commit d9380d4a3121864fd49f37d8ae3ae3545071a503
Author: Jyri Eerola <jyri.eerola@jrd.fi>
Date:   Thu Jun 29 20:30:04 2017 +0300

    Initial commit

diff --git a/README.md b/README.md
new file mode 100644
index 0000000..4889003
--- /dev/null
+++ b/README.md
@@ -0,0 +1,28 @@
+# wid_tool.py - Wikipedia Image Downloader
+
+wid_tool.py is tool for downloading images in Wikipedia articles.
+Upto date usage instructions can always be found by running
+
+    $ python3 wid_tool.py -h
+
+## Requirements
+
+The requirements are the following:
+* A quite new version of Python 3
+* lxml
+* cssselect
+
+## Quick start
+
+Let's download all images from a particular article:
+
+    $ python3 wid_tool.py https://en.wikipedia.org/wiki/List_of_screw_drives
+
+We should be able to find all the images downloaded in the current
+folder. There is also a url ending filter available. Let's download only
+the SVG files from the article above:
+
+    $ python3 wid_tool.py -f svg https://en.wikipedia.org/wiki/List_of_screw_drives
+
+Now only files with their corresponding urls ending with _svg_ should be
+downloading.
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..e87585d
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,2 @@
+lxml
+cssselect
diff --git a/wid_tool.py b/wid_tool.py
new file mode 100644
index 0000000..9e843f5
--- /dev/null
+++ b/wid_tool.py
@@ -0,0 +1,25 @@
+from lxml.html import parse
+from urllib.request import urlopen, urlretrieve
+from urllib.parse import urlparse, urljoin
+import argparse
+
+argparser = argparse.ArgumentParser(description='Download images from a Wikipedia article.')
+argparser.add_argument('url', help='The article\'s URL')
+argparser.add_argument('-f', dest='href_filter', default='', help='Filter image file names with specified ending.')
+args = argparser.parse_args()
+
+purl = urlparse(args.url)
+
+with urlopen(args.url) as response:
+    root = parse(response).getroot()
+    anchors = root.cssselect('a.image')
+
+links = [a.get('href') for a in anchors]
+links = [f'{purl.scheme}://{purl.netloc}{link}' for link in links if link.endswith(args.href_filter)]
+    
+for link in links:
+    with urlopen(link) as response:
+        original = parse(response).getroot().cssselect('a.internal')[0].get('href')
+        filename = original.split('/')[-1]
+        print(filename)
+        urlretrieve(urljoin('https://', original), filename=filename)