Initial commit

2017-06-29 20:30:04 +03:00 · 2017-06-29 20:30:04 +03:00 · d9380d4a31
commit d9380d4a31
3 changed files with 55 additions and 0 deletions
--- a/README.md
+++ b/README.md
@ -0,0 +1,28 @@
 # wid_tool.py - Wikipedia Image Downloader
 wid_tool.py is tool for downloading images in Wikipedia articles.
 Upto date usage instructions can always be found by running
    $ python3 wid_tool.py -h
 ## Requirements
 The requirements are the following:
 * A quite new version of Python 3
 * lxml
 * cssselect
 ## Quick start
 Let's download all images from a particular article:
    $ python3 wid_tool.py https://en.wikipedia.org/wiki/List_of_screw_drives
 We should be able to find all the images downloaded in the current
 folder. There is also a url ending filter available. Let's download only
 the SVG files from the article above:
    $ python3 wid_tool.py -f svg https://en.wikipedia.org/wiki/List_of_screw_drives
 Now only files with their corresponding urls ending with _svg_ should be
 downloading.
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,2 @@
 lxml
 cssselect
--- a/wid_tool.py
+++ b/wid_tool.py
@ -0,0 +1,25 @@
 from lxml.html import parse
 from urllib.request import urlopen, urlretrieve
 from urllib.parse import urlparse, urljoin
 import argparse
 argparser = argparse.ArgumentParser(description='Download images from a Wikipedia article.')
 argparser.add_argument('url', help='The article\'s URL')
 argparser.add_argument('-f', dest='href_filter', default='', help='Filter image file names with specified ending.')
 args = argparser.parse_args()
 purl = urlparse(args.url)
 with urlopen(args.url) as response:
    root = parse(response).getroot()
    anchors = root.cssselect('a.image')
 links = [a.get('href') for a in anchors]
 links = [f'{purl.scheme}://{purl.netloc}{link}' for link in links if link.endswith(args.href_filter)]
 for link in links:
    with urlopen(link) as response:
        original = parse(response).getroot().cssselect('a.internal')[0].get('href')
        filename = original.split('/')[-1]
        print(filename)
        urlretrieve(urljoin('https://', original), filename=filename)