commit d9380d4a3121864fd49f37d8ae3ae3545071a503 Author: Jyri Eerola Date: Thu Jun 29 20:30:04 2017 +0300 Initial commit diff --git a/README.md b/README.md new file mode 100644 index 0000000..4889003 --- /dev/null +++ b/README.md @@ -0,0 +1,28 @@ +# wid_tool.py - Wikipedia Image Downloader + +wid_tool.py is tool for downloading images in Wikipedia articles. +Upto date usage instructions can always be found by running + + $ python3 wid_tool.py -h + +## Requirements + +The requirements are the following: +* A quite new version of Python 3 +* lxml +* cssselect + +## Quick start + +Let's download all images from a particular article: + + $ python3 wid_tool.py https://en.wikipedia.org/wiki/List_of_screw_drives + +We should be able to find all the images downloaded in the current +folder. There is also a url ending filter available. Let's download only +the SVG files from the article above: + + $ python3 wid_tool.py -f svg https://en.wikipedia.org/wiki/List_of_screw_drives + +Now only files with their corresponding urls ending with _svg_ should be +downloading. diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..e87585d --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +lxml +cssselect diff --git a/wid_tool.py b/wid_tool.py new file mode 100644 index 0000000..9e843f5 --- /dev/null +++ b/wid_tool.py @@ -0,0 +1,25 @@ +from lxml.html import parse +from urllib.request import urlopen, urlretrieve +from urllib.parse import urlparse, urljoin +import argparse + +argparser = argparse.ArgumentParser(description='Download images from a Wikipedia article.') +argparser.add_argument('url', help='The article\'s URL') +argparser.add_argument('-f', dest='href_filter', default='', help='Filter image file names with specified ending.') +args = argparser.parse_args() + +purl = urlparse(args.url) + +with urlopen(args.url) as response: + root = parse(response).getroot() + anchors = root.cssselect('a.image') + +links = [a.get('href') for a in anchors] +links = [f'{purl.scheme}://{purl.netloc}{link}' for link in links if link.endswith(args.href_filter)] + +for link in links: + with urlopen(link) as response: + original = parse(response).getroot().cssselect('a.internal')[0].get('href') + filename = original.split('/')[-1] + print(filename) + urlretrieve(urljoin('https://', original), filename=filename)