| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | import os |
| | import time |
| | import datetime |
| | import urllib |
| | import urllib2 |
| | import urlparse |
| | import socket |
| | import argparse |
| | import multiprocessing |
| | import xml.etree.ElementTree as ElementTree |
| |
|
| | PER_PAGE = 500 |
| | SORT = "date-posted-desc" |
| | URL = "https://api.flickr.com/services/rest/?method=flickr.photos.search&" \ |
| | "api_key=%s&text=%s&sort=%s&per_page=%d&page=%d&min_upload_date=%s&" \ |
| | "max_upload_date=%s&format=rest&extras=url_o,url_l,url_c,url_z,url_n" |
| | MAX_PAGE_REQUESTS = 5 |
| | MAX_PAGE_TIMEOUT = 20 |
| | MAX_IMAGE_REQUESTS = 3 |
| | TIME_SKIP = 24 * 60 * 60 |
| | MAX_DATE = time.time() |
| | MIN_DATE = MAX_DATE - TIME_SKIP |
| |
|
| |
|
| | def parse_args(): |
| | parser = argparse.ArgumentParser() |
| | parser.add_argument("--search_text", required=True) |
| | parser.add_argument("--api_key", required=True) |
| | parser.add_argument("--image_path", required=True) |
| | parser.add_argument("--num_procs", type=int, default=10) |
| | parser.add_argument("--max_days_without_image", type=int, default=365) |
| | args = parser.parse_args() |
| | return args |
| |
|
| |
|
| | def compose_url(page, api_key, text, min_date, max_date): |
| | return URL % (api_key, text, SORT, PER_PAGE, page, |
| | str(min_date), str(max_date)) |
| |
|
| |
|
| | def parse_page(page, api_key, text, min_date, max_date): |
| | f = None |
| | for _ in range(MAX_PAGE_REQUESTS): |
| | try: |
| | f = urllib2.urlopen(compose_url(page, api_key, text, min_date, |
| | max_date), timeout=MAX_PAGE_TIMEOUT) |
| | except socket.timeout: |
| | continue |
| | else: |
| | break |
| |
|
| | if f is None: |
| | return {'pages': '0', |
| | 'total': '0', |
| | 'page': '0', |
| | 'perpage': '0'}, tuple() |
| |
|
| | response = f.read() |
| | root = ElementTree.fromstring(response) |
| |
|
| | if root.attrib["stat"] != "ok": |
| | raise IOError |
| |
|
| | photos = [] |
| | for photo in root.iter("photo"): |
| | photos.append(photo.attrib) |
| |
|
| | return root.find("photos").attrib, photos |
| |
|
| |
|
| | class PhotoDownloader(object): |
| |
|
| | def __init__(self, image_path): |
| | self.image_path = image_path |
| |
|
| | def __call__(self, photo): |
| | |
| | |
| | |
| | url = None |
| | for url_suffix in ("o", "l", "k", "h", "b", "c", "z"): |
| | url_attr = "url_%s" % url_suffix |
| | if photo.get(url_attr) is not None: |
| | url = photo.get(url_attr) |
| | break |
| |
|
| | if url is not None: |
| | |
| | |
| | url_filename = urlparse.urlparse(url).path |
| | image_ext = os.path.splitext(url_filename)[1] |
| |
|
| | image_name = "%s_%s%s" % (photo["id"], photo["secret"], image_ext) |
| | path = os.path.join(self.image_path, image_name) |
| | if not os.path.exists(path): |
| | print(url) |
| | for _ in range(MAX_IMAGE_REQUESTS): |
| | try: |
| | urllib.urlretrieve(url, path) |
| | except urllib.ContentTooShortError: |
| | continue |
| | else: |
| | break |
| |
|
| |
|
| | def main(): |
| | args = parse_args() |
| |
|
| | downloader = PhotoDownloader(args.image_path) |
| | pool = multiprocessing.Pool(processes=args.num_procs) |
| |
|
| | num_pages = float("inf") |
| | page = 0 |
| |
|
| | min_date = MIN_DATE |
| | max_date = MAX_DATE |
| |
|
| | days_in_row = 0; |
| |
|
| | search_text = args.search_text.replace(" ", "-") |
| |
|
| | while num_pages > page: |
| | page += 1 |
| |
|
| | metadata, photos = parse_page(page, args.api_key, search_text, |
| | min_date, max_date) |
| |
|
| | num_pages = int(metadata["pages"]) |
| |
|
| | print(78 * "=") |
| | print("Page:\t\t", page, "of", num_pages) |
| | print("Min-Date:\t", datetime.datetime.fromtimestamp(min_date)) |
| | print("Max-Date:\t", datetime.datetime.fromtimestamp(max_date)) |
| | print("Num-Photos:\t", len(photos)) |
| | print(78 * "=") |
| |
|
| | try: |
| | pool.map_async(downloader, photos).get(1e10) |
| | except KeyboardInterrupt: |
| | pool.wait() |
| | break |
| |
|
| | if page >= num_pages: |
| | max_date -= TIME_SKIP |
| | min_date -= TIME_SKIP |
| | page = 0 |
| |
|
| | if num_pages == 0: |
| | days_in_row = days_in_row + 1 |
| | num_pages = float("inf") |
| |
|
| | print(" No images in", days_in_row, "days in a row") |
| |
|
| | if days_in_row == args.max_days_without_image: |
| | break |
| | else: |
| | days_in_row = 0 |
| |
|
| |
|
| | if __name__ == "__main__": |
| | main() |
| |
|