import os import requests import time import argparse from urllib.parse import urlparse, parse_qs, urljoin, unquote from bs4 import BeautifulSoup from concurrent.futures import ThreadPoolExecutor, as_completed from tqdm import tqdm BASE_URL = "https://www.e-stat.go.jp" START_URL = "https://www.e-stat.go.jp/en/stat-search/files?page=1&toukei=00200521&tstat=000001136464" DATA_DIR = "data" def download_file(url, folder, dry_run=False): """Downloads a file from the given URL.""" try: if dry_run: # print(f"[Dry Run] Would download: {url}") # Silence for progress bar return # Get filename from Content-Disposition header or URL # Note: HEAD request might be slow in parallel, maybe skip or optimize? # For speed, let's try to guess from URL first if possible, but e-stat uses IDs. # We'll do a HEAD request. response = requests.head(url, allow_redirects=True) if "Content-Disposition" in response.headers: filename = response.headers["Content-Disposition"].split("filename=")[-1].strip('"') else: parsed_url = urlparse(url) qs = parse_qs(parsed_url.query) if 'statInfId' in qs: filename = f"{qs['statInfId'][0]}.xls" # Default extension else: filename = os.path.basename(parsed_url.path) filename = unquote(filename) filepath = os.path.join(folder, filename) if os.path.exists(filepath): # print(f"Skipping {filename} (already exists)") return # print(f"Downloading {filename}...") response = requests.get(url) with open(filepath, 'wb') as f: f.write(response.content) # print(f"Saved {filename}") except Exception as e: print(f"Failed to download {url}: {e}") def get_links_from_page(url): """Fetches a page and returns (file_links, nav_links).""" file_links = [] nav_links = [] try: response = requests.get(url) soup = BeautifulSoup(response.content, 'html.parser') links = soup.find_all('a', href=True) for link in links: href = link['href'] full_url = urljoin(BASE_URL, href) if "file-download" in href and "statInfId" in href: file_links.append(full_url) elif "stat-search/files" in href and "toukei=00200521" in href: # Avoid self-reference or going back up if possible, but simple check is enough if full_url != url: nav_links.append(full_url) except Exception as e: print(f"Error processing {url}: {e}") return file_links, nav_links def crawl_parallel(start_url, max_workers=10): """Crawls the e-Stat file pages in parallel.""" print("Fetching main category page...") # 1. Get initial links (Prefectures) from the start page # We assume the start page lists the prefectures (nav links) _, prefecture_links = get_links_from_page(start_url) # Filter out duplicates prefecture_links = list(set(prefecture_links)) print(f"Found {len(prefecture_links)} category/prefecture pages. Scanning them in parallel...") all_file_links = [] # 2. Process each prefecture page in parallel to find files with ThreadPoolExecutor(max_workers=max_workers) as executor: # Submit tasks future_to_url = {executor.submit(get_links_from_page, url): url for url in prefecture_links} for future in tqdm(as_completed(future_to_url), total=len(prefecture_links), desc="Crawling Pages"): url = future_to_url[future] try: f_links, _ = future.result() if f_links: all_file_links.extend(f_links) except Exception as e: print(f"Error scanning {url}: {e}") return list(set(all_file_links)) def main(): parser = argparse.ArgumentParser(description="Download e-Stat Census Data") parser.add_argument("--dry-run", action="store_true", help="Print URLs without downloading") parser.add_argument("--workers", type=int, default=10, help="Number of parallel threads") args = parser.parse_args() if not args.dry_run and not os.path.exists(DATA_DIR): os.makedirs(DATA_DIR) # 1. Parallel Crawl links = crawl_parallel(START_URL, max_workers=args.workers) print(f"Total files found: {len(links)}") # 2. Parallel Download print(f"Starting downloads with {args.workers} workers...") # We can't use executor.map directly with tqdm easily if we want to track completion # So we submit futures and iterate as_completed with ThreadPoolExecutor(max_workers=args.workers) as executor: futures = [executor.submit(download_file, url, DATA_DIR, args.dry_run) for url in links] for _ in tqdm(as_completed(futures), total=len(futures), desc="Downloading Files"): pass if __name__ == "__main__": main()