sdlc-agent / src /data /download_census_data.py
Veeru-c's picture
initial commit
06bd253
import os
import requests
import time
import argparse
from urllib.parse import urlparse, parse_qs, urljoin, unquote
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
BASE_URL = "https://www.e-stat.go.jp"
START_URL = "https://www.e-stat.go.jp/en/stat-search/files?page=1&toukei=00200521&tstat=000001136464"
DATA_DIR = "data"
def download_file(url, folder, dry_run=False):
"""Downloads a file from the given URL."""
try:
if dry_run:
# print(f"[Dry Run] Would download: {url}") # Silence for progress bar
return
# Get filename from Content-Disposition header or URL
# Note: HEAD request might be slow in parallel, maybe skip or optimize?
# For speed, let's try to guess from URL first if possible, but e-stat uses IDs.
# We'll do a HEAD request.
response = requests.head(url, allow_redirects=True)
if "Content-Disposition" in response.headers:
filename = response.headers["Content-Disposition"].split("filename=")[-1].strip('"')
else:
parsed_url = urlparse(url)
qs = parse_qs(parsed_url.query)
if 'statInfId' in qs:
filename = f"{qs['statInfId'][0]}.xls" # Default extension
else:
filename = os.path.basename(parsed_url.path)
filename = unquote(filename)
filepath = os.path.join(folder, filename)
if os.path.exists(filepath):
# print(f"Skipping {filename} (already exists)")
return
# print(f"Downloading {filename}...")
response = requests.get(url)
with open(filepath, 'wb') as f:
f.write(response.content)
# print(f"Saved {filename}")
except Exception as e:
print(f"Failed to download {url}: {e}")
def get_links_from_page(url):
"""Fetches a page and returns (file_links, nav_links)."""
file_links = []
nav_links = []
try:
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
links = soup.find_all('a', href=True)
for link in links:
href = link['href']
full_url = urljoin(BASE_URL, href)
if "file-download" in href and "statInfId" in href:
file_links.append(full_url)
elif "stat-search/files" in href and "toukei=00200521" in href:
# Avoid self-reference or going back up if possible, but simple check is enough
if full_url != url:
nav_links.append(full_url)
except Exception as e:
print(f"Error processing {url}: {e}")
return file_links, nav_links
def crawl_parallel(start_url, max_workers=10):
"""Crawls the e-Stat file pages in parallel."""
print("Fetching main category page...")
# 1. Get initial links (Prefectures) from the start page
# We assume the start page lists the prefectures (nav links)
_, prefecture_links = get_links_from_page(start_url)
# Filter out duplicates
prefecture_links = list(set(prefecture_links))
print(f"Found {len(prefecture_links)} category/prefecture pages. Scanning them in parallel...")
all_file_links = []
# 2. Process each prefecture page in parallel to find files
with ThreadPoolExecutor(max_workers=max_workers) as executor:
# Submit tasks
future_to_url = {executor.submit(get_links_from_page, url): url for url in prefecture_links}
for future in tqdm(as_completed(future_to_url), total=len(prefecture_links), desc="Crawling Pages"):
url = future_to_url[future]
try:
f_links, _ = future.result()
if f_links:
all_file_links.extend(f_links)
except Exception as e:
print(f"Error scanning {url}: {e}")
return list(set(all_file_links))
def main():
parser = argparse.ArgumentParser(description="Download e-Stat Census Data")
parser.add_argument("--dry-run", action="store_true", help="Print URLs without downloading")
parser.add_argument("--workers", type=int, default=10, help="Number of parallel threads")
args = parser.parse_args()
if not args.dry_run and not os.path.exists(DATA_DIR):
os.makedirs(DATA_DIR)
# 1. Parallel Crawl
links = crawl_parallel(START_URL, max_workers=args.workers)
print(f"Total files found: {len(links)}")
# 2. Parallel Download
print(f"Starting downloads with {args.workers} workers...")
# We can't use executor.map directly with tqdm easily if we want to track completion
# So we submit futures and iterate as_completed
with ThreadPoolExecutor(max_workers=args.workers) as executor:
futures = [executor.submit(download_file, url, DATA_DIR, args.dry_run) for url in links]
for _ in tqdm(as_completed(futures), total=len(futures), desc="Downloading Files"):
pass
if __name__ == "__main__":
main()