Spaces:

MCP-1st-Birthday
/

sdlc-agent

Runtime error

File size: 5,122 Bytes

7e1fb9b

import os
import requests
import time
import argparse
from urllib.parse import urlparse, parse_qs, urljoin, unquote
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

BASE_URL = "https://www.e-stat.go.jp"
START_URL = "https://www.e-stat.go.jp/en/stat-search/files?page=1&toukei=00200521&tstat=000001136464"
DATA_DIR = "data"

def download_file(url, folder, dry_run=False):
    """Downloads a file from the given URL."""
    try:
        if dry_run:
            # print(f"[Dry Run] Would download: {url}") # Silence for progress bar
            return

        # Get filename from Content-Disposition header or URL
        # Note: HEAD request might be slow in parallel, maybe skip or optimize?
        # For speed, let's try to guess from URL first if possible, but e-stat uses IDs.
        # We'll do a HEAD request.
        response = requests.head(url, allow_redirects=True)
        if "Content-Disposition" in response.headers:
            filename = response.headers["Content-Disposition"].split("filename=")[-1].strip('"')
        else:
            parsed_url = urlparse(url)
            qs = parse_qs(parsed_url.query)
            if 'statInfId' in qs:
                filename = f"{qs['statInfId'][0]}.xls" # Default extension
            else:
                filename = os.path.basename(parsed_url.path)
        
        filename = unquote(filename)
        filepath = os.path.join(folder, filename)
        
        if os.path.exists(filepath):
            # print(f"Skipping {filename} (already exists)")
            return

        # print(f"Downloading {filename}...")
        response = requests.get(url)
        with open(filepath, 'wb') as f:
            f.write(response.content)
        # print(f"Saved {filename}")
        
    except Exception as e:
        print(f"Failed to download {url}: {e}")

def get_links_from_page(url):
    """Fetches a page and returns (file_links, nav_links)."""
    file_links = []
    nav_links = []
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        links = soup.find_all('a', href=True)
        for link in links:
            href = link['href']
            full_url = urljoin(BASE_URL, href)
            
            if "file-download" in href and "statInfId" in href:
                file_links.append(full_url)
            elif "stat-search/files" in href and "toukei=00200521" in href:
                # Avoid self-reference or going back up if possible, but simple check is enough
                if full_url != url: 
                    nav_links.append(full_url)
                    
    except Exception as e:
        print(f"Error processing {url}: {e}")
    
    return file_links, nav_links

def crawl_parallel(start_url, max_workers=10):
    """Crawls the e-Stat file pages in parallel."""
    print("Fetching main category page...")
    # 1. Get initial links (Prefectures) from the start page
    # We assume the start page lists the prefectures (nav links)
    _, prefecture_links = get_links_from_page(start_url)
    
    # Filter out duplicates
    prefecture_links = list(set(prefecture_links))
    print(f"Found {len(prefecture_links)} category/prefecture pages. Scanning them in parallel...")

    all_file_links = []
    
    # 2. Process each prefecture page in parallel to find files
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Submit tasks
        future_to_url = {executor.submit(get_links_from_page, url): url for url in prefecture_links}
        
        for future in tqdm(as_completed(future_to_url), total=len(prefecture_links), desc="Crawling Pages"):
            url = future_to_url[future]
            try:
                f_links, _ = future.result()
                if f_links:
                    all_file_links.extend(f_links)
            except Exception as e:
                print(f"Error scanning {url}: {e}")
                
    return list(set(all_file_links))

def main():
    parser = argparse.ArgumentParser(description="Download e-Stat Census Data")
    parser.add_argument("--dry-run", action="store_true", help="Print URLs without downloading")
    parser.add_argument("--workers", type=int, default=10, help="Number of parallel threads")
    args = parser.parse_args()
    
    if not args.dry_run and not os.path.exists(DATA_DIR):
        os.makedirs(DATA_DIR)

    # 1. Parallel Crawl
    links = crawl_parallel(START_URL, max_workers=args.workers)
    print(f"Total files found: {len(links)}")
    
    # 2. Parallel Download
    print(f"Starting downloads with {args.workers} workers...")
    # We can't use executor.map directly with tqdm easily if we want to track completion
    # So we submit futures and iterate as_completed
    with ThreadPoolExecutor(max_workers=args.workers) as executor:
        futures = [executor.submit(download_file, url, DATA_DIR, args.dry_run) for url in links]
        for _ in tqdm(as_completed(futures), total=len(futures), desc="Downloading Files"):
            pass

if __name__ == "__main__":
    main()