File size: 5,122 Bytes
7e1fb9b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import os
import requests
import time
import argparse
from urllib.parse import urlparse, parse_qs, urljoin, unquote
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

BASE_URL = "https://www.e-stat.go.jp"
START_URL = "https://www.e-stat.go.jp/en/stat-search/files?page=1&toukei=00200521&tstat=000001136464"
DATA_DIR = "data"

def download_file(url, folder, dry_run=False):
    """Downloads a file from the given URL."""
    try:
        if dry_run:
            # print(f"[Dry Run] Would download: {url}") # Silence for progress bar
            return

        # Get filename from Content-Disposition header or URL
        # Note: HEAD request might be slow in parallel, maybe skip or optimize?
        # For speed, let's try to guess from URL first if possible, but e-stat uses IDs.
        # We'll do a HEAD request.
        response = requests.head(url, allow_redirects=True)
        if "Content-Disposition" in response.headers:
            filename = response.headers["Content-Disposition"].split("filename=")[-1].strip('"')
        else:
            parsed_url = urlparse(url)
            qs = parse_qs(parsed_url.query)
            if 'statInfId' in qs:
                filename = f"{qs['statInfId'][0]}.xls" # Default extension
            else:
                filename = os.path.basename(parsed_url.path)
        
        filename = unquote(filename)
        filepath = os.path.join(folder, filename)
        
        if os.path.exists(filepath):
            # print(f"Skipping {filename} (already exists)")
            return

        # print(f"Downloading {filename}...")
        response = requests.get(url)
        with open(filepath, 'wb') as f:
            f.write(response.content)
        # print(f"Saved {filename}")
        
    except Exception as e:
        print(f"Failed to download {url}: {e}")

def get_links_from_page(url):
    """Fetches a page and returns (file_links, nav_links)."""
    file_links = []
    nav_links = []
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        links = soup.find_all('a', href=True)
        for link in links:
            href = link['href']
            full_url = urljoin(BASE_URL, href)
            
            if "file-download" in href and "statInfId" in href:
                file_links.append(full_url)
            elif "stat-search/files" in href and "toukei=00200521" in href:
                # Avoid self-reference or going back up if possible, but simple check is enough
                if full_url != url: 
                    nav_links.append(full_url)
                    
    except Exception as e:
        print(f"Error processing {url}: {e}")
    
    return file_links, nav_links

def crawl_parallel(start_url, max_workers=10):
    """Crawls the e-Stat file pages in parallel."""
    print("Fetching main category page...")
    # 1. Get initial links (Prefectures) from the start page
    # We assume the start page lists the prefectures (nav links)
    _, prefecture_links = get_links_from_page(start_url)
    
    # Filter out duplicates
    prefecture_links = list(set(prefecture_links))
    print(f"Found {len(prefecture_links)} category/prefecture pages. Scanning them in parallel...")

    all_file_links = []
    
    # 2. Process each prefecture page in parallel to find files
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Submit tasks
        future_to_url = {executor.submit(get_links_from_page, url): url for url in prefecture_links}
        
        for future in tqdm(as_completed(future_to_url), total=len(prefecture_links), desc="Crawling Pages"):
            url = future_to_url[future]
            try:
                f_links, _ = future.result()
                if f_links:
                    all_file_links.extend(f_links)
            except Exception as e:
                print(f"Error scanning {url}: {e}")
                
    return list(set(all_file_links))

def main():
    parser = argparse.ArgumentParser(description="Download e-Stat Census Data")
    parser.add_argument("--dry-run", action="store_true", help="Print URLs without downloading")
    parser.add_argument("--workers", type=int, default=10, help="Number of parallel threads")
    args = parser.parse_args()
    
    if not args.dry_run and not os.path.exists(DATA_DIR):
        os.makedirs(DATA_DIR)

    # 1. Parallel Crawl
    links = crawl_parallel(START_URL, max_workers=args.workers)
    print(f"Total files found: {len(links)}")
    
    # 2. Parallel Download
    print(f"Starting downloads with {args.workers} workers...")
    # We can't use executor.map directly with tqdm easily if we want to track completion
    # So we submit futures and iterate as_completed
    with ThreadPoolExecutor(max_workers=args.workers) as executor:
        futures = [executor.submit(download_file, url, DATA_DIR, args.dry_run) for url in links]
        for _ in tqdm(as_completed(futures), total=len(futures), desc="Downloading Files"):
            pass

if __name__ == "__main__":
    main()