File size: 14,430 Bytes
46f2cb3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
import modal
from urllib.parse import urlparse, parse_qs, urljoin, unquote
from bs4 import BeautifulSoup
import requests
import pandas as pd
import os

# Create Modal app
app = modal.App("census-csv-downloader")

# Create a volume to store the downloaded files
volume = modal.Volume.from_name("census-csv-data", create_if_missing=True)

# Define the image with required dependencies
image = modal.Image.debian_slim().pip_install(
    "requests",
    "beautifulsoup4",
    "tqdm",
    "pandas",
    "openpyxl",
    "xlrd"
)

BASE_URL = "https://www.e-stat.go.jp"
START_URL = "https://www.e-stat.go.jp/en/stat-search/files?page=1&toukei=00200521&tstat=000001136464"
VOLUME_PATH = "/data"

@app.function(
    image=image,
    volumes={VOLUME_PATH: volume},
    timeout=900,  # Increased for conversion time
    retries=3,
    max_containers=100,  # Reduced for memory usage
    cpu=1.0,
    memory=1024,  # Added memory for Excel processing
)
def download_and_convert_to_csv(url: str) -> dict:
    """Downloads Excel file and converts to CSV."""
    import os
    
    try:
        # Extract statInfId from URL for filename
        parsed_url = urlparse(url)
        qs = parse_qs(parsed_url.query)
        if 'statInfId' not in qs:
            return {"url": url, "status": "error", "message": "No statInfId in URL"}
            
        stat_id = qs['statInfId'][0]
        
        # Check if CSV file already exists
        csv_filename = f"{stat_id}.csv"
        csv_filepath = os.path.join(VOLUME_PATH, csv_filename)
        
        if os.path.exists(csv_filepath) and os.path.getsize(csv_filepath) > 0:
            return {"url": url, "status": "skipped", "filename": csv_filename, "type": "csv"}
        
        # Download the Excel file
        response = requests.get(url, timeout=60)
        response.raise_for_status()
        
        # Save temporary Excel file
        temp_excel_path = os.path.join(VOLUME_PATH, f"temp_{stat_id}.xlsx")
        with open(temp_excel_path, 'wb') as f:
            f.write(response.content)
        
        try:
            # Convert Excel to CSV
            # Try different Excel engines
            for engine in ['openpyxl', 'xlrd']:
                try:
                    # Read Excel file (try first sheet)
                    df = pd.read_excel(temp_excel_path, engine=engine, sheet_name=0)
                    
                    # Clean the data
                    df = df.dropna(how='all')  # Remove empty rows
                    df = df.fillna('')  # Replace NaN with empty string
                    
                    # Save as CSV
                    df.to_csv(csv_filepath, index=False, encoding='utf-8')
                    
                    # Remove temporary Excel file
                    os.remove(temp_excel_path)
                    
                    # Commit changes to volume
                    volume.commit()
                    
                    return {
                        "url": url, 
                        "status": "success", 
                        "filename": csv_filename, 
                        "type": "csv",
                        "rows": len(df),
                        "columns": len(df.columns)
                    }
                    
                except Exception as e:
                    continue  # Try next engine
            
            # If all engines failed
            os.remove(temp_excel_path)
            return {"url": url, "status": "error", "message": "Could not read Excel file"}
            
        except Exception as e:
            # Clean up temp file if conversion fails
            if os.path.exists(temp_excel_path):
                os.remove(temp_excel_path)
            raise e
        
    except Exception as e:
        return {"url": url, "status": "error", "message": str(e)}

@app.function(image=image, timeout=3600)
def get_links_from_page(url: str) -> tuple:
    """Fetches a page and returns (file_links, nav_links)."""
    file_links = []
    nav_links = []
    try:
        response = requests.get(url, timeout=30)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        links = soup.find_all('a', href=True)
        for link in links:
            href = link['href']
            full_url = urljoin(BASE_URL, href)
            
            if "file-download" in href and "statInfId" in href:
                file_links.append(full_url)
            elif "stat-search/files" in href and "toukei=00200521" in href:
                if full_url != url: 
                    nav_links.append(full_url)
                    
    except Exception as e:
        print(f"Error processing {url}: {e}")
    
    return file_links, nav_links

@app.function(
    image=image,
    volumes={VOLUME_PATH: volume},
    timeout=600
)
def analyze_file_batch(filenames: list) -> list:
    """Analyze a batch of files and return their metadata."""
    import os
    
    results = []
    for filename in filenames:
        try:
            filepath = os.path.join(VOLUME_PATH, filename)
            
            # Get basic file info
            file_info = {
                'filename': filename,
                'stat_id': filename.replace('.csv', ''),
                'size_bytes': os.path.getsize(filepath),
                'modified': os.path.getmtime(filepath)
            }
            
            # Read column information
            try:
                df = pd.read_csv(filepath, nrows=0)  # Just read header
                file_info['columns'] = len(df.columns)
                file_info['column_names'] = str(list(df.columns))  # Convert to string for CSV
            except Exception as e:
                file_info['columns'] = 0
                file_info['column_names'] = str([])
            
            results.append(file_info)
        except Exception as e:
            print(f"Error processing {filename}: {e}")
            
    return results

@app.function(
    image=image,
    volumes={VOLUME_PATH: volume},
    timeout=1800  # 30 minutes
)
def create_master_csv() -> dict:
    """Creates a master CSV file with metadata about all downloaded files."""
    import os
    import json
    
    try:
        print("Scanning volume for CSV files...")
        filenames = [f for f in os.listdir(VOLUME_PATH) if f.endswith('.csv') and not f.startswith('temp_')]
        print(f"Found {len(filenames)} CSV files to process")
        
        # Process files in parallel batches
        batch_size = 100
        batches = [filenames[i:i+batch_size] for i in range(0, len(filenames), batch_size)]
        
        print(f"Processing {len(batches)} batches of {batch_size} files each...")
        
        all_results = []
        for i, batch_results in enumerate(analyze_file_batch.map(batches)):
            all_results.extend(batch_results)
            print(f"Completed batch {i+1}/{len(batches)} ({len(all_results)} files processed)")
        
        print("Creating master CSV file...")
        # Create master CSV
        if all_results:
            master_df = pd.DataFrame(all_results)
            master_path = os.path.join(VOLUME_PATH, 'master_inventory.csv')
            master_df.to_csv(master_path, index=False)
            
            volume.commit()
            
            return {
                "status": "success",
                "total_files": len(all_results),
                "master_file": "master_inventory.csv"
            }
        else:
            return {"status": "error", "message": "No CSV files found"}
            
    except Exception as e:
        return {"status": "error", "message": str(e)}

@app.function(
    image=image,
    volumes={VOLUME_PATH: volume},
    timeout=300
)
def list_csv_files() -> dict:
    """Lists all CSV files in the volume."""
    import os
    
    try:
        files = []
        for filename in os.listdir(VOLUME_PATH):
            if filename.endswith('.csv'):
                filepath = os.path.join(VOLUME_PATH, filename)
                files.append({
                    'filename': filename,
                    'size_bytes': os.path.getsize(filepath)
                })
        
        return {
            "status": "success",
            "total_files": len(files),
            "files": files[:20]  # Show first 20 files
        }
    except Exception as e:
        return {"status": "error", "message": str(e)}

@app.local_entrypoint()
def main():
    """Main function to orchestrate download and conversion."""
    print("Starting Japan Census Data Downloader (CSV Converter)...")
    
    # Get prefecture links
    print("Fetching main category page...")
    _, prefecture_links = get_links_from_page.remote(START_URL)
    prefecture_links = list(set(prefecture_links))
    print(f"Found {len(prefecture_links)} category/prefecture pages.")
    
    # Get all file links from prefecture pages in parallel
    print("Scanning prefecture pages for file links...")
    all_file_links = []
    
    for file_links, _ in get_links_from_page.map(prefecture_links):
        all_file_links.extend(file_links)
    
    all_file_links = list(set(all_file_links))
    print(f"Total files found: {len(all_file_links)}")
    
    # Download and convert all files to CSV in parallel
    print(f"Starting downloads and CSV conversion across Modal containers...")
    results = list(download_and_convert_to_csv.map(all_file_links))
    
    # Summary
    success = sum(1 for r in results if r["status"] == "success")
    skipped = sum(1 for r in results if r["status"] == "skipped")
    errors = sum(1 for r in results if r["status"] == "error")
    
    print(f"\n=== Download & Conversion Summary ===")
    print(f"Total files: {len(results)}")
    print(f"Successfully converted to CSV: {success}")
    print(f"Skipped (already exists): {skipped}")
    print(f"Errors: {errors}")
    
    # Show details of successful conversions
    if success > 0:
        total_rows = sum(r.get('rows', 0) for r in results if r["status"] == "success")
        print(f"\nTotal data rows converted: {total_rows:,}")
    
    # Show errors
    if errors > 0:
        print(f"\nFailed URLs:")
        for r in results:
            if r["status"] == "error":
                print(f"  - {r['url']}: {r.get('message', 'Unknown error')}")
    
    # Create master inventory
    print(f"\nCreating master inventory CSV...")
    master_result = create_master_csv.remote()
    if master_result["status"] == "success":
        print(f"Master inventory created: {master_result['total_files']} files indexed")
    else:
        print(f"Failed to create master inventory: {master_result.get('message', 'Unknown error')}")

@app.local_entrypoint()
def check_files():
    """Check what files are in the volume."""
    result = list_csv_files.remote()
    if result["status"] == "success":
        print(f"Found {result['total_files']} CSV files:")
        for file_info in result["files"]:
            print(f"  - {file_info['filename']} ({file_info['size_bytes']:,} bytes)")
    else:
        print(f"Error: {result.get('message', 'Unknown error')}")

@app.local_entrypoint()
def create_inventory():
    """Create master inventory of all files."""
    print("Creating master inventory CSV...")
    master_result = create_master_csv.remote()
    if master_result["status"] == "success":
        print(f"Master inventory created: {master_result['total_files']} files indexed")
        print(f"Master file: {master_result['master_file']}")
    else:
        print(f"Failed to create master inventory: {master_result.get('message', 'Unknown error')}")

@app.function(
    image=image,
    volumes={VOLUME_PATH: volume},
    timeout=300
)
def download_file(filename: str) -> str:
    """Download a specific file from the volume."""
    import os
    
    filepath = os.path.join(VOLUME_PATH, filename)
    if os.path.exists(filepath):
        with open(filepath, 'r', encoding='utf-8') as f:
            content = f.read()
        return content
    else:
        return f"File {filename} not found"

@app.local_entrypoint()
def get_master_inventory():
    """Get the master inventory content."""
    print("Fetching master inventory...")
    content = download_file.remote("master_inventory.csv")
    print("Master inventory content:")
    print(content[:1000] + "..." if len(content) > 1000 else content)

@app.function(
    image=image,
    volumes={VOLUME_PATH: volume},
    timeout=600
)
def analyze_column_patterns() -> dict:
    """Analyze column patterns across all files."""
    import os
    from collections import Counter
    
    try:
        # Read master inventory
        master_path = os.path.join(VOLUME_PATH, 'master_inventory.csv')
        df = pd.read_csv(master_path)
        
        # Analyze column counts
        column_counts = Counter(df['columns'])
        
        # Analyze unique column names
        all_columns = []
        for col_names_str in df['column_names'].dropna():
            try:
                cols = eval(col_names_str)  # Convert string back to list
                all_columns.extend(cols)
            except:
                continue
        
        column_frequency = Counter(all_columns)
        
        return {
            "status": "success",
            "total_files": len(df),
            "column_count_distribution": dict(column_counts),
            "most_common_columns": dict(column_frequency.most_common(20)),
            "unique_columns": len(column_frequency)
        }
    except Exception as e:
        return {"status": "error", "message": str(e)}

@app.local_entrypoint()
def analyze_columns():
    """Analyze column patterns across all census files."""
    print("Analyzing column patterns...")
    result = analyze_column_patterns.remote()
    
    if result["status"] == "success":
        print(f"\n=== Column Analysis Results ===")
        print(f"Total files analyzed: {result['total_files']}")
        print(f"Unique column names found: {result['unique_columns']}")
        
        print(f"\nColumn count distribution:")
        for count, files in sorted(result["column_count_distribution"].items()):
            print(f"  {count} columns: {files} files")
        
        print(f"\nMost common column names:")
        for col, freq in result["most_common_columns"].items():
            print(f"  '{col}': {freq} files")
    else:
        print(f"Error: {result.get('message', 'Unknown error')}")

if __name__ == "__main__":
    main()