File size: 4,697 Bytes
7e1fb9b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b251424
7e1fb9b
b251424
 
7e1fb9b
b251424
7e1fb9b
 
 
 
 
 
b251424
7e1fb9b
 
b251424
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7e1fb9b
 
 
 
 
b251424
7e1fb9b
b251424
 
7e1fb9b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import modal
import os

# Create Modal app
app = modal.App("census-csv-converter")

# Use the same volume where census data was downloaded
volume = modal.Volume.from_name("census-data")

# Define image with pandas and openpyxl for Excel processing
image = modal.Image.debian_slim().pip_install(
    "pandas",
    "openpyxl",
    "xlrd"
)

VOLUME_PATH = "/data"

@app.function(
    image=image,
    volumes={VOLUME_PATH: volume},
    timeout=600,
    max_containers=100,
)
def convert_to_csv(file_path: str) -> dict:
    """Converts a single Excel file to CSV and saves it in a 'csv' subfolder with a human-readable name."""
    import pandas as pd
    import re
    
    try:
        # Determine directory
        directory, filename = os.path.split(file_path)
        name, ext = os.path.splitext(filename)
        
        if ext.lower() not in ['.xls', '.xlsx']:
            return {"file": file_path, "status": "skipped", "reason": "Not an Excel file"}
            
        # Create 'csv' subfolder
        csv_dir = os.path.join(directory, "csv")
        os.makedirs(csv_dir, exist_ok=True)

        # 1. Extract Title for Filename
        # Read first few rows to find a title
        try:
            df_meta = pd.read_excel(file_path, header=None, nrows=10)
            title = None
            # Search for the first long string which is likely the title
            for val in df_meta.values.flatten():
                if isinstance(val, str) and len(val) > 5:
                    title = val
                    break
            
            if title:
                # Sanitize title
                # Remove invalid chars
                clean_title = re.sub(r'[\\/*?:"<>|]', "", title)
                # Replace whitespace with underscores
                clean_title = re.sub(r'\s+', "_", clean_title)
                # Limit length and strip
                clean_title = clean_title.strip()[:100]
                
                # Use ID + Title to ensure uniqueness and readability
                csv_filename = f"{name}_{clean_title}.csv"
            else:
                csv_filename = f"{name}.csv"
                
        except Exception as e:
            print(f"Warning: Could not extract title from {filename}: {e}")
            csv_filename = f"{name}.csv"
        
        csv_path = os.path.join(csv_dir, csv_filename)
        
        # Check if CSV already exists
        if os.path.exists(csv_path):
             return {"file": file_path, "status": "skipped", "reason": "CSV already exists", "csv_path": csv_path}

        # 2. Convert Content
        # Read full file
        df = pd.read_excel(file_path)
        
        # Save as CSV
        df.to_csv(csv_path, index=False, encoding='utf-8')
        
        # Commit changes
        volume.commit()
        
        return {"file": file_path, "status": "success", "csv_path": csv_path}
        
    except Exception as e:
        return {"file": file_path, "status": "error", "message": str(e)}

@app.function(image=image, volumes={VOLUME_PATH: volume})
def list_excel_files() -> list:
    """Lists all Excel files in the volume."""
    excel_files = []
    for root, dirs, files in os.walk(VOLUME_PATH):
        # Skip existing csv directories to avoid recursion if we re-run
        if 'csv' in dirs:
            # Don't traverse into csv directories
            # Note: os.walk allows modifying dirs in-place to prune traversal
            pass 
            
        for file in files:
            if file.lower().endswith(('.xls', '.xlsx')):
                excel_files.append(os.path.join(root, file))
    return excel_files

@app.local_entrypoint()
def main():
    """Main function to orchestrate CSV conversion."""
    print("Scanning for Excel files in census-data volume...")
    files = list_excel_files.remote()
    print(f"Found {len(files)} Excel files.")
    
    if not files:
        print("No files to convert.")
        return

    print("Starting conversion...")
    results = list(convert_to_csv.map(files))
    
    # Summary
    success = sum(1 for r in results if r["status"] == "success")
    skipped = sum(1 for r in results if r["status"] == "skipped")
    errors = sum(1 for r in results if r["status"] == "error")
    
    print(f"\n=== Conversion Summary ===")
    print(f"Total files processed: {len(results)}")
    print(f"Successfully converted: {success}")
    print(f"Skipped: {skipped}")
    print(f"Errors: {errors}")
    
    if errors > 0:
        print(f"\nFailed Conversions:")
        for r in results:
            if r["status"] == "error":
                print(f"  - {r['file']}: {r.get('message', 'Unknown error')}")

    print("\n✅ Conversion complete!")