Spaces:
Runtime error
Runtime error
| import modal | |
| import os | |
| # Create Modal app | |
| app = modal.App("census-csv-converter") | |
| # Use the same volume where census data was downloaded | |
| volume = modal.Volume.from_name("census-data") | |
| # Define image with pandas and openpyxl for Excel processing | |
| image = modal.Image.debian_slim().pip_install( | |
| "pandas", | |
| "openpyxl", | |
| "xlrd" | |
| ) | |
| VOLUME_PATH = "/data" | |
| def convert_to_csv(file_path: str) -> dict: | |
| """Converts a single Excel file to CSV and saves it in a 'csv' subfolder with a human-readable name.""" | |
| import pandas as pd | |
| import re | |
| try: | |
| # Determine directory | |
| directory, filename = os.path.split(file_path) | |
| name, ext = os.path.splitext(filename) | |
| if ext.lower() not in ['.xls', '.xlsx']: | |
| return {"file": file_path, "status": "skipped", "reason": "Not an Excel file"} | |
| # Create 'csv' subfolder | |
| csv_dir = os.path.join(directory, "csv") | |
| os.makedirs(csv_dir, exist_ok=True) | |
| # 1. Extract Title for Filename | |
| # Read first few rows to find a title | |
| try: | |
| df_meta = pd.read_excel(file_path, header=None, nrows=10) | |
| title = None | |
| # Search for the first long string which is likely the title | |
| for val in df_meta.values.flatten(): | |
| if isinstance(val, str) and len(val) > 5: | |
| title = val | |
| break | |
| if title: | |
| # Sanitize title | |
| # Remove invalid chars | |
| clean_title = re.sub(r'[\\/*?:"<>|]', "", title) | |
| # Replace whitespace with underscores | |
| clean_title = re.sub(r'\s+', "_", clean_title) | |
| # Limit length and strip | |
| clean_title = clean_title.strip()[:100] | |
| # Use ID + Title to ensure uniqueness and readability | |
| csv_filename = f"{name}_{clean_title}.csv" | |
| else: | |
| csv_filename = f"{name}.csv" | |
| except Exception as e: | |
| print(f"Warning: Could not extract title from {filename}: {e}") | |
| csv_filename = f"{name}.csv" | |
| csv_path = os.path.join(csv_dir, csv_filename) | |
| # Check if CSV already exists | |
| if os.path.exists(csv_path): | |
| return {"file": file_path, "status": "skipped", "reason": "CSV already exists", "csv_path": csv_path} | |
| # 2. Convert Content | |
| # Read full file | |
| df = pd.read_excel(file_path) | |
| # Save as CSV | |
| df.to_csv(csv_path, index=False, encoding='utf-8') | |
| # Commit changes | |
| volume.commit() | |
| return {"file": file_path, "status": "success", "csv_path": csv_path} | |
| except Exception as e: | |
| return {"file": file_path, "status": "error", "message": str(e)} | |
| def list_excel_files() -> list: | |
| """Lists all Excel files in the volume.""" | |
| excel_files = [] | |
| for root, dirs, files in os.walk(VOLUME_PATH): | |
| # Skip existing csv directories to avoid recursion if we re-run | |
| if 'csv' in dirs: | |
| # Don't traverse into csv directories | |
| # Note: os.walk allows modifying dirs in-place to prune traversal | |
| pass | |
| for file in files: | |
| if file.lower().endswith(('.xls', '.xlsx')): | |
| excel_files.append(os.path.join(root, file)) | |
| return excel_files | |
| def main(): | |
| """Main function to orchestrate CSV conversion.""" | |
| print("Scanning for Excel files in census-data volume...") | |
| files = list_excel_files.remote() | |
| print(f"Found {len(files)} Excel files.") | |
| if not files: | |
| print("No files to convert.") | |
| return | |
| print("Starting conversion...") | |
| results = list(convert_to_csv.map(files)) | |
| # Summary | |
| success = sum(1 for r in results if r["status"] == "success") | |
| skipped = sum(1 for r in results if r["status"] == "skipped") | |
| errors = sum(1 for r in results if r["status"] == "error") | |
| print(f"\n=== Conversion Summary ===") | |
| print(f"Total files processed: {len(results)}") | |
| print(f"Successfully converted: {success}") | |
| print(f"Skipped: {skipped}") | |
| print(f"Errors: {errors}") | |
| if errors > 0: | |
| print(f"\nFailed Conversions:") | |
| for r in results: | |
| if r["status"] == "error": | |
| print(f" - {r['file']}: {r.get('message', 'Unknown error')}") | |
| print("\n✅ Conversion complete!") | |