Spaces:
Runtime error
Runtime error
File size: 4,697 Bytes
7e1fb9b b251424 7e1fb9b b251424 7e1fb9b b251424 7e1fb9b b251424 7e1fb9b b251424 7e1fb9b b251424 7e1fb9b b251424 7e1fb9b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
import modal
import os
# Create Modal app
app = modal.App("census-csv-converter")
# Use the same volume where census data was downloaded
volume = modal.Volume.from_name("census-data")
# Define image with pandas and openpyxl for Excel processing
image = modal.Image.debian_slim().pip_install(
"pandas",
"openpyxl",
"xlrd"
)
VOLUME_PATH = "/data"
@app.function(
image=image,
volumes={VOLUME_PATH: volume},
timeout=600,
max_containers=100,
)
def convert_to_csv(file_path: str) -> dict:
"""Converts a single Excel file to CSV and saves it in a 'csv' subfolder with a human-readable name."""
import pandas as pd
import re
try:
# Determine directory
directory, filename = os.path.split(file_path)
name, ext = os.path.splitext(filename)
if ext.lower() not in ['.xls', '.xlsx']:
return {"file": file_path, "status": "skipped", "reason": "Not an Excel file"}
# Create 'csv' subfolder
csv_dir = os.path.join(directory, "csv")
os.makedirs(csv_dir, exist_ok=True)
# 1. Extract Title for Filename
# Read first few rows to find a title
try:
df_meta = pd.read_excel(file_path, header=None, nrows=10)
title = None
# Search for the first long string which is likely the title
for val in df_meta.values.flatten():
if isinstance(val, str) and len(val) > 5:
title = val
break
if title:
# Sanitize title
# Remove invalid chars
clean_title = re.sub(r'[\\/*?:"<>|]', "", title)
# Replace whitespace with underscores
clean_title = re.sub(r'\s+', "_", clean_title)
# Limit length and strip
clean_title = clean_title.strip()[:100]
# Use ID + Title to ensure uniqueness and readability
csv_filename = f"{name}_{clean_title}.csv"
else:
csv_filename = f"{name}.csv"
except Exception as e:
print(f"Warning: Could not extract title from {filename}: {e}")
csv_filename = f"{name}.csv"
csv_path = os.path.join(csv_dir, csv_filename)
# Check if CSV already exists
if os.path.exists(csv_path):
return {"file": file_path, "status": "skipped", "reason": "CSV already exists", "csv_path": csv_path}
# 2. Convert Content
# Read full file
df = pd.read_excel(file_path)
# Save as CSV
df.to_csv(csv_path, index=False, encoding='utf-8')
# Commit changes
volume.commit()
return {"file": file_path, "status": "success", "csv_path": csv_path}
except Exception as e:
return {"file": file_path, "status": "error", "message": str(e)}
@app.function(image=image, volumes={VOLUME_PATH: volume})
def list_excel_files() -> list:
"""Lists all Excel files in the volume."""
excel_files = []
for root, dirs, files in os.walk(VOLUME_PATH):
# Skip existing csv directories to avoid recursion if we re-run
if 'csv' in dirs:
# Don't traverse into csv directories
# Note: os.walk allows modifying dirs in-place to prune traversal
pass
for file in files:
if file.lower().endswith(('.xls', '.xlsx')):
excel_files.append(os.path.join(root, file))
return excel_files
@app.local_entrypoint()
def main():
"""Main function to orchestrate CSV conversion."""
print("Scanning for Excel files in census-data volume...")
files = list_excel_files.remote()
print(f"Found {len(files)} Excel files.")
if not files:
print("No files to convert.")
return
print("Starting conversion...")
results = list(convert_to_csv.map(files))
# Summary
success = sum(1 for r in results if r["status"] == "success")
skipped = sum(1 for r in results if r["status"] == "skipped")
errors = sum(1 for r in results if r["status"] == "error")
print(f"\n=== Conversion Summary ===")
print(f"Total files processed: {len(results)}")
print(f"Successfully converted: {success}")
print(f"Skipped: {skipped}")
print(f"Errors: {errors}")
if errors > 0:
print(f"\nFailed Conversions:")
for r in results:
if r["status"] == "error":
print(f" - {r['file']}: {r.get('message', 'Unknown error')}")
print("\n✅ Conversion complete!")
|