sdlc-agent / src /data /convert_census_to_csv.py
Veeru-c's picture
initial commit
06bd253
import modal
import os
# Create Modal app
app = modal.App("census-csv-converter")
# Use the same volume where census data was downloaded
volume = modal.Volume.from_name("census-data")
# Define image with pandas and openpyxl for Excel processing
image = modal.Image.debian_slim().pip_install(
"pandas",
"openpyxl",
"xlrd"
)
VOLUME_PATH = "/data"
@app.function(
image=image,
volumes={VOLUME_PATH: volume},
timeout=600,
max_containers=100,
)
def convert_to_csv(file_path: str) -> dict:
"""Converts a single Excel file to CSV and saves it in a 'csv' subfolder with a human-readable name."""
import pandas as pd
import re
try:
# Determine directory
directory, filename = os.path.split(file_path)
name, ext = os.path.splitext(filename)
if ext.lower() not in ['.xls', '.xlsx']:
return {"file": file_path, "status": "skipped", "reason": "Not an Excel file"}
# Create 'csv' subfolder
csv_dir = os.path.join(directory, "csv")
os.makedirs(csv_dir, exist_ok=True)
# 1. Extract Title for Filename
# Read first few rows to find a title
try:
df_meta = pd.read_excel(file_path, header=None, nrows=10)
title = None
# Search for the first long string which is likely the title
for val in df_meta.values.flatten():
if isinstance(val, str) and len(val) > 5:
title = val
break
if title:
# Sanitize title
# Remove invalid chars
clean_title = re.sub(r'[\\/*?:"<>|]', "", title)
# Replace whitespace with underscores
clean_title = re.sub(r'\s+', "_", clean_title)
# Limit length and strip
clean_title = clean_title.strip()[:100]
# Use ID + Title to ensure uniqueness and readability
csv_filename = f"{name}_{clean_title}.csv"
else:
csv_filename = f"{name}.csv"
except Exception as e:
print(f"Warning: Could not extract title from {filename}: {e}")
csv_filename = f"{name}.csv"
csv_path = os.path.join(csv_dir, csv_filename)
# Check if CSV already exists
if os.path.exists(csv_path):
return {"file": file_path, "status": "skipped", "reason": "CSV already exists", "csv_path": csv_path}
# 2. Convert Content
# Read full file
df = pd.read_excel(file_path)
# Save as CSV
df.to_csv(csv_path, index=False, encoding='utf-8')
# Commit changes
volume.commit()
return {"file": file_path, "status": "success", "csv_path": csv_path}
except Exception as e:
return {"file": file_path, "status": "error", "message": str(e)}
@app.function(image=image, volumes={VOLUME_PATH: volume})
def list_excel_files() -> list:
"""Lists all Excel files in the volume."""
excel_files = []
for root, dirs, files in os.walk(VOLUME_PATH):
# Skip existing csv directories to avoid recursion if we re-run
if 'csv' in dirs:
# Don't traverse into csv directories
# Note: os.walk allows modifying dirs in-place to prune traversal
pass
for file in files:
if file.lower().endswith(('.xls', '.xlsx')):
excel_files.append(os.path.join(root, file))
return excel_files
@app.local_entrypoint()
def main():
"""Main function to orchestrate CSV conversion."""
print("Scanning for Excel files in census-data volume...")
files = list_excel_files.remote()
print(f"Found {len(files)} Excel files.")
if not files:
print("No files to convert.")
return
print("Starting conversion...")
results = list(convert_to_csv.map(files))
# Summary
success = sum(1 for r in results if r["status"] == "success")
skipped = sum(1 for r in results if r["status"] == "skipped")
errors = sum(1 for r in results if r["status"] == "error")
print(f"\n=== Conversion Summary ===")
print(f"Total files processed: {len(results)}")
print(f"Successfully converted: {success}")
print(f"Skipped: {skipped}")
print(f"Errors: {errors}")
if errors > 0:
print(f"\nFailed Conversions:")
for r in results:
if r["status"] == "error":
print(f" - {r['file']}: {r.get('message', 'Unknown error')}")
print("\n✅ Conversion complete!")