Spaces:

MCP-1st-Birthday
/

sdlc-agent

Runtime error

App Files Files Community

sdlc-agent / src /data /convert_census_to_csv.py

Veeru-c

initial commit

06bd253 22 days ago

raw

history blame contribute delete

4.7 kB

	import modal
	import os

	# Create Modal app
	app = modal.App("census-csv-converter")

	# Use the same volume where census data was downloaded
	volume = modal.Volume.from_name("census-data")

	# Define image with pandas and openpyxl for Excel processing
	image = modal.Image.debian_slim().pip_install(
	"pandas",
	"openpyxl",
	"xlrd"
	)

	VOLUME_PATH = "/data"

	@app.function(
	image=image,
	volumes={VOLUME_PATH: volume},
	timeout=600,
	max_containers=100,
	)
	def convert_to_csv(file_path: str) -> dict:
	"""Converts a single Excel file to CSV and saves it in a 'csv' subfolder with a human-readable name."""
	import pandas as pd
	import re

	try:
	# Determine directory
	directory, filename = os.path.split(file_path)
	name, ext = os.path.splitext(filename)

	if ext.lower() not in ['.xls', '.xlsx']:
	return {"file": file_path, "status": "skipped", "reason": "Not an Excel file"}

	# Create 'csv' subfolder
	csv_dir = os.path.join(directory, "csv")
	os.makedirs(csv_dir, exist_ok=True)

	# 1. Extract Title for Filename
	# Read first few rows to find a title
	try:
	df_meta = pd.read_excel(file_path, header=None, nrows=10)
	title = None
	# Search for the first long string which is likely the title
	for val in df_meta.values.flatten():
	if isinstance(val, str) and len(val) > 5:
	title = val
	break

	if title:
	# Sanitize title
	# Remove invalid chars
	clean_title = re.sub(r'[\\/*?:"<>\|]', "", title)
	# Replace whitespace with underscores
	clean_title = re.sub(r'\s+', "_", clean_title)
	# Limit length and strip
	clean_title = clean_title.strip()[:100]

	# Use ID + Title to ensure uniqueness and readability
	csv_filename = f"{name}_{clean_title}.csv"
	else:
	csv_filename = f"{name}.csv"

	except Exception as e:
	print(f"Warning: Could not extract title from {filename}: {e}")
	csv_filename = f"{name}.csv"

	csv_path = os.path.join(csv_dir, csv_filename)

	# Check if CSV already exists
	if os.path.exists(csv_path):
	return {"file": file_path, "status": "skipped", "reason": "CSV already exists", "csv_path": csv_path}

	# 2. Convert Content
	# Read full file
	df = pd.read_excel(file_path)

	# Save as CSV
	df.to_csv(csv_path, index=False, encoding='utf-8')

	# Commit changes
	volume.commit()

	return {"file": file_path, "status": "success", "csv_path": csv_path}

	except Exception as e:
	return {"file": file_path, "status": "error", "message": str(e)}

	@app.function(image=image, volumes={VOLUME_PATH: volume})
	def list_excel_files() -> list:
	"""Lists all Excel files in the volume."""
	excel_files = []
	for root, dirs, files in os.walk(VOLUME_PATH):
	# Skip existing csv directories to avoid recursion if we re-run
	if 'csv' in dirs:
	# Don't traverse into csv directories
	# Note: os.walk allows modifying dirs in-place to prune traversal
	pass

	for file in files:
	if file.lower().endswith(('.xls', '.xlsx')):
	excel_files.append(os.path.join(root, file))
	return excel_files

	@app.local_entrypoint()
	def main():
	"""Main function to orchestrate CSV conversion."""
	print("Scanning for Excel files in census-data volume...")
	files = list_excel_files.remote()
	print(f"Found {len(files)} Excel files.")

	if not files:
	print("No files to convert.")
	return

	print("Starting conversion...")
	results = list(convert_to_csv.map(files))

	# Summary
	success = sum(1 for r in results if r["status"] == "success")
	skipped = sum(1 for r in results if r["status"] == "skipped")
	errors = sum(1 for r in results if r["status"] == "error")

	print(f"\n=== Conversion Summary ===")
	print(f"Total files processed: {len(results)}")
	print(f"Successfully converted: {success}")
	print(f"Skipped: {skipped}")
	print(f"Errors: {errors}")

	if errors > 0:
	print(f"\nFailed Conversions:")
	for r in results:
	if r["status"] == "error":
	print(f" - {r['file']}: {r.get('message', 'Unknown error')}")

	print("\n✅ Conversion complete!")