Spaces:

MCP-1st-Birthday
/

sdlc-agent

Runtime error

App Files Files Community

sdlc-agent / src /data /cleanup_data.py

Veeru-c

initial commit

06bd253 22 days ago

raw

history blame contribute delete

3.45 kB

	import modal
	import os
	import glob

	app = modal.App("data-cleanup")

	# Define volumes
	vol_census = modal.Volume.from_name("census-data")
	vol_economy = modal.Volume.from_name("economy-labor-data")

	image = modal.Image.debian_slim()

	@app.function(
	image=image,
	volumes={
	"/data/census": vol_census,
	"/data/economy": vol_economy
	},
	timeout=600
	)
	def cleanup_volume(root_path: str, volume_name: str):
	print(f"🧹 Cleaning up {volume_name} at {root_path}...")

	deleted_excel = 0
	deleted_duplicates = 0

	for root, dirs, files in os.walk(root_path):
	# 1. Delete Excel files
	for f in files:
	if f.lower().endswith(('.xls', '.xlsx')):
	full_path = os.path.join(root, f)
	try:
	os.remove(full_path)
	deleted_excel += 1
	except Exception as e:
	print(f"Error deleting {f}: {e}")

	# 2. Delete duplicate CSVs
	# Logic: If 'ID_Title.csv' exists, delete 'ID.csv'
	csv_files = [f for f in files if f.lower().endswith('.csv')]

	# Group by ID (assuming ID is the part before the first underscore or the whole name)
	# Actually, the pattern is:
	# Old: ID.csv
	# New: ID_Title.csv

	# Find all "ID.csv" candidates
	simple_csvs = {} # Map ID -> filename
	complex_csvs = set() # Set of IDs that have a complex version

	for f in csv_files:
	name, _ = os.path.splitext(f)
	if '_' in name:
	# Likely ID_Title
	# We need to extract the ID.
	# Based on previous scripts, ID is the first part, but title might contain underscores.
	# However, the simple file is just "ID.csv".
	# So we can check if there is a file named "{ID}.csv" corresponding to this complex one.
	# But we don't know the ID for sure just from splitting by underscore if ID itself has underscores (unlikely for these datasets, usually alphanumeric).
	# Let's assume ID is everything before the first underscore.
	parts = name.split('_', 1)
	if len(parts) > 1:
	complex_csvs.add(parts[0])
	else:
	# Likely ID.csv
	simple_csvs[name] = f

	# Now check for duplicates
	for simple_id, simple_filename in simple_csvs.items():
	if simple_id in complex_csvs:
	# We have both ID.csv and ID_Title.csv -> Delete ID.csv
	full_path = os.path.join(root, simple_filename)
	try:
	os.remove(full_path)
	deleted_duplicates += 1
	# print(f"Deleted duplicate: {simple_filename}")
	except Exception as e:
	print(f"Error deleting {simple_filename}: {e}")

	# Commit changes (needed for Modal Volumes)
	if volume_name == "census-data":
	vol_census.commit()
	else:
	vol_economy.commit()

	print(f"✅ {volume_name}: Deleted {deleted_excel} Excel files and {deleted_duplicates} duplicate CSVs.")

	@app.local_entrypoint()
	def main():
	# Cleanup Census Data
	cleanup_volume.remote("/data/census", "census-data")

	# Cleanup Economy & Labor Data
	cleanup_volume.remote("/data/economy", "economy-labor-data")