Spaces:
Runtime error
Runtime error
| import modal | |
| import os | |
| import glob | |
| app = modal.App("data-cleanup") | |
| # Define volumes | |
| vol_census = modal.Volume.from_name("census-data") | |
| vol_economy = modal.Volume.from_name("economy-labor-data") | |
| image = modal.Image.debian_slim() | |
| def cleanup_volume(root_path: str, volume_name: str): | |
| print(f"🧹 Cleaning up {volume_name} at {root_path}...") | |
| deleted_excel = 0 | |
| deleted_duplicates = 0 | |
| for root, dirs, files in os.walk(root_path): | |
| # 1. Delete Excel files | |
| for f in files: | |
| if f.lower().endswith(('.xls', '.xlsx')): | |
| full_path = os.path.join(root, f) | |
| try: | |
| os.remove(full_path) | |
| deleted_excel += 1 | |
| except Exception as e: | |
| print(f"Error deleting {f}: {e}") | |
| # 2. Delete duplicate CSVs | |
| # Logic: If 'ID_Title.csv' exists, delete 'ID.csv' | |
| csv_files = [f for f in files if f.lower().endswith('.csv')] | |
| # Group by ID (assuming ID is the part before the first underscore or the whole name) | |
| # Actually, the pattern is: | |
| # Old: ID.csv | |
| # New: ID_Title.csv | |
| # Find all "ID.csv" candidates | |
| simple_csvs = {} # Map ID -> filename | |
| complex_csvs = set() # Set of IDs that have a complex version | |
| for f in csv_files: | |
| name, _ = os.path.splitext(f) | |
| if '_' in name: | |
| # Likely ID_Title | |
| # We need to extract the ID. | |
| # Based on previous scripts, ID is the first part, but title might contain underscores. | |
| # However, the simple file is just "ID.csv". | |
| # So we can check if there is a file named "{ID}.csv" corresponding to this complex one. | |
| # But we don't know the ID for sure just from splitting by underscore if ID itself has underscores (unlikely for these datasets, usually alphanumeric). | |
| # Let's assume ID is everything before the *first* underscore. | |
| parts = name.split('_', 1) | |
| if len(parts) > 1: | |
| complex_csvs.add(parts[0]) | |
| else: | |
| # Likely ID.csv | |
| simple_csvs[name] = f | |
| # Now check for duplicates | |
| for simple_id, simple_filename in simple_csvs.items(): | |
| if simple_id in complex_csvs: | |
| # We have both ID.csv and ID_Title.csv -> Delete ID.csv | |
| full_path = os.path.join(root, simple_filename) | |
| try: | |
| os.remove(full_path) | |
| deleted_duplicates += 1 | |
| # print(f"Deleted duplicate: {simple_filename}") | |
| except Exception as e: | |
| print(f"Error deleting {simple_filename}: {e}") | |
| # Commit changes (needed for Modal Volumes) | |
| if volume_name == "census-data": | |
| vol_census.commit() | |
| else: | |
| vol_economy.commit() | |
| print(f"✅ {volume_name}: Deleted {deleted_excel} Excel files and {deleted_duplicates} duplicate CSVs.") | |
| def main(): | |
| # Cleanup Census Data | |
| cleanup_volume.remote("/data/census", "census-data") | |
| # Cleanup Economy & Labor Data | |
| cleanup_volume.remote("/data/economy", "economy-labor-data") | |