Spaces:
Runtime error
Runtime error
File size: 3,445 Bytes
b251424 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 |
import modal
import os
import glob
app = modal.App("data-cleanup")
# Define volumes
vol_census = modal.Volume.from_name("census-data")
vol_economy = modal.Volume.from_name("economy-labor-data")
image = modal.Image.debian_slim()
@app.function(
image=image,
volumes={
"/data/census": vol_census,
"/data/economy": vol_economy
},
timeout=600
)
def cleanup_volume(root_path: str, volume_name: str):
print(f"🧹 Cleaning up {volume_name} at {root_path}...")
deleted_excel = 0
deleted_duplicates = 0
for root, dirs, files in os.walk(root_path):
# 1. Delete Excel files
for f in files:
if f.lower().endswith(('.xls', '.xlsx')):
full_path = os.path.join(root, f)
try:
os.remove(full_path)
deleted_excel += 1
except Exception as e:
print(f"Error deleting {f}: {e}")
# 2. Delete duplicate CSVs
# Logic: If 'ID_Title.csv' exists, delete 'ID.csv'
csv_files = [f for f in files if f.lower().endswith('.csv')]
# Group by ID (assuming ID is the part before the first underscore or the whole name)
# Actually, the pattern is:
# Old: ID.csv
# New: ID_Title.csv
# Find all "ID.csv" candidates
simple_csvs = {} # Map ID -> filename
complex_csvs = set() # Set of IDs that have a complex version
for f in csv_files:
name, _ = os.path.splitext(f)
if '_' in name:
# Likely ID_Title
# We need to extract the ID.
# Based on previous scripts, ID is the first part, but title might contain underscores.
# However, the simple file is just "ID.csv".
# So we can check if there is a file named "{ID}.csv" corresponding to this complex one.
# But we don't know the ID for sure just from splitting by underscore if ID itself has underscores (unlikely for these datasets, usually alphanumeric).
# Let's assume ID is everything before the *first* underscore.
parts = name.split('_', 1)
if len(parts) > 1:
complex_csvs.add(parts[0])
else:
# Likely ID.csv
simple_csvs[name] = f
# Now check for duplicates
for simple_id, simple_filename in simple_csvs.items():
if simple_id in complex_csvs:
# We have both ID.csv and ID_Title.csv -> Delete ID.csv
full_path = os.path.join(root, simple_filename)
try:
os.remove(full_path)
deleted_duplicates += 1
# print(f"Deleted duplicate: {simple_filename}")
except Exception as e:
print(f"Error deleting {simple_filename}: {e}")
# Commit changes (needed for Modal Volumes)
if volume_name == "census-data":
vol_census.commit()
else:
vol_economy.commit()
print(f"✅ {volume_name}: Deleted {deleted_excel} Excel files and {deleted_duplicates} duplicate CSVs.")
@app.local_entrypoint()
def main():
# Cleanup Census Data
cleanup_volume.remote("/data/census", "census-data")
# Cleanup Economy & Labor Data
cleanup_volume.remote("/data/economy", "economy-labor-data")
|