Spaces:
Runtime error
Runtime error
File size: 2,882 Bytes
b251424 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 |
import modal
import os
import urllib.parse
app = modal.App("fix-csv-filenames")
# Volumes
census_volume = modal.Volume.from_name("census-data")
economy_volume = modal.Volume.from_name("economy-labor-data")
image = modal.Image.debian_slim()
def clean_filename(filename: str) -> str:
"""Cleans up the filename by removing garbage prefixes."""
# 1. Unquote URL encoding
# e.g. attachment%3B%20filename*%3DUTF-8%27%27a01e... -> attachment; filename*=UTF-8''a01e...
cleaned = urllib.parse.unquote(filename)
# 2. Remove common garbage prefixes
prefixes = [
"attachment; filename*=UTF-8''",
"attachment; filename=",
"attachment;",
]
for prefix in prefixes:
if cleaned.startswith(prefix):
cleaned = cleaned[len(prefix):]
# 3. Clean up any remaining quotes or whitespace
cleaned = cleaned.strip('"\' ')
return cleaned
def process_volume(volume_path: str, volume_obj: modal.Volume) -> dict:
"""Renames files in the volume."""
renamed_count = 0
errors = 0
print(f"Scanning {volume_path}...")
for root, _, files in os.walk(volume_path):
for filename in files:
if not filename.lower().endswith('.csv'):
continue
new_name = clean_filename(filename)
if new_name != filename:
old_path = os.path.join(root, filename)
new_path = os.path.join(root, new_name)
# Avoid overwriting if target exists (unless it's the same file)
if os.path.exists(new_path) and new_path != old_path:
print(f"Skipping rename {filename} -> {new_name} (Target exists)")
continue
try:
os.rename(old_path, new_path)
renamed_count += 1
# print(f"Renamed: {filename} -> {new_name}")
except Exception as e:
print(f"Error renaming {filename}: {e}")
errors += 1
volume_obj.commit()
return {"renamed": renamed_count, "errors": errors}
@app.function(image=image, volumes={"/data/census": census_volume})
def fix_census():
return process_volume("/data/census", census_volume)
@app.function(image=image, volumes={"/data/economy": economy_volume})
def fix_economy():
return process_volume("/data/economy", economy_volume)
@app.local_entrypoint()
def main():
print("Fixing Census filenames...")
census_res = fix_census.remote()
print(f"Census: Renamed {census_res['renamed']} files. Errors: {census_res['errors']}")
print("Fixing Economy filenames...")
economy_res = fix_economy.remote()
print(f"Economy: Renamed {economy_res['renamed']} files. Errors: {economy_res['errors']}")
|