File size: 2,882 Bytes
b251424
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import modal
import os
import urllib.parse

app = modal.App("fix-csv-filenames")

# Volumes
census_volume = modal.Volume.from_name("census-data")
economy_volume = modal.Volume.from_name("economy-labor-data")

image = modal.Image.debian_slim()

def clean_filename(filename: str) -> str:
    """Cleans up the filename by removing garbage prefixes."""
    # 1. Unquote URL encoding
    # e.g. attachment%3B%20filename*%3DUTF-8%27%27a01e... -> attachment; filename*=UTF-8''a01e...
    cleaned = urllib.parse.unquote(filename)
    
    # 2. Remove common garbage prefixes
    prefixes = [
        "attachment; filename*=UTF-8''",
        "attachment; filename=",
        "attachment;",
    ]
    
    for prefix in prefixes:
        if cleaned.startswith(prefix):
            cleaned = cleaned[len(prefix):]
    
    # 3. Clean up any remaining quotes or whitespace
    cleaned = cleaned.strip('"\' ')
    
    return cleaned

def process_volume(volume_path: str, volume_obj: modal.Volume) -> dict:
    """Renames files in the volume."""
    renamed_count = 0
    errors = 0
    
    print(f"Scanning {volume_path}...")
    
    for root, _, files in os.walk(volume_path):
        for filename in files:
            if not filename.lower().endswith('.csv'):
                continue
                
            new_name = clean_filename(filename)
            
            if new_name != filename:
                old_path = os.path.join(root, filename)
                new_path = os.path.join(root, new_name)
                
                # Avoid overwriting if target exists (unless it's the same file)
                if os.path.exists(new_path) and new_path != old_path:
                    print(f"Skipping rename {filename} -> {new_name} (Target exists)")
                    continue
                    
                try:
                    os.rename(old_path, new_path)
                    renamed_count += 1
                    # print(f"Renamed: {filename} -> {new_name}")
                except Exception as e:
                    print(f"Error renaming {filename}: {e}")
                    errors += 1
    
    volume_obj.commit()
    return {"renamed": renamed_count, "errors": errors}

@app.function(image=image, volumes={"/data/census": census_volume})
def fix_census():
    return process_volume("/data/census", census_volume)

@app.function(image=image, volumes={"/data/economy": economy_volume})
def fix_economy():
    return process_volume("/data/economy", economy_volume)

@app.local_entrypoint()
def main():
    print("Fixing Census filenames...")
    census_res = fix_census.remote()
    print(f"Census: Renamed {census_res['renamed']} files. Errors: {census_res['errors']}")
    
    print("Fixing Economy filenames...")
    economy_res = fix_economy.remote()
    print(f"Economy: Renamed {economy_res['renamed']} files. Errors: {economy_res['errors']}")