File size: 2,383 Bytes
46f2cb3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import modal

app = modal.App("debug-parser")
vol_census = modal.Volume.from_name("census-data")
vol_economy = modal.Volume.from_name("economy-labor-data")

image = modal.Image.debian_slim().pip_install("pandas")

@app.function(
    image=image,
    volumes={"/data/census": vol_census, "/data/economy": vol_economy}
)
def debug_single_file():
    import pandas as pd
    import os
    import re
    
    # Get first census file
    file_path = None
    for root, _, filenames in os.walk("/data/census"):
        for f in filenames:
            if f.lower().endswith('.csv'):
                file_path = os.path.join(root, f)
                break
        if file_path:
            break
    
    if not file_path:
        print("No CSV files found!")
        return
    
    print(f"Testing file: {file_path}")
    
    # Read raw
    df_raw = pd.read_csv(file_path, header=None, low_memory=False)
    print(f"\nRaw shape: {df_raw.shape}")
    print(f"\nFirst 5 rows:")
    for i in range(min(5, len(df_raw))):
        print(f"Row {i}: {df_raw.iloc[i].tolist()[:5]}")
    
    # Test header detection
    for i in range(min(15, len(df_raw))):
        row = df_raw.iloc[i]
        non_null_count = row.count()
        
        # Check for Unnamed
        unnamed_count = sum(1 for val in row if pd.notna(val) and "Unnamed" in str(val))
        
        header_like = 0
        for val in row:
            if pd.notna(val):
                val_str = str(val).strip()
                if val_str and not val_str.replace('.', '').replace(',', '').replace('-', '').replace(' ', '').isdigit():
                    header_like += 1
        
        print(f"\nRow {i}: non_null={non_null_count}, unnamed={unnamed_count}, header_like={header_like}")
        print(f"  Ratios: unnamed={unnamed_count/non_null_count if non_null_count > 0 else 0:.2f}, header={header_like/non_null_count if non_null_count > 0 else 0:.2f}")
        
        # Check if passes filters
        if non_null_count >= len(df_raw.columns) * 0.3:
            if unnamed_count > non_null_count * 0.3:
                print(f"  β†’ SKIPPED (too many Unnamed)")
            elif header_like >= non_null_count * 0.5:
                print(f"  β†’ DETECTED AS HEADER ROW!")
                print(f"  β†’ Headers: {row.tolist()[:10]}")
                break

@app.local_entrypoint()
def main():
    debug_single_file.remote()