File size: 4,562 Bytes
46f2cb3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import pandas as pd
import re
import os

def clean_value(val):
    """Clean and normalize values"""
    if pd.isna(val):
        return None
    val_str = str(val).strip()
    # Remove leading codes like "13103_"
    val_str = re.sub(r'^\d+_', '', val_str)
    # Remove numpy type wrappers
    val_str = re.sub(r'^np\.(int|float)\d*\((.+)\)$', r'\2', val_str)
    return val_str if val_str and val_str.lower() not in ['nan', 'none', ''] else None

def find_header_row(df):
    """
    Intelligently find the row that contains real column headers.
    Returns: (header_row_index, data_start_row_index)
    """
    for i in range(min(20, len(df))):
        row = df.iloc[i]
        
        # Count non-null values
        non_null_count = row.count()
        if non_null_count < len(df.columns) * 0.3:
            continue  # Skip sparse rows
        
        # Check if this row has "Unnamed" in most values (skip these)
        unnamed_count = sum(1 for val in row if pd.notna(val) and "Unnamed" in str(val))
        if unnamed_count > non_null_count * 0.3:
            continue  # Skip rows with too many "Unnamed"
        
        # Count how many values look like headers (strings, not numbers)
        header_like = 0
        for val in row:
            if pd.notna(val):
                val_str = str(val).strip()
                # Headers are usually strings, not pure numbers
                if val_str and not val_str.replace('.', '').replace(',', '').replace('-', '').replace(' ', '').isdigit():
                    header_like += 1
        
        # If most non-null values look like headers, this is likely the header row
        if header_like >= non_null_count * 0.5:
            return i, i + 1
    
    return None, None

def process_local_file(file_path):
    print(f"Processing {file_path}...")
    
    try:
        # Read the file to extract headers from specific rows
        # Based on inspection:
        # Row 7 (index 7): Destination Areas (cols 4+)
        # Row 9 (index 9): Origin Areas metadata (cols 0-3)
        
        # Read first 15 rows to get headers
        df_headers = pd.read_csv(file_path, header=None, nrows=15, low_memory=False)
        
        # Construct headers
        headers = []
        
        # Cols 0-3 from Row 9
        row9 = df_headers.iloc[9]
        for i in range(4):
            val = clean_value(row9[i])
            headers.append(val if val else f"Meta_{i}")
            
        # Cols 4+ from Row 7
        row7 = df_headers.iloc[7]
        for i in range(4, len(row7)):
            val = clean_value(row7[i])
            # Add prefix to indicate it's a destination column
            headers.append(f"Dest_{val}" if val else f"Col_{i}")
            
        # Read data starting from Row 10
        # We need to read the whole file now, skipping first 10 rows
        df = pd.read_csv(file_path, header=None, skiprows=10, low_memory=False)
        
        # Assign headers
        if len(df.columns) != len(headers):
            print(f"Column count mismatch: Data has {len(df.columns)}, Headers have {len(headers)}")
            # Adjust headers if needed
            if len(df.columns) < len(headers):
                headers = headers[:len(df.columns)]
            else:
                headers += [f"Extra_{i}" for i in range(len(headers), len(df.columns))]
        
        # Deduplicate headers
        unique_headers = []
        seen_headers = {}
        for h in headers:
            if h in seen_headers:
                seen_headers[h] += 1
                unique_headers.append(f"{h}_{seen_headers[h]}")
            else:
                seen_headers[h] = 0
                unique_headers.append(h)
        headers = unique_headers
        
        df.columns = headers
        
        # Clean values in all columns
        for col in df.columns:
            # Only clean string columns to avoid converting numbers to strings if not needed
            if df[col].dtype == 'object':
                df[col] = df[col].apply(clean_value)
        
        # Save clean CSV
        output_path = "clean_sample.csv"
        df.to_csv(output_path, index=False)
        print(f"Saved clean CSV to {output_path}")
        print(f"Shape: {df.shape}")
        print("\nFirst 5 rows:")
        # Print first 5 columns to avoid mess
        print(df.iloc[:5, :5].to_markdown(index=False))
        
    except Exception as e:
        print(f"Error processing file: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    process_local_file("sample.csv")