import pandas as pd import re import os def clean_value(val): """Clean and normalize values""" if pd.isna(val): return None val_str = str(val).strip() # Remove leading codes like "13103_" val_str = re.sub(r'^\d+_', '', val_str) # Remove numpy type wrappers val_str = re.sub(r'^np\.(int|float)\d*\((.+)\)$', r'\2', val_str) return val_str if val_str and val_str.lower() not in ['nan', 'none', ''] else None def find_header_row(df): """ Intelligently find the row that contains real column headers. Returns: (header_row_index, data_start_row_index) """ for i in range(min(20, len(df))): row = df.iloc[i] # Count non-null values non_null_count = row.count() if non_null_count < len(df.columns) * 0.3: continue # Skip sparse rows # Check if this row has "Unnamed" in most values (skip these) unnamed_count = sum(1 for val in row if pd.notna(val) and "Unnamed" in str(val)) if unnamed_count > non_null_count * 0.3: continue # Skip rows with too many "Unnamed" # Count how many values look like headers (strings, not numbers) header_like = 0 for val in row: if pd.notna(val): val_str = str(val).strip() # Headers are usually strings, not pure numbers if val_str and not val_str.replace('.', '').replace(',', '').replace('-', '').replace(' ', '').isdigit(): header_like += 1 # If most non-null values look like headers, this is likely the header row if header_like >= non_null_count * 0.5: return i, i + 1 return None, None def process_local_file(file_path): print(f"Processing {file_path}...") try: # Read the file to extract headers from specific rows # Based on inspection: # Row 7 (index 7): Destination Areas (cols 4+) # Row 9 (index 9): Origin Areas metadata (cols 0-3) # Read first 15 rows to get headers df_headers = pd.read_csv(file_path, header=None, nrows=15, low_memory=False) # Construct headers headers = [] # Cols 0-3 from Row 9 row9 = df_headers.iloc[9] for i in range(4): val = clean_value(row9[i]) headers.append(val if val else f"Meta_{i}") # Cols 4+ from Row 7 row7 = df_headers.iloc[7] for i in range(4, len(row7)): val = clean_value(row7[i]) # Add prefix to indicate it's a destination column headers.append(f"Dest_{val}" if val else f"Col_{i}") # Read data starting from Row 10 # We need to read the whole file now, skipping first 10 rows df = pd.read_csv(file_path, header=None, skiprows=10, low_memory=False) # Assign headers if len(df.columns) != len(headers): print(f"Column count mismatch: Data has {len(df.columns)}, Headers have {len(headers)}") # Adjust headers if needed if len(df.columns) < len(headers): headers = headers[:len(df.columns)] else: headers += [f"Extra_{i}" for i in range(len(headers), len(df.columns))] # Deduplicate headers unique_headers = [] seen_headers = {} for h in headers: if h in seen_headers: seen_headers[h] += 1 unique_headers.append(f"{h}_{seen_headers[h]}") else: seen_headers[h] = 0 unique_headers.append(h) headers = unique_headers df.columns = headers # Clean values in all columns for col in df.columns: # Only clean string columns to avoid converting numbers to strings if not needed if df[col].dtype == 'object': df[col] = df[col].apply(clean_value) # Save clean CSV output_path = "clean_sample.csv" df.to_csv(output_path, index=False) print(f"Saved clean CSV to {output_path}") print(f"Shape: {df.shape}") print("\nFirst 5 rows:") # Print first 5 columns to avoid mess print(df.iloc[:5, :5].to_markdown(index=False)) except Exception as e: print(f"Error processing file: {e}") import traceback traceback.print_exc() if __name__ == "__main__": process_local_file("sample.csv")