Spaces:
Runtime error
Runtime error
| import modal | |
| app = modal.App("debug-parser") | |
| vol_census = modal.Volume.from_name("census-data") | |
| vol_economy = modal.Volume.from_name("economy-labor-data") | |
| image = modal.Image.debian_slim().pip_install("pandas") | |
| def debug_single_file(): | |
| import pandas as pd | |
| import os | |
| import re | |
| # Get first census file | |
| file_path = None | |
| for root, _, filenames in os.walk("/data/census"): | |
| for f in filenames: | |
| if f.lower().endswith('.csv'): | |
| file_path = os.path.join(root, f) | |
| break | |
| if file_path: | |
| break | |
| if not file_path: | |
| print("No CSV files found!") | |
| return | |
| print(f"Testing file: {file_path}") | |
| # Read raw | |
| df_raw = pd.read_csv(file_path, header=None, low_memory=False) | |
| print(f"\nRaw shape: {df_raw.shape}") | |
| print(f"\nFirst 5 rows:") | |
| for i in range(min(5, len(df_raw))): | |
| print(f"Row {i}: {df_raw.iloc[i].tolist()[:5]}") | |
| # Test header detection | |
| for i in range(min(15, len(df_raw))): | |
| row = df_raw.iloc[i] | |
| non_null_count = row.count() | |
| # Check for Unnamed | |
| unnamed_count = sum(1 for val in row if pd.notna(val) and "Unnamed" in str(val)) | |
| header_like = 0 | |
| for val in row: | |
| if pd.notna(val): | |
| val_str = str(val).strip() | |
| if val_str and not val_str.replace('.', '').replace(',', '').replace('-', '').replace(' ', '').isdigit(): | |
| header_like += 1 | |
| print(f"\nRow {i}: non_null={non_null_count}, unnamed={unnamed_count}, header_like={header_like}") | |
| print(f" Ratios: unnamed={unnamed_count/non_null_count if non_null_count > 0 else 0:.2f}, header={header_like/non_null_count if non_null_count > 0 else 0:.2f}") | |
| # Check if passes filters | |
| if non_null_count >= len(df_raw.columns) * 0.3: | |
| if unnamed_count > non_null_count * 0.3: | |
| print(f" β SKIPPED (too many Unnamed)") | |
| elif header_like >= non_null_count * 0.5: | |
| print(f" β DETECTED AS HEADER ROW!") | |
| print(f" β Headers: {row.tolist()[:10]}") | |
| break | |
| def main(): | |
| debug_single_file.remote() | |