sdlc-agent / src /data /debug_parser.py
Veeru-c's picture
initial commit
06bd253
import modal
app = modal.App("debug-parser")
vol_census = modal.Volume.from_name("census-data")
vol_economy = modal.Volume.from_name("economy-labor-data")
image = modal.Image.debian_slim().pip_install("pandas")
@app.function(
image=image,
volumes={"/data/census": vol_census, "/data/economy": vol_economy}
)
def debug_single_file():
import pandas as pd
import os
import re
# Get first census file
file_path = None
for root, _, filenames in os.walk("/data/census"):
for f in filenames:
if f.lower().endswith('.csv'):
file_path = os.path.join(root, f)
break
if file_path:
break
if not file_path:
print("No CSV files found!")
return
print(f"Testing file: {file_path}")
# Read raw
df_raw = pd.read_csv(file_path, header=None, low_memory=False)
print(f"\nRaw shape: {df_raw.shape}")
print(f"\nFirst 5 rows:")
for i in range(min(5, len(df_raw))):
print(f"Row {i}: {df_raw.iloc[i].tolist()[:5]}")
# Test header detection
for i in range(min(15, len(df_raw))):
row = df_raw.iloc[i]
non_null_count = row.count()
# Check for Unnamed
unnamed_count = sum(1 for val in row if pd.notna(val) and "Unnamed" in str(val))
header_like = 0
for val in row:
if pd.notna(val):
val_str = str(val).strip()
if val_str and not val_str.replace('.', '').replace(',', '').replace('-', '').replace(' ', '').isdigit():
header_like += 1
print(f"\nRow {i}: non_null={non_null_count}, unnamed={unnamed_count}, header_like={header_like}")
print(f" Ratios: unnamed={unnamed_count/non_null_count if non_null_count > 0 else 0:.2f}, header={header_like/non_null_count if non_null_count > 0 else 0:.2f}")
# Check if passes filters
if non_null_count >= len(df_raw.columns) * 0.3:
if unnamed_count > non_null_count * 0.3:
print(f" β†’ SKIPPED (too many Unnamed)")
elif header_like >= non_null_count * 0.5:
print(f" β†’ DETECTED AS HEADER ROW!")
print(f" β†’ Headers: {row.tolist()[:10]}")
break
@app.local_entrypoint()
def main():
debug_single_file.remote()