""" extract.py ---------- Loads and validates the unified MNREGA CSV. Supports both the synthetic unified dataset and any real CSV that matches the schema. """ import pandas as pd REQUIRED_COLUMNS = { "state", "district", "financial_year", "person_days_lakhs", "expenditure_lakhs", "avg_wage_rate" } STAGE1_COLUMNS = REQUIRED_COLUMNS STAGE2_COLUMNS = STAGE1_COLUMNS | {"rainfall_mm", "crop_season_index", "rural_population_lakhs", "poverty_rate_pct"} STAGE3_COLUMNS = STAGE2_COLUMNS | {"pmkisan_beneficiaries", "pmay_houses_sanctioned", "budget_allocated_lakhs"} def load_csv(filepath: str, state_filter: str = None) -> pd.DataFrame: """ Load unified MNREGA CSV. Args: filepath : Path to CSV file. state_filter : If provided, filter to a single state e.g. "Maharashtra". Pass None for all-India (Stage 2+). Returns: Raw DataFrame. """ print(f"[extract] Loading: {filepath}") try: df = pd.read_csv(filepath) except FileNotFoundError: raise FileNotFoundError(f"[extract] File not found: {filepath}") # Normalize column names df.columns = [c.strip().lower().replace(" ", "_") for c in df.columns] _validate_columns(df) if state_filter: before = len(df) df = df[df["state"] == state_filter].reset_index(drop=True) print(f"[extract] Filtered to '{state_filter}': {before} → {len(df)} rows") print(f"[extract] Loaded {len(df)} rows | {df['state'].nunique()} state(s) | {df['district'].nunique()} districts | {df['financial_year'].nunique()} years") print(f"[extract] Validation passed ✓") return df def _validate_columns(df: pd.DataFrame) -> None: actual = set(df.columns) missing = REQUIRED_COLUMNS - actual if missing: raise ValueError(f"[extract] Missing required columns: {missing}")