Spaces:
Sleeping
Sleeping
| """ | |
| extract.py | |
| ---------- | |
| Loads and validates the unified MNREGA CSV. | |
| Supports both the synthetic unified dataset and any real CSV | |
| that matches the schema. | |
| """ | |
| import pandas as pd | |
| REQUIRED_COLUMNS = { | |
| "state", "district", "financial_year", | |
| "person_days_lakhs", "expenditure_lakhs", "avg_wage_rate" | |
| } | |
| STAGE1_COLUMNS = REQUIRED_COLUMNS | |
| STAGE2_COLUMNS = STAGE1_COLUMNS | {"rainfall_mm", "crop_season_index", "rural_population_lakhs", "poverty_rate_pct"} | |
| STAGE3_COLUMNS = STAGE2_COLUMNS | {"pmkisan_beneficiaries", "pmay_houses_sanctioned", "budget_allocated_lakhs"} | |
| def load_csv(filepath: str, state_filter: str = None) -> pd.DataFrame: | |
| """ | |
| Load unified MNREGA CSV. | |
| Args: | |
| filepath : Path to CSV file. | |
| state_filter : If provided, filter to a single state e.g. "Maharashtra". | |
| Pass None for all-India (Stage 2+). | |
| Returns: | |
| Raw DataFrame. | |
| """ | |
| print(f"[extract] Loading: {filepath}") | |
| try: | |
| df = pd.read_csv(filepath) | |
| except FileNotFoundError: | |
| raise FileNotFoundError(f"[extract] File not found: {filepath}") | |
| # Normalize column names | |
| df.columns = [c.strip().lower().replace(" ", "_") for c in df.columns] | |
| _validate_columns(df) | |
| if state_filter: | |
| before = len(df) | |
| df = df[df["state"] == state_filter].reset_index(drop=True) | |
| print(f"[extract] Filtered to '{state_filter}': {before} → {len(df)} rows") | |
| print(f"[extract] Loaded {len(df)} rows | {df['state'].nunique()} state(s) | {df['district'].nunique()} districts | {df['financial_year'].nunique()} years") | |
| print(f"[extract] Validation passed ✓") | |
| return df | |
| def _validate_columns(df: pd.DataFrame) -> None: | |
| actual = set(df.columns) | |
| missing = REQUIRED_COLUMNS - actual | |
| if missing: | |
| raise ValueError(f"[extract] Missing required columns: {missing}") | |