Spaces:
Sleeping
Sleeping
| """ | |
| generate_synthetic.py | |
| ---------------------- | |
| Generates realistic synthetic MNREGA district-level data for Maharashtra. | |
| Mimics the structure of real data available from: | |
| - nregarep1.nic.in (MoRD official portal) | |
| - dataful.in (district-wise persondays + expenditure) | |
| Columns produced match what you'd get from real sources: | |
| state, district, financial_year, | |
| households_demanded, households_offered, households_availed, | |
| person_days, expenditure_lakhs, avg_wage_rate, works_completed | |
| Design principles for realism: | |
| - Each district has a stable "base capacity" (some districts are | |
| structurally larger / more active than others) | |
| - Year-on-year growth follows real MNREGA trends (spike in 2020-21 | |
| due to COVID reverse migration, slowdown in urban-adjacent districts) | |
| - Expenditure correlates with person_days but has noise (efficiency varies) | |
| - Wage rate increases over years (matches real wage revision schedule) | |
| - ~8% missing values injected randomly to simulate real data quality | |
| """ | |
| import numpy as np | |
| import pandas as pd | |
| import os | |
| # ββ Maharashtra districts (all 36) βββββββββββββββββββββββββββββββββββββββββββ | |
| MAHARASHTRA_DISTRICTS = [ | |
| "Ahmednagar", "Akola", "Amravati", "Aurangabad", "Beed", | |
| "Bhandara", "Buldhana", "Chandrapur", "Dhule", "Gadchiroli", | |
| "Gondia", "Hingoli", "Jalgaon", "Jalna", "Kolhapur", | |
| "Latur", "Mumbai City", "Mumbai Suburban", "Nagpur", "Nanded", | |
| "Nandurbar", "Nashik", "Osmanabad", "Palghar", "Parbhani", | |
| "Pune", "Raigad", "Ratnagiri", "Sangli", "Satara", | |
| "Sindhudurg", "Solapur", "Thane", "Wardha", "Washim", "Yavatmal" | |
| ] | |
| YEARS = [ | |
| "2014-15", "2015-16", "2016-17", "2017-18", "2018-19", | |
| "2019-20", "2020-21", "2021-22", "2022-23", "2023-24" | |
| ] | |
| # Real MNREGA wage rates in Maharashtra (approx βΉ/day by year) | |
| WAGE_RATES = { | |
| "2014-15": 162, "2015-16": 174, "2016-17": 183, "2017-18": 194, | |
| "2018-19": 203, "2019-20": 213, "2020-21": 238, "2021-22": 256, | |
| "2022-23": 273, "2023-24": 289 | |
| } | |
| # Year-level demand multipliers based on real MNREGA trends | |
| # COVID year (2020-21) saw massive spike due to reverse migration | |
| YEAR_MULTIPLIERS = { | |
| "2014-15": 0.85, "2015-16": 0.90, "2016-17": 0.92, "2017-18": 0.95, | |
| "2018-19": 1.00, "2019-20": 1.05, "2020-21": 1.45, "2021-22": 1.20, | |
| "2022-23": 1.10, "2023-24": 1.08 | |
| } | |
| # District profile: (base_persondays_lakhs, efficiency_score, rural_weight) | |
| # Urban/peri-urban districts have lower base; tribal/rural have higher | |
| DISTRICT_PROFILES = { | |
| "Gadchiroli": (18.5, 0.72, 0.95), | |
| "Nandurbar": (16.2, 0.68, 0.93), | |
| "Yavatmal": (15.8, 0.74, 0.91), | |
| "Amravati": (14.3, 0.76, 0.88), | |
| "Chandrapur": (13.9, 0.71, 0.87), | |
| "Washim": (12.1, 0.73, 0.89), | |
| "Buldhana": (11.8, 0.75, 0.86), | |
| "Beed": (11.5, 0.70, 0.90), | |
| "Hingoli": (10.9, 0.72, 0.88), | |
| "Osmanabad": (10.7, 0.69, 0.87), | |
| "Latur": (10.4, 0.71, 0.85), | |
| "Nanded": (10.2, 0.73, 0.84), | |
| "Jalna": (9.8, 0.74, 0.85), | |
| "Parbhani": (9.5, 0.72, 0.84), | |
| "Akola": (9.3, 0.75, 0.83), | |
| "Dhule": (9.1, 0.70, 0.85), | |
| "Gondia": (8.9, 0.76, 0.82), | |
| "Bhandara": (8.6, 0.74, 0.81), | |
| "Wardha": (8.3, 0.77, 0.80), | |
| "Ahmednagar": (8.1, 0.78, 0.79), | |
| "Solapur": (7.9, 0.76, 0.80), | |
| "Aurangabad": (7.6, 0.79, 0.75), | |
| "Jalgaon": (7.4, 0.77, 0.77), | |
| "Nashik": (7.1, 0.80, 0.73), | |
| "Satara": (6.8, 0.81, 0.74), | |
| "Sangli": (6.5, 0.80, 0.73), | |
| "Kolhapur": (6.2, 0.82, 0.71), | |
| "Palghar": (6.0, 0.75, 0.78), | |
| "Nandurbar": (5.8, 0.71, 0.82), | |
| "Ratnagiri": (5.5, 0.79, 0.74), | |
| "Sindhudurg": (5.1, 0.80, 0.72), | |
| "Raigad": (4.8, 0.78, 0.68), | |
| "Pune": (4.2, 0.83, 0.55), | |
| "Thane": (3.5, 0.81, 0.45), | |
| "Mumbai Suburban": (1.2, 0.85, 0.15), | |
| "Mumbai City": (0.4, 0.88, 0.05), | |
| } | |
| def generate(seed: int = 42, missing_rate: float = 0.08) -> pd.DataFrame: | |
| """ | |
| Generate a synthetic MNREGA dataset for Maharashtra. | |
| Args: | |
| seed : Random seed for reproducibility. | |
| missing_rate: Fraction of cells to nullify (simulates real data gaps). | |
| Returns: | |
| DataFrame with realistic MNREGA data. | |
| """ | |
| rng = np.random.default_rng(seed) | |
| records = [] | |
| for district in MAHARASHTRA_DISTRICTS: | |
| profile = DISTRICT_PROFILES.get(district, (7.0, 0.75, 0.70)) | |
| base_pd, efficiency, rural_w = profile | |
| for year in YEARS: | |
| year_mult = YEAR_MULTIPLIERS[year] | |
| wage = WAGE_RATES[year] | |
| # ββ Person days (in lakhs) ββββββββββββββββββββββββββββββββββββ | |
| noise = rng.normal(1.0, 0.07) | |
| person_days_lakhs = base_pd * year_mult * noise | |
| person_days_lakhs = max(person_days_lakhs, 0.1) | |
| # ββ Households βββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Avg ~45 days per household β households = person_days / 45 | |
| hh_demanded = int(person_days_lakhs * 1e5 / 38 * rng.uniform(1.05, 1.15)) | |
| hh_offered = int(hh_demanded * rng.uniform(0.92, 0.99)) | |
| hh_availed = int(hh_offered * rng.uniform(0.88, 0.97)) | |
| # ββ Expenditure (βΉ lakhs) ββββββββββββββββββββββββββββββββββββ | |
| # Base = person_days * wage_rate, efficiency introduces noise | |
| base_expenditure = person_days_lakhs * 1e5 * wage / 1e5 | |
| expenditure_lakhs = base_expenditure / efficiency * rng.uniform(0.93, 1.07) | |
| # ββ Works completed ββββββββββββββββββββββββββββββββββββββββββ | |
| works = int(person_days_lakhs * rng.uniform(18, 35)) | |
| records.append({ | |
| "state": "Maharashtra", | |
| "district": district, | |
| "financial_year": year, | |
| "households_demanded": hh_demanded, | |
| "households_offered": hh_offered, | |
| "households_availed": hh_availed, | |
| "person_days_lakhs": round(person_days_lakhs, 3), | |
| "expenditure_lakhs": round(expenditure_lakhs, 2), | |
| "avg_wage_rate": wage, | |
| "works_completed": works, | |
| }) | |
| df = pd.DataFrame(records) | |
| # ββ Inject realistic missing values ββββββββββββββββββββββββββββββββββββββ | |
| nullable_cols = [ | |
| "households_demanded", "households_offered", | |
| "households_availed", "works_completed" | |
| ] | |
| for col in nullable_cols: | |
| mask = rng.random(len(df)) < missing_rate | |
| df.loc[mask, col] = np.nan | |
| print(f"[generate] Created {len(df)} rows Γ {len(df.columns)} columns") | |
| print(f"[generate] Districts: {df['district'].nunique()} | Years: {df['financial_year'].nunique()}") | |
| print(f"[generate] Missing values injected: ~{missing_rate*100:.0f}% per nullable column") | |
| return df | |
| def save(df: pd.DataFrame, path: str = "data/raw/mnrega_maharashtra_synthetic.csv") -> None: | |
| os.makedirs(os.path.dirname(path), exist_ok=True) | |
| df.to_csv(path, index=False) | |
| print(f"[generate] Saved β {path}") | |
| if __name__ == "__main__": | |
| df = generate() | |
| save(df) | |
| print("\nSample:") | |
| print(df.head(6).to_string(index=False)) | |