Spaces:

sammeeer
/

SchemeImpactNet

Sleeping

App Files Files Community

SchemeImpactNet / src /generate_synthetic.py

sammeeer

Inital schemeimpactnet deployment

f87e795 about 1 month ago

raw

history blame contribute delete

7.91 kB

	"""
	generate_synthetic.py
	----------------------
	Generates realistic synthetic MNREGA district-level data for Maharashtra.

	Mimics the structure of real data available from:
	- nregarep1.nic.in (MoRD official portal)
	- dataful.in (district-wise persondays + expenditure)

	Columns produced match what you'd get from real sources:
	state, district, financial_year,
	households_demanded, households_offered, households_availed,
	person_days, expenditure_lakhs, avg_wage_rate, works_completed

	Design principles for realism:
	- Each district has a stable "base capacity" (some districts are
	structurally larger / more active than others)
	- Year-on-year growth follows real MNREGA trends (spike in 2020-21
	due to COVID reverse migration, slowdown in urban-adjacent districts)
	- Expenditure correlates with person_days but has noise (efficiency varies)
	- Wage rate increases over years (matches real wage revision schedule)
	- ~8% missing values injected randomly to simulate real data quality
	"""

	import numpy as np
	import pandas as pd
	import os

	# ── Maharashtra districts (all 36) ───────────────────────────────────────────
	MAHARASHTRA_DISTRICTS = [
	"Ahmednagar", "Akola", "Amravati", "Aurangabad", "Beed",
	"Bhandara", "Buldhana", "Chandrapur", "Dhule", "Gadchiroli",
	"Gondia", "Hingoli", "Jalgaon", "Jalna", "Kolhapur",
	"Latur", "Mumbai City", "Mumbai Suburban", "Nagpur", "Nanded",
	"Nandurbar", "Nashik", "Osmanabad", "Palghar", "Parbhani",
	"Pune", "Raigad", "Ratnagiri", "Sangli", "Satara",
	"Sindhudurg", "Solapur", "Thane", "Wardha", "Washim", "Yavatmal"
	]

	YEARS = [
	"2014-15", "2015-16", "2016-17", "2017-18", "2018-19",
	"2019-20", "2020-21", "2021-22", "2022-23", "2023-24"
	]

	# Real MNREGA wage rates in Maharashtra (approx ₹/day by year)
	WAGE_RATES = {
	"2014-15": 162, "2015-16": 174, "2016-17": 183, "2017-18": 194,
	"2018-19": 203, "2019-20": 213, "2020-21": 238, "2021-22": 256,
	"2022-23": 273, "2023-24": 289
	}

	# Year-level demand multipliers based on real MNREGA trends
	# COVID year (2020-21) saw massive spike due to reverse migration
	YEAR_MULTIPLIERS = {
	"2014-15": 0.85, "2015-16": 0.90, "2016-17": 0.92, "2017-18": 0.95,
	"2018-19": 1.00, "2019-20": 1.05, "2020-21": 1.45, "2021-22": 1.20,
	"2022-23": 1.10, "2023-24": 1.08
	}

	# District profile: (base_persondays_lakhs, efficiency_score, rural_weight)
	# Urban/peri-urban districts have lower base; tribal/rural have higher
	DISTRICT_PROFILES = {
	"Gadchiroli": (18.5, 0.72, 0.95),
	"Nandurbar": (16.2, 0.68, 0.93),
	"Yavatmal": (15.8, 0.74, 0.91),
	"Amravati": (14.3, 0.76, 0.88),
	"Chandrapur": (13.9, 0.71, 0.87),
	"Washim": (12.1, 0.73, 0.89),
	"Buldhana": (11.8, 0.75, 0.86),
	"Beed": (11.5, 0.70, 0.90),
	"Hingoli": (10.9, 0.72, 0.88),
	"Osmanabad": (10.7, 0.69, 0.87),
	"Latur": (10.4, 0.71, 0.85),
	"Nanded": (10.2, 0.73, 0.84),
	"Jalna": (9.8, 0.74, 0.85),
	"Parbhani": (9.5, 0.72, 0.84),
	"Akola": (9.3, 0.75, 0.83),
	"Dhule": (9.1, 0.70, 0.85),
	"Gondia": (8.9, 0.76, 0.82),
	"Bhandara": (8.6, 0.74, 0.81),
	"Wardha": (8.3, 0.77, 0.80),
	"Ahmednagar": (8.1, 0.78, 0.79),
	"Solapur": (7.9, 0.76, 0.80),
	"Aurangabad": (7.6, 0.79, 0.75),
	"Jalgaon": (7.4, 0.77, 0.77),
	"Nashik": (7.1, 0.80, 0.73),
	"Satara": (6.8, 0.81, 0.74),
	"Sangli": (6.5, 0.80, 0.73),
	"Kolhapur": (6.2, 0.82, 0.71),
	"Palghar": (6.0, 0.75, 0.78),
	"Nandurbar": (5.8, 0.71, 0.82),
	"Ratnagiri": (5.5, 0.79, 0.74),
	"Sindhudurg": (5.1, 0.80, 0.72),
	"Raigad": (4.8, 0.78, 0.68),
	"Pune": (4.2, 0.83, 0.55),
	"Thane": (3.5, 0.81, 0.45),
	"Mumbai Suburban": (1.2, 0.85, 0.15),
	"Mumbai City": (0.4, 0.88, 0.05),
	}


	def generate(seed: int = 42, missing_rate: float = 0.08) -> pd.DataFrame:
	"""
	Generate a synthetic MNREGA dataset for Maharashtra.

	Args:
	seed : Random seed for reproducibility.
	missing_rate: Fraction of cells to nullify (simulates real data gaps).

	Returns:
	DataFrame with realistic MNREGA data.
	"""
	rng = np.random.default_rng(seed)
	records = []

	for district in MAHARASHTRA_DISTRICTS:
	profile = DISTRICT_PROFILES.get(district, (7.0, 0.75, 0.70))
	base_pd, efficiency, rural_w = profile

	for year in YEARS:
	year_mult = YEAR_MULTIPLIERS[year]
	wage = WAGE_RATES[year]

	# ── Person days (in lakhs) ────────────────────────────────────
	noise = rng.normal(1.0, 0.07)
	person_days_lakhs = base_pd * year_mult * noise
	person_days_lakhs = max(person_days_lakhs, 0.1)

	# ── Households ───────────────────────────────────────────────
	# Avg ~45 days per household → households = person_days / 45
	hh_demanded = int(person_days_lakhs * 1e5 / 38 * rng.uniform(1.05, 1.15))
	hh_offered = int(hh_demanded * rng.uniform(0.92, 0.99))
	hh_availed = int(hh_offered * rng.uniform(0.88, 0.97))

	# ── Expenditure (₹ lakhs) ────────────────────────────────────
	# Base = person_days * wage_rate, efficiency introduces noise
	base_expenditure = person_days_lakhs * 1e5 * wage / 1e5
	expenditure_lakhs = base_expenditure / efficiency * rng.uniform(0.93, 1.07)

	# ── Works completed ──────────────────────────────────────────
	works = int(person_days_lakhs * rng.uniform(18, 35))

	records.append({
	"state": "Maharashtra",
	"district": district,
	"financial_year": year,
	"households_demanded": hh_demanded,
	"households_offered": hh_offered,
	"households_availed": hh_availed,
	"person_days_lakhs": round(person_days_lakhs, 3),
	"expenditure_lakhs": round(expenditure_lakhs, 2),
	"avg_wage_rate": wage,
	"works_completed": works,
	})

	df = pd.DataFrame(records)

	# ── Inject realistic missing values ──────────────────────────────────────
	nullable_cols = [
	"households_demanded", "households_offered",
	"households_availed", "works_completed"
	]
	for col in nullable_cols:
	mask = rng.random(len(df)) < missing_rate
	df.loc[mask, col] = np.nan

	print(f"[generate] Created {len(df)} rows × {len(df.columns)} columns")
	print(f"[generate] Districts: {df['district'].nunique()} \| Years: {df['financial_year'].nunique()}")
	print(f"[generate] Missing values injected: ~{missing_rate*100:.0f}% per nullable column")

	return df


	def save(df: pd.DataFrame, path: str = "data/raw/mnrega_maharashtra_synthetic.csv") -> None:
	os.makedirs(os.path.dirname(path), exist_ok=True)
	df.to_csv(path, index=False)
	print(f"[generate] Saved → {path}")


	if __name__ == "__main__":
	df = generate()
	save(df)
	print("\nSample:")
	print(df.head(6).to_string(index=False))