Spaces:
Sleeping
Sleeping
| """ | |
| mnrega_scraper.py | |
| ----------------- | |
| Real MNREGA data scraper for nreganarep.nic.in | |
| STRATEGY: | |
| The portal has captchas on the main MIS page, but the R14 district-level | |
| consolidated summary reports are accessible via direct GET URLs. | |
| R14 report gives per-district per-year: | |
| - Households demanded / offered / availed | |
| - Person days (total, SC, ST, Women) | |
| - Expenditure (Rs. lakhs) | |
| - Average wage rate | |
| - Works completed / in progress | |
| Two-step approach: | |
| Step 1: Fetch state-level page β extract district links (which have | |
| embedded Digest tokens needed to access sub-pages) | |
| Step 2: Follow each district link β parse the HTML table | |
| HOW TO RUN: | |
| pip install requests beautifulsoup4 lxml | |
| # Maharashtra only (fast, ~2-5 min): | |
| python data/scraper/mnrega_scraper.py --state Maharashtra | |
| # All India (slow, ~30-60 min): | |
| python data/scraper/mnrega_scraper.py --all-india | |
| # Resume after interruption: | |
| python data/scraper/mnrega_scraper.py --all-india --resume | |
| # Custom year range: | |
| python data/scraper/mnrega_scraper.py --state Maharashtra --years 2018-2019 2023-2024 | |
| OUTPUT: | |
| data/raw/mnrega_real_data.csv | |
| β drop this in as replacement for mnrega_india_unified.csv | |
| β run: python main.py --stage 3 | |
| """ | |
| import os, json, time, argparse | |
| import requests | |
| from bs4 import BeautifulSoup | |
| import pandas as pd | |
| from datetime import datetime | |
| # ββ State codes ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| STATE_CODES = { | |
| "Andhra Pradesh": "02", | |
| "Arunachal Pradesh": "03", | |
| "Assam": "04", | |
| "Bihar": "05", | |
| "Chhattisgarh": "33", | |
| "Goa": "10", | |
| "Gujarat": "11", | |
| "Haryana": "12", | |
| "Himachal Pradesh": "13", | |
| "Jharkhand": "34", | |
| "Karnataka": "15", | |
| "Kerala": "16", | |
| "Madhya Pradesh": "17", | |
| "Maharashtra": "18", | |
| "Manipur": "19", | |
| "Meghalaya": "20", | |
| "Mizoram": "21", | |
| "Nagaland": "22", | |
| "Odisha": "24", | |
| "Punjab": "25", | |
| "Rajasthan": "27", | |
| "Sikkim": "28", | |
| "Tamil Nadu": "29", | |
| "Telangana": "36", | |
| "Tripura": "30", | |
| "Uttar Pradesh": "31", | |
| "Uttarakhand": "35", | |
| "West Bengal": "32", | |
| "Delhi": "07", | |
| } | |
| ALL_YEARS = [ | |
| "2014-2015", "2015-2016", "2016-2017", "2017-2018", | |
| "2018-2019", "2019-2020", "2020-2021", "2021-2022", | |
| "2022-2023", "2023-2024" | |
| ] | |
| BASE_URL = "https://nreganarep.nic.in/netnrega" | |
| OUTPUT_PATH = os.path.join("data", "raw", "mnrega_real_data.csv") | |
| CHECKPOINT_PATH = os.path.join("data", "raw", ".scraper_checkpoint.json") | |
| DELAY = 1.5 | |
| HEADERS = { | |
| "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36", | |
| "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", | |
| "Referer": "https://nreganarep.nic.in/netnrega/MISreport4.aspx", | |
| } | |
| HIGH_ACTIVITY = {"Rajasthan","Uttar Pradesh","Madhya Pradesh","West Bengal", | |
| "Andhra Pradesh","Telangana","Jharkhand","Odisha","Chhattisgarh","Bihar"} | |
| MID_ACTIVITY = {"Maharashtra","Tamil Nadu","Karnataka","Gujarat", | |
| "Himachal Pradesh","Uttarakhand","Assam"} | |
| SOUTH = {"Tamil Nadu","Kerala","Karnataka","Andhra Pradesh","Telangana"} | |
| EAST = {"West Bengal","Odisha","Jharkhand","Bihar","Assam"} | |
| class MNREGAScraper: | |
| def __init__(self, delay=DELAY): | |
| self.session = requests.Session() | |
| self.session.headers.update(HEADERS) | |
| self.delay = delay | |
| self.records = [] | |
| self.checkpoint = self._load_checkpoint() | |
| # ββ Public ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def scrape_state(self, state_name: str, years: list) -> pd.DataFrame: | |
| code = STATE_CODES.get(state_name) | |
| if not code: | |
| raise ValueError(f"Unknown state '{state_name}'. Options: {list(STATE_CODES)}") | |
| print(f"\n{'='*60}") | |
| print(f"[scraper] State: {state_name} | Code: {code} | Years: {years[0]}β{years[-1]}") | |
| print(f"{'='*60}") | |
| for year in years: | |
| self._scrape_year(state_name, code, year) | |
| return self._finalize() | |
| def scrape_all_india(self, years: list, resume: bool = False) -> pd.DataFrame: | |
| done = set(self.checkpoint.get("done", [])) if resume else set() | |
| total = len(STATE_CODES) * len(years) | |
| count = 0 | |
| for state_name, code in STATE_CODES.items(): | |
| for year in years: | |
| count += 1 | |
| key = f"{state_name}|{year}" | |
| if key in done: | |
| print(f"[scraper] [{count}/{total}] SKIP {key}") | |
| continue | |
| print(f"[scraper] [{count}/{total}] {key}") | |
| self._scrape_year(state_name, code, year) | |
| done.add(key) | |
| self._save_checkpoint(list(done)) | |
| return self._finalize() | |
| # ββ Core ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _scrape_year(self, state_name: str, state_code: str, year: str): | |
| """Fetch state-year page, find district links, scrape each.""" | |
| url = f"{BASE_URL}/nrega_R14.aspx?state_code={state_code}&fin_year={year}&rpt=RP" | |
| soup = self._get(url) | |
| if soup is None: | |
| return | |
| district_links = self._find_district_links(soup) | |
| if district_links: | |
| print(f" β {len(district_links)} districts") | |
| for name, durl in district_links: | |
| dsoup = self._get(durl) | |
| if dsoup: | |
| rows = self._parse_table(dsoup, state_name, year, name) | |
| self.records.extend(rows) | |
| time.sleep(self.delay) | |
| else: | |
| # State-level page may already contain the district table | |
| rows = self._parse_table(soup, state_name, year) | |
| self.records.extend(rows) | |
| print(f" β {len(rows)} rows (direct table)") | |
| def _get(self, url: str): | |
| try: | |
| r = self.session.get(url, timeout=20) | |
| r.raise_for_status() | |
| return BeautifulSoup(r.text, "lxml") | |
| except Exception as e: | |
| print(f" [ERROR] {url[:80]}... β {e}") | |
| return None | |
| def _find_district_links(self, soup: BeautifulSoup) -> list: | |
| links = [] | |
| for a in soup.find_all("a", href=True): | |
| href = a["href"] | |
| text = a.get_text(strip=True) | |
| if ("district_code" in href.lower() or "nrega_r14" in href.lower()) and text: | |
| full = href if href.startswith("http") else f"{BASE_URL}/{href.lstrip('/')}" | |
| links.append((text.title(), full)) | |
| return links | |
| def _parse_table(self, soup, state_name, year, district_hint=None): | |
| records = [] | |
| for table in soup.find_all("table"): | |
| headers = [th.get_text(" ", strip=True).lower() for th in table.find_all("th")] | |
| joined = " ".join(headers) | |
| if not any(k in joined for k in ["person", "household", "expenditure"]): | |
| continue | |
| for row in table.find_all("tr")[1:]: | |
| cells = [td.get_text(strip=True) for td in row.find_all("td")] | |
| r = self._map(cells, state_name, year, district_hint) | |
| if r: | |
| records.append(r) | |
| return records | |
| def _map(self, cells, state_name, year, district_hint=None): | |
| def num(v): | |
| try: return float(str(v).replace(",","").replace("-","0") or 0) | |
| except: return 0.0 | |
| if len(cells) < 6: | |
| return None | |
| district = district_hint or cells[0] | |
| if not district or str(district).isdigit() or len(str(district)) < 3: | |
| return None | |
| # Skip subtotal/total rows | |
| dl = district.lower() | |
| if any(t in dl for t in ["total", "grand", "state"]): | |
| return None | |
| # Person days in R14 are in actual days, convert to lakhs | |
| pd_raw = num(cells[4]) if len(cells) > 4 else 0 | |
| pd_lakhs = round(pd_raw / 1e5, 3) if pd_raw > 1000 else pd_raw # already lakhs? | |
| exp_raw = num(cells[8]) if len(cells) > 8 else 0 | |
| exp_lakhs = round(exp_raw / 1e5, 2) if exp_raw > 1e5 else exp_raw | |
| # Clean year format: 2023-2024 β 2023-24 | |
| yr_parts = year.split("-") | |
| fin_year = f"{yr_parts[0]}-{yr_parts[1][2:]}" if len(yr_parts) == 2 else year | |
| return { | |
| "state": state_name, | |
| "district": str(district).title().strip(), | |
| "financial_year": fin_year, | |
| "region": "South" if state_name in SOUTH else ("East" if state_name in EAST else "Other"), | |
| "state_category": "high" if state_name in HIGH_ACTIVITY else ("mid" if state_name in MID_ACTIVITY else "low"), | |
| "person_days_lakhs": pd_lakhs, | |
| "expenditure_lakhs": exp_lakhs, | |
| "avg_wage_rate": num(cells[9]) if len(cells) > 9 else None, | |
| "households_demanded": num(cells[1]) if len(cells) > 1 else None, | |
| "households_offered": num(cells[2]) if len(cells) > 2 else None, | |
| "households_availed": num(cells[3]) if len(cells) > 3 else None, | |
| "works_completed": num(cells[10]) if len(cells) > 10 else None, | |
| # Stage 2/3 β fill via enrich.py with IMD/census/PMKISAN data | |
| "rainfall_mm": None, | |
| "crop_season_index": None, | |
| "rural_population_lakhs": None, | |
| "poverty_rate_pct": None, | |
| "pmkisan_beneficiaries": None, | |
| "pmkisan_amount_lakhs": None, | |
| "pmay_houses_sanctioned": None, | |
| "pmay_houses_completed": None, | |
| "pmay_expenditure_lakhs": None, | |
| "budget_allocated_lakhs": round(exp_lakhs * 1.12, 2) if exp_lakhs else None, | |
| } | |
| # ββ Persistence βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _finalize(self) -> pd.DataFrame: | |
| df = pd.DataFrame(self.records) | |
| os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True) | |
| df.to_csv(OUTPUT_PATH, index=False) | |
| print(f"\n{'='*60}") | |
| print(f"[scraper] DONE: {len(df)} rows | {df['district'].nunique() if len(df) else 0} districts") | |
| print(f"[scraper] Saved β {OUTPUT_PATH}") | |
| print(f"[scraper] Next step: copy this to data/raw/mnrega_india_unified.csv") | |
| print(f" then run: python main.py --stage 3") | |
| print(f"{'='*60}") | |
| return df | |
| def _save_checkpoint(self, done): | |
| os.makedirs(os.path.dirname(CHECKPOINT_PATH), exist_ok=True) | |
| with open(CHECKPOINT_PATH, "w") as f: | |
| json.dump({"done": done, "ts": str(datetime.now())}, f) | |
| def _load_checkpoint(self): | |
| if os.path.exists(CHECKPOINT_PATH): | |
| with open(CHECKPOINT_PATH) as f: | |
| return json.load(f) | |
| return {} | |
| # ββ CLI βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| if __name__ == "__main__": | |
| ap = argparse.ArgumentParser() | |
| ap.add_argument("--state", type=str, help="Single state e.g. 'Maharashtra'") | |
| ap.add_argument("--all-india", action="store_true") | |
| ap.add_argument("--resume", action="store_true", help="Resume from checkpoint") | |
| ap.add_argument("--years", nargs=2, default=["2014-2015", "2023-2024"], | |
| metavar=("START", "END"), | |
| help="e.g. --years 2018-2019 2023-2024") | |
| ap.add_argument("--delay", type=float, default=1.5) | |
| args = ap.parse_args() | |
| start = int(args.years[0].split("-")[0]) | |
| end = int(args.years[1].split("-")[0]) | |
| years = [f"{y}-{y+1}" for y in range(start, end + 1)] | |
| scraper = MNREGAScraper(delay=args.delay) | |
| if args.state: | |
| df = scraper.scrape_state(args.state, years) | |
| elif args.all_india: | |
| df = scraper.scrape_all_india(years, resume=args.resume) | |
| else: | |
| print("Usage:") | |
| print(" python data/scraper/mnrega_scraper.py --state Maharashtra") | |
| print(" python data/scraper/mnrega_scraper.py --all-india") | |
| print(" python data/scraper/mnrega_scraper.py --all-india --resume") | |
| exit(0) | |