Spaces:
Sleeping
Sleeping
File size: 13,083 Bytes
f87e795 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 | """
mnrega_scraper.py
-----------------
Real MNREGA data scraper for nreganarep.nic.in
STRATEGY:
The portal has captchas on the main MIS page, but the R14 district-level
consolidated summary reports are accessible via direct GET URLs.
R14 report gives per-district per-year:
- Households demanded / offered / availed
- Person days (total, SC, ST, Women)
- Expenditure (Rs. lakhs)
- Average wage rate
- Works completed / in progress
Two-step approach:
Step 1: Fetch state-level page β extract district links (which have
embedded Digest tokens needed to access sub-pages)
Step 2: Follow each district link β parse the HTML table
HOW TO RUN:
pip install requests beautifulsoup4 lxml
# Maharashtra only (fast, ~2-5 min):
python data/scraper/mnrega_scraper.py --state Maharashtra
# All India (slow, ~30-60 min):
python data/scraper/mnrega_scraper.py --all-india
# Resume after interruption:
python data/scraper/mnrega_scraper.py --all-india --resume
# Custom year range:
python data/scraper/mnrega_scraper.py --state Maharashtra --years 2018-2019 2023-2024
OUTPUT:
data/raw/mnrega_real_data.csv
β drop this in as replacement for mnrega_india_unified.csv
β run: python main.py --stage 3
"""
import os, json, time, argparse
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
# ββ State codes ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
STATE_CODES = {
"Andhra Pradesh": "02",
"Arunachal Pradesh": "03",
"Assam": "04",
"Bihar": "05",
"Chhattisgarh": "33",
"Goa": "10",
"Gujarat": "11",
"Haryana": "12",
"Himachal Pradesh": "13",
"Jharkhand": "34",
"Karnataka": "15",
"Kerala": "16",
"Madhya Pradesh": "17",
"Maharashtra": "18",
"Manipur": "19",
"Meghalaya": "20",
"Mizoram": "21",
"Nagaland": "22",
"Odisha": "24",
"Punjab": "25",
"Rajasthan": "27",
"Sikkim": "28",
"Tamil Nadu": "29",
"Telangana": "36",
"Tripura": "30",
"Uttar Pradesh": "31",
"Uttarakhand": "35",
"West Bengal": "32",
"Delhi": "07",
}
ALL_YEARS = [
"2014-2015", "2015-2016", "2016-2017", "2017-2018",
"2018-2019", "2019-2020", "2020-2021", "2021-2022",
"2022-2023", "2023-2024"
]
BASE_URL = "https://nreganarep.nic.in/netnrega"
OUTPUT_PATH = os.path.join("data", "raw", "mnrega_real_data.csv")
CHECKPOINT_PATH = os.path.join("data", "raw", ".scraper_checkpoint.json")
DELAY = 1.5
HEADERS = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Referer": "https://nreganarep.nic.in/netnrega/MISreport4.aspx",
}
HIGH_ACTIVITY = {"Rajasthan","Uttar Pradesh","Madhya Pradesh","West Bengal",
"Andhra Pradesh","Telangana","Jharkhand","Odisha","Chhattisgarh","Bihar"}
MID_ACTIVITY = {"Maharashtra","Tamil Nadu","Karnataka","Gujarat",
"Himachal Pradesh","Uttarakhand","Assam"}
SOUTH = {"Tamil Nadu","Kerala","Karnataka","Andhra Pradesh","Telangana"}
EAST = {"West Bengal","Odisha","Jharkhand","Bihar","Assam"}
class MNREGAScraper:
def __init__(self, delay=DELAY):
self.session = requests.Session()
self.session.headers.update(HEADERS)
self.delay = delay
self.records = []
self.checkpoint = self._load_checkpoint()
# ββ Public ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def scrape_state(self, state_name: str, years: list) -> pd.DataFrame:
code = STATE_CODES.get(state_name)
if not code:
raise ValueError(f"Unknown state '{state_name}'. Options: {list(STATE_CODES)}")
print(f"\n{'='*60}")
print(f"[scraper] State: {state_name} | Code: {code} | Years: {years[0]}β{years[-1]}")
print(f"{'='*60}")
for year in years:
self._scrape_year(state_name, code, year)
return self._finalize()
def scrape_all_india(self, years: list, resume: bool = False) -> pd.DataFrame:
done = set(self.checkpoint.get("done", [])) if resume else set()
total = len(STATE_CODES) * len(years)
count = 0
for state_name, code in STATE_CODES.items():
for year in years:
count += 1
key = f"{state_name}|{year}"
if key in done:
print(f"[scraper] [{count}/{total}] SKIP {key}")
continue
print(f"[scraper] [{count}/{total}] {key}")
self._scrape_year(state_name, code, year)
done.add(key)
self._save_checkpoint(list(done))
return self._finalize()
# ββ Core ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def _scrape_year(self, state_name: str, state_code: str, year: str):
"""Fetch state-year page, find district links, scrape each."""
url = f"{BASE_URL}/nrega_R14.aspx?state_code={state_code}&fin_year={year}&rpt=RP"
soup = self._get(url)
if soup is None:
return
district_links = self._find_district_links(soup)
if district_links:
print(f" β {len(district_links)} districts")
for name, durl in district_links:
dsoup = self._get(durl)
if dsoup:
rows = self._parse_table(dsoup, state_name, year, name)
self.records.extend(rows)
time.sleep(self.delay)
else:
# State-level page may already contain the district table
rows = self._parse_table(soup, state_name, year)
self.records.extend(rows)
print(f" β {len(rows)} rows (direct table)")
def _get(self, url: str):
try:
r = self.session.get(url, timeout=20)
r.raise_for_status()
return BeautifulSoup(r.text, "lxml")
except Exception as e:
print(f" [ERROR] {url[:80]}... β {e}")
return None
def _find_district_links(self, soup: BeautifulSoup) -> list:
links = []
for a in soup.find_all("a", href=True):
href = a["href"]
text = a.get_text(strip=True)
if ("district_code" in href.lower() or "nrega_r14" in href.lower()) and text:
full = href if href.startswith("http") else f"{BASE_URL}/{href.lstrip('/')}"
links.append((text.title(), full))
return links
def _parse_table(self, soup, state_name, year, district_hint=None):
records = []
for table in soup.find_all("table"):
headers = [th.get_text(" ", strip=True).lower() for th in table.find_all("th")]
joined = " ".join(headers)
if not any(k in joined for k in ["person", "household", "expenditure"]):
continue
for row in table.find_all("tr")[1:]:
cells = [td.get_text(strip=True) for td in row.find_all("td")]
r = self._map(cells, state_name, year, district_hint)
if r:
records.append(r)
return records
def _map(self, cells, state_name, year, district_hint=None):
def num(v):
try: return float(str(v).replace(",","").replace("-","0") or 0)
except: return 0.0
if len(cells) < 6:
return None
district = district_hint or cells[0]
if not district or str(district).isdigit() or len(str(district)) < 3:
return None
# Skip subtotal/total rows
dl = district.lower()
if any(t in dl for t in ["total", "grand", "state"]):
return None
# Person days in R14 are in actual days, convert to lakhs
pd_raw = num(cells[4]) if len(cells) > 4 else 0
pd_lakhs = round(pd_raw / 1e5, 3) if pd_raw > 1000 else pd_raw # already lakhs?
exp_raw = num(cells[8]) if len(cells) > 8 else 0
exp_lakhs = round(exp_raw / 1e5, 2) if exp_raw > 1e5 else exp_raw
# Clean year format: 2023-2024 β 2023-24
yr_parts = year.split("-")
fin_year = f"{yr_parts[0]}-{yr_parts[1][2:]}" if len(yr_parts) == 2 else year
return {
"state": state_name,
"district": str(district).title().strip(),
"financial_year": fin_year,
"region": "South" if state_name in SOUTH else ("East" if state_name in EAST else "Other"),
"state_category": "high" if state_name in HIGH_ACTIVITY else ("mid" if state_name in MID_ACTIVITY else "low"),
"person_days_lakhs": pd_lakhs,
"expenditure_lakhs": exp_lakhs,
"avg_wage_rate": num(cells[9]) if len(cells) > 9 else None,
"households_demanded": num(cells[1]) if len(cells) > 1 else None,
"households_offered": num(cells[2]) if len(cells) > 2 else None,
"households_availed": num(cells[3]) if len(cells) > 3 else None,
"works_completed": num(cells[10]) if len(cells) > 10 else None,
# Stage 2/3 β fill via enrich.py with IMD/census/PMKISAN data
"rainfall_mm": None,
"crop_season_index": None,
"rural_population_lakhs": None,
"poverty_rate_pct": None,
"pmkisan_beneficiaries": None,
"pmkisan_amount_lakhs": None,
"pmay_houses_sanctioned": None,
"pmay_houses_completed": None,
"pmay_expenditure_lakhs": None,
"budget_allocated_lakhs": round(exp_lakhs * 1.12, 2) if exp_lakhs else None,
}
# ββ Persistence βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def _finalize(self) -> pd.DataFrame:
df = pd.DataFrame(self.records)
os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)
df.to_csv(OUTPUT_PATH, index=False)
print(f"\n{'='*60}")
print(f"[scraper] DONE: {len(df)} rows | {df['district'].nunique() if len(df) else 0} districts")
print(f"[scraper] Saved β {OUTPUT_PATH}")
print(f"[scraper] Next step: copy this to data/raw/mnrega_india_unified.csv")
print(f" then run: python main.py --stage 3")
print(f"{'='*60}")
return df
def _save_checkpoint(self, done):
os.makedirs(os.path.dirname(CHECKPOINT_PATH), exist_ok=True)
with open(CHECKPOINT_PATH, "w") as f:
json.dump({"done": done, "ts": str(datetime.now())}, f)
def _load_checkpoint(self):
if os.path.exists(CHECKPOINT_PATH):
with open(CHECKPOINT_PATH) as f:
return json.load(f)
return {}
# ββ CLI βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
if __name__ == "__main__":
ap = argparse.ArgumentParser()
ap.add_argument("--state", type=str, help="Single state e.g. 'Maharashtra'")
ap.add_argument("--all-india", action="store_true")
ap.add_argument("--resume", action="store_true", help="Resume from checkpoint")
ap.add_argument("--years", nargs=2, default=["2014-2015", "2023-2024"],
metavar=("START", "END"),
help="e.g. --years 2018-2019 2023-2024")
ap.add_argument("--delay", type=float, default=1.5)
args = ap.parse_args()
start = int(args.years[0].split("-")[0])
end = int(args.years[1].split("-")[0])
years = [f"{y}-{y+1}" for y in range(start, end + 1)]
scraper = MNREGAScraper(delay=args.delay)
if args.state:
df = scraper.scrape_state(args.state, years)
elif args.all_india:
df = scraper.scrape_all_india(years, resume=args.resume)
else:
print("Usage:")
print(" python data/scraper/mnrega_scraper.py --state Maharashtra")
print(" python data/scraper/mnrega_scraper.py --all-india")
print(" python data/scraper/mnrega_scraper.py --all-india --resume")
exit(0)
|