File size: 13,083 Bytes
f87e795
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
"""
mnrega_scraper.py
-----------------
Real MNREGA data scraper for nreganarep.nic.in

STRATEGY:
  The portal has captchas on the main MIS page, but the R14 district-level
  consolidated summary reports are accessible via direct GET URLs.

  R14 report gives per-district per-year:
    - Households demanded / offered / availed
    - Person days (total, SC, ST, Women)
    - Expenditure (Rs. lakhs)
    - Average wage rate
    - Works completed / in progress

  Two-step approach:
    Step 1: Fetch state-level page β†’ extract district links (which have
            embedded Digest tokens needed to access sub-pages)
    Step 2: Follow each district link β†’ parse the HTML table

HOW TO RUN:
  pip install requests beautifulsoup4 lxml

  # Maharashtra only (fast, ~2-5 min):
  python data/scraper/mnrega_scraper.py --state Maharashtra

  # All India (slow, ~30-60 min):
  python data/scraper/mnrega_scraper.py --all-india

  # Resume after interruption:
  python data/scraper/mnrega_scraper.py --all-india --resume

  # Custom year range:
  python data/scraper/mnrega_scraper.py --state Maharashtra --years 2018-2019 2023-2024

OUTPUT:
  data/raw/mnrega_real_data.csv
  β†’ drop this in as replacement for mnrega_india_unified.csv
  β†’ run: python main.py --stage 3
"""

import os, json, time, argparse
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime

# ── State codes ────────────────────────────────────────────────────────────────
STATE_CODES = {
    "Andhra Pradesh":    "02",
    "Arunachal Pradesh": "03",
    "Assam":             "04",
    "Bihar":             "05",
    "Chhattisgarh":      "33",
    "Goa":               "10",
    "Gujarat":           "11",
    "Haryana":           "12",
    "Himachal Pradesh":  "13",
    "Jharkhand":         "34",
    "Karnataka":         "15",
    "Kerala":            "16",
    "Madhya Pradesh":    "17",
    "Maharashtra":       "18",
    "Manipur":           "19",
    "Meghalaya":         "20",
    "Mizoram":           "21",
    "Nagaland":          "22",
    "Odisha":            "24",
    "Punjab":            "25",
    "Rajasthan":         "27",
    "Sikkim":            "28",
    "Tamil Nadu":        "29",
    "Telangana":         "36",
    "Tripura":           "30",
    "Uttar Pradesh":     "31",
    "Uttarakhand":       "35",
    "West Bengal":       "32",
    "Delhi":             "07",
}

ALL_YEARS = [
    "2014-2015", "2015-2016", "2016-2017", "2017-2018",
    "2018-2019", "2019-2020", "2020-2021", "2021-2022",
    "2022-2023", "2023-2024"
]

BASE_URL        = "https://nreganarep.nic.in/netnrega"
OUTPUT_PATH     = os.path.join("data", "raw", "mnrega_real_data.csv")
CHECKPOINT_PATH = os.path.join("data", "raw", ".scraper_checkpoint.json")
DELAY           = 1.5

HEADERS = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36",
    "Accept":     "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Referer":    "https://nreganarep.nic.in/netnrega/MISreport4.aspx",
}

HIGH_ACTIVITY = {"Rajasthan","Uttar Pradesh","Madhya Pradesh","West Bengal",
                 "Andhra Pradesh","Telangana","Jharkhand","Odisha","Chhattisgarh","Bihar"}
MID_ACTIVITY  = {"Maharashtra","Tamil Nadu","Karnataka","Gujarat",
                 "Himachal Pradesh","Uttarakhand","Assam"}
SOUTH         = {"Tamil Nadu","Kerala","Karnataka","Andhra Pradesh","Telangana"}
EAST          = {"West Bengal","Odisha","Jharkhand","Bihar","Assam"}


class MNREGAScraper:

    def __init__(self, delay=DELAY):
        self.session = requests.Session()
        self.session.headers.update(HEADERS)
        self.delay   = delay
        self.records = []
        self.checkpoint = self._load_checkpoint()

    # ── Public ────────────────────────────────────────────────────────────────

    def scrape_state(self, state_name: str, years: list) -> pd.DataFrame:
        code = STATE_CODES.get(state_name)
        if not code:
            raise ValueError(f"Unknown state '{state_name}'. Options: {list(STATE_CODES)}")
        print(f"\n{'='*60}")
        print(f"[scraper] State: {state_name} | Code: {code} | Years: {years[0]}β†’{years[-1]}")
        print(f"{'='*60}")
        for year in years:
            self._scrape_year(state_name, code, year)
        return self._finalize()

    def scrape_all_india(self, years: list, resume: bool = False) -> pd.DataFrame:
        done = set(self.checkpoint.get("done", [])) if resume else set()
        total = len(STATE_CODES) * len(years)
        count = 0
        for state_name, code in STATE_CODES.items():
            for year in years:
                count += 1
                key = f"{state_name}|{year}"
                if key in done:
                    print(f"[scraper] [{count}/{total}] SKIP {key}")
                    continue
                print(f"[scraper] [{count}/{total}] {key}")
                self._scrape_year(state_name, code, year)
                done.add(key)
                self._save_checkpoint(list(done))
        return self._finalize()

    # ── Core ──────────────────────────────────────────────────────────────────

    def _scrape_year(self, state_name: str, state_code: str, year: str):
        """Fetch state-year page, find district links, scrape each."""
        url = f"{BASE_URL}/nrega_R14.aspx?state_code={state_code}&fin_year={year}&rpt=RP"
        soup = self._get(url)
        if soup is None:
            return

        district_links = self._find_district_links(soup)

        if district_links:
            print(f"  β†’ {len(district_links)} districts")
            for name, durl in district_links:
                dsoup = self._get(durl)
                if dsoup:
                    rows = self._parse_table(dsoup, state_name, year, name)
                    self.records.extend(rows)
                time.sleep(self.delay)
        else:
            # State-level page may already contain the district table
            rows = self._parse_table(soup, state_name, year)
            self.records.extend(rows)
            print(f"  β†’ {len(rows)} rows (direct table)")

    def _get(self, url: str):
        try:
            r = self.session.get(url, timeout=20)
            r.raise_for_status()
            return BeautifulSoup(r.text, "lxml")
        except Exception as e:
            print(f"  [ERROR] {url[:80]}... β†’ {e}")
            return None

    def _find_district_links(self, soup: BeautifulSoup) -> list:
        links = []
        for a in soup.find_all("a", href=True):
            href = a["href"]
            text = a.get_text(strip=True)
            if ("district_code" in href.lower() or "nrega_r14" in href.lower()) and text:
                full = href if href.startswith("http") else f"{BASE_URL}/{href.lstrip('/')}"
                links.append((text.title(), full))
        return links

    def _parse_table(self, soup, state_name, year, district_hint=None):
        records = []
        for table in soup.find_all("table"):
            headers = [th.get_text(" ", strip=True).lower() for th in table.find_all("th")]
            joined  = " ".join(headers)
            if not any(k in joined for k in ["person", "household", "expenditure"]):
                continue
            for row in table.find_all("tr")[1:]:
                cells = [td.get_text(strip=True) for td in row.find_all("td")]
                r = self._map(cells, state_name, year, district_hint)
                if r:
                    records.append(r)
        return records

    def _map(self, cells, state_name, year, district_hint=None):
        def num(v):
            try: return float(str(v).replace(",","").replace("-","0") or 0)
            except: return 0.0

        if len(cells) < 6:
            return None

        district = district_hint or cells[0]
        if not district or str(district).isdigit() or len(str(district)) < 3:
            return None

        # Skip subtotal/total rows
        dl = district.lower()
        if any(t in dl for t in ["total", "grand", "state"]):
            return None

        # Person days in R14 are in actual days, convert to lakhs
        pd_raw = num(cells[4]) if len(cells) > 4 else 0
        pd_lakhs = round(pd_raw / 1e5, 3) if pd_raw > 1000 else pd_raw  # already lakhs?

        exp_raw = num(cells[8]) if len(cells) > 8 else 0
        exp_lakhs = round(exp_raw / 1e5, 2) if exp_raw > 1e5 else exp_raw

        # Clean year format: 2023-2024 β†’ 2023-24
        yr_parts = year.split("-")
        fin_year = f"{yr_parts[0]}-{yr_parts[1][2:]}" if len(yr_parts) == 2 else year

        return {
            "state":                  state_name,
            "district":               str(district).title().strip(),
            "financial_year":         fin_year,
            "region":                 "South" if state_name in SOUTH else ("East" if state_name in EAST else "Other"),
            "state_category":         "high" if state_name in HIGH_ACTIVITY else ("mid" if state_name in MID_ACTIVITY else "low"),
            "person_days_lakhs":      pd_lakhs,
            "expenditure_lakhs":      exp_lakhs,
            "avg_wage_rate":          num(cells[9])  if len(cells) > 9  else None,
            "households_demanded":    num(cells[1])  if len(cells) > 1  else None,
            "households_offered":     num(cells[2])  if len(cells) > 2  else None,
            "households_availed":     num(cells[3])  if len(cells) > 3  else None,
            "works_completed":        num(cells[10]) if len(cells) > 10 else None,
            # Stage 2/3 β€” fill via enrich.py with IMD/census/PMKISAN data
            "rainfall_mm":            None,
            "crop_season_index":      None,
            "rural_population_lakhs": None,
            "poverty_rate_pct":       None,
            "pmkisan_beneficiaries":  None,
            "pmkisan_amount_lakhs":   None,
            "pmay_houses_sanctioned": None,
            "pmay_houses_completed":  None,
            "pmay_expenditure_lakhs": None,
            "budget_allocated_lakhs": round(exp_lakhs * 1.12, 2) if exp_lakhs else None,
        }

    # ── Persistence ───────────────────────────────────────────────────────────

    def _finalize(self) -> pd.DataFrame:
        df = pd.DataFrame(self.records)
        os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)
        df.to_csv(OUTPUT_PATH, index=False)
        print(f"\n{'='*60}")
        print(f"[scraper] DONE: {len(df)} rows | {df['district'].nunique() if len(df) else 0} districts")
        print(f"[scraper] Saved β†’ {OUTPUT_PATH}")
        print(f"[scraper] Next step: copy this to data/raw/mnrega_india_unified.csv")
        print(f"          then run:  python main.py --stage 3")
        print(f"{'='*60}")
        return df

    def _save_checkpoint(self, done):
        os.makedirs(os.path.dirname(CHECKPOINT_PATH), exist_ok=True)
        with open(CHECKPOINT_PATH, "w") as f:
            json.dump({"done": done, "ts": str(datetime.now())}, f)

    def _load_checkpoint(self):
        if os.path.exists(CHECKPOINT_PATH):
            with open(CHECKPOINT_PATH) as f:
                return json.load(f)
        return {}


# ── CLI ───────────────────────────────────────────────────────────────────────

if __name__ == "__main__":
    ap = argparse.ArgumentParser()
    ap.add_argument("--state",     type=str,  help="Single state e.g. 'Maharashtra'")
    ap.add_argument("--all-india", action="store_true")
    ap.add_argument("--resume",    action="store_true", help="Resume from checkpoint")
    ap.add_argument("--years",     nargs=2, default=["2014-2015", "2023-2024"],
                    metavar=("START", "END"),
                    help="e.g. --years 2018-2019 2023-2024")
    ap.add_argument("--delay",     type=float, default=1.5)
    args = ap.parse_args()

    start = int(args.years[0].split("-")[0])
    end   = int(args.years[1].split("-")[0])
    years = [f"{y}-{y+1}" for y in range(start, end + 1)]

    scraper = MNREGAScraper(delay=args.delay)

    if args.state:
        df = scraper.scrape_state(args.state, years)
    elif args.all_india:
        df = scraper.scrape_all_india(years, resume=args.resume)
    else:
        print("Usage:")
        print("  python data/scraper/mnrega_scraper.py --state Maharashtra")
        print("  python data/scraper/mnrega_scraper.py --all-india")
        print("  python data/scraper/mnrega_scraper.py --all-india --resume")
        exit(0)