gkdivya commited on
Commit
58bcc88
·
verified ·
1 Parent(s): 18f965b

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +552 -0
app.py ADDED
@@ -0,0 +1,552 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # app.py
3
+ # --------------------------------------------------
4
+ # Gradio app for state-specific school fuzzy matching
5
+ #
6
+ # - Loads school masters from Hugging Face dataset (Apf-AI4Good/Schools)
7
+ # - Uses normalization patterns from patterns.json (no hardcoded patterns)
8
+ # - Public "Search" tab for users
9
+ # - "Admin" tab to view/edit/save patterns (password protected for saving)
10
+ # --------------------------------------------------
11
+
12
+ import os
13
+ import re
14
+ import json
15
+ import pandas as pd
16
+ import gradio as gr
17
+ from rapidfuzz import process, fuzz
18
+ from datasets import load_dataset
19
+
20
+ # ====================================================
21
+ # CONFIG: columns, states, HF dataset, admin
22
+ # ====================================================
23
+
24
+ # Expected columns in the master CSVs (must match your HF CSVs)
25
+ MASTER_SCHOOL_COL = "School_Name__c"
26
+ MASTER_DISTRICT_COL = "School_District__c"
27
+ MASTER_BLOCK_COL = "School_Block__c" # optional
28
+ MASTER_UDISE_COL = "School_Udise_Code__c"
29
+ MASTER_STATE_COL = "School_State__c" # optional
30
+
31
+ # Hugging Face dataset that holds all state CSVs
32
+ HF_SCHOOLS_DATASET = "Apf-AI4Good/Schools"
33
+
34
+ # Map state keys to CSV filenames inside that dataset
35
+ STATE_HF_FILES = {
36
+ "ARUNACHAL PRADESH": "Arunachal Pradesh.csv",
37
+ "ASSAM": "Assam.csv",
38
+ "BIHAR": "Bihar.csv",
39
+ "CHHATTISGARH": "Chhattisgarh.csv",
40
+ "JHARKHAND": "Jharkhand.csv",
41
+ "MADHYA PRADESH": "Madhya Pradesh.csv",
42
+ "MANIPUR": "Manipur.csv",
43
+ "MEGHALAYA": "Meghalaya.csv",
44
+ "MIZORAM": "Mizoram.csv",
45
+ "NAGALAND": "Nagaland.csv",
46
+ "ODISHA": "Odisha.csv",
47
+ "PUDUCHERRY": "Puducherry.csv",
48
+ "RAJASTHAN": "Rajasthan.csv",
49
+ "SIKKIM": "Sikkim.csv",
50
+ "TELANGANA": "Telangana.csv",
51
+ "TRIPURA": "Tripura.csv",
52
+ "UTTAR PRADESH": "Uttar Pradesh.csv",
53
+ "UTTARAKHAND": "Uttarakhand.csv"
54
+ }
55
+
56
+
57
+ # Default state for dropdown
58
+ DEFAULT_STATE_KEY = "ARUNACHAL PRADESH"
59
+
60
+ # Number of candidates to show in the table
61
+ MAX_CANDIDATES = 5
62
+
63
+ # JSON file to store patterns
64
+ PATTERN_FILE = "patterns.json"
65
+
66
+ # Admin password (CHANGE THIS!)
67
+ ADMIN_PASSWORD = "secret"
68
+
69
+ # Global master df (for currently selected state)
70
+ master_df: pd.DataFrame | None = None
71
+
72
+ # Global pattern config (loaded from patterns.json)
73
+ pattern_config: dict | None = None
74
+
75
+
76
+ # ====================================================
77
+ # PATTERN CONFIG LOAD / SAVE (JSON)
78
+ # ====================================================
79
+
80
+ DEFAULT_PATTERN_CONFIG = {
81
+ "global": [], # start empty; managed via Admin tab
82
+ "states": {} # e.g. "KARNATAKA": [{pattern: "...", replacement: "..."}]
83
+ }
84
+
85
+
86
+ def load_pattern_config() -> dict:
87
+ """Load patterns from JSON file, or create empty file."""
88
+ if os.path.exists(PATTERN_FILE):
89
+ with open(PATTERN_FILE, "r", encoding="utf-8") as f:
90
+ return json.load(f)
91
+
92
+ # If file not present, seed with empty structure
93
+ with open(PATTERN_FILE, "w", encoding="utf-8") as f:
94
+ json.dump(DEFAULT_PATTERN_CONFIG, f, indent=2, ensure_ascii=False)
95
+ return DEFAULT_PATTERN_CONFIG.copy()
96
+
97
+
98
+ def save_pattern_config(config: dict) -> None:
99
+ with open(PATTERN_FILE, "w", encoding="utf-8") as f:
100
+ json.dump(config, f, indent=2, ensure_ascii=False)
101
+
102
+
103
+ def build_patterns_from_config(config: dict, state_key: str | None):
104
+ """
105
+ Convert JSON config into lists of (pattern, replacement) tuples
106
+ for global and state-specific patterns.
107
+ """
108
+ global_list = [(p["pattern"], p["replacement"]) for p in config.get("global", [])]
109
+
110
+ state_list = []
111
+ if state_key:
112
+ state_key_up = state_key.upper().strip()
113
+ state_patterns = config.get("states", {}).get(state_key_up, [])
114
+ state_list = [(p["pattern"], p["replacement"]) for p in state_patterns]
115
+
116
+ return global_list, state_list
117
+
118
+
119
+ # ====================================================
120
+ # NORMALIZATION
121
+ # ====================================================
122
+
123
+ def normalize_with_patterns_dynamic(s: str, state_key: str | None) -> str:
124
+ """Normalize using global + state patterns from pattern_config."""
125
+ global pattern_config
126
+
127
+ if not isinstance(s, str):
128
+ return ""
129
+ s = s.upper()
130
+
131
+ if pattern_config is None:
132
+ pattern_config = load_pattern_config()
133
+
134
+ global_patterns, state_patterns = build_patterns_from_config(pattern_config, state_key)
135
+
136
+ for pat, repl in global_patterns:
137
+ s = re.sub(pat, repl, s)
138
+ for pat, repl in state_patterns:
139
+ s = re.sub(pat, repl, s)
140
+
141
+ # Standard final cleaning
142
+ s = re.sub(r"[^A-Z0-9]+", " ", s)
143
+ s = re.sub(r"\s+", " ", s).strip()
144
+ return s
145
+
146
+
147
+ # ====================================================
148
+ # DATA LOADING FROM HF
149
+ # ====================================================
150
+
151
+ def load_master_for_state(state_key: str | None):
152
+ """
153
+ Load the master CSV for a state from Hugging Face Datasets,
154
+ set global master_df, and return District & Block dropdown configs.
155
+ """
156
+ global master_df
157
+
158
+ if not state_key:
159
+ master_df = None
160
+ return (
161
+ gr.Dropdown(choices=[], value=None),
162
+ gr.Dropdown(choices=[], value=None),
163
+ )
164
+
165
+ state_key_norm = state_key.upper().strip()
166
+ if state_key_norm not in STATE_HF_FILES:
167
+ master_df = None
168
+ return (
169
+ gr.Dropdown(choices=[], value=None),
170
+ gr.Dropdown(choices=[], value=None),
171
+ )
172
+
173
+ csv_filename = STATE_HF_FILES[state_key_norm]
174
+
175
+ # Load only that CSV file from the HF dataset repo
176
+ ds_dict = load_dataset(
177
+ HF_SCHOOLS_DATASET,
178
+ data_files={"train": csv_filename},
179
+ )
180
+ ds = ds_dict["train"]
181
+
182
+ # Convert to pandas and standardize
183
+ master_df = ds.to_pandas().fillna("")
184
+
185
+ # District choices
186
+ if MASTER_DISTRICT_COL in master_df.columns:
187
+ districts = sorted(master_df[MASTER_DISTRICT_COL].dropna().unique().tolist())
188
+ districts = ["All"] + districts
189
+ else:
190
+ districts = []
191
+
192
+ # Initial blocks (will be refined when district changes)
193
+ if MASTER_BLOCK_COL in master_df.columns:
194
+ blocks = ["All"]
195
+ else:
196
+ blocks = []
197
+
198
+ return (
199
+ gr.Dropdown(choices=districts, value="All" if districts else None),
200
+ gr.Dropdown(choices=blocks, value="All" if blocks else None),
201
+ )
202
+
203
+
204
+ def update_blocks(district: str | None):
205
+ """
206
+ Update Block dropdown when District changes.
207
+ """
208
+ global master_df
209
+
210
+ if master_df is None or MASTER_BLOCK_COL not in master_df.columns:
211
+ return gr.Dropdown(choices=["All"], value="All")
212
+
213
+ df = master_df
214
+ if (
215
+ district
216
+ and district != "All"
217
+ and MASTER_DISTRICT_COL in df.columns
218
+ ):
219
+ df = df[df[MASTER_DISTRICT_COL] == district]
220
+
221
+ blocks = sorted(df[MASTER_BLOCK_COL].dropna().unique().tolist())
222
+ blocks = ["All"] + blocks if blocks else ["All"]
223
+ return gr.Dropdown(choices=blocks, value="All")
224
+
225
+
226
+ # ====================================================
227
+ # FUZZY SEARCH
228
+ # ====================================================
229
+
230
+ def search_candidates(query_name: str, state_key: str | None, district: str | None, block: str | None):
231
+ """
232
+ Given school name + state + district + block, return:
233
+ - candidates table (top N matches)
234
+ - best-candidate table (single row)
235
+ """
236
+ global master_df
237
+
238
+ if master_df is None:
239
+ return pd.DataFrame(), pd.DataFrame()
240
+
241
+ query_name = (query_name or "").strip()
242
+ if not query_name:
243
+ return pd.DataFrame(), pd.DataFrame()
244
+
245
+ df = master_df
246
+
247
+ # Filter by district
248
+ if (
249
+ district
250
+ and district != "All"
251
+ and MASTER_DISTRICT_COL in df.columns
252
+ ):
253
+ df = df[df[MASTER_DISTRICT_COL] == district]
254
+
255
+ # Filter by block
256
+ if (
257
+ block
258
+ and block != "All"
259
+ and MASTER_BLOCK_COL in df.columns
260
+ ):
261
+ df = df[df[MASTER_BLOCK_COL] == block]
262
+
263
+ if df.empty:
264
+ return pd.DataFrame(), pd.DataFrame()
265
+
266
+ state_for_patterns = (state_key or DEFAULT_STATE_KEY).upper().strip()
267
+
268
+ choices = df[MASTER_SCHOOL_COL].astype(str)
269
+
270
+ candidates_raw = process.extract(
271
+ query_name,
272
+ choices,
273
+ scorer=fuzz.token_set_ratio,
274
+ processor=lambda s: normalize_with_patterns_dynamic(s, state_for_patterns),
275
+ limit=MAX_CANDIDATES,
276
+ ) # (choice, score, key)
277
+
278
+ if not candidates_raw:
279
+ return pd.DataFrame(), pd.DataFrame()
280
+
281
+ rows = []
282
+ for choice_name, score, key in candidates_raw:
283
+ try:
284
+ row = df.loc[key]
285
+ except Exception:
286
+ continue
287
+
288
+ rows.append({
289
+ "School_Name": row.get(MASTER_SCHOOL_COL, ""),
290
+ "State": row.get(MASTER_STATE_COL, "") if MASTER_STATE_COL in df.columns else state_for_patterns,
291
+ "District": row.get(MASTER_DISTRICT_COL, "") if MASTER_DISTRICT_COL in df.columns else "",
292
+ "Block": row.get(MASTER_BLOCK_COL, "") if MASTER_BLOCK_COL in df.columns else "",
293
+ "UDISE_Code": row.get(MASTER_UDISE_COL, "") if MASTER_UDISE_COL in df.columns else "",
294
+ "Score": score,
295
+ })
296
+
297
+ if not rows:
298
+ return pd.DataFrame(), pd.DataFrame()
299
+
300
+ candidates_df = pd.DataFrame(rows)
301
+ best_df = candidates_df.head(1).copy()
302
+ return candidates_df, best_df
303
+
304
+
305
+ # ====================================================
306
+ # ADMIN / PATTERN CALLBACKS (PASSWORD PROTECTED SAVE)
307
+ # ====================================================
308
+
309
+ def load_state_patterns_for_editor(selected_state: str | None, new_state_name: str | None):
310
+ """
311
+ Load state-specific patterns into the editor dataframe.
312
+ Does NOT require password (view-only).
313
+ """
314
+ global pattern_config
315
+ if pattern_config is None:
316
+ pattern_config = load_pattern_config()
317
+
318
+ key = None
319
+ if new_state_name and new_state_name.strip():
320
+ key = new_state_name.strip().upper()
321
+ elif selected_state:
322
+ key = selected_state.strip().upper()
323
+
324
+ if not key:
325
+ return pd.DataFrame(columns=["pattern", "replacement"])
326
+
327
+ state_patterns = pattern_config.get("states", {}).get(key, [])
328
+ if not state_patterns:
329
+ return pd.DataFrame(columns=["pattern", "replacement"])
330
+
331
+ return pd.DataFrame(state_patterns)
332
+
333
+
334
+ def save_global_patterns_from_editor(df: pd.DataFrame, password: str):
335
+ global pattern_config
336
+ if password != ADMIN_PASSWORD:
337
+ return "❌ Invalid admin password. Global patterns NOT saved."
338
+
339
+ if pattern_config is None:
340
+ pattern_config = load_pattern_config()
341
+
342
+ pattern_config["global"] = df.fillna("").to_dict(orient="records")
343
+ save_pattern_config(pattern_config)
344
+ return "✅ Global patterns saved to patterns.json"
345
+
346
+
347
+ def save_state_patterns_from_editor(selected_state: str | None, new_state_name: str | None, df: pd.DataFrame, password: str):
348
+ global pattern_config
349
+ if password != ADMIN_PASSWORD:
350
+ return "❌ Invalid admin password. State patterns NOT saved."
351
+
352
+ if pattern_config is None:
353
+ pattern_config = load_pattern_config()
354
+
355
+ key = None
356
+ if new_state_name and new_state_name.strip():
357
+ key = new_state_name.strip().upper()
358
+ elif selected_state:
359
+ key = selected_state.strip().upper()
360
+
361
+ if not key:
362
+ return "⚠️ Please select a state or type a new state key."
363
+
364
+ pattern_config.setdefault("states", {})[key] = df.fillna("").to_dict(orient="records")
365
+ save_pattern_config(pattern_config)
366
+ return f"✅ Patterns for **{key}** saved to patterns.json"
367
+
368
+
369
+ def load_global_patterns_for_editor():
370
+ """Load global patterns into editor (view-only, no password)."""
371
+ global pattern_config
372
+ if pattern_config is None:
373
+ pattern_config = load_pattern_config()
374
+ return pd.DataFrame(pattern_config.get("global", []))
375
+
376
+
377
+ # ====================================================
378
+ # BUILD GRADIO UI
379
+ # ====================================================
380
+
381
+ # Initialize pattern_config at startup
382
+ pattern_config = load_pattern_config()
383
+
384
+ with gr.Blocks(title="State School Fuzzy Matcher") as demo:
385
+ gr.Markdown(
386
+ """
387
+ # State School Fuzzy Matcher (RapidFuzz)
388
+
389
+ **Search tab** (public):
390
+ 1. Select **State** (loads master CSV from Hugging Face dataset).
391
+ 2. (Optional) choose **District** and **Block**.
392
+ 3. Type a **School Name** (as on marksheet).
393
+ 4. See top fuzzy-match candidates and the best candidate.
394
+
395
+ **Admin tab** (protected for saving):
396
+ - View / edit **global** and **state-specific** normalization patterns.
397
+ - Saving changes requires an **admin password**.
398
+ """
399
+ )
400
+
401
+ # ------------------------------------------------
402
+ # TAB: SEARCH (PUBLIC)
403
+ # ------------------------------------------------
404
+ with gr.Tab("Search"):
405
+ with gr.Row():
406
+ state_dd = gr.Dropdown(
407
+ label="State",
408
+ choices=list(STATE_HF_FILES.keys()),
409
+ value=DEFAULT_STATE_KEY if DEFAULT_STATE_KEY in STATE_HF_FILES else None,
410
+ interactive=True,
411
+ )
412
+
413
+ with gr.Row():
414
+ district_dd = gr.Dropdown(label="District", choices=[], value=None, interactive=True)
415
+ block_dd = gr.Dropdown(label="Block", choices=[], value=None, interactive=True)
416
+
417
+ school_input = gr.Textbox(
418
+ label="Input School Name",
419
+ placeholder="Type school name from marksheet..."
420
+ )
421
+ search_btn = gr.Button("Find Candidates")
422
+
423
+ gr.Markdown("### Candidates (top matches)")
424
+ candidates_table = gr.Dataframe(
425
+ headers=["School_Name", "State", "District", "Block", "UDISE_Code", "Score"],
426
+ datatype=["str", "str", "str", "str", "str", "number"],
427
+ interactive=False
428
+ )
429
+
430
+ gr.Markdown("### Best Candidate")
431
+ best_table = gr.Dataframe(
432
+ headers=["School_Name", "State", "District", "Block", "UDISE_Code", "Score"],
433
+ datatype=["str", "str", "str", "str", "str", "number"],
434
+ interactive=False
435
+ )
436
+
437
+ # ------------------------------------------------
438
+ # TAB: ADMIN (PASSWORD-PROTECTED SAVE)
439
+ # ------------------------------------------------
440
+ with gr.Tab("Admin"):
441
+ gr.Markdown(
442
+ """
443
+ ### Admin – Pattern configuration
444
+
445
+ - **Global patterns**: applied to all states.
446
+ - **State-specific patterns**: applied only for that state key.
447
+
448
+ You can **view and edit** patterns freely, but **saving** requires the admin password.
449
+ """
450
+ )
451
+
452
+ admin_pwd = gr.Textbox(
453
+ label="Admin Password",
454
+ type="password",
455
+ placeholder="Enter admin password to save changes",
456
+ )
457
+
458
+ # ---- Global patterns editor ----
459
+ gr.Markdown("#### Global Patterns")
460
+ global_df = gr.Dataframe(
461
+ value=load_global_patterns_for_editor(),
462
+ headers=["pattern", "replacement"],
463
+ datatype=["str", "str"],
464
+ interactive=True,
465
+ row_count=(len(pattern_config.get("global", [])) or 1),
466
+ )
467
+ save_global_btn = gr.Button("💾 Save Global Patterns")
468
+ global_status = gr.Markdown("")
469
+
470
+ # ---- State patterns editor ----
471
+ gr.Markdown("#### State-specific Patterns")
472
+
473
+ existing_state_keys = sorted(pattern_config.get("states", {}).keys())
474
+ with gr.Row():
475
+ state_pattern_dd = gr.Dropdown(
476
+ label="Existing state key",
477
+ choices=existing_state_keys,
478
+ value=existing_state_keys[0] if existing_state_keys else None,
479
+ interactive=True,
480
+ )
481
+ new_state_tb = gr.Textbox(
482
+ label="Or type new state key",
483
+ placeholder="e.g. KARNATAKA or TAMIL NADU",
484
+ )
485
+
486
+ state_df = gr.Dataframe(
487
+ value=pd.DataFrame(columns=["pattern", "replacement"]),
488
+ headers=["pattern", "replacement"],
489
+ datatype=["str", "str"],
490
+ interactive=True,
491
+ row_count=3,
492
+ )
493
+ load_state_btn = gr.Button("Load patterns for selected / new state")
494
+ save_state_btn = gr.Button("💾 Save State Patterns")
495
+ state_status = gr.Markdown("")
496
+
497
+ # ====================================================
498
+ # WIRING EVENTS
499
+ # ====================================================
500
+
501
+ # Load master when state changes
502
+ state_dd.change(
503
+ fn=load_master_for_state,
504
+ inputs=state_dd,
505
+ outputs=[district_dd, block_dd],
506
+ )
507
+
508
+ # Initial load for default state (if any)
509
+ demo.load(
510
+ fn=load_master_for_state,
511
+ inputs=state_dd,
512
+ outputs=[district_dd, block_dd],
513
+ )
514
+
515
+ # Update blocks when district changes
516
+ district_dd.change(
517
+ fn=update_blocks,
518
+ inputs=district_dd,
519
+ outputs=block_dd,
520
+ )
521
+
522
+ # Search button
523
+ search_btn.click(
524
+ fn=search_candidates,
525
+ inputs=[school_input, state_dd, district_dd, block_dd],
526
+ outputs=[candidates_table, best_table],
527
+ )
528
+
529
+ # Admin: save global patterns (password checked inside)
530
+ save_global_btn.click(
531
+ fn=save_global_patterns_from_editor,
532
+ inputs=[global_df, admin_pwd],
533
+ outputs=global_status,
534
+ )
535
+
536
+ # Admin: load state patterns into editor
537
+ load_state_btn.click(
538
+ fn=load_state_patterns_for_editor,
539
+ inputs=[state_pattern_dd, new_state_tb],
540
+ outputs=state_df,
541
+ )
542
+
543
+ # Admin: save state patterns (password checked inside)
544
+ save_state_btn.click(
545
+ fn=save_state_patterns_from_editor,
546
+ inputs=[state_pattern_dd, new_state_tb, state_df, admin_pwd],
547
+ outputs=state_status,
548
+ )
549
+
550
+
551
+ if __name__ == "__main__":
552
+ demo.launch()