gkdivya commited on
Commit
72d160b
·
verified ·
1 Parent(s): 5be6e81

Update searchschool.py

Browse files
Files changed (1) hide show
  1. searchschool.py +118 -192
searchschool.py CHANGED
@@ -6,7 +6,7 @@ from rapidfuzz import process, fuzz
6
  from web_search import tavily_search_codes
7
 
8
  # ====================================================
9
- # CONFIG: columns, states, HF dataset
10
  # ====================================================
11
  MASTER_SCHOOL_COL = "School_Name__c"
12
  MASTER_DISTRICT_COL = "School_District__c"
@@ -15,43 +15,41 @@ MASTER_UDISE_COL = "School_Udise_Code__c"
15
  MASTER_STATE_COL = "School_State__c"
16
 
17
  HF_SCHOOLS_DATASET = "Apf-AI4Good/Schools"
 
18
 
19
- # Map state keys to CSV filenames inside that dataset
20
- STATE_HF_FILES = {
21
- "ARUNACHAL PRADESH": "Arunachal Pradesh.csv",
22
- "ASSAM": "Assam.csv",
23
- "BIHAR": "Bihar.csv",
24
- "CHHATTISGARH": "Chhattisgarh.csv",
25
- "JHARKHAND": "Jharkhand.csv",
26
- "MADHYA PRADESH": "Madhya Pradesh.csv",
27
- "MANIPUR": "Manipur.csv",
28
- "MEGHALAYA": "Meghalaya.csv",
29
- "MIZORAM": "Mizoram.csv",
30
- "NAGALAND": "Nagaland.csv",
31
- "ODISHA": "Odisha.csv",
32
- "PUDUCHERRY": "Puducherry.csv",
33
- "RAJASTHAN": "Rajasthan.csv",
34
- "SIKKIM": "Sikkim.csv",
35
- "TELANGANA": "Telangana.csv",
36
- "TRIPURA": "Tripura.csv",
37
- "UTTAR PRADESH": "Uttar Pradesh.csv",
38
- "UTTARAKHAND": "Uttarakhand.csv"
39
- }
40
  DEFAULT_STATE_KEY = "ARUNACHAL PRADESH"
41
  MAX_CANDIDATES = 5
42
 
43
- # global cache
44
  master_df = None
45
 
46
- # You will import normalize_with_patterns_dynamic from admin_patterns when needed
47
- # to avoid circular imports, main app passes runtime normalization in search_candidates
48
  try:
49
  from admin_patterns import normalize_with_patterns_dynamic
50
  except Exception:
51
- # if admin_patterns isn't importable at module import time, we will import inside functions
52
  normalize_with_patterns_dynamic = None
53
 
54
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  def on_search_web(
56
  school_name: str,
57
  state_name: str,
@@ -59,234 +57,166 @@ def on_search_web(
59
  block: str = None
60
  ):
61
  """
62
- 1. Performs Tavily search → returns list of valid UDISE codes.
63
- 2. Looks up these UDISE codes in our HF Schools dataset using
64
- get_school_rows_by_udise().
65
- 3. Converts results into the standard DataFrame your Gradio app expects.
66
-
67
- Returns:
68
- pandas.DataFrame with columns:
69
- School_Name, State, District, Block, UDISE_Code, Score
70
  """
71
 
72
- # Step 1: Tavily → list of UDISE codes
73
  udise_list = tavily_search_codes(
74
  school_name=school_name,
75
  state_name=state_name,
76
  district=district,
77
- api_key=None, # use HuggingFace secret instead
78
- enforce_state_prefix=True
79
  )
80
 
81
- print(udise_list)
82
-
83
  if not udise_list:
84
- # Always return an empty DF with correct schema
85
  return pd.DataFrame(
86
  columns=["School_Name", "State", "District", "Block", "UDISE_Code"]
87
  )
88
 
89
- # Step 2: HF dataset lookup
90
- rows = get_school_rows_by_udise(state_name, udise_list, try_global=True)
91
 
92
- # Step 3: Convert list → DataFrame
93
  df = pd.DataFrame(rows)
94
 
95
- # Make sure all expected columns exist
96
  expected = ["School_Name", "State", "District", "Block", "UDISE_Code"]
97
  for col in expected:
98
  if col not in df.columns:
99
- df[col] = None # keep schema consistent
100
 
101
- # Reorder to canonical format
102
- df = df[expected]
103
 
104
- # Score is not applicable for web search → keep None
105
- return df
106
 
107
-
108
- def get_school_rows_by_udise(state_name: str, udise_codes: list[str], try_global: bool = True) -> list:
109
  """
110
- Very simplified UDISE → school rows lookup.
111
- Returns list of dicts:
112
- School_Name, State, District, Block, UDISE_Code
113
  """
114
-
115
  if not udise_codes:
116
  return []
117
 
118
- udise_codes = list({str(u) for u in udise_codes}) # unique + cast to str
119
 
120
- results = []
121
 
122
- # --- Normalize state key ---
123
- state_key = None
124
- if state_name:
125
- upper = state_name.strip().upper()
126
- for k in STATE_HF_FILES.keys():
127
- if k.upper() == upper:
128
- state_key = k
129
- break
130
-
131
- # --- Helper: read CSV safely ---
132
- def load_csv(filename):
133
- try:
134
- path = hf_hub_download(
135
- repo_id=HF_SCHOOLS_DATASET,
136
- repo_type="dataset",
137
- filename=filename
138
- )
139
- return pd.read_csv(path, dtype=str).fillna("")
140
- except Exception:
141
- return pd.DataFrame()
142
-
143
- # --- Helper: extract rows for given DF ---
144
- def extract_rows(df, state_label):
145
- if df.empty or MASTER_UDISE_COL not in df.columns:
146
- return []
147
- matched = df[df[MASTER_UDISE_COL].isin(udise_codes)]
148
- if matched.empty:
149
- return []
150
- rows = []
151
- for _, r in matched.iterrows():
152
- rows.append({
153
- "School_Name": r.get(MASTER_SCHOOL_COL, ""),
154
- "State": r.get(MASTER_STATE_COL, state_label),
155
- "District": r.get(MASTER_DISTRICT_COL, ""),
156
- "Block": r.get(MASTER_BLOCK_COL, ""),
157
- "UDISE_Code": r.get(MASTER_UDISE_COL, "")
158
- })
159
- return rows
160
-
161
- # --- 1) Try requested state first ---
162
- if state_key:
163
- fname = STATE_HF_FILES[state_key]
164
- df_state = load_csv(fname)
165
- rows = extract_rows(df_state, state_label=state_key)
166
- if rows:
167
- return rows
168
 
169
- # --- 2) Try all states (global fallback) ---
170
- if try_global:
171
- for sk, fname in STATE_HF_FILES.items():
172
- df = load_csv(fname)
173
- rows = extract_rows(df, state_label=sk)
174
- if rows:
175
- results.extend(rows)
176
 
177
- return results
 
 
 
 
 
 
 
 
178
 
 
179
 
180
 
 
 
 
181
  def load_master_for_state(state_key: str | None):
182
  """
183
- Load the master CSV for a state from Hugging Face Hub (dataset repo),
184
- set global master_df, and return District & Block dropdown configs.
185
  """
186
- global master_df
187
-
188
- if not state_key:
189
- master_df = None
190
- return gr.Dropdown(choices=[], value=None), gr.Dropdown(choices=[], value=None) # gr referenced in app; kept for signature
191
-
192
- state_key_norm = state_key.upper().strip()
193
- if state_key_norm not in STATE_HF_FILES:
194
- master_df = None
195
- return gr.Dropdown(choices=[], value=None), gr.Dropdown(choices=[], value=None)
196
 
197
- csv_filename = STATE_HF_FILES[state_key_norm]
198
 
199
- # Download the CSV file from the dataset repo
200
- local_path = hf_hub_download(
201
- repo_id=HF_SCHOOLS_DATASET,
202
- repo_type="dataset",
203
- filename=csv_filename,
204
- )
205
 
206
- master_df = pd.read_csv(local_path, dtype=str).fillna("")
 
207
 
208
- # District choices
209
- if MASTER_DISTRICT_COL in master_df.columns:
210
- districts = sorted(master_df[MASTER_DISTRICT_COL].dropna().unique().tolist())
211
  districts = ["All"] + districts
212
  else:
213
  districts = []
214
 
215
- # Initial blocks
216
- blocks = ["All"] if MASTER_BLOCK_COL in master_df.columns else []
217
 
218
- # Return gr-compatible Dropdown values (constructed in app)
219
- # To avoid importing gr here (keeping logic separate), return lists and let app assemble Dropdowns if needed.
220
- # However, in our app we directly return gr.Dropdown — so keep compatibility.
221
- import gradio as gr # local import to avoid circular imports at top
222
- return gr.Dropdown(choices=districts, value="All" if districts else None), gr.Dropdown(choices=blocks, value="All" if blocks else None)
223
 
224
 
225
  def update_blocks(district: str | None):
226
  """
227
- Update Block dropdown when District changes.
228
  """
229
- global master_df
230
-
231
  import gradio as gr
232
- if master_df is None or MASTER_BLOCK_COL not in master_df.columns:
233
- return gr.Dropdown(choices=["All"], value="All")
234
 
235
  df = master_df
236
- if (
237
- district
238
- and district != "All"
239
- and MASTER_DISTRICT_COL in df.columns
240
- ):
241
  df = df[df[MASTER_DISTRICT_COL] == district]
242
 
243
- blocks = sorted(df[MASTER_BLOCK_COL].dropna().unique().tolist())
244
- blocks = ["All"] + blocks if blocks else ["All"]
 
 
 
 
245
  return gr.Dropdown(choices=blocks, value="All")
246
 
247
 
248
- def search_candidates(query_name: str, state_key: str | None, district: str | None, block: str | None):
 
 
 
 
 
 
 
 
249
  """
250
- Given school name + state + district + block, return:
251
- - candidates table (top N matches)
252
- - best-candidate table (single row)
253
  """
254
- global master_df, normalize_with_patterns_dynamic
255
 
256
- # import normalize function if not loaded yet (avoids circular import)
257
  if normalize_with_patterns_dynamic is None:
258
- from admin_patterns import normalize_with_patterns_dynamic # local import
259
- normalize_with_patterns_dynamic = normalize_with_patterns_dynamic
260
 
261
- if master_df is None:
262
- return pd.DataFrame(), pd.DataFrame()
263
-
264
- query_name = (query_name or "").strip()
265
  if not query_name:
266
  return pd.DataFrame(), pd.DataFrame()
267
 
 
 
268
  df = master_df
269
 
 
 
 
 
270
  # Filter by district
271
- if (
272
- district
273
- and district != "All"
274
- and MASTER_DISTRICT_COL in df.columns
275
- ):
276
  df = df[df[MASTER_DISTRICT_COL] == district]
277
 
278
  # Filter by block
279
- if (
280
- block
281
- and block != "All"
282
- and MASTER_BLOCK_COL in df.columns
283
- ):
284
  df = df[df[MASTER_BLOCK_COL] == block]
285
 
286
  if df.empty:
287
  return pd.DataFrame(), pd.DataFrame()
288
 
289
- state_for_patterns = (state_key or DEFAULT_STATE_KEY).upper().strip()
290
 
291
  choices = df[MASTER_SCHOOL_COL].astype(str)
292
 
@@ -294,29 +224,25 @@ def search_candidates(query_name: str, state_key: str | None, district: str | No
294
  query_name,
295
  choices,
296
  scorer=fuzz.token_set_ratio,
297
- processor=lambda s: normalize_with_patterns_dynamic(s, state_for_patterns),
 
 
298
  limit=MAX_CANDIDATES,
299
- ) # (choice, score, key)
300
-
301
- if not candidates_raw:
302
- return pd.DataFrame(), pd.DataFrame()
303
 
304
  rows = []
305
- for choice_name, score, key in candidates_raw:
306
- try:
307
- row = df.loc[key]
308
- except Exception:
309
- continue
310
-
311
  rows.append({
312
- "School_Name": row.get(MASTER_SCHOOL_COL, ""),
313
- "State": row.get(MASTER_STATE_COL, "") if MASTER_STATE_COL in df.columns else state_for_patterns,
314
- "District": row.get(MASTER_DISTRICT_COL, "") if MASTER_DISTRICT_COL in df.columns else "",
315
- "Block": row.get(MASTER_BLOCK_COL, "") if MASTER_BLOCK_COL in df.columns else "",
316
- "UDISE_Code": row.get(MASTER_UDISE_COL, "") if MASTER_UDISE_COL in df.columns else "",
317
  "Score": score,
318
  })
319
 
320
  candidates_df = pd.DataFrame(rows)
321
  best_df = candidates_df.head(1).copy()
 
322
  return candidates_df, best_df
 
6
  from web_search import tavily_search_codes
7
 
8
  # ====================================================
9
+ # CONFIG: columns + HF dataset
10
  # ====================================================
11
  MASTER_SCHOOL_COL = "School_Name__c"
12
  MASTER_DISTRICT_COL = "School_District__c"
 
15
  MASTER_STATE_COL = "School_State__c"
16
 
17
  HF_SCHOOLS_DATASET = "Apf-AI4Good/Schools"
18
+ MASTER_ALL_STATES_FILE = "master_all_states.csv"
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  DEFAULT_STATE_KEY = "ARUNACHAL PRADESH"
21
  MAX_CANDIDATES = 5
22
 
23
+ # global cache (loaded once)
24
  master_df = None
25
 
26
+ # normalization helper (lazy import to avoid circular deps)
 
27
  try:
28
  from admin_patterns import normalize_with_patterns_dynamic
29
  except Exception:
 
30
  normalize_with_patterns_dynamic = None
31
 
32
 
33
+ # ====================================================
34
+ # INTERNAL: load master CSV once
35
+ # ====================================================
36
+ def _load_master_if_needed():
37
+ global master_df
38
+ if master_df is not None:
39
+ return
40
+
41
+ local_path = hf_hub_download(
42
+ repo_id=HF_SCHOOLS_DATASET,
43
+ repo_type="dataset",
44
+ filename=MASTER_ALL_STATES_FILE,
45
+ )
46
+
47
+ master_df = pd.read_csv(local_path, dtype=str).fillna("")
48
+
49
+
50
+ # ====================================================
51
+ # WEB SEARCH → UDISE → MASTER LOOKUP
52
+ # ====================================================
53
  def on_search_web(
54
  school_name: str,
55
  state_name: str,
 
57
  block: str = None
58
  ):
59
  """
60
+ 1. Tavily search → list of UDISE codes
61
+ 2. Lookup those UDISE codes in master_all_states.csv
62
+ 3. Return standardized DataFrame
 
 
 
 
 
63
  """
64
 
65
+ # Step 1: Tavily search
66
  udise_list = tavily_search_codes(
67
  school_name=school_name,
68
  state_name=state_name,
69
  district=district,
70
+ api_key=None,
71
+ enforce_state_prefix=True,
72
  )
73
 
 
 
74
  if not udise_list:
 
75
  return pd.DataFrame(
76
  columns=["School_Name", "State", "District", "Block", "UDISE_Code"]
77
  )
78
 
79
+ # Step 2: lookup
80
+ rows = get_school_rows_by_udise(state_name, udise_list)
81
 
82
+ # Step 3: to DataFrame
83
  df = pd.DataFrame(rows)
84
 
 
85
  expected = ["School_Name", "State", "District", "Block", "UDISE_Code"]
86
  for col in expected:
87
  if col not in df.columns:
88
+ df[col] = None
89
 
90
+ return df[expected]
 
91
 
 
 
92
 
93
+ def get_school_rows_by_udise(state_name: str, udise_codes: list[str]):
 
94
  """
95
+ UDISE → school rows lookup from master_all_states.csv
 
 
96
  """
 
97
  if not udise_codes:
98
  return []
99
 
100
+ _load_master_if_needed()
101
 
102
+ udise_codes = {str(u) for u in udise_codes}
103
 
104
+ df = master_df
105
+ matched = df[df[MASTER_UDISE_COL].isin(udise_codes)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
 
107
+ if state_name:
108
+ matched = matched[
109
+ matched[MASTER_STATE_COL].str.upper() == state_name.upper()
110
+ ]
 
 
 
111
 
112
+ rows = []
113
+ for _, r in matched.iterrows():
114
+ rows.append({
115
+ "School_Name": r.get(MASTER_SCHOOL_COL, ""),
116
+ "State": r.get(MASTER_STATE_COL, ""),
117
+ "District": r.get(MASTER_DISTRICT_COL, ""),
118
+ "Block": r.get(MASTER_BLOCK_COL, ""),
119
+ "UDISE_Code": r.get(MASTER_UDISE_COL, ""),
120
+ })
121
 
122
+ return rows
123
 
124
 
125
+ # ====================================================
126
+ # MASTER LOAD FOR UI (STATE → DISTRICT → BLOCK)
127
+ # ====================================================
128
  def load_master_for_state(state_key: str | None):
129
  """
130
+ Load master_all_states.csv once.
131
+ Filter districts by selected state.
132
  """
133
+ import gradio as gr
 
 
 
 
 
 
 
 
 
134
 
135
+ _load_master_if_needed()
136
 
137
+ df = master_df
 
 
 
 
 
138
 
139
+ if state_key:
140
+ df = df[df[MASTER_STATE_COL].str.upper() == state_key.upper()]
141
 
142
+ if MASTER_DISTRICT_COL in df.columns:
143
+ districts = sorted(df[MASTER_DISTRICT_COL].unique().tolist())
 
144
  districts = ["All"] + districts
145
  else:
146
  districts = []
147
 
148
+ blocks = ["All"]
 
149
 
150
+ return (
151
+ gr.Dropdown(choices=districts, value="All" if districts else None),
152
+ gr.Dropdown(choices=blocks, value="All"),
153
+ )
 
154
 
155
 
156
  def update_blocks(district: str | None):
157
  """
158
+ Update block dropdown when district changes
159
  """
 
 
160
  import gradio as gr
161
+
162
+ _load_master_if_needed()
163
 
164
  df = master_df
165
+
166
+ if district and district != "All":
 
 
 
167
  df = df[df[MASTER_DISTRICT_COL] == district]
168
 
169
+ if MASTER_BLOCK_COL in df.columns:
170
+ blocks = sorted(df[MASTER_BLOCK_COL].unique().tolist())
171
+ blocks = ["All"] + blocks if blocks else ["All"]
172
+ else:
173
+ blocks = ["All"]
174
+
175
  return gr.Dropdown(choices=blocks, value="All")
176
 
177
 
178
+ # ====================================================
179
+ # RAPIDFUZZ SEARCH
180
+ # ====================================================
181
+ def search_candidates(
182
+ query_name: str,
183
+ state_key: str | None,
184
+ district: str | None,
185
+ block: str | None,
186
+ ):
187
  """
188
+ Given school name + filters, return:
189
+ - candidates table
190
+ - best candidate table
191
  """
192
+ global normalize_with_patterns_dynamic
193
 
 
194
  if normalize_with_patterns_dynamic is None:
195
+ from admin_patterns import normalize_with_patterns_dynamic
 
196
 
 
 
 
 
197
  if not query_name:
198
  return pd.DataFrame(), pd.DataFrame()
199
 
200
+ _load_master_if_needed()
201
+
202
  df = master_df
203
 
204
+ # Filter by state
205
+ if state_key:
206
+ df = df[df[MASTER_STATE_COL].str.upper() == state_key.upper()]
207
+
208
  # Filter by district
209
+ if district and district != "All":
 
 
 
 
210
  df = df[df[MASTER_DISTRICT_COL] == district]
211
 
212
  # Filter by block
213
+ if block and block != "All":
 
 
 
 
214
  df = df[df[MASTER_BLOCK_COL] == block]
215
 
216
  if df.empty:
217
  return pd.DataFrame(), pd.DataFrame()
218
 
219
+ state_for_patterns = (state_key or DEFAULT_STATE_KEY).upper()
220
 
221
  choices = df[MASTER_SCHOOL_COL].astype(str)
222
 
 
224
  query_name,
225
  choices,
226
  scorer=fuzz.token_set_ratio,
227
+ processor=lambda s: normalize_with_patterns_dynamic(
228
+ s, state_for_patterns
229
+ ),
230
  limit=MAX_CANDIDATES,
231
+ )
 
 
 
232
 
233
  rows = []
234
+ for choice_name, score, idx in candidates_raw:
235
+ r = df.loc[idx]
 
 
 
 
236
  rows.append({
237
+ "School_Name": r.get(MASTER_SCHOOL_COL, ""),
238
+ "State": r.get(MASTER_STATE_COL, ""),
239
+ "District": r.get(MASTER_DISTRICT_COL, ""),
240
+ "Block": r.get(MASTER_BLOCK_COL, ""),
241
+ "UDISE_Code": r.get(MASTER_UDISE_COL, ""),
242
  "Score": score,
243
  })
244
 
245
  candidates_df = pd.DataFrame(rows)
246
  best_df = candidates_df.head(1).copy()
247
+
248
  return candidates_df, best_df