gkdivya commited on
Commit
b5de4f2
·
verified ·
1 Parent(s): 2c5da0c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +192 -87
app.py CHANGED
@@ -6,58 +6,90 @@ import requests
6
  import pandas as pd
7
  import json
8
  import re
 
9
  from tavily import TavilyClient
10
 
11
  KYS_SAMPLE = "https://kys.udiseplus.gov.in/webapp/api/search-schools?searchType=3&searchParam={udise}"
12
 
13
- # List of valid state codes for UDISE (first 2 digits of UDISE code)
14
- # List of valid state codes for UDISE (first 2 digits of UDISE code)
15
- VALID_UDISE_STATE_CODES = [
16
- '01', # Jammu & Kashmir
17
- '02', # Himachal Pradesh
18
- '03', # Punjab
19
- '04', # Chandigarh
20
- '05', # Uttarakhand
21
- '06', # Haryana
22
- '07', # Delhi
23
- '08', # Rajasthan
24
- '09', # Uttar Pradesh
25
- '10', # Bihar
26
- '11', # Sikkim
27
- '12', # Arunachal Pradesh
28
- '13', # Nagaland
29
- '14', # Manipur
30
- '15', # Mizoram
31
- '16', # Tripura
32
- '17', # Meghalaya
33
- '18', # Assam
34
- '19', # West Bengal
35
- '20', # Jharkhand
36
- '21', # Odisha
37
- '22', # Chhattisgarh
38
- '23', # Madhya Pradesh
39
- '24', # Gujarat
40
- '25', # Daman & Diu
41
- '26', # Dadra & Nagar Haveli
42
- '27', # Maharashtra
43
- '28', # Andhra Pradesh
44
- '29', # Karnataka
45
- '30', # Goa
46
- '31', # Lakshadweep
47
- '32', # Kerala
48
- '33', # Tamil Nadu
49
- '34', # Puducherry
50
- '35', # Andaman & Nicobar Islands
51
- '36', # Telangana
52
- '37' # Ladakh (added in newer datasets)
53
- ]
54
-
55
- def is_valid_udise(code):
56
- """Check if a string is a valid UDISE code."""
 
 
 
 
 
 
 
 
 
 
 
 
57
  if not (code and code.isdigit() and len(code) == 11):
58
  return False
59
- # Optional: Validate state code (first 2 digits)
60
- return code[:2] in VALID_UDISE_STATE_CODES
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
  STATES = [
63
  "Arunachal_pradesh",
@@ -103,13 +135,24 @@ def call_search_sdk(api_key, payload_text):
103
  return {"ok": False, "error": str(e)}
104
 
105
 
106
- def extract_udise_candidates_from_search(search_json):
107
  """
108
- Extract UDISE codes from Tavily search results.
109
- Returns a list of unique UDISE codes found in the search results.
 
 
 
 
 
 
 
110
  """
111
  print("\n===== Extracting UDISE Codes =====")
112
- found = set()
 
 
 
 
113
 
114
  # Check if we have valid search results
115
  if not search_json or not isinstance(search_json, dict):
@@ -123,7 +166,7 @@ def extract_udise_candidates_from_search(search_json):
123
 
124
  print(f"Found {len(results)} search results")
125
 
126
- # Patterns to match UDISE codes
127
  patterns = [
128
  r'UDISE[^\d]*(?:code|Code|CODE)[^\d]*(\d{11})(?![0-9])',
129
  r'Udise[^\d]*(?:School[^\d]*Code|Code)[^\d]*(\d{11})(?![0-9])',
@@ -134,26 +177,70 @@ def extract_udise_candidates_from_search(search_json):
134
  if not isinstance(result, dict):
135
  continue
136
 
137
- # Combine title and content for searching
138
- text = f"{result.get('title', '')} {result.get('content', '')}"
 
 
 
139
 
140
  # Check for UDISE codes using all patterns
141
  for pattern in patterns:
142
  matches = re.finditer(pattern, text, re.IGNORECASE)
143
  for match in matches:
144
  udise_code = match.group(1) if len(match.groups()) > 0 else match.group(0)
145
- if udise_code and is_valid_udise(udise_code):
146
- print(f"Found UDISE code: {udise_code}")
147
- found.add(udise_code)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
 
149
- # Convert set to list and sort
150
- result = sorted(list(found))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
 
152
- if not result:
153
- print("No valid UDISE codes found")
154
  return []
155
 
156
- return result
157
 
158
 
159
  def json_to_table(obj):
@@ -204,18 +291,29 @@ def to_table_from_kys(kys_json):
204
 
205
 
206
  def search_workflow(school_name, state_name, search_key, use_search=True, use_kys=True):
207
- out = {"kys": None, "search": None, "suggestions": [], "first_candidate": None}
208
  payload_text = f"{school_name or ''} {state_name or ''} UDISE code".strip()
209
 
210
  if use_search:
211
  search_res = call_search_sdk(search_key, payload_text)
212
  out["search"] = search_res
213
  if search_res.get("ok"):
214
- # Pass both school_name and state_name to improve UDISE code extraction
215
- candidates = extract_udise_candidates_from_search(
216
- search_res["data"]
 
 
217
  )
218
- out["suggestions"] = candidates
 
 
 
 
 
 
 
 
 
219
  if candidates and candidates[0] != "No UDISE codes found":
220
  out["first_candidate"] = candidates[0]
221
  else:
@@ -231,10 +329,9 @@ def search_workflow(school_name, state_name, search_key, use_search=True, use_ky
231
  with gr.Blocks() as demo:
232
  gr.Markdown(
233
  """
234
- # Search + KYS Lookup (Hugging Face Space)
235
- This version uses the Search SDK internally. Provide your API key in the textbox.
236
- Enter a school name (or UDISE code) and select the state; the app calls Search SDK
237
- with the combined query `<school> <state> UDISE code`, then optionally calls KYS.
238
  """
239
  )
240
 
@@ -270,11 +367,21 @@ with the combined query `<school> <state> UDISE code`, then optionally calls KYS
270
  if res.get("search") and res["search"].get("ok"):
271
  tbl = json_to_table(res["search"]["data"])
272
 
273
- # Ensure suggestions is always a list and handle empty state
274
- suggestions = res.get("suggestions", [])
275
- if not suggestions:
276
- suggestions = ["No UDISE codes found"]
277
-
 
 
 
 
 
 
 
 
 
 
278
  # Always save the key to state if a new one is provided
279
  new_saved_key = key or saved_key
280
 
@@ -283,7 +390,7 @@ with the combined query `<school> <state> UDISE code`, then optionally calls KYS
283
  res.get("search"), # output_json
284
  tbl, # search_table
285
  {"choices": suggestions, "__type__": "update"}, # Update dropdown choices
286
- res.get("first_candidate", ""), # This will update udise_input
287
  new_saved_key, # saved_key_state
288
  res.get("kys") # kys_output_json
289
  )
@@ -302,8 +409,15 @@ with the combined query `<school> <state> UDISE code`, then optionally calls KYS
302
  )
303
 
304
  def on_select_suggestion(choice):
305
- # Only update if a valid choice is selected (not the placeholder)
306
- return choice if choice and choice != "No UDISE codes found" else ""
 
 
 
 
 
 
 
307
 
308
  suggestions_dropdown.change(
309
  on_select_suggestion,
@@ -328,14 +442,5 @@ with the combined query `<school> <state> UDISE code`, then optionally calls KYS
328
 
329
  show_raw_checkbox.change(toggle_raw, inputs=[show_raw_checkbox], outputs=[output_json, kys_output_json])
330
 
331
- gr.Markdown(
332
- """
333
- ---
334
- **Notes:**
335
- - Your API key is stored in-memory for the current session only.
336
- - It is not written to disk or logged.
337
- """
338
- )
339
-
340
  if __name__ == "__main__":
341
  demo.launch()
 
6
  import pandas as pd
7
  import json
8
  import re
9
+ from difflib import get_close_matches
10
  from tavily import TavilyClient
11
 
12
  KYS_SAMPLE = "https://kys.udiseplus.gov.in/webapp/api/search-schools?searchType=3&searchParam={udise}"
13
 
14
+ # Mapping of state names to UDISE state codes
15
+ STATE_TO_UDISE_CODE = {
16
+ 'Jammu & Kashmir': '01',
17
+ 'Himachal Pradesh': '02',
18
+ 'Punjab': '03',
19
+ 'Chandigarh': '04',
20
+ 'Uttarakhand': '05',
21
+ 'Haryana': '06',
22
+ 'Delhi': '07',
23
+ 'Rajasthan': '08',
24
+ 'Uttar Pradesh': '09',
25
+ 'Bihar': '10',
26
+ 'Sikkim': '11',
27
+ 'Arunachal Pradesh': '12',
28
+ 'Nagaland': '13',
29
+ 'Manipur': '14',
30
+ 'Mizoram': '15',
31
+ 'Tripura': '16',
32
+ 'Meghalaya': '17',
33
+ 'Assam': '18',
34
+ 'West Bengal': '19',
35
+ 'Jharkhand': '20',
36
+ 'Odisha': '21',
37
+ 'Chhattisgarh': '22',
38
+ 'Madhya Pradesh': '23',
39
+ 'Gujarat': '24',
40
+ 'Daman & Diu': '25',
41
+ 'Dadra & Nagar Haveli': '26',
42
+ 'Maharashtra': '27',
43
+ 'Andhra Pradesh': '28',
44
+ 'Karnataka': '29',
45
+ 'Goa': '30',
46
+ 'Lakshadweep': '31',
47
+ 'Kerala': '32',
48
+ 'Tamil Nadu': '33',
49
+ 'Puducherry': '34',
50
+ 'Andaman & Nicobar Islands': '35',
51
+ 'Telangana': '36',
52
+ 'Ladakh': '37'
53
+ }
54
+
55
+ # For backward compatibility
56
+ VALID_UDISE_STATE_CODES = list(STATE_TO_UDISE_CODE.values())
57
+
58
+ def is_valid_udise(code, state_name=None):
59
+ """
60
+ Check if a string is a valid UDISE code.
61
+
62
+ Args:
63
+ code: The UDISE code to validate
64
+ state_name: Optional state name to validate against the UDISE state code
65
+
66
+ Returns:
67
+ bool: True if the code is valid, False otherwise
68
+ """
69
+ # Basic validation
70
  if not (code and code.isdigit() and len(code) == 11):
71
  return False
72
+
73
+ state_code = code[:2]
74
+
75
+ # Check if state code is valid
76
+ if state_code not in VALID_UDISE_STATE_CODES:
77
+ return False
78
+
79
+ # If state_name is provided, validate against it
80
+ if state_name:
81
+ state_name = state_name.strip().title()
82
+ # Handle special case for 'Uttar pradesh' vs 'Uttar Pradesh'
83
+ state_name = state_name.replace('_', ' ')
84
+ expected_code = STATE_TO_UDISE_CODE.get(state_name)
85
+ if not expected_code:
86
+ print(f"Warning: Unknown state name: {state_name}")
87
+ return False
88
+ if state_code != expected_code:
89
+ print(f"UDISE code {code} state code {state_code} does not match expected state {state_name} ({expected_code})")
90
+ return False
91
+
92
+ return True
93
 
94
  STATES = [
95
  "Arunachal_pradesh",
 
135
  return {"ok": False, "error": str(e)}
136
 
137
 
138
+ def extract_udise_candidates_from_search(search_json, state_name=None, search_query=None):
139
  """
140
+ Extract UDISE codes and school information from Tavily search results.
141
+
142
+ Args:
143
+ search_json: JSON response from Tavily search
144
+ state_name: Optional state name to validate UDISE codes against
145
+ search_query: Original search query to help with fuzzy matching
146
+
147
+ Returns:
148
+ list: List of dictionaries containing UDISE codes and school information
149
  """
150
  print("\n===== Extracting UDISE Codes =====")
151
+ if state_name:
152
+ print(f"Validating UDISE codes against state: {state_name}")
153
+
154
+ found_codes = set()
155
+ school_info = [] # List to store school information
156
 
157
  # Check if we have valid search results
158
  if not search_json or not isinstance(search_json, dict):
 
166
 
167
  print(f"Found {len(results)} search results")
168
 
169
+ # Patterns to match UDISE codes and school information
170
  patterns = [
171
  r'UDISE[^\d]*(?:code|Code|CODE)[^\d]*(\d{11})(?![0-9])',
172
  r'Udise[^\d]*(?:School[^\d]*Code|Code)[^\d]*(\d{11})(?![0-9])',
 
177
  if not isinstance(result, dict):
178
  continue
179
 
180
+ # Get title and content
181
+ title = result.get('title', '')
182
+ content = result.get('content', '')
183
+ url = result.get('url', '')
184
+ text = f"{title} {content}"
185
 
186
  # Check for UDISE codes using all patterns
187
  for pattern in patterns:
188
  matches = re.finditer(pattern, text, re.IGNORECASE)
189
  for match in matches:
190
  udise_code = match.group(1) if len(match.groups()) > 0 else match.group(0)
191
+ if udise_code and is_valid_udise(udise_code, state_name) and udise_code not in found_codes:
192
+ print(f"Found valid UDISE code: {udise_code}")
193
+ found_codes.add(udise_code)
194
+
195
+ # Extract school name - try to find the most relevant text
196
+ school_name = title
197
+
198
+ # If title is too short or doesn't seem like a school name, try to find a better match
199
+ if len(school_name.split()) < 2 or any(word in school_name.lower() for word in ['udise', 'code', 'school']):
200
+ # Look for a school-like name in the content
201
+ school_matches = re.findall(r'([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+\s*(?:School|School|High School|High School|Vidyalaya|Vidyalaya|Vidyalayam|Vidyalayam|Vidhya|Vidhya|Vidya|Vidya|Public School|Public School|Govt|Government|Kendriya|Jawahar|Navodaya|Sainik|Army|Air Force|Navy|Central School|Central School|CBSE|ICSE|State Board|State Board|EM|EM|TM|TM|Primary|Primary|Upper Primary|Upper Primary|Higher Secondary|Higher Secondary|HSS|HSS|HS|HS|UPS|UPS|PS|PS))', content, re.IGNORECASE)
202
+ if school_matches:
203
+ school_name = school_matches[0][0].strip()
204
+
205
+ school_info.append({
206
+ 'udise': udise_code,
207
+ 'name': school_name,
208
+ 'source': url,
209
+ 'snippet': content[:200] + '...' if len(content) > 200 else content
210
+ })
211
 
212
+ # If we have a search query, sort results by relevance to the query
213
+ if search_query and school_info:
214
+ # Extract just the school names for fuzzy matching
215
+ school_names = [s['name'] for s in school_info]
216
+
217
+ # Get fuzzy matches and their scores
218
+ matches = get_close_matches(
219
+ search_query.lower(),
220
+ [name.lower() for name in school_names],
221
+ n=len(school_names),
222
+ cutoff=0.3 # Lower cutoff to allow more fuzzy matches
223
+ )
224
+
225
+ # Create a dictionary to map lowercase names to their original objects with scores
226
+ school_map = {s['name'].lower(): s for s in school_info}
227
+
228
+ # Rebuild the school_info list in order of best match
229
+ sorted_schools = []
230
+ for match in matches:
231
+ if match in school_map:
232
+ sorted_schools.append(school_map[match])
233
+ del school_map[match]
234
+
235
+ # Add any remaining schools that didn't match the fuzzy search
236
+ sorted_schools.extend(school_map.values())
237
+ school_info = sorted_schools
238
 
239
+ if not school_info:
240
+ print("No valid school information found with UDISE codes")
241
  return []
242
 
243
+ return school_info
244
 
245
 
246
  def json_to_table(obj):
 
291
 
292
 
293
  def search_workflow(school_name, state_name, search_key, use_search=True, use_kys=True):
294
+ out = {"kys": None, "search": None, "suggestions": [], "first_candidate": None, "school_info": []}
295
  payload_text = f"{school_name or ''} {state_name or ''} UDISE code".strip()
296
 
297
  if use_search:
298
  search_res = call_search_sdk(search_key, payload_text)
299
  out["search"] = search_res
300
  if search_res.get("ok"):
301
+ # Pass school_name for fuzzy matching and state_name for validation
302
+ school_info = extract_udise_candidates_from_search(
303
+ search_res["data"],
304
+ state_name=state_name,
305
+ search_query=school_name
306
  )
307
+
308
+ # Extract just the UDISE codes for backward compatibility
309
+ candidates = [info['udise'] for info in school_info]
310
+
311
+ out["suggestions"] = [
312
+ f"{info['name']} (UDISE: {info['udise']})"
313
+ for info in school_info
314
+ ]
315
+ out["school_info"] = school_info
316
+
317
  if candidates and candidates[0] != "No UDISE codes found":
318
  out["first_candidate"] = candidates[0]
319
  else:
 
329
  with gr.Blocks() as demo:
330
  gr.Markdown(
331
  """
332
+ # Find School UDISE Code
333
+ Provide your API key in the textbox.
334
+ Enter a school name and select the state
 
335
  """
336
  )
337
 
 
367
  if res.get("search") and res["search"].get("ok"):
368
  tbl = json_to_table(res["search"]["data"])
369
 
370
+ # Get school info and format suggestions with school names and UDISE codes
371
+ school_info = res.get("school_info", [])
372
+ suggestions = []
373
+ first_candidate = ""
374
+
375
+ if school_info:
376
+ # Format suggestions as "School Name (UDISE: 12345678901)"
377
+ suggestions = [
378
+ f"{info['name']} (UDISE: {info['udise']})"
379
+ for info in school_info
380
+ ]
381
+ first_candidate = school_info[0]['udise'] if school_info else ""
382
+ else:
383
+ suggestions = ["No matching schools found"]
384
+
385
  # Always save the key to state if a new one is provided
386
  new_saved_key = key or saved_key
387
 
 
390
  res.get("search"), # output_json
391
  tbl, # search_table
392
  {"choices": suggestions, "__type__": "update"}, # Update dropdown choices
393
+ first_candidate, # This will update udise_input with the UDISE code
394
  new_saved_key, # saved_key_state
395
  res.get("kys") # kys_output_json
396
  )
 
409
  )
410
 
411
  def on_select_suggestion(choice):
412
+ # Extract UDISE code from the selected choice
413
+ if not choice or choice in ["No matching schools found", "No UDISE codes found"]:
414
+ return ""
415
+
416
+ # Extract UDISE code from the format "School Name (UDISE: 12345678901)"
417
+ match = re.search(r'\(UDISE:\s*(\d+)\)', choice)
418
+ if match:
419
+ return match.group(1)
420
+ return ""
421
 
422
  suggestions_dropdown.change(
423
  on_select_suggestion,
 
442
 
443
  show_raw_checkbox.change(toggle_raw, inputs=[show_raw_checkbox], outputs=[output_json, kys_output_json])
444
 
 
 
 
 
 
 
 
 
 
445
  if __name__ == "__main__":
446
  demo.launch()