Spaces:
Sleeping
Sleeping
File size: 16,510 Bytes
300b404 0cad454 684f84d 4842e86 06d5eae b5de4f2 686aead 684f84d 1a0de33 b5de4f2 1a0de33 b5de4f2 1a0de33 6c05f60 b662c97 4842e86 b662c97 4842e86 b662c97 4842e86 684f84d 4842e86 684f84d 300b404 684f84d 300b404 684f84d b5de4f2 1289deb b5de4f2 1289deb 7e65b3f b5de4f2 1289deb 7e65b3f 1289deb 7e65b3f 1289deb 7e65b3f b5de4f2 1289deb 7e65b3f 1289deb b5de4f2 acb72da 7e65b3f acb72da 7e65b3f acb72da b5de4f2 acb72da b5de4f2 acb72da b5de4f2 7e65b3f 1289deb b5de4f2 4842e86 684f84d 0cad454 684f84d 4842e86 684f84d 4842e86 684f84d b662c97 4842e86 b662c97 4842e86 c70e992 684f84d 4842e86 b662c97 4842e86 b662c97 c70e992 b662c97 684f84d 686aead b5de4f2 300b404 684f84d 300b404 b5de4f2 acb72da b5de4f2 acb72da 1289deb 684f84d 300b404 684f84d 300b404 684f84d 4842e86 b5de4f2 4842e86 684f84d 4842e86 0cad454 300b404 686aead 684f84d 4842e86 b662c97 4842e86 b662c97 300b404 684f84d 300b404 4842e86 684f84d b662c97 684f84d 0cad454 686aead 300b404 686aead 300b404 686aead b5de4f2 686aead 1289deb 686aead b5de4f2 1289deb 686aead 1289deb 684f84d b5de4f2 684f84d 686aead 684f84d 4842e86 b662c97 684f84d 4842e86 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 |
# search_kys_space.py
# Gradio app for Hugging Face Spaces (uses Search API via Tavily SDK internally)
import gradio as gr
import requests
import pandas as pd
import json
import re
from difflib import get_close_matches
from tavily import TavilyClient
KYS_SAMPLE = "https://kys.udiseplus.gov.in/webapp/api/search-schools?searchType=3&searchParam={udise}"
# Mapping of state names to UDISE state codes
STATE_TO_UDISE_CODE = {
'Jammu & Kashmir': '01',
'Himachal Pradesh': '02',
'Punjab': '03',
'Chandigarh': '04',
'Uttarakhand': '05',
'Haryana': '06',
'Delhi': '07',
'Rajasthan': '08',
'Uttar Pradesh': '09',
'Bihar': '10',
'Sikkim': '11',
'Arunachal Pradesh': '12',
'Nagaland': '13',
'Manipur': '14',
'Mizoram': '15',
'Tripura': '16',
'Meghalaya': '17',
'Assam': '18',
'West Bengal': '19',
'Jharkhand': '20',
'Odisha': '21',
'Chhattisgarh': '22',
'Madhya Pradesh': '23',
'Gujarat': '24',
'Daman & Diu': '25',
'Dadra & Nagar Haveli': '26',
'Maharashtra': '27',
'Andhra Pradesh': '28',
'Karnataka': '29',
'Goa': '30',
'Lakshadweep': '31',
'Kerala': '32',
'Tamil Nadu': '33',
'Puducherry': '34',
'Andaman & Nicobar Islands': '35',
'Telangana': '36',
'Ladakh': '37'
}
# For backward compatibility
VALID_UDISE_STATE_CODES = list(STATE_TO_UDISE_CODE.values())
def is_valid_udise(code, state_name=None):
"""
Check if a string is a valid UDISE code.
Args:
code: The UDISE code to validate
state_name: Optional state name to validate against the UDISE state code
Returns:
bool: True if the code is valid, False otherwise
"""
# Basic validation
if not (code and code.isdigit() and len(code) == 11):
return False
state_code = code[:2]
# Check if state code is valid
if state_code not in VALID_UDISE_STATE_CODES:
return False
# If state_name is provided, validate against it
if state_name:
state_name = state_name.strip().title()
# Handle special case for 'Uttar pradesh' vs 'Uttar Pradesh'
state_name = state_name.replace('_', ' ')
expected_code = STATE_TO_UDISE_CODE.get(state_name)
if not expected_code:
print(f"Warning: Unknown state name: {state_name}")
return False
if state_code != expected_code:
print(f"UDISE code {code} state code {state_code} does not match expected state {state_name} ({expected_code})")
return False
return True
STATES = [
"Arunachal_pradesh",
"Assam",
"Bihar",
"Chhattisgarh",
"Jharkhand",
"Karnataka",
"Madhya pradesh",
"Manipur",
"Meghalaya",
"Mizoram",
"Nagaland",
"Odisha",
"Puducherry",
"Rajasthan",
"Sikkim",
"Telangana",
"Tripura",
"Uttar pradesh",
"Uttarakhand"
]
def call_kys_by_udise(udise_code):
url = KYS_SAMPLE.format(udise=udise_code)
try:
resp = requests.get(url, timeout=10)
resp.raise_for_status()
data = resp.json()
return {"ok": True, "url": url, "data": data}
except Exception as e:
return {"ok": False, "error": str(e), "url": url}
def call_search_sdk(api_key, payload_text):
try:
client = TavilyClient(api_key)
resp = client.search(query=payload_text)
return {"ok": True, "data": resp}
except Exception as e:
return {"ok": False, "error": str(e)}
def extract_udise_candidates_from_search(search_json, state_name=None, search_query=None):
"""
Extract UDISE codes and school information from Tavily search results.
Args:
search_json: JSON response from Tavily search
state_name: Optional state name to validate UDISE codes against
search_query: Original search query to help with fuzzy matching
Returns:
list: List of dictionaries containing UDISE codes and school information
"""
print("\n===== Extracting UDISE Codes =====")
if state_name:
print(f"Validating UDISE codes against state: {state_name}")
found_codes = set()
school_info = [] # List to store school information
# Check if we have valid search results
if not search_json or not isinstance(search_json, dict):
print("Invalid search JSON")
return []
results = search_json.get('results', []) or search_json.get('data', {}).get('results', [])
if not results:
print("No results found in search JSON")
return []
print(f"Found {len(results)} search results")
# Patterns to match UDISE codes and school information
patterns = [
r'UDISE[^\d]*(?:code|Code|CODE)[^\d]*(\d{11})(?![0-9])',
r'Udise[^\d]*(?:School[^\d]*Code|Code)[^\d]*(\d{11})(?![0-9])',
r'(?<![0-9])(\d{11})(?![0-9])' # Fallback: any 11-digit number
]
for result in results:
if not isinstance(result, dict):
continue
# Get title and content
title = result.get('title', '')
content = result.get('content', '')
url = result.get('url', '')
text = f"{title} {content}"
# Check for UDISE codes using all patterns
for pattern in patterns:
matches = re.finditer(pattern, text, re.IGNORECASE)
for match in matches:
udise_code = match.group(1) if len(match.groups()) > 0 else match.group(0)
if udise_code and is_valid_udise(udise_code, state_name) and udise_code not in found_codes:
print(f"Found valid UDISE code: {udise_code}")
found_codes.add(udise_code)
# Extract school name - try to find the most relevant text
school_name = title
# If title is too short or doesn't seem like a school name, try to find a better match
if len(school_name.split()) < 2 or any(word in school_name.lower() for word in ['udise', 'code', 'school']):
# Look for a school-like name in the content
school_matches = re.findall(r'([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+\s*(?:School|School|High School|High School|Vidyalaya|Vidyalaya|Vidyalayam|Vidyalayam|Vidhya|Vidhya|Vidya|Vidya|Public School|Public School|Govt|Government|Kendriya|Jawahar|Navodaya|Sainik|Army|Air Force|Navy|Central School|Central School|CBSE|ICSE|State Board|State Board|EM|EM|TM|TM|Primary|Primary|Upper Primary|Upper Primary|Higher Secondary|Higher Secondary|HSS|HSS|HS|HS|UPS|UPS|PS|PS))', content, re.IGNORECASE)
if school_matches:
school_name = school_matches[0][0].strip()
school_info.append({
'udise': udise_code,
'name': school_name,
'source': url,
'snippet': content[:200] + '...' if len(content) > 200 else content
})
# If we have a search query, sort results by relevance to the query
if search_query and school_info:
# Extract just the school names for fuzzy matching
school_names = [s['name'] for s in school_info]
# Get fuzzy matches and their scores
matches = get_close_matches(
search_query.lower(),
[name.lower() for name in school_names],
n=len(school_names),
cutoff=0.3 # Lower cutoff to allow more fuzzy matches
)
# Create a dictionary to map lowercase names to their original objects with scores
school_map = {s['name'].lower(): s for s in school_info}
# Rebuild the school_info list in order of best match
sorted_schools = []
for match in matches:
if match in school_map:
sorted_schools.append(school_map[match])
del school_map[match]
# Add any remaining schools that didn't match the fuzzy search
sorted_schools.extend(school_map.values())
school_info = sorted_schools
if not school_info:
print("No valid school information found with UDISE codes")
return []
return school_info
def json_to_table(obj):
try:
if isinstance(obj, list):
return pd.json_normalize(obj)
if isinstance(obj, dict):
for k in ("results", "data", "hits", "items"):
if k in obj and isinstance(obj[k], list):
return pd.json_normalize(obj[k])
return pd.json_normalize([obj])
except Exception:
pass
return pd.DataFrame()
def to_table_from_kys(kys_json):
"""
Convert KYS JSON wrapper into a simplified pandas DataFrame showing only
selected fields from the `content` list.
"""
try:
content = None
if isinstance(kys_json, dict):
inner = kys_json.get("data") if kys_json.get("data") is not None else None
if isinstance(inner, dict) and isinstance(inner.get("content"), list):
content = inner.get("content")
elif isinstance(inner, dict) and isinstance(inner.get("data"), dict) and isinstance(inner.get("data").get("content"), list):
content = inner.get("data").get("content")
elif isinstance(kys_json.get("content"), list):
content = kys_json.get("content")
if not content:
return pd.DataFrame()
rows = []
for r in content:
rows.append({
"School Name": r.get("schoolName"),
"School ID": r.get("schoolId"),
"Pincode": r.get("pincode"),
"State": r.get("stateName"),
"District": r.get("districtName"),
"Management Type": r.get("schMgmtType")
})
return pd.DataFrame(rows)
except Exception as e:
print("to_table_from_kys error:", e)
return pd.DataFrame()
def search_workflow(school_name, state_name, search_key, use_search=True, use_kys=True):
out = {"kys": None, "search": None, "suggestions": [], "first_candidate": None, "school_info": []}
payload_text = f"{school_name or ''} {state_name or ''} UDISE code".strip()
if use_search:
search_res = call_search_sdk(search_key, payload_text)
out["search"] = search_res
if search_res.get("ok"):
# Pass school_name for fuzzy matching and state_name for validation
school_info = extract_udise_candidates_from_search(
search_res["data"],
state_name=state_name,
search_query=school_name
)
# Extract just the UDISE codes for backward compatibility
candidates = [info['udise'] for info in school_info]
out["suggestions"] = [
f"{info['name']} (UDISE: {info['udise']})"
for info in school_info
]
out["school_info"] = school_info
if candidates and candidates[0] != "No UDISE codes found":
out["first_candidate"] = candidates[0]
else:
out["search"] = {"ok": False, "error": "Search disabled or SDK not used"}
if use_kys and school_name and school_name.strip().isdigit() and 6 <= len(school_name.strip()) <= 14:
kys_res = call_kys_by_udise(school_name.strip())
out["kys"] = kys_res
return out
with gr.Blocks() as demo:
gr.Markdown(
"""
# Find School UDISE Code
Provide your API key in the textbox.
Enter a school name and select the state
"""
)
with gr.Row():
inp = gr.Textbox(label="School name or UDISE code", placeholder="e.g. GOVT SEC SCHOOL DARLONG or 12345678901", lines=1)
state_dropdown = gr.Dropdown(choices=STATES, label="State", value=STATES[0] if STATES else "", interactive=True, allow_custom_value=True)
search_key = gr.Textbox(label="Search API Key (required)", placeholder="api-key...", lines=1)
run = gr.Button("Search", variant="primary")
# By default hide raw JSON outputs; users can toggle visibility with `show_raw_checkbox`
show_raw_checkbox = gr.Checkbox(value=False, label="Show raw JSON outputs")
output_json = gr.JSON(label="Raw Search Output (JSON)", visible=False)
search_table = gr.DataFrame(headers=None, label="Search results (table)")
gr.Markdown("### UDISE candidates found in Search results")
suggestions_dropdown = gr.Dropdown(choices=[], label="UDISE candidates (from Search)")
udise_input = gr.Textbox(label="UDISE to lookup (editable)", placeholder="Pick a candidate or type a UDISE code...", lines=1)
lookup_btn = gr.Button("Lookup UDISE (Call KYS)")
kys_output_json = gr.JSON(label="KYS Raw Output", visible=False)
kys_table = gr.DataFrame(headers=None, label="KYS results (table)")
saved_key_state = gr.State("")
def on_run(school, state, key, saved_key):
# Always use the saved key if it exists, otherwise use the provided key
effective_key = saved_key if saved_key else key
# Always enable both search and KYS by default
res = search_workflow(school, state, effective_key, use_search=True, use_kys=True)
tbl = pd.DataFrame()
if res.get("search") and res["search"].get("ok"):
tbl = json_to_table(res["search"]["data"])
# Get school info and format suggestions with school names and UDISE codes
school_info = res.get("school_info", [])
suggestions = []
first_candidate = ""
if school_info:
# Format suggestions as "School Name (UDISE: 12345678901)"
suggestions = [
f"{info['name']} (UDISE: {info['udise']})"
for info in school_info
]
first_candidate = school_info[0]['udise'] if school_info else ""
else:
suggestions = ["No matching schools found"]
# Always save the key to state if a new one is provided
new_saved_key = key or saved_key
# Return the first candidate along with other values
return (
res.get("search"), # output_json
tbl, # search_table
{"choices": suggestions, "__type__": "update"}, # Update dropdown choices
first_candidate, # This will update udise_input with the UDISE code
new_saved_key, # saved_key_state
res.get("kys") # kys_output_json
)
run.click(
on_run,
inputs=[inp, state_dropdown, search_key, saved_key_state],
outputs=[
output_json,
search_table,
suggestions_dropdown,
udise_input, # This will be updated with first_candidate
saved_key_state,
kys_output_json
]
)
def on_select_suggestion(choice):
# Extract UDISE code from the selected choice
if not choice or choice in ["No matching schools found", "No UDISE codes found"]:
return ""
# Extract UDISE code from the format "School Name (UDISE: 12345678901)"
match = re.search(r'\(UDISE:\s*(\d+)\)', choice)
if match:
return match.group(1)
return ""
suggestions_dropdown.change(
on_select_suggestion,
inputs=[suggestions_dropdown],
outputs=[udise_input]
)
def on_lookup_udise(udise_code):
if not udise_code or not udise_code.strip().isdigit():
return {"ok": False, "error": "Provide a numeric UDISE code (6-14 digits)."}, pd.DataFrame()
kys_res = call_kys_by_udise(udise_code.strip())
df = pd.DataFrame()
if kys_res.get("ok"):
df = to_table_from_kys(kys_res["data"]) if kys_res.get("data") else pd.DataFrame()
return kys_res, df
lookup_btn.click(on_lookup_udise, inputs=[udise_input], outputs=[kys_output_json, kys_table])
# Toggle visibility handler for raw JSON outputs
def toggle_raw(visible: bool):
return gr.update(visible=visible), gr.update(visible=visible)
show_raw_checkbox.change(toggle_raw, inputs=[show_raw_checkbox], outputs=[output_json, kys_output_json])
if __name__ == "__main__":
demo.launch() |