findschool / app.py
gkdivya's picture
Update app.py
b5de4f2 verified
# search_kys_space.py
# Gradio app for Hugging Face Spaces (uses Search API via Tavily SDK internally)
import gradio as gr
import requests
import pandas as pd
import json
import re
from difflib import get_close_matches
from tavily import TavilyClient
KYS_SAMPLE = "https://kys.udiseplus.gov.in/webapp/api/search-schools?searchType=3&searchParam={udise}"
# Mapping of state names to UDISE state codes
STATE_TO_UDISE_CODE = {
'Jammu & Kashmir': '01',
'Himachal Pradesh': '02',
'Punjab': '03',
'Chandigarh': '04',
'Uttarakhand': '05',
'Haryana': '06',
'Delhi': '07',
'Rajasthan': '08',
'Uttar Pradesh': '09',
'Bihar': '10',
'Sikkim': '11',
'Arunachal Pradesh': '12',
'Nagaland': '13',
'Manipur': '14',
'Mizoram': '15',
'Tripura': '16',
'Meghalaya': '17',
'Assam': '18',
'West Bengal': '19',
'Jharkhand': '20',
'Odisha': '21',
'Chhattisgarh': '22',
'Madhya Pradesh': '23',
'Gujarat': '24',
'Daman & Diu': '25',
'Dadra & Nagar Haveli': '26',
'Maharashtra': '27',
'Andhra Pradesh': '28',
'Karnataka': '29',
'Goa': '30',
'Lakshadweep': '31',
'Kerala': '32',
'Tamil Nadu': '33',
'Puducherry': '34',
'Andaman & Nicobar Islands': '35',
'Telangana': '36',
'Ladakh': '37'
}
# For backward compatibility
VALID_UDISE_STATE_CODES = list(STATE_TO_UDISE_CODE.values())
def is_valid_udise(code, state_name=None):
"""
Check if a string is a valid UDISE code.
Args:
code: The UDISE code to validate
state_name: Optional state name to validate against the UDISE state code
Returns:
bool: True if the code is valid, False otherwise
"""
# Basic validation
if not (code and code.isdigit() and len(code) == 11):
return False
state_code = code[:2]
# Check if state code is valid
if state_code not in VALID_UDISE_STATE_CODES:
return False
# If state_name is provided, validate against it
if state_name:
state_name = state_name.strip().title()
# Handle special case for 'Uttar pradesh' vs 'Uttar Pradesh'
state_name = state_name.replace('_', ' ')
expected_code = STATE_TO_UDISE_CODE.get(state_name)
if not expected_code:
print(f"Warning: Unknown state name: {state_name}")
return False
if state_code != expected_code:
print(f"UDISE code {code} state code {state_code} does not match expected state {state_name} ({expected_code})")
return False
return True
STATES = [
"Arunachal_pradesh",
"Assam",
"Bihar",
"Chhattisgarh",
"Jharkhand",
"Karnataka",
"Madhya pradesh",
"Manipur",
"Meghalaya",
"Mizoram",
"Nagaland",
"Odisha",
"Puducherry",
"Rajasthan",
"Sikkim",
"Telangana",
"Tripura",
"Uttar pradesh",
"Uttarakhand"
]
def call_kys_by_udise(udise_code):
url = KYS_SAMPLE.format(udise=udise_code)
try:
resp = requests.get(url, timeout=10)
resp.raise_for_status()
data = resp.json()
return {"ok": True, "url": url, "data": data}
except Exception as e:
return {"ok": False, "error": str(e), "url": url}
def call_search_sdk(api_key, payload_text):
try:
client = TavilyClient(api_key)
resp = client.search(query=payload_text)
return {"ok": True, "data": resp}
except Exception as e:
return {"ok": False, "error": str(e)}
def extract_udise_candidates_from_search(search_json, state_name=None, search_query=None):
"""
Extract UDISE codes and school information from Tavily search results.
Args:
search_json: JSON response from Tavily search
state_name: Optional state name to validate UDISE codes against
search_query: Original search query to help with fuzzy matching
Returns:
list: List of dictionaries containing UDISE codes and school information
"""
print("\n===== Extracting UDISE Codes =====")
if state_name:
print(f"Validating UDISE codes against state: {state_name}")
found_codes = set()
school_info = [] # List to store school information
# Check if we have valid search results
if not search_json or not isinstance(search_json, dict):
print("Invalid search JSON")
return []
results = search_json.get('results', []) or search_json.get('data', {}).get('results', [])
if not results:
print("No results found in search JSON")
return []
print(f"Found {len(results)} search results")
# Patterns to match UDISE codes and school information
patterns = [
r'UDISE[^\d]*(?:code|Code|CODE)[^\d]*(\d{11})(?![0-9])',
r'Udise[^\d]*(?:School[^\d]*Code|Code)[^\d]*(\d{11})(?![0-9])',
r'(?<![0-9])(\d{11})(?![0-9])' # Fallback: any 11-digit number
]
for result in results:
if not isinstance(result, dict):
continue
# Get title and content
title = result.get('title', '')
content = result.get('content', '')
url = result.get('url', '')
text = f"{title} {content}"
# Check for UDISE codes using all patterns
for pattern in patterns:
matches = re.finditer(pattern, text, re.IGNORECASE)
for match in matches:
udise_code = match.group(1) if len(match.groups()) > 0 else match.group(0)
if udise_code and is_valid_udise(udise_code, state_name) and udise_code not in found_codes:
print(f"Found valid UDISE code: {udise_code}")
found_codes.add(udise_code)
# Extract school name - try to find the most relevant text
school_name = title
# If title is too short or doesn't seem like a school name, try to find a better match
if len(school_name.split()) < 2 or any(word in school_name.lower() for word in ['udise', 'code', 'school']):
# Look for a school-like name in the content
school_matches = re.findall(r'([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+\s*(?:School|School|High School|High School|Vidyalaya|Vidyalaya|Vidyalayam|Vidyalayam|Vidhya|Vidhya|Vidya|Vidya|Public School|Public School|Govt|Government|Kendriya|Jawahar|Navodaya|Sainik|Army|Air Force|Navy|Central School|Central School|CBSE|ICSE|State Board|State Board|EM|EM|TM|TM|Primary|Primary|Upper Primary|Upper Primary|Higher Secondary|Higher Secondary|HSS|HSS|HS|HS|UPS|UPS|PS|PS))', content, re.IGNORECASE)
if school_matches:
school_name = school_matches[0][0].strip()
school_info.append({
'udise': udise_code,
'name': school_name,
'source': url,
'snippet': content[:200] + '...' if len(content) > 200 else content
})
# If we have a search query, sort results by relevance to the query
if search_query and school_info:
# Extract just the school names for fuzzy matching
school_names = [s['name'] for s in school_info]
# Get fuzzy matches and their scores
matches = get_close_matches(
search_query.lower(),
[name.lower() for name in school_names],
n=len(school_names),
cutoff=0.3 # Lower cutoff to allow more fuzzy matches
)
# Create a dictionary to map lowercase names to their original objects with scores
school_map = {s['name'].lower(): s for s in school_info}
# Rebuild the school_info list in order of best match
sorted_schools = []
for match in matches:
if match in school_map:
sorted_schools.append(school_map[match])
del school_map[match]
# Add any remaining schools that didn't match the fuzzy search
sorted_schools.extend(school_map.values())
school_info = sorted_schools
if not school_info:
print("No valid school information found with UDISE codes")
return []
return school_info
def json_to_table(obj):
try:
if isinstance(obj, list):
return pd.json_normalize(obj)
if isinstance(obj, dict):
for k in ("results", "data", "hits", "items"):
if k in obj and isinstance(obj[k], list):
return pd.json_normalize(obj[k])
return pd.json_normalize([obj])
except Exception:
pass
return pd.DataFrame()
def to_table_from_kys(kys_json):
"""
Convert KYS JSON wrapper into a simplified pandas DataFrame showing only
selected fields from the `content` list.
"""
try:
content = None
if isinstance(kys_json, dict):
inner = kys_json.get("data") if kys_json.get("data") is not None else None
if isinstance(inner, dict) and isinstance(inner.get("content"), list):
content = inner.get("content")
elif isinstance(inner, dict) and isinstance(inner.get("data"), dict) and isinstance(inner.get("data").get("content"), list):
content = inner.get("data").get("content")
elif isinstance(kys_json.get("content"), list):
content = kys_json.get("content")
if not content:
return pd.DataFrame()
rows = []
for r in content:
rows.append({
"School Name": r.get("schoolName"),
"School ID": r.get("schoolId"),
"Pincode": r.get("pincode"),
"State": r.get("stateName"),
"District": r.get("districtName"),
"Management Type": r.get("schMgmtType")
})
return pd.DataFrame(rows)
except Exception as e:
print("to_table_from_kys error:", e)
return pd.DataFrame()
def search_workflow(school_name, state_name, search_key, use_search=True, use_kys=True):
out = {"kys": None, "search": None, "suggestions": [], "first_candidate": None, "school_info": []}
payload_text = f"{school_name or ''} {state_name or ''} UDISE code".strip()
if use_search:
search_res = call_search_sdk(search_key, payload_text)
out["search"] = search_res
if search_res.get("ok"):
# Pass school_name for fuzzy matching and state_name for validation
school_info = extract_udise_candidates_from_search(
search_res["data"],
state_name=state_name,
search_query=school_name
)
# Extract just the UDISE codes for backward compatibility
candidates = [info['udise'] for info in school_info]
out["suggestions"] = [
f"{info['name']} (UDISE: {info['udise']})"
for info in school_info
]
out["school_info"] = school_info
if candidates and candidates[0] != "No UDISE codes found":
out["first_candidate"] = candidates[0]
else:
out["search"] = {"ok": False, "error": "Search disabled or SDK not used"}
if use_kys and school_name and school_name.strip().isdigit() and 6 <= len(school_name.strip()) <= 14:
kys_res = call_kys_by_udise(school_name.strip())
out["kys"] = kys_res
return out
with gr.Blocks() as demo:
gr.Markdown(
"""
# Find School UDISE Code
Provide your API key in the textbox.
Enter a school name and select the state
"""
)
with gr.Row():
inp = gr.Textbox(label="School name or UDISE code", placeholder="e.g. GOVT SEC SCHOOL DARLONG or 12345678901", lines=1)
state_dropdown = gr.Dropdown(choices=STATES, label="State", value=STATES[0] if STATES else "", interactive=True, allow_custom_value=True)
search_key = gr.Textbox(label="Search API Key (required)", placeholder="api-key...", lines=1)
run = gr.Button("Search", variant="primary")
# By default hide raw JSON outputs; users can toggle visibility with `show_raw_checkbox`
show_raw_checkbox = gr.Checkbox(value=False, label="Show raw JSON outputs")
output_json = gr.JSON(label="Raw Search Output (JSON)", visible=False)
search_table = gr.DataFrame(headers=None, label="Search results (table)")
gr.Markdown("### UDISE candidates found in Search results")
suggestions_dropdown = gr.Dropdown(choices=[], label="UDISE candidates (from Search)")
udise_input = gr.Textbox(label="UDISE to lookup (editable)", placeholder="Pick a candidate or type a UDISE code...", lines=1)
lookup_btn = gr.Button("Lookup UDISE (Call KYS)")
kys_output_json = gr.JSON(label="KYS Raw Output", visible=False)
kys_table = gr.DataFrame(headers=None, label="KYS results (table)")
saved_key_state = gr.State("")
def on_run(school, state, key, saved_key):
# Always use the saved key if it exists, otherwise use the provided key
effective_key = saved_key if saved_key else key
# Always enable both search and KYS by default
res = search_workflow(school, state, effective_key, use_search=True, use_kys=True)
tbl = pd.DataFrame()
if res.get("search") and res["search"].get("ok"):
tbl = json_to_table(res["search"]["data"])
# Get school info and format suggestions with school names and UDISE codes
school_info = res.get("school_info", [])
suggestions = []
first_candidate = ""
if school_info:
# Format suggestions as "School Name (UDISE: 12345678901)"
suggestions = [
f"{info['name']} (UDISE: {info['udise']})"
for info in school_info
]
first_candidate = school_info[0]['udise'] if school_info else ""
else:
suggestions = ["No matching schools found"]
# Always save the key to state if a new one is provided
new_saved_key = key or saved_key
# Return the first candidate along with other values
return (
res.get("search"), # output_json
tbl, # search_table
{"choices": suggestions, "__type__": "update"}, # Update dropdown choices
first_candidate, # This will update udise_input with the UDISE code
new_saved_key, # saved_key_state
res.get("kys") # kys_output_json
)
run.click(
on_run,
inputs=[inp, state_dropdown, search_key, saved_key_state],
outputs=[
output_json,
search_table,
suggestions_dropdown,
udise_input, # This will be updated with first_candidate
saved_key_state,
kys_output_json
]
)
def on_select_suggestion(choice):
# Extract UDISE code from the selected choice
if not choice or choice in ["No matching schools found", "No UDISE codes found"]:
return ""
# Extract UDISE code from the format "School Name (UDISE: 12345678901)"
match = re.search(r'\(UDISE:\s*(\d+)\)', choice)
if match:
return match.group(1)
return ""
suggestions_dropdown.change(
on_select_suggestion,
inputs=[suggestions_dropdown],
outputs=[udise_input]
)
def on_lookup_udise(udise_code):
if not udise_code or not udise_code.strip().isdigit():
return {"ok": False, "error": "Provide a numeric UDISE code (6-14 digits)."}, pd.DataFrame()
kys_res = call_kys_by_udise(udise_code.strip())
df = pd.DataFrame()
if kys_res.get("ok"):
df = to_table_from_kys(kys_res["data"]) if kys_res.get("data") else pd.DataFrame()
return kys_res, df
lookup_btn.click(on_lookup_udise, inputs=[udise_input], outputs=[kys_output_json, kys_table])
# Toggle visibility handler for raw JSON outputs
def toggle_raw(visible: bool):
return gr.update(visible=visible), gr.update(visible=visible)
show_raw_checkbox.change(toggle_raw, inputs=[show_raw_checkbox], outputs=[output_json, kys_output_json])
if __name__ == "__main__":
demo.launch()