# search_kys_space.py # Gradio app for Hugging Face Spaces (uses Search API via Tavily SDK internally) import gradio as gr import requests import pandas as pd import json import re from difflib import get_close_matches from tavily import TavilyClient KYS_SAMPLE = "https://kys.udiseplus.gov.in/webapp/api/search-schools?searchType=3&searchParam={udise}" # Mapping of state names to UDISE state codes STATE_TO_UDISE_CODE = { 'Jammu & Kashmir': '01', 'Himachal Pradesh': '02', 'Punjab': '03', 'Chandigarh': '04', 'Uttarakhand': '05', 'Haryana': '06', 'Delhi': '07', 'Rajasthan': '08', 'Uttar Pradesh': '09', 'Bihar': '10', 'Sikkim': '11', 'Arunachal Pradesh': '12', 'Nagaland': '13', 'Manipur': '14', 'Mizoram': '15', 'Tripura': '16', 'Meghalaya': '17', 'Assam': '18', 'West Bengal': '19', 'Jharkhand': '20', 'Odisha': '21', 'Chhattisgarh': '22', 'Madhya Pradesh': '23', 'Gujarat': '24', 'Daman & Diu': '25', 'Dadra & Nagar Haveli': '26', 'Maharashtra': '27', 'Andhra Pradesh': '28', 'Karnataka': '29', 'Goa': '30', 'Lakshadweep': '31', 'Kerala': '32', 'Tamil Nadu': '33', 'Puducherry': '34', 'Andaman & Nicobar Islands': '35', 'Telangana': '36', 'Ladakh': '37' } # For backward compatibility VALID_UDISE_STATE_CODES = list(STATE_TO_UDISE_CODE.values()) def is_valid_udise(code, state_name=None): """ Check if a string is a valid UDISE code. Args: code: The UDISE code to validate state_name: Optional state name to validate against the UDISE state code Returns: bool: True if the code is valid, False otherwise """ # Basic validation if not (code and code.isdigit() and len(code) == 11): return False state_code = code[:2] # Check if state code is valid if state_code not in VALID_UDISE_STATE_CODES: return False # If state_name is provided, validate against it if state_name: state_name = state_name.strip().title() # Handle special case for 'Uttar pradesh' vs 'Uttar Pradesh' state_name = state_name.replace('_', ' ') expected_code = STATE_TO_UDISE_CODE.get(state_name) if not expected_code: print(f"Warning: Unknown state name: {state_name}") return False if state_code != expected_code: print(f"UDISE code {code} state code {state_code} does not match expected state {state_name} ({expected_code})") return False return True STATES = [ "Arunachal_pradesh", "Assam", "Bihar", "Chhattisgarh", "Jharkhand", "Karnataka", "Madhya pradesh", "Manipur", "Meghalaya", "Mizoram", "Nagaland", "Odisha", "Puducherry", "Rajasthan", "Sikkim", "Telangana", "Tripura", "Uttar pradesh", "Uttarakhand" ] def call_kys_by_udise(udise_code): url = KYS_SAMPLE.format(udise=udise_code) try: resp = requests.get(url, timeout=10) resp.raise_for_status() data = resp.json() return {"ok": True, "url": url, "data": data} except Exception as e: return {"ok": False, "error": str(e), "url": url} def call_search_sdk(api_key, payload_text): try: client = TavilyClient(api_key) resp = client.search(query=payload_text) return {"ok": True, "data": resp} except Exception as e: return {"ok": False, "error": str(e)} def extract_udise_candidates_from_search(search_json, state_name=None, search_query=None): """ Extract UDISE codes and school information from Tavily search results. Args: search_json: JSON response from Tavily search state_name: Optional state name to validate UDISE codes against search_query: Original search query to help with fuzzy matching Returns: list: List of dictionaries containing UDISE codes and school information """ print("\n===== Extracting UDISE Codes =====") if state_name: print(f"Validating UDISE codes against state: {state_name}") found_codes = set() school_info = [] # List to store school information # Check if we have valid search results if not search_json or not isinstance(search_json, dict): print("Invalid search JSON") return [] results = search_json.get('results', []) or search_json.get('data', {}).get('results', []) if not results: print("No results found in search JSON") return [] print(f"Found {len(results)} search results") # Patterns to match UDISE codes and school information patterns = [ r'UDISE[^\d]*(?:code|Code|CODE)[^\d]*(\d{11})(?![0-9])', r'Udise[^\d]*(?:School[^\d]*Code|Code)[^\d]*(\d{11})(?![0-9])', r'(? 0 else match.group(0) if udise_code and is_valid_udise(udise_code, state_name) and udise_code not in found_codes: print(f"Found valid UDISE code: {udise_code}") found_codes.add(udise_code) # Extract school name - try to find the most relevant text school_name = title # If title is too short or doesn't seem like a school name, try to find a better match if len(school_name.split()) < 2 or any(word in school_name.lower() for word in ['udise', 'code', 'school']): # Look for a school-like name in the content school_matches = re.findall(r'([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+\s*(?:School|School|High School|High School|Vidyalaya|Vidyalaya|Vidyalayam|Vidyalayam|Vidhya|Vidhya|Vidya|Vidya|Public School|Public School|Govt|Government|Kendriya|Jawahar|Navodaya|Sainik|Army|Air Force|Navy|Central School|Central School|CBSE|ICSE|State Board|State Board|EM|EM|TM|TM|Primary|Primary|Upper Primary|Upper Primary|Higher Secondary|Higher Secondary|HSS|HSS|HS|HS|UPS|UPS|PS|PS))', content, re.IGNORECASE) if school_matches: school_name = school_matches[0][0].strip() school_info.append({ 'udise': udise_code, 'name': school_name, 'source': url, 'snippet': content[:200] + '...' if len(content) > 200 else content }) # If we have a search query, sort results by relevance to the query if search_query and school_info: # Extract just the school names for fuzzy matching school_names = [s['name'] for s in school_info] # Get fuzzy matches and their scores matches = get_close_matches( search_query.lower(), [name.lower() for name in school_names], n=len(school_names), cutoff=0.3 # Lower cutoff to allow more fuzzy matches ) # Create a dictionary to map lowercase names to their original objects with scores school_map = {s['name'].lower(): s for s in school_info} # Rebuild the school_info list in order of best match sorted_schools = [] for match in matches: if match in school_map: sorted_schools.append(school_map[match]) del school_map[match] # Add any remaining schools that didn't match the fuzzy search sorted_schools.extend(school_map.values()) school_info = sorted_schools if not school_info: print("No valid school information found with UDISE codes") return [] return school_info def json_to_table(obj): try: if isinstance(obj, list): return pd.json_normalize(obj) if isinstance(obj, dict): for k in ("results", "data", "hits", "items"): if k in obj and isinstance(obj[k], list): return pd.json_normalize(obj[k]) return pd.json_normalize([obj]) except Exception: pass return pd.DataFrame() def to_table_from_kys(kys_json): """ Convert KYS JSON wrapper into a simplified pandas DataFrame showing only selected fields from the `content` list. """ try: content = None if isinstance(kys_json, dict): inner = kys_json.get("data") if kys_json.get("data") is not None else None if isinstance(inner, dict) and isinstance(inner.get("content"), list): content = inner.get("content") elif isinstance(inner, dict) and isinstance(inner.get("data"), dict) and isinstance(inner.get("data").get("content"), list): content = inner.get("data").get("content") elif isinstance(kys_json.get("content"), list): content = kys_json.get("content") if not content: return pd.DataFrame() rows = [] for r in content: rows.append({ "School Name": r.get("schoolName"), "School ID": r.get("schoolId"), "Pincode": r.get("pincode"), "State": r.get("stateName"), "District": r.get("districtName"), "Management Type": r.get("schMgmtType") }) return pd.DataFrame(rows) except Exception as e: print("to_table_from_kys error:", e) return pd.DataFrame() def search_workflow(school_name, state_name, search_key, use_search=True, use_kys=True): out = {"kys": None, "search": None, "suggestions": [], "first_candidate": None, "school_info": []} payload_text = f"{school_name or ''} {state_name or ''} UDISE code".strip() if use_search: search_res = call_search_sdk(search_key, payload_text) out["search"] = search_res if search_res.get("ok"): # Pass school_name for fuzzy matching and state_name for validation school_info = extract_udise_candidates_from_search( search_res["data"], state_name=state_name, search_query=school_name ) # Extract just the UDISE codes for backward compatibility candidates = [info['udise'] for info in school_info] out["suggestions"] = [ f"{info['name']} (UDISE: {info['udise']})" for info in school_info ] out["school_info"] = school_info if candidates and candidates[0] != "No UDISE codes found": out["first_candidate"] = candidates[0] else: out["search"] = {"ok": False, "error": "Search disabled or SDK not used"} if use_kys and school_name and school_name.strip().isdigit() and 6 <= len(school_name.strip()) <= 14: kys_res = call_kys_by_udise(school_name.strip()) out["kys"] = kys_res return out with gr.Blocks() as demo: gr.Markdown( """ # Find School UDISE Code Provide your API key in the textbox. Enter a school name and select the state """ ) with gr.Row(): inp = gr.Textbox(label="School name or UDISE code", placeholder="e.g. GOVT SEC SCHOOL DARLONG or 12345678901", lines=1) state_dropdown = gr.Dropdown(choices=STATES, label="State", value=STATES[0] if STATES else "", interactive=True, allow_custom_value=True) search_key = gr.Textbox(label="Search API Key (required)", placeholder="api-key...", lines=1) run = gr.Button("Search", variant="primary") # By default hide raw JSON outputs; users can toggle visibility with `show_raw_checkbox` show_raw_checkbox = gr.Checkbox(value=False, label="Show raw JSON outputs") output_json = gr.JSON(label="Raw Search Output (JSON)", visible=False) search_table = gr.DataFrame(headers=None, label="Search results (table)") gr.Markdown("### UDISE candidates found in Search results") suggestions_dropdown = gr.Dropdown(choices=[], label="UDISE candidates (from Search)") udise_input = gr.Textbox(label="UDISE to lookup (editable)", placeholder="Pick a candidate or type a UDISE code...", lines=1) lookup_btn = gr.Button("Lookup UDISE (Call KYS)") kys_output_json = gr.JSON(label="KYS Raw Output", visible=False) kys_table = gr.DataFrame(headers=None, label="KYS results (table)") saved_key_state = gr.State("") def on_run(school, state, key, saved_key): # Always use the saved key if it exists, otherwise use the provided key effective_key = saved_key if saved_key else key # Always enable both search and KYS by default res = search_workflow(school, state, effective_key, use_search=True, use_kys=True) tbl = pd.DataFrame() if res.get("search") and res["search"].get("ok"): tbl = json_to_table(res["search"]["data"]) # Get school info and format suggestions with school names and UDISE codes school_info = res.get("school_info", []) suggestions = [] first_candidate = "" if school_info: # Format suggestions as "School Name (UDISE: 12345678901)" suggestions = [ f"{info['name']} (UDISE: {info['udise']})" for info in school_info ] first_candidate = school_info[0]['udise'] if school_info else "" else: suggestions = ["No matching schools found"] # Always save the key to state if a new one is provided new_saved_key = key or saved_key # Return the first candidate along with other values return ( res.get("search"), # output_json tbl, # search_table {"choices": suggestions, "__type__": "update"}, # Update dropdown choices first_candidate, # This will update udise_input with the UDISE code new_saved_key, # saved_key_state res.get("kys") # kys_output_json ) run.click( on_run, inputs=[inp, state_dropdown, search_key, saved_key_state], outputs=[ output_json, search_table, suggestions_dropdown, udise_input, # This will be updated with first_candidate saved_key_state, kys_output_json ] ) def on_select_suggestion(choice): # Extract UDISE code from the selected choice if not choice or choice in ["No matching schools found", "No UDISE codes found"]: return "" # Extract UDISE code from the format "School Name (UDISE: 12345678901)" match = re.search(r'\(UDISE:\s*(\d+)\)', choice) if match: return match.group(1) return "" suggestions_dropdown.change( on_select_suggestion, inputs=[suggestions_dropdown], outputs=[udise_input] ) def on_lookup_udise(udise_code): if not udise_code or not udise_code.strip().isdigit(): return {"ok": False, "error": "Provide a numeric UDISE code (6-14 digits)."}, pd.DataFrame() kys_res = call_kys_by_udise(udise_code.strip()) df = pd.DataFrame() if kys_res.get("ok"): df = to_table_from_kys(kys_res["data"]) if kys_res.get("data") else pd.DataFrame() return kys_res, df lookup_btn.click(on_lookup_udise, inputs=[udise_input], outputs=[kys_output_json, kys_table]) # Toggle visibility handler for raw JSON outputs def toggle_raw(visible: bool): return gr.update(visible=visible), gr.update(visible=visible) show_raw_checkbox.change(toggle_raw, inputs=[show_raw_checkbox], outputs=[output_json, kys_output_json]) if __name__ == "__main__": demo.launch()