Spaces:
Running
Running
| # ==================================================== | |
| # web_search.py | |
| # ==================================================== | |
| """ | |
| Minimal Tavily wrapper: run a web search and return a list of UDISE codes. | |
| Enhancement: | |
| - Optional village support for more precise queries | |
| """ | |
| import os | |
| import re | |
| from typing import List, Optional | |
| # optional Tavily SDK | |
| try: | |
| from tavily import TavilyClient | |
| except Exception: | |
| TavilyClient = None | |
| # ---------------------------------------------------- | |
| # State → UDISE prefix mapping | |
| # ---------------------------------------------------- | |
| STATE_TO_UDISE_CODE = { | |
| "Jammu & Kashmir": "01", "Himachal Pradesh": "02", "Punjab": "03", | |
| "Chandigarh": "04", "Uttarakhand": "05", "Haryana": "06", "Delhi": "07", | |
| "Rajasthan": "08", "Uttar Pradesh": "09", "Bihar": "10", "Sikkim": "11", | |
| "Arunachal Pradesh": "12", "Nagaland": "13", "Manipur": "14", "Mizoram": "15", | |
| "Tripura": "16", "Meghalaya": "17", "Assam": "18", "West Bengal": "19", | |
| "Jharkhand": "20", "Odisha": "21", "Chhattisgarh": "22", | |
| "Madhya Pradesh": "23", "Gujarat": "24", "Daman & Diu": "25", | |
| "Dadra & Nagar Haveli": "26", "Maharashtra": "27", | |
| "Andhra Pradesh": "28", "Karnataka": "29", "Goa": "30", | |
| "Lakshadweep": "31", "Kerala": "32", "Tamil Nadu": "33", | |
| "Puducherry": "34", "Andaman & Nicobar Islands": "35", | |
| "Telangana": "36", "Ladakh": "37", | |
| } | |
| # strict 11-digit UDISE match | |
| _UDISE_RE = re.compile(r"(?<!\d)(\d{11})(?!\d)") | |
| # ---------------------------------------------------- | |
| # Query builder (Village-aware) | |
| # ---------------------------------------------------- | |
| def _build_query( | |
| school_name: Optional[str], | |
| state_name: Optional[str], | |
| district: Optional[str], | |
| village: Optional[str] = None, | |
| ) -> str: | |
| parts = ["UDISE code of"] | |
| if school_name: | |
| parts.append(f"School {school_name.strip()}") | |
| if village: | |
| parts.append(f"in village {village.strip()}") | |
| if district: | |
| parts.append(f"district {district.strip()}") | |
| if state_name: | |
| parts.append(f"state {state_name.strip()}") | |
| return " ".join(parts).strip() | |
| def _call_tavily(api_key: Optional[str], query: str): | |
| key = api_key or os.getenv("TAVILY_API_KEY") | |
| if not key: | |
| return {"ok": False, "error": "No Tavily API key provided."} | |
| if TavilyClient is None: | |
| return {"ok": False, "error": "tavily package not installed."} | |
| try: | |
| client = TavilyClient(key) | |
| print(query) | |
| resp = client.search(query=query, country="india") | |
| print(resp) | |
| return {"ok": True, "data": resp} | |
| except Exception as e: | |
| return {"ok": False, "error": str(e)} | |
| def _normalize_state_key(state_name: Optional[str]) -> Optional[str]: | |
| if not state_name: | |
| return None | |
| cleaned = re.sub(r"[^A-Za-z]", "", state_name).lower() | |
| for k in STATE_TO_UDISE_CODE: | |
| if re.sub(r"[^A-Za-z]", "", k).lower() == cleaned: | |
| return k | |
| return None | |
| # ---------------------------------------------------- | |
| # Public API | |
| # ---------------------------------------------------- | |
| def tavily_search_codes( | |
| school_name: Optional[str], | |
| state_name: Optional[str] = None, | |
| district: Optional[str] = None, | |
| village: Optional[str] = None, | |
| api_key: Optional[str] = None, | |
| enforce_state_prefix: bool = True, | |
| ) -> List[str]: | |
| """ | |
| Perform a Tavily search and return a list of unique UDISE codes. | |
| Village is used only to improve search precision. | |
| """ | |
| if not school_name: | |
| return [] | |
| query = _build_query(school_name, state_name, district, village) | |
| call = _call_tavily(api_key, query) | |
| if not call.get("ok"): | |
| return [] | |
| raw = call.get("data") or {} | |
| snippets = [] | |
| if isinstance(raw, dict): | |
| candidates = ( | |
| raw.get("results") | |
| or raw.get("data", {}).get("results") | |
| or raw.get("items") | |
| or [] | |
| ) | |
| for item in candidates: | |
| if isinstance(item, dict): | |
| snippets.append( | |
| " ".join( | |
| [ | |
| str(item.get("title", "")), | |
| str(item.get("content", "")), | |
| str(item.get("text", "")), | |
| str(item.get("url", "")), | |
| ] | |
| ) | |
| ) | |
| else: | |
| snippets.append(str(item)) | |
| elif isinstance(raw, list): | |
| snippets = [str(x) for x in raw] | |
| else: | |
| snippets = [str(raw)] | |
| allowed_prefix = None | |
| state_key = _normalize_state_key(state_name) | |
| if enforce_state_prefix and state_key: | |
| allowed_prefix = STATE_TO_UDISE_CODE.get(state_key) | |
| found, seen = [], set() | |
| for text in snippets: | |
| for m in _UDISE_RE.finditer(text): | |
| code = m.group(1) | |
| if code in seen: | |
| continue | |
| prefix = code[:2] | |
| if prefix not in STATE_TO_UDISE_CODE.values(): | |
| continue | |
| if allowed_prefix and prefix != allowed_prefix: | |
| continue | |
| seen.add(code) | |
| found.append(code) | |
| return found | |