FindSchoolByName / web_search.py
gkdivya's picture
Update web_search.py
45d3479 verified
# ====================================================
# web_search.py
# ====================================================
"""
Minimal Tavily wrapper: run a web search and return a list of UDISE codes.
Enhancement:
- Optional village support for more precise queries
"""
import os
import re
from typing import List, Optional
# optional Tavily SDK
try:
from tavily import TavilyClient
except Exception:
TavilyClient = None
# ----------------------------------------------------
# State → UDISE prefix mapping
# ----------------------------------------------------
STATE_TO_UDISE_CODE = {
"Jammu & Kashmir": "01", "Himachal Pradesh": "02", "Punjab": "03",
"Chandigarh": "04", "Uttarakhand": "05", "Haryana": "06", "Delhi": "07",
"Rajasthan": "08", "Uttar Pradesh": "09", "Bihar": "10", "Sikkim": "11",
"Arunachal Pradesh": "12", "Nagaland": "13", "Manipur": "14", "Mizoram": "15",
"Tripura": "16", "Meghalaya": "17", "Assam": "18", "West Bengal": "19",
"Jharkhand": "20", "Odisha": "21", "Chhattisgarh": "22",
"Madhya Pradesh": "23", "Gujarat": "24", "Daman & Diu": "25",
"Dadra & Nagar Haveli": "26", "Maharashtra": "27",
"Andhra Pradesh": "28", "Karnataka": "29", "Goa": "30",
"Lakshadweep": "31", "Kerala": "32", "Tamil Nadu": "33",
"Puducherry": "34", "Andaman & Nicobar Islands": "35",
"Telangana": "36", "Ladakh": "37",
}
# strict 11-digit UDISE match
_UDISE_RE = re.compile(r"(?<!\d)(\d{11})(?!\d)")
# ----------------------------------------------------
# Query builder (Village-aware)
# ----------------------------------------------------
def _build_query(
school_name: Optional[str],
state_name: Optional[str],
district: Optional[str],
village: Optional[str] = None,
) -> str:
parts = ["UDISE code of"]
if school_name:
parts.append(f"School {school_name.strip()}")
if village:
parts.append(f"in village {village.strip()}")
if district:
parts.append(f"district {district.strip()}")
if state_name:
parts.append(f"state {state_name.strip()}")
return " ".join(parts).strip()
def _call_tavily(api_key: Optional[str], query: str):
key = api_key or os.getenv("TAVILY_API_KEY")
if not key:
return {"ok": False, "error": "No Tavily API key provided."}
if TavilyClient is None:
return {"ok": False, "error": "tavily package not installed."}
try:
client = TavilyClient(key)
print(query)
resp = client.search(query=query, country="india")
print(resp)
return {"ok": True, "data": resp}
except Exception as e:
return {"ok": False, "error": str(e)}
def _normalize_state_key(state_name: Optional[str]) -> Optional[str]:
if not state_name:
return None
cleaned = re.sub(r"[^A-Za-z]", "", state_name).lower()
for k in STATE_TO_UDISE_CODE:
if re.sub(r"[^A-Za-z]", "", k).lower() == cleaned:
return k
return None
# ----------------------------------------------------
# Public API
# ----------------------------------------------------
def tavily_search_codes(
school_name: Optional[str],
state_name: Optional[str] = None,
district: Optional[str] = None,
village: Optional[str] = None,
api_key: Optional[str] = None,
enforce_state_prefix: bool = True,
) -> List[str]:
"""
Perform a Tavily search and return a list of unique UDISE codes.
Village is used only to improve search precision.
"""
if not school_name:
return []
query = _build_query(school_name, state_name, district, village)
call = _call_tavily(api_key, query)
if not call.get("ok"):
return []
raw = call.get("data") or {}
snippets = []
if isinstance(raw, dict):
candidates = (
raw.get("results")
or raw.get("data", {}).get("results")
or raw.get("items")
or []
)
for item in candidates:
if isinstance(item, dict):
snippets.append(
" ".join(
[
str(item.get("title", "")),
str(item.get("content", "")),
str(item.get("text", "")),
str(item.get("url", "")),
]
)
)
else:
snippets.append(str(item))
elif isinstance(raw, list):
snippets = [str(x) for x in raw]
else:
snippets = [str(raw)]
allowed_prefix = None
state_key = _normalize_state_key(state_name)
if enforce_state_prefix and state_key:
allowed_prefix = STATE_TO_UDISE_CODE.get(state_key)
found, seen = [], set()
for text in snippets:
for m in _UDISE_RE.finditer(text):
code = m.group(1)
if code in seen:
continue
prefix = code[:2]
if prefix not in STATE_TO_UDISE_CODE.values():
continue
if allowed_prefix and prefix != allowed_prefix:
continue
seen.add(code)
found.append(code)
return found