Spaces:
Sleeping
Sleeping
Create web_search.py
Browse files- web_search.py +165 -0
web_search.py
ADDED
|
@@ -0,0 +1,165 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# web_search.py
|
| 2 |
+
"""
|
| 3 |
+
Minimal Tavily wrapper: run a web search and return a list of UDISE codes.
|
| 4 |
+
|
| 5 |
+
Usage:
|
| 6 |
+
from web_search import tavily_search_codes
|
| 7 |
+
codes = tavily_search_codes("GOVT HIGH SCHOOL", state_name="Karnataka", district="Bengaluru")
|
| 8 |
+
|
| 9 |
+
Behavior:
|
| 10 |
+
- Uses TAVILY_API_KEY env var if api_key is not provided.
|
| 11 |
+
- Extracts 11-digit sequences that look like UDISE codes.
|
| 12 |
+
- If state_name provided and matches a known mapping, only returns codes whose
|
| 13 |
+
first two digits match that state's prefix.
|
| 14 |
+
- Returns a list of unique UDISE codes (strings) in the order they were found.
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
import os
|
| 18 |
+
import re
|
| 19 |
+
from typing import List, Optional
|
| 20 |
+
|
| 21 |
+
# optional Tavily SDK
|
| 22 |
+
try:
|
| 23 |
+
from tavily import TavilyClient
|
| 24 |
+
except Exception:
|
| 25 |
+
TavilyClient = None
|
| 26 |
+
|
| 27 |
+
# Minimal mapping of state name -> 2-digit UDISE prefix.
|
| 28 |
+
# Extend this map to match your canonical state names if you want strict filtering.
|
| 29 |
+
STATE_TO_UDISE_CODE = {
|
| 30 |
+
"Jammu & Kashmir": "01", "Himachal Pradesh": "02", "Punjab": "03",
|
| 31 |
+
"Chandigarh": "04", "Uttarakhand": "05", "Haryana": "06", "Delhi": "07",
|
| 32 |
+
"Rajasthan": "08", "Uttar Pradesh": "09", "Bihar": "10", "Sikkim": "11",
|
| 33 |
+
"Arunachal Pradesh": "12", "Nagaland": "13", "Manipur": "14", "Mizoram": "15",
|
| 34 |
+
"Tripura": "16", "Meghalaya": "17", "Assam": "18", "West Bengal": "19",
|
| 35 |
+
"Jharkhand": "20", "Odisha": "21", "Chhattisgarh": "22", "Madhya Pradesh": "23",
|
| 36 |
+
"Gujarat": "24", "Daman & Diu": "25", "Dadra & Nagar Haveli": "26",
|
| 37 |
+
"Maharashtra": "27", "Andhra Pradesh": "28", "Karnataka": "29",
|
| 38 |
+
"Goa": "30", "Lakshadweep": "31", "Kerala": "32", "Tamil Nadu": "33",
|
| 39 |
+
"Puducherry": "34", "Andaman & Nicobar Islands": "35", "Telangana": "36",
|
| 40 |
+
"Ladakh": "37"
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
# compile once: strict 11-digit match
|
| 44 |
+
_UDISE_RE = re.compile(r"(?<!\d)(\d{11})(?!\d)")
|
| 45 |
+
|
| 46 |
+
def _build_query(school_name: Optional[str], state_name: Optional[str], district: Optional[str]) -> str:
|
| 47 |
+
parts = []
|
| 48 |
+
if school_name:
|
| 49 |
+
parts.append(school_name.strip())
|
| 50 |
+
if district:
|
| 51 |
+
parts.append(f"district {district.strip()}")
|
| 52 |
+
if state_name:
|
| 53 |
+
parts.append(f"state {state_name.strip()}")
|
| 54 |
+
# bias to India and ask specifically for UDISE mentions
|
| 55 |
+
parts.append("India")
|
| 56 |
+
parts.append("UDISE code OR UDISE number OR Udise")
|
| 57 |
+
return " ".join(parts).strip()
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def _call_tavily(api_key: Optional[str], query: str):
|
| 61 |
+
key = api_key or os.getenv("TAVILY_API_KEY")
|
| 62 |
+
if not key:
|
| 63 |
+
return {"ok": False, "error": "No Tavily API key provided (set TAVILY_API_KEY or pass api_key)."}
|
| 64 |
+
if TavilyClient is None:
|
| 65 |
+
return {"ok": False, "error": "tavily package not installed or TavilyClient not importable."}
|
| 66 |
+
try:
|
| 67 |
+
client = TavilyClient(key)
|
| 68 |
+
# SDK may return complex object; keep raw
|
| 69 |
+
resp = client.search(query=query)
|
| 70 |
+
return {"ok": True, "data": resp}
|
| 71 |
+
except Exception as e:
|
| 72 |
+
return {"ok": False, "error": str(e)}
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def _normalize_state_key(state_name: Optional[str]) -> Optional[str]:
|
| 76 |
+
if not state_name:
|
| 77 |
+
return None
|
| 78 |
+
# simple exact / title / upper checks
|
| 79 |
+
for k in (state_name, state_name.title(), state_name.upper()):
|
| 80 |
+
if k in STATE_TO_UDISE_CODE:
|
| 81 |
+
return k
|
| 82 |
+
# fallback: compare cleaned names (remove non-letters)
|
| 83 |
+
cleaned = re.sub(r"[^A-Za-z]", "", state_name).lower()
|
| 84 |
+
for k in STATE_TO_UDISE_CODE:
|
| 85 |
+
if re.sub(r"[^A-Za-z]", "", k).lower() == cleaned:
|
| 86 |
+
return k
|
| 87 |
+
return None
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
def tavily_search_codes(
|
| 91 |
+
school_name: Optional[str],
|
| 92 |
+
state_name: Optional[str] = None,
|
| 93 |
+
district: Optional[str] = None,
|
| 94 |
+
api_key: Optional[str] = None,
|
| 95 |
+
enforce_state_prefix: bool = True
|
| 96 |
+
) -> List[str]:
|
| 97 |
+
"""
|
| 98 |
+
Perform a Tavily search and return a list of unique UDISE codes (strings).
|
| 99 |
+
|
| 100 |
+
Args:
|
| 101 |
+
school_name: partial or full school name to search
|
| 102 |
+
state_name: optional state name to restrict to that state's UDISE prefix
|
| 103 |
+
district: optional district to include in query
|
| 104 |
+
api_key: optional Tavily API key (falls back to TAVILY_API_KEY env)
|
| 105 |
+
enforce_state_prefix: if True and state_name is known, only return codes
|
| 106 |
+
whose first two digits match that state's prefix.
|
| 107 |
+
|
| 108 |
+
Returns:
|
| 109 |
+
List[str] of unique UDISE codes in the order found.
|
| 110 |
+
"""
|
| 111 |
+
query = _build_query(school_name, state_name, district)
|
| 112 |
+
call = _call_tavily(api_key, query)
|
| 113 |
+
if not call.get("ok"):
|
| 114 |
+
# on error return empty list (caller can inspect logs)
|
| 115 |
+
return []
|
| 116 |
+
|
| 117 |
+
raw = call.get("data") or {}
|
| 118 |
+
# flatten likely text fields from raw into strings to search for codes
|
| 119 |
+
snippets = []
|
| 120 |
+
|
| 121 |
+
# common shapes: dict with 'results' or 'data' or a list
|
| 122 |
+
if isinstance(raw, dict):
|
| 123 |
+
# try results/data/results/data.items etc
|
| 124 |
+
candidates = raw.get("results") or raw.get("data", {}).get("results") or raw.get("data") or raw.get("hits") or raw.get("items")
|
| 125 |
+
if isinstance(candidates, list):
|
| 126 |
+
for item in candidates:
|
| 127 |
+
if isinstance(item, dict):
|
| 128 |
+
snippets.append(" ".join([str(item.get("title", "")), str(item.get("content", "")), str(item.get("text", "")), str(item.get("url", ""))]))
|
| 129 |
+
else:
|
| 130 |
+
snippets.append(str(item))
|
| 131 |
+
else:
|
| 132 |
+
# fallback stringify the whole dict
|
| 133 |
+
snippets.append(str(raw))
|
| 134 |
+
elif isinstance(raw, list):
|
| 135 |
+
for item in raw:
|
| 136 |
+
snippets.append(str(item))
|
| 137 |
+
else:
|
| 138 |
+
snippets.append(str(raw))
|
| 139 |
+
|
| 140 |
+
# Determine allowed prefix if requested
|
| 141 |
+
allowed_prefix = None
|
| 142 |
+
state_key = _normalize_state_key(state_name)
|
| 143 |
+
if enforce_state_prefix and state_key:
|
| 144 |
+
allowed_prefix = STATE_TO_UDISE_CODE.get(state_key)
|
| 145 |
+
|
| 146 |
+
found = []
|
| 147 |
+
seen = set()
|
| 148 |
+
for text in snippets:
|
| 149 |
+
if not text:
|
| 150 |
+
continue
|
| 151 |
+
for m in _UDISE_RE.finditer(text):
|
| 152 |
+
code = m.group(1)
|
| 153 |
+
if not code or code in seen:
|
| 154 |
+
continue
|
| 155 |
+
# quick prefix check
|
| 156 |
+
prefix = code[:2]
|
| 157 |
+
# ignore codes that do not start with a valid state prefix
|
| 158 |
+
if prefix not in set(STATE_TO_UDISE_CODE.values()):
|
| 159 |
+
continue
|
| 160 |
+
if allowed_prefix and prefix != allowed_prefix:
|
| 161 |
+
continue
|
| 162 |
+
seen.add(code)
|
| 163 |
+
found.append(code)
|
| 164 |
+
|
| 165 |
+
return found
|