Spaces:
Running
Running
Update web_search.py
Browse files- web_search.py +82 -76
web_search.py
CHANGED
|
@@ -1,17 +1,12 @@
|
|
| 1 |
-
#
|
|
|
|
|
|
|
|
|
|
| 2 |
"""
|
| 3 |
Minimal Tavily wrapper: run a web search and return a list of UDISE codes.
|
| 4 |
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
codes = tavily_search_codes("GOVT HIGH SCHOOL", state_name="Karnataka", district="Bengaluru")
|
| 8 |
-
|
| 9 |
-
Behavior:
|
| 10 |
-
- Uses TAVILY_API_KEY env var if api_key is not provided.
|
| 11 |
-
- Extracts 11-digit sequences that look like UDISE codes.
|
| 12 |
-
- If state_name provided and matches a known mapping, only returns codes whose
|
| 13 |
-
first two digits match that state's prefix.
|
| 14 |
-
- Returns a list of unique UDISE codes (strings) in the order they were found.
|
| 15 |
"""
|
| 16 |
|
| 17 |
import os
|
|
@@ -24,46 +19,62 @@ try:
|
|
| 24 |
except Exception:
|
| 25 |
TavilyClient = None
|
| 26 |
|
| 27 |
-
#
|
| 28 |
-
#
|
|
|
|
| 29 |
STATE_TO_UDISE_CODE = {
|
| 30 |
"Jammu & Kashmir": "01", "Himachal Pradesh": "02", "Punjab": "03",
|
| 31 |
"Chandigarh": "04", "Uttarakhand": "05", "Haryana": "06", "Delhi": "07",
|
| 32 |
"Rajasthan": "08", "Uttar Pradesh": "09", "Bihar": "10", "Sikkim": "11",
|
| 33 |
"Arunachal Pradesh": "12", "Nagaland": "13", "Manipur": "14", "Mizoram": "15",
|
| 34 |
"Tripura": "16", "Meghalaya": "17", "Assam": "18", "West Bengal": "19",
|
| 35 |
-
"Jharkhand": "20", "Odisha": "21", "Chhattisgarh": "22",
|
| 36 |
-
"
|
| 37 |
-
"
|
| 38 |
-
"
|
| 39 |
-
"
|
| 40 |
-
"
|
|
|
|
| 41 |
}
|
| 42 |
|
| 43 |
-
#
|
| 44 |
_UDISE_RE = re.compile(r"(?<!\d)(\d{11})(?!\d)")
|
| 45 |
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
if school_name:
|
| 50 |
parts.append(f"School {school_name.strip()}")
|
| 51 |
-
|
| 52 |
-
|
|
|
|
|
|
|
| 53 |
if district:
|
| 54 |
-
parts.append(f"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
return " ".join(parts).strip()
|
| 56 |
|
| 57 |
|
| 58 |
def _call_tavily(api_key: Optional[str], query: str):
|
| 59 |
key = api_key or os.getenv("TAVILY_API_KEY")
|
| 60 |
if not key:
|
| 61 |
-
return {"ok": False, "error": "No Tavily API key provided
|
| 62 |
if TavilyClient is None:
|
| 63 |
-
return {"ok": False, "error": "tavily package not installed
|
| 64 |
try:
|
| 65 |
client = TavilyClient(key)
|
| 66 |
-
# SDK may return complex object; keep raw
|
| 67 |
resp = client.search(query=query, country="india")
|
| 68 |
return {"ok": True, "data": resp}
|
| 69 |
except Exception as e:
|
|
@@ -73,11 +84,7 @@ def _call_tavily(api_key: Optional[str], query: str):
|
|
| 73 |
def _normalize_state_key(state_name: Optional[str]) -> Optional[str]:
|
| 74 |
if not state_name:
|
| 75 |
return None
|
| 76 |
-
|
| 77 |
-
for k in (state_name, state_name.title(), state_name.upper()):
|
| 78 |
-
if k in STATE_TO_UDISE_CODE:
|
| 79 |
-
return k
|
| 80 |
-
# fallback: compare cleaned names (remove non-letters)
|
| 81 |
cleaned = re.sub(r"[^A-Za-z]", "", state_name).lower()
|
| 82 |
for k in STATE_TO_UDISE_CODE:
|
| 83 |
if re.sub(r"[^A-Za-z]", "", k).lower() == cleaned:
|
|
@@ -85,81 +92,80 @@ def _normalize_state_key(state_name: Optional[str]) -> Optional[str]:
|
|
| 85 |
return None
|
| 86 |
|
| 87 |
|
|
|
|
|
|
|
|
|
|
| 88 |
def tavily_search_codes(
|
| 89 |
school_name: Optional[str],
|
| 90 |
state_name: Optional[str] = None,
|
| 91 |
district: Optional[str] = None,
|
|
|
|
| 92 |
api_key: Optional[str] = None,
|
| 93 |
-
enforce_state_prefix: bool = True
|
| 94 |
) -> List[str]:
|
| 95 |
"""
|
| 96 |
-
Perform a Tavily search and return a list of unique UDISE codes
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
school_name: partial or full school name to search
|
| 100 |
-
state_name: optional state name to restrict to that state's UDISE prefix
|
| 101 |
-
district: optional district to include in query
|
| 102 |
-
api_key: optional Tavily API key (falls back to TAVILY_API_KEY env)
|
| 103 |
-
enforce_state_prefix: if True and state_name is known, only return codes
|
| 104 |
-
whose first two digits match that state's prefix.
|
| 105 |
-
|
| 106 |
-
Returns:
|
| 107 |
-
List[str] of unique UDISE codes in the order found.
|
| 108 |
"""
|
|
|
|
| 109 |
if not school_name:
|
| 110 |
-
return
|
| 111 |
-
|
|
|
|
| 112 |
call = _call_tavily(api_key, query)
|
|
|
|
| 113 |
if not call.get("ok"):
|
| 114 |
-
# on error return empty list (caller can inspect logs)
|
| 115 |
return []
|
| 116 |
|
| 117 |
raw = call.get("data") or {}
|
| 118 |
-
# flatten likely text fields from raw into strings to search for codes
|
| 119 |
snippets = []
|
| 120 |
|
| 121 |
-
# common shapes: dict with 'results' or 'data' or a list
|
| 122 |
if isinstance(raw, dict):
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
elif isinstance(raw, list):
|
| 135 |
-
for
|
| 136 |
-
snippets.append(str(item))
|
| 137 |
else:
|
| 138 |
-
snippets
|
| 139 |
|
| 140 |
-
# Determine allowed prefix if requested
|
| 141 |
allowed_prefix = None
|
| 142 |
state_key = _normalize_state_key(state_name)
|
| 143 |
if enforce_state_prefix and state_key:
|
| 144 |
allowed_prefix = STATE_TO_UDISE_CODE.get(state_key)
|
| 145 |
|
| 146 |
-
found = []
|
| 147 |
-
seen = set()
|
| 148 |
for text in snippets:
|
| 149 |
-
if not text:
|
| 150 |
-
continue
|
| 151 |
for m in _UDISE_RE.finditer(text):
|
| 152 |
code = m.group(1)
|
| 153 |
-
if
|
| 154 |
continue
|
| 155 |
-
|
| 156 |
prefix = code[:2]
|
| 157 |
-
|
| 158 |
-
if prefix not in set(STATE_TO_UDISE_CODE.values()):
|
| 159 |
continue
|
| 160 |
if allowed_prefix and prefix != allowed_prefix:
|
| 161 |
continue
|
|
|
|
| 162 |
seen.add(code)
|
| 163 |
found.append(code)
|
| 164 |
|
| 165 |
-
return found
|
|
|
|
| 1 |
+
# ====================================================
|
| 2 |
+
# web_search.py — Village Enabled
|
| 3 |
+
# ====================================================
|
| 4 |
+
|
| 5 |
"""
|
| 6 |
Minimal Tavily wrapper: run a web search and return a list of UDISE codes.
|
| 7 |
|
| 8 |
+
Enhancement:
|
| 9 |
+
- Optional village support for more precise queries
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
"""
|
| 11 |
|
| 12 |
import os
|
|
|
|
| 19 |
except Exception:
|
| 20 |
TavilyClient = None
|
| 21 |
|
| 22 |
+
# ----------------------------------------------------
|
| 23 |
+
# State → UDISE prefix mapping
|
| 24 |
+
# ----------------------------------------------------
|
| 25 |
STATE_TO_UDISE_CODE = {
|
| 26 |
"Jammu & Kashmir": "01", "Himachal Pradesh": "02", "Punjab": "03",
|
| 27 |
"Chandigarh": "04", "Uttarakhand": "05", "Haryana": "06", "Delhi": "07",
|
| 28 |
"Rajasthan": "08", "Uttar Pradesh": "09", "Bihar": "10", "Sikkim": "11",
|
| 29 |
"Arunachal Pradesh": "12", "Nagaland": "13", "Manipur": "14", "Mizoram": "15",
|
| 30 |
"Tripura": "16", "Meghalaya": "17", "Assam": "18", "West Bengal": "19",
|
| 31 |
+
"Jharkhand": "20", "Odisha": "21", "Chhattisgarh": "22",
|
| 32 |
+
"Madhya Pradesh": "23", "Gujarat": "24", "Daman & Diu": "25",
|
| 33 |
+
"Dadra & Nagar Haveli": "26", "Maharashtra": "27",
|
| 34 |
+
"Andhra Pradesh": "28", "Karnataka": "29", "Goa": "30",
|
| 35 |
+
"Lakshadweep": "31", "Kerala": "32", "Tamil Nadu": "33",
|
| 36 |
+
"Puducherry": "34", "Andaman & Nicobar Islands": "35",
|
| 37 |
+
"Telangana": "36", "Ladakh": "37",
|
| 38 |
}
|
| 39 |
|
| 40 |
+
# strict 11-digit UDISE match
|
| 41 |
_UDISE_RE = re.compile(r"(?<!\d)(\d{11})(?!\d)")
|
| 42 |
|
| 43 |
+
|
| 44 |
+
# ----------------------------------------------------
|
| 45 |
+
# Query builder (Village-aware)
|
| 46 |
+
# ----------------------------------------------------
|
| 47 |
+
def _build_query(
|
| 48 |
+
school_name: Optional[str],
|
| 49 |
+
state_name: Optional[str],
|
| 50 |
+
district: Optional[str],
|
| 51 |
+
village: Optional[str] = None,
|
| 52 |
+
) -> str:
|
| 53 |
+
parts = ["UDISE code of"]
|
| 54 |
+
|
| 55 |
if school_name:
|
| 56 |
parts.append(f"School {school_name.strip()}")
|
| 57 |
+
|
| 58 |
+
if village:
|
| 59 |
+
parts.append(f"in village {village.strip()}")
|
| 60 |
+
|
| 61 |
if district:
|
| 62 |
+
parts.append(f"district {district.strip()}")
|
| 63 |
+
|
| 64 |
+
if state_name:
|
| 65 |
+
parts.append(f"state {state_name.strip()}")
|
| 66 |
+
|
| 67 |
return " ".join(parts).strip()
|
| 68 |
|
| 69 |
|
| 70 |
def _call_tavily(api_key: Optional[str], query: str):
|
| 71 |
key = api_key or os.getenv("TAVILY_API_KEY")
|
| 72 |
if not key:
|
| 73 |
+
return {"ok": False, "error": "No Tavily API key provided."}
|
| 74 |
if TavilyClient is None:
|
| 75 |
+
return {"ok": False, "error": "tavily package not installed."}
|
| 76 |
try:
|
| 77 |
client = TavilyClient(key)
|
|
|
|
| 78 |
resp = client.search(query=query, country="india")
|
| 79 |
return {"ok": True, "data": resp}
|
| 80 |
except Exception as e:
|
|
|
|
| 84 |
def _normalize_state_key(state_name: Optional[str]) -> Optional[str]:
|
| 85 |
if not state_name:
|
| 86 |
return None
|
| 87 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
cleaned = re.sub(r"[^A-Za-z]", "", state_name).lower()
|
| 89 |
for k in STATE_TO_UDISE_CODE:
|
| 90 |
if re.sub(r"[^A-Za-z]", "", k).lower() == cleaned:
|
|
|
|
| 92 |
return None
|
| 93 |
|
| 94 |
|
| 95 |
+
# ----------------------------------------------------
|
| 96 |
+
# Public API
|
| 97 |
+
# ----------------------------------------------------
|
| 98 |
def tavily_search_codes(
|
| 99 |
school_name: Optional[str],
|
| 100 |
state_name: Optional[str] = None,
|
| 101 |
district: Optional[str] = None,
|
| 102 |
+
village: Optional[str] = None,
|
| 103 |
api_key: Optional[str] = None,
|
| 104 |
+
enforce_state_prefix: bool = True,
|
| 105 |
) -> List[str]:
|
| 106 |
"""
|
| 107 |
+
Perform a Tavily search and return a list of unique UDISE codes.
|
| 108 |
+
|
| 109 |
+
Village is used only to improve search precision.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
"""
|
| 111 |
+
|
| 112 |
if not school_name:
|
| 113 |
+
return []
|
| 114 |
+
|
| 115 |
+
query = _build_query(school_name, state_name, district, village)
|
| 116 |
call = _call_tavily(api_key, query)
|
| 117 |
+
|
| 118 |
if not call.get("ok"):
|
|
|
|
| 119 |
return []
|
| 120 |
|
| 121 |
raw = call.get("data") or {}
|
|
|
|
| 122 |
snippets = []
|
| 123 |
|
|
|
|
| 124 |
if isinstance(raw, dict):
|
| 125 |
+
candidates = (
|
| 126 |
+
raw.get("results")
|
| 127 |
+
or raw.get("data", {}).get("results")
|
| 128 |
+
or raw.get("items")
|
| 129 |
+
or []
|
| 130 |
+
)
|
| 131 |
+
for item in candidates:
|
| 132 |
+
if isinstance(item, dict):
|
| 133 |
+
snippets.append(
|
| 134 |
+
" ".join(
|
| 135 |
+
[
|
| 136 |
+
str(item.get("title", "")),
|
| 137 |
+
str(item.get("content", "")),
|
| 138 |
+
str(item.get("text", "")),
|
| 139 |
+
str(item.get("url", "")),
|
| 140 |
+
]
|
| 141 |
+
)
|
| 142 |
+
)
|
| 143 |
+
else:
|
| 144 |
+
snippets.append(str(item))
|
| 145 |
elif isinstance(raw, list):
|
| 146 |
+
snippets = [str(x) for x in raw]
|
|
|
|
| 147 |
else:
|
| 148 |
+
snippets = [str(raw)]
|
| 149 |
|
|
|
|
| 150 |
allowed_prefix = None
|
| 151 |
state_key = _normalize_state_key(state_name)
|
| 152 |
if enforce_state_prefix and state_key:
|
| 153 |
allowed_prefix = STATE_TO_UDISE_CODE.get(state_key)
|
| 154 |
|
| 155 |
+
found, seen = [], set()
|
|
|
|
| 156 |
for text in snippets:
|
|
|
|
|
|
|
| 157 |
for m in _UDISE_RE.finditer(text):
|
| 158 |
code = m.group(1)
|
| 159 |
+
if code in seen:
|
| 160 |
continue
|
| 161 |
+
|
| 162 |
prefix = code[:2]
|
| 163 |
+
if prefix not in STATE_TO_UDISE_CODE.values():
|
|
|
|
| 164 |
continue
|
| 165 |
if allowed_prefix and prefix != allowed_prefix:
|
| 166 |
continue
|
| 167 |
+
|
| 168 |
seen.add(code)
|
| 169 |
found.append(code)
|
| 170 |
|
| 171 |
+
return found
|