gkdivya commited on
Commit
f930d0c
·
verified ·
1 Parent(s): 9180c17

Create web_search.py

Browse files
Files changed (1) hide show
  1. web_search.py +165 -0
web_search.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # web_search.py
2
+ """
3
+ Minimal Tavily wrapper: run a web search and return a list of UDISE codes.
4
+
5
+ Usage:
6
+ from web_search import tavily_search_codes
7
+ codes = tavily_search_codes("GOVT HIGH SCHOOL", state_name="Karnataka", district="Bengaluru")
8
+
9
+ Behavior:
10
+ - Uses TAVILY_API_KEY env var if api_key is not provided.
11
+ - Extracts 11-digit sequences that look like UDISE codes.
12
+ - If state_name provided and matches a known mapping, only returns codes whose
13
+ first two digits match that state's prefix.
14
+ - Returns a list of unique UDISE codes (strings) in the order they were found.
15
+ """
16
+
17
+ import os
18
+ import re
19
+ from typing import List, Optional
20
+
21
+ # optional Tavily SDK
22
+ try:
23
+ from tavily import TavilyClient
24
+ except Exception:
25
+ TavilyClient = None
26
+
27
+ # Minimal mapping of state name -> 2-digit UDISE prefix.
28
+ # Extend this map to match your canonical state names if you want strict filtering.
29
+ STATE_TO_UDISE_CODE = {
30
+ "Jammu & Kashmir": "01", "Himachal Pradesh": "02", "Punjab": "03",
31
+ "Chandigarh": "04", "Uttarakhand": "05", "Haryana": "06", "Delhi": "07",
32
+ "Rajasthan": "08", "Uttar Pradesh": "09", "Bihar": "10", "Sikkim": "11",
33
+ "Arunachal Pradesh": "12", "Nagaland": "13", "Manipur": "14", "Mizoram": "15",
34
+ "Tripura": "16", "Meghalaya": "17", "Assam": "18", "West Bengal": "19",
35
+ "Jharkhand": "20", "Odisha": "21", "Chhattisgarh": "22", "Madhya Pradesh": "23",
36
+ "Gujarat": "24", "Daman & Diu": "25", "Dadra & Nagar Haveli": "26",
37
+ "Maharashtra": "27", "Andhra Pradesh": "28", "Karnataka": "29",
38
+ "Goa": "30", "Lakshadweep": "31", "Kerala": "32", "Tamil Nadu": "33",
39
+ "Puducherry": "34", "Andaman & Nicobar Islands": "35", "Telangana": "36",
40
+ "Ladakh": "37"
41
+ }
42
+
43
+ # compile once: strict 11-digit match
44
+ _UDISE_RE = re.compile(r"(?<!\d)(\d{11})(?!\d)")
45
+
46
+ def _build_query(school_name: Optional[str], state_name: Optional[str], district: Optional[str]) -> str:
47
+ parts = []
48
+ if school_name:
49
+ parts.append(school_name.strip())
50
+ if district:
51
+ parts.append(f"district {district.strip()}")
52
+ if state_name:
53
+ parts.append(f"state {state_name.strip()}")
54
+ # bias to India and ask specifically for UDISE mentions
55
+ parts.append("India")
56
+ parts.append("UDISE code OR UDISE number OR Udise")
57
+ return " ".join(parts).strip()
58
+
59
+
60
+ def _call_tavily(api_key: Optional[str], query: str):
61
+ key = api_key or os.getenv("TAVILY_API_KEY")
62
+ if not key:
63
+ return {"ok": False, "error": "No Tavily API key provided (set TAVILY_API_KEY or pass api_key)."}
64
+ if TavilyClient is None:
65
+ return {"ok": False, "error": "tavily package not installed or TavilyClient not importable."}
66
+ try:
67
+ client = TavilyClient(key)
68
+ # SDK may return complex object; keep raw
69
+ resp = client.search(query=query)
70
+ return {"ok": True, "data": resp}
71
+ except Exception as e:
72
+ return {"ok": False, "error": str(e)}
73
+
74
+
75
+ def _normalize_state_key(state_name: Optional[str]) -> Optional[str]:
76
+ if not state_name:
77
+ return None
78
+ # simple exact / title / upper checks
79
+ for k in (state_name, state_name.title(), state_name.upper()):
80
+ if k in STATE_TO_UDISE_CODE:
81
+ return k
82
+ # fallback: compare cleaned names (remove non-letters)
83
+ cleaned = re.sub(r"[^A-Za-z]", "", state_name).lower()
84
+ for k in STATE_TO_UDISE_CODE:
85
+ if re.sub(r"[^A-Za-z]", "", k).lower() == cleaned:
86
+ return k
87
+ return None
88
+
89
+
90
+ def tavily_search_codes(
91
+ school_name: Optional[str],
92
+ state_name: Optional[str] = None,
93
+ district: Optional[str] = None,
94
+ api_key: Optional[str] = None,
95
+ enforce_state_prefix: bool = True
96
+ ) -> List[str]:
97
+ """
98
+ Perform a Tavily search and return a list of unique UDISE codes (strings).
99
+
100
+ Args:
101
+ school_name: partial or full school name to search
102
+ state_name: optional state name to restrict to that state's UDISE prefix
103
+ district: optional district to include in query
104
+ api_key: optional Tavily API key (falls back to TAVILY_API_KEY env)
105
+ enforce_state_prefix: if True and state_name is known, only return codes
106
+ whose first two digits match that state's prefix.
107
+
108
+ Returns:
109
+ List[str] of unique UDISE codes in the order found.
110
+ """
111
+ query = _build_query(school_name, state_name, district)
112
+ call = _call_tavily(api_key, query)
113
+ if not call.get("ok"):
114
+ # on error return empty list (caller can inspect logs)
115
+ return []
116
+
117
+ raw = call.get("data") or {}
118
+ # flatten likely text fields from raw into strings to search for codes
119
+ snippets = []
120
+
121
+ # common shapes: dict with 'results' or 'data' or a list
122
+ if isinstance(raw, dict):
123
+ # try results/data/results/data.items etc
124
+ candidates = raw.get("results") or raw.get("data", {}).get("results") or raw.get("data") or raw.get("hits") or raw.get("items")
125
+ if isinstance(candidates, list):
126
+ for item in candidates:
127
+ if isinstance(item, dict):
128
+ snippets.append(" ".join([str(item.get("title", "")), str(item.get("content", "")), str(item.get("text", "")), str(item.get("url", ""))]))
129
+ else:
130
+ snippets.append(str(item))
131
+ else:
132
+ # fallback stringify the whole dict
133
+ snippets.append(str(raw))
134
+ elif isinstance(raw, list):
135
+ for item in raw:
136
+ snippets.append(str(item))
137
+ else:
138
+ snippets.append(str(raw))
139
+
140
+ # Determine allowed prefix if requested
141
+ allowed_prefix = None
142
+ state_key = _normalize_state_key(state_name)
143
+ if enforce_state_prefix and state_key:
144
+ allowed_prefix = STATE_TO_UDISE_CODE.get(state_key)
145
+
146
+ found = []
147
+ seen = set()
148
+ for text in snippets:
149
+ if not text:
150
+ continue
151
+ for m in _UDISE_RE.finditer(text):
152
+ code = m.group(1)
153
+ if not code or code in seen:
154
+ continue
155
+ # quick prefix check
156
+ prefix = code[:2]
157
+ # ignore codes that do not start with a valid state prefix
158
+ if prefix not in set(STATE_TO_UDISE_CODE.values()):
159
+ continue
160
+ if allowed_prefix and prefix != allowed_prefix:
161
+ continue
162
+ seen.add(code)
163
+ found.append(code)
164
+
165
+ return found