gkdivya commited on
Commit
ef31980
·
verified ·
1 Parent(s): 4cfcde5

Update web_search.py

Browse files
Files changed (1) hide show
  1. web_search.py +82 -76
web_search.py CHANGED
@@ -1,17 +1,12 @@
1
- # web_search.py
 
 
 
2
  """
3
  Minimal Tavily wrapper: run a web search and return a list of UDISE codes.
4
 
5
- Usage:
6
- from web_search import tavily_search_codes
7
- codes = tavily_search_codes("GOVT HIGH SCHOOL", state_name="Karnataka", district="Bengaluru")
8
-
9
- Behavior:
10
- - Uses TAVILY_API_KEY env var if api_key is not provided.
11
- - Extracts 11-digit sequences that look like UDISE codes.
12
- - If state_name provided and matches a known mapping, only returns codes whose
13
- first two digits match that state's prefix.
14
- - Returns a list of unique UDISE codes (strings) in the order they were found.
15
  """
16
 
17
  import os
@@ -24,46 +19,62 @@ try:
24
  except Exception:
25
  TavilyClient = None
26
 
27
- # Minimal mapping of state name -> 2-digit UDISE prefix.
28
- # Extend this map to match your canonical state names if you want strict filtering.
 
29
  STATE_TO_UDISE_CODE = {
30
  "Jammu & Kashmir": "01", "Himachal Pradesh": "02", "Punjab": "03",
31
  "Chandigarh": "04", "Uttarakhand": "05", "Haryana": "06", "Delhi": "07",
32
  "Rajasthan": "08", "Uttar Pradesh": "09", "Bihar": "10", "Sikkim": "11",
33
  "Arunachal Pradesh": "12", "Nagaland": "13", "Manipur": "14", "Mizoram": "15",
34
  "Tripura": "16", "Meghalaya": "17", "Assam": "18", "West Bengal": "19",
35
- "Jharkhand": "20", "Odisha": "21", "Chhattisgarh": "22", "Madhya Pradesh": "23",
36
- "Gujarat": "24", "Daman & Diu": "25", "Dadra & Nagar Haveli": "26",
37
- "Maharashtra": "27", "Andhra Pradesh": "28", "Karnataka": "29",
38
- "Goa": "30", "Lakshadweep": "31", "Kerala": "32", "Tamil Nadu": "33",
39
- "Puducherry": "34", "Andaman & Nicobar Islands": "35", "Telangana": "36",
40
- "Ladakh": "37"
 
41
  }
42
 
43
- # compile once: strict 11-digit match
44
  _UDISE_RE = re.compile(r"(?<!\d)(\d{11})(?!\d)")
45
 
46
- def _build_query(school_name: Optional[str], state_name: Optional[str], district: Optional[str]) -> str:
47
- parts = []
48
- parts.append("UDISE code of")
 
 
 
 
 
 
 
 
 
49
  if school_name:
50
  parts.append(f"School {school_name.strip()}")
51
- if state_name:
52
- parts.append(f"in state {state_name.strip()}")
 
 
53
  if district:
54
- parts.append(f"of district {district.strip()}")
 
 
 
 
55
  return " ".join(parts).strip()
56
 
57
 
58
  def _call_tavily(api_key: Optional[str], query: str):
59
  key = api_key or os.getenv("TAVILY_API_KEY")
60
  if not key:
61
- return {"ok": False, "error": "No Tavily API key provided (set TAVILY_API_KEY or pass api_key)."}
62
  if TavilyClient is None:
63
- return {"ok": False, "error": "tavily package not installed or TavilyClient not importable."}
64
  try:
65
  client = TavilyClient(key)
66
- # SDK may return complex object; keep raw
67
  resp = client.search(query=query, country="india")
68
  return {"ok": True, "data": resp}
69
  except Exception as e:
@@ -73,11 +84,7 @@ def _call_tavily(api_key: Optional[str], query: str):
73
  def _normalize_state_key(state_name: Optional[str]) -> Optional[str]:
74
  if not state_name:
75
  return None
76
- # simple exact / title / upper checks
77
- for k in (state_name, state_name.title(), state_name.upper()):
78
- if k in STATE_TO_UDISE_CODE:
79
- return k
80
- # fallback: compare cleaned names (remove non-letters)
81
  cleaned = re.sub(r"[^A-Za-z]", "", state_name).lower()
82
  for k in STATE_TO_UDISE_CODE:
83
  if re.sub(r"[^A-Za-z]", "", k).lower() == cleaned:
@@ -85,81 +92,80 @@ def _normalize_state_key(state_name: Optional[str]) -> Optional[str]:
85
  return None
86
 
87
 
 
 
 
88
  def tavily_search_codes(
89
  school_name: Optional[str],
90
  state_name: Optional[str] = None,
91
  district: Optional[str] = None,
 
92
  api_key: Optional[str] = None,
93
- enforce_state_prefix: bool = True
94
  ) -> List[str]:
95
  """
96
- Perform a Tavily search and return a list of unique UDISE codes (strings).
97
-
98
- Args:
99
- school_name: partial or full school name to search
100
- state_name: optional state name to restrict to that state's UDISE prefix
101
- district: optional district to include in query
102
- api_key: optional Tavily API key (falls back to TAVILY_API_KEY env)
103
- enforce_state_prefix: if True and state_name is known, only return codes
104
- whose first two digits match that state's prefix.
105
-
106
- Returns:
107
- List[str] of unique UDISE codes in the order found.
108
  """
 
109
  if not school_name:
110
- return None
111
- query = _build_query(school_name, state_name, district)
 
112
  call = _call_tavily(api_key, query)
 
113
  if not call.get("ok"):
114
- # on error return empty list (caller can inspect logs)
115
  return []
116
 
117
  raw = call.get("data") or {}
118
- # flatten likely text fields from raw into strings to search for codes
119
  snippets = []
120
 
121
- # common shapes: dict with 'results' or 'data' or a list
122
  if isinstance(raw, dict):
123
- # try results/data/results/data.items etc
124
- candidates = raw.get("results") or raw.get("data", {}).get("results") or raw.get("data") or raw.get("hits") or raw.get("items")
125
- if isinstance(candidates, list):
126
- for item in candidates:
127
- if isinstance(item, dict):
128
- snippets.append(" ".join([str(item.get("title", "")), str(item.get("content", "")), str(item.get("text", "")), str(item.get("url", ""))]))
129
- else:
130
- snippets.append(str(item))
131
- else:
132
- # fallback stringify the whole dict
133
- snippets.append(str(raw))
 
 
 
 
 
 
 
 
 
134
  elif isinstance(raw, list):
135
- for item in raw:
136
- snippets.append(str(item))
137
  else:
138
- snippets.append(str(raw))
139
 
140
- # Determine allowed prefix if requested
141
  allowed_prefix = None
142
  state_key = _normalize_state_key(state_name)
143
  if enforce_state_prefix and state_key:
144
  allowed_prefix = STATE_TO_UDISE_CODE.get(state_key)
145
 
146
- found = []
147
- seen = set()
148
  for text in snippets:
149
- if not text:
150
- continue
151
  for m in _UDISE_RE.finditer(text):
152
  code = m.group(1)
153
- if not code or code in seen:
154
  continue
155
- # quick prefix check
156
  prefix = code[:2]
157
- # ignore codes that do not start with a valid state prefix
158
- if prefix not in set(STATE_TO_UDISE_CODE.values()):
159
  continue
160
  if allowed_prefix and prefix != allowed_prefix:
161
  continue
 
162
  seen.add(code)
163
  found.append(code)
164
 
165
- return found
 
1
+ # ====================================================
2
+ # web_search.py — Village Enabled
3
+ # ====================================================
4
+
5
  """
6
  Minimal Tavily wrapper: run a web search and return a list of UDISE codes.
7
 
8
+ Enhancement:
9
+ - Optional village support for more precise queries
 
 
 
 
 
 
 
 
10
  """
11
 
12
  import os
 
19
  except Exception:
20
  TavilyClient = None
21
 
22
+ # ----------------------------------------------------
23
+ # State UDISE prefix mapping
24
+ # ----------------------------------------------------
25
  STATE_TO_UDISE_CODE = {
26
  "Jammu & Kashmir": "01", "Himachal Pradesh": "02", "Punjab": "03",
27
  "Chandigarh": "04", "Uttarakhand": "05", "Haryana": "06", "Delhi": "07",
28
  "Rajasthan": "08", "Uttar Pradesh": "09", "Bihar": "10", "Sikkim": "11",
29
  "Arunachal Pradesh": "12", "Nagaland": "13", "Manipur": "14", "Mizoram": "15",
30
  "Tripura": "16", "Meghalaya": "17", "Assam": "18", "West Bengal": "19",
31
+ "Jharkhand": "20", "Odisha": "21", "Chhattisgarh": "22",
32
+ "Madhya Pradesh": "23", "Gujarat": "24", "Daman & Diu": "25",
33
+ "Dadra & Nagar Haveli": "26", "Maharashtra": "27",
34
+ "Andhra Pradesh": "28", "Karnataka": "29", "Goa": "30",
35
+ "Lakshadweep": "31", "Kerala": "32", "Tamil Nadu": "33",
36
+ "Puducherry": "34", "Andaman & Nicobar Islands": "35",
37
+ "Telangana": "36", "Ladakh": "37",
38
  }
39
 
40
+ # strict 11-digit UDISE match
41
  _UDISE_RE = re.compile(r"(?<!\d)(\d{11})(?!\d)")
42
 
43
+
44
+ # ----------------------------------------------------
45
+ # Query builder (Village-aware)
46
+ # ----------------------------------------------------
47
+ def _build_query(
48
+ school_name: Optional[str],
49
+ state_name: Optional[str],
50
+ district: Optional[str],
51
+ village: Optional[str] = None,
52
+ ) -> str:
53
+ parts = ["UDISE code of"]
54
+
55
  if school_name:
56
  parts.append(f"School {school_name.strip()}")
57
+
58
+ if village:
59
+ parts.append(f"in village {village.strip()}")
60
+
61
  if district:
62
+ parts.append(f"district {district.strip()}")
63
+
64
+ if state_name:
65
+ parts.append(f"state {state_name.strip()}")
66
+
67
  return " ".join(parts).strip()
68
 
69
 
70
  def _call_tavily(api_key: Optional[str], query: str):
71
  key = api_key or os.getenv("TAVILY_API_KEY")
72
  if not key:
73
+ return {"ok": False, "error": "No Tavily API key provided."}
74
  if TavilyClient is None:
75
+ return {"ok": False, "error": "tavily package not installed."}
76
  try:
77
  client = TavilyClient(key)
 
78
  resp = client.search(query=query, country="india")
79
  return {"ok": True, "data": resp}
80
  except Exception as e:
 
84
  def _normalize_state_key(state_name: Optional[str]) -> Optional[str]:
85
  if not state_name:
86
  return None
87
+
 
 
 
 
88
  cleaned = re.sub(r"[^A-Za-z]", "", state_name).lower()
89
  for k in STATE_TO_UDISE_CODE:
90
  if re.sub(r"[^A-Za-z]", "", k).lower() == cleaned:
 
92
  return None
93
 
94
 
95
+ # ----------------------------------------------------
96
+ # Public API
97
+ # ----------------------------------------------------
98
  def tavily_search_codes(
99
  school_name: Optional[str],
100
  state_name: Optional[str] = None,
101
  district: Optional[str] = None,
102
+ village: Optional[str] = None,
103
  api_key: Optional[str] = None,
104
+ enforce_state_prefix: bool = True,
105
  ) -> List[str]:
106
  """
107
+ Perform a Tavily search and return a list of unique UDISE codes.
108
+
109
+ Village is used only to improve search precision.
 
 
 
 
 
 
 
 
 
110
  """
111
+
112
  if not school_name:
113
+ return []
114
+
115
+ query = _build_query(school_name, state_name, district, village)
116
  call = _call_tavily(api_key, query)
117
+
118
  if not call.get("ok"):
 
119
  return []
120
 
121
  raw = call.get("data") or {}
 
122
  snippets = []
123
 
 
124
  if isinstance(raw, dict):
125
+ candidates = (
126
+ raw.get("results")
127
+ or raw.get("data", {}).get("results")
128
+ or raw.get("items")
129
+ or []
130
+ )
131
+ for item in candidates:
132
+ if isinstance(item, dict):
133
+ snippets.append(
134
+ " ".join(
135
+ [
136
+ str(item.get("title", "")),
137
+ str(item.get("content", "")),
138
+ str(item.get("text", "")),
139
+ str(item.get("url", "")),
140
+ ]
141
+ )
142
+ )
143
+ else:
144
+ snippets.append(str(item))
145
  elif isinstance(raw, list):
146
+ snippets = [str(x) for x in raw]
 
147
  else:
148
+ snippets = [str(raw)]
149
 
 
150
  allowed_prefix = None
151
  state_key = _normalize_state_key(state_name)
152
  if enforce_state_prefix and state_key:
153
  allowed_prefix = STATE_TO_UDISE_CODE.get(state_key)
154
 
155
+ found, seen = [], set()
 
156
  for text in snippets:
 
 
157
  for m in _UDISE_RE.finditer(text):
158
  code = m.group(1)
159
+ if code in seen:
160
  continue
161
+
162
  prefix = code[:2]
163
+ if prefix not in STATE_TO_UDISE_CODE.values():
 
164
  continue
165
  if allowed_prefix and prefix != allowed_prefix:
166
  continue
167
+
168
  seen.add(code)
169
  found.append(code)
170
 
171
+ return found