gkdivya commited on
Commit
acb72da
·
verified ·
1 Parent(s): 686aead

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +69 -28
app.py CHANGED
@@ -103,54 +103,90 @@ def call_search_sdk(api_key, payload_text):
103
  return {"ok": False, "error": str(e)}
104
 
105
 
106
- def extract_udise_candidates_from_search(search_json):
107
  """
108
- Extract UDISE codes from Tavily search results.
109
- Looks for patterns like "UDISE Code: 12345678901" or "Udise School Code: 12345678901"
110
- in the content and title fields of search results.
 
 
 
 
 
 
111
  """
112
  found = set()
 
 
 
 
 
113
 
114
  # Check if we have valid search results
115
  if not search_json or not isinstance(search_json, dict):
116
  return []
117
 
118
- data = search_json.get('data', {})
119
- results = data.get('results', [])
120
-
121
  if not isinstance(results, list):
122
  return []
123
 
124
- # Compile patterns once for better performance
125
  patterns = [
126
  r'UDISE\s*[Cc]ode\s*:?\s*(\d{11})(?![0-9])', # Matches "UDISE Code: 12345678901"
127
  r'Udise\s+School\s+Code\s*:?\s*(\d{11})(?![0-9])', # Matches "Udise School Code: 12345678901"
 
128
  r'(?<![0-9])(\d{11})(?![0-9])' # Matches exactly 11 digits not part of a longer number
129
  ]
130
 
131
- # Compile patterns for better performance
132
- compiled_patterns = [re.compile(pattern) for pattern in patterns]
133
-
134
  for result in results:
135
  if not isinstance(result, dict):
136
  continue
137
 
138
- # Check both content and title fields
139
- for field in ['content', 'title']:
140
- text = result.get(field, '')
141
- if not isinstance(text, str):
142
- continue
143
-
144
- # Try each pattern
145
- for pattern in compiled_patterns:
146
- matches = pattern.finditer(text)
147
- for match in matches:
148
- # Get the first group if it exists, otherwise the whole match
149
- udise_code = match.group(1) if len(match.groups()) > 0 else match.group(0)
150
- if udise_code and is_valid_udise(udise_code):
151
- found.add(udise_code)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
 
153
- return sorted(list(found))
154
 
155
 
156
  def json_to_table(obj):
@@ -208,9 +244,14 @@ def search_workflow(school_name, state_name, search_key, use_search=True, use_ky
208
  search_res = call_search_sdk(search_key, payload_text)
209
  out["search"] = search_res
210
  if search_res.get("ok"):
211
- candidates = extract_udise_candidates_from_search(search_res["data"])
 
 
 
 
 
212
  out["suggestions"] = candidates
213
- if candidates: # If we have candidates, set the first one
214
  out["first_candidate"] = candidates[0]
215
  else:
216
  out["search"] = {"ok": False, "error": "Search disabled or SDK not used"}
 
103
  return {"ok": False, "error": str(e)}
104
 
105
 
106
+ def extract_udise_candidates_from_search(search_json, school_name="", state_name=""):
107
  """
108
+ Extract UDISE codes from Tavily search results with scoring based on relevance.
109
+
110
+ Args:
111
+ search_json: The raw Tavily API response
112
+ school_name: Optional school name for better matching
113
+ state_name: Optional state name for better matching
114
+
115
+ Returns:
116
+ List of UDISE codes sorted by relevance
117
  """
118
  found = set()
119
+ candidates = []
120
+
121
+ # Normalize search terms for matching
122
+ school_tokens = [t for t in re.sub(r"\s+", " ", school_name).strip().lower().split() if t]
123
+ state_norm = re.sub(r"\s+", " ", state_name).strip().lower()
124
 
125
  # Check if we have valid search results
126
  if not search_json or not isinstance(search_json, dict):
127
  return []
128
 
129
+ results = search_json.get('data', {}).get('results', [])
 
 
130
  if not isinstance(results, list):
131
  return []
132
 
133
+ # Patterns to match UDISE codes
134
  patterns = [
135
  r'UDISE\s*[Cc]ode\s*:?\s*(\d{11})(?![0-9])', # Matches "UDISE Code: 12345678901"
136
  r'Udise\s+School\s+Code\s*:?\s*(\d{11})(?![0-9])', # Matches "Udise School Code: 12345678901"
137
+ r'udise\s+code:?\s*(\d{11})(?![0-9])', # Case-insensitive match
138
  r'(?<![0-9])(\d{11})(?![0-9])' # Matches exactly 11 digits not part of a longer number
139
  ]
140
 
 
 
 
141
  for result in results:
142
  if not isinstance(result, dict):
143
  continue
144
 
145
+ # Combine title and content for better matching
146
+ title = str(result.get('title', ''))
147
+ content = str(result.get('content', ''))
148
+ snippet = f"{title} {content}"
149
+ snippet_low = snippet.lower()
150
+
151
+ # Skip if no UDISE code pattern found
152
+ if not any(re.search(p, snippet, re.IGNORECASE) for p in patterns):
153
+ continue
154
+
155
+ # Calculate relevance score
156
+ score = 0
157
+
158
+ # Higher score if school name tokens are found
159
+ if school_tokens and any(tok in snippet_low for tok in school_tokens):
160
+ score += 2
161
+
162
+ # Higher score if state name is mentioned
163
+ if state_norm and state_norm in snippet_low:
164
+ score += 1
165
+
166
+ # Extra points for exact school name match
167
+ if school_name and school_name.lower() in snippet_low:
168
+ score += 3
169
+
170
+ # Extract all UDISE codes from this result
171
+ for pattern in patterns:
172
+ for match in re.finditer(pattern, snippet, re.IGNORECASE):
173
+ udise_code = match.group(1) if len(match.groups()) > 0 else match.group(0)
174
+ if udise_code and is_valid_udise(udise_code):
175
+ found.add(udise_code)
176
+ # Store with score for sorting
177
+ candidates.append((score, udise_code))
178
+
179
+ # Sort by score (descending) and return unique codes
180
+ candidates.sort(key=lambda x: x[0], reverse=True)
181
+ seen = set()
182
+ result = []
183
+
184
+ for score, code in candidates:
185
+ if code not in seen:
186
+ seen.add(code)
187
+ result.append(code)
188
 
189
+ return result
190
 
191
 
192
  def json_to_table(obj):
 
244
  search_res = call_search_sdk(search_key, payload_text)
245
  out["search"] = search_res
246
  if search_res.get("ok"):
247
+ # Pass both school_name and state_name to improve UDISE code extraction
248
+ candidates = extract_udise_candidates_from_search(
249
+ search_res["data"],
250
+ school_name=school_name,
251
+ state_name=state_name
252
+ )
253
  out["suggestions"] = candidates
254
+ if candidates and candidates[0] != "No UDISE codes found":
255
  out["first_candidate"] = candidates[0]
256
  else:
257
  out["search"] = {"ok": False, "error": "Search disabled or SDK not used"}