gkdivya commited on
Commit
7e65b3f
·
verified ·
1 Parent(s): acb72da

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -57
app.py CHANGED
@@ -103,88 +103,55 @@ def call_search_sdk(api_key, payload_text):
103
  return {"ok": False, "error": str(e)}
104
 
105
 
106
- def extract_udise_candidates_from_search(search_json, school_name="", state_name=""):
107
  """
108
- Extract UDISE codes from Tavily search results with scoring based on relevance.
109
-
110
- Args:
111
- search_json: The raw Tavily API response
112
- school_name: Optional school name for better matching
113
- state_name: Optional state name for better matching
114
-
115
- Returns:
116
- List of UDISE codes sorted by relevance
117
  """
 
118
  found = set()
119
- candidates = []
120
-
121
- # Normalize search terms for matching
122
- school_tokens = [t for t in re.sub(r"\s+", " ", school_name).strip().lower().split() if t]
123
- state_norm = re.sub(r"\s+", " ", state_name).strip().lower()
124
 
125
  # Check if we have valid search results
126
  if not search_json or not isinstance(search_json, dict):
 
127
  return []
128
 
129
- results = search_json.get('data', {}).get('results', [])
130
- if not isinstance(results, list):
 
131
  return []
132
 
 
 
133
  # Patterns to match UDISE codes
134
  patterns = [
135
- r'UDISE\s*[Cc]ode\s*:?\s*(\d{11})(?![0-9])', # Matches "UDISE Code: 12345678901"
136
- r'Udise\s+School\s+Code\s*:?\s*(\d{11})(?![0-9])', # Matches "Udise School Code: 12345678901"
137
- r'udise\s+code:?\s*(\d{11})(?![0-9])', # Case-insensitive match
138
- r'(?<![0-9])(\d{11})(?![0-9])' # Matches exactly 11 digits not part of a longer number
139
  ]
140
 
141
  for result in results:
142
  if not isinstance(result, dict):
143
  continue
144
 
145
- # Combine title and content for better matching
146
- title = str(result.get('title', ''))
147
- content = str(result.get('content', ''))
148
- snippet = f"{title} {content}"
149
- snippet_low = snippet.lower()
150
 
151
- # Skip if no UDISE code pattern found
152
- if not any(re.search(p, snippet, re.IGNORECASE) for p in patterns):
153
- continue
154
-
155
- # Calculate relevance score
156
- score = 0
157
-
158
- # Higher score if school name tokens are found
159
- if school_tokens and any(tok in snippet_low for tok in school_tokens):
160
- score += 2
161
-
162
- # Higher score if state name is mentioned
163
- if state_norm and state_norm in snippet_low:
164
- score += 1
165
-
166
- # Extra points for exact school name match
167
- if school_name and school_name.lower() in snippet_low:
168
- score += 3
169
-
170
- # Extract all UDISE codes from this result
171
  for pattern in patterns:
172
- for match in re.finditer(pattern, snippet, re.IGNORECASE):
 
173
  udise_code = match.group(1) if len(match.groups()) > 0 else match.group(0)
174
  if udise_code and is_valid_udise(udise_code):
 
175
  found.add(udise_code)
176
- # Store with score for sorting
177
- candidates.append((score, udise_code))
178
 
179
- # Sort by score (descending) and return unique codes
180
- candidates.sort(key=lambda x: x[0], reverse=True)
181
- seen = set()
182
- result = []
183
 
184
- for score, code in candidates:
185
- if code not in seen:
186
- seen.add(code)
187
- result.append(code)
188
 
189
  return result
190
 
 
103
  return {"ok": False, "error": str(e)}
104
 
105
 
106
+ def extract_udise_candidates_from_search(search_json):
107
  """
108
+ Extract UDISE codes from Tavily search results.
109
+ Returns a list of unique UDISE codes found in the search results.
 
 
 
 
 
 
 
110
  """
111
+ print("\n===== Extracting UDISE Codes =====")
112
  found = set()
 
 
 
 
 
113
 
114
  # Check if we have valid search results
115
  if not search_json or not isinstance(search_json, dict):
116
+ print("Invalid search JSON")
117
  return []
118
 
119
+ results = search_json.get('results', []) or search_json.get('data', {}).get('results', [])
120
+ if not results:
121
+ print("No results found in search JSON")
122
  return []
123
 
124
+ print(f"Found {len(results)} search results")
125
+
126
  # Patterns to match UDISE codes
127
  patterns = [
128
+ r'UDISE[^\d]*(?:code|Code|CODE)[^\d]*(\d{11})(?![0-9])',
129
+ r'Udise[^\d]*(?:School[^\d]*Code|Code)[^\d]*(\d{11})(?![0-9])',
130
+ r'(?<![0-9])(\d{11})(?![0-9])' # Fallback: any 11-digit number
 
131
  ]
132
 
133
  for result in results:
134
  if not isinstance(result, dict):
135
  continue
136
 
137
+ # Combine title and content for searching
138
+ text = f"{result.get('title', '')} {result.get('content', '')}"
 
 
 
139
 
140
+ # Check for UDISE codes using all patterns
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
  for pattern in patterns:
142
+ matches = re.finditer(pattern, text, re.IGNORECASE)
143
+ for match in matches:
144
  udise_code = match.group(1) if len(match.groups()) > 0 else match.group(0)
145
  if udise_code and is_valid_udise(udise_code):
146
+ print(f"Found UDISE code: {udise_code}")
147
  found.add(udise_code)
 
 
148
 
149
+ # Convert set to list and sort
150
+ result = sorted(list(found))
 
 
151
 
152
+ if not result:
153
+ print("No valid UDISE codes found")
154
+ return []
 
155
 
156
  return result
157