Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -103,54 +103,90 @@ def call_search_sdk(api_key, payload_text):
|
|
| 103 |
return {"ok": False, "error": str(e)}
|
| 104 |
|
| 105 |
|
| 106 |
-
def extract_udise_candidates_from_search(search_json):
|
| 107 |
"""
|
| 108 |
-
Extract UDISE codes from Tavily search results.
|
| 109 |
-
|
| 110 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
"""
|
| 112 |
found = set()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
|
| 114 |
# Check if we have valid search results
|
| 115 |
if not search_json or not isinstance(search_json, dict):
|
| 116 |
return []
|
| 117 |
|
| 118 |
-
|
| 119 |
-
results = data.get('results', [])
|
| 120 |
-
|
| 121 |
if not isinstance(results, list):
|
| 122 |
return []
|
| 123 |
|
| 124 |
-
#
|
| 125 |
patterns = [
|
| 126 |
r'UDISE\s*[Cc]ode\s*:?\s*(\d{11})(?![0-9])', # Matches "UDISE Code: 12345678901"
|
| 127 |
r'Udise\s+School\s+Code\s*:?\s*(\d{11})(?![0-9])', # Matches "Udise School Code: 12345678901"
|
|
|
|
| 128 |
r'(?<![0-9])(\d{11})(?![0-9])' # Matches exactly 11 digits not part of a longer number
|
| 129 |
]
|
| 130 |
|
| 131 |
-
# Compile patterns for better performance
|
| 132 |
-
compiled_patterns = [re.compile(pattern) for pattern in patterns]
|
| 133 |
-
|
| 134 |
for result in results:
|
| 135 |
if not isinstance(result, dict):
|
| 136 |
continue
|
| 137 |
|
| 138 |
-
#
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
|
| 153 |
-
return
|
| 154 |
|
| 155 |
|
| 156 |
def json_to_table(obj):
|
|
@@ -208,9 +244,14 @@ def search_workflow(school_name, state_name, search_key, use_search=True, use_ky
|
|
| 208 |
search_res = call_search_sdk(search_key, payload_text)
|
| 209 |
out["search"] = search_res
|
| 210 |
if search_res.get("ok"):
|
| 211 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 212 |
out["suggestions"] = candidates
|
| 213 |
-
if candidates
|
| 214 |
out["first_candidate"] = candidates[0]
|
| 215 |
else:
|
| 216 |
out["search"] = {"ok": False, "error": "Search disabled or SDK not used"}
|
|
|
|
| 103 |
return {"ok": False, "error": str(e)}
|
| 104 |
|
| 105 |
|
| 106 |
+
def extract_udise_candidates_from_search(search_json, school_name="", state_name=""):
|
| 107 |
"""
|
| 108 |
+
Extract UDISE codes from Tavily search results with scoring based on relevance.
|
| 109 |
+
|
| 110 |
+
Args:
|
| 111 |
+
search_json: The raw Tavily API response
|
| 112 |
+
school_name: Optional school name for better matching
|
| 113 |
+
state_name: Optional state name for better matching
|
| 114 |
+
|
| 115 |
+
Returns:
|
| 116 |
+
List of UDISE codes sorted by relevance
|
| 117 |
"""
|
| 118 |
found = set()
|
| 119 |
+
candidates = []
|
| 120 |
+
|
| 121 |
+
# Normalize search terms for matching
|
| 122 |
+
school_tokens = [t for t in re.sub(r"\s+", " ", school_name).strip().lower().split() if t]
|
| 123 |
+
state_norm = re.sub(r"\s+", " ", state_name).strip().lower()
|
| 124 |
|
| 125 |
# Check if we have valid search results
|
| 126 |
if not search_json or not isinstance(search_json, dict):
|
| 127 |
return []
|
| 128 |
|
| 129 |
+
results = search_json.get('data', {}).get('results', [])
|
|
|
|
|
|
|
| 130 |
if not isinstance(results, list):
|
| 131 |
return []
|
| 132 |
|
| 133 |
+
# Patterns to match UDISE codes
|
| 134 |
patterns = [
|
| 135 |
r'UDISE\s*[Cc]ode\s*:?\s*(\d{11})(?![0-9])', # Matches "UDISE Code: 12345678901"
|
| 136 |
r'Udise\s+School\s+Code\s*:?\s*(\d{11})(?![0-9])', # Matches "Udise School Code: 12345678901"
|
| 137 |
+
r'udise\s+code:?\s*(\d{11})(?![0-9])', # Case-insensitive match
|
| 138 |
r'(?<![0-9])(\d{11})(?![0-9])' # Matches exactly 11 digits not part of a longer number
|
| 139 |
]
|
| 140 |
|
|
|
|
|
|
|
|
|
|
| 141 |
for result in results:
|
| 142 |
if not isinstance(result, dict):
|
| 143 |
continue
|
| 144 |
|
| 145 |
+
# Combine title and content for better matching
|
| 146 |
+
title = str(result.get('title', ''))
|
| 147 |
+
content = str(result.get('content', ''))
|
| 148 |
+
snippet = f"{title} {content}"
|
| 149 |
+
snippet_low = snippet.lower()
|
| 150 |
+
|
| 151 |
+
# Skip if no UDISE code pattern found
|
| 152 |
+
if not any(re.search(p, snippet, re.IGNORECASE) for p in patterns):
|
| 153 |
+
continue
|
| 154 |
+
|
| 155 |
+
# Calculate relevance score
|
| 156 |
+
score = 0
|
| 157 |
+
|
| 158 |
+
# Higher score if school name tokens are found
|
| 159 |
+
if school_tokens and any(tok in snippet_low for tok in school_tokens):
|
| 160 |
+
score += 2
|
| 161 |
+
|
| 162 |
+
# Higher score if state name is mentioned
|
| 163 |
+
if state_norm and state_norm in snippet_low:
|
| 164 |
+
score += 1
|
| 165 |
+
|
| 166 |
+
# Extra points for exact school name match
|
| 167 |
+
if school_name and school_name.lower() in snippet_low:
|
| 168 |
+
score += 3
|
| 169 |
+
|
| 170 |
+
# Extract all UDISE codes from this result
|
| 171 |
+
for pattern in patterns:
|
| 172 |
+
for match in re.finditer(pattern, snippet, re.IGNORECASE):
|
| 173 |
+
udise_code = match.group(1) if len(match.groups()) > 0 else match.group(0)
|
| 174 |
+
if udise_code and is_valid_udise(udise_code):
|
| 175 |
+
found.add(udise_code)
|
| 176 |
+
# Store with score for sorting
|
| 177 |
+
candidates.append((score, udise_code))
|
| 178 |
+
|
| 179 |
+
# Sort by score (descending) and return unique codes
|
| 180 |
+
candidates.sort(key=lambda x: x[0], reverse=True)
|
| 181 |
+
seen = set()
|
| 182 |
+
result = []
|
| 183 |
+
|
| 184 |
+
for score, code in candidates:
|
| 185 |
+
if code not in seen:
|
| 186 |
+
seen.add(code)
|
| 187 |
+
result.append(code)
|
| 188 |
|
| 189 |
+
return result
|
| 190 |
|
| 191 |
|
| 192 |
def json_to_table(obj):
|
|
|
|
| 244 |
search_res = call_search_sdk(search_key, payload_text)
|
| 245 |
out["search"] = search_res
|
| 246 |
if search_res.get("ok"):
|
| 247 |
+
# Pass both school_name and state_name to improve UDISE code extraction
|
| 248 |
+
candidates = extract_udise_candidates_from_search(
|
| 249 |
+
search_res["data"],
|
| 250 |
+
school_name=school_name,
|
| 251 |
+
state_name=state_name
|
| 252 |
+
)
|
| 253 |
out["suggestions"] = candidates
|
| 254 |
+
if candidates and candidates[0] != "No UDISE codes found":
|
| 255 |
out["first_candidate"] = candidates[0]
|
| 256 |
else:
|
| 257 |
out["search"] = {"ok": False, "error": "Search disabled or SDK not used"}
|