Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -8,6 +8,23 @@ import json
|
|
| 8 |
from tavily import TavilyClient
|
| 9 |
|
| 10 |
KYS_SAMPLE = "https://kys.udiseplus.gov.in/webapp/api/search-schools?searchType=3&searchParam={udise}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
STATES = [
|
| 12 |
"Arunachal_pradesh",
|
| 13 |
"Assam",
|
|
@@ -70,13 +87,16 @@ def extract_udise_candidates_from_search(search_json):
|
|
| 70 |
if not isinstance(results, list):
|
| 71 |
return []
|
| 72 |
|
| 73 |
-
#
|
| 74 |
patterns = [
|
| 75 |
-
r'UDISE\s*[Cc]ode\s*:?\s*(\d{11})', # Matches "UDISE Code: 12345678901"
|
| 76 |
-
r'Udise\s+School\s+Code\s*:?\s*(\d{11})', # Matches "Udise School Code: 12345678901"
|
| 77 |
-
r'
|
| 78 |
]
|
| 79 |
|
|
|
|
|
|
|
|
|
|
| 80 |
for result in results:
|
| 81 |
if not isinstance(result, dict):
|
| 82 |
continue
|
|
@@ -88,14 +108,13 @@ def extract_udise_candidates_from_search(search_json):
|
|
| 88 |
continue
|
| 89 |
|
| 90 |
# Try each pattern
|
| 91 |
-
for pattern in
|
| 92 |
-
|
| 93 |
-
matches = re.findall(pattern, text)
|
| 94 |
for match in matches:
|
| 95 |
-
if
|
| 96 |
-
|
| 97 |
-
if
|
| 98 |
-
found.add(
|
| 99 |
|
| 100 |
return sorted(list(found))
|
| 101 |
|
|
|
|
| 8 |
from tavily import TavilyClient
|
| 9 |
|
| 10 |
KYS_SAMPLE = "https://kys.udiseplus.gov.in/webapp/api/search-schools?searchType=3&searchParam={udise}"
|
| 11 |
+
|
| 12 |
+
# List of valid state codes for UDISE (first 2 digits of UDISE code)
|
| 13 |
+
VALID_UDISE_STATE_CODES = [
|
| 14 |
+
'11', '12', '13', '14', '15', '16', '17', '18', '19', # Northern
|
| 15 |
+
'20', '21', '22', '23', '24', '25', '26', '27', '28', # Western
|
| 16 |
+
'29', '30', '31', '32', '33', '34', '35', '36', # Southern
|
| 17 |
+
'37', '38', '39', '40', '41', '42', '43', '44', # Eastern
|
| 18 |
+
'45', '46', '47', '48', '49', '50', '51', '52' # North-Eastern & UTs
|
| 19 |
+
]
|
| 20 |
+
|
| 21 |
+
def is_valid_udise(code):
|
| 22 |
+
"""Check if a string is a valid UDISE code."""
|
| 23 |
+
if not (code and code.isdigit() and len(code) == 11):
|
| 24 |
+
return False
|
| 25 |
+
# Optional: Validate state code (first 2 digits)
|
| 26 |
+
return code[:2] in VALID_UDISE_STATE_CODES
|
| 27 |
+
|
| 28 |
STATES = [
|
| 29 |
"Arunachal_pradesh",
|
| 30 |
"Assam",
|
|
|
|
| 87 |
if not isinstance(results, list):
|
| 88 |
return []
|
| 89 |
|
| 90 |
+
# Compile patterns once for better performance
|
| 91 |
patterns = [
|
| 92 |
+
r'UDISE\s*[Cc]ode\s*:?\s*(\d{11})(?![0-9])', # Matches "UDISE Code: 12345678901"
|
| 93 |
+
r'Udise\s+School\s+Code\s*:?\s*(\d{11})(?![0-9])', # Matches "Udise School Code: 12345678901"
|
| 94 |
+
r'(?<![0-9])(\d{11})(?![0-9])' # Matches exactly 11 digits not part of a longer number
|
| 95 |
]
|
| 96 |
|
| 97 |
+
# Compile patterns for better performance
|
| 98 |
+
compiled_patterns = [re.compile(pattern) for pattern in patterns]
|
| 99 |
+
|
| 100 |
for result in results:
|
| 101 |
if not isinstance(result, dict):
|
| 102 |
continue
|
|
|
|
| 108 |
continue
|
| 109 |
|
| 110 |
# Try each pattern
|
| 111 |
+
for pattern in compiled_patterns:
|
| 112 |
+
matches = pattern.finditer(text)
|
|
|
|
| 113 |
for match in matches:
|
| 114 |
+
# Get the first group if it exists, otherwise the whole match
|
| 115 |
+
udise_code = match.group(1) if len(match.groups()) > 0 else match.group(0)
|
| 116 |
+
if udise_code and is_valid_udise(udise_code):
|
| 117 |
+
found.add(udise_code)
|
| 118 |
|
| 119 |
return sorted(list(found))
|
| 120 |
|