gkdivya commited on
Commit
1a0de33
·
verified ·
1 Parent(s): 6c05f60

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -11
app.py CHANGED
@@ -8,6 +8,23 @@ import json
8
  from tavily import TavilyClient
9
 
10
  KYS_SAMPLE = "https://kys.udiseplus.gov.in/webapp/api/search-schools?searchType=3&searchParam={udise}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  STATES = [
12
  "Arunachal_pradesh",
13
  "Assam",
@@ -70,13 +87,16 @@ def extract_udise_candidates_from_search(search_json):
70
  if not isinstance(results, list):
71
  return []
72
 
73
- # Patterns to match UDISE codes in text
74
  patterns = [
75
- r'UDISE\s*[Cc]ode\s*:?\s*(\d{11})', # Matches "UDISE Code: 12345678901"
76
- r'Udise\s+School\s+Code\s*:?\s*(\d{11})', # Matches "Udise School Code: 12345678901"
77
- r'\b(\d{11})\b' # Matches any 11-digit number as fallback
78
  ]
79
 
 
 
 
80
  for result in results:
81
  if not isinstance(result, dict):
82
  continue
@@ -88,14 +108,13 @@ def extract_udise_candidates_from_search(search_json):
88
  continue
89
 
90
  # Try each pattern
91
- for pattern in patterns:
92
- import re
93
- matches = re.findall(pattern, text)
94
  for match in matches:
95
- if isinstance(match, tuple): # If pattern has groups
96
- match = match[0] # Take the first group
97
- if match.isdigit() and len(match) == 11: # UDISE codes are 11 digits
98
- found.add(match)
99
 
100
  return sorted(list(found))
101
 
 
8
  from tavily import TavilyClient
9
 
10
  KYS_SAMPLE = "https://kys.udiseplus.gov.in/webapp/api/search-schools?searchType=3&searchParam={udise}"
11
+
12
+ # List of valid state codes for UDISE (first 2 digits of UDISE code)
13
+ VALID_UDISE_STATE_CODES = [
14
+ '11', '12', '13', '14', '15', '16', '17', '18', '19', # Northern
15
+ '20', '21', '22', '23', '24', '25', '26', '27', '28', # Western
16
+ '29', '30', '31', '32', '33', '34', '35', '36', # Southern
17
+ '37', '38', '39', '40', '41', '42', '43', '44', # Eastern
18
+ '45', '46', '47', '48', '49', '50', '51', '52' # North-Eastern & UTs
19
+ ]
20
+
21
+ def is_valid_udise(code):
22
+ """Check if a string is a valid UDISE code."""
23
+ if not (code and code.isdigit() and len(code) == 11):
24
+ return False
25
+ # Optional: Validate state code (first 2 digits)
26
+ return code[:2] in VALID_UDISE_STATE_CODES
27
+
28
  STATES = [
29
  "Arunachal_pradesh",
30
  "Assam",
 
87
  if not isinstance(results, list):
88
  return []
89
 
90
+ # Compile patterns once for better performance
91
  patterns = [
92
+ r'UDISE\s*[Cc]ode\s*:?\s*(\d{11})(?![0-9])', # Matches "UDISE Code: 12345678901"
93
+ r'Udise\s+School\s+Code\s*:?\s*(\d{11})(?![0-9])', # Matches "Udise School Code: 12345678901"
94
+ r'(?<![0-9])(\d{11})(?![0-9])' # Matches exactly 11 digits not part of a longer number
95
  ]
96
 
97
+ # Compile patterns for better performance
98
+ compiled_patterns = [re.compile(pattern) for pattern in patterns]
99
+
100
  for result in results:
101
  if not isinstance(result, dict):
102
  continue
 
108
  continue
109
 
110
  # Try each pattern
111
+ for pattern in compiled_patterns:
112
+ matches = pattern.finditer(text)
 
113
  for match in matches:
114
+ # Get the first group if it exists, otherwise the whole match
115
+ udise_code = match.group(1) if len(match.groups()) > 0 else match.group(0)
116
+ if udise_code and is_valid_udise(udise_code):
117
+ found.add(udise_code)
118
 
119
  return sorted(list(found))
120