gkdivya commited on
Commit
205f666
·
verified ·
1 Parent(s): f930d0c

Update searchschool.py

Browse files
Files changed (1) hide show
  1. searchschool.py +125 -0
searchschool.py CHANGED
@@ -3,6 +3,7 @@ import os
3
  import pandas as pd
4
  from huggingface_hub import hf_hub_download
5
  from rapidfuzz import process, fuzz
 
6
 
7
  # ====================================================
8
  # CONFIG: columns, states, HF dataset
@@ -36,6 +37,130 @@ except Exception:
36
  normalize_with_patterns_dynamic = None
37
 
38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  def load_master_for_state(state_key: str | None):
40
  """
41
  Load the master CSV for a state from Hugging Face Hub (dataset repo),
 
3
  import pandas as pd
4
  from huggingface_hub import hf_hub_download
5
  from rapidfuzz import process, fuzz
6
+ from web_search import tavily_search_codes
7
 
8
  # ====================================================
9
  # CONFIG: columns, states, HF dataset
 
37
  normalize_with_patterns_dynamic = None
38
 
39
 
40
+ def on_search_web(
41
+ school_name: str,
42
+ state_name: str,
43
+ district: str = None,
44
+ block: str = None
45
+ ):
46
+ """
47
+ 1. Performs Tavily search → returns list of valid UDISE codes.
48
+ 2. Looks up these UDISE codes in our HF Schools dataset using
49
+ get_school_rows_by_udise().
50
+ 3. Converts results into the standard DataFrame your Gradio app expects.
51
+
52
+ Returns:
53
+ pandas.DataFrame with columns:
54
+ School_Name, State, District, Block, UDISE_Code, Score
55
+ """
56
+
57
+ # Step 1: Tavily → list of UDISE codes
58
+ udise_list = tavily_search_codes(
59
+ school_name=school_name,
60
+ state_name=state_name,
61
+ district=district,
62
+ api_key=None, # use HuggingFace secret instead
63
+ enforce_state_prefix=True
64
+ )
65
+
66
+ if not udise_list:
67
+ # Always return an empty DF with correct schema
68
+ return pd.DataFrame(
69
+ columns=["School_Name", "State", "District", "Block", "UDISE_Code"]
70
+ )
71
+
72
+ # Step 2: HF dataset lookup
73
+ rows = get_school_rows_by_udise(state_name, udise_list, try_global=True)
74
+
75
+ # Step 3: Convert list → DataFrame
76
+ df = pd.DataFrame(rows)
77
+
78
+ # Make sure all expected columns exist
79
+ expected = ["School_Name", "State", "District", "Block", "UDISE_Code"]
80
+ for col in expected:
81
+ if col not in df.columns:
82
+ df[col] = None # keep schema consistent
83
+
84
+ # Reorder to canonical format
85
+ df = df[expected]
86
+
87
+ # Score is not applicable for web search → keep None
88
+ return df
89
+
90
+
91
+ def get_school_rows_by_udise(state_name: str, udise_codes: list[str], try_global: bool = True) -> list:
92
+ """
93
+ Very simplified UDISE → school rows lookup.
94
+ Returns list of dicts:
95
+ School_Name, State, District, Block, UDISE_Code
96
+ """
97
+
98
+ if not udise_codes:
99
+ return []
100
+
101
+ udise_codes = list({str(u) for u in udise_codes}) # unique + cast to str
102
+
103
+ results = []
104
+
105
+ # --- Normalize state key ---
106
+ state_key = None
107
+ if state_name:
108
+ upper = state_name.strip().upper()
109
+ for k in STATE_HF_FILES.keys():
110
+ if k.upper() == upper:
111
+ state_key = k
112
+ break
113
+
114
+ # --- Helper: read CSV safely ---
115
+ def load_csv(filename):
116
+ try:
117
+ path = hf_hub_download(
118
+ repo_id=HF_SCHOOLS_DATASET,
119
+ repo_type="dataset",
120
+ filename=filename
121
+ )
122
+ return pd.read_csv(path, dtype=str).fillna("")
123
+ except Exception:
124
+ return pd.DataFrame()
125
+
126
+ # --- Helper: extract rows for given DF ---
127
+ def extract_rows(df, state_label):
128
+ if df.empty or MASTER_UDISE_COL not in df.columns:
129
+ return []
130
+ matched = df[df[MASTER_UDISE_COL].isin(udise_codes)]
131
+ if matched.empty:
132
+ return []
133
+ rows = []
134
+ for _, r in matched.iterrows():
135
+ rows.append({
136
+ "School_Name": r.get(MASTER_SCHOOL_COL, ""),
137
+ "State": r.get(MASTER_STATE_COL, state_label),
138
+ "District": r.get(MASTER_DISTRICT_COL, ""),
139
+ "Block": r.get(MASTER_BLOCK_COL, ""),
140
+ "UDISE_Code": r.get(MASTER_UDISE_COL, "")
141
+ })
142
+ return rows
143
+
144
+ # --- 1) Try requested state first ---
145
+ if state_key:
146
+ fname = STATE_HF_FILES[state_key]
147
+ df_state = load_csv(fname)
148
+ rows = extract_rows(df_state, state_label=state_key)
149
+ if rows:
150
+ return rows
151
+
152
+ # --- 2) Try all states (global fallback) ---
153
+ if try_global:
154
+ for sk, fname in STATE_HF_FILES.items():
155
+ df = load_csv(fname)
156
+ rows = extract_rows(df, state_label=sk)
157
+ if rows:
158
+ results.extend(rows)
159
+
160
+ return results
161
+
162
+
163
+
164
  def load_master_for_state(state_key: str | None):
165
  """
166
  Load the master CSV for a state from Hugging Face Hub (dataset repo),