Spaces:
Sleeping
Sleeping
File size: 5,763 Bytes
4cfcde5 87ecf4c 205f666 87ecf4c 72d160b 87ecf4c 4cfcde5 87ecf4c 3275737 87ecf4c 4cfcde5 87ecf4c 4cfcde5 87ecf4c 72d160b 4cfcde5 72d160b 3275737 72d160b 205f666 4cfcde5 205f666 72d160b 4cfcde5 72d160b 205f666 72d160b 205f666 4cfcde5 205f666 4cfcde5 205f666 4cfcde5 205f666 72d160b 205f666 72d160b 205f666 4cfcde5 205f666 72d160b 205f666 4cfcde5 72d160b 205f666 4cfcde5 205f666 72d160b 4cfcde5 205f666 72d160b 4cfcde5 72d160b 4cfcde5 72d160b 205f666 72d160b 205f666 72d160b 4cfcde5 72d160b 66b4504 72d160b 4cfcde5 72d160b 87ecf4c 72d160b 87ecf4c 72d160b 87ecf4c 4cfcde5 66b4504 72d160b 4cfcde5 87ecf4c 4cfcde5 87ecf4c 4cfcde5 87ecf4c 66b4504 87ecf4c 72d160b 87ecf4c 72d160b 87ecf4c 72d160b 87ecf4c 72d160b 4cfcde5 72d160b 87ecf4c 72d160b 87ecf4c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 |
# ====================================================
# searchschool.py — Village Enabled
# ====================================================
import pandas as pd
from huggingface_hub import hf_hub_download
from rapidfuzz import process, fuzz
from web_search import tavily_search_codes
# ====================================================
# CONFIG: columns + HF dataset
# ====================================================
MASTER_SCHOOL_COL = "School_Name__c"
MASTER_DISTRICT_COL = "School_District__c"
MASTER_BLOCK_COL = "School_Block__c"
MASTER_VILLAGE_COL = "School_Village__c"
MASTER_UDISE_COL = "School_Udise_Code__c"
MASTER_STATE_COL = "School_State__c"
HF_SCHOOLS_DATASET = "Apf-AI4Good/Schools"
MASTER_ALL_STATES_FILE = "master_all_states.xlsx"
DEFAULT_STATE_KEY = "ARUNACHAL PRADESH"
MAX_CANDIDATES = 5
# global cache
master_df = None
# normalization helper
try:
from admin_patterns import normalize_with_patterns_dynamic
except Exception:
normalize_with_patterns_dynamic = None
# ====================================================
# INTERNAL: load master once
# ====================================================
def _load_master_if_needed():
global master_df
if master_df is not None:
return
local_path = hf_hub_download(
repo_id=HF_SCHOOLS_DATASET,
repo_type="dataset",
filename=MASTER_ALL_STATES_FILE,
)
master_df = pd.read_excel(local_path, dtype=str).fillna("")
# ====================================================
# WEB SEARCH → UDISE → MASTER LOOKUP
# ====================================================
def on_search_web(
school_name: str,
state_name: str,
district: str | None = None,
block: str | None = None,
village: str | None = None,
):
"""
1. Tavily search → list of UDISE codes
2. Lookup those UDISE codes in master
3. Return standardized DataFrame
"""
udise_list = tavily_search_codes(
school_name=school_name,
state_name=state_name,
district=district,
api_key=None,
enforce_state_prefix=True,
)
if not udise_list:
return pd.DataFrame(
columns=[
"School_Name", "State",
"District", "Block", "Village",
"UDISE_Code"
]
)
rows = get_school_rows_by_udise(
state_name, udise_list, district, block, village
)
df = pd.DataFrame(rows)
expected = [
"School_Name", "State",
"District", "Block", "Village",
"UDISE_Code"
]
for col in expected:
if col not in df.columns:
df[col] = None
return df[expected]
def get_school_rows_by_udise(
state_name: str,
udise_codes: list[str],
district: str | None = None,
block: str | None = None,
village: str | None = None,
):
if not udise_codes:
return []
_load_master_if_needed()
df = master_df
udise_codes = {str(u) for u in udise_codes}
df = df[df[MASTER_UDISE_COL].isin(udise_codes)]
if state_name:
df = df[df[MASTER_STATE_COL].str.upper() == state_name.upper()]
if district:
df = df[df[MASTER_DISTRICT_COL] == district]
if block:
df = df[df[MASTER_BLOCK_COL] == block]
if village and MASTER_VILLAGE_COL in df.columns:
df = df[df[MASTER_VILLAGE_COL] == village]
rows = []
for _, r in df.iterrows():
rows.append({
"School_Name": r.get(MASTER_SCHOOL_COL, ""),
"State": r.get(MASTER_STATE_COL, ""),
"District": r.get(MASTER_DISTRICT_COL, ""),
"Block": r.get(MASTER_BLOCK_COL, ""),
"Village": r.get(MASTER_VILLAGE_COL, ""),
"UDISE_Code": r.get(MASTER_UDISE_COL, ""),
})
return rows
# ====================================================
# RAPIDFUZZ SEARCH (Village-aware)
# ====================================================
def search_candidates(
query_name: str,
state: str | None,
district: str | None,
block: str | None,
village: str | None = None,
):
global normalize_with_patterns_dynamic
if normalize_with_patterns_dynamic is None:
from admin_patterns import normalize_with_patterns_dynamic
if not query_name:
return pd.DataFrame(), pd.DataFrame()
_load_master_if_needed()
df = master_df
# -------- Filters --------
if state:
df = df[df[MASTER_STATE_COL].str.upper() == state.upper()]
if district:
df = df[df[MASTER_DISTRICT_COL] == district]
if block:
df = df[df[MASTER_BLOCK_COL] == block]
if village and MASTER_VILLAGE_COL in df.columns:
df = df[df[MASTER_VILLAGE_COL] == village]
if df.empty:
return pd.DataFrame(), pd.DataFrame()
state_for_patterns = (state or DEFAULT_STATE_KEY).upper()
choices = df[MASTER_SCHOOL_COL].astype(str)
candidates_raw = process.extract(
query_name,
choices,
scorer=fuzz.token_set_ratio,
processor=lambda s: normalize_with_patterns_dynamic(
s, state_for_patterns
),
limit=MAX_CANDIDATES,
)
rows = []
for choice_name, score, idx in candidates_raw:
r = df.loc[idx]
rows.append({
"School_Name": r.get(MASTER_SCHOOL_COL, ""),
"State": r.get(MASTER_STATE_COL, ""),
"District": r.get(MASTER_DISTRICT_COL, ""),
"Block": r.get(MASTER_BLOCK_COL, ""),
"Village": r.get(MASTER_VILLAGE_COL, ""),
"UDISE_Code": r.get(MASTER_UDISE_COL, ""),
"Score": score,
})
candidates_df = pd.DataFrame(rows)
best_df = candidates_df.head(1).copy()
return candidates_df, best_df
|