FindSchoolByName / searchschool.py
gkdivya's picture
Update searchschool.py
66b4504 verified
# ====================================================
# searchschool.py — Village Enabled
# ====================================================
import pandas as pd
from huggingface_hub import hf_hub_download
from rapidfuzz import process, fuzz
from web_search import tavily_search_codes
# ====================================================
# CONFIG: columns + HF dataset
# ====================================================
MASTER_SCHOOL_COL = "School_Name__c"
MASTER_DISTRICT_COL = "School_District__c"
MASTER_BLOCK_COL = "School_Block__c"
MASTER_VILLAGE_COL = "School_Village__c"
MASTER_UDISE_COL = "School_Udise_Code__c"
MASTER_STATE_COL = "School_State__c"
HF_SCHOOLS_DATASET = "Apf-AI4Good/Schools"
MASTER_ALL_STATES_FILE = "master_all_states.xlsx"
DEFAULT_STATE_KEY = "ARUNACHAL PRADESH"
MAX_CANDIDATES = 5
# global cache
master_df = None
# normalization helper
try:
from admin_patterns import normalize_with_patterns_dynamic
except Exception:
normalize_with_patterns_dynamic = None
# ====================================================
# INTERNAL: load master once
# ====================================================
def _load_master_if_needed():
global master_df
if master_df is not None:
return
local_path = hf_hub_download(
repo_id=HF_SCHOOLS_DATASET,
repo_type="dataset",
filename=MASTER_ALL_STATES_FILE,
)
master_df = pd.read_excel(local_path, dtype=str).fillna("")
# ====================================================
# WEB SEARCH → UDISE → MASTER LOOKUP
# ====================================================
def on_search_web(
school_name: str,
state_name: str,
district: str | None = None,
block: str | None = None,
village: str | None = None,
):
"""
1. Tavily search → list of UDISE codes
2. Lookup those UDISE codes in master
3. Return standardized DataFrame
"""
udise_list = tavily_search_codes(
school_name=school_name,
state_name=state_name,
district=district,
api_key=None,
enforce_state_prefix=True,
)
if not udise_list:
return pd.DataFrame(
columns=[
"School_Name", "State",
"District", "Block", "Village",
"UDISE_Code"
]
)
rows = get_school_rows_by_udise(
state_name, udise_list, district, block, village
)
df = pd.DataFrame(rows)
expected = [
"School_Name", "State",
"District", "Block", "Village",
"UDISE_Code"
]
for col in expected:
if col not in df.columns:
df[col] = None
return df[expected]
def get_school_rows_by_udise(
state_name: str,
udise_codes: list[str],
district: str | None = None,
block: str | None = None,
village: str | None = None,
):
if not udise_codes:
return []
_load_master_if_needed()
df = master_df
udise_codes = {str(u) for u in udise_codes}
df = df[df[MASTER_UDISE_COL].isin(udise_codes)]
if state_name:
df = df[df[MASTER_STATE_COL].str.upper() == state_name.upper()]
if district:
df = df[df[MASTER_DISTRICT_COL] == district]
if block:
df = df[df[MASTER_BLOCK_COL] == block]
if village and MASTER_VILLAGE_COL in df.columns:
df = df[df[MASTER_VILLAGE_COL] == village]
rows = []
for _, r in df.iterrows():
rows.append({
"School_Name": r.get(MASTER_SCHOOL_COL, ""),
"State": r.get(MASTER_STATE_COL, ""),
"District": r.get(MASTER_DISTRICT_COL, ""),
"Block": r.get(MASTER_BLOCK_COL, ""),
"Village": r.get(MASTER_VILLAGE_COL, ""),
"UDISE_Code": r.get(MASTER_UDISE_COL, ""),
})
return rows
# ====================================================
# RAPIDFUZZ SEARCH (Village-aware)
# ====================================================
def search_candidates(
query_name: str,
state: str | None,
district: str | None,
block: str | None,
village: str | None = None,
):
global normalize_with_patterns_dynamic
if normalize_with_patterns_dynamic is None:
from admin_patterns import normalize_with_patterns_dynamic
if not query_name:
return pd.DataFrame(), pd.DataFrame()
_load_master_if_needed()
df = master_df
# -------- Filters --------
if state:
df = df[df[MASTER_STATE_COL].str.upper() == state.upper()]
if district:
df = df[df[MASTER_DISTRICT_COL] == district]
if block:
df = df[df[MASTER_BLOCK_COL] == block]
if village and MASTER_VILLAGE_COL in df.columns:
df = df[df[MASTER_VILLAGE_COL] == village]
if df.empty:
return pd.DataFrame(), pd.DataFrame()
state_for_patterns = (state or DEFAULT_STATE_KEY).upper()
choices = df[MASTER_SCHOOL_COL].astype(str)
candidates_raw = process.extract(
query_name,
choices,
scorer=fuzz.token_set_ratio,
processor=lambda s: normalize_with_patterns_dynamic(
s, state_for_patterns
),
limit=MAX_CANDIDATES,
)
rows = []
for choice_name, score, idx in candidates_raw:
r = df.loc[idx]
rows.append({
"School_Name": r.get(MASTER_SCHOOL_COL, ""),
"State": r.get(MASTER_STATE_COL, ""),
"District": r.get(MASTER_DISTRICT_COL, ""),
"Block": r.get(MASTER_BLOCK_COL, ""),
"Village": r.get(MASTER_VILLAGE_COL, ""),
"UDISE_Code": r.get(MASTER_UDISE_COL, ""),
"Score": score,
})
candidates_df = pd.DataFrame(rows)
best_df = candidates_df.head(1).copy()
return candidates_df, best_df