# ==================================================== # searchschool.py — Village Enabled # ==================================================== import pandas as pd from huggingface_hub import hf_hub_download from rapidfuzz import process, fuzz from web_search import tavily_search_codes # ==================================================== # CONFIG: columns + HF dataset # ==================================================== MASTER_SCHOOL_COL = "School_Name__c" MASTER_DISTRICT_COL = "School_District__c" MASTER_BLOCK_COL = "School_Block__c" MASTER_VILLAGE_COL = "School_Village__c" MASTER_UDISE_COL = "School_Udise_Code__c" MASTER_STATE_COL = "School_State__c" HF_SCHOOLS_DATASET = "Apf-AI4Good/Schools" MASTER_ALL_STATES_FILE = "master_all_states.xlsx" DEFAULT_STATE_KEY = "ARUNACHAL PRADESH" MAX_CANDIDATES = 5 # global cache master_df = None # normalization helper try: from admin_patterns import normalize_with_patterns_dynamic except Exception: normalize_with_patterns_dynamic = None # ==================================================== # INTERNAL: load master once # ==================================================== def _load_master_if_needed(): global master_df if master_df is not None: return local_path = hf_hub_download( repo_id=HF_SCHOOLS_DATASET, repo_type="dataset", filename=MASTER_ALL_STATES_FILE, ) master_df = pd.read_excel(local_path, dtype=str).fillna("") # ==================================================== # WEB SEARCH → UDISE → MASTER LOOKUP # ==================================================== def on_search_web( school_name: str, state_name: str, district: str | None = None, block: str | None = None, village: str | None = None, ): """ 1. Tavily search → list of UDISE codes 2. Lookup those UDISE codes in master 3. Return standardized DataFrame """ udise_list = tavily_search_codes( school_name=school_name, state_name=state_name, district=district, api_key=None, enforce_state_prefix=True, ) if not udise_list: return pd.DataFrame( columns=[ "School_Name", "State", "District", "Block", "Village", "UDISE_Code" ] ) rows = get_school_rows_by_udise( state_name, udise_list, district, block, village ) df = pd.DataFrame(rows) expected = [ "School_Name", "State", "District", "Block", "Village", "UDISE_Code" ] for col in expected: if col not in df.columns: df[col] = None return df[expected] def get_school_rows_by_udise( state_name: str, udise_codes: list[str], district: str | None = None, block: str | None = None, village: str | None = None, ): if not udise_codes: return [] _load_master_if_needed() df = master_df udise_codes = {str(u) for u in udise_codes} df = df[df[MASTER_UDISE_COL].isin(udise_codes)] if state_name: df = df[df[MASTER_STATE_COL].str.upper() == state_name.upper()] if district: df = df[df[MASTER_DISTRICT_COL] == district] if block: df = df[df[MASTER_BLOCK_COL] == block] if village and MASTER_VILLAGE_COL in df.columns: df = df[df[MASTER_VILLAGE_COL] == village] rows = [] for _, r in df.iterrows(): rows.append({ "School_Name": r.get(MASTER_SCHOOL_COL, ""), "State": r.get(MASTER_STATE_COL, ""), "District": r.get(MASTER_DISTRICT_COL, ""), "Block": r.get(MASTER_BLOCK_COL, ""), "Village": r.get(MASTER_VILLAGE_COL, ""), "UDISE_Code": r.get(MASTER_UDISE_COL, ""), }) return rows # ==================================================== # RAPIDFUZZ SEARCH (Village-aware) # ==================================================== def search_candidates( query_name: str, state: str | None, district: str | None, block: str | None, village: str | None = None, ): global normalize_with_patterns_dynamic if normalize_with_patterns_dynamic is None: from admin_patterns import normalize_with_patterns_dynamic if not query_name: return pd.DataFrame(), pd.DataFrame() _load_master_if_needed() df = master_df # -------- Filters -------- if state: df = df[df[MASTER_STATE_COL].str.upper() == state.upper()] if district: df = df[df[MASTER_DISTRICT_COL] == district] if block: df = df[df[MASTER_BLOCK_COL] == block] if village and MASTER_VILLAGE_COL in df.columns: df = df[df[MASTER_VILLAGE_COL] == village] if df.empty: return pd.DataFrame(), pd.DataFrame() state_for_patterns = (state or DEFAULT_STATE_KEY).upper() choices = df[MASTER_SCHOOL_COL].astype(str) candidates_raw = process.extract( query_name, choices, scorer=fuzz.token_set_ratio, processor=lambda s: normalize_with_patterns_dynamic( s, state_for_patterns ), limit=MAX_CANDIDATES, ) rows = [] for choice_name, score, idx in candidates_raw: r = df.loc[idx] rows.append({ "School_Name": r.get(MASTER_SCHOOL_COL, ""), "State": r.get(MASTER_STATE_COL, ""), "District": r.get(MASTER_DISTRICT_COL, ""), "Block": r.get(MASTER_BLOCK_COL, ""), "Village": r.get(MASTER_VILLAGE_COL, ""), "UDISE_Code": r.get(MASTER_UDISE_COL, ""), "Score": score, }) candidates_df = pd.DataFrame(rows) best_df = candidates_df.head(1).copy() return candidates_df, best_df