Spaces:
Running
Running
| # ==================================================== | |
| # searchschool.py — Village Enabled | |
| # ==================================================== | |
| import pandas as pd | |
| from huggingface_hub import hf_hub_download | |
| from rapidfuzz import process, fuzz | |
| from web_search import tavily_search_codes | |
| # ==================================================== | |
| # CONFIG: columns + HF dataset | |
| # ==================================================== | |
| MASTER_SCHOOL_COL = "School_Name__c" | |
| MASTER_DISTRICT_COL = "School_District__c" | |
| MASTER_BLOCK_COL = "School_Block__c" | |
| MASTER_VILLAGE_COL = "School_Village__c" | |
| MASTER_UDISE_COL = "School_Udise_Code__c" | |
| MASTER_STATE_COL = "School_State__c" | |
| HF_SCHOOLS_DATASET = "Apf-AI4Good/Schools" | |
| MASTER_ALL_STATES_FILE = "master_all_states.xlsx" | |
| DEFAULT_STATE_KEY = "ARUNACHAL PRADESH" | |
| MAX_CANDIDATES = 5 | |
| # global cache | |
| master_df = None | |
| # normalization helper | |
| try: | |
| from admin_patterns import normalize_with_patterns_dynamic | |
| except Exception: | |
| normalize_with_patterns_dynamic = None | |
| # ==================================================== | |
| # INTERNAL: load master once | |
| # ==================================================== | |
| def _load_master_if_needed(): | |
| global master_df | |
| if master_df is not None: | |
| return | |
| local_path = hf_hub_download( | |
| repo_id=HF_SCHOOLS_DATASET, | |
| repo_type="dataset", | |
| filename=MASTER_ALL_STATES_FILE, | |
| ) | |
| master_df = pd.read_excel(local_path, dtype=str).fillna("") | |
| # ==================================================== | |
| # WEB SEARCH → UDISE → MASTER LOOKUP | |
| # ==================================================== | |
| def on_search_web( | |
| school_name: str, | |
| state_name: str, | |
| district: str | None = None, | |
| block: str | None = None, | |
| village: str | None = None, | |
| ): | |
| """ | |
| 1. Tavily search → list of UDISE codes | |
| 2. Lookup those UDISE codes in master | |
| 3. Return standardized DataFrame | |
| """ | |
| udise_list = tavily_search_codes( | |
| school_name=school_name, | |
| state_name=state_name, | |
| district=district, | |
| api_key=None, | |
| enforce_state_prefix=True, | |
| ) | |
| if not udise_list: | |
| return pd.DataFrame( | |
| columns=[ | |
| "School_Name", "State", | |
| "District", "Block", "Village", | |
| "UDISE_Code" | |
| ] | |
| ) | |
| rows = get_school_rows_by_udise( | |
| state_name, udise_list, district, block, village | |
| ) | |
| df = pd.DataFrame(rows) | |
| expected = [ | |
| "School_Name", "State", | |
| "District", "Block", "Village", | |
| "UDISE_Code" | |
| ] | |
| for col in expected: | |
| if col not in df.columns: | |
| df[col] = None | |
| return df[expected] | |
| def get_school_rows_by_udise( | |
| state_name: str, | |
| udise_codes: list[str], | |
| district: str | None = None, | |
| block: str | None = None, | |
| village: str | None = None, | |
| ): | |
| if not udise_codes: | |
| return [] | |
| _load_master_if_needed() | |
| df = master_df | |
| udise_codes = {str(u) for u in udise_codes} | |
| df = df[df[MASTER_UDISE_COL].isin(udise_codes)] | |
| if state_name: | |
| df = df[df[MASTER_STATE_COL].str.upper() == state_name.upper()] | |
| if district: | |
| df = df[df[MASTER_DISTRICT_COL] == district] | |
| if block: | |
| df = df[df[MASTER_BLOCK_COL] == block] | |
| if village and MASTER_VILLAGE_COL in df.columns: | |
| df = df[df[MASTER_VILLAGE_COL] == village] | |
| rows = [] | |
| for _, r in df.iterrows(): | |
| rows.append({ | |
| "School_Name": r.get(MASTER_SCHOOL_COL, ""), | |
| "State": r.get(MASTER_STATE_COL, ""), | |
| "District": r.get(MASTER_DISTRICT_COL, ""), | |
| "Block": r.get(MASTER_BLOCK_COL, ""), | |
| "Village": r.get(MASTER_VILLAGE_COL, ""), | |
| "UDISE_Code": r.get(MASTER_UDISE_COL, ""), | |
| }) | |
| return rows | |
| # ==================================================== | |
| # RAPIDFUZZ SEARCH (Village-aware) | |
| # ==================================================== | |
| def search_candidates( | |
| query_name: str, | |
| state: str | None, | |
| district: str | None, | |
| block: str | None, | |
| village: str | None = None, | |
| ): | |
| global normalize_with_patterns_dynamic | |
| if normalize_with_patterns_dynamic is None: | |
| from admin_patterns import normalize_with_patterns_dynamic | |
| if not query_name: | |
| return pd.DataFrame(), pd.DataFrame() | |
| _load_master_if_needed() | |
| df = master_df | |
| # -------- Filters -------- | |
| if state: | |
| df = df[df[MASTER_STATE_COL].str.upper() == state.upper()] | |
| if district: | |
| df = df[df[MASTER_DISTRICT_COL] == district] | |
| if block: | |
| df = df[df[MASTER_BLOCK_COL] == block] | |
| if village and MASTER_VILLAGE_COL in df.columns: | |
| df = df[df[MASTER_VILLAGE_COL] == village] | |
| if df.empty: | |
| return pd.DataFrame(), pd.DataFrame() | |
| state_for_patterns = (state or DEFAULT_STATE_KEY).upper() | |
| choices = df[MASTER_SCHOOL_COL].astype(str) | |
| candidates_raw = process.extract( | |
| query_name, | |
| choices, | |
| scorer=fuzz.token_set_ratio, | |
| processor=lambda s: normalize_with_patterns_dynamic( | |
| s, state_for_patterns | |
| ), | |
| limit=MAX_CANDIDATES, | |
| ) | |
| rows = [] | |
| for choice_name, score, idx in candidates_raw: | |
| r = df.loc[idx] | |
| rows.append({ | |
| "School_Name": r.get(MASTER_SCHOOL_COL, ""), | |
| "State": r.get(MASTER_STATE_COL, ""), | |
| "District": r.get(MASTER_DISTRICT_COL, ""), | |
| "Block": r.get(MASTER_BLOCK_COL, ""), | |
| "Village": r.get(MASTER_VILLAGE_COL, ""), | |
| "UDISE_Code": r.get(MASTER_UDISE_COL, ""), | |
| "Score": score, | |
| }) | |
| candidates_df = pd.DataFrame(rows) | |
| best_df = candidates_df.head(1).copy() | |
| return candidates_df, best_df | |