Spaces:

Apf-AI4Good
/

FindSchoolByName

Sleeping

App Files Files Community

gkdivya commited on Dec 17, 2025

Commit

72d160b

verified ·

1 Parent(s): 5be6e81

Update searchschool.py

Browse files

Files changed (1) hide show

searchschool.py +118 -192

searchschool.py CHANGED Viewed

@@ -6,7 +6,7 @@ from rapidfuzz import process, fuzz
 from web_search import tavily_search_codes
 # ====================================================
-# CONFIG: columns, states, HF dataset
 # ====================================================
 MASTER_SCHOOL_COL   = "School_Name__c"
 MASTER_DISTRICT_COL = "School_District__c"
@@ -15,43 +15,41 @@ MASTER_UDISE_COL    = "School_Udise_Code__c"
 MASTER_STATE_COL    = "School_State__c"
 HF_SCHOOLS_DATASET = "Apf-AI4Good/Schools"
-# Map state keys to CSV filenames inside that dataset
-STATE_HF_FILES = {
-    "ARUNACHAL PRADESH": "Arunachal Pradesh.csv",
-    "ASSAM": "Assam.csv",
-    "BIHAR": "Bihar.csv",
-    "CHHATTISGARH": "Chhattisgarh.csv",
-    "JHARKHAND": "Jharkhand.csv",
-    "MADHYA PRADESH": "Madhya Pradesh.csv",
-    "MANIPUR": "Manipur.csv",
-    "MEGHALAYA": "Meghalaya.csv",
-    "MIZORAM": "Mizoram.csv",
-    "NAGALAND": "Nagaland.csv",
-    "ODISHA": "Odisha.csv",
-    "PUDUCHERRY": "Puducherry.csv",
-    "RAJASTHAN": "Rajasthan.csv",
-    "SIKKIM": "Sikkim.csv",
-    "TELANGANA": "Telangana.csv",
-    "TRIPURA": "Tripura.csv",
-    "UTTAR PRADESH": "Uttar Pradesh.csv",
-    "UTTARAKHAND": "Uttarakhand.csv"
-}
 DEFAULT_STATE_KEY = "ARUNACHAL PRADESH"
 MAX_CANDIDATES = 5
-# global cache
 master_df = None
-# You will import normalize_with_patterns_dynamic from admin_patterns when needed
-# to avoid circular imports, main app passes runtime normalization in search_candidates
 try:
     from admin_patterns import normalize_with_patterns_dynamic
 except Exception:
-    # if admin_patterns isn't importable at module import time, we will import inside functions
     normalize_with_patterns_dynamic = None
 def on_search_web(
     school_name: str,
     state_name: str,
@@ -59,234 +57,166 @@ def on_search_web(
     block: str = None
 ):
     """
-    1. Performs Tavily search → returns list of valid UDISE codes.
-    2. Looks up these UDISE codes in our HF Schools dataset using
-       get_school_rows_by_udise().
-    3. Converts results into the standard DataFrame your Gradio app expects.
-    Returns:
-        pandas.DataFrame with columns:
-        School_Name, State, District, Block, UDISE_Code, Score
     """
-    # Step 1: Tavily → list of UDISE codes
     udise_list = tavily_search_codes(
         school_name=school_name,
         state_name=state_name,
         district=district,
-        api_key=None,               # use HuggingFace secret instead
-        enforce_state_prefix=True
     )
-    print(udise_list)
     if not udise_list:
-        # Always return an empty DF with correct schema
         return pd.DataFrame(
             columns=["School_Name", "State", "District", "Block", "UDISE_Code"]
         )
-    # Step 2: HF dataset lookup
-    rows = get_school_rows_by_udise(state_name, udise_list, try_global=True)
-    # Step 3: Convert list → DataFrame
     df = pd.DataFrame(rows)
-    # Make sure all expected columns exist
     expected = ["School_Name", "State", "District", "Block", "UDISE_Code"]
     for col in expected:
         if col not in df.columns:
-            df[col] = None   # keep schema consistent
-    # Reorder to canonical format
-    df = df[expected]
-    # Score is not applicable for web search → keep None
-    return df
-def get_school_rows_by_udise(state_name: str, udise_codes: list[str], try_global: bool = True) -> list:
     """
-    Very simplified UDISE → school rows lookup.
-    Returns list of dicts:
-        School_Name, State, District, Block, UDISE_Code
     """
     if not udise_codes:
         return []
-    udise_codes = list({str(u) for u in udise_codes})  # unique + cast to str
-    results = []
-    # --- Normalize state key ---
-    state_key = None
-    if state_name:
-        upper = state_name.strip().upper()
-        for k in STATE_HF_FILES.keys():
-            if k.upper() == upper:
-                state_key = k
-                break
-    # --- Helper: read CSV safely ---
-    def load_csv(filename):
-        try:
-            path = hf_hub_download(
-                repo_id=HF_SCHOOLS_DATASET,
-                repo_type="dataset",
-                filename=filename
-            )
-            return pd.read_csv(path, dtype=str).fillna("")
-        except Exception:
-            return pd.DataFrame()
-    # --- Helper: extract rows for given DF ---
-    def extract_rows(df, state_label):
-        if df.empty or MASTER_UDISE_COL not in df.columns:
-            return []
-        matched = df[df[MASTER_UDISE_COL].isin(udise_codes)]
-        if matched.empty:
-            return []
-        rows = []
-        for _, r in matched.iterrows():
-            rows.append({
-                "School_Name": r.get(MASTER_SCHOOL_COL, ""),
-                "State": r.get(MASTER_STATE_COL, state_label),
-                "District": r.get(MASTER_DISTRICT_COL, ""),
-                "Block": r.get(MASTER_BLOCK_COL, ""),
-                "UDISE_Code": r.get(MASTER_UDISE_COL, "")
-            })
-        return rows
-    # --- 1) Try requested state first ---
-    if state_key:
-        fname = STATE_HF_FILES[state_key]
-        df_state = load_csv(fname)
-        rows = extract_rows(df_state, state_label=state_key)
-        if rows:
-            return rows
-    # --- 2) Try all states (global fallback) ---
-    if try_global:
-        for sk, fname in STATE_HF_FILES.items():
-            df = load_csv(fname)
-            rows = extract_rows(df, state_label=sk)
-            if rows:
-                results.extend(rows)
-    return results
 def load_master_for_state(state_key: str | None):
     """
-    Load the master CSV for a state from Hugging Face Hub (dataset repo),
-    set global master_df, and return District & Block dropdown configs.
     """
-    global master_df
-    if not state_key:
-        master_df = None
-        return gr.Dropdown(choices=[], value=None), gr.Dropdown(choices=[], value=None)  # gr referenced in app; kept for signature
-    state_key_norm = state_key.upper().strip()
-    if state_key_norm not in STATE_HF_FILES:
-        master_df = None
-        return gr.Dropdown(choices=[], value=None), gr.Dropdown(choices=[], value=None)
-    csv_filename = STATE_HF_FILES[state_key_norm]
-    # Download the CSV file from the dataset repo
-    local_path = hf_hub_download(
-        repo_id=HF_SCHOOLS_DATASET,
-        repo_type="dataset",
-        filename=csv_filename,
-    )
-    master_df = pd.read_csv(local_path, dtype=str).fillna("")
-    # District choices
-    if MASTER_DISTRICT_COL in master_df.columns:
-        districts = sorted(master_df[MASTER_DISTRICT_COL].dropna().unique().tolist())
         districts = ["All"] + districts
     else:
         districts = []
-    # Initial blocks
-    blocks = ["All"] if MASTER_BLOCK_COL in master_df.columns else []
-    # Return gr-compatible Dropdown values (constructed in app)
-    # To avoid importing gr here (keeping logic separate), return lists and let app assemble Dropdowns if needed.
-    # However, in our app we directly return gr.Dropdown — so keep compatibility.
-    import gradio as gr  # local import to avoid circular imports at top
-    return gr.Dropdown(choices=districts, value="All" if districts else None), gr.Dropdown(choices=blocks, value="All" if blocks else None)
 def update_blocks(district: str | None):
     """
-    Update Block dropdown when District changes.
     """
-    global master_df
     import gradio as gr
-    if master_df is None or MASTER_BLOCK_COL not in master_df.columns:
-        return gr.Dropdown(choices=["All"], value="All")
     df = master_df
-    if (
-        district
-        and district != "All"
-        and MASTER_DISTRICT_COL in df.columns
-    ):
         df = df[df[MASTER_DISTRICT_COL] == district]
-    blocks = sorted(df[MASTER_BLOCK_COL].dropna().unique().tolist())
-    blocks = ["All"] + blocks if blocks else ["All"]
     return gr.Dropdown(choices=blocks, value="All")
-def search_candidates(query_name: str, state_key: str | None, district: str | None, block: str | None):
     """
-    Given school name + state + district + block, return:
-    - candidates table (top N matches)
-    - best-candidate table (single row)
     """
-    global master_df, normalize_with_patterns_dynamic
-    # import normalize function if not loaded yet (avoids circular import)
     if normalize_with_patterns_dynamic is None:
-        from admin_patterns import normalize_with_patterns_dynamic  # local import
-        normalize_with_patterns_dynamic = normalize_with_patterns_dynamic
-    if master_df is None:
-        return pd.DataFrame(), pd.DataFrame()
-    query_name = (query_name or "").strip()
     if not query_name:
         return pd.DataFrame(), pd.DataFrame()
     df = master_df
     # Filter by district
-    if (
-        district
-        and district != "All"
-        and MASTER_DISTRICT_COL in df.columns
-    ):
         df = df[df[MASTER_DISTRICT_COL] == district]
     # Filter by block
-    if (
-        block
-        and block != "All"
-        and MASTER_BLOCK_COL in df.columns
-    ):
         df = df[df[MASTER_BLOCK_COL] == block]
     if df.empty:
         return pd.DataFrame(), pd.DataFrame()
-    state_for_patterns = (state_key or DEFAULT_STATE_KEY).upper().strip()
     choices = df[MASTER_SCHOOL_COL].astype(str)
@@ -294,29 +224,25 @@ def search_candidates(query_name: str, state_key: str | None, district: str | No
         query_name,
         choices,
         scorer=fuzz.token_set_ratio,
-        processor=lambda s: normalize_with_patterns_dynamic(s, state_for_patterns),
         limit=MAX_CANDIDATES,
-    )  # (choice, score, key)
-    if not candidates_raw:
-        return pd.DataFrame(), pd.DataFrame()
     rows = []
-    for choice_name, score, key in candidates_raw:
-        try:
-            row = df.loc[key]
-        except Exception:
-            continue
         rows.append({
-            "School_Name": row.get(MASTER_SCHOOL_COL, ""),
-            "State": row.get(MASTER_STATE_COL, "") if MASTER_STATE_COL in df.columns else state_for_patterns,
-            "District": row.get(MASTER_DISTRICT_COL, "") if MASTER_DISTRICT_COL in df.columns else "",
-            "Block": row.get(MASTER_BLOCK_COL, "") if MASTER_BLOCK_COL in df.columns else "",
-            "UDISE_Code": row.get(MASTER_UDISE_COL, "") if MASTER_UDISE_COL in df.columns else "",
             "Score": score,
         })
     candidates_df = pd.DataFrame(rows)
     best_df = candidates_df.head(1).copy()
     return candidates_df, best_df

 from web_search import tavily_search_codes
 # ====================================================
+# CONFIG: columns + HF dataset
 # ====================================================
 MASTER_SCHOOL_COL   = "School_Name__c"
 MASTER_DISTRICT_COL = "School_District__c"
 MASTER_STATE_COL    = "School_State__c"
 HF_SCHOOLS_DATASET = "Apf-AI4Good/Schools"
+MASTER_ALL_STATES_FILE = "master_all_states.csv"
 DEFAULT_STATE_KEY = "ARUNACHAL PRADESH"
 MAX_CANDIDATES = 5
+# global cache (loaded once)
 master_df = None
+# normalization helper (lazy import to avoid circular deps)
 try:
     from admin_patterns import normalize_with_patterns_dynamic
 except Exception:
     normalize_with_patterns_dynamic = None
+# ====================================================
+# INTERNAL: load master CSV once
+# ====================================================
+def _load_master_if_needed():
+    global master_df
+    if master_df is not None:
+        return
+    local_path = hf_hub_download(
+        repo_id=HF_SCHOOLS_DATASET,
+        repo_type="dataset",
+        filename=MASTER_ALL_STATES_FILE,
+    )
+    master_df = pd.read_csv(local_path, dtype=str).fillna("")
+# ====================================================
+# WEB SEARCH → UDISE → MASTER LOOKUP
+# ====================================================
 def on_search_web(
     school_name: str,
     state_name: str,
     block: str = None
 ):
     """
+    1. Tavily search → list of UDISE codes
+    2. Lookup those UDISE codes in master_all_states.csv
+    3. Return standardized DataFrame
     """
+    # Step 1: Tavily search
     udise_list = tavily_search_codes(
         school_name=school_name,
         state_name=state_name,
         district=district,
+        api_key=None,
+        enforce_state_prefix=True,
     )
     if not udise_list:
         return pd.DataFrame(
             columns=["School_Name", "State", "District", "Block", "UDISE_Code"]
         )
+    # Step 2: lookup
+    rows = get_school_rows_by_udise(state_name, udise_list)
+    # Step 3: to DataFrame
     df = pd.DataFrame(rows)
     expected = ["School_Name", "State", "District", "Block", "UDISE_Code"]
     for col in expected:
         if col not in df.columns:
+            df[col] = None
+    return df[expected]
+def get_school_rows_by_udise(state_name: str, udise_codes: list[str]):
     """
+    UDISE → school rows lookup from master_all_states.csv
     """
     if not udise_codes:
         return []
+    _load_master_if_needed()
+    udise_codes = {str(u) for u in udise_codes}
+    df = master_df
+    matched = df[df[MASTER_UDISE_COL].isin(udise_codes)]
+    if state_name:
+        matched = matched[
+            matched[MASTER_STATE_COL].str.upper() == state_name.upper()
+        ]
+    rows = []
+    for _, r in matched.iterrows():
+        rows.append({
+            "School_Name": r.get(MASTER_SCHOOL_COL, ""),
+            "State": r.get(MASTER_STATE_COL, ""),
+            "District": r.get(MASTER_DISTRICT_COL, ""),
+            "Block": r.get(MASTER_BLOCK_COL, ""),
+            "UDISE_Code": r.get(MASTER_UDISE_COL, ""),
+        })
+    return rows
+# ====================================================
+# MASTER LOAD FOR UI (STATE → DISTRICT → BLOCK)
+# ====================================================
 def load_master_for_state(state_key: str | None):
     """
+    Load master_all_states.csv once.
+    Filter districts by selected state.
     """
+    import gradio as gr
+    _load_master_if_needed()
+    df = master_df
+    if state_key:
+        df = df[df[MASTER_STATE_COL].str.upper() == state_key.upper()]
+    if MASTER_DISTRICT_COL in df.columns:
+        districts = sorted(df[MASTER_DISTRICT_COL].unique().tolist())
         districts = ["All"] + districts
     else:
         districts = []
+    blocks = ["All"]
+    return (
+        gr.Dropdown(choices=districts, value="All" if districts else None),
+        gr.Dropdown(choices=blocks, value="All"),
+    )
 def update_blocks(district: str | None):
     """
+    Update block dropdown when district changes
     """
     import gradio as gr
+    _load_master_if_needed()
     df = master_df
+    if district and district != "All":
         df = df[df[MASTER_DISTRICT_COL] == district]
+    if MASTER_BLOCK_COL in df.columns:
+        blocks = sorted(df[MASTER_BLOCK_COL].unique().tolist())
+        blocks = ["All"] + blocks if blocks else ["All"]
+    else:
+        blocks = ["All"]
     return gr.Dropdown(choices=blocks, value="All")
+# ====================================================
+# RAPIDFUZZ SEARCH
+# ====================================================
+def search_candidates(
+    query_name: str,
+    state_key: str | None,
+    district: str | None,
+    block: str | None,
+):
     """
+    Given school name + filters, return:
+    - candidates table
+    - best candidate table
     """
+    global normalize_with_patterns_dynamic
     if normalize_with_patterns_dynamic is None:
+        from admin_patterns import normalize_with_patterns_dynamic
     if not query_name:
         return pd.DataFrame(), pd.DataFrame()
+    _load_master_if_needed()
     df = master_df
+    # Filter by state
+    if state_key:
+        df = df[df[MASTER_STATE_COL].str.upper() == state_key.upper()]
     # Filter by district
+    if district and district != "All":
         df = df[df[MASTER_DISTRICT_COL] == district]
     # Filter by block
+    if block and block != "All":
         df = df[df[MASTER_BLOCK_COL] == block]
     if df.empty:
         return pd.DataFrame(), pd.DataFrame()
+    state_for_patterns = (state_key or DEFAULT_STATE_KEY).upper()
     choices = df[MASTER_SCHOOL_COL].astype(str)
         query_name,
         choices,
         scorer=fuzz.token_set_ratio,
+        processor=lambda s: normalize_with_patterns_dynamic(
+            s, state_for_patterns
+        ),
         limit=MAX_CANDIDATES,
+    )
     rows = []
+    for choice_name, score, idx in candidates_raw:
+        r = df.loc[idx]
         rows.append({
+            "School_Name": r.get(MASTER_SCHOOL_COL, ""),
+            "State": r.get(MASTER_STATE_COL, ""),
+            "District": r.get(MASTER_DISTRICT_COL, ""),
+            "Block": r.get(MASTER_BLOCK_COL, ""),
+            "UDISE_Code": r.get(MASTER_UDISE_COL, ""),
             "Score": score,
         })
     candidates_df = pd.DataFrame(rows)
     best_df = candidates_df.head(1).copy()
     return candidates_df, best_df