Spaces:

Alshargi
/

hadeethapi

Running

App Files Files Community

Alshargi commited on Jan 29

Commit

397688f

verified ·

1 Parent(s): 90c65a7

Update app.py

Browse files

Files changed (1) hide show

app.py +61 -112

app.py CHANGED Viewed

@@ -3,24 +3,27 @@ from __future__ import annotations
 import os
 import re
 import time
-import math
 from functools import lru_cache
-from typing import List, Dict, Any, Tuple, Optional
 import numpy as np
 import pandas as pd
 import faiss
 from flask import Flask, request, jsonify, Response
 from sentence_transformers import SentenceTransformer
 # =========================
-# Config
 # =========================
-INDEX_PATH = os.getenv("HADITH_INDEX_PATH", "hadith_ar.faiss")
 META_PATH  = os.getenv("HADITH_META_PATH",  "hadith_meta.parquet")
-MODEL_NAME = os.getenv("HADITH_MODEL_NAME", "intfloat/multilingual-e5-base")
 DEFAULT_TOP_K = 10
 MAX_TOP_K = 50
@@ -29,21 +32,21 @@ DEFAULT_RERANK_K = 35
 MAX_RERANK_K = 120
 MIN_RERANK_K = 10
-DEFAULT_HL_TOPN = 6        # for /highlight and html responses
 MAX_HL_TOPN = 25
 DEFAULT_SEG_MAXLEN = 220
 MAX_SEG_MAXLEN = 420
 MIN_SEG_MAXLEN = 120
-# Rerank speed/quality knobs (safe defaults)
-RERANK_MAX_SEGS_PER_DOC = int(os.getenv("RERANK_MAX_SEGS_PER_DOC", "10"))   # keep it small for speed
-RERANK_SEG_MAXLEN = int(os.getenv("RERANK_SEG_MAXLEN", "240"))             # segment length during rerank
-RERANK_WEIGHT = float(os.getenv("RERANK_WEIGHT", "0.65"))                  # 0..1 combine rerank with faiss
 RERANK_ENABLE = os.getenv("RERANK_ENABLE", "1").strip() != "0"
 # CORS
-CORS_ALLOW_ORIGIN = os.getenv("CORS_ALLOW_ORIGIN", "*")  # set to your domain if you want strict
 # =========================
@@ -75,7 +78,6 @@ def ar_tokens(text: str) -> List[str]:
     t = normalize_ar(text)
     t = _AR_PUNCT.sub(" ", t)
     toks = [x.strip() for x in t.split() if x.strip()]
-    # remove super short tokens
     toks = [x for x in toks if len(x) >= 2]
     return toks
@@ -117,19 +119,15 @@ def split_ar_segments(text: str, max_len: int) -> List[str]:
     if buf:
         segs.append(buf)
-    # fallback chunking
     if len(segs) <= 1 and len(t) > max_len:
         segs = [t[i:i+max_len].strip() for i in range(0, len(t), max_len) if t[i:i+max_len].strip()]
     return segs
 def pick_segs_for_rerank(segs: List[str], max_keep: int) -> List[str]:
-    """Pick up to max_keep segments spread out (for speed)."""
     if len(segs) <= max_keep:
         return segs
-    # spread indices evenly
     idxs = np.linspace(0, len(segs) - 1, num=max_keep)
     idxs = [int(round(x)) for x in idxs]
-    # unique preserve order
     seen = set()
     out = []
     for i in idxs:
@@ -141,10 +139,11 @@ def pick_segs_for_rerank(segs: List[str], max_keep: int) -> List[str]:
 # =========================
 # Embedding helpers (cached)
 # =========================
 @lru_cache(maxsize=2048)
 def cached_query_emb(query_norm: str) -> bytes:
-    emb = model.encode(["query: " + query_norm], normalize_embeddings=True).astype("float32")[0]
     return emb.tobytes()
 def get_query_emb(query_norm: str) -> np.ndarray:
@@ -152,10 +151,9 @@ def get_query_emb(query_norm: str) -> np.ndarray:
 # =========================
-# Rerank + evidence HTML (no extra encode)
 # =========================
 def build_heatmap_html(segs: List[str], sims: np.ndarray, top_n: int = 6) -> str:
-    """Small bar-like heatmap using segment similarity (already computed)."""
     if not segs or sims.size == 0:
         return ""
@@ -166,14 +164,12 @@ def build_heatmap_html(segs: List[str], sims: np.ndarray, top_n: int = 6) -> str
     s_max = float(np.max(sims))
     denom = (s_max - s_min) if (s_max - s_min) > 1e-6 else 1.0
-    # choose top indices
     order = np.argsort(-sims)
     keep = set(order[:top_n])
     blocks = []
     for i in range(n):
-        w = (float(sims[i]) - s_min) / denom  # 0..1
-        # stronger for top segments
         alpha = (0.20 + 0.60 * w) if i in keep else (0.08 + 0.18 * w)
         alpha = max(0.06, min(alpha, 0.85))
         blocks.append(
@@ -193,7 +189,11 @@ def best_seg_html(segs: List[str], sims: np.ndarray) -> str:
     if not segs or sims.size == 0:
         return ""
     i = int(np.argmax(sims))
-    return f'<span style="background:rgba(255,230,120,0.55);border:1px solid rgba(234,179,8,0.35);border-radius:12px;padding:3px 8px;display:inline;">{escape_html(segs[i])}</span>'
 def lexical_ratio(query_norm: str, doc_norm: str, max_terms: int = 10) -> Tuple[float, str]:
     q_toks = ar_tokens(query_norm)
@@ -206,38 +206,29 @@ def lexical_ratio(query_norm: str, doc_norm: str, max_terms: int = 10) -> Tuple[
     return float(ratio), terms
 def confidence_label(score: float) -> Tuple[str, str]:
-    """
-    Simple score->label mapping.
-    Assumes cosine-like range ~[0..1] after normalization & blending.
-    """
     if score >= 0.78:
         return "HIGH", "bHigh"
     if score >= 0.62:
         return "MED", "bMed"
     return "LOW", "bLow"
 def rerank_rows(
     query_norm: str,
     df: pd.DataFrame,
     k_final: int,
 ) -> Tuple[pd.DataFrame, Dict[int, Dict[str, Any]]]:
-    """
-    Rerank using segment max similarity:
-    - Split each doc to segments (short)
-    - Pick a limited set of segments (speed)
-    - One encode call for all segments
-    Returns reranked df and per-hadith evidence dict (sims/segs + prebuilt html).
-    """
     evidence: Dict[int, Dict[str, Any]] = {}
     if (not RERANK_ENABLE) or df.empty:
-        # still fill basic fields
         for _, row in df.iterrows():
             hid = int(row["hadithID"]) if pd.notna(row.get("hadithID")) else -1
             evidence[hid] = {"mode": "disabled"}
         return df.head(k_final), evidence
-    # Collect segments for each candidate
     cand_rows = df.copy()
     per_doc_segs: List[List[str]] = []
@@ -247,21 +238,16 @@ def rerank_rows(
         hid = int(row["hadithID"]) if pd.notna(row.get("hadithID")) else -1
         doc_hids.append(hid)
-        ar = str(row.get("arabic", "") or "")
-        ar_clean = row.get("arabic_clean", "")
-        if ar_clean is None or (isinstance(ar_clean, float) and np.isnan(ar_clean)):
-            ar_clean = ""
-        ar_clean = str(ar_clean).strip()
-        if not ar_clean:
-            ar_clean = normalize_ar(ar)
-        segs = split_ar_segments(ar_clean, max_len=RERANK_SEG_MAXLEN)
         segs = pick_segs_for_rerank(segs, max_keep=RERANK_MAX_SEGS_PER_DOC)
-        if not segs:
-            segs = [ar_clean[:RERANK_SEG_MAXLEN]] if ar_clean else []
         per_doc_segs.append(segs)
-    # Flatten
     all_segs: List[str] = []
     offsets: List[Tuple[int, int]] = []
     cur = 0
@@ -272,21 +258,14 @@ def rerank_rows(
         offsets.append((start, cur))
     if not all_segs:
-        # fallback: no rerank
         for hid in doc_hids:
             evidence[hid] = {"mode": "empty"}
         return cand_rows.head(k_final), evidence
-    # Encode query once + all segments once
-    q_emb = get_query_emb(query_norm)  # (d,)
-    seg_emb = model.encode(
-        ["passage: " + s for s in all_segs],
-        normalize_embeddings=True
-    ).astype("float32")  # (N, d)
-    sims_all = (seg_emb @ q_emb).astype(np.float32)  # (N,)
-    # Compute per-doc rerank score = max(sim)
     rr_scores: List[float] = []
     for hid, (start, end), segs in zip(doc_hids, offsets, per_doc_segs):
         if start == end:
@@ -297,7 +276,6 @@ def rerank_rows(
             rr = float(np.max(sims))
         rr_scores.append(rr)
-        # Build evidence HTML now (no extra encode)
         hm = build_heatmap_html(segs, sims, top_n=min(6, len(segs))) if sims.size else ""
         best = best_seg_html(segs, sims) if sims.size else ""
         evidence[hid] = {
@@ -305,14 +283,10 @@ def rerank_rows(
             "rerank_score": rr,
             "heatmap_html": hm,
             "best_seg_html": best,
-            "rerank_segs": segs,   # keep for debugging (can omit if you want)
-            "rerank_sims": None,   # don't ship full sims to client
         }
     cand_rows["rerank_score"] = rr_scores
-    # Blend: score_final = (1-w)*faiss + w*rerank
-    # Both are cosine-ish in [0,1] in your setup (normalize embeddings + IP index)
     faiss_scores = cand_rows["score"].astype(float).to_numpy()
     rr = cand_rows["rerank_score"].astype(float).to_numpy()
@@ -325,7 +299,7 @@ def rerank_rows(
 # =========================
-# Full highlight for ONE hadith (on click)
 # =========================
 def full_highlight_html(
     query_norm: str,
@@ -342,11 +316,7 @@ def full_highlight_html(
         }
     q_emb = get_query_emb(query_norm)
-    seg_emb = model.encode(
-        ["passage: " + s for s in segs],
-        normalize_embeddings=True
-    ).astype("float32")
     sims = (seg_emb @ q_emb).astype(np.float32)
     s_min = float(np.min(sims))
@@ -392,6 +362,13 @@ model = SentenceTransformer(MODEL_NAME)
 index = faiss.read_index(INDEX_PATH)
 meta  = pd.read_parquet(META_PATH)
 required_cols = {"hadithID", "collection", "hadith_number", "arabic", "english"}
 missing = required_cols - set(meta.columns)
 if missing:
@@ -400,6 +377,10 @@ if missing:
 if "arabic_clean" not in meta.columns:
     meta["arabic_clean"] = ""
 # =========================
 # FAISS Search
@@ -410,7 +391,7 @@ def semantic_search_df(query: str, top_k: int) -> pd.DataFrame:
         return meta.iloc[0:0].copy()
     top_k = max(1, min(int(top_k), MAX_TOP_K))
-    q_norm = normalize_ar(q)
     q_emb = get_query_emb(q_norm).reshape(1, -1)
     scores, idx = index.search(q_emb, top_k)
@@ -418,9 +399,6 @@ def semantic_search_df(query: str, top_k: int) -> pd.DataFrame:
     res = meta.iloc[idx[0]].copy()
     res["score"] = scores[0]
     res = res.sort_values("score", ascending=False)
-    # ensure arabic
-    res["arabic"] = res["arabic"].fillna("").astype(str)
     res = res[res["arabic"].str.strip() != ""]
     return res
@@ -473,7 +451,7 @@ def health():
 def search():
     q = request.args.get("q", "").strip()
-    # TopK final
     k_raw = request.args.get("k", str(DEFAULT_TOP_K)).strip()
     try:
         k = int(k_raw) if k_raw else DEFAULT_TOP_K
@@ -481,7 +459,7 @@ def search():
         k = DEFAULT_TOP_K
     k = max(1, min(k, MAX_TOP_K))
-    # rerank pool
     rk_raw = request.args.get("rerank_k", str(DEFAULT_RERANK_K)).strip()
     try:
         rerank_k = int(rk_raw) if rk_raw else DEFAULT_RERANK_K
@@ -490,7 +468,7 @@ def search():
     rerank_k = max(MIN_RERANK_K, min(rerank_k, MAX_RERANK_K))
     rerank_k = max(rerank_k, k)
-    # Highlight controls (only used for format=html; for fast mode you can still send hl_topn=0)
     hl_raw = request.args.get("hl_topn", str(DEFAULT_HL_TOPN)).strip()
     seg_raw = request.args.get("seg_maxlen", str(DEFAULT_SEG_MAXLEN)).strip()
     try:
@@ -526,33 +504,27 @@ def search():
     t0 = time.time()
-    # 1) FAISS retrieve pool (rerank_k)
     df_pool = semantic_search_df(q, top_k=rerank_k)
     q_norm = normalize_ar(q)
-    # 2) rerank to final k + evidence (no extra encode)
     df_final, ev = rerank_rows(query_norm=q_norm, df=df_pool, k_final=k)
     took_ms = int((time.time() - t0) * 1000)
-    # Build results
     results: List[Dict[str, Any]] = []
     for _, row in df_final.iterrows():
         hid = int(row.get("hadithID")) if pd.notna(row.get("hadithID")) else None
         arabic = str(row.get("arabic", "") or "")
         english = str(row.get("english", "") or "")
-        ar_clean = row.get("arabic_clean", "")
-        if ar_clean is None or (isinstance(ar_clean, float) and np.isnan(ar_clean)):
-            ar_clean = ""
-        ar_clean = str(ar_clean).strip()
         if not ar_clean:
             ar_clean = normalize_ar(arabic)
-        # lexical
         lex_r, lex_terms = lexical_ratio(q_norm, ar_clean)
-        # scores
         faiss_score = float(row.get("score")) if pd.notna(row.get("score")) else 0.0
         rerank_score = float(row.get("rerank_score")) if pd.notna(row.get("rerank_score")) else faiss_score
         final_score = float(row.get("final_score")) if pd.notna(row.get("final_score")) else faiss_score
@@ -567,31 +539,20 @@ def search():
             "hadithID": hid,
             "collection": str(row.get("collection", "") or ""),
             "hadith_number": int(row.get("hadith_number")) if pd.notna(row.get("hadith_number")) else None,
-            # unified score the UI should use
             "score": final_score,
-            # diagnostics
             "faiss_score": faiss_score,
             "rerank_score": rerank_score,
             "conf_label": conf_label,
             "conf_class": conf_class,
             "lex_ratio": float(lex_r),
             "lex_terms": lex_terms,
             "arabic": arabic,
             "arabic_clean": ar_clean,
             "english": english,
-            # Provide evidence html even in json (cheap: already computed in rerank)
             "heatmap_html": heatmap_html,
             "best_seg_html": best_html,
         }
-        # If the caller asked for html AND did not disable highlight, also compute full highlight for each result.
-        # This is heavier. Recommended: keep hl_topn=0 for fast mode and use /highlight on click.
         if want_html and hl_topn > 0:
             extras = full_highlight_html(
                 query_norm=q_norm,
@@ -600,7 +561,6 @@ def search():
                 seg_maxlen=seg_maxlen,
             )
             r["arabic_clean_html"] = extras["arabic_clean_html"]
-            # You can overwrite with full-doc ones (optional):
             r["heatmap_html"] = extras["heatmap_html"] or r["heatmap_html"]
             r["best_seg_html"] = extras["best_seg_html"] or r["best_seg_html"]
@@ -624,10 +584,6 @@ def search():
 @app.get("/highlight")
 def highlight():
-    """
-    Highlight a single hadith on-demand (for fast UI).
-    GET /highlight?q=...&hadithID=123&format=html&hl_topn=6&seg_maxlen=220
-    """
     q = request.args.get("q", "").strip()
     hid_raw = request.args.get("hadithID", "").strip()
@@ -666,14 +622,10 @@ def highlight():
     arabic = str(row.get("arabic", "") or "")
     english = str(row.get("english", "") or "")
-    ar_clean = row.get("arabic_clean", "")
-    if ar_clean is None or (isinstance(ar_clean, float) and np.isnan(ar_clean)):
-        ar_clean = ""
-    ar_clean = str(ar_clean).strip()
     if not ar_clean:
         ar_clean = normalize_ar(arabic)
-    # Always produce evidence + highlight here (one doc only)
     extras = full_highlight_html(
         query_norm=q_norm,
         arabic_clean_text=ar_clean,
@@ -681,7 +633,6 @@ def highlight():
         seg_maxlen=seg_maxlen,
     )
-    # lexical
     lex_r, lex_terms = lexical_ratio(q_norm, ar_clean)
     return jsonify({
@@ -692,14 +643,11 @@ def highlight():
         "format": "html" if want_html else "json",
         "hl_topn": hl_topn,
         "seg_maxlen": seg_maxlen,
         "lex_ratio": float(lex_r),
         "lex_terms": lex_terms,
         "arabic": arabic,
         "arabic_clean": ar_clean,
         "english": english,
         "arabic_clean_html": extras.get("arabic_clean_html", "") if want_html else "",
         "heatmap_html": extras.get("heatmap_html", ""),
         "best_seg_html": extras.get("best_seg_html", ""),
@@ -707,5 +655,6 @@ def highlight():
 if __name__ == "__main__":
-    # local run only
-    app.run(host="127.0.0.1", port=5000, debug=True)

 import os
 import re
 import time
 from functools import lru_cache
+from typing import List, Dict, Any, Tuple
 import numpy as np
 import pandas as pd
 import faiss
 from flask import Flask, request, jsonify, Response
 from sentence_transformers import SentenceTransformer
 # =========================
+# Config (HF Space defaults)
 # =========================
+INDEX_PATH = os.getenv("HADITH_INDEX_PATH", "hadith_semantic.faiss")
 META_PATH  = os.getenv("HADITH_META_PATH",  "hadith_meta.parquet")
+# Small/fast multilingual model (good on free CPU)
+MODEL_NAME = os.getenv(
+    "HADITH_MODEL_NAME",
+    "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
+)
 DEFAULT_TOP_K = 10
 MAX_TOP_K = 50
 MAX_RERANK_K = 120
 MIN_RERANK_K = 10
+DEFAULT_HL_TOPN = 6
 MAX_HL_TOPN = 25
 DEFAULT_SEG_MAXLEN = 220
 MAX_SEG_MAXLEN = 420
 MIN_SEG_MAXLEN = 120
+# Rerank knobs (keep small for HF free CPU)
+RERANK_MAX_SEGS_PER_DOC = int(os.getenv("RERANK_MAX_SEGS_PER_DOC", "8"))
+RERANK_SEG_MAXLEN = int(os.getenv("RERANK_SEG_MAXLEN", "240"))
+RERANK_WEIGHT = float(os.getenv("RERANK_WEIGHT", "0.65"))
 RERANK_ENABLE = os.getenv("RERANK_ENABLE", "1").strip() != "0"
 # CORS
+CORS_ALLOW_ORIGIN = os.getenv("CORS_ALLOW_ORIGIN", "*")
 # =========================
     t = normalize_ar(text)
     t = _AR_PUNCT.sub(" ", t)
     toks = [x.strip() for x in t.split() if x.strip()]
     toks = [x for x in toks if len(x) >= 2]
     return toks
     if buf:
         segs.append(buf)
     if len(segs) <= 1 and len(t) > max_len:
         segs = [t[i:i+max_len].strip() for i in range(0, len(t), max_len) if t[i:i+max_len].strip()]
     return segs
 def pick_segs_for_rerank(segs: List[str], max_keep: int) -> List[str]:
     if len(segs) <= max_keep:
         return segs
     idxs = np.linspace(0, len(segs) - 1, num=max_keep)
     idxs = [int(round(x)) for x in idxs]
     seen = set()
     out = []
     for i in idxs:
 # =========================
 # Embedding helpers (cached)
+# IMPORTANT: This model does NOT use "query:" / "passage:" prefixes.
 # =========================
 @lru_cache(maxsize=2048)
 def cached_query_emb(query_norm: str) -> bytes:
+    emb = model.encode([query_norm], normalize_embeddings=True).astype("float32")[0]
     return emb.tobytes()
 def get_query_emb(query_norm: str) -> np.ndarray:
 # =========================
+# Evidence HTML
 # =========================
 def build_heatmap_html(segs: List[str], sims: np.ndarray, top_n: int = 6) -> str:
     if not segs or sims.size == 0:
         return ""
     s_max = float(np.max(sims))
     denom = (s_max - s_min) if (s_max - s_min) > 1e-6 else 1.0
     order = np.argsort(-sims)
     keep = set(order[:top_n])
     blocks = []
     for i in range(n):
+        w = (float(sims[i]) - s_min) / denom
         alpha = (0.20 + 0.60 * w) if i in keep else (0.08 + 0.18 * w)
         alpha = max(0.06, min(alpha, 0.85))
         blocks.append(
     if not segs or sims.size == 0:
         return ""
     i = int(np.argmax(sims))
+    return (
+        '<span style="background:rgba(255,230,120,0.55);'
+        'border:1px solid rgba(234,179,8,0.35);border-radius:12px;padding:3px 8px;display:inline;">'
+        f'{escape_html(segs[i])}</span>'
+    )
 def lexical_ratio(query_norm: str, doc_norm: str, max_terms: int = 10) -> Tuple[float, str]:
     q_toks = ar_tokens(query_norm)
     return float(ratio), terms
 def confidence_label(score: float) -> Tuple[str, str]:
     if score >= 0.78:
         return "HIGH", "bHigh"
     if score >= 0.62:
         return "MED", "bMed"
     return "LOW", "bLow"
+# =========================
+# Rerank
+# =========================
 def rerank_rows(
     query_norm: str,
     df: pd.DataFrame,
     k_final: int,
 ) -> Tuple[pd.DataFrame, Dict[int, Dict[str, Any]]]:
     evidence: Dict[int, Dict[str, Any]] = {}
     if (not RERANK_ENABLE) or df.empty:
         for _, row in df.iterrows():
             hid = int(row["hadithID"]) if pd.notna(row.get("hadithID")) else -1
             evidence[hid] = {"mode": "disabled"}
         return df.head(k_final), evidence
     cand_rows = df.copy()
     per_doc_segs: List[List[str]] = []
         hid = int(row["hadithID"]) if pd.notna(row.get("hadithID")) else -1
         doc_hids.append(hid)
+        ar = str(row.get("arabic_clean", "") or "").strip()
+        if not ar:
+            ar = normalize_ar(str(row.get("arabic", "") or ""))
+        segs = split_ar_segments(ar, max_len=RERANK_SEG_MAXLEN)
         segs = pick_segs_for_rerank(segs, max_keep=RERANK_MAX_SEGS_PER_DOC)
+        if not segs and ar:
+            segs = [ar[:RERANK_SEG_MAXLEN]]
         per_doc_segs.append(segs)
     all_segs: List[str] = []
     offsets: List[Tuple[int, int]] = []
     cur = 0
         offsets.append((start, cur))
     if not all_segs:
         for hid in doc_hids:
             evidence[hid] = {"mode": "empty"}
         return cand_rows.head(k_final), evidence
+    q_emb = get_query_emb(query_norm)
+    seg_emb = model.encode(all_segs, normalize_embeddings=True).astype("float32")
+    sims_all = (seg_emb @ q_emb).astype(np.float32)
     rr_scores: List[float] = []
     for hid, (start, end), segs in zip(doc_hids, offsets, per_doc_segs):
         if start == end:
             rr = float(np.max(sims))
         rr_scores.append(rr)
         hm = build_heatmap_html(segs, sims, top_n=min(6, len(segs))) if sims.size else ""
         best = best_seg_html(segs, sims) if sims.size else ""
         evidence[hid] = {
             "rerank_score": rr,
             "heatmap_html": hm,
             "best_seg_html": best,
         }
     cand_rows["rerank_score"] = rr_scores
     faiss_scores = cand_rows["score"].astype(float).to_numpy()
     rr = cand_rows["rerank_score"].astype(float).to_numpy()
 # =========================
+# Full highlight for ONE hadith
 # =========================
 def full_highlight_html(
     query_norm: str,
         }
     q_emb = get_query_emb(query_norm)
+    seg_emb = model.encode(segs, normalize_embeddings=True).astype("float32")
     sims = (seg_emb @ q_emb).astype(np.float32)
     s_min = float(np.min(sims))
 index = faiss.read_index(INDEX_PATH)
 meta  = pd.read_parquet(META_PATH)
+# Accept corpusID or hadithID, normalize to hadithID
+id_col = "hadithID" if "hadithID" in meta.columns else ("corpusID" if "corpusID" in meta.columns else None)
+if id_col is None:
+    raise ValueError("Meta must contain 'hadithID' or 'corpusID'")
+if id_col != "hadithID":
+    meta = meta.rename(columns={id_col: "hadithID"})
 required_cols = {"hadithID", "collection", "hadith_number", "arabic", "english"}
 missing = required_cols - set(meta.columns)
 if missing:
 if "arabic_clean" not in meta.columns:
     meta["arabic_clean"] = ""
+meta["arabic"] = meta["arabic"].fillna("").astype(str)
+meta["english"] = meta["english"].fillna("").astype(str)
+meta["arabic_clean"] = meta["arabic_clean"].fillna("").astype(str)
 # =========================
 # FAISS Search
         return meta.iloc[0:0].copy()
     top_k = max(1, min(int(top_k), MAX_TOP_K))
+    q_norm = normalize_ar(q)  # Arabic normalize, safe for English too
     q_emb = get_query_emb(q_norm).reshape(1, -1)
     scores, idx = index.search(q_emb, top_k)
     res = meta.iloc[idx[0]].copy()
     res["score"] = scores[0]
     res = res.sort_values("score", ascending=False)
     res = res[res["arabic"].str.strip() != ""]
     return res
 def search():
     q = request.args.get("q", "").strip()
+    # final top-k
     k_raw = request.args.get("k", str(DEFAULT_TOP_K)).strip()
     try:
         k = int(k_raw) if k_raw else DEFAULT_TOP_K
         k = DEFAULT_TOP_K
     k = max(1, min(k, MAX_TOP_K))
+    # rerank pool size
     rk_raw = request.args.get("rerank_k", str(DEFAULT_RERANK_K)).strip()
     try:
         rerank_k = int(rk_raw) if rk_raw else DEFAULT_RERANK_K
     rerank_k = max(MIN_RERANK_K, min(rerank_k, MAX_RERANK_K))
     rerank_k = max(rerank_k, k)
+    # highlight controls
     hl_raw = request.args.get("hl_topn", str(DEFAULT_HL_TOPN)).strip()
     seg_raw = request.args.get("seg_maxlen", str(DEFAULT_SEG_MAXLEN)).strip()
     try:
     t0 = time.time()
+    # 1) retrieve pool
     df_pool = semantic_search_df(q, top_k=rerank_k)
     q_norm = normalize_ar(q)
+    # 2) rerank -> final
     df_final, ev = rerank_rows(query_norm=q_norm, df=df_pool, k_final=k)
     took_ms = int((time.time() - t0) * 1000)
     results: List[Dict[str, Any]] = []
     for _, row in df_final.iterrows():
         hid = int(row.get("hadithID")) if pd.notna(row.get("hadithID")) else None
         arabic = str(row.get("arabic", "") or "")
         english = str(row.get("english", "") or "")
+        ar_clean = str(row.get("arabic_clean", "") or "").strip()
         if not ar_clean:
             ar_clean = normalize_ar(arabic)
         lex_r, lex_terms = lexical_ratio(q_norm, ar_clean)
         faiss_score = float(row.get("score")) if pd.notna(row.get("score")) else 0.0
         rerank_score = float(row.get("rerank_score")) if pd.notna(row.get("rerank_score")) else faiss_score
         final_score = float(row.get("final_score")) if pd.notna(row.get("final_score")) else faiss_score
             "hadithID": hid,
             "collection": str(row.get("collection", "") or ""),
             "hadith_number": int(row.get("hadith_number")) if pd.notna(row.get("hadith_number")) else None,
             "score": final_score,
             "faiss_score": faiss_score,
             "rerank_score": rerank_score,
             "conf_label": conf_label,
             "conf_class": conf_class,
             "lex_ratio": float(lex_r),
             "lex_terms": lex_terms,
             "arabic": arabic,
             "arabic_clean": ar_clean,
             "english": english,
             "heatmap_html": heatmap_html,
             "best_seg_html": best_html,
         }
         if want_html and hl_topn > 0:
             extras = full_highlight_html(
                 query_norm=q_norm,
                 seg_maxlen=seg_maxlen,
             )
             r["arabic_clean_html"] = extras["arabic_clean_html"]
             r["heatmap_html"] = extras["heatmap_html"] or r["heatmap_html"]
             r["best_seg_html"] = extras["best_seg_html"] or r["best_seg_html"]
 @app.get("/highlight")
 def highlight():
     q = request.args.get("q", "").strip()
     hid_raw = request.args.get("hadithID", "").strip()
     arabic = str(row.get("arabic", "") or "")
     english = str(row.get("english", "") or "")
+    ar_clean = str(row.get("arabic_clean", "") or "").strip()
     if not ar_clean:
         ar_clean = normalize_ar(arabic)
     extras = full_highlight_html(
         query_norm=q_norm,
         arabic_clean_text=ar_clean,
         seg_maxlen=seg_maxlen,
     )
     lex_r, lex_terms = lexical_ratio(q_norm, ar_clean)
     return jsonify({
         "format": "html" if want_html else "json",
         "hl_topn": hl_topn,
         "seg_maxlen": seg_maxlen,
         "lex_ratio": float(lex_r),
         "lex_terms": lex_terms,
         "arabic": arabic,
         "arabic_clean": ar_clean,
         "english": english,
         "arabic_clean_html": extras.get("arabic_clean_html", "") if want_html else "",
         "heatmap_html": extras.get("heatmap_html", ""),
         "best_seg_html": extras.get("best_seg_html", ""),
 if __name__ == "__main__":
+    # Hugging Face Spaces uses PORT=7860
+    port = int(os.getenv("PORT", "7860"))
+    app.run(host="0.0.0.0", port=port, debug=False)