Spaces:

Alshargi
/

hadeethapi

Running

App Files Files Community

Alshargi commited on Jan 27

Commit

50fe70f

verified ·

1 Parent(s): d293589

Update app.py

Browse files

Files changed (1) hide show

app.py +102 -55

app.py CHANGED Viewed

@@ -3,7 +3,7 @@ from __future__ import annotations
 import os
 import re
 import time
-from typing import Any, Dict, List, Optional
 import numpy as np
 import pandas as pd
@@ -23,12 +23,9 @@ MODEL_NAME = os.getenv("HADITH_MODEL_NAME", "intfloat/multilingual-e5-base")
 DEFAULT_TOP_K = 10
 MAX_TOP_K = 50
-# If you want a smaller response payload
-DEFAULT_INCLUDE_TEXT = True
 # =========================
-# Arabic normalization
 # =========================
 _AR_DIACRITICS = re.compile(r"""
     [\u0610-\u061A]
@@ -38,7 +35,6 @@ _AR_DIACRITICS = re.compile(r"""
 """, re.VERBOSE)
 def normalize_ar(text: str) -> str:
-    """Remove tashkeel + normalize common Arabic letter variants."""
     if text is None:
         return ""
     text = str(text)
@@ -53,33 +49,50 @@ def normalize_ar(text: str) -> str:
 # =========================
-# Load model + index + meta (once)
 # =========================
-if not os.path.exists(INDEX_PATH):
-    raise FileNotFoundError(f"FAISS index not found: {INDEX_PATH}")
-if not os.path.exists(META_PATH):
-    raise FileNotFoundError(f"Meta parquet not found: {META_PATH}")
-model = SentenceTransformer(MODEL_NAME)
-index = faiss.read_index(INDEX_PATH)
-meta  = pd.read_parquet(META_PATH)
-required_cols = {"hadithID", "collection", "hadith_number", "arabic", "english"}
-missing = required_cols - set(meta.columns)
-if missing:
-    raise ValueError(f"Meta is missing required columns: {missing}")
-if "arabic_clean" not in meta.columns:
-    meta["arabic_clean"] = ""
-# Normalize column types to avoid NaN surprises
-for col in ["arabic", "english", "arabic_clean", "collection"]:
-    if col in meta.columns:
-        meta[col] = meta[col].fillna("").astype(str)
 def semantic_search(query: str, top_k: int = DEFAULT_TOP_K) -> pd.DataFrame:
     q = str(query or "").strip()
     if not q:
         return meta.iloc[0:0].copy()
@@ -88,13 +101,14 @@ def semantic_search(query: str, top_k: int = DEFAULT_TOP_K) -> pd.DataFrame:
     q_norm = normalize_ar(q)
     q_emb = model.encode(["query: " + q_norm], normalize_embeddings=True).astype("float32")
     scores, idx = index.search(q_emb, top_k)
     res = meta.iloc[idx[0]].copy()
     res["score"] = scores[0].astype(float)
     res = res.sort_values("score", ascending=False)
-    # Ensure no empty Arabic (avoid useless results)
     res["arabic"] = res["arabic"].fillna("").astype(str)
     res = res[res["arabic"].str.strip() != ""]
@@ -103,11 +117,13 @@ def semantic_search(query: str, top_k: int = DEFAULT_TOP_K) -> pd.DataFrame:
 def row_to_json(row: pd.Series, include_text: bool = True) -> Dict[str, Any]:
     arabic = str(row.get("arabic", "") or "")
     arabic_clean = str(row.get("arabic_clean", "") or "").strip()
     if not arabic_clean:
         arabic_clean = normalize_ar(arabic)
-    base = {
         "score": float(row.get("score", 0.0)),
         "hadithID": int(row.get("hadithID")),
         "collection": str(row.get("collection", "")),
@@ -118,58 +134,88 @@ def row_to_json(row: pd.Series, include_text: bool = True) -> Dict[str, Any]:
         base.update({
             "arabic": arabic,
             "arabic_clean": arabic_clean,
-            "english": str(row.get("english", "") or ""),
         })
     return base
 # =========================
-# Flask API app
 # =========================
 app = Flask(__name__)
-CORS(app, resources={r"/*": {"origins": "*"}})  # allow calls from other hosts
-@app.get("/health")
-def health():
     return jsonify({
         "ok": True,
-        "rows": int(len(meta)),
-        "index_ntotal": int(getattr(index, "ntotal", -1)),
-        "model": MODEL_NAME
     })
 @app.post("/search")
-def search():
     """
-    JSON body:
     {
-      "q": "الزرق و سبيل الرزق",
       "k": 10,
       "include_text": true
     }
     """
     payload = request.get_json(silent=True) or {}
-    q = (payload.get("q") or "").strip()
-    k = payload.get("k", DEFAULT_TOP_K)
-    include_text = payload.get("include_text", DEFAULT_INCLUDE_TEXT)
-    # Validate
     if not q:
         return jsonify({"ok": False, "error": "Missing 'q'"}), 400
     try:
         k = int(k)
     except Exception:
         k = DEFAULT_TOP_K
     k = max(1, min(k, MAX_TOP_K))
     t0 = time.time()
-    res_df = semantic_search(q, top_k=k)
     took_ms = int((time.time() - t0) * 1000)
-    results = [row_to_json(r, include_text=bool(include_text)) for _, r in res_df.iterrows()]
     return jsonify({
         "ok": True,
@@ -186,34 +232,35 @@ def search():
 def search_get():
     """
     GET /search?q=...&k=10&include_text=1
-    Useful for quick testing in browser.
     """
     q = (request.args.get("q") or "").strip()
-    k = request.args.get("k", str(DEFAULT_TOP_K))
-    include_text = request.args.get("include_text", "1")
     if not q:
         return jsonify({"ok": False, "error": "Missing 'q'"}), 400
     try:
-        k_int = int(k)
     except Exception:
-        k_int = DEFAULT_TOP_K
-    k_int = max(1, min(k_int, MAX_TOP_K))
-    include_text_bool = include_text not in ("0", "false", "False", "")
     t0 = time.time()
-    res_df = semantic_search(q, top_k=k_int)
     took_ms = int((time.time() - t0) * 1000)
-    results = [row_to_json(r, include_text=include_text_bool) for _, r in res_df.iterrows()]
     return jsonify({
         "ok": True,
         "query": q,
         "query_norm": normalize_ar(q),
-        "k": k_int,
         "took_ms": took_ms,
         "results_count": len(results),
         "results": results
@@ -221,5 +268,5 @@ def search_get():
 if __name__ == "__main__":
-    # For local debug only. On HF Spaces, gunicorn/uvicorn handles it.
     app.run(host="0.0.0.0", port=7860, debug=False)

 import os
 import re
 import time
+from typing import Any, Dict, List, Optional, Tuple
 import numpy as np
 import pandas as pd
 DEFAULT_TOP_K = 10
 MAX_TOP_K = 50
 # =========================
+# Arabic normalization (remove tashkeel + normalize letters)
 # =========================
 _AR_DIACRITICS = re.compile(r"""
     [\u0610-\u061A]
 """, re.VERBOSE)
 def normalize_ar(text: str) -> str:
     if text is None:
         return ""
     text = str(text)
 # =========================
+# Lazy load (load resources on demand)
 # =========================
+_model: Optional[SentenceTransformer] = None
+_index = None
+_meta: Optional[pd.DataFrame] = None
+def get_resources() -> Tuple[SentenceTransformer, Any, pd.DataFrame]:
+    global _model, _index, _meta
+    if _model is not None and _index is not None and _meta is not None:
+        return _model, _index, _meta
+    if not os.path.exists(INDEX_PATH):
+        raise FileNotFoundError(f"FAISS index not found: {INDEX_PATH}")
+    if not os.path.exists(META_PATH):
+        raise FileNotFoundError(f"Meta parquet not found: {META_PATH}")
+    _model = SentenceTransformer(MODEL_NAME)
+    _index = faiss.read_index(INDEX_PATH)
+    _meta  = pd.read_parquet(META_PATH)
+    required_cols = {"hadithID", "collection", "hadith_number", "arabic", "english"}
+    missing = required_cols - set(_meta.columns)
+    if missing:
+        raise ValueError(f"Meta is missing required columns: {missing}")
+    if "arabic_clean" not in _meta.columns:
+        _meta["arabic_clean"] = ""
+    # Normalize types / fill missing
+    for col in ["arabic", "english", "arabic_clean", "collection"]:
+        if col in _meta.columns:
+            _meta[col] = _meta[col].fillna("").astype(str)
+    return _model, _index, _meta
+# =========================
+# Search
+# =========================
 def semantic_search(query: str, top_k: int = DEFAULT_TOP_K) -> pd.DataFrame:
+    model, index, meta = get_resources()
     q = str(query or "").strip()
     if not q:
         return meta.iloc[0:0].copy()
     q_norm = normalize_ar(q)
     q_emb = model.encode(["query: " + q_norm], normalize_embeddings=True).astype("float32")
     scores, idx = index.search(q_emb, top_k)
     res = meta.iloc[idx[0]].copy()
     res["score"] = scores[0].astype(float)
     res = res.sort_values("score", ascending=False)
+    # Filter empty arabic just in case
     res["arabic"] = res["arabic"].fillna("").astype(str)
     res = res[res["arabic"].str.strip() != ""]
 def row_to_json(row: pd.Series, include_text: bool = True) -> Dict[str, Any]:
     arabic = str(row.get("arabic", "") or "")
+    english = str(row.get("english", "") or "")
     arabic_clean = str(row.get("arabic_clean", "") or "").strip()
     if not arabic_clean:
         arabic_clean = normalize_ar(arabic)
+    base: Dict[str, Any] = {
         "score": float(row.get("score", 0.0)),
         "hadithID": int(row.get("hadithID")),
         "collection": str(row.get("collection", "")),
         base.update({
             "arabic": arabic,
             "arabic_clean": arabic_clean,
+            "english": english,
         })
     return base
 # =========================
+# Flask API
 # =========================
 app = Flask(__name__)
+CORS(app, resources={r"/*": {"origins": "*"}})
+@app.get("/")
+def root():
     return jsonify({
         "ok": True,
+        "service": "hadeeth semantic search api",
+        "endpoints": ["/health", "/search (GET/POST)"]
     })
+@app.get("/health")
+def health():
+    # Don't force-load model/index/meta here if you want it super fast
+    # But we can still show file presence:
+    files_ok = os.path.exists(INDEX_PATH) and os.path.exists(META_PATH)
+    info = {
+        "ok": True,
+        "files_ok": files_ok,
+        "index_path": INDEX_PATH,
+        "meta_path": META_PATH,
+        "model": MODEL_NAME,
+    }
+    # If you want to show counts (this will load resources):
+    try:
+        _, index, meta = get_resources()
+        info["rows"] = int(len(meta))
+        info["index_ntotal"] = int(getattr(index, "ntotal", -1))
+        info["loaded"] = True
+    except Exception as e:
+        info["loaded"] = False
+        info["load_error"] = str(e)
+    return jsonify(info)
 @app.post("/search")
+def search_post():
     """
+    Body JSON:
     {
+      "q": "الرزق",
       "k": 10,
       "include_text": true
     }
     """
     payload = request.get_json(silent=True) or {}
+    q = (payload.get("q") or "").strip()
     if not q:
         return jsonify({"ok": False, "error": "Missing 'q'"}), 400
+    k = payload.get("k", DEFAULT_TOP_K)
     try:
         k = int(k)
     except Exception:
         k = DEFAULT_TOP_K
     k = max(1, min(k, MAX_TOP_K))
+    include_text = payload.get("include_text", True)
+    include_text = bool(include_text)
     t0 = time.time()
+    try:
+        res_df = semantic_search(q, top_k=k)
+    except Exception as e:
+        return jsonify({"ok": False, "error": str(e)}), 500
     took_ms = int((time.time() - t0) * 1000)
+    results = [row_to_json(r, include_text=include_text) for _, r in res_df.iterrows()]
     return jsonify({
         "ok": True,
 def search_get():
     """
     GET /search?q=...&k=10&include_text=1
     """
     q = (request.args.get("q") or "").strip()
     if not q:
         return jsonify({"ok": False, "error": "Missing 'q'"}), 400
+    k_raw = request.args.get("k", str(DEFAULT_TOP_K))
     try:
+        k = int(k_raw)
     except Exception:
+        k = DEFAULT_TOP_K
+    k = max(1, min(k, MAX_TOP_K))
+    include_text_raw = request.args.get("include_text", "1")
+    include_text = include_text_raw not in ("0", "false", "False", "")
     t0 = time.time()
+    try:
+        res_df = semantic_search(q, top_k=k)
+    except Exception as e:
+        return jsonify({"ok": False, "error": str(e)}), 500
     took_ms = int((time.time() - t0) * 1000)
+    results = [row_to_json(r, include_text=include_text) for _, r in res_df.iterrows()]
     return jsonify({
         "ok": True,
         "query": q,
         "query_norm": normalize_ar(q),
+        "k": k,
         "took_ms": took_ms,
         "results_count": len(results),
         "results": results
 if __name__ == "__main__":
+    # Local dev only
     app.run(host="0.0.0.0", port=7860, debug=False)