Spaces:

Alshargi
/

hadith-faiss

Sleeping

App Files Files Community

Alshargi commited on Feb 8

Commit

8662bec

verified ·

1 Parent(s): e11a55d

Update app.py

Browse files

Files changed (1) hide show

app.py +40 -26

app.py CHANGED Viewed

@@ -20,10 +20,17 @@ JSON_PATH = os.path.join(BASE_DIR, "hadith_corpus25k.json")
 ART_DIR = os.path.join(BASE_DIR, "artifacts_hadith_faiss")
 INDEX_PATH = os.path.join(ART_DIR, "faiss.index")
-EMB_PATH = os.path.join(ART_DIR, "embeddings.npy")
 ID_BY_POS_PATH = os.path.join(ART_DIR, "id_by_pos.json")
 POS_BY_ID_PATH = os.path.join(ART_DIR, "pos_by_id.json")
 # -----------------------------
 # App
 # -----------------------------
@@ -31,8 +38,8 @@ app = FastAPI(title="Hadith FAISS API", version="1.0")
 app.add_middleware(
     CORSMiddleware,
-    allow_origins=["*"],   # لو تبي تقفلها على دومين موقعك فقط قل لي
-    allow_credentials=True,
     allow_methods=["*"],
     allow_headers=["*"],
 )
@@ -71,7 +78,7 @@ def ensure_dirs():
 def artifacts_exist() -> bool:
     return (
         os.path.exists(INDEX_PATH)
-        and os.path.exists(EMB_PATH)
         and os.path.exists(ID_BY_POS_PATH)
         and os.path.exists(POS_BY_ID_PATH)
     )
@@ -85,19 +92,21 @@ def load_items():
     with open(JSON_PATH, "r", encoding="utf-8") as f:
         _items = json.load(f)
-    # Build id map
     _item_by_id = {}
     for it in _items:
         cid = it.get("corpusID")
-        if cid is not None:
-            _item_by_id[int(cid)] = it
 def get_model() -> SentenceTransformer:
     global _model
     if _model is None:
-        # intfloat/multilingual-e5-base
-        _model = SentenceTransformer("intfloat/multilingual-e5-base")
     return _model
@@ -105,12 +114,11 @@ def save_artifacts(index: faiss.Index, emb: np.ndarray, id_by_pos: List[int], po
     ensure_dirs()
     faiss.write_index(index, INDEX_PATH)
-    np.save(EMB_PATH, emb)
     with open(ID_BY_POS_PATH, "w", encoding="utf-8") as f:
         json.dump(id_by_pos, f, ensure_ascii=False)
-    # keys must be str in json; we convert to str
     pos_by_id_str = {str(k): int(v) for k, v in pos_by_id.items()}
     with open(POS_BY_ID_PATH, "w", encoding="utf-8") as f:
         json.dump(pos_by_id_str, f, ensure_ascii=False)
@@ -120,7 +128,7 @@ def load_artifacts():
     global _index, _emb, _id_by_pos, _pos_by_id, _DIM
     _index = faiss.read_index(INDEX_PATH)
-    _emb = np.load(EMB_PATH).astype("float32")
     with open(ID_BY_POS_PATH, "r", encoding="utf-8") as f:
         _id_by_pos = [int(x) for x in json.load(f)]
@@ -142,14 +150,12 @@ def build_all():
     model = get_model()
     texts = [build_text(x) for x in _items]
-    # E5 recommends prefixes
-    passages = ["passage: " + t for t in texts]
     emb = model.encode(
         passages,
         normalize_embeddings=True,
-        batch_size=64,
         show_progress_bar=True,
     )
     emb = np.asarray(emb, dtype="float32")
@@ -158,7 +164,13 @@ def build_all():
     index = faiss.IndexFlatIP(dim)  # cosine via IP since normalized
     index.add(emb)
-    id_by_pos = [int(x["corpusID"]) for x in _items]
     pos_by_id = {cid: i for i, cid in enumerate(id_by_pos)}
     save_artifacts(index, emb, id_by_pos, pos_by_id)
@@ -174,12 +186,11 @@ def build_all():
 def require_ready():
-    if not _READY or _index is None or _emb is None:
         raise HTTPException(status_code=503, detail="API is not ready yet. Try again in a moment.")
 def pack_item(it: Dict[str, Any]) -> Dict[str, Any]:
-    # return only what you need (خفيف)
     return {
         "corpusID": it.get("corpusID"),
         "book": it.get("book"),
@@ -193,8 +204,7 @@ def pack_item(it: Dict[str, Any]) -> Dict[str, Any]:
 def embed_query(q: str) -> np.ndarray:
     model = get_model()
-    # E5 query prefix:
-    vec = model.encode(["query: " + q], normalize_embeddings=True)
     return np.asarray(vec, dtype="float32")
@@ -231,7 +241,6 @@ def on_startup():
     except Exception as e:
         _READY = False
         print("[startup] FAILED ❌", str(e))
-        # keep app up but not ready
 # -----------------------------
@@ -254,6 +263,7 @@ def stats():
         "items": len(_items),
         "dim": _DIM,
         "index_type": type(_index).__name__,
     }
@@ -269,14 +279,17 @@ def get_item(corpus_id: int):
 @app.get("/similar/{corpus_id}")
 def similar(corpus_id: int, topk: int = 10):
     require_ready()
     cid = int(corpus_id)
     if cid not in _pos_by_id:
         raise HTTPException(status_code=404, detail="corpusID not found in index")
     pos = _pos_by_id[cid]
     q = _emb[pos:pos + 1]  # already normalized
-    scores, idxs = _index.search(q, int(topk) + 1)  # +1 to skip itself
     scores = scores[0].tolist()
     idxs = idxs[0].tolist()
@@ -295,20 +308,21 @@ def similar(corpus_id: int, topk: int = 10):
             "score": float(sc),
             "item": pack_item(it),
         })
-        if len(results) >= int(topk):
             break
-    return {"query_id": cid, "topk": int(topk), "results": results}
 @app.post("/search")
 def search(req: SearchRequest):
     require_ready()
     q = (req.query or "").strip()
     if not q:
         raise HTTPException(status_code=400, detail="query is empty")
-    topk = max(1, min(int(req.topk), 50))
     qv = embed_query(q)
     scores, idxs = _index.search(qv, topk)

 ART_DIR = os.path.join(BASE_DIR, "artifacts_hadith_faiss")
 INDEX_PATH = os.path.join(ART_DIR, "faiss.index")
+# IMPORTANT: np.save adds ".npy" if not present; keep path WITHOUT extension
+EMB_PATH = os.path.join(ART_DIR, "embeddings")  # will produce embeddings.npy
 ID_BY_POS_PATH = os.path.join(ART_DIR, "id_by_pos.json")
 POS_BY_ID_PATH = os.path.join(ART_DIR, "pos_by_id.json")
+# Settings
+MODEL_NAME = os.getenv("MODEL_NAME", "intfloat/multilingual-e5-base")
+BATCH_SIZE = int(os.getenv("BATCH_SIZE", "64"))
+TOPK_MAX = int(os.getenv("TOPK_MAX", "50"))
 # -----------------------------
 # App
 # -----------------------------
 app.add_middleware(
     CORSMiddleware,
+    allow_origins=["*"],   # لاحقاً: استبدلها بدومين موقعك للأمان
+    allow_credentials=False,
     allow_methods=["*"],
     allow_headers=["*"],
 )
 def artifacts_exist() -> bool:
     return (
         os.path.exists(INDEX_PATH)
+        and os.path.exists(EMB_PATH + ".npy")
         and os.path.exists(ID_BY_POS_PATH)
         and os.path.exists(POS_BY_ID_PATH)
     )
     with open(JSON_PATH, "r", encoding="utf-8") as f:
         _items = json.load(f)
+    if not isinstance(_items, list):
+        raise RuntimeError("Dataset JSON root must be a list")
     _item_by_id = {}
     for it in _items:
         cid = it.get("corpusID")
+        if cid is None:
+            continue
+        _item_by_id[int(cid)] = it
 def get_model() -> SentenceTransformer:
     global _model
     if _model is None:
+        _model = SentenceTransformer(MODEL_NAME)
     return _model
     ensure_dirs()
     faiss.write_index(index, INDEX_PATH)
+    np.save(EMB_PATH, emb)  # creates EMB_PATH + ".npy"
     with open(ID_BY_POS_PATH, "w", encoding="utf-8") as f:
         json.dump(id_by_pos, f, ensure_ascii=False)
     pos_by_id_str = {str(k): int(v) for k, v in pos_by_id.items()}
     with open(POS_BY_ID_PATH, "w", encoding="utf-8") as f:
         json.dump(pos_by_id_str, f, ensure_ascii=False)
     global _index, _emb, _id_by_pos, _pos_by_id, _DIM
     _index = faiss.read_index(INDEX_PATH)
+    _emb = np.load(EMB_PATH + ".npy").astype("float32", copy=False)
     with open(ID_BY_POS_PATH, "r", encoding="utf-8") as f:
         _id_by_pos = [int(x) for x in json.load(f)]
     model = get_model()
     texts = [build_text(x) for x in _items]
+    passages = ["passage: " + t for t in texts]  # E5 passage prefix
     emb = model.encode(
         passages,
         normalize_embeddings=True,
+        batch_size=BATCH_SIZE,
         show_progress_bar=True,
     )
     emb = np.asarray(emb, dtype="float32")
     index = faiss.IndexFlatIP(dim)  # cosine via IP since normalized
     index.add(emb)
+    # Build ID mappings
+    id_by_pos = []
+    for x in _items:
+        if "corpusID" not in x:
+            raise RuntimeError("Each item must have corpusID")
+        id_by_pos.append(int(x["corpusID"]))
     pos_by_id = {cid: i for i, cid in enumerate(id_by_pos)}
     save_artifacts(index, emb, id_by_pos, pos_by_id)
 def require_ready():
+    if (not _READY) or (_index is None) or (_emb is None):
         raise HTTPException(status_code=503, detail="API is not ready yet. Try again in a moment.")
 def pack_item(it: Dict[str, Any]) -> Dict[str, Any]:
     return {
         "corpusID": it.get("corpusID"),
         "book": it.get("book"),
 def embed_query(q: str) -> np.ndarray:
     model = get_model()
+    vec = model.encode(["query: " + q], normalize_embeddings=True)  # E5 query prefix
     return np.asarray(vec, dtype="float32")
     except Exception as e:
         _READY = False
         print("[startup] FAILED ❌", str(e))
 # -----------------------------
         "items": len(_items),
         "dim": _DIM,
         "index_type": type(_index).__name__,
+        "model": MODEL_NAME,
     }
 @app.get("/similar/{corpus_id}")
 def similar(corpus_id: int, topk: int = 10):
     require_ready()
     cid = int(corpus_id)
     if cid not in _pos_by_id:
         raise HTTPException(status_code=404, detail="corpusID not found in index")
+    topk = max(1, min(int(topk), TOPK_MAX))
     pos = _pos_by_id[cid]
     q = _emb[pos:pos + 1]  # already normalized
+    scores, idxs = _index.search(q, topk + 1)  # +1 to skip itself
     scores = scores[0].tolist()
     idxs = idxs[0].tolist()
             "score": float(sc),
             "item": pack_item(it),
         })
+        if len(results) >= topk:
             break
+    return {"query_id": cid, "topk": topk, "results": results}
 @app.post("/search")
 def search(req: SearchRequest):
     require_ready()
     q = (req.query or "").strip()
     if not q:
         raise HTTPException(status_code=400, detail="query is empty")
+    topk = max(1, min(int(req.topk), TOPK_MAX))
     qv = embed_query(q)
     scores, idxs = _index.search(qv, topk)