Spaces:

Ashanasri
/

construction-rag

Sleeping

App Files Files Community

Ashanasri commited on Feb 26

Commit

0eeb787

verified ·

1 Parent(s): 9f0e12c

Upload app/rag/search.py with huggingface_hub

Browse files

Files changed (1) hide show

app/rag/search.py +195 -0

app/rag/search.py ADDED Viewed

	@@ -0,0 +1,195 @@

+# from __future__ import annotations
+# from typing import List, Dict, Any
+# import re
+# import numpy as np
+# from pathlib import Path
+# import faiss
+# from app.rag.embeddings import BGEM3Embedder
+# from app.rag.storage import load_faiss, load_jsonl, search as faiss_search
+# def clean_text(text: str) -> str:
+#     """
+#     Clean text extracted from PDFs:
+#     - Collapse multiple spaces/tabs
+#     - Replace line breaks with spaces (unless paragraph breaks)
+#     - Normalize multiple newlines
+#     - Add spaces between lowercase-uppercase and letter-digit transitions
+#     - Strip leading/trailing whitespace
+#     """
+#     if not text:
+#         return ""
+#     text = re.sub(r"[ \t]+", " ", text)                     # collapse spaces/tabs
+#     text = re.sub(r"\n(?!\n)", " ", text)                    # single newline -> space
+#     text = re.sub(r"\n{2,}", "\n", text)                     # multi newlines -> single newline
+#     text = re.sub(r"([a-z])([A-Z])", r"\1 \2", text)         # split lowercase-uppercase
+#     text = re.sub(r"([a-zA-Z])(\d)", r"\1 \2", text)         # split letter-digit
+#     text = re.sub(r"(\d)([a-zA-Z])", r"\1 \2", text)         # split digit-letter
+#     return text.strip()
+# class RAGSearcher:
+#     def __init__(self, index_path: Path, meta_path: Path, device: str = "cpu"):
+#         self.index_path = index_path
+#         self.meta_path = meta_path
+#         self.meta = load_jsonl(meta_path)
+#         self.embedder = BGEM3Embedder(device=device)
+#         self.index: faiss.Index = load_faiss(index_path)
+#         self.d = self.index.d  # embedding dimension sanity check
+#         # Clean text in metadata on load
+#         for m in self.meta:
+#             m["text"] = clean_text(m.get("text", ""))
+#     def embed_query(self, query: str) -> np.ndarray:
+#         return self.embedder.embed_one(query, mode="query")
+#     def top_k(self, query: str, k: int = 5, rerank: bool = True) -> List[Dict[str, Any]]:
+#         """
+#         Search the FAISS index for top-k passages matching the query.
+#         Optionally rerank using fresh embeddings for better accuracy.
+#         """
+#         q = self.embed_query(query).reshape(1, -1)
+#         scores, ids = faiss_search(self.index, q, top_k=k)
+#         ids_row = ids[0].tolist()
+#         scores_row = scores[0].tolist()
+#         items = []
+#         for i, sc in zip(ids_row, scores_row):
+#             if i < 0:
+#                 continue
+#             m = self.meta[i]
+#             items.append({
+#                 "id": i,
+#                 "score": float(sc),
+#                 "page": m["page"],
+#                 "chunk_index": m["chunk_index"],
+#                 "source": m["source"],
+#                 "text": m["text"],
+#             })
+#         if rerank and items:
+#             # Re-embed candidate passages and recompute cosine similarity
+#             passages = [it["text"] for it in items]
+#             P = self.embedder.embed_texts(passages, mode="passage")
+#             qv = q.astype("float32")  # [1, d]
+#             rerank_scores = (P @ qv.T).reshape(-1)  # cosine sim with L2 normed vectors
+#             for it, rs in zip(items, rerank_scores.tolist()):
+#                 it["rerank_score"] = float(rs)
+#             items.sort(key=lambda x: x.get("rerank_score", x["score"]), reverse=True)
+#         else:
+#             items.sort(key=lambda x: x["score"], reverse=True)
+#         return items
+from __future__ import annotations
+"""
+search.py
+=========
+RAGSearcher — wraps FAISS index + BGE-M3 embedder.
+Exposes _top_k_sync() (blocking) used by utils.py pipelines.
+"""
+import asyncio
+import re
+from pathlib import Path
+from typing import Any, Dict, List
+import faiss
+import numpy as np
+from app.rag.embeddings import BGEM3Embedder
+from app.rag.storage import load_faiss, load_jsonl, search as faiss_search
+# ── Text cleaner ──────────────────────────────────────────────────────────────
+def clean_text(text: str) -> str:
+    if not text:
+        return ""
+    text = re.sub(r"[ \t]+", " ", text)
+    text = re.sub(r"\n(?!\n)", " ", text)
+    text = re.sub(r"\n{2,}", "\n", text)
+    text = re.sub(r"([a-z])([A-Z])", r"\1 \2", text)
+    text = re.sub(r"([a-zA-Z])(\d)", r"\1 \2", text)
+    text = re.sub(r"(\d)([a-zA-Z])", r"\1 \2", text)
+    return text.strip()
+# ── RAGSearcher ───────────────────────────────────────────────────────────────
+class RAGSearcher:
+    """
+    Loads FAISS index + metadata once.
+    _top_k_sync()  -> blocking retrieval (called by utils.answer_query*)
+    top_k()        -> async wrapper (optional direct use)
+    """
+    def __init__(self, index_path: Path, meta_path: Path, device: str = "cpu"):
+        self.meta = load_jsonl(meta_path)
+        self.embedder = BGEM3Embedder(device=device)
+        self.index: faiss.Index = load_faiss(index_path)
+        self.d = self.index.d
+        # Clean all metadata text once at load time
+        for m in self.meta:
+            m["text"] = clean_text(m.get("text", ""))
+    def embed_query(self, query: str) -> np.ndarray:
+        return self.embedder.embed_one(query, mode="query")
+    # ── Blocking retrieval ────────────────────────────────────────────────────
+    def _top_k_sync(
+        self, query: str, k: int = 5, rerank: bool = True
+    ) -> List[Dict[str, Any]]:
+        """
+        1. Embed query with BGE-M3
+        2. FAISS cosine search (top-k)
+        3. Rerank via fresh passage embeddings (cosine rescore)
+        Returns list of hit dicts sorted by best score.
+        """
+        q = self.embed_query(query).reshape(1, -1)
+        scores, ids = faiss_search(self.index, q, top_k=k)
+        items = []
+        for i, sc in zip(ids[0].tolist(), scores[0].tolist()):
+            if i < 0:
+                continue
+            m = self.meta[i]
+            items.append({
+                "id": i,
+                "score": float(sc),
+                "page": m.get("page"),
+                "chunk_index": m.get("chunk_index"),
+                "source": m.get("source"),
+                "text": m["text"],
+            })
+            print(f"Retrieved chunk {i} with initial score {sc:.4f}")
+        if rerank and items:
+            passages = [it["text"] for it in items]
+            P = self.embedder.embed_texts(passages, mode="passage")
+            rerank_scores = (P @ q.astype("float32").T).reshape(-1)
+            for it, rs in zip(items, rerank_scores.tolist()):
+                it["rerank_score"] = float(rs)
+            items.sort(key=lambda x: x.get("rerank_score", x["score"]), reverse=True)
+        else:
+            items.sort(key=lambda x: x["score"], reverse=True)
+        return items
+    async def top_k(
+        self, query: str, k: int = 5, rerank: bool = True
+    ) -> List[Dict[str, Any]]:
+        """Non-blocking version for direct async use."""
+        loop = asyncio.get_event_loop()
+        return await loop.run_in_executor(None, self._top_k_sync, query, k, rerank)