Spaces:

Shubham170793
/

enterprise-knowledge-assistant

Sleeping

App Files Files Community

Shubham170793 commited on Oct 17

Commit

e9faa78

verified ·

1 Parent(s): edaeee6

Update src/qa.py

Browse files

Files changed (1) hide show

src/qa.py +84 -41

src/qa.py CHANGED Viewed

@@ -94,16 +94,30 @@ REASONING_PROMPT = (
 # ==========================================================
-# 5️⃣ Retrieval — FAISS + Re-rank + Neighbor Fill (Auto-Healing)
 # ==========================================================
 from vectorstore import build_faiss_index
 def retrieve_chunks(query: str, index, chunks: list, top_k: int = 5,
                     min_similarity: float = 0.6, candidate_multiplier: int = 3,
-                    embeddings: list = None):
     """
-    Re-rank and optionally fill with neighbors for context continuity.
-    Auto-detects and rebuilds FAISS index if dimension mismatch occurs.
     """
     if not index or not chunks:
@@ -111,52 +125,67 @@ def retrieve_chunks(query: str, index, chunks: list, top_k: int = 5,
         return []
     try:
-        # Encode query embedding
-        q_emb = _query_model.encode(
             [f"query: {query.strip()}"],
             convert_to_numpy=True,
             normalize_embeddings=True
         )[0]
-        # ✅ Sanity check: dimension match between query and FAISS index
-        if hasattr(index, "d") and q_emb.shape[0] != index.d:
-            print(f"⚠️ FAISS index dimension mismatch: index={index.d}, query={q_emb.shape[0]}")
-            if embeddings:
-                print("🔄 Rebuilding FAISS index to match embedding dimensions...")
-                index = build_faiss_index(embeddings)
-                print("✅ FAISS index successfully rebuilt.")
-                # ✅ Regenerate query embedding now that we have a matching index
-                q_emb = _query_model.encode(
-                    [f"query: {query.strip()}"],
-                    convert_to_numpy=True,
-                    normalize_embeddings=True
-                )[0]
-            else:
-                print("❌ No embeddings available to rebuild FAISS index.")
-                return []
-        # Step 1️⃣ — Initial FAISS retrieval
-        num_candidates = max(top_k * candidate_multiplier, top_k + 2)
-        distances, indices = index.search(np.array([q_emb]).astype("float32"), num_candidates)
-        candidate_indices = [int(i) for i in indices[0] if i >= 0]
-        candidate_indices = list(dict.fromkeys(candidate_indices))  # de-dupe
-        # Step 2️⃣ — Re-rank by cosine similarity
         doc_embs = _query_model.encode(
             [f"passage: {chunks[i]}" for i in candidate_indices],
             convert_to_numpy=True,
             normalize_embeddings=True,
         )
-        sims = cosine_similarity([q_emb], doc_embs)[0]
         ranked = sorted(zip(candidate_indices, sims), key=lambda x: x[1], reverse=True)
-        # Step 3️⃣ — Filter by similarity threshold
-        filtered = [idx for idx, sim in ranked if sim >= min_similarity]
-        if len(filtered) > top_k:
-            filtered = filtered[:top_k]
-        # Step 4️⃣ — Neighbor fill (if not enough)
         if len(filtered) < top_k:
             expanded = set(filtered)
             for idx in filtered:
@@ -167,11 +196,25 @@ def retrieve_chunks(query: str, index, chunks: list, top_k: int = 5,
                             break
                 if len(expanded) >= top_k:
                     break
-            filtered = sorted(expanded)[:top_k]
-        # Step 5️⃣ — Build final chunk list
         final_chunks = [chunks[i] for i in filtered]
-        print(f"✅ Retrieved {len(final_chunks)} chunks (semantic + neighbor fill).")
         return final_chunks
     except Exception as e:

 # ==========================================================
+# 🔍 Improved Retrieval — Multi-Span Query + Adaptive Similarity + Context Expansion
 # ==========================================================
 from vectorstore import build_faiss_index
+def _split_query(query: str):
+    """
+    Breaks long or compound questions into smaller sub-queries for richer retrieval coverage.
+    """
+    separators = [".", "?", "and", "then", "also", ",", ";"]
+    for sep in separators:
+        query = query.replace(sep, "|")
+    parts = [q.strip() for q in query.split("|") if len(q.strip()) > 3]
+    return parts[:3] if parts else [query.strip()]
 def retrieve_chunks(query: str, index, chunks: list, top_k: int = 5,
                     min_similarity: float = 0.6, candidate_multiplier: int = 3,
+                    embeddings: list = None, token_budget: int = 3500):
     """
+    Enhanced retrieval:
+      ✅ Handles large / multi-part questions
+      ✅ Dynamically adjusts similarity threshold
+      ✅ Expands context until token budget is reached
+      ✅ Keeps neighbor fill for continuity
     """
     if not index or not chunks:
         return []
     try:
+        # 🔹 Step 0 — Split into sub-queries
+        sub_queries = _split_query(query)
+        dynamic_min_sim = max(0.45, min(0.6, 0.6 - 0.02 * len(sub_queries)))
+        print(f"🧩 Sub-queries: {sub_queries} | Dynamic min_similarity={dynamic_min_sim:.2f}")
+        # 🔹 Step 1 — Embed all sub-queries and gather candidate indices
+        all_candidates = set()
+        for sub_q in sub_queries:
+            q_emb = _query_model.encode(
+                [f"query: {sub_q.strip()}"],
+                convert_to_numpy=True,
+                normalize_embeddings=True
+            )[0]
+            # ✅ Auto-heal FAISS index dimension mismatch
+            if hasattr(index, "d") and q_emb.shape[0] != index.d:
+                print(f"⚠️ FAISS index dimension mismatch: index={index.d}, query={q_emb.shape[0]}")
+                if embeddings:
+                    print("🔄 Rebuilding FAISS index to match embedding dimensions...")
+                    index = build_faiss_index(embeddings)
+                    print("✅ FAISS index successfully rebuilt.")
+                    q_emb = _query_model.encode(
+                        [f"query: {sub_q.strip()}"],
+                        convert_to_numpy=True,
+                        normalize_embeddings=True
+                    )[0]
+                else:
+                    print("❌ No embeddings available to rebuild FAISS index.")
+                    continue
+            # Initial retrieval for each sub-query
+            num_candidates = max(top_k * candidate_multiplier, top_k + 2)
+            distances, indices = index.search(np.array([q_emb]).astype("float32"), num_candidates)
+            all_candidates.update([int(i) for i in indices[0] if i >= 0])
+        if not all_candidates:
+            print("⚠️ No retrieval candidates found.")
+            return []
+        candidate_indices = list(all_candidates)
+        # 🔹 Step 2 — Re-rank by cosine similarity
+        q_emb_global = _query_model.encode(
             [f"query: {query.strip()}"],
             convert_to_numpy=True,
             normalize_embeddings=True
         )[0]
         doc_embs = _query_model.encode(
             [f"passage: {chunks[i]}" for i in candidate_indices],
             convert_to_numpy=True,
             normalize_embeddings=True,
         )
+        sims = cosine_similarity([q_emb_global], doc_embs)[0]
         ranked = sorted(zip(candidate_indices, sims), key=lambda x: x[1], reverse=True)
+        # 🔹 Step 3 — Dynamic filtering
+        filtered = [idx for idx, sim in ranked if sim >= dynamic_min_sim]
+        if not filtered:
+            filtered = [idx for idx, _ in ranked[:top_k]]
+        # 🔹 Step 4 — Neighbor fill for continuity
         if len(filtered) < top_k:
             expanded = set(filtered)
             for idx in filtered:
                             break
                 if len(expanded) >= top_k:
                     break
+            filtered = sorted(expanded)
+        # 🔹 Step 5 — Context expansion (token-budget-aware)
+        context_limit = token_budget  # approx. by word count
+        context_accum, current_len = [], 0
+        for idx, sim in ranked:
+            if idx not in filtered:
+                filtered.append(idx)
+            chunk_len = len(chunks[idx].split())
+            if current_len + chunk_len > context_limit:
+                break
+            context_accum.append(idx)
+            current_len += chunk_len
+        filtered = sorted(set(context_accum or filtered))[: max(top_k, len(filtered))]
+        # 🔹 Step 6 — Final context prep
         final_chunks = [chunks[i] for i in filtered]
+        print(f"✅ Retrieved {len(final_chunks)} chunks (multi-span + adaptive threshold).")
         return final_chunks
     except Exception as e: