Spaces:

Shubham170793
/

enterprise-knowledge-assistant

Sleeping

App Files Files Community

Shubham170793 commited on Oct 18

Commit

f86b15f

verified ·

1 Parent(s): 5f3c646

Update src/qa.py

Browse files

Files changed (1) hide show

src/qa.py +76 -56

src/qa.py CHANGED Viewed

@@ -3,6 +3,7 @@ qa.py — GPT-4o (SAP Gen AI Hub) + ReRank Retrieval
 --------------------------------------------------
 ✅ Semantic retrieval (FAISS + cosine re-rank + neighbor fill)
 ✅ Bullet-aware similarity boost for procedural chunks
 ✅ Smart factual mode (fast)
 ✅ Deep reasoning mode (ChatGPT-like)
 """
@@ -10,16 +11,18 @@ qa.py — GPT-4o (SAP Gen AI Hub) + ReRank Retrieval
 import os
 import re
 import json
 import numpy as np
 from sentence_transformers import SentenceTransformer
 from sklearn.metrics.pairwise import cosine_similarity
 from gen_ai_hub.proxy.core.proxy_clients import get_proxy_client
 from gen_ai_hub.proxy.langchain.openai import ChatOpenAI
-print("✅ qa.py (GPT-4o via Gen AI Hub + Bullet-Aware Retrieval) loaded from:", __file__)
 # ==========================================================
-# 1️⃣ Hugging Face Cache
 # ==========================================================
 CACHE_DIR = "/tmp/hf_cache"
 os.makedirs(CACHE_DIR, exist_ok=True)
@@ -35,7 +38,7 @@ os.environ.update({
 # ==========================================================
 try:
     _query_model = SentenceTransformer(
-        "intfloat/e5-small-v2",   # ⚡ Faster, 384-dim embeddings
         cache_folder=CACHE_DIR
     )
     print("✅ Loaded embedding model: intfloat/e5-small-v2 (fast mode)")
@@ -74,7 +77,52 @@ except Exception as e:
     chat_llm = None
 # ==========================================================
-# 4️⃣ Prompt Templates
 # ==========================================================
 STRICT_PROMPT = (
     "You are an enterprise documentation assistant.\n"
@@ -97,7 +145,7 @@ REASONING_PROMPT = (
 )
 # ==========================================================
-# 5️⃣ Retrieval — FAISS + Bullet-Aware Re-rank + Neighbor Fill
 # ==========================================================
 from vectorstore import build_faiss_index
@@ -105,9 +153,8 @@ def retrieve_chunks(query: str, index, chunks: list, top_k: int = 5,
                     min_similarity: float = 0.6, candidate_multiplier: int = 3,
                     embeddings: list = None):
     """
-    Re-rank and optionally fill with neighbors for context continuity.
-    Adds small similarity boost for bullet-style or step-based chunks.
-    Auto-detects and rebuilds FAISS index if dimension mismatch occurs.
     """
     if not index or not chunks:
@@ -115,60 +162,45 @@ def retrieve_chunks(query: str, index, chunks: list, top_k: int = 5,
         return []
     try:
-        # Encode query embedding
         q_emb = _query_model.encode(
             [f"query: {query.strip()}"],
             convert_to_numpy=True,
             normalize_embeddings=True
         )[0]
-        # ✅ Sanity check: dimension match between query and FAISS index
         if hasattr(index, "d") and q_emb.shape[0] != index.d:
-            print(f"⚠️ FAISS index dimension mismatch: index={index.d}, query={q_emb.shape[0]}")
             if embeddings:
-                print("🔄 Rebuilding FAISS index to match embedding dimensions...")
                 index = build_faiss_index(embeddings)
-                print("✅ FAISS index successfully rebuilt.")
-                q_emb = _query_model.encode(
-                    [f"query: {query.strip()}"],
-                    convert_to_numpy=True,
-                    normalize_embeddings=True
-                )[0]
             else:
-                print("❌ No embeddings available to rebuild FAISS index.")
                 return []
         # Step 1️⃣ — Initial FAISS retrieval
         num_candidates = max(top_k * candidate_multiplier, top_k + 2)
         distances, indices = index.search(np.array([q_emb]).astype("float32"), num_candidates)
         candidate_indices = [int(i) for i in indices[0] if i >= 0]
-        candidate_indices = list(dict.fromkeys(candidate_indices))  # de-dupe
-        # Step 2️⃣ — Compute similarities
         doc_embs = _query_model.encode(
             [f"passage: {chunks[i]}" for i in candidate_indices],
             convert_to_numpy=True,
             normalize_embeddings=True,
         )
         sims = cosine_similarity([q_emb], doc_embs)[0]
-        # 🔹 NEW: Boost similarity for bullet-style or step-based chunks
         boosted_sims = []
         for idx, sim in zip(candidate_indices, sims):
-            chunk_text = chunks[idx].strip()
-            if re.match(r"^[-•\d]+[\.\s]", chunk_text):  # bullet or numbered
-                sim += 0.05  # small procedural context boost
             boosted_sims.append((idx, sim))
         ranked = sorted(boosted_sims, key=lambda x: x[1], reverse=True)
-        # Step 3️⃣ — Filter by similarity threshold
-        filtered = [idx for idx, sim in ranked if sim >= min_similarity]
-        if len(filtered) > top_k:
-            filtered = filtered[:top_k]
-        # Step 4️⃣ — Neighbor fill (context continuity)
         neighbors = set()
         for idx in filtered:
             for n in [idx - 1, idx + 1]:
@@ -176,7 +208,6 @@ def retrieve_chunks(query: str, index, chunks: list, top_k: int = 5,
                     neighbors.add(n)
         filtered = sorted(set(filtered) | neighbors)
-        # Step 5️⃣ — Build final chunk list
         final_chunks = [chunks[i] for i in filtered]
         print(f"✅ Retrieved {len(final_chunks)} chunks (bullet-aware + continuity).")
         return final_chunks
@@ -186,34 +217,25 @@ def retrieve_chunks(query: str, index, chunks: list, top_k: int = 5,
         return []
 # ==========================================================
-# 6️⃣ Answer Generation — GPT-4o via Gen AI Hub
 # ==========================================================
 def generate_answer(query: str, retrieved_chunks: list, reasoning_mode: bool = False):
-    """
-    reasoning_mode=False → strict factual mode (fast)
-    reasoning_mode=True  → deep reasoning mode (ChatGPT-like)
-    """
     if not retrieved_chunks:
         return "Sorry, I couldn’t find relevant information in the document."
     if chat_llm is None:
         return "⚠️ GPT-4o not initialized. Check credentials or rebuild the Space."
-    # Combine chunks with markers
     context = "\n".join(f"[Chunk {i+1}] {chunk.strip()}" for i, chunk in enumerate(retrieved_chunks))
     prompt = (REASONING_PROMPT if reasoning_mode else STRICT_PROMPT).format(context=context, query=query)
     messages = [
-        {
-            "role": "system",
-            "content": (
-                "You are an expert enterprise documentation assistant. "
-                "When reasoning_mode is off, stay strictly factual and concise. "
-                "When reasoning_mode is on, combine insights across chunks logically "
-                "and explain the reasoning briefly. "
-                "If the answer is not in the document, reply exactly: "
-                "'I don't know based on the provided document.'"
-            ),
-        },
         {"role": "user", "content": prompt},
     ]
@@ -225,7 +247,7 @@ def generate_answer(query: str, retrieved_chunks: list, reasoning_mode: bool = F
         return "⚠️ Error: Could not generate an answer."
 # ==========================================================
-# 7️⃣ Local Test
 # ==========================================================
 if __name__ == "__main__":
     from vectorstore import build_faiss_index
@@ -236,10 +258,8 @@ if __name__ == "__main__":
         "Setup instructions and configuration details.",
         "Prerequisites for automation are described here."
     ]
-    embeddings = [
-        _query_model.encode([f"passage: {c}"], convert_to_numpy=True, normalize_embeddings=True)[0]
-        for c in dummy_chunks
-    ]
     index = build_faiss_index(embeddings)
     query = "What are the prerequisites for commerce automation?"

 --------------------------------------------------
 ✅ Semantic retrieval (FAISS + cosine re-rank + neighbor fill)
 ✅ Bullet-aware similarity boost for procedural chunks
+✅ Embedding caching (per PDF)
 ✅ Smart factual mode (fast)
 ✅ Deep reasoning mode (ChatGPT-like)
 """
 import os
 import re
 import json
+import pickle
+import hashlib
 import numpy as np
 from sentence_transformers import SentenceTransformer
 from sklearn.metrics.pairwise import cosine_similarity
 from gen_ai_hub.proxy.core.proxy_clients import get_proxy_client
 from gen_ai_hub.proxy.langchain.openai import ChatOpenAI
+print("✅ qa.py (GPT-4o via Gen AI Hub + Bullet-Aware Retrieval + Cache) loaded from:", __file__)
 # ==========================================================
+# 1️⃣ Hugging Face Cache Setup
 # ==========================================================
 CACHE_DIR = "/tmp/hf_cache"
 os.makedirs(CACHE_DIR, exist_ok=True)
 # ==========================================================
 try:
     _query_model = SentenceTransformer(
+        "intfloat/e5-small-v2",  # ⚡ Faster, 384-dim embeddings
         cache_folder=CACHE_DIR
     )
     print("✅ Loaded embedding model: intfloat/e5-small-v2 (fast mode)")
     chat_llm = None
 # ==========================================================
+# 4️⃣ Embedding Cache Manager
+# ==========================================================
+CACHE_EMB_DIR = "/tmp/embed_cache"
+os.makedirs(CACHE_EMB_DIR, exist_ok=True)
+def _hash_name(file_name: str):
+    """Generate unique hash for PDF file name."""
+    return hashlib.md5(file_name.encode()).hexdigest()
+def cache_embeddings(file_name: str, chunks, embed_func):
+    """
+    Checks if cached embeddings exist for a PDF; if not, compute and save.
+    """
+    cache_path = os.path.join(CACHE_EMB_DIR, f"{_hash_name(file_name)}.pkl")
+    if os.path.exists(cache_path):
+        print(f"🧠 Loaded cached embeddings for {file_name}")
+        with open(cache_path, "rb") as f:
+            return pickle.load(f)
+    print(f"💡 No cache found for {file_name}. Generating embeddings...")
+    embeddings = embed_func(chunks)
+    with open(cache_path, "wb") as f:
+        pickle.dump(embeddings, f)
+    print(f"💾 Cached embeddings saved for {file_name}")
+    return embeddings
+def embed_chunks(chunks, batch_size=32):
+    """
+    Batch-encode text chunks for speed.
+    """
+    all_embeddings = []
+    for i in range(0, len(chunks), batch_size):
+        batch = [f"passage: {c}" for c in chunks[i:i+batch_size]]
+        batch_embs = _query_model.encode(
+            batch,
+            convert_to_numpy=True,
+            normalize_embeddings=True,
+            show_progress_bar=False
+        )
+        all_embeddings.extend(batch_embs)
+    print(f"⚡ Embedded {len(all_embeddings)} chunks in batches of {batch_size}")
+    return np.array(all_embeddings)
+# ==========================================================
+# 5️⃣ Prompt Templates
 # ==========================================================
 STRICT_PROMPT = (
     "You are an enterprise documentation assistant.\n"
 )
 # ==========================================================
+# 6️⃣ Retrieval — FAISS + Bullet-Aware Re-rank + Neighbor Fill
 # ==========================================================
 from vectorstore import build_faiss_index
                     min_similarity: float = 0.6, candidate_multiplier: int = 3,
                     embeddings: list = None):
     """
+    Retrieves top relevant chunks and preserves context continuity.
+    Adds small similarity boost for procedural (bullet or numbered) chunks.
     """
     if not index or not chunks:
         return []
     try:
         q_emb = _query_model.encode(
             [f"query: {query.strip()}"],
             convert_to_numpy=True,
             normalize_embeddings=True
         )[0]
+        # ✅ Dimension sanity check
         if hasattr(index, "d") and q_emb.shape[0] != index.d:
+            print(f"⚠️ FAISS dimension mismatch: index={index.d}, query={q_emb.shape[0]}")
             if embeddings:
+                print("🔄 Rebuilding FAISS index...")
                 index = build_faiss_index(embeddings)
             else:
                 return []
         # Step 1️⃣ — Initial FAISS retrieval
         num_candidates = max(top_k * candidate_multiplier, top_k + 2)
         distances, indices = index.search(np.array([q_emb]).astype("float32"), num_candidates)
         candidate_indices = [int(i) for i in indices[0] if i >= 0]
+        candidate_indices = list(dict.fromkeys(candidate_indices))
+        # Step 2️⃣ — Re-rank with bullet-aware boost
         doc_embs = _query_model.encode(
             [f"passage: {chunks[i]}" for i in candidate_indices],
             convert_to_numpy=True,
             normalize_embeddings=True,
         )
         sims = cosine_similarity([q_emb], doc_embs)[0]
         boosted_sims = []
         for idx, sim in zip(candidate_indices, sims):
+            text = chunks[idx].strip()
+            if re.match(r"^[-•\d]+[\.\s]", text):  # bullet or step pattern
+                sim += 0.05
             boosted_sims.append((idx, sim))
         ranked = sorted(boosted_sims, key=lambda x: x[1], reverse=True)
+        filtered = [idx for idx, sim in ranked if sim >= min_similarity][:top_k]
+        # Step 3️⃣ — Add neighboring chunks for continuity
         neighbors = set()
         for idx in filtered:
             for n in [idx - 1, idx + 1]:
                     neighbors.add(n)
         filtered = sorted(set(filtered) | neighbors)
         final_chunks = [chunks[i] for i in filtered]
         print(f"✅ Retrieved {len(final_chunks)} chunks (bullet-aware + continuity).")
         return final_chunks
         return []
 # ==========================================================
+# 7️⃣ Answer Generation
 # ==========================================================
 def generate_answer(query: str, retrieved_chunks: list, reasoning_mode: bool = False):
     if not retrieved_chunks:
         return "Sorry, I couldn’t find relevant information in the document."
     if chat_llm is None:
         return "⚠️ GPT-4o not initialized. Check credentials or rebuild the Space."
     context = "\n".join(f"[Chunk {i+1}] {chunk.strip()}" for i, chunk in enumerate(retrieved_chunks))
     prompt = (REASONING_PROMPT if reasoning_mode else STRICT_PROMPT).format(context=context, query=query)
     messages = [
+        {"role": "system", "content":
+            "You are an expert enterprise documentation assistant. "
+            "When reasoning_mode is off, stay strictly factual and concise. "
+            "When reasoning_mode is on, combine insights across chunks logically "
+            "and explain briefly. "
+            "If the answer is not in the document, reply exactly: "
+            "'I don't know based on the provided document.'"},
         {"role": "user", "content": prompt},
     ]
         return "⚠️ Error: Could not generate an answer."
 # ==========================================================
+# 8️⃣ Local Test
 # ==========================================================
 if __name__ == "__main__":
     from vectorstore import build_faiss_index
         "Setup instructions and configuration details.",
         "Prerequisites for automation are described here."
     ]
+    embeddings = embed_chunks(dummy_chunks)
     index = build_faiss_index(embeddings)
     query = "What are the prerequisites for commerce automation?"