Spaces:

Shubham170793
/

enterprise-knowledge-assistant

Running

App Files Files Community

Shubham170793 commited on Oct 20

Commit

7b7e367

verified ·

1 Parent(s): 7e98078

Update src/qa.py

Browse files

Files changed (1) hide show

src/qa.py +38 -65

src/qa.py CHANGED Viewed

@@ -1,5 +1,5 @@
 """
-qa.py — GPT-4o (SAP Gen AI Hub) + ReRank Retrieval
 --------------------------------------------------
 ✅ Semantic retrieval (FAISS + cosine re-rank + neighbor fill)
 ✅ Bullet-aware similarity boost for procedural chunks
@@ -7,6 +7,7 @@ qa.py — GPT-4o (SAP Gen AI Hub) + ReRank Retrieval
 ✅ Smart factual mode (fast)
 ✅ Deep reasoning mode (ChatGPT-like)
 ✅ genai_generate() helper for suggestions
 """
 import os
@@ -15,12 +16,13 @@ import json
 import pickle
 import hashlib
 import numpy as np
 from sentence_transformers import SentenceTransformer
 from sklearn.metrics.pairwise import cosine_similarity
 from gen_ai_hub.proxy.core.proxy_clients import get_proxy_client
 from gen_ai_hub.proxy.langchain.openai import ChatOpenAI
-print("✅ qa.py (GPT-4o via Gen AI Hub + Bullet-Aware Retrieval + Cache) loaded from:", __file__)
 # ==========================================================
 # 🧱 Permanent Embeddings Cache Directory
@@ -28,7 +30,6 @@ print("✅ qa.py (GPT-4o via Gen AI Hub + Bullet-Aware Retrieval + Cache) loaded
 CACHE_EMB_DIR = os.path.join(os.path.dirname(__file__), "embed_cache")
 os.makedirs(CACHE_EMB_DIR, exist_ok=True)
-# Verify write permission
 try:
     test_file = os.path.join(CACHE_EMB_DIR, "test_write.tmp")
     with open(test_file, "w") as f:
@@ -57,10 +58,7 @@ os.environ.update({
 # 2️⃣ Embedding Model (E5-small-v2)
 # ==========================================================
 try:
-    _query_model = SentenceTransformer(
-        "intfloat/e5-small-v2",  # ⚡ Faster, 384-dim embeddings
-        cache_folder=CACHE_DIR
-    )
     print("✅ Loaded embedding model: intfloat/e5-small-v2 (fast mode)")
 except Exception as e:
     print(f"⚠️ Embedding load failed ({e}), using MiniLM fallback")
@@ -69,21 +67,15 @@ except Exception as e:
 # ==========================================================
 # 3️⃣ GPT-4o via SAP Gen AI Hub — Lazy / On-demand initialization
 # ==========================================================
 CRED_PATH = os.path.join(os.path.dirname(__file__), "GEN AI HUB PROXY.json")
-_chat_llm = None  # cached instance
 def get_chat_llm(model_name: str = "gpt-4o", temperature: float = 0.3, max_tokens: int = 1500):
-    """
-    Lazily initializes ChatOpenAI via Gen AI Hub proxy.
-    Only runs when first needed; cached afterward.
-    """
     global _chat_llm
     if _chat_llm is not None:
         return _chat_llm
     try:
-        # Optional: set environment variables from service key if present
         if os.path.exists(CRED_PATH):
             with open(CRED_PATH, "r") as key_file:
                 svcKey = json.load(key_file)
@@ -109,15 +101,10 @@ def get_chat_llm(model_name: str = "gpt-4o", temperature: float = 0.3, max_token
         _chat_llm = None
         raise
 # ==========================================================
 # 4️⃣ Embedding Generator (batch-optimized)
 # ==========================================================
 def embed_chunks(chunks, batch_size: int = 32):
-    """
-    Batch-encode text chunks using the global embedding model.
-    Normalized 384-dim embeddings for FAISS retrieval.
-    """
     if not chunks:
         return np.array([])
@@ -135,18 +122,13 @@ def embed_chunks(chunks, batch_size: int = 32):
     return np.array(all_embeddings)
 # ==========================================================
-# 5️⃣ Embedding Cache Manager (Chunk-Aware + Auto-Cleanup)
 # ==========================================================
-CACHE_EMB_DIR = "/tmp/embed_cache"
-os.makedirs(CACHE_EMB_DIR, exist_ok=True)
 def _hash_name(file_name: str, chunk_size: int, overlap: int, num_chunks: int):
-    """Generate unique short hash for a file + chunking configuration."""
     combo = f"{file_name}_{chunk_size}_{overlap}_{num_chunks}"
     return hashlib.md5(combo.encode()).hexdigest()[:8]
 def _clean_old_caches(base_name: str, keep_latest: int = 5):
-    """Keep only latest few embedding caches for each document."""
     files = [
         (os.path.getmtime(os.path.join(CACHE_EMB_DIR, f)), f)
         for f in os.listdir(CACHE_EMB_DIR)
@@ -162,7 +144,6 @@ def _clean_old_caches(base_name: str, keep_latest: int = 5):
                 pass
 def cache_embeddings(file_name: str, chunks, embed_func, chunk_size: int = None, overlap: int = None):
-    """Load or create embeddings cache (chunk size + overlap aware)."""
     cache_key = _hash_name(file_name, chunk_size or 1000, overlap or 100, len(chunks))
     cache_file = f"{os.path.basename(file_name)}_cs{chunk_size}_ov{overlap}_{cache_key}.pkl"
     cache_path = os.path.join(CACHE_EMB_DIR, cache_file)
@@ -182,9 +163,8 @@ def cache_embeddings(file_name: str, chunks, embed_func, chunk_size: int = None,
     return embeddings
 # ==========================================================
-# 6️⃣ Prompt Templates (Enhanced for Structured Formatting + Clean Output)
 # ==========================================================
 STRICT_PROMPT = (
     "You are an enterprise documentation assistant.\n"
     "Use all relevant information from the CONTEXT below.\n"
@@ -198,8 +178,6 @@ STRICT_PROMPT = (
     "Context:\n{context}\n\nQuestion: {query}\nAnswer:"
 )
 REASONING_PROMPT = (
     "You are an expert enterprise assistant capable of reasoning.\n"
     "Think step by step and synthesize information even if scattered across chunks.\n"
@@ -211,6 +189,30 @@ REASONING_PROMPT = (
     "Context:\n{context}\n\nQuestion: {query}\nLet's reason step-by-step:\nAnswer:"
 )
 # ==========================================================
 # 7️⃣ Retrieval — FAISS + Bullet-Aware Re-rank + Neighbor Fill
@@ -220,24 +222,14 @@ from vectorstore import build_faiss_index
 def retrieve_chunks(query: str, index, chunks: list, top_k: int = 7,
                     min_similarity: float = 0.6, candidate_multiplier: int = 3,
                     embeddings: list = None):
-    """
-    Retrieves the most relevant chunks using FAISS similarity + reranking.
-    Includes bullet-aware similarity boost and a fallback mechanism if
-    similarity threshold isn't met — ensuring predictable, complete retrieval.
-    """
     if not index or not chunks:
         print("⚠️ No FAISS index or chunks provided — returning empty result.")
         return []
     try:
-        # --- Encode query
-        q_emb = _query_model.encode(
-            [f"query: {query.strip()}"],
-            convert_to_numpy=True,
-            normalize_embeddings=True
-        )[0]
-        # --- Rebuild index if mismatch occurs
         if hasattr(index, "d") and q_emb.shape[0] != index.d:
             print(f"⚠️ FAISS dimension mismatch: index={index.d}, query={q_emb.shape[0]}")
             if embeddings:
@@ -246,46 +238,35 @@ def retrieve_chunks(query: str, index, chunks: list, top_k: int = 7,
             else:
                 return []
-        # --- Retrieve top candidate chunks
         num_candidates = max(top_k * candidate_multiplier, top_k + 2)
         distances, indices = index.search(np.array([q_emb]).astype("float32"), num_candidates)
-        candidate_indices = [int(i) for i in indices[0] if i >= 0]
-        candidate_indices = list(dict.fromkeys(candidate_indices))  # remove duplicates
-        # --- Re-rank using cosine similarity
         doc_embs = _query_model.encode(
             [f"passage: {chunks[i]}" for i in candidate_indices],
             convert_to_numpy=True,
             normalize_embeddings=True,
         )
         sims = cosine_similarity([q_emb], doc_embs)[0]
         boosted_sims = []
         for idx, sim in zip(candidate_indices, sims):
             text = chunks[idx].strip()
             if re.match(r"^[-•\d]+[\.\s]", text):
-                sim += 0.05  # slight boost for procedural bullets
             boosted_sims.append((idx, sim))
         ranked = sorted(boosted_sims, key=lambda x: x[1], reverse=True)
-        # --- Filter based on similarity threshold
         filtered = [idx for idx, sim in ranked if sim >= min_similarity][:top_k]
-        # --- Fallback: if no matches above threshold, pick top_k anyway
         if not filtered:
             print(f"⚠️ No chunks ≥ {min_similarity:.2f} — using top {top_k} ranked chunks instead.")
             filtered = [idx for idx, sim in ranked[:top_k]]
-        # --- Neighbor continuity: include nearby chunks
         neighbors = set()
         for idx in filtered:
             for n in [idx - 1, idx + 1]:
                 if 0 <= n < len(chunks):
                     neighbors.add(n)
         filtered = sorted(set(filtered) | neighbors)
-        # --- Return final chunk set
         final_chunks = [chunks[i] for i in filtered]
         avg_sim = np.mean([s for _, s in ranked[:top_k]])
         print(f"✅ Retrieved {len(final_chunks)} chunks | avg_sim={avg_sim:.3f} | threshold={min_similarity:.2f}")
@@ -295,21 +276,18 @@ def retrieve_chunks(query: str, index, chunks: list, top_k: int = 7,
         print(f"⚠️ Retrieval error: {repr(e)}")
         return []
 # ==========================================================
-# 8️⃣ Answer Generation (Lazy GPT-4o Initialization)
 # ==========================================================
 def generate_answer(query: str, retrieved_chunks: list, reasoning_mode: bool = False):
     if not retrieved_chunks:
         return "Sorry, I couldn’t find relevant information in the document."
-    # Try lazy initialization
     try:
         chat_llm_local = get_chat_llm()
     except Exception:
         return "⚠️ GPT-4o not initialized. Check credentials or rebuild the Space."
-    # Build context and prompt
     context = "\n".join(f"[Chunk {i+1}] {chunk.strip()}" for i, chunk in enumerate(retrieved_chunks))
     prompt = (REASONING_PROMPT if reasoning_mode else STRICT_PROMPT).format(context=context, query=query)
@@ -323,8 +301,6 @@ def generate_answer(query: str, retrieved_chunks: list, reasoning_mode: bool = F
             "'I don't know based on the provided document.'"},
         {"role": "user", "content": prompt},
     ]
-    # Invoke GPT-4o
     try:
         response = chat_llm_local.invoke(messages)
         return response.content.strip()
@@ -332,12 +308,10 @@ def generate_answer(query: str, retrieved_chunks: list, reasoning_mode: bool = F
         print(f"⚠️ GPT-4o generation failed: {e}")
         return "⚠️ Error: Could not generate an answer."
 # ==========================================================
-# 9️⃣ Generic Text Generation Helper (for AI suggestions)
 # ==========================================================
 def genai_generate(prompt: str) -> str:
-    # Try lazy initialization
     try:
         chat_llm_local = get_chat_llm()
     except Exception:
@@ -370,7 +344,6 @@ if __name__ == "__main__":
     embeddings = embed_chunks(dummy_chunks)
     index = build_faiss_index(embeddings)
     query = "What are the prerequisites for commerce automation?"
     retrieved = retrieve_chunks(query, index, dummy_chunks)
     print("🔍 Retrieved:", retrieved)

 """
+qa.py — GPT-4o (SAP Gen AI Hub) + ReRank Retrieval + PRF Query Expansion
 --------------------------------------------------
 ✅ Semantic retrieval (FAISS + cosine re-rank + neighbor fill)
 ✅ Bullet-aware similarity boost for procedural chunks
 ✅ Smart factual mode (fast)
 ✅ Deep reasoning mode (ChatGPT-like)
 ✅ genai_generate() helper for suggestions
+✅ NEW: Lightweight PRF query expansion to fix synonym-based retrieval misses
 """
 import os
 import pickle
 import hashlib
 import numpy as np
+from collections import Counter
 from sentence_transformers import SentenceTransformer
 from sklearn.metrics.pairwise import cosine_similarity
 from gen_ai_hub.proxy.core.proxy_clients import get_proxy_client
 from gen_ai_hub.proxy.langchain.openai import ChatOpenAI
+print("✅ qa.py (GPT-4o via Gen AI Hub + Bullet-Aware Retrieval + PRF) loaded from:", __file__)
 # ==========================================================
 # 🧱 Permanent Embeddings Cache Directory
 CACHE_EMB_DIR = os.path.join(os.path.dirname(__file__), "embed_cache")
 os.makedirs(CACHE_EMB_DIR, exist_ok=True)
 try:
     test_file = os.path.join(CACHE_EMB_DIR, "test_write.tmp")
     with open(test_file, "w") as f:
 # 2️⃣ Embedding Model (E5-small-v2)
 # ==========================================================
 try:
+    _query_model = SentenceTransformer("intfloat/e5-small-v2", cache_folder=CACHE_DIR)
     print("✅ Loaded embedding model: intfloat/e5-small-v2 (fast mode)")
 except Exception as e:
     print(f"⚠️ Embedding load failed ({e}), using MiniLM fallback")
 # ==========================================================
 # 3️⃣ GPT-4o via SAP Gen AI Hub — Lazy / On-demand initialization
 # ==========================================================
 CRED_PATH = os.path.join(os.path.dirname(__file__), "GEN AI HUB PROXY.json")
+_chat_llm = None
 def get_chat_llm(model_name: str = "gpt-4o", temperature: float = 0.3, max_tokens: int = 1500):
     global _chat_llm
     if _chat_llm is not None:
         return _chat_llm
     try:
         if os.path.exists(CRED_PATH):
             with open(CRED_PATH, "r") as key_file:
                 svcKey = json.load(key_file)
         _chat_llm = None
         raise
 # ==========================================================
 # 4️⃣ Embedding Generator (batch-optimized)
 # ==========================================================
 def embed_chunks(chunks, batch_size: int = 32):
     if not chunks:
         return np.array([])
     return np.array(all_embeddings)
 # ==========================================================
+# 5️⃣ Embedding Cache Manager
 # ==========================================================
 def _hash_name(file_name: str, chunk_size: int, overlap: int, num_chunks: int):
     combo = f"{file_name}_{chunk_size}_{overlap}_{num_chunks}"
     return hashlib.md5(combo.encode()).hexdigest()[:8]
 def _clean_old_caches(base_name: str, keep_latest: int = 5):
     files = [
         (os.path.getmtime(os.path.join(CACHE_EMB_DIR, f)), f)
         for f in os.listdir(CACHE_EMB_DIR)
                 pass
 def cache_embeddings(file_name: str, chunks, embed_func, chunk_size: int = None, overlap: int = None):
     cache_key = _hash_name(file_name, chunk_size or 1000, overlap or 100, len(chunks))
     cache_file = f"{os.path.basename(file_name)}_cs{chunk_size}_ov{overlap}_{cache_key}.pkl"
     cache_path = os.path.join(CACHE_EMB_DIR, cache_file)
     return embeddings
 # ==========================================================
+# 6️⃣ Prompt Templates
 # ==========================================================
 STRICT_PROMPT = (
     "You are an enterprise documentation assistant.\n"
     "Use all relevant information from the CONTEXT below.\n"
     "Context:\n{context}\n\nQuestion: {query}\nAnswer:"
 )
 REASONING_PROMPT = (
     "You are an expert enterprise assistant capable of reasoning.\n"
     "Think step by step and synthesize information even if scattered across chunks.\n"
     "Context:\n{context}\n\nQuestion: {query}\nLet's reason step-by-step:\nAnswer:"
 )
+# ==========================================================
+# 🔹 NEW: Lightweight PRF Query Expansion
+# ==========================================================
+def expand_query_embedding(query, model, index, chunks, topN=40, alpha=0.75):
+    """
+    Expands the query embedding slightly using top candidate chunks (PRF-style).
+    Helps when query wording differs from document phrasing.
+    """
+    try:
+        q_emb = model.encode([f"query: {query}"], convert_to_numpy=True, normalize_embeddings=True)[0]
+        D, I = index.search(np.array([q_emb]).astype("float32"), topN)
+        texts = " ".join(chunks[i] for i in I[0] if i >= 0)
+        words = re.findall(r"[A-Za-z]{4,}", texts)
+        common = [w for w, _ in Counter(words).most_common(6) if w.lower() not in query.lower()]
+        if not common:
+            return q_emb
+        e_emb = model.encode([f"passage: {' '.join(common)}"], convert_to_numpy=True, normalize_embeddings=True)[0]
+        combined = alpha * q_emb + (1 - alpha) * e_emb
+        combined /= np.linalg.norm(combined)
+        print(f"🔍 Query expanded with: {common}")
+        return combined
+    except Exception as e:
+        print(f"⚠️ Query expansion skipped due to error: {e}")
+        return q_emb
 # ==========================================================
 # 7️⃣ Retrieval — FAISS + Bullet-Aware Re-rank + Neighbor Fill
 def retrieve_chunks(query: str, index, chunks: list, top_k: int = 7,
                     min_similarity: float = 0.6, candidate_multiplier: int = 3,
                     embeddings: list = None):
     if not index or not chunks:
         print("⚠️ No FAISS index or chunks provided — returning empty result.")
         return []
     try:
+        # --- PRF-enhanced query embedding
+        q_emb = expand_query_embedding(query, _query_model, index, chunks)
         if hasattr(index, "d") and q_emb.shape[0] != index.d:
             print(f"⚠️ FAISS dimension mismatch: index={index.d}, query={q_emb.shape[0]}")
             if embeddings:
             else:
                 return []
         num_candidates = max(top_k * candidate_multiplier, top_k + 2)
         distances, indices = index.search(np.array([q_emb]).astype("float32"), num_candidates)
+        candidate_indices = list(dict.fromkeys([int(i) for i in indices[0] if i >= 0]))
         doc_embs = _query_model.encode(
             [f"passage: {chunks[i]}" for i in candidate_indices],
             convert_to_numpy=True,
             normalize_embeddings=True,
         )
         sims = cosine_similarity([q_emb], doc_embs)[0]
         boosted_sims = []
         for idx, sim in zip(candidate_indices, sims):
             text = chunks[idx].strip()
             if re.match(r"^[-•\d]+[\.\s]", text):
+                sim += 0.05
             boosted_sims.append((idx, sim))
         ranked = sorted(boosted_sims, key=lambda x: x[1], reverse=True)
         filtered = [idx for idx, sim in ranked if sim >= min_similarity][:top_k]
         if not filtered:
             print(f"⚠️ No chunks ≥ {min_similarity:.2f} — using top {top_k} ranked chunks instead.")
             filtered = [idx for idx, sim in ranked[:top_k]]
         neighbors = set()
         for idx in filtered:
             for n in [idx - 1, idx + 1]:
                 if 0 <= n < len(chunks):
                     neighbors.add(n)
         filtered = sorted(set(filtered) | neighbors)
         final_chunks = [chunks[i] for i in filtered]
         avg_sim = np.mean([s for _, s in ranked[:top_k]])
         print(f"✅ Retrieved {len(final_chunks)} chunks | avg_sim={avg_sim:.3f} | threshold={min_similarity:.2f}")
         print(f"⚠️ Retrieval error: {repr(e)}")
         return []
 # ==========================================================
+# 8️⃣ Answer Generation
 # ==========================================================
 def generate_answer(query: str, retrieved_chunks: list, reasoning_mode: bool = False):
     if not retrieved_chunks:
         return "Sorry, I couldn’t find relevant information in the document."
     try:
         chat_llm_local = get_chat_llm()
     except Exception:
         return "⚠️ GPT-4o not initialized. Check credentials or rebuild the Space."
     context = "\n".join(f"[Chunk {i+1}] {chunk.strip()}" for i, chunk in enumerate(retrieved_chunks))
     prompt = (REASONING_PROMPT if reasoning_mode else STRICT_PROMPT).format(context=context, query=query)
             "'I don't know based on the provided document.'"},
         {"role": "user", "content": prompt},
     ]
     try:
         response = chat_llm_local.invoke(messages)
         return response.content.strip()
         print(f"⚠️ GPT-4o generation failed: {e}")
         return "⚠️ Error: Could not generate an answer."
 # ==========================================================
+# 9️⃣ Generic Text Generation Helper
 # ==========================================================
 def genai_generate(prompt: str) -> str:
     try:
         chat_llm_local = get_chat_llm()
     except Exception:
     embeddings = embed_chunks(dummy_chunks)
     index = build_faiss_index(embeddings)
     query = "What are the prerequisites for commerce automation?"
     retrieved = retrieve_chunks(query, index, dummy_chunks)
     print("🔍 Retrieved:", retrieved)