Spaces:

Shubham170793
/

enterprise-knowledge-assistant

Sleeping

App Files Files Community

Shubham170793 commited on Oct 11

Commit

49c4268

verified ·

1 Parent(s): a5ea9d2

Update src/qa.py

Browse files

Files changed (1) hide show

src/qa.py +49 -45

src/qa.py CHANGED Viewed

@@ -1,21 +1,21 @@
 """
-qa.py — Retrieval + Generation Layer (Optimized Mistral Version)
----------------------------------------------------------------
 Handles:
-• Query embedding (SentenceTransformer / E5-compatible)
-• Chunk retrieval (FAISS, no redundant encoding)
-• Answer generation (Mistral-7B-Instruct, quantized for CPU)
-Optimized for Hugging Face Spaces & Streamlit.
 """
 import os
 import numpy as np
 from sentence_transformers import SentenceTransformer
 from sklearn.metrics.pairwise import cosine_similarity
-from vectorstore import search_faiss
 from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
-print("✅ qa.py (Mistral version) loaded from:", __file__)
 # ==========================================================
 # 1️⃣ Hugging Face Cache Setup
@@ -31,19 +31,19 @@ os.environ.update({
 print(f"✅ Using Hugging Face cache at {CACHE_DIR}")
 # ==========================================================
-# 2️⃣ Query Embedding Model (fast, efficient)
 # ==========================================================
 try:
     _query_model = SentenceTransformer("intfloat/e5-small-v2", cache_folder=CACHE_DIR)
-    print("✅ Loaded model: intfloat/e5-small-v2")
 except Exception as e:
-    print(f"⚠️ Embedding model load failed ({e}), falling back to MiniLM.")
     _query_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", cache_folder=CACHE_DIR)
 # ==========================================================
-# 3️⃣ LLM Setup (Mistral 7B-Instruct, quantized)
 # ==========================================================
-MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.3"
 print(f"✅ Loading LLM: {MODEL_NAME}")
 _tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir=CACHE_DIR)
@@ -51,8 +51,8 @@ _model = AutoModelForCausalLM.from_pretrained(
     MODEL_NAME,
     cache_dir=CACHE_DIR,
     torch_dtype="auto",
-    device_map="auto",          # smart layer placement
-    low_cpu_mem_usage=True,     # enables disk offloading on CPU
 )
 _answer_model = pipeline(
     "text-generation",
@@ -64,29 +64,23 @@ _answer_model = pipeline(
 print("✅ Mistral text-generation pipeline ready.")
 # ==========================================================
-# 4️⃣ Prompt Template
 # ==========================================================
-PROMPT_TEMPLATE = """
-You are an enterprise knowledge assistant.
-Use ONLY the CONTEXT below to answer the QUESTION clearly, completely, and factually.
-If the context doesn’t contain the answer, reply exactly:
-"I don't know based on the provided document."
----
-Context:
-{context}
----
-Question:
-{query}
----
-Answer:
-"""
 # ==========================================================
-# 5️⃣ Chunk Retrieval Function (FAST)
 # ==========================================================
-def retrieve_chunks(query: str, index, chunks: list, top_k: int = 5):
-    """Fast semantic retrieval with FAISS — no redundant re-encoding."""
     if not index or not chunks:
         return []
@@ -98,18 +92,25 @@ def retrieve_chunks(query: str, index, chunks: list, top_k: int = 5):
             normalize_embeddings=True
         )[0]
-        # Step 2: FAISS search only (already has precomputed embeddings)
-        distances, indices = index.search(np.array([query_emb]).astype("float32"), top_k)
-        # Step 3: Return top chunks directly (fast)
-        return [chunks[i] for i in indices[0]]
     except Exception as e:
         print(f"⚠️ Retrieval error: {e}")
         return []
 # ==========================================================
-# 6️⃣ Answer Generation Function (Optimized for Speed)
 # ==========================================================
 def generate_answer(query: str, retrieved_chunks: list):
     """Generate factual, context-grounded answers using Mistral."""
@@ -117,11 +118,7 @@ def generate_answer(query: str, retrieved_chunks: list):
         return "Sorry, I couldn’t find relevant information in the document."
     # Merge retrieved chunks
-    context = "\n\n".join([
-        f"[Chunk {i+1}]: {chunk.strip()}"
-        for i, chunk in enumerate(retrieved_chunks)
-    ])
     prompt = PROMPT_TEMPLATE.format(context=context, query=query)
     try:
@@ -133,6 +130,13 @@ def generate_answer(query: str, retrieved_chunks: list):
             pad_token_id=_tokenizer.eos_token_id,
         )
         answer = result[0]["generated_text"].strip()
         return answer
     except Exception as e:
@@ -140,7 +144,7 @@ def generate_answer(query: str, retrieved_chunks: list):
         return "⚠️ Error: Could not generate an answer at the moment."
 # ==========================================================
-# 7️⃣ Local Test (run only in dev mode)
 # ==========================================================
 if __name__ == "__main__":
     dummy_chunks = [

 """
+qa.py — Retrieval + Generation Layer (Mistral Optimized v2)
+-----------------------------------------------------------
 Handles:
+• Query embedding (SentenceTransformer / E5)
+• Fast FAISS retrieval with context merging
+• Answer generation via Mistral-7B-Instruct (optimized for CPU)
+-----------------------------------------------------------
+Built for Hugging Face Spaces / Streamlit apps.
 """
 import os
 import numpy as np
 from sentence_transformers import SentenceTransformer
 from sklearn.metrics.pairwise import cosine_similarity
 from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
+print("✅ qa.py (Mistral Optimized v2) loaded from:", __file__)
 # ==========================================================
 # 1️⃣ Hugging Face Cache Setup
 print(f"✅ Using Hugging Face cache at {CACHE_DIR}")
 # ==========================================================
+# 2️⃣ Query Embedding Model (E5-small, lightweight)
 # ==========================================================
 try:
     _query_model = SentenceTransformer("intfloat/e5-small-v2", cache_folder=CACHE_DIR)
+    print("✅ Loaded query model: intfloat/e5-small-v2")
 except Exception as e:
+    print(f"⚠️ Embedding model load failed ({e}), using MiniLM fallback.")
     _query_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", cache_folder=CACHE_DIR)
 # ==========================================================
+# 3️⃣ LLM Setup: Mistral-7B-Instruct (quantized + optimized)
 # ==========================================================
+MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.2"  # slightly faster and stable
 print(f"✅ Loading LLM: {MODEL_NAME}")
 _tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir=CACHE_DIR)
     MODEL_NAME,
     cache_dir=CACHE_DIR,
     torch_dtype="auto",
+    device_map="auto",
+    low_cpu_mem_usage=True,
 )
 _answer_model = pipeline(
     "text-generation",
 print("✅ Mistral text-generation pipeline ready.")
 # ==========================================================
+# 4️⃣ Prompt Template (compact + efficient)
 # ==========================================================
+PROMPT_TEMPLATE = (
+    "Answer the question using only the document context below. "
+    "If the answer isn’t clearly in the document, say: "
+    "'I don't know based on the provided document.'\n\n"
+    "Context:\n{context}\n\nQuestion: {query}\nAnswer:"
+)
 # ==========================================================
+# 5️⃣ Fast Chunk Retrieval with Context Merging
 # ==========================================================
+def retrieve_chunks(query: str, index, chunks: list, top_k: int = 5, merge_window: int = 1):
+    """
+    Fast semantic retrieval with lightweight neighborhood expansion.
+    Retrieves top-K relevant chunks, then merges nearby ones for context continuity.
+    """
     if not index or not chunks:
         return []
             normalize_embeddings=True
         )[0]
+        # Step 2: Retrieve top-K*2 candidates
+        distances, indices = index.search(np.array([query_emb]).astype("float32"), top_k * 2)
+        # Step 3: Expand retrieval to nearby chunks
+        selected = set()
+        for idx in indices[0]:
+            for n in range(max(0, idx - merge_window), min(len(chunks), idx + merge_window + 1)):
+                selected.add(n)
+        # Step 4: Preserve order (important for sequential text like steps)
+        ordered = [chunks[i] for i in sorted(selected)]
+        return ordered
     except Exception as e:
         print(f"⚠️ Retrieval error: {e}")
         return []
 # ==========================================================
+# 6️⃣ Answer Generation Function (Faster + Cleaner Output)
 # ==========================================================
 def generate_answer(query: str, retrieved_chunks: list):
     """Generate factual, context-grounded answers using Mistral."""
         return "Sorry, I couldn’t find relevant information in the document."
     # Merge retrieved chunks
+    context = "\n".join(chunk.strip() for chunk in retrieved_chunks)
     prompt = PROMPT_TEMPLATE.format(context=context, query=query)
     try:
             pad_token_id=_tokenizer.eos_token_id,
         )
         answer = result[0]["generated_text"].strip()
+        # Cleanup redundant prompt echo
+        if "Question:" in answer:
+            answer = answer.split("Question:")[-1].strip()
+        if answer.startswith(query):
+            answer = answer[len(query):].strip()
         return answer
     except Exception as e:
         return "⚠️ Error: Could not generate an answer at the moment."
 # ==========================================================
+# 7️⃣ Local Dev Test (optional)
 # ==========================================================
 if __name__ == "__main__":
     dummy_chunks = [