Spaces:

Shubham170793
/

enterprise-knowledge-assistant

Sleeping

App Files Files Community

Shubham170793 commited on Oct 11

Commit

a5ea9d2

verified ·

1 Parent(s): 6f0d970

Update src/qa.py

Browse files

Files changed (1) hide show

src/qa.py +44 -48

src/qa.py CHANGED Viewed

@@ -1,10 +1,10 @@
 """
-qa.py — Retrieval + Generation Layer
--------------------------------------
 Handles:
 • Query embedding (SentenceTransformer / E5-compatible)
-• Chunk retrieval (FAISS with neighborhood merging + re-ranking)
-• Answer generation (Mistral-7B-Instruct-v0.3)
 Optimized for Hugging Face Spaces & Streamlit.
 """
@@ -12,8 +12,8 @@ import os
 import numpy as np
 from sentence_transformers import SentenceTransformer
 from sklearn.metrics.pairwise import cosine_similarity
-from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 from vectorstore import search_faiss
 print("✅ qa.py (Mistral version) loaded from:", __file__)
@@ -28,19 +28,20 @@ os.environ.update({
     "HF_DATASETS_CACHE": CACHE_DIR,
     "HF_MODULES_CACHE": CACHE_DIR
 })
 # ==========================================================
-# 2️⃣ Query Embedding Model
 # ==========================================================
 try:
     _query_model = SentenceTransformer("intfloat/e5-small-v2", cache_folder=CACHE_DIR)
-    print("✅ Loaded query model: intfloat/e5-small-v2")
 except Exception as e:
-    print(f"⚠️ Query model load failed ({e}), falling back to MiniLM.")
     _query_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", cache_folder=CACHE_DIR)
 # ==========================================================
-# 3️⃣ LLM Setup: Mistral-7B-Instruct-v0.3
 # ==========================================================
 MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.3"
 print(f"✅ Loading LLM: {MODEL_NAME}")
@@ -50,101 +51,96 @@ _model = AutoModelForCausalLM.from_pretrained(
     MODEL_NAME,
     cache_dir=CACHE_DIR,
     torch_dtype="auto",
-    device_map="auto"  # Uses GPU if available, CPU otherwise
 )
 _answer_model = pipeline(
     "text-generation",
     model=_model,
     tokenizer=_tokenizer,
-    max_new_tokens=800,
-    temperature=0.4,
-    do_sample=False
 )
 print("✅ Mistral text-generation pipeline ready.")
 # ==========================================================
 # 4️⃣ Prompt Template
 # ==========================================================
-PROMPT_TEMPLATE = """You are a precise enterprise knowledge assistant.
-Use only the context provided below to answer the question clearly and factually.
-If the answer cannot be found, reply exactly:
 "I don't know based on the provided document."
 Context:
 {context}
 Question:
 {query}
-Answer:"""
 # ==========================================================
-# 5️⃣ Chunk Retrieval Function
 # ==========================================================
 def retrieve_chunks(query: str, index, chunks: list, top_k: int = 5):
-    """Retrieve top-K relevant chunks and re-rank them by semantic accuracy."""
     if not index or not chunks:
         return []
     try:
-        # Encode the query
         query_emb = _query_model.encode(
             [f"query: {query.strip()}"],
             convert_to_numpy=True,
             normalize_embeddings=True
         )[0]
-        # Initial FAISS retrieval
-        distances, indices = index.search(np.array([query_emb]).astype("float32"), top_k * 2)
-        # Merge neighboring chunks
-        merged_chunks = []
-        for idx in indices[0]:
-            neighbors = [chunks[i] for i in range(max(0, idx - 1), min(len(chunks), idx + 2))]
-            merged_chunks.append(" ".join(neighbors))
-        # Re-rank by cosine similarity
-        chunk_vecs = np.array([
-            _query_model.encode([c], convert_to_numpy=True, normalize_embeddings=True)[0]
-            for c in merged_chunks
-        ])
-        scores = cosine_similarity(np.array([query_emb]), chunk_vecs)[0]
-        sorted_indices = np.argsort(scores)[::-1]
-        return [merged_chunks[i] for i in sorted_indices[:top_k]]
     except Exception as e:
         print(f"⚠️ Retrieval error: {e}")
         return []
 # ==========================================================
-# 6️⃣ Answer Generation Function
 # ==========================================================
 def generate_answer(query: str, retrieved_chunks: list):
-    """Generate factual, context-grounded answers using Mistral-7B."""
     if not retrieved_chunks:
         return "Sorry, I couldn’t find relevant information in the document."
-    # Build the full context
     context = "\n\n".join([
         f"[Chunk {i+1}]: {chunk.strip()}"
         for i, chunk in enumerate(retrieved_chunks)
     ])
     prompt = PROMPT_TEMPLATE.format(context=context, query=query)
     try:
-        result = _answer_model(prompt)
-        output = result[0]["generated_text"]
-        # Remove the repeated prompt text (if any)
-        answer = output[len(prompt):].strip()
         return answer
     except Exception as e:
         print(f"⚠️ Generation failed: {e}")
         return "⚠️ Error: Could not generate an answer at the moment."
 # ==========================================================
-# 7️⃣ Local Test
 # ==========================================================
 if __name__ == "__main__":
     dummy_chunks = [

 """
+qa.py — Retrieval + Generation Layer (Optimized Mistral Version)
+---------------------------------------------------------------
 Handles:
 • Query embedding (SentenceTransformer / E5-compatible)
+• Chunk retrieval (FAISS, no redundant encoding)
+• Answer generation (Mistral-7B-Instruct, quantized for CPU)
 Optimized for Hugging Face Spaces & Streamlit.
 """
 import numpy as np
 from sentence_transformers import SentenceTransformer
 from sklearn.metrics.pairwise import cosine_similarity
 from vectorstore import search_faiss
+from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 print("✅ qa.py (Mistral version) loaded from:", __file__)
     "HF_DATASETS_CACHE": CACHE_DIR,
     "HF_MODULES_CACHE": CACHE_DIR
 })
+print(f"✅ Using Hugging Face cache at {CACHE_DIR}")
 # ==========================================================
+# 2️⃣ Query Embedding Model (fast, efficient)
 # ==========================================================
 try:
     _query_model = SentenceTransformer("intfloat/e5-small-v2", cache_folder=CACHE_DIR)
+    print("✅ Loaded model: intfloat/e5-small-v2")
 except Exception as e:
+    print(f"⚠️ Embedding model load failed ({e}), falling back to MiniLM.")
     _query_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", cache_folder=CACHE_DIR)
 # ==========================================================
+# 3️⃣ LLM Setup (Mistral 7B-Instruct, quantized)
 # ==========================================================
 MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.3"
 print(f"✅ Loading LLM: {MODEL_NAME}")
     MODEL_NAME,
     cache_dir=CACHE_DIR,
     torch_dtype="auto",
+    device_map="auto",          # smart layer placement
+    low_cpu_mem_usage=True,     # enables disk offloading on CPU
 )
 _answer_model = pipeline(
     "text-generation",
     model=_model,
     tokenizer=_tokenizer,
+    max_new_tokens=600,
+    do_sample=False,
 )
 print("✅ Mistral text-generation pipeline ready.")
 # ==========================================================
 # 4️⃣ Prompt Template
 # ==========================================================
+PROMPT_TEMPLATE = """
+You are an enterprise knowledge assistant.
+Use ONLY the CONTEXT below to answer the QUESTION clearly, completely, and factually.
+If the context doesn’t contain the answer, reply exactly:
 "I don't know based on the provided document."
+---
 Context:
 {context}
+---
 Question:
 {query}
+---
+Answer:
+"""
 # ==========================================================
+# 5️⃣ Chunk Retrieval Function (FAST)
 # ==========================================================
 def retrieve_chunks(query: str, index, chunks: list, top_k: int = 5):
+    """Fast semantic retrieval with FAISS — no redundant re-encoding."""
     if not index or not chunks:
         return []
     try:
+        # Step 1: Encode query once
         query_emb = _query_model.encode(
             [f"query: {query.strip()}"],
             convert_to_numpy=True,
             normalize_embeddings=True
         )[0]
+        # Step 2: FAISS search only (already has precomputed embeddings)
+        distances, indices = index.search(np.array([query_emb]).astype("float32"), top_k)
+        # Step 3: Return top chunks directly (fast)
+        return [chunks[i] for i in indices[0]]
     except Exception as e:
         print(f"⚠️ Retrieval error: {e}")
         return []
 # ==========================================================
+# 6️⃣ Answer Generation Function (Optimized for Speed)
 # ==========================================================
 def generate_answer(query: str, retrieved_chunks: list):
+    """Generate factual, context-grounded answers using Mistral."""
     if not retrieved_chunks:
         return "Sorry, I couldn’t find relevant information in the document."
+    # Merge retrieved chunks
     context = "\n\n".join([
         f"[Chunk {i+1}]: {chunk.strip()}"
         for i, chunk in enumerate(retrieved_chunks)
     ])
     prompt = PROMPT_TEMPLATE.format(context=context, query=query)
     try:
+        result = _answer_model(
+            prompt,
+            max_new_tokens=700,
+            temperature=None,
+            do_sample=False,
+            pad_token_id=_tokenizer.eos_token_id,
+        )
+        answer = result[0]["generated_text"].strip()
         return answer
     except Exception as e:
         print(f"⚠️ Generation failed: {e}")
         return "⚠️ Error: Could not generate an answer at the moment."
 # ==========================================================
+# 7️⃣ Local Test (run only in dev mode)
 # ==========================================================
 if __name__ == "__main__":
     dummy_chunks = [