Spaces:

Shubham170793
/

enterprise-knowledge-assistant

Running

App Files Files Community

Shubham170793 commited on Oct 11

Commit

43cd83d

verified ·

1 Parent(s): 885d81f

Update src/qa.py

Browse files

Files changed (1) hide show

src/qa.py +82 -84

src/qa.py CHANGED Viewed

@@ -1,25 +1,22 @@
 """
-qa.py — Phi-2 Hybrid Mode (Reasoning + Strict)
--------------------------------------
-Handles:
-• Query embedding (SentenceTransformer / E5-small-v2)
-• Chunk retrieval (FAISS)
-• Answer generation (Phi-2, with toggleable reasoning)
-Optimized for Hugging Face Spaces & Streamlit.
 """
 import os
 import numpy as np
-import torch
 from sentence_transformers import SentenceTransformer
-from sklearn.metrics.pairwise import cosine_similarity
 from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
-from vectorstore import search_faiss
-print("✅ qa.py (Phi-2 Hybrid Mode) loaded from:", __file__)
 # ==========================================================
-# 1️⃣ Hugging Face Cache Setup
 # ==========================================================
 CACHE_DIR = "/tmp/hf_cache"
 os.makedirs(CACHE_DIR, exist_ok=True)
@@ -32,22 +29,17 @@ os.environ.update({
 print(f"✅ Using Hugging Face cache at {CACHE_DIR}")
 # ==========================================================
-# 2️⃣ Speed Tweaks for CPU
-# ==========================================================
-torch.set_num_threads(2)  # Limit CPU threads for faster execution
-# ==========================================================
-# 3️⃣ Query Embedding Model
 # ==========================================================
 try:
     _query_model = SentenceTransformer("intfloat/e5-small-v2", cache_folder=CACHE_DIR)
     print("✅ Loaded embedding model: intfloat/e5-small-v2")
 except Exception as e:
-    print(f"⚠️ Query model load failed ({e}), using fallback MiniLM.")
     _query_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", cache_folder=CACHE_DIR)
 # ==========================================================
-# 4️⃣ LLM Setup — Phi-2 (Optimized)
 # ==========================================================
 MODEL_NAME = "microsoft/phi-2"
 print(f"✅ Loading LLM: {MODEL_NAME}")
@@ -57,103 +49,109 @@ _model = AutoModelForCausalLM.from_pretrained(
     MODEL_NAME,
     cache_dir=CACHE_DIR,
     torch_dtype="auto",
-    device_map="auto"
 )
-_answer_model = pipeline("text-generation", model=_model, tokenizer=_tokenizer, device_map="auto")
 print("✅ Phi-2 generation pipeline ready.")
 # ==========================================================
-# 5️⃣ Prompt Templates (Two Modes)
 # ==========================================================
-STRICT_PROMPT = (
-    "You are a factual assistant. Use ONLY the CONTEXT below to answer. "
-    "If the answer is not explicitly in the context, say exactly: "
-    "'I don't know based on the provided document.'\n\n"
-    "CONTEXT:\n{context}\n\nQUESTION: {query}\nANSWER:"
-)
-REASONING_PROMPT = (
-    "You are an intelligent assistant. Use the CONTEXT below and your general knowledge "
-    "to provide the most complete and helpful answer. If unsure, say 'I don't know.'\n\n"
-    "CONTEXT:\n{context}\n\nQUESTION: {query}\nANSWER:"
-)
 # ==========================================================
-# 6️⃣ Chunk Retrieval Function
 # ==========================================================
-def retrieve_chunks(query: str, index, chunks: list, top_k: int = 5):
-    """Retrieve top-K relevant chunks quickly using FAISS."""
     if not index or not chunks:
         return []
-    try:
-        query_emb = _query_model.encode(
-            [f"query: {query.strip()}"],
-            convert_to_numpy=True,
-            normalize_embeddings=True
-        )[0]
-        distances, indices = index.search(np.array([query_emb]).astype("float32"), top_k)
-        return [chunks[i] for i in indices[0]]
-    except Exception as e:
-        print(f"⚠️ Retrieval error: {e}")
-        return []
 # ==========================================================
-# 7️⃣ Answer Generation Function (with Mode Toggle)
 # ==========================================================
 def generate_answer(query: str, retrieved_chunks: list, reasoning_mode: bool = True):
-    """
-    Generates answers using Phi-2.
-    reasoning_mode=True  → reasoning + external knowledge
-    reasoning_mode=False → strict chunk-only factual mode
-    """
     if not retrieved_chunks:
         return "Sorry, I couldn’t find relevant information in the document."
-    # Merge retrieved context
     context = "\n".join([chunk.strip() for chunk in retrieved_chunks])
-    # Select prompt based on mode
-    prompt = (REASONING_PROMPT if reasoning_mode else STRICT_PROMPT).format(
-        context=context, query=query
-    )
     try:
-        # ⚡ Speed-optimized generation
         result = _answer_model(
             prompt,
-            max_new_tokens=140 if reasoning_mode else 100,   # ⏱ shorter output = faster
-            temperature=0.3 if reasoning_mode else 0.1,      # balanced creativity
-            do_sample=False,                                 # ✅ greedy decoding = fastest
-            repetition_penalty=1.1,                          # avoids repetitive phrasing
         )
-        # Cleanly extract the answer
-        answer = result[0]["generated_text"].split("ANSWER:")[-1].strip()
-        # Safety: truncate overly long rambles
-        if len(answer.split()) > 150:
-            answer = " ".join(answer.split()[:150]) + "..."
-        return answer
     except Exception as e:
         print(f"⚠️ Generation failed: {e}")
         return "⚠️ Error: Could not generate an answer."
 # ==========================================================
-# 8️⃣ Local Test (Optional)
 # ==========================================================
 if __name__ == "__main__":
     dummy_chunks = [
         "Step 1: Open the dashboard and navigate to reports.",
-        "Step 2: Click 'Export' to download a CSV summary."
     ]
-    from vectorstore import build_faiss_index
     index = build_faiss_index([
-        _query_model.encode([f"passage: {c}"], convert_to_numpy=True, normalize_embeddings=True)[0]
-        for c in dummy_chunks
     ])
-    query = "How to export a report?"
-    print("💬 Strict:", generate_answer(query, dummy_chunks, reasoning_mode=False))
-    print("💬 Reasoning:", generate_answer(query, dummy_chunks, reasoning_mode=True))

 """
+qa.py — Fast, Reasoning-Enabled Phi-2 Version
+----------------------------------------------
+• Uses SentenceTransformer (E5-small) for embeddings
+• Uses microsoft/phi-2 for generation
+• Retains reasoning vs factual modes
+• Optimized for speed and low VRAM on CPU
 """
 import os
 import numpy as np
 from sentence_transformers import SentenceTransformer
 from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
+from sklearn.metrics.pairwise import cosine_similarity
+print("✅ qa.py (Phi-2 optimized) loaded from:", __file__)
 # ==========================================================
+# Hugging Face Cache Setup
 # ==========================================================
 CACHE_DIR = "/tmp/hf_cache"
 os.makedirs(CACHE_DIR, exist_ok=True)
 print(f"✅ Using Hugging Face cache at {CACHE_DIR}")
 # ==========================================================
+# Query Embedding Model
 # ==========================================================
 try:
     _query_model = SentenceTransformer("intfloat/e5-small-v2", cache_folder=CACHE_DIR)
     print("✅ Loaded embedding model: intfloat/e5-small-v2")
 except Exception as e:
+    print(f"⚠️ Fallback to MiniLM due to {e}")
     _query_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", cache_folder=CACHE_DIR)
 # ==========================================================
+# Phi-2 Model (Causal LM)
 # ==========================================================
 MODEL_NAME = "microsoft/phi-2"
 print(f"✅ Loading LLM: {MODEL_NAME}")
     MODEL_NAME,
     cache_dir=CACHE_DIR,
     torch_dtype="auto",
+    low_cpu_mem_usage=True
+)
+_answer_model = pipeline(
+    "text-generation",
+    model=_model,
+    tokenizer=_tokenizer,
+    device=-1  # CPU-compatible
 )
 print("✅ Phi-2 generation pipeline ready.")
 # ==========================================================
+# Prompt Templates
 # ==========================================================
+REASONING_PROMPT = """
+You are an intelligent enterprise assistant.
+Use the CONTEXT below and your general understanding to answer the QUESTION logically and clearly.
+Explain your reasoning briefly if helpful.
+---
+CONTEXT:
+{context}
+---
+QUESTION:
+{query}
+---
+ANSWER:
+"""
+STRICT_PROMPT = """
+You are an enterprise document assistant.
+Use ONLY the CONTEXT below to answer the QUESTION clearly and factually.
+If the answer is not found in the context, reply exactly:
+"I don't know based on the provided document."
+---
+CONTEXT:
+{context}
+---
+QUESTION:
+{query}
+---
+ANSWER:
+"""
 # ==========================================================
+# Retrieve Chunks
 # ==========================================================
+def retrieve_chunks(query: str, index, chunks: list, top_k: int = 3):
+    """Retrieve top-K most relevant chunks quickly (no re-ranking for speed)."""
     if not index or not chunks:
         return []
+    query_emb = _query_model.encode(
+        [f"query: {query.strip()}"],
+        convert_to_numpy=True,
+        normalize_embeddings=True
+    )[0]
+    distances, indices = index.search(np.array([query_emb]).astype("float32"), top_k)
+    return [chunks[i] for i in indices[0]]
 # ==========================================================
+# Generate Answer (Phi-2)
 # ==========================================================
 def generate_answer(query: str, retrieved_chunks: list, reasoning_mode: bool = True):
+    """Generate answers using Phi-2. Supports reasoning or strict factual modes."""
     if not retrieved_chunks:
         return "Sorry, I couldn’t find relevant information in the document."
     context = "\n".join([chunk.strip() for chunk in retrieved_chunks])
+    prompt = (REASONING_PROMPT if reasoning_mode else STRICT_PROMPT).format(context=context, query=query)
     try:
         result = _answer_model(
             prompt,
+            max_new_tokens=180,        # keeps output short & fast
+            temperature=0.4 if reasoning_mode else 0.2,
+            do_sample=False,           # deterministic
+            num_beams=1,               # no beam search for speed
+            early_stopping=True,
         )
+        text = result[0]["generated_text"].split("ANSWER:")[-1].strip()
+        return text
     except Exception as e:
         print(f"⚠️ Generation failed: {e}")
         return "⚠️ Error: Could not generate an answer."
 # ==========================================================
+# Local Test (optional)
 # ==========================================================
 if __name__ == "__main__":
+    from vectorstore import build_faiss_index
     dummy_chunks = [
         "Step 1: Open the dashboard and navigate to reports.",
+        "Step 2: Click 'Export' to download a CSV summary.",
+        "Step 3: Review the generated report in your downloads folder."
     ]
     index = build_faiss_index([
+        _query_model.encode([f"passage: {chunk}"], convert_to_numpy=True, normalize_embeddings=True)[0]
+        for chunk in dummy_chunks
     ])
+    query = "What are the steps to export a report?"
+    retrieved = retrieve_chunks(query, index, dummy_chunks)
+    print("🔍 Retrieved:", retrieved)
+    print("💬 Answer:", generate_answer(query, retrieved))