Spaces:

Shubham170793
/

enterprise-knowledge-assistant

Sleeping

App Files Files Community

Shubham170793 commited on Oct 11, 2025

Commit

6718956

verified ·

1 Parent(s): 74cc3b2

Update src/qa.py

Browse files

Files changed (1) hide show

src/qa.py +76 -91

src/qa.py CHANGED Viewed

@@ -1,23 +1,25 @@
 """
-qa.py — Optimized Phi-2 Retrieval + Generation
-----------------------------------------------
-Uses:
-• intfloat/e5-small-v2 for embeddings
-• microsoft/phi-2 for reasoning-rich generation (fast on CPU)
-Optimized for: speed + stability in Streamlit / Hugging Face Spaces
 """
 import os
 import numpy as np
 from sentence_transformers import SentenceTransformer
 from sklearn.metrics.pairwise import cosine_similarity
 from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
-import torch
-print("✅ qa.py (Phi-2 optimized fast) loaded from:", __file__)
 # ==========================================================
-# 1️⃣ Cache Setup
 # ==========================================================
 CACHE_DIR = "/tmp/hf_cache"
 os.makedirs(CACHE_DIR, exist_ok=True)
@@ -27,132 +29,115 @@ os.environ.update({
     "HF_DATASETS_CACHE": CACHE_DIR,
     "HF_MODULES_CACHE": CACHE_DIR
 })
 # ==========================================================
-# 2️⃣ Embedding Model
 # ==========================================================
 try:
     _query_model = SentenceTransformer("intfloat/e5-small-v2", cache_folder=CACHE_DIR)
     print("✅ Loaded embedding model: intfloat/e5-small-v2")
 except Exception as e:
-    print(f"⚠️ Fallback to MiniLM due to {e}")
     _query_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", cache_folder=CACHE_DIR)
 # ==========================================================
-# 3️⃣ Phi-2 LLM Setup (Quantized for CPU)
 # ==========================================================
-try:
-    MODEL_NAME = "microsoft/phi-2"
-    print(f"✅ Loading LLM: {MODEL_NAME} (quantized, CPU-optimized)")
-    _tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir=CACHE_DIR)
-    # ✅ Load model in mixed precision for 4–6× faster inference
-    _model = AutoModelForCausalLM.from_pretrained(
-        MODEL_NAME,
-        cache_dir=CACHE_DIR,
-        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.bfloat16,
-        low_cpu_mem_usage=True,
-    ).to("cpu")
-    # ✅ Create generation pipeline (keep in memory)
-    _answer_model = pipeline(
-        "text-generation",
-        model=_model,
-        tokenizer=_tokenizer,
-        device=-1,
-        model_kwargs={"torch_dtype": torch.bfloat16, "low_cpu_mem_usage": True},
-    )
-    print("✅ Phi-2 text-generation pipeline ready (optimized).")
-except Exception as e:
-    print(f"⚠️ Phi-2 load failed: {e}")
-    _answer_model = None
 # ==========================================================
-# 4️⃣ Prompt Template (Balanced Mode — quality + speed)
 # ==========================================================
-PROMPT_TEMPLATE = (
-    "You are a helpful enterprise document assistant. "
-    "Use ONLY the following context to answer the question clearly and factually. "
-    "If the information is missing, say exactly: 'I don't know based on the provided document.'\n\n"
-    "Keep your answer concise (2–5 sentences) but ensure it covers all relevant details.\n\n"
-    "Context:\n{context}\n\nQuestion: {query}\nAnswer:"
 )
 # ==========================================================
-# 5️⃣ Retrieve Top-K Chunks
 # ==========================================================
 def retrieve_chunks(query: str, index, chunks: list, top_k: int = 5):
-    """Efficient FAISS retrieval using cosine similarity."""
     if not index or not chunks:
         return []
     try:
-        q_emb = _query_model.encode([f"query: {query.strip()}"], convert_to_numpy=True, normalize_embeddings=True)[0]
-        distances, indices = index.search(np.array([q_emb]).astype("float32"), top_k * 2)
-        selected = set()
-        for idx in indices[0]:
-            for i in range(max(0, idx - 1), min(len(chunks), idx + 2)):
-                selected.add(i)
-        ordered_chunks = [chunks[i] for i in sorted(selected)]
-        return ordered_chunks
     except Exception as e:
         print(f"⚠️ Retrieval error: {e}")
         return []
 # ==========================================================
-# 6️⃣ Answer Generation (fast)
 # ==========================================================
-def generate_answer(query: str, retrieved_chunks: list):
-    """Generate concise, grounded answers using Phi-2."""
     if not retrieved_chunks:
         return "Sorry, I couldn’t find relevant information in the document."
-    context = "\n".join(chunk.strip() for chunk in retrieved_chunks)
-    prompt = PROMPT_TEMPLATE.format(context=context, query=query)
     try:
-        # ✅ Limit tokens to speed up inference
         result = _answer_model(
             prompt,
-            max_new_tokens=120,  # reduced for faster completion
             do_sample=False,
-            early_stopping=True,
-            pad_token_id=_tokenizer.eos_token_id,
         )
-        answer = result[0]["generated_text"].strip()
-        # Clean excessive prompt echo
-        if "Answer:" in answer:
-            answer = answer.split("Answer:")[-1].strip()
-        return answer
     except Exception as e:
         print(f"⚠️ Generation failed: {e}")
-        return "⚠️ Error: Could not generate an answer at the moment."
 # ==========================================================
-# 7️⃣ Local Test
 # ==========================================================
 if __name__ == "__main__":
-    from vectorstore import build_faiss_index
     dummy_chunks = [
         "Step 1: Open the dashboard and navigate to reports.",
-        "Step 2: Click 'Export' to download a CSV summary.",
-        "Step 3: Review the generated report in your downloads folder."
     ]
-    embeddings = [
-        _query_model.encode([f"passage: {chunk}"], convert_to_numpy=True, normalize_embeddings=True)[0]
-        for chunk in dummy_chunks
-    ]
-    index = build_faiss_index(embeddings)
-    query = "What are the steps to export a report?"
-    retrieved = retrieve_chunks(query, index, dummy_chunks)
-    print("🔍 Retrieved:", retrieved)
-    print("💬 Answer:", generate_answer(query, retrieved))

 """
+qa.py — Phi-2 Hybrid Mode (Reasoning + Strict)
+-------------------------------------
+Handles:
+• Query embedding (SentenceTransformer / E5-small-v2)
+• Chunk retrieval (FAISS)
+• Answer generation (Phi-2, with toggleable reasoning)
+Optimized for Hugging Face Spaces & Streamlit.
 """
 import os
 import numpy as np
+import torch
 from sentence_transformers import SentenceTransformer
 from sklearn.metrics.pairwise import cosine_similarity
 from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
+from vectorstore import search_faiss
+print("✅ qa.py (Phi-2 Hybrid Mode) loaded from:", __file__)
 # ==========================================================
+# 1️⃣ Hugging Face Cache Setup
 # ==========================================================
 CACHE_DIR = "/tmp/hf_cache"
 os.makedirs(CACHE_DIR, exist_ok=True)
     "HF_DATASETS_CACHE": CACHE_DIR,
     "HF_MODULES_CACHE": CACHE_DIR
 })
+print(f"✅ Using Hugging Face cache at {CACHE_DIR}")
+# ==========================================================
+# 2️⃣ Speed Tweaks for CPU
+# ==========================================================
+torch.set_num_threads(2)  # Limit CPU threads for faster execution
 # ==========================================================
+# 3️⃣ Query Embedding Model
 # ==========================================================
 try:
     _query_model = SentenceTransformer("intfloat/e5-small-v2", cache_folder=CACHE_DIR)
     print("✅ Loaded embedding model: intfloat/e5-small-v2")
 except Exception as e:
+    print(f"⚠️ Query model load failed ({e}), using fallback MiniLM.")
     _query_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", cache_folder=CACHE_DIR)
 # ==========================================================
+# 4️⃣ LLM Setup — Phi-2 (Optimized)
 # ==========================================================
+MODEL_NAME = "microsoft/phi-2"
+print(f"✅ Loading LLM: {MODEL_NAME}")
+_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir=CACHE_DIR)
+_model = AutoModelForCausalLM.from_pretrained(
+    MODEL_NAME,
+    cache_dir=CACHE_DIR,
+    torch_dtype="auto",
+    device_map="auto"
+)
+_answer_model = pipeline("text-generation", model=_model, tokenizer=_tokenizer, device_map="auto")
+print("✅ Phi-2 generation pipeline ready.")
 # ==========================================================
+# 5️⃣ Prompt Templates (Two Modes)
 # ==========================================================
+STRICT_PROMPT = (
+    "You are a factual assistant. Use ONLY the CONTEXT below to answer. "
+    "If the answer is not explicitly in the context, say exactly: "
+    "'I don't know based on the provided document.'\n\n"
+    "CONTEXT:\n{context}\n\nQUESTION: {query}\nANSWER:"
+)
+REASONING_PROMPT = (
+    "You are an intelligent assistant. Use the CONTEXT below and your general knowledge "
+    "to provide the most complete and helpful answer. If unsure, say 'I don't know.'\n\n"
+    "CONTEXT:\n{context}\n\nQUESTION: {query}\nANSWER:"
 )
 # ==========================================================
+# 6️⃣ Chunk Retrieval Function
 # ==========================================================
 def retrieve_chunks(query: str, index, chunks: list, top_k: int = 5):
+    """Retrieve top-K relevant chunks quickly using FAISS."""
     if not index or not chunks:
         return []
     try:
+        query_emb = _query_model.encode(
+            [f"query: {query.strip()}"],
+            convert_to_numpy=True,
+            normalize_embeddings=True
+        )[0]
+        distances, indices = index.search(np.array([query_emb]).astype("float32"), top_k)
+        return [chunks[i] for i in indices[0]]
     except Exception as e:
         print(f"⚠️ Retrieval error: {e}")
         return []
 # ==========================================================
+# 7️⃣ Answer Generation Function (with Mode Toggle)
 # ==========================================================
+def generate_answer(query: str, retrieved_chunks: list, reasoning_mode: bool = True):
+    """
+    Generates answers using Phi-2.
+    reasoning_mode=True → reasoning + external knowledge
+    reasoning_mode=False → strict chunk-only factual mode
+    """
     if not retrieved_chunks:
         return "Sorry, I couldn’t find relevant information in the document."
+    context = "\n".join([chunk.strip() for chunk in retrieved_chunks])
+    prompt = (REASONING_PROMPT if reasoning_mode else STRICT_PROMPT).format(context=context, query=query)
     try:
         result = _answer_model(
             prompt,
+            max_new_tokens=180,
+            temperature=0.4 if reasoning_mode else 0.2,
             do_sample=False,
         )
+        return result[0]["generated_text"].split("ANSWER:")[-1].strip()
     except Exception as e:
         print(f"⚠️ Generation failed: {e}")
+        return "⚠️ Error: Could not generate an answer."
 # ==========================================================
+# 8️⃣ Local Test (Optional)
 # ==========================================================
 if __name__ == "__main__":
     dummy_chunks = [
         "Step 1: Open the dashboard and navigate to reports.",
+        "Step 2: Click 'Export' to download a CSV summary."
     ]
+    from vectorstore import build_faiss_index
+    index = build_faiss_index([
+        _query_model.encode([f"passage: {c}"], convert_to_numpy=True, normalize_embeddings=True)[0]
+        for c in dummy_chunks
+    ])
+    query = "How to export a report?"
+    print("💬 Strict:", generate_answer(query, dummy_chunks, reasoning_mode=False))
+    print("💬 Reasoning:", generate_answer(query, dummy_chunks, reasoning_mode=True))