Spaces:

Shubham170793
/

enterprise-knowledge-assistant

Running

App Files Files Community

Shubham170793 commited on Oct 11

Commit

874e5e3

verified ·

1 Parent(s): d1ca01c

Update src/qa.py

Browse files

Files changed (1) hide show

src/qa.py +36 -25

src/qa.py CHANGED Viewed

@@ -1,19 +1,20 @@
 """
-qa.py — Retrieval + Generation (Phi-2 Fast Reasoning)
------------------------------------------------------
 Uses:
- - intfloat/e5-small-v2 for embeddings
- - microsoft/phi-2 as main LLM (fast, strong reasoning)
- - Optional fallback: google/flan-t5-base
-Optimized for CPU inference (Hugging Face Spaces / Streamlit)
 """
 import os
 import numpy as np
 from sentence_transformers import SentenceTransformer
 from sklearn.metrics.pairwise import cosine_similarity
-print("✅ qa.py (Phi-2 optimized) loaded from:", __file__)
 # ==========================================================
 # 1️⃣ Cache Setup
@@ -38,29 +39,33 @@ except Exception as e:
     _query_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", cache_folder=CACHE_DIR)
 # ==========================================================
-# 3️⃣ Phi-2 LLM Setup
 # ==========================================================
-from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 try:
     MODEL_NAME = "microsoft/phi-2"
-    print(f"✅ Loading LLM: {MODEL_NAME}")
     _tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir=CACHE_DIR)
     _model = AutoModelForCausalLM.from_pretrained(
         MODEL_NAME,
         cache_dir=CACHE_DIR,
-        torch_dtype="auto",
         low_cpu_mem_usage=True,
-    )
     _answer_model = pipeline(
         "text-generation",
         model=_model,
         tokenizer=_tokenizer,
         device=-1,
-        max_new_tokens=250,
-        do_sample=False,
     )
-    print("✅ Phi-2 generation pipeline ready.")
 except Exception as e:
     print(f"⚠️ Phi-2 load failed: {e}")
     _answer_model = None
@@ -71,16 +76,16 @@ except Exception as e:
 PROMPT_TEMPLATE = (
     "You are an expert assistant for enterprise document understanding.\n"
     "Use ONLY the context below to answer the question clearly and factually.\n"
-    "If the context doesn’t contain the answer, reply: "
     "'I don't know based on the provided document.'\n\n"
     "Context:\n{context}\n\nQuestion: {query}\nAnswer:"
 )
 # ==========================================================
-# 5️⃣ Retrieval Function
 # ==========================================================
 def retrieve_chunks(query: str, index, chunks: list, top_k: int = 5):
-    """Fast FAISS retrieval with E5 embeddings."""
     if not index or not chunks:
         return []
@@ -88,7 +93,6 @@ def retrieve_chunks(query: str, index, chunks: list, top_k: int = 5):
         q_emb = _query_model.encode([f"query: {query.strip()}"], convert_to_numpy=True, normalize_embeddings=True)[0]
         distances, indices = index.search(np.array([q_emb]).astype("float32"), top_k * 2)
-        # Merge nearby chunks for continuity
         selected = set()
         for idx in indices[0]:
             for i in range(max(0, idx - 1), min(len(chunks), idx + 2)):
@@ -101,10 +105,10 @@ def retrieve_chunks(query: str, index, chunks: list, top_k: int = 5):
         return []
 # ==========================================================
-# 6️⃣ Answer Generation Function
 # ==========================================================
 def generate_answer(query: str, retrieved_chunks: list):
-    """Generate grounded answers using Phi-2."""
     if not retrieved_chunks:
         return "Sorry, I couldn’t find relevant information in the document."
@@ -112,21 +116,28 @@ def generate_answer(query: str, retrieved_chunks: list):
     prompt = PROMPT_TEMPLATE.format(context=context, query=query)
     try:
         result = _answer_model(
             prompt,
-            max_new_tokens=250,
             do_sample=False,
             early_stopping=True,
             pad_token_id=_tokenizer.eos_token_id,
         )
         answer = result[0]["generated_text"].strip()
         return answer
     except Exception as e:
         print(f"⚠️ Generation failed: {e}")
         return "⚠️ Error: Could not generate an answer at the moment."
 # ==========================================================
-# 7️⃣ Local Test (optional)
 # ==========================================================
 if __name__ == "__main__":
     from vectorstore import build_faiss_index
@@ -135,12 +146,12 @@ if __name__ == "__main__":
         "Step 2: Click 'Export' to download a CSV summary.",
         "Step 3: Review the generated report in your downloads folder."
     ]
     embeddings = [
         _query_model.encode([f"passage: {chunk}"], convert_to_numpy=True, normalize_embeddings=True)[0]
         for chunk in dummy_chunks
     ]
     index = build_faiss_index(embeddings)
     query = "What are the steps to export a report?"
     retrieved = retrieve_chunks(query, index, dummy_chunks)
     print("🔍 Retrieved:", retrieved)

 """
+qa.py — Optimized Phi-2 Retrieval + Generation
+----------------------------------------------
 Uses:
+• intfloat/e5-small-v2 for embeddings
+• microsoft/phi-2 for reasoning-rich generation (fast on CPU)
+Optimized for: speed + stability in Streamlit / Hugging Face Spaces
 """
 import os
 import numpy as np
 from sentence_transformers import SentenceTransformer
 from sklearn.metrics.pairwise import cosine_similarity
+from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
+import torch
+print("✅ qa.py (Phi-2 optimized fast) loaded from:", __file__)
 # ==========================================================
 # 1️⃣ Cache Setup
     _query_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", cache_folder=CACHE_DIR)
 # ==========================================================
+# 3️⃣ Phi-2 LLM Setup (Quantized for CPU)
 # ==========================================================
 try:
     MODEL_NAME = "microsoft/phi-2"
+    print(f"✅ Loading LLM: {MODEL_NAME} (quantized, CPU-optimized)")
     _tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir=CACHE_DIR)
+    # ✅ Load model in mixed precision for 4–6× faster inference
     _model = AutoModelForCausalLM.from_pretrained(
         MODEL_NAME,
         cache_dir=CACHE_DIR,
+        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.bfloat16,
         low_cpu_mem_usage=True,
+    ).to("cpu")
+    # ✅ Create generation pipeline (keep in memory)
     _answer_model = pipeline(
         "text-generation",
         model=_model,
         tokenizer=_tokenizer,
         device=-1,
+        model_kwargs={"torch_dtype": torch.bfloat16, "low_cpu_mem_usage": True},
     )
+    print("✅ Phi-2 text-generation pipeline ready (optimized).")
 except Exception as e:
     print(f"⚠️ Phi-2 load failed: {e}")
     _answer_model = None
 PROMPT_TEMPLATE = (
     "You are an expert assistant for enterprise document understanding.\n"
     "Use ONLY the context below to answer the question clearly and factually.\n"
+    "If the context doesn’t contain the answer, reply exactly:\n"
     "'I don't know based on the provided document.'\n\n"
     "Context:\n{context}\n\nQuestion: {query}\nAnswer:"
 )
 # ==========================================================
+# 5️⃣ Retrieve Top-K Chunks
 # ==========================================================
 def retrieve_chunks(query: str, index, chunks: list, top_k: int = 5):
+    """Efficient FAISS retrieval using cosine similarity."""
     if not index or not chunks:
         return []
         q_emb = _query_model.encode([f"query: {query.strip()}"], convert_to_numpy=True, normalize_embeddings=True)[0]
         distances, indices = index.search(np.array([q_emb]).astype("float32"), top_k * 2)
         selected = set()
         for idx in indices[0]:
             for i in range(max(0, idx - 1), min(len(chunks), idx + 2)):
         return []
 # ==========================================================
+# 6️⃣ Answer Generation (fast)
 # ==========================================================
 def generate_answer(query: str, retrieved_chunks: list):
+    """Generate concise, grounded answers using Phi-2."""
     if not retrieved_chunks:
         return "Sorry, I couldn’t find relevant information in the document."
     prompt = PROMPT_TEMPLATE.format(context=context, query=query)
     try:
+        # ✅ Limit tokens to speed up inference
         result = _answer_model(
             prompt,
+            max_new_tokens=120,  # reduced for faster completion
             do_sample=False,
             early_stopping=True,
             pad_token_id=_tokenizer.eos_token_id,
         )
         answer = result[0]["generated_text"].strip()
+        # Clean excessive prompt echo
+        if "Answer:" in answer:
+            answer = answer.split("Answer:")[-1].strip()
         return answer
     except Exception as e:
         print(f"⚠️ Generation failed: {e}")
         return "⚠️ Error: Could not generate an answer at the moment."
 # ==========================================================
+# 7️⃣ Local Test
 # ==========================================================
 if __name__ == "__main__":
     from vectorstore import build_faiss_index
         "Step 2: Click 'Export' to download a CSV summary.",
         "Step 3: Review the generated report in your downloads folder."
     ]
     embeddings = [
         _query_model.encode([f"passage: {chunk}"], convert_to_numpy=True, normalize_embeddings=True)[0]
         for chunk in dummy_chunks
     ]
     index = build_faiss_index(embeddings)
     query = "What are the steps to export a report?"
     retrieved = retrieve_chunks(query, index, dummy_chunks)
     print("🔍 Retrieved:", retrieved)