Spaces:

Shubham170793
/

enterprise-knowledge-assistant

Running

App Files Files Community

Shubham170793 commited on Oct 11

Commit

d7aaa8f

verified ·

1 Parent(s): 00be68d

Update src/qa.py

Browse files

Files changed (1) hide show

src/qa.py +61 -83

src/qa.py CHANGED Viewed

@@ -1,24 +1,22 @@
 """
-qa.py — Retrieval + Generation Layer (Mistral Optimized v2)
------------------------------------------------------------
-Handles:
-• Query embedding (SentenceTransformer / E5)
-• Fast FAISS retrieval with context merging
-• Answer generation via Mistral-7B-Instruct (optimized for CPU)
------------------------------------------------------------
-Built for Hugging Face Spaces / Streamlit apps.
 """
 import os
 import numpy as np
 from sentence_transformers import SentenceTransformer
 from sklearn.metrics.pairwise import cosine_similarity
-from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
-print("✅ qa.py (Mistral Optimized v2) loaded from:", __file__)
 # ==========================================================
-# 1️⃣ Hugging Face Cache Setup
 # ==========================================================
 CACHE_DIR = "/tmp/hf_cache"
 os.makedirs(CACHE_DIR, exist_ok=True)
@@ -28,141 +26,121 @@ os.environ.update({
     "HF_DATASETS_CACHE": CACHE_DIR,
     "HF_MODULES_CACHE": CACHE_DIR
 })
-print(f"✅ Using Hugging Face cache at {CACHE_DIR}")
 # ==========================================================
-# 2️⃣ Query Embedding Model (E5-small, lightweight)
 # ==========================================================
 try:
     _query_model = SentenceTransformer("intfloat/e5-small-v2", cache_folder=CACHE_DIR)
-    print("✅ Loaded query model: intfloat/e5-small-v2")
 except Exception as e:
-    print(f"⚠️ Embedding model load failed ({e}), using MiniLM fallback.")
     _query_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", cache_folder=CACHE_DIR)
 # ==========================================================
-# 3️⃣ LLM Setup: Mistral-7B-Instruct (quantized + optimized)
 # ==========================================================
-MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.2"  # slightly faster and stable
-print(f"✅ Loading LLM: {MODEL_NAME}")
-_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir=CACHE_DIR)
-_model = AutoModelForCausalLM.from_pretrained(
-    MODEL_NAME,
-    cache_dir=CACHE_DIR,
-    torch_dtype="auto",
-    device_map="auto",
-    low_cpu_mem_usage=True,
-)
-_answer_model = pipeline(
-    "text-generation",
-    model=_model,
-    tokenizer=_tokenizer,
-    max_new_tokens=600,
-    do_sample=False,
-)
-print("✅ Mistral text-generation pipeline ready.")
 # ==========================================================
-# 4️⃣ Prompt Template (compact + efficient)
 # ==========================================================
 PROMPT_TEMPLATE = (
-    "Answer the question using only the document context below. "
-    "If the answer isn’t clearly in the document, say: "
     "'I don't know based on the provided document.'\n\n"
     "Context:\n{context}\n\nQuestion: {query}\nAnswer:"
 )
 # ==========================================================
-# 5️⃣ Fast Chunk Retrieval with Context Merging
 # ==========================================================
-def retrieve_chunks(query: str, index, chunks: list, top_k: int = 5, merge_window: int = 1):
-    """
-    Fast semantic retrieval with lightweight neighborhood expansion.
-    Retrieves top-K relevant chunks, then merges nearby ones for context continuity.
-    """
     if not index or not chunks:
         return []
     try:
-        # Step 1: Encode query once
-        query_emb = _query_model.encode(
-            [f"query: {query.strip()}"],
-            convert_to_numpy=True,
-            normalize_embeddings=True
-        )[0]
-        # Step 2: Retrieve top-K*2 candidates
-        distances, indices = index.search(np.array([query_emb]).astype("float32"), top_k * 2)
-        # Step 3: Expand retrieval to nearby chunks
         selected = set()
         for idx in indices[0]:
-            for n in range(max(0, idx - merge_window), min(len(chunks), idx + merge_window + 1)):
-                selected.add(n)
-        # Step 4: Preserve order (important for sequential text like steps)
-        ordered = [chunks[i] for i in sorted(selected)]
-        return ordered
     except Exception as e:
         print(f"⚠️ Retrieval error: {e}")
         return []
 # ==========================================================
-# 6️⃣ Answer Generation Function (Faster + Cleaner Output)
 # ==========================================================
 def generate_answer(query: str, retrieved_chunks: list):
-    """Generate factual, context-grounded answers using Mistral."""
     if not retrieved_chunks:
         return "Sorry, I couldn’t find relevant information in the document."
-    # Merge retrieved chunks
     context = "\n".join(chunk.strip() for chunk in retrieved_chunks)
     prompt = PROMPT_TEMPLATE.format(context=context, query=query)
     try:
         result = _answer_model(
             prompt,
-            max_new_tokens=700,
-            temperature=None,
             do_sample=False,
             pad_token_id=_tokenizer.eos_token_id,
         )
         answer = result[0]["generated_text"].strip()
-        # Cleanup redundant prompt echo
-        if "Question:" in answer:
-            answer = answer.split("Question:")[-1].strip()
-        if answer.startswith(query):
-            answer = answer[len(query):].strip()
         return answer
     except Exception as e:
         print(f"⚠️ Generation failed: {e}")
         return "⚠️ Error: Could not generate an answer at the moment."
 # ==========================================================
-# 7️⃣ Local Dev Test (optional)
 # ==========================================================
 if __name__ == "__main__":
     dummy_chunks = [
         "Step 1: Open the dashboard and navigate to reports.",
         "Step 2: Click 'Export' to download a CSV summary.",
         "Step 3: Review the generated report in your downloads folder."
     ]
-    from vectorstore import build_faiss_index
-    index = build_faiss_index([
-        _query_model.encode(
-            [f"passage: {chunk}"],
-            convert_to_numpy=True,
-            normalize_embeddings=True
-        )[0]
         for chunk in dummy_chunks
-    ])
     query = "What are the steps to export a report?"
     retrieved = retrieve_chunks(query, index, dummy_chunks)
     print("🔍 Retrieved:", retrieved)

 """
+qa.py — Retrieval + Generation (Phi-2 Fast Reasoning)
+-----------------------------------------------------
+Uses:
+ - intfloat/e5-small-v2 for embeddings
+ - microsoft/phi-2 as main LLM (fast, strong reasoning)
+ - Optional fallback: google/flan-t5-base
+Optimized for CPU inference (Hugging Face Spaces / Streamlit)
 """
 import os
 import numpy as np
 from sentence_transformers import SentenceTransformer
 from sklearn.metrics.pairwise import cosine_similarity
+print("✅ qa.py (Phi-2 optimized) loaded from:", __file__)
 # ==========================================================
+# 1️⃣ Cache Setup
 # ==========================================================
 CACHE_DIR = "/tmp/hf_cache"
 os.makedirs(CACHE_DIR, exist_ok=True)
     "HF_DATASETS_CACHE": CACHE_DIR,
     "HF_MODULES_CACHE": CACHE_DIR
 })
 # ==========================================================
+# 2️⃣ Embedding Model
 # ==========================================================
 try:
     _query_model = SentenceTransformer("intfloat/e5-small-v2", cache_folder=CACHE_DIR)
+    print("✅ Loaded embedding model: intfloat/e5-small-v2")
 except Exception as e:
+    print(f"⚠️ Fallback to MiniLM due to {e}")
     _query_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", cache_folder=CACHE_DIR)
 # ==========================================================
+# 3️⃣ Phi-2 LLM Setup
 # ==========================================================
+from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
+try:
+    MODEL_NAME = "microsoft/phi-2"
+    print(f"✅ Loading LLM: {MODEL_NAME}")
+    _tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir=CACHE_DIR)
+    _model = AutoModelForCausalLM.from_pretrained(
+        MODEL_NAME,
+        cache_dir=CACHE_DIR,
+        torch_dtype="auto",
+        low_cpu_mem_usage=True,
+    )
+    _answer_model = pipeline(
+        "text-generation",
+        model=_model,
+        tokenizer=_tokenizer,
+        device=-1,
+        max_new_tokens=250,
+        do_sample=False,
+    )
+    print("✅ Phi-2 generation pipeline ready.")
+except Exception as e:
+    print(f"⚠️ Phi-2 load failed: {e}")
+    _answer_model = None
 # ==========================================================
+# 4️⃣ Prompt Template
 # ==========================================================
 PROMPT_TEMPLATE = (
+    "You are an expert assistant for enterprise document understanding.\n"
+    "Use ONLY the context below to answer the question clearly and factually.\n"
+    "If the context doesn’t contain the answer, reply: "
     "'I don't know based on the provided document.'\n\n"
     "Context:\n{context}\n\nQuestion: {query}\nAnswer:"
 )
 # ==========================================================
+# 5️⃣ Retrieval Function
 # ==========================================================
+def retrieve_chunks(query: str, index, chunks: list, top_k: int = 5):
+    """Fast FAISS retrieval with E5 embeddings."""
     if not index or not chunks:
         return []
     try:
+        q_emb = _query_model.encode([f"query: {query.strip()}"], convert_to_numpy=True, normalize_embeddings=True)[0]
+        distances, indices = index.search(np.array([q_emb]).astype("float32"), top_k * 2)
+        # Merge nearby chunks for continuity
         selected = set()
         for idx in indices[0]:
+            for i in range(max(0, idx - 1), min(len(chunks), idx + 2)):
+                selected.add(i)
+        ordered_chunks = [chunks[i] for i in sorted(selected)]
+        return ordered_chunks
     except Exception as e:
         print(f"⚠️ Retrieval error: {e}")
         return []
 # ==========================================================
+# 6️⃣ Answer Generation Function
 # ==========================================================
 def generate_answer(query: str, retrieved_chunks: list):
+    """Generate grounded answers using Phi-2."""
     if not retrieved_chunks:
         return "Sorry, I couldn’t find relevant information in the document."
     context = "\n".join(chunk.strip() for chunk in retrieved_chunks)
     prompt = PROMPT_TEMPLATE.format(context=context, query=query)
     try:
         result = _answer_model(
             prompt,
+            max_new_tokens=250,
             do_sample=False,
+            early_stopping=True,
             pad_token_id=_tokenizer.eos_token_id,
         )
         answer = result[0]["generated_text"].strip()
         return answer
     except Exception as e:
         print(f"⚠️ Generation failed: {e}")
         return "⚠️ Error: Could not generate an answer at the moment."
 # ==========================================================
+# 7️⃣ Local Test (optional)
 # ==========================================================
 if __name__ == "__main__":
+    from vectorstore import build_faiss_index
     dummy_chunks = [
         "Step 1: Open the dashboard and navigate to reports.",
         "Step 2: Click 'Export' to download a CSV summary.",
         "Step 3: Review the generated report in your downloads folder."
     ]
+    embeddings = [
+        _query_model.encode([f"passage: {chunk}"], convert_to_numpy=True, normalize_embeddings=True)[0]
         for chunk in dummy_chunks
+    ]
+    index = build_faiss_index(embeddings)
     query = "What are the steps to export a report?"
     retrieved = retrieve_chunks(query, index, dummy_chunks)
     print("🔍 Retrieved:", retrieved)