Spaces:

Shubham170793
/

enterprise-knowledge-assistant

Sleeping

App Files Files Community

Shubham170793 commited on Oct 13

Commit

41ac7b0

verified ·

1 Parent(s): 66bfc48

Update src/qa.py

Browse files

Files changed (1) hide show

src/qa.py +124 -98

src/qa.py CHANGED Viewed

@@ -1,22 +1,23 @@
 """
-qa.py — Phi-2 Fast + Smart Reasoning Mode (CPU-only Stable)
------------------------------------------------------------
-✅ Uses intfloat/e5-small-v2 for embeddings
-✅ Uses microsoft/phi-2 (CPU-only, no GPU / quantization)
-✅ Reasoning Mode toggle integrated cleanly
-✅ Retrieval and chunking unchanged
 """
 import os
 import numpy as np
-import torch
 from sentence_transformers import SentenceTransformer
 from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
-print("✅ qa.py (Phi-2 CPU) loaded from:", __file__)
 # ==========================================================
-# 1️⃣ Cache Setup
 # ==========================================================
 CACHE_DIR = "/tmp/hf_cache"
 os.makedirs(CACHE_DIR, exist_ok=True)
@@ -26,6 +27,7 @@ os.environ.update({
     "HF_DATASETS_CACHE": CACHE_DIR,
     "HF_MODULES_CACHE": CACHE_DIR
 })
 # ==========================================================
 # 2️⃣ Embedding Model
@@ -34,119 +36,145 @@ try:
     _query_model = SentenceTransformer("intfloat/e5-small-v2", cache_folder=CACHE_DIR)
     print("✅ Loaded embedding model: intfloat/e5-small-v2")
 except Exception as e:
-    print(f"⚠️ Fallback to MiniLM due to {e}")
     _query_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", cache_folder=CACHE_DIR)
 # ==========================================================
-# 3️⃣ Phi-2 Model (CPU-only, no quantization)
 # ==========================================================
-try:
-    MODEL_NAME = "microsoft/phi-2"
-    print(f"✅ Loading LLM: {MODEL_NAME} (CPU mode)")
-    _tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir=CACHE_DIR)
-    _model = AutoModelForCausalLM.from_pretrained(
-        MODEL_NAME,
-        cache_dir=CACHE_DIR,
-        torch_dtype=torch.float32,   # safest for CPU
-        low_cpu_mem_usage=True,
-    ).to("cpu")
-    _answer_model = pipeline(
-        "text-generation",
-        model=_model,
-        tokenizer=_tokenizer,
-        device=-1,  # Force CPU
-        model_kwargs={"low_cpu_mem_usage": True},
-    )
-    print("✅ Phi-2 text-generation pipeline ready (CPU).")
-except Exception as e:
-    print(f"⚠️ Phi-2 load failed: {e}")
-    _answer_model = None
 # ==========================================================
-# 4️⃣ Prompt Templates
 # ==========================================================
 STRICT_PROMPT = (
-    "Answer based ONLY on the context below.\n"
-    "If the answer isn’t in the context, say: 'I don't know based on the provided document.'\n\n"
     "Context:\n{context}\n\nQuestion: {query}\nAnswer:"
 )
 REASONING_PROMPT = (
-    "You are an expert assistant. Use the context and your reasoning ability to form a clear, step-by-step answer.\n"
-    "Be concise yet complete. If the context doesn’t contain the answer, say: 'I don't know based on the provided document.'\n\n"
     "Context:\n{context}\n\nQuestion: {query}\nAnswer:"
 )
 # ==========================================================
-# 5️⃣ Retrieval (unchanged)
 # ==========================================================
-def retrieve_chunks(query: str, index, chunks: list, top_k: int = 5):
     if not index or not chunks:
         return []
-    try:
-        q_emb = _query_model.encode(
-            [f"query: {query.strip()}"],
-            convert_to_numpy=True,
-            normalize_embeddings=True
-        )[0]
-        distances, indices = index.search(np.array([q_emb]).astype("float32"), top_k * 2)
-        selected = set()
-        for idx in indices[0]:
-            for i in range(max(0, idx - 1), min(len(chunks), idx + 2)):
-                selected.add(i)
-        return [chunks[i] for i in sorted(selected)]
-    except Exception as e:
-        print(f"⚠️ Retrieval error: {e}")
-        return []
 # ==========================================================
-# 6️⃣ Answer Generation (CPU Stable)
 # ==========================================================
 def generate_answer(query: str, retrieved_chunks: list, reasoning_mode: bool = False):
     if not retrieved_chunks:
         return "Sorry, I couldn’t find relevant information in the document."
     context = "\n".join(chunk.strip() for chunk in retrieved_chunks)
-    context = context[:2500]  # ✅ keep context short to avoid overflow & massive slowdowns
-    print(f"🧩 Context length (chars): {len(context)}, chunks used: {len(retrieved_chunks)}")
-    reasoning_prompt = (
-        "You are an expert assistant for enterprise document understanding.\n"
-        "Use the context below and your reasoning ability to form a complete, explanatory answer.\n"
-        "If the context doesn’t contain the answer, you can logically infer based on general knowledge, "
-        "but mention that explicitly.\n\n"
-        "Context:\n{context}\n\nQuestion: {query}\n\nAnswer:"
     )
-    strict_prompt = (
-        "You are an assistant that must answer only using the information in the provided context.\n"
-        "If the context does not contain relevant information, respond exactly:\n"
-        "'I don't know based on the provided document.'\n\n"
-        "Context:\n{context}\n\nQuestion: {query}\nAnswer:"
-    )
-    prompt = (reasoning_prompt if reasoning_mode else strict_prompt).format(context=context, query=query)
     try:
         result = _answer_model(
             prompt,
-            max_new_tokens=120 if not reasoning_mode else 180,
-            temperature=0.2 if not reasoning_mode else 0.4,
-            do_sample=False,
             pad_token_id=_tokenizer.eos_token_id,
         )
-        raw = result[0]["generated_text"]
-        if "Answer:" in raw:
-            raw = raw.split("Answer:")[-1].strip()
-        return raw.strip()
     except Exception as e:
         print(f"⚠️ Generation failed: {e}")
         return "⚠️ Error: Could not generate an answer."
@@ -156,22 +184,20 @@ def generate_answer(query: str, retrieved_chunks: list, reasoning_mode: bool = F
 # ==========================================================
 if __name__ == "__main__":
     from vectorstore import build_faiss_index
     dummy_chunks = [
         "Step 1: Open the dashboard and navigate to reports.",
         "Step 2: Click 'Export' to download a CSV summary.",
-        "Step 3: Review the generated report in your downloads folder."
     ]
     embeddings = [
-        _query_model.encode([f"passage: {chunk}"], convert_to_numpy=True, normalize_embeddings=True)[0]
-        for chunk in dummy_chunks
     ]
     index = build_faiss_index(embeddings)
-    query = "What are the steps to export a report?"
     retrieved = retrieve_chunks(query, index, dummy_chunks)
-    print("\n--- Strict Mode ---")
-    print(generate_answer(query, retrieved, reasoning_mode=False))
-    print("\n--- Reasoning Mode ---")
-    print(generate_answer(query, retrieved, reasoning_mode=True))

 """
+qa.py — Phi-2 FAST + RERANKED RETRIEVAL
+--------------------------------------
+Uses:
+• intfloat/e5-small-v2  — embeddings
+• microsoft/phi-2       — generation
+Optimized for: speed, factual accuracy, and semantic retrieval on Hugging Face Spaces
 """
 import os
 import numpy as np
 from sentence_transformers import SentenceTransformer
+from sklearn.metrics.pairwise import cosine_similarity
 from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
+import torch
+print("✅ qa.py (Phi-2 FAST + ReRank) loaded from:", __file__)
 # ==========================================================
+# 1️⃣ Cache Setup (Hugging Face /tmp cache)
 # ==========================================================
 CACHE_DIR = "/tmp/hf_cache"
 os.makedirs(CACHE_DIR, exist_ok=True)
     "HF_DATASETS_CACHE": CACHE_DIR,
     "HF_MODULES_CACHE": CACHE_DIR
 })
+print(f"✅ Using Hugging Face cache at {CACHE_DIR}")
 # ==========================================================
 # 2️⃣ Embedding Model
     _query_model = SentenceTransformer("intfloat/e5-small-v2", cache_folder=CACHE_DIR)
     print("✅ Loaded embedding model: intfloat/e5-small-v2")
 except Exception as e:
+    print(f"⚠️ Embedding load failed ({e}), falling back to MiniLM")
     _query_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", cache_folder=CACHE_DIR)
 # ==========================================================
+# 3️⃣ Phi-2 LLM Setup
 # ==========================================================
+MODEL_NAME = "microsoft/phi-2"
+print(f"✅ Loading LLM: {MODEL_NAME}")
+_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir=CACHE_DIR)
+_model = AutoModelForCausalLM.from_pretrained(
+    MODEL_NAME,
+    cache_dir=CACHE_DIR,
+    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.bfloat16,
+    low_cpu_mem_usage=True,
+).to("cpu")
+_answer_model = pipeline(
+    "text-generation",
+    model=_model,
+    tokenizer=_tokenizer,
+    device=-1,
+    model_kwargs={"torch_dtype": torch.bfloat16, "low_cpu_mem_usage": True},
+)
+print("✅ Phi-2 text-generation pipeline ready (optimized).")
 # ==========================================================
+# 4️⃣ Prompt Template
 # ==========================================================
 STRICT_PROMPT = (
+    "You are an enterprise documentation assistant.\n"
+    "Answer factually using ONLY the context below.\n"
+    "If the answer isn’t present, reply exactly:\n"
+    "'I don't know based on the provided document.'\n\n"
     "Context:\n{context}\n\nQuestion: {query}\nAnswer:"
 )
 REASONING_PROMPT = (
+    "You are an expert enterprise assistant with reasoning ability.\n"
+    "Think carefully about the context and question.\n"
+    "Use world knowledge and inference if necessary, but prefer factual accuracy.\n"
+    "If the document lacks the answer, say:\n"
+    "'I don't know based on the provided document.'\n\n"
     "Context:\n{context}\n\nQuestion: {query}\nAnswer:"
 )
 # ==========================================================
+# 5️⃣ Retrieve Chunks (FAISS + Rerank + Neighbor Expansion)
 # ==========================================================
+def retrieve_chunks(
+    query: str,
+    index,
+    chunks: list,
+    top_k: int = 3,
+    topn_candidates: int = 20,
+    neighbor_threshold: float = 0.68,
+    expansion_window: int = 1,
+    max_context_chunks: int = 6,
+):
+    """Retrieve semantically relevant chunks with reranking and neighbor expansion."""
     if not index or not chunks:
         return []
+    # 1️⃣ Encode query (normalized)
+    query_emb = _query_model.encode(
+        [f"query: {query.strip()}"],
+        convert_to_numpy=True,
+        normalize_embeddings=True
+    )[0].astype("float32")
+    # 2️⃣ FAISS search (initial candidates)
+    topn_candidates = min(topn_candidates, getattr(index, "ntotal", topn_candidates))
+    _, candidate_ids = index.search(np.array([query_emb]).astype("float32"), topn_candidates)
+    candidate_ids = [int(i) for i in candidate_ids[0] if i != -1]
+    # 3️⃣ Re-encode candidate chunks and compute cosine similarities
+    candidate_texts = [chunks[i] for i in candidate_ids]
+    candidate_vecs = np.array([
+        _query_model.encode([t], convert_to_numpy=True, normalize_embeddings=True)[0]
+        for t in candidate_texts
+    ])
+    sims = cosine_similarity([query_emb], candidate_vecs)[0]
+    sorted_idx = np.argsort(sims)[::-1]
+    reranked_ids = [candidate_ids[i] for i in sorted_idx]
+    # 4️⃣ Select top-k base chunks
+    selected, selected_set = [], set()
+    for rid in reranked_ids:
+        if len(selected) >= top_k:
+            break
+        selected.append(rid)
+        selected_set.add(rid)
+    # 5️⃣ Conditional neighbor expansion
+    final_order = list(selected)
+    for base_id in selected:
+        if len(final_order) >= max_context_chunks:
+            break
+        for offset in range(1, expansion_window + 1):
+            for neighbor in (base_id - offset, base_id + offset):
+                if neighbor < 0 or neighbor >= len(chunks) or neighbor in selected_set:
+                    continue
+                # Check semantic closeness
+                neighbor_vec = _query_model.encode([chunks[neighbor]], convert_to_numpy=True, normalize_embeddings=True)[0]
+                sim = float(cosine_similarity([query_emb], [neighbor_vec])[0][0])
+                if sim >= neighbor_threshold:
+                    final_order.append(neighbor)
+                    selected_set.add(neighbor)
+                    if len(final_order) >= max_context_chunks:
+                        break
+            if len(final_order) >= max_context_chunks:
+                break
+    return [chunks[i] for i in final_order]
 # ==========================================================
+# 6️⃣ Answer Generation
 # ==========================================================
 def generate_answer(query: str, retrieved_chunks: list, reasoning_mode: bool = False):
+    """Generate concise, factual or reasoning-based answers using Phi-2."""
     if not retrieved_chunks:
         return "Sorry, I couldn’t find relevant information in the document."
     context = "\n".join(chunk.strip() for chunk in retrieved_chunks)
+    prompt = (REASONING_PROMPT if reasoning_mode else STRICT_PROMPT).format(
+        context=context, query=query
     )
     try:
         result = _answer_model(
             prompt,
+            max_new_tokens=180 if reasoning_mode else 120,
+            temperature=0.6 if reasoning_mode else 0.3,
+            do_sample=reasoning_mode,
+            early_stopping=True,
             pad_token_id=_tokenizer.eos_token_id,
         )
+        text = result[0]["generated_text"].strip()
+        return text.split("Answer:")[-1].strip() if "Answer:" in text else text
     except Exception as e:
         print(f"⚠️ Generation failed: {e}")
         return "⚠️ Error: Could not generate an answer."
 # ==========================================================
 if __name__ == "__main__":
     from vectorstore import build_faiss_index
     dummy_chunks = [
         "Step 1: Open the dashboard and navigate to reports.",
         "Step 2: Click 'Export' to download a CSV summary.",
+        "Step 3: Review the generated report in your downloads folder.",
+        "Appendix: Communication user creation steps are explained later in this guide."
     ]
     embeddings = [
+        _query_model.encode([f"passage: {c}"], convert_to_numpy=True, normalize_embeddings=True)[0]
+        for c in dummy_chunks
     ]
     index = build_faiss_index(embeddings)
+    query = "How do I create a communication user?"
     retrieved = retrieve_chunks(query, index, dummy_chunks)
+    print("🔍 Retrieved:", retrieved)
+    print("💬 Answer:", generate_answer(query, retrieved))