Spaces:

Shubham170793
/

enterprise-knowledge-assistant

Sleeping

App Files Files Community

Shubham170793 commited on Oct 14

Commit

28eda6f

verified ·

1 Parent(s): 960fe58

Update src/qa.py

Browse files

Files changed (1) hide show

src/qa.py +148 -102

src/qa.py CHANGED Viewed

@@ -1,14 +1,11 @@
 """
-qa.py — Phi-2 FAST + RERANKED RETRIEVAL + INTENT WEIGHTING (with Debug)
------------------------------------------------------------------------
-Uses:
-• intfloat/e5-small-v2  — embeddings
-• microsoft/phi-2       — generation
-Optimized for: speed, factual accuracy, and semantic retrieval on Hugging Face Spaces
-Now includes:
-• Intent-weighted query embedding
-• Intent-aware prompting (LLM focuses on “how”, “what”, “why”)
-• Debug printout showing detected query intent for verification
 """
 import os
@@ -18,11 +15,11 @@ from sklearn.metrics.pairwise import cosine_similarity
 from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 import torch
-print("✅ qa.py (Phi-2 FAST + ReRank + Intent + Debug) loaded from:", __file__)
-# ==========================================================
-# 1️⃣ Cache Setup (Hugging Face /tmp cache)
-# ==========================================================
 CACHE_DIR = "/tmp/hf_cache"
 os.makedirs(CACHE_DIR, exist_ok=True)
 os.environ.update({
@@ -33,9 +30,9 @@ os.environ.update({
 })
 print(f"✅ Using Hugging Face cache at {CACHE_DIR}")
-# ==========================================================
-# 2️⃣ Embedding Model
-# ==========================================================
 try:
     _query_model = SentenceTransformer("intfloat/e5-small-v2", cache_folder=CACHE_DIR)
     print("✅ Loaded embedding model: intfloat/e5-small-v2")
@@ -43,9 +40,9 @@ except Exception as e:
     print(f"⚠️ Embedding load failed ({e}), falling back to MiniLM")
     _query_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", cache_folder=CACHE_DIR)
-# ==========================================================
-# 3️⃣ Phi-2 LLM Setup
-# ==========================================================
 MODEL_NAME = "microsoft/phi-2"
 print(f"✅ Loading LLM: {MODEL_NAME}")
@@ -66,145 +63,194 @@ _answer_model = pipeline(
 )
 print("✅ Phi-2 text-generation pipeline ready (optimized).")
-# ==========================================================
-# 4️⃣ Prompt Templates (intent-aware)
-# ==========================================================
 STRICT_PROMPT = (
     "You are an enterprise documentation assistant.\n"
-    "Understand the intent of the question before answering:\n"
-    "• If it asks 'how', focus only on step-by-step or procedural instructions.\n"
-    "• If it asks 'what', provide definitions or factual explanations.\n"
-    "• If it asks 'why', explain reasons or purposes.\n"
-    "Use ONLY the provided context below to answer factually.\n"
-    "If the answer isn’t present, reply exactly:\n"
-    "'I don't know based on the provided document.'\n\n"
     "Context:\n{context}\n\nQuestion: {query}\nAnswer:"
 )
 REASONING_PROMPT = (
-    "You are an expert enterprise assistant with reasoning ability.\n"
-    "Think carefully about the context and question intent.\n"
-    "If it's procedural, outline steps clearly.\n"
-    "If it's conceptual, explain in detail.\n"
-    "Prefer factual accuracy but you may infer if clearly implied.\n"
     "If the document lacks the answer, say:\n"
-    "'I don't know based on the provided document.'\n\n"
     "Context:\n{context}\n\nQuestion: {query}\nAnswer:"
 )
-# ==========================================================
-# 5️⃣ Retrieve Chunks (FAISS + Rerank + Intent-weighting + Debug)
-# ==========================================================
-def retrieve_chunks(query: str, index, chunks: list, top_k: int = 5, min_similarity: float = 0.6):
     """
-    Hybrid retrieval:
-    1️⃣ Detect query intent and embed accordingly.
-    2️⃣ Get semantic top-K chunks via FAISS.
-    3️⃣ Re-rank by cosine similarity and apply a minimum similarity filter.
-    4️⃣ If fewer than top_k remain, fill remaining seats with adjacent chunks (±1) for continuity.
     """
     if not index or not chunks:
         return []
     try:
-        # 🔍 Detect and encode query intent
-        intent_hint = ""
-        query_type = "factual"
-        if any(kw in query.lower() for kw in ["how", "create", "steps", "procedure", "setup", "configure"]):
-            query_type = "procedural"
-            intent_hint = " This is an instructional query; focus on procedure and step-by-step instructions."
-        elif any(kw in query.lower() for kw in ["why", "reason", "purpose", "benefit"]):
-            query_type = "conceptual"
-            intent_hint = " This is a conceptual query; focus on rationale and explanation."
-        print(f"🧩 Detected query type: {query_type}")
         q_emb = _query_model.encode(
-            [f"query: {query.strip()}{intent_hint}"],
             convert_to_numpy=True,
             normalize_embeddings=True
         )[0]
-        # Step 1️⃣ — FAISS initial retrieval
-        distances, indices = index.search(np.array([q_emb]).astype("float32"), top_k * 2)
-        retrieved_indices = list(indices[0])
-        # Step 2️⃣ — Compute cosine similarity for re-ranking
-        retrieved_texts = [chunks[i] for i in retrieved_indices]
         doc_embs = _query_model.encode(
-            [f"passage: {c}" for c in retrieved_texts],
             convert_to_numpy=True,
             normalize_embeddings=True
         )
         sims = cosine_similarity([q_emb], doc_embs)[0]
-        ranked = sorted(zip(retrieved_indices, sims), key=lambda x: x[1], reverse=True)
-        # Step 3️⃣ — Apply minimum similarity filter
-        filtered_indices = [idx for idx, score in ranked if score >= min_similarity]
-        # Step 4️⃣ — If not enough, add ±1 neighbors for continuity
-        if len(filtered_indices) < top_k:
-            expanded_indices = set(filtered_indices)
-            for idx in filtered_indices:
-                for neighbor in [idx - 1, idx + 1]:
-                    if 0 <= neighbor < len(chunks):
-                        expanded_indices.add(neighbor)
-                        if len(expanded_indices) >= top_k:
-                            break
-                if len(expanded_indices) >= top_k:
                     break
-            filtered_indices = list(sorted(expanded_indices))[:top_k]
-        # Step 5️⃣ — Build final ordered list of chunks
-        final_chunks = [chunks[i] for i in filtered_indices]
-        print(f"✅ Retrieved {len(final_chunks)} chunks (intent-weighted, semantic + neighbor fill).")
         return final_chunks
     except Exception as e:
         print(f"⚠️ Retrieval error: {e}")
         return []
-# ==========================================================
-# 6️⃣ Answer Generation
-# ==========================================================
 def generate_answer(query: str, retrieved_chunks: list, reasoning_mode: bool = False):
-    """Generate concise, factual or reasoning-based answers using Phi-2."""
     if not retrieved_chunks:
         return "Sorry, I couldn’t find relevant information in the document."
-    context = "\n".join(chunk.strip() for chunk in retrieved_chunks)
     prompt = (REASONING_PROMPT if reasoning_mode else STRICT_PROMPT).format(
         context=context, query=query
     )
     try:
-        # 🧠 Adaptive length for factual mode (based on question complexity)
         if reasoning_mode:
-            max_tokens = 180  # keep reasoning slightly longer
         else:
-            max_tokens = 120 if len(query.split()) < 6 else 180  # short factual queries stay fast
         result = _answer_model(
             prompt,
-            max_new_tokens=max_tokens,
-            temperature=0.6 if reasoning_mode else 0.3,
-            do_sample=reasoning_mode,
             early_stopping=True,
             pad_token_id=_tokenizer.eos_token_id,
         )
-        text = result[0]["generated_text"].strip()
-        return text.split("Answer:")[-1].strip() if "Answer:" in text else text
     except Exception as e:
         print(f"⚠️ Generation failed: {e}")
         return "⚠️ Error: Could not generate an answer."
-# ==========================================================
-# 7️⃣ Local Test
-# ==========================================================
 if __name__ == "__main__":
     from vectorstore import build_faiss_index
@@ -221,6 +267,6 @@ if __name__ == "__main__":
     index = build_faiss_index(embeddings)
     query = "How do I create a communication user?"
-    retrieved = retrieve_chunks(query, index, dummy_chunks)
     print("🔍 Retrieved:", retrieved)
-    print("💬 Answer:", generate_answer(query, retrieved))

 """
+qa.py — Phi-2 FAST + ReRank (stable) — Prefer semantic ranking, neighbor-fill last-resort
+---------------------------------------------------------------------------------------
+- Uses intfloat/e5-small-v2 for embeddings
+- Uses microsoft/phi-2 for generation
+- Re-ranks candidate pool from FAISS then picks top_k by true cosine similarity
+- Neighbor expansion only if not enough high-sim items
+- Logs chunk indices + similarity scores for debugging
 """
 import os
 from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 import torch
+print("✅ qa.py (Phi-2 FAST + ReRank stable) loaded from:", __file__)
+# ---------------------------
+# Cache
+# ---------------------------
 CACHE_DIR = "/tmp/hf_cache"
 os.makedirs(CACHE_DIR, exist_ok=True)
 os.environ.update({
 })
 print(f"✅ Using Hugging Face cache at {CACHE_DIR}")
+# ---------------------------
+# Embeddings
+# ---------------------------
 try:
     _query_model = SentenceTransformer("intfloat/e5-small-v2", cache_folder=CACHE_DIR)
     print("✅ Loaded embedding model: intfloat/e5-small-v2")
     print(f"⚠️ Embedding load failed ({e}), falling back to MiniLM")
     _query_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", cache_folder=CACHE_DIR)
+# ---------------------------
+# Phi-2 model
+# ---------------------------
 MODEL_NAME = "microsoft/phi-2"
 print(f"✅ Loading LLM: {MODEL_NAME}")
 )
 print("✅ Phi-2 text-generation pipeline ready (optimized).")
+# ---------------------------
+# Prompts
+# ---------------------------
 STRICT_PROMPT = (
     "You are an enterprise documentation assistant.\n"
+    "Use ONLY the CONTEXT chunks below to answer the QUESTION.\n"
+    "Cite the chunk number(s) you used, e.g. [Chunk 3].\n"
+    "If the document does not contain the answer, reply exactly:\n"
+    "\"I don't know based on the provided document.\"\n\n"
     "Context:\n{context}\n\nQuestion: {query}\nAnswer:"
 )
 REASONING_PROMPT = (
+    "You are an expert enterprise assistant with reasoning capacity.\n"
+    "Prefer the provided CONTEXT but you may cautiously infer when reasonable.\n"
+    "If you infer, say so and prefer facts from the document.\n"
     "If the document lacks the answer, say:\n"
+    "\"I don't know based on the provided document.\"\n\n"
     "Context:\n{context}\n\nQuestion: {query}\nAnswer:"
 )
+# ---------------------------
+# Retrieval: FAISS -> rerank -> neighbor fill (last resort)
+# ---------------------------
+def retrieve_chunks(query: str, index, chunks: list, top_k: int = 3, min_similarity: float = 0.55, candidate_multiplier: int = 4):
     """
+    Steps:
+    1. Encode query (E5 style).
+    2. Run FAISS search for k*candidate_multiplier candidates.
+    3. Re-embed those candidate texts and compute cosine similarity with query embedding.
+    4. Sort by similarity and pick top_k where similarity >= min_similarity.
+    5. If fewer than top_k passed threshold, fill remaining slots by:
+       - selecting neighboring chunks around the *highest-scoring* chunk(s),
+         but only if absolutely necessary (keeps noise low).
+    Returns: ordered list of chunks (strings)
+    Also prints indices + similarity scores for debugging.
     """
     if not index or not chunks:
         return []
     try:
+        # 1. encode query
         q_emb = _query_model.encode(
+            [f"query: {query.strip()}"],
             convert_to_numpy=True,
             normalize_embeddings=True
         )[0]
+        # 2. FAISS initial retrieval (get a larger candidate pool)
+        num_candidates = max(top_k * candidate_multiplier, top_k + 2)
+        distances, indices = index.search(np.array([q_emb]).astype("float32"), num_candidates)
+        candidate_indices = [int(i) for i in indices[0] if i >= 0]
+        # protective dedupe and clamp
+        candidate_indices = list(dict.fromkeys(candidate_indices))  # preserve order, unique
+        # 3. Re-embed candidate texts and compute true cosine similarity
+        candidate_texts = [chunks[i] for i in candidate_indices]
+        # Encode passages (passage prefix helps alignment)
         doc_embs = _query_model.encode(
+            [f"passage: {c}" for c in candidate_texts],
             convert_to_numpy=True,
             normalize_embeddings=True
         )
         sims = cosine_similarity([q_emb], doc_embs)[0]
+        # Pair up indices and sims and sort descending
+        paired = [(candidate_indices[i], float(sims[i])) for i in range(len(candidate_indices))]
+        paired_sorted = sorted(paired, key=lambda x: x[1], reverse=True)
+        # Debug print: top candidates and their similarity
+        print("🔎 Candidate ranking (index : sim):")
+        for idx, sim in paired_sorted[: min(len(paired_sorted), top_k * 3)]:
+            print(f"  - Chunk {idx} : {sim:.4f}")
+        # 4. Pick those meeting threshold
+        selected = [idx for idx, sim in paired_sorted if sim >= min_similarity]
+        # Preserve order by similarity
+        selected = selected[:top_k]
+        # 5. If not enough, fill by neighbors around highest-scoring items
+        if len(selected) < top_k:
+            needed = top_k - len(selected)
+            # pick highest scoring indices as anchor(s)
+            anchors = [idx for idx, _ in paired_sorted[:3]]  # top 3 anchors
+            expanded = []
+            for a in anchors:
+                # neighbors ordered by proximity: a, a-1, a+1, a-2, a+2 ...
+                if a not in expanded:
+                    expanded.append(a)
+                offset = 1
+                while len(expanded) < top_k and offset < 5:
+                    for cand in (a - offset, a + offset):
+                        if 0 <= cand < len(chunks) and cand not in expanded:
+                            expanded.append(cand)
+                            if len(expanded) >= top_k:
+                                break
+                    offset += 1
+                if len(expanded) >= top_k:
                     break
+            # final selected: first maintain previously selected, then add neighbors from expanded preserving order
+            final_order = []
+            for idx, _sim in paired_sorted:
+                if idx in selected and idx not in final_order:
+                    final_order.append(idx)
+            for idx in expanded:
+                if idx not in final_order:
+                    final_order.append(idx)
+            selected = final_order[:top_k]
+        # final chunk strings (ordered by selected list)
+        final_chunks = [chunks[i] for i in selected]
+        print(f"✅ retrieve_chunks: returning {len(final_chunks)} chunks (top_k={top_k}, min_sim={min_similarity})")
+        print(f"  chunk indices: {selected}")
+        # Also return the indices? (if you want to display chunk numbers in UI, you can)
         return final_chunks
     except Exception as e:
         print(f"⚠️ Retrieval error: {e}")
         return []
+# ---------------------------
+# Answer generation
+# ---------------------------
 def generate_answer(query: str, retrieved_chunks: list, reasoning_mode: bool = False):
+    """
+    - reasoning_mode=False => strict factual, deterministic
+    - reasoning_mode=True  => allow cautious inference (slower / longer)
+    """
     if not retrieved_chunks:
         return "Sorry, I couldn’t find relevant information in the document."
+    # Add chunk headings so model can cite them if needed
+    context_lines = []
+    for i, chunk in enumerate(retrieved_chunks, start=1):
+        # Use [Chunk i] markers — LLM will echo them when asked to cite sources
+        context_lines.append(f"[Chunk {i}]: {chunk.strip()}")
+    context = "\n".join(context_lines)
     prompt = (REASONING_PROMPT if reasoning_mode else STRICT_PROMPT).format(
         context=context, query=query
     )
     try:
+        # deterministic in strict mode
         if reasoning_mode:
+            max_new_tokens = 220
+            temp = 0.6
+            do_sample = True
         else:
+            max_new_tokens = 140
+            temp = 0.0
+            do_sample = False
         result = _answer_model(
             prompt,
+            max_new_tokens=max_new_tokens,
+            temperature=temp,
+            do_sample=do_sample,
             early_stopping=True,
             pad_token_id=_tokenizer.eos_token_id,
         )
+        text = result[0].get("generated_text", "").strip()
+        # remove the prompt echo if present
+        if "Answer:" in text:
+            out = text.split("Answer:")[-1].strip()
+        else:
+            out = text
+        # Enforce exact fallback phrase if model tries to paraphrase missing-answer
+        if not reasoning_mode and ("i don't know" in out.lower() or "not present" in out.lower()):
+            return "I don't know based on the provided document."
+        return out
     except Exception as e:
         print(f"⚠️ Generation failed: {e}")
         return "⚠️ Error: Could not generate an answer."
+# ---------------------------
+# Local debug main
+# ---------------------------
 if __name__ == "__main__":
     from vectorstore import build_faiss_index
     index = build_faiss_index(embeddings)
     query = "How do I create a communication user?"
+    retrieved = retrieve_chunks(query, index, dummy_chunks, top_k=3, min_similarity=0.55)
     print("🔍 Retrieved:", retrieved)
+    print("💬 Answer:", generate_answer(query, retrieved, reasoning_mode=False))