Spaces:

Shubham170793
/

enterprise-knowledge-assistant

Sleeping

App Files Files Community

Shubham170793 commited on Oct 14

Commit

197e569

verified ·

1 Parent(s): f384f96

Update src/qa.py

Browse files

Files changed (1) hide show

src/qa.py +120 -116

src/qa.py CHANGED Viewed

@@ -1,11 +1,10 @@
 """
-qa.py — Phi-2 Hybrid (Fast + Reasoning) with Rerank & Similarity Filtering
---------------------------------------------------------------------------
-✅ Optimized for Hugging Face Spaces & Streamlit
-✅ intfloat/e5-small-v2 for embeddings
-✅ microsoft/phi-2 for generation (fast CPU-optimized)
-✅ Re-ranking + minimum similarity threshold for clean retrieval
-✅ reasoning_mode toggle for deeper answers
 """
 import os
@@ -15,10 +14,10 @@ from sklearn.metrics.pairwise import cosine_similarity
 from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 import torch
-print("✅ qa.py (Phi-2 Hybrid + Rerank + Similarity Filter) loaded from:", __file__)
 # ==========================================================
-# 1️⃣ Hugging Face Cache Setup
 # ==========================================================
 CACHE_DIR = "/tmp/hf_cache"
 os.makedirs(CACHE_DIR, exist_ok=True)
@@ -31,169 +30,174 @@ os.environ.update({
 print(f"✅ Using Hugging Face cache at {CACHE_DIR}")
 # ==========================================================
-# 2️⃣ Embedding Model (E5-small-v2)
 # ==========================================================
 try:
     _query_model = SentenceTransformer("intfloat/e5-small-v2", cache_folder=CACHE_DIR)
     print("✅ Loaded embedding model: intfloat/e5-small-v2")
 except Exception as e:
-    print(f"⚠️ Embedding model load failed ({e}), falling back to MiniLM.")
     _query_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", cache_folder=CACHE_DIR)
 # ==========================================================
-# 3️⃣ LLM Setup — Phi-2 (Fast)
 # ==========================================================
 MODEL_NAME = "microsoft/phi-2"
 print(f"✅ Loading LLM: {MODEL_NAME}")
-try:
-    _tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir=CACHE_DIR)
-    _model = AutoModelForCausalLM.from_pretrained(
-        MODEL_NAME,
-        cache_dir=CACHE_DIR,
-        torch_dtype=torch.float16 if torch.cuda.is_available() else "auto",
-        low_cpu_mem_usage=True
-    )
-    _answer_model = pipeline(
-        "text-generation",
-        model=_model,
-        tokenizer=_tokenizer,
-        device_map="auto"
-    )
-    print("✅ Phi-2 text-generation pipeline ready.")
-except Exception as e:
-    print(f"⚠️ Phi-2 load failed: {e}")
-    _answer_model = None
 # ==========================================================
-# 4️⃣ Prompt Templates
 # ==========================================================
 STRICT_PROMPT = (
-    "You are an assistant for enterprise documentation.\n"
-    "Answer the question based ONLY on the context below.\n"
-    "If the answer is not in the context, reply exactly:\n"
     "'I don't know based on the provided document.'\n\n"
     "Context:\n{context}\n\nQuestion: {query}\nAnswer:"
 )
 REASONING_PROMPT = (
-    "You are an expert enterprise assistant.\n"
-    "Carefully reason about the following context and provide a detailed, step-by-step answer.\n"
-    "If the context does not provide enough information, you may make cautious inferences based on logical reasoning.\n"
-    "However, always note when you are inferring beyond the text.\n\n"
-    "Context:\n{context}\n\nQuestion: {query}\n\nReasoning and Answer:"
 )
 # ==========================================================
-# 5️⃣ Retrieve Chunks — FAISS + Re-rank + Similarity Filter
 # ==========================================================
-def retrieve_chunks(query: str, index, chunks: list, top_k: int = 5, min_similarity: float = 0.6):
-    """
-    Retrieves top-K relevant chunks with re-ranking and similarity threshold filtering.
-    Steps:
-    1️⃣ Use FAISS to get approximate top candidates.
-    2️⃣ Re-rank them by cosine similarity with the query.
-    3️⃣ Filter out low-similarity chunks below min_similarity.
-    """
     if not index or not chunks:
         return []
-    try:
-        # --- Encode query ---
-        q_emb = _query_model.encode(
-            [f"query: {query.strip()}"],
-            convert_to_numpy=True,
-            normalize_embeddings=True
-        )[0]
-        # --- FAISS initial retrieval ---
-        distances, indices = index.search(np.array([q_emb]).astype("float32"), top_k * 3)
-        retrieved = [chunks[i] for i in indices[0]]
-        # --- Compute re-ranking similarity scores ---
-        doc_embs = _query_model.encode(
-            [f"passage: {c}" for c in retrieved],
-            convert_to_numpy=True,
-            normalize_embeddings=True
-        )
-        sims = cosine_similarity([q_emb], doc_embs)[0]
-        # --- Combine and sort by similarity ---
-        scored = sorted(zip(retrieved, sims), key=lambda x: x[1], reverse=True)
-        # --- Apply minimum similarity filter ---
-        filtered = [(chunk, score) for chunk, score in scored if score >= min_similarity]
-        # --- Select final top_k results ---
-        final_chunks = [chunk for chunk, _ in filtered[:top_k]]
-        print(f"✅ Retrieved {len(final_chunks)} chunks (min sim={min_similarity})")
-        return final_chunks
-    except Exception as e:
-        print(f"⚠️ Retrieval error: {e}")
-        return []
 # ==========================================================
-# 6️⃣ Answer Generation (Fast / Reasoning Hybrid)
 # ==========================================================
 def generate_answer(query: str, retrieved_chunks: list, reasoning_mode: bool = False):
-    """
-    Generates concise or reasoning-rich answers using Phi-2.
-    reasoning_mode=True → longer, more explanatory (slower)
-    reasoning_mode=False → short factual (fast)
-    """
     if not retrieved_chunks:
         return "Sorry, I couldn’t find relevant information in the document."
     context = "\n".join(chunk.strip() for chunk in retrieved_chunks)
-    prompt = (REASONING_PROMPT if reasoning_mode else STRICT_PROMPT).format(context=context, query=query)
     try:
         result = _answer_model(
             prompt,
-            max_new_tokens=200 if reasoning_mode else 120,
-            temperature=0.6 if reasoning_mode else 0.2,
             do_sample=reasoning_mode,
             pad_token_id=_tokenizer.eos_token_id,
         )
-        answer = result[0]["generated_text"].strip()
-        if "Answer:" in answer:
-            answer = answer.split("Answer:")[-1].strip()
-        return answer
     except Exception as e:
         print(f"⚠️ Generation failed: {e}")
         return "⚠️ Error: Could not generate an answer."
 # ==========================================================
-# 7️⃣ Local Test (Optional)
 # ==========================================================
 if __name__ == "__main__":
     from vectorstore import build_faiss_index
-    import faiss
     dummy_chunks = [
         "Step 1: Open the dashboard and navigate to reports.",
         "Step 2: Click 'Export' to download a CSV summary.",
-        "Step 3: Review the generated report in your downloads folder."
     ]
     embeddings = [
-        _query_model.encode([f"passage: {chunk}"], convert_to_numpy=True, normalize_embeddings=True)[0]
-        for chunk in dummy_chunks
     ]
-    dim = embeddings[0].shape[0]
-    index = faiss.IndexFlatL2(dim)
-    index.add(np.array(embeddings).astype("float32"))
-    query = "How to export a report?"
-    retrieved = retrieve_chunks(query, index, dummy_chunks, top_k=3, min_similarity=0.6)
-    print("\n🔍 Retrieved chunks:", retrieved)
-    print("\n💬 FAST Answer:", generate_answer(query, retrieved, reasoning_mode=False))
-    print("\n🧠 REASONING Answer:", generate_answer(query, retrieved, reasoning_mode=True))

 """
+qa.py — Phi-2 FAST + RERANKED RETRIEVAL
+--------------------------------------
+Uses:
+• intfloat/e5-small-v2  — embeddings
+• microsoft/phi-2       — generation
+Optimized for: speed, factual accuracy, and semantic retrieval on Hugging Face Spaces
 """
 import os
 from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 import torch
+print("✅ qa.py (Phi-2 FAST + ReRank) loaded from:", __file__)
 # ==========================================================
+# 1️⃣ Cache Setup (Hugging Face /tmp cache)
 # ==========================================================
 CACHE_DIR = "/tmp/hf_cache"
 os.makedirs(CACHE_DIR, exist_ok=True)
 print(f"✅ Using Hugging Face cache at {CACHE_DIR}")
 # ==========================================================
+# 2️⃣ Embedding Model
 # ==========================================================
 try:
     _query_model = SentenceTransformer("intfloat/e5-small-v2", cache_folder=CACHE_DIR)
     print("✅ Loaded embedding model: intfloat/e5-small-v2")
 except Exception as e:
+    print(f"⚠️ Embedding load failed ({e}), falling back to MiniLM")
     _query_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", cache_folder=CACHE_DIR)
 # ==========================================================
+# 3️⃣ Phi-2 LLM Setup
 # ==========================================================
 MODEL_NAME = "microsoft/phi-2"
 print(f"✅ Loading LLM: {MODEL_NAME}")
+_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir=CACHE_DIR)
+_model = AutoModelForCausalLM.from_pretrained(
+    MODEL_NAME,
+    cache_dir=CACHE_DIR,
+    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.bfloat16,
+    low_cpu_mem_usage=True,
+).to("cpu")
+_answer_model = pipeline(
+    "text-generation",
+    model=_model,
+    tokenizer=_tokenizer,
+    device=-1,
+    model_kwargs={"torch_dtype": torch.bfloat16, "low_cpu_mem_usage": True},
+)
+print("✅ Phi-2 text-generation pipeline ready (optimized).")
 # ==========================================================
+# 4️⃣ Prompt Template
 # ==========================================================
 STRICT_PROMPT = (
+    "You are an enterprise documentation assistant.\n"
+    "Answer factually using ONLY the context below.\n"
+    "If the answer isn’t present, reply exactly:\n"
     "'I don't know based on the provided document.'\n\n"
     "Context:\n{context}\n\nQuestion: {query}\nAnswer:"
 )
 REASONING_PROMPT = (
+    "You are an expert enterprise assistant with reasoning ability.\n"
+    "Think carefully about the context and question.\n"
+    "Use world knowledge and inference if necessary, but prefer factual accuracy.\n"
+    "If the document lacks the answer, say:\n"
+    "'I don't know based on the provided document.'\n\n"
+    "Context:\n{context}\n\nQuestion: {query}\nAnswer:"
 )
 # ==========================================================
+# 5️⃣ Retrieve Chunks (FAISS + Rerank + Neighbor Expansion)
 # ==========================================================
+def retrieve_chunks(
+    query: str,
+    index,
+    chunks: list,
+    top_k: int = 3,
+    topn_candidates: int = 20,
+    neighbor_threshold: float = 0.68,
+    expansion_window: int = 1,
+    max_context_chunks: int = 6,
+):
+    """Retrieve semantically relevant chunks with reranking and neighbor expansion."""
     if not index or not chunks:
         return []
+    # 1️⃣ Encode query (normalized)
+    query_emb = _query_model.encode(
+        [f"query: {query.strip()}"],
+        convert_to_numpy=True,
+        normalize_embeddings=True
+    )[0].astype("float32")
+    # 2️⃣ FAISS search (initial candidates)
+    topn_candidates = min(topn_candidates, getattr(index, "ntotal", topn_candidates))
+    _, candidate_ids = index.search(np.array([query_emb]).astype("float32"), topn_candidates)
+    candidate_ids = [int(i) for i in candidate_ids[0] if i != -1]
+    # 3️⃣ Re-encode candidate chunks and compute cosine similarities
+    candidate_texts = [chunks[i] for i in candidate_ids]
+    candidate_vecs = np.array([
+        _query_model.encode([t], convert_to_numpy=True, normalize_embeddings=True)[0]
+        for t in candidate_texts
+    ])
+    sims = cosine_similarity([query_emb], candidate_vecs)[0]
+    sorted_idx = np.argsort(sims)[::-1]
+    reranked_ids = [candidate_ids[i] for i in sorted_idx]
+    # 4️⃣ Select top-k base chunks
+    selected, selected_set = [], set()
+    for rid in reranked_ids:
+        if len(selected) >= top_k:
+            break
+        selected.append(rid)
+        selected_set.add(rid)
+    # 5️⃣ Conditional neighbor expansion
+    final_order = list(selected)
+    for base_id in selected:
+        if len(final_order) >= max_context_chunks:
+            break
+        for offset in range(1, expansion_window + 1):
+            for neighbor in (base_id - offset, base_id + offset):
+                if neighbor < 0 or neighbor >= len(chunks) or neighbor in selected_set:
+                    continue
+                # Check semantic closeness
+                neighbor_vec = _query_model.encode([chunks[neighbor]], convert_to_numpy=True, normalize_embeddings=True)[0]
+                sim = float(cosine_similarity([query_emb], [neighbor_vec])[0][0])
+                if sim >= neighbor_threshold:
+                    final_order.append(neighbor)
+                    selected_set.add(neighbor)
+                    if len(final_order) >= max_context_chunks:
+                        break
+            if len(final_order) >= max_context_chunks:
+                break
+    return [chunks[i] for i in final_order]
 # ==========================================================
+# 6️⃣ Answer Generation
 # ==========================================================
 def generate_answer(query: str, retrieved_chunks: list, reasoning_mode: bool = False):
+    """Generate concise, factual or reasoning-based answers using Phi-2."""
     if not retrieved_chunks:
         return "Sorry, I couldn’t find relevant information in the document."
     context = "\n".join(chunk.strip() for chunk in retrieved_chunks)
+    prompt = (REASONING_PROMPT if reasoning_mode else STRICT_PROMPT).format(
+        context=context, query=query
+    )
     try:
         result = _answer_model(
             prompt,
+            max_new_tokens=180 if reasoning_mode else 120,
+            temperature=0.6 if reasoning_mode else 0.3,
             do_sample=reasoning_mode,
+            early_stopping=True,
             pad_token_id=_tokenizer.eos_token_id,
         )
+        text = result[0]["generated_text"].strip()
+        return text.split("Answer:")[-1].strip() if "Answer:" in text else text
     except Exception as e:
         print(f"⚠️ Generation failed: {e}")
         return "⚠️ Error: Could not generate an answer."
 # ==========================================================
+# 7️⃣ Local Test
 # ==========================================================
 if __name__ == "__main__":
     from vectorstore import build_faiss_index
     dummy_chunks = [
         "Step 1: Open the dashboard and navigate to reports.",
         "Step 2: Click 'Export' to download a CSV summary.",
+        "Step 3: Review the generated report in your downloads folder.",
+        "Appendix: Communication user creation steps are explained later in this guide."
     ]
     embeddings = [
+        _query_model.encode([f"passage: {c}"], convert_to_numpy=True, normalize_embeddings=True)[0]
+        for c in dummy_chunks
     ]
+    index = build_faiss_index(embeddings)
+    query = "How do I create a communication user?"
+    retrieved = retrieve_chunks(query, index, dummy_chunks)
+    print("🔍 Retrieved:", retrieved)
+    print("💬 Answer:", generate_answer(query, retrieved))