Spaces:

Shubham170793
/

enterprise-knowledge-assistant

Running

App Files Files Community

Shubham170793 commited on Oct 5

Commit

743f89e

verified ·

1 Parent(s): 09c2f03

Update src/qa.py

Browse files

Files changed (1) hide show

src/qa.py +36 -33

src/qa.py CHANGED Viewed

@@ -31,7 +31,6 @@ os.environ.update({
 # ==========================================================
 # 2️⃣ Query Embedding Model
 # ==========================================================
-# Use E5-small-v2 for retrieval consistency with embeddings.py
 try:
     _query_model = SentenceTransformer(
         "intfloat/e5-small-v2",
@@ -49,7 +48,7 @@ except Exception as e:
 # ==========================================================
 # 3️⃣ LLM for Answer Generation (FLAN-T5)
 # ==========================================================
-MODEL_NAME = "google/flan-t5-base"   # switch to 'large' if RAM allows
 print(f"✅ Loading LLM: {MODEL_NAME}")
 _tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir=CACHE_DIR)
@@ -59,15 +58,15 @@ _answer_model = pipeline(
     "text2text-generation",
     model=_model,
     tokenizer=_tokenizer,
-    device=-1  # CPU-safe for Spaces
 )
 # ==========================================================
-# 4️⃣ Prompt Template (concise and factual)
 # ==========================================================
 PROMPT_TEMPLATE = """
-You are an expert enterprise assistant.
-Using ONLY the CONTEXT below, answer the QUESTION clearly and factually.
 If the context doesn’t contain the answer, reply exactly:
 "I don't know based on the provided document."
@@ -93,7 +92,6 @@ def retrieve_chunks(query: str, index, chunks: list, top_k: int = 3):
         return []
     try:
-        # E5 expects 'query:' prefix for better retrieval accuracy
         query_emb = _query_model.encode(
             [f"query: {query.strip()}"],
             convert_to_numpy=True,
@@ -114,45 +112,47 @@ def retrieve_chunks(query: str, index, chunks: list, top_k: int = 3):
 def generate_answer(query: str, retrieved_chunks: list):
     """
     Generates an answer using FLAN-T5 and retrieved chunks as context.
     """
     if not retrieved_chunks:
         return "Sorry, I couldn’t find relevant information in the document."
-    # Merge retrieved chunks for context
-    context = "\n\n".join([f"[Chunk {i+1}]: {chunk}" for i, chunk in enumerate(retrieved_chunks)])
-    # Build structured prompt
     prompt = PROMPT_TEMPLATE.format(context=context, query=query)
     try:
-    result = _answer_model(
-        prompt,
-        max_new_tokens=350,        # allow longer, more complete answers
-        do_sample=True,            # enable sampling for natural flow
-        temperature=0.7,           # slightly higher = more expressive responses
-        top_p=0.95,                # nucleus sampling for coherence
-        repetition_penalty=1.2     # discourages repetitive phrasing
-    )
-    answer = result[0]["generated_text"].strip()
-    # 🧩 If the model outputs something too short, expand gracefully
-    if len(answer.split()) < 8:
-        answer = (
-            "The document mentions this briefly. Based on the context, here's what it suggests: "
-            + answer
         )
-    return answer
-except Exception as e:
-    print(f"⚠️ Generation failed: {e}")
-    return "⚠️ Error: Could not generate an answer at the moment."
 # ==========================================================
-# 7️⃣ Optional Local Test
 # ==========================================================
 if __name__ == "__main__":
     dummy_chunks = [
@@ -161,10 +161,13 @@ if __name__ == "__main__":
         "Integration with SAP ERP allows for seamless data synchronization."
     ]
     from vectorstore import build_faiss_index
-    import numpy as np
     index = build_faiss_index([
-        _query_model.encode([f"passage: {chunk}"], convert_to_numpy=True, normalize_embeddings=True)[0]
         for chunk in dummy_chunks
     ])

 # ==========================================================
 # 2️⃣ Query Embedding Model
 # ==========================================================
 try:
     _query_model = SentenceTransformer(
         "intfloat/e5-small-v2",
 # ==========================================================
 # 3️⃣ LLM for Answer Generation (FLAN-T5)
 # ==========================================================
+MODEL_NAME = "google/flan-t5-base"   # Switch to 'large' if enough memory
 print(f"✅ Loading LLM: {MODEL_NAME}")
 _tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir=CACHE_DIR)
     "text2text-generation",
     model=_model,
     tokenizer=_tokenizer,
+    device=-1  # CPU-safe (Hugging Face Spaces)
 )
 # ==========================================================
+# 4️⃣ Prompt Template
 # ==========================================================
 PROMPT_TEMPLATE = """
+You are an expert enterprise knowledge assistant.
+Use ONLY the CONTEXT below to answer the QUESTION clearly, factually, and completely.
 If the context doesn’t contain the answer, reply exactly:
 "I don't know based on the provided document."
         return []
     try:
         query_emb = _query_model.encode(
             [f"query: {query.strip()}"],
             convert_to_numpy=True,
 def generate_answer(query: str, retrieved_chunks: list):
     """
     Generates an answer using FLAN-T5 and retrieved chunks as context.
+    Includes dynamic length, sampling for expressiveness, and fallback logic.
     """
     if not retrieved_chunks:
         return "Sorry, I couldn’t find relevant information in the document."
+    # Merge retrieved chunks into one coherent context
+    context = "\n\n".join([
+        f"[Chunk {i+1}]: {chunk.strip()}"
+        for i, chunk in enumerate(retrieved_chunks)
+    ])
     prompt = PROMPT_TEMPLATE.format(context=context, query=query)
     try:
+        result = _answer_model(
+            prompt,
+            max_new_tokens=400,        # allow more elaborate responses
+            do_sample=True,            # enable natural variability
+            temperature=0.7,           # creativity balance
+            top_p=0.9,                 # nucleus sampling for relevance
+            repetition_penalty=1.15    # discourage repetition
         )
+        answer = result[0]["generated_text"].strip()
+        # 🧩 Handle overly short answers
+        if len(answer.split()) < 8:
+            answer = (
+                "The document briefly mentions this. Based on the context, here's what it implies: "
+                + answer
+            )
+        return answer
+    except Exception as e:
+        print(f"⚠️ Generation failed: {e}")
+        return "⚠️ Error: Could not generate an answer at the moment."
 # ==========================================================
+# 7️⃣ Optional Local Test (runs only in dev mode)
 # ==========================================================
 if __name__ == "__main__":
     dummy_chunks = [
         "Integration with SAP ERP allows for seamless data synchronization."
     ]
     from vectorstore import build_faiss_index
     index = build_faiss_index([
+        _query_model.encode(
+            [f"passage: {chunk}"],
+            convert_to_numpy=True,
+            normalize_embeddings=True
+        )[0]
         for chunk in dummy_chunks
     ])