Spaces:

sofzcc
/

Self-Service-KB-Assistant

Configuration error

App Files Files Community

sofzcc commited on Nov 27, 2025

Commit

e202573

verified ·

1 Parent(s): 1826392

Update app.py

Browse files

Files changed (1) hide show

app.py +59 -36

app.py CHANGED Viewed

@@ -5,6 +5,9 @@ from typing import List, Tuple
 import gradio as gr
 import numpy as np
 from sentence_transformers import SentenceTransformer
 # -----------------------------
@@ -12,12 +15,11 @@ from sentence_transformers import SentenceTransformer
 # -----------------------------
 KB_DIR = "./kb"  # optional: folder with .txt or .md files
 EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
-TOP_K = 3  # how many chunks to retrieve per answer
 CHUNK_SIZE = 500  # characters
 CHUNK_OVERLAP = 100  # characters
-# FLAN-T5 model (RAG LLM)
-FLAN_MODEL_NAME = "google/flan-t5-large"
 # -----------------------------
@@ -152,6 +154,13 @@ class KBIndex:
 kb_index = KBIndex()
 # -----------------------------
 # LLM (FLAN-T5-Large) - lazy load
@@ -191,9 +200,25 @@ def get_llm():
 # CHAT LOGIC
 # -----------------------------
 def build_answer(query: str) -> str:
-    """Use the KB index + FLAN-T5-Large to build a natural-language answer."""
     results = kb_index.search(query, top_k=TOP_K)
     if not results:
         return (
             "I couldn't find anything relevant in the knowledge base for this query yet.\n\n"
@@ -202,48 +227,46 @@ def build_answer(query: str) -> str:
             "- Improve the existing documentation for this topic."
         )
-    # Combine retrieved chunks into a single context
-    chunks, sources, _scores = zip(*[(c, s, sc) for (c, s, sc) in results])
-    context = "\n\n".join(chunks)
-    # Trim context a bit so it doesn't explode the token limit
-    # (FLAN-T5-Large handles a limited input length)
-    max_context_chars = 3000
-    if len(context) > max_context_chars:
-        context = context[:max_context_chars]
-    llm = get_llm()
     prompt = (
-        "You are a helpful knowledge base assistant. "
-        "Using only the information in the context below, answer the user's question in a clear, natural, and friendly way. "
-        "If the answer is not fully covered by the context, say so honestly.\n\n"
         f"Context:\n{context}\n\n"
         f"Question: {query}\n\n"
-        "Answer:"
     )
-    try:
-        result = llm(
-            prompt,
-            max_new_tokens=256,
-            num_return_sequences=1,
-        )
-        answer_text = result[0]["generated_text"].strip()
-    except Exception as e:
-        print(f"LLM generation error: {e}")
-        # Fallback: still show something useful instead of crashing
-        answer_text = (
-            "I had trouble generating a summarized answer from the knowledge base just now. "
-            "Here are some relevant excerpts instead:\n\n" + context
         )
-    # Optionally add a subtle note about sources (file names)
-    unique_sources = sorted(set(sources))
-    if unique_sources:
-        answer_text += "\n\n— Based on information from: " + ", ".join(unique_sources)
-    return answer_text
 def chat_respond(message: str, history):

 import gradio as gr
 import numpy as np
 from sentence_transformers import SentenceTransformer
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+import torch
 # -----------------------------
 # -----------------------------
 KB_DIR = "./kb"  # optional: folder with .txt or .md files
 EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
+GEN_MODEL_NAME = "google/flan-t5-base"
+TOP_K = 3
 CHUNK_SIZE = 500  # characters
 CHUNK_OVERLAP = 100  # characters
 # -----------------------------
 kb_index = KBIndex()
+print("Loading generation model...")
+gen_tokenizer = AutoTokenizer.from_pretrained(GEN_MODEL_NAME)
+gen_model = AutoModelForSeq2SeqLM.from_pretrained(GEN_MODEL_NAME)
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+gen_model.to(device)
+gen_model.eval()
+print("Generation model ready.")
 # -----------------------------
 # LLM (FLAN-T5-Large) - lazy load
 # CHAT LOGIC
 # -----------------------------
+def build_context_from_results(results: List[Tuple[str, str, float]]) -> str:
+    """
+    Turn retrieved chunks into a compact context string for the LLM.
+    """
+    context_parts = []
+    for chunk, source, score in results:
+        # Keep it concise; we don't need every line label
+        cleaned = chunk.strip()
+        context_parts.append(f"From {source}:\n{cleaned}")
+    return "\n\n".join(context_parts)
 def build_answer(query: str) -> str:
+    """
+    Use the KB index to retrieve relevant chunks,
+    then ask FLAN-T5 to write a natural answer based ONLY on that context.
+    """
     results = kb_index.search(query, top_k=TOP_K)
     if not results:
         return (
             "I couldn't find anything relevant in the knowledge base for this query yet.\n\n"
             "- Improve the existing documentation for this topic."
         )
+    # Build context for the model
+    context = build_context_from_results(results)
+    # Short list of sources for a small citation line
+    source_names = list({src for _, src, _ in results})
+    source_line = "Based on: " + ", ".join(source_names)
+    # Prompt for FLAN-T5
     prompt = (
+        "You are a helpful knowledge base assistant.\n"
+        "Using ONLY the information in the context below, answer the user's question "
+        "in a clear, concise, and natural way. Focus on practical guidance.\n\n"
         f"Context:\n{context}\n\n"
         f"Question: {query}\n\n"
+        "Answer in 2–5 short paragraphs. If something is not covered in the context, say that.\n"
     )
+    inputs = gen_tokenizer(
+        prompt,
+        return_tensors="pt",
+        truncation=True,
+        max_length=2048,
+    ).to(device)
+    with torch.no_grad():
+        output_ids = gen_model.generate(
+            **inputs,
+            max_length=512,
+            temperature=0.7,
+            top_p=0.95,
+            num_beams=4,
         )
+    answer_text = gen_tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()
+    # Add a subtle source hint at the end
+    final_answer = f"{answer_text}\n\n— {source_line}"
+    return final_answer
 def chat_respond(message: str, history):