Spaces:

sofzcc
/

Self-Service-KB-Assistant

Configuration error

App Files Files Community

sofzcc commited on Nov 27, 2025

Commit

a68912a

verified ·

1 Parent(s): dac13d8

Update app.py

Browse files

Files changed (1) hide show

app.py +57 -17

app.py CHANGED Viewed

@@ -5,16 +5,19 @@ from typing import List, Tuple
 import gradio as gr
 import numpy as np
 from sentence_transformers import SentenceTransformer
 # -----------------------------
 # CONFIG
 # -----------------------------
-KB_DIR = "./kb"  # optional: folder with .txt or .md files
 EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
-TOP_K = 3  # how many chunks to show per answer
-CHUNK_SIZE = 500  # characters
-CHUNK_OVERLAP = 100  # characters
 # -----------------------------
@@ -92,7 +95,7 @@ class KBIndex:
     def __init__(self, model_name: str = EMBEDDING_MODEL_NAME):
         print("Loading embedding model...")
         self.model = SentenceTransformer(model_name)
-        print("Model loaded.")
         self.chunks: List[str] = []
         self.chunk_sources: List[str] = []
         self.embeddings: np.ndarray | None = None
@@ -150,12 +153,30 @@ class KBIndex:
 kb_index = KBIndex()
 # -----------------------------
 # CHAT LOGIC
 # -----------------------------
 def build_answer(query: str) -> str:
-    """Use the KB index to build a human-readable answer."""
     results = kb_index.search(query, top_k=TOP_K)
     if not results:
         return (
@@ -165,16 +186,36 @@ def build_answer(query: str) -> str:
             "- Improve the existing documentation for this topic."
         )
-    intro = "Here’s what I found in the knowledge base:\n"
-    bullets = []
-    for i, (chunk, source, score) in enumerate(results, start=1):
-        bullets.append(f"{i}. From **{source}**:\n{chunk.strip()}\n")
-    guidance = (
-        "\nYou can ask follow-up questions, or try a more specific query if this doesn't fully answer your question."
     )
-    return intro + "\n".join(bullets) + guidance
 def chat_respond(message: str, history):
@@ -185,8 +226,7 @@ def chat_respond(message: str, history):
     We only need to return the assistant's reply as a string.
     """
-    answer = build_answer(message)
-    return answer
 # -----------------------------
@@ -197,7 +237,7 @@ description = """
 Ask questions as if you were talking to a knowledge base assistant.
 In a real scenario, this assistant would be connected to your own
 help center or internal documentation. Here, it's using a small demo
-knowledge base to show how retrieval-based self-service can work.
 """
 chat = gr.ChatInterface(
@@ -210,7 +250,7 @@ chat = gr.ChatInterface(
         "How could a KB assistant help agents?",
         "Why is self-service important for customer support?",
     ],
-    cache_examples=False,  # avoid example pre-caching issues on HF Spaces
 )

 import gradio as gr
 import numpy as np
 from sentence_transformers import SentenceTransformer
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 # -----------------------------
 # CONFIG
 # -----------------------------
+KB_DIR = "./kb"  # folder with .txt or .md files
 EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
+LLM_MODEL_NAME = "google/flan-t5-large"
+TOP_K = 3           # how many chunks to use per answer
+CHUNK_SIZE = 500    # characters
+CHUNK_OVERLAP = 100 # characters
 # -----------------------------
     def __init__(self, model_name: str = EMBEDDING_MODEL_NAME):
         print("Loading embedding model...")
         self.model = SentenceTransformer(model_name)
+        print("Embedding model loaded.")
         self.chunks: List[str] = []
         self.chunk_sources: List[str] = []
         self.embeddings: np.ndarray | None = None
 kb_index = KBIndex()
+# -----------------------------
+# LLM (FLAN-T5-LARGE) – LAZY LOAD
+# -----------------------------
+_llm_tokenizer = None
+_llm_model = None
+def get_llm():
+    """Load FLAN-T5-Large only once, when first needed."""
+    global _llm_tokenizer, _llm_model
+    if _llm_tokenizer is None or _llm_model is None:
+        print("Loading FLAN-T5-Large...")
+        _llm_tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_NAME)
+        _llm_model = AutoModelForSeq2SeqLM.from_pretrained(LLM_MODEL_NAME)
+        print("FLAN-T5-Large loaded.")
+    return _llm_tokenizer, _llm_model
 # -----------------------------
 # CHAT LOGIC
 # -----------------------------
 def build_answer(query: str) -> str:
+    """Use the KB index + FLAN-T5 to build a natural, human-sounding answer."""
     results = kb_index.search(query, top_k=TOP_K)
     if not results:
         return (
             "- Improve the existing documentation for this topic."
         )
+    # Collect contexts (just the text, ignore filenames in the answer)
+    contexts = [chunk for (chunk, _source, _score) in results]
+    tokenizer, model = get_llm()
+    # Build a prompt for FLAN-T5
+    context_block = "\n\n---\n\n".join(contexts[:TOP_K])
+    prompt = (
+        "You are a helpful knowledge base assistant. "
+        "Using ONLY the information in the context below, answer the user's question "
+        "in a clear, concise, and human, conversational tone. "
+        "Do not list file names or raw chunks; write a smooth answer. "
+        "If something is not covered in the context, say that you don't have that information.\n\n"
+        f"QUESTION: {query}\n\n"
+        f"CONTEXT:\n{context_block}\n"
+    )
+    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
+    outputs = model.generate(
+        **inputs,
+        max_length=256,
+        num_beams=4,
+        early_stopping=True,
     )
+    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    # Small post-touch to avoid the answer looking too abrupt
+    answer = answer.strip()
+    return answer
 def chat_respond(message: str, history):
     We only need to return the assistant's reply as a string.
     """
+    return build_answer(message)
 # -----------------------------
 Ask questions as if you were talking to a knowledge base assistant.
 In a real scenario, this assistant would be connected to your own
 help center or internal documentation. Here, it's using a small demo
+knowledge base to show how retrieval-augmented self-service can work.
 """
 chat = gr.ChatInterface(
         "How could a KB assistant help agents?",
         "Why is self-service important for customer support?",
     ],
+    cache_examples=False,  # avoids example caching issues on HF Spaces
 )