Spaces:

sofzcc
/

Self-Service-KB-Assistant

Configuration error

App Files Files Community

sofzcc commited on Nov 27, 2025

Commit

1826392

verified ·

1 Parent(s): a68912a

Update app.py

Browse files

Files changed (1) hide show

app.py +74 -45

app.py CHANGED Viewed

@@ -5,19 +5,19 @@ from typing import List, Tuple
 import gradio as gr
 import numpy as np
 from sentence_transformers import SentenceTransformer
-from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 # -----------------------------
 # CONFIG
 # -----------------------------
-KB_DIR = "./kb"  # folder with .txt or .md files
 EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
-LLM_MODEL_NAME = "google/flan-t5-large"
-TOP_K = 3           # how many chunks to use per answer
-CHUNK_SIZE = 500    # characters
-CHUNK_OVERLAP = 100 # characters
 # -----------------------------
@@ -95,7 +95,7 @@ class KBIndex:
     def __init__(self, model_name: str = EMBEDDING_MODEL_NAME):
         print("Loading embedding model...")
         self.model = SentenceTransformer(model_name)
-        print("Embedding model loaded.")
         self.chunks: List[str] = []
         self.chunk_sources: List[str] = []
         self.embeddings: np.ndarray | None = None
@@ -154,21 +154,37 @@ kb_index = KBIndex()
 # -----------------------------
-# LLM (FLAN-T5-LARGE) – LAZY LOAD
 # -----------------------------
-_llm_tokenizer = None
-_llm_model = None
 def get_llm():
-    """Load FLAN-T5-Large only once, when first needed."""
-    global _llm_tokenizer, _llm_model
-    if _llm_tokenizer is None or _llm_model is None:
-        print("Loading FLAN-T5-Large...")
-        _llm_tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_NAME)
-        _llm_model = AutoModelForSeq2SeqLM.from_pretrained(LLM_MODEL_NAME)
-        print("FLAN-T5-Large loaded.")
-    return _llm_tokenizer, _llm_model
 # -----------------------------
@@ -176,7 +192,7 @@ def get_llm():
 # -----------------------------
 def build_answer(query: str) -> str:
-    """Use the KB index + FLAN-T5 to build a natural, human-sounding answer."""
     results = kb_index.search(query, top_k=TOP_K)
     if not results:
         return (
@@ -186,47 +202,60 @@ def build_answer(query: str) -> str:
             "- Improve the existing documentation for this topic."
         )
-    # Collect contexts (just the text, ignore filenames in the answer)
-    contexts = [chunk for (chunk, _source, _score) in results]
-    tokenizer, model = get_llm()
-    # Build a prompt for FLAN-T5
-    context_block = "\n\n---\n\n".join(contexts[:TOP_K])
     prompt = (
         "You are a helpful knowledge base assistant. "
-        "Using ONLY the information in the context below, answer the user's question "
-        "in a clear, concise, and human, conversational tone. "
-        "Do not list file names or raw chunks; write a smooth answer. "
-        "If something is not covered in the context, say that you don't have that information.\n\n"
-        f"QUESTION: {query}\n\n"
-        f"CONTEXT:\n{context_block}\n"
     )
-    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
-    outputs = model.generate(
-        **inputs,
-        max_length=256,
-        num_beams=4,
-        early_stopping=True,
-    )
-    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
-    # Small post-touch to avoid the answer looking too abrupt
-    answer = answer.strip()
-    return answer
 def chat_respond(message: str, history):
     """
     Gradio ChatInterface (type='messages') calls this with:
       - message: latest user message (str)
-      - history: list of previous messages (handled internally by Gradio)
     We only need to return the assistant's reply as a string.
     """
-    return build_answer(message)
 # -----------------------------
@@ -237,7 +266,7 @@ description = """
 Ask questions as if you were talking to a knowledge base assistant.
 In a real scenario, this assistant would be connected to your own
 help center or internal documentation. Here, it's using a small demo
-knowledge base to show how retrieval-augmented self-service can work.
 """
 chat = gr.ChatInterface(
@@ -250,7 +279,7 @@ chat = gr.ChatInterface(
         "How could a KB assistant help agents?",
         "Why is self-service important for customer support?",
     ],
-    cache_examples=False,  # avoids example caching issues on HF Spaces
 )

 import gradio as gr
 import numpy as np
 from sentence_transformers import SentenceTransformer
 # -----------------------------
 # CONFIG
 # -----------------------------
+KB_DIR = "./kb"  # optional: folder with .txt or .md files
 EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
+TOP_K = 3  # how many chunks to retrieve per answer
+CHUNK_SIZE = 500  # characters
+CHUNK_OVERLAP = 100  # characters
+# FLAN-T5 model (RAG LLM)
+FLAN_MODEL_NAME = "google/flan-t5-large"
 # -----------------------------
     def __init__(self, model_name: str = EMBEDDING_MODEL_NAME):
         print("Loading embedding model...")
         self.model = SentenceTransformer(model_name)
+        print("Model loaded.")
         self.chunks: List[str] = []
         self.chunk_sources: List[str] = []
         self.embeddings: np.ndarray | None = None
 # -----------------------------
+# LLM (FLAN-T5-Large) - lazy load
 # -----------------------------
+_llm_pipeline = None
 def get_llm():
+    """
+    Lazily load FLAN-T5-Large as a text2text-generation pipeline.
+    This avoids blocking startup too much.
+    """
+    global _llm_pipeline
+    if _llm_pipeline is not None:
+        return _llm_pipeline
+    print("Loading FLAN-T5-Large model...")
+    from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
+    import torch
+    tokenizer = AutoTokenizer.from_pretrained(FLAN_MODEL_NAME)
+    model = AutoModelForSeq2SeqLM.from_pretrained(FLAN_MODEL_NAME)
+    device = 0 if torch.cuda.is_available() else -1
+    _llm_pipeline = pipeline(
+        "text2text-generation",
+        model=model,
+        tokenizer=tokenizer,
+        device=device,
+    )
+    print("FLAN-T5-Large loaded.")
+    return _llm_pipeline
 # -----------------------------
 # -----------------------------
 def build_answer(query: str) -> str:
+    """Use the KB index + FLAN-T5-Large to build a natural-language answer."""
     results = kb_index.search(query, top_k=TOP_K)
     if not results:
         return (
             "- Improve the existing documentation for this topic."
         )
+    # Combine retrieved chunks into a single context
+    chunks, sources, _scores = zip(*[(c, s, sc) for (c, s, sc) in results])
+    context = "\n\n".join(chunks)
+    # Trim context a bit so it doesn't explode the token limit
+    # (FLAN-T5-Large handles a limited input length)
+    max_context_chars = 3000
+    if len(context) > max_context_chars:
+        context = context[:max_context_chars]
+    llm = get_llm()
     prompt = (
         "You are a helpful knowledge base assistant. "
+        "Using only the information in the context below, answer the user's question in a clear, natural, and friendly way. "
+        "If the answer is not fully covered by the context, say so honestly.\n\n"
+        f"Context:\n{context}\n\n"
+        f"Question: {query}\n\n"
+        "Answer:"
     )
+    try:
+        result = llm(
+            prompt,
+            max_new_tokens=256,
+            num_return_sequences=1,
+        )
+        answer_text = result[0]["generated_text"].strip()
+    except Exception as e:
+        print(f"LLM generation error: {e}")
+        # Fallback: still show something useful instead of crashing
+        answer_text = (
+            "I had trouble generating a summarized answer from the knowledge base just now. "
+            "Here are some relevant excerpts instead:\n\n" + context
+        )
+    # Optionally add a subtle note about sources (file names)
+    unique_sources = sorted(set(sources))
+    if unique_sources:
+        answer_text += "\n\n— Based on information from: " + ", ".join(unique_sources)
+    return answer_text
 def chat_respond(message: str, history):
     """
     Gradio ChatInterface (type='messages') calls this with:
       - message: latest user message (str)
+      - history: list of previous messages (handled by Gradio)
     We only need to return the assistant's reply as a string.
     """
+    answer = build_answer(message)
+    return answer
 # -----------------------------
 Ask questions as if you were talking to a knowledge base assistant.
 In a real scenario, this assistant would be connected to your own
 help center or internal documentation. Here, it's using a small demo
+knowledge base to show how retrieval-based self-service can work.
 """
 chat = gr.ChatInterface(
         "How could a KB assistant help agents?",
         "Why is self-service important for customer support?",
     ],
+    cache_examples=False,  # avoid example pre-caching issues on HF Spaces
 )