sofzcc commited on
Commit
a68912a
·
verified ·
1 Parent(s): dac13d8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +57 -17
app.py CHANGED
@@ -5,16 +5,19 @@ from typing import List, Tuple
5
  import gradio as gr
6
  import numpy as np
7
  from sentence_transformers import SentenceTransformer
 
8
 
9
 
10
  # -----------------------------
11
  # CONFIG
12
  # -----------------------------
13
- KB_DIR = "./kb" # optional: folder with .txt or .md files
14
  EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
15
- TOP_K = 3 # how many chunks to show per answer
16
- CHUNK_SIZE = 500 # characters
17
- CHUNK_OVERLAP = 100 # characters
 
 
18
 
19
 
20
  # -----------------------------
@@ -92,7 +95,7 @@ class KBIndex:
92
  def __init__(self, model_name: str = EMBEDDING_MODEL_NAME):
93
  print("Loading embedding model...")
94
  self.model = SentenceTransformer(model_name)
95
- print("Model loaded.")
96
  self.chunks: List[str] = []
97
  self.chunk_sources: List[str] = []
98
  self.embeddings: np.ndarray | None = None
@@ -150,12 +153,30 @@ class KBIndex:
150
  kb_index = KBIndex()
151
 
152
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
  # -----------------------------
154
  # CHAT LOGIC
155
  # -----------------------------
156
 
157
  def build_answer(query: str) -> str:
158
- """Use the KB index to build a human-readable answer."""
159
  results = kb_index.search(query, top_k=TOP_K)
160
  if not results:
161
  return (
@@ -165,16 +186,36 @@ def build_answer(query: str) -> str:
165
  "- Improve the existing documentation for this topic."
166
  )
167
 
168
- intro = "Here’s what I found in the knowledge base:\n"
169
- bullets = []
170
- for i, (chunk, source, score) in enumerate(results, start=1):
171
- bullets.append(f"{i}. From **{source}**:\n{chunk.strip()}\n")
172
 
173
- guidance = (
174
- "\nYou can ask follow-up questions, or try a more specific query if this doesn't fully answer your question."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
  )
 
176
 
177
- return intro + "\n".join(bullets) + guidance
 
 
178
 
179
 
180
  def chat_respond(message: str, history):
@@ -185,8 +226,7 @@ def chat_respond(message: str, history):
185
 
186
  We only need to return the assistant's reply as a string.
187
  """
188
- answer = build_answer(message)
189
- return answer
190
 
191
 
192
  # -----------------------------
@@ -197,7 +237,7 @@ description = """
197
  Ask questions as if you were talking to a knowledge base assistant.
198
  In a real scenario, this assistant would be connected to your own
199
  help center or internal documentation. Here, it's using a small demo
200
- knowledge base to show how retrieval-based self-service can work.
201
  """
202
 
203
  chat = gr.ChatInterface(
@@ -210,7 +250,7 @@ chat = gr.ChatInterface(
210
  "How could a KB assistant help agents?",
211
  "Why is self-service important for customer support?",
212
  ],
213
- cache_examples=False, # avoid example pre-caching issues on HF Spaces
214
  )
215
 
216
 
 
5
  import gradio as gr
6
  import numpy as np
7
  from sentence_transformers import SentenceTransformer
8
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
9
 
10
 
11
  # -----------------------------
12
  # CONFIG
13
  # -----------------------------
14
+ KB_DIR = "./kb" # folder with .txt or .md files
15
  EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
16
+ LLM_MODEL_NAME = "google/flan-t5-large"
17
+
18
+ TOP_K = 3 # how many chunks to use per answer
19
+ CHUNK_SIZE = 500 # characters
20
+ CHUNK_OVERLAP = 100 # characters
21
 
22
 
23
  # -----------------------------
 
95
  def __init__(self, model_name: str = EMBEDDING_MODEL_NAME):
96
  print("Loading embedding model...")
97
  self.model = SentenceTransformer(model_name)
98
+ print("Embedding model loaded.")
99
  self.chunks: List[str] = []
100
  self.chunk_sources: List[str] = []
101
  self.embeddings: np.ndarray | None = None
 
153
  kb_index = KBIndex()
154
 
155
 
156
+ # -----------------------------
157
+ # LLM (FLAN-T5-LARGE) – LAZY LOAD
158
+ # -----------------------------
159
+
160
+ _llm_tokenizer = None
161
+ _llm_model = None
162
+
163
+ def get_llm():
164
+ """Load FLAN-T5-Large only once, when first needed."""
165
+ global _llm_tokenizer, _llm_model
166
+ if _llm_tokenizer is None or _llm_model is None:
167
+ print("Loading FLAN-T5-Large...")
168
+ _llm_tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_NAME)
169
+ _llm_model = AutoModelForSeq2SeqLM.from_pretrained(LLM_MODEL_NAME)
170
+ print("FLAN-T5-Large loaded.")
171
+ return _llm_tokenizer, _llm_model
172
+
173
+
174
  # -----------------------------
175
  # CHAT LOGIC
176
  # -----------------------------
177
 
178
  def build_answer(query: str) -> str:
179
+ """Use the KB index + FLAN-T5 to build a natural, human-sounding answer."""
180
  results = kb_index.search(query, top_k=TOP_K)
181
  if not results:
182
  return (
 
186
  "- Improve the existing documentation for this topic."
187
  )
188
 
189
+ # Collect contexts (just the text, ignore filenames in the answer)
190
+ contexts = [chunk for (chunk, _source, _score) in results]
 
 
191
 
192
+ tokenizer, model = get_llm()
193
+
194
+ # Build a prompt for FLAN-T5
195
+ context_block = "\n\n---\n\n".join(contexts[:TOP_K])
196
+
197
+ prompt = (
198
+ "You are a helpful knowledge base assistant. "
199
+ "Using ONLY the information in the context below, answer the user's question "
200
+ "in a clear, concise, and human, conversational tone. "
201
+ "Do not list file names or raw chunks; write a smooth answer. "
202
+ "If something is not covered in the context, say that you don't have that information.\n\n"
203
+ f"QUESTION: {query}\n\n"
204
+ f"CONTEXT:\n{context_block}\n"
205
+ )
206
+
207
+ inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
208
+ outputs = model.generate(
209
+ **inputs,
210
+ max_length=256,
211
+ num_beams=4,
212
+ early_stopping=True,
213
  )
214
+ answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
215
 
216
+ # Small post-touch to avoid the answer looking too abrupt
217
+ answer = answer.strip()
218
+ return answer
219
 
220
 
221
  def chat_respond(message: str, history):
 
226
 
227
  We only need to return the assistant's reply as a string.
228
  """
229
+ return build_answer(message)
 
230
 
231
 
232
  # -----------------------------
 
237
  Ask questions as if you were talking to a knowledge base assistant.
238
  In a real scenario, this assistant would be connected to your own
239
  help center or internal documentation. Here, it's using a small demo
240
+ knowledge base to show how retrieval-augmented self-service can work.
241
  """
242
 
243
  chat = gr.ChatInterface(
 
250
  "How could a KB assistant help agents?",
251
  "Why is self-service important for customer support?",
252
  ],
253
+ cache_examples=False, # avoids example caching issues on HF Spaces
254
  )
255
 
256