sofzcc commited on
Commit
9207c22
·
verified ·
1 Parent(s): 4ee6d34

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -25
app.py CHANGED
@@ -173,10 +173,11 @@ def load_kb_documents(kb_dir: str) -> List[Tuple[str, str]]:
173
 
174
  def clean_context_text(text: str) -> str:
175
  """
176
- Clean raw document context before sending to the generator:
177
  - Remove markdown headings (#, ##, ###)
178
  - Remove list markers (1., 2), -, *)
179
  - Remove duplicate lines
 
180
  """
181
  lines = text.splitlines()
182
  cleaned = []
@@ -200,6 +201,14 @@ def clean_context_text(text: str) -> str:
200
  if len(l) < 5:
201
  continue
202
 
 
 
 
 
 
 
 
 
203
  # Avoid exact duplicates
204
  if l in seen:
205
  continue
@@ -409,7 +418,7 @@ class RAGIndex:
409
  return answer
410
 
411
  def answer(self, question: str) -> str:
412
- """Answer a question using RAG with a simple extractive approach."""
413
  if not self.initialized:
414
  return "❌ Assistant not properly initialized. Please check the logs."
415
 
@@ -432,33 +441,24 @@ class RAGIndex:
432
  f"💡 Try rephrasing your question or check if relevant documents exist in the knowledge base."
433
  )
434
 
435
- used_sources = set()
436
- context_texts = []
 
437
 
438
- # Clean and collect the retrieved chunks
439
- for ctx, source, score in contexts:
440
- used_sources.add(source)
441
- cleaned_ctx = clean_context_text(ctx)
442
- if cleaned_ctx:
443
- context_texts.append(cleaned_ctx)
444
-
445
- if not context_texts:
446
  return (
447
  f"{NO_ANSWER_MSG}\n\n"
448
  f"💡 Try adding more detailed documents to the knowledge base."
449
  )
450
 
451
- # 2) Combine contexts into a single evidence block
452
- combined_context = "\n\n".join(context_texts)
453
-
454
- # Keep context at a reasonable size
455
- max_context_chars = 3000
456
- if len(combined_context) > max_context_chars:
457
- combined_context = combined_context[:max_context_chars]
458
 
459
- # 3) Sentence-level relevance scoring
460
- # We pick the sentences that best match the question terms
461
- raw_sentences = re.split(r'(?<=[.!?])\s+', combined_context)
462
  question_words = {
463
  w.lower()
464
  for w in re.findall(r"\w+", question)
@@ -482,14 +482,14 @@ class RAGIndex:
482
  # Take the best 2–3 sentences that have some overlap
483
  top = [s for score, s in scored_sentences if score > 0][:3]
484
 
485
- # If none have overlap (e.g., very vague question), just take the top 2 sentences overall
486
  if not top:
487
  top = [s for _, s in scored_sentences[:2]]
488
 
489
  answer_text = " ".join(top)
490
  else:
491
- # Fallback: just take a slice of the combined context
492
- answer_text = combined_context[:400].strip()
493
 
494
  if not answer_text:
495
  answer_text = NO_ANSWER_MSG
 
173
 
174
  def clean_context_text(text: str) -> str:
175
  """
176
+ Clean raw document context before sending to the answer builder:
177
  - Remove markdown headings (#, ##, ###)
178
  - Remove list markers (1., 2), -, *)
179
  - Remove duplicate lines
180
+ - Remove title-like lines (e.g. 'Knowledge Base Structure & Information Architecture Best Practices')
181
  """
182
  lines = text.splitlines()
183
  cleaned = []
 
201
  if len(l) < 5:
202
  continue
203
 
204
+ # Heuristic: skip "title-like" lines where almost every word is capitalized
205
+ words = l.split()
206
+ if words:
207
+ cap_words = sum(1 for w in words if w[:1].isupper())
208
+ if len(words) <= 10 and cap_words >= len(words) - 1:
209
+ # Looks like a heading / title, skip it
210
+ continue
211
+
212
  # Avoid exact duplicates
213
  if l in seen:
214
  continue
 
418
  return answer
419
 
420
  def answer(self, question: str) -> str:
421
+ """Answer a question using RAG with a simple extractive approach from the best chunk only."""
422
  if not self.initialized:
423
  return "❌ Assistant not properly initialized. Please check the logs."
424
 
 
441
  f"💡 Try rephrasing your question or check if relevant documents exist in the knowledge base."
442
  )
443
 
444
+ # Use ONLY the single best scoring context (top-1)
445
+ best_ctx, best_source, best_score = contexts[0]
446
+ used_sources = {best_source}
447
 
448
+ cleaned_ctx = clean_context_text(best_ctx)
449
+ if not cleaned_ctx:
 
 
 
 
 
 
450
  return (
451
  f"{NO_ANSWER_MSG}\n\n"
452
  f"💡 Try adding more detailed documents to the knowledge base."
453
  )
454
 
455
+ # 2) Limit context size just in case
456
+ max_context_chars = 1500
457
+ if len(cleaned_ctx) > max_context_chars:
458
+ cleaned_ctx = cleaned_ctx[:max_context_chars]
 
 
 
459
 
460
+ # 3) Sentence-level relevance scoring within this single chunk
461
+ raw_sentences = re.split(r'(?<=[.!?])\s+|\n+', cleaned_ctx)
 
462
  question_words = {
463
  w.lower()
464
  for w in re.findall(r"\w+", question)
 
482
  # Take the best 2–3 sentences that have some overlap
483
  top = [s for score, s in scored_sentences if score > 0][:3]
484
 
485
+ # If none have overlap (e.g., vague question), just take the top 2 sentences overall
486
  if not top:
487
  top = [s for _, s in scored_sentences[:2]]
488
 
489
  answer_text = " ".join(top)
490
  else:
491
+ # Fallback: just take a slice of the cleaned context
492
+ answer_text = cleaned_ctx[:400].strip()
493
 
494
  if not answer_text:
495
  answer_text = NO_ANSWER_MSG