Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -173,10 +173,11 @@ def load_kb_documents(kb_dir: str) -> List[Tuple[str, str]]:
|
|
| 173 |
|
| 174 |
def clean_context_text(text: str) -> str:
|
| 175 |
"""
|
| 176 |
-
Clean raw document context before sending to the
|
| 177 |
- Remove markdown headings (#, ##, ###)
|
| 178 |
- Remove list markers (1., 2), -, *)
|
| 179 |
- Remove duplicate lines
|
|
|
|
| 180 |
"""
|
| 181 |
lines = text.splitlines()
|
| 182 |
cleaned = []
|
|
@@ -200,6 +201,14 @@ def clean_context_text(text: str) -> str:
|
|
| 200 |
if len(l) < 5:
|
| 201 |
continue
|
| 202 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 203 |
# Avoid exact duplicates
|
| 204 |
if l in seen:
|
| 205 |
continue
|
|
@@ -409,7 +418,7 @@ class RAGIndex:
|
|
| 409 |
return answer
|
| 410 |
|
| 411 |
def answer(self, question: str) -> str:
|
| 412 |
-
"""Answer a question using RAG with a simple extractive approach."""
|
| 413 |
if not self.initialized:
|
| 414 |
return "❌ Assistant not properly initialized. Please check the logs."
|
| 415 |
|
|
@@ -432,33 +441,24 @@ class RAGIndex:
|
|
| 432 |
f"💡 Try rephrasing your question or check if relevant documents exist in the knowledge base."
|
| 433 |
)
|
| 434 |
|
| 435 |
-
|
| 436 |
-
|
|
|
|
| 437 |
|
| 438 |
-
|
| 439 |
-
|
| 440 |
-
used_sources.add(source)
|
| 441 |
-
cleaned_ctx = clean_context_text(ctx)
|
| 442 |
-
if cleaned_ctx:
|
| 443 |
-
context_texts.append(cleaned_ctx)
|
| 444 |
-
|
| 445 |
-
if not context_texts:
|
| 446 |
return (
|
| 447 |
f"{NO_ANSWER_MSG}\n\n"
|
| 448 |
f"💡 Try adding more detailed documents to the knowledge base."
|
| 449 |
)
|
| 450 |
|
| 451 |
-
# 2)
|
| 452 |
-
|
| 453 |
-
|
| 454 |
-
|
| 455 |
-
max_context_chars = 3000
|
| 456 |
-
if len(combined_context) > max_context_chars:
|
| 457 |
-
combined_context = combined_context[:max_context_chars]
|
| 458 |
|
| 459 |
-
# 3) Sentence-level relevance scoring
|
| 460 |
-
|
| 461 |
-
raw_sentences = re.split(r'(?<=[.!?])\s+', combined_context)
|
| 462 |
question_words = {
|
| 463 |
w.lower()
|
| 464 |
for w in re.findall(r"\w+", question)
|
|
@@ -482,14 +482,14 @@ class RAGIndex:
|
|
| 482 |
# Take the best 2–3 sentences that have some overlap
|
| 483 |
top = [s for score, s in scored_sentences if score > 0][:3]
|
| 484 |
|
| 485 |
-
# If none have overlap (e.g.,
|
| 486 |
if not top:
|
| 487 |
top = [s for _, s in scored_sentences[:2]]
|
| 488 |
|
| 489 |
answer_text = " ".join(top)
|
| 490 |
else:
|
| 491 |
-
# Fallback: just take a slice of the
|
| 492 |
-
answer_text =
|
| 493 |
|
| 494 |
if not answer_text:
|
| 495 |
answer_text = NO_ANSWER_MSG
|
|
|
|
| 173 |
|
| 174 |
def clean_context_text(text: str) -> str:
|
| 175 |
"""
|
| 176 |
+
Clean raw document context before sending to the answer builder:
|
| 177 |
- Remove markdown headings (#, ##, ###)
|
| 178 |
- Remove list markers (1., 2), -, *)
|
| 179 |
- Remove duplicate lines
|
| 180 |
+
- Remove title-like lines (e.g. 'Knowledge Base Structure & Information Architecture Best Practices')
|
| 181 |
"""
|
| 182 |
lines = text.splitlines()
|
| 183 |
cleaned = []
|
|
|
|
| 201 |
if len(l) < 5:
|
| 202 |
continue
|
| 203 |
|
| 204 |
+
# Heuristic: skip "title-like" lines where almost every word is capitalized
|
| 205 |
+
words = l.split()
|
| 206 |
+
if words:
|
| 207 |
+
cap_words = sum(1 for w in words if w[:1].isupper())
|
| 208 |
+
if len(words) <= 10 and cap_words >= len(words) - 1:
|
| 209 |
+
# Looks like a heading / title, skip it
|
| 210 |
+
continue
|
| 211 |
+
|
| 212 |
# Avoid exact duplicates
|
| 213 |
if l in seen:
|
| 214 |
continue
|
|
|
|
| 418 |
return answer
|
| 419 |
|
| 420 |
def answer(self, question: str) -> str:
|
| 421 |
+
"""Answer a question using RAG with a simple extractive approach from the best chunk only."""
|
| 422 |
if not self.initialized:
|
| 423 |
return "❌ Assistant not properly initialized. Please check the logs."
|
| 424 |
|
|
|
|
| 441 |
f"💡 Try rephrasing your question or check if relevant documents exist in the knowledge base."
|
| 442 |
)
|
| 443 |
|
| 444 |
+
# Use ONLY the single best scoring context (top-1)
|
| 445 |
+
best_ctx, best_source, best_score = contexts[0]
|
| 446 |
+
used_sources = {best_source}
|
| 447 |
|
| 448 |
+
cleaned_ctx = clean_context_text(best_ctx)
|
| 449 |
+
if not cleaned_ctx:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 450 |
return (
|
| 451 |
f"{NO_ANSWER_MSG}\n\n"
|
| 452 |
f"💡 Try adding more detailed documents to the knowledge base."
|
| 453 |
)
|
| 454 |
|
| 455 |
+
# 2) Limit context size just in case
|
| 456 |
+
max_context_chars = 1500
|
| 457 |
+
if len(cleaned_ctx) > max_context_chars:
|
| 458 |
+
cleaned_ctx = cleaned_ctx[:max_context_chars]
|
|
|
|
|
|
|
|
|
|
| 459 |
|
| 460 |
+
# 3) Sentence-level relevance scoring within this single chunk
|
| 461 |
+
raw_sentences = re.split(r'(?<=[.!?])\s+|\n+', cleaned_ctx)
|
|
|
|
| 462 |
question_words = {
|
| 463 |
w.lower()
|
| 464 |
for w in re.findall(r"\w+", question)
|
|
|
|
| 482 |
# Take the best 2–3 sentences that have some overlap
|
| 483 |
top = [s for score, s in scored_sentences if score > 0][:3]
|
| 484 |
|
| 485 |
+
# If none have overlap (e.g., vague question), just take the top 2 sentences overall
|
| 486 |
if not top:
|
| 487 |
top = [s for _, s in scored_sentences[:2]]
|
| 488 |
|
| 489 |
answer_text = " ".join(top)
|
| 490 |
else:
|
| 491 |
+
# Fallback: just take a slice of the cleaned context
|
| 492 |
+
answer_text = cleaned_ctx[:400].strip()
|
| 493 |
|
| 494 |
if not answer_text:
|
| 495 |
answer_text = NO_ANSWER_MSG
|