Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -339,7 +339,7 @@ class RAGIndex:
|
|
| 339 |
self.chunks = all_chunks
|
| 340 |
self.chunk_sources = all_sources
|
| 341 |
|
| 342 |
-
def retrieve(self, query: str, top_k: int =
|
| 343 |
"""Retrieve relevant chunks for a query"""
|
| 344 |
if not query or not query.strip():
|
| 345 |
return []
|
|
@@ -418,7 +418,7 @@ class RAGIndex:
|
|
| 418 |
return answer
|
| 419 |
|
| 420 |
def answer(self, question: str) -> str:
|
| 421 |
-
"""Answer a question using RAG with
|
| 422 |
if not self.initialized:
|
| 423 |
return "❌ Assistant not properly initialized. Please check the logs."
|
| 424 |
|
|
@@ -432,8 +432,10 @@ class RAGIndex:
|
|
| 432 |
f"Supported formats: .txt, .md, .pdf, .docx"
|
| 433 |
)
|
| 434 |
|
| 435 |
-
#
|
| 436 |
-
contexts
|
|
|
|
|
|
|
| 437 |
|
| 438 |
if not contexts:
|
| 439 |
return (
|
|
@@ -441,58 +443,124 @@ class RAGIndex:
|
|
| 441 |
f"💡 Try rephrasing your question or check if relevant documents exist in the knowledge base."
|
| 442 |
)
|
| 443 |
|
| 444 |
-
|
| 445 |
-
|
| 446 |
-
used_sources = {best_source}
|
| 447 |
|
| 448 |
-
|
| 449 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 450 |
return (
|
| 451 |
f"{NO_ANSWER_MSG}\n\n"
|
| 452 |
f"💡 Try adding more detailed documents to the knowledge base."
|
| 453 |
)
|
| 454 |
|
| 455 |
-
#
|
| 456 |
-
|
| 457 |
-
|
| 458 |
-
|
| 459 |
-
|
| 460 |
-
|
| 461 |
-
|
| 462 |
-
|
| 463 |
-
|
| 464 |
-
|
| 465 |
-
|
| 466 |
}
|
| 467 |
|
| 468 |
-
|
| 469 |
-
|
| 470 |
-
|
| 471 |
-
|
| 472 |
-
|
| 473 |
-
|
| 474 |
-
|
| 475 |
-
|
| 476 |
-
|
| 477 |
-
|
| 478 |
-
if
|
| 479 |
-
|
| 480 |
-
|
| 481 |
-
|
| 482 |
-
|
| 483 |
-
|
| 484 |
-
|
| 485 |
-
|
| 486 |
-
|
| 487 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 488 |
|
| 489 |
-
|
| 490 |
-
|
| 491 |
-
|
| 492 |
-
answer_text = cleaned_ctx[:400].strip()
|
| 493 |
|
| 494 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 495 |
answer_text = NO_ANSWER_MSG
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 496 |
|
| 497 |
sources_str = ", ".join(sorted(used_sources)) if used_sources else "N/A"
|
| 498 |
|
|
|
|
| 339 |
self.chunks = all_chunks
|
| 340 |
self.chunk_sources = all_sources
|
| 341 |
|
| 342 |
+
def retrieve(self, query: str, top_k: int = 5) -> List[Tuple[str, str, float]]:
|
| 343 |
"""Retrieve relevant chunks for a query"""
|
| 344 |
if not query or not query.strip():
|
| 345 |
return []
|
|
|
|
| 418 |
return answer
|
| 419 |
|
| 420 |
def answer(self, question: str) -> str:
|
| 421 |
+
"""Answer a question using RAG with sentence-level semantic selection."""
|
| 422 |
if not self.initialized:
|
| 423 |
return "❌ Assistant not properly initialized. Please check the logs."
|
| 424 |
|
|
|
|
| 432 |
f"Supported formats: .txt, .md, .pdf, .docx"
|
| 433 |
)
|
| 434 |
|
| 435 |
+
# -----------------------------
|
| 436 |
+
# 1) Retrieve relevant contexts (top-3)
|
| 437 |
+
# -----------------------------
|
| 438 |
+
contexts = self.retrieve(question, top_k=3)
|
| 439 |
|
| 440 |
if not contexts:
|
| 441 |
return (
|
|
|
|
| 443 |
f"💡 Try rephrasing your question or check if relevant documents exist in the knowledge base."
|
| 444 |
)
|
| 445 |
|
| 446 |
+
used_sources = set()
|
| 447 |
+
all_sentences = []
|
|
|
|
| 448 |
|
| 449 |
+
# -----------------------------
|
| 450 |
+
# 2) Collect & clean sentences from the retrieved chunks
|
| 451 |
+
# -----------------------------
|
| 452 |
+
for ctx, source, score in contexts:
|
| 453 |
+
used_sources.add(source)
|
| 454 |
+
cleaned_ctx = clean_context_text(ctx)
|
| 455 |
+
if not cleaned_ctx:
|
| 456 |
+
continue
|
| 457 |
+
|
| 458 |
+
# Split into sentences (simple regex: ., !, ? or line breaks)
|
| 459 |
+
raw_sents = re.split(r'(?<=[.!?])\s+|\n+', cleaned_ctx)
|
| 460 |
+
for s in raw_sents:
|
| 461 |
+
s_clean = s.strip()
|
| 462 |
+
# Ignore very short or weirdly short lines
|
| 463 |
+
if len(s_clean) < 25:
|
| 464 |
+
continue
|
| 465 |
+
all_sentences.append((s_clean, source))
|
| 466 |
+
|
| 467 |
+
if not all_sentences:
|
| 468 |
return (
|
| 469 |
f"{NO_ANSWER_MSG}\n\n"
|
| 470 |
f"💡 Try adding more detailed documents to the knowledge base."
|
| 471 |
)
|
| 472 |
|
| 473 |
+
# -----------------------------
|
| 474 |
+
# 3) Topic-aware filtering based on the question
|
| 475 |
+
# -----------------------------
|
| 476 |
+
q_lower = question.lower()
|
| 477 |
+
|
| 478 |
+
topic_keywords = {
|
| 479 |
+
"structure": {"structure", "organize", "hierarchy", "taxonomy", "categories", "information architecture"},
|
| 480 |
+
"maintenance": {"maintain", "maintenance", "update", "review", "governance", "version", "changelog"},
|
| 481 |
+
"quality": {"excellent", "good article", "tone", "style", "writing", "quality"},
|
| 482 |
+
"gaps": {"gap", "gaps", "missing", "search logs", "zero-result", "content gaps"},
|
| 483 |
+
"definition": {"what is", "define", "definition"},
|
| 484 |
}
|
| 485 |
|
| 486 |
+
active_topics = set()
|
| 487 |
+
|
| 488 |
+
if any(k in q_lower for k in ["structure", "organize", "hierarchy", "taxonomy"]):
|
| 489 |
+
active_topics.add("structure")
|
| 490 |
+
if any(k in q_lower for k in ["maintain", "maintenance", "update", "review", "governance", "keep up to date"]):
|
| 491 |
+
active_topics.add("maintenance")
|
| 492 |
+
if any(k in q_lower for k in ["good article", "excellent article", "tone", "style", "how to write"]):
|
| 493 |
+
active_topics.add("quality")
|
| 494 |
+
if any(k in q_lower for k in ["gap", "gaps", "content gaps", "missing", "search logs"]):
|
| 495 |
+
active_topics.add("gaps")
|
| 496 |
+
if any(k in q_lower for k in ["what is", "define", "definition"]):
|
| 497 |
+
active_topics.add("definition")
|
| 498 |
+
|
| 499 |
+
# If no explicit topic detected, we keep all sentences as candidates
|
| 500 |
+
filtered_sentences = []
|
| 501 |
+
if active_topics:
|
| 502 |
+
# Collect all keywords from active topics
|
| 503 |
+
active_kw = set()
|
| 504 |
+
for t in active_topics:
|
| 505 |
+
active_kw |= topic_keywords.get(t, set())
|
| 506 |
+
|
| 507 |
+
for sent, source in all_sentences:
|
| 508 |
+
s_lower = sent.lower()
|
| 509 |
+
if any(kw in s_lower for kw in active_kw):
|
| 510 |
+
filtered_sentences.append((sent, source))
|
| 511 |
+
|
| 512 |
+
# Fallback to all sentences if filtering removed everything
|
| 513 |
+
if not filtered_sentences:
|
| 514 |
+
filtered_sentences = all_sentences
|
| 515 |
+
|
| 516 |
+
# Keep only the sentence text for embedding
|
| 517 |
+
candidate_sents = [s for s, _ in filtered_sentences]
|
| 518 |
+
|
| 519 |
+
# -----------------------------
|
| 520 |
+
# 4) Semantic scoring with SentenceTransformer
|
| 521 |
+
# -----------------------------
|
| 522 |
+
try:
|
| 523 |
+
q_emb = self.embedder.encode([question], convert_to_numpy=True)
|
| 524 |
+
sent_embs = self.embedder.encode(candidate_sents, convert_to_numpy=True)
|
| 525 |
|
| 526 |
+
# Normalize for cosine similarity
|
| 527 |
+
faiss.normalize_L2(q_emb)
|
| 528 |
+
faiss.normalize_L2(sent_embs)
|
|
|
|
| 529 |
|
| 530 |
+
# Cosine similarity = dot product after normalization
|
| 531 |
+
sims = np.dot(sent_embs, q_emb.T).reshape(-1)
|
| 532 |
+
except Exception as e:
|
| 533 |
+
print(f"Sentence embedding error, falling back to lexical scoring: {e}")
|
| 534 |
+
# Lexical fallback: overlap of content words
|
| 535 |
+
q_words = {w.lower() for w in re.findall(r"\w+", question) if len(w) > 3}
|
| 536 |
+
sims = []
|
| 537 |
+
for sent in candidate_sents:
|
| 538 |
+
s_words = {w.lower() for w in re.findall(r"\w+", sent) if len(w) > 3}
|
| 539 |
+
overlap = len(q_words & s_words)
|
| 540 |
+
sims.append(float(overlap))
|
| 541 |
+
sims = np.array(sims, dtype=float)
|
| 542 |
+
|
| 543 |
+
# -----------------------------
|
| 544 |
+
# 5) Pick top-N sentences & compose answer
|
| 545 |
+
# -----------------------------
|
| 546 |
+
if len(sims) == 0:
|
| 547 |
answer_text = NO_ANSWER_MSG
|
| 548 |
+
else:
|
| 549 |
+
# Indices sorted by similarity descending
|
| 550 |
+
top_idx = np.argsort(-sims)
|
| 551 |
+
top_k = min(3, len(top_idx)) # use up to 3 sentences
|
| 552 |
+
chosen = []
|
| 553 |
+
|
| 554 |
+
for i in top_idx[:top_k]:
|
| 555 |
+
s = candidate_sents[i].strip()
|
| 556 |
+
if s and s not in chosen:
|
| 557 |
+
chosen.append(s)
|
| 558 |
+
|
| 559 |
+
if not chosen:
|
| 560 |
+
answer_text = NO_ANSWER_MSG
|
| 561 |
+
else:
|
| 562 |
+
# Join with spaces, ensure it reads like a paragraph
|
| 563 |
+
answer_text = " ".join(chosen)
|
| 564 |
|
| 565 |
sources_str = ", ".join(sorted(used_sources)) if used_sources else "N/A"
|
| 566 |
|