Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -418,7 +418,12 @@ class RAGIndex:
|
|
| 418 |
return answer
|
| 419 |
|
| 420 |
def answer(self, question: str) -> str:
|
| 421 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 422 |
if not self.initialized:
|
| 423 |
return "❌ Assistant not properly initialized. Please check the logs."
|
| 424 |
|
|
@@ -433,9 +438,9 @@ class RAGIndex:
|
|
| 433 |
)
|
| 434 |
|
| 435 |
# -----------------------------
|
| 436 |
-
# 1) Retrieve
|
| 437 |
# -----------------------------
|
| 438 |
-
contexts = self.retrieve(question, top_k=
|
| 439 |
|
| 440 |
if not contexts:
|
| 441 |
return (
|
|
@@ -444,124 +449,108 @@ class RAGIndex:
|
|
| 444 |
)
|
| 445 |
|
| 446 |
used_sources = set()
|
| 447 |
-
|
|
|
|
| 448 |
|
| 449 |
# -----------------------------
|
| 450 |
-
# 2)
|
| 451 |
# -----------------------------
|
| 452 |
for ctx, source, score in contexts:
|
| 453 |
used_sources.add(source)
|
|
|
|
| 454 |
cleaned_ctx = clean_context_text(ctx)
|
| 455 |
if not cleaned_ctx:
|
| 456 |
continue
|
| 457 |
|
| 458 |
-
#
|
| 459 |
raw_sents = re.split(r'(?<=[.!?])\s+|\n+', cleaned_ctx)
|
|
|
|
| 460 |
for s in raw_sents:
|
| 461 |
s_clean = s.strip()
|
| 462 |
-
#
|
| 463 |
if len(s_clean) < 25:
|
| 464 |
continue
|
| 465 |
-
all_sentences.append((s_clean, source))
|
| 466 |
|
| 467 |
-
|
|
|
|
|
|
|
|
|
|
| 468 |
return (
|
| 469 |
f"{NO_ANSWER_MSG}\n\n"
|
| 470 |
f"💡 Try adding more detailed documents to the knowledge base."
|
| 471 |
)
|
| 472 |
|
| 473 |
# -----------------------------
|
| 474 |
-
# 3)
|
| 475 |
-
# -----------------------------
|
| 476 |
-
q_lower = question.lower()
|
| 477 |
-
|
| 478 |
-
topic_keywords = {
|
| 479 |
-
"structure": {"structure", "organize", "hierarchy", "taxonomy", "categories", "information architecture"},
|
| 480 |
-
"maintenance": {"maintain", "maintenance", "update", "review", "governance", "version", "changelog"},
|
| 481 |
-
"quality": {"excellent", "good article", "tone", "style", "writing", "quality"},
|
| 482 |
-
"gaps": {"gap", "gaps", "missing", "search logs", "zero-result", "content gaps"},
|
| 483 |
-
"definition": {"what is", "define", "definition"},
|
| 484 |
-
}
|
| 485 |
-
|
| 486 |
-
active_topics = set()
|
| 487 |
-
|
| 488 |
-
if any(k in q_lower for k in ["structure", "organize", "hierarchy", "taxonomy"]):
|
| 489 |
-
active_topics.add("structure")
|
| 490 |
-
if any(k in q_lower for k in ["maintain", "maintenance", "update", "review", "governance", "keep up to date"]):
|
| 491 |
-
active_topics.add("maintenance")
|
| 492 |
-
if any(k in q_lower for k in ["good article", "excellent article", "tone", "style", "how to write"]):
|
| 493 |
-
active_topics.add("quality")
|
| 494 |
-
if any(k in q_lower for k in ["gap", "gaps", "content gaps", "missing", "search logs"]):
|
| 495 |
-
active_topics.add("gaps")
|
| 496 |
-
if any(k in q_lower for k in ["what is", "define", "definition"]):
|
| 497 |
-
active_topics.add("definition")
|
| 498 |
-
|
| 499 |
-
# If no explicit topic detected, we keep all sentences as candidates
|
| 500 |
-
filtered_sentences = []
|
| 501 |
-
if active_topics:
|
| 502 |
-
# Collect all keywords from active topics
|
| 503 |
-
active_kw = set()
|
| 504 |
-
for t in active_topics:
|
| 505 |
-
active_kw |= topic_keywords.get(t, set())
|
| 506 |
-
|
| 507 |
-
for sent, source in all_sentences:
|
| 508 |
-
s_lower = sent.lower()
|
| 509 |
-
if any(kw in s_lower for kw in active_kw):
|
| 510 |
-
filtered_sentences.append((sent, source))
|
| 511 |
-
|
| 512 |
-
# Fallback to all sentences if filtering removed everything
|
| 513 |
-
if not filtered_sentences:
|
| 514 |
-
filtered_sentences = all_sentences
|
| 515 |
-
|
| 516 |
-
# Keep only the sentence text for embedding
|
| 517 |
-
candidate_sents = [s for s, _ in filtered_sentences]
|
| 518 |
-
|
| 519 |
-
# -----------------------------
|
| 520 |
-
# 4) Semantic scoring with SentenceTransformer
|
| 521 |
# -----------------------------
|
| 522 |
try:
|
|
|
|
| 523 |
q_emb = self.embedder.encode([question], convert_to_numpy=True)
|
| 524 |
-
|
| 525 |
|
| 526 |
-
# Normalize for cosine similarity
|
| 527 |
faiss.normalize_L2(q_emb)
|
| 528 |
-
faiss.normalize_L2(
|
| 529 |
|
| 530 |
-
|
| 531 |
-
sims = np.dot(sent_embs, q_emb.T).reshape(-1)
|
| 532 |
except Exception as e:
|
| 533 |
-
print(f"Sentence embedding error, falling back to lexical scoring: {e}")
|
| 534 |
-
|
| 535 |
-
|
| 536 |
-
|
| 537 |
-
|
| 538 |
-
|
| 539 |
-
|
| 540 |
-
|
| 541 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 542 |
|
| 543 |
# -----------------------------
|
| 544 |
-
#
|
| 545 |
# -----------------------------
|
| 546 |
-
if len(
|
| 547 |
answer_text = NO_ANSWER_MSG
|
| 548 |
else:
|
| 549 |
-
|
| 550 |
-
|
| 551 |
-
|
| 552 |
-
|
| 553 |
-
|
| 554 |
-
for i in top_idx
|
| 555 |
-
|
| 556 |
-
|
| 557 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 558 |
|
| 559 |
-
if not
|
| 560 |
answer_text = NO_ANSWER_MSG
|
| 561 |
else:
|
| 562 |
-
|
| 563 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 564 |
|
|
|
|
| 565 |
sources_str = ", ".join(sorted(used_sources)) if used_sources else "N/A"
|
| 566 |
|
| 567 |
return (
|
|
@@ -570,6 +559,7 @@ class RAGIndex:
|
|
| 570 |
)
|
| 571 |
|
| 572 |
|
|
|
|
| 573 |
# Initialize RAG system
|
| 574 |
print("=" * 50)
|
| 575 |
rag_index = RAGIndex()
|
|
|
|
| 418 |
return answer
|
| 419 |
|
| 420 |
def answer(self, question: str) -> str:
|
| 421 |
+
"""
|
| 422 |
+
Answer a question using RAG with sentence-level semantic selection
|
| 423 |
+
and a generic seq2seq model (Flan-T5, BART, etc.).
|
| 424 |
+
This function is fully stateless per call: it only uses the question
|
| 425 |
+
and the indexed knowledge base, never previous answers.
|
| 426 |
+
"""
|
| 427 |
if not self.initialized:
|
| 428 |
return "❌ Assistant not properly initialized. Please check the logs."
|
| 429 |
|
|
|
|
| 438 |
)
|
| 439 |
|
| 440 |
# -----------------------------
|
| 441 |
+
# 1) Retrieve top-K chunks for this question
|
| 442 |
# -----------------------------
|
| 443 |
+
contexts = self.retrieve(question, top_k=5)
|
| 444 |
|
| 445 |
if not contexts:
|
| 446 |
return (
|
|
|
|
| 449 |
)
|
| 450 |
|
| 451 |
used_sources = set()
|
| 452 |
+
candidate_sentences = []
|
| 453 |
+
candidate_sources = []
|
| 454 |
|
| 455 |
# -----------------------------
|
| 456 |
+
# 2) Split retrieved chunks into sentences (generic, no KB-specific logic)
|
| 457 |
# -----------------------------
|
| 458 |
for ctx, source, score in contexts:
|
| 459 |
used_sources.add(source)
|
| 460 |
+
|
| 461 |
cleaned_ctx = clean_context_text(ctx)
|
| 462 |
if not cleaned_ctx:
|
| 463 |
continue
|
| 464 |
|
| 465 |
+
# Simple sentence splitter: split on ., ?, ! plus newlines
|
| 466 |
raw_sents = re.split(r'(?<=[.!?])\s+|\n+', cleaned_ctx)
|
| 467 |
+
|
| 468 |
for s in raw_sents:
|
| 469 |
s_clean = s.strip()
|
| 470 |
+
# skip very short sentences
|
| 471 |
if len(s_clean) < 25:
|
| 472 |
continue
|
|
|
|
| 473 |
|
| 474 |
+
candidate_sentences.append(s_clean)
|
| 475 |
+
candidate_sources.append(source)
|
| 476 |
+
|
| 477 |
+
if not candidate_sentences:
|
| 478 |
return (
|
| 479 |
f"{NO_ANSWER_MSG}\n\n"
|
| 480 |
f"💡 Try adding more detailed documents to the knowledge base."
|
| 481 |
)
|
| 482 |
|
| 483 |
# -----------------------------
|
| 484 |
+
# 3) Score sentences: semantic + lexical (generic)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 485 |
# -----------------------------
|
| 486 |
try:
|
| 487 |
+
# Semantic similarity via sentence embeddings
|
| 488 |
q_emb = self.embedder.encode([question], convert_to_numpy=True)
|
| 489 |
+
s_embs = self.embedder.encode(candidate_sentences, convert_to_numpy=True)
|
| 490 |
|
|
|
|
| 491 |
faiss.normalize_L2(q_emb)
|
| 492 |
+
faiss.normalize_L2(s_embs)
|
| 493 |
|
| 494 |
+
sims = np.dot(s_embs, q_emb.T).reshape(-1) # cosine similarity
|
|
|
|
| 495 |
except Exception as e:
|
| 496 |
+
print(f"Sentence embedding error, falling back to lexical scoring only: {e}")
|
| 497 |
+
sims = np.zeros(len(candidate_sentences), dtype=float)
|
| 498 |
+
|
| 499 |
+
# Lexical overlap (shared content words)
|
| 500 |
+
q_words = {w.lower() for w in re.findall(r"\w+", question) if len(w) > 3}
|
| 501 |
+
lex_scores = []
|
| 502 |
+
for sent in candidate_sentences:
|
| 503 |
+
s_words = {w.lower() for w in re.findall(r"\w+", sent) if len(w) > 3}
|
| 504 |
+
lex_scores.append(len(q_words & s_words))
|
| 505 |
+
lex_scores = np.array(lex_scores, dtype=float)
|
| 506 |
+
|
| 507 |
+
# Combine scores in a generic way: semantic + a bit of lexical
|
| 508 |
+
combined = (1.5 * sims) + (0.5 * lex_scores)
|
| 509 |
|
| 510 |
# -----------------------------
|
| 511 |
+
# 4) Pick top-N sentences to form the context
|
| 512 |
# -----------------------------
|
| 513 |
+
if len(combined) == 0:
|
| 514 |
answer_text = NO_ANSWER_MSG
|
| 515 |
else:
|
| 516 |
+
top_idx = np.argsort(-combined)
|
| 517 |
+
max_sentences = 5 # you can tune this
|
| 518 |
+
chosen_sentences = []
|
| 519 |
+
chosen_sources = set()
|
| 520 |
+
|
| 521 |
+
for i in top_idx:
|
| 522 |
+
if len(chosen_sentences) >= max_sentences:
|
| 523 |
+
break
|
| 524 |
+
s = candidate_sentences[i].strip()
|
| 525 |
+
if not s:
|
| 526 |
+
continue
|
| 527 |
+
if s in chosen_sentences:
|
| 528 |
+
continue # avoid duplicates
|
| 529 |
+
chosen_sentences.append(s)
|
| 530 |
+
chosen_sources.add(candidate_sources[i])
|
| 531 |
|
| 532 |
+
if not chosen_sentences:
|
| 533 |
answer_text = NO_ANSWER_MSG
|
| 534 |
else:
|
| 535 |
+
context_for_llm = "\n".join(chosen_sentences)
|
| 536 |
+
|
| 537 |
+
# -----------------------------
|
| 538 |
+
# 5) Let the seq2seq model generate a natural answer
|
| 539 |
+
# -----------------------------
|
| 540 |
+
try:
|
| 541 |
+
answer_text = self._generate_from_context(
|
| 542 |
+
question=question,
|
| 543 |
+
context=context_for_llm,
|
| 544 |
+
max_new_tokens=200,
|
| 545 |
+
).strip()
|
| 546 |
+
except Exception as e:
|
| 547 |
+
print(f"Generation error, falling back to extractive answer: {e}")
|
| 548 |
+
answer_text = " ".join(chosen_sentences)
|
| 549 |
+
|
| 550 |
+
if not answer_text:
|
| 551 |
+
answer_text = NO_ANSWER_MSG
|
| 552 |
|
| 553 |
+
# Track sources from retrieved chunks (or from chosen sentences if you prefer)
|
| 554 |
sources_str = ", ".join(sorted(used_sources)) if used_sources else "N/A"
|
| 555 |
|
| 556 |
return (
|
|
|
|
| 559 |
)
|
| 560 |
|
| 561 |
|
| 562 |
+
|
| 563 |
# Initialize RAG system
|
| 564 |
print("=" * 50)
|
| 565 |
rag_index = RAGIndex()
|