sofzcc commited on
Commit
c08571d
·
verified ·
1 Parent(s): 3155864

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +74 -84
app.py CHANGED
@@ -418,7 +418,12 @@ class RAGIndex:
418
  return answer
419
 
420
  def answer(self, question: str) -> str:
421
- """Answer a question using RAG with sentence-level semantic selection."""
 
 
 
 
 
422
  if not self.initialized:
423
  return "❌ Assistant not properly initialized. Please check the logs."
424
 
@@ -433,9 +438,9 @@ class RAGIndex:
433
  )
434
 
435
  # -----------------------------
436
- # 1) Retrieve relevant contexts (top-3)
437
  # -----------------------------
438
- contexts = self.retrieve(question, top_k=3)
439
 
440
  if not contexts:
441
  return (
@@ -444,124 +449,108 @@ class RAGIndex:
444
  )
445
 
446
  used_sources = set()
447
- all_sentences = []
 
448
 
449
  # -----------------------------
450
- # 2) Collect & clean sentences from the retrieved chunks
451
  # -----------------------------
452
  for ctx, source, score in contexts:
453
  used_sources.add(source)
 
454
  cleaned_ctx = clean_context_text(ctx)
455
  if not cleaned_ctx:
456
  continue
457
 
458
- # Split into sentences (simple regex: ., !, ? or line breaks)
459
  raw_sents = re.split(r'(?<=[.!?])\s+|\n+', cleaned_ctx)
 
460
  for s in raw_sents:
461
  s_clean = s.strip()
462
- # Ignore very short or weirdly short lines
463
  if len(s_clean) < 25:
464
  continue
465
- all_sentences.append((s_clean, source))
466
 
467
- if not all_sentences:
 
 
 
468
  return (
469
  f"{NO_ANSWER_MSG}\n\n"
470
  f"💡 Try adding more detailed documents to the knowledge base."
471
  )
472
 
473
  # -----------------------------
474
- # 3) Topic-aware filtering based on the question
475
- # -----------------------------
476
- q_lower = question.lower()
477
-
478
- topic_keywords = {
479
- "structure": {"structure", "organize", "hierarchy", "taxonomy", "categories", "information architecture"},
480
- "maintenance": {"maintain", "maintenance", "update", "review", "governance", "version", "changelog"},
481
- "quality": {"excellent", "good article", "tone", "style", "writing", "quality"},
482
- "gaps": {"gap", "gaps", "missing", "search logs", "zero-result", "content gaps"},
483
- "definition": {"what is", "define", "definition"},
484
- }
485
-
486
- active_topics = set()
487
-
488
- if any(k in q_lower for k in ["structure", "organize", "hierarchy", "taxonomy"]):
489
- active_topics.add("structure")
490
- if any(k in q_lower for k in ["maintain", "maintenance", "update", "review", "governance", "keep up to date"]):
491
- active_topics.add("maintenance")
492
- if any(k in q_lower for k in ["good article", "excellent article", "tone", "style", "how to write"]):
493
- active_topics.add("quality")
494
- if any(k in q_lower for k in ["gap", "gaps", "content gaps", "missing", "search logs"]):
495
- active_topics.add("gaps")
496
- if any(k in q_lower for k in ["what is", "define", "definition"]):
497
- active_topics.add("definition")
498
-
499
- # If no explicit topic detected, we keep all sentences as candidates
500
- filtered_sentences = []
501
- if active_topics:
502
- # Collect all keywords from active topics
503
- active_kw = set()
504
- for t in active_topics:
505
- active_kw |= topic_keywords.get(t, set())
506
-
507
- for sent, source in all_sentences:
508
- s_lower = sent.lower()
509
- if any(kw in s_lower for kw in active_kw):
510
- filtered_sentences.append((sent, source))
511
-
512
- # Fallback to all sentences if filtering removed everything
513
- if not filtered_sentences:
514
- filtered_sentences = all_sentences
515
-
516
- # Keep only the sentence text for embedding
517
- candidate_sents = [s for s, _ in filtered_sentences]
518
-
519
- # -----------------------------
520
- # 4) Semantic scoring with SentenceTransformer
521
  # -----------------------------
522
  try:
 
523
  q_emb = self.embedder.encode([question], convert_to_numpy=True)
524
- sent_embs = self.embedder.encode(candidate_sents, convert_to_numpy=True)
525
 
526
- # Normalize for cosine similarity
527
  faiss.normalize_L2(q_emb)
528
- faiss.normalize_L2(sent_embs)
529
 
530
- # Cosine similarity = dot product after normalization
531
- sims = np.dot(sent_embs, q_emb.T).reshape(-1)
532
  except Exception as e:
533
- print(f"Sentence embedding error, falling back to lexical scoring: {e}")
534
- # Lexical fallback: overlap of content words
535
- q_words = {w.lower() for w in re.findall(r"\w+", question) if len(w) > 3}
536
- sims = []
537
- for sent in candidate_sents:
538
- s_words = {w.lower() for w in re.findall(r"\w+", sent) if len(w) > 3}
539
- overlap = len(q_words & s_words)
540
- sims.append(float(overlap))
541
- sims = np.array(sims, dtype=float)
 
 
 
 
542
 
543
  # -----------------------------
544
- # 5) Pick top-N sentences & compose answer
545
  # -----------------------------
546
- if len(sims) == 0:
547
  answer_text = NO_ANSWER_MSG
548
  else:
549
- # Indices sorted by similarity descending
550
- top_idx = np.argsort(-sims)
551
- top_k = min(3, len(top_idx)) # use up to 3 sentences
552
- chosen = []
553
-
554
- for i in top_idx[:top_k]:
555
- s = candidate_sents[i].strip()
556
- if s and s not in chosen:
557
- chosen.append(s)
 
 
 
 
 
 
558
 
559
- if not chosen:
560
  answer_text = NO_ANSWER_MSG
561
  else:
562
- # Join with spaces, ensure it reads like a paragraph
563
- answer_text = " ".join(chosen)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
564
 
 
565
  sources_str = ", ".join(sorted(used_sources)) if used_sources else "N/A"
566
 
567
  return (
@@ -570,6 +559,7 @@ class RAGIndex:
570
  )
571
 
572
 
 
573
  # Initialize RAG system
574
  print("=" * 50)
575
  rag_index = RAGIndex()
 
418
  return answer
419
 
420
  def answer(self, question: str) -> str:
421
+ """
422
+ Answer a question using RAG with sentence-level semantic selection
423
+ and a generic seq2seq model (Flan-T5, BART, etc.).
424
+ This function is fully stateless per call: it only uses the question
425
+ and the indexed knowledge base, never previous answers.
426
+ """
427
  if not self.initialized:
428
  return "❌ Assistant not properly initialized. Please check the logs."
429
 
 
438
  )
439
 
440
  # -----------------------------
441
+ # 1) Retrieve top-K chunks for this question
442
  # -----------------------------
443
+ contexts = self.retrieve(question, top_k=5)
444
 
445
  if not contexts:
446
  return (
 
449
  )
450
 
451
  used_sources = set()
452
+ candidate_sentences = []
453
+ candidate_sources = []
454
 
455
  # -----------------------------
456
+ # 2) Split retrieved chunks into sentences (generic, no KB-specific logic)
457
  # -----------------------------
458
  for ctx, source, score in contexts:
459
  used_sources.add(source)
460
+
461
  cleaned_ctx = clean_context_text(ctx)
462
  if not cleaned_ctx:
463
  continue
464
 
465
+ # Simple sentence splitter: split on ., ?, ! plus newlines
466
  raw_sents = re.split(r'(?<=[.!?])\s+|\n+', cleaned_ctx)
467
+
468
  for s in raw_sents:
469
  s_clean = s.strip()
470
+ # skip very short sentences
471
  if len(s_clean) < 25:
472
  continue
 
473
 
474
+ candidate_sentences.append(s_clean)
475
+ candidate_sources.append(source)
476
+
477
+ if not candidate_sentences:
478
  return (
479
  f"{NO_ANSWER_MSG}\n\n"
480
  f"💡 Try adding more detailed documents to the knowledge base."
481
  )
482
 
483
  # -----------------------------
484
+ # 3) Score sentences: semantic + lexical (generic)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
485
  # -----------------------------
486
  try:
487
+ # Semantic similarity via sentence embeddings
488
  q_emb = self.embedder.encode([question], convert_to_numpy=True)
489
+ s_embs = self.embedder.encode(candidate_sentences, convert_to_numpy=True)
490
 
 
491
  faiss.normalize_L2(q_emb)
492
+ faiss.normalize_L2(s_embs)
493
 
494
+ sims = np.dot(s_embs, q_emb.T).reshape(-1) # cosine similarity
 
495
  except Exception as e:
496
+ print(f"Sentence embedding error, falling back to lexical scoring only: {e}")
497
+ sims = np.zeros(len(candidate_sentences), dtype=float)
498
+
499
+ # Lexical overlap (shared content words)
500
+ q_words = {w.lower() for w in re.findall(r"\w+", question) if len(w) > 3}
501
+ lex_scores = []
502
+ for sent in candidate_sentences:
503
+ s_words = {w.lower() for w in re.findall(r"\w+", sent) if len(w) > 3}
504
+ lex_scores.append(len(q_words & s_words))
505
+ lex_scores = np.array(lex_scores, dtype=float)
506
+
507
+ # Combine scores in a generic way: semantic + a bit of lexical
508
+ combined = (1.5 * sims) + (0.5 * lex_scores)
509
 
510
  # -----------------------------
511
+ # 4) Pick top-N sentences to form the context
512
  # -----------------------------
513
+ if len(combined) == 0:
514
  answer_text = NO_ANSWER_MSG
515
  else:
516
+ top_idx = np.argsort(-combined)
517
+ max_sentences = 5 # you can tune this
518
+ chosen_sentences = []
519
+ chosen_sources = set()
520
+
521
+ for i in top_idx:
522
+ if len(chosen_sentences) >= max_sentences:
523
+ break
524
+ s = candidate_sentences[i].strip()
525
+ if not s:
526
+ continue
527
+ if s in chosen_sentences:
528
+ continue # avoid duplicates
529
+ chosen_sentences.append(s)
530
+ chosen_sources.add(candidate_sources[i])
531
 
532
+ if not chosen_sentences:
533
  answer_text = NO_ANSWER_MSG
534
  else:
535
+ context_for_llm = "\n".join(chosen_sentences)
536
+
537
+ # -----------------------------
538
+ # 5) Let the seq2seq model generate a natural answer
539
+ # -----------------------------
540
+ try:
541
+ answer_text = self._generate_from_context(
542
+ question=question,
543
+ context=context_for_llm,
544
+ max_new_tokens=200,
545
+ ).strip()
546
+ except Exception as e:
547
+ print(f"Generation error, falling back to extractive answer: {e}")
548
+ answer_text = " ".join(chosen_sentences)
549
+
550
+ if not answer_text:
551
+ answer_text = NO_ANSWER_MSG
552
 
553
+ # Track sources from retrieved chunks (or from chosen sentences if you prefer)
554
  sources_str = ", ".join(sorted(used_sources)) if used_sources else "N/A"
555
 
556
  return (
 
559
  )
560
 
561
 
562
+
563
  # Initialize RAG system
564
  print("=" * 50)
565
  rag_index = RAGIndex()