sofzcc commited on
Commit
3155864
·
verified ·
1 Parent(s): 1781439

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +113 -45
app.py CHANGED
@@ -339,7 +339,7 @@ class RAGIndex:
339
  self.chunks = all_chunks
340
  self.chunk_sources = all_sources
341
 
342
- def retrieve(self, query: str, top_k: int = 15) -> List[Tuple[str, str, float]]:
343
  """Retrieve relevant chunks for a query"""
344
  if not query or not query.strip():
345
  return []
@@ -418,7 +418,7 @@ class RAGIndex:
418
  return answer
419
 
420
  def answer(self, question: str) -> str:
421
- """Answer a question using RAG with a simple extractive approach from the best chunk only."""
422
  if not self.initialized:
423
  return "❌ Assistant not properly initialized. Please check the logs."
424
 
@@ -432,8 +432,10 @@ class RAGIndex:
432
  f"Supported formats: .txt, .md, .pdf, .docx"
433
  )
434
 
435
- # 1) Retrieve relevant contexts
436
- contexts = self.retrieve(question, top_k=1)
 
 
437
 
438
  if not contexts:
439
  return (
@@ -441,58 +443,124 @@ class RAGIndex:
441
  f"💡 Try rephrasing your question or check if relevant documents exist in the knowledge base."
442
  )
443
 
444
- # Use ONLY the single best scoring context (top-1)
445
- best_ctx, best_source, best_score = contexts[0]
446
- used_sources = {best_source}
447
 
448
- cleaned_ctx = clean_context_text(best_ctx)
449
- if not cleaned_ctx:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
450
  return (
451
  f"{NO_ANSWER_MSG}\n\n"
452
  f"💡 Try adding more detailed documents to the knowledge base."
453
  )
454
 
455
- # 2) Limit context size just in case
456
- max_context_chars = 1500
457
- if len(cleaned_ctx) > max_context_chars:
458
- cleaned_ctx = cleaned_ctx[:max_context_chars]
459
-
460
- # 3) Sentence-level relevance scoring within this single chunk
461
- raw_sentences = re.split(r'(?<=[.!?])\s+|\n+', cleaned_ctx)
462
- question_words = {
463
- w.lower()
464
- for w in re.findall(r"\w+", question)
465
- if len(w) > 3 # ignore very short/common words
466
  }
467
 
468
- scored_sentences = []
469
- for s in raw_sentences:
470
- s_clean = s.strip()
471
- if len(s_clean) < 20:
472
- continue
473
- words = {w.lower() for w in re.findall(r"\w+", s_clean)}
474
- overlap = question_words & words
475
- score = len(overlap)
476
- scored_sentences.append((score, s_clean))
477
-
478
- if scored_sentences:
479
- # Sort by overlap score (descending)
480
- scored_sentences.sort(key=lambda x: x[0], reverse=True)
481
-
482
- # Take the best 2–3 sentences that have some overlap
483
- top = [s for score, s in scored_sentences if score > 0][:3]
484
-
485
- # If none have overlap (e.g., vague question), just take the top 2 sentences overall
486
- if not top:
487
- top = [s for _, s in scored_sentences[:2]]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
488
 
489
- answer_text = " ".join(top)
490
- else:
491
- # Fallback: just take a slice of the cleaned context
492
- answer_text = cleaned_ctx[:400].strip()
493
 
494
- if not answer_text:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
495
  answer_text = NO_ANSWER_MSG
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
496
 
497
  sources_str = ", ".join(sorted(used_sources)) if used_sources else "N/A"
498
 
 
339
  self.chunks = all_chunks
340
  self.chunk_sources = all_sources
341
 
342
+ def retrieve(self, query: str, top_k: int = 5) -> List[Tuple[str, str, float]]:
343
  """Retrieve relevant chunks for a query"""
344
  if not query or not query.strip():
345
  return []
 
418
  return answer
419
 
420
  def answer(self, question: str) -> str:
421
+ """Answer a question using RAG with sentence-level semantic selection."""
422
  if not self.initialized:
423
  return "❌ Assistant not properly initialized. Please check the logs."
424
 
 
432
  f"Supported formats: .txt, .md, .pdf, .docx"
433
  )
434
 
435
+ # -----------------------------
436
+ # 1) Retrieve relevant contexts (top-3)
437
+ # -----------------------------
438
+ contexts = self.retrieve(question, top_k=3)
439
 
440
  if not contexts:
441
  return (
 
443
  f"💡 Try rephrasing your question or check if relevant documents exist in the knowledge base."
444
  )
445
 
446
+ used_sources = set()
447
+ all_sentences = []
 
448
 
449
+ # -----------------------------
450
+ # 2) Collect & clean sentences from the retrieved chunks
451
+ # -----------------------------
452
+ for ctx, source, score in contexts:
453
+ used_sources.add(source)
454
+ cleaned_ctx = clean_context_text(ctx)
455
+ if not cleaned_ctx:
456
+ continue
457
+
458
+ # Split into sentences (simple regex: ., !, ? or line breaks)
459
+ raw_sents = re.split(r'(?<=[.!?])\s+|\n+', cleaned_ctx)
460
+ for s in raw_sents:
461
+ s_clean = s.strip()
462
+ # Ignore very short or weirdly short lines
463
+ if len(s_clean) < 25:
464
+ continue
465
+ all_sentences.append((s_clean, source))
466
+
467
+ if not all_sentences:
468
  return (
469
  f"{NO_ANSWER_MSG}\n\n"
470
  f"💡 Try adding more detailed documents to the knowledge base."
471
  )
472
 
473
+ # -----------------------------
474
+ # 3) Topic-aware filtering based on the question
475
+ # -----------------------------
476
+ q_lower = question.lower()
477
+
478
+ topic_keywords = {
479
+ "structure": {"structure", "organize", "hierarchy", "taxonomy", "categories", "information architecture"},
480
+ "maintenance": {"maintain", "maintenance", "update", "review", "governance", "version", "changelog"},
481
+ "quality": {"excellent", "good article", "tone", "style", "writing", "quality"},
482
+ "gaps": {"gap", "gaps", "missing", "search logs", "zero-result", "content gaps"},
483
+ "definition": {"what is", "define", "definition"},
484
  }
485
 
486
+ active_topics = set()
487
+
488
+ if any(k in q_lower for k in ["structure", "organize", "hierarchy", "taxonomy"]):
489
+ active_topics.add("structure")
490
+ if any(k in q_lower for k in ["maintain", "maintenance", "update", "review", "governance", "keep up to date"]):
491
+ active_topics.add("maintenance")
492
+ if any(k in q_lower for k in ["good article", "excellent article", "tone", "style", "how to write"]):
493
+ active_topics.add("quality")
494
+ if any(k in q_lower for k in ["gap", "gaps", "content gaps", "missing", "search logs"]):
495
+ active_topics.add("gaps")
496
+ if any(k in q_lower for k in ["what is", "define", "definition"]):
497
+ active_topics.add("definition")
498
+
499
+ # If no explicit topic detected, we keep all sentences as candidates
500
+ filtered_sentences = []
501
+ if active_topics:
502
+ # Collect all keywords from active topics
503
+ active_kw = set()
504
+ for t in active_topics:
505
+ active_kw |= topic_keywords.get(t, set())
506
+
507
+ for sent, source in all_sentences:
508
+ s_lower = sent.lower()
509
+ if any(kw in s_lower for kw in active_kw):
510
+ filtered_sentences.append((sent, source))
511
+
512
+ # Fallback to all sentences if filtering removed everything
513
+ if not filtered_sentences:
514
+ filtered_sentences = all_sentences
515
+
516
+ # Keep only the sentence text for embedding
517
+ candidate_sents = [s for s, _ in filtered_sentences]
518
+
519
+ # -----------------------------
520
+ # 4) Semantic scoring with SentenceTransformer
521
+ # -----------------------------
522
+ try:
523
+ q_emb = self.embedder.encode([question], convert_to_numpy=True)
524
+ sent_embs = self.embedder.encode(candidate_sents, convert_to_numpy=True)
525
 
526
+ # Normalize for cosine similarity
527
+ faiss.normalize_L2(q_emb)
528
+ faiss.normalize_L2(sent_embs)
 
529
 
530
+ # Cosine similarity = dot product after normalization
531
+ sims = np.dot(sent_embs, q_emb.T).reshape(-1)
532
+ except Exception as e:
533
+ print(f"Sentence embedding error, falling back to lexical scoring: {e}")
534
+ # Lexical fallback: overlap of content words
535
+ q_words = {w.lower() for w in re.findall(r"\w+", question) if len(w) > 3}
536
+ sims = []
537
+ for sent in candidate_sents:
538
+ s_words = {w.lower() for w in re.findall(r"\w+", sent) if len(w) > 3}
539
+ overlap = len(q_words & s_words)
540
+ sims.append(float(overlap))
541
+ sims = np.array(sims, dtype=float)
542
+
543
+ # -----------------------------
544
+ # 5) Pick top-N sentences & compose answer
545
+ # -----------------------------
546
+ if len(sims) == 0:
547
  answer_text = NO_ANSWER_MSG
548
+ else:
549
+ # Indices sorted by similarity descending
550
+ top_idx = np.argsort(-sims)
551
+ top_k = min(3, len(top_idx)) # use up to 3 sentences
552
+ chosen = []
553
+
554
+ for i in top_idx[:top_k]:
555
+ s = candidate_sents[i].strip()
556
+ if s and s not in chosen:
557
+ chosen.append(s)
558
+
559
+ if not chosen:
560
+ answer_text = NO_ANSWER_MSG
561
+ else:
562
+ # Join with spaces, ensure it reads like a paragraph
563
+ answer_text = " ".join(chosen)
564
 
565
  sources_str = ", ".join(sorted(used_sources)) if used_sources else "N/A"
566