Spaces:

sofzcc
/

Full_RAG_Assistant

Sleeping

App Files Files Community

sofzcc commited on Dec 2, 2025

Commit

3155864

verified ·

1 Parent(s): 1781439

Update app.py

Browse files

Files changed (1) hide show

app.py +113 -45

app.py CHANGED Viewed

@@ -339,7 +339,7 @@ class RAGIndex:
         self.chunks = all_chunks
         self.chunk_sources = all_sources
-    def retrieve(self, query: str, top_k: int = 15) -> List[Tuple[str, str, float]]:
         """Retrieve relevant chunks for a query"""
         if not query or not query.strip():
             return []
@@ -418,7 +418,7 @@ class RAGIndex:
         return answer
     def answer(self, question: str) -> str:
-        """Answer a question using RAG with a simple extractive approach from the best chunk only."""
         if not self.initialized:
             return "❌ Assistant not properly initialized. Please check the logs."
@@ -432,8 +432,10 @@ class RAGIndex:
                 f"Supported formats: .txt, .md, .pdf, .docx"
             )
-        # 1) Retrieve relevant contexts
-        contexts = self.retrieve(question, top_k=1)
         if not contexts:
             return (
@@ -441,58 +443,124 @@ class RAGIndex:
                 f"💡 Try rephrasing your question or check if relevant documents exist in the knowledge base."
             )
-        # Use ONLY the single best scoring context (top-1)
-        best_ctx, best_source, best_score = contexts[0]
-        used_sources = {best_source}
-        cleaned_ctx = clean_context_text(best_ctx)
-        if not cleaned_ctx:
             return (
                 f"{NO_ANSWER_MSG}\n\n"
                 f"💡 Try adding more detailed documents to the knowledge base."
             )
-        # 2) Limit context size just in case
-        max_context_chars = 1500
-        if len(cleaned_ctx) > max_context_chars:
-            cleaned_ctx = cleaned_ctx[:max_context_chars]
-        # 3) Sentence-level relevance scoring within this single chunk
-        raw_sentences = re.split(r'(?<=[.!?])\s+|\n+', cleaned_ctx)
-        question_words = {
-            w.lower()
-            for w in re.findall(r"\w+", question)
-            if len(w) > 3  # ignore very short/common words
         }
-        scored_sentences = []
-        for s in raw_sentences:
-            s_clean = s.strip()
-            if len(s_clean) < 20:
-                continue
-            words = {w.lower() for w in re.findall(r"\w+", s_clean)}
-            overlap = question_words & words
-            score = len(overlap)
-            scored_sentences.append((score, s_clean))
-        if scored_sentences:
-            # Sort by overlap score (descending)
-            scored_sentences.sort(key=lambda x: x[0], reverse=True)
-            # Take the best 2–3 sentences that have some overlap
-            top = [s for score, s in scored_sentences if score > 0][:3]
-            # If none have overlap (e.g., vague question), just take the top 2 sentences overall
-            if not top:
-                top = [s for _, s in scored_sentences[:2]]
-            answer_text = " ".join(top)
-        else:
-            # Fallback: just take a slice of the cleaned context
-            answer_text = cleaned_ctx[:400].strip()
-        if not answer_text:
             answer_text = NO_ANSWER_MSG
         sources_str = ", ".join(sorted(used_sources)) if used_sources else "N/A"

         self.chunks = all_chunks
         self.chunk_sources = all_sources
+    def retrieve(self, query: str, top_k: int = 5) -> List[Tuple[str, str, float]]:
         """Retrieve relevant chunks for a query"""
         if not query or not query.strip():
             return []
         return answer
     def answer(self, question: str) -> str:
+        """Answer a question using RAG with sentence-level semantic selection."""
         if not self.initialized:
             return "❌ Assistant not properly initialized. Please check the logs."
                 f"Supported formats: .txt, .md, .pdf, .docx"
             )
+        # -----------------------------
+        # 1) Retrieve relevant contexts (top-3)
+        # -----------------------------
+        contexts = self.retrieve(question, top_k=3)
         if not contexts:
             return (
                 f"💡 Try rephrasing your question or check if relevant documents exist in the knowledge base."
             )
+        used_sources = set()
+        all_sentences = []
+        # -----------------------------
+        # 2) Collect & clean sentences from the retrieved chunks
+        # -----------------------------
+        for ctx, source, score in contexts:
+            used_sources.add(source)
+            cleaned_ctx = clean_context_text(ctx)
+            if not cleaned_ctx:
+                continue
+            # Split into sentences (simple regex: ., !, ? or line breaks)
+            raw_sents = re.split(r'(?<=[.!?])\s+|\n+', cleaned_ctx)
+            for s in raw_sents:
+                s_clean = s.strip()
+                # Ignore very short or weirdly short lines
+                if len(s_clean) < 25:
+                    continue
+                all_sentences.append((s_clean, source))
+        if not all_sentences:
             return (
                 f"{NO_ANSWER_MSG}\n\n"
                 f"💡 Try adding more detailed documents to the knowledge base."
             )
+        # -----------------------------
+        # 3) Topic-aware filtering based on the question
+        # -----------------------------
+        q_lower = question.lower()
+        topic_keywords = {
+            "structure": {"structure", "organize", "hierarchy", "taxonomy", "categories", "information architecture"},
+            "maintenance": {"maintain", "maintenance", "update", "review", "governance", "version", "changelog"},
+            "quality": {"excellent", "good article", "tone", "style", "writing", "quality"},
+            "gaps": {"gap", "gaps", "missing", "search logs", "zero-result", "content gaps"},
+            "definition": {"what is", "define", "definition"},
         }
+        active_topics = set()
+        if any(k in q_lower for k in ["structure", "organize", "hierarchy", "taxonomy"]):
+            active_topics.add("structure")
+        if any(k in q_lower for k in ["maintain", "maintenance", "update", "review", "governance", "keep up to date"]):
+            active_topics.add("maintenance")
+        if any(k in q_lower for k in ["good article", "excellent article", "tone", "style", "how to write"]):
+            active_topics.add("quality")
+        if any(k in q_lower for k in ["gap", "gaps", "content gaps", "missing", "search logs"]):
+            active_topics.add("gaps")
+        if any(k in q_lower for k in ["what is", "define", "definition"]):
+            active_topics.add("definition")
+        # If no explicit topic detected, we keep all sentences as candidates
+        filtered_sentences = []
+        if active_topics:
+            # Collect all keywords from active topics
+            active_kw = set()
+            for t in active_topics:
+                active_kw |= topic_keywords.get(t, set())
+            for sent, source in all_sentences:
+                s_lower = sent.lower()
+                if any(kw in s_lower for kw in active_kw):
+                    filtered_sentences.append((sent, source))
+        # Fallback to all sentences if filtering removed everything
+        if not filtered_sentences:
+            filtered_sentences = all_sentences
+        # Keep only the sentence text for embedding
+        candidate_sents = [s for s, _ in filtered_sentences]
+        # -----------------------------
+        # 4) Semantic scoring with SentenceTransformer
+        # -----------------------------
+        try:
+            q_emb = self.embedder.encode([question], convert_to_numpy=True)
+            sent_embs = self.embedder.encode(candidate_sents, convert_to_numpy=True)
+            # Normalize for cosine similarity
+            faiss.normalize_L2(q_emb)
+            faiss.normalize_L2(sent_embs)
+            # Cosine similarity = dot product after normalization
+            sims = np.dot(sent_embs, q_emb.T).reshape(-1)
+        except Exception as e:
+            print(f"Sentence embedding error, falling back to lexical scoring: {e}")
+            # Lexical fallback: overlap of content words
+            q_words = {w.lower() for w in re.findall(r"\w+", question) if len(w) > 3}
+            sims = []
+            for sent in candidate_sents:
+                s_words = {w.lower() for w in re.findall(r"\w+", sent) if len(w) > 3}
+                overlap = len(q_words & s_words)
+                sims.append(float(overlap))
+            sims = np.array(sims, dtype=float)
+        # -----------------------------
+        # 5) Pick top-N sentences & compose answer
+        # -----------------------------
+        if len(sims) == 0:
             answer_text = NO_ANSWER_MSG
+        else:
+            # Indices sorted by similarity descending
+            top_idx = np.argsort(-sims)
+            top_k = min(3, len(top_idx))  # use up to 3 sentences
+            chosen = []
+            for i in top_idx[:top_k]:
+                s = candidate_sents[i].strip()
+                if s and s not in chosen:
+                    chosen.append(s)
+            if not chosen:
+                answer_text = NO_ANSWER_MSG
+            else:
+                # Join with spaces, ensure it reads like a paragraph
+                answer_text = " ".join(chosen)
         sources_str = ", ".join(sorted(used_sources)) if used_sources else "N/A"