Spaces:

sofzcc
/

Full_RAG_Assistant

Sleeping

App Files Files Community

sofzcc commited on Dec 2, 2025

Commit

c08571d

verified ·

1 Parent(s): 3155864

Update app.py

Browse files

Files changed (1) hide show

app.py +74 -84

app.py CHANGED Viewed

@@ -418,7 +418,12 @@ class RAGIndex:
         return answer
     def answer(self, question: str) -> str:
-        """Answer a question using RAG with sentence-level semantic selection."""
         if not self.initialized:
             return "❌ Assistant not properly initialized. Please check the logs."
@@ -433,9 +438,9 @@ class RAGIndex:
             )
         # -----------------------------
-        # 1) Retrieve relevant contexts (top-3)
         # -----------------------------
-        contexts = self.retrieve(question, top_k=3)
         if not contexts:
             return (
@@ -444,124 +449,108 @@ class RAGIndex:
             )
         used_sources = set()
-        all_sentences = []
         # -----------------------------
-        # 2) Collect & clean sentences from the retrieved chunks
         # -----------------------------
         for ctx, source, score in contexts:
             used_sources.add(source)
             cleaned_ctx = clean_context_text(ctx)
             if not cleaned_ctx:
                 continue
-            # Split into sentences (simple regex: ., !, ? or line breaks)
             raw_sents = re.split(r'(?<=[.!?])\s+|\n+', cleaned_ctx)
             for s in raw_sents:
                 s_clean = s.strip()
-                # Ignore very short or weirdly short lines
                 if len(s_clean) < 25:
                     continue
-                all_sentences.append((s_clean, source))
-        if not all_sentences:
             return (
                 f"{NO_ANSWER_MSG}\n\n"
                 f"💡 Try adding more detailed documents to the knowledge base."
             )
         # -----------------------------
-        # 3) Topic-aware filtering based on the question
-        # -----------------------------
-        q_lower = question.lower()
-        topic_keywords = {
-            "structure": {"structure", "organize", "hierarchy", "taxonomy", "categories", "information architecture"},
-            "maintenance": {"maintain", "maintenance", "update", "review", "governance", "version", "changelog"},
-            "quality": {"excellent", "good article", "tone", "style", "writing", "quality"},
-            "gaps": {"gap", "gaps", "missing", "search logs", "zero-result", "content gaps"},
-            "definition": {"what is", "define", "definition"},
-        }
-        active_topics = set()
-        if any(k in q_lower for k in ["structure", "organize", "hierarchy", "taxonomy"]):
-            active_topics.add("structure")
-        if any(k in q_lower for k in ["maintain", "maintenance", "update", "review", "governance", "keep up to date"]):
-            active_topics.add("maintenance")
-        if any(k in q_lower for k in ["good article", "excellent article", "tone", "style", "how to write"]):
-            active_topics.add("quality")
-        if any(k in q_lower for k in ["gap", "gaps", "content gaps", "missing", "search logs"]):
-            active_topics.add("gaps")
-        if any(k in q_lower for k in ["what is", "define", "definition"]):
-            active_topics.add("definition")
-        # If no explicit topic detected, we keep all sentences as candidates
-        filtered_sentences = []
-        if active_topics:
-            # Collect all keywords from active topics
-            active_kw = set()
-            for t in active_topics:
-                active_kw |= topic_keywords.get(t, set())
-            for sent, source in all_sentences:
-                s_lower = sent.lower()
-                if any(kw in s_lower for kw in active_kw):
-                    filtered_sentences.append((sent, source))
-        # Fallback to all sentences if filtering removed everything
-        if not filtered_sentences:
-            filtered_sentences = all_sentences
-        # Keep only the sentence text for embedding
-        candidate_sents = [s for s, _ in filtered_sentences]
-        # -----------------------------
-        # 4) Semantic scoring with SentenceTransformer
         # -----------------------------
         try:
             q_emb = self.embedder.encode([question], convert_to_numpy=True)
-            sent_embs = self.embedder.encode(candidate_sents, convert_to_numpy=True)
-            # Normalize for cosine similarity
             faiss.normalize_L2(q_emb)
-            faiss.normalize_L2(sent_embs)
-            # Cosine similarity = dot product after normalization
-            sims = np.dot(sent_embs, q_emb.T).reshape(-1)
         except Exception as e:
-            print(f"Sentence embedding error, falling back to lexical scoring: {e}")
-            # Lexical fallback: overlap of content words
-            q_words = {w.lower() for w in re.findall(r"\w+", question) if len(w) > 3}
-            sims = []
-            for sent in candidate_sents:
-                s_words = {w.lower() for w in re.findall(r"\w+", sent) if len(w) > 3}
-                overlap = len(q_words & s_words)
-                sims.append(float(overlap))
-            sims = np.array(sims, dtype=float)
         # -----------------------------
-        # 5) Pick top-N sentences & compose answer
         # -----------------------------
-        if len(sims) == 0:
             answer_text = NO_ANSWER_MSG
         else:
-            # Indices sorted by similarity descending
-            top_idx = np.argsort(-sims)
-            top_k = min(3, len(top_idx))  # use up to 3 sentences
-            chosen = []
-            for i in top_idx[:top_k]:
-                s = candidate_sents[i].strip()
-                if s and s not in chosen:
-                    chosen.append(s)
-            if not chosen:
                 answer_text = NO_ANSWER_MSG
             else:
-                # Join with spaces, ensure it reads like a paragraph
-                answer_text = " ".join(chosen)
         sources_str = ", ".join(sorted(used_sources)) if used_sources else "N/A"
         return (
@@ -570,6 +559,7 @@ class RAGIndex:
         )
 # Initialize RAG system
 print("=" * 50)
 rag_index = RAGIndex()

         return answer
     def answer(self, question: str) -> str:
+        """
+        Answer a question using RAG with sentence-level semantic selection
+        and a generic seq2seq model (Flan-T5, BART, etc.).
+        This function is fully stateless per call: it only uses the question
+        and the indexed knowledge base, never previous answers.
+        """
         if not self.initialized:
             return "❌ Assistant not properly initialized. Please check the logs."
             )
         # -----------------------------
+        # 1) Retrieve top-K chunks for this question
         # -----------------------------
+        contexts = self.retrieve(question, top_k=5)
         if not contexts:
             return (
             )
         used_sources = set()
+        candidate_sentences = []
+        candidate_sources = []
         # -----------------------------
+        # 2) Split retrieved chunks into sentences (generic, no KB-specific logic)
         # -----------------------------
         for ctx, source, score in contexts:
             used_sources.add(source)
             cleaned_ctx = clean_context_text(ctx)
             if not cleaned_ctx:
                 continue
+            # Simple sentence splitter: split on ., ?, ! plus newlines
             raw_sents = re.split(r'(?<=[.!?])\s+|\n+', cleaned_ctx)
             for s in raw_sents:
                 s_clean = s.strip()
+                # skip very short sentences
                 if len(s_clean) < 25:
                     continue
+                candidate_sentences.append(s_clean)
+                candidate_sources.append(source)
+        if not candidate_sentences:
             return (
                 f"{NO_ANSWER_MSG}\n\n"
                 f"💡 Try adding more detailed documents to the knowledge base."
             )
         # -----------------------------
+        # 3) Score sentences: semantic + lexical (generic)
         # -----------------------------
         try:
+            # Semantic similarity via sentence embeddings
             q_emb = self.embedder.encode([question], convert_to_numpy=True)
+            s_embs = self.embedder.encode(candidate_sentences, convert_to_numpy=True)
             faiss.normalize_L2(q_emb)
+            faiss.normalize_L2(s_embs)
+            sims = np.dot(s_embs, q_emb.T).reshape(-1)  # cosine similarity
         except Exception as e:
+            print(f"Sentence embedding error, falling back to lexical scoring only: {e}")
+            sims = np.zeros(len(candidate_sentences), dtype=float)
+        # Lexical overlap (shared content words)
+        q_words = {w.lower() for w in re.findall(r"\w+", question) if len(w) > 3}
+        lex_scores = []
+        for sent in candidate_sentences:
+            s_words = {w.lower() for w in re.findall(r"\w+", sent) if len(w) > 3}
+            lex_scores.append(len(q_words & s_words))
+        lex_scores = np.array(lex_scores, dtype=float)
+        # Combine scores in a generic way: semantic + a bit of lexical
+        combined = (1.5 * sims) + (0.5 * lex_scores)
         # -----------------------------
+        # 4) Pick top-N sentences to form the context
         # -----------------------------
+        if len(combined) == 0:
             answer_text = NO_ANSWER_MSG
         else:
+            top_idx = np.argsort(-combined)
+            max_sentences = 5  # you can tune this
+            chosen_sentences = []
+            chosen_sources = set()
+            for i in top_idx:
+                if len(chosen_sentences) >= max_sentences:
+                    break
+                s = candidate_sentences[i].strip()
+                if not s:
+                    continue
+                if s in chosen_sentences:
+                    continue  # avoid duplicates
+                chosen_sentences.append(s)
+                chosen_sources.add(candidate_sources[i])
+            if not chosen_sentences:
                 answer_text = NO_ANSWER_MSG
             else:
+                context_for_llm = "\n".join(chosen_sentences)
+                # -----------------------------
+                # 5) Let the seq2seq model generate a natural answer
+                # -----------------------------
+                try:
+                    answer_text = self._generate_from_context(
+                        question=question,
+                        context=context_for_llm,
+                        max_new_tokens=200,
+                    ).strip()
+                except Exception as e:
+                    print(f"Generation error, falling back to extractive answer: {e}")
+                    answer_text = " ".join(chosen_sentences)
+        if not answer_text:
+            answer_text = NO_ANSWER_MSG
+        # Track sources from retrieved chunks (or from chosen sentences if you prefer)
         sources_str = ", ".join(sorted(used_sources)) if used_sources else "N/A"
         return (
         )
 # Initialize RAG system
 print("=" * 50)
 rag_index = RAGIndex()