Spaces:

sofzcc
/

Full_RAG_Assistant

Sleeping

App Files Files Community

sofzcc commited on Dec 2, 2025

Commit

ff5c1a5

verified ·

1 Parent(s): 5de21d9

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -14

app.py CHANGED Viewed

@@ -362,11 +362,31 @@ class RAGIndex:
             print(f"Retrieval error: {e}")
             return []
-    def _generate_from_context(self, prompt: str, max_new_tokens: int = 128) -> str:
-        """Run Flan-T5 on the given prompt and return the decoded answer."""
         if self.qa_model is None or self.qa_tokenizer is None:
             raise RuntimeError("QA model not loaded.")
         inputs = self.qa_tokenizer(
             prompt,
             return_tensors="pt",
@@ -377,6 +397,7 @@ class RAGIndex:
         outputs = self.qa_model.generate(
             **inputs,
             max_new_tokens=max_new_tokens,
             do_sample=False,
         )
@@ -414,6 +435,7 @@ class RAGIndex:
         used_sources = set()
         context_texts = []
         for ctx, source, score in contexts:
             used_sources.add(source)
             cleaned_ctx = clean_context_text(ctx)
@@ -434,19 +456,13 @@ class RAGIndex:
         if len(combined_context) > max_context_chars:
             combined_context = combined_context[:max_context_chars]
-        # 3) Build a prompt that works for both BART (summarization-style)
-        #    and instruction-tuned models like Flan-T5.
-        prompt = (
-            "You are an assistant that answers questions about a knowledge base.\n"
-            "Using only the information in the passages below, answer the question in 2–4 sentences.\n"
-            "Explain in clear, natural language. Do NOT copy section numbers, markdown headings, or bullet symbols.\n\n"
-            f"Passages:\n{combined_context}\n\n"
-            f"Question: {question}\n\n"
-            "Answer:"
-        )
         try:
-            answer_text = self._generate_from_context(prompt, max_new_tokens=180).strip()
         except Exception as e:
             print(f"Generation error: {e}")
             return (

             print(f"Retrieval error: {e}")
             return []
+    def _generate_from_context(
+        self,
+        question: str,
+        context: str,
+        max_new_tokens: int = 180,
+    ) -> str:
+        """
+        Generate a grounded answer from the retrieved context using a seq2seq model
+        (FLAN-T5, BART, etc.). The prompt forces the model to only use the context.
+        """
         if self.qa_model is None or self.qa_tokenizer is None:
             raise RuntimeError("QA model not loaded.")
+        prompt = (
+            "You are a knowledge base assistant. Answer the question ONLY using the information "
+            "in the context below.\n"
+            "If the context does not contain the answer, say exactly: "
+            "\"The documents do not contain enough information to answer this.\"\n\n"
+            f"Question: {question}\n\n"
+            "Context:\n"
+            f"{context}\n\n"
+            "Write a helpful answer in 2–4 sentences. Keep it factual and concise. "
+            "Do NOT repeat the question. Do NOT include section titles or headings."
+        )
         inputs = self.qa_tokenizer(
             prompt,
             return_tensors="pt",
         outputs = self.qa_model.generate(
             **inputs,
             max_new_tokens=max_new_tokens,
+            temperature=0.0,   # deterministic
             do_sample=False,
         )
         used_sources = set()
         context_texts = []
+        # Clean and collect the retrieved chunks
         for ctx, source, score in contexts:
             used_sources.add(source)
             cleaned_ctx = clean_context_text(ctx)
         if len(combined_context) > max_context_chars:
             combined_context = combined_context[:max_context_chars]
+        # 3) Generate grounded answer from context
         try:
+            answer_text = self._generate_from_context(
+                question=question,
+                context=combined_context,
+                max_new_tokens=180,
+            ).strip()
         except Exception as e:
             print(f"Generation error: {e}")
             return (