Spaces:

sofzcc
/

Full_RAG_Assistant

Sleeping

App Files Files Community

sofzcc commited on Dec 2, 2025

Commit

df86717

verified ·

1 Parent(s): 3f651bb

Update app.py

Browse files

Files changed (1) hide show

app.py +50 -3

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import os
 import glob
 import yaml
@@ -102,7 +103,45 @@ def chunk_text(text: str, chunk_size: int, overlap: int) -> List[str]:
         start += chunk_size - overlap
     return chunks
 def load_file_text(path: str) -> str:
     """Load text from various file formats with error handling"""
@@ -342,6 +381,8 @@ class RAGIndex:
             **inputs,
             max_new_tokens=128,
             do_sample=False,
         )
         answer = self.qa_tokenizer.decode(
@@ -386,6 +427,8 @@ class RAGIndex:
         combined_text = "\n\n".join(combined_context)
         # Limit context length to keep it manageable
         max_context_chars = 4000
@@ -395,15 +438,19 @@ class RAGIndex:
         # Prompt for the generative model
         prompt = (
             "You are an AI assistant that answers questions using only the provided context.\n"
-            "- Do NOT copy large passages from the context.\n"
-            "- Do NOT mention file names or sources in your answer.\n"
             "- If the answer cannot be found in the context, reply exactly with: "
             "\"I don't know based on the provided documents.\"\n\n"
             f"Context:\n{combined_text}\n\n"
             f"Question: {question}\n\n"
-            "Answer in 1–3 concise sentences:"
         )
         try:
             answer_text = self._generate_from_context(prompt)
         except Exception as e:

+import re
 import os
 import glob
 import yaml
         start += chunk_size - overlap
     return chunks
+def clean_context_text(text: str) -> str:
+    """
+    Clean raw document context before sending to the generator:
+    - Remove markdown headings (#, ##, ###)
+    - Remove list markers (1., 2), -, *)
+    - Remove duplicate lines
+    """
+    lines = text.splitlines()
+    cleaned = []
+    seen = set()
+    for line in lines:
+        l = line.strip()
+        if not l:
+            continue
+        # Remove markdown headings like "# 1. Title", "## Section"
+        l = re.sub(r"^#+\s*", "", l)
+        # Remove ordered list prefixes like "1. ", "2) "
+        l = re.sub(r"^\d+[\.\)]\s*", "", l)
+        # Remove bullet markers like "- ", "* "
+        l = re.sub(r"^[-*]\s*", "", l)
+        # Skip very short "noise" lines
+        if len(l) < 5:
+            continue
+        # Avoid exact duplicates
+        if l in seen:
+            continue
+        seen.add(l)
+        cleaned.append(l)
+    return "\n".join(cleaned)
 def load_file_text(path: str) -> str:
     """Load text from various file formats with error handling"""
             **inputs,
             max_new_tokens=128,
             do_sample=False,
+            top_p=0.9,
+            temperature=0.7,
         )
         answer = self.qa_tokenizer.decode(
         combined_text = "\n\n".join(combined_context)
+        # Clean markdown / numbering / duplicates
+        combined_text = clean_context_text(combined_text)
         # Limit context length to keep it manageable
         max_context_chars = 4000
         # Prompt for the generative model
         prompt = (
             "You are an AI assistant that answers questions using only the provided context.\n"
+            "Your task is to synthesize a clear, natural explanation in your own words.\n"
+            "- Do NOT copy headings or section numbers from the context.\n"
+            "- Do NOT include markdown like '#', '##', '---', or bullet/list markers.\n"
+            "- Do NOT mention file names, sources, or internal labels in your answer.\n"
+            "- Do NOT just repeat full sentences from the context; always paraphrase.\n"
             "- If the answer cannot be found in the context, reply exactly with: "
             "\"I don't know based on the provided documents.\"\n\n"
             f"Context:\n{combined_text}\n\n"
             f"Question: {question}\n\n"
+            "Answer in 1–3 concise sentences of plain text:"
         )
         try:
             answer_text = self._generate_from_context(prompt)
         except Exception as e: