Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
import os
|
| 2 |
import glob
|
| 3 |
import yaml
|
|
@@ -102,7 +103,45 @@ def chunk_text(text: str, chunk_size: int, overlap: int) -> List[str]:
|
|
| 102 |
start += chunk_size - overlap
|
| 103 |
|
| 104 |
return chunks
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
|
|
|
|
|
|
|
|
|
|
| 106 |
|
| 107 |
def load_file_text(path: str) -> str:
|
| 108 |
"""Load text from various file formats with error handling"""
|
|
@@ -342,6 +381,8 @@ class RAGIndex:
|
|
| 342 |
**inputs,
|
| 343 |
max_new_tokens=128,
|
| 344 |
do_sample=False,
|
|
|
|
|
|
|
| 345 |
)
|
| 346 |
|
| 347 |
answer = self.qa_tokenizer.decode(
|
|
@@ -386,6 +427,8 @@ class RAGIndex:
|
|
| 386 |
|
| 387 |
combined_text = "\n\n".join(combined_context)
|
| 388 |
|
|
|
|
|
|
|
| 389 |
|
| 390 |
# Limit context length to keep it manageable
|
| 391 |
max_context_chars = 4000
|
|
@@ -395,15 +438,19 @@ class RAGIndex:
|
|
| 395 |
# Prompt for the generative model
|
| 396 |
prompt = (
|
| 397 |
"You are an AI assistant that answers questions using only the provided context.\n"
|
| 398 |
-
"
|
| 399 |
-
"- Do NOT
|
|
|
|
|
|
|
|
|
|
| 400 |
"- If the answer cannot be found in the context, reply exactly with: "
|
| 401 |
"\"I don't know based on the provided documents.\"\n\n"
|
| 402 |
f"Context:\n{combined_text}\n\n"
|
| 403 |
f"Question: {question}\n\n"
|
| 404 |
-
"Answer in 1–3 concise sentences:"
|
| 405 |
)
|
| 406 |
|
|
|
|
| 407 |
try:
|
| 408 |
answer_text = self._generate_from_context(prompt)
|
| 409 |
except Exception as e:
|
|
|
|
| 1 |
+
import re
|
| 2 |
import os
|
| 3 |
import glob
|
| 4 |
import yaml
|
|
|
|
| 103 |
start += chunk_size - overlap
|
| 104 |
|
| 105 |
return chunks
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
def clean_context_text(text: str) -> str:
|
| 109 |
+
"""
|
| 110 |
+
Clean raw document context before sending to the generator:
|
| 111 |
+
- Remove markdown headings (#, ##, ###)
|
| 112 |
+
- Remove list markers (1., 2), -, *)
|
| 113 |
+
- Remove duplicate lines
|
| 114 |
+
"""
|
| 115 |
+
lines = text.splitlines()
|
| 116 |
+
cleaned = []
|
| 117 |
+
seen = set()
|
| 118 |
+
|
| 119 |
+
for line in lines:
|
| 120 |
+
l = line.strip()
|
| 121 |
+
if not l:
|
| 122 |
+
continue
|
| 123 |
+
|
| 124 |
+
# Remove markdown headings like "# 1. Title", "## Section"
|
| 125 |
+
l = re.sub(r"^#+\s*", "", l)
|
| 126 |
+
|
| 127 |
+
# Remove ordered list prefixes like "1. ", "2) "
|
| 128 |
+
l = re.sub(r"^\d+[\.\)]\s*", "", l)
|
| 129 |
+
|
| 130 |
+
# Remove bullet markers like "- ", "* "
|
| 131 |
+
l = re.sub(r"^[-*]\s*", "", l)
|
| 132 |
+
|
| 133 |
+
# Skip very short "noise" lines
|
| 134 |
+
if len(l) < 5:
|
| 135 |
+
continue
|
| 136 |
+
|
| 137 |
+
# Avoid exact duplicates
|
| 138 |
+
if l in seen:
|
| 139 |
+
continue
|
| 140 |
+
seen.add(l)
|
| 141 |
|
| 142 |
+
cleaned.append(l)
|
| 143 |
+
|
| 144 |
+
return "\n".join(cleaned)
|
| 145 |
|
| 146 |
def load_file_text(path: str) -> str:
|
| 147 |
"""Load text from various file formats with error handling"""
|
|
|
|
| 381 |
**inputs,
|
| 382 |
max_new_tokens=128,
|
| 383 |
do_sample=False,
|
| 384 |
+
top_p=0.9,
|
| 385 |
+
temperature=0.7,
|
| 386 |
)
|
| 387 |
|
| 388 |
answer = self.qa_tokenizer.decode(
|
|
|
|
| 427 |
|
| 428 |
combined_text = "\n\n".join(combined_context)
|
| 429 |
|
| 430 |
+
# Clean markdown / numbering / duplicates
|
| 431 |
+
combined_text = clean_context_text(combined_text)
|
| 432 |
|
| 433 |
# Limit context length to keep it manageable
|
| 434 |
max_context_chars = 4000
|
|
|
|
| 438 |
# Prompt for the generative model
|
| 439 |
prompt = (
|
| 440 |
"You are an AI assistant that answers questions using only the provided context.\n"
|
| 441 |
+
"Your task is to synthesize a clear, natural explanation in your own words.\n"
|
| 442 |
+
"- Do NOT copy headings or section numbers from the context.\n"
|
| 443 |
+
"- Do NOT include markdown like '#', '##', '---', or bullet/list markers.\n"
|
| 444 |
+
"- Do NOT mention file names, sources, or internal labels in your answer.\n"
|
| 445 |
+
"- Do NOT just repeat full sentences from the context; always paraphrase.\n"
|
| 446 |
"- If the answer cannot be found in the context, reply exactly with: "
|
| 447 |
"\"I don't know based on the provided documents.\"\n\n"
|
| 448 |
f"Context:\n{combined_text}\n\n"
|
| 449 |
f"Question: {question}\n\n"
|
| 450 |
+
"Answer in 1–3 concise sentences of plain text:"
|
| 451 |
)
|
| 452 |
|
| 453 |
+
|
| 454 |
try:
|
| 455 |
answer_text = self._generate_from_context(prompt)
|
| 456 |
except Exception as e:
|