Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -388,7 +388,7 @@ class RAGIndex:
|
|
| 388 |
return answer
|
| 389 |
|
| 390 |
def answer(self, question: str) -> str:
|
| 391 |
-
"""Answer a question using RAG
|
| 392 |
if not self.initialized:
|
| 393 |
return "β Assistant not properly initialized. Please check the logs."
|
| 394 |
|
|
@@ -402,7 +402,7 @@ class RAGIndex:
|
|
| 402 |
f"Supported formats: .txt, .md, .pdf, .docx"
|
| 403 |
)
|
| 404 |
|
| 405 |
-
#
|
| 406 |
contexts = self.retrieve(question, top_k=3)
|
| 407 |
|
| 408 |
if not contexts:
|
|
@@ -412,64 +412,69 @@ class RAGIndex:
|
|
| 412 |
)
|
| 413 |
|
| 414 |
used_sources = set()
|
|
|
|
|
|
|
| 415 |
|
| 416 |
-
#
|
| 417 |
-
evidence_parts = []
|
| 418 |
for ctx, source, score in contexts:
|
| 419 |
used_sources.add(source)
|
| 420 |
-
|
| 421 |
-
|
| 422 |
-
|
| 423 |
|
| 424 |
-
if not
|
| 425 |
-
return
|
| 426 |
-
f"{NO_ANSWER_MSG}\n\n"
|
| 427 |
-
f"π‘ Try rephrasing your question or adding more detailed documents to the knowledge base."
|
| 428 |
-
)
|
| 429 |
|
| 430 |
-
#
|
| 431 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 432 |
|
| 433 |
-
|
| 434 |
-
|
| 435 |
-
|
| 436 |
-
|
| 437 |
-
|
| 438 |
-
|
| 439 |
-
|
| 440 |
-
|
| 441 |
-
|
| 442 |
-
|
| 443 |
-
answer_text = self._generate_from_context(answer_prompt, max_new_tokens=200)
|
| 444 |
-
answer_text = answer_text.strip()
|
| 445 |
-
|
| 446 |
-
# Clean up common artifacts
|
| 447 |
-
answer_text = answer_text.replace("**", "").replace("##", "").strip()
|
| 448 |
|
| 449 |
-
|
| 450 |
-
|
| 451 |
-
|
| 452 |
-
answer_text.startswith("Do NOT") or
|
| 453 |
-
"tone tone tone" in answer_text.lower()):
|
| 454 |
|
| 455 |
-
#
|
| 456 |
-
|
| 457 |
-
|
| 458 |
-
|
| 459 |
-
|
| 460 |
-
#
|
| 461 |
-
|
| 462 |
-
|
| 463 |
-
|
| 464 |
-
|
| 465 |
-
|
| 466 |
-
return (
|
| 467 |
-
"There was an error while generating the answer. "
|
| 468 |
-
"Please try again with a shorter question or different wording."
|
| 469 |
-
)
|
| 470 |
-
|
| 471 |
sources_str = ", ".join(sorted(used_sources)) if used_sources else "N/A"
|
| 472 |
-
|
| 473 |
return (
|
| 474 |
f"**Answer:** {answer_text}\n\n"
|
| 475 |
f"**Sources:** {sources_str}"
|
|
|
|
| 388 |
return answer
|
| 389 |
|
| 390 |
def answer(self, question: str) -> str:
|
| 391 |
+
"""Answer a question using RAG - simplified extractive approach."""
|
| 392 |
if not self.initialized:
|
| 393 |
return "β Assistant not properly initialized. Please check the logs."
|
| 394 |
|
|
|
|
| 402 |
f"Supported formats: .txt, .md, .pdf, .docx"
|
| 403 |
)
|
| 404 |
|
| 405 |
+
# Retrieve relevant contexts
|
| 406 |
contexts = self.retrieve(question, top_k=3)
|
| 407 |
|
| 408 |
if not contexts:
|
|
|
|
| 412 |
)
|
| 413 |
|
| 414 |
used_sources = set()
|
| 415 |
+
best_context = None
|
| 416 |
+
best_score = 0
|
| 417 |
|
| 418 |
+
# Find the best matching context
|
|
|
|
| 419 |
for ctx, source, score in contexts:
|
| 420 |
used_sources.add(source)
|
| 421 |
+
if score > best_score:
|
| 422 |
+
best_score = score
|
| 423 |
+
best_context = ctx
|
| 424 |
|
| 425 |
+
if not best_context:
|
| 426 |
+
return f"{NO_ANSWER_MSG}"
|
|
|
|
|
|
|
|
|
|
| 427 |
|
| 428 |
+
# AGGRESSIVE cleaning of the context
|
| 429 |
+
def deep_clean(text):
|
| 430 |
+
"""Remove ALL markdown, bullets, numbers, emojis, and formatting."""
|
| 431 |
+
# Remove emojis and special characters
|
| 432 |
+
text = re.sub(r'[πππ’π‘π ββββ οΈπ‘π]', '', text)
|
| 433 |
+
# Remove markdown headers (# ## ###)
|
| 434 |
+
text = re.sub(r'^#{1,6}\s+', '', text, flags=re.MULTILINE)
|
| 435 |
+
# Remove numbered lists (1. 2. 3.)
|
| 436 |
+
text = re.sub(r'^\s*\d+\.\s+', '', text, flags=re.MULTILINE)
|
| 437 |
+
# Remove bullet points (- * β’)
|
| 438 |
+
text = re.sub(r'^\s*[-*β’]\s+', '', text, flags=re.MULTILINE)
|
| 439 |
+
# Remove bold/italic (**text** *text*)
|
| 440 |
+
text = re.sub(r'\*\*([^*]+)\*\*', r'\1', text)
|
| 441 |
+
text = re.sub(r'\*([^*]+)\*', r'\1', text)
|
| 442 |
+
# Remove extra colons from labels
|
| 443 |
+
text = re.sub(r':\s*$', '', text, flags=re.MULTILINE)
|
| 444 |
+
# Clean multiple spaces
|
| 445 |
+
text = re.sub(r'\s+', ' ', text)
|
| 446 |
+
# Remove "Good:" "Bad:" type prefixes
|
| 447 |
+
text = re.sub(r'^(Good|Bad|Example|Note):\s*', '', text, flags=re.MULTILINE)
|
| 448 |
+
return text.strip()
|
| 449 |
|
| 450 |
+
cleaned = deep_clean(best_context)
|
| 451 |
+
|
| 452 |
+
# Extract just the most relevant sentences (3-4 sentences max)
|
| 453 |
+
sentences = [s.strip() + '.' for s in cleaned.split('.') if len(s.strip()) > 20]
|
| 454 |
+
answer_text = ' '.join(sentences[:4]) # First 4 good sentences
|
| 455 |
+
|
| 456 |
+
# If we got good text, try to generate a natural answer
|
| 457 |
+
if len(answer_text) > 50:
|
| 458 |
+
# Simple prompt for FLAN-T5
|
| 459 |
+
prompt = f"Question: {question}\n\nInformation: {answer_text[:800]}\n\nWrite a clear answer in 2-3 sentences:"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 460 |
|
| 461 |
+
try:
|
| 462 |
+
generated = self._generate_from_context(prompt, max_new_tokens=150)
|
| 463 |
+
generated = generated.strip()
|
|
|
|
|
|
|
| 464 |
|
| 465 |
+
# Only use generated answer if it looks good
|
| 466 |
+
if (len(generated) > 30 and
|
| 467 |
+
not generated.startswith(("Do NOT", "You are", "##", "**")) and
|
| 468 |
+
generated.count(':') < 3):
|
| 469 |
+
answer_text = generated
|
| 470 |
+
# Otherwise, keep the cleaned extractive answer
|
| 471 |
+
|
| 472 |
+
except Exception as e:
|
| 473 |
+
print(f"Generation error (using extractive fallback): {e}")
|
| 474 |
+
# Keep the cleaned extractive answer
|
| 475 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 476 |
sources_str = ", ".join(sorted(used_sources)) if used_sources else "N/A"
|
| 477 |
+
|
| 478 |
return (
|
| 479 |
f"**Answer:** {answer_text}\n\n"
|
| 480 |
f"**Sources:** {sources_str}"
|