sofzcc commited on
Commit
c90e2a0
Β·
verified Β·
1 Parent(s): f7a09e8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -51
app.py CHANGED
@@ -388,7 +388,7 @@ class RAGIndex:
388
  return answer
389
 
390
  def answer(self, question: str) -> str:
391
- """Answer a question using RAG with improved generation."""
392
  if not self.initialized:
393
  return "❌ Assistant not properly initialized. Please check the logs."
394
 
@@ -402,7 +402,7 @@ class RAGIndex:
402
  f"Supported formats: .txt, .md, .pdf, .docx"
403
  )
404
 
405
- # 1) Retrieve relevant contexts
406
  contexts = self.retrieve(question, top_k=3)
407
 
408
  if not contexts:
@@ -412,64 +412,69 @@ class RAGIndex:
412
  )
413
 
414
  used_sources = set()
 
 
415
 
416
- # 2) Collect and clean the best contexts
417
- evidence_parts = []
418
  for ctx, source, score in contexts:
419
  used_sources.add(source)
420
- cleaned_ctx = clean_context_text(ctx)
421
- if cleaned_ctx.strip():
422
- evidence_parts.append(cleaned_ctx)
423
 
424
- if not evidence_parts:
425
- return (
426
- f"{NO_ANSWER_MSG}\n\n"
427
- f"πŸ’‘ Try rephrasing your question or adding more detailed documents to the knowledge base."
428
- )
429
 
430
- # Combine contexts - use MORE context for better answers
431
- combined_context = " ".join(evidence_parts[:3])[:1500] # Top 3, up to 1500 chars
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
432
 
433
- # 3) Better prompt structure for FLAN-T5
434
- answer_prompt = f"""Read the context and answer the question in 2-3 complete sentences.
435
-
436
- Context: {combined_context}
437
-
438
- Question: {question}
439
-
440
- Answer in complete sentences:"""
441
-
442
- try:
443
- answer_text = self._generate_from_context(answer_prompt, max_new_tokens=200)
444
- answer_text = answer_text.strip()
445
-
446
- # Clean up common artifacts
447
- answer_text = answer_text.replace("**", "").replace("##", "").strip()
448
 
449
- # If answer is poor quality, try alternative approach
450
- if (len(answer_text) < 20 or
451
- answer_text.count(":") > 3 or
452
- answer_text.startswith("Do NOT") or
453
- "tone tone tone" in answer_text.lower()):
454
 
455
- # Try extractive approach: just clean and present the best context
456
- best_context = evidence_parts[0]
457
- # Remove list markers and clean
458
- best_context = re.sub(r'^\s*[-*]\s*', '', best_context, flags=re.MULTILINE)
459
- best_context = re.sub(r'^\s*\d+\.\s*', '', best_context, flags=re.MULTILINE)
460
- # Take first few sentences
461
- sentences = best_context.split('.')[:3]
462
- answer_text = '. '.join(s.strip() for s in sentences if s.strip()) + '.'
463
-
464
- except Exception as e:
465
- print(f"Generation error: {e}")
466
- return (
467
- "There was an error while generating the answer. "
468
- "Please try again with a shorter question or different wording."
469
- )
470
-
471
  sources_str = ", ".join(sorted(used_sources)) if used_sources else "N/A"
472
-
473
  return (
474
  f"**Answer:** {answer_text}\n\n"
475
  f"**Sources:** {sources_str}"
 
388
  return answer
389
 
390
  def answer(self, question: str) -> str:
391
+ """Answer a question using RAG - simplified extractive approach."""
392
  if not self.initialized:
393
  return "❌ Assistant not properly initialized. Please check the logs."
394
 
 
402
  f"Supported formats: .txt, .md, .pdf, .docx"
403
  )
404
 
405
+ # Retrieve relevant contexts
406
  contexts = self.retrieve(question, top_k=3)
407
 
408
  if not contexts:
 
412
  )
413
 
414
  used_sources = set()
415
+ best_context = None
416
+ best_score = 0
417
 
418
+ # Find the best matching context
 
419
  for ctx, source, score in contexts:
420
  used_sources.add(source)
421
+ if score > best_score:
422
+ best_score = score
423
+ best_context = ctx
424
 
425
+ if not best_context:
426
+ return f"{NO_ANSWER_MSG}"
 
 
 
427
 
428
+ # AGGRESSIVE cleaning of the context
429
+ def deep_clean(text):
430
+ """Remove ALL markdown, bullets, numbers, emojis, and formatting."""
431
+ # Remove emojis and special characters
432
+ text = re.sub(r'[πŸ“˜πŸ“„πŸŸ’πŸŸ‘πŸŸ βœ“βœ—βŒβš οΈπŸ’‘πŸ“š]', '', text)
433
+ # Remove markdown headers (# ## ###)
434
+ text = re.sub(r'^#{1,6}\s+', '', text, flags=re.MULTILINE)
435
+ # Remove numbered lists (1. 2. 3.)
436
+ text = re.sub(r'^\s*\d+\.\s+', '', text, flags=re.MULTILINE)
437
+ # Remove bullet points (- * β€’)
438
+ text = re.sub(r'^\s*[-*β€’]\s+', '', text, flags=re.MULTILINE)
439
+ # Remove bold/italic (**text** *text*)
440
+ text = re.sub(r'\*\*([^*]+)\*\*', r'\1', text)
441
+ text = re.sub(r'\*([^*]+)\*', r'\1', text)
442
+ # Remove extra colons from labels
443
+ text = re.sub(r':\s*$', '', text, flags=re.MULTILINE)
444
+ # Clean multiple spaces
445
+ text = re.sub(r'\s+', ' ', text)
446
+ # Remove "Good:" "Bad:" type prefixes
447
+ text = re.sub(r'^(Good|Bad|Example|Note):\s*', '', text, flags=re.MULTILINE)
448
+ return text.strip()
449
 
450
+ cleaned = deep_clean(best_context)
451
+
452
+ # Extract just the most relevant sentences (3-4 sentences max)
453
+ sentences = [s.strip() + '.' for s in cleaned.split('.') if len(s.strip()) > 20]
454
+ answer_text = ' '.join(sentences[:4]) # First 4 good sentences
455
+
456
+ # If we got good text, try to generate a natural answer
457
+ if len(answer_text) > 50:
458
+ # Simple prompt for FLAN-T5
459
+ prompt = f"Question: {question}\n\nInformation: {answer_text[:800]}\n\nWrite a clear answer in 2-3 sentences:"
 
 
 
 
 
460
 
461
+ try:
462
+ generated = self._generate_from_context(prompt, max_new_tokens=150)
463
+ generated = generated.strip()
 
 
464
 
465
+ # Only use generated answer if it looks good
466
+ if (len(generated) > 30 and
467
+ not generated.startswith(("Do NOT", "You are", "##", "**")) and
468
+ generated.count(':') < 3):
469
+ answer_text = generated
470
+ # Otherwise, keep the cleaned extractive answer
471
+
472
+ except Exception as e:
473
+ print(f"Generation error (using extractive fallback): {e}")
474
+ # Keep the cleaned extractive answer
475
+
 
 
 
 
 
476
  sources_str = ", ".join(sorted(used_sources)) if used_sources else "N/A"
477
+
478
  return (
479
  f"**Answer:** {answer_text}\n\n"
480
  f"**Sources:** {sources_str}"