rohannsinghal commited on
Commit
ae9db2f
·
1 Parent(s): 9c2df30

update main_api.py

Browse files
Files changed (1) hide show
  1. app/main_api.py +85 -46
app/main_api.py CHANGED
@@ -420,37 +420,79 @@ class OptimizedSemanticRAGPipeline:
420
  logger.info(f"✅ Optimized semantic RAG pipeline initialized: {collection_name}")
421
 
422
  def clean_response(self, answer: str) -> str:
423
- """Clean up the response formatting for better readability"""
424
  if not answer:
425
  return answer
426
 
427
- # Remove excessive newlines
428
- answer = re.sub(r'\n\s*\n\s*\n+', '\n\n', answer) # Multiple newlines to double
429
- answer = re.sub(r'\n\s*\n', '\n\n', answer) # Ensure consistent double newlines for paragraphs
430
-
431
- # Remove quotes around single words and short phrases
432
- answer = re.sub(r'"([A-Z\s]{2,20})"', r'\1', answer) # Remove quotes from short caps phrases
433
- answer = re.sub(r'"(\w+)"', r'\1', answer) # Remove quotes from single words
434
- answer = re.sub(r'"(Rs\. [\d,]+[/-]*)"', r'\1', answer) # Remove quotes from amounts
435
- answer = re.sub(r'"(\d+%)"', r'\1', answer) # Remove quotes from percentages
436
- answer = re.sub(r'"(\d+ (?:days?|months?|years?))"', r'\1', answer) # Remove quotes from time periods
437
-
438
- # Clean up policy references - keep important quotes but make them flow better
439
- answer = re.sub(r'As stated in the policy: "([^"]+)"', r'The policy states that \1', answer)
440
- answer = re.sub(r'According to the policy document: "([^"]+)"', r'According to the policy document, \1', answer)
441
- answer = re.sub(r'The policy states: "([^"]+)"', r'The policy states that \1', answer)
442
- answer = re.sub(r'As per the policy: "([^"]+)"', r'As per the policy, \1', answer)
443
-
444
- # Fix spacing and formatting
445
- answer = re.sub(r'\s+', ' ', answer) # Multiple spaces to single space
446
- answer = answer.replace(' ,', ',') # Fix spacing before commas
447
- answer = answer.replace(' .', '.') # Fix spacing before periods
448
- answer = answer.strip() # Remove leading/trailing whitespace
449
-
450
- # Clean up excessive line breaks in the middle of sentences
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
451
  answer = re.sub(r'([a-z,])\s*\n\s*([a-z])', r'\1 \2', answer)
452
 
453
- return answer
 
 
 
 
 
 
 
 
454
 
455
  def add_documents(self, chunks: List[Dict[str, Any]]):
456
  if not chunks:
@@ -521,36 +563,33 @@ class OptimizedSemanticRAGPipeline:
521
  }
522
  )
523
 
524
- # Enhanced semantic prompt template with better formatting
525
  prompt_template = PromptTemplate(
526
  input_variables=["context", "question"],
527
- template="""You are an expert insurance policy analyst with semantic understanding capabilities. Analyze the policy document context to provide accurate, detailed answers.
528
 
529
  POLICY DOCUMENT CONTEXT:
530
  {context}
531
 
532
  QUESTION: {question}
533
 
534
- SEMANTIC ANALYSIS INSTRUCTIONS:
535
- - Carefully analyze the semantic meaning and relationships in the policy context
536
- - Extract specific facts: numbers, percentages, time periods, conditions, and requirements
537
- - Understand implicit connections between different policy sections
538
- - Quote exact policy language when providing specific details, but format quotes naturally
539
- - Synthesize information from multiple context sections when relevant
540
- - Distinguish between explicit statements and reasonable inferences
541
- - If information is partial, provide what's available and note limitations
 
 
 
 
542
  - Be precise about conditions, exceptions, and qualifying circumstances
 
543
 
544
- FORMATTING GUIDELINES:
545
- - Write in clear, professional paragraphs without unnecessary line breaks
546
- - When quoting policy text, integrate quotes smoothly into sentences
547
- - Use bullet points or numbered lists only when listing multiple related items
548
- - Avoid excessive quotation marks around single words or short phrases
549
- - Write numbers and percentages directly (e.g., 30 days, 5%, Rs. 10,000) without quotes
550
- - Make the response flow naturally and be easy to read
551
-
552
- ANSWER FORMAT:
553
- Provide a comprehensive, well-formatted answer based on your semantic analysis of the policy document context.
554
 
555
  ANSWER:"""
556
  )
 
420
  logger.info(f"✅ Optimized semantic RAG pipeline initialized: {collection_name}")
421
 
422
  def clean_response(self, answer: str) -> str:
423
+ """Comprehensive response cleaning for professional formatting"""
424
  if not answer:
425
  return answer
426
 
427
+ # Remove excessive newlines first
428
+ answer = re.sub(r'\n\s*\n\s*\n+', '\n\n', answer)
429
+ answer = re.sub(r'\n\s*\n', '\n\n', answer)
430
+
431
+ # Remove ALL excessive quotation marks - comprehensive patterns
432
+ # Remove quotes around single words
433
+ answer = re.sub(r'"(\w+)"', r'\1', answer)
434
+
435
+ # Remove quotes around short phrases (2-5 words)
436
+ answer = re.sub(r'"([^"]{1,50})"', r'\1', answer)
437
+
438
+ # Remove quotes around ALL CAPS words/phrases
439
+ answer = re.sub(r'"([A-Z\s]{2,50})"', r'\1', answer)
440
+
441
+ # Remove quotes around numbers, percentages, amounts
442
+ answer = re.sub(r'"(Rs\.?\s*[\d,]+[/-]*)"', r'\1', answer)
443
+ answer = re.sub(r'"(\d+%)"', r'\1', answer)
444
+ answer = re.sub(r'"(\d+\s*(?:days?|months?|years?|lacs?))"', r'\1', answer)
445
+ answer = re.sub(r'"(\d+[.,]\d+)"', r'\1', answer)
446
+
447
+ # Remove quotes around plan names and policy terms
448
+ answer = re.sub(r'"(Plan\s+[A-Z])"', r'\1', answer)
449
+ answer = re.sub(r'"([A-Z]+\s*[A-Z]*)"', r'\1', answer)
450
+
451
+ # Clean up policy statement formats - make them flow naturally
452
+ answer = re.sub(r'[Aa]s stated in the policy[:\s]*"([^"]+)"', r'As per the policy, \1', answer)
453
+ answer = re.sub(r'[Aa]ccording to the policy[:\s]*"([^"]+)"', r'According to the policy, \1', answer)
454
+ answer = re.sub(r'[Tt]he policy states[:\s]*"([^"]+)"', r'The policy states that \1', answer)
455
+ answer = re.sub(r'[Aa]s per the policy[:\s]*"([^"]+)"', r'As per the policy, \1', answer)
456
+ answer = re.sub(r'[Tt]he policy mentions[:\s]*"([^"]+)"', r'The policy mentions that \1', answer)
457
+
458
+ # Remove quotes from technical terms and common insurance phrases
459
+ insurance_terms = [
460
+ 'sum insured', 'waiting period', 'grace period', 'pre-existing',
461
+ 'cumulative bonus', 'no claim discount', 'room rent', 'icu charges',
462
+ 'ayush', 'hospital', 'medical expenses', 'policy period', 'exclusion',
463
+ 'inpatient', 'outpatient', 'domiciliary', 'cashless', 'reimbursement'
464
+ ]
465
+
466
+ for term in insurance_terms:
467
+ # Remove quotes around these terms (case insensitive)
468
+ pattern = f'"{re.escape(term)}"'
469
+ answer = re.sub(pattern, term, answer, flags=re.IGNORECASE)
470
+ # Also handle capitalized versions
471
+ pattern = f'"{re.escape(term.upper())}"'
472
+ answer = re.sub(pattern, term.upper(), answer, flags=re.IGNORECASE)
473
+
474
+ # Clean up remaining problematic quote patterns
475
+ answer = re.sub(r'"\s*([^"]*)\s*"', r'\1', answer) # Any remaining quoted text
476
+
477
+ # Fix spacing issues
478
+ answer = re.sub(r'\s+', ' ', answer) # Multiple spaces to single
479
+ answer = answer.replace(' ,', ',') # Space before comma
480
+ answer = answer.replace(' .', '.') # Space before period
481
+ answer = answer.replace('( ', '(') # Space after opening parenthesis
482
+ answer = answer.replace(' )', ')') # Space before closing parenthesis
483
+
484
+ # Clean up line breaks within sentences
485
  answer = re.sub(r'([a-z,])\s*\n\s*([a-z])', r'\1 \2', answer)
486
 
487
+ # Final cleanup - remove any remaining escape characters
488
+ answer = answer.replace('\\"', '"') # Remove escape characters
489
+ answer = answer.replace('\\n', ' ') # Convert literal \n to space
490
+ answer = answer.replace('\\"', '') # Remove any remaining escaped quotes
491
+
492
+ # Ensure proper sentence structure
493
+ answer = re.sub(r'([.!?])\s*([A-Z])', r'\1 \2', answer) # Space after sentence end
494
+
495
+ return answer.strip()
496
 
497
  def add_documents(self, chunks: List[Dict[str, Any]]):
498
  if not chunks:
 
563
  }
564
  )
565
 
566
+ # Enhanced semantic prompt template with strict formatting rules
567
  prompt_template = PromptTemplate(
568
  input_variables=["context", "question"],
569
+ template="""You are an expert insurance policy analyst. Analyze the policy document context to provide accurate, detailed answers.
570
 
571
  POLICY DOCUMENT CONTEXT:
572
  {context}
573
 
574
  QUESTION: {question}
575
 
576
+ CRITICAL FORMATTING INSTRUCTIONS:
577
+ - Write in natural, flowing sentences without excessive quotation marks
578
+ - When referencing policy text, paraphrase or integrate naturally into sentences
579
+ - Do NOT put quotes around single words, numbers, percentages, or short phrases
580
+ - Do NOT put quotes around plan names (Plan A), amounts (Rs. 5,000), or time periods (30 days)
581
+ - Write numbers and amounts directly: 30 days, 5%, Rs. 10,000, Plan A
582
+ - Use quotes ONLY for exact lengthy policy clauses that need verbatim citation
583
+ - Make the text read like professional analysis, not a quote-heavy document
584
+
585
+ ANALYSIS INSTRUCTIONS:
586
+ - Extract specific facts: numbers, percentages, time periods, conditions
587
+ - Understand relationships between different policy sections
588
  - Be precise about conditions, exceptions, and qualifying circumstances
589
+ - If information is partial, state what's available and note limitations
590
 
591
+ RESPONSE STYLE:
592
+ Write a comprehensive, naturally flowing analysis that reads professionally without excessive quotation marks or formatting issues.
 
 
 
 
 
 
 
 
593
 
594
  ANSWER:"""
595
  )