Spaces:
Sleeping
Sleeping
Commit ·
ae9db2f
1
Parent(s): 9c2df30
update main_api.py
Browse files- app/main_api.py +85 -46
app/main_api.py
CHANGED
|
@@ -420,37 +420,79 @@ class OptimizedSemanticRAGPipeline:
|
|
| 420 |
logger.info(f"✅ Optimized semantic RAG pipeline initialized: {collection_name}")
|
| 421 |
|
| 422 |
def clean_response(self, answer: str) -> str:
|
| 423 |
-
"""
|
| 424 |
if not answer:
|
| 425 |
return answer
|
| 426 |
|
| 427 |
-
# Remove excessive newlines
|
| 428 |
-
answer = re.sub(r'\n\s*\n\s*\n+', '\n\n', answer)
|
| 429 |
-
answer = re.sub(r'\n\s*\n', '\n\n', answer)
|
| 430 |
-
|
| 431 |
-
# Remove
|
| 432 |
-
|
| 433 |
-
answer = re.sub(r'"(\w+)"', r'\1', answer)
|
| 434 |
-
|
| 435 |
-
|
| 436 |
-
answer = re.sub(r'"(
|
| 437 |
-
|
| 438 |
-
#
|
| 439 |
-
answer = re.sub(r'
|
| 440 |
-
|
| 441 |
-
|
| 442 |
-
answer = re.sub(r'
|
| 443 |
-
|
| 444 |
-
|
| 445 |
-
answer = re.sub(r'\
|
| 446 |
-
|
| 447 |
-
|
| 448 |
-
answer =
|
| 449 |
-
|
| 450 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 451 |
answer = re.sub(r'([a-z,])\s*\n\s*([a-z])', r'\1 \2', answer)
|
| 452 |
|
| 453 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 454 |
|
| 455 |
def add_documents(self, chunks: List[Dict[str, Any]]):
|
| 456 |
if not chunks:
|
|
@@ -521,36 +563,33 @@ class OptimizedSemanticRAGPipeline:
|
|
| 521 |
}
|
| 522 |
)
|
| 523 |
|
| 524 |
-
# Enhanced semantic prompt template with
|
| 525 |
prompt_template = PromptTemplate(
|
| 526 |
input_variables=["context", "question"],
|
| 527 |
-
template="""You are an expert insurance policy analyst
|
| 528 |
|
| 529 |
POLICY DOCUMENT CONTEXT:
|
| 530 |
{context}
|
| 531 |
|
| 532 |
QUESTION: {question}
|
| 533 |
|
| 534 |
-
|
| 535 |
-
-
|
| 536 |
-
-
|
| 537 |
-
-
|
| 538 |
-
-
|
| 539 |
-
-
|
| 540 |
-
-
|
| 541 |
-
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 542 |
- Be precise about conditions, exceptions, and qualifying circumstances
|
|
|
|
| 543 |
|
| 544 |
-
|
| 545 |
-
|
| 546 |
-
- When quoting policy text, integrate quotes smoothly into sentences
|
| 547 |
-
- Use bullet points or numbered lists only when listing multiple related items
|
| 548 |
-
- Avoid excessive quotation marks around single words or short phrases
|
| 549 |
-
- Write numbers and percentages directly (e.g., 30 days, 5%, Rs. 10,000) without quotes
|
| 550 |
-
- Make the response flow naturally and be easy to read
|
| 551 |
-
|
| 552 |
-
ANSWER FORMAT:
|
| 553 |
-
Provide a comprehensive, well-formatted answer based on your semantic analysis of the policy document context.
|
| 554 |
|
| 555 |
ANSWER:"""
|
| 556 |
)
|
|
|
|
| 420 |
logger.info(f"✅ Optimized semantic RAG pipeline initialized: {collection_name}")
|
| 421 |
|
| 422 |
def clean_response(self, answer: str) -> str:
|
| 423 |
+
"""Comprehensive response cleaning for professional formatting"""
|
| 424 |
if not answer:
|
| 425 |
return answer
|
| 426 |
|
| 427 |
+
# Remove excessive newlines first
|
| 428 |
+
answer = re.sub(r'\n\s*\n\s*\n+', '\n\n', answer)
|
| 429 |
+
answer = re.sub(r'\n\s*\n', '\n\n', answer)
|
| 430 |
+
|
| 431 |
+
# Remove ALL excessive quotation marks - comprehensive patterns
|
| 432 |
+
# Remove quotes around single words
|
| 433 |
+
answer = re.sub(r'"(\w+)"', r'\1', answer)
|
| 434 |
+
|
| 435 |
+
# Remove quotes around short phrases (2-5 words)
|
| 436 |
+
answer = re.sub(r'"([^"]{1,50})"', r'\1', answer)
|
| 437 |
+
|
| 438 |
+
# Remove quotes around ALL CAPS words/phrases
|
| 439 |
+
answer = re.sub(r'"([A-Z\s]{2,50})"', r'\1', answer)
|
| 440 |
+
|
| 441 |
+
# Remove quotes around numbers, percentages, amounts
|
| 442 |
+
answer = re.sub(r'"(Rs\.?\s*[\d,]+[/-]*)"', r'\1', answer)
|
| 443 |
+
answer = re.sub(r'"(\d+%)"', r'\1', answer)
|
| 444 |
+
answer = re.sub(r'"(\d+\s*(?:days?|months?|years?|lacs?))"', r'\1', answer)
|
| 445 |
+
answer = re.sub(r'"(\d+[.,]\d+)"', r'\1', answer)
|
| 446 |
+
|
| 447 |
+
# Remove quotes around plan names and policy terms
|
| 448 |
+
answer = re.sub(r'"(Plan\s+[A-Z])"', r'\1', answer)
|
| 449 |
+
answer = re.sub(r'"([A-Z]+\s*[A-Z]*)"', r'\1', answer)
|
| 450 |
+
|
| 451 |
+
# Clean up policy statement formats - make them flow naturally
|
| 452 |
+
answer = re.sub(r'[Aa]s stated in the policy[:\s]*"([^"]+)"', r'As per the policy, \1', answer)
|
| 453 |
+
answer = re.sub(r'[Aa]ccording to the policy[:\s]*"([^"]+)"', r'According to the policy, \1', answer)
|
| 454 |
+
answer = re.sub(r'[Tt]he policy states[:\s]*"([^"]+)"', r'The policy states that \1', answer)
|
| 455 |
+
answer = re.sub(r'[Aa]s per the policy[:\s]*"([^"]+)"', r'As per the policy, \1', answer)
|
| 456 |
+
answer = re.sub(r'[Tt]he policy mentions[:\s]*"([^"]+)"', r'The policy mentions that \1', answer)
|
| 457 |
+
|
| 458 |
+
# Remove quotes from technical terms and common insurance phrases
|
| 459 |
+
insurance_terms = [
|
| 460 |
+
'sum insured', 'waiting period', 'grace period', 'pre-existing',
|
| 461 |
+
'cumulative bonus', 'no claim discount', 'room rent', 'icu charges',
|
| 462 |
+
'ayush', 'hospital', 'medical expenses', 'policy period', 'exclusion',
|
| 463 |
+
'inpatient', 'outpatient', 'domiciliary', 'cashless', 'reimbursement'
|
| 464 |
+
]
|
| 465 |
+
|
| 466 |
+
for term in insurance_terms:
|
| 467 |
+
# Remove quotes around these terms (case insensitive)
|
| 468 |
+
pattern = f'"{re.escape(term)}"'
|
| 469 |
+
answer = re.sub(pattern, term, answer, flags=re.IGNORECASE)
|
| 470 |
+
# Also handle capitalized versions
|
| 471 |
+
pattern = f'"{re.escape(term.upper())}"'
|
| 472 |
+
answer = re.sub(pattern, term.upper(), answer, flags=re.IGNORECASE)
|
| 473 |
+
|
| 474 |
+
# Clean up remaining problematic quote patterns
|
| 475 |
+
answer = re.sub(r'"\s*([^"]*)\s*"', r'\1', answer) # Any remaining quoted text
|
| 476 |
+
|
| 477 |
+
# Fix spacing issues
|
| 478 |
+
answer = re.sub(r'\s+', ' ', answer) # Multiple spaces to single
|
| 479 |
+
answer = answer.replace(' ,', ',') # Space before comma
|
| 480 |
+
answer = answer.replace(' .', '.') # Space before period
|
| 481 |
+
answer = answer.replace('( ', '(') # Space after opening parenthesis
|
| 482 |
+
answer = answer.replace(' )', ')') # Space before closing parenthesis
|
| 483 |
+
|
| 484 |
+
# Clean up line breaks within sentences
|
| 485 |
answer = re.sub(r'([a-z,])\s*\n\s*([a-z])', r'\1 \2', answer)
|
| 486 |
|
| 487 |
+
# Final cleanup - remove any remaining escape characters
|
| 488 |
+
answer = answer.replace('\\"', '"') # Remove escape characters
|
| 489 |
+
answer = answer.replace('\\n', ' ') # Convert literal \n to space
|
| 490 |
+
answer = answer.replace('\\"', '') # Remove any remaining escaped quotes
|
| 491 |
+
|
| 492 |
+
# Ensure proper sentence structure
|
| 493 |
+
answer = re.sub(r'([.!?])\s*([A-Z])', r'\1 \2', answer) # Space after sentence end
|
| 494 |
+
|
| 495 |
+
return answer.strip()
|
| 496 |
|
| 497 |
def add_documents(self, chunks: List[Dict[str, Any]]):
|
| 498 |
if not chunks:
|
|
|
|
| 563 |
}
|
| 564 |
)
|
| 565 |
|
| 566 |
+
# Enhanced semantic prompt template with strict formatting rules
|
| 567 |
prompt_template = PromptTemplate(
|
| 568 |
input_variables=["context", "question"],
|
| 569 |
+
template="""You are an expert insurance policy analyst. Analyze the policy document context to provide accurate, detailed answers.
|
| 570 |
|
| 571 |
POLICY DOCUMENT CONTEXT:
|
| 572 |
{context}
|
| 573 |
|
| 574 |
QUESTION: {question}
|
| 575 |
|
| 576 |
+
CRITICAL FORMATTING INSTRUCTIONS:
|
| 577 |
+
- Write in natural, flowing sentences without excessive quotation marks
|
| 578 |
+
- When referencing policy text, paraphrase or integrate naturally into sentences
|
| 579 |
+
- Do NOT put quotes around single words, numbers, percentages, or short phrases
|
| 580 |
+
- Do NOT put quotes around plan names (Plan A), amounts (Rs. 5,000), or time periods (30 days)
|
| 581 |
+
- Write numbers and amounts directly: 30 days, 5%, Rs. 10,000, Plan A
|
| 582 |
+
- Use quotes ONLY for exact lengthy policy clauses that need verbatim citation
|
| 583 |
+
- Make the text read like professional analysis, not a quote-heavy document
|
| 584 |
+
|
| 585 |
+
ANALYSIS INSTRUCTIONS:
|
| 586 |
+
- Extract specific facts: numbers, percentages, time periods, conditions
|
| 587 |
+
- Understand relationships between different policy sections
|
| 588 |
- Be precise about conditions, exceptions, and qualifying circumstances
|
| 589 |
+
- If information is partial, state what's available and note limitations
|
| 590 |
|
| 591 |
+
RESPONSE STYLE:
|
| 592 |
+
Write a comprehensive, naturally flowing analysis that reads professionally without excessive quotation marks or formatting issues.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 593 |
|
| 594 |
ANSWER:"""
|
| 595 |
)
|