Spaces:
Sleeping
Sleeping
Commit Β·
9c2df30
1
Parent(s): 43a0745
updated main_api.py
Browse files- app/main_api.py +50 -6
app/main_api.py
CHANGED
|
@@ -419,6 +419,39 @@ class OptimizedSemanticRAGPipeline:
|
|
| 419 |
self.qa_chain = None
|
| 420 |
logger.info(f"β
Optimized semantic RAG pipeline initialized: {collection_name}")
|
| 421 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 422 |
def add_documents(self, chunks: List[Dict[str, Any]]):
|
| 423 |
if not chunks:
|
| 424 |
logger.error("β No chunks provided!")
|
|
@@ -488,7 +521,7 @@ class OptimizedSemanticRAGPipeline:
|
|
| 488 |
}
|
| 489 |
)
|
| 490 |
|
| 491 |
-
# Enhanced semantic prompt template
|
| 492 |
prompt_template = PromptTemplate(
|
| 493 |
input_variables=["context", "question"],
|
| 494 |
template="""You are an expert insurance policy analyst with semantic understanding capabilities. Analyze the policy document context to provide accurate, detailed answers.
|
|
@@ -502,14 +535,22 @@ SEMANTIC ANALYSIS INSTRUCTIONS:
|
|
| 502 |
- Carefully analyze the semantic meaning and relationships in the policy context
|
| 503 |
- Extract specific facts: numbers, percentages, time periods, conditions, and requirements
|
| 504 |
- Understand implicit connections between different policy sections
|
| 505 |
-
- Quote exact policy language when providing specific details
|
| 506 |
- Synthesize information from multiple context sections when relevant
|
| 507 |
- Distinguish between explicit statements and reasonable inferences
|
| 508 |
- If information is partial, provide what's available and note limitations
|
| 509 |
- Be precise about conditions, exceptions, and qualifying circumstances
|
| 510 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 511 |
ANSWER FORMAT:
|
| 512 |
-
Provide a comprehensive answer based on your semantic analysis of the policy document context.
|
| 513 |
|
| 514 |
ANSWER:"""
|
| 515 |
)
|
|
@@ -533,10 +574,13 @@ ANSWER:"""
|
|
| 533 |
try:
|
| 534 |
# Retrieve with semantic understanding
|
| 535 |
result = await asyncio.to_thread(self.qa_chain, {"query": question})
|
| 536 |
-
|
| 537 |
|
| 538 |
-
|
| 539 |
-
|
|
|
|
|
|
|
|
|
|
| 540 |
|
| 541 |
except Exception as e:
|
| 542 |
logger.error(f"β Error during semantic QA: {e}")
|
|
|
|
| 419 |
self.qa_chain = None
|
| 420 |
logger.info(f"β
Optimized semantic RAG pipeline initialized: {collection_name}")
|
| 421 |
|
| 422 |
+
def clean_response(self, answer: str) -> str:
|
| 423 |
+
"""Clean up the response formatting for better readability"""
|
| 424 |
+
if not answer:
|
| 425 |
+
return answer
|
| 426 |
+
|
| 427 |
+
# Remove excessive newlines
|
| 428 |
+
answer = re.sub(r'\n\s*\n\s*\n+', '\n\n', answer) # Multiple newlines to double
|
| 429 |
+
answer = re.sub(r'\n\s*\n', '\n\n', answer) # Ensure consistent double newlines for paragraphs
|
| 430 |
+
|
| 431 |
+
# Remove quotes around single words and short phrases
|
| 432 |
+
answer = re.sub(r'"([A-Z\s]{2,20})"', r'\1', answer) # Remove quotes from short caps phrases
|
| 433 |
+
answer = re.sub(r'"(\w+)"', r'\1', answer) # Remove quotes from single words
|
| 434 |
+
answer = re.sub(r'"(Rs\. [\d,]+[/-]*)"', r'\1', answer) # Remove quotes from amounts
|
| 435 |
+
answer = re.sub(r'"(\d+%)"', r'\1', answer) # Remove quotes from percentages
|
| 436 |
+
answer = re.sub(r'"(\d+ (?:days?|months?|years?))"', r'\1', answer) # Remove quotes from time periods
|
| 437 |
+
|
| 438 |
+
# Clean up policy references - keep important quotes but make them flow better
|
| 439 |
+
answer = re.sub(r'As stated in the policy: "([^"]+)"', r'The policy states that \1', answer)
|
| 440 |
+
answer = re.sub(r'According to the policy document: "([^"]+)"', r'According to the policy document, \1', answer)
|
| 441 |
+
answer = re.sub(r'The policy states: "([^"]+)"', r'The policy states that \1', answer)
|
| 442 |
+
answer = re.sub(r'As per the policy: "([^"]+)"', r'As per the policy, \1', answer)
|
| 443 |
+
|
| 444 |
+
# Fix spacing and formatting
|
| 445 |
+
answer = re.sub(r'\s+', ' ', answer) # Multiple spaces to single space
|
| 446 |
+
answer = answer.replace(' ,', ',') # Fix spacing before commas
|
| 447 |
+
answer = answer.replace(' .', '.') # Fix spacing before periods
|
| 448 |
+
answer = answer.strip() # Remove leading/trailing whitespace
|
| 449 |
+
|
| 450 |
+
# Clean up excessive line breaks in the middle of sentences
|
| 451 |
+
answer = re.sub(r'([a-z,])\s*\n\s*([a-z])', r'\1 \2', answer)
|
| 452 |
+
|
| 453 |
+
return answer
|
| 454 |
+
|
| 455 |
def add_documents(self, chunks: List[Dict[str, Any]]):
|
| 456 |
if not chunks:
|
| 457 |
logger.error("β No chunks provided!")
|
|
|
|
| 521 |
}
|
| 522 |
)
|
| 523 |
|
| 524 |
+
# Enhanced semantic prompt template with better formatting
|
| 525 |
prompt_template = PromptTemplate(
|
| 526 |
input_variables=["context", "question"],
|
| 527 |
template="""You are an expert insurance policy analyst with semantic understanding capabilities. Analyze the policy document context to provide accurate, detailed answers.
|
|
|
|
| 535 |
- Carefully analyze the semantic meaning and relationships in the policy context
|
| 536 |
- Extract specific facts: numbers, percentages, time periods, conditions, and requirements
|
| 537 |
- Understand implicit connections between different policy sections
|
| 538 |
+
- Quote exact policy language when providing specific details, but format quotes naturally
|
| 539 |
- Synthesize information from multiple context sections when relevant
|
| 540 |
- Distinguish between explicit statements and reasonable inferences
|
| 541 |
- If information is partial, provide what's available and note limitations
|
| 542 |
- Be precise about conditions, exceptions, and qualifying circumstances
|
| 543 |
|
| 544 |
+
FORMATTING GUIDELINES:
|
| 545 |
+
- Write in clear, professional paragraphs without unnecessary line breaks
|
| 546 |
+
- When quoting policy text, integrate quotes smoothly into sentences
|
| 547 |
+
- Use bullet points or numbered lists only when listing multiple related items
|
| 548 |
+
- Avoid excessive quotation marks around single words or short phrases
|
| 549 |
+
- Write numbers and percentages directly (e.g., 30 days, 5%, Rs. 10,000) without quotes
|
| 550 |
+
- Make the response flow naturally and be easy to read
|
| 551 |
+
|
| 552 |
ANSWER FORMAT:
|
| 553 |
+
Provide a comprehensive, well-formatted answer based on your semantic analysis of the policy document context.
|
| 554 |
|
| 555 |
ANSWER:"""
|
| 556 |
)
|
|
|
|
| 574 |
try:
|
| 575 |
# Retrieve with semantic understanding
|
| 576 |
result = await asyncio.to_thread(self.qa_chain, {"query": question})
|
| 577 |
+
raw_answer = result.get("result", "Failed to generate semantic answer.")
|
| 578 |
|
| 579 |
+
# Clean up the response formatting
|
| 580 |
+
clean_answer = self.clean_response(raw_answer)
|
| 581 |
+
|
| 582 |
+
logger.info(f"β
Semantic answer generated: {len(clean_answer)} characters")
|
| 583 |
+
return clean_answer
|
| 584 |
|
| 585 |
except Exception as e:
|
| 586 |
logger.error(f"β Error during semantic QA: {e}")
|