Spaces:

ChAbhishek28
/

PensionBot

Runtime error

ChAbhishek28 commited on Oct 2, 2025

Commit

1a4bf58

1 Parent(s): e0ad8eb

Fix document relevance filtering for voice bot

- Simplified content matching keywords to be more inclusive
- Changed 'retirement benefit' to 'retirement' and 'benefits' separately
- Reduced penalty from -1.2 to -0.8 for non-matching content
- Lowered relevance threshold from 0.3 to 0.1 for better recall
- Added debug logging for content matches
- Now correctly finds pension documents for 'pension rules impact' queries

Files changed (1) hide show

rag_service.py +8 -6

rag_service.py CHANGED Viewed

@@ -279,9 +279,9 @@ async def search_documents_async(query: str, limit: int = 5) -> List[Dict[str, A
             # Content categories - what each document type contains
             content_categories = {
-                'pension': ['pension', 'retirement benefit', 'gratuity', 'provident fund', 'superannuation'],
-                'leave': ['leave rules', 'casual leave', 'earned leave', 'medical leave', 'maternity leave'],
-                'allowance': ['dearness allowance', 'house rent allowance', 'travel allowance', 'da', 'hra', 'increment'],
                 'procurement': ['procurement', 'tender', 'bidding', 'contract', 'vendor', 'gem', 'purchase'],
                 'medical': ['medical', 'health', 'cghs', 'reimbursement', 'treatment'],
                 'transfer': ['transfer', 'posting', 'deputation', 'cadre'],
@@ -300,18 +300,20 @@ async def search_documents_async(query: str, limit: int = 5) -> List[Dict[str, A
             if detected_query_category:
                 # Boost score if document content matches query category
                 matching_content_keywords = content_categories.get(detected_query_category, [])
-                if any(keyword in content for keyword in matching_content_keywords):
                     relevance_score += 1.5  # Strong boost for matching content
                 # Penalize documents from different categories
                 for other_category, other_keywords in content_categories.items():
                     if other_category != detected_query_category:
                         if any(keyword in content for keyword in other_keywords):
-                            relevance_score -= 1.2  # Heavy penalty for non-matching content
                 logger.debug(f"Query category: {detected_query_category}, Relevance: {relevance_score:.2f} for content: {content[:50]}...")
-            if relevance_score > 0.3:  # Only include relevant documents
                 # Add relevance score to document (create dict if needed)
                 if hasattr(doc, 'metadata'):
                     doc.metadata['relevance_score'] = relevance_score

             # Content categories - what each document type contains
             content_categories = {
+                'pension': ['pension', 'retirement', 'gratuity', 'provident fund', 'superannuation', 'benefits'],
+                'leave': ['leave', 'casual', 'earned', 'medical leave', 'maternity'],
+                'allowance': ['allowance', 'dearness', 'house rent', 'travel', 'da', 'hra', 'increment'],
                 'procurement': ['procurement', 'tender', 'bidding', 'contract', 'vendor', 'gem', 'purchase'],
                 'medical': ['medical', 'health', 'cghs', 'reimbursement', 'treatment'],
                 'transfer': ['transfer', 'posting', 'deputation', 'cadre'],
             if detected_query_category:
                 # Boost score if document content matches query category
                 matching_content_keywords = content_categories.get(detected_query_category, [])
+                content_matches = [kw for kw in matching_content_keywords if kw in content]
+                if content_matches:
                     relevance_score += 1.5  # Strong boost for matching content
+                    logger.debug(f"✅ Content match found: {content_matches} for category: {detected_query_category}")
                 # Penalize documents from different categories
                 for other_category, other_keywords in content_categories.items():
                     if other_category != detected_query_category:
                         if any(keyword in content for keyword in other_keywords):
+                            relevance_score -= 0.8  # Moderate penalty for non-matching content
                 logger.debug(f"Query category: {detected_query_category}, Relevance: {relevance_score:.2f} for content: {content[:50]}...")
+            if relevance_score > 0.1:  # More inclusive threshold for relevant documents
                 # Add relevance score to document (create dict if needed)
                 if hasattr(doc, 'metadata'):
                     doc.metadata['relevance_score'] = relevance_score