Spaces:
Runtime error
Runtime error
Commit ·
1a4bf58
1
Parent(s): e0ad8eb
Fix document relevance filtering for voice bot
Browse files- Simplified content matching keywords to be more inclusive
- Changed 'retirement benefit' to 'retirement' and 'benefits' separately
- Reduced penalty from -1.2 to -0.8 for non-matching content
- Lowered relevance threshold from 0.3 to 0.1 for better recall
- Added debug logging for content matches
- Now correctly finds pension documents for 'pension rules impact' queries
- rag_service.py +8 -6
rag_service.py
CHANGED
|
@@ -279,9 +279,9 @@ async def search_documents_async(query: str, limit: int = 5) -> List[Dict[str, A
|
|
| 279 |
|
| 280 |
# Content categories - what each document type contains
|
| 281 |
content_categories = {
|
| 282 |
-
'pension': ['pension', 'retirement
|
| 283 |
-
'leave': ['leave
|
| 284 |
-
'allowance': ['
|
| 285 |
'procurement': ['procurement', 'tender', 'bidding', 'contract', 'vendor', 'gem', 'purchase'],
|
| 286 |
'medical': ['medical', 'health', 'cghs', 'reimbursement', 'treatment'],
|
| 287 |
'transfer': ['transfer', 'posting', 'deputation', 'cadre'],
|
|
@@ -300,18 +300,20 @@ async def search_documents_async(query: str, limit: int = 5) -> List[Dict[str, A
|
|
| 300 |
if detected_query_category:
|
| 301 |
# Boost score if document content matches query category
|
| 302 |
matching_content_keywords = content_categories.get(detected_query_category, [])
|
| 303 |
-
|
|
|
|
| 304 |
relevance_score += 1.5 # Strong boost for matching content
|
|
|
|
| 305 |
|
| 306 |
# Penalize documents from different categories
|
| 307 |
for other_category, other_keywords in content_categories.items():
|
| 308 |
if other_category != detected_query_category:
|
| 309 |
if any(keyword in content for keyword in other_keywords):
|
| 310 |
-
relevance_score -=
|
| 311 |
|
| 312 |
logger.debug(f"Query category: {detected_query_category}, Relevance: {relevance_score:.2f} for content: {content[:50]}...")
|
| 313 |
|
| 314 |
-
if relevance_score > 0.
|
| 315 |
# Add relevance score to document (create dict if needed)
|
| 316 |
if hasattr(doc, 'metadata'):
|
| 317 |
doc.metadata['relevance_score'] = relevance_score
|
|
|
|
| 279 |
|
| 280 |
# Content categories - what each document type contains
|
| 281 |
content_categories = {
|
| 282 |
+
'pension': ['pension', 'retirement', 'gratuity', 'provident fund', 'superannuation', 'benefits'],
|
| 283 |
+
'leave': ['leave', 'casual', 'earned', 'medical leave', 'maternity'],
|
| 284 |
+
'allowance': ['allowance', 'dearness', 'house rent', 'travel', 'da', 'hra', 'increment'],
|
| 285 |
'procurement': ['procurement', 'tender', 'bidding', 'contract', 'vendor', 'gem', 'purchase'],
|
| 286 |
'medical': ['medical', 'health', 'cghs', 'reimbursement', 'treatment'],
|
| 287 |
'transfer': ['transfer', 'posting', 'deputation', 'cadre'],
|
|
|
|
| 300 |
if detected_query_category:
|
| 301 |
# Boost score if document content matches query category
|
| 302 |
matching_content_keywords = content_categories.get(detected_query_category, [])
|
| 303 |
+
content_matches = [kw for kw in matching_content_keywords if kw in content]
|
| 304 |
+
if content_matches:
|
| 305 |
relevance_score += 1.5 # Strong boost for matching content
|
| 306 |
+
logger.debug(f"✅ Content match found: {content_matches} for category: {detected_query_category}")
|
| 307 |
|
| 308 |
# Penalize documents from different categories
|
| 309 |
for other_category, other_keywords in content_categories.items():
|
| 310 |
if other_category != detected_query_category:
|
| 311 |
if any(keyword in content for keyword in other_keywords):
|
| 312 |
+
relevance_score -= 0.8 # Moderate penalty for non-matching content
|
| 313 |
|
| 314 |
logger.debug(f"Query category: {detected_query_category}, Relevance: {relevance_score:.2f} for content: {content[:50]}...")
|
| 315 |
|
| 316 |
+
if relevance_score > 0.1: # More inclusive threshold for relevant documents
|
| 317 |
# Add relevance score to document (create dict if needed)
|
| 318 |
if hasattr(doc, 'metadata'):
|
| 319 |
doc.metadata['relevance_score'] = relevance_score
|