ChAbhishek28 commited on
Commit
1a4bf58
·
1 Parent(s): e0ad8eb

Fix document relevance filtering for voice bot

Browse files

- Simplified content matching keywords to be more inclusive
- Changed 'retirement benefit' to 'retirement' and 'benefits' separately
- Reduced penalty from -1.2 to -0.8 for non-matching content
- Lowered relevance threshold from 0.3 to 0.1 for better recall
- Added debug logging for content matches
- Now correctly finds pension documents for 'pension rules impact' queries

Files changed (1) hide show
  1. rag_service.py +8 -6
rag_service.py CHANGED
@@ -279,9 +279,9 @@ async def search_documents_async(query: str, limit: int = 5) -> List[Dict[str, A
279
 
280
  # Content categories - what each document type contains
281
  content_categories = {
282
- 'pension': ['pension', 'retirement benefit', 'gratuity', 'provident fund', 'superannuation'],
283
- 'leave': ['leave rules', 'casual leave', 'earned leave', 'medical leave', 'maternity leave'],
284
- 'allowance': ['dearness allowance', 'house rent allowance', 'travel allowance', 'da', 'hra', 'increment'],
285
  'procurement': ['procurement', 'tender', 'bidding', 'contract', 'vendor', 'gem', 'purchase'],
286
  'medical': ['medical', 'health', 'cghs', 'reimbursement', 'treatment'],
287
  'transfer': ['transfer', 'posting', 'deputation', 'cadre'],
@@ -300,18 +300,20 @@ async def search_documents_async(query: str, limit: int = 5) -> List[Dict[str, A
300
  if detected_query_category:
301
  # Boost score if document content matches query category
302
  matching_content_keywords = content_categories.get(detected_query_category, [])
303
- if any(keyword in content for keyword in matching_content_keywords):
 
304
  relevance_score += 1.5 # Strong boost for matching content
 
305
 
306
  # Penalize documents from different categories
307
  for other_category, other_keywords in content_categories.items():
308
  if other_category != detected_query_category:
309
  if any(keyword in content for keyword in other_keywords):
310
- relevance_score -= 1.2 # Heavy penalty for non-matching content
311
 
312
  logger.debug(f"Query category: {detected_query_category}, Relevance: {relevance_score:.2f} for content: {content[:50]}...")
313
 
314
- if relevance_score > 0.3: # Only include relevant documents
315
  # Add relevance score to document (create dict if needed)
316
  if hasattr(doc, 'metadata'):
317
  doc.metadata['relevance_score'] = relevance_score
 
279
 
280
  # Content categories - what each document type contains
281
  content_categories = {
282
+ 'pension': ['pension', 'retirement', 'gratuity', 'provident fund', 'superannuation', 'benefits'],
283
+ 'leave': ['leave', 'casual', 'earned', 'medical leave', 'maternity'],
284
+ 'allowance': ['allowance', 'dearness', 'house rent', 'travel', 'da', 'hra', 'increment'],
285
  'procurement': ['procurement', 'tender', 'bidding', 'contract', 'vendor', 'gem', 'purchase'],
286
  'medical': ['medical', 'health', 'cghs', 'reimbursement', 'treatment'],
287
  'transfer': ['transfer', 'posting', 'deputation', 'cadre'],
 
300
  if detected_query_category:
301
  # Boost score if document content matches query category
302
  matching_content_keywords = content_categories.get(detected_query_category, [])
303
+ content_matches = [kw for kw in matching_content_keywords if kw in content]
304
+ if content_matches:
305
  relevance_score += 1.5 # Strong boost for matching content
306
+ logger.debug(f"✅ Content match found: {content_matches} for category: {detected_query_category}")
307
 
308
  # Penalize documents from different categories
309
  for other_category, other_keywords in content_categories.items():
310
  if other_category != detected_query_category:
311
  if any(keyword in content for keyword in other_keywords):
312
+ relevance_score -= 0.8 # Moderate penalty for non-matching content
313
 
314
  logger.debug(f"Query category: {detected_query_category}, Relevance: {relevance_score:.2f} for content: {content[:50]}...")
315
 
316
+ if relevance_score > 0.1: # More inclusive threshold for relevant documents
317
  # Add relevance score to document (create dict if needed)
318
  if hasattr(doc, 'metadata'):
319
  doc.metadata['relevance_score'] = relevance_score