jashdoshi77 commited on
Commit
84cc717
·
1 Parent(s): abc646e

HYBRID RAG ADD

Browse files
Files changed (1) hide show
  1. services/rag_service.py +181 -24
services/rag_service.py CHANGED
@@ -333,11 +333,48 @@ Query: "show me policies for XYZ Industries"
333
 
334
  except Exception as e:
335
  print(f"[AI QUERY PARSER] Error: {e}, falling back to pattern matching")
336
- # Fallback to basic detection with new fields
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
337
  return {
338
- "intent": "specific",
339
- "needs_metadata": False,
340
- "filters": {},
341
  "sort_by": None,
342
  "sort_order": "desc",
343
  "limit": None,
@@ -1211,6 +1248,29 @@ Summary: {summary[:300] if summary else 'No summary available'}
1211
  if filter_terms:
1212
  search_query = f"{query} {' '.join(filter_terms)}"
1213
  print(f"[HYBRID RAG] Enhanced search query: {search_query[:80]}...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1214
 
1215
  # For fallback searches, use more aggressive parameters
1216
  if is_fallback:
@@ -1354,14 +1414,29 @@ Summary: {summary[:300] if summary else 'No summary available'}
1354
  metadata_result = self._handle_metadata_query(user_id, bucket_id, query, parsed)
1355
  print(f"[HYBRID STREAM] Metadata returned {metadata_result.get('total_documents', 0)} docs")
1356
 
1357
- # Step 2: Always get RAG context for detailed content
1358
- # If metadata returned 0, use fallback mode for more aggressive search
1359
- # Also detect document names in query for targeted search
 
 
 
 
 
 
 
 
 
 
1360
  metadata_has_results = metadata_result.get('total_documents', 0) > 0
 
 
 
 
 
1361
  rag_result = self._get_rag_context_for_query(
1362
  user_id, bucket_id, query,
1363
  filters=parsed.get('filters'),
1364
- is_fallback=not metadata_has_results, # Use fallback mode if metadata failed
1365
  doc_ids=None # Document name detection happens inside the method
1366
  )
1367
  print(f"[HYBRID STREAM] RAG returned {rag_result.get('chunk_count', 0)} chunks")
@@ -1436,11 +1511,21 @@ Do NOT say information is missing - search through ALL provided context thorough
1436
 
1437
  CRITICAL INSTRUCTIONS:
1438
  1. You have BOTH structured metadata AND detailed document content.
1439
- 2. Use metadata for: policy number, insured name, sum insured, premium, dates.
1440
- 3. Use detailed content for: coverage details, terms, conditions, exclusions.
1441
  4. Provide a comprehensive answer covering all relevant information.
1442
  5. Format clearly with headers and bullet points.
1443
 
 
 
 
 
 
 
 
 
 
 
1444
  {format_instructions}
1445
 
1446
  Do NOT say information is missing - search through ALL provided context thoroughly."""
@@ -1452,10 +1537,17 @@ CRITICAL INSTRUCTIONS:
1452
  1. You have BOTH structured metadata AND detailed document content.
1453
  2. Search thoroughly through ALL provided context before answering.
1454
  3. Use metadata for structured fields like names, amounts, dates.
1455
- 4. Use detailed content for explanations, terms, conditions.
1456
  5. Provide a complete and accurate answer based on the documents.
1457
  6. Format clearly with headers and bullet points where appropriate.
1458
 
 
 
 
 
 
 
 
1459
  {format_instructions}
1460
 
1461
  Do NOT say information is missing - search through ALL provided context thoroughly."""
@@ -1474,7 +1566,12 @@ Do NOT say information is missing - search through ALL provided context thorough
1474
  except Exception as e:
1475
  print(f"[HYBRID STREAM] Failed to load history: {e}")
1476
 
1477
- # Step 6: Build messages
 
 
 
 
 
1478
  messages = [{"role": "system", "content": system_prompt}]
1479
 
1480
  for msg in stored_history:
@@ -1485,14 +1582,40 @@ Do NOT say information is missing - search through ALL provided context thorough
1485
 
1486
  format_reminder = f"\n\nRemember: Format response as {format_preference}." if format_preference else ""
1487
 
1488
- user_message = f"""Based on the following document data, answer my question comprehensively.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1489
 
1490
  DOCUMENT DATA:
1491
  {context}
1492
-
1493
  QUESTION: {query}
1494
 
1495
- Instructions: Use both the structured metadata AND detailed content to provide a complete answer.{format_reminder}"""
 
 
 
 
 
1496
 
1497
  messages.append({"role": "user", "content": user_message})
1498
 
@@ -2457,21 +2580,55 @@ Instructions: Synthesize from multiple documents if relevant. Be detailed but co
2457
  # Route based on AI-parsed intent
2458
  intent = parsed.get('intent', 'specific')
2459
  needs_metadata = parsed.get('needs_metadata', False)
 
2460
 
2461
- # HYBRID ROUTING LOGIC:
2462
- # 1. For aggregate/list/count/rank queries: Use metadata (with RAG fallback)
2463
- # 2. For ALL other queries: Use HYBRID (metadata + RAG together) for comprehensive answers
2464
-
 
 
 
 
 
 
 
 
2465
  if intent in ['list', 'count', 'rank', 'calculate'] and needs_metadata:
2466
- # Aggregate queries - metadata is primary, RAG is fallback (handled inside)
2467
- print(f"[QUERY ROUTING] Using METADATA path for {intent} query")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2468
  yield from self._stream_metadata_query(user_id, bucket_id, query, parsed, chat_id)
2469
  return
2470
 
2471
  else:
2472
- # ALL other queries (specific, compare, general, summarize, followup)
2473
- # Use HYBRID approach - both metadata AND RAG for comprehensive answers
2474
- print(f"[QUERY ROUTING] Using HYBRID path for {intent} query")
 
 
 
2475
  yield from self._stream_hybrid_query(user_id, bucket_id, query, parsed, chat_id)
2476
  return
2477
 
 
333
 
334
  except Exception as e:
335
  print(f"[AI QUERY PARSER] Error: {e}, falling back to pattern matching")
336
+ # Fallback: Try to extract entity names from query even when JSON parsing fails
337
+ filters = {}
338
+ query_lower = query.lower()
339
+
340
+ # Try to extract entity names (common patterns for company/college names)
341
+ # Look for capitalized words or multi-word entities (handles both uppercase and lowercase)
342
+ import re
343
+ # Pattern: "how many total students are insured in prahladrai dalmia"
344
+ # Extract names that appear after "in", "for", "about", "of"
345
+ name_patterns = [
346
+ r'(?:in|for|about|of|at)\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)', # "in Prahladrai Dalmia" (capitalized)
347
+ r'(?:in|for|about|of|at)\s+([a-z]+(?:\s+[a-z]+){1,4})', # "in prahladrai dalmia" (lowercase)
348
+ r'([A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,4})', # Multi-word capitalized names anywhere
349
+ ]
350
+
351
+ for pattern in name_patterns:
352
+ matches = re.findall(pattern, query)
353
+ if matches:
354
+ # Take the longest match (most specific)
355
+ entity_name = max(matches, key=len)
356
+ if len(entity_name.split()) >= 2: # At least 2 words
357
+ # Capitalize first letter of each word for consistency
358
+ entity_name = ' '.join(word.capitalize() for word in entity_name.split())
359
+ filters['insured_name'] = entity_name
360
+ print(f"[AI QUERY PARSER] Fallback extracted entity: {entity_name}")
361
+ break
362
+
363
+ # Detect intent from keywords
364
+ intent = "specific"
365
+ if any(word in query_lower for word in ['how many', 'count', 'total number']):
366
+ intent = "count"
367
+ elif any(word in query_lower for word in ['list all', 'show all', 'all policies']):
368
+ intent = "list"
369
+ needs_metadata = True
370
+ else:
371
+ needs_metadata = False
372
+
373
+ # Fallback to basic detection with extracted filters
374
  return {
375
+ "intent": intent,
376
+ "needs_metadata": needs_metadata,
377
+ "filters": filters,
378
  "sort_by": None,
379
  "sort_order": "desc",
380
  "limit": None,
 
1248
  if filter_terms:
1249
  search_query = f"{query} {' '.join(filter_terms)}"
1250
  print(f"[HYBRID RAG] Enhanced search query: {search_query[:80]}...")
1251
+ else:
1252
+ # If no filters, try to extract entity names directly from query for better search
1253
+ # This helps when AI parser fails but query contains entity names
1254
+ import re
1255
+ # Look for multi-word names (handles both uppercase and lowercase)
1256
+ # Pattern: "how many total students are insured in prahladrai dalmia"
1257
+ name_patterns = [
1258
+ r'(?:in|for|about|of|at)\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)', # Capitalized after preposition
1259
+ r'(?:in|for|about|of|at)\s+([a-z]+(?:\s+[a-z]+){1,4})', # Lowercase after preposition
1260
+ r'([A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,4})', # Capitalized anywhere
1261
+ ]
1262
+
1263
+ for pattern in name_patterns:
1264
+ name_matches = re.findall(pattern, query)
1265
+ if name_matches:
1266
+ # Use the longest match (most specific entity name)
1267
+ entity_name = max(name_matches, key=len)
1268
+ if len(entity_name.split()) >= 2: # At least 2 words
1269
+ # Capitalize first letter of each word for consistency
1270
+ entity_name = ' '.join(word.capitalize() for word in entity_name.split())
1271
+ search_query = f"{query} {entity_name}"
1272
+ print(f"[HYBRID RAG] Extracted entity from query for search: {entity_name}")
1273
+ break
1274
 
1275
  # For fallback searches, use more aggressive parameters
1276
  if is_fallback:
 
1414
  metadata_result = self._handle_metadata_query(user_id, bucket_id, query, parsed)
1415
  print(f"[HYBRID STREAM] Metadata returned {metadata_result.get('total_documents', 0)} docs")
1416
 
1417
+ # Step 2: Detect if query needs detailed content (numbers, counts, totals, students, etc.)
1418
+ # For these queries, ALWAYS use aggressive RAG search even if metadata has results
1419
+ query_lower = query.lower()
1420
+ needs_detailed_content = any(keyword in query_lower for keyword in [
1421
+ 'how many', 'total', 'count', 'number of', 'students', 'sum insured',
1422
+ 'total sum', 'aggregate', 'amount', 'quantity', 'coverage', 'insured persons',
1423
+ 'lives', 'members', 'people', 'individuals'
1424
+ ])
1425
+
1426
+ # Step 3: Always get RAG context for detailed content
1427
+ # Use aggressive search (is_fallback=True) if:
1428
+ # - Metadata has no results, OR
1429
+ # - Query needs detailed content (numbers, counts, totals) - metadata might not have these
1430
  metadata_has_results = metadata_result.get('total_documents', 0) > 0
1431
+ use_aggressive_rag = not metadata_has_results or needs_detailed_content
1432
+
1433
+ if needs_detailed_content:
1434
+ print(f"[HYBRID STREAM] Query needs detailed content - using aggressive RAG search regardless of metadata results")
1435
+
1436
  rag_result = self._get_rag_context_for_query(
1437
  user_id, bucket_id, query,
1438
  filters=parsed.get('filters'),
1439
+ is_fallback=use_aggressive_rag, # Use aggressive search for detailed content queries
1440
  doc_ids=None # Document name detection happens inside the method
1441
  )
1442
  print(f"[HYBRID STREAM] RAG returned {rag_result.get('chunk_count', 0)} chunks")
 
1511
 
1512
  CRITICAL INSTRUCTIONS:
1513
  1. You have BOTH structured metadata AND detailed document content.
1514
+ 2. Use metadata for: policy number, insured name, basic premium, dates.
1515
+ 3. Use detailed content for: coverage details, terms, conditions, exclusions, numbers, counts, totals, sum insured, students, etc.
1516
  4. Provide a comprehensive answer covering all relevant information.
1517
  5. Format clearly with headers and bullet points.
1518
 
1519
+ FINDING NUMBERS AND TOTALS (CRITICAL):
1520
+ - When asked about "how many", "total", "sum insured", "students", "count" - search EVERY section
1521
+ - The DETAILED DOCUMENT CONTENT section is MORE IMPORTANT than metadata for finding numbers
1522
+ - Metadata may have policy info but NOT the detailed numbers - always check detailed content
1523
+ - Look for: numbers, totals, aggregates, counts, quantities, amounts
1524
+ - Information may be phrased as: "total sum insured", "aggregate SI", "Sum Insured", "number of students", "insured students", etc.
1525
+ - NEVER say "cannot be determined" or "not available" unless you've checked EVERY single document section
1526
+ - If you find ANY number related to the question, include it in your answer
1527
+ - If metadata doesn't have the answer, it's DEFINITELY in the detailed content - keep searching!
1528
+
1529
  {format_instructions}
1530
 
1531
  Do NOT say information is missing - search through ALL provided context thoroughly."""
 
1537
  1. You have BOTH structured metadata AND detailed document content.
1538
  2. Search thoroughly through ALL provided context before answering.
1539
  3. Use metadata for structured fields like names, amounts, dates.
1540
+ 4. Use detailed content for explanations, terms, conditions, numbers, totals, counts.
1541
  5. Provide a complete and accurate answer based on the documents.
1542
  6. Format clearly with headers and bullet points where appropriate.
1543
 
1544
+ FINDING NUMBERS AND TOTALS (CRITICAL):
1545
+ - When asked about "how many", "total", "sum insured", "students", "count" - search EVERY section
1546
+ - Look for: numbers, totals, aggregates, counts, quantities, amounts
1547
+ - Information may be phrased as: "total sum insured", "aggregate SI", "Sum Insured", "number of students", "insured students", etc.
1548
+ - NEVER say "cannot be determined" or "not available" unless you've checked EVERY single document section
1549
+ - If you find ANY number related to the question, include it in your answer
1550
+
1551
  {format_instructions}
1552
 
1553
  Do NOT say information is missing - search through ALL provided context thoroughly."""
 
1566
  except Exception as e:
1567
  print(f"[HYBRID STREAM] Failed to load history: {e}")
1568
 
1569
+ # Step 6: Detect query type and build conversation context for pronoun resolution
1570
+ query_type = self._detect_query_type(query, stored_history)
1571
+ conversation_context = self._build_conversation_context(stored_history, query)
1572
+ print(f"[HYBRID STREAM] Query type: {query_type}, has conversation context: {bool(conversation_context)}")
1573
+
1574
+ # Step 7: Build messages
1575
  messages = [{"role": "system", "content": system_prompt}]
1576
 
1577
  for msg in stored_history:
 
1582
 
1583
  format_reminder = f"\n\nRemember: Format response as {format_preference}." if format_preference else ""
1584
 
1585
+ # Build user message with context injection for pronouns
1586
+ context_injection = ""
1587
+ if query_type == 'followup' and conversation_context:
1588
+ context_injection = f"""
1589
+ CONVERSATION CONTEXT (use this to understand pronouns like "it", "this", "that"):
1590
+ {conversation_context}
1591
+
1592
+ """
1593
+
1594
+ # Add emphasis on using RAG content when query needs detailed information
1595
+ detailed_content_emphasis = ""
1596
+ if needs_detailed_content:
1597
+ detailed_content_emphasis = """
1598
+
1599
+ CRITICAL: This query asks for detailed information (numbers, counts, totals, students, sum insured, etc.).
1600
+ - The METADATA section may have policy information but NOT the detailed numbers
1601
+ - The DETAILED DOCUMENT CONTENT section contains the actual numbers, counts, and totals
1602
+ - You MUST search through the DETAILED DOCUMENT CONTENT section to find the answer
1603
+ - If metadata doesn't have the answer, the answer is definitely in the detailed content - keep searching!
1604
+ """
1605
+
1606
+ user_message = f"""{context_injection}Based on the following document data, answer my question comprehensively.
1607
 
1608
  DOCUMENT DATA:
1609
  {context}
1610
+ {detailed_content_emphasis}
1611
  QUESTION: {query}
1612
 
1613
+ Instructions:
1614
+ - Use both the structured metadata AND detailed content to provide a complete answer
1615
+ - If this is a follow-up, use conversation history to understand what I'm referring to
1616
+ - Search THOROUGHLY through ALL document sections for numbers, totals, counts, students, sum insured, etc.
1617
+ - For questions about numbers/counts/totals: The DETAILED DOCUMENT CONTENT section is more important than metadata
1618
+ - NEVER say information is missing unless you've checked every single section{format_reminder}"""
1619
 
1620
  messages.append({"role": "user", "content": user_message})
1621
 
 
2580
  # Route based on AI-parsed intent
2581
  intent = parsed.get('intent', 'specific')
2582
  needs_metadata = parsed.get('needs_metadata', False)
2583
+ filters = parsed.get('filters', {})
2584
 
2585
+ # HYBRID ROUTING LOGIC (UPDATED):
2586
+ # 1. Use METADATA path ONLY for true aggregate queries that need ALL documents:
2587
+ # - Queries with "list all", "all policies", "all documents" that don't filter by specific entity
2588
+ # - Queries asking for aggregate data across ALL documents (e.g., "all GMC policies", "renewals in march")
2589
+ # - These queries need to scan ALL documents, so metadata is more efficient
2590
+ # 2. Use HYBRID path for EVERYTHING else:
2591
+ # - Specific entity queries (even if they say "list all X policies" for a specific company)
2592
+ # - Questions about specific documents/entities
2593
+ # - Any query that filters by insured_name, insurer_name, or other specific entity
2594
+
2595
+ # Check if this is a TRUE aggregate query (needs all documents, no specific entity filter)
2596
+ is_true_aggregate = False
2597
  if intent in ['list', 'count', 'rank', 'calculate'] and needs_metadata:
2598
+ # It's a true aggregate if:
2599
+ # 1. Query explicitly asks for "all" documents/policies (not filtered to specific entity)
2600
+ # 2. No specific entity filters (insured_name, insurer_name) are present
2601
+ # 3. OR it's asking for aggregate data like "all GMC policies", "all fire policies" (policy type, not entity)
2602
+ query_lower = query.lower()
2603
+ has_all_keyword = any(phrase in query_lower for phrase in [
2604
+ 'list all', 'all policies', 'all documents', 'all the policies',
2605
+ 'every policy', 'every document', 'all the documents'
2606
+ ])
2607
+
2608
+ # Check if filtering by specific entity (company, person, etc.)
2609
+ has_entity_filter = bool(filters.get('insured_name') or filters.get('insurer_name') or filters.get('broker_name'))
2610
+
2611
+ # True aggregate = has "all" keyword AND no specific entity filter
2612
+ # OR it's asking for aggregate by type (policy_type, industry) without entity
2613
+ if has_all_keyword and not has_entity_filter:
2614
+ is_true_aggregate = True
2615
+ elif not has_entity_filter and (filters.get('policy_type') or filters.get('industry')):
2616
+ # Aggregate by type (e.g., "all fire policies", "all manufacturing") - use metadata
2617
+ is_true_aggregate = True
2618
+
2619
+ if is_true_aggregate:
2620
+ # True aggregate queries - metadata is primary, RAG is fallback (handled inside)
2621
+ print(f"[QUERY ROUTING] Using METADATA path for aggregate {intent} query (needs all documents)")
2622
  yield from self._stream_metadata_query(user_id, bucket_id, query, parsed, chat_id)
2623
  return
2624
 
2625
  else:
2626
+ # ALL other queries - use HYBRID approach (metadata + RAG together)
2627
+ # This includes:
2628
+ # - Specific entity queries (even if they say "list all X policies")
2629
+ # - Questions about specific documents/entities
2630
+ # - Any query with entity filters
2631
+ print(f"[QUERY ROUTING] Using HYBRID path for {intent} query (specific entity or detailed content)")
2632
  yield from self._stream_hybrid_query(user_id, bucket_id, query, parsed, chat_id)
2633
  return
2634