Spaces:

jashdoshi77
/

notebooklm-fast

Running

App Files Files Community

jashdoshi77 commited on 1 day ago

Commit

84cc717

1 Parent(s): abc646e

HYBRID RAG ADD

Browse files

Files changed (1) hide show

services/rag_service.py +181 -24

services/rag_service.py CHANGED Viewed

@@ -333,11 +333,48 @@ Query: "show me policies for XYZ Industries"
         except Exception as e:
             print(f"[AI QUERY PARSER] Error: {e}, falling back to pattern matching")
-            # Fallback to basic detection with new fields
             return {
-                "intent": "specific",
-                "needs_metadata": False,
-                "filters": {},
                 "sort_by": None,
                 "sort_order": "desc",
                 "limit": None,
@@ -1211,6 +1248,29 @@ Summary: {summary[:300] if summary else 'No summary available'}
             if filter_terms:
                 search_query = f"{query} {' '.join(filter_terms)}"
                 print(f"[HYBRID RAG] Enhanced search query: {search_query[:80]}...")
         # For fallback searches, use more aggressive parameters
         if is_fallback:
@@ -1354,14 +1414,29 @@ Summary: {summary[:300] if summary else 'No summary available'}
         metadata_result = self._handle_metadata_query(user_id, bucket_id, query, parsed)
         print(f"[HYBRID STREAM] Metadata returned {metadata_result.get('total_documents', 0)} docs")
-        # Step 2: Always get RAG context for detailed content
-        # If metadata returned 0, use fallback mode for more aggressive search
-        # Also detect document names in query for targeted search
         metadata_has_results = metadata_result.get('total_documents', 0) > 0
         rag_result = self._get_rag_context_for_query(
             user_id, bucket_id, query,
             filters=parsed.get('filters'),
-            is_fallback=not metadata_has_results,  # Use fallback mode if metadata failed
             doc_ids=None  # Document name detection happens inside the method
         )
         print(f"[HYBRID STREAM] RAG returned {rag_result.get('chunk_count', 0)} chunks")
@@ -1436,11 +1511,21 @@ Do NOT say information is missing - search through ALL provided context thorough
 CRITICAL INSTRUCTIONS:
 1. You have BOTH structured metadata AND detailed document content.
-2. Use metadata for: policy number, insured name, sum insured, premium, dates.
-3. Use detailed content for: coverage details, terms, conditions, exclusions.
 4. Provide a comprehensive answer covering all relevant information.
 5. Format clearly with headers and bullet points.
 {format_instructions}
 Do NOT say information is missing - search through ALL provided context thoroughly."""
@@ -1452,10 +1537,17 @@ CRITICAL INSTRUCTIONS:
 1. You have BOTH structured metadata AND detailed document content.
 2. Search thoroughly through ALL provided context before answering.
 3. Use metadata for structured fields like names, amounts, dates.
-4. Use detailed content for explanations, terms, conditions.
 5. Provide a complete and accurate answer based on the documents.
 6. Format clearly with headers and bullet points where appropriate.
 {format_instructions}
 Do NOT say information is missing - search through ALL provided context thoroughly."""
@@ -1474,7 +1566,12 @@ Do NOT say information is missing - search through ALL provided context thorough
             except Exception as e:
                 print(f"[HYBRID STREAM] Failed to load history: {e}")
-        # Step 6: Build messages
         messages = [{"role": "system", "content": system_prompt}]
         for msg in stored_history:
@@ -1485,14 +1582,40 @@ Do NOT say information is missing - search through ALL provided context thorough
         format_reminder = f"\n\nRemember: Format response as {format_preference}." if format_preference else ""
-        user_message = f"""Based on the following document data, answer my question comprehensively.
 DOCUMENT DATA:
 {context}
 QUESTION: {query}
-Instructions: Use both the structured metadata AND detailed content to provide a complete answer.{format_reminder}"""
         messages.append({"role": "user", "content": user_message})
@@ -2457,21 +2580,55 @@ Instructions: Synthesize from multiple documents if relevant. Be detailed but co
         # Route based on AI-parsed intent
         intent = parsed.get('intent', 'specific')
         needs_metadata = parsed.get('needs_metadata', False)
-        # HYBRID ROUTING LOGIC:
-        # 1. For aggregate/list/count/rank queries: Use metadata (with RAG fallback)
-        # 2. For ALL other queries: Use HYBRID (metadata + RAG together) for comprehensive answers
         if intent in ['list', 'count', 'rank', 'calculate'] and needs_metadata:
-            # Aggregate queries - metadata is primary, RAG is fallback (handled inside)
-            print(f"[QUERY ROUTING] Using METADATA path for {intent} query")
             yield from self._stream_metadata_query(user_id, bucket_id, query, parsed, chat_id)
             return
         else:
-            # ALL other queries (specific, compare, general, summarize, followup)
-            # Use HYBRID approach - both metadata AND RAG for comprehensive answers
-            print(f"[QUERY ROUTING] Using HYBRID path for {intent} query")
             yield from self._stream_hybrid_query(user_id, bucket_id, query, parsed, chat_id)
             return

         except Exception as e:
             print(f"[AI QUERY PARSER] Error: {e}, falling back to pattern matching")
+            # Fallback: Try to extract entity names from query even when JSON parsing fails
+            filters = {}
+            query_lower = query.lower()
+            # Try to extract entity names (common patterns for company/college names)
+            # Look for capitalized words or multi-word entities (handles both uppercase and lowercase)
+            import re
+            # Pattern: "how many total students are insured in prahladrai dalmia"
+            # Extract names that appear after "in", "for", "about", "of"
+            name_patterns = [
+                r'(?:in|for|about|of|at)\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)',  # "in Prahladrai Dalmia" (capitalized)
+                r'(?:in|for|about|of|at)\s+([a-z]+(?:\s+[a-z]+){1,4})',  # "in prahladrai dalmia" (lowercase)
+                r'([A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,4})',  # Multi-word capitalized names anywhere
+            ]
+            for pattern in name_patterns:
+                matches = re.findall(pattern, query)
+                if matches:
+                    # Take the longest match (most specific)
+                    entity_name = max(matches, key=len)
+                    if len(entity_name.split()) >= 2:  # At least 2 words
+                        # Capitalize first letter of each word for consistency
+                        entity_name = ' '.join(word.capitalize() for word in entity_name.split())
+                        filters['insured_name'] = entity_name
+                        print(f"[AI QUERY PARSER] Fallback extracted entity: {entity_name}")
+                        break
+            # Detect intent from keywords
+            intent = "specific"
+            if any(word in query_lower for word in ['how many', 'count', 'total number']):
+                intent = "count"
+            elif any(word in query_lower for word in ['list all', 'show all', 'all policies']):
+                intent = "list"
+                needs_metadata = True
+            else:
+                needs_metadata = False
+            # Fallback to basic detection with extracted filters
             return {
+                "intent": intent,
+                "needs_metadata": needs_metadata,
+                "filters": filters,
                 "sort_by": None,
                 "sort_order": "desc",
                 "limit": None,
             if filter_terms:
                 search_query = f"{query} {' '.join(filter_terms)}"
                 print(f"[HYBRID RAG] Enhanced search query: {search_query[:80]}...")
+        else:
+            # If no filters, try to extract entity names directly from query for better search
+            # This helps when AI parser fails but query contains entity names
+            import re
+            # Look for multi-word names (handles both uppercase and lowercase)
+            # Pattern: "how many total students are insured in prahladrai dalmia"
+            name_patterns = [
+                r'(?:in|for|about|of|at)\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)',  # Capitalized after preposition
+                r'(?:in|for|about|of|at)\s+([a-z]+(?:\s+[a-z]+){1,4})',  # Lowercase after preposition
+                r'([A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,4})',  # Capitalized anywhere
+            ]
+            for pattern in name_patterns:
+                name_matches = re.findall(pattern, query)
+                if name_matches:
+                    # Use the longest match (most specific entity name)
+                    entity_name = max(name_matches, key=len)
+                    if len(entity_name.split()) >= 2:  # At least 2 words
+                        # Capitalize first letter of each word for consistency
+                        entity_name = ' '.join(word.capitalize() for word in entity_name.split())
+                        search_query = f"{query} {entity_name}"
+                        print(f"[HYBRID RAG] Extracted entity from query for search: {entity_name}")
+                        break
         # For fallback searches, use more aggressive parameters
         if is_fallback:
         metadata_result = self._handle_metadata_query(user_id, bucket_id, query, parsed)
         print(f"[HYBRID STREAM] Metadata returned {metadata_result.get('total_documents', 0)} docs")
+        # Step 2: Detect if query needs detailed content (numbers, counts, totals, students, etc.)
+        # For these queries, ALWAYS use aggressive RAG search even if metadata has results
+        query_lower = query.lower()
+        needs_detailed_content = any(keyword in query_lower for keyword in [
+            'how many', 'total', 'count', 'number of', 'students', 'sum insured',
+            'total sum', 'aggregate', 'amount', 'quantity', 'coverage', 'insured persons',
+            'lives', 'members', 'people', 'individuals'
+        ])
+        # Step 3: Always get RAG context for detailed content
+        # Use aggressive search (is_fallback=True) if:
+        #   - Metadata has no results, OR
+        #   - Query needs detailed content (numbers, counts, totals) - metadata might not have these
         metadata_has_results = metadata_result.get('total_documents', 0) > 0
+        use_aggressive_rag = not metadata_has_results or needs_detailed_content
+        if needs_detailed_content:
+            print(f"[HYBRID STREAM] Query needs detailed content - using aggressive RAG search regardless of metadata results")
         rag_result = self._get_rag_context_for_query(
             user_id, bucket_id, query,
             filters=parsed.get('filters'),
+            is_fallback=use_aggressive_rag,  # Use aggressive search for detailed content queries
             doc_ids=None  # Document name detection happens inside the method
         )
         print(f"[HYBRID STREAM] RAG returned {rag_result.get('chunk_count', 0)} chunks")
 CRITICAL INSTRUCTIONS:
 1. You have BOTH structured metadata AND detailed document content.
+2. Use metadata for: policy number, insured name, basic premium, dates.
+3. Use detailed content for: coverage details, terms, conditions, exclusions, numbers, counts, totals, sum insured, students, etc.
 4. Provide a comprehensive answer covering all relevant information.
 5. Format clearly with headers and bullet points.
+FINDING NUMBERS AND TOTALS (CRITICAL):
+- When asked about "how many", "total", "sum insured", "students", "count" - search EVERY section
+- The DETAILED DOCUMENT CONTENT section is MORE IMPORTANT than metadata for finding numbers
+- Metadata may have policy info but NOT the detailed numbers - always check detailed content
+- Look for: numbers, totals, aggregates, counts, quantities, amounts
+- Information may be phrased as: "total sum insured", "aggregate SI", "Sum Insured", "number of students", "insured students", etc.
+- NEVER say "cannot be determined" or "not available" unless you've checked EVERY single document section
+- If you find ANY number related to the question, include it in your answer
+- If metadata doesn't have the answer, it's DEFINITELY in the detailed content - keep searching!
 {format_instructions}
 Do NOT say information is missing - search through ALL provided context thoroughly."""
 1. You have BOTH structured metadata AND detailed document content.
 2. Search thoroughly through ALL provided context before answering.
 3. Use metadata for structured fields like names, amounts, dates.
+4. Use detailed content for explanations, terms, conditions, numbers, totals, counts.
 5. Provide a complete and accurate answer based on the documents.
 6. Format clearly with headers and bullet points where appropriate.
+FINDING NUMBERS AND TOTALS (CRITICAL):
+- When asked about "how many", "total", "sum insured", "students", "count" - search EVERY section
+- Look for: numbers, totals, aggregates, counts, quantities, amounts
+- Information may be phrased as: "total sum insured", "aggregate SI", "Sum Insured", "number of students", "insured students", etc.
+- NEVER say "cannot be determined" or "not available" unless you've checked EVERY single document section
+- If you find ANY number related to the question, include it in your answer
 {format_instructions}
 Do NOT say information is missing - search through ALL provided context thoroughly."""
             except Exception as e:
                 print(f"[HYBRID STREAM] Failed to load history: {e}")
+        # Step 6: Detect query type and build conversation context for pronoun resolution
+        query_type = self._detect_query_type(query, stored_history)
+        conversation_context = self._build_conversation_context(stored_history, query)
+        print(f"[HYBRID STREAM] Query type: {query_type}, has conversation context: {bool(conversation_context)}")
+        # Step 7: Build messages
         messages = [{"role": "system", "content": system_prompt}]
         for msg in stored_history:
         format_reminder = f"\n\nRemember: Format response as {format_preference}." if format_preference else ""
+        # Build user message with context injection for pronouns
+        context_injection = ""
+        if query_type == 'followup' and conversation_context:
+            context_injection = f"""
+CONVERSATION CONTEXT (use this to understand pronouns like "it", "this", "that"):
+{conversation_context}
+"""
+        # Add emphasis on using RAG content when query needs detailed information
+        detailed_content_emphasis = ""
+        if needs_detailed_content:
+            detailed_content_emphasis = """
+CRITICAL: This query asks for detailed information (numbers, counts, totals, students, sum insured, etc.).
+- The METADATA section may have policy information but NOT the detailed numbers
+- The DETAILED DOCUMENT CONTENT section contains the actual numbers, counts, and totals
+- You MUST search through the DETAILED DOCUMENT CONTENT section to find the answer
+- If metadata doesn't have the answer, the answer is definitely in the detailed content - keep searching!
+"""
+        user_message = f"""{context_injection}Based on the following document data, answer my question comprehensively.
 DOCUMENT DATA:
 {context}
+{detailed_content_emphasis}
 QUESTION: {query}
+Instructions:
+- Use both the structured metadata AND detailed content to provide a complete answer
+- If this is a follow-up, use conversation history to understand what I'm referring to
+- Search THOROUGHLY through ALL document sections for numbers, totals, counts, students, sum insured, etc.
+- For questions about numbers/counts/totals: The DETAILED DOCUMENT CONTENT section is more important than metadata
+- NEVER say information is missing unless you've checked every single section{format_reminder}"""
         messages.append({"role": "user", "content": user_message})
         # Route based on AI-parsed intent
         intent = parsed.get('intent', 'specific')
         needs_metadata = parsed.get('needs_metadata', False)
+        filters = parsed.get('filters', {})
+        # HYBRID ROUTING LOGIC (UPDATED):
+        # 1. Use METADATA path ONLY for true aggregate queries that need ALL documents:
+        #    - Queries with "list all", "all policies", "all documents" that don't filter by specific entity
+        #    - Queries asking for aggregate data across ALL documents (e.g., "all GMC policies", "renewals in march")
+        #    - These queries need to scan ALL documents, so metadata is more efficient
+        # 2. Use HYBRID path for EVERYTHING else:
+        #    - Specific entity queries (even if they say "list all X policies" for a specific company)
+        #    - Questions about specific documents/entities
+        #    - Any query that filters by insured_name, insurer_name, or other specific entity
+        # Check if this is a TRUE aggregate query (needs all documents, no specific entity filter)
+        is_true_aggregate = False
         if intent in ['list', 'count', 'rank', 'calculate'] and needs_metadata:
+            # It's a true aggregate if:
+            # 1. Query explicitly asks for "all" documents/policies (not filtered to specific entity)
+            # 2. No specific entity filters (insured_name, insurer_name) are present
+            # 3. OR it's asking for aggregate data like "all GMC policies", "all fire policies" (policy type, not entity)
+            query_lower = query.lower()
+            has_all_keyword = any(phrase in query_lower for phrase in [
+                'list all', 'all policies', 'all documents', 'all the policies',
+                'every policy', 'every document', 'all the documents'
+            ])
+            # Check if filtering by specific entity (company, person, etc.)
+            has_entity_filter = bool(filters.get('insured_name') or filters.get('insurer_name') or filters.get('broker_name'))
+            # True aggregate = has "all" keyword AND no specific entity filter
+            # OR it's asking for aggregate by type (policy_type, industry) without entity
+            if has_all_keyword and not has_entity_filter:
+                is_true_aggregate = True
+            elif not has_entity_filter and (filters.get('policy_type') or filters.get('industry')):
+                # Aggregate by type (e.g., "all fire policies", "all manufacturing") - use metadata
+                is_true_aggregate = True
+        if is_true_aggregate:
+            # True aggregate queries - metadata is primary, RAG is fallback (handled inside)
+            print(f"[QUERY ROUTING] Using METADATA path for aggregate {intent} query (needs all documents)")
             yield from self._stream_metadata_query(user_id, bucket_id, query, parsed, chat_id)
             return
         else:
+            # ALL other queries - use HYBRID approach (metadata + RAG together)
+            # This includes:
+            # - Specific entity queries (even if they say "list all X policies")
+            # - Questions about specific documents/entities
+            # - Any query with entity filters
+            print(f"[QUERY ROUTING] Using HYBRID path for {intent} query (specific entity or detailed content)")
             yield from self._stream_hybrid_query(user_id, bucket_id, query, parsed, chat_id)
             return