Spaces:

akryldigital
/

audit_assistant

Running

App Files Files Community

akryldigital commited on Dec 28, 2025

Commit

600e8b4

verified ·

1 Parent(s): ca739e3

Add fallback policy to multi-modal when metadata:text is empty

Browse files

Files changed (2) hide show

src/agents/visual_chatbot.py +8 -5
src/agents/visual_multi_agent_chatbot.py +140 -17

src/agents/visual_chatbot.py CHANGED Viewed

@@ -233,7 +233,7 @@ Please provide a detailed answer based on the documents above. Cite your sources
             {"role": "user", "content": user_prompt}
         ]
-        response = self.llm.invoke(messages)
         return response.content
@@ -242,7 +242,7 @@ def get_visual_chatbot() -> VisualChatbot:
     Factory function to create a visual chatbot.
     Uses the same QDRANT_URL and QDRANT_API_KEY as the colpali_colab_package,
-    but connects to the 'colSmol-500M' collection instead of v1's collections.
     Returns:
         Initialized VisualChatbot
@@ -262,6 +262,9 @@ def get_visual_chatbot() -> VisualChatbot:
         os.environ.get("QDRANT_API_KEY")            # Fallback
     )
     if not qdrant_url or not qdrant_api_key:
         raise ValueError(
             "Visual mode requires Qdrant credentials for the ColPali cluster.\n"
@@ -269,17 +272,17 @@ def get_visual_chatbot() -> VisualChatbot:
             "  - QDRANT_URL_AKRYL and QDRANT_API_KEY_AKRYL\n"
             "  - DEST_QDRANT_URL and DEST_QDRANT_API_KEY\n"
             "  - QDRANT_URL and QDRANT_API_KEY\n\n"
-            "These should point to the cluster containing the 'colSmol-500M' collection."
         )
     logger.info(f"   Using Qdrant URL: {qdrant_url}")
-    logger.info(f"   Collection: colSmol-500M")
     # Create visual search adapter with explicit credentials
     visual_search = VisualSearchAdapter(
         qdrant_url=qdrant_url,
         qdrant_api_key=qdrant_api_key,
-        collection_name="colSmol-500M"
     )
     # Get LLM config from settings.yaml

             {"role": "user", "content": user_prompt}
         ]
+        response = self.llm.invoke(messages, prompt_name="visual_simple_answer")
         return response.content
     Factory function to create a visual chatbot.
     Uses the same QDRANT_URL and QDRANT_API_KEY as the colpali_colab_package,
+    and connects to the collection specified by QDRANT_COLLECTION_VISUAL env var.
     Returns:
         Initialized VisualChatbot
         os.environ.get("QDRANT_API_KEY")            # Fallback
     )
+    # Get collection name from env var (default to colSmol-500M-v2 for new processing)
+    collection_name = os.environ.get("QDRANT_COLLECTION_VISUAL", "colSmol-500M-v2")
     if not qdrant_url or not qdrant_api_key:
         raise ValueError(
             "Visual mode requires Qdrant credentials for the ColPali cluster.\n"
             "  - QDRANT_URL_AKRYL and QDRANT_API_KEY_AKRYL\n"
             "  - DEST_QDRANT_URL and DEST_QDRANT_API_KEY\n"
             "  - QDRANT_URL and QDRANT_API_KEY\n\n"
+            "And optionally set QDRANT_COLLECTION_VISUAL (default: colSmol-500M-v2)"
         )
     logger.info(f"   Using Qdrant URL: {qdrant_url}")
+    logger.info(f"   Collection: {collection_name}")
     # Create visual search adapter with explicit credentials
     visual_search = VisualSearchAdapter(
         qdrant_url=qdrant_url,
         qdrant_api_key=qdrant_api_key,
+        collection_name=collection_name
     )
     # Get LLM config from settings.yaml

src/agents/visual_multi_agent_chatbot.py CHANGED Viewed

@@ -39,8 +39,13 @@ logger = logging.getLogger(__name__)
 # Multi-modal LLM configuration
 MULTIMODAL_MODEL = os.environ.get("VISUAL_RAG_MODEL", "gpt-4o")  # GPT-4o supports vision
 MULTIMODAL_MAX_IMAGES = int(os.environ.get("VISUAL_RAG_MAX_IMAGES", "3"))  # Top N images by relevance score
-MULTIMODAL_ENABLED = os.environ.get("VISUAL_RAG_MULTIMODAL", "true").lower() == "true"  # Toggle for multi-modal mode
 class VisualMultiAgentChatbot(BaseMultiAgentChatbot):
     """Multi-agent chatbot with visual RAG (ColPali) and multi-modal response generation"""
@@ -77,6 +82,36 @@ class VisualMultiAgentChatbot(BaseMultiAgentChatbot):
         super().__init__(config_path)
         logger.info(f"🎨 Visual Multi-Agent Chatbot initialized (multi-modal: {self.enable_multimodal})")
     def _perform_retrieval(self, query: str, filters: Dict[str, Any]) -> Any:
         """
@@ -89,9 +124,7 @@ class VisualMultiAgentChatbot(BaseMultiAgentChatbot):
         Returns:
             Result object with .sources and .answer attributes
         """
-        logger.info(f"🔍 VISUAL RETRIEVAL: Performing visual search")
-        logger.info(f"🔍 VISUAL RETRIEVAL: Query: '{query}'")
-        logger.info(f"🔍 VISUAL RETRIEVAL: Filters: {filters}")
         # Convert filters to visual search format
         visual_filters = {}
@@ -110,7 +143,6 @@ class VisualMultiAgentChatbot(BaseMultiAgentChatbot):
         if filters.get("filenames"):
             visual_filters["filenames"] = filters["filenames"]
-        logger.info(f"🔍 VISUAL RETRIEVAL: Converted filters: {visual_filters}")
         # Perform visual search
         try:
@@ -155,8 +187,6 @@ class VisualMultiAgentChatbot(BaseMultiAgentChatbot):
         rag_query = state["rag_query"]
         filters = state["rag_filters"]
-        logger.info(f"📝 VISUAL RESPONSE AGENT: Query: '{rag_query}'")
-        logger.info(f"📝 VISUAL RESPONSE AGENT: Filters: {filters}")
         try:
             # Call visual retrieval
@@ -239,7 +269,8 @@ class VisualMultiAgentChatbot(BaseMultiAgentChatbot):
         documents: List[Any],
         conversation_context: str,
         correct_names: str,
-        filters: Dict[str, Any] = None
     ) -> Optional[str]:
         """
         Generate response using GPT-4o with images (multi-modal).
@@ -252,11 +283,16 @@ class VisualMultiAgentChatbot(BaseMultiAgentChatbot):
             conversation_context: Formatted conversation history
             correct_names: Correct district/source names from metadata
             filters: Applied filters
         Returns:
             LLM response string, or None if multi-modal generation failed
         """
-        if not self.openai_client or not self.enable_multimodal:
             logger.info("🖼️ Multi-modal disabled, skipping")
             return None
@@ -454,8 +490,8 @@ Now analyze the images and answer the question:"""
         # Build conversation history context
         conversation_context = self._build_conversation_context_for_response(messages)
-        # Build detailed document information
-        document_details = self._build_visual_document_details(documents)
         logger.info(f"💬 VISUAL RESPONSE GENERATION: Document details length: {len(document_details)} chars")
         # Extract correct names from documents
@@ -463,17 +499,44 @@ Now analyze the images and answer the question:"""
         logger.info(f"💬 VISUAL RESPONSE GENERATION: Correct names: {correct_names}")
         # ============================================================
-        # PHASE 2: Try multi-modal generation first (GPT-4o with images)
         # ============================================================
-        if self.enable_multimodal:
-            logger.info("🖼️ VISUAL RESPONSE GENERATION: Attempting multi-modal generation (GPT-4o with images)...")
             multimodal_response = self._generate_multimodal_response(
                 query=query,
                 documents=documents,
                 conversation_context=conversation_context,
                 correct_names=correct_names,
-                filters=filters
             )
             if multimodal_response:
@@ -557,7 +620,7 @@ Generate a conversational response with proper document references:""")
         try:
             logger.info(f"📝 TEXT-ONLY GENERATION: Calling LLM...")
-            response = self.llm.invoke(response_prompt.format_messages())
             response_text = response.content.strip()
             logger.info(f"📝 TEXT-ONLY GENERATION: LLM response received")
@@ -639,7 +702,7 @@ Generate a helpful response:""")
         ])
         try:
-            response = self.llm.invoke(response_prompt.format_messages())
             return response.content.strip()
         except Exception as e:
             logger.error(f"❌ RESPONSE GENERATION (NO DOCS): Error: {e}")
@@ -719,6 +782,66 @@ Generate a helpful response:""")
         return "\n\n".join(details) if details else "No document details available."
     def _extract_correct_names_from_documents(self, documents: List[Any]) -> str:
         """Extract correct district/source names from documents to correct misspellings"""
         districts = set()

 # Multi-modal LLM configuration
 MULTIMODAL_MODEL = os.environ.get("VISUAL_RAG_MODEL", "gpt-4o")  # GPT-4o supports vision
 MULTIMODAL_MAX_IMAGES = int(os.environ.get("VISUAL_RAG_MAX_IMAGES", "3"))  # Top N images by relevance score
+# MULTIMODAL_ENABLED = os.environ.get("VISUAL_RAG_MULTIMODAL", "true").lower() == "true"  # Toggle for multi-modal mode
+MULTIMODAL_ENABLED=False
+# Query rewriting configuration
+# By default, SKIP query rewriting for visual RAG (use original query for saliency accuracy)
+# Set ENABLE_VISUAL_QUERY_REWRITE=true to enable query rewriting
+SKIP_QUERY_REWRITE = os.environ.get("ENABLE_VISUAL_QUERY_REWRITE", "false").lower() != "true"
 class VisualMultiAgentChatbot(BaseMultiAgentChatbot):
     """Multi-agent chatbot with visual RAG (ColPali) and multi-modal response generation"""
         super().__init__(config_path)
         logger.info(f"🎨 Visual Multi-Agent Chatbot initialized (multi-modal: {self.enable_multimodal})")
+        logger.info(f"🎨 Query rewriting: {'ENABLED' if not SKIP_QUERY_REWRITE else 'DISABLED (using original query)'}")
+    def _rag_agent(self, state: MultiAgentState) -> MultiAgentState:
+        """
+        RAG Agent override for Visual RAG.
+        By default, SKIPS query rewriting to preserve original query for saliency maps.
+        Set ENABLE_VISUAL_QUERY_REWRITE=true to enable rewriting.
+        """
+        from src.agents.base_multi_agent_chatbot import MultiAgentState
+        if SKIP_QUERY_REWRITE:
+            # Use original query (no rewriting) - better for saliency accuracy
+            original_query = state["current_query"]
+            logger.info(f"🔍 RAG AGENT (Visual): Using ORIGINAL query (no rewriting): '{original_query}'")
+            # Still build filters
+            context = state["query_context"]
+            filters = self._build_filters(context)
+            logger.info(f"🔍 RAG AGENT (Visual): Filters: {filters}")
+            state["agent_logs"].append(f"RAG AGENT: Query='{original_query}' (original), Filters={filters}")
+            state["rag_query"] = original_query
+            state["rag_filters"] = filters
+            return state
+        else:
+            # Use parent's query rewriting
+            logger.info(f"🔍 RAG AGENT (Visual): Query rewriting ENABLED")
+            return super()._rag_agent(state)
     def _perform_retrieval(self, query: str, filters: Dict[str, Any]) -> Any:
         """
         Returns:
             Result object with .sources and .answer attributes
         """
+        logger.info(f"🔍 VISUAL RETRIEVAL: Searching with {len(filters.get('filenames', []))} filename filters")
         # Convert filters to visual search format
         visual_filters = {}
         if filters.get("filenames"):
             visual_filters["filenames"] = filters["filenames"]
         # Perform visual search
         try:
         rag_query = state["rag_query"]
         filters = state["rag_filters"]
         try:
             # Call visual retrieval
         documents: List[Any],
         conversation_context: str,
         correct_names: str,
+        filters: Dict[str, Any] = None,
+        force_multimodal: bool = False
     ) -> Optional[str]:
         """
         Generate response using GPT-4o with images (multi-modal).
             conversation_context: Formatted conversation history
             correct_names: Correct district/source names from metadata
             filters: Applied filters
+            force_multimodal: Force multi-modal even if globally disabled (for auto-fallback)
         Returns:
             LLM response string, or None if multi-modal generation failed
         """
+        if not self.openai_client:
+            logger.warning("🖼️ Multi-modal: OpenAI client not initialized")
+            return None
+        if not self.enable_multimodal and not force_multimodal:
             logger.info("🖼️ Multi-modal disabled, skipping")
             return None
         # Build conversation history context
         conversation_context = self._build_conversation_context_for_response(messages)
+        # Build detailed document information and check text content availability
+        document_details, docs_with_text, docs_without_text = self._build_visual_document_details_with_counts(documents)
         logger.info(f"💬 VISUAL RESPONSE GENERATION: Document details length: {len(document_details)} chars")
         # Extract correct names from documents
         logger.info(f"💬 VISUAL RESPONSE GENERATION: Correct names: {correct_names}")
         # ============================================================
+        # AUTO-FALLBACK: If most documents lack text, force multi-modal
         # ============================================================
+        use_multimodal = self.enable_multimodal
+        force_multimodal = False
+        if docs_without_text > docs_with_text and not use_multimodal:
+            logger.warning(f"⚠️ AUTO-FALLBACK: {docs_without_text}/{len(documents)} docs lack text content!")
+            logger.info("🖼️ AUTO-FALLBACK: Temporarily enabling multi-modal to analyze images...")
+            if self.openai_client is None:
+                api_key = os.environ.get("OPENAI_API_KEY")
+                if api_key:
+                    self.openai_client = OpenAI(api_key=api_key)
+                    use_multimodal = True
+                    force_multimodal = True
+                    logger.info(f"🖼️ AUTO-FALLBACK: Initialized OpenAI client for {MULTIMODAL_MODEL}")
+                else:
+                    logger.warning("⚠️ AUTO-FALLBACK: Cannot enable multi-modal - OPENAI_API_KEY not set")
+            else:
+                use_multimodal = True
+                force_multimodal = True
+        # ============================================================
+        # PHASE 2: Try multi-modal generation (GPT-4o with images)
+        # ============================================================
+        if use_multimodal:
+            if force_multimodal:
+                logger.info("🖼️ VISUAL RESPONSE GENERATION: Using AUTO-FALLBACK multi-modal (most docs lack text)...")
+            else:
+                logger.info("🖼️ VISUAL RESPONSE GENERATION: Attempting multi-modal generation (GPT-4o with images)...")
             multimodal_response = self._generate_multimodal_response(
                 query=query,
                 documents=documents,
                 conversation_context=conversation_context,
                 correct_names=correct_names,
+                filters=filters,
+                force_multimodal=force_multimodal
             )
             if multimodal_response:
         try:
             logger.info(f"📝 TEXT-ONLY GENERATION: Calling LLM...")
+            response = self.llm.invoke(response_prompt.format_messages(), prompt_name="visual_rag_answer")
             response_text = response.content.strip()
             logger.info(f"📝 TEXT-ONLY GENERATION: LLM response received")
         ])
         try:
+            response = self.llm.invoke(response_prompt.format_messages(), prompt_name="visual_no_docs_fallback")
             return response.content.strip()
         except Exception as e:
             logger.error(f"❌ RESPONSE GENERATION (NO DOCS): Error: {e}")
         return "\n\n".join(details) if details else "No document details available."
+    def _build_visual_document_details_with_counts(self, documents: List[Any]) -> tuple:
+        """
+        Build document details and return counts of docs with/without text.
+        Returns:
+            Tuple of (document_details_string, docs_with_text_count, docs_without_text_count)
+        """
+        details = []
+        docs_with_content = 0
+        docs_without_content = 0
+        total_content_length = 0
+        logger.info(f"�� BUILD_DETAILS: Processing {len(documents)} documents for LLM context")
+        for i, doc in enumerate(documents, 1):
+            metadata = getattr(doc, 'metadata', {}) or {}
+            content = getattr(doc, 'page_content', '') or getattr(doc, 'content', '') or metadata.get('text', '')
+            score = getattr(doc, 'score', 0) if hasattr(doc, 'score') else 0
+            filename = metadata.get('filename', 'Unknown')
+            year = metadata.get('year', 'Unknown')
+            source = metadata.get('source', 'Unknown')
+            page = metadata.get('page', metadata.get('page_number', 'Unknown'))
+            district = metadata.get('district', 'Unknown')
+            num_tiles = metadata.get('num_tiles')
+            num_visual_tokens = metadata.get('num_visual_tokens')
+            doc_info = f"[Doc {i}] (Score: {score:.3f})"
+            doc_info += f"\n  Filename: {filename}"
+            doc_info += f"\n  Year: {year}"
+            doc_info += f"\n  Source: {source}"
+            if district != 'Unknown':
+                doc_info += f"\n  District: {district}"
+            doc_info += f"\n  Page: {page}"
+            if num_tiles or num_visual_tokens:
+                doc_info += f"\n  Visual: {num_tiles} tiles, {num_visual_tokens} tokens"
+            if content and content.strip():
+                doc_info += f"\n  Content: {content[:500]}{'...' if len(content) > 500 else ''}"
+                docs_with_content += 1
+                total_content_length += len(content)
+            else:
+                doc_info += "\n  Content: (No text extracted - image-only page)"
+                docs_without_content += 1
+            details.append(doc_info)
+        logger.info(f"📄 BUILD_DETAILS SUMMARY:")
+        logger.info(f"   - Documents with text content: {docs_with_content}")
+        logger.info(f"   - Documents WITHOUT text (image-only): {docs_without_content}")
+        logger.info(f"   - Total text content length: {total_content_length} chars")
+        if docs_without_content > docs_with_content:
+            logger.warning(f"⚠️ BUILD_DETAILS: Most documents have NO TEXT CONTENT!")
+            logger.warning(f"⚠️ Auto-fallback to multi-modal will be attempted...")
+        details_str = "\n\n".join(details) if details else "No document details available."
+        return details_str, docs_with_content, docs_without_content
     def _extract_correct_names_from_documents(self, documents: List[Any]) -> str:
         """Extract correct district/source names from documents to correct misspellings"""
         districts = set()