akryldigital commited on
Commit
600e8b4
·
verified ·
1 Parent(s): ca739e3

Add fallback policy to multi-modal when metadata:text is empty

Browse files
src/agents/visual_chatbot.py CHANGED
@@ -233,7 +233,7 @@ Please provide a detailed answer based on the documents above. Cite your sources
233
  {"role": "user", "content": user_prompt}
234
  ]
235
 
236
- response = self.llm.invoke(messages)
237
  return response.content
238
 
239
 
@@ -242,7 +242,7 @@ def get_visual_chatbot() -> VisualChatbot:
242
  Factory function to create a visual chatbot.
243
 
244
  Uses the same QDRANT_URL and QDRANT_API_KEY as the colpali_colab_package,
245
- but connects to the 'colSmol-500M' collection instead of v1's collections.
246
 
247
  Returns:
248
  Initialized VisualChatbot
@@ -262,6 +262,9 @@ def get_visual_chatbot() -> VisualChatbot:
262
  os.environ.get("QDRANT_API_KEY") # Fallback
263
  )
264
 
 
 
 
265
  if not qdrant_url or not qdrant_api_key:
266
  raise ValueError(
267
  "Visual mode requires Qdrant credentials for the ColPali cluster.\n"
@@ -269,17 +272,17 @@ def get_visual_chatbot() -> VisualChatbot:
269
  " - QDRANT_URL_AKRYL and QDRANT_API_KEY_AKRYL\n"
270
  " - DEST_QDRANT_URL and DEST_QDRANT_API_KEY\n"
271
  " - QDRANT_URL and QDRANT_API_KEY\n\n"
272
- "These should point to the cluster containing the 'colSmol-500M' collection."
273
  )
274
 
275
  logger.info(f" Using Qdrant URL: {qdrant_url}")
276
- logger.info(f" Collection: colSmol-500M")
277
 
278
  # Create visual search adapter with explicit credentials
279
  visual_search = VisualSearchAdapter(
280
  qdrant_url=qdrant_url,
281
  qdrant_api_key=qdrant_api_key,
282
- collection_name="colSmol-500M"
283
  )
284
 
285
  # Get LLM config from settings.yaml
 
233
  {"role": "user", "content": user_prompt}
234
  ]
235
 
236
+ response = self.llm.invoke(messages, prompt_name="visual_simple_answer")
237
  return response.content
238
 
239
 
 
242
  Factory function to create a visual chatbot.
243
 
244
  Uses the same QDRANT_URL and QDRANT_API_KEY as the colpali_colab_package,
245
+ and connects to the collection specified by QDRANT_COLLECTION_VISUAL env var.
246
 
247
  Returns:
248
  Initialized VisualChatbot
 
262
  os.environ.get("QDRANT_API_KEY") # Fallback
263
  )
264
 
265
+ # Get collection name from env var (default to colSmol-500M-v2 for new processing)
266
+ collection_name = os.environ.get("QDRANT_COLLECTION_VISUAL", "colSmol-500M-v2")
267
+
268
  if not qdrant_url or not qdrant_api_key:
269
  raise ValueError(
270
  "Visual mode requires Qdrant credentials for the ColPali cluster.\n"
 
272
  " - QDRANT_URL_AKRYL and QDRANT_API_KEY_AKRYL\n"
273
  " - DEST_QDRANT_URL and DEST_QDRANT_API_KEY\n"
274
  " - QDRANT_URL and QDRANT_API_KEY\n\n"
275
+ "And optionally set QDRANT_COLLECTION_VISUAL (default: colSmol-500M-v2)"
276
  )
277
 
278
  logger.info(f" Using Qdrant URL: {qdrant_url}")
279
+ logger.info(f" Collection: {collection_name}")
280
 
281
  # Create visual search adapter with explicit credentials
282
  visual_search = VisualSearchAdapter(
283
  qdrant_url=qdrant_url,
284
  qdrant_api_key=qdrant_api_key,
285
+ collection_name=collection_name
286
  )
287
 
288
  # Get LLM config from settings.yaml
src/agents/visual_multi_agent_chatbot.py CHANGED
@@ -39,8 +39,13 @@ logger = logging.getLogger(__name__)
39
  # Multi-modal LLM configuration
40
  MULTIMODAL_MODEL = os.environ.get("VISUAL_RAG_MODEL", "gpt-4o") # GPT-4o supports vision
41
  MULTIMODAL_MAX_IMAGES = int(os.environ.get("VISUAL_RAG_MAX_IMAGES", "3")) # Top N images by relevance score
42
- MULTIMODAL_ENABLED = os.environ.get("VISUAL_RAG_MULTIMODAL", "true").lower() == "true" # Toggle for multi-modal mode
 
43
 
 
 
 
 
44
 
45
  class VisualMultiAgentChatbot(BaseMultiAgentChatbot):
46
  """Multi-agent chatbot with visual RAG (ColPali) and multi-modal response generation"""
@@ -77,6 +82,36 @@ class VisualMultiAgentChatbot(BaseMultiAgentChatbot):
77
  super().__init__(config_path)
78
 
79
  logger.info(f"🎨 Visual Multi-Agent Chatbot initialized (multi-modal: {self.enable_multimodal})")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
 
81
  def _perform_retrieval(self, query: str, filters: Dict[str, Any]) -> Any:
82
  """
@@ -89,9 +124,7 @@ class VisualMultiAgentChatbot(BaseMultiAgentChatbot):
89
  Returns:
90
  Result object with .sources and .answer attributes
91
  """
92
- logger.info(f"🔍 VISUAL RETRIEVAL: Performing visual search")
93
- logger.info(f"🔍 VISUAL RETRIEVAL: Query: '{query}'")
94
- logger.info(f"🔍 VISUAL RETRIEVAL: Filters: {filters}")
95
 
96
  # Convert filters to visual search format
97
  visual_filters = {}
@@ -110,7 +143,6 @@ class VisualMultiAgentChatbot(BaseMultiAgentChatbot):
110
  if filters.get("filenames"):
111
  visual_filters["filenames"] = filters["filenames"]
112
 
113
- logger.info(f"🔍 VISUAL RETRIEVAL: Converted filters: {visual_filters}")
114
 
115
  # Perform visual search
116
  try:
@@ -155,8 +187,6 @@ class VisualMultiAgentChatbot(BaseMultiAgentChatbot):
155
  rag_query = state["rag_query"]
156
  filters = state["rag_filters"]
157
 
158
- logger.info(f"📝 VISUAL RESPONSE AGENT: Query: '{rag_query}'")
159
- logger.info(f"📝 VISUAL RESPONSE AGENT: Filters: {filters}")
160
 
161
  try:
162
  # Call visual retrieval
@@ -239,7 +269,8 @@ class VisualMultiAgentChatbot(BaseMultiAgentChatbot):
239
  documents: List[Any],
240
  conversation_context: str,
241
  correct_names: str,
242
- filters: Dict[str, Any] = None
 
243
  ) -> Optional[str]:
244
  """
245
  Generate response using GPT-4o with images (multi-modal).
@@ -252,11 +283,16 @@ class VisualMultiAgentChatbot(BaseMultiAgentChatbot):
252
  conversation_context: Formatted conversation history
253
  correct_names: Correct district/source names from metadata
254
  filters: Applied filters
 
255
 
256
  Returns:
257
  LLM response string, or None if multi-modal generation failed
258
  """
259
- if not self.openai_client or not self.enable_multimodal:
 
 
 
 
260
  logger.info("🖼️ Multi-modal disabled, skipping")
261
  return None
262
 
@@ -454,8 +490,8 @@ Now analyze the images and answer the question:"""
454
  # Build conversation history context
455
  conversation_context = self._build_conversation_context_for_response(messages)
456
 
457
- # Build detailed document information
458
- document_details = self._build_visual_document_details(documents)
459
  logger.info(f"💬 VISUAL RESPONSE GENERATION: Document details length: {len(document_details)} chars")
460
 
461
  # Extract correct names from documents
@@ -463,17 +499,44 @@ Now analyze the images and answer the question:"""
463
  logger.info(f"💬 VISUAL RESPONSE GENERATION: Correct names: {correct_names}")
464
 
465
  # ============================================================
466
- # PHASE 2: Try multi-modal generation first (GPT-4o with images)
467
  # ============================================================
468
- if self.enable_multimodal:
469
- logger.info("🖼️ VISUAL RESPONSE GENERATION: Attempting multi-modal generation (GPT-4o with images)...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
470
 
471
  multimodal_response = self._generate_multimodal_response(
472
  query=query,
473
  documents=documents,
474
  conversation_context=conversation_context,
475
  correct_names=correct_names,
476
- filters=filters
 
477
  )
478
 
479
  if multimodal_response:
@@ -557,7 +620,7 @@ Generate a conversational response with proper document references:""")
557
 
558
  try:
559
  logger.info(f"📝 TEXT-ONLY GENERATION: Calling LLM...")
560
- response = self.llm.invoke(response_prompt.format_messages())
561
  response_text = response.content.strip()
562
 
563
  logger.info(f"📝 TEXT-ONLY GENERATION: LLM response received")
@@ -639,7 +702,7 @@ Generate a helpful response:""")
639
  ])
640
 
641
  try:
642
- response = self.llm.invoke(response_prompt.format_messages())
643
  return response.content.strip()
644
  except Exception as e:
645
  logger.error(f"❌ RESPONSE GENERATION (NO DOCS): Error: {e}")
@@ -719,6 +782,66 @@ Generate a helpful response:""")
719
 
720
  return "\n\n".join(details) if details else "No document details available."
721
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
722
  def _extract_correct_names_from_documents(self, documents: List[Any]) -> str:
723
  """Extract correct district/source names from documents to correct misspellings"""
724
  districts = set()
 
39
  # Multi-modal LLM configuration
40
  MULTIMODAL_MODEL = os.environ.get("VISUAL_RAG_MODEL", "gpt-4o") # GPT-4o supports vision
41
  MULTIMODAL_MAX_IMAGES = int(os.environ.get("VISUAL_RAG_MAX_IMAGES", "3")) # Top N images by relevance score
42
+ # MULTIMODAL_ENABLED = os.environ.get("VISUAL_RAG_MULTIMODAL", "true").lower() == "true" # Toggle for multi-modal mode
43
+ MULTIMODAL_ENABLED=False
44
 
45
+ # Query rewriting configuration
46
+ # By default, SKIP query rewriting for visual RAG (use original query for saliency accuracy)
47
+ # Set ENABLE_VISUAL_QUERY_REWRITE=true to enable query rewriting
48
+ SKIP_QUERY_REWRITE = os.environ.get("ENABLE_VISUAL_QUERY_REWRITE", "false").lower() != "true"
49
 
50
  class VisualMultiAgentChatbot(BaseMultiAgentChatbot):
51
  """Multi-agent chatbot with visual RAG (ColPali) and multi-modal response generation"""
 
82
  super().__init__(config_path)
83
 
84
  logger.info(f"🎨 Visual Multi-Agent Chatbot initialized (multi-modal: {self.enable_multimodal})")
85
+ logger.info(f"🎨 Query rewriting: {'ENABLED' if not SKIP_QUERY_REWRITE else 'DISABLED (using original query)'}")
86
+
87
+ def _rag_agent(self, state: MultiAgentState) -> MultiAgentState:
88
+ """
89
+ RAG Agent override for Visual RAG.
90
+
91
+ By default, SKIPS query rewriting to preserve original query for saliency maps.
92
+ Set ENABLE_VISUAL_QUERY_REWRITE=true to enable rewriting.
93
+ """
94
+ from src.agents.base_multi_agent_chatbot import MultiAgentState
95
+
96
+ if SKIP_QUERY_REWRITE:
97
+ # Use original query (no rewriting) - better for saliency accuracy
98
+ original_query = state["current_query"]
99
+ logger.info(f"🔍 RAG AGENT (Visual): Using ORIGINAL query (no rewriting): '{original_query}'")
100
+
101
+ # Still build filters
102
+ context = state["query_context"]
103
+ filters = self._build_filters(context)
104
+ logger.info(f"🔍 RAG AGENT (Visual): Filters: {filters}")
105
+
106
+ state["agent_logs"].append(f"RAG AGENT: Query='{original_query}' (original), Filters={filters}")
107
+ state["rag_query"] = original_query
108
+ state["rag_filters"] = filters
109
+
110
+ return state
111
+ else:
112
+ # Use parent's query rewriting
113
+ logger.info(f"🔍 RAG AGENT (Visual): Query rewriting ENABLED")
114
+ return super()._rag_agent(state)
115
 
116
  def _perform_retrieval(self, query: str, filters: Dict[str, Any]) -> Any:
117
  """
 
124
  Returns:
125
  Result object with .sources and .answer attributes
126
  """
127
+ logger.info(f"🔍 VISUAL RETRIEVAL: Searching with {len(filters.get('filenames', []))} filename filters")
 
 
128
 
129
  # Convert filters to visual search format
130
  visual_filters = {}
 
143
  if filters.get("filenames"):
144
  visual_filters["filenames"] = filters["filenames"]
145
 
 
146
 
147
  # Perform visual search
148
  try:
 
187
  rag_query = state["rag_query"]
188
  filters = state["rag_filters"]
189
 
 
 
190
 
191
  try:
192
  # Call visual retrieval
 
269
  documents: List[Any],
270
  conversation_context: str,
271
  correct_names: str,
272
+ filters: Dict[str, Any] = None,
273
+ force_multimodal: bool = False
274
  ) -> Optional[str]:
275
  """
276
  Generate response using GPT-4o with images (multi-modal).
 
283
  conversation_context: Formatted conversation history
284
  correct_names: Correct district/source names from metadata
285
  filters: Applied filters
286
+ force_multimodal: Force multi-modal even if globally disabled (for auto-fallback)
287
 
288
  Returns:
289
  LLM response string, or None if multi-modal generation failed
290
  """
291
+ if not self.openai_client:
292
+ logger.warning("🖼️ Multi-modal: OpenAI client not initialized")
293
+ return None
294
+
295
+ if not self.enable_multimodal and not force_multimodal:
296
  logger.info("🖼️ Multi-modal disabled, skipping")
297
  return None
298
 
 
490
  # Build conversation history context
491
  conversation_context = self._build_conversation_context_for_response(messages)
492
 
493
+ # Build detailed document information and check text content availability
494
+ document_details, docs_with_text, docs_without_text = self._build_visual_document_details_with_counts(documents)
495
  logger.info(f"💬 VISUAL RESPONSE GENERATION: Document details length: {len(document_details)} chars")
496
 
497
  # Extract correct names from documents
 
499
  logger.info(f"💬 VISUAL RESPONSE GENERATION: Correct names: {correct_names}")
500
 
501
  # ============================================================
502
+ # AUTO-FALLBACK: If most documents lack text, force multi-modal
503
  # ============================================================
504
+ use_multimodal = self.enable_multimodal
505
+ force_multimodal = False
506
+
507
+ if docs_without_text > docs_with_text and not use_multimodal:
508
+ logger.warning(f"⚠️ AUTO-FALLBACK: {docs_without_text}/{len(documents)} docs lack text content!")
509
+ logger.info("🖼️ AUTO-FALLBACK: Temporarily enabling multi-modal to analyze images...")
510
+
511
+ if self.openai_client is None:
512
+ api_key = os.environ.get("OPENAI_API_KEY")
513
+ if api_key:
514
+ self.openai_client = OpenAI(api_key=api_key)
515
+ use_multimodal = True
516
+ force_multimodal = True
517
+ logger.info(f"🖼️ AUTO-FALLBACK: Initialized OpenAI client for {MULTIMODAL_MODEL}")
518
+ else:
519
+ logger.warning("⚠️ AUTO-FALLBACK: Cannot enable multi-modal - OPENAI_API_KEY not set")
520
+ else:
521
+ use_multimodal = True
522
+ force_multimodal = True
523
+
524
+ # ============================================================
525
+ # PHASE 2: Try multi-modal generation (GPT-4o with images)
526
+ # ============================================================
527
+ if use_multimodal:
528
+ if force_multimodal:
529
+ logger.info("🖼️ VISUAL RESPONSE GENERATION: Using AUTO-FALLBACK multi-modal (most docs lack text)...")
530
+ else:
531
+ logger.info("🖼️ VISUAL RESPONSE GENERATION: Attempting multi-modal generation (GPT-4o with images)...")
532
 
533
  multimodal_response = self._generate_multimodal_response(
534
  query=query,
535
  documents=documents,
536
  conversation_context=conversation_context,
537
  correct_names=correct_names,
538
+ filters=filters,
539
+ force_multimodal=force_multimodal
540
  )
541
 
542
  if multimodal_response:
 
620
 
621
  try:
622
  logger.info(f"📝 TEXT-ONLY GENERATION: Calling LLM...")
623
+ response = self.llm.invoke(response_prompt.format_messages(), prompt_name="visual_rag_answer")
624
  response_text = response.content.strip()
625
 
626
  logger.info(f"📝 TEXT-ONLY GENERATION: LLM response received")
 
702
  ])
703
 
704
  try:
705
+ response = self.llm.invoke(response_prompt.format_messages(), prompt_name="visual_no_docs_fallback")
706
  return response.content.strip()
707
  except Exception as e:
708
  logger.error(f"❌ RESPONSE GENERATION (NO DOCS): Error: {e}")
 
782
 
783
  return "\n\n".join(details) if details else "No document details available."
784
 
785
+ def _build_visual_document_details_with_counts(self, documents: List[Any]) -> tuple:
786
+ """
787
+ Build document details and return counts of docs with/without text.
788
+
789
+ Returns:
790
+ Tuple of (document_details_string, docs_with_text_count, docs_without_text_count)
791
+ """
792
+ details = []
793
+ docs_with_content = 0
794
+ docs_without_content = 0
795
+ total_content_length = 0
796
+
797
+ logger.info(f"�� BUILD_DETAILS: Processing {len(documents)} documents for LLM context")
798
+
799
+ for i, doc in enumerate(documents, 1):
800
+ metadata = getattr(doc, 'metadata', {}) or {}
801
+ content = getattr(doc, 'page_content', '') or getattr(doc, 'content', '') or metadata.get('text', '')
802
+ score = getattr(doc, 'score', 0) if hasattr(doc, 'score') else 0
803
+
804
+ filename = metadata.get('filename', 'Unknown')
805
+ year = metadata.get('year', 'Unknown')
806
+ source = metadata.get('source', 'Unknown')
807
+ page = metadata.get('page', metadata.get('page_number', 'Unknown'))
808
+ district = metadata.get('district', 'Unknown')
809
+ num_tiles = metadata.get('num_tiles')
810
+ num_visual_tokens = metadata.get('num_visual_tokens')
811
+
812
+ doc_info = f"[Doc {i}] (Score: {score:.3f})"
813
+ doc_info += f"\n Filename: {filename}"
814
+ doc_info += f"\n Year: {year}"
815
+ doc_info += f"\n Source: {source}"
816
+ if district != 'Unknown':
817
+ doc_info += f"\n District: {district}"
818
+ doc_info += f"\n Page: {page}"
819
+
820
+ if num_tiles or num_visual_tokens:
821
+ doc_info += f"\n Visual: {num_tiles} tiles, {num_visual_tokens} tokens"
822
+
823
+ if content and content.strip():
824
+ doc_info += f"\n Content: {content[:500]}{'...' if len(content) > 500 else ''}"
825
+ docs_with_content += 1
826
+ total_content_length += len(content)
827
+ else:
828
+ doc_info += "\n Content: (No text extracted - image-only page)"
829
+ docs_without_content += 1
830
+
831
+ details.append(doc_info)
832
+
833
+ logger.info(f"📄 BUILD_DETAILS SUMMARY:")
834
+ logger.info(f" - Documents with text content: {docs_with_content}")
835
+ logger.info(f" - Documents WITHOUT text (image-only): {docs_without_content}")
836
+ logger.info(f" - Total text content length: {total_content_length} chars")
837
+
838
+ if docs_without_content > docs_with_content:
839
+ logger.warning(f"⚠️ BUILD_DETAILS: Most documents have NO TEXT CONTENT!")
840
+ logger.warning(f"⚠️ Auto-fallback to multi-modal will be attempted...")
841
+
842
+ details_str = "\n\n".join(details) if details else "No document details available."
843
+ return details_str, docs_with_content, docs_without_content
844
+
845
  def _extract_correct_names_from_documents(self, documents: List[Any]) -> str:
846
  """Extract correct district/source names from documents to correct misspellings"""
847
  districts = set()