akryldigital commited on
Commit
582a267
Β·
verified Β·
1 Parent(s): 9db763a

add VisionRAG Agent

Browse files
src/agents/visual_multi_agent_chatbot.py ADDED
@@ -0,0 +1,891 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Visual Multi-Agent RAG Chatbot using LangGraph
3
+
4
+ This system uses the same sophisticated multi-agent architecture as v1,
5
+ but with visual document retrieval (ColPali) instead of text-based RAG.
6
+
7
+ Inherits from BaseMultiAgentChatbot to get:
8
+ - LLM-based query analysis
9
+ - Filter extraction and validation
10
+ - Query rewriting
11
+ - Main/RAG/Response agent orchestration
12
+
13
+ Only implements:
14
+ - Visual search retrieval
15
+ - Response generation with visual context
16
+
17
+ Phase 2 IMPLEMENTED: Multi-modal LLM support
18
+ - Top 3 images (by relevance) are sent directly to GPT-4o
19
+ - LLM can see tables, charts, figures directly
20
+ - Falls back to text-only if multi-modal fails
21
+ """
22
+ import os
23
+ import time
24
+ import base64
25
+ import logging
26
+ import traceback
27
+ import httpx
28
+ from typing import Dict, List, Any, Optional
29
+
30
+ from openai import OpenAI
31
+ from langchain_core.prompts import ChatPromptTemplate
32
+ from langchain_core.messages import HumanMessage, AIMessage, SystemMessage
33
+
34
+ from src.agents.base_multi_agent_chatbot import BaseMultiAgentChatbot, MultiAgentState
35
+ from src.colpali.visual_search import VisualSearchAdapter
36
+
37
+ logger = logging.getLogger(__name__)
38
+
39
+ # Multi-modal LLM configuration
40
+ MULTIMODAL_MODEL = os.environ.get("VISUAL_RAG_MODEL", "gpt-4o") # GPT-4o supports vision
41
+ MULTIMODAL_MAX_IMAGES = int(os.environ.get("VISUAL_RAG_MAX_IMAGES", "3")) # Top N images by relevance score
42
+ MULTIMODAL_ENABLED = os.environ.get("VISUAL_RAG_MULTIMODAL", "true").lower() == "true" # Toggle for multi-modal mode
43
+
44
+
45
+ class VisualMultiAgentChatbot(BaseMultiAgentChatbot):
46
+ """Multi-agent chatbot with visual RAG (ColPali) and multi-modal response generation"""
47
+
48
+ def __init__(
49
+ self,
50
+ visual_search: VisualSearchAdapter,
51
+ config_path: str = "src/config/settings.yaml",
52
+ enable_multimodal: bool = True
53
+ ):
54
+ """
55
+ Initialize the visual multi-agent chatbot.
56
+
57
+ Args:
58
+ visual_search: Visual search adapter (ColPali)
59
+ config_path: Path to config file
60
+ enable_multimodal: Whether to use multi-modal LLM (GPT-4o with images)
61
+ """
62
+ self.visual_search = visual_search
63
+ self.enable_multimodal = enable_multimodal and MULTIMODAL_ENABLED
64
+
65
+ # Initialize OpenAI client for multi-modal (GPT-4o)
66
+ self.openai_client = None
67
+ if self.enable_multimodal:
68
+ api_key = os.environ.get("OPENAI_API_KEY")
69
+ if api_key:
70
+ self.openai_client = OpenAI(api_key=api_key)
71
+ logger.info(f"πŸ–ΌοΈ Multi-modal LLM initialized: {MULTIMODAL_MODEL}")
72
+ else:
73
+ logger.warning("⚠️ OPENAI_API_KEY not set, multi-modal disabled")
74
+ self.enable_multimodal = False
75
+
76
+ # Call parent init (sets up LLM, filters, graph, etc.)
77
+ super().__init__(config_path)
78
+
79
+ logger.info(f"🎨 Visual Multi-Agent Chatbot initialized (multi-modal: {self.enable_multimodal})")
80
+
81
+ def _perform_retrieval(self, query: str, filters: Dict[str, Any]) -> Any:
82
+ """
83
+ Perform visual search retrieval.
84
+
85
+ Args:
86
+ query: The rewritten query
87
+ filters: The filters to apply
88
+
89
+ Returns:
90
+ Result object with .sources and .answer attributes
91
+ """
92
+ logger.info(f"πŸ” VISUAL RETRIEVAL: Performing visual search")
93
+ logger.info(f"πŸ” VISUAL RETRIEVAL: Query: '{query}'")
94
+ logger.info(f"πŸ” VISUAL RETRIEVAL: Filters: {filters}")
95
+
96
+ # Convert filters to visual search format
97
+ visual_filters = {}
98
+
99
+ if filters.get("sources"):
100
+ visual_filters["sources"] = filters["sources"]
101
+
102
+ if filters.get("year"):
103
+ # Convert to "years" (plural) for visual search
104
+ visual_filters["years"] = filters["year"]
105
+
106
+ if filters.get("district"):
107
+ # Convert to "districts" (plural) for visual search
108
+ visual_filters["districts"] = filters["district"]
109
+
110
+ if filters.get("filenames"):
111
+ visual_filters["filenames"] = filters["filenames"]
112
+
113
+ logger.info(f"πŸ” VISUAL RETRIEVAL: Converted filters: {visual_filters}")
114
+
115
+ # Perform visual search
116
+ try:
117
+ visual_results = self.visual_search.search(
118
+ query=query,
119
+ top_k=10,
120
+ filters=visual_filters,
121
+ search_strategy="multi_vector"
122
+ )
123
+
124
+ logger.info(f"πŸ” VISUAL RETRIEVAL: Retrieved {len(visual_results)} visual documents")
125
+
126
+ # Return in format expected by base class
127
+ class Result:
128
+ def __init__(self, sources, answer=""):
129
+ self.sources = sources
130
+ self.answer = answer
131
+
132
+ return Result(visual_results, "")
133
+
134
+ except Exception as e:
135
+ logger.error(f"❌ VISUAL RETRIEVAL: Error during visual search: {e}")
136
+ traceback.print_exc()
137
+
138
+ # Return empty result
139
+ class Result:
140
+ def __init__(self, sources, answer=""):
141
+ self.sources = sources
142
+ self.answer = answer
143
+
144
+ return Result([], "")
145
+
146
+ def _response_agent(self, state: MultiAgentState) -> MultiAgentState:
147
+ """
148
+ Override response agent for Visual RAG.
149
+
150
+ Visual RAG uses MaxSim scores (typically 10-30+) instead of cosine similarity (0-1),
151
+ so we skip the similarity score threshold check that's in the base class.
152
+ """
153
+ logger.info("πŸ“ VISUAL RESPONSE AGENT: Starting document retrieval and answer generation")
154
+
155
+ rag_query = state["rag_query"]
156
+ filters = state["rag_filters"]
157
+
158
+ logger.info(f"πŸ“ VISUAL RESPONSE AGENT: Query: '{rag_query}'")
159
+ logger.info(f"πŸ“ VISUAL RESPONSE AGENT: Filters: {filters}")
160
+
161
+ try:
162
+ # Call visual retrieval
163
+ result = self._perform_retrieval(rag_query, filters)
164
+
165
+ state["retrieved_documents"] = result.sources
166
+ state["agent_logs"].append(f"VISUAL RESPONSE AGENT: Retrieved {len(result.sources)} documents")
167
+
168
+ logger.info(f"πŸ“ VISUAL RESPONSE AGENT: Retrieved {len(result.sources)} documents")
169
+
170
+ # For Visual RAG, we don't check similarity scores (MaxSim scores are different scale)
171
+ # Just check if we have any documents
172
+ if not result.sources:
173
+ logger.warning(f"⚠️ VISUAL RESPONSE AGENT: No documents retrieved, using LLM knowledge only")
174
+ response = self._generate_conversational_response_without_docs(
175
+ state["current_query"],
176
+ state["messages"]
177
+ )
178
+ else:
179
+ # Generate conversational response with documents
180
+ response = self._generate_conversational_response(
181
+ state["current_query"],
182
+ result.sources,
183
+ result.answer,
184
+ state["messages"],
185
+ filters
186
+ )
187
+
188
+ state["final_response"] = response
189
+ state["last_ai_message_time"] = time.time()
190
+
191
+ logger.info(f"πŸ“ VISUAL RESPONSE AGENT: Answer generation complete")
192
+
193
+ except Exception as e:
194
+ logger.error(f"❌ VISUAL RESPONSE AGENT ERROR: {e}")
195
+ traceback.print_exc()
196
+ state["final_response"] = "I apologize, but I encountered an error while retrieving visual documents. Please try again."
197
+ state["last_ai_message_time"] = time.time()
198
+
199
+ return state
200
+
201
+ def _fetch_image_as_base64(self, image_url: str, timeout: float = 10.0) -> Optional[str]:
202
+ """
203
+ Fetch an image from URL and convert to base64.
204
+
205
+ Args:
206
+ image_url: URL of the image
207
+ timeout: Request timeout in seconds
208
+
209
+ Returns:
210
+ Base64 encoded image string, or None if failed
211
+ """
212
+ try:
213
+ with httpx.Client(timeout=timeout) as client:
214
+ response = client.get(image_url)
215
+ response.raise_for_status()
216
+
217
+ # Determine content type
218
+ content_type = response.headers.get('content-type', 'image/jpeg')
219
+ if 'png' in content_type:
220
+ media_type = 'image/png'
221
+ elif 'gif' in content_type:
222
+ media_type = 'image/gif'
223
+ elif 'webp' in content_type:
224
+ media_type = 'image/webp'
225
+ else:
226
+ media_type = 'image/jpeg'
227
+
228
+ # Encode to base64
229
+ base64_image = base64.b64encode(response.content).decode('utf-8')
230
+ return f"data:{media_type};base64,{base64_image}"
231
+
232
+ except Exception as e:
233
+ logger.warning(f"⚠️ Failed to fetch image {image_url}: {e}")
234
+ return None
235
+
236
+ def _generate_multimodal_response(
237
+ self,
238
+ query: str,
239
+ documents: List[Any],
240
+ conversation_context: str,
241
+ correct_names: str,
242
+ filters: Dict[str, Any] = None
243
+ ) -> Optional[str]:
244
+ """
245
+ Generate response using GPT-4o with images (multi-modal).
246
+
247
+ Sends top 3 images by relevance score directly to the LLM.
248
+
249
+ Args:
250
+ query: User query
251
+ documents: Retrieved visual documents (sorted by score)
252
+ conversation_context: Formatted conversation history
253
+ correct_names: Correct district/source names from metadata
254
+ filters: Applied filters
255
+
256
+ Returns:
257
+ LLM response string, or None if multi-modal generation failed
258
+ """
259
+ if not self.openai_client or not self.enable_multimodal:
260
+ logger.info("πŸ–ΌοΈ Multi-modal disabled, skipping")
261
+ return None
262
+
263
+ logger.info("=" * 80)
264
+ logger.info("πŸ–ΌοΈ MULTI-MODAL RESPONSE GENERATION: Starting")
265
+ logger.info("=" * 80)
266
+
267
+ # Get top N images by relevance score
268
+ top_docs = documents[:MULTIMODAL_MAX_IMAGES]
269
+ logger.info(f"πŸ–ΌοΈ MULTI-MODAL: Processing top {len(top_docs)} documents for image injection")
270
+
271
+ # Fetch images and build content
272
+ image_contents = []
273
+ image_descriptions = []
274
+
275
+ for i, doc in enumerate(top_docs):
276
+ metadata = getattr(doc, 'metadata', {})
277
+ content = getattr(doc, 'page_content', '')
278
+ score = getattr(doc, 'score', 0.0)
279
+
280
+ # Get image URL (prefer original for better quality)
281
+ image_url = metadata.get('original_url') or metadata.get('resized_url') or metadata.get('page')
282
+
283
+ if image_url and isinstance(image_url, str) and image_url.startswith('http'):
284
+ logger.info(f"πŸ–ΌοΈ MULTI-MODAL: Fetching image {i+1}: {image_url[:80]}...")
285
+
286
+ # Fetch and encode image
287
+ base64_image = self._fetch_image_as_base64(image_url)
288
+
289
+ if base64_image:
290
+ image_contents.append({
291
+ "type": "image_url",
292
+ "image_url": {
293
+ "url": base64_image,
294
+ "detail": "high" # High detail for document analysis
295
+ }
296
+ })
297
+
298
+ # Build description for this image
299
+ desc = f"[Image {i+1}] "
300
+ desc += f"File: {metadata.get('filename', 'Unknown')}, "
301
+ desc += f"Page: {metadata.get('page_number', 'N/A')}, "
302
+ desc += f"Year: {metadata.get('year', 'N/A')}, "
303
+ desc += f"District: {metadata.get('district', 'N/A')}, "
304
+ desc += f"Score: {score:.3f}"
305
+ if content:
306
+ desc += f"\nExtracted text preview: {content[:300]}..."
307
+ image_descriptions.append(desc)
308
+
309
+ logger.info(f"πŸ–ΌοΈ MULTI-MODAL: Image {i+1} loaded successfully")
310
+ else:
311
+ logger.warning(f"⚠️ MULTI-MODAL: Failed to load image {i+1}")
312
+ else:
313
+ logger.warning(f"⚠️ MULTI-MODAL: No valid image URL for doc {i+1}")
314
+
315
+ if not image_contents:
316
+ logger.warning("⚠️ MULTI-MODAL: No images loaded, falling back to text-only")
317
+ return None
318
+
319
+ logger.info(f"πŸ–ΌοΈ MULTI-MODAL: {len(image_contents)} images loaded for LLM")
320
+
321
+ # Build the multi-modal prompt
322
+ system_prompt = """You are a helpful audit report assistant with the ability to SEE document images directly.
323
+
324
+ CRITICAL RULES - VISUAL ANALYSIS:
325
+ 1. **LOOK AT THE IMAGES**: You can see the actual document pages. Analyze tables, charts, figures, and text directly.
326
+ 2. **ONLY use information visible in the images or provided text** - DO NOT hallucinate
327
+ 3. **EVERY claim MUST reference which image/document it came from** using [Image 1], [Image 2], etc.
328
+ 4. **If you see tables or figures in the images, describe what they show**
329
+ 5. **If information is not visible in any image, explicitly state that**
330
+ 6. **USE CORRECT NAMES**: Always use the exact names from document metadata provided.
331
+
332
+ RESPONSE STYLE:
333
+ - Be conversational, not technical
334
+ - Use bullet points and lists when appropriate
335
+ - Reference specific images: "As shown in [Image 1]..." or "The table in [Image 2] indicates..."
336
+ - If multiple images show similar information, cite all: [Image 1, Image 2]
337
+ - Don't describe the image layout/format, focus on the CONTENT
338
+
339
+ TONE: Professional but friendly, like talking to a colleague who can see the same documents."""
340
+
341
+ # Build user message with images
342
+ user_content = []
343
+
344
+ # Add text context first
345
+ text_context = f"""Conversation History:
346
+ {conversation_context}
347
+
348
+ Current User Question: {query}
349
+
350
+ CORRECT NAMES TO USE (from document metadata):
351
+ {correct_names}
352
+
353
+ Image Descriptions (for reference):
354
+ {chr(10).join(image_descriptions)}
355
+
356
+ INSTRUCTIONS:
357
+ 1. LOOK at the {len(image_contents)} document images below
358
+ 2. Answer the user's question based on what you SEE in the images
359
+ 3. Reference specific images when citing information
360
+ 4. If the answer isn't visible in any image, say so
361
+
362
+ Now analyze the images and answer the question:"""
363
+
364
+ user_content.append({
365
+ "type": "text",
366
+ "text": text_context
367
+ })
368
+
369
+ # Add images
370
+ user_content.extend(image_contents)
371
+
372
+ try:
373
+ logger.info(f"πŸ–ΌοΈ MULTI-MODAL: Calling {MULTIMODAL_MODEL}...")
374
+
375
+ response = self.openai_client.chat.completions.create(
376
+ model=MULTIMODAL_MODEL,
377
+ messages=[
378
+ {"role": "system", "content": system_prompt},
379
+ {"role": "user", "content": user_content}
380
+ ],
381
+ max_tokens=2000,
382
+ temperature=0.3
383
+ )
384
+
385
+ response_text = response.choices[0].message.content.strip()
386
+
387
+ # Detailed token usage logging
388
+ usage = response.usage
389
+ logger.info(f"πŸ–ΌοΈ MULTI-MODAL: Response received")
390
+ logger.info(f"πŸ–ΌοΈ MULTI-MODAL: Response length: {len(response_text)} chars")
391
+ logger.info(f"πŸ–ΌοΈ MULTI-MODAL: Response preview: {response_text[:300]}...")
392
+ logger.info(f"πŸ–ΌοΈ MULTI-MODAL: Token usage breakdown:")
393
+ logger.info(f" πŸ“₯ Input tokens (prompt + images): {usage.prompt_tokens}")
394
+ logger.info(f" πŸ“€ Output tokens (response): {usage.completion_tokens}")
395
+ logger.info(f" πŸ“Š Total tokens: {usage.total_tokens}")
396
+
397
+ # Estimate cost (GPT-4o pricing: $2.50/1M input, $10/1M output as of 2024)
398
+ input_cost = (usage.prompt_tokens / 1_000_000) * 2.50
399
+ output_cost = (usage.completion_tokens / 1_000_000) * 10.00
400
+ total_cost = input_cost + output_cost
401
+ logger.info(f" πŸ’° Estimated cost: ${total_cost:.4f} (input: ${input_cost:.4f}, output: ${output_cost:.4f})")
402
+
403
+ return response_text
404
+
405
+ except Exception as e:
406
+ logger.error(f"❌ MULTI-MODAL ERROR: {e}")
407
+ traceback.print_exc()
408
+ return None
409
+
410
+ def _generate_conversational_response(
411
+ self,
412
+ query: str,
413
+ documents: List[Any],
414
+ rag_answer: str,
415
+ messages: List[Any],
416
+ filters: Dict[str, Any] = None
417
+ ) -> str:
418
+ """
419
+ Generate conversational response from visually retrieved documents.
420
+
421
+ Phase 2 IMPLEMENTED: Multi-modal LLM support
422
+ - Sends top 3 images (by relevance) directly to GPT-4o
423
+ - LLM can see tables, charts, figures directly
424
+ - Falls back to text-only if multi-modal fails
425
+
426
+ Current implementation: Multi-modal with text fallback
427
+
428
+ Args:
429
+ query: User query
430
+ documents: Retrieved visual documents
431
+ rag_answer: RAG answer (empty for visual search)
432
+ messages: Conversation history
433
+
434
+ Returns:
435
+ LLM response
436
+ """
437
+ logger.info("=" * 80)
438
+ logger.info("πŸ’¬ VISUAL RESPONSE GENERATION: Starting")
439
+ logger.info("=" * 80)
440
+ logger.info(f"πŸ’¬ VISUAL RESPONSE GENERATION: Processing {len(documents)} visual documents")
441
+ logger.info(f"πŸ’¬ VISUAL RESPONSE GENERATION: Query: '{query}'")
442
+ logger.info(f"πŸ’¬ VISUAL RESPONSE GENERATION: Filters applied: {filters}")
443
+
444
+ # Log each document's metadata and content preview
445
+ for i, doc in enumerate(documents[:5]): # Log first 5 docs
446
+ metadata = getattr(doc, 'metadata', {})
447
+ content = getattr(doc, 'page_content', '')
448
+ logger.info(f"πŸ“„ DOC {i+1} METADATA: filename={metadata.get('filename', 'N/A')}, "
449
+ f"year={metadata.get('year', 'N/A')}, district={metadata.get('district', 'N/A')}, "
450
+ f"source={metadata.get('source', 'N/A')}, page={metadata.get('page_number', 'N/A')}")
451
+ logger.info(f"πŸ“„ DOC {i+1} CONTENT PREVIEW: {content[:200]}..." if content else f"πŸ“„ DOC {i+1} CONTENT: EMPTY/NONE")
452
+ logger.info(f"πŸ“„ DOC {i+1} CONTENT LENGTH: {len(content)} chars" if content else "πŸ“„ DOC {i+1} CONTENT LENGTH: 0")
453
+
454
+ # Build conversation history context
455
+ conversation_context = self._build_conversation_context_for_response(messages)
456
+
457
+ # Build detailed document information
458
+ document_details = self._build_visual_document_details(documents)
459
+ logger.info(f"πŸ’¬ VISUAL RESPONSE GENERATION: Document details length: {len(document_details)} chars")
460
+
461
+ # Extract correct names from documents
462
+ correct_names = self._extract_correct_names_from_documents(documents)
463
+ logger.info(f"πŸ’¬ VISUAL RESPONSE GENERATION: Correct names: {correct_names}")
464
+
465
+ # ============================================================
466
+ # PHASE 2: Try multi-modal generation first (GPT-4o with images)
467
+ # ============================================================
468
+ if self.enable_multimodal:
469
+ logger.info("πŸ–ΌοΈ VISUAL RESPONSE GENERATION: Attempting multi-modal generation (GPT-4o with images)...")
470
+
471
+ multimodal_response = self._generate_multimodal_response(
472
+ query=query,
473
+ documents=documents,
474
+ conversation_context=conversation_context,
475
+ correct_names=correct_names,
476
+ filters=filters
477
+ )
478
+
479
+ if multimodal_response:
480
+ logger.info("βœ… VISUAL RESPONSE GENERATION: Multi-modal generation successful!")
481
+
482
+ # Validate and enhance the response
483
+ final_response = self._validate_and_enhance_response(
484
+ multimodal_response,
485
+ documents,
486
+ query,
487
+ filters
488
+ )
489
+
490
+ logger.info("=" * 80)
491
+ return final_response
492
+ else:
493
+ logger.warning("⚠️ VISUAL RESPONSE GENERATION: Multi-modal failed, falling back to text-only")
494
+ else:
495
+ logger.info("πŸ“ VISUAL RESPONSE GENERATION: Multi-modal disabled, using text-only mode")
496
+
497
+ # ============================================================
498
+ # FALLBACK: Text-only generation (extracted text from pages)
499
+ # ============================================================
500
+ logger.info("πŸ“ VISUAL RESPONSE GENERATION: Using text-only mode")
501
+
502
+ # Create response prompt
503
+ response_prompt = ChatPromptTemplate.from_messages([
504
+ SystemMessage(content="""You are a helpful audit report assistant. Generate a natural, conversational response.
505
+
506
+ CRITICAL RULES - NO HALLUCINATION:
507
+ 1. **ONLY use information from the retrieved documents provided below**
508
+ 2. **EVERY sentence with facts, numbers, or specific claims MUST have a [Doc i] reference**
509
+ 3. **If a document doesn't contain the information, DO NOT make it up**
510
+ 4. **If the user asks about a year/district that's NOT in the retrieved documents, explicitly state that**
511
+ 5. **Check the document years/districts before making any claims about them**
512
+ 6. **USE CORRECT NAMES**: Always use the exact names from document metadata, not misspellings from conversation.
513
+
514
+ RULES:
515
+ 1. Answer the user's question directly and clearly
516
+ 2. Use ONLY the retrieved documents as evidence - DO NOT use your training data
517
+ 3. Be conversational, not technical
518
+ 4. Don't mention scores, retrieval details, or technical implementation
519
+ 5. If relevant documents were found, reference them naturally
520
+ 6. If no relevant documents, say you do not have enough information - DO NOT hallucinate
521
+ 7. If the passages have useful facts or numbers, use them in your answer WITH references
522
+ 8. **MANDATORY**: When you use information from a passage, mention where it came from by using [Doc i] at the end of the sentence.
523
+ 9. Do not use the sentence 'Doc i says ...' to say where information came from.
524
+ 10. If the same thing is said in more than one document, you can mention all of them like this: [Doc i, Doc j, Doc k]
525
+ 11. Do not just summarize each passage one by one. Group your summaries to highlight the key parts in the explanation.
526
+ 12. If it makes sense, use bullet points and lists to make your answers easier to understand.
527
+ 13. You do not need to use every passage. Only use the ones that help answer the question.
528
+ 14. **VERIFY**: Before mentioning any year, district, or number, check that it exists in the retrieved documents.
529
+ 15. **NO HALLUCINATION**: If documents show years 2021, 2022, 2023 but user asks about 2020, DO NOT provide 2020 data.
530
+ 16. **USE CORRECT SPELLING**: Always use the district/source names exactly as they appear in the document metadata below.
531
+
532
+ NOTE: These documents were retrieved using advanced visual search (ColPali), so they may contain tables, figures, or structured data.
533
+
534
+ TONE: Professional but friendly, like talking to a colleague."""),
535
+ HumanMessage(content=f"""Conversation History:
536
+ {conversation_context}
537
+
538
+ Current User Question: {query}
539
+
540
+ Retrieved Documents: {len(documents)} documents found
541
+
542
+ CORRECT NAMES TO USE (from document metadata - use these exact spellings):
543
+ {correct_names}
544
+
545
+ Full Document Details:
546
+ {document_details}
547
+
548
+ CRITICAL:
549
+ - Responses should be grounded to what is available in the retrieved documents
550
+ - If user asks about a specific year but documents show other years, explicitly state "can't provide response on ... because ..."
551
+ - Every factual claim MUST have [Doc i] reference
552
+ - If information is not in documents, explicitly state it's not available
553
+ - **USE THE CORRECT DISTRICT/SOURCE NAMES from the document metadata above**
554
+
555
+ Generate a conversational response with proper document references:""")
556
+ ])
557
+
558
+ try:
559
+ logger.info(f"πŸ“ TEXT-ONLY GENERATION: Calling LLM...")
560
+ response = self.llm.invoke(response_prompt.format_messages())
561
+ response_text = response.content.strip()
562
+
563
+ logger.info(f"πŸ“ TEXT-ONLY GENERATION: LLM response received")
564
+ logger.info(f"πŸ“ TEXT-ONLY GENERATION: Response length: {len(response_text)} chars")
565
+ logger.info(f"πŸ“ TEXT-ONLY GENERATION: Response preview: {response_text[:300]}...")
566
+
567
+ # Check if response indicates no information found
568
+ no_info_indicators = [
569
+ "don't have", "do not have", "isn't available", "is not available",
570
+ "no information", "cannot provide", "can't provide", "not in the retrieved"
571
+ ]
572
+ if any(indicator in response_text.lower() for indicator in no_info_indicators):
573
+ logger.warning("⚠️ TEXT-ONLY GENERATION: Response indicates NO INFORMATION FOUND")
574
+ logger.warning("⚠️ This could mean:")
575
+ logger.warning(" 1. Retrieved documents don't contain relevant content")
576
+ logger.warning(" 2. Extracted text from visual documents is empty/poor quality")
577
+ logger.warning(" 3. Query doesn't match document content")
578
+
579
+ # Validate and enhance the response
580
+ final_response = self._validate_and_enhance_response(
581
+ response_text,
582
+ documents,
583
+ query,
584
+ filters
585
+ )
586
+
587
+ logger.info("=" * 80)
588
+ return final_response
589
+
590
+ except Exception as e:
591
+ logger.error(f"❌ TEXT-ONLY GENERATION: Error during generation: {e}")
592
+ traceback.print_exc()
593
+ return "I apologize, but I encountered an error generating the response."
594
+
595
+ def _generate_conversational_response_without_docs(
596
+ self,
597
+ query: str,
598
+ messages: List[Any]
599
+ ) -> str:
600
+ """
601
+ Generate conversational response using only LLM knowledge.
602
+
603
+ Args:
604
+ query: User query
605
+ messages: Conversation history
606
+
607
+ Returns:
608
+ LLM response
609
+ """
610
+ logger.info("πŸ’¬ RESPONSE GENERATION (NO DOCS): Starting response generation without documents")
611
+
612
+ # Build conversation context
613
+ conversation_context = ""
614
+ for msg in messages[-6:]:
615
+ if isinstance(msg, HumanMessage):
616
+ conversation_context += f"User: {msg.content}\n"
617
+ elif isinstance(msg, AIMessage):
618
+ conversation_context += f"Assistant: {msg.content}\n"
619
+
620
+ # Create response prompt
621
+ response_prompt = ChatPromptTemplate.from_messages([
622
+ SystemMessage(content="""You are a helpful audit report assistant.
623
+
624
+ RULES:
625
+ 1. Politely explain that no relevant documents were found with high enough similarity
626
+ 2. Suggest rephrasing the query or being more specific
627
+ 3. Suggest checking if the information might be in a different year/source/district
628
+ 4. Stay professional but friendly
629
+
630
+ TONE: Professional but friendly, like talking to a colleague."""),
631
+ HumanMessage(content=f"""Current Question: {query}
632
+
633
+ Conversation History:
634
+ {conversation_context}
635
+
636
+ Note: No relevant documents were found with high enough similarity scores.
637
+
638
+ Generate a helpful response:""")
639
+ ])
640
+
641
+ try:
642
+ response = self.llm.invoke(response_prompt.format_messages())
643
+ return response.content.strip()
644
+ except Exception as e:
645
+ logger.error(f"❌ RESPONSE GENERATION (NO DOCS): Error: {e}")
646
+ return "I couldn't find relevant documents for your query. Please try rephrasing or being more specific."
647
+
648
+ def _build_conversation_context_for_response(self, messages: List[Any]) -> str:
649
+ """Build conversation history context for response generation"""
650
+ context_lines = []
651
+ for msg in messages[-6:]:
652
+ if isinstance(msg, HumanMessage):
653
+ context_lines.append(f"User: {msg.content}")
654
+ elif isinstance(msg, AIMessage):
655
+ context_lines.append(f"Assistant: {msg.content}")
656
+
657
+ return "\n".join(context_lines) if context_lines else "No previous conversation."
658
+
659
+ def _build_visual_document_details(self, documents: List[Any]) -> str:
660
+ """Build detailed document information for response generation"""
661
+ if not documents:
662
+ logger.warning("πŸ“„ BUILD_DETAILS: No documents to process!")
663
+ return "No documents retrieved."
664
+
665
+ logger.info(f"πŸ“„ BUILD_DETAILS: Processing {len(documents)} documents for LLM context")
666
+
667
+ details = []
668
+ docs_with_content = 0
669
+ docs_without_content = 0
670
+ total_content_length = 0
671
+
672
+ for i, doc in enumerate(documents[:15], 1):
673
+ metadata = getattr(doc, 'metadata', {})
674
+ content = getattr(doc, 'page_content', '')
675
+ score = getattr(doc, 'score', 0.0)
676
+
677
+ filename = metadata.get('filename', 'Unknown')
678
+ year = metadata.get('year', 'Unknown')
679
+ district = metadata.get('district', 'Unknown')
680
+ source = metadata.get('source', 'Unknown')
681
+ page = metadata.get('page_number', metadata.get('page', 'Unknown'))
682
+
683
+ # Visual metadata
684
+ num_tiles = metadata.get('num_tiles')
685
+ num_visual_tokens = metadata.get('num_visual_tokens')
686
+
687
+ doc_info = f"[Doc {i}] (Score: {score:.3f})"
688
+ doc_info += f"\n Filename: {filename}"
689
+ doc_info += f"\n Year: {year}"
690
+ doc_info += f"\n Source: {source}"
691
+ if district != 'Unknown':
692
+ doc_info += f"\n District: {district}"
693
+ doc_info += f"\n Page: {page}"
694
+
695
+ # Add visual metadata if available
696
+ if num_tiles or num_visual_tokens:
697
+ doc_info += f"\n Visual: {num_tiles} tiles, {num_visual_tokens} tokens"
698
+
699
+ # Add content preview
700
+ if content and content.strip():
701
+ doc_info += f"\n Content: {content[:500]}{'...' if len(content) > 500 else ''}"
702
+ docs_with_content += 1
703
+ total_content_length += len(content)
704
+ else:
705
+ doc_info += "\n Content: (No text extracted - image-only page)"
706
+ docs_without_content += 1
707
+
708
+ details.append(doc_info)
709
+
710
+ # Log summary
711
+ logger.info(f"πŸ“„ BUILD_DETAILS SUMMARY:")
712
+ logger.info(f" - Documents with text content: {docs_with_content}")
713
+ logger.info(f" - Documents WITHOUT text (image-only): {docs_without_content}")
714
+ logger.info(f" - Total text content length: {total_content_length} chars")
715
+
716
+ if docs_without_content > docs_with_content:
717
+ logger.warning(f"⚠️ BUILD_DETAILS: Most documents have NO TEXT CONTENT!")
718
+ logger.warning(f"⚠️ This is likely why the LLM says 'no information available'")
719
+
720
+ return "\n\n".join(details) if details else "No document details available."
721
+
722
+ def _extract_correct_names_from_documents(self, documents: List[Any]) -> str:
723
+ """Extract correct district/source names from documents to correct misspellings"""
724
+ districts = set()
725
+ sources = set()
726
+ years = set()
727
+
728
+ for doc in documents:
729
+ metadata = getattr(doc, 'metadata', {})
730
+ if metadata.get('district'):
731
+ districts.add(str(metadata['district']))
732
+ if metadata.get('source'):
733
+ sources.add(str(metadata['source']))
734
+ if metadata.get('year'):
735
+ years.add(str(metadata['year']))
736
+
737
+ result = []
738
+ if districts:
739
+ result.append(f"Districts: {', '.join(sorted(districts))}")
740
+ if sources:
741
+ result.append(f"Sources: {', '.join(sorted(sources))}")
742
+ if years:
743
+ result.append(f"Years: {', '.join(sorted(years))}")
744
+
745
+ if result:
746
+ return "\n".join(result) + "\n\nIMPORTANT: Use these EXACT spellings in your response."
747
+ return "No metadata available."
748
+
749
+ def _validate_and_enhance_response(
750
+ self,
751
+ response: str,
752
+ documents: List[Any],
753
+ query: str,
754
+ filters: Dict[str, Any] = None
755
+ ) -> str:
756
+ """
757
+ Validate response and add warnings about data coverage gaps.
758
+
759
+ Compares REQUESTED filters against RETRIEVED document metadata.
760
+
761
+ Args:
762
+ response: LLM-generated response
763
+ documents: Retrieved documents
764
+ query: User query
765
+ filters: Applied filters (year, district, etc.)
766
+
767
+ Returns:
768
+ Response with optional warnings appended
769
+ """
770
+ import re
771
+
772
+ # Extract years and districts from RETRIEVED documents
773
+ doc_years = set()
774
+ doc_districts = set()
775
+
776
+ for doc in documents:
777
+ metadata = getattr(doc, 'metadata', {}) if hasattr(doc, 'metadata') else {}
778
+ if isinstance(metadata, dict):
779
+ if metadata.get('year'):
780
+ doc_years.add(str(metadata['year']))
781
+ if metadata.get('district'):
782
+ doc_districts.add(str(metadata['district']))
783
+
784
+ logger.info(f"πŸ“Š VALIDATION: Retrieved docs cover years={doc_years}, districts={doc_districts}")
785
+
786
+ warnings = []
787
+
788
+ # Get REQUESTED filters
789
+ requested_years = set()
790
+ requested_districts = set()
791
+
792
+ if filters:
793
+ if filters.get('year'):
794
+ if isinstance(filters['year'], list):
795
+ requested_years = set(str(y) for y in filters['year'])
796
+ else:
797
+ requested_years = {str(filters['year'])}
798
+ if filters.get('district'):
799
+ if isinstance(filters['district'], list):
800
+ requested_districts = set(str(d) for d in filters['district'])
801
+ else:
802
+ requested_districts = {str(filters['district'])}
803
+
804
+ logger.info(f"πŸ“Š VALIDATION: Requested years={requested_years}, districts={requested_districts}")
805
+
806
+ # Compare requested vs retrieved YEARS
807
+ if requested_years and doc_years:
808
+ missing_years = requested_years - doc_years
809
+ if missing_years:
810
+ warnings.append(
811
+ f"You requested data for years {', '.join(sorted(requested_years))}, "
812
+ f"but the retrieved documents only cover {', '.join(sorted(doc_years))}. "
813
+ f"Data for {', '.join(sorted(missing_years))} may not be available."
814
+ )
815
+
816
+ # Compare requested vs retrieved DISTRICTS
817
+ if requested_districts and doc_districts:
818
+ # Normalize for comparison (case-insensitive)
819
+ requested_lower = {d.lower() for d in requested_districts}
820
+ doc_lower = {d.lower() for d in doc_districts}
821
+ missing_lower = requested_lower - doc_lower
822
+
823
+ if missing_lower:
824
+ missing_districts = [d for d in requested_districts if d.lower() in missing_lower]
825
+ warnings.append(
826
+ f"You requested data for districts {', '.join(sorted(requested_districts))}, "
827
+ f"but the retrieved documents only cover {', '.join(sorted(doc_districts))}. "
828
+ f"Data for {', '.join(sorted(missing_districts))} may not be available."
829
+ )
830
+
831
+ # Add warnings to response if any
832
+ if warnings and "⚠️" not in response:
833
+ warning_text = "\n\n⚠️ **Note:** " + " ".join(warnings)
834
+ response = response + warning_text
835
+ logger.info(f"πŸ“Š VALIDATION: Added warning about data coverage")
836
+
837
+ return response
838
+
839
+
840
+ def get_visual_multi_agent_chatbot() -> VisualMultiAgentChatbot:
841
+ """
842
+ Factory function to create a visual multi-agent chatbot.
843
+
844
+ Returns:
845
+ Initialized VisualMultiAgentChatbot
846
+ """
847
+ import os
848
+ from src.colpali.visual_search import VisualSearchAdapter
849
+
850
+ logger.info("🎨 Creating Visual Multi-Agent Chatbot...")
851
+
852
+ # Get Qdrant credentials for ColPali cluster
853
+ qdrant_url = (
854
+ os.environ.get("QDRANT_URL_AKRYL") or
855
+ os.environ.get("DEST_QDRANT_URL") or
856
+ os.environ.get("QDRANT_URL")
857
+ )
858
+ qdrant_api_key = (
859
+ os.environ.get("QDRANT_API_KEY_AKRYL") or
860
+ os.environ.get("DEST_QDRANT_API_KEY") or
861
+ os.environ.get("QDRANT_API_KEY")
862
+ )
863
+
864
+ if not qdrant_url or not qdrant_api_key:
865
+ raise ValueError(
866
+ "Visual mode requires Qdrant credentials for the ColPali cluster.\n"
867
+ "Please set one of these in your .env file:\n"
868
+ " - QDRANT_URL_AKRYL and QDRANT_API_KEY_AKRYL\n"
869
+ " - DEST_QDRANT_URL and DEST_QDRANT_API_KEY\n"
870
+ " - QDRANT_URL and QDRANT_API_KEY"
871
+ )
872
+
873
+ logger.info(f" Using Qdrant URL: {qdrant_url}")
874
+ logger.info(f" Collection: colSmol-500M")
875
+ logger.info(f" Multi-modal: {MULTIMODAL_ENABLED} (model: {MULTIMODAL_MODEL}, max_images: {MULTIMODAL_MAX_IMAGES})")
876
+
877
+ # Create visual search adapter
878
+ visual_search = VisualSearchAdapter(
879
+ qdrant_url=qdrant_url,
880
+ qdrant_api_key=qdrant_api_key,
881
+ collection_name="colSmol-500M"
882
+ )
883
+
884
+ # Create multi-agent chatbot with multi-modal enabled
885
+ chatbot = VisualMultiAgentChatbot(
886
+ visual_search=visual_search,
887
+ config_path="src/config/settings.yaml",
888
+ enable_multimodal=MULTIMODAL_ENABLED
889
+ )
890
+
891
+ return chatbot