Spaces:
Sleeping
Sleeping
add VisionRAG Agent
Browse files
src/agents/visual_multi_agent_chatbot.py
ADDED
|
@@ -0,0 +1,891 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Visual Multi-Agent RAG Chatbot using LangGraph
|
| 3 |
+
|
| 4 |
+
This system uses the same sophisticated multi-agent architecture as v1,
|
| 5 |
+
but with visual document retrieval (ColPali) instead of text-based RAG.
|
| 6 |
+
|
| 7 |
+
Inherits from BaseMultiAgentChatbot to get:
|
| 8 |
+
- LLM-based query analysis
|
| 9 |
+
- Filter extraction and validation
|
| 10 |
+
- Query rewriting
|
| 11 |
+
- Main/RAG/Response agent orchestration
|
| 12 |
+
|
| 13 |
+
Only implements:
|
| 14 |
+
- Visual search retrieval
|
| 15 |
+
- Response generation with visual context
|
| 16 |
+
|
| 17 |
+
Phase 2 IMPLEMENTED: Multi-modal LLM support
|
| 18 |
+
- Top 3 images (by relevance) are sent directly to GPT-4o
|
| 19 |
+
- LLM can see tables, charts, figures directly
|
| 20 |
+
- Falls back to text-only if multi-modal fails
|
| 21 |
+
"""
|
| 22 |
+
import os
|
| 23 |
+
import time
|
| 24 |
+
import base64
|
| 25 |
+
import logging
|
| 26 |
+
import traceback
|
| 27 |
+
import httpx
|
| 28 |
+
from typing import Dict, List, Any, Optional
|
| 29 |
+
|
| 30 |
+
from openai import OpenAI
|
| 31 |
+
from langchain_core.prompts import ChatPromptTemplate
|
| 32 |
+
from langchain_core.messages import HumanMessage, AIMessage, SystemMessage
|
| 33 |
+
|
| 34 |
+
from src.agents.base_multi_agent_chatbot import BaseMultiAgentChatbot, MultiAgentState
|
| 35 |
+
from src.colpali.visual_search import VisualSearchAdapter
|
| 36 |
+
|
| 37 |
+
logger = logging.getLogger(__name__)
|
| 38 |
+
|
| 39 |
+
# Multi-modal LLM configuration
|
| 40 |
+
MULTIMODAL_MODEL = os.environ.get("VISUAL_RAG_MODEL", "gpt-4o") # GPT-4o supports vision
|
| 41 |
+
MULTIMODAL_MAX_IMAGES = int(os.environ.get("VISUAL_RAG_MAX_IMAGES", "3")) # Top N images by relevance score
|
| 42 |
+
MULTIMODAL_ENABLED = os.environ.get("VISUAL_RAG_MULTIMODAL", "true").lower() == "true" # Toggle for multi-modal mode
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
class VisualMultiAgentChatbot(BaseMultiAgentChatbot):
|
| 46 |
+
"""Multi-agent chatbot with visual RAG (ColPali) and multi-modal response generation"""
|
| 47 |
+
|
| 48 |
+
def __init__(
|
| 49 |
+
self,
|
| 50 |
+
visual_search: VisualSearchAdapter,
|
| 51 |
+
config_path: str = "src/config/settings.yaml",
|
| 52 |
+
enable_multimodal: bool = True
|
| 53 |
+
):
|
| 54 |
+
"""
|
| 55 |
+
Initialize the visual multi-agent chatbot.
|
| 56 |
+
|
| 57 |
+
Args:
|
| 58 |
+
visual_search: Visual search adapter (ColPali)
|
| 59 |
+
config_path: Path to config file
|
| 60 |
+
enable_multimodal: Whether to use multi-modal LLM (GPT-4o with images)
|
| 61 |
+
"""
|
| 62 |
+
self.visual_search = visual_search
|
| 63 |
+
self.enable_multimodal = enable_multimodal and MULTIMODAL_ENABLED
|
| 64 |
+
|
| 65 |
+
# Initialize OpenAI client for multi-modal (GPT-4o)
|
| 66 |
+
self.openai_client = None
|
| 67 |
+
if self.enable_multimodal:
|
| 68 |
+
api_key = os.environ.get("OPENAI_API_KEY")
|
| 69 |
+
if api_key:
|
| 70 |
+
self.openai_client = OpenAI(api_key=api_key)
|
| 71 |
+
logger.info(f"πΌοΈ Multi-modal LLM initialized: {MULTIMODAL_MODEL}")
|
| 72 |
+
else:
|
| 73 |
+
logger.warning("β οΈ OPENAI_API_KEY not set, multi-modal disabled")
|
| 74 |
+
self.enable_multimodal = False
|
| 75 |
+
|
| 76 |
+
# Call parent init (sets up LLM, filters, graph, etc.)
|
| 77 |
+
super().__init__(config_path)
|
| 78 |
+
|
| 79 |
+
logger.info(f"π¨ Visual Multi-Agent Chatbot initialized (multi-modal: {self.enable_multimodal})")
|
| 80 |
+
|
| 81 |
+
def _perform_retrieval(self, query: str, filters: Dict[str, Any]) -> Any:
|
| 82 |
+
"""
|
| 83 |
+
Perform visual search retrieval.
|
| 84 |
+
|
| 85 |
+
Args:
|
| 86 |
+
query: The rewritten query
|
| 87 |
+
filters: The filters to apply
|
| 88 |
+
|
| 89 |
+
Returns:
|
| 90 |
+
Result object with .sources and .answer attributes
|
| 91 |
+
"""
|
| 92 |
+
logger.info(f"π VISUAL RETRIEVAL: Performing visual search")
|
| 93 |
+
logger.info(f"π VISUAL RETRIEVAL: Query: '{query}'")
|
| 94 |
+
logger.info(f"π VISUAL RETRIEVAL: Filters: {filters}")
|
| 95 |
+
|
| 96 |
+
# Convert filters to visual search format
|
| 97 |
+
visual_filters = {}
|
| 98 |
+
|
| 99 |
+
if filters.get("sources"):
|
| 100 |
+
visual_filters["sources"] = filters["sources"]
|
| 101 |
+
|
| 102 |
+
if filters.get("year"):
|
| 103 |
+
# Convert to "years" (plural) for visual search
|
| 104 |
+
visual_filters["years"] = filters["year"]
|
| 105 |
+
|
| 106 |
+
if filters.get("district"):
|
| 107 |
+
# Convert to "districts" (plural) for visual search
|
| 108 |
+
visual_filters["districts"] = filters["district"]
|
| 109 |
+
|
| 110 |
+
if filters.get("filenames"):
|
| 111 |
+
visual_filters["filenames"] = filters["filenames"]
|
| 112 |
+
|
| 113 |
+
logger.info(f"π VISUAL RETRIEVAL: Converted filters: {visual_filters}")
|
| 114 |
+
|
| 115 |
+
# Perform visual search
|
| 116 |
+
try:
|
| 117 |
+
visual_results = self.visual_search.search(
|
| 118 |
+
query=query,
|
| 119 |
+
top_k=10,
|
| 120 |
+
filters=visual_filters,
|
| 121 |
+
search_strategy="multi_vector"
|
| 122 |
+
)
|
| 123 |
+
|
| 124 |
+
logger.info(f"π VISUAL RETRIEVAL: Retrieved {len(visual_results)} visual documents")
|
| 125 |
+
|
| 126 |
+
# Return in format expected by base class
|
| 127 |
+
class Result:
|
| 128 |
+
def __init__(self, sources, answer=""):
|
| 129 |
+
self.sources = sources
|
| 130 |
+
self.answer = answer
|
| 131 |
+
|
| 132 |
+
return Result(visual_results, "")
|
| 133 |
+
|
| 134 |
+
except Exception as e:
|
| 135 |
+
logger.error(f"β VISUAL RETRIEVAL: Error during visual search: {e}")
|
| 136 |
+
traceback.print_exc()
|
| 137 |
+
|
| 138 |
+
# Return empty result
|
| 139 |
+
class Result:
|
| 140 |
+
def __init__(self, sources, answer=""):
|
| 141 |
+
self.sources = sources
|
| 142 |
+
self.answer = answer
|
| 143 |
+
|
| 144 |
+
return Result([], "")
|
| 145 |
+
|
| 146 |
+
def _response_agent(self, state: MultiAgentState) -> MultiAgentState:
|
| 147 |
+
"""
|
| 148 |
+
Override response agent for Visual RAG.
|
| 149 |
+
|
| 150 |
+
Visual RAG uses MaxSim scores (typically 10-30+) instead of cosine similarity (0-1),
|
| 151 |
+
so we skip the similarity score threshold check that's in the base class.
|
| 152 |
+
"""
|
| 153 |
+
logger.info("π VISUAL RESPONSE AGENT: Starting document retrieval and answer generation")
|
| 154 |
+
|
| 155 |
+
rag_query = state["rag_query"]
|
| 156 |
+
filters = state["rag_filters"]
|
| 157 |
+
|
| 158 |
+
logger.info(f"π VISUAL RESPONSE AGENT: Query: '{rag_query}'")
|
| 159 |
+
logger.info(f"π VISUAL RESPONSE AGENT: Filters: {filters}")
|
| 160 |
+
|
| 161 |
+
try:
|
| 162 |
+
# Call visual retrieval
|
| 163 |
+
result = self._perform_retrieval(rag_query, filters)
|
| 164 |
+
|
| 165 |
+
state["retrieved_documents"] = result.sources
|
| 166 |
+
state["agent_logs"].append(f"VISUAL RESPONSE AGENT: Retrieved {len(result.sources)} documents")
|
| 167 |
+
|
| 168 |
+
logger.info(f"π VISUAL RESPONSE AGENT: Retrieved {len(result.sources)} documents")
|
| 169 |
+
|
| 170 |
+
# For Visual RAG, we don't check similarity scores (MaxSim scores are different scale)
|
| 171 |
+
# Just check if we have any documents
|
| 172 |
+
if not result.sources:
|
| 173 |
+
logger.warning(f"β οΈ VISUAL RESPONSE AGENT: No documents retrieved, using LLM knowledge only")
|
| 174 |
+
response = self._generate_conversational_response_without_docs(
|
| 175 |
+
state["current_query"],
|
| 176 |
+
state["messages"]
|
| 177 |
+
)
|
| 178 |
+
else:
|
| 179 |
+
# Generate conversational response with documents
|
| 180 |
+
response = self._generate_conversational_response(
|
| 181 |
+
state["current_query"],
|
| 182 |
+
result.sources,
|
| 183 |
+
result.answer,
|
| 184 |
+
state["messages"],
|
| 185 |
+
filters
|
| 186 |
+
)
|
| 187 |
+
|
| 188 |
+
state["final_response"] = response
|
| 189 |
+
state["last_ai_message_time"] = time.time()
|
| 190 |
+
|
| 191 |
+
logger.info(f"π VISUAL RESPONSE AGENT: Answer generation complete")
|
| 192 |
+
|
| 193 |
+
except Exception as e:
|
| 194 |
+
logger.error(f"β VISUAL RESPONSE AGENT ERROR: {e}")
|
| 195 |
+
traceback.print_exc()
|
| 196 |
+
state["final_response"] = "I apologize, but I encountered an error while retrieving visual documents. Please try again."
|
| 197 |
+
state["last_ai_message_time"] = time.time()
|
| 198 |
+
|
| 199 |
+
return state
|
| 200 |
+
|
| 201 |
+
def _fetch_image_as_base64(self, image_url: str, timeout: float = 10.0) -> Optional[str]:
|
| 202 |
+
"""
|
| 203 |
+
Fetch an image from URL and convert to base64.
|
| 204 |
+
|
| 205 |
+
Args:
|
| 206 |
+
image_url: URL of the image
|
| 207 |
+
timeout: Request timeout in seconds
|
| 208 |
+
|
| 209 |
+
Returns:
|
| 210 |
+
Base64 encoded image string, or None if failed
|
| 211 |
+
"""
|
| 212 |
+
try:
|
| 213 |
+
with httpx.Client(timeout=timeout) as client:
|
| 214 |
+
response = client.get(image_url)
|
| 215 |
+
response.raise_for_status()
|
| 216 |
+
|
| 217 |
+
# Determine content type
|
| 218 |
+
content_type = response.headers.get('content-type', 'image/jpeg')
|
| 219 |
+
if 'png' in content_type:
|
| 220 |
+
media_type = 'image/png'
|
| 221 |
+
elif 'gif' in content_type:
|
| 222 |
+
media_type = 'image/gif'
|
| 223 |
+
elif 'webp' in content_type:
|
| 224 |
+
media_type = 'image/webp'
|
| 225 |
+
else:
|
| 226 |
+
media_type = 'image/jpeg'
|
| 227 |
+
|
| 228 |
+
# Encode to base64
|
| 229 |
+
base64_image = base64.b64encode(response.content).decode('utf-8')
|
| 230 |
+
return f"data:{media_type};base64,{base64_image}"
|
| 231 |
+
|
| 232 |
+
except Exception as e:
|
| 233 |
+
logger.warning(f"β οΈ Failed to fetch image {image_url}: {e}")
|
| 234 |
+
return None
|
| 235 |
+
|
| 236 |
+
def _generate_multimodal_response(
|
| 237 |
+
self,
|
| 238 |
+
query: str,
|
| 239 |
+
documents: List[Any],
|
| 240 |
+
conversation_context: str,
|
| 241 |
+
correct_names: str,
|
| 242 |
+
filters: Dict[str, Any] = None
|
| 243 |
+
) -> Optional[str]:
|
| 244 |
+
"""
|
| 245 |
+
Generate response using GPT-4o with images (multi-modal).
|
| 246 |
+
|
| 247 |
+
Sends top 3 images by relevance score directly to the LLM.
|
| 248 |
+
|
| 249 |
+
Args:
|
| 250 |
+
query: User query
|
| 251 |
+
documents: Retrieved visual documents (sorted by score)
|
| 252 |
+
conversation_context: Formatted conversation history
|
| 253 |
+
correct_names: Correct district/source names from metadata
|
| 254 |
+
filters: Applied filters
|
| 255 |
+
|
| 256 |
+
Returns:
|
| 257 |
+
LLM response string, or None if multi-modal generation failed
|
| 258 |
+
"""
|
| 259 |
+
if not self.openai_client or not self.enable_multimodal:
|
| 260 |
+
logger.info("πΌοΈ Multi-modal disabled, skipping")
|
| 261 |
+
return None
|
| 262 |
+
|
| 263 |
+
logger.info("=" * 80)
|
| 264 |
+
logger.info("πΌοΈ MULTI-MODAL RESPONSE GENERATION: Starting")
|
| 265 |
+
logger.info("=" * 80)
|
| 266 |
+
|
| 267 |
+
# Get top N images by relevance score
|
| 268 |
+
top_docs = documents[:MULTIMODAL_MAX_IMAGES]
|
| 269 |
+
logger.info(f"πΌοΈ MULTI-MODAL: Processing top {len(top_docs)} documents for image injection")
|
| 270 |
+
|
| 271 |
+
# Fetch images and build content
|
| 272 |
+
image_contents = []
|
| 273 |
+
image_descriptions = []
|
| 274 |
+
|
| 275 |
+
for i, doc in enumerate(top_docs):
|
| 276 |
+
metadata = getattr(doc, 'metadata', {})
|
| 277 |
+
content = getattr(doc, 'page_content', '')
|
| 278 |
+
score = getattr(doc, 'score', 0.0)
|
| 279 |
+
|
| 280 |
+
# Get image URL (prefer original for better quality)
|
| 281 |
+
image_url = metadata.get('original_url') or metadata.get('resized_url') or metadata.get('page')
|
| 282 |
+
|
| 283 |
+
if image_url and isinstance(image_url, str) and image_url.startswith('http'):
|
| 284 |
+
logger.info(f"πΌοΈ MULTI-MODAL: Fetching image {i+1}: {image_url[:80]}...")
|
| 285 |
+
|
| 286 |
+
# Fetch and encode image
|
| 287 |
+
base64_image = self._fetch_image_as_base64(image_url)
|
| 288 |
+
|
| 289 |
+
if base64_image:
|
| 290 |
+
image_contents.append({
|
| 291 |
+
"type": "image_url",
|
| 292 |
+
"image_url": {
|
| 293 |
+
"url": base64_image,
|
| 294 |
+
"detail": "high" # High detail for document analysis
|
| 295 |
+
}
|
| 296 |
+
})
|
| 297 |
+
|
| 298 |
+
# Build description for this image
|
| 299 |
+
desc = f"[Image {i+1}] "
|
| 300 |
+
desc += f"File: {metadata.get('filename', 'Unknown')}, "
|
| 301 |
+
desc += f"Page: {metadata.get('page_number', 'N/A')}, "
|
| 302 |
+
desc += f"Year: {metadata.get('year', 'N/A')}, "
|
| 303 |
+
desc += f"District: {metadata.get('district', 'N/A')}, "
|
| 304 |
+
desc += f"Score: {score:.3f}"
|
| 305 |
+
if content:
|
| 306 |
+
desc += f"\nExtracted text preview: {content[:300]}..."
|
| 307 |
+
image_descriptions.append(desc)
|
| 308 |
+
|
| 309 |
+
logger.info(f"πΌοΈ MULTI-MODAL: Image {i+1} loaded successfully")
|
| 310 |
+
else:
|
| 311 |
+
logger.warning(f"β οΈ MULTI-MODAL: Failed to load image {i+1}")
|
| 312 |
+
else:
|
| 313 |
+
logger.warning(f"β οΈ MULTI-MODAL: No valid image URL for doc {i+1}")
|
| 314 |
+
|
| 315 |
+
if not image_contents:
|
| 316 |
+
logger.warning("β οΈ MULTI-MODAL: No images loaded, falling back to text-only")
|
| 317 |
+
return None
|
| 318 |
+
|
| 319 |
+
logger.info(f"πΌοΈ MULTI-MODAL: {len(image_contents)} images loaded for LLM")
|
| 320 |
+
|
| 321 |
+
# Build the multi-modal prompt
|
| 322 |
+
system_prompt = """You are a helpful audit report assistant with the ability to SEE document images directly.
|
| 323 |
+
|
| 324 |
+
CRITICAL RULES - VISUAL ANALYSIS:
|
| 325 |
+
1. **LOOK AT THE IMAGES**: You can see the actual document pages. Analyze tables, charts, figures, and text directly.
|
| 326 |
+
2. **ONLY use information visible in the images or provided text** - DO NOT hallucinate
|
| 327 |
+
3. **EVERY claim MUST reference which image/document it came from** using [Image 1], [Image 2], etc.
|
| 328 |
+
4. **If you see tables or figures in the images, describe what they show**
|
| 329 |
+
5. **If information is not visible in any image, explicitly state that**
|
| 330 |
+
6. **USE CORRECT NAMES**: Always use the exact names from document metadata provided.
|
| 331 |
+
|
| 332 |
+
RESPONSE STYLE:
|
| 333 |
+
- Be conversational, not technical
|
| 334 |
+
- Use bullet points and lists when appropriate
|
| 335 |
+
- Reference specific images: "As shown in [Image 1]..." or "The table in [Image 2] indicates..."
|
| 336 |
+
- If multiple images show similar information, cite all: [Image 1, Image 2]
|
| 337 |
+
- Don't describe the image layout/format, focus on the CONTENT
|
| 338 |
+
|
| 339 |
+
TONE: Professional but friendly, like talking to a colleague who can see the same documents."""
|
| 340 |
+
|
| 341 |
+
# Build user message with images
|
| 342 |
+
user_content = []
|
| 343 |
+
|
| 344 |
+
# Add text context first
|
| 345 |
+
text_context = f"""Conversation History:
|
| 346 |
+
{conversation_context}
|
| 347 |
+
|
| 348 |
+
Current User Question: {query}
|
| 349 |
+
|
| 350 |
+
CORRECT NAMES TO USE (from document metadata):
|
| 351 |
+
{correct_names}
|
| 352 |
+
|
| 353 |
+
Image Descriptions (for reference):
|
| 354 |
+
{chr(10).join(image_descriptions)}
|
| 355 |
+
|
| 356 |
+
INSTRUCTIONS:
|
| 357 |
+
1. LOOK at the {len(image_contents)} document images below
|
| 358 |
+
2. Answer the user's question based on what you SEE in the images
|
| 359 |
+
3. Reference specific images when citing information
|
| 360 |
+
4. If the answer isn't visible in any image, say so
|
| 361 |
+
|
| 362 |
+
Now analyze the images and answer the question:"""
|
| 363 |
+
|
| 364 |
+
user_content.append({
|
| 365 |
+
"type": "text",
|
| 366 |
+
"text": text_context
|
| 367 |
+
})
|
| 368 |
+
|
| 369 |
+
# Add images
|
| 370 |
+
user_content.extend(image_contents)
|
| 371 |
+
|
| 372 |
+
try:
|
| 373 |
+
logger.info(f"πΌοΈ MULTI-MODAL: Calling {MULTIMODAL_MODEL}...")
|
| 374 |
+
|
| 375 |
+
response = self.openai_client.chat.completions.create(
|
| 376 |
+
model=MULTIMODAL_MODEL,
|
| 377 |
+
messages=[
|
| 378 |
+
{"role": "system", "content": system_prompt},
|
| 379 |
+
{"role": "user", "content": user_content}
|
| 380 |
+
],
|
| 381 |
+
max_tokens=2000,
|
| 382 |
+
temperature=0.3
|
| 383 |
+
)
|
| 384 |
+
|
| 385 |
+
response_text = response.choices[0].message.content.strip()
|
| 386 |
+
|
| 387 |
+
# Detailed token usage logging
|
| 388 |
+
usage = response.usage
|
| 389 |
+
logger.info(f"πΌοΈ MULTI-MODAL: Response received")
|
| 390 |
+
logger.info(f"πΌοΈ MULTI-MODAL: Response length: {len(response_text)} chars")
|
| 391 |
+
logger.info(f"πΌοΈ MULTI-MODAL: Response preview: {response_text[:300]}...")
|
| 392 |
+
logger.info(f"πΌοΈ MULTI-MODAL: Token usage breakdown:")
|
| 393 |
+
logger.info(f" π₯ Input tokens (prompt + images): {usage.prompt_tokens}")
|
| 394 |
+
logger.info(f" π€ Output tokens (response): {usage.completion_tokens}")
|
| 395 |
+
logger.info(f" π Total tokens: {usage.total_tokens}")
|
| 396 |
+
|
| 397 |
+
# Estimate cost (GPT-4o pricing: $2.50/1M input, $10/1M output as of 2024)
|
| 398 |
+
input_cost = (usage.prompt_tokens / 1_000_000) * 2.50
|
| 399 |
+
output_cost = (usage.completion_tokens / 1_000_000) * 10.00
|
| 400 |
+
total_cost = input_cost + output_cost
|
| 401 |
+
logger.info(f" π° Estimated cost: ${total_cost:.4f} (input: ${input_cost:.4f}, output: ${output_cost:.4f})")
|
| 402 |
+
|
| 403 |
+
return response_text
|
| 404 |
+
|
| 405 |
+
except Exception as e:
|
| 406 |
+
logger.error(f"β MULTI-MODAL ERROR: {e}")
|
| 407 |
+
traceback.print_exc()
|
| 408 |
+
return None
|
| 409 |
+
|
| 410 |
+
def _generate_conversational_response(
|
| 411 |
+
self,
|
| 412 |
+
query: str,
|
| 413 |
+
documents: List[Any],
|
| 414 |
+
rag_answer: str,
|
| 415 |
+
messages: List[Any],
|
| 416 |
+
filters: Dict[str, Any] = None
|
| 417 |
+
) -> str:
|
| 418 |
+
"""
|
| 419 |
+
Generate conversational response from visually retrieved documents.
|
| 420 |
+
|
| 421 |
+
Phase 2 IMPLEMENTED: Multi-modal LLM support
|
| 422 |
+
- Sends top 3 images (by relevance) directly to GPT-4o
|
| 423 |
+
- LLM can see tables, charts, figures directly
|
| 424 |
+
- Falls back to text-only if multi-modal fails
|
| 425 |
+
|
| 426 |
+
Current implementation: Multi-modal with text fallback
|
| 427 |
+
|
| 428 |
+
Args:
|
| 429 |
+
query: User query
|
| 430 |
+
documents: Retrieved visual documents
|
| 431 |
+
rag_answer: RAG answer (empty for visual search)
|
| 432 |
+
messages: Conversation history
|
| 433 |
+
|
| 434 |
+
Returns:
|
| 435 |
+
LLM response
|
| 436 |
+
"""
|
| 437 |
+
logger.info("=" * 80)
|
| 438 |
+
logger.info("π¬ VISUAL RESPONSE GENERATION: Starting")
|
| 439 |
+
logger.info("=" * 80)
|
| 440 |
+
logger.info(f"π¬ VISUAL RESPONSE GENERATION: Processing {len(documents)} visual documents")
|
| 441 |
+
logger.info(f"π¬ VISUAL RESPONSE GENERATION: Query: '{query}'")
|
| 442 |
+
logger.info(f"π¬ VISUAL RESPONSE GENERATION: Filters applied: {filters}")
|
| 443 |
+
|
| 444 |
+
# Log each document's metadata and content preview
|
| 445 |
+
for i, doc in enumerate(documents[:5]): # Log first 5 docs
|
| 446 |
+
metadata = getattr(doc, 'metadata', {})
|
| 447 |
+
content = getattr(doc, 'page_content', '')
|
| 448 |
+
logger.info(f"π DOC {i+1} METADATA: filename={metadata.get('filename', 'N/A')}, "
|
| 449 |
+
f"year={metadata.get('year', 'N/A')}, district={metadata.get('district', 'N/A')}, "
|
| 450 |
+
f"source={metadata.get('source', 'N/A')}, page={metadata.get('page_number', 'N/A')}")
|
| 451 |
+
logger.info(f"π DOC {i+1} CONTENT PREVIEW: {content[:200]}..." if content else f"π DOC {i+1} CONTENT: EMPTY/NONE")
|
| 452 |
+
logger.info(f"π DOC {i+1} CONTENT LENGTH: {len(content)} chars" if content else "π DOC {i+1} CONTENT LENGTH: 0")
|
| 453 |
+
|
| 454 |
+
# Build conversation history context
|
| 455 |
+
conversation_context = self._build_conversation_context_for_response(messages)
|
| 456 |
+
|
| 457 |
+
# Build detailed document information
|
| 458 |
+
document_details = self._build_visual_document_details(documents)
|
| 459 |
+
logger.info(f"π¬ VISUAL RESPONSE GENERATION: Document details length: {len(document_details)} chars")
|
| 460 |
+
|
| 461 |
+
# Extract correct names from documents
|
| 462 |
+
correct_names = self._extract_correct_names_from_documents(documents)
|
| 463 |
+
logger.info(f"π¬ VISUAL RESPONSE GENERATION: Correct names: {correct_names}")
|
| 464 |
+
|
| 465 |
+
# ============================================================
|
| 466 |
+
# PHASE 2: Try multi-modal generation first (GPT-4o with images)
|
| 467 |
+
# ============================================================
|
| 468 |
+
if self.enable_multimodal:
|
| 469 |
+
logger.info("πΌοΈ VISUAL RESPONSE GENERATION: Attempting multi-modal generation (GPT-4o with images)...")
|
| 470 |
+
|
| 471 |
+
multimodal_response = self._generate_multimodal_response(
|
| 472 |
+
query=query,
|
| 473 |
+
documents=documents,
|
| 474 |
+
conversation_context=conversation_context,
|
| 475 |
+
correct_names=correct_names,
|
| 476 |
+
filters=filters
|
| 477 |
+
)
|
| 478 |
+
|
| 479 |
+
if multimodal_response:
|
| 480 |
+
logger.info("β
VISUAL RESPONSE GENERATION: Multi-modal generation successful!")
|
| 481 |
+
|
| 482 |
+
# Validate and enhance the response
|
| 483 |
+
final_response = self._validate_and_enhance_response(
|
| 484 |
+
multimodal_response,
|
| 485 |
+
documents,
|
| 486 |
+
query,
|
| 487 |
+
filters
|
| 488 |
+
)
|
| 489 |
+
|
| 490 |
+
logger.info("=" * 80)
|
| 491 |
+
return final_response
|
| 492 |
+
else:
|
| 493 |
+
logger.warning("β οΈ VISUAL RESPONSE GENERATION: Multi-modal failed, falling back to text-only")
|
| 494 |
+
else:
|
| 495 |
+
logger.info("π VISUAL RESPONSE GENERATION: Multi-modal disabled, using text-only mode")
|
| 496 |
+
|
| 497 |
+
# ============================================================
|
| 498 |
+
# FALLBACK: Text-only generation (extracted text from pages)
|
| 499 |
+
# ============================================================
|
| 500 |
+
logger.info("π VISUAL RESPONSE GENERATION: Using text-only mode")
|
| 501 |
+
|
| 502 |
+
# Create response prompt
|
| 503 |
+
response_prompt = ChatPromptTemplate.from_messages([
|
| 504 |
+
SystemMessage(content="""You are a helpful audit report assistant. Generate a natural, conversational response.
|
| 505 |
+
|
| 506 |
+
CRITICAL RULES - NO HALLUCINATION:
|
| 507 |
+
1. **ONLY use information from the retrieved documents provided below**
|
| 508 |
+
2. **EVERY sentence with facts, numbers, or specific claims MUST have a [Doc i] reference**
|
| 509 |
+
3. **If a document doesn't contain the information, DO NOT make it up**
|
| 510 |
+
4. **If the user asks about a year/district that's NOT in the retrieved documents, explicitly state that**
|
| 511 |
+
5. **Check the document years/districts before making any claims about them**
|
| 512 |
+
6. **USE CORRECT NAMES**: Always use the exact names from document metadata, not misspellings from conversation.
|
| 513 |
+
|
| 514 |
+
RULES:
|
| 515 |
+
1. Answer the user's question directly and clearly
|
| 516 |
+
2. Use ONLY the retrieved documents as evidence - DO NOT use your training data
|
| 517 |
+
3. Be conversational, not technical
|
| 518 |
+
4. Don't mention scores, retrieval details, or technical implementation
|
| 519 |
+
5. If relevant documents were found, reference them naturally
|
| 520 |
+
6. If no relevant documents, say you do not have enough information - DO NOT hallucinate
|
| 521 |
+
7. If the passages have useful facts or numbers, use them in your answer WITH references
|
| 522 |
+
8. **MANDATORY**: When you use information from a passage, mention where it came from by using [Doc i] at the end of the sentence.
|
| 523 |
+
9. Do not use the sentence 'Doc i says ...' to say where information came from.
|
| 524 |
+
10. If the same thing is said in more than one document, you can mention all of them like this: [Doc i, Doc j, Doc k]
|
| 525 |
+
11. Do not just summarize each passage one by one. Group your summaries to highlight the key parts in the explanation.
|
| 526 |
+
12. If it makes sense, use bullet points and lists to make your answers easier to understand.
|
| 527 |
+
13. You do not need to use every passage. Only use the ones that help answer the question.
|
| 528 |
+
14. **VERIFY**: Before mentioning any year, district, or number, check that it exists in the retrieved documents.
|
| 529 |
+
15. **NO HALLUCINATION**: If documents show years 2021, 2022, 2023 but user asks about 2020, DO NOT provide 2020 data.
|
| 530 |
+
16. **USE CORRECT SPELLING**: Always use the district/source names exactly as they appear in the document metadata below.
|
| 531 |
+
|
| 532 |
+
NOTE: These documents were retrieved using advanced visual search (ColPali), so they may contain tables, figures, or structured data.
|
| 533 |
+
|
| 534 |
+
TONE: Professional but friendly, like talking to a colleague."""),
|
| 535 |
+
HumanMessage(content=f"""Conversation History:
|
| 536 |
+
{conversation_context}
|
| 537 |
+
|
| 538 |
+
Current User Question: {query}
|
| 539 |
+
|
| 540 |
+
Retrieved Documents: {len(documents)} documents found
|
| 541 |
+
|
| 542 |
+
CORRECT NAMES TO USE (from document metadata - use these exact spellings):
|
| 543 |
+
{correct_names}
|
| 544 |
+
|
| 545 |
+
Full Document Details:
|
| 546 |
+
{document_details}
|
| 547 |
+
|
| 548 |
+
CRITICAL:
|
| 549 |
+
- Responses should be grounded to what is available in the retrieved documents
|
| 550 |
+
- If user asks about a specific year but documents show other years, explicitly state "can't provide response on ... because ..."
|
| 551 |
+
- Every factual claim MUST have [Doc i] reference
|
| 552 |
+
- If information is not in documents, explicitly state it's not available
|
| 553 |
+
- **USE THE CORRECT DISTRICT/SOURCE NAMES from the document metadata above**
|
| 554 |
+
|
| 555 |
+
Generate a conversational response with proper document references:""")
|
| 556 |
+
])
|
| 557 |
+
|
| 558 |
+
try:
|
| 559 |
+
logger.info(f"π TEXT-ONLY GENERATION: Calling LLM...")
|
| 560 |
+
response = self.llm.invoke(response_prompt.format_messages())
|
| 561 |
+
response_text = response.content.strip()
|
| 562 |
+
|
| 563 |
+
logger.info(f"π TEXT-ONLY GENERATION: LLM response received")
|
| 564 |
+
logger.info(f"π TEXT-ONLY GENERATION: Response length: {len(response_text)} chars")
|
| 565 |
+
logger.info(f"π TEXT-ONLY GENERATION: Response preview: {response_text[:300]}...")
|
| 566 |
+
|
| 567 |
+
# Check if response indicates no information found
|
| 568 |
+
no_info_indicators = [
|
| 569 |
+
"don't have", "do not have", "isn't available", "is not available",
|
| 570 |
+
"no information", "cannot provide", "can't provide", "not in the retrieved"
|
| 571 |
+
]
|
| 572 |
+
if any(indicator in response_text.lower() for indicator in no_info_indicators):
|
| 573 |
+
logger.warning("β οΈ TEXT-ONLY GENERATION: Response indicates NO INFORMATION FOUND")
|
| 574 |
+
logger.warning("β οΈ This could mean:")
|
| 575 |
+
logger.warning(" 1. Retrieved documents don't contain relevant content")
|
| 576 |
+
logger.warning(" 2. Extracted text from visual documents is empty/poor quality")
|
| 577 |
+
logger.warning(" 3. Query doesn't match document content")
|
| 578 |
+
|
| 579 |
+
# Validate and enhance the response
|
| 580 |
+
final_response = self._validate_and_enhance_response(
|
| 581 |
+
response_text,
|
| 582 |
+
documents,
|
| 583 |
+
query,
|
| 584 |
+
filters
|
| 585 |
+
)
|
| 586 |
+
|
| 587 |
+
logger.info("=" * 80)
|
| 588 |
+
return final_response
|
| 589 |
+
|
| 590 |
+
except Exception as e:
|
| 591 |
+
logger.error(f"β TEXT-ONLY GENERATION: Error during generation: {e}")
|
| 592 |
+
traceback.print_exc()
|
| 593 |
+
return "I apologize, but I encountered an error generating the response."
|
| 594 |
+
|
| 595 |
+
def _generate_conversational_response_without_docs(
|
| 596 |
+
self,
|
| 597 |
+
query: str,
|
| 598 |
+
messages: List[Any]
|
| 599 |
+
) -> str:
|
| 600 |
+
"""
|
| 601 |
+
Generate conversational response using only LLM knowledge.
|
| 602 |
+
|
| 603 |
+
Args:
|
| 604 |
+
query: User query
|
| 605 |
+
messages: Conversation history
|
| 606 |
+
|
| 607 |
+
Returns:
|
| 608 |
+
LLM response
|
| 609 |
+
"""
|
| 610 |
+
logger.info("π¬ RESPONSE GENERATION (NO DOCS): Starting response generation without documents")
|
| 611 |
+
|
| 612 |
+
# Build conversation context
|
| 613 |
+
conversation_context = ""
|
| 614 |
+
for msg in messages[-6:]:
|
| 615 |
+
if isinstance(msg, HumanMessage):
|
| 616 |
+
conversation_context += f"User: {msg.content}\n"
|
| 617 |
+
elif isinstance(msg, AIMessage):
|
| 618 |
+
conversation_context += f"Assistant: {msg.content}\n"
|
| 619 |
+
|
| 620 |
+
# Create response prompt
|
| 621 |
+
response_prompt = ChatPromptTemplate.from_messages([
|
| 622 |
+
SystemMessage(content="""You are a helpful audit report assistant.
|
| 623 |
+
|
| 624 |
+
RULES:
|
| 625 |
+
1. Politely explain that no relevant documents were found with high enough similarity
|
| 626 |
+
2. Suggest rephrasing the query or being more specific
|
| 627 |
+
3. Suggest checking if the information might be in a different year/source/district
|
| 628 |
+
4. Stay professional but friendly
|
| 629 |
+
|
| 630 |
+
TONE: Professional but friendly, like talking to a colleague."""),
|
| 631 |
+
HumanMessage(content=f"""Current Question: {query}
|
| 632 |
+
|
| 633 |
+
Conversation History:
|
| 634 |
+
{conversation_context}
|
| 635 |
+
|
| 636 |
+
Note: No relevant documents were found with high enough similarity scores.
|
| 637 |
+
|
| 638 |
+
Generate a helpful response:""")
|
| 639 |
+
])
|
| 640 |
+
|
| 641 |
+
try:
|
| 642 |
+
response = self.llm.invoke(response_prompt.format_messages())
|
| 643 |
+
return response.content.strip()
|
| 644 |
+
except Exception as e:
|
| 645 |
+
logger.error(f"β RESPONSE GENERATION (NO DOCS): Error: {e}")
|
| 646 |
+
return "I couldn't find relevant documents for your query. Please try rephrasing or being more specific."
|
| 647 |
+
|
| 648 |
+
def _build_conversation_context_for_response(self, messages: List[Any]) -> str:
|
| 649 |
+
"""Build conversation history context for response generation"""
|
| 650 |
+
context_lines = []
|
| 651 |
+
for msg in messages[-6:]:
|
| 652 |
+
if isinstance(msg, HumanMessage):
|
| 653 |
+
context_lines.append(f"User: {msg.content}")
|
| 654 |
+
elif isinstance(msg, AIMessage):
|
| 655 |
+
context_lines.append(f"Assistant: {msg.content}")
|
| 656 |
+
|
| 657 |
+
return "\n".join(context_lines) if context_lines else "No previous conversation."
|
| 658 |
+
|
| 659 |
+
def _build_visual_document_details(self, documents: List[Any]) -> str:
|
| 660 |
+
"""Build detailed document information for response generation"""
|
| 661 |
+
if not documents:
|
| 662 |
+
logger.warning("π BUILD_DETAILS: No documents to process!")
|
| 663 |
+
return "No documents retrieved."
|
| 664 |
+
|
| 665 |
+
logger.info(f"π BUILD_DETAILS: Processing {len(documents)} documents for LLM context")
|
| 666 |
+
|
| 667 |
+
details = []
|
| 668 |
+
docs_with_content = 0
|
| 669 |
+
docs_without_content = 0
|
| 670 |
+
total_content_length = 0
|
| 671 |
+
|
| 672 |
+
for i, doc in enumerate(documents[:15], 1):
|
| 673 |
+
metadata = getattr(doc, 'metadata', {})
|
| 674 |
+
content = getattr(doc, 'page_content', '')
|
| 675 |
+
score = getattr(doc, 'score', 0.0)
|
| 676 |
+
|
| 677 |
+
filename = metadata.get('filename', 'Unknown')
|
| 678 |
+
year = metadata.get('year', 'Unknown')
|
| 679 |
+
district = metadata.get('district', 'Unknown')
|
| 680 |
+
source = metadata.get('source', 'Unknown')
|
| 681 |
+
page = metadata.get('page_number', metadata.get('page', 'Unknown'))
|
| 682 |
+
|
| 683 |
+
# Visual metadata
|
| 684 |
+
num_tiles = metadata.get('num_tiles')
|
| 685 |
+
num_visual_tokens = metadata.get('num_visual_tokens')
|
| 686 |
+
|
| 687 |
+
doc_info = f"[Doc {i}] (Score: {score:.3f})"
|
| 688 |
+
doc_info += f"\n Filename: {filename}"
|
| 689 |
+
doc_info += f"\n Year: {year}"
|
| 690 |
+
doc_info += f"\n Source: {source}"
|
| 691 |
+
if district != 'Unknown':
|
| 692 |
+
doc_info += f"\n District: {district}"
|
| 693 |
+
doc_info += f"\n Page: {page}"
|
| 694 |
+
|
| 695 |
+
# Add visual metadata if available
|
| 696 |
+
if num_tiles or num_visual_tokens:
|
| 697 |
+
doc_info += f"\n Visual: {num_tiles} tiles, {num_visual_tokens} tokens"
|
| 698 |
+
|
| 699 |
+
# Add content preview
|
| 700 |
+
if content and content.strip():
|
| 701 |
+
doc_info += f"\n Content: {content[:500]}{'...' if len(content) > 500 else ''}"
|
| 702 |
+
docs_with_content += 1
|
| 703 |
+
total_content_length += len(content)
|
| 704 |
+
else:
|
| 705 |
+
doc_info += "\n Content: (No text extracted - image-only page)"
|
| 706 |
+
docs_without_content += 1
|
| 707 |
+
|
| 708 |
+
details.append(doc_info)
|
| 709 |
+
|
| 710 |
+
# Log summary
|
| 711 |
+
logger.info(f"π BUILD_DETAILS SUMMARY:")
|
| 712 |
+
logger.info(f" - Documents with text content: {docs_with_content}")
|
| 713 |
+
logger.info(f" - Documents WITHOUT text (image-only): {docs_without_content}")
|
| 714 |
+
logger.info(f" - Total text content length: {total_content_length} chars")
|
| 715 |
+
|
| 716 |
+
if docs_without_content > docs_with_content:
|
| 717 |
+
logger.warning(f"β οΈ BUILD_DETAILS: Most documents have NO TEXT CONTENT!")
|
| 718 |
+
logger.warning(f"β οΈ This is likely why the LLM says 'no information available'")
|
| 719 |
+
|
| 720 |
+
return "\n\n".join(details) if details else "No document details available."
|
| 721 |
+
|
| 722 |
+
def _extract_correct_names_from_documents(self, documents: List[Any]) -> str:
|
| 723 |
+
"""Extract correct district/source names from documents to correct misspellings"""
|
| 724 |
+
districts = set()
|
| 725 |
+
sources = set()
|
| 726 |
+
years = set()
|
| 727 |
+
|
| 728 |
+
for doc in documents:
|
| 729 |
+
metadata = getattr(doc, 'metadata', {})
|
| 730 |
+
if metadata.get('district'):
|
| 731 |
+
districts.add(str(metadata['district']))
|
| 732 |
+
if metadata.get('source'):
|
| 733 |
+
sources.add(str(metadata['source']))
|
| 734 |
+
if metadata.get('year'):
|
| 735 |
+
years.add(str(metadata['year']))
|
| 736 |
+
|
| 737 |
+
result = []
|
| 738 |
+
if districts:
|
| 739 |
+
result.append(f"Districts: {', '.join(sorted(districts))}")
|
| 740 |
+
if sources:
|
| 741 |
+
result.append(f"Sources: {', '.join(sorted(sources))}")
|
| 742 |
+
if years:
|
| 743 |
+
result.append(f"Years: {', '.join(sorted(years))}")
|
| 744 |
+
|
| 745 |
+
if result:
|
| 746 |
+
return "\n".join(result) + "\n\nIMPORTANT: Use these EXACT spellings in your response."
|
| 747 |
+
return "No metadata available."
|
| 748 |
+
|
| 749 |
+
def _validate_and_enhance_response(
|
| 750 |
+
self,
|
| 751 |
+
response: str,
|
| 752 |
+
documents: List[Any],
|
| 753 |
+
query: str,
|
| 754 |
+
filters: Dict[str, Any] = None
|
| 755 |
+
) -> str:
|
| 756 |
+
"""
|
| 757 |
+
Validate response and add warnings about data coverage gaps.
|
| 758 |
+
|
| 759 |
+
Compares REQUESTED filters against RETRIEVED document metadata.
|
| 760 |
+
|
| 761 |
+
Args:
|
| 762 |
+
response: LLM-generated response
|
| 763 |
+
documents: Retrieved documents
|
| 764 |
+
query: User query
|
| 765 |
+
filters: Applied filters (year, district, etc.)
|
| 766 |
+
|
| 767 |
+
Returns:
|
| 768 |
+
Response with optional warnings appended
|
| 769 |
+
"""
|
| 770 |
+
import re
|
| 771 |
+
|
| 772 |
+
# Extract years and districts from RETRIEVED documents
|
| 773 |
+
doc_years = set()
|
| 774 |
+
doc_districts = set()
|
| 775 |
+
|
| 776 |
+
for doc in documents:
|
| 777 |
+
metadata = getattr(doc, 'metadata', {}) if hasattr(doc, 'metadata') else {}
|
| 778 |
+
if isinstance(metadata, dict):
|
| 779 |
+
if metadata.get('year'):
|
| 780 |
+
doc_years.add(str(metadata['year']))
|
| 781 |
+
if metadata.get('district'):
|
| 782 |
+
doc_districts.add(str(metadata['district']))
|
| 783 |
+
|
| 784 |
+
logger.info(f"π VALIDATION: Retrieved docs cover years={doc_years}, districts={doc_districts}")
|
| 785 |
+
|
| 786 |
+
warnings = []
|
| 787 |
+
|
| 788 |
+
# Get REQUESTED filters
|
| 789 |
+
requested_years = set()
|
| 790 |
+
requested_districts = set()
|
| 791 |
+
|
| 792 |
+
if filters:
|
| 793 |
+
if filters.get('year'):
|
| 794 |
+
if isinstance(filters['year'], list):
|
| 795 |
+
requested_years = set(str(y) for y in filters['year'])
|
| 796 |
+
else:
|
| 797 |
+
requested_years = {str(filters['year'])}
|
| 798 |
+
if filters.get('district'):
|
| 799 |
+
if isinstance(filters['district'], list):
|
| 800 |
+
requested_districts = set(str(d) for d in filters['district'])
|
| 801 |
+
else:
|
| 802 |
+
requested_districts = {str(filters['district'])}
|
| 803 |
+
|
| 804 |
+
logger.info(f"π VALIDATION: Requested years={requested_years}, districts={requested_districts}")
|
| 805 |
+
|
| 806 |
+
# Compare requested vs retrieved YEARS
|
| 807 |
+
if requested_years and doc_years:
|
| 808 |
+
missing_years = requested_years - doc_years
|
| 809 |
+
if missing_years:
|
| 810 |
+
warnings.append(
|
| 811 |
+
f"You requested data for years {', '.join(sorted(requested_years))}, "
|
| 812 |
+
f"but the retrieved documents only cover {', '.join(sorted(doc_years))}. "
|
| 813 |
+
f"Data for {', '.join(sorted(missing_years))} may not be available."
|
| 814 |
+
)
|
| 815 |
+
|
| 816 |
+
# Compare requested vs retrieved DISTRICTS
|
| 817 |
+
if requested_districts and doc_districts:
|
| 818 |
+
# Normalize for comparison (case-insensitive)
|
| 819 |
+
requested_lower = {d.lower() for d in requested_districts}
|
| 820 |
+
doc_lower = {d.lower() for d in doc_districts}
|
| 821 |
+
missing_lower = requested_lower - doc_lower
|
| 822 |
+
|
| 823 |
+
if missing_lower:
|
| 824 |
+
missing_districts = [d for d in requested_districts if d.lower() in missing_lower]
|
| 825 |
+
warnings.append(
|
| 826 |
+
f"You requested data for districts {', '.join(sorted(requested_districts))}, "
|
| 827 |
+
f"but the retrieved documents only cover {', '.join(sorted(doc_districts))}. "
|
| 828 |
+
f"Data for {', '.join(sorted(missing_districts))} may not be available."
|
| 829 |
+
)
|
| 830 |
+
|
| 831 |
+
# Add warnings to response if any
|
| 832 |
+
if warnings and "β οΈ" not in response:
|
| 833 |
+
warning_text = "\n\nβ οΈ **Note:** " + " ".join(warnings)
|
| 834 |
+
response = response + warning_text
|
| 835 |
+
logger.info(f"π VALIDATION: Added warning about data coverage")
|
| 836 |
+
|
| 837 |
+
return response
|
| 838 |
+
|
| 839 |
+
|
| 840 |
+
def get_visual_multi_agent_chatbot() -> VisualMultiAgentChatbot:
|
| 841 |
+
"""
|
| 842 |
+
Factory function to create a visual multi-agent chatbot.
|
| 843 |
+
|
| 844 |
+
Returns:
|
| 845 |
+
Initialized VisualMultiAgentChatbot
|
| 846 |
+
"""
|
| 847 |
+
import os
|
| 848 |
+
from src.colpali.visual_search import VisualSearchAdapter
|
| 849 |
+
|
| 850 |
+
logger.info("π¨ Creating Visual Multi-Agent Chatbot...")
|
| 851 |
+
|
| 852 |
+
# Get Qdrant credentials for ColPali cluster
|
| 853 |
+
qdrant_url = (
|
| 854 |
+
os.environ.get("QDRANT_URL_AKRYL") or
|
| 855 |
+
os.environ.get("DEST_QDRANT_URL") or
|
| 856 |
+
os.environ.get("QDRANT_URL")
|
| 857 |
+
)
|
| 858 |
+
qdrant_api_key = (
|
| 859 |
+
os.environ.get("QDRANT_API_KEY_AKRYL") or
|
| 860 |
+
os.environ.get("DEST_QDRANT_API_KEY") or
|
| 861 |
+
os.environ.get("QDRANT_API_KEY")
|
| 862 |
+
)
|
| 863 |
+
|
| 864 |
+
if not qdrant_url or not qdrant_api_key:
|
| 865 |
+
raise ValueError(
|
| 866 |
+
"Visual mode requires Qdrant credentials for the ColPali cluster.\n"
|
| 867 |
+
"Please set one of these in your .env file:\n"
|
| 868 |
+
" - QDRANT_URL_AKRYL and QDRANT_API_KEY_AKRYL\n"
|
| 869 |
+
" - DEST_QDRANT_URL and DEST_QDRANT_API_KEY\n"
|
| 870 |
+
" - QDRANT_URL and QDRANT_API_KEY"
|
| 871 |
+
)
|
| 872 |
+
|
| 873 |
+
logger.info(f" Using Qdrant URL: {qdrant_url}")
|
| 874 |
+
logger.info(f" Collection: colSmol-500M")
|
| 875 |
+
logger.info(f" Multi-modal: {MULTIMODAL_ENABLED} (model: {MULTIMODAL_MODEL}, max_images: {MULTIMODAL_MAX_IMAGES})")
|
| 876 |
+
|
| 877 |
+
# Create visual search adapter
|
| 878 |
+
visual_search = VisualSearchAdapter(
|
| 879 |
+
qdrant_url=qdrant_url,
|
| 880 |
+
qdrant_api_key=qdrant_api_key,
|
| 881 |
+
collection_name="colSmol-500M"
|
| 882 |
+
)
|
| 883 |
+
|
| 884 |
+
# Create multi-agent chatbot with multi-modal enabled
|
| 885 |
+
chatbot = VisualMultiAgentChatbot(
|
| 886 |
+
visual_search=visual_search,
|
| 887 |
+
config_path="src/config/settings.yaml",
|
| 888 |
+
enable_multimodal=MULTIMODAL_ENABLED
|
| 889 |
+
)
|
| 890 |
+
|
| 891 |
+
return chatbot
|