Spaces:
Running
Running
Add fallback policy to multi-modal when metadata:text is empty
Browse files
src/agents/visual_chatbot.py
CHANGED
|
@@ -233,7 +233,7 @@ Please provide a detailed answer based on the documents above. Cite your sources
|
|
| 233 |
{"role": "user", "content": user_prompt}
|
| 234 |
]
|
| 235 |
|
| 236 |
-
response = self.llm.invoke(messages)
|
| 237 |
return response.content
|
| 238 |
|
| 239 |
|
|
@@ -242,7 +242,7 @@ def get_visual_chatbot() -> VisualChatbot:
|
|
| 242 |
Factory function to create a visual chatbot.
|
| 243 |
|
| 244 |
Uses the same QDRANT_URL and QDRANT_API_KEY as the colpali_colab_package,
|
| 245 |
-
|
| 246 |
|
| 247 |
Returns:
|
| 248 |
Initialized VisualChatbot
|
|
@@ -262,6 +262,9 @@ def get_visual_chatbot() -> VisualChatbot:
|
|
| 262 |
os.environ.get("QDRANT_API_KEY") # Fallback
|
| 263 |
)
|
| 264 |
|
|
|
|
|
|
|
|
|
|
| 265 |
if not qdrant_url or not qdrant_api_key:
|
| 266 |
raise ValueError(
|
| 267 |
"Visual mode requires Qdrant credentials for the ColPali cluster.\n"
|
|
@@ -269,17 +272,17 @@ def get_visual_chatbot() -> VisualChatbot:
|
|
| 269 |
" - QDRANT_URL_AKRYL and QDRANT_API_KEY_AKRYL\n"
|
| 270 |
" - DEST_QDRANT_URL and DEST_QDRANT_API_KEY\n"
|
| 271 |
" - QDRANT_URL and QDRANT_API_KEY\n\n"
|
| 272 |
-
"
|
| 273 |
)
|
| 274 |
|
| 275 |
logger.info(f" Using Qdrant URL: {qdrant_url}")
|
| 276 |
-
logger.info(f" Collection:
|
| 277 |
|
| 278 |
# Create visual search adapter with explicit credentials
|
| 279 |
visual_search = VisualSearchAdapter(
|
| 280 |
qdrant_url=qdrant_url,
|
| 281 |
qdrant_api_key=qdrant_api_key,
|
| 282 |
-
collection_name=
|
| 283 |
)
|
| 284 |
|
| 285 |
# Get LLM config from settings.yaml
|
|
|
|
| 233 |
{"role": "user", "content": user_prompt}
|
| 234 |
]
|
| 235 |
|
| 236 |
+
response = self.llm.invoke(messages, prompt_name="visual_simple_answer")
|
| 237 |
return response.content
|
| 238 |
|
| 239 |
|
|
|
|
| 242 |
Factory function to create a visual chatbot.
|
| 243 |
|
| 244 |
Uses the same QDRANT_URL and QDRANT_API_KEY as the colpali_colab_package,
|
| 245 |
+
and connects to the collection specified by QDRANT_COLLECTION_VISUAL env var.
|
| 246 |
|
| 247 |
Returns:
|
| 248 |
Initialized VisualChatbot
|
|
|
|
| 262 |
os.environ.get("QDRANT_API_KEY") # Fallback
|
| 263 |
)
|
| 264 |
|
| 265 |
+
# Get collection name from env var (default to colSmol-500M-v2 for new processing)
|
| 266 |
+
collection_name = os.environ.get("QDRANT_COLLECTION_VISUAL", "colSmol-500M-v2")
|
| 267 |
+
|
| 268 |
if not qdrant_url or not qdrant_api_key:
|
| 269 |
raise ValueError(
|
| 270 |
"Visual mode requires Qdrant credentials for the ColPali cluster.\n"
|
|
|
|
| 272 |
" - QDRANT_URL_AKRYL and QDRANT_API_KEY_AKRYL\n"
|
| 273 |
" - DEST_QDRANT_URL and DEST_QDRANT_API_KEY\n"
|
| 274 |
" - QDRANT_URL and QDRANT_API_KEY\n\n"
|
| 275 |
+
"And optionally set QDRANT_COLLECTION_VISUAL (default: colSmol-500M-v2)"
|
| 276 |
)
|
| 277 |
|
| 278 |
logger.info(f" Using Qdrant URL: {qdrant_url}")
|
| 279 |
+
logger.info(f" Collection: {collection_name}")
|
| 280 |
|
| 281 |
# Create visual search adapter with explicit credentials
|
| 282 |
visual_search = VisualSearchAdapter(
|
| 283 |
qdrant_url=qdrant_url,
|
| 284 |
qdrant_api_key=qdrant_api_key,
|
| 285 |
+
collection_name=collection_name
|
| 286 |
)
|
| 287 |
|
| 288 |
# Get LLM config from settings.yaml
|
src/agents/visual_multi_agent_chatbot.py
CHANGED
|
@@ -39,8 +39,13 @@ logger = logging.getLogger(__name__)
|
|
| 39 |
# Multi-modal LLM configuration
|
| 40 |
MULTIMODAL_MODEL = os.environ.get("VISUAL_RAG_MODEL", "gpt-4o") # GPT-4o supports vision
|
| 41 |
MULTIMODAL_MAX_IMAGES = int(os.environ.get("VISUAL_RAG_MAX_IMAGES", "3")) # Top N images by relevance score
|
| 42 |
-
MULTIMODAL_ENABLED = os.environ.get("VISUAL_RAG_MULTIMODAL", "true").lower() == "true" # Toggle for multi-modal mode
|
|
|
|
| 43 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
|
| 45 |
class VisualMultiAgentChatbot(BaseMultiAgentChatbot):
|
| 46 |
"""Multi-agent chatbot with visual RAG (ColPali) and multi-modal response generation"""
|
|
@@ -77,6 +82,36 @@ class VisualMultiAgentChatbot(BaseMultiAgentChatbot):
|
|
| 77 |
super().__init__(config_path)
|
| 78 |
|
| 79 |
logger.info(f"🎨 Visual Multi-Agent Chatbot initialized (multi-modal: {self.enable_multimodal})")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
|
| 81 |
def _perform_retrieval(self, query: str, filters: Dict[str, Any]) -> Any:
|
| 82 |
"""
|
|
@@ -89,9 +124,7 @@ class VisualMultiAgentChatbot(BaseMultiAgentChatbot):
|
|
| 89 |
Returns:
|
| 90 |
Result object with .sources and .answer attributes
|
| 91 |
"""
|
| 92 |
-
logger.info(f"🔍 VISUAL RETRIEVAL:
|
| 93 |
-
logger.info(f"🔍 VISUAL RETRIEVAL: Query: '{query}'")
|
| 94 |
-
logger.info(f"🔍 VISUAL RETRIEVAL: Filters: {filters}")
|
| 95 |
|
| 96 |
# Convert filters to visual search format
|
| 97 |
visual_filters = {}
|
|
@@ -110,7 +143,6 @@ class VisualMultiAgentChatbot(BaseMultiAgentChatbot):
|
|
| 110 |
if filters.get("filenames"):
|
| 111 |
visual_filters["filenames"] = filters["filenames"]
|
| 112 |
|
| 113 |
-
logger.info(f"🔍 VISUAL RETRIEVAL: Converted filters: {visual_filters}")
|
| 114 |
|
| 115 |
# Perform visual search
|
| 116 |
try:
|
|
@@ -155,8 +187,6 @@ class VisualMultiAgentChatbot(BaseMultiAgentChatbot):
|
|
| 155 |
rag_query = state["rag_query"]
|
| 156 |
filters = state["rag_filters"]
|
| 157 |
|
| 158 |
-
logger.info(f"📝 VISUAL RESPONSE AGENT: Query: '{rag_query}'")
|
| 159 |
-
logger.info(f"📝 VISUAL RESPONSE AGENT: Filters: {filters}")
|
| 160 |
|
| 161 |
try:
|
| 162 |
# Call visual retrieval
|
|
@@ -239,7 +269,8 @@ class VisualMultiAgentChatbot(BaseMultiAgentChatbot):
|
|
| 239 |
documents: List[Any],
|
| 240 |
conversation_context: str,
|
| 241 |
correct_names: str,
|
| 242 |
-
filters: Dict[str, Any] = None
|
|
|
|
| 243 |
) -> Optional[str]:
|
| 244 |
"""
|
| 245 |
Generate response using GPT-4o with images (multi-modal).
|
|
@@ -252,11 +283,16 @@ class VisualMultiAgentChatbot(BaseMultiAgentChatbot):
|
|
| 252 |
conversation_context: Formatted conversation history
|
| 253 |
correct_names: Correct district/source names from metadata
|
| 254 |
filters: Applied filters
|
|
|
|
| 255 |
|
| 256 |
Returns:
|
| 257 |
LLM response string, or None if multi-modal generation failed
|
| 258 |
"""
|
| 259 |
-
if not self.openai_client
|
|
|
|
|
|
|
|
|
|
|
|
|
| 260 |
logger.info("🖼️ Multi-modal disabled, skipping")
|
| 261 |
return None
|
| 262 |
|
|
@@ -454,8 +490,8 @@ Now analyze the images and answer the question:"""
|
|
| 454 |
# Build conversation history context
|
| 455 |
conversation_context = self._build_conversation_context_for_response(messages)
|
| 456 |
|
| 457 |
-
# Build detailed document information
|
| 458 |
-
document_details = self.
|
| 459 |
logger.info(f"💬 VISUAL RESPONSE GENERATION: Document details length: {len(document_details)} chars")
|
| 460 |
|
| 461 |
# Extract correct names from documents
|
|
@@ -463,17 +499,44 @@ Now analyze the images and answer the question:"""
|
|
| 463 |
logger.info(f"💬 VISUAL RESPONSE GENERATION: Correct names: {correct_names}")
|
| 464 |
|
| 465 |
# ============================================================
|
| 466 |
-
#
|
| 467 |
# ============================================================
|
| 468 |
-
|
| 469 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 470 |
|
| 471 |
multimodal_response = self._generate_multimodal_response(
|
| 472 |
query=query,
|
| 473 |
documents=documents,
|
| 474 |
conversation_context=conversation_context,
|
| 475 |
correct_names=correct_names,
|
| 476 |
-
filters=filters
|
|
|
|
| 477 |
)
|
| 478 |
|
| 479 |
if multimodal_response:
|
|
@@ -557,7 +620,7 @@ Generate a conversational response with proper document references:""")
|
|
| 557 |
|
| 558 |
try:
|
| 559 |
logger.info(f"📝 TEXT-ONLY GENERATION: Calling LLM...")
|
| 560 |
-
response = self.llm.invoke(response_prompt.format_messages())
|
| 561 |
response_text = response.content.strip()
|
| 562 |
|
| 563 |
logger.info(f"📝 TEXT-ONLY GENERATION: LLM response received")
|
|
@@ -639,7 +702,7 @@ Generate a helpful response:""")
|
|
| 639 |
])
|
| 640 |
|
| 641 |
try:
|
| 642 |
-
response = self.llm.invoke(response_prompt.format_messages())
|
| 643 |
return response.content.strip()
|
| 644 |
except Exception as e:
|
| 645 |
logger.error(f"❌ RESPONSE GENERATION (NO DOCS): Error: {e}")
|
|
@@ -719,6 +782,66 @@ Generate a helpful response:""")
|
|
| 719 |
|
| 720 |
return "\n\n".join(details) if details else "No document details available."
|
| 721 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 722 |
def _extract_correct_names_from_documents(self, documents: List[Any]) -> str:
|
| 723 |
"""Extract correct district/source names from documents to correct misspellings"""
|
| 724 |
districts = set()
|
|
|
|
| 39 |
# Multi-modal LLM configuration
|
| 40 |
MULTIMODAL_MODEL = os.environ.get("VISUAL_RAG_MODEL", "gpt-4o") # GPT-4o supports vision
|
| 41 |
MULTIMODAL_MAX_IMAGES = int(os.environ.get("VISUAL_RAG_MAX_IMAGES", "3")) # Top N images by relevance score
|
| 42 |
+
# MULTIMODAL_ENABLED = os.environ.get("VISUAL_RAG_MULTIMODAL", "true").lower() == "true" # Toggle for multi-modal mode
|
| 43 |
+
MULTIMODAL_ENABLED=False
|
| 44 |
|
| 45 |
+
# Query rewriting configuration
|
| 46 |
+
# By default, SKIP query rewriting for visual RAG (use original query for saliency accuracy)
|
| 47 |
+
# Set ENABLE_VISUAL_QUERY_REWRITE=true to enable query rewriting
|
| 48 |
+
SKIP_QUERY_REWRITE = os.environ.get("ENABLE_VISUAL_QUERY_REWRITE", "false").lower() != "true"
|
| 49 |
|
| 50 |
class VisualMultiAgentChatbot(BaseMultiAgentChatbot):
|
| 51 |
"""Multi-agent chatbot with visual RAG (ColPali) and multi-modal response generation"""
|
|
|
|
| 82 |
super().__init__(config_path)
|
| 83 |
|
| 84 |
logger.info(f"🎨 Visual Multi-Agent Chatbot initialized (multi-modal: {self.enable_multimodal})")
|
| 85 |
+
logger.info(f"🎨 Query rewriting: {'ENABLED' if not SKIP_QUERY_REWRITE else 'DISABLED (using original query)'}")
|
| 86 |
+
|
| 87 |
+
def _rag_agent(self, state: MultiAgentState) -> MultiAgentState:
|
| 88 |
+
"""
|
| 89 |
+
RAG Agent override for Visual RAG.
|
| 90 |
+
|
| 91 |
+
By default, SKIPS query rewriting to preserve original query for saliency maps.
|
| 92 |
+
Set ENABLE_VISUAL_QUERY_REWRITE=true to enable rewriting.
|
| 93 |
+
"""
|
| 94 |
+
from src.agents.base_multi_agent_chatbot import MultiAgentState
|
| 95 |
+
|
| 96 |
+
if SKIP_QUERY_REWRITE:
|
| 97 |
+
# Use original query (no rewriting) - better for saliency accuracy
|
| 98 |
+
original_query = state["current_query"]
|
| 99 |
+
logger.info(f"🔍 RAG AGENT (Visual): Using ORIGINAL query (no rewriting): '{original_query}'")
|
| 100 |
+
|
| 101 |
+
# Still build filters
|
| 102 |
+
context = state["query_context"]
|
| 103 |
+
filters = self._build_filters(context)
|
| 104 |
+
logger.info(f"🔍 RAG AGENT (Visual): Filters: {filters}")
|
| 105 |
+
|
| 106 |
+
state["agent_logs"].append(f"RAG AGENT: Query='{original_query}' (original), Filters={filters}")
|
| 107 |
+
state["rag_query"] = original_query
|
| 108 |
+
state["rag_filters"] = filters
|
| 109 |
+
|
| 110 |
+
return state
|
| 111 |
+
else:
|
| 112 |
+
# Use parent's query rewriting
|
| 113 |
+
logger.info(f"🔍 RAG AGENT (Visual): Query rewriting ENABLED")
|
| 114 |
+
return super()._rag_agent(state)
|
| 115 |
|
| 116 |
def _perform_retrieval(self, query: str, filters: Dict[str, Any]) -> Any:
|
| 117 |
"""
|
|
|
|
| 124 |
Returns:
|
| 125 |
Result object with .sources and .answer attributes
|
| 126 |
"""
|
| 127 |
+
logger.info(f"🔍 VISUAL RETRIEVAL: Searching with {len(filters.get('filenames', []))} filename filters")
|
|
|
|
|
|
|
| 128 |
|
| 129 |
# Convert filters to visual search format
|
| 130 |
visual_filters = {}
|
|
|
|
| 143 |
if filters.get("filenames"):
|
| 144 |
visual_filters["filenames"] = filters["filenames"]
|
| 145 |
|
|
|
|
| 146 |
|
| 147 |
# Perform visual search
|
| 148 |
try:
|
|
|
|
| 187 |
rag_query = state["rag_query"]
|
| 188 |
filters = state["rag_filters"]
|
| 189 |
|
|
|
|
|
|
|
| 190 |
|
| 191 |
try:
|
| 192 |
# Call visual retrieval
|
|
|
|
| 269 |
documents: List[Any],
|
| 270 |
conversation_context: str,
|
| 271 |
correct_names: str,
|
| 272 |
+
filters: Dict[str, Any] = None,
|
| 273 |
+
force_multimodal: bool = False
|
| 274 |
) -> Optional[str]:
|
| 275 |
"""
|
| 276 |
Generate response using GPT-4o with images (multi-modal).
|
|
|
|
| 283 |
conversation_context: Formatted conversation history
|
| 284 |
correct_names: Correct district/source names from metadata
|
| 285 |
filters: Applied filters
|
| 286 |
+
force_multimodal: Force multi-modal even if globally disabled (for auto-fallback)
|
| 287 |
|
| 288 |
Returns:
|
| 289 |
LLM response string, or None if multi-modal generation failed
|
| 290 |
"""
|
| 291 |
+
if not self.openai_client:
|
| 292 |
+
logger.warning("🖼️ Multi-modal: OpenAI client not initialized")
|
| 293 |
+
return None
|
| 294 |
+
|
| 295 |
+
if not self.enable_multimodal and not force_multimodal:
|
| 296 |
logger.info("🖼️ Multi-modal disabled, skipping")
|
| 297 |
return None
|
| 298 |
|
|
|
|
| 490 |
# Build conversation history context
|
| 491 |
conversation_context = self._build_conversation_context_for_response(messages)
|
| 492 |
|
| 493 |
+
# Build detailed document information and check text content availability
|
| 494 |
+
document_details, docs_with_text, docs_without_text = self._build_visual_document_details_with_counts(documents)
|
| 495 |
logger.info(f"💬 VISUAL RESPONSE GENERATION: Document details length: {len(document_details)} chars")
|
| 496 |
|
| 497 |
# Extract correct names from documents
|
|
|
|
| 499 |
logger.info(f"💬 VISUAL RESPONSE GENERATION: Correct names: {correct_names}")
|
| 500 |
|
| 501 |
# ============================================================
|
| 502 |
+
# AUTO-FALLBACK: If most documents lack text, force multi-modal
|
| 503 |
# ============================================================
|
| 504 |
+
use_multimodal = self.enable_multimodal
|
| 505 |
+
force_multimodal = False
|
| 506 |
+
|
| 507 |
+
if docs_without_text > docs_with_text and not use_multimodal:
|
| 508 |
+
logger.warning(f"⚠️ AUTO-FALLBACK: {docs_without_text}/{len(documents)} docs lack text content!")
|
| 509 |
+
logger.info("🖼️ AUTO-FALLBACK: Temporarily enabling multi-modal to analyze images...")
|
| 510 |
+
|
| 511 |
+
if self.openai_client is None:
|
| 512 |
+
api_key = os.environ.get("OPENAI_API_KEY")
|
| 513 |
+
if api_key:
|
| 514 |
+
self.openai_client = OpenAI(api_key=api_key)
|
| 515 |
+
use_multimodal = True
|
| 516 |
+
force_multimodal = True
|
| 517 |
+
logger.info(f"🖼️ AUTO-FALLBACK: Initialized OpenAI client for {MULTIMODAL_MODEL}")
|
| 518 |
+
else:
|
| 519 |
+
logger.warning("⚠️ AUTO-FALLBACK: Cannot enable multi-modal - OPENAI_API_KEY not set")
|
| 520 |
+
else:
|
| 521 |
+
use_multimodal = True
|
| 522 |
+
force_multimodal = True
|
| 523 |
+
|
| 524 |
+
# ============================================================
|
| 525 |
+
# PHASE 2: Try multi-modal generation (GPT-4o with images)
|
| 526 |
+
# ============================================================
|
| 527 |
+
if use_multimodal:
|
| 528 |
+
if force_multimodal:
|
| 529 |
+
logger.info("🖼️ VISUAL RESPONSE GENERATION: Using AUTO-FALLBACK multi-modal (most docs lack text)...")
|
| 530 |
+
else:
|
| 531 |
+
logger.info("🖼️ VISUAL RESPONSE GENERATION: Attempting multi-modal generation (GPT-4o with images)...")
|
| 532 |
|
| 533 |
multimodal_response = self._generate_multimodal_response(
|
| 534 |
query=query,
|
| 535 |
documents=documents,
|
| 536 |
conversation_context=conversation_context,
|
| 537 |
correct_names=correct_names,
|
| 538 |
+
filters=filters,
|
| 539 |
+
force_multimodal=force_multimodal
|
| 540 |
)
|
| 541 |
|
| 542 |
if multimodal_response:
|
|
|
|
| 620 |
|
| 621 |
try:
|
| 622 |
logger.info(f"📝 TEXT-ONLY GENERATION: Calling LLM...")
|
| 623 |
+
response = self.llm.invoke(response_prompt.format_messages(), prompt_name="visual_rag_answer")
|
| 624 |
response_text = response.content.strip()
|
| 625 |
|
| 626 |
logger.info(f"📝 TEXT-ONLY GENERATION: LLM response received")
|
|
|
|
| 702 |
])
|
| 703 |
|
| 704 |
try:
|
| 705 |
+
response = self.llm.invoke(response_prompt.format_messages(), prompt_name="visual_no_docs_fallback")
|
| 706 |
return response.content.strip()
|
| 707 |
except Exception as e:
|
| 708 |
logger.error(f"❌ RESPONSE GENERATION (NO DOCS): Error: {e}")
|
|
|
|
| 782 |
|
| 783 |
return "\n\n".join(details) if details else "No document details available."
|
| 784 |
|
| 785 |
+
def _build_visual_document_details_with_counts(self, documents: List[Any]) -> tuple:
|
| 786 |
+
"""
|
| 787 |
+
Build document details and return counts of docs with/without text.
|
| 788 |
+
|
| 789 |
+
Returns:
|
| 790 |
+
Tuple of (document_details_string, docs_with_text_count, docs_without_text_count)
|
| 791 |
+
"""
|
| 792 |
+
details = []
|
| 793 |
+
docs_with_content = 0
|
| 794 |
+
docs_without_content = 0
|
| 795 |
+
total_content_length = 0
|
| 796 |
+
|
| 797 |
+
logger.info(f"�� BUILD_DETAILS: Processing {len(documents)} documents for LLM context")
|
| 798 |
+
|
| 799 |
+
for i, doc in enumerate(documents, 1):
|
| 800 |
+
metadata = getattr(doc, 'metadata', {}) or {}
|
| 801 |
+
content = getattr(doc, 'page_content', '') or getattr(doc, 'content', '') or metadata.get('text', '')
|
| 802 |
+
score = getattr(doc, 'score', 0) if hasattr(doc, 'score') else 0
|
| 803 |
+
|
| 804 |
+
filename = metadata.get('filename', 'Unknown')
|
| 805 |
+
year = metadata.get('year', 'Unknown')
|
| 806 |
+
source = metadata.get('source', 'Unknown')
|
| 807 |
+
page = metadata.get('page', metadata.get('page_number', 'Unknown'))
|
| 808 |
+
district = metadata.get('district', 'Unknown')
|
| 809 |
+
num_tiles = metadata.get('num_tiles')
|
| 810 |
+
num_visual_tokens = metadata.get('num_visual_tokens')
|
| 811 |
+
|
| 812 |
+
doc_info = f"[Doc {i}] (Score: {score:.3f})"
|
| 813 |
+
doc_info += f"\n Filename: {filename}"
|
| 814 |
+
doc_info += f"\n Year: {year}"
|
| 815 |
+
doc_info += f"\n Source: {source}"
|
| 816 |
+
if district != 'Unknown':
|
| 817 |
+
doc_info += f"\n District: {district}"
|
| 818 |
+
doc_info += f"\n Page: {page}"
|
| 819 |
+
|
| 820 |
+
if num_tiles or num_visual_tokens:
|
| 821 |
+
doc_info += f"\n Visual: {num_tiles} tiles, {num_visual_tokens} tokens"
|
| 822 |
+
|
| 823 |
+
if content and content.strip():
|
| 824 |
+
doc_info += f"\n Content: {content[:500]}{'...' if len(content) > 500 else ''}"
|
| 825 |
+
docs_with_content += 1
|
| 826 |
+
total_content_length += len(content)
|
| 827 |
+
else:
|
| 828 |
+
doc_info += "\n Content: (No text extracted - image-only page)"
|
| 829 |
+
docs_without_content += 1
|
| 830 |
+
|
| 831 |
+
details.append(doc_info)
|
| 832 |
+
|
| 833 |
+
logger.info(f"📄 BUILD_DETAILS SUMMARY:")
|
| 834 |
+
logger.info(f" - Documents with text content: {docs_with_content}")
|
| 835 |
+
logger.info(f" - Documents WITHOUT text (image-only): {docs_without_content}")
|
| 836 |
+
logger.info(f" - Total text content length: {total_content_length} chars")
|
| 837 |
+
|
| 838 |
+
if docs_without_content > docs_with_content:
|
| 839 |
+
logger.warning(f"⚠️ BUILD_DETAILS: Most documents have NO TEXT CONTENT!")
|
| 840 |
+
logger.warning(f"⚠️ Auto-fallback to multi-modal will be attempted...")
|
| 841 |
+
|
| 842 |
+
details_str = "\n\n".join(details) if details else "No document details available."
|
| 843 |
+
return details_str, docs_with_content, docs_without_content
|
| 844 |
+
|
| 845 |
def _extract_correct_names_from_documents(self, documents: List[Any]) -> str:
|
| 846 |
"""Extract correct district/source names from documents to correct misspellings"""
|
| 847 |
districts = set()
|