akryldigital commited on
Commit
215981c
·
verified ·
1 Parent(s): efdbd84

delete bug

Browse files
Files changed (1) hide show
  1. src/agents/visual_documents.py +0 -418
src/agents/visual_documents.py DELETED
@@ -1,418 +0,0 @@
1
- """
2
- Visual Document Display Components
3
-
4
- UI components for displaying visual search results with enhanced metadata.
5
- Includes saliency map visualization for tile-aware ColPali embeddings.
6
- """
7
-
8
- import streamlit as st
9
- import pandas as pd
10
- import numpy as np
11
- import logging
12
- from typing import List, Any, Dict, Optional
13
- from collections import Counter
14
-
15
- logger = logging.getLogger(__name__)
16
-
17
-
18
- def display_visual_document_statistics(sources: List[Any]) -> None:
19
- """
20
- Display statistics for visual search results in a bordered box with tables.
21
-
22
- Args:
23
- sources: List of VisualSearchResult objects
24
- """
25
- if not sources:
26
- return
27
-
28
- # Extract statistics
29
- filenames = []
30
- years = []
31
- sources_list = []
32
- districts = []
33
-
34
- for doc in sources:
35
- metadata = getattr(doc, 'metadata', {})
36
- filenames.append(metadata.get('filename', 'Unknown'))
37
- year = metadata.get('year')
38
- if year:
39
- years.append(year)
40
- source = metadata.get('source')
41
- if source:
42
- sources_list.append(source)
43
- district = metadata.get('district')
44
- if district and district != 'None':
45
- districts.append(district)
46
-
47
- # Count unique values
48
- unique_files = len(set(filenames))
49
- unique_years = len(set(years))
50
- unique_sources = len(set(sources_list))
51
-
52
- # Create bordered container
53
- with st.container():
54
- st.markdown("""
55
- <style>
56
- .stats-container {
57
- border: 2px solid #e0e0e0;
58
- border-radius: 10px;
59
- padding: 20px;
60
- margin: 10px 0;
61
- background-color: #f9f9f9;
62
- }
63
- </style>
64
- """, unsafe_allow_html=True)
65
-
66
- st.markdown('<div class="stats-container">', unsafe_allow_html=True)
67
- st.markdown("### 📊 Retrieval Statistics")
68
-
69
- # Metrics in columns
70
- col1, col2, col3, col4 = st.columns(4)
71
-
72
- with col1:
73
- st.metric("Total Chunks", len(sources))
74
- with col2:
75
- st.metric("Unique Files", unique_files)
76
- with col3:
77
- st.metric("Unique Years", unique_years if unique_years > 0 else "N/A")
78
- with col4:
79
- st.metric("Unique Sources", unique_sources if unique_sources > 0 else "N/A")
80
-
81
- st.markdown("---")
82
-
83
- # Distribution tables in columns
84
- col1, col2, col3, col4 = st.columns(4)
85
-
86
- with col1:
87
- # District distribution
88
- if districts:
89
- district_counts = Counter(districts)
90
- st.markdown("**🏘️ Districts**")
91
- district_df = pd.DataFrame([
92
- {"District": dist, "Count": count}
93
- for dist, count in district_counts.most_common(10)
94
- ])
95
- st.dataframe(district_df, hide_index=True, use_container_width=True)
96
-
97
- with col2:
98
- # Source distribution
99
- if sources_list:
100
- source_counts = Counter(sources_list)
101
- st.markdown("**🏛️ Sources**")
102
- source_df = pd.DataFrame([
103
- {"Source": src, "Count": count}
104
- for src, count in source_counts.most_common()
105
- ])
106
- st.dataframe(source_df, hide_index=True, use_container_width=True)
107
-
108
- with col3:
109
- # Year distribution
110
- if years:
111
- year_counts = Counter(years)
112
- st.markdown("**📅 Years**")
113
- year_df = pd.DataFrame([
114
- {"Year": year, "Count": count}
115
- for year, count in sorted(year_counts.items(), reverse=True)
116
- ])
117
- st.dataframe(year_df, hide_index=True, use_container_width=True)
118
-
119
- with col4:
120
- # File distribution (top 10)
121
- file_counts = Counter(filenames)
122
- st.markdown("**📄 Files**")
123
- file_df = pd.DataFrame([
124
- {"File": filename[:30] + "..." if len(filename) > 30 else filename, "Count": count}
125
- for filename, count in file_counts.most_common(10)
126
- ])
127
- st.dataframe(file_df, hide_index=True, use_container_width=True)
128
-
129
- st.markdown('</div>', unsafe_allow_html=True)
130
-
131
-
132
- def display_visual_document_details(
133
- sources: List[Any],
134
- show_images: bool = False,
135
- show_saliency: bool = False,
136
- qdrant_client: Any = None,
137
- collection_name: str = None,
138
- query_embedding: Optional[np.ndarray] = None,
139
- saliency_alpha: float = 0.4,
140
- saliency_colormap: str = 'hot',
141
- saliency_threshold: int = 50
142
- ) -> None:
143
- """
144
- Display detailed information for each visual search result.
145
-
146
- Args:
147
- sources: List of VisualSearchResult objects
148
- show_images: Whether to display document images (from Cloudinary)
149
- show_saliency: Whether to generate and display saliency maps
150
- qdrant_client: Qdrant client (required for saliency)
151
- collection_name: Qdrant collection name (required for saliency)
152
- query_embedding: Query embedding for saliency computation
153
- saliency_alpha: Saliency overlay transparency (0.0-1.0)
154
- saliency_colormap: Matplotlib colormap for saliency (default: 'hot')
155
- saliency_threshold: Threshold percentile for saliency (default: 50)
156
- """
157
- st.markdown("### 📄 Document Details")
158
-
159
- # Import saliency functions if needed
160
- if show_saliency:
161
- from .saliency import generate_tile_aware_saliency, can_generate_saliency
162
-
163
- for i, doc in enumerate(sources):
164
- metadata = getattr(doc, 'metadata', {})
165
-
166
- # Get basic metadata
167
- filename = metadata.get('filename', 'Unknown')
168
- page_number = metadata.get('page_number', '?')
169
- year = metadata.get('year', 'Unknown')
170
- source = metadata.get('source', 'Unknown')
171
- district = metadata.get('district')
172
- score = getattr(doc, 'score', 0.0)
173
-
174
- # Get visual-specific metadata
175
- num_tiles = metadata.get('num_tiles')
176
- tile_rows = metadata.get('tile_rows')
177
- tile_cols = metadata.get('tile_cols')
178
- num_visual_tokens = metadata.get('num_visual_tokens')
179
- original_width = metadata.get('original_width')
180
- original_height = metadata.get('original_height')
181
- resized_width = metadata.get('resized_width')
182
- resized_height = metadata.get('resized_height')
183
-
184
- # Get image URLs
185
- original_url = metadata.get('original_url')
186
- resized_url = metadata.get('resized_url')
187
- page_url = metadata.get('page') # Fallback
188
-
189
- # Get point_id for saliency (check doc.id first, then metadata)
190
- point_id = getattr(doc, 'id', None) or metadata.get('point_id') or metadata.get('_id')
191
-
192
- # Debug logging for saliency
193
- if show_saliency:
194
- logger.debug(f"Doc {i+1}: point_id={point_id}, has_tiles={metadata.get('num_tiles') is not None}")
195
-
196
- # Build title
197
- score_text = f" (Score: {score:.3f})"
198
- title = f"📄 Document {i+1}: {filename[:50]}...{score_text}"
199
-
200
- with st.expander(title, expanded=(i == 0)): # Expand first result
201
- # Two-column layout: Metadata (left) and Image (right)
202
- col_meta, col_image = st.columns([1, 2])
203
-
204
- with col_meta:
205
- st.markdown("### 📋 Metadata")
206
-
207
- # Basic metadata
208
- st.write(f"📄 **File:** {filename}")
209
- st.write(f"🏛️ **Source:** {source}")
210
- st.write(f"📅 **Year:** {year}")
211
- st.write(f"📖 **Page:** {page_number}")
212
-
213
- if district and district != 'None':
214
- st.write(f"📍 **District:** {district}")
215
-
216
- # Relevance score
217
- st.markdown("---")
218
- st.markdown("### 🎯 Relevance")
219
- score_color = "🟢" if score > 0.7 else "🟡" if score > 0.5 else "🔴"
220
- st.markdown(f"**Score:** {score_color} **{score:.3f}**")
221
-
222
- # Visual metadata (if available)
223
- if num_tiles or num_visual_tokens:
224
- st.markdown("---")
225
- st.markdown("### 🎨 Visual Metadata")
226
-
227
- if num_tiles:
228
- st.write(f"🔲 **Tiles:** {num_tiles} ({tile_rows}×{tile_cols})")
229
- if num_visual_tokens:
230
- st.write(f"🔢 **Visual Tokens:** {num_visual_tokens}")
231
- if original_width and original_height:
232
- st.write(f"📐 **Original Size:** {original_width}×{original_height}")
233
- if resized_width and resized_height:
234
- st.write(f"📐 **Resized Size:** {resized_width}×{resized_height}")
235
-
236
- processing_version = metadata.get('processing_version')
237
- if processing_version:
238
- st.write(f"⚙️ **Processing:** {processing_version}")
239
-
240
- # Text content preview
241
- content = getattr(doc, 'page_content', '')
242
- if content:
243
- st.markdown("---")
244
- with st.expander("📝 Extracted Text", expanded=True):
245
- st.text_area(
246
- "Content",
247
- value=content[:500] + ("..." if len(content) > 500 else ""),
248
- height=150,
249
- disabled=True,
250
- label_visibility="collapsed",
251
- key=f"visual_doc_text_{i}"
252
- )
253
- else:
254
- st.markdown("---")
255
- st.caption("_No text extracted (image-only page)_")
256
-
257
- # Show image URLs under text
258
- if original_url and resized_url:
259
- with st.expander("🔗 Image URLs", expanded=True):
260
- st.markdown(f"**Original:** [{original_url}]({original_url})")
261
- st.markdown(f"**Resized (for embeddings):** [{resized_url}]({resized_url})")
262
-
263
- with col_image:
264
- st.markdown("### 📸 Document Page")
265
-
266
- # Check if we should generate saliency
267
- saliency_generated = False
268
-
269
- if show_saliency and show_images:
270
- # Check if we have all requirements for saliency
271
- has_client = qdrant_client is not None
272
- has_collection = collection_name is not None
273
- has_query = query_embedding is not None
274
- has_point_id = point_id is not None
275
- has_tile_metadata = can_generate_saliency(metadata)
276
-
277
- can_saliency = has_client and has_collection and has_query and has_point_id and has_tile_metadata
278
-
279
- if not can_saliency:
280
- missing = []
281
- if not has_client: missing.append("qdrant_client")
282
- if not has_collection: missing.append("collection_name")
283
- if not has_query: missing.append("query_embedding")
284
- if not has_point_id: missing.append("point_id")
285
- if not has_tile_metadata: missing.append("tile_metadata")
286
- logger.warning(f"Doc {i+1}: Saliency unavailable, missing: {missing}")
287
-
288
- if can_saliency:
289
- try:
290
- with st.spinner(f"🔥 Generating saliency map for Doc {i+1}..."):
291
- # Convert query embedding if needed
292
- query_emb = query_embedding
293
- if hasattr(query_emb, 'cpu'):
294
- query_emb = query_emb.cpu().float().numpy()
295
- if query_emb.ndim == 3:
296
- query_emb = query_emb.squeeze(0) # Remove batch dimension
297
-
298
- logger.info(f"🔥 Generating saliency for doc {i+1}: point_id={point_id}, colormap={saliency_colormap}")
299
-
300
- saliency_img = generate_tile_aware_saliency(
301
- qdrant_client=qdrant_client,
302
- collection_name=collection_name,
303
- point_id=point_id,
304
- query_embedding=query_emb,
305
- alpha=saliency_alpha,
306
- colormap=saliency_colormap,
307
- threshold_percentile=saliency_threshold
308
- )
309
-
310
- if saliency_img:
311
- # Display saliency map
312
- st.image(saliency_img, width=700, caption=f"🔥 Saliency Map - Page {page_number}")
313
- saliency_generated = True
314
- logger.info(f"✅ Saliency map displayed for doc {i+1}")
315
- else:
316
- logger.warning(f"Saliency generation returned None for doc {i+1}")
317
- st.caption("_Saliency map could not be generated_")
318
- except Exception as e:
319
- logger.error(f"Saliency generation failed for doc {i+1}: {e}")
320
- import traceback
321
- logger.debug(traceback.format_exc())
322
- st.warning(f"⚠️ Saliency generation failed: {str(e)[:100]}")
323
- else:
324
- if not has_tile_metadata:
325
- st.caption("_Saliency unavailable: missing tile metadata_")
326
- elif not has_point_id:
327
- st.caption("_Saliency unavailable: missing point_id_")
328
-
329
- # Display original image if saliency wasn't generated
330
- if show_images and not saliency_generated:
331
- # Use ORIGINAL image (not resized) for display
332
- image_url = original_url or resized_url or page_url
333
-
334
- if image_url and isinstance(image_url, str) and image_url.startswith('http'):
335
- try:
336
- # Use width parameter for medium-sized image
337
- st.image(image_url, width=700, caption=f"Page {page_number}")
338
- except Exception as e:
339
- st.error(f"Failed to load image: {e}")
340
- else:
341
- st.info("No image URL available")
342
- elif not show_images:
343
- st.info("Enable image display in settings to view document pages")
344
-
345
-
346
- def display_visual_search_results(
347
- sources: List[Any],
348
- show_statistics: bool = True,
349
- show_images: bool = False,
350
- show_saliency: bool = False,
351
- qdrant_client: Any = None,
352
- collection_name: str = None,
353
- query_embedding: Optional[np.ndarray] = None,
354
- saliency_alpha: float = 0.4,
355
- saliency_colormap: str = 'hot',
356
- saliency_threshold: int = 50,
357
- max_display: int = 20
358
- ) -> None:
359
- """
360
- Display visual search results with statistics and details.
361
-
362
- Args:
363
- sources: List of VisualSearchResult objects
364
- show_statistics: Whether to show statistics
365
- show_images: Whether to show document images
366
- show_saliency: Whether to generate and display saliency maps
367
- qdrant_client: Qdrant client (required for saliency)
368
- collection_name: Qdrant collection name (required for saliency)
369
- query_embedding: Query embedding for saliency computation
370
- saliency_alpha: Saliency overlay transparency (0.0-1.0)
371
- saliency_colormap: Matplotlib colormap for saliency (default: 'hot')
372
- saliency_threshold: Threshold percentile for saliency (default: 50)
373
- max_display: Maximum number of documents to display in detail
374
- """
375
- if not sources:
376
- st.info("No documents were retrieved for the last query.")
377
- return
378
-
379
- # Count unique filenames
380
- unique_filenames = set()
381
- for doc in sources:
382
- filename = getattr(doc, 'metadata', {}).get('filename', 'Unknown')
383
- unique_filenames.add(filename)
384
-
385
- st.markdown(f"**Found {len(sources)} document chunks from {len(unique_filenames)} unique documents:**")
386
-
387
- if len(unique_filenames) < len(sources):
388
- st.info(f"💡 **Note**: Each document is split into multiple chunks. You're seeing {len(sources)} chunks from {len(unique_filenames)} documents.")
389
-
390
- # Show saliency info if enabled
391
- if show_saliency:
392
- st.info(f"🔥 **Saliency Maps Enabled**: Showing which image regions are most relevant to your query (using '{saliency_colormap}' colormap)")
393
-
394
- # Show statistics
395
- if show_statistics:
396
- display_visual_document_statistics(sources)
397
- st.markdown("---")
398
-
399
- # Show detailed results (limit to max_display)
400
- display_sources = sources[:max_display]
401
- if len(sources) > max_display:
402
- st.warning(f"⚠️ Showing top {max_display} of {len(sources)} results")
403
-
404
- display_visual_document_details(
405
- display_sources,
406
- show_images=show_images,
407
- show_saliency=show_saliency,
408
- qdrant_client=qdrant_client,
409
- collection_name=collection_name,
410
- query_embedding=query_embedding,
411
- saliency_alpha=saliency_alpha,
412
- saliency_colormap=saliency_colormap,
413
- saliency_threshold=saliency_threshold
414
- )
415
-
416
- if len(sources) > max_display:
417
- st.info(f"💡 {len(sources) - max_display} more results not shown")
418
-