akryldigital commited on
Commit
b632fe0
Β·
verified Β·
1 Parent(s): e53ce4e

add VisionRAG ui components

Browse files
src/ui_components/styles.py CHANGED
@@ -115,3 +115,5 @@ def get_custom_css() -> str:
115
  </style>
116
  """
117
 
 
 
 
115
  </style>
116
  """
117
 
118
+
119
+
src/ui_components/visual_documents.py ADDED
@@ -0,0 +1,294 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Visual Document Display Components
3
+
4
+ UI components for displaying visual search results with enhanced metadata.
5
+ """
6
+
7
+ import streamlit as st
8
+ import pandas as pd
9
+ from typing import List, Any, Dict
10
+ from collections import Counter
11
+
12
+
13
+ def display_visual_document_statistics(sources: List[Any]) -> None:
14
+ """
15
+ Display statistics for visual search results in a bordered box with tables.
16
+
17
+ Args:
18
+ sources: List of VisualSearchResult objects
19
+ """
20
+ if not sources:
21
+ return
22
+
23
+ # Extract statistics
24
+ filenames = []
25
+ years = []
26
+ sources_list = []
27
+ districts = []
28
+
29
+ for doc in sources:
30
+ metadata = getattr(doc, 'metadata', {})
31
+ filenames.append(metadata.get('filename', 'Unknown'))
32
+ year = metadata.get('year')
33
+ if year:
34
+ years.append(year)
35
+ source = metadata.get('source')
36
+ if source:
37
+ sources_list.append(source)
38
+ district = metadata.get('district')
39
+ if district and district != 'None':
40
+ districts.append(district)
41
+
42
+ # Count unique values
43
+ unique_files = len(set(filenames))
44
+ unique_years = len(set(years))
45
+ unique_sources = len(set(sources_list))
46
+
47
+ # Create bordered container
48
+ with st.container():
49
+ st.markdown("""
50
+ <style>
51
+ .stats-container {
52
+ border: 2px solid #e0e0e0;
53
+ border-radius: 10px;
54
+ padding: 20px;
55
+ margin: 10px 0;
56
+ background-color: #f9f9f9;
57
+ }
58
+ </style>
59
+ """, unsafe_allow_html=True)
60
+
61
+ st.markdown('<div class="stats-container">', unsafe_allow_html=True)
62
+ st.markdown("### πŸ“Š Retrieval Statistics")
63
+
64
+ # Metrics in columns
65
+ col1, col2, col3, col4 = st.columns(4)
66
+
67
+ with col1:
68
+ st.metric("Total Chunks", len(sources))
69
+ with col2:
70
+ st.metric("Unique Files", unique_files)
71
+ with col3:
72
+ st.metric("Unique Years", unique_years if unique_years > 0 else "N/A")
73
+ with col4:
74
+ st.metric("Unique Sources", unique_sources if unique_sources > 0 else "N/A")
75
+
76
+ st.markdown("---")
77
+
78
+ # Distribution tables in columns
79
+ col1, col2, col3, col4 = st.columns(4)
80
+
81
+ with col1:
82
+ # District distribution
83
+ if districts:
84
+ district_counts = Counter(districts)
85
+ st.markdown("**🏘️ Districts**")
86
+ district_df = pd.DataFrame([
87
+ {"District": dist, "Count": count}
88
+ for dist, count in district_counts.most_common(10)
89
+ ])
90
+ st.dataframe(district_df, hide_index=True, use_container_width=True)
91
+
92
+ with col2:
93
+ # Source distribution
94
+ if sources_list:
95
+ source_counts = Counter(sources_list)
96
+ st.markdown("**πŸ›οΈ Sources**")
97
+ source_df = pd.DataFrame([
98
+ {"Source": src, "Count": count}
99
+ for src, count in source_counts.most_common()
100
+ ])
101
+ st.dataframe(source_df, hide_index=True, use_container_width=True)
102
+
103
+ with col3:
104
+ # Year distribution
105
+ if years:
106
+ year_counts = Counter(years)
107
+ st.markdown("**πŸ“… Years**")
108
+ year_df = pd.DataFrame([
109
+ {"Year": year, "Count": count}
110
+ for year, count in sorted(year_counts.items(), reverse=True)
111
+ ])
112
+ st.dataframe(year_df, hide_index=True, use_container_width=True)
113
+
114
+ with col4:
115
+ # File distribution (top 10)
116
+ file_counts = Counter(filenames)
117
+ st.markdown("**πŸ“„ Files**")
118
+ file_df = pd.DataFrame([
119
+ {"File": filename[:30] + "..." if len(filename) > 30 else filename, "Count": count}
120
+ for filename, count in file_counts.most_common(10)
121
+ ])
122
+ st.dataframe(file_df, hide_index=True, use_container_width=True)
123
+
124
+ st.markdown('</div>', unsafe_allow_html=True)
125
+
126
+
127
+ def display_visual_document_details(sources: List[Any], show_images: bool = False) -> None:
128
+ """
129
+ Display detailed information for each visual search result.
130
+
131
+ Args:
132
+ sources: List of VisualSearchResult objects
133
+ show_images: Whether to display document images (from Cloudinary)
134
+ """
135
+ st.markdown("### πŸ“„ Document Details")
136
+
137
+ for i, doc in enumerate(sources):
138
+ metadata = getattr(doc, 'metadata', {})
139
+
140
+ # Get basic metadata
141
+ filename = metadata.get('filename', 'Unknown')
142
+ page_number = metadata.get('page_number', '?')
143
+ year = metadata.get('year', 'Unknown')
144
+ source = metadata.get('source', 'Unknown')
145
+ district = metadata.get('district')
146
+ score = getattr(doc, 'score', 0.0)
147
+
148
+ # Get visual-specific metadata
149
+ num_tiles = metadata.get('num_tiles')
150
+ tile_rows = metadata.get('tile_rows')
151
+ tile_cols = metadata.get('tile_cols')
152
+ num_visual_tokens = metadata.get('num_visual_tokens')
153
+ original_width = metadata.get('original_width')
154
+ original_height = metadata.get('original_height')
155
+ resized_width = metadata.get('resized_width')
156
+ resized_height = metadata.get('resized_height')
157
+
158
+ # Get image URLs
159
+ original_url = metadata.get('original_url')
160
+ resized_url = metadata.get('resized_url')
161
+ page_url = metadata.get('page') # Fallback
162
+
163
+ # Build title
164
+ score_text = f" (Score: {score:.3f})"
165
+ title = f"πŸ“„ Document {i+1}: {filename[:50]}...{score_text}"
166
+
167
+ with st.expander(title, expanded=(i == 0)): # Expand first result
168
+ # Two-column layout: Metadata (left) and Image (right)
169
+ col_meta, col_image = st.columns([1, 2])
170
+
171
+ with col_meta:
172
+ st.markdown("### πŸ“‹ Metadata")
173
+
174
+ # Basic metadata
175
+ st.write(f"πŸ“„ **File:** {filename}")
176
+ st.write(f"πŸ›οΈ **Source:** {source}")
177
+ st.write(f"πŸ“… **Year:** {year}")
178
+ st.write(f"πŸ“– **Page:** {page_number}")
179
+
180
+ if district and district != 'None':
181
+ st.write(f"πŸ“ **District:** {district}")
182
+
183
+ # Relevance score
184
+ st.markdown("---")
185
+ st.markdown("### 🎯 Relevance")
186
+ score_color = "🟒" if score > 0.7 else "🟑" if score > 0.5 else "πŸ”΄"
187
+ st.markdown(f"**Score:** {score_color} **{score:.3f}**")
188
+
189
+ # Visual metadata (if available)
190
+ if num_tiles or num_visual_tokens:
191
+ st.markdown("---")
192
+ st.markdown("### 🎨 Visual Metadata")
193
+
194
+ if num_tiles:
195
+ st.write(f"πŸ”² **Tiles:** {num_tiles} ({tile_rows}Γ—{tile_cols})")
196
+ if num_visual_tokens:
197
+ st.write(f"πŸ”’ **Visual Tokens:** {num_visual_tokens}")
198
+ if original_width and original_height:
199
+ st.write(f"πŸ“ **Original Size:** {original_width}Γ—{original_height}")
200
+ if resized_width and resized_height:
201
+ st.write(f"πŸ“ **Resized Size:** {resized_width}Γ—{resized_height}")
202
+
203
+ processing_version = metadata.get('processing_version')
204
+ if processing_version:
205
+ st.write(f"βš™οΈ **Processing:** {processing_version}")
206
+
207
+ # Text content preview
208
+ content = getattr(doc, 'page_content', '')
209
+ if content:
210
+ st.markdown("---")
211
+ with st.expander("πŸ“ Extracted Text", expanded=True):
212
+ st.text_area(
213
+ "Content",
214
+ value=content[:500] + ("..." if len(content) > 500 else ""),
215
+ height=150,
216
+ disabled=True,
217
+ label_visibility="collapsed",
218
+ key=f"visual_doc_text_{i}"
219
+ )
220
+ else:
221
+ st.markdown("---")
222
+ st.caption("_No text extracted (image-only page)_")
223
+
224
+ # Show image URLs under text
225
+ if original_url and resized_url:
226
+ with st.expander("πŸ”— Image URLs", expanded=True):
227
+ st.markdown(f"**Original:** [{original_url}]({original_url})")
228
+ st.markdown(f"**Resized (for embeddings):** [{resized_url}]({resized_url})")
229
+
230
+ with col_image:
231
+ st.markdown("### πŸ“„ Document Page")
232
+
233
+ # Display image (if available and requested)
234
+ if show_images:
235
+ # Use ORIGINAL image (not resized) for display
236
+ image_url = original_url or resized_url or page_url
237
+
238
+ if image_url and isinstance(image_url, str) and image_url.startswith('http'):
239
+ try:
240
+ # Use width parameter for medium-sized image
241
+ st.image(image_url, width=750, caption=f"Page {page_number}")
242
+ except Exception as e:
243
+ st.error(f"Failed to load image: {e}")
244
+ else:
245
+ st.info("No image URL available")
246
+ else:
247
+ st.info("Enable image display in settings to view document pages")
248
+
249
+
250
+ def display_visual_search_results(
251
+ sources: List[Any],
252
+ show_statistics: bool = True,
253
+ show_images: bool = False,
254
+ max_display: int = 20
255
+ ) -> None:
256
+ """
257
+ Display visual search results with statistics and details.
258
+
259
+ Args:
260
+ sources: List of VisualSearchResult objects
261
+ show_statistics: Whether to show statistics
262
+ show_images: Whether to show document images
263
+ max_display: Maximum number of documents to display in detail
264
+ """
265
+ if not sources:
266
+ st.info("No documents were retrieved for the last query.")
267
+ return
268
+
269
+ # Count unique filenames
270
+ unique_filenames = set()
271
+ for doc in sources:
272
+ filename = getattr(doc, 'metadata', {}).get('filename', 'Unknown')
273
+ unique_filenames.add(filename)
274
+
275
+ st.markdown(f"**Found {len(sources)} document chunks from {len(unique_filenames)} unique documents:**")
276
+
277
+ if len(unique_filenames) < len(sources):
278
+ st.info(f"πŸ’‘ **Note**: Each document is split into multiple chunks. You're seeing {len(sources)} chunks from {len(unique_filenames)} documents.")
279
+
280
+ # Show statistics
281
+ if show_statistics:
282
+ display_visual_document_statistics(sources)
283
+ st.markdown("---")
284
+
285
+ # Show detailed results (limit to max_display)
286
+ display_sources = sources[:max_display]
287
+ if len(sources) > max_display:
288
+ st.warning(f"⚠️ Showing top {max_display} of {len(sources)} results")
289
+
290
+ display_visual_document_details(display_sources, show_images=show_images)
291
+
292
+ if len(sources) > max_display:
293
+ st.info(f"πŸ’‘ {len(sources) - max_display} more results not shown")
294
+