Spaces:

akryldigital
/

audit_assistant

Running

App Files Files Community

akryldigital commited on Nov 24, 2025

Commit

b632fe0

verified ·

1 Parent(s): e53ce4e

add VisionRAG ui components

Browse files

Files changed (2) hide show

src/ui_components/styles.py +2 -0
src/ui_components/visual_documents.py +294 -0

src/ui_components/styles.py CHANGED Viewed

@@ -115,3 +115,5 @@ def get_custom_css() -> str:
 </style>
 """


115	</style>
116	"""
117
118	+
119	+

src/ui_components/visual_documents.py ADDED Viewed

	@@ -0,0 +1,294 @@

+"""
+Visual Document Display Components
+UI components for displaying visual search results with enhanced metadata.
+"""
+import streamlit as st
+import pandas as pd
+from typing import List, Any, Dict
+from collections import Counter
+def display_visual_document_statistics(sources: List[Any]) -> None:
+    """
+    Display statistics for visual search results in a bordered box with tables.
+    Args:
+        sources: List of VisualSearchResult objects
+    """
+    if not sources:
+        return
+    # Extract statistics
+    filenames = []
+    years = []
+    sources_list = []
+    districts = []
+    for doc in sources:
+        metadata = getattr(doc, 'metadata', {})
+        filenames.append(metadata.get('filename', 'Unknown'))
+        year = metadata.get('year')
+        if year:
+            years.append(year)
+        source = metadata.get('source')
+        if source:
+            sources_list.append(source)
+        district = metadata.get('district')
+        if district and district != 'None':
+            districts.append(district)
+    # Count unique values
+    unique_files = len(set(filenames))
+    unique_years = len(set(years))
+    unique_sources = len(set(sources_list))
+    # Create bordered container
+    with st.container():
+        st.markdown("""
+        <style>
+        .stats-container {
+            border: 2px solid #e0e0e0;
+            border-radius: 10px;
+            padding: 20px;
+            margin: 10px 0;
+            background-color: #f9f9f9;
+        }
+        </style>
+        """, unsafe_allow_html=True)
+        st.markdown('<div class="stats-container">', unsafe_allow_html=True)
+        st.markdown("### 📊 Retrieval Statistics")
+        # Metrics in columns
+        col1, col2, col3, col4 = st.columns(4)
+        with col1:
+            st.metric("Total Chunks", len(sources))
+        with col2:
+            st.metric("Unique Files", unique_files)
+        with col3:
+            st.metric("Unique Years", unique_years if unique_years > 0 else "N/A")
+        with col4:
+            st.metric("Unique Sources", unique_sources if unique_sources > 0 else "N/A")
+        st.markdown("---")
+        # Distribution tables in columns
+        col1, col2, col3, col4 = st.columns(4)
+        with col1:
+            # District distribution
+            if districts:
+                district_counts = Counter(districts)
+                st.markdown("**🏘️ Districts**")
+                district_df = pd.DataFrame([
+                    {"District": dist, "Count": count}
+                    for dist, count in district_counts.most_common(10)
+                ])
+                st.dataframe(district_df, hide_index=True, use_container_width=True)
+        with col2:
+            # Source distribution
+            if sources_list:
+                source_counts = Counter(sources_list)
+                st.markdown("**🏛️ Sources**")
+                source_df = pd.DataFrame([
+                    {"Source": src, "Count": count}
+                    for src, count in source_counts.most_common()
+                ])
+                st.dataframe(source_df, hide_index=True, use_container_width=True)
+        with col3:
+            # Year distribution
+            if years:
+                year_counts = Counter(years)
+                st.markdown("**📅 Years**")
+                year_df = pd.DataFrame([
+                    {"Year": year, "Count": count}
+                    for year, count in sorted(year_counts.items(), reverse=True)
+                ])
+                st.dataframe(year_df, hide_index=True, use_container_width=True)
+        with col4:
+            # File distribution (top 10)
+            file_counts = Counter(filenames)
+            st.markdown("**📄 Files**")
+            file_df = pd.DataFrame([
+                {"File": filename[:30] + "..." if len(filename) > 30 else filename, "Count": count}
+                for filename, count in file_counts.most_common(10)
+            ])
+            st.dataframe(file_df, hide_index=True, use_container_width=True)
+        st.markdown('</div>', unsafe_allow_html=True)
+def display_visual_document_details(sources: List[Any], show_images: bool = False) -> None:
+    """
+    Display detailed information for each visual search result.
+    Args:
+        sources: List of VisualSearchResult objects
+        show_images: Whether to display document images (from Cloudinary)
+    """
+    st.markdown("### 📄 Document Details")
+    for i, doc in enumerate(sources):
+        metadata = getattr(doc, 'metadata', {})
+        # Get basic metadata
+        filename = metadata.get('filename', 'Unknown')
+        page_number = metadata.get('page_number', '?')
+        year = metadata.get('year', 'Unknown')
+        source = metadata.get('source', 'Unknown')
+        district = metadata.get('district')
+        score = getattr(doc, 'score', 0.0)
+        # Get visual-specific metadata
+        num_tiles = metadata.get('num_tiles')
+        tile_rows = metadata.get('tile_rows')
+        tile_cols = metadata.get('tile_cols')
+        num_visual_tokens = metadata.get('num_visual_tokens')
+        original_width = metadata.get('original_width')
+        original_height = metadata.get('original_height')
+        resized_width = metadata.get('resized_width')
+        resized_height = metadata.get('resized_height')
+        # Get image URLs
+        original_url = metadata.get('original_url')
+        resized_url = metadata.get('resized_url')
+        page_url = metadata.get('page')  # Fallback
+        # Build title
+        score_text = f" (Score: {score:.3f})"
+        title = f"📄 Document {i+1}: {filename[:50]}...{score_text}"
+        with st.expander(title, expanded=(i == 0)):  # Expand first result
+            # Two-column layout: Metadata (left) and Image (right)
+            col_meta, col_image = st.columns([1, 2])
+            with col_meta:
+                st.markdown("### 📋 Metadata")
+                # Basic metadata
+                st.write(f"📄 **File:** {filename}")
+                st.write(f"🏛️ **Source:** {source}")
+                st.write(f"📅 **Year:** {year}")
+                st.write(f"📖 **Page:** {page_number}")
+                if district and district != 'None':
+                    st.write(f"📍 **District:** {district}")
+                # Relevance score
+                st.markdown("---")
+                st.markdown("### 🎯 Relevance")
+                score_color = "🟢" if score > 0.7 else "🟡" if score > 0.5 else "🔴"
+                st.markdown(f"**Score:** {score_color} **{score:.3f}**")
+                # Visual metadata (if available)
+                if num_tiles or num_visual_tokens:
+                    st.markdown("---")
+                    st.markdown("### 🎨 Visual Metadata")
+                    if num_tiles:
+                        st.write(f"🔲 **Tiles:** {num_tiles} ({tile_rows}×{tile_cols})")
+                    if num_visual_tokens:
+                        st.write(f"🔢 **Visual Tokens:** {num_visual_tokens}")
+                    if original_width and original_height:
+                        st.write(f"📐 **Original Size:** {original_width}×{original_height}")
+                    if resized_width and resized_height:
+                        st.write(f"📐 **Resized Size:** {resized_width}×{resized_height}")
+                    processing_version = metadata.get('processing_version')
+                    if processing_version:
+                        st.write(f"⚙️ **Processing:** {processing_version}")
+                # Text content preview
+                content = getattr(doc, 'page_content', '')
+                if content:
+                    st.markdown("---")
+                    with st.expander("📝 Extracted Text", expanded=True):
+                        st.text_area(
+                            "Content",
+                            value=content[:500] + ("..." if len(content) > 500 else ""),
+                            height=150,
+                            disabled=True,
+                            label_visibility="collapsed",
+                            key=f"visual_doc_text_{i}"
+                        )
+                else:
+                    st.markdown("---")
+                    st.caption("_No text extracted (image-only page)_")
+                # Show image URLs under text
+                if original_url and resized_url:
+                    with st.expander("🔗 Image URLs", expanded=True):
+                        st.markdown(f"**Original:** [{original_url}]({original_url})")
+                        st.markdown(f"**Resized (for embeddings):** [{resized_url}]({resized_url})")
+            with col_image:
+                st.markdown("### 📄 Document Page")
+                # Display image (if available and requested)
+                if show_images:
+                    # Use ORIGINAL image (not resized) for display
+                    image_url = original_url or resized_url or page_url
+                    if image_url and isinstance(image_url, str) and image_url.startswith('http'):
+                        try:
+                            # Use width parameter for medium-sized image
+                            st.image(image_url, width=750, caption=f"Page {page_number}")
+                        except Exception as e:
+                            st.error(f"Failed to load image: {e}")
+                    else:
+                        st.info("No image URL available")
+                else:
+                    st.info("Enable image display in settings to view document pages")
+def display_visual_search_results(
+    sources: List[Any],
+    show_statistics: bool = True,
+    show_images: bool = False,
+    max_display: int = 20
+) -> None:
+    """
+    Display visual search results with statistics and details.
+    Args:
+        sources: List of VisualSearchResult objects
+        show_statistics: Whether to show statistics
+        show_images: Whether to show document images
+        max_display: Maximum number of documents to display in detail
+    """
+    if not sources:
+        st.info("No documents were retrieved for the last query.")
+        return
+    # Count unique filenames
+    unique_filenames = set()
+    for doc in sources:
+        filename = getattr(doc, 'metadata', {}).get('filename', 'Unknown')
+        unique_filenames.add(filename)
+    st.markdown(f"**Found {len(sources)} document chunks from {len(unique_filenames)} unique documents:**")
+    if len(unique_filenames) < len(sources):
+        st.info(f"💡 **Note**: Each document is split into multiple chunks. You're seeing {len(sources)} chunks from {len(unique_filenames)} documents.")
+    # Show statistics
+    if show_statistics:
+        display_visual_document_statistics(sources)
+        st.markdown("---")
+    # Show detailed results (limit to max_display)
+    display_sources = sources[:max_display]
+    if len(sources) > max_display:
+        st.warning(f"⚠️ Showing top {max_display} of {len(sources)} results")
+    display_visual_document_details(display_sources, show_images=show_images)
+    if len(sources) > max_display:
+        st.info(f"💡 {len(sources) - max_display} more results not shown")