Spaces:

akryldigital
/

audit_assistant

Sleeping

App Files Files Community

akryldigital commited on 20 days ago

Commit

215981c

verified ·

1 Parent(s): efdbd84

delete bug

Browse files

Files changed (1) hide show

src/agents/visual_documents.py +0 -418

src/agents/visual_documents.py DELETED Viewed

@@ -1,418 +0,0 @@
-"""
-Visual Document Display Components
-UI components for displaying visual search results with enhanced metadata.
-Includes saliency map visualization for tile-aware ColPali embeddings.
-"""
-import streamlit as st
-import pandas as pd
-import numpy as np
-import logging
-from typing import List, Any, Dict, Optional
-from collections import Counter
-logger = logging.getLogger(__name__)
-def display_visual_document_statistics(sources: List[Any]) -> None:
-    """
-    Display statistics for visual search results in a bordered box with tables.
-    Args:
-        sources: List of VisualSearchResult objects
-    """
-    if not sources:
-        return
-    # Extract statistics
-    filenames = []
-    years = []
-    sources_list = []
-    districts = []
-    for doc in sources:
-        metadata = getattr(doc, 'metadata', {})
-        filenames.append(metadata.get('filename', 'Unknown'))
-        year = metadata.get('year')
-        if year:
-            years.append(year)
-        source = metadata.get('source')
-        if source:
-            sources_list.append(source)
-        district = metadata.get('district')
-        if district and district != 'None':
-            districts.append(district)
-    # Count unique values
-    unique_files = len(set(filenames))
-    unique_years = len(set(years))
-    unique_sources = len(set(sources_list))
-    # Create bordered container
-    with st.container():
-        st.markdown("""
-        <style>
-        .stats-container {
-            border: 2px solid #e0e0e0;
-            border-radius: 10px;
-            padding: 20px;
-            margin: 10px 0;
-            background-color: #f9f9f9;
-        }
-        </style>
-        """, unsafe_allow_html=True)
-        st.markdown('<div class="stats-container">', unsafe_allow_html=True)
-        st.markdown("### 📊 Retrieval Statistics")
-        # Metrics in columns
-        col1, col2, col3, col4 = st.columns(4)
-        with col1:
-            st.metric("Total Chunks", len(sources))
-        with col2:
-            st.metric("Unique Files", unique_files)
-        with col3:
-            st.metric("Unique Years", unique_years if unique_years > 0 else "N/A")
-        with col4:
-            st.metric("Unique Sources", unique_sources if unique_sources > 0 else "N/A")
-        st.markdown("---")
-        # Distribution tables in columns
-        col1, col2, col3, col4 = st.columns(4)
-        with col1:
-            # District distribution
-            if districts:
-                district_counts = Counter(districts)
-                st.markdown("**🏘️ Districts**")
-                district_df = pd.DataFrame([
-                    {"District": dist, "Count": count}
-                    for dist, count in district_counts.most_common(10)
-                ])
-                st.dataframe(district_df, hide_index=True, use_container_width=True)
-        with col2:
-            # Source distribution
-            if sources_list:
-                source_counts = Counter(sources_list)
-                st.markdown("**🏛️ Sources**")
-                source_df = pd.DataFrame([
-                    {"Source": src, "Count": count}
-                    for src, count in source_counts.most_common()
-                ])
-                st.dataframe(source_df, hide_index=True, use_container_width=True)
-        with col3:
-            # Year distribution
-            if years:
-                year_counts = Counter(years)
-                st.markdown("**📅 Years**")
-                year_df = pd.DataFrame([
-                    {"Year": year, "Count": count}
-                    for year, count in sorted(year_counts.items(), reverse=True)
-                ])
-                st.dataframe(year_df, hide_index=True, use_container_width=True)
-        with col4:
-            # File distribution (top 10)
-            file_counts = Counter(filenames)
-            st.markdown("**📄 Files**")
-            file_df = pd.DataFrame([
-                {"File": filename[:30] + "..." if len(filename) > 30 else filename, "Count": count}
-                for filename, count in file_counts.most_common(10)
-            ])
-            st.dataframe(file_df, hide_index=True, use_container_width=True)
-        st.markdown('</div>', unsafe_allow_html=True)
-def display_visual_document_details(
-    sources: List[Any],
-    show_images: bool = False,
-    show_saliency: bool = False,
-    qdrant_client: Any = None,
-    collection_name: str = None,
-    query_embedding: Optional[np.ndarray] = None,
-    saliency_alpha: float = 0.4,
-    saliency_colormap: str = 'hot',
-    saliency_threshold: int = 50
-) -> None:
-    """
-    Display detailed information for each visual search result.
-    Args:
-        sources: List of VisualSearchResult objects
-        show_images: Whether to display document images (from Cloudinary)
-        show_saliency: Whether to generate and display saliency maps
-        qdrant_client: Qdrant client (required for saliency)
-        collection_name: Qdrant collection name (required for saliency)
-        query_embedding: Query embedding for saliency computation
-        saliency_alpha: Saliency overlay transparency (0.0-1.0)
-        saliency_colormap: Matplotlib colormap for saliency (default: 'hot')
-        saliency_threshold: Threshold percentile for saliency (default: 50)
-    """
-    st.markdown("### 📄 Document Details")
-    # Import saliency functions if needed
-    if show_saliency:
-        from .saliency import generate_tile_aware_saliency, can_generate_saliency
-    for i, doc in enumerate(sources):
-        metadata = getattr(doc, 'metadata', {})
-        # Get basic metadata
-        filename = metadata.get('filename', 'Unknown')
-        page_number = metadata.get('page_number', '?')
-        year = metadata.get('year', 'Unknown')
-        source = metadata.get('source', 'Unknown')
-        district = metadata.get('district')
-        score = getattr(doc, 'score', 0.0)
-        # Get visual-specific metadata
-        num_tiles = metadata.get('num_tiles')
-        tile_rows = metadata.get('tile_rows')
-        tile_cols = metadata.get('tile_cols')
-        num_visual_tokens = metadata.get('num_visual_tokens')
-        original_width = metadata.get('original_width')
-        original_height = metadata.get('original_height')
-        resized_width = metadata.get('resized_width')
-        resized_height = metadata.get('resized_height')
-        # Get image URLs
-        original_url = metadata.get('original_url')
-        resized_url = metadata.get('resized_url')
-        page_url = metadata.get('page')  # Fallback
-        # Get point_id for saliency (check doc.id first, then metadata)
-        point_id = getattr(doc, 'id', None) or metadata.get('point_id') or metadata.get('_id')
-        # Debug logging for saliency
-        if show_saliency:
-            logger.debug(f"Doc {i+1}: point_id={point_id}, has_tiles={metadata.get('num_tiles') is not None}")
-        # Build title
-        score_text = f" (Score: {score:.3f})"
-        title = f"📄 Document {i+1}: {filename[:50]}...{score_text}"
-        with st.expander(title, expanded=(i == 0)):  # Expand first result
-            # Two-column layout: Metadata (left) and Image (right)
-            col_meta, col_image = st.columns([1, 2])
-            with col_meta:
-                st.markdown("### 📋 Metadata")
-                # Basic metadata
-                st.write(f"📄 **File:** {filename}")
-                st.write(f"🏛️ **Source:** {source}")
-                st.write(f"📅 **Year:** {year}")
-                st.write(f"📖 **Page:** {page_number}")
-                if district and district != 'None':
-                    st.write(f"📍 **District:** {district}")
-                # Relevance score
-                st.markdown("---")
-                st.markdown("### 🎯 Relevance")
-                score_color = "🟢" if score > 0.7 else "🟡" if score > 0.5 else "🔴"
-                st.markdown(f"**Score:** {score_color} **{score:.3f}**")
-                # Visual metadata (if available)
-                if num_tiles or num_visual_tokens:
-                    st.markdown("---")
-                    st.markdown("### 🎨 Visual Metadata")
-                    if num_tiles:
-                        st.write(f"🔲 **Tiles:** {num_tiles} ({tile_rows}×{tile_cols})")
-                    if num_visual_tokens:
-                        st.write(f"🔢 **Visual Tokens:** {num_visual_tokens}")
-                    if original_width and original_height:
-                        st.write(f"📐 **Original Size:** {original_width}×{original_height}")
-                    if resized_width and resized_height:
-                        st.write(f"📐 **Resized Size:** {resized_width}×{resized_height}")
-                    processing_version = metadata.get('processing_version')
-                    if processing_version:
-                        st.write(f"⚙️ **Processing:** {processing_version}")
-                # Text content preview
-                content = getattr(doc, 'page_content', '')
-                if content:
-                    st.markdown("---")
-                    with st.expander("📝 Extracted Text", expanded=True):
-                        st.text_area(
-                            "Content",
-                            value=content[:500] + ("..." if len(content) > 500 else ""),
-                            height=150,
-                            disabled=True,
-                            label_visibility="collapsed",
-                            key=f"visual_doc_text_{i}"
-                        )
-                else:
-                    st.markdown("---")
-                    st.caption("_No text extracted (image-only page)_")
-                # Show image URLs under text
-                if original_url and resized_url:
-                    with st.expander("🔗 Image URLs", expanded=True):
-                        st.markdown(f"**Original:** [{original_url}]({original_url})")
-                        st.markdown(f"**Resized (for embeddings):** [{resized_url}]({resized_url})")
-            with col_image:
-                st.markdown("### 📸 Document Page")
-                # Check if we should generate saliency
-                saliency_generated = False
-                if show_saliency and show_images:
-                    # Check if we have all requirements for saliency
-                    has_client = qdrant_client is not None
-                    has_collection = collection_name is not None
-                    has_query = query_embedding is not None
-                    has_point_id = point_id is not None
-                    has_tile_metadata = can_generate_saliency(metadata)
-                    can_saliency = has_client and has_collection and has_query and has_point_id and has_tile_metadata
-                    if not can_saliency:
-                        missing = []
-                        if not has_client: missing.append("qdrant_client")
-                        if not has_collection: missing.append("collection_name")
-                        if not has_query: missing.append("query_embedding")
-                        if not has_point_id: missing.append("point_id")
-                        if not has_tile_metadata: missing.append("tile_metadata")
-                        logger.warning(f"Doc {i+1}: Saliency unavailable, missing: {missing}")
-                    if can_saliency:
-                        try:
-                            with st.spinner(f"🔥 Generating saliency map for Doc {i+1}..."):
-                                # Convert query embedding if needed
-                                query_emb = query_embedding
-                                if hasattr(query_emb, 'cpu'):
-                                    query_emb = query_emb.cpu().float().numpy()
-                                if query_emb.ndim == 3:
-                                    query_emb = query_emb.squeeze(0)  # Remove batch dimension
-                                logger.info(f"🔥 Generating saliency for doc {i+1}: point_id={point_id}, colormap={saliency_colormap}")
-                                saliency_img = generate_tile_aware_saliency(
-                                    qdrant_client=qdrant_client,
-                                    collection_name=collection_name,
-                                    point_id=point_id,
-                                    query_embedding=query_emb,
-                                    alpha=saliency_alpha,
-                                    colormap=saliency_colormap,
-                                    threshold_percentile=saliency_threshold
-                                )
-                                if saliency_img:
-                                    # Display saliency map
-                                    st.image(saliency_img, width=700, caption=f"🔥 Saliency Map - Page {page_number}")
-                                    saliency_generated = True
-                                    logger.info(f"✅ Saliency map displayed for doc {i+1}")
-                                else:
-                                    logger.warning(f"Saliency generation returned None for doc {i+1}")
-                                    st.caption("_Saliency map could not be generated_")
-                        except Exception as e:
-                            logger.error(f"Saliency generation failed for doc {i+1}: {e}")
-                            import traceback
-                            logger.debug(traceback.format_exc())
-                            st.warning(f"⚠️ Saliency generation failed: {str(e)[:100]}")
-                    else:
-                        if not has_tile_metadata:
-                            st.caption("_Saliency unavailable: missing tile metadata_")
-                        elif not has_point_id:
-                            st.caption("_Saliency unavailable: missing point_id_")
-                # Display original image if saliency wasn't generated
-                if show_images and not saliency_generated:
-                    # Use ORIGINAL image (not resized) for display
-                    image_url = original_url or resized_url or page_url
-                    if image_url and isinstance(image_url, str) and image_url.startswith('http'):
-                        try:
-                            # Use width parameter for medium-sized image
-                            st.image(image_url, width=700, caption=f"Page {page_number}")
-                        except Exception as e:
-                            st.error(f"Failed to load image: {e}")
-                    else:
-                        st.info("No image URL available")
-                elif not show_images:
-                    st.info("Enable image display in settings to view document pages")
-def display_visual_search_results(
-    sources: List[Any],
-    show_statistics: bool = True,
-    show_images: bool = False,
-    show_saliency: bool = False,
-    qdrant_client: Any = None,
-    collection_name: str = None,
-    query_embedding: Optional[np.ndarray] = None,
-    saliency_alpha: float = 0.4,
-    saliency_colormap: str = 'hot',
-    saliency_threshold: int = 50,
-    max_display: int = 20
-) -> None:
-    """
-    Display visual search results with statistics and details.
-    Args:
-        sources: List of VisualSearchResult objects
-        show_statistics: Whether to show statistics
-        show_images: Whether to show document images
-        show_saliency: Whether to generate and display saliency maps
-        qdrant_client: Qdrant client (required for saliency)
-        collection_name: Qdrant collection name (required for saliency)
-        query_embedding: Query embedding for saliency computation
-        saliency_alpha: Saliency overlay transparency (0.0-1.0)
-        saliency_colormap: Matplotlib colormap for saliency (default: 'hot')
-        saliency_threshold: Threshold percentile for saliency (default: 50)
-        max_display: Maximum number of documents to display in detail
-    """
-    if not sources:
-        st.info("No documents were retrieved for the last query.")
-        return
-    # Count unique filenames
-    unique_filenames = set()
-    for doc in sources:
-        filename = getattr(doc, 'metadata', {}).get('filename', 'Unknown')
-        unique_filenames.add(filename)
-    st.markdown(f"**Found {len(sources)} document chunks from {len(unique_filenames)} unique documents:**")
-    if len(unique_filenames) < len(sources):
-        st.info(f"💡 **Note**: Each document is split into multiple chunks. You're seeing {len(sources)} chunks from {len(unique_filenames)} documents.")
-    # Show saliency info if enabled
-    if show_saliency:
-        st.info(f"🔥 **Saliency Maps Enabled**: Showing which image regions are most relevant to your query (using '{saliency_colormap}' colormap)")
-    # Show statistics
-    if show_statistics:
-        display_visual_document_statistics(sources)
-        st.markdown("---")
-    # Show detailed results (limit to max_display)
-    display_sources = sources[:max_display]
-    if len(sources) > max_display:
-        st.warning(f"⚠️ Showing top {max_display} of {len(sources)} results")
-    display_visual_document_details(
-        display_sources,
-        show_images=show_images,
-        show_saliency=show_saliency,
-        qdrant_client=qdrant_client,
-        collection_name=collection_name,
-        query_embedding=query_embedding,
-        saliency_alpha=saliency_alpha,
-        saliency_colormap=saliency_colormap,
-        saliency_threshold=saliency_threshold
-    )
-    if len(sources) > max_display:
-        st.info(f"💡 {len(sources) - max_display} more results not shown")