Spaces:

docsumo
/

ocr-results

Running

File size: 6,527 Bytes

import streamlit as st
import os
import glob
from pathlib import Path

# Set page configuration
st.set_page_config(
    page_title="OCR analysis results",
    layout="wide",
    initial_sidebar_state="collapsed"
)

# Custom CSS for a cleaner interface
st.markdown("""
    <style>
    .main { padding-top: 1rem; }
    .stTabs [data-baseweb="tab-list"] { 
        gap: 1rem; 
        margin-bottom: 1rem;
    }
    .stTabs [data-baseweb="tab"] {
        height: 50px;
        white-space: pre-wrap;
        border-radius: 4px 4px 0 0;
        font-weight: bold;
        letter-spacing: 1px;
    }
    .image-container {
        display: flex;
        justify-content: center;
    }
    .markdown-container {
        margin-top: 2rem;
        border: 1px solid #f0f0f0;
        padding: 1rem;
        border-radius: 5px;
        background-color: #f9f9f9;
    }
    .toggle-container {
        margin-bottom: 1rem;
    }
    </style>
""", unsafe_allow_html=True)

def get_all_docs(repo_path="markdowns"):
    """
    Gets all document IDs from the nested structure in markdowns directory.
    Structure: markdowns/folder_id/doc_id/
    """
    all_docs = []
    
    if not os.path.exists(repo_path):
        return []
    
    # Get all folder_ids (we'll use the paths but skip showing them in UI)
    folder_paths = [f for f in glob.glob(os.path.join(repo_path, '*')) if os.path.isdir(f)]
    
    # For each folder, get all doc_ids
    for folder_path in folder_paths:
        doc_paths = [f for f in glob.glob(os.path.join(folder_path, '*')) if os.path.isdir(f)]
        
        for doc_path in doc_paths:
            doc_id = os.path.basename(doc_path)
            all_docs.append((doc_id, doc_path))
    
    return all_docs

def read_markdown_file(file_path):
    """Reads the content of a markdown file."""
    if os.path.exists(file_path):
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                return f.read()
        except Exception as e:
            return f"Error reading markdown file: {str(e)}"
    return "Markdown file not found."

def display_model_outputs(doc_path):
    """Displays the markdown outputs from different models for the current document."""
    # Check which markdown files are available
    md_files = glob.glob(os.path.join(doc_path, "*.md"))
    
    if not md_files:
        st.warning("No markdown files found for this document")
        return
    
    # Extract model names and sort them
    model_names = [os.path.basename(md_file).replace(".md", "") for md_file in md_files]
    model_names.sort()  # Ensure consistent order
    
    # Convert model names to uppercase
    display_names = [name.upper() for name in model_names]
    
    # Initialize show_parsed in session_state if not already set
    if 'show_parsed' not in st.session_state:
        st.session_state.show_parsed = False
    
    # Toggle for raw/parsed markdown that preserves state
    st.markdown("<div class='toggle-container'>", unsafe_allow_html=True)
    show_parsed = st.checkbox(
        "Show Parsed Markdown", 
        value=st.session_state.show_parsed,
        key="parsed_markdown_toggle",
        on_change=lambda: setattr(st.session_state, 'show_parsed', st.session_state.parsed_markdown_toggle)
    )
    st.markdown("</div>", unsafe_allow_html=True)
    
    # Create tabs for each model
    tabs = st.tabs(display_names)
    
    for i, model_name in enumerate(model_names):
        md_path = os.path.join(doc_path, f"{model_name}.md")
        md_content = read_markdown_file(md_path)
        
        with tabs[i]:
            if show_parsed:
                st.markdown(md_content, unsafe_allow_html=True)
            else:
                st.markdown("<div class='markdown-container'>", unsafe_allow_html=True)
                st.code(md_content, language="markdown")
                st.markdown("</div>", unsafe_allow_html=True)

def main():
    """Main function to run the Streamlit app."""
    st.title("Document Analysis Leaderboard")
    
    # Get all doc-ids from the fixed repository path
    repo_path = "markdowns"
    
    with st.spinner("Loading documents..."):
        all_docs = get_all_docs(repo_path)
    
    if not all_docs:
        st.error(f"No documents found in {repo_path}. Please check the directory structure.")
        if os.path.exists(repo_path):
            st.info(f"The path {repo_path} exists, but no documents were found.")
        else:
            st.info(f"The path {repo_path} does not exist.")
        return
    
    # Sort docs by doc_id for consistent ordering
    all_docs.sort()
    
    # Initialize session state for current index
    if 'current_index' not in st.session_state:
        st.session_state.current_index = 0
    
    # Ensure current_index is within bounds
    st.session_state.current_index = min(st.session_state.current_index, len(all_docs) - 1)
    
    # Current document info
    doc_id, doc_path = all_docs[st.session_state.current_index]
    
    # Navigation buttons
    col1, col2, col3 = st.columns([1, 4, 1])
    
    with col1:
        if st.button("← Previous", use_container_width=True):
            st.session_state.current_index = (st.session_state.current_index - 1) % len(all_docs)
            st.rerun()
    
    with col2:
        st.markdown(f"### Document: {doc_id}")
        st.caption(f"Document {st.session_state.current_index + 1} of {len(all_docs)}")
    
    with col3:
        if st.button("Next →", use_container_width=True):
            st.session_state.current_index = (st.session_state.current_index + 1) % len(all_docs)
            st.rerun()
    
    # Display document image with reduced size and centered
    image_path = os.path.join(doc_path, "image.jpg")
    
    try:
        if os.path.exists(image_path):
            # Use columns to center and size the image
            col1, col2, col3 = st.columns([1, 2, 1])
            with col2:
                st.markdown("<div class='image-container'>", unsafe_allow_html=True)
                st.image(image_path, width=500)  # Fixed width for smaller size
                st.markdown("</div>", unsafe_allow_html=True)
        else:
            st.info("Image not available for this document")
    except Exception as e:
        st.error(f"Error loading image: {str(e)}")
    
    # Add separator between image and model outputs
    st.markdown("<hr style='margin: 2rem 0;'>", unsafe_allow_html=True)
    
    # Display model outputs
    display_model_outputs(doc_path)

if __name__ == "__main__":
    main()