File size: 6,527 Bytes
cc944f5
 
 
 
 
 
 
5d177e2
cc944f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84dea15
cc944f5
84dea15
 
cc944f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5d177e2
 
 
 
 
cc944f5
5d177e2
 
 
 
 
 
cc944f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84dea15
cc944f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
import streamlit as st
import os
import glob
from pathlib import Path

# Set page configuration
st.set_page_config(
    page_title="OCR analysis results",
    layout="wide",
    initial_sidebar_state="collapsed"
)

# Custom CSS for a cleaner interface
st.markdown("""
    <style>
    .main { padding-top: 1rem; }
    .stTabs [data-baseweb="tab-list"] { 
        gap: 1rem; 
        margin-bottom: 1rem;
    }
    .stTabs [data-baseweb="tab"] {
        height: 50px;
        white-space: pre-wrap;
        border-radius: 4px 4px 0 0;
        font-weight: bold;
        letter-spacing: 1px;
    }
    .image-container {
        display: flex;
        justify-content: center;
    }
    .markdown-container {
        margin-top: 2rem;
        border: 1px solid #f0f0f0;
        padding: 1rem;
        border-radius: 5px;
        background-color: #f9f9f9;
    }
    .toggle-container {
        margin-bottom: 1rem;
    }
    </style>
""", unsafe_allow_html=True)

def get_all_docs(repo_path="markdowns"):
    """
    Gets all document IDs from the nested structure in markdowns directory.
    Structure: markdowns/folder_id/doc_id/
    """
    all_docs = []
    
    if not os.path.exists(repo_path):
        return []
    
    # Get all folder_ids (we'll use the paths but skip showing them in UI)
    folder_paths = [f for f in glob.glob(os.path.join(repo_path, '*')) if os.path.isdir(f)]
    
    # For each folder, get all doc_ids
    for folder_path in folder_paths:
        doc_paths = [f for f in glob.glob(os.path.join(folder_path, '*')) if os.path.isdir(f)]
        
        for doc_path in doc_paths:
            doc_id = os.path.basename(doc_path)
            all_docs.append((doc_id, doc_path))
    
    return all_docs

def read_markdown_file(file_path):
    """Reads the content of a markdown file."""
    if os.path.exists(file_path):
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                return f.read()
        except Exception as e:
            return f"Error reading markdown file: {str(e)}"
    return "Markdown file not found."

def display_model_outputs(doc_path):
    """Displays the markdown outputs from different models for the current document."""
    # Check which markdown files are available
    md_files = glob.glob(os.path.join(doc_path, "*.md"))
    
    if not md_files:
        st.warning("No markdown files found for this document")
        return
    
    # Extract model names and sort them
    model_names = [os.path.basename(md_file).replace(".md", "") for md_file in md_files]
    model_names.sort()  # Ensure consistent order
    
    # Convert model names to uppercase
    display_names = [name.upper() for name in model_names]
    
    # Initialize show_parsed in session_state if not already set
    if 'show_parsed' not in st.session_state:
        st.session_state.show_parsed = False
    
    # Toggle for raw/parsed markdown that preserves state
    st.markdown("<div class='toggle-container'>", unsafe_allow_html=True)
    show_parsed = st.checkbox(
        "Show Parsed Markdown", 
        value=st.session_state.show_parsed,
        key="parsed_markdown_toggle",
        on_change=lambda: setattr(st.session_state, 'show_parsed', st.session_state.parsed_markdown_toggle)
    )
    st.markdown("</div>", unsafe_allow_html=True)
    
    # Create tabs for each model
    tabs = st.tabs(display_names)
    
    for i, model_name in enumerate(model_names):
        md_path = os.path.join(doc_path, f"{model_name}.md")
        md_content = read_markdown_file(md_path)
        
        with tabs[i]:
            if show_parsed:
                st.markdown(md_content, unsafe_allow_html=True)
            else:
                st.markdown("<div class='markdown-container'>", unsafe_allow_html=True)
                st.code(md_content, language="markdown")
                st.markdown("</div>", unsafe_allow_html=True)

def main():
    """Main function to run the Streamlit app."""
    st.title("Document Analysis Leaderboard")
    
    # Get all doc-ids from the fixed repository path
    repo_path = "markdowns"
    
    with st.spinner("Loading documents..."):
        all_docs = get_all_docs(repo_path)
    
    if not all_docs:
        st.error(f"No documents found in {repo_path}. Please check the directory structure.")
        if os.path.exists(repo_path):
            st.info(f"The path {repo_path} exists, but no documents were found.")
        else:
            st.info(f"The path {repo_path} does not exist.")
        return
    
    # Sort docs by doc_id for consistent ordering
    all_docs.sort()
    
    # Initialize session state for current index
    if 'current_index' not in st.session_state:
        st.session_state.current_index = 0
    
    # Ensure current_index is within bounds
    st.session_state.current_index = min(st.session_state.current_index, len(all_docs) - 1)
    
    # Current document info
    doc_id, doc_path = all_docs[st.session_state.current_index]
    
    # Navigation buttons
    col1, col2, col3 = st.columns([1, 4, 1])
    
    with col1:
        if st.button("← Previous", use_container_width=True):
            st.session_state.current_index = (st.session_state.current_index - 1) % len(all_docs)
            st.rerun()
    
    with col2:
        st.markdown(f"### Document: {doc_id}")
        st.caption(f"Document {st.session_state.current_index + 1} of {len(all_docs)}")
    
    with col3:
        if st.button("Next →", use_container_width=True):
            st.session_state.current_index = (st.session_state.current_index + 1) % len(all_docs)
            st.rerun()
    
    # Display document image with reduced size and centered
    image_path = os.path.join(doc_path, "image.jpg")
    
    try:
        if os.path.exists(image_path):
            # Use columns to center and size the image
            col1, col2, col3 = st.columns([1, 2, 1])
            with col2:
                st.markdown("<div class='image-container'>", unsafe_allow_html=True)
                st.image(image_path, width=500)  # Fixed width for smaller size
                st.markdown("</div>", unsafe_allow_html=True)
        else:
            st.info("Image not available for this document")
    except Exception as e:
        st.error(f"Error loading image: {str(e)}")
    
    # Add separator between image and model outputs
    st.markdown("<hr style='margin: 2rem 0;'>", unsafe_allow_html=True)
    
    # Display model outputs
    display_model_outputs(doc_path)

if __name__ == "__main__":
    main()