Spaces:
Running
Running
File size: 6,527 Bytes
cc944f5 5d177e2 cc944f5 84dea15 cc944f5 84dea15 cc944f5 5d177e2 cc944f5 5d177e2 cc944f5 84dea15 cc944f5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 |
import streamlit as st
import os
import glob
from pathlib import Path
# Set page configuration
st.set_page_config(
page_title="OCR analysis results",
layout="wide",
initial_sidebar_state="collapsed"
)
# Custom CSS for a cleaner interface
st.markdown("""
<style>
.main { padding-top: 1rem; }
.stTabs [data-baseweb="tab-list"] {
gap: 1rem;
margin-bottom: 1rem;
}
.stTabs [data-baseweb="tab"] {
height: 50px;
white-space: pre-wrap;
border-radius: 4px 4px 0 0;
font-weight: bold;
letter-spacing: 1px;
}
.image-container {
display: flex;
justify-content: center;
}
.markdown-container {
margin-top: 2rem;
border: 1px solid #f0f0f0;
padding: 1rem;
border-radius: 5px;
background-color: #f9f9f9;
}
.toggle-container {
margin-bottom: 1rem;
}
</style>
""", unsafe_allow_html=True)
def get_all_docs(repo_path="markdowns"):
"""
Gets all document IDs from the nested structure in markdowns directory.
Structure: markdowns/folder_id/doc_id/
"""
all_docs = []
if not os.path.exists(repo_path):
return []
# Get all folder_ids (we'll use the paths but skip showing them in UI)
folder_paths = [f for f in glob.glob(os.path.join(repo_path, '*')) if os.path.isdir(f)]
# For each folder, get all doc_ids
for folder_path in folder_paths:
doc_paths = [f for f in glob.glob(os.path.join(folder_path, '*')) if os.path.isdir(f)]
for doc_path in doc_paths:
doc_id = os.path.basename(doc_path)
all_docs.append((doc_id, doc_path))
return all_docs
def read_markdown_file(file_path):
"""Reads the content of a markdown file."""
if os.path.exists(file_path):
try:
with open(file_path, 'r', encoding='utf-8') as f:
return f.read()
except Exception as e:
return f"Error reading markdown file: {str(e)}"
return "Markdown file not found."
def display_model_outputs(doc_path):
"""Displays the markdown outputs from different models for the current document."""
# Check which markdown files are available
md_files = glob.glob(os.path.join(doc_path, "*.md"))
if not md_files:
st.warning("No markdown files found for this document")
return
# Extract model names and sort them
model_names = [os.path.basename(md_file).replace(".md", "") for md_file in md_files]
model_names.sort() # Ensure consistent order
# Convert model names to uppercase
display_names = [name.upper() for name in model_names]
# Initialize show_parsed in session_state if not already set
if 'show_parsed' not in st.session_state:
st.session_state.show_parsed = False
# Toggle for raw/parsed markdown that preserves state
st.markdown("<div class='toggle-container'>", unsafe_allow_html=True)
show_parsed = st.checkbox(
"Show Parsed Markdown",
value=st.session_state.show_parsed,
key="parsed_markdown_toggle",
on_change=lambda: setattr(st.session_state, 'show_parsed', st.session_state.parsed_markdown_toggle)
)
st.markdown("</div>", unsafe_allow_html=True)
# Create tabs for each model
tabs = st.tabs(display_names)
for i, model_name in enumerate(model_names):
md_path = os.path.join(doc_path, f"{model_name}.md")
md_content = read_markdown_file(md_path)
with tabs[i]:
if show_parsed:
st.markdown(md_content, unsafe_allow_html=True)
else:
st.markdown("<div class='markdown-container'>", unsafe_allow_html=True)
st.code(md_content, language="markdown")
st.markdown("</div>", unsafe_allow_html=True)
def main():
"""Main function to run the Streamlit app."""
st.title("Document Analysis Leaderboard")
# Get all doc-ids from the fixed repository path
repo_path = "markdowns"
with st.spinner("Loading documents..."):
all_docs = get_all_docs(repo_path)
if not all_docs:
st.error(f"No documents found in {repo_path}. Please check the directory structure.")
if os.path.exists(repo_path):
st.info(f"The path {repo_path} exists, but no documents were found.")
else:
st.info(f"The path {repo_path} does not exist.")
return
# Sort docs by doc_id for consistent ordering
all_docs.sort()
# Initialize session state for current index
if 'current_index' not in st.session_state:
st.session_state.current_index = 0
# Ensure current_index is within bounds
st.session_state.current_index = min(st.session_state.current_index, len(all_docs) - 1)
# Current document info
doc_id, doc_path = all_docs[st.session_state.current_index]
# Navigation buttons
col1, col2, col3 = st.columns([1, 4, 1])
with col1:
if st.button("← Previous", use_container_width=True):
st.session_state.current_index = (st.session_state.current_index - 1) % len(all_docs)
st.rerun()
with col2:
st.markdown(f"### Document: {doc_id}")
st.caption(f"Document {st.session_state.current_index + 1} of {len(all_docs)}")
with col3:
if st.button("Next →", use_container_width=True):
st.session_state.current_index = (st.session_state.current_index + 1) % len(all_docs)
st.rerun()
# Display document image with reduced size and centered
image_path = os.path.join(doc_path, "image.jpg")
try:
if os.path.exists(image_path):
# Use columns to center and size the image
col1, col2, col3 = st.columns([1, 2, 1])
with col2:
st.markdown("<div class='image-container'>", unsafe_allow_html=True)
st.image(image_path, width=500) # Fixed width for smaller size
st.markdown("</div>", unsafe_allow_html=True)
else:
st.info("Image not available for this document")
except Exception as e:
st.error(f"Error loading image: {str(e)}")
# Add separator between image and model outputs
st.markdown("<hr style='margin: 2rem 0;'>", unsafe_allow_html=True)
# Display model outputs
display_model_outputs(doc_path)
if __name__ == "__main__":
main() |