import streamlit as st import os import glob from pathlib import Path # Set page configuration st.set_page_config( page_title="OCR analysis results", layout="wide", initial_sidebar_state="collapsed" ) # Custom CSS for a cleaner interface st.markdown(""" """, unsafe_allow_html=True) def get_all_docs(repo_path="markdowns"): """ Gets all document IDs from the nested structure in markdowns directory. Structure: markdowns/folder_id/doc_id/ """ all_docs = [] if not os.path.exists(repo_path): return [] # Get all folder_ids (we'll use the paths but skip showing them in UI) folder_paths = [f for f in glob.glob(os.path.join(repo_path, '*')) if os.path.isdir(f)] # For each folder, get all doc_ids for folder_path in folder_paths: doc_paths = [f for f in glob.glob(os.path.join(folder_path, '*')) if os.path.isdir(f)] for doc_path in doc_paths: doc_id = os.path.basename(doc_path) all_docs.append((doc_id, doc_path)) return all_docs def read_markdown_file(file_path): """Reads the content of a markdown file.""" if os.path.exists(file_path): try: with open(file_path, 'r', encoding='utf-8') as f: return f.read() except Exception as e: return f"Error reading markdown file: {str(e)}" return "Markdown file not found." def display_model_outputs(doc_path): """Displays the markdown outputs from different models for the current document.""" # Check which markdown files are available md_files = glob.glob(os.path.join(doc_path, "*.md")) if not md_files: st.warning("No markdown files found for this document") return # Extract model names and sort them model_names = [os.path.basename(md_file).replace(".md", "") for md_file in md_files] model_names.sort() # Ensure consistent order # Convert model names to uppercase display_names = [name.upper() for name in model_names] # Initialize show_parsed in session_state if not already set if 'show_parsed' not in st.session_state: st.session_state.show_parsed = False # Toggle for raw/parsed markdown that preserves state st.markdown("

", unsafe_allow_html=True) show_parsed = st.checkbox( "Show Parsed Markdown", value=st.session_state.show_parsed, key="parsed_markdown_toggle", on_change=lambda: setattr(st.session_state, 'show_parsed', st.session_state.parsed_markdown_toggle) ) st.markdown("

", unsafe_allow_html=True) # Create tabs for each model tabs = st.tabs(display_names) for i, model_name in enumerate(model_names): md_path = os.path.join(doc_path, f"{model_name}.md") md_content = read_markdown_file(md_path) with tabs[i]: if show_parsed: st.markdown(md_content, unsafe_allow_html=True) else: st.markdown("

", unsafe_allow_html=True) st.code(md_content, language="markdown") st.markdown("

", unsafe_allow_html=True) def main(): """Main function to run the Streamlit app.""" st.title("Document Analysis Leaderboard") # Get all doc-ids from the fixed repository path repo_path = "markdowns" with st.spinner("Loading documents..."): all_docs = get_all_docs(repo_path) if not all_docs: st.error(f"No documents found in {repo_path}. Please check the directory structure.") if os.path.exists(repo_path): st.info(f"The path {repo_path} exists, but no documents were found.") else: st.info(f"The path {repo_path} does not exist.") return # Sort docs by doc_id for consistent ordering all_docs.sort() # Initialize session state for current index if 'current_index' not in st.session_state: st.session_state.current_index = 0 # Ensure current_index is within bounds st.session_state.current_index = min(st.session_state.current_index, len(all_docs) - 1) # Current document info doc_id, doc_path = all_docs[st.session_state.current_index] # Navigation buttons col1, col2, col3 = st.columns([1, 4, 1]) with col1: if st.button("← Previous", use_container_width=True): st.session_state.current_index = (st.session_state.current_index - 1) % len(all_docs) st.rerun() with col2: st.markdown(f"### Document: {doc_id}") st.caption(f"Document {st.session_state.current_index + 1} of {len(all_docs)}") with col3: if st.button("Next →", use_container_width=True): st.session_state.current_index = (st.session_state.current_index + 1) % len(all_docs) st.rerun() # Display document image with reduced size and centered image_path = os.path.join(doc_path, "image.jpg") try: if os.path.exists(image_path): # Use columns to center and size the image col1, col2, col3 = st.columns([1, 2, 1]) with col2: st.markdown("

", unsafe_allow_html=True) st.image(image_path, width=500) # Fixed width for smaller size st.markdown("

", unsafe_allow_html=True) else: st.info("Image not available for this document") except Exception as e: st.error(f"Error loading image: {str(e)}") # Add separator between image and model outputs st.markdown("

", unsafe_allow_html=True) # Display model outputs display_model_outputs(doc_path) if __name__ == "__main__": main()