Spaces:

docsumo
/

ocr-results

Running

App Files Files Community

spookie-boogie commited on Apr 2, 2025

Commit

cc944f5

verified ·

1 Parent(s): 875bafd

Create app.py

Browse files

Files changed (1) hide show

app.py +185 -0

app.py ADDED Viewed

	@@ -0,0 +1,185 @@

+import streamlit as st
+import os
+import glob
+from pathlib import Path
+# Set page configuration
+st.set_page_config(
+    page_title="Document Analysis Leaderboard",
+    layout="wide",
+    initial_sidebar_state="collapsed"
+)
+# Custom CSS for a cleaner interface
+st.markdown("""
+    <style>
+    .main { padding-top: 1rem; }
+    .stTabs [data-baseweb="tab-list"] {
+        gap: 1rem;
+        margin-bottom: 1rem;
+    }
+    .stTabs [data-baseweb="tab"] {
+        height: 50px;
+        white-space: pre-wrap;
+        border-radius: 4px 4px 0 0;
+        font-weight: bold;
+        letter-spacing: 1px;
+    }
+    .image-container {
+        display: flex;
+        justify-content: center;
+    }
+    .markdown-container {
+        margin-top: 2rem;
+        border: 1px solid #f0f0f0;
+        padding: 1rem;
+        border-radius: 5px;
+        background-color: #f9f9f9;
+    }
+    .toggle-container {
+        margin-bottom: 1rem;
+    }
+    </style>
+""", unsafe_allow_html=True)
+def get_all_docs(repo_path="sampled_markdown"):
+    """
+    Gets all document IDs from the nested structure in sampled_markdown directory.
+    Structure: sampled_markdown/folder_id/doc_id/
+    """
+    all_docs = []
+    if not os.path.exists(repo_path):
+        return []
+    # Get all folder_ids (we'll use the paths but skip showing them in UI)
+    folder_paths = [f for f in glob.glob(os.path.join(repo_path, '*')) if os.path.isdir(f)]
+    # For each folder, get all doc_ids
+    for folder_path in folder_paths:
+        doc_paths = [f for f in glob.glob(os.path.join(folder_path, '*')) if os.path.isdir(f)]
+        for doc_path in doc_paths:
+            doc_id = os.path.basename(doc_path)
+            all_docs.append((doc_id, doc_path))
+    return all_docs
+def read_markdown_file(file_path):
+    """Reads the content of a markdown file."""
+    if os.path.exists(file_path):
+        try:
+            with open(file_path, 'r', encoding='utf-8') as f:
+                return f.read()
+        except Exception as e:
+            return f"Error reading markdown file: {str(e)}"
+    return "Markdown file not found."
+def display_model_outputs(doc_path):
+    """Displays the markdown outputs from different models for the current document."""
+    # Check which markdown files are available
+    md_files = glob.glob(os.path.join(doc_path, "*.md"))
+    if not md_files:
+        st.warning("No markdown files found for this document")
+        return
+    # Extract model names and sort them
+    model_names = [os.path.basename(md_file).replace(".md", "") for md_file in md_files]
+    model_names.sort()  # Ensure consistent order
+    # Convert model names to uppercase
+    display_names = [name.upper() for name in model_names]
+    # Toggle for raw/parsed markdown
+    st.markdown("<div class='toggle-container'>", unsafe_allow_html=True)
+    show_parsed = st.checkbox("Show Parsed Markdown", value=False)
+    st.markdown("</div>", unsafe_allow_html=True)
+    # Create tabs for each model
+    tabs = st.tabs(display_names)
+    for i, model_name in enumerate(model_names):
+        md_path = os.path.join(doc_path, f"{model_name}.md")
+        md_content = read_markdown_file(md_path)
+        with tabs[i]:
+            if show_parsed:
+                st.markdown(md_content, unsafe_allow_html=True)
+            else:
+                st.markdown("<div class='markdown-container'>", unsafe_allow_html=True)
+                st.code(md_content, language="markdown")
+                st.markdown("</div>", unsafe_allow_html=True)
+def main():
+    """Main function to run the Streamlit app."""
+    st.title("Document Analysis Leaderboard")
+    # Get all doc-ids from the fixed repository path
+    repo_path = "sampled_markdown"
+    with st.spinner("Loading documents..."):
+        all_docs = get_all_docs(repo_path)
+    if not all_docs:
+        st.error(f"No documents found in {repo_path}. Please check the directory structure.")
+        if os.path.exists(repo_path):
+            st.info(f"The path {repo_path} exists, but no documents were found.")
+        else:
+            st.info(f"The path {repo_path} does not exist.")
+        return
+    # Sort docs by doc_id for consistent ordering
+    all_docs.sort()
+    # Initialize session state for current index
+    if 'current_index' not in st.session_state:
+        st.session_state.current_index = 0
+    # Ensure current_index is within bounds
+    st.session_state.current_index = min(st.session_state.current_index, len(all_docs) - 1)
+    # Current document info
+    doc_id, doc_path = all_docs[st.session_state.current_index]
+    # Navigation buttons
+    col1, col2, col3 = st.columns([1, 4, 1])
+    with col1:
+        if st.button("← Previous", use_container_width=True):
+            st.session_state.current_index = (st.session_state.current_index - 1) % len(all_docs)
+            st.rerun()
+    with col2:
+        st.markdown(f"### Document: {doc_id}")
+        st.caption(f"Document {st.session_state.current_index + 1} of {len(all_docs)}")
+    with col3:
+        if st.button("Next →", use_container_width=True):
+            st.session_state.current_index = (st.session_state.current_index + 1) % len(all_docs)
+            st.rerun()
+    # Display document image with reduced size and centered
+    image_path = os.path.join(doc_path, "image.jpg")
+    try:
+        if os.path.exists(image_path):
+            # Use columns to center and size the image
+            col1, col2, col3 = st.columns([1, 2, 1])
+            with col2:
+                st.markdown("<div class='image-container'>", unsafe_allow_html=True)
+                st.image(image_path, width=500)  # Fixed width for smaller size
+                st.markdown("</div>", unsafe_allow_html=True)
+        else:
+            st.info("Image not available for this document")
+    except Exception as e:
+        st.error(f"Error loading image: {str(e)}")
+    # Add separator between image and model outputs
+    st.markdown("<hr style='margin: 2rem 0;'>", unsafe_allow_html=True)
+    # Display model outputs
+    display_model_outputs(doc_path)
+if __name__ == "__main__":
+    main()