spookie-boogie commited on
Commit
cc944f5
·
verified ·
1 Parent(s): 875bafd

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +185 -0
app.py ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ import glob
4
+ from pathlib import Path
5
+
6
+ # Set page configuration
7
+ st.set_page_config(
8
+ page_title="Document Analysis Leaderboard",
9
+ layout="wide",
10
+ initial_sidebar_state="collapsed"
11
+ )
12
+
13
+ # Custom CSS for a cleaner interface
14
+ st.markdown("""
15
+ <style>
16
+ .main { padding-top: 1rem; }
17
+ .stTabs [data-baseweb="tab-list"] {
18
+ gap: 1rem;
19
+ margin-bottom: 1rem;
20
+ }
21
+ .stTabs [data-baseweb="tab"] {
22
+ height: 50px;
23
+ white-space: pre-wrap;
24
+ border-radius: 4px 4px 0 0;
25
+ font-weight: bold;
26
+ letter-spacing: 1px;
27
+ }
28
+ .image-container {
29
+ display: flex;
30
+ justify-content: center;
31
+ }
32
+ .markdown-container {
33
+ margin-top: 2rem;
34
+ border: 1px solid #f0f0f0;
35
+ padding: 1rem;
36
+ border-radius: 5px;
37
+ background-color: #f9f9f9;
38
+ }
39
+ .toggle-container {
40
+ margin-bottom: 1rem;
41
+ }
42
+ </style>
43
+ """, unsafe_allow_html=True)
44
+
45
+ def get_all_docs(repo_path="sampled_markdown"):
46
+ """
47
+ Gets all document IDs from the nested structure in sampled_markdown directory.
48
+ Structure: sampled_markdown/folder_id/doc_id/
49
+ """
50
+ all_docs = []
51
+
52
+ if not os.path.exists(repo_path):
53
+ return []
54
+
55
+ # Get all folder_ids (we'll use the paths but skip showing them in UI)
56
+ folder_paths = [f for f in glob.glob(os.path.join(repo_path, '*')) if os.path.isdir(f)]
57
+
58
+ # For each folder, get all doc_ids
59
+ for folder_path in folder_paths:
60
+ doc_paths = [f for f in glob.glob(os.path.join(folder_path, '*')) if os.path.isdir(f)]
61
+
62
+ for doc_path in doc_paths:
63
+ doc_id = os.path.basename(doc_path)
64
+ all_docs.append((doc_id, doc_path))
65
+
66
+ return all_docs
67
+
68
+ def read_markdown_file(file_path):
69
+ """Reads the content of a markdown file."""
70
+ if os.path.exists(file_path):
71
+ try:
72
+ with open(file_path, 'r', encoding='utf-8') as f:
73
+ return f.read()
74
+ except Exception as e:
75
+ return f"Error reading markdown file: {str(e)}"
76
+ return "Markdown file not found."
77
+
78
+ def display_model_outputs(doc_path):
79
+ """Displays the markdown outputs from different models for the current document."""
80
+ # Check which markdown files are available
81
+ md_files = glob.glob(os.path.join(doc_path, "*.md"))
82
+
83
+ if not md_files:
84
+ st.warning("No markdown files found for this document")
85
+ return
86
+
87
+ # Extract model names and sort them
88
+ model_names = [os.path.basename(md_file).replace(".md", "") for md_file in md_files]
89
+ model_names.sort() # Ensure consistent order
90
+
91
+ # Convert model names to uppercase
92
+ display_names = [name.upper() for name in model_names]
93
+
94
+ # Toggle for raw/parsed markdown
95
+ st.markdown("<div class='toggle-container'>", unsafe_allow_html=True)
96
+ show_parsed = st.checkbox("Show Parsed Markdown", value=False)
97
+ st.markdown("</div>", unsafe_allow_html=True)
98
+
99
+ # Create tabs for each model
100
+ tabs = st.tabs(display_names)
101
+
102
+ for i, model_name in enumerate(model_names):
103
+ md_path = os.path.join(doc_path, f"{model_name}.md")
104
+ md_content = read_markdown_file(md_path)
105
+
106
+ with tabs[i]:
107
+ if show_parsed:
108
+ st.markdown(md_content, unsafe_allow_html=True)
109
+ else:
110
+ st.markdown("<div class='markdown-container'>", unsafe_allow_html=True)
111
+ st.code(md_content, language="markdown")
112
+ st.markdown("</div>", unsafe_allow_html=True)
113
+
114
+ def main():
115
+ """Main function to run the Streamlit app."""
116
+ st.title("Document Analysis Leaderboard")
117
+
118
+ # Get all doc-ids from the fixed repository path
119
+ repo_path = "sampled_markdown"
120
+
121
+ with st.spinner("Loading documents..."):
122
+ all_docs = get_all_docs(repo_path)
123
+
124
+ if not all_docs:
125
+ st.error(f"No documents found in {repo_path}. Please check the directory structure.")
126
+ if os.path.exists(repo_path):
127
+ st.info(f"The path {repo_path} exists, but no documents were found.")
128
+ else:
129
+ st.info(f"The path {repo_path} does not exist.")
130
+ return
131
+
132
+ # Sort docs by doc_id for consistent ordering
133
+ all_docs.sort()
134
+
135
+ # Initialize session state for current index
136
+ if 'current_index' not in st.session_state:
137
+ st.session_state.current_index = 0
138
+
139
+ # Ensure current_index is within bounds
140
+ st.session_state.current_index = min(st.session_state.current_index, len(all_docs) - 1)
141
+
142
+ # Current document info
143
+ doc_id, doc_path = all_docs[st.session_state.current_index]
144
+
145
+ # Navigation buttons
146
+ col1, col2, col3 = st.columns([1, 4, 1])
147
+
148
+ with col1:
149
+ if st.button("← Previous", use_container_width=True):
150
+ st.session_state.current_index = (st.session_state.current_index - 1) % len(all_docs)
151
+ st.rerun()
152
+
153
+ with col2:
154
+ st.markdown(f"### Document: {doc_id}")
155
+ st.caption(f"Document {st.session_state.current_index + 1} of {len(all_docs)}")
156
+
157
+ with col3:
158
+ if st.button("Next →", use_container_width=True):
159
+ st.session_state.current_index = (st.session_state.current_index + 1) % len(all_docs)
160
+ st.rerun()
161
+
162
+ # Display document image with reduced size and centered
163
+ image_path = os.path.join(doc_path, "image.jpg")
164
+
165
+ try:
166
+ if os.path.exists(image_path):
167
+ # Use columns to center and size the image
168
+ col1, col2, col3 = st.columns([1, 2, 1])
169
+ with col2:
170
+ st.markdown("<div class='image-container'>", unsafe_allow_html=True)
171
+ st.image(image_path, width=500) # Fixed width for smaller size
172
+ st.markdown("</div>", unsafe_allow_html=True)
173
+ else:
174
+ st.info("Image not available for this document")
175
+ except Exception as e:
176
+ st.error(f"Error loading image: {str(e)}")
177
+
178
+ # Add separator between image and model outputs
179
+ st.markdown("<hr style='margin: 2rem 0;'>", unsafe_allow_html=True)
180
+
181
+ # Display model outputs
182
+ display_model_outputs(doc_path)
183
+
184
+ if __name__ == "__main__":
185
+ main()