ysif9 commited on
Commit
8c17874
Β·
verified Β·
1 Parent(s): 71b1d53

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +224 -38
src/streamlit_app.py CHANGED
@@ -1,40 +1,226 @@
1
- import altair as alt
2
- import numpy as np
3
- import pandas as pd
 
 
 
4
  import streamlit as st
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
- """
7
- # Welcome to Streamlit!
8
-
9
- Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
10
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
11
- forums](https://discuss.streamlit.io).
12
-
13
- In the meantime, below is an example of what you can do with just a few lines of code:
14
- """
15
-
16
- num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
17
- num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
18
-
19
- indices = np.linspace(0, 1, num_points)
20
- theta = 2 * np.pi * num_turns * indices
21
- radius = indices
22
-
23
- x = radius * np.cos(theta)
24
- y = radius * np.sin(theta)
25
-
26
- df = pd.DataFrame({
27
- "x": x,
28
- "y": y,
29
- "idx": indices,
30
- "rand": np.random.randn(num_points),
31
- })
32
-
33
- st.altair_chart(alt.Chart(df, height=700, width=700)
34
- .mark_point(filled=True)
35
- .encode(
36
- x=alt.X("x", axis=None),
37
- y=alt.Y("y", axis=None),
38
- color=alt.Color("idx", legend=None, scale=alt.Scale()),
39
- size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
40
- ))
 
1
+ import difflib
2
+ import tempfile
3
+ import time
4
+ from io import BytesIO
5
+ from pathlib import Path
6
+
7
  import streamlit as st
8
+ from docling.datamodel.base_models import DocumentStream
9
+ from docling.document_converter import DocumentConverter
10
+ from marker.converters.pdf import PdfConverter
11
+ from marker.models import create_model_dict
12
+ from marker.output import text_from_rendered
13
+ from st_diff_viewer import diff_viewer
14
+
15
+
16
+ @st.cache_resource
17
+ def load_marker_models() -> dict:
18
+ """Load Marker models"""
19
+ return create_model_dict()
20
+
21
+ def extract_with_marker(pdf_bytes: bytes):
22
+ """Extract text from PDF using Marker"""
23
+
24
+ try:
25
+ # Save bytes to temporary file since Marker needs a file path
26
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
27
+ tmp_file.write(pdf_bytes)
28
+ tmp_file_path = tmp_file.name
29
+
30
+ # Initialize Marker converter
31
+ converter = PdfConverter(
32
+ artifact_dict=load_marker_models(),
33
+ )
34
+
35
+ # Time the conversion
36
+ start_time = time.time()
37
+ rendered = converter(tmp_file_path)
38
+ text, _, images = text_from_rendered(rendered)
39
+ end_time = time.time()
40
+
41
+ # Clean up temp file
42
+ Path(tmp_file_path).unlink()
43
+
44
+ processing_time = end_time - start_time
45
+
46
+ return text, processing_time, None
47
+
48
+ except Exception as e:
49
+ return None, None, str(e)
50
+
51
+
52
+ def extract_with_docling(pdf_bytes: bytes, filename: str):
53
+ """Extract text from PDF using Docling"""
54
+
55
+ try:
56
+ # Create DocumentStream from bytes
57
+ buf = BytesIO(pdf_bytes)
58
+ source = DocumentStream(name=filename, stream=buf)
59
+
60
+ # Initialize Docling converter
61
+ converter = DocumentConverter()
62
+
63
+ # Time the conversion
64
+ start_time = time.time()
65
+ result = converter.convert(source)
66
+ markdown_text = result.document.export_to_markdown()
67
+ end_time = time.time()
68
+
69
+ processing_time = end_time - start_time
70
+
71
+
72
+ return markdown_text, processing_time, None
73
+
74
+ except Exception as e:
75
+ return None, None, str(e)
76
+
77
+
78
+ def calculate_similarity(text1: str, text2: str) -> float:
79
+ """Calculate similarity ratio between two texts"""
80
+ return difflib.SequenceMatcher(None, text1, text2).ratio()
81
+
82
+
83
+ def main() -> None:
84
+ """
85
+ Main function for the application, providing an interface for comparing PDF-to-Markdown
86
+ extraction performance between the Marker library and the Docling library. The function
87
+ is executed in a Streamlit environment and utilizes its widgets and layout.
88
+
89
+ This function handles file uploads, extraction using the two libraries, and displays
90
+ various processing metrics, outputs, and comparisons to the user in an accessible format.
91
+
92
+ :raises ValueError: If invalid or unsupported inputs are provided during processing.
93
+ """
94
+ st.set_page_config(
95
+ page_title="PDF Extraction Comparison: Marker vs Docling",
96
+ page_icon="πŸ“„",
97
+ layout="wide"
98
+ )
99
+
100
+ st.title("πŸ“„ PDF Extraction Comparison: Marker vs Docling")
101
+ st.markdown("Compare PDF-to-Markdown extraction performance between Marker and Docling libraries")
102
+
103
+ # File upload
104
+ st.header("πŸ“€ Upload PDF Document")
105
+ uploaded_file = st.file_uploader(
106
+ "Choose a PDF file",
107
+ type="pdf",
108
+ help="Upload a PDF document to compare extraction performance"
109
+ )
110
+
111
+ if uploaded_file is not None:
112
+ st.success(f"File uploaded: {uploaded_file.name}")
113
+ pdf_bytes = uploaded_file.read()
114
+
115
+ # Process with both libraries
116
+ st.header("πŸ”„ Processing...")
117
+
118
+ # Create columns for parallel processing display
119
+ col1, col2 = st.columns(2)
120
+
121
+ with col1:
122
+ st.subheader("🏷️ Marker Processing")
123
+ marker_placeholder = st.empty()
124
+
125
+ with col2:
126
+ st.subheader("πŸ“‹ Docling Processing")
127
+ docling_placeholder = st.empty()
128
+
129
+ # Process with Marker
130
+ with marker_placeholder.container():
131
+ with st.spinner("Processing with Marker..."):
132
+ marker_text, marker_time, marker_error = extract_with_marker(pdf_bytes)
133
+
134
+ # Process with Docling
135
+ with docling_placeholder.container():
136
+ with st.spinner("Processing with Docling..."):
137
+ docling_text, docling_time, docling_error = extract_with_docling(pdf_bytes, uploaded_file.name)
138
+
139
+ # Display results
140
+ st.header("πŸ“Š Results")
141
+
142
+ # Performance metrics
143
+ if marker_time is not None and docling_time is not None:
144
+ metrics_col1, metrics_col2, metrics_col3 = st.columns(3)
145
+
146
+ with metrics_col1:
147
+ st.metric(
148
+ "Marker Processing Time",
149
+ f"{marker_time:.2f}s"
150
+ )
151
+
152
+ with metrics_col2:
153
+ st.metric(
154
+ "Docling Processing Time",
155
+ f"{docling_time:.2f}s"
156
+ )
157
+
158
+ with metrics_col3:
159
+ speed_diff = ((marker_time - docling_time) / docling_time) * 100
160
+ faster_library = "Docling" if marker_time > docling_time else "Marker"
161
+ st.metric(
162
+ f"{faster_library} is faster by",
163
+ f"{abs(speed_diff):.1f}%"
164
+ )
165
+
166
+ # Text comparison
167
+ if marker_text is not None and docling_text is not None:
168
+ # Calculate similarity
169
+ similarity = calculate_similarity(marker_text, docling_text)
170
+ st.subheader(f"πŸ“ Text Similarity: {similarity:.1%}")
171
+
172
+ # Length comparison
173
+ len_col1, len_col2 = st.columns(2)
174
+ with len_col1:
175
+ st.info(f"Marker output: {len(marker_text)} characters")
176
+ with len_col2:
177
+ st.info(f"Docling output: {len(docling_text)} characters")
178
+
179
+ # Side-by-side comparison
180
+ st.subheader("πŸ“„ Markdown Output Comparison")
181
+
182
+ tab1, tab2, tab3 = st.tabs(["Marker Output", "Docling Output", "Diff View"])
183
+
184
+ with tab1:
185
+ st.markdown("### Marker Output")
186
+ st.text_area(
187
+ "Marker Markdown",
188
+ marker_text,
189
+ height=800,
190
+ key="marker_output"
191
+ )
192
+
193
+ with tab2:
194
+ st.markdown("### Docling Output")
195
+ st.text_area(
196
+ "Docling Markdown",
197
+ docling_text,
198
+ height=800,
199
+ key="docling_output"
200
+ )
201
+
202
+ with tab3:
203
+ st.markdown("### Text Differences")
204
+ try:
205
+ diff_viewer(
206
+ old_text=marker_text,
207
+ new_text=docling_text,
208
+ left_title="Marker",
209
+ right_title="Docling",
210
+ )
211
+ except ImportError as e:
212
+ st.error(f"streamlit-diff-viewer not available: {e}")
213
+
214
+ # Error handling
215
+ if marker_error:
216
+ st.error(f"Marker Error: {marker_error}")
217
+
218
+ if docling_error:
219
+ st.error(f"Docling Error: {docling_error}")
220
+
221
+ else:
222
+ st.info("πŸ‘† Please upload a PDF file to begin comparison")
223
+
224
 
225
+ if __name__ == "__main__":
226
+ main()