Spaces:
Sleeping
Sleeping
Yousif Abdulhafiz commited on
Commit ·
9a3de0d
1
Parent(s): a19b20f
Add OCR capabilities with PyMuPDF enhance PDF extraction comparison
Browse files- pyproject.toml +1 -0
- src/streamlit_app.py +221 -54
- uv.lock +34 -0
pyproject.toml
CHANGED
|
@@ -10,5 +10,6 @@ dependencies = [
|
|
| 10 |
"marker-pdf",
|
| 11 |
"streamlit",
|
| 12 |
"st-diff-viewer",
|
|
|
|
| 13 |
]
|
| 14 |
|
|
|
|
| 10 |
"marker-pdf",
|
| 11 |
"streamlit",
|
| 12 |
"st-diff-viewer",
|
| 13 |
+
"pymupdf>=1.26.4",
|
| 14 |
]
|
| 15 |
|
src/streamlit_app.py
CHANGED
|
@@ -5,19 +5,22 @@ from io import BytesIO
|
|
| 5 |
from pathlib import Path
|
| 6 |
|
| 7 |
import streamlit as st
|
| 8 |
-
from docling.datamodel.base_models import DocumentStream
|
| 9 |
-
from docling.document_converter import DocumentConverter
|
|
|
|
| 10 |
from marker.converters.pdf import PdfConverter
|
| 11 |
from marker.models import create_model_dict
|
| 12 |
from marker.output import text_from_rendered
|
| 13 |
from st_diff_viewer import diff_viewer
|
| 14 |
|
|
|
|
| 15 |
|
| 16 |
@st.cache_resource
|
| 17 |
def load_marker_models() -> dict:
|
| 18 |
"""Load Marker models"""
|
| 19 |
return create_model_dict()
|
| 20 |
|
|
|
|
| 21 |
def extract_with_marker(pdf_bytes: bytes):
|
| 22 |
"""Extract text from PDF using Marker"""
|
| 23 |
|
|
@@ -32,7 +35,6 @@ def extract_with_marker(pdf_bytes: bytes):
|
|
| 32 |
artifact_dict=load_marker_models(),
|
| 33 |
)
|
| 34 |
|
| 35 |
-
# Time the conversion
|
| 36 |
start_time = time.time()
|
| 37 |
rendered = converter(tmp_file_path)
|
| 38 |
text, _, images = text_from_rendered(rendered)
|
|
@@ -49,28 +51,116 @@ def extract_with_marker(pdf_bytes: bytes):
|
|
| 49 |
return None, None, str(e)
|
| 50 |
|
| 51 |
|
| 52 |
-
def
|
| 53 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
|
| 55 |
try:
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
source = DocumentStream(name=filename, stream=buf)
|
| 59 |
|
| 60 |
-
|
| 61 |
-
|
|
|
|
| 62 |
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
result = converter.convert(source)
|
| 66 |
-
markdown_text = result.document.export_to_markdown()
|
| 67 |
-
end_time = time.time()
|
| 68 |
|
| 69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
|
| 71 |
-
|
| 72 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
except Exception as e:
|
| 75 |
return None, None, str(e)
|
| 76 |
|
|
@@ -98,7 +188,7 @@ def main() -> None:
|
|
| 98 |
)
|
| 99 |
|
| 100 |
st.title("📄 PDF Extraction Comparison: Marker vs Docling")
|
| 101 |
-
st.markdown("Compare PDF-to-Markdown extraction performance between Marker and Docling
|
| 102 |
|
| 103 |
# File upload
|
| 104 |
st.header("📤 Upload PDF Document")
|
|
@@ -108,39 +198,70 @@ def main() -> None:
|
|
| 108 |
help="Upload a PDF document to compare extraction performance"
|
| 109 |
)
|
| 110 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
if uploaded_file is not None:
|
| 112 |
st.success(f"File uploaded: {uploaded_file.name}")
|
| 113 |
pdf_bytes = uploaded_file.read()
|
| 114 |
|
| 115 |
-
# Process with
|
| 116 |
st.header("🔄 Processing...")
|
| 117 |
|
| 118 |
# Create columns for parallel processing display
|
| 119 |
-
col1, col2 = st.columns(
|
| 120 |
|
| 121 |
with col1:
|
| 122 |
st.subheader("🏷️ Marker Processing")
|
| 123 |
marker_placeholder = st.empty()
|
| 124 |
|
| 125 |
with col2:
|
| 126 |
-
st.subheader("📋 Docling
|
| 127 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
|
| 129 |
# Process with Marker
|
| 130 |
with marker_placeholder.container():
|
| 131 |
with st.spinner("Processing with Marker..."):
|
| 132 |
marker_text, marker_time, marker_error = extract_with_marker(pdf_bytes)
|
| 133 |
|
| 134 |
-
# Process with Docling
|
| 135 |
-
with
|
| 136 |
-
with st.spinner("Processing with Docling..."):
|
| 137 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 138 |
|
| 139 |
# Display results
|
| 140 |
st.header("📊 Results")
|
| 141 |
|
| 142 |
# Performance metrics
|
| 143 |
-
if marker_time is not None and
|
| 144 |
metrics_col1, metrics_col2, metrics_col3 = st.columns(3)
|
| 145 |
|
| 146 |
with metrics_col1:
|
|
@@ -151,35 +272,47 @@ def main() -> None:
|
|
| 151 |
|
| 152 |
with metrics_col2:
|
| 153 |
st.metric(
|
| 154 |
-
"Docling
|
| 155 |
-
f"{
|
| 156 |
)
|
| 157 |
|
| 158 |
with metrics_col3:
|
| 159 |
-
speed_diff = ((marker_time - docling_time) / docling_time) * 100
|
| 160 |
-
faster_library = "Docling" if marker_time > docling_time else "Marker"
|
| 161 |
st.metric(
|
| 162 |
-
|
| 163 |
-
f"{
|
| 164 |
)
|
| 165 |
|
| 166 |
# Text comparison
|
| 167 |
-
if marker_text is not None and
|
| 168 |
-
# Calculate
|
| 169 |
-
|
| 170 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 171 |
|
| 172 |
# Length comparison
|
| 173 |
-
len_col1, len_col2 = st.columns(
|
| 174 |
with len_col1:
|
| 175 |
st.info(f"Marker output: {len(marker_text)} characters")
|
| 176 |
with len_col2:
|
| 177 |
-
st.info(f"Docling
|
|
|
|
|
|
|
| 178 |
|
| 179 |
-
#
|
| 180 |
st.subheader("📄 Markdown Output Comparison")
|
| 181 |
|
| 182 |
-
tab1, tab2, tab3 = st.tabs(["Marker Output", "Docling
|
| 183 |
|
| 184 |
with tab1:
|
| 185 |
st.markdown("### Marker Output")
|
|
@@ -191,23 +324,54 @@ def main() -> None:
|
|
| 191 |
)
|
| 192 |
|
| 193 |
with tab2:
|
| 194 |
-
st.markdown("### Docling Output")
|
| 195 |
st.text_area(
|
| 196 |
-
"Docling Markdown",
|
| 197 |
-
|
| 198 |
height=800,
|
| 199 |
-
key="
|
| 200 |
)
|
| 201 |
|
| 202 |
with tab3:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 203 |
st.markdown("### Text Differences")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
try:
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 211 |
except ImportError as e:
|
| 212 |
st.error(f"streamlit-diff-viewer not available: {e}")
|
| 213 |
|
|
@@ -215,8 +379,11 @@ def main() -> None:
|
|
| 215 |
if marker_error:
|
| 216 |
st.error(f"Marker Error: {marker_error}")
|
| 217 |
|
| 218 |
-
if
|
| 219 |
-
st.error(f"Docling Error: {
|
|
|
|
|
|
|
|
|
|
| 220 |
|
| 221 |
else:
|
| 222 |
st.info("👆 Please upload a PDF file to begin comparison")
|
|
|
|
| 5 |
from pathlib import Path
|
| 6 |
|
| 7 |
import streamlit as st
|
| 8 |
+
from docling.datamodel.base_models import DocumentStream, InputFormat
|
| 9 |
+
from docling.document_converter import DocumentConverter, PdfFormatOption, ImageFormatOption
|
| 10 |
+
from docling.datamodel.pipeline_options import PdfPipelineOptions, EasyOcrOptions, TesseractOcrOptions
|
| 11 |
from marker.converters.pdf import PdfConverter
|
| 12 |
from marker.models import create_model_dict
|
| 13 |
from marker.output import text_from_rendered
|
| 14 |
from st_diff_viewer import diff_viewer
|
| 15 |
|
| 16 |
+
import fitz
|
| 17 |
|
| 18 |
@st.cache_resource
|
| 19 |
def load_marker_models() -> dict:
|
| 20 |
"""Load Marker models"""
|
| 21 |
return create_model_dict()
|
| 22 |
|
| 23 |
+
@st.cache_data(show_spinner=False)
|
| 24 |
def extract_with_marker(pdf_bytes: bytes):
|
| 25 |
"""Extract text from PDF using Marker"""
|
| 26 |
|
|
|
|
| 35 |
artifact_dict=load_marker_models(),
|
| 36 |
)
|
| 37 |
|
|
|
|
| 38 |
start_time = time.time()
|
| 39 |
rendered = converter(tmp_file_path)
|
| 40 |
text, _, images = text_from_rendered(rendered)
|
|
|
|
| 51 |
return None, None, str(e)
|
| 52 |
|
| 53 |
|
| 54 |
+
def pdf_to_images(pdf_bytes: bytes, dpi: int = 200) -> list[bytes]:
|
| 55 |
+
"""Convert PDF pages to PIL Images using PyMuPDF"""
|
| 56 |
+
images = []
|
| 57 |
+
pdf_doc = fitz.open(stream=pdf_bytes, filetype="pdf")
|
| 58 |
+
|
| 59 |
+
zoom = float(dpi) / 72.0
|
| 60 |
+
mat = fitz.Matrix(zoom, zoom)
|
| 61 |
|
| 62 |
try:
|
| 63 |
+
for page in pdf_doc:
|
| 64 |
+
pix = page.get_pixmap(matrix=mat)
|
|
|
|
| 65 |
|
| 66 |
+
img_data = pix.tobytes("png")
|
| 67 |
+
# img = Image.open(BytesIO(img_data))
|
| 68 |
+
images.append(img_data)
|
| 69 |
|
| 70 |
+
finally:
|
| 71 |
+
pdf_doc.close()
|
|
|
|
|
|
|
|
|
|
| 72 |
|
| 73 |
+
return images
|
| 74 |
+
|
| 75 |
+
@st.cache_data(show_spinner=False)
|
| 76 |
+
def extract_with_docling(pdf_bytes: bytes, filename: str, ocr_engine: str = "EasyOCR", full_ocr_mode: bool = False):
|
| 77 |
+
"""Extract text from PDF using Docling with configurable OCR options
|
| 78 |
+
|
| 79 |
+
Args:
|
| 80 |
+
pdf_bytes: PDF file content as bytes
|
| 81 |
+
filename: Name of the PDF file
|
| 82 |
+
ocr_engine: OCR engine to use ("EasyOCR" or "Tesseract")
|
| 83 |
+
full_ocr_mode: If True, converts pages to images and applies full OCR
|
| 84 |
+
"""
|
| 85 |
|
| 86 |
+
try:
|
| 87 |
+
if full_ocr_mode:
|
| 88 |
+
# Convert PDF pages to images first
|
| 89 |
+
images = pdf_to_images(pdf_bytes, dpi=300)
|
| 90 |
+
|
| 91 |
+
pipeline_options = PdfPipelineOptions()
|
| 92 |
+
pipeline_options.do_ocr = True
|
| 93 |
+
if ocr_engine == "Tesseract":
|
| 94 |
+
pipeline_options.ocr_options = TesseractOcrOptions(force_full_page_ocr=True)
|
| 95 |
+
else:
|
| 96 |
+
pipeline_options.ocr_options = EasyOcrOptions(force_full_page_ocr=True)
|
| 97 |
+
|
| 98 |
+
# Initialize converter for images
|
| 99 |
+
converter = DocumentConverter(
|
| 100 |
+
format_options={
|
| 101 |
+
InputFormat.IMAGE: ImageFormatOption(
|
| 102 |
+
pipeline_options=pipeline_options
|
| 103 |
+
)
|
| 104 |
+
}
|
| 105 |
+
)
|
| 106 |
+
|
| 107 |
+
all_markdown = []
|
| 108 |
+
total_processing_time = 0.0
|
| 109 |
+
for i, img in enumerate(images):
|
| 110 |
+
# img_buffer = BytesIO()
|
| 111 |
+
# img.save(img_buffer, format='PNG')
|
| 112 |
+
img_bytes = BytesIO(img)
|
| 113 |
+
|
| 114 |
+
# Create DocumentStream for the image
|
| 115 |
+
img_stream = DocumentStream(
|
| 116 |
+
name=f"{filename}_page_{i+1}.png",
|
| 117 |
+
stream=img_bytes
|
| 118 |
+
)
|
| 119 |
|
| 120 |
+
# Convert image with OCR
|
| 121 |
+
start_time = time.time()
|
| 122 |
+
result = converter.convert(img_stream)
|
| 123 |
+
end_time = time.time()
|
| 124 |
+
processing_time = end_time - start_time
|
| 125 |
+
total_processing_time += processing_time
|
| 126 |
+
page_markdown = result.document.export_to_markdown()
|
| 127 |
+
|
| 128 |
+
if page_markdown.strip():
|
| 129 |
+
all_markdown.append(f"# Page {i+1}\n\n{page_markdown}")
|
| 130 |
+
|
| 131 |
+
# Combine all pages
|
| 132 |
+
markdown_text = "\n\n---\n\n".join(all_markdown)
|
| 133 |
+
return markdown_text, total_processing_time, None
|
| 134 |
+
|
| 135 |
+
else:
|
| 136 |
+
# Standard PDF processing
|
| 137 |
+
buf = BytesIO(pdf_bytes)
|
| 138 |
+
source = DocumentStream(name=filename, stream=buf)
|
| 139 |
+
|
| 140 |
+
# Configure pipeline options
|
| 141 |
+
pipeline_options = PdfPipelineOptions()
|
| 142 |
+
|
| 143 |
+
# Configure OCR engine
|
| 144 |
+
if ocr_engine == "Tesseract":
|
| 145 |
+
pipeline_options.ocr_options = TesseractOcrOptions()
|
| 146 |
+
else:
|
| 147 |
+
pipeline_options.ocr_options = EasyOcrOptions()
|
| 148 |
+
|
| 149 |
+
# Initialize Docling converter with custom options
|
| 150 |
+
converter = DocumentConverter(
|
| 151 |
+
format_options={
|
| 152 |
+
InputFormat.PDF: PdfFormatOption(
|
| 153 |
+
pipeline_options=pipeline_options
|
| 154 |
+
)
|
| 155 |
+
}
|
| 156 |
+
)
|
| 157 |
+
|
| 158 |
+
start_time = time.time()
|
| 159 |
+
result = converter.convert(source)
|
| 160 |
+
end_time = time.time()
|
| 161 |
+
markdown_text = result.document.export_to_markdown()
|
| 162 |
+
processing_time = end_time - start_time
|
| 163 |
+
return markdown_text, processing_time, None
|
| 164 |
except Exception as e:
|
| 165 |
return None, None, str(e)
|
| 166 |
|
|
|
|
| 188 |
)
|
| 189 |
|
| 190 |
st.title("📄 PDF Extraction Comparison: Marker vs Docling")
|
| 191 |
+
st.markdown("Compare PDF-to-Markdown extraction performance between **Marker**, **Docling Standard** (PDF text extraction), and **Docling Full OCR** (page-to-image + OCR processing)")
|
| 192 |
|
| 193 |
# File upload
|
| 194 |
st.header("📤 Upload PDF Document")
|
|
|
|
| 198 |
help="Upload a PDF document to compare extraction performance"
|
| 199 |
)
|
| 200 |
|
| 201 |
+
# OCR Configuration Section
|
| 202 |
+
st.header("⚙️ OCR Configuration")
|
| 203 |
+
|
| 204 |
+
ocr_engine = st.selectbox(
|
| 205 |
+
"OCR Engine",
|
| 206 |
+
options=["EasyOCR", "Tesseract"],
|
| 207 |
+
index=0,
|
| 208 |
+
help="Choose the OCR engine for text extraction. EasyOCR is generally faster, while Tesseract may be more accurate for certain document types."
|
| 209 |
+
)
|
| 210 |
+
|
| 211 |
+
st.info("📋 **Processing modes**: The app will run both Docling Standard (PDF text extraction) and Docling Full OCR (page-to-image + OCR) modes for comparison.")
|
| 212 |
+
|
| 213 |
if uploaded_file is not None:
|
| 214 |
st.success(f"File uploaded: {uploaded_file.name}")
|
| 215 |
pdf_bytes = uploaded_file.read()
|
| 216 |
|
| 217 |
+
# Process with all three methods
|
| 218 |
st.header("🔄 Processing...")
|
| 219 |
|
| 220 |
# Create columns for parallel processing display
|
| 221 |
+
col1, col2, col3 = st.columns(3)
|
| 222 |
|
| 223 |
with col1:
|
| 224 |
st.subheader("🏷️ Marker Processing")
|
| 225 |
marker_placeholder = st.empty()
|
| 226 |
|
| 227 |
with col2:
|
| 228 |
+
st.subheader("📋 Docling Standard")
|
| 229 |
+
docling_standard_placeholder = st.empty()
|
| 230 |
+
|
| 231 |
+
with col3:
|
| 232 |
+
st.subheader("🔍 Docling Full OCR")
|
| 233 |
+
docling_ocr_placeholder = st.empty()
|
| 234 |
|
| 235 |
# Process with Marker
|
| 236 |
with marker_placeholder.container():
|
| 237 |
with st.spinner("Processing with Marker..."):
|
| 238 |
marker_text, marker_time, marker_error = extract_with_marker(pdf_bytes)
|
| 239 |
|
| 240 |
+
# Process with Docling Standard Mode
|
| 241 |
+
with docling_standard_placeholder.container():
|
| 242 |
+
with st.spinner(f"Processing with Docling Standard ({ocr_engine} OCR)..."):
|
| 243 |
+
docling_standard_text, docling_standard_time, docling_standard_error = extract_with_docling(
|
| 244 |
+
pdf_bytes,
|
| 245 |
+
uploaded_file.name,
|
| 246 |
+
ocr_engine=ocr_engine,
|
| 247 |
+
full_ocr_mode=False
|
| 248 |
+
)
|
| 249 |
+
|
| 250 |
+
# Process with Docling Full OCR Mode
|
| 251 |
+
with docling_ocr_placeholder.container():
|
| 252 |
+
with st.spinner(f"Processing with Docling Full OCR ({ocr_engine} OCR)..."):
|
| 253 |
+
docling_ocr_text, docling_ocr_time, docling_ocr_error = extract_with_docling(
|
| 254 |
+
pdf_bytes,
|
| 255 |
+
uploaded_file.name,
|
| 256 |
+
ocr_engine=ocr_engine,
|
| 257 |
+
full_ocr_mode=True
|
| 258 |
+
)
|
| 259 |
|
| 260 |
# Display results
|
| 261 |
st.header("📊 Results")
|
| 262 |
|
| 263 |
# Performance metrics
|
| 264 |
+
if marker_time is not None and docling_standard_time is not None and docling_ocr_time is not None:
|
| 265 |
metrics_col1, metrics_col2, metrics_col3 = st.columns(3)
|
| 266 |
|
| 267 |
with metrics_col1:
|
|
|
|
| 272 |
|
| 273 |
with metrics_col2:
|
| 274 |
st.metric(
|
| 275 |
+
"Docling Standard Time",
|
| 276 |
+
f"{docling_standard_time:.2f}s"
|
| 277 |
)
|
| 278 |
|
| 279 |
with metrics_col3:
|
|
|
|
|
|
|
| 280 |
st.metric(
|
| 281 |
+
"Docling Full OCR Time",
|
| 282 |
+
f"{docling_ocr_time:.2f}s"
|
| 283 |
)
|
| 284 |
|
| 285 |
# Text comparison
|
| 286 |
+
if marker_text is not None and docling_standard_text is not None and docling_ocr_text is not None:
|
| 287 |
+
# Calculate similarities between all methods
|
| 288 |
+
similarity_marker_standard = calculate_similarity(marker_text, docling_standard_text)
|
| 289 |
+
similarity_marker_ocr = calculate_similarity(marker_text, docling_ocr_text)
|
| 290 |
+
similarity_standard_ocr = calculate_similarity(docling_standard_text, docling_ocr_text)
|
| 291 |
+
|
| 292 |
+
# Display similarity metrics
|
| 293 |
+
st.subheader("📝 Text Similarity Comparison")
|
| 294 |
+
sim_col1, sim_col2, sim_col3 = st.columns(3)
|
| 295 |
+
|
| 296 |
+
with sim_col1:
|
| 297 |
+
st.metric("Marker ↔ Docling Standard", f"{similarity_marker_standard:.1%}")
|
| 298 |
+
with sim_col2:
|
| 299 |
+
st.metric("Marker ↔ Docling Full OCR", f"{similarity_marker_ocr:.1%}")
|
| 300 |
+
with sim_col3:
|
| 301 |
+
st.metric("Docling Standard ↔ Full OCR", f"{similarity_standard_ocr:.1%}")
|
| 302 |
|
| 303 |
# Length comparison
|
| 304 |
+
len_col1, len_col2, len_col3 = st.columns(3)
|
| 305 |
with len_col1:
|
| 306 |
st.info(f"Marker output: {len(marker_text)} characters")
|
| 307 |
with len_col2:
|
| 308 |
+
st.info(f"Docling Standard: {len(docling_standard_text)} characters")
|
| 309 |
+
with len_col3:
|
| 310 |
+
st.info(f"Docling Full OCR: {len(docling_ocr_text)} characters")
|
| 311 |
|
| 312 |
+
# Three-way comparison tabs
|
| 313 |
st.subheader("📄 Markdown Output Comparison")
|
| 314 |
|
| 315 |
+
tab1, tab2, tab3, tab4 = st.tabs(["Marker Output", "Docling Standard", "Docling Full OCR", "Diff View"])
|
| 316 |
|
| 317 |
with tab1:
|
| 318 |
st.markdown("### Marker Output")
|
|
|
|
| 324 |
)
|
| 325 |
|
| 326 |
with tab2:
|
| 327 |
+
st.markdown("### Docling Standard Output")
|
| 328 |
st.text_area(
|
| 329 |
+
"Docling Standard Markdown",
|
| 330 |
+
docling_standard_text,
|
| 331 |
height=800,
|
| 332 |
+
key="docling_standard_output"
|
| 333 |
)
|
| 334 |
|
| 335 |
with tab3:
|
| 336 |
+
st.markdown("### Docling Full OCR Output")
|
| 337 |
+
st.text_area(
|
| 338 |
+
"Docling Full OCR Markdown",
|
| 339 |
+
docling_ocr_text,
|
| 340 |
+
height=800,
|
| 341 |
+
key="docling_ocr_output"
|
| 342 |
+
)
|
| 343 |
+
|
| 344 |
+
with tab4:
|
| 345 |
st.markdown("### Text Differences")
|
| 346 |
+
|
| 347 |
+
# Allow user to choose which comparison to view
|
| 348 |
+
diff_option = st.selectbox(
|
| 349 |
+
"Choose comparison:",
|
| 350 |
+
["Marker vs Docling Standard", "Marker vs Docling Full OCR", "Docling Standard vs Full OCR"]
|
| 351 |
+
)
|
| 352 |
+
|
| 353 |
try:
|
| 354 |
+
if diff_option == "Marker vs Docling Standard":
|
| 355 |
+
diff_viewer(
|
| 356 |
+
old_text=marker_text,
|
| 357 |
+
new_text=docling_standard_text,
|
| 358 |
+
left_title="Marker",
|
| 359 |
+
right_title="Docling Standard",
|
| 360 |
+
)
|
| 361 |
+
elif diff_option == "Marker vs Docling Full OCR":
|
| 362 |
+
diff_viewer(
|
| 363 |
+
old_text=marker_text,
|
| 364 |
+
new_text=docling_ocr_text,
|
| 365 |
+
left_title="Marker",
|
| 366 |
+
right_title="Docling Full OCR",
|
| 367 |
+
)
|
| 368 |
+
else: # Docling Standard vs Full OCR
|
| 369 |
+
diff_viewer(
|
| 370 |
+
old_text=docling_standard_text,
|
| 371 |
+
new_text=docling_ocr_text,
|
| 372 |
+
left_title="Docling Standard",
|
| 373 |
+
right_title="Docling Full OCR",
|
| 374 |
+
)
|
| 375 |
except ImportError as e:
|
| 376 |
st.error(f"streamlit-diff-viewer not available: {e}")
|
| 377 |
|
|
|
|
| 379 |
if marker_error:
|
| 380 |
st.error(f"Marker Error: {marker_error}")
|
| 381 |
|
| 382 |
+
if docling_standard_error:
|
| 383 |
+
st.error(f"Docling Standard Error: {docling_standard_error}")
|
| 384 |
+
|
| 385 |
+
if docling_ocr_error:
|
| 386 |
+
st.error(f"Docling Full OCR Error: {docling_ocr_error}")
|
| 387 |
|
| 388 |
else:
|
| 389 |
st.info("👆 Please upload a PDF file to begin comparison")
|
uv.lock
CHANGED
|
@@ -335,16 +335,22 @@ source = { virtual = "." }
|
|
| 335 |
dependencies = [
|
| 336 |
{ name = "docling" },
|
| 337 |
{ name = "marker-pdf" },
|
|
|
|
|
|
|
| 338 |
{ name = "st-diff-viewer" },
|
| 339 |
{ name = "streamlit" },
|
|
|
|
| 340 |
]
|
| 341 |
|
| 342 |
[package.metadata]
|
| 343 |
requires-dist = [
|
| 344 |
{ name = "docling" },
|
| 345 |
{ name = "marker-pdf" },
|
|
|
|
|
|
|
| 346 |
{ name = "st-diff-viewer" },
|
| 347 |
{ name = "streamlit" },
|
|
|
|
| 348 |
]
|
| 349 |
|
| 350 |
[[package]]
|
|
@@ -1472,6 +1478,21 @@ version = "2.10"
|
|
| 1472 |
source = { registry = "https://pypi.org/simple" }
|
| 1473 |
sdist = { url = "https://files.pythonhosted.org/packages/5d/ab/34ec41718af73c00119d0351b7a2531d2ebddb51833a36448fc7b862be60/pylatexenc-2.10.tar.gz", hash = "sha256:3dd8fd84eb46dc30bee1e23eaab8d8fb5a7f507347b23e5f38ad9675c84f40d3", size = 162597, upload-time = "2021-04-06T07:56:07.854Z" }
|
| 1474 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1475 |
[[package]]
|
| 1476 |
name = "pypdfium2"
|
| 1477 |
version = "4.30.0"
|
|
@@ -2129,6 +2150,19 @@ wheels = [
|
|
| 2129 |
{ url = "https://files.pythonhosted.org/packages/e5/30/643397144bfbfec6f6ef821f36f33e57d35946c44a2352d3c9f0ae847619/tenacity-9.1.2-py3-none-any.whl", hash = "sha256:f77bf36710d8b73a50b2dd155c97b870017ad21afe6ab300326b0371b3b05138", size = 28248, upload-time = "2025-04-02T08:25:07.678Z" },
|
| 2130 |
]
|
| 2131 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2132 |
[[package]]
|
| 2133 |
name = "threadpoolctl"
|
| 2134 |
version = "3.6.0"
|
|
|
|
| 335 |
dependencies = [
|
| 336 |
{ name = "docling" },
|
| 337 |
{ name = "marker-pdf" },
|
| 338 |
+
{ name = "pillow" },
|
| 339 |
+
{ name = "pymupdf" },
|
| 340 |
{ name = "st-diff-viewer" },
|
| 341 |
{ name = "streamlit" },
|
| 342 |
+
{ name = "tesserocr" },
|
| 343 |
]
|
| 344 |
|
| 345 |
[package.metadata]
|
| 346 |
requires-dist = [
|
| 347 |
{ name = "docling" },
|
| 348 |
{ name = "marker-pdf" },
|
| 349 |
+
{ name = "pillow", specifier = ">=10.4.0" },
|
| 350 |
+
{ name = "pymupdf", specifier = ">=1.26.4" },
|
| 351 |
{ name = "st-diff-viewer" },
|
| 352 |
{ name = "streamlit" },
|
| 353 |
+
{ name = "tesserocr", specifier = ">=2.8.0" },
|
| 354 |
]
|
| 355 |
|
| 356 |
[[package]]
|
|
|
|
| 1478 |
source = { registry = "https://pypi.org/simple" }
|
| 1479 |
sdist = { url = "https://files.pythonhosted.org/packages/5d/ab/34ec41718af73c00119d0351b7a2531d2ebddb51833a36448fc7b862be60/pylatexenc-2.10.tar.gz", hash = "sha256:3dd8fd84eb46dc30bee1e23eaab8d8fb5a7f507347b23e5f38ad9675c84f40d3", size = 162597, upload-time = "2021-04-06T07:56:07.854Z" }
|
| 1480 |
|
| 1481 |
+
[[package]]
|
| 1482 |
+
name = "pymupdf"
|
| 1483 |
+
version = "1.26.4"
|
| 1484 |
+
source = { registry = "https://pypi.org/simple" }
|
| 1485 |
+
sdist = { url = "https://files.pythonhosted.org/packages/90/35/031556dfc0d332d8e9ed9b61ca105138606d3f8971b9eb02e20118629334/pymupdf-1.26.4.tar.gz", hash = "sha256:be13a066d42bfaed343a488168656637c4d9843ddc63b768dc827c9dfc6b9989", size = 83077563, upload-time = "2025-08-25T14:20:29.499Z" }
|
| 1486 |
+
wheels = [
|
| 1487 |
+
{ url = "https://files.pythonhosted.org/packages/27/ae/3be722886cc7be2093585cd94f466db1199133ab005645a7a567b249560f/pymupdf-1.26.4-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:cb95562a0a63ce906fd788bdad5239063b63068cf4a991684f43acb09052cb99", size = 23061974, upload-time = "2025-08-25T14:16:58.811Z" },
|
| 1488 |
+
{ url = "https://files.pythonhosted.org/packages/fc/b0/9a451d837e1fe18ecdbfbc34a6499f153c8a008763229cc634725383a93f/pymupdf-1.26.4-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:67e9e6b45832c33726651c2a031e9a20108fd9e759140b9e843f934de813a7ff", size = 22410112, upload-time = "2025-08-25T14:17:24.511Z" },
|
| 1489 |
+
{ url = "https://files.pythonhosted.org/packages/d8/13/0916e8e02cb5453161fb9d9167c747d0a20d58633e30728645374153f815/pymupdf-1.26.4-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:2604f687dd02b6a1b98c81bd8becfc0024899a2d2085adfe3f9e91607721fd22", size = 23454948, upload-time = "2025-08-25T21:20:07.71Z" },
|
| 1490 |
+
{ url = "https://files.pythonhosted.org/packages/4e/c6/d3cfafc75d383603884edeabe4821a549345df954a88d79e6764e2c87601/pymupdf-1.26.4-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:973a6dda61ebd34040e4df3753bf004b669017663fbbfdaa294d44eceba98de0", size = 24060686, upload-time = "2025-08-25T14:17:56.536Z" },
|
| 1491 |
+
{ url = "https://files.pythonhosted.org/packages/72/08/035e9d22c801e801bba50c6745bc90ba8696a042fe2c68793e28bf0c3b07/pymupdf-1.26.4-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:299a49797df5b558e695647fa791329ba3911cbbb31ed65f24a6266c118ef1a7", size = 24265046, upload-time = "2025-08-25T14:18:21.238Z" },
|
| 1492 |
+
{ url = "https://files.pythonhosted.org/packages/28/8c/c201e4846ec0fb6ae5d52aa3a5d66f9355f0c69fb94230265714df0de65e/pymupdf-1.26.4-cp39-abi3-win32.whl", hash = "sha256:51b38379aad8c71bd7a8dd24d93fbe7580c2a5d9d7e1f9cd29ebbba315aa1bd1", size = 17127332, upload-time = "2025-08-25T14:18:39.132Z" },
|
| 1493 |
+
{ url = "https://files.pythonhosted.org/packages/d1/c4/87d27b108c2f6d773aa5183c5ae367b2a99296ea4bc16eb79f453c679e30/pymupdf-1.26.4-cp39-abi3-win_amd64.whl", hash = "sha256:0b6345a93a9afd28de2567e433055e873205c52e6b920b129ca50e836a3aeec6", size = 18743491, upload-time = "2025-08-25T14:19:01.104Z" },
|
| 1494 |
+
]
|
| 1495 |
+
|
| 1496 |
[[package]]
|
| 1497 |
name = "pypdfium2"
|
| 1498 |
version = "4.30.0"
|
|
|
|
| 2150 |
{ url = "https://files.pythonhosted.org/packages/e5/30/643397144bfbfec6f6ef821f36f33e57d35946c44a2352d3c9f0ae847619/tenacity-9.1.2-py3-none-any.whl", hash = "sha256:f77bf36710d8b73a50b2dd155c97b870017ad21afe6ab300326b0371b3b05138", size = 28248, upload-time = "2025-04-02T08:25:07.678Z" },
|
| 2151 |
]
|
| 2152 |
|
| 2153 |
+
[[package]]
|
| 2154 |
+
name = "tesserocr"
|
| 2155 |
+
version = "2.8.0"
|
| 2156 |
+
source = { registry = "https://pypi.org/simple" }
|
| 2157 |
+
sdist = { url = "https://files.pythonhosted.org/packages/4f/d6/145858a1aff0310cdf709b8c5895d43660680202296ce6e5980dd2412d53/tesserocr-2.8.0.tar.gz", hash = "sha256:be518d1b1b5ff54c11aada1e0fd12942509ea70581e0a8b39a2a473a0b2dbd36", size = 72564, upload-time = "2025-02-12T12:41:53.7Z" }
|
| 2158 |
+
wheels = [
|
| 2159 |
+
{ url = "https://files.pythonhosted.org/packages/b2/43/1739cf5e2223bf0ea270c933b71763b8a7c4616064e309e660c8e43bec02/tesserocr-2.8.0-cp313-cp313-macosx_13_0_x86_64.whl", hash = "sha256:44b3396d52379155fd838931b78b044129c7c77a8f02a92574cde626cff9b4a8", size = 4099019, upload-time = "2025-02-12T12:41:39.368Z" },
|
| 2160 |
+
{ url = "https://files.pythonhosted.org/packages/d9/9d/7b8a8e29050d90446b81ccc5a3cc3256d62cff145628e718f7286a64dd14/tesserocr-2.8.0-cp313-cp313-macosx_15_0_arm64.whl", hash = "sha256:1edd2302f4a91b5491a4ce3f63e612441adf92fd81b339b85cbedb3b5b40f206", size = 3609710, upload-time = "2025-02-12T12:41:43.128Z" },
|
| 2161 |
+
{ url = "https://files.pythonhosted.org/packages/76/0b/b445adba94ccbabfe59e5cd0247285ccc4263103bed8fd54b835a973c200/tesserocr-2.8.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:b0dd849ce77373f9ac4b54d345b4d7115414e525e57a158e948887d744c6f909", size = 4886946, upload-time = "2025-02-12T12:41:46.594Z" },
|
| 2162 |
+
{ url = "https://files.pythonhosted.org/packages/13/e4/bf4ab45d49459d0e9e727603d5ed077552afd252e6e7886259e57fc9f10d/tesserocr-2.8.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:9ce710a73308964f2ac53f94b4980d2791bb67a82863bb7ef0ca445c1b325aa4", size = 5206055, upload-time = "2025-02-12T12:41:49.217Z" },
|
| 2163 |
+
{ url = "https://files.pythonhosted.org/packages/05/11/cf253d8de880f72924084e2570bc9df54e9d0013094c602a85cd962a70ff/tesserocr-2.8.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:a7a36af39aaf29a152c629cf62457192944f8854fbdd28395ef92d283e800662", size = 6599015, upload-time = "2025-02-12T12:41:52.017Z" },
|
| 2164 |
+
]
|
| 2165 |
+
|
| 2166 |
[[package]]
|
| 2167 |
name = "threadpoolctl"
|
| 2168 |
version = "3.6.0"
|