Spaces:
Sleeping
Sleeping
| import difflib | |
| import tempfile | |
| import time | |
| from io import BytesIO | |
| from pathlib import Path | |
| import streamlit as st | |
| from docling.datamodel.base_models import DocumentStream, InputFormat | |
| from docling.document_converter import DocumentConverter, PdfFormatOption, ImageFormatOption | |
| from docling.datamodel.pipeline_options import PdfPipelineOptions, EasyOcrOptions, TesseractOcrOptions | |
| from marker.converters.pdf import PdfConverter | |
| from marker.models import create_model_dict | |
| from marker.output import text_from_rendered | |
| from st_diff_viewer import diff_viewer | |
| import fitz | |
| def load_marker_models() -> dict: | |
| """Load Marker models""" | |
| return create_model_dict() | |
| def extract_with_marker(pdf_bytes: bytes): | |
| """Extract text from PDF using Marker""" | |
| try: | |
| # Save bytes to temporary file since Marker needs a file path | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file: | |
| tmp_file.write(pdf_bytes) | |
| tmp_file_path = tmp_file.name | |
| # Initialize Marker converter | |
| converter = PdfConverter( | |
| artifact_dict=load_marker_models(), | |
| ) | |
| start_time = time.time() | |
| rendered = converter(tmp_file_path) | |
| text, _, images = text_from_rendered(rendered) | |
| end_time = time.time() | |
| # Clean up temp file | |
| Path(tmp_file_path).unlink() | |
| processing_time = end_time - start_time | |
| return text, processing_time, None | |
| except Exception as e: | |
| return None, None, str(e) | |
| def pdf_to_images(pdf_bytes: bytes, dpi: int = 200) -> list[bytes]: | |
| """Convert PDF pages to PIL Images using PyMuPDF""" | |
| images = [] | |
| pdf_doc = fitz.open(stream=pdf_bytes, filetype="pdf") | |
| zoom = float(dpi) / 72.0 | |
| mat = fitz.Matrix(zoom, zoom) | |
| try: | |
| for page in pdf_doc: | |
| pix = page.get_pixmap(matrix=mat) | |
| img_data = pix.tobytes("png") | |
| # img = Image.open(BytesIO(img_data)) | |
| images.append(img_data) | |
| finally: | |
| pdf_doc.close() | |
| return images | |
| def extract_with_docling(pdf_bytes: bytes, filename: str, ocr_engine: str = "EasyOCR", full_ocr_mode: bool = False): | |
| """Extract text from PDF using Docling with configurable OCR options | |
| Args: | |
| pdf_bytes: PDF file content as bytes | |
| filename: Name of the PDF file | |
| ocr_engine: OCR engine to use ("EasyOCR" or "Tesseract") | |
| full_ocr_mode: If True, converts pages to images and applies full OCR | |
| """ | |
| try: | |
| if full_ocr_mode: | |
| # Convert PDF pages to images first | |
| images = pdf_to_images(pdf_bytes, dpi=300) | |
| pipeline_options = PdfPipelineOptions() | |
| pipeline_options.do_ocr = True | |
| if ocr_engine == "Tesseract": | |
| pipeline_options.ocr_options = TesseractOcrOptions(force_full_page_ocr=True) | |
| else: | |
| pipeline_options.ocr_options = EasyOcrOptions(force_full_page_ocr=True) | |
| # Initialize converter for images | |
| converter = DocumentConverter( | |
| format_options={ | |
| InputFormat.IMAGE: ImageFormatOption( | |
| pipeline_options=pipeline_options | |
| ) | |
| } | |
| ) | |
| all_markdown = [] | |
| total_processing_time = 0.0 | |
| for i, img in enumerate(images): | |
| # img_buffer = BytesIO() | |
| # img.save(img_buffer, format='PNG') | |
| img_bytes = BytesIO(img) | |
| # Create DocumentStream for the image | |
| img_stream = DocumentStream( | |
| name=f"{filename}_page_{i+1}.png", | |
| stream=img_bytes | |
| ) | |
| # Convert image with OCR | |
| start_time = time.time() | |
| result = converter.convert(img_stream) | |
| end_time = time.time() | |
| processing_time = end_time - start_time | |
| total_processing_time += processing_time | |
| page_markdown = result.document.export_to_markdown() | |
| if page_markdown.strip(): | |
| all_markdown.append(f"# Page {i+1}\n\n{page_markdown}") | |
| # Combine all pages | |
| markdown_text = "\n\n---\n\n".join(all_markdown) | |
| return markdown_text, total_processing_time, None | |
| else: | |
| # Standard PDF processing | |
| buf = BytesIO(pdf_bytes) | |
| source = DocumentStream(name=filename, stream=buf) | |
| # Configure pipeline options | |
| pipeline_options = PdfPipelineOptions() | |
| # Configure OCR engine | |
| if ocr_engine == "Tesseract": | |
| pipeline_options.ocr_options = TesseractOcrOptions() | |
| else: | |
| pipeline_options.ocr_options = EasyOcrOptions() | |
| # Initialize Docling converter with custom options | |
| converter = DocumentConverter( | |
| format_options={ | |
| InputFormat.PDF: PdfFormatOption( | |
| pipeline_options=pipeline_options | |
| ) | |
| } | |
| ) | |
| start_time = time.time() | |
| result = converter.convert(source) | |
| end_time = time.time() | |
| markdown_text = result.document.export_to_markdown() | |
| processing_time = end_time - start_time | |
| return markdown_text, processing_time, None | |
| except Exception as e: | |
| return None, None, str(e) | |
| def calculate_similarity(text1: str, text2: str) -> float: | |
| """Calculate similarity ratio between two texts""" | |
| return difflib.SequenceMatcher(None, text1, text2).ratio() | |
| def main() -> None: | |
| """ | |
| Main function for the application, providing an interface for comparing PDF-to-Markdown | |
| extraction performance between the Marker library and the Docling library. The function | |
| is executed in a Streamlit environment and utilizes its widgets and layout. | |
| This function handles file uploads, extraction using the two libraries, and displays | |
| various processing metrics, outputs, and comparisons to the user in an accessible format. | |
| :raises ValueError: If invalid or unsupported inputs are provided during processing. | |
| """ | |
| st.set_page_config( | |
| page_title="PDF Extraction Comparison: Marker vs Docling", | |
| page_icon="π", | |
| layout="wide" | |
| ) | |
| st.title("π PDF Extraction Comparison: Marker vs Docling") | |
| st.markdown("Compare PDF-to-Markdown extraction performance between **Marker**, **Docling Standard** (PDF text extraction), and **Docling Full OCR** (page-to-image + OCR processing)") | |
| # File upload | |
| st.header("π€ Upload PDF Document") | |
| uploaded_file = st.file_uploader( | |
| "Choose a PDF file", | |
| type="pdf", | |
| help="Upload a PDF document to compare extraction performance" | |
| ) | |
| # OCR Configuration Section | |
| st.header("βοΈ OCR Configuration") | |
| ocr_engine = st.selectbox( | |
| "OCR Engine", | |
| options=["EasyOCR", "Tesseract"], | |
| index=0, | |
| help="Choose the OCR engine for text extraction. EasyOCR is generally faster, while Tesseract may be more accurate for certain document types." | |
| ) | |
| st.info("π **Processing modes**: The app will run both Docling Standard (PDF text extraction) and Docling Full OCR (page-to-image + OCR) modes for comparison.") | |
| if uploaded_file is not None: | |
| st.success(f"File uploaded: {uploaded_file.name}") | |
| pdf_bytes = uploaded_file.read() | |
| # Process with all three methods | |
| st.header("π Processing...") | |
| # Create columns for parallel processing display | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| st.subheader("π·οΈ Marker Processing") | |
| marker_placeholder = st.empty() | |
| with col2: | |
| st.subheader("π Docling Standard") | |
| docling_standard_placeholder = st.empty() | |
| with col3: | |
| st.subheader("π Docling Full OCR") | |
| docling_ocr_placeholder = st.empty() | |
| # Process with Marker | |
| with marker_placeholder.container(): | |
| with st.spinner("Processing with Marker..."): | |
| marker_text, marker_time, marker_error = extract_with_marker(pdf_bytes) | |
| # Process with Docling Standard Mode | |
| with docling_standard_placeholder.container(): | |
| with st.spinner(f"Processing with Docling Standard ({ocr_engine} OCR)..."): | |
| docling_standard_text, docling_standard_time, docling_standard_error = extract_with_docling( | |
| pdf_bytes, | |
| uploaded_file.name, | |
| ocr_engine=ocr_engine, | |
| full_ocr_mode=False | |
| ) | |
| # Process with Docling Full OCR Mode | |
| with docling_ocr_placeholder.container(): | |
| with st.spinner(f"Processing with Docling Full OCR ({ocr_engine} OCR)..."): | |
| docling_ocr_text, docling_ocr_time, docling_ocr_error = extract_with_docling( | |
| pdf_bytes, | |
| uploaded_file.name, | |
| ocr_engine=ocr_engine, | |
| full_ocr_mode=True | |
| ) | |
| # Display results | |
| st.header("π Results") | |
| # Performance metrics | |
| if marker_time is not None and docling_standard_time is not None and docling_ocr_time is not None: | |
| metrics_col1, metrics_col2, metrics_col3 = st.columns(3) | |
| with metrics_col1: | |
| st.metric( | |
| "Marker Processing Time", | |
| f"{marker_time:.2f}s" | |
| ) | |
| with metrics_col2: | |
| st.metric( | |
| "Docling Standard Time", | |
| f"{docling_standard_time:.2f}s" | |
| ) | |
| with metrics_col3: | |
| st.metric( | |
| "Docling Full OCR Time", | |
| f"{docling_ocr_time:.2f}s" | |
| ) | |
| # Text comparison | |
| if marker_text is not None and docling_standard_text is not None and docling_ocr_text is not None: | |
| # Calculate similarities between all methods | |
| similarity_marker_standard = calculate_similarity(marker_text, docling_standard_text) | |
| similarity_marker_ocr = calculate_similarity(marker_text, docling_ocr_text) | |
| similarity_standard_ocr = calculate_similarity(docling_standard_text, docling_ocr_text) | |
| # Display similarity metrics | |
| st.subheader("π Text Similarity Comparison") | |
| sim_col1, sim_col2, sim_col3 = st.columns(3) | |
| with sim_col1: | |
| st.metric("Marker β Docling Standard", f"{similarity_marker_standard:.1%}") | |
| with sim_col2: | |
| st.metric("Marker β Docling Full OCR", f"{similarity_marker_ocr:.1%}") | |
| with sim_col3: | |
| st.metric("Docling Standard β Full OCR", f"{similarity_standard_ocr:.1%}") | |
| # Length comparison | |
| len_col1, len_col2, len_col3 = st.columns(3) | |
| with len_col1: | |
| st.info(f"Marker output: {len(marker_text)} characters") | |
| with len_col2: | |
| st.info(f"Docling Standard: {len(docling_standard_text)} characters") | |
| with len_col3: | |
| st.info(f"Docling Full OCR: {len(docling_ocr_text)} characters") | |
| # Three-way comparison tabs | |
| st.subheader("π Markdown Output Comparison") | |
| tab1, tab2, tab3, tab4 = st.tabs(["Marker Output", "Docling Standard", "Docling Full OCR", "Diff View"]) | |
| with tab1: | |
| st.markdown("### Marker Output") | |
| st.text_area( | |
| "Marker Markdown", | |
| marker_text, | |
| height=800, | |
| key="marker_output" | |
| ) | |
| with tab2: | |
| st.markdown("### Docling Standard Output") | |
| st.text_area( | |
| "Docling Standard Markdown", | |
| docling_standard_text, | |
| height=800, | |
| key="docling_standard_output" | |
| ) | |
| with tab3: | |
| st.markdown("### Docling Full OCR Output") | |
| st.text_area( | |
| "Docling Full OCR Markdown", | |
| docling_ocr_text, | |
| height=800, | |
| key="docling_ocr_output" | |
| ) | |
| with tab4: | |
| st.markdown("### Text Differences") | |
| # Allow user to choose which comparison to view | |
| diff_option = st.selectbox( | |
| "Choose comparison:", | |
| ["Marker vs Docling Standard", "Marker vs Docling Full OCR", "Docling Standard vs Full OCR"] | |
| ) | |
| try: | |
| if diff_option == "Marker vs Docling Standard": | |
| diff_viewer( | |
| old_text=marker_text, | |
| new_text=docling_standard_text, | |
| left_title="Marker", | |
| right_title="Docling Standard", | |
| ) | |
| elif diff_option == "Marker vs Docling Full OCR": | |
| diff_viewer( | |
| old_text=marker_text, | |
| new_text=docling_ocr_text, | |
| left_title="Marker", | |
| right_title="Docling Full OCR", | |
| ) | |
| else: # Docling Standard vs Full OCR | |
| diff_viewer( | |
| old_text=docling_standard_text, | |
| new_text=docling_ocr_text, | |
| left_title="Docling Standard", | |
| right_title="Docling Full OCR", | |
| ) | |
| except ImportError as e: | |
| st.error(f"streamlit-diff-viewer not available: {e}") | |
| # Error handling | |
| if marker_error: | |
| st.error(f"Marker Error: {marker_error}") | |
| if docling_standard_error: | |
| st.error(f"Docling Standard Error: {docling_standard_error}") | |
| if docling_ocr_error: | |
| st.error(f"Docling Full OCR Error: {docling_ocr_error}") | |
| else: | |
| st.info("π Please upload a PDF file to begin comparison") | |
| if __name__ == "__main__": | |
| main() | |