Spaces:

ysif9
/

compare-document-processing

Sleeping

App Files Files Community

Yousif Abdulhafiz commited on Sep 17, 2025

Commit

9a3de0d

1 Parent(s): a19b20f

Add OCR capabilities with PyMuPDF enhance PDF extraction comparison

Browse files

Files changed (3) hide show

pyproject.toml +1 -0
src/streamlit_app.py +221 -54
uv.lock +34 -0

pyproject.toml CHANGED Viewed

@@ -10,5 +10,6 @@ dependencies = [
   "marker-pdf",
   "streamlit",
   "st-diff-viewer",
 ]

   "marker-pdf",
   "streamlit",
   "st-diff-viewer",
+  "pymupdf>=1.26.4",
 ]

src/streamlit_app.py CHANGED Viewed

@@ -5,19 +5,22 @@ from io import BytesIO
 from pathlib import Path
 import streamlit as st
-from docling.datamodel.base_models import DocumentStream
-from docling.document_converter import DocumentConverter
 from marker.converters.pdf import PdfConverter
 from marker.models import create_model_dict
 from marker.output import text_from_rendered
 from st_diff_viewer import diff_viewer
 @st.cache_resource
 def load_marker_models() -> dict:
     """Load Marker models"""
     return create_model_dict()
 def extract_with_marker(pdf_bytes: bytes):
     """Extract text from PDF using Marker"""
@@ -32,7 +35,6 @@ def extract_with_marker(pdf_bytes: bytes):
             artifact_dict=load_marker_models(),
         )
-        # Time the conversion
         start_time = time.time()
         rendered = converter(tmp_file_path)
         text, _, images = text_from_rendered(rendered)
@@ -49,28 +51,116 @@ def extract_with_marker(pdf_bytes: bytes):
         return None, None, str(e)
-def extract_with_docling(pdf_bytes: bytes, filename: str):
-    """Extract text from PDF using Docling"""
     try:
-        # Create DocumentStream from bytes
-        buf = BytesIO(pdf_bytes)
-        source = DocumentStream(name=filename, stream=buf)
-        # Initialize Docling converter
-        converter = DocumentConverter()
-        # Time the conversion
-        start_time = time.time()
-        result = converter.convert(source)
-        markdown_text = result.document.export_to_markdown()
-        end_time = time.time()
-        processing_time = end_time - start_time
-        return markdown_text, processing_time, None
     except Exception as e:
         return None, None, str(e)
@@ -98,7 +188,7 @@ def main() -> None:
     )
     st.title("📄 PDF Extraction Comparison: Marker vs Docling")
-    st.markdown("Compare PDF-to-Markdown extraction performance between Marker and Docling libraries")
     # File upload
     st.header("📤 Upload PDF Document")
@@ -108,39 +198,70 @@ def main() -> None:
         help="Upload a PDF document to compare extraction performance"
     )
     if uploaded_file is not None:
         st.success(f"File uploaded: {uploaded_file.name}")
         pdf_bytes = uploaded_file.read()
-        # Process with both libraries
         st.header("🔄 Processing...")
         # Create columns for parallel processing display
-        col1, col2 = st.columns(2)
         with col1:
             st.subheader("🏷️ Marker Processing")
             marker_placeholder = st.empty()
         with col2:
-            st.subheader("📋 Docling Processing")
-            docling_placeholder = st.empty()
         # Process with Marker
         with marker_placeholder.container():
             with st.spinner("Processing with Marker..."):
                 marker_text, marker_time, marker_error = extract_with_marker(pdf_bytes)
-        # Process with Docling
-        with docling_placeholder.container():
-            with st.spinner("Processing with Docling..."):
-                docling_text, docling_time, docling_error = extract_with_docling(pdf_bytes, uploaded_file.name)
         # Display results
         st.header("📊 Results")
         # Performance metrics
-        if marker_time is not None and docling_time is not None:
             metrics_col1, metrics_col2, metrics_col3 = st.columns(3)
             with metrics_col1:
@@ -151,35 +272,47 @@ def main() -> None:
             with metrics_col2:
                 st.metric(
-                    "Docling Processing Time",
-                    f"{docling_time:.2f}s"
                 )
             with metrics_col3:
-                speed_diff = ((marker_time - docling_time) / docling_time) * 100
-                faster_library = "Docling" if marker_time > docling_time else "Marker"
                 st.metric(
-                    f"{faster_library} is faster by",
-                    f"{abs(speed_diff):.1f}%"
                 )
         # Text comparison
-        if marker_text is not None and docling_text is not None:
-            # Calculate similarity
-            similarity = calculate_similarity(marker_text, docling_text)
-            st.subheader(f"📝 Text Similarity: {similarity:.1%}")
             # Length comparison
-            len_col1, len_col2 = st.columns(2)
             with len_col1:
                 st.info(f"Marker output: {len(marker_text)} characters")
             with len_col2:
-                st.info(f"Docling output: {len(docling_text)} characters")
-            # Side-by-side comparison
             st.subheader("📄 Markdown Output Comparison")
-            tab1, tab2, tab3 = st.tabs(["Marker Output", "Docling Output", "Diff View"])
             with tab1:
                 st.markdown("### Marker Output")
@@ -191,23 +324,54 @@ def main() -> None:
                 )
             with tab2:
-                st.markdown("### Docling Output")
                 st.text_area(
-                    "Docling Markdown",
-                    docling_text,
                     height=800,
-                    key="docling_output"
                 )
             with tab3:
                 st.markdown("### Text Differences")
                 try:
-                    diff_viewer(
-                        old_text=marker_text,
-                        new_text=docling_text,
-                        left_title="Marker",
-                        right_title="Docling",
-                    )
                 except ImportError as e:
                     st.error(f"streamlit-diff-viewer not available: {e}")
@@ -215,8 +379,11 @@ def main() -> None:
         if marker_error:
             st.error(f"Marker Error: {marker_error}")
-        if docling_error:
-            st.error(f"Docling Error: {docling_error}")
     else:
         st.info("👆 Please upload a PDF file to begin comparison")

 from pathlib import Path
 import streamlit as st
+from docling.datamodel.base_models import DocumentStream, InputFormat
+from docling.document_converter import DocumentConverter, PdfFormatOption, ImageFormatOption
+from docling.datamodel.pipeline_options import PdfPipelineOptions, EasyOcrOptions, TesseractOcrOptions
 from marker.converters.pdf import PdfConverter
 from marker.models import create_model_dict
 from marker.output import text_from_rendered
 from st_diff_viewer import diff_viewer
+import fitz
 @st.cache_resource
 def load_marker_models() -> dict:
     """Load Marker models"""
     return create_model_dict()
+@st.cache_data(show_spinner=False)
 def extract_with_marker(pdf_bytes: bytes):
     """Extract text from PDF using Marker"""
             artifact_dict=load_marker_models(),
         )
         start_time = time.time()
         rendered = converter(tmp_file_path)
         text, _, images = text_from_rendered(rendered)
         return None, None, str(e)
+def pdf_to_images(pdf_bytes: bytes, dpi: int = 200) -> list[bytes]:
+    """Convert PDF pages to PIL Images using PyMuPDF"""
+    images = []
+    pdf_doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+    zoom = float(dpi) / 72.0
+    mat = fitz.Matrix(zoom, zoom)
     try:
+        for page in pdf_doc:
+            pix = page.get_pixmap(matrix=mat)
+            img_data = pix.tobytes("png")
+            # img = Image.open(BytesIO(img_data))
+            images.append(img_data)
+    finally:
+        pdf_doc.close()
+    return images
+@st.cache_data(show_spinner=False)
+def extract_with_docling(pdf_bytes: bytes, filename: str, ocr_engine: str = "EasyOCR", full_ocr_mode: bool = False):
+    """Extract text from PDF using Docling with configurable OCR options
+    Args:
+        pdf_bytes: PDF file content as bytes
+        filename: Name of the PDF file
+        ocr_engine: OCR engine to use ("EasyOCR" or "Tesseract")
+        full_ocr_mode: If True, converts pages to images and applies full OCR
+    """
+    try:
+        if full_ocr_mode:
+            # Convert PDF pages to images first
+            images = pdf_to_images(pdf_bytes, dpi=300)
+            pipeline_options = PdfPipelineOptions()
+            pipeline_options.do_ocr = True
+            if ocr_engine == "Tesseract":
+                pipeline_options.ocr_options = TesseractOcrOptions(force_full_page_ocr=True)
+            else:
+                pipeline_options.ocr_options = EasyOcrOptions(force_full_page_ocr=True)
+            # Initialize converter for images
+            converter = DocumentConverter(
+                format_options={
+                    InputFormat.IMAGE: ImageFormatOption(
+                        pipeline_options=pipeline_options
+                    )
+                }
+            )
+            all_markdown = []
+            total_processing_time = 0.0
+            for i, img in enumerate(images):
+                # img_buffer = BytesIO()
+                # img.save(img_buffer, format='PNG')
+                img_bytes = BytesIO(img)
+                # Create DocumentStream for the image
+                img_stream = DocumentStream(
+                    name=f"{filename}_page_{i+1}.png",
+                    stream=img_bytes
+                )
+                # Convert image with OCR
+                start_time = time.time()
+                result = converter.convert(img_stream)
+                end_time = time.time()
+                processing_time = end_time - start_time
+                total_processing_time += processing_time
+                page_markdown = result.document.export_to_markdown()
+                if page_markdown.strip():
+                    all_markdown.append(f"# Page {i+1}\n\n{page_markdown}")
+            # Combine all pages
+            markdown_text = "\n\n---\n\n".join(all_markdown)
+            return markdown_text, total_processing_time, None
+        else:
+            # Standard PDF processing
+            buf = BytesIO(pdf_bytes)
+            source = DocumentStream(name=filename, stream=buf)
+            # Configure pipeline options
+            pipeline_options = PdfPipelineOptions()
+            # Configure OCR engine
+            if ocr_engine == "Tesseract":
+                pipeline_options.ocr_options = TesseractOcrOptions()
+            else:
+                pipeline_options.ocr_options = EasyOcrOptions()
+            # Initialize Docling converter with custom options
+            converter = DocumentConverter(
+                format_options={
+                    InputFormat.PDF: PdfFormatOption(
+                        pipeline_options=pipeline_options
+                    )
+                }
+            )
+            start_time = time.time()
+            result = converter.convert(source)
+            end_time = time.time()
+            markdown_text = result.document.export_to_markdown()
+            processing_time = end_time - start_time
+            return markdown_text, processing_time, None
     except Exception as e:
         return None, None, str(e)
     )
     st.title("📄 PDF Extraction Comparison: Marker vs Docling")
+    st.markdown("Compare PDF-to-Markdown extraction performance between **Marker**, **Docling Standard** (PDF text extraction), and **Docling Full OCR** (page-to-image + OCR processing)")
     # File upload
     st.header("📤 Upload PDF Document")
         help="Upload a PDF document to compare extraction performance"
     )
+    # OCR Configuration Section
+    st.header("⚙️ OCR Configuration")
+    ocr_engine = st.selectbox(
+        "OCR Engine",
+        options=["EasyOCR", "Tesseract"],
+        index=0,
+        help="Choose the OCR engine for text extraction. EasyOCR is generally faster, while Tesseract may be more accurate for certain document types."
+    )
+    st.info("📋 **Processing modes**: The app will run both Docling Standard (PDF text extraction) and Docling Full OCR (page-to-image + OCR) modes for comparison.")
     if uploaded_file is not None:
         st.success(f"File uploaded: {uploaded_file.name}")
         pdf_bytes = uploaded_file.read()
+        # Process with all three methods
         st.header("🔄 Processing...")
         # Create columns for parallel processing display
+        col1, col2, col3 = st.columns(3)
         with col1:
             st.subheader("🏷️ Marker Processing")
             marker_placeholder = st.empty()
         with col2:
+            st.subheader("📋 Docling Standard")
+            docling_standard_placeholder = st.empty()
+        with col3:
+            st.subheader("🔍 Docling Full OCR")
+            docling_ocr_placeholder = st.empty()
         # Process with Marker
         with marker_placeholder.container():
             with st.spinner("Processing with Marker..."):
                 marker_text, marker_time, marker_error = extract_with_marker(pdf_bytes)
+        # Process with Docling Standard Mode
+        with docling_standard_placeholder.container():
+            with st.spinner(f"Processing with Docling Standard ({ocr_engine} OCR)..."):
+                docling_standard_text, docling_standard_time, docling_standard_error = extract_with_docling(
+                    pdf_bytes,
+                    uploaded_file.name,
+                    ocr_engine=ocr_engine,
+                    full_ocr_mode=False
+                )
+        # Process with Docling Full OCR Mode
+        with docling_ocr_placeholder.container():
+            with st.spinner(f"Processing with Docling Full OCR ({ocr_engine} OCR)..."):
+                docling_ocr_text, docling_ocr_time, docling_ocr_error = extract_with_docling(
+                    pdf_bytes,
+                    uploaded_file.name,
+                    ocr_engine=ocr_engine,
+                    full_ocr_mode=True
+                )
         # Display results
         st.header("📊 Results")
         # Performance metrics
+        if marker_time is not None and docling_standard_time is not None and docling_ocr_time is not None:
             metrics_col1, metrics_col2, metrics_col3 = st.columns(3)
             with metrics_col1:
             with metrics_col2:
                 st.metric(
+                    "Docling Standard Time",
+                    f"{docling_standard_time:.2f}s"
                 )
             with metrics_col3:
                 st.metric(
+                    "Docling Full OCR Time",
+                    f"{docling_ocr_time:.2f}s"
                 )
         # Text comparison
+        if marker_text is not None and docling_standard_text is not None and docling_ocr_text is not None:
+            # Calculate similarities between all methods
+            similarity_marker_standard = calculate_similarity(marker_text, docling_standard_text)
+            similarity_marker_ocr = calculate_similarity(marker_text, docling_ocr_text)
+            similarity_standard_ocr = calculate_similarity(docling_standard_text, docling_ocr_text)
+            # Display similarity metrics
+            st.subheader("📝 Text Similarity Comparison")
+            sim_col1, sim_col2, sim_col3 = st.columns(3)
+            with sim_col1:
+                st.metric("Marker ↔ Docling Standard", f"{similarity_marker_standard:.1%}")
+            with sim_col2:
+                st.metric("Marker ↔ Docling Full OCR", f"{similarity_marker_ocr:.1%}")
+            with sim_col3:
+                st.metric("Docling Standard ↔ Full OCR", f"{similarity_standard_ocr:.1%}")
             # Length comparison
+            len_col1, len_col2, len_col3 = st.columns(3)
             with len_col1:
                 st.info(f"Marker output: {len(marker_text)} characters")
             with len_col2:
+                st.info(f"Docling Standard: {len(docling_standard_text)} characters")
+            with len_col3:
+                st.info(f"Docling Full OCR: {len(docling_ocr_text)} characters")
+            # Three-way comparison tabs
             st.subheader("📄 Markdown Output Comparison")
+            tab1, tab2, tab3, tab4 = st.tabs(["Marker Output", "Docling Standard", "Docling Full OCR", "Diff View"])
             with tab1:
                 st.markdown("### Marker Output")
                 )
             with tab2:
+                st.markdown("### Docling Standard Output")
                 st.text_area(
+                    "Docling Standard Markdown",
+                    docling_standard_text,
                     height=800,
+                    key="docling_standard_output"
                 )
             with tab3:
+                st.markdown("### Docling Full OCR Output")
+                st.text_area(
+                    "Docling Full OCR Markdown",
+                    docling_ocr_text,
+                    height=800,
+                    key="docling_ocr_output"
+                )
+            with tab4:
                 st.markdown("### Text Differences")
+                # Allow user to choose which comparison to view
+                diff_option = st.selectbox(
+                    "Choose comparison:",
+                    ["Marker vs Docling Standard", "Marker vs Docling Full OCR", "Docling Standard vs Full OCR"]
+                )
                 try:
+                    if diff_option == "Marker vs Docling Standard":
+                        diff_viewer(
+                            old_text=marker_text,
+                            new_text=docling_standard_text,
+                            left_title="Marker",
+                            right_title="Docling Standard",
+                        )
+                    elif diff_option == "Marker vs Docling Full OCR":
+                        diff_viewer(
+                            old_text=marker_text,
+                            new_text=docling_ocr_text,
+                            left_title="Marker",
+                            right_title="Docling Full OCR",
+                        )
+                    else:  # Docling Standard vs Full OCR
+                        diff_viewer(
+                            old_text=docling_standard_text,
+                            new_text=docling_ocr_text,
+                            left_title="Docling Standard",
+                            right_title="Docling Full OCR",
+                        )
                 except ImportError as e:
                     st.error(f"streamlit-diff-viewer not available: {e}")
         if marker_error:
             st.error(f"Marker Error: {marker_error}")
+        if docling_standard_error:
+            st.error(f"Docling Standard Error: {docling_standard_error}")
+        if docling_ocr_error:
+            st.error(f"Docling Full OCR Error: {docling_ocr_error}")
     else:
         st.info("👆 Please upload a PDF file to begin comparison")

uv.lock CHANGED Viewed

@@ -335,16 +335,22 @@ source = { virtual = "." }
 dependencies = [
     { name = "docling" },
     { name = "marker-pdf" },
     { name = "st-diff-viewer" },
     { name = "streamlit" },
 ]
 [package.metadata]
 requires-dist = [
     { name = "docling" },
     { name = "marker-pdf" },
     { name = "st-diff-viewer" },
     { name = "streamlit" },
 ]
 [[package]]
@@ -1472,6 +1478,21 @@ version = "2.10"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/5d/ab/34ec41718af73c00119d0351b7a2531d2ebddb51833a36448fc7b862be60/pylatexenc-2.10.tar.gz", hash = "sha256:3dd8fd84eb46dc30bee1e23eaab8d8fb5a7f507347b23e5f38ad9675c84f40d3", size = 162597, upload-time = "2021-04-06T07:56:07.854Z" }
 [[package]]
 name = "pypdfium2"
 version = "4.30.0"
@@ -2129,6 +2150,19 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/e5/30/643397144bfbfec6f6ef821f36f33e57d35946c44a2352d3c9f0ae847619/tenacity-9.1.2-py3-none-any.whl", hash = "sha256:f77bf36710d8b73a50b2dd155c97b870017ad21afe6ab300326b0371b3b05138", size = 28248, upload-time = "2025-04-02T08:25:07.678Z" },
 ]
 [[package]]
 name = "threadpoolctl"
 version = "3.6.0"

 dependencies = [
     { name = "docling" },
     { name = "marker-pdf" },
+    { name = "pillow" },
+    { name = "pymupdf" },
     { name = "st-diff-viewer" },
     { name = "streamlit" },
+    { name = "tesserocr" },
 ]
 [package.metadata]
 requires-dist = [
     { name = "docling" },
     { name = "marker-pdf" },
+    { name = "pillow", specifier = ">=10.4.0" },
+    { name = "pymupdf", specifier = ">=1.26.4" },
     { name = "st-diff-viewer" },
     { name = "streamlit" },
+    { name = "tesserocr", specifier = ">=2.8.0" },
 ]
 [[package]]
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/5d/ab/34ec41718af73c00119d0351b7a2531d2ebddb51833a36448fc7b862be60/pylatexenc-2.10.tar.gz", hash = "sha256:3dd8fd84eb46dc30bee1e23eaab8d8fb5a7f507347b23e5f38ad9675c84f40d3", size = 162597, upload-time = "2021-04-06T07:56:07.854Z" }
+[[package]]
+name = "pymupdf"
+version = "1.26.4"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/90/35/031556dfc0d332d8e9ed9b61ca105138606d3f8971b9eb02e20118629334/pymupdf-1.26.4.tar.gz", hash = "sha256:be13a066d42bfaed343a488168656637c4d9843ddc63b768dc827c9dfc6b9989", size = 83077563, upload-time = "2025-08-25T14:20:29.499Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/27/ae/3be722886cc7be2093585cd94f466db1199133ab005645a7a567b249560f/pymupdf-1.26.4-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:cb95562a0a63ce906fd788bdad5239063b63068cf4a991684f43acb09052cb99", size = 23061974, upload-time = "2025-08-25T14:16:58.811Z" },
+    { url = "https://files.pythonhosted.org/packages/fc/b0/9a451d837e1fe18ecdbfbc34a6499f153c8a008763229cc634725383a93f/pymupdf-1.26.4-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:67e9e6b45832c33726651c2a031e9a20108fd9e759140b9e843f934de813a7ff", size = 22410112, upload-time = "2025-08-25T14:17:24.511Z" },
+    { url = "https://files.pythonhosted.org/packages/d8/13/0916e8e02cb5453161fb9d9167c747d0a20d58633e30728645374153f815/pymupdf-1.26.4-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:2604f687dd02b6a1b98c81bd8becfc0024899a2d2085adfe3f9e91607721fd22", size = 23454948, upload-time = "2025-08-25T21:20:07.71Z" },
+    { url = "https://files.pythonhosted.org/packages/4e/c6/d3cfafc75d383603884edeabe4821a549345df954a88d79e6764e2c87601/pymupdf-1.26.4-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:973a6dda61ebd34040e4df3753bf004b669017663fbbfdaa294d44eceba98de0", size = 24060686, upload-time = "2025-08-25T14:17:56.536Z" },
+    { url = "https://files.pythonhosted.org/packages/72/08/035e9d22c801e801bba50c6745bc90ba8696a042fe2c68793e28bf0c3b07/pymupdf-1.26.4-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:299a49797df5b558e695647fa791329ba3911cbbb31ed65f24a6266c118ef1a7", size = 24265046, upload-time = "2025-08-25T14:18:21.238Z" },
+    { url = "https://files.pythonhosted.org/packages/28/8c/c201e4846ec0fb6ae5d52aa3a5d66f9355f0c69fb94230265714df0de65e/pymupdf-1.26.4-cp39-abi3-win32.whl", hash = "sha256:51b38379aad8c71bd7a8dd24d93fbe7580c2a5d9d7e1f9cd29ebbba315aa1bd1", size = 17127332, upload-time = "2025-08-25T14:18:39.132Z" },
+    { url = "https://files.pythonhosted.org/packages/d1/c4/87d27b108c2f6d773aa5183c5ae367b2a99296ea4bc16eb79f453c679e30/pymupdf-1.26.4-cp39-abi3-win_amd64.whl", hash = "sha256:0b6345a93a9afd28de2567e433055e873205c52e6b920b129ca50e836a3aeec6", size = 18743491, upload-time = "2025-08-25T14:19:01.104Z" },
+]
 [[package]]
 name = "pypdfium2"
 version = "4.30.0"
     { url = "https://files.pythonhosted.org/packages/e5/30/643397144bfbfec6f6ef821f36f33e57d35946c44a2352d3c9f0ae847619/tenacity-9.1.2-py3-none-any.whl", hash = "sha256:f77bf36710d8b73a50b2dd155c97b870017ad21afe6ab300326b0371b3b05138", size = 28248, upload-time = "2025-04-02T08:25:07.678Z" },
 ]
+[[package]]
+name = "tesserocr"
+version = "2.8.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/4f/d6/145858a1aff0310cdf709b8c5895d43660680202296ce6e5980dd2412d53/tesserocr-2.8.0.tar.gz", hash = "sha256:be518d1b1b5ff54c11aada1e0fd12942509ea70581e0a8b39a2a473a0b2dbd36", size = 72564, upload-time = "2025-02-12T12:41:53.7Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b2/43/1739cf5e2223bf0ea270c933b71763b8a7c4616064e309e660c8e43bec02/tesserocr-2.8.0-cp313-cp313-macosx_13_0_x86_64.whl", hash = "sha256:44b3396d52379155fd838931b78b044129c7c77a8f02a92574cde626cff9b4a8", size = 4099019, upload-time = "2025-02-12T12:41:39.368Z" },
+    { url = "https://files.pythonhosted.org/packages/d9/9d/7b8a8e29050d90446b81ccc5a3cc3256d62cff145628e718f7286a64dd14/tesserocr-2.8.0-cp313-cp313-macosx_15_0_arm64.whl", hash = "sha256:1edd2302f4a91b5491a4ce3f63e612441adf92fd81b339b85cbedb3b5b40f206", size = 3609710, upload-time = "2025-02-12T12:41:43.128Z" },
+    { url = "https://files.pythonhosted.org/packages/76/0b/b445adba94ccbabfe59e5cd0247285ccc4263103bed8fd54b835a973c200/tesserocr-2.8.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:b0dd849ce77373f9ac4b54d345b4d7115414e525e57a158e948887d744c6f909", size = 4886946, upload-time = "2025-02-12T12:41:46.594Z" },
+    { url = "https://files.pythonhosted.org/packages/13/e4/bf4ab45d49459d0e9e727603d5ed077552afd252e6e7886259e57fc9f10d/tesserocr-2.8.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:9ce710a73308964f2ac53f94b4980d2791bb67a82863bb7ef0ca445c1b325aa4", size = 5206055, upload-time = "2025-02-12T12:41:49.217Z" },
+    { url = "https://files.pythonhosted.org/packages/05/11/cf253d8de880f72924084e2570bc9df54e9d0013094c602a85cd962a70ff/tesserocr-2.8.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:a7a36af39aaf29a152c629cf62457192944f8854fbdd28395ef92d283e800662", size = 6599015, upload-time = "2025-02-12T12:41:52.017Z" },
+]
 [[package]]
 name = "threadpoolctl"
 version = "3.6.0"