Yousif Abdulhafiz commited on
Commit
9a3de0d
·
1 Parent(s): a19b20f

Add OCR capabilities with PyMuPDF enhance PDF extraction comparison

Browse files
Files changed (3) hide show
  1. pyproject.toml +1 -0
  2. src/streamlit_app.py +221 -54
  3. uv.lock +34 -0
pyproject.toml CHANGED
@@ -10,5 +10,6 @@ dependencies = [
10
  "marker-pdf",
11
  "streamlit",
12
  "st-diff-viewer",
 
13
  ]
14
 
 
10
  "marker-pdf",
11
  "streamlit",
12
  "st-diff-viewer",
13
+ "pymupdf>=1.26.4",
14
  ]
15
 
src/streamlit_app.py CHANGED
@@ -5,19 +5,22 @@ from io import BytesIO
5
  from pathlib import Path
6
 
7
  import streamlit as st
8
- from docling.datamodel.base_models import DocumentStream
9
- from docling.document_converter import DocumentConverter
 
10
  from marker.converters.pdf import PdfConverter
11
  from marker.models import create_model_dict
12
  from marker.output import text_from_rendered
13
  from st_diff_viewer import diff_viewer
14
 
 
15
 
16
  @st.cache_resource
17
  def load_marker_models() -> dict:
18
  """Load Marker models"""
19
  return create_model_dict()
20
 
 
21
  def extract_with_marker(pdf_bytes: bytes):
22
  """Extract text from PDF using Marker"""
23
 
@@ -32,7 +35,6 @@ def extract_with_marker(pdf_bytes: bytes):
32
  artifact_dict=load_marker_models(),
33
  )
34
 
35
- # Time the conversion
36
  start_time = time.time()
37
  rendered = converter(tmp_file_path)
38
  text, _, images = text_from_rendered(rendered)
@@ -49,28 +51,116 @@ def extract_with_marker(pdf_bytes: bytes):
49
  return None, None, str(e)
50
 
51
 
52
- def extract_with_docling(pdf_bytes: bytes, filename: str):
53
- """Extract text from PDF using Docling"""
 
 
 
 
 
54
 
55
  try:
56
- # Create DocumentStream from bytes
57
- buf = BytesIO(pdf_bytes)
58
- source = DocumentStream(name=filename, stream=buf)
59
 
60
- # Initialize Docling converter
61
- converter = DocumentConverter()
 
62
 
63
- # Time the conversion
64
- start_time = time.time()
65
- result = converter.convert(source)
66
- markdown_text = result.document.export_to_markdown()
67
- end_time = time.time()
68
 
69
- processing_time = end_time - start_time
 
 
 
 
 
 
 
 
 
 
 
70
 
71
-
72
- return markdown_text, processing_time, None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  except Exception as e:
75
  return None, None, str(e)
76
 
@@ -98,7 +188,7 @@ def main() -> None:
98
  )
99
 
100
  st.title("📄 PDF Extraction Comparison: Marker vs Docling")
101
- st.markdown("Compare PDF-to-Markdown extraction performance between Marker and Docling libraries")
102
 
103
  # File upload
104
  st.header("📤 Upload PDF Document")
@@ -108,39 +198,70 @@ def main() -> None:
108
  help="Upload a PDF document to compare extraction performance"
109
  )
110
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  if uploaded_file is not None:
112
  st.success(f"File uploaded: {uploaded_file.name}")
113
  pdf_bytes = uploaded_file.read()
114
 
115
- # Process with both libraries
116
  st.header("🔄 Processing...")
117
 
118
  # Create columns for parallel processing display
119
- col1, col2 = st.columns(2)
120
 
121
  with col1:
122
  st.subheader("🏷️ Marker Processing")
123
  marker_placeholder = st.empty()
124
 
125
  with col2:
126
- st.subheader("📋 Docling Processing")
127
- docling_placeholder = st.empty()
 
 
 
 
128
 
129
  # Process with Marker
130
  with marker_placeholder.container():
131
  with st.spinner("Processing with Marker..."):
132
  marker_text, marker_time, marker_error = extract_with_marker(pdf_bytes)
133
 
134
- # Process with Docling
135
- with docling_placeholder.container():
136
- with st.spinner("Processing with Docling..."):
137
- docling_text, docling_time, docling_error = extract_with_docling(pdf_bytes, uploaded_file.name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
 
139
  # Display results
140
  st.header("📊 Results")
141
 
142
  # Performance metrics
143
- if marker_time is not None and docling_time is not None:
144
  metrics_col1, metrics_col2, metrics_col3 = st.columns(3)
145
 
146
  with metrics_col1:
@@ -151,35 +272,47 @@ def main() -> None:
151
 
152
  with metrics_col2:
153
  st.metric(
154
- "Docling Processing Time",
155
- f"{docling_time:.2f}s"
156
  )
157
 
158
  with metrics_col3:
159
- speed_diff = ((marker_time - docling_time) / docling_time) * 100
160
- faster_library = "Docling" if marker_time > docling_time else "Marker"
161
  st.metric(
162
- f"{faster_library} is faster by",
163
- f"{abs(speed_diff):.1f}%"
164
  )
165
 
166
  # Text comparison
167
- if marker_text is not None and docling_text is not None:
168
- # Calculate similarity
169
- similarity = calculate_similarity(marker_text, docling_text)
170
- st.subheader(f"📝 Text Similarity: {similarity:.1%}")
 
 
 
 
 
 
 
 
 
 
 
 
171
 
172
  # Length comparison
173
- len_col1, len_col2 = st.columns(2)
174
  with len_col1:
175
  st.info(f"Marker output: {len(marker_text)} characters")
176
  with len_col2:
177
- st.info(f"Docling output: {len(docling_text)} characters")
 
 
178
 
179
- # Side-by-side comparison
180
  st.subheader("📄 Markdown Output Comparison")
181
 
182
- tab1, tab2, tab3 = st.tabs(["Marker Output", "Docling Output", "Diff View"])
183
 
184
  with tab1:
185
  st.markdown("### Marker Output")
@@ -191,23 +324,54 @@ def main() -> None:
191
  )
192
 
193
  with tab2:
194
- st.markdown("### Docling Output")
195
  st.text_area(
196
- "Docling Markdown",
197
- docling_text,
198
  height=800,
199
- key="docling_output"
200
  )
201
 
202
  with tab3:
 
 
 
 
 
 
 
 
 
203
  st.markdown("### Text Differences")
 
 
 
 
 
 
 
204
  try:
205
- diff_viewer(
206
- old_text=marker_text,
207
- new_text=docling_text,
208
- left_title="Marker",
209
- right_title="Docling",
210
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
211
  except ImportError as e:
212
  st.error(f"streamlit-diff-viewer not available: {e}")
213
 
@@ -215,8 +379,11 @@ def main() -> None:
215
  if marker_error:
216
  st.error(f"Marker Error: {marker_error}")
217
 
218
- if docling_error:
219
- st.error(f"Docling Error: {docling_error}")
 
 
 
220
 
221
  else:
222
  st.info("👆 Please upload a PDF file to begin comparison")
 
5
  from pathlib import Path
6
 
7
  import streamlit as st
8
+ from docling.datamodel.base_models import DocumentStream, InputFormat
9
+ from docling.document_converter import DocumentConverter, PdfFormatOption, ImageFormatOption
10
+ from docling.datamodel.pipeline_options import PdfPipelineOptions, EasyOcrOptions, TesseractOcrOptions
11
  from marker.converters.pdf import PdfConverter
12
  from marker.models import create_model_dict
13
  from marker.output import text_from_rendered
14
  from st_diff_viewer import diff_viewer
15
 
16
+ import fitz
17
 
18
  @st.cache_resource
19
  def load_marker_models() -> dict:
20
  """Load Marker models"""
21
  return create_model_dict()
22
 
23
+ @st.cache_data(show_spinner=False)
24
  def extract_with_marker(pdf_bytes: bytes):
25
  """Extract text from PDF using Marker"""
26
 
 
35
  artifact_dict=load_marker_models(),
36
  )
37
 
 
38
  start_time = time.time()
39
  rendered = converter(tmp_file_path)
40
  text, _, images = text_from_rendered(rendered)
 
51
  return None, None, str(e)
52
 
53
 
54
+ def pdf_to_images(pdf_bytes: bytes, dpi: int = 200) -> list[bytes]:
55
+ """Convert PDF pages to PIL Images using PyMuPDF"""
56
+ images = []
57
+ pdf_doc = fitz.open(stream=pdf_bytes, filetype="pdf")
58
+
59
+ zoom = float(dpi) / 72.0
60
+ mat = fitz.Matrix(zoom, zoom)
61
 
62
  try:
63
+ for page in pdf_doc:
64
+ pix = page.get_pixmap(matrix=mat)
 
65
 
66
+ img_data = pix.tobytes("png")
67
+ # img = Image.open(BytesIO(img_data))
68
+ images.append(img_data)
69
 
70
+ finally:
71
+ pdf_doc.close()
 
 
 
72
 
73
+ return images
74
+
75
+ @st.cache_data(show_spinner=False)
76
+ def extract_with_docling(pdf_bytes: bytes, filename: str, ocr_engine: str = "EasyOCR", full_ocr_mode: bool = False):
77
+ """Extract text from PDF using Docling with configurable OCR options
78
+
79
+ Args:
80
+ pdf_bytes: PDF file content as bytes
81
+ filename: Name of the PDF file
82
+ ocr_engine: OCR engine to use ("EasyOCR" or "Tesseract")
83
+ full_ocr_mode: If True, converts pages to images and applies full OCR
84
+ """
85
 
86
+ try:
87
+ if full_ocr_mode:
88
+ # Convert PDF pages to images first
89
+ images = pdf_to_images(pdf_bytes, dpi=300)
90
+
91
+ pipeline_options = PdfPipelineOptions()
92
+ pipeline_options.do_ocr = True
93
+ if ocr_engine == "Tesseract":
94
+ pipeline_options.ocr_options = TesseractOcrOptions(force_full_page_ocr=True)
95
+ else:
96
+ pipeline_options.ocr_options = EasyOcrOptions(force_full_page_ocr=True)
97
+
98
+ # Initialize converter for images
99
+ converter = DocumentConverter(
100
+ format_options={
101
+ InputFormat.IMAGE: ImageFormatOption(
102
+ pipeline_options=pipeline_options
103
+ )
104
+ }
105
+ )
106
+
107
+ all_markdown = []
108
+ total_processing_time = 0.0
109
+ for i, img in enumerate(images):
110
+ # img_buffer = BytesIO()
111
+ # img.save(img_buffer, format='PNG')
112
+ img_bytes = BytesIO(img)
113
+
114
+ # Create DocumentStream for the image
115
+ img_stream = DocumentStream(
116
+ name=f"{filename}_page_{i+1}.png",
117
+ stream=img_bytes
118
+ )
119
 
120
+ # Convert image with OCR
121
+ start_time = time.time()
122
+ result = converter.convert(img_stream)
123
+ end_time = time.time()
124
+ processing_time = end_time - start_time
125
+ total_processing_time += processing_time
126
+ page_markdown = result.document.export_to_markdown()
127
+
128
+ if page_markdown.strip():
129
+ all_markdown.append(f"# Page {i+1}\n\n{page_markdown}")
130
+
131
+ # Combine all pages
132
+ markdown_text = "\n\n---\n\n".join(all_markdown)
133
+ return markdown_text, total_processing_time, None
134
+
135
+ else:
136
+ # Standard PDF processing
137
+ buf = BytesIO(pdf_bytes)
138
+ source = DocumentStream(name=filename, stream=buf)
139
+
140
+ # Configure pipeline options
141
+ pipeline_options = PdfPipelineOptions()
142
+
143
+ # Configure OCR engine
144
+ if ocr_engine == "Tesseract":
145
+ pipeline_options.ocr_options = TesseractOcrOptions()
146
+ else:
147
+ pipeline_options.ocr_options = EasyOcrOptions()
148
+
149
+ # Initialize Docling converter with custom options
150
+ converter = DocumentConverter(
151
+ format_options={
152
+ InputFormat.PDF: PdfFormatOption(
153
+ pipeline_options=pipeline_options
154
+ )
155
+ }
156
+ )
157
+
158
+ start_time = time.time()
159
+ result = converter.convert(source)
160
+ end_time = time.time()
161
+ markdown_text = result.document.export_to_markdown()
162
+ processing_time = end_time - start_time
163
+ return markdown_text, processing_time, None
164
  except Exception as e:
165
  return None, None, str(e)
166
 
 
188
  )
189
 
190
  st.title("📄 PDF Extraction Comparison: Marker vs Docling")
191
+ st.markdown("Compare PDF-to-Markdown extraction performance between **Marker**, **Docling Standard** (PDF text extraction), and **Docling Full OCR** (page-to-image + OCR processing)")
192
 
193
  # File upload
194
  st.header("📤 Upload PDF Document")
 
198
  help="Upload a PDF document to compare extraction performance"
199
  )
200
 
201
+ # OCR Configuration Section
202
+ st.header("⚙️ OCR Configuration")
203
+
204
+ ocr_engine = st.selectbox(
205
+ "OCR Engine",
206
+ options=["EasyOCR", "Tesseract"],
207
+ index=0,
208
+ help="Choose the OCR engine for text extraction. EasyOCR is generally faster, while Tesseract may be more accurate for certain document types."
209
+ )
210
+
211
+ st.info("📋 **Processing modes**: The app will run both Docling Standard (PDF text extraction) and Docling Full OCR (page-to-image + OCR) modes for comparison.")
212
+
213
  if uploaded_file is not None:
214
  st.success(f"File uploaded: {uploaded_file.name}")
215
  pdf_bytes = uploaded_file.read()
216
 
217
+ # Process with all three methods
218
  st.header("🔄 Processing...")
219
 
220
  # Create columns for parallel processing display
221
+ col1, col2, col3 = st.columns(3)
222
 
223
  with col1:
224
  st.subheader("🏷️ Marker Processing")
225
  marker_placeholder = st.empty()
226
 
227
  with col2:
228
+ st.subheader("📋 Docling Standard")
229
+ docling_standard_placeholder = st.empty()
230
+
231
+ with col3:
232
+ st.subheader("🔍 Docling Full OCR")
233
+ docling_ocr_placeholder = st.empty()
234
 
235
  # Process with Marker
236
  with marker_placeholder.container():
237
  with st.spinner("Processing with Marker..."):
238
  marker_text, marker_time, marker_error = extract_with_marker(pdf_bytes)
239
 
240
+ # Process with Docling Standard Mode
241
+ with docling_standard_placeholder.container():
242
+ with st.spinner(f"Processing with Docling Standard ({ocr_engine} OCR)..."):
243
+ docling_standard_text, docling_standard_time, docling_standard_error = extract_with_docling(
244
+ pdf_bytes,
245
+ uploaded_file.name,
246
+ ocr_engine=ocr_engine,
247
+ full_ocr_mode=False
248
+ )
249
+
250
+ # Process with Docling Full OCR Mode
251
+ with docling_ocr_placeholder.container():
252
+ with st.spinner(f"Processing with Docling Full OCR ({ocr_engine} OCR)..."):
253
+ docling_ocr_text, docling_ocr_time, docling_ocr_error = extract_with_docling(
254
+ pdf_bytes,
255
+ uploaded_file.name,
256
+ ocr_engine=ocr_engine,
257
+ full_ocr_mode=True
258
+ )
259
 
260
  # Display results
261
  st.header("📊 Results")
262
 
263
  # Performance metrics
264
+ if marker_time is not None and docling_standard_time is not None and docling_ocr_time is not None:
265
  metrics_col1, metrics_col2, metrics_col3 = st.columns(3)
266
 
267
  with metrics_col1:
 
272
 
273
  with metrics_col2:
274
  st.metric(
275
+ "Docling Standard Time",
276
+ f"{docling_standard_time:.2f}s"
277
  )
278
 
279
  with metrics_col3:
 
 
280
  st.metric(
281
+ "Docling Full OCR Time",
282
+ f"{docling_ocr_time:.2f}s"
283
  )
284
 
285
  # Text comparison
286
+ if marker_text is not None and docling_standard_text is not None and docling_ocr_text is not None:
287
+ # Calculate similarities between all methods
288
+ similarity_marker_standard = calculate_similarity(marker_text, docling_standard_text)
289
+ similarity_marker_ocr = calculate_similarity(marker_text, docling_ocr_text)
290
+ similarity_standard_ocr = calculate_similarity(docling_standard_text, docling_ocr_text)
291
+
292
+ # Display similarity metrics
293
+ st.subheader("📝 Text Similarity Comparison")
294
+ sim_col1, sim_col2, sim_col3 = st.columns(3)
295
+
296
+ with sim_col1:
297
+ st.metric("Marker ↔ Docling Standard", f"{similarity_marker_standard:.1%}")
298
+ with sim_col2:
299
+ st.metric("Marker ↔ Docling Full OCR", f"{similarity_marker_ocr:.1%}")
300
+ with sim_col3:
301
+ st.metric("Docling Standard ↔ Full OCR", f"{similarity_standard_ocr:.1%}")
302
 
303
  # Length comparison
304
+ len_col1, len_col2, len_col3 = st.columns(3)
305
  with len_col1:
306
  st.info(f"Marker output: {len(marker_text)} characters")
307
  with len_col2:
308
+ st.info(f"Docling Standard: {len(docling_standard_text)} characters")
309
+ with len_col3:
310
+ st.info(f"Docling Full OCR: {len(docling_ocr_text)} characters")
311
 
312
+ # Three-way comparison tabs
313
  st.subheader("📄 Markdown Output Comparison")
314
 
315
+ tab1, tab2, tab3, tab4 = st.tabs(["Marker Output", "Docling Standard", "Docling Full OCR", "Diff View"])
316
 
317
  with tab1:
318
  st.markdown("### Marker Output")
 
324
  )
325
 
326
  with tab2:
327
+ st.markdown("### Docling Standard Output")
328
  st.text_area(
329
+ "Docling Standard Markdown",
330
+ docling_standard_text,
331
  height=800,
332
+ key="docling_standard_output"
333
  )
334
 
335
  with tab3:
336
+ st.markdown("### Docling Full OCR Output")
337
+ st.text_area(
338
+ "Docling Full OCR Markdown",
339
+ docling_ocr_text,
340
+ height=800,
341
+ key="docling_ocr_output"
342
+ )
343
+
344
+ with tab4:
345
  st.markdown("### Text Differences")
346
+
347
+ # Allow user to choose which comparison to view
348
+ diff_option = st.selectbox(
349
+ "Choose comparison:",
350
+ ["Marker vs Docling Standard", "Marker vs Docling Full OCR", "Docling Standard vs Full OCR"]
351
+ )
352
+
353
  try:
354
+ if diff_option == "Marker vs Docling Standard":
355
+ diff_viewer(
356
+ old_text=marker_text,
357
+ new_text=docling_standard_text,
358
+ left_title="Marker",
359
+ right_title="Docling Standard",
360
+ )
361
+ elif diff_option == "Marker vs Docling Full OCR":
362
+ diff_viewer(
363
+ old_text=marker_text,
364
+ new_text=docling_ocr_text,
365
+ left_title="Marker",
366
+ right_title="Docling Full OCR",
367
+ )
368
+ else: # Docling Standard vs Full OCR
369
+ diff_viewer(
370
+ old_text=docling_standard_text,
371
+ new_text=docling_ocr_text,
372
+ left_title="Docling Standard",
373
+ right_title="Docling Full OCR",
374
+ )
375
  except ImportError as e:
376
  st.error(f"streamlit-diff-viewer not available: {e}")
377
 
 
379
  if marker_error:
380
  st.error(f"Marker Error: {marker_error}")
381
 
382
+ if docling_standard_error:
383
+ st.error(f"Docling Standard Error: {docling_standard_error}")
384
+
385
+ if docling_ocr_error:
386
+ st.error(f"Docling Full OCR Error: {docling_ocr_error}")
387
 
388
  else:
389
  st.info("👆 Please upload a PDF file to begin comparison")
uv.lock CHANGED
@@ -335,16 +335,22 @@ source = { virtual = "." }
335
  dependencies = [
336
  { name = "docling" },
337
  { name = "marker-pdf" },
 
 
338
  { name = "st-diff-viewer" },
339
  { name = "streamlit" },
 
340
  ]
341
 
342
  [package.metadata]
343
  requires-dist = [
344
  { name = "docling" },
345
  { name = "marker-pdf" },
 
 
346
  { name = "st-diff-viewer" },
347
  { name = "streamlit" },
 
348
  ]
349
 
350
  [[package]]
@@ -1472,6 +1478,21 @@ version = "2.10"
1472
  source = { registry = "https://pypi.org/simple" }
1473
  sdist = { url = "https://files.pythonhosted.org/packages/5d/ab/34ec41718af73c00119d0351b7a2531d2ebddb51833a36448fc7b862be60/pylatexenc-2.10.tar.gz", hash = "sha256:3dd8fd84eb46dc30bee1e23eaab8d8fb5a7f507347b23e5f38ad9675c84f40d3", size = 162597, upload-time = "2021-04-06T07:56:07.854Z" }
1474
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1475
  [[package]]
1476
  name = "pypdfium2"
1477
  version = "4.30.0"
@@ -2129,6 +2150,19 @@ wheels = [
2129
  { url = "https://files.pythonhosted.org/packages/e5/30/643397144bfbfec6f6ef821f36f33e57d35946c44a2352d3c9f0ae847619/tenacity-9.1.2-py3-none-any.whl", hash = "sha256:f77bf36710d8b73a50b2dd155c97b870017ad21afe6ab300326b0371b3b05138", size = 28248, upload-time = "2025-04-02T08:25:07.678Z" },
2130
  ]
2131
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2132
  [[package]]
2133
  name = "threadpoolctl"
2134
  version = "3.6.0"
 
335
  dependencies = [
336
  { name = "docling" },
337
  { name = "marker-pdf" },
338
+ { name = "pillow" },
339
+ { name = "pymupdf" },
340
  { name = "st-diff-viewer" },
341
  { name = "streamlit" },
342
+ { name = "tesserocr" },
343
  ]
344
 
345
  [package.metadata]
346
  requires-dist = [
347
  { name = "docling" },
348
  { name = "marker-pdf" },
349
+ { name = "pillow", specifier = ">=10.4.0" },
350
+ { name = "pymupdf", specifier = ">=1.26.4" },
351
  { name = "st-diff-viewer" },
352
  { name = "streamlit" },
353
+ { name = "tesserocr", specifier = ">=2.8.0" },
354
  ]
355
 
356
  [[package]]
 
1478
  source = { registry = "https://pypi.org/simple" }
1479
  sdist = { url = "https://files.pythonhosted.org/packages/5d/ab/34ec41718af73c00119d0351b7a2531d2ebddb51833a36448fc7b862be60/pylatexenc-2.10.tar.gz", hash = "sha256:3dd8fd84eb46dc30bee1e23eaab8d8fb5a7f507347b23e5f38ad9675c84f40d3", size = 162597, upload-time = "2021-04-06T07:56:07.854Z" }
1480
 
1481
+ [[package]]
1482
+ name = "pymupdf"
1483
+ version = "1.26.4"
1484
+ source = { registry = "https://pypi.org/simple" }
1485
+ sdist = { url = "https://files.pythonhosted.org/packages/90/35/031556dfc0d332d8e9ed9b61ca105138606d3f8971b9eb02e20118629334/pymupdf-1.26.4.tar.gz", hash = "sha256:be13a066d42bfaed343a488168656637c4d9843ddc63b768dc827c9dfc6b9989", size = 83077563, upload-time = "2025-08-25T14:20:29.499Z" }
1486
+ wheels = [
1487
+ { url = "https://files.pythonhosted.org/packages/27/ae/3be722886cc7be2093585cd94f466db1199133ab005645a7a567b249560f/pymupdf-1.26.4-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:cb95562a0a63ce906fd788bdad5239063b63068cf4a991684f43acb09052cb99", size = 23061974, upload-time = "2025-08-25T14:16:58.811Z" },
1488
+ { url = "https://files.pythonhosted.org/packages/fc/b0/9a451d837e1fe18ecdbfbc34a6499f153c8a008763229cc634725383a93f/pymupdf-1.26.4-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:67e9e6b45832c33726651c2a031e9a20108fd9e759140b9e843f934de813a7ff", size = 22410112, upload-time = "2025-08-25T14:17:24.511Z" },
1489
+ { url = "https://files.pythonhosted.org/packages/d8/13/0916e8e02cb5453161fb9d9167c747d0a20d58633e30728645374153f815/pymupdf-1.26.4-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:2604f687dd02b6a1b98c81bd8becfc0024899a2d2085adfe3f9e91607721fd22", size = 23454948, upload-time = "2025-08-25T21:20:07.71Z" },
1490
+ { url = "https://files.pythonhosted.org/packages/4e/c6/d3cfafc75d383603884edeabe4821a549345df954a88d79e6764e2c87601/pymupdf-1.26.4-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:973a6dda61ebd34040e4df3753bf004b669017663fbbfdaa294d44eceba98de0", size = 24060686, upload-time = "2025-08-25T14:17:56.536Z" },
1491
+ { url = "https://files.pythonhosted.org/packages/72/08/035e9d22c801e801bba50c6745bc90ba8696a042fe2c68793e28bf0c3b07/pymupdf-1.26.4-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:299a49797df5b558e695647fa791329ba3911cbbb31ed65f24a6266c118ef1a7", size = 24265046, upload-time = "2025-08-25T14:18:21.238Z" },
1492
+ { url = "https://files.pythonhosted.org/packages/28/8c/c201e4846ec0fb6ae5d52aa3a5d66f9355f0c69fb94230265714df0de65e/pymupdf-1.26.4-cp39-abi3-win32.whl", hash = "sha256:51b38379aad8c71bd7a8dd24d93fbe7580c2a5d9d7e1f9cd29ebbba315aa1bd1", size = 17127332, upload-time = "2025-08-25T14:18:39.132Z" },
1493
+ { url = "https://files.pythonhosted.org/packages/d1/c4/87d27b108c2f6d773aa5183c5ae367b2a99296ea4bc16eb79f453c679e30/pymupdf-1.26.4-cp39-abi3-win_amd64.whl", hash = "sha256:0b6345a93a9afd28de2567e433055e873205c52e6b920b129ca50e836a3aeec6", size = 18743491, upload-time = "2025-08-25T14:19:01.104Z" },
1494
+ ]
1495
+
1496
  [[package]]
1497
  name = "pypdfium2"
1498
  version = "4.30.0"
 
2150
  { url = "https://files.pythonhosted.org/packages/e5/30/643397144bfbfec6f6ef821f36f33e57d35946c44a2352d3c9f0ae847619/tenacity-9.1.2-py3-none-any.whl", hash = "sha256:f77bf36710d8b73a50b2dd155c97b870017ad21afe6ab300326b0371b3b05138", size = 28248, upload-time = "2025-04-02T08:25:07.678Z" },
2151
  ]
2152
 
2153
+ [[package]]
2154
+ name = "tesserocr"
2155
+ version = "2.8.0"
2156
+ source = { registry = "https://pypi.org/simple" }
2157
+ sdist = { url = "https://files.pythonhosted.org/packages/4f/d6/145858a1aff0310cdf709b8c5895d43660680202296ce6e5980dd2412d53/tesserocr-2.8.0.tar.gz", hash = "sha256:be518d1b1b5ff54c11aada1e0fd12942509ea70581e0a8b39a2a473a0b2dbd36", size = 72564, upload-time = "2025-02-12T12:41:53.7Z" }
2158
+ wheels = [
2159
+ { url = "https://files.pythonhosted.org/packages/b2/43/1739cf5e2223bf0ea270c933b71763b8a7c4616064e309e660c8e43bec02/tesserocr-2.8.0-cp313-cp313-macosx_13_0_x86_64.whl", hash = "sha256:44b3396d52379155fd838931b78b044129c7c77a8f02a92574cde626cff9b4a8", size = 4099019, upload-time = "2025-02-12T12:41:39.368Z" },
2160
+ { url = "https://files.pythonhosted.org/packages/d9/9d/7b8a8e29050d90446b81ccc5a3cc3256d62cff145628e718f7286a64dd14/tesserocr-2.8.0-cp313-cp313-macosx_15_0_arm64.whl", hash = "sha256:1edd2302f4a91b5491a4ce3f63e612441adf92fd81b339b85cbedb3b5b40f206", size = 3609710, upload-time = "2025-02-12T12:41:43.128Z" },
2161
+ { url = "https://files.pythonhosted.org/packages/76/0b/b445adba94ccbabfe59e5cd0247285ccc4263103bed8fd54b835a973c200/tesserocr-2.8.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:b0dd849ce77373f9ac4b54d345b4d7115414e525e57a158e948887d744c6f909", size = 4886946, upload-time = "2025-02-12T12:41:46.594Z" },
2162
+ { url = "https://files.pythonhosted.org/packages/13/e4/bf4ab45d49459d0e9e727603d5ed077552afd252e6e7886259e57fc9f10d/tesserocr-2.8.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:9ce710a73308964f2ac53f94b4980d2791bb67a82863bb7ef0ca445c1b325aa4", size = 5206055, upload-time = "2025-02-12T12:41:49.217Z" },
2163
+ { url = "https://files.pythonhosted.org/packages/05/11/cf253d8de880f72924084e2570bc9df54e9d0013094c602a85cd962a70ff/tesserocr-2.8.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:a7a36af39aaf29a152c629cf62457192944f8854fbdd28395ef92d283e800662", size = 6599015, upload-time = "2025-02-12T12:41:52.017Z" },
2164
+ ]
2165
+
2166
  [[package]]
2167
  name = "threadpoolctl"
2168
  version = "3.6.0"