Vik Paruchuri commited on
Commit
7e40faa
·
1 Parent(s): b28bf13

Bump pdftext, fix tests

Browse files
README.md CHANGED
@@ -66,7 +66,7 @@ Install with:
66
  pip install marker-pdf
67
  ```
68
 
69
- By default, marker will work on PDFs and images. If you also want to use marker on XLSX, DOCX, HTML, etc, you will need to run:
70
 
71
  ```shell
72
  pip install marker-pdf[full]
 
66
  pip install marker-pdf
67
  ```
68
 
69
+ If you want to use marker on documents other than PDFs, you will need to install additional dependencies with:
70
 
71
  ```shell
72
  pip install marker-pdf[full]
marker/processors/llm/llm_equation.py CHANGED
@@ -74,8 +74,12 @@ analysis: The equations are not formatted as LaTeX, or enclosed in math tags.
74
  for block_data in blocks:
75
  block = block_data["block"]
76
  page = block_data["page"]
 
77
  # If we redo inline math, we redo all equations
78
- if block.polygon.height / page.polygon.height < self.min_equation_height and not self.redo_inline_math:
 
 
 
79
  continue
80
  out_blocks.append(block_data)
81
  return out_blocks
 
74
  for block_data in blocks:
75
  block = block_data["block"]
76
  page = block_data["page"]
77
+
78
  # If we redo inline math, we redo all equations
79
+ if all([
80
+ block.polygon.height / page.polygon.height < self.min_equation_height,
81
+ not self.redo_inline_math
82
+ ]):
83
  continue
84
  out_blocks.append(block_data)
85
  return out_blocks
marker/providers/document.py CHANGED
@@ -1,11 +1,9 @@
1
  import base64
2
- import logging
3
  import os
4
  import re
5
  import tempfile
6
  from io import BytesIO
7
 
8
- import mammoth
9
  from PIL import Image
10
 
11
  from marker.providers.pdf import PdfProvider
@@ -69,6 +67,7 @@ class DocumentProvider(PdfProvider):
69
 
70
  def convert_docx_to_pdf(self, filepath: str):
71
  from weasyprint import CSS, HTML
 
72
 
73
  with open(filepath, "rb") as docx_file:
74
  # we convert the docx to HTML
 
1
  import base64
 
2
  import os
3
  import re
4
  import tempfile
5
  from io import BytesIO
6
 
 
7
  from PIL import Image
8
 
9
  from marker.providers.pdf import PdfProvider
 
67
 
68
  def convert_docx_to_pdf(self, filepath: str):
69
  from weasyprint import CSS, HTML
70
+ import mammoth
71
 
72
  with open(filepath, "rb") as docx_file:
73
  # we convert the docx to HTML
marker/providers/epub.py CHANGED
@@ -2,9 +2,7 @@ import base64
2
  import os
3
  import tempfile
4
 
5
- import ebooklib
6
  from bs4 import BeautifulSoup
7
- from ebooklib import epub
8
 
9
  from marker.providers.pdf import PdfProvider
10
 
@@ -67,6 +65,8 @@ class EpubProvider(PdfProvider):
67
 
68
  def convert_epub_to_pdf(self, filepath):
69
  from weasyprint import CSS, HTML
 
 
70
 
71
  ebook = epub.read_epub(filepath)
72
 
@@ -104,7 +104,7 @@ class EpubProvider(PdfProvider):
104
  full_style = ''.join([css]) # + styles)
105
 
106
  # we convert the epub to HTML
107
- result = HTML(string=html_content, base_url=filepath).write_pdf(
108
  self.temp_pdf_path,
109
  stylesheets=[CSS(string=full_style), self.get_font_css()]
110
  )
 
2
  import os
3
  import tempfile
4
 
 
5
  from bs4 import BeautifulSoup
 
6
 
7
  from marker.providers.pdf import PdfProvider
8
 
 
65
 
66
  def convert_epub_to_pdf(self, filepath):
67
  from weasyprint import CSS, HTML
68
+ from ebooklib import epub
69
+ import ebooklib
70
 
71
  ebook = epub.read_epub(filepath)
72
 
 
104
  full_style = ''.join([css]) # + styles)
105
 
106
  # we convert the epub to HTML
107
+ HTML(string=html_content, base_url=filepath).write_pdf(
108
  self.temp_pdf_path,
109
  stylesheets=[CSS(string=full_style), self.get_font_css()]
110
  )
marker/providers/powerpoint.py CHANGED
@@ -3,9 +3,6 @@ import os
3
  import tempfile
4
  import traceback
5
 
6
- from pptx import Presentation
7
- from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
8
-
9
  from marker.providers.pdf import PdfProvider
10
 
11
  css = '''
@@ -63,6 +60,8 @@ class PowerPointProvider(PdfProvider):
63
 
64
  def convert_pptx_to_pdf(self, filepath):
65
  from weasyprint import CSS, HTML
 
 
66
 
67
  pptx = Presentation(filepath)
68
 
@@ -112,6 +111,7 @@ class PowerPointProvider(PdfProvider):
112
  """
113
  Recursively handle shapes in a group. Returns HTML string for the entire group.
114
  """
 
115
 
116
  group_parts = []
117
  for shape in group_shape.shapes:
@@ -140,6 +140,7 @@ class PowerPointProvider(PdfProvider):
140
  Processes shape text, including bullet/numbered list detection and placeholders
141
  (title, subtitle, etc.). Returns HTML for the text block(s).
142
  """
 
143
 
144
  # Distinguish placeholders to see if it's a title or subtitle
145
  label_html_tag = "p"
 
3
  import tempfile
4
  import traceback
5
 
 
 
 
6
  from marker.providers.pdf import PdfProvider
7
 
8
  css = '''
 
60
 
61
  def convert_pptx_to_pdf(self, filepath):
62
  from weasyprint import CSS, HTML
63
+ from pptx import Presentation
64
+ from pptx.enum.shapes import MSO_SHAPE_TYPE
65
 
66
  pptx = Presentation(filepath)
67
 
 
111
  """
112
  Recursively handle shapes in a group. Returns HTML string for the entire group.
113
  """
114
+ from pptx.enum.shapes import MSO_SHAPE_TYPE
115
 
116
  group_parts = []
117
  for shape in group_shape.shapes:
 
140
  Processes shape text, including bullet/numbered list detection and placeholders
141
  (title, subtitle, etc.). Returns HTML for the text block(s).
142
  """
143
+ from pptx.enum.shapes import PP_PLACEHOLDER
144
 
145
  # Distinguish placeholders to see if it's a title or subtitle
146
  label_html_tag = "p"
marker/providers/spreadsheet.py CHANGED
@@ -1,9 +1,6 @@
1
  import os
2
  import tempfile
3
 
4
- from openpyxl import load_workbook
5
- from openpyxl.worksheet.worksheet import Worksheet
6
-
7
  from marker.providers.pdf import PdfProvider
8
 
9
  css = '''
@@ -52,6 +49,7 @@ class SpreadSheetProvider(PdfProvider):
52
 
53
  def convert_xlsx_to_pdf(self, filepath: str):
54
  from weasyprint import CSS, HTML
 
55
 
56
  html = ""
57
  workbook = load_workbook(filepath)
@@ -69,7 +67,7 @@ class SpreadSheetProvider(PdfProvider):
69
  )
70
 
71
  @staticmethod
72
- def _get_merged_cell_ranges(sheet: Worksheet):
73
  merged_info = {}
74
  for merged_range in sheet.merged_cells.ranges:
75
  min_col, min_row, max_col, max_row = merged_range.bounds
@@ -80,7 +78,7 @@ class SpreadSheetProvider(PdfProvider):
80
  }
81
  return merged_info
82
 
83
- def _excel_to_html_table(self, sheet: Worksheet):
84
  merged_cells = self._get_merged_cell_ranges(sheet)
85
 
86
  html = f'<table>'
 
1
  import os
2
  import tempfile
3
 
 
 
 
4
  from marker.providers.pdf import PdfProvider
5
 
6
  css = '''
 
49
 
50
  def convert_xlsx_to_pdf(self, filepath: str):
51
  from weasyprint import CSS, HTML
52
+ from openpyxl import load_workbook
53
 
54
  html = ""
55
  workbook = load_workbook(filepath)
 
67
  )
68
 
69
  @staticmethod
70
+ def _get_merged_cell_ranges(sheet):
71
  merged_info = {}
72
  for merged_range in sheet.merged_cells.ranges:
73
  min_col, min_row, max_col, max_row = merged_range.bounds
 
78
  }
79
  return merged_info
80
 
81
+ def _excel_to_html_table(self, sheet):
82
  merged_cells = self._get_merged_cell_ranges(sheet)
83
 
84
  html = f'<table>'
poetry.lock CHANGED
@@ -3289,13 +3289,13 @@ testing = ["docopt", "pytest"]
3289
 
3290
  [[package]]
3291
  name = "pdftext"
3292
- version = "0.6.1"
3293
  description = "Extract structured text from pdfs quickly"
3294
  optional = false
3295
  python-versions = "<4.0,>=3.10"
3296
  files = [
3297
- {file = "pdftext-0.6.1-py3-none-any.whl", hash = "sha256:9c437a05262277dede2f6953eebc7b46d7393bb11ee373267814af2aa5e02e4d"},
3298
- {file = "pdftext-0.6.1.tar.gz", hash = "sha256:ffec41064804e157b48b76c834051ed7a5aa456257b78d9b87a5e8f54cebe307"},
3299
  ]
3300
 
3301
  [package.dependencies]
@@ -6112,4 +6112,4 @@ full = ["ebooklib", "mammoth", "openpyxl", "python-pptx", "weasyprint"]
6112
  [metadata]
6113
  lock-version = "2.0"
6114
  python-versions = "^3.10"
6115
- content-hash = "e03ee53a4be2afc3661be448bc16a102b47273a57125b37e54184f88030b233b"
 
3289
 
3290
  [[package]]
3291
  name = "pdftext"
3292
+ version = "0.6.2"
3293
  description = "Extract structured text from pdfs quickly"
3294
  optional = false
3295
  python-versions = "<4.0,>=3.10"
3296
  files = [
3297
+ {file = "pdftext-0.6.2-py3-none-any.whl", hash = "sha256:905d11e62d548e307933c25865a69c8e993947bb5b40b1535b0a2aa8f07a71d4"},
3298
+ {file = "pdftext-0.6.2.tar.gz", hash = "sha256:ff5b92462ac03ae63a23429384ae123d45c162dcda30e7bf2c5c92a6b208c9de"},
3299
  ]
3300
 
3301
  [package.dependencies]
 
6112
  [metadata]
6113
  lock-version = "2.0"
6114
  python-versions = "^3.10"
6115
+ content-hash = "4609798f8e0c4bc0c7a9ab4bcb6f92289ea03bdd902a1c35c12699f874f67298"
pyproject.toml CHANGED
@@ -28,7 +28,7 @@ ftfy = "^6.1.1"
28
  rapidfuzz = "^3.8.1"
29
  surya-ocr = "~0.13.0"
30
  regex = "^2024.4.28"
31
- pdftext = "~0.6.1"
32
  markdownify = "^0.13.1"
33
  click = "^8.1.7"
34
  markdown2 = "^2.5.2"
 
28
  rapidfuzz = "^3.8.1"
29
  surya-ocr = "~0.13.0"
30
  regex = "^2024.4.28"
31
+ pdftext = "~0.6.2"
32
  markdownify = "^0.13.1"
33
  click = "^8.1.7"
34
  markdown2 = "^2.5.2"
tests/builders/test_garbled_pdf.py CHANGED
@@ -15,7 +15,7 @@ def test_garbled_pdf(pdf_document, detection_model, recognition_model, table_rec
15
 
16
  table_cell = pdf_document.pages[0].get_block(table_block.structure[0])
17
  assert table_cell.block_type == BlockTypes.Line
18
- assert table_cell.structure[0] == "/page/0/Span/2"
19
 
20
  # We don't OCR in the initial pass, only with the TableProcessor
21
  processor = TableProcessor(detection_model, recognition_model, table_rec_model)
 
15
 
16
  table_cell = pdf_document.pages[0].get_block(table_block.structure[0])
17
  assert table_cell.block_type == BlockTypes.Line
18
+ assert table_cell.structure[0] == "/page/0/Span/3"
19
 
20
  # We don't OCR in the initial pass, only with the TableProcessor
21
  processor = TableProcessor(detection_model, recognition_model, table_rec_model)
tests/builders/test_ocr_pipeline.py CHANGED
@@ -23,7 +23,7 @@ def _ocr_pipeline_test(pdf_document):
23
  # Makes sure the OCR bbox is being scaled to the same scale as the layout boxes
24
  text_lines = first_page.contained_blocks(pdf_document, (BlockTypes.Line,))
25
  text_blocks = first_page.contained_blocks(pdf_document, (BlockTypes.Text, BlockTypes.TextInlineMath))
26
- assert len(text_lines) == 71
27
 
28
  # Ensure the bbox sizes match up
29
  max_line_position = max([line.polygon.y_end for line in text_lines])
 
23
  # Makes sure the OCR bbox is being scaled to the same scale as the layout boxes
24
  text_lines = first_page.contained_blocks(pdf_document, (BlockTypes.Line,))
25
  text_blocks = first_page.contained_blocks(pdf_document, (BlockTypes.Text, BlockTypes.TextInlineMath))
26
+ assert len(text_lines) == 84
27
 
28
  # Ensure the bbox sizes match up
29
  max_line_position = max([line.polygon.y_end for line in text_lines])
tests/conftest.py CHANGED
@@ -147,7 +147,7 @@ def llm_service(request, config):
147
  def temp_image():
148
  img = Image.new("RGB", (512, 512), color="white")
149
  draw = ImageDraw.Draw(img)
150
- draw.text((10, 10), "Hello, World!", fill="black")
151
  with tempfile.NamedTemporaryFile(suffix=".png") as f:
152
  img.save(f.name)
153
  f.flush()
 
147
  def temp_image():
148
  img = Image.new("RGB", (512, 512), color="white")
149
  draw = ImageDraw.Draw(img)
150
+ draw.text((10, 10), "Hello, World!", fill="black", font_size=24)
151
  with tempfile.NamedTemporaryFile(suffix=".png") as f:
152
  img.save(f.name)
153
  f.flush()
tests/processors/test_llm_processors.py CHANGED
@@ -168,7 +168,7 @@ def test_llm_complex_region_processor(pdf_document):
168
  def test_multi_llm_processors(pdf_document):
169
  description = "<math>This is an image description. And here is a lot of writing about it.</math>" * 10
170
  mock_cls = Mock()
171
- mock_cls.return_value = {"image_description": description, "html_equation": description}
172
 
173
  config = {"use_llm": True, "gemini_api_key": "test", "extract_images": False, "min_equation_height": .001}
174
  processor_lst = [LLMImageDescriptionProcessor(config), LLMEquationProcessor(config)]
 
168
  def test_multi_llm_processors(pdf_document):
169
  description = "<math>This is an image description. And here is a lot of writing about it.</math>" * 10
170
  mock_cls = Mock()
171
+ mock_cls.return_value = {"image_description": description, "corrected_equation": description}
172
 
173
  config = {"use_llm": True, "gemini_api_key": "test", "extract_images": False, "min_equation_height": .001}
174
  processor_lst = [LLMImageDescriptionProcessor(config), LLMEquationProcessor(config)]