Vik Paruchuri
commited on
Commit
·
7e40faa
1
Parent(s):
b28bf13
Bump pdftext, fix tests
Browse files- README.md +1 -1
- marker/processors/llm/llm_equation.py +5 -1
- marker/providers/document.py +1 -2
- marker/providers/epub.py +3 -3
- marker/providers/powerpoint.py +4 -3
- marker/providers/spreadsheet.py +3 -5
- poetry.lock +4 -4
- pyproject.toml +1 -1
- tests/builders/test_garbled_pdf.py +1 -1
- tests/builders/test_ocr_pipeline.py +1 -1
- tests/conftest.py +1 -1
- tests/processors/test_llm_processors.py +1 -1
README.md
CHANGED
|
@@ -66,7 +66,7 @@ Install with:
|
|
| 66 |
pip install marker-pdf
|
| 67 |
```
|
| 68 |
|
| 69 |
-
|
| 70 |
|
| 71 |
```shell
|
| 72 |
pip install marker-pdf[full]
|
|
|
|
| 66 |
pip install marker-pdf
|
| 67 |
```
|
| 68 |
|
| 69 |
+
If you want to use marker on documents other than PDFs, you will need to install additional dependencies with:
|
| 70 |
|
| 71 |
```shell
|
| 72 |
pip install marker-pdf[full]
|
marker/processors/llm/llm_equation.py
CHANGED
|
@@ -74,8 +74,12 @@ analysis: The equations are not formatted as LaTeX, or enclosed in math tags.
|
|
| 74 |
for block_data in blocks:
|
| 75 |
block = block_data["block"]
|
| 76 |
page = block_data["page"]
|
|
|
|
| 77 |
# If we redo inline math, we redo all equations
|
| 78 |
-
if
|
|
|
|
|
|
|
|
|
|
| 79 |
continue
|
| 80 |
out_blocks.append(block_data)
|
| 81 |
return out_blocks
|
|
|
|
| 74 |
for block_data in blocks:
|
| 75 |
block = block_data["block"]
|
| 76 |
page = block_data["page"]
|
| 77 |
+
|
| 78 |
# If we redo inline math, we redo all equations
|
| 79 |
+
if all([
|
| 80 |
+
block.polygon.height / page.polygon.height < self.min_equation_height,
|
| 81 |
+
not self.redo_inline_math
|
| 82 |
+
]):
|
| 83 |
continue
|
| 84 |
out_blocks.append(block_data)
|
| 85 |
return out_blocks
|
marker/providers/document.py
CHANGED
|
@@ -1,11 +1,9 @@
|
|
| 1 |
import base64
|
| 2 |
-
import logging
|
| 3 |
import os
|
| 4 |
import re
|
| 5 |
import tempfile
|
| 6 |
from io import BytesIO
|
| 7 |
|
| 8 |
-
import mammoth
|
| 9 |
from PIL import Image
|
| 10 |
|
| 11 |
from marker.providers.pdf import PdfProvider
|
|
@@ -69,6 +67,7 @@ class DocumentProvider(PdfProvider):
|
|
| 69 |
|
| 70 |
def convert_docx_to_pdf(self, filepath: str):
|
| 71 |
from weasyprint import CSS, HTML
|
|
|
|
| 72 |
|
| 73 |
with open(filepath, "rb") as docx_file:
|
| 74 |
# we convert the docx to HTML
|
|
|
|
| 1 |
import base64
|
|
|
|
| 2 |
import os
|
| 3 |
import re
|
| 4 |
import tempfile
|
| 5 |
from io import BytesIO
|
| 6 |
|
|
|
|
| 7 |
from PIL import Image
|
| 8 |
|
| 9 |
from marker.providers.pdf import PdfProvider
|
|
|
|
| 67 |
|
| 68 |
def convert_docx_to_pdf(self, filepath: str):
|
| 69 |
from weasyprint import CSS, HTML
|
| 70 |
+
import mammoth
|
| 71 |
|
| 72 |
with open(filepath, "rb") as docx_file:
|
| 73 |
# we convert the docx to HTML
|
marker/providers/epub.py
CHANGED
|
@@ -2,9 +2,7 @@ import base64
|
|
| 2 |
import os
|
| 3 |
import tempfile
|
| 4 |
|
| 5 |
-
import ebooklib
|
| 6 |
from bs4 import BeautifulSoup
|
| 7 |
-
from ebooklib import epub
|
| 8 |
|
| 9 |
from marker.providers.pdf import PdfProvider
|
| 10 |
|
|
@@ -67,6 +65,8 @@ class EpubProvider(PdfProvider):
|
|
| 67 |
|
| 68 |
def convert_epub_to_pdf(self, filepath):
|
| 69 |
from weasyprint import CSS, HTML
|
|
|
|
|
|
|
| 70 |
|
| 71 |
ebook = epub.read_epub(filepath)
|
| 72 |
|
|
@@ -104,7 +104,7 @@ class EpubProvider(PdfProvider):
|
|
| 104 |
full_style = ''.join([css]) # + styles)
|
| 105 |
|
| 106 |
# we convert the epub to HTML
|
| 107 |
-
|
| 108 |
self.temp_pdf_path,
|
| 109 |
stylesheets=[CSS(string=full_style), self.get_font_css()]
|
| 110 |
)
|
|
|
|
| 2 |
import os
|
| 3 |
import tempfile
|
| 4 |
|
|
|
|
| 5 |
from bs4 import BeautifulSoup
|
|
|
|
| 6 |
|
| 7 |
from marker.providers.pdf import PdfProvider
|
| 8 |
|
|
|
|
| 65 |
|
| 66 |
def convert_epub_to_pdf(self, filepath):
|
| 67 |
from weasyprint import CSS, HTML
|
| 68 |
+
from ebooklib import epub
|
| 69 |
+
import ebooklib
|
| 70 |
|
| 71 |
ebook = epub.read_epub(filepath)
|
| 72 |
|
|
|
|
| 104 |
full_style = ''.join([css]) # + styles)
|
| 105 |
|
| 106 |
# we convert the epub to HTML
|
| 107 |
+
HTML(string=html_content, base_url=filepath).write_pdf(
|
| 108 |
self.temp_pdf_path,
|
| 109 |
stylesheets=[CSS(string=full_style), self.get_font_css()]
|
| 110 |
)
|
marker/providers/powerpoint.py
CHANGED
|
@@ -3,9 +3,6 @@ import os
|
|
| 3 |
import tempfile
|
| 4 |
import traceback
|
| 5 |
|
| 6 |
-
from pptx import Presentation
|
| 7 |
-
from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
|
| 8 |
-
|
| 9 |
from marker.providers.pdf import PdfProvider
|
| 10 |
|
| 11 |
css = '''
|
|
@@ -63,6 +60,8 @@ class PowerPointProvider(PdfProvider):
|
|
| 63 |
|
| 64 |
def convert_pptx_to_pdf(self, filepath):
|
| 65 |
from weasyprint import CSS, HTML
|
|
|
|
|
|
|
| 66 |
|
| 67 |
pptx = Presentation(filepath)
|
| 68 |
|
|
@@ -112,6 +111,7 @@ class PowerPointProvider(PdfProvider):
|
|
| 112 |
"""
|
| 113 |
Recursively handle shapes in a group. Returns HTML string for the entire group.
|
| 114 |
"""
|
|
|
|
| 115 |
|
| 116 |
group_parts = []
|
| 117 |
for shape in group_shape.shapes:
|
|
@@ -140,6 +140,7 @@ class PowerPointProvider(PdfProvider):
|
|
| 140 |
Processes shape text, including bullet/numbered list detection and placeholders
|
| 141 |
(title, subtitle, etc.). Returns HTML for the text block(s).
|
| 142 |
"""
|
|
|
|
| 143 |
|
| 144 |
# Distinguish placeholders to see if it's a title or subtitle
|
| 145 |
label_html_tag = "p"
|
|
|
|
| 3 |
import tempfile
|
| 4 |
import traceback
|
| 5 |
|
|
|
|
|
|
|
|
|
|
| 6 |
from marker.providers.pdf import PdfProvider
|
| 7 |
|
| 8 |
css = '''
|
|
|
|
| 60 |
|
| 61 |
def convert_pptx_to_pdf(self, filepath):
|
| 62 |
from weasyprint import CSS, HTML
|
| 63 |
+
from pptx import Presentation
|
| 64 |
+
from pptx.enum.shapes import MSO_SHAPE_TYPE
|
| 65 |
|
| 66 |
pptx = Presentation(filepath)
|
| 67 |
|
|
|
|
| 111 |
"""
|
| 112 |
Recursively handle shapes in a group. Returns HTML string for the entire group.
|
| 113 |
"""
|
| 114 |
+
from pptx.enum.shapes import MSO_SHAPE_TYPE
|
| 115 |
|
| 116 |
group_parts = []
|
| 117 |
for shape in group_shape.shapes:
|
|
|
|
| 140 |
Processes shape text, including bullet/numbered list detection and placeholders
|
| 141 |
(title, subtitle, etc.). Returns HTML for the text block(s).
|
| 142 |
"""
|
| 143 |
+
from pptx.enum.shapes import PP_PLACEHOLDER
|
| 144 |
|
| 145 |
# Distinguish placeholders to see if it's a title or subtitle
|
| 146 |
label_html_tag = "p"
|
marker/providers/spreadsheet.py
CHANGED
|
@@ -1,9 +1,6 @@
|
|
| 1 |
import os
|
| 2 |
import tempfile
|
| 3 |
|
| 4 |
-
from openpyxl import load_workbook
|
| 5 |
-
from openpyxl.worksheet.worksheet import Worksheet
|
| 6 |
-
|
| 7 |
from marker.providers.pdf import PdfProvider
|
| 8 |
|
| 9 |
css = '''
|
|
@@ -52,6 +49,7 @@ class SpreadSheetProvider(PdfProvider):
|
|
| 52 |
|
| 53 |
def convert_xlsx_to_pdf(self, filepath: str):
|
| 54 |
from weasyprint import CSS, HTML
|
|
|
|
| 55 |
|
| 56 |
html = ""
|
| 57 |
workbook = load_workbook(filepath)
|
|
@@ -69,7 +67,7 @@ class SpreadSheetProvider(PdfProvider):
|
|
| 69 |
)
|
| 70 |
|
| 71 |
@staticmethod
|
| 72 |
-
def _get_merged_cell_ranges(sheet
|
| 73 |
merged_info = {}
|
| 74 |
for merged_range in sheet.merged_cells.ranges:
|
| 75 |
min_col, min_row, max_col, max_row = merged_range.bounds
|
|
@@ -80,7 +78,7 @@ class SpreadSheetProvider(PdfProvider):
|
|
| 80 |
}
|
| 81 |
return merged_info
|
| 82 |
|
| 83 |
-
def _excel_to_html_table(self, sheet
|
| 84 |
merged_cells = self._get_merged_cell_ranges(sheet)
|
| 85 |
|
| 86 |
html = f'<table>'
|
|
|
|
| 1 |
import os
|
| 2 |
import tempfile
|
| 3 |
|
|
|
|
|
|
|
|
|
|
| 4 |
from marker.providers.pdf import PdfProvider
|
| 5 |
|
| 6 |
css = '''
|
|
|
|
| 49 |
|
| 50 |
def convert_xlsx_to_pdf(self, filepath: str):
|
| 51 |
from weasyprint import CSS, HTML
|
| 52 |
+
from openpyxl import load_workbook
|
| 53 |
|
| 54 |
html = ""
|
| 55 |
workbook = load_workbook(filepath)
|
|
|
|
| 67 |
)
|
| 68 |
|
| 69 |
@staticmethod
|
| 70 |
+
def _get_merged_cell_ranges(sheet):
|
| 71 |
merged_info = {}
|
| 72 |
for merged_range in sheet.merged_cells.ranges:
|
| 73 |
min_col, min_row, max_col, max_row = merged_range.bounds
|
|
|
|
| 78 |
}
|
| 79 |
return merged_info
|
| 80 |
|
| 81 |
+
def _excel_to_html_table(self, sheet):
|
| 82 |
merged_cells = self._get_merged_cell_ranges(sheet)
|
| 83 |
|
| 84 |
html = f'<table>'
|
poetry.lock
CHANGED
|
@@ -3289,13 +3289,13 @@ testing = ["docopt", "pytest"]
|
|
| 3289 |
|
| 3290 |
[[package]]
|
| 3291 |
name = "pdftext"
|
| 3292 |
-
version = "0.6.
|
| 3293 |
description = "Extract structured text from pdfs quickly"
|
| 3294 |
optional = false
|
| 3295 |
python-versions = "<4.0,>=3.10"
|
| 3296 |
files = [
|
| 3297 |
-
{file = "pdftext-0.6.
|
| 3298 |
-
{file = "pdftext-0.6.
|
| 3299 |
]
|
| 3300 |
|
| 3301 |
[package.dependencies]
|
|
@@ -6112,4 +6112,4 @@ full = ["ebooklib", "mammoth", "openpyxl", "python-pptx", "weasyprint"]
|
|
| 6112 |
[metadata]
|
| 6113 |
lock-version = "2.0"
|
| 6114 |
python-versions = "^3.10"
|
| 6115 |
-
content-hash = "
|
|
|
|
| 3289 |
|
| 3290 |
[[package]]
|
| 3291 |
name = "pdftext"
|
| 3292 |
+
version = "0.6.2"
|
| 3293 |
description = "Extract structured text from pdfs quickly"
|
| 3294 |
optional = false
|
| 3295 |
python-versions = "<4.0,>=3.10"
|
| 3296 |
files = [
|
| 3297 |
+
{file = "pdftext-0.6.2-py3-none-any.whl", hash = "sha256:905d11e62d548e307933c25865a69c8e993947bb5b40b1535b0a2aa8f07a71d4"},
|
| 3298 |
+
{file = "pdftext-0.6.2.tar.gz", hash = "sha256:ff5b92462ac03ae63a23429384ae123d45c162dcda30e7bf2c5c92a6b208c9de"},
|
| 3299 |
]
|
| 3300 |
|
| 3301 |
[package.dependencies]
|
|
|
|
| 6112 |
[metadata]
|
| 6113 |
lock-version = "2.0"
|
| 6114 |
python-versions = "^3.10"
|
| 6115 |
+
content-hash = "4609798f8e0c4bc0c7a9ab4bcb6f92289ea03bdd902a1c35c12699f874f67298"
|
pyproject.toml
CHANGED
|
@@ -28,7 +28,7 @@ ftfy = "^6.1.1"
|
|
| 28 |
rapidfuzz = "^3.8.1"
|
| 29 |
surya-ocr = "~0.13.0"
|
| 30 |
regex = "^2024.4.28"
|
| 31 |
-
pdftext = "~0.6.
|
| 32 |
markdownify = "^0.13.1"
|
| 33 |
click = "^8.1.7"
|
| 34 |
markdown2 = "^2.5.2"
|
|
|
|
| 28 |
rapidfuzz = "^3.8.1"
|
| 29 |
surya-ocr = "~0.13.0"
|
| 30 |
regex = "^2024.4.28"
|
| 31 |
+
pdftext = "~0.6.2"
|
| 32 |
markdownify = "^0.13.1"
|
| 33 |
click = "^8.1.7"
|
| 34 |
markdown2 = "^2.5.2"
|
tests/builders/test_garbled_pdf.py
CHANGED
|
@@ -15,7 +15,7 @@ def test_garbled_pdf(pdf_document, detection_model, recognition_model, table_rec
|
|
| 15 |
|
| 16 |
table_cell = pdf_document.pages[0].get_block(table_block.structure[0])
|
| 17 |
assert table_cell.block_type == BlockTypes.Line
|
| 18 |
-
assert table_cell.structure[0] == "/page/0/Span/
|
| 19 |
|
| 20 |
# We don't OCR in the initial pass, only with the TableProcessor
|
| 21 |
processor = TableProcessor(detection_model, recognition_model, table_rec_model)
|
|
|
|
| 15 |
|
| 16 |
table_cell = pdf_document.pages[0].get_block(table_block.structure[0])
|
| 17 |
assert table_cell.block_type == BlockTypes.Line
|
| 18 |
+
assert table_cell.structure[0] == "/page/0/Span/3"
|
| 19 |
|
| 20 |
# We don't OCR in the initial pass, only with the TableProcessor
|
| 21 |
processor = TableProcessor(detection_model, recognition_model, table_rec_model)
|
tests/builders/test_ocr_pipeline.py
CHANGED
|
@@ -23,7 +23,7 @@ def _ocr_pipeline_test(pdf_document):
|
|
| 23 |
# Makes sure the OCR bbox is being scaled to the same scale as the layout boxes
|
| 24 |
text_lines = first_page.contained_blocks(pdf_document, (BlockTypes.Line,))
|
| 25 |
text_blocks = first_page.contained_blocks(pdf_document, (BlockTypes.Text, BlockTypes.TextInlineMath))
|
| 26 |
-
assert len(text_lines) ==
|
| 27 |
|
| 28 |
# Ensure the bbox sizes match up
|
| 29 |
max_line_position = max([line.polygon.y_end for line in text_lines])
|
|
|
|
| 23 |
# Makes sure the OCR bbox is being scaled to the same scale as the layout boxes
|
| 24 |
text_lines = first_page.contained_blocks(pdf_document, (BlockTypes.Line,))
|
| 25 |
text_blocks = first_page.contained_blocks(pdf_document, (BlockTypes.Text, BlockTypes.TextInlineMath))
|
| 26 |
+
assert len(text_lines) == 84
|
| 27 |
|
| 28 |
# Ensure the bbox sizes match up
|
| 29 |
max_line_position = max([line.polygon.y_end for line in text_lines])
|
tests/conftest.py
CHANGED
|
@@ -147,7 +147,7 @@ def llm_service(request, config):
|
|
| 147 |
def temp_image():
|
| 148 |
img = Image.new("RGB", (512, 512), color="white")
|
| 149 |
draw = ImageDraw.Draw(img)
|
| 150 |
-
draw.text((10, 10), "Hello, World!", fill="black")
|
| 151 |
with tempfile.NamedTemporaryFile(suffix=".png") as f:
|
| 152 |
img.save(f.name)
|
| 153 |
f.flush()
|
|
|
|
| 147 |
def temp_image():
|
| 148 |
img = Image.new("RGB", (512, 512), color="white")
|
| 149 |
draw = ImageDraw.Draw(img)
|
| 150 |
+
draw.text((10, 10), "Hello, World!", fill="black", font_size=24)
|
| 151 |
with tempfile.NamedTemporaryFile(suffix=".png") as f:
|
| 152 |
img.save(f.name)
|
| 153 |
f.flush()
|
tests/processors/test_llm_processors.py
CHANGED
|
@@ -168,7 +168,7 @@ def test_llm_complex_region_processor(pdf_document):
|
|
| 168 |
def test_multi_llm_processors(pdf_document):
|
| 169 |
description = "<math>This is an image description. And here is a lot of writing about it.</math>" * 10
|
| 170 |
mock_cls = Mock()
|
| 171 |
-
mock_cls.return_value = {"image_description": description, "
|
| 172 |
|
| 173 |
config = {"use_llm": True, "gemini_api_key": "test", "extract_images": False, "min_equation_height": .001}
|
| 174 |
processor_lst = [LLMImageDescriptionProcessor(config), LLMEquationProcessor(config)]
|
|
|
|
| 168 |
def test_multi_llm_processors(pdf_document):
|
| 169 |
description = "<math>This is an image description. And here is a lot of writing about it.</math>" * 10
|
| 170 |
mock_cls = Mock()
|
| 171 |
+
mock_cls.return_value = {"image_description": description, "corrected_equation": description}
|
| 172 |
|
| 173 |
config = {"use_llm": True, "gemini_api_key": "test", "extract_images": False, "min_equation_height": .001}
|
| 174 |
processor_lst = [LLMImageDescriptionProcessor(config), LLMEquationProcessor(config)]
|