Spaces:

rt4u
/

marker

Sleeping

App Files Files Community

Vik Paruchuri commited on Feb 28

Commit

7e40faa

1 Parent(s): b28bf13

Bump pdftext, fix tests

Browse files

Files changed (12) hide show

README.md +1 -1
marker/processors/llm/llm_equation.py +5 -1
marker/providers/document.py +1 -2
marker/providers/epub.py +3 -3
marker/providers/powerpoint.py +4 -3
marker/providers/spreadsheet.py +3 -5
poetry.lock +4 -4
pyproject.toml +1 -1
tests/builders/test_garbled_pdf.py +1 -1
tests/builders/test_ocr_pipeline.py +1 -1
tests/conftest.py +1 -1
tests/processors/test_llm_processors.py +1 -1

README.md CHANGED Viewed

@@ -66,7 +66,7 @@ Install with:
 pip install marker-pdf
 ```
-By default, marker will work on PDFs and images.  If you also want to use marker on XLSX, DOCX, HTML, etc, you will need to run:
 ```shell
 pip install marker-pdf[full]

 pip install marker-pdf
 ```
+If you want to use marker on documents other than PDFs, you will need to install additional dependencies with:
 ```shell
 pip install marker-pdf[full]

marker/processors/llm/llm_equation.py CHANGED Viewed

@@ -74,8 +74,12 @@ analysis: The equations are not formatted as LaTeX, or enclosed in math tags.
         for block_data in blocks:
             block = block_data["block"]
             page = block_data["page"]
             # If we redo inline math, we redo all equations
-            if block.polygon.height / page.polygon.height < self.min_equation_height and not self.redo_inline_math:
                 continue
             out_blocks.append(block_data)
         return out_blocks

         for block_data in blocks:
             block = block_data["block"]
             page = block_data["page"]
             # If we redo inline math, we redo all equations
+            if all([
+                block.polygon.height / page.polygon.height < self.min_equation_height,
+                not self.redo_inline_math
+            ]):
                 continue
             out_blocks.append(block_data)
         return out_blocks

marker/providers/document.py CHANGED Viewed

@@ -1,11 +1,9 @@
 import base64
-import logging
 import os
 import re
 import tempfile
 from io import BytesIO
-import mammoth
 from PIL import Image
 from marker.providers.pdf import PdfProvider
@@ -69,6 +67,7 @@ class DocumentProvider(PdfProvider):
     def convert_docx_to_pdf(self, filepath: str):
         from weasyprint import CSS, HTML
         with open(filepath, "rb") as docx_file:
             # we convert the docx to HTML

 import base64
 import os
 import re
 import tempfile
 from io import BytesIO
 from PIL import Image
 from marker.providers.pdf import PdfProvider
     def convert_docx_to_pdf(self, filepath: str):
         from weasyprint import CSS, HTML
+        import mammoth
         with open(filepath, "rb") as docx_file:
             # we convert the docx to HTML

marker/providers/epub.py CHANGED Viewed

@@ -2,9 +2,7 @@ import base64
 import os
 import tempfile
-import ebooklib
 from bs4 import BeautifulSoup
-from ebooklib import epub
 from marker.providers.pdf import PdfProvider
@@ -67,6 +65,8 @@ class EpubProvider(PdfProvider):
     def convert_epub_to_pdf(self, filepath):
         from weasyprint import CSS, HTML
         ebook = epub.read_epub(filepath)
@@ -104,7 +104,7 @@ class EpubProvider(PdfProvider):
         full_style = ''.join([css])  # + styles)
         # we convert the epub to HTML
-        result = HTML(string=html_content, base_url=filepath).write_pdf(
             self.temp_pdf_path,
             stylesheets=[CSS(string=full_style), self.get_font_css()]
         )

 import os
 import tempfile
 from bs4 import BeautifulSoup
 from marker.providers.pdf import PdfProvider
     def convert_epub_to_pdf(self, filepath):
         from weasyprint import CSS, HTML
+        from ebooklib import epub
+        import ebooklib
         ebook = epub.read_epub(filepath)
         full_style = ''.join([css])  # + styles)
         # we convert the epub to HTML
+        HTML(string=html_content, base_url=filepath).write_pdf(
             self.temp_pdf_path,
             stylesheets=[CSS(string=full_style), self.get_font_css()]
         )

marker/providers/powerpoint.py CHANGED Viewed

@@ -3,9 +3,6 @@ import os
 import tempfile
 import traceback
-from pptx import Presentation
-from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
 from marker.providers.pdf import PdfProvider
 css = '''
@@ -63,6 +60,8 @@ class PowerPointProvider(PdfProvider):
     def convert_pptx_to_pdf(self, filepath):
         from weasyprint import CSS, HTML
         pptx = Presentation(filepath)
@@ -112,6 +111,7 @@ class PowerPointProvider(PdfProvider):
         """
         Recursively handle shapes in a group. Returns HTML string for the entire group.
         """
         group_parts = []
         for shape in group_shape.shapes:
@@ -140,6 +140,7 @@ class PowerPointProvider(PdfProvider):
         Processes shape text, including bullet/numbered list detection and placeholders
         (title, subtitle, etc.). Returns HTML for the text block(s).
         """
         # Distinguish placeholders to see if it's a title or subtitle
         label_html_tag = "p"

 import tempfile
 import traceback
 from marker.providers.pdf import PdfProvider
 css = '''
     def convert_pptx_to_pdf(self, filepath):
         from weasyprint import CSS, HTML
+        from pptx import Presentation
+        from pptx.enum.shapes import MSO_SHAPE_TYPE
         pptx = Presentation(filepath)
         """
         Recursively handle shapes in a group. Returns HTML string for the entire group.
         """
+        from pptx.enum.shapes import MSO_SHAPE_TYPE
         group_parts = []
         for shape in group_shape.shapes:
         Processes shape text, including bullet/numbered list detection and placeholders
         (title, subtitle, etc.). Returns HTML for the text block(s).
         """
+        from pptx.enum.shapes import PP_PLACEHOLDER
         # Distinguish placeholders to see if it's a title or subtitle
         label_html_tag = "p"

marker/providers/spreadsheet.py CHANGED Viewed

@@ -1,9 +1,6 @@
 import os
 import tempfile
-from openpyxl import load_workbook
-from openpyxl.worksheet.worksheet import Worksheet
 from marker.providers.pdf import PdfProvider
 css = '''
@@ -52,6 +49,7 @@ class SpreadSheetProvider(PdfProvider):
     def convert_xlsx_to_pdf(self, filepath: str):
         from weasyprint import CSS, HTML
         html = ""
         workbook = load_workbook(filepath)
@@ -69,7 +67,7 @@ class SpreadSheetProvider(PdfProvider):
         )
     @staticmethod
-    def _get_merged_cell_ranges(sheet: Worksheet):
         merged_info = {}
         for merged_range in sheet.merged_cells.ranges:
             min_col, min_row, max_col, max_row = merged_range.bounds
@@ -80,7 +78,7 @@ class SpreadSheetProvider(PdfProvider):
             }
         return merged_info
-    def _excel_to_html_table(self, sheet: Worksheet):
         merged_cells = self._get_merged_cell_ranges(sheet)
         html = f'<table>'

 import os
 import tempfile
 from marker.providers.pdf import PdfProvider
 css = '''
     def convert_xlsx_to_pdf(self, filepath: str):
         from weasyprint import CSS, HTML
+        from openpyxl import load_workbook
         html = ""
         workbook = load_workbook(filepath)
         )
     @staticmethod
+    def _get_merged_cell_ranges(sheet):
         merged_info = {}
         for merged_range in sheet.merged_cells.ranges:
             min_col, min_row, max_col, max_row = merged_range.bounds
             }
         return merged_info
+    def _excel_to_html_table(self, sheet):
         merged_cells = self._get_merged_cell_ranges(sheet)
         html = f'<table>'

poetry.lock CHANGED Viewed

@@ -3289,13 +3289,13 @@ testing = ["docopt", "pytest"]
 [[package]]
 name = "pdftext"
-version = "0.6.1"
 description = "Extract structured text from pdfs quickly"
 optional = false
 python-versions = "<4.0,>=3.10"
 files = [
-    {file = "pdftext-0.6.1-py3-none-any.whl", hash = "sha256:9c437a05262277dede2f6953eebc7b46d7393bb11ee373267814af2aa5e02e4d"},
-    {file = "pdftext-0.6.1.tar.gz", hash = "sha256:ffec41064804e157b48b76c834051ed7a5aa456257b78d9b87a5e8f54cebe307"},
 ]
 [package.dependencies]
@@ -6112,4 +6112,4 @@ full = ["ebooklib", "mammoth", "openpyxl", "python-pptx", "weasyprint"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.10"
-content-hash = "e03ee53a4be2afc3661be448bc16a102b47273a57125b37e54184f88030b233b"

 [[package]]
 name = "pdftext"
+version = "0.6.2"
 description = "Extract structured text from pdfs quickly"
 optional = false
 python-versions = "<4.0,>=3.10"
 files = [
+    {file = "pdftext-0.6.2-py3-none-any.whl", hash = "sha256:905d11e62d548e307933c25865a69c8e993947bb5b40b1535b0a2aa8f07a71d4"},
+    {file = "pdftext-0.6.2.tar.gz", hash = "sha256:ff5b92462ac03ae63a23429384ae123d45c162dcda30e7bf2c5c92a6b208c9de"},
 ]
 [package.dependencies]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.10"
+content-hash = "4609798f8e0c4bc0c7a9ab4bcb6f92289ea03bdd902a1c35c12699f874f67298"

pyproject.toml CHANGED Viewed

@@ -28,7 +28,7 @@ ftfy = "^6.1.1"
 rapidfuzz = "^3.8.1"
 surya-ocr = "~0.13.0"
 regex = "^2024.4.28"
-pdftext = "~0.6.1"
 markdownify = "^0.13.1"
 click = "^8.1.7"
 markdown2 = "^2.5.2"

 rapidfuzz = "^3.8.1"
 surya-ocr = "~0.13.0"
 regex = "^2024.4.28"
+pdftext = "~0.6.2"
 markdownify = "^0.13.1"
 click = "^8.1.7"
 markdown2 = "^2.5.2"

tests/builders/test_garbled_pdf.py CHANGED Viewed

@@ -15,7 +15,7 @@ def test_garbled_pdf(pdf_document, detection_model, recognition_model, table_rec
     table_cell = pdf_document.pages[0].get_block(table_block.structure[0])
     assert table_cell.block_type == BlockTypes.Line
-    assert table_cell.structure[0] == "/page/0/Span/2"
     # We don't OCR in the initial pass, only with the TableProcessor
     processor = TableProcessor(detection_model, recognition_model, table_rec_model)

     table_cell = pdf_document.pages[0].get_block(table_block.structure[0])
     assert table_cell.block_type == BlockTypes.Line
+    assert table_cell.structure[0] == "/page/0/Span/3"
     # We don't OCR in the initial pass, only with the TableProcessor
     processor = TableProcessor(detection_model, recognition_model, table_rec_model)

tests/builders/test_ocr_pipeline.py CHANGED Viewed

@@ -23,7 +23,7 @@ def _ocr_pipeline_test(pdf_document):
     # Makes sure the OCR bbox is being scaled to the same scale as the layout boxes
     text_lines = first_page.contained_blocks(pdf_document, (BlockTypes.Line,))
     text_blocks = first_page.contained_blocks(pdf_document, (BlockTypes.Text, BlockTypes.TextInlineMath))
-    assert len(text_lines) == 71
     # Ensure the bbox sizes match up
     max_line_position = max([line.polygon.y_end for line in text_lines])

     # Makes sure the OCR bbox is being scaled to the same scale as the layout boxes
     text_lines = first_page.contained_blocks(pdf_document, (BlockTypes.Line,))
     text_blocks = first_page.contained_blocks(pdf_document, (BlockTypes.Text, BlockTypes.TextInlineMath))
+    assert len(text_lines) == 84
     # Ensure the bbox sizes match up
     max_line_position = max([line.polygon.y_end for line in text_lines])

tests/conftest.py CHANGED Viewed

@@ -147,7 +147,7 @@ def llm_service(request, config):
 def temp_image():
     img = Image.new("RGB", (512, 512), color="white")
     draw = ImageDraw.Draw(img)
-    draw.text((10, 10), "Hello, World!", fill="black")
     with tempfile.NamedTemporaryFile(suffix=".png") as f:
         img.save(f.name)
         f.flush()

 def temp_image():
     img = Image.new("RGB", (512, 512), color="white")
     draw = ImageDraw.Draw(img)
+    draw.text((10, 10), "Hello, World!", fill="black", font_size=24)
     with tempfile.NamedTemporaryFile(suffix=".png") as f:
         img.save(f.name)
         f.flush()

tests/processors/test_llm_processors.py CHANGED Viewed

@@ -168,7 +168,7 @@ def test_llm_complex_region_processor(pdf_document):
 def test_multi_llm_processors(pdf_document):
     description = "<math>This is an image description.  And here is a lot of writing about it.</math>" * 10
     mock_cls = Mock()
-    mock_cls.return_value = {"image_description": description, "html_equation": description}
     config = {"use_llm": True, "gemini_api_key": "test", "extract_images": False, "min_equation_height": .001}
     processor_lst = [LLMImageDescriptionProcessor(config), LLMEquationProcessor(config)]

 def test_multi_llm_processors(pdf_document):
     description = "<math>This is an image description.  And here is a lot of writing about it.</math>" * 10
     mock_cls = Mock()
+    mock_cls.return_value = {"image_description": description, "corrected_equation": description}
     config = {"use_llm": True, "gemini_api_key": "test", "extract_images": False, "min_equation_height": .001}
     processor_lst = [LLMImageDescriptionProcessor(config), LLMEquationProcessor(config)]