Spaces:

rt4u
/

marker

Sleeping

App Files Files Community

Vik Paruchuri commited on May 19

Commit

530b509

1 Parent(s): 9d63262

Patch tests

Browse files

Files changed (5) hide show

README.md +2 -1
marker/converters/extraction.py +3 -3
marker/renderers/extraction.py +2 -1
tests/builders/test_ocr_pipeline.py +11 -7
tests/converters/test_ocr_converter.py +2 -2

README.md CHANGED Viewed

@@ -3,6 +3,7 @@
 Marker converts documents to markdown, JSON, and HTML quickly and accurately.
 - Converts PDF, image, PPTX, DOCX, XLSX, HTML, EPUB files in all languages
 - Formats tables, forms, equations, inline math, links, references, and code blocks
 - Extracts and saves images
 - Removes headers/footers/other artifacts
@@ -249,7 +250,7 @@ You can also run this via the CLI with
 marker_single FILENAME --converter_cls marker.converters.ocr.OCRConverter
 ```
-### Structured Extraction (alpha)
 You can run structured extraction via the `ExtractionConverter`.  This requires an llm service to be setup first (see [here](#llm-services) for details).  You'll get a JSON output with the extracted values.

 Marker converts documents to markdown, JSON, and HTML quickly and accurately.
 - Converts PDF, image, PPTX, DOCX, XLSX, HTML, EPUB files in all languages
+- Does structured extraction, given a JSON schema (beta)
 - Formats tables, forms, equations, inline math, links, references, and code blocks
 - Extracts and saves images
 - Removes headers/footers/other artifacts
 marker_single FILENAME --converter_cls marker.converters.ocr.OCRConverter
 ```
+### Structured Extraction (beta)
 You can run structured extraction via the `ExtractionConverter`.  This requires an llm service to be setup first (see [here](#llm-services) for details).  You'll get a JSON output with the extracted values.

marker/converters/extraction.py CHANGED Viewed

@@ -9,7 +9,7 @@ from marker.converters.pdf import PdfConverter
 from marker.extractors.page import PageExtractor, json_schema_to_base_model
 from marker.providers.registry import provider_from_filepath
-from marker.renderers.extraction import ExtractionMerger, ExtractionOutput
 from marker.renderers.markdown import MarkdownRenderer
 from marker.logger import get_logger
@@ -65,7 +65,7 @@ class ExtractionConverter(PdfConverter):
             )
         extractor = self.resolve_dependencies(PageExtractor)
-        merger = self.resolve_dependencies(ExtractionMerger)
         pnums = provider.page_range
         all_json = {}
@@ -73,5 +73,5 @@ class ExtractionConverter(PdfConverter):
             extracted_json = extractor(document, page, page_md.strip())
             all_json[pnum] = extracted_json
-        merged = merger(all_json)
         return merged

 from marker.extractors.page import PageExtractor, json_schema_to_base_model
 from marker.providers.registry import provider_from_filepath
+from marker.renderers.extraction import ExtractionRenderer, ExtractionOutput
 from marker.renderers.markdown import MarkdownRenderer
 from marker.logger import get_logger
             )
         extractor = self.resolve_dependencies(PageExtractor)
+        renderer = self.resolve_dependencies(ExtractionRenderer)
         pnums = provider.page_range
         all_json = {}
             extracted_json = extractor(document, page, page_md.strip())
             all_json[pnum] = extracted_json
+        merged = renderer(all_json)
         return merged

marker/renderers/extraction.py CHANGED Viewed

@@ -4,6 +4,7 @@ from typing import Dict
 from pydantic import BaseModel
 from marker.extractors import ExtractionResult
 @dataclass
@@ -44,7 +45,7 @@ class ExtractionOutput(BaseModel):
     json: dict
-class ExtractionMerger:
     def __call__(self, outputs: Dict[int, ExtractionResult]) -> ExtractionOutput:
         pnums = sorted(list(outputs.keys()))
         merged_result = outputs[pnums[0]].extracted_data.copy()

 from pydantic import BaseModel
 from marker.extractors import ExtractionResult
+from marker.renderers import BaseRenderer
 @dataclass
     json: dict
+class ExtractionRenderer(BaseRenderer):
     def __call__(self, outputs: Dict[int, ExtractionResult]) -> ExtractionOutput:
         pnums = sorted(list(outputs.keys()))
         merged_result = outputs[pnums[0]].extracted_data.copy()

tests/builders/test_ocr_pipeline.py CHANGED Viewed

@@ -6,10 +6,10 @@ from marker.schema.text.line import Line
 def _ocr_pipeline_test(pdf_document):
     first_page = pdf_document.pages[0]
-    assert first_page.structure[0] == '/page/0/SectionHeader/0'
     first_block = first_page.get_block(first_page.structure[0])
-    assert first_block.text_extraction_method == 'surya'
     assert first_block.block_type == BlockTypes.SectionHeader
     first_text_block: Line = first_page.get_block(first_block.structure[0])
@@ -17,17 +17,21 @@ def _ocr_pipeline_test(pdf_document):
     first_span = first_page.get_block(first_text_block.structure[0])
     assert first_span.block_type == BlockTypes.Span
-    assert first_span.text.strip() == 'Subspace Adversarial Training'
     # Ensure we match all text lines up properly
     # Makes sure the OCR bbox is being scaled to the same scale as the layout boxes
     text_lines = first_page.contained_blocks(pdf_document, (BlockTypes.Line,))
-    text_blocks = first_page.contained_blocks(pdf_document, (BlockTypes.Text, BlockTypes.TextInlineMath))
-    assert len(text_lines) == 84
     # Ensure the bbox sizes match up
     max_line_position = max([line.polygon.y_end for line in text_lines])
-    max_block_position = max([block.polygon.y_end for block in text_blocks if block.source == "layout"])
     assert max_line_position <= (max_block_position * 1.02)
@@ -35,7 +39,7 @@ def _ocr_pipeline_test(pdf_document):
 def test_ocr_pipeline(pdf_document):
     _ocr_pipeline_test(pdf_document)
 @pytest.mark.config({"force_ocr": True, "page_range": [0], "use_llm": True})
 def test_ocr_with_inline_pipeline(pdf_document):
     _ocr_pipeline_test(pdf_document)

 def _ocr_pipeline_test(pdf_document):
     first_page = pdf_document.pages[0]
+    assert first_page.structure[0] == "/page/0/SectionHeader/0"
     first_block = first_page.get_block(first_page.structure[0])
+    assert first_block.text_extraction_method == "surya"
     assert first_block.block_type == BlockTypes.SectionHeader
     first_text_block: Line = first_page.get_block(first_block.structure[0])
     first_span = first_page.get_block(first_text_block.structure[0])
     assert first_span.block_type == BlockTypes.Span
+    assert first_span.text.strip() == "Subspace Adversarial Training"
     # Ensure we match all text lines up properly
     # Makes sure the OCR bbox is being scaled to the same scale as the layout boxes
     text_lines = first_page.contained_blocks(pdf_document, (BlockTypes.Line,))
+    text_blocks = first_page.contained_blocks(
+        pdf_document, (BlockTypes.Text, BlockTypes.TextInlineMath)
+    )
+    assert len(text_lines) == 83
     # Ensure the bbox sizes match up
     max_line_position = max([line.polygon.y_end for line in text_lines])
+    max_block_position = max(
+        [block.polygon.y_end for block in text_blocks if block.source == "layout"]
+    )
     assert max_line_position <= (max_block_position * 1.02)
 def test_ocr_pipeline(pdf_document):
     _ocr_pipeline_test(pdf_document)
 @pytest.mark.config({"force_ocr": True, "page_range": [0], "use_llm": True})
 def test_ocr_with_inline_pipeline(pdf_document):
     _ocr_pipeline_test(pdf_document)

tests/converters/test_ocr_converter.py CHANGED Viewed

@@ -35,13 +35,13 @@ def check_bboxes(page: OCRJSONPageOutput, lines):
 @pytest.mark.config({"page_range": [0]})
 def test_ocr_converter(config, model_dict, temp_doc):
-    _ocr_converter(config, model_dict, temp_doc, 85, 2)
 @pytest.mark.filename("pres.pdf")
 @pytest.mark.config({"page_range": [1], "force_ocr": True, "keep_chars": True})
 def test_ocr_converter_force(config, model_dict, temp_doc):
-    pages = _ocr_converter(config, model_dict, temp_doc, 9, 0)
     lines = [line for line in pages[0].children if line.block_type == "Line"]
     check_bboxes(pages[0], lines)

 @pytest.mark.config({"page_range": [0]})
 def test_ocr_converter(config, model_dict, temp_doc):
+    _ocr_converter(config, model_dict, temp_doc, 84, 2)
 @pytest.mark.filename("pres.pdf")
 @pytest.mark.config({"page_range": [1], "force_ocr": True, "keep_chars": True})
 def test_ocr_converter_force(config, model_dict, temp_doc):
+    pages = _ocr_converter(config, model_dict, temp_doc, 10, 0)
     lines = [line for line in pages[0].children if line.block_type == "Line"]
     check_bboxes(pages[0], lines)