Vik Paruchuri commited on
Commit
530b509
·
1 Parent(s): 9d63262

Patch tests

Browse files
README.md CHANGED
@@ -3,6 +3,7 @@
3
  Marker converts documents to markdown, JSON, and HTML quickly and accurately.
4
 
5
  - Converts PDF, image, PPTX, DOCX, XLSX, HTML, EPUB files in all languages
 
6
  - Formats tables, forms, equations, inline math, links, references, and code blocks
7
  - Extracts and saves images
8
  - Removes headers/footers/other artifacts
@@ -249,7 +250,7 @@ You can also run this via the CLI with
249
  marker_single FILENAME --converter_cls marker.converters.ocr.OCRConverter
250
  ```
251
 
252
- ### Structured Extraction (alpha)
253
 
254
  You can run structured extraction via the `ExtractionConverter`. This requires an llm service to be setup first (see [here](#llm-services) for details). You'll get a JSON output with the extracted values.
255
 
 
3
  Marker converts documents to markdown, JSON, and HTML quickly and accurately.
4
 
5
  - Converts PDF, image, PPTX, DOCX, XLSX, HTML, EPUB files in all languages
6
+ - Does structured extraction, given a JSON schema (beta)
7
  - Formats tables, forms, equations, inline math, links, references, and code blocks
8
  - Extracts and saves images
9
  - Removes headers/footers/other artifacts
 
250
  marker_single FILENAME --converter_cls marker.converters.ocr.OCRConverter
251
  ```
252
 
253
+ ### Structured Extraction (beta)
254
 
255
  You can run structured extraction via the `ExtractionConverter`. This requires an llm service to be setup first (see [here](#llm-services) for details). You'll get a JSON output with the extracted values.
256
 
marker/converters/extraction.py CHANGED
@@ -9,7 +9,7 @@ from marker.converters.pdf import PdfConverter
9
  from marker.extractors.page import PageExtractor, json_schema_to_base_model
10
  from marker.providers.registry import provider_from_filepath
11
 
12
- from marker.renderers.extraction import ExtractionMerger, ExtractionOutput
13
  from marker.renderers.markdown import MarkdownRenderer
14
 
15
  from marker.logger import get_logger
@@ -65,7 +65,7 @@ class ExtractionConverter(PdfConverter):
65
  )
66
 
67
  extractor = self.resolve_dependencies(PageExtractor)
68
- merger = self.resolve_dependencies(ExtractionMerger)
69
 
70
  pnums = provider.page_range
71
  all_json = {}
@@ -73,5 +73,5 @@ class ExtractionConverter(PdfConverter):
73
  extracted_json = extractor(document, page, page_md.strip())
74
  all_json[pnum] = extracted_json
75
 
76
- merged = merger(all_json)
77
  return merged
 
9
  from marker.extractors.page import PageExtractor, json_schema_to_base_model
10
  from marker.providers.registry import provider_from_filepath
11
 
12
+ from marker.renderers.extraction import ExtractionRenderer, ExtractionOutput
13
  from marker.renderers.markdown import MarkdownRenderer
14
 
15
  from marker.logger import get_logger
 
65
  )
66
 
67
  extractor = self.resolve_dependencies(PageExtractor)
68
+ renderer = self.resolve_dependencies(ExtractionRenderer)
69
 
70
  pnums = provider.page_range
71
  all_json = {}
 
73
  extracted_json = extractor(document, page, page_md.strip())
74
  all_json[pnum] = extracted_json
75
 
76
+ merged = renderer(all_json)
77
  return merged
marker/renderers/extraction.py CHANGED
@@ -4,6 +4,7 @@ from typing import Dict
4
  from pydantic import BaseModel
5
 
6
  from marker.extractors import ExtractionResult
 
7
 
8
 
9
  @dataclass
@@ -44,7 +45,7 @@ class ExtractionOutput(BaseModel):
44
  json: dict
45
 
46
 
47
- class ExtractionMerger:
48
  def __call__(self, outputs: Dict[int, ExtractionResult]) -> ExtractionOutput:
49
  pnums = sorted(list(outputs.keys()))
50
  merged_result = outputs[pnums[0]].extracted_data.copy()
 
4
  from pydantic import BaseModel
5
 
6
  from marker.extractors import ExtractionResult
7
+ from marker.renderers import BaseRenderer
8
 
9
 
10
  @dataclass
 
45
  json: dict
46
 
47
 
48
+ class ExtractionRenderer(BaseRenderer):
49
  def __call__(self, outputs: Dict[int, ExtractionResult]) -> ExtractionOutput:
50
  pnums = sorted(list(outputs.keys()))
51
  merged_result = outputs[pnums[0]].extracted_data.copy()
tests/builders/test_ocr_pipeline.py CHANGED
@@ -6,10 +6,10 @@ from marker.schema.text.line import Line
6
 
7
  def _ocr_pipeline_test(pdf_document):
8
  first_page = pdf_document.pages[0]
9
- assert first_page.structure[0] == '/page/0/SectionHeader/0'
10
 
11
  first_block = first_page.get_block(first_page.structure[0])
12
- assert first_block.text_extraction_method == 'surya'
13
  assert first_block.block_type == BlockTypes.SectionHeader
14
 
15
  first_text_block: Line = first_page.get_block(first_block.structure[0])
@@ -17,17 +17,21 @@ def _ocr_pipeline_test(pdf_document):
17
 
18
  first_span = first_page.get_block(first_text_block.structure[0])
19
  assert first_span.block_type == BlockTypes.Span
20
- assert first_span.text.strip() == 'Subspace Adversarial Training'
21
 
22
  # Ensure we match all text lines up properly
23
  # Makes sure the OCR bbox is being scaled to the same scale as the layout boxes
24
  text_lines = first_page.contained_blocks(pdf_document, (BlockTypes.Line,))
25
- text_blocks = first_page.contained_blocks(pdf_document, (BlockTypes.Text, BlockTypes.TextInlineMath))
26
- assert len(text_lines) == 84
 
 
27
 
28
  # Ensure the bbox sizes match up
29
  max_line_position = max([line.polygon.y_end for line in text_lines])
30
- max_block_position = max([block.polygon.y_end for block in text_blocks if block.source == "layout"])
 
 
31
  assert max_line_position <= (max_block_position * 1.02)
32
 
33
 
@@ -35,7 +39,7 @@ def _ocr_pipeline_test(pdf_document):
35
  def test_ocr_pipeline(pdf_document):
36
  _ocr_pipeline_test(pdf_document)
37
 
 
38
  @pytest.mark.config({"force_ocr": True, "page_range": [0], "use_llm": True})
39
  def test_ocr_with_inline_pipeline(pdf_document):
40
  _ocr_pipeline_test(pdf_document)
41
-
 
6
 
7
  def _ocr_pipeline_test(pdf_document):
8
  first_page = pdf_document.pages[0]
9
+ assert first_page.structure[0] == "/page/0/SectionHeader/0"
10
 
11
  first_block = first_page.get_block(first_page.structure[0])
12
+ assert first_block.text_extraction_method == "surya"
13
  assert first_block.block_type == BlockTypes.SectionHeader
14
 
15
  first_text_block: Line = first_page.get_block(first_block.structure[0])
 
17
 
18
  first_span = first_page.get_block(first_text_block.structure[0])
19
  assert first_span.block_type == BlockTypes.Span
20
+ assert first_span.text.strip() == "Subspace Adversarial Training"
21
 
22
  # Ensure we match all text lines up properly
23
  # Makes sure the OCR bbox is being scaled to the same scale as the layout boxes
24
  text_lines = first_page.contained_blocks(pdf_document, (BlockTypes.Line,))
25
+ text_blocks = first_page.contained_blocks(
26
+ pdf_document, (BlockTypes.Text, BlockTypes.TextInlineMath)
27
+ )
28
+ assert len(text_lines) == 83
29
 
30
  # Ensure the bbox sizes match up
31
  max_line_position = max([line.polygon.y_end for line in text_lines])
32
+ max_block_position = max(
33
+ [block.polygon.y_end for block in text_blocks if block.source == "layout"]
34
+ )
35
  assert max_line_position <= (max_block_position * 1.02)
36
 
37
 
 
39
  def test_ocr_pipeline(pdf_document):
40
  _ocr_pipeline_test(pdf_document)
41
 
42
+
43
  @pytest.mark.config({"force_ocr": True, "page_range": [0], "use_llm": True})
44
  def test_ocr_with_inline_pipeline(pdf_document):
45
  _ocr_pipeline_test(pdf_document)
 
tests/converters/test_ocr_converter.py CHANGED
@@ -35,13 +35,13 @@ def check_bboxes(page: OCRJSONPageOutput, lines):
35
 
36
  @pytest.mark.config({"page_range": [0]})
37
  def test_ocr_converter(config, model_dict, temp_doc):
38
- _ocr_converter(config, model_dict, temp_doc, 85, 2)
39
 
40
 
41
  @pytest.mark.filename("pres.pdf")
42
  @pytest.mark.config({"page_range": [1], "force_ocr": True, "keep_chars": True})
43
  def test_ocr_converter_force(config, model_dict, temp_doc):
44
- pages = _ocr_converter(config, model_dict, temp_doc, 9, 0)
45
  lines = [line for line in pages[0].children if line.block_type == "Line"]
46
  check_bboxes(pages[0], lines)
47
 
 
35
 
36
  @pytest.mark.config({"page_range": [0]})
37
  def test_ocr_converter(config, model_dict, temp_doc):
38
+ _ocr_converter(config, model_dict, temp_doc, 84, 2)
39
 
40
 
41
  @pytest.mark.filename("pres.pdf")
42
  @pytest.mark.config({"page_range": [1], "force_ocr": True, "keep_chars": True})
43
  def test_ocr_converter_force(config, model_dict, temp_doc):
44
+ pages = _ocr_converter(config, model_dict, temp_doc, 10, 0)
45
  lines = [line for line in pages[0].children if line.block_type == "Line"]
46
  check_bboxes(pages[0], lines)
47