Vik Paruchuri
commited on
Commit
·
530b509
1
Parent(s):
9d63262
Patch tests
Browse files
README.md
CHANGED
|
@@ -3,6 +3,7 @@
|
|
| 3 |
Marker converts documents to markdown, JSON, and HTML quickly and accurately.
|
| 4 |
|
| 5 |
- Converts PDF, image, PPTX, DOCX, XLSX, HTML, EPUB files in all languages
|
|
|
|
| 6 |
- Formats tables, forms, equations, inline math, links, references, and code blocks
|
| 7 |
- Extracts and saves images
|
| 8 |
- Removes headers/footers/other artifacts
|
|
@@ -249,7 +250,7 @@ You can also run this via the CLI with
|
|
| 249 |
marker_single FILENAME --converter_cls marker.converters.ocr.OCRConverter
|
| 250 |
```
|
| 251 |
|
| 252 |
-
### Structured Extraction (
|
| 253 |
|
| 254 |
You can run structured extraction via the `ExtractionConverter`. This requires an llm service to be setup first (see [here](#llm-services) for details). You'll get a JSON output with the extracted values.
|
| 255 |
|
|
|
|
| 3 |
Marker converts documents to markdown, JSON, and HTML quickly and accurately.
|
| 4 |
|
| 5 |
- Converts PDF, image, PPTX, DOCX, XLSX, HTML, EPUB files in all languages
|
| 6 |
+
- Does structured extraction, given a JSON schema (beta)
|
| 7 |
- Formats tables, forms, equations, inline math, links, references, and code blocks
|
| 8 |
- Extracts and saves images
|
| 9 |
- Removes headers/footers/other artifacts
|
|
|
|
| 250 |
marker_single FILENAME --converter_cls marker.converters.ocr.OCRConverter
|
| 251 |
```
|
| 252 |
|
| 253 |
+
### Structured Extraction (beta)
|
| 254 |
|
| 255 |
You can run structured extraction via the `ExtractionConverter`. This requires an llm service to be setup first (see [here](#llm-services) for details). You'll get a JSON output with the extracted values.
|
| 256 |
|
marker/converters/extraction.py
CHANGED
|
@@ -9,7 +9,7 @@ from marker.converters.pdf import PdfConverter
|
|
| 9 |
from marker.extractors.page import PageExtractor, json_schema_to_base_model
|
| 10 |
from marker.providers.registry import provider_from_filepath
|
| 11 |
|
| 12 |
-
from marker.renderers.extraction import
|
| 13 |
from marker.renderers.markdown import MarkdownRenderer
|
| 14 |
|
| 15 |
from marker.logger import get_logger
|
|
@@ -65,7 +65,7 @@ class ExtractionConverter(PdfConverter):
|
|
| 65 |
)
|
| 66 |
|
| 67 |
extractor = self.resolve_dependencies(PageExtractor)
|
| 68 |
-
|
| 69 |
|
| 70 |
pnums = provider.page_range
|
| 71 |
all_json = {}
|
|
@@ -73,5 +73,5 @@ class ExtractionConverter(PdfConverter):
|
|
| 73 |
extracted_json = extractor(document, page, page_md.strip())
|
| 74 |
all_json[pnum] = extracted_json
|
| 75 |
|
| 76 |
-
merged =
|
| 77 |
return merged
|
|
|
|
| 9 |
from marker.extractors.page import PageExtractor, json_schema_to_base_model
|
| 10 |
from marker.providers.registry import provider_from_filepath
|
| 11 |
|
| 12 |
+
from marker.renderers.extraction import ExtractionRenderer, ExtractionOutput
|
| 13 |
from marker.renderers.markdown import MarkdownRenderer
|
| 14 |
|
| 15 |
from marker.logger import get_logger
|
|
|
|
| 65 |
)
|
| 66 |
|
| 67 |
extractor = self.resolve_dependencies(PageExtractor)
|
| 68 |
+
renderer = self.resolve_dependencies(ExtractionRenderer)
|
| 69 |
|
| 70 |
pnums = provider.page_range
|
| 71 |
all_json = {}
|
|
|
|
| 73 |
extracted_json = extractor(document, page, page_md.strip())
|
| 74 |
all_json[pnum] = extracted_json
|
| 75 |
|
| 76 |
+
merged = renderer(all_json)
|
| 77 |
return merged
|
marker/renderers/extraction.py
CHANGED
|
@@ -4,6 +4,7 @@ from typing import Dict
|
|
| 4 |
from pydantic import BaseModel
|
| 5 |
|
| 6 |
from marker.extractors import ExtractionResult
|
|
|
|
| 7 |
|
| 8 |
|
| 9 |
@dataclass
|
|
@@ -44,7 +45,7 @@ class ExtractionOutput(BaseModel):
|
|
| 44 |
json: dict
|
| 45 |
|
| 46 |
|
| 47 |
-
class
|
| 48 |
def __call__(self, outputs: Dict[int, ExtractionResult]) -> ExtractionOutput:
|
| 49 |
pnums = sorted(list(outputs.keys()))
|
| 50 |
merged_result = outputs[pnums[0]].extracted_data.copy()
|
|
|
|
| 4 |
from pydantic import BaseModel
|
| 5 |
|
| 6 |
from marker.extractors import ExtractionResult
|
| 7 |
+
from marker.renderers import BaseRenderer
|
| 8 |
|
| 9 |
|
| 10 |
@dataclass
|
|
|
|
| 45 |
json: dict
|
| 46 |
|
| 47 |
|
| 48 |
+
class ExtractionRenderer(BaseRenderer):
|
| 49 |
def __call__(self, outputs: Dict[int, ExtractionResult]) -> ExtractionOutput:
|
| 50 |
pnums = sorted(list(outputs.keys()))
|
| 51 |
merged_result = outputs[pnums[0]].extracted_data.copy()
|
tests/builders/test_ocr_pipeline.py
CHANGED
|
@@ -6,10 +6,10 @@ from marker.schema.text.line import Line
|
|
| 6 |
|
| 7 |
def _ocr_pipeline_test(pdf_document):
|
| 8 |
first_page = pdf_document.pages[0]
|
| 9 |
-
assert first_page.structure[0] ==
|
| 10 |
|
| 11 |
first_block = first_page.get_block(first_page.structure[0])
|
| 12 |
-
assert first_block.text_extraction_method ==
|
| 13 |
assert first_block.block_type == BlockTypes.SectionHeader
|
| 14 |
|
| 15 |
first_text_block: Line = first_page.get_block(first_block.structure[0])
|
|
@@ -17,17 +17,21 @@ def _ocr_pipeline_test(pdf_document):
|
|
| 17 |
|
| 18 |
first_span = first_page.get_block(first_text_block.structure[0])
|
| 19 |
assert first_span.block_type == BlockTypes.Span
|
| 20 |
-
assert first_span.text.strip() ==
|
| 21 |
|
| 22 |
# Ensure we match all text lines up properly
|
| 23 |
# Makes sure the OCR bbox is being scaled to the same scale as the layout boxes
|
| 24 |
text_lines = first_page.contained_blocks(pdf_document, (BlockTypes.Line,))
|
| 25 |
-
text_blocks = first_page.contained_blocks(
|
| 26 |
-
|
|
|
|
|
|
|
| 27 |
|
| 28 |
# Ensure the bbox sizes match up
|
| 29 |
max_line_position = max([line.polygon.y_end for line in text_lines])
|
| 30 |
-
max_block_position = max(
|
|
|
|
|
|
|
| 31 |
assert max_line_position <= (max_block_position * 1.02)
|
| 32 |
|
| 33 |
|
|
@@ -35,7 +39,7 @@ def _ocr_pipeline_test(pdf_document):
|
|
| 35 |
def test_ocr_pipeline(pdf_document):
|
| 36 |
_ocr_pipeline_test(pdf_document)
|
| 37 |
|
|
|
|
| 38 |
@pytest.mark.config({"force_ocr": True, "page_range": [0], "use_llm": True})
|
| 39 |
def test_ocr_with_inline_pipeline(pdf_document):
|
| 40 |
_ocr_pipeline_test(pdf_document)
|
| 41 |
-
|
|
|
|
| 6 |
|
| 7 |
def _ocr_pipeline_test(pdf_document):
|
| 8 |
first_page = pdf_document.pages[0]
|
| 9 |
+
assert first_page.structure[0] == "/page/0/SectionHeader/0"
|
| 10 |
|
| 11 |
first_block = first_page.get_block(first_page.structure[0])
|
| 12 |
+
assert first_block.text_extraction_method == "surya"
|
| 13 |
assert first_block.block_type == BlockTypes.SectionHeader
|
| 14 |
|
| 15 |
first_text_block: Line = first_page.get_block(first_block.structure[0])
|
|
|
|
| 17 |
|
| 18 |
first_span = first_page.get_block(first_text_block.structure[0])
|
| 19 |
assert first_span.block_type == BlockTypes.Span
|
| 20 |
+
assert first_span.text.strip() == "Subspace Adversarial Training"
|
| 21 |
|
| 22 |
# Ensure we match all text lines up properly
|
| 23 |
# Makes sure the OCR bbox is being scaled to the same scale as the layout boxes
|
| 24 |
text_lines = first_page.contained_blocks(pdf_document, (BlockTypes.Line,))
|
| 25 |
+
text_blocks = first_page.contained_blocks(
|
| 26 |
+
pdf_document, (BlockTypes.Text, BlockTypes.TextInlineMath)
|
| 27 |
+
)
|
| 28 |
+
assert len(text_lines) == 83
|
| 29 |
|
| 30 |
# Ensure the bbox sizes match up
|
| 31 |
max_line_position = max([line.polygon.y_end for line in text_lines])
|
| 32 |
+
max_block_position = max(
|
| 33 |
+
[block.polygon.y_end for block in text_blocks if block.source == "layout"]
|
| 34 |
+
)
|
| 35 |
assert max_line_position <= (max_block_position * 1.02)
|
| 36 |
|
| 37 |
|
|
|
|
| 39 |
def test_ocr_pipeline(pdf_document):
|
| 40 |
_ocr_pipeline_test(pdf_document)
|
| 41 |
|
| 42 |
+
|
| 43 |
@pytest.mark.config({"force_ocr": True, "page_range": [0], "use_llm": True})
|
| 44 |
def test_ocr_with_inline_pipeline(pdf_document):
|
| 45 |
_ocr_pipeline_test(pdf_document)
|
|
|
tests/converters/test_ocr_converter.py
CHANGED
|
@@ -35,13 +35,13 @@ def check_bboxes(page: OCRJSONPageOutput, lines):
|
|
| 35 |
|
| 36 |
@pytest.mark.config({"page_range": [0]})
|
| 37 |
def test_ocr_converter(config, model_dict, temp_doc):
|
| 38 |
-
_ocr_converter(config, model_dict, temp_doc,
|
| 39 |
|
| 40 |
|
| 41 |
@pytest.mark.filename("pres.pdf")
|
| 42 |
@pytest.mark.config({"page_range": [1], "force_ocr": True, "keep_chars": True})
|
| 43 |
def test_ocr_converter_force(config, model_dict, temp_doc):
|
| 44 |
-
pages = _ocr_converter(config, model_dict, temp_doc,
|
| 45 |
lines = [line for line in pages[0].children if line.block_type == "Line"]
|
| 46 |
check_bboxes(pages[0], lines)
|
| 47 |
|
|
|
|
| 35 |
|
| 36 |
@pytest.mark.config({"page_range": [0]})
|
| 37 |
def test_ocr_converter(config, model_dict, temp_doc):
|
| 38 |
+
_ocr_converter(config, model_dict, temp_doc, 84, 2)
|
| 39 |
|
| 40 |
|
| 41 |
@pytest.mark.filename("pres.pdf")
|
| 42 |
@pytest.mark.config({"page_range": [1], "force_ocr": True, "keep_chars": True})
|
| 43 |
def test_ocr_converter_force(config, model_dict, temp_doc):
|
| 44 |
+
pages = _ocr_converter(config, model_dict, temp_doc, 10, 0)
|
| 45 |
lines = [line for line in pages[0].children if line.block_type == "Line"]
|
| 46 |
check_bboxes(pages[0], lines)
|
| 47 |
|