Merge in dev
Browse files- .github/workflows/scripts.yml +29 -0
- marker/builders/document.py +2 -1
- marker/converters/pdf.py +4 -4
- marker/processors/reference.py +55 -0
- marker/providers/pdf.py +7 -2
- marker/renderers/__init__.py +0 -1
- marker/renderers/html.py +2 -4
- marker/schema/__init__.py +1 -0
- marker/schema/blocks/__init__.py +1 -0
- marker/schema/blocks/base.py +6 -0
- marker/schema/blocks/basetable.py +6 -3
- marker/schema/blocks/caption.py +2 -0
- marker/schema/blocks/equation.py +8 -6
- marker/schema/blocks/figure.py +4 -3
- marker/schema/blocks/footnote.py +2 -0
- marker/schema/blocks/handwriting.py +2 -0
- marker/schema/blocks/pagefooter.py +3 -0
- marker/schema/blocks/pageheader.py +3 -0
- marker/schema/blocks/reference.py +11 -0
- marker/schema/groups/page.py +2 -0
- marker/schema/registry.py +2 -1
- marker/schema/text/span.py +0 -3
- tests/builders/test_pdf_links.py +7 -4
.github/workflows/scripts.yml
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Test CLI scripts
|
| 2 |
+
|
| 3 |
+
on: [push]
|
| 4 |
+
|
| 5 |
+
env:
|
| 6 |
+
TORCH_DEVICE: "cpu"
|
| 7 |
+
OCR_ENGINE: "surya"
|
| 8 |
+
|
| 9 |
+
jobs:
|
| 10 |
+
tests:
|
| 11 |
+
runs-on: ubuntu-latest
|
| 12 |
+
steps:
|
| 13 |
+
- uses: actions/checkout@v3
|
| 14 |
+
- name: Set up Python 3.11
|
| 15 |
+
uses: actions/setup-python@v4
|
| 16 |
+
with:
|
| 17 |
+
python-version: 3.11
|
| 18 |
+
- name: Install python dependencies
|
| 19 |
+
run: |
|
| 20 |
+
pip install poetry
|
| 21 |
+
poetry install
|
| 22 |
+
- name: Download benchmark data
|
| 23 |
+
run: |
|
| 24 |
+
wget -O benchmark_data.zip "https://drive.google.com/uc?export=download&id=1NHrdYatR1rtqs2gPVfdvO0BAvocH8CJi"
|
| 25 |
+
unzip -o benchmark_data.zip
|
| 26 |
+
- name: Test single script
|
| 27 |
+
run: poetry run marker_single benchmark_data/pdfs/switch_trans.pdf --page_range 0
|
| 28 |
+
- name: Test convert script
|
| 29 |
+
run: poetry run marker benchmark_data/pdfs --max_files 1 --workers 1 --page_range 0
|
marker/builders/document.py
CHANGED
|
@@ -43,7 +43,8 @@ class DocumentBuilder(BaseBuilder):
|
|
| 43 |
page_id=p,
|
| 44 |
lowres_image=lowres_images[i],
|
| 45 |
highres_image=highres_images[i],
|
| 46 |
-
polygon=provider.get_page_bbox(p)
|
|
|
|
| 47 |
) for i, p in enumerate(provider.page_range)
|
| 48 |
]
|
| 49 |
DocumentClass: Document = get_block_class(BlockTypes.Document)
|
|
|
|
| 43 |
page_id=p,
|
| 44 |
lowres_image=lowres_images[i],
|
| 45 |
highres_image=highres_images[i],
|
| 46 |
+
polygon=provider.get_page_bbox(p),
|
| 47 |
+
refs=provider.get_page_refs(p)
|
| 48 |
) for i, p in enumerate(provider.page_range)
|
| 49 |
]
|
| 50 |
DocumentClass: Document = get_block_class(BlockTypes.Document)
|
marker/converters/pdf.py
CHANGED
|
@@ -1,13 +1,10 @@
|
|
| 1 |
import os
|
| 2 |
-
|
| 3 |
-
from marker.processors.llm.llm_handwriting import LLMHandwritingProcessor
|
| 4 |
-
|
| 5 |
os.environ["TOKENIZERS_PARALLELISM"] = "false" # disables a tokenizers warning
|
| 6 |
|
| 7 |
import inspect
|
| 8 |
from collections import defaultdict
|
| 9 |
-
from typing import Annotated, Any, Dict, List, Optional, Type, Tuple
|
| 10 |
from functools import cache
|
|
|
|
| 11 |
|
| 12 |
from marker.processors import BaseProcessor
|
| 13 |
from marker.processors.llm.llm_table_merge import LLMTableMergeProcessor
|
|
@@ -33,6 +30,7 @@ from marker.processors.llm.llm_image_description import LLMImageDescriptionProce
|
|
| 33 |
from marker.processors.llm.llm_table import LLMTableProcessor
|
| 34 |
from marker.processors.llm.llm_text import LLMTextProcessor
|
| 35 |
from marker.processors.page_header import PageHeaderProcessor
|
|
|
|
| 36 |
from marker.processors.sectionheader import SectionHeaderProcessor
|
| 37 |
from marker.processors.table import TableProcessor
|
| 38 |
from marker.processors.text import TextProcessor
|
|
@@ -42,6 +40,7 @@ from marker.schema import BlockTypes
|
|
| 42 |
from marker.schema.blocks import Block
|
| 43 |
from marker.schema.registry import register_block_class
|
| 44 |
from marker.util import strings_to_classes
|
|
|
|
| 45 |
|
| 46 |
|
| 47 |
class PdfConverter(BaseConverter):
|
|
@@ -80,6 +79,7 @@ class PdfConverter(BaseConverter):
|
|
| 80 |
LLMImageDescriptionProcessor,
|
| 81 |
LLMEquationProcessor,
|
| 82 |
LLMHandwritingProcessor,
|
|
|
|
| 83 |
DebugProcessor,
|
| 84 |
)
|
| 85 |
|
|
|
|
| 1 |
import os
|
|
|
|
|
|
|
|
|
|
| 2 |
os.environ["TOKENIZERS_PARALLELISM"] = "false" # disables a tokenizers warning
|
| 3 |
|
| 4 |
import inspect
|
| 5 |
from collections import defaultdict
|
|
|
|
| 6 |
from functools import cache
|
| 7 |
+
from typing import Annotated, Any, Dict, List, Optional, Type, Tuple
|
| 8 |
|
| 9 |
from marker.processors import BaseProcessor
|
| 10 |
from marker.processors.llm.llm_table_merge import LLMTableMergeProcessor
|
|
|
|
| 30 |
from marker.processors.llm.llm_table import LLMTableProcessor
|
| 31 |
from marker.processors.llm.llm_text import LLMTextProcessor
|
| 32 |
from marker.processors.page_header import PageHeaderProcessor
|
| 33 |
+
from marker.processors.reference import ReferenceProcessor
|
| 34 |
from marker.processors.sectionheader import SectionHeaderProcessor
|
| 35 |
from marker.processors.table import TableProcessor
|
| 36 |
from marker.processors.text import TextProcessor
|
|
|
|
| 40 |
from marker.schema.blocks import Block
|
| 41 |
from marker.schema.registry import register_block_class
|
| 42 |
from marker.util import strings_to_classes
|
| 43 |
+
from marker.processors.llm.llm_handwriting import LLMHandwritingProcessor
|
| 44 |
|
| 45 |
|
| 46 |
class PdfConverter(BaseConverter):
|
|
|
|
| 79 |
LLMImageDescriptionProcessor,
|
| 80 |
LLMEquationProcessor,
|
| 81 |
LLMHandwritingProcessor,
|
| 82 |
+
ReferenceProcessor,
|
| 83 |
DebugProcessor,
|
| 84 |
)
|
| 85 |
|
marker/processors/reference.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
|
| 3 |
+
from marker.processors import BaseProcessor
|
| 4 |
+
from marker.schema import BlockTypes
|
| 5 |
+
from marker.schema.blocks import Reference
|
| 6 |
+
from marker.schema.document import Document
|
| 7 |
+
from marker.schema.groups.list import ListGroup
|
| 8 |
+
from marker.schema.groups.table import TableGroup
|
| 9 |
+
from marker.schema.registry import get_block_class
|
| 10 |
+
from marker.schema.groups.picture import PictureGroup
|
| 11 |
+
from marker.schema.groups.figure import FigureGroup
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class ReferenceProcessor(BaseProcessor):
|
| 15 |
+
"""
|
| 16 |
+
A processor for adding references to the document.
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
def __init__(self, config):
|
| 20 |
+
super().__init__(config)
|
| 21 |
+
|
| 22 |
+
def __call__(self, document: Document):
|
| 23 |
+
ReferenceClass: Reference = get_block_class(BlockTypes.Reference)
|
| 24 |
+
|
| 25 |
+
for page in document.pages:
|
| 26 |
+
refs = page.refs
|
| 27 |
+
ref_starts = np.array([ref.coord for ref in refs])
|
| 28 |
+
|
| 29 |
+
blocks = []
|
| 30 |
+
for block_id in page.structure:
|
| 31 |
+
block = page.get_block(block_id)
|
| 32 |
+
if isinstance(block, (ListGroup, FigureGroup, TableGroup)):
|
| 33 |
+
blocks.extend([page.get_block(b) for b in block.structure])
|
| 34 |
+
else:
|
| 35 |
+
blocks.append(block)
|
| 36 |
+
blocks = [b for b in blocks if not b.ignore_for_output]
|
| 37 |
+
|
| 38 |
+
block_starts = np.array([block.polygon.bbox[:2] for block in blocks])
|
| 39 |
+
|
| 40 |
+
if not (len(refs) and len(block_starts)):
|
| 41 |
+
continue
|
| 42 |
+
|
| 43 |
+
distances = np.linalg.norm(block_starts[:, np.newaxis, :] - ref_starts[np.newaxis, :, :], axis=2)
|
| 44 |
+
for ref_idx in range(len(ref_starts)):
|
| 45 |
+
block_idx = np.argmin(distances[:, ref_idx])
|
| 46 |
+
block = blocks[block_idx]
|
| 47 |
+
|
| 48 |
+
ref_block = page.add_full_block(ReferenceClass(
|
| 49 |
+
ref=refs[ref_idx].ref,
|
| 50 |
+
polygon=block.polygon,
|
| 51 |
+
page_id=page.page_id
|
| 52 |
+
))
|
| 53 |
+
if block.structure is None:
|
| 54 |
+
block.structure = []
|
| 55 |
+
block.structure.insert(0, ref_block.id)
|
marker/providers/pdf.py
CHANGED
|
@@ -1,12 +1,13 @@
|
|
| 1 |
import atexit
|
| 2 |
import ctypes
|
| 3 |
import re
|
| 4 |
-
from typing import Annotated, List, Optional, Set
|
| 5 |
|
| 6 |
import pypdfium2 as pdfium
|
| 7 |
import pypdfium2.raw as pdfium_c
|
| 8 |
from ftfy import fix_text
|
| 9 |
from pdftext.extraction import dictionary_output
|
|
|
|
| 10 |
from PIL import Image
|
| 11 |
from pypdfium2 import PdfiumError
|
| 12 |
|
|
@@ -75,6 +76,7 @@ class PdfProvider(BaseProvider):
|
|
| 75 |
|
| 76 |
self.doc: pdfium.PdfDocument = pdfium.PdfDocument(self.filepath)
|
| 77 |
self.page_lines: ProviderPageLines = {i: [] for i in range(len(self.doc))}
|
|
|
|
| 78 |
|
| 79 |
if self.page_range is None:
|
| 80 |
self.page_range = range(len(self.doc))
|
|
@@ -210,7 +212,6 @@ class PdfProvider(BaseProvider):
|
|
| 210 |
page_id=page_id,
|
| 211 |
text_extraction_method="pdftext",
|
| 212 |
url=span.get("url"),
|
| 213 |
-
anchors=span.get("anchors"),
|
| 214 |
)
|
| 215 |
)
|
| 216 |
polygon = PolygonBox.from_bbox(line["bbox"], ensure_nonzero_area=True)
|
|
@@ -222,6 +223,7 @@ class PdfProvider(BaseProvider):
|
|
| 222 |
)
|
| 223 |
if self.check_line_spans(lines):
|
| 224 |
page_lines[page_id] = lines
|
|
|
|
| 225 |
|
| 226 |
return page_lines
|
| 227 |
|
|
@@ -326,6 +328,9 @@ class PdfProvider(BaseProvider):
|
|
| 326 |
def get_page_lines(self, idx: int) -> List[ProviderOutput]:
|
| 327 |
return self.page_lines[idx]
|
| 328 |
|
|
|
|
|
|
|
|
|
|
| 329 |
@staticmethod
|
| 330 |
def _get_fontname(font) -> str:
|
| 331 |
font_name = ""
|
|
|
|
| 1 |
import atexit
|
| 2 |
import ctypes
|
| 3 |
import re
|
| 4 |
+
from typing import Annotated, Dict, List, Optional, Set
|
| 5 |
|
| 6 |
import pypdfium2 as pdfium
|
| 7 |
import pypdfium2.raw as pdfium_c
|
| 8 |
from ftfy import fix_text
|
| 9 |
from pdftext.extraction import dictionary_output
|
| 10 |
+
from pdftext.schema import Reference
|
| 11 |
from PIL import Image
|
| 12 |
from pypdfium2 import PdfiumError
|
| 13 |
|
|
|
|
| 76 |
|
| 77 |
self.doc: pdfium.PdfDocument = pdfium.PdfDocument(self.filepath)
|
| 78 |
self.page_lines: ProviderPageLines = {i: [] for i in range(len(self.doc))}
|
| 79 |
+
self.page_refs: Dict[int, List[Reference]] = {i: [] for i in range(len(self.doc))}
|
| 80 |
|
| 81 |
if self.page_range is None:
|
| 82 |
self.page_range = range(len(self.doc))
|
|
|
|
| 212 |
page_id=page_id,
|
| 213 |
text_extraction_method="pdftext",
|
| 214 |
url=span.get("url"),
|
|
|
|
| 215 |
)
|
| 216 |
)
|
| 217 |
polygon = PolygonBox.from_bbox(line["bbox"], ensure_nonzero_area=True)
|
|
|
|
| 223 |
)
|
| 224 |
if self.check_line_spans(lines):
|
| 225 |
page_lines[page_id] = lines
|
| 226 |
+
self.page_refs[page_id] = page["refs"]
|
| 227 |
|
| 228 |
return page_lines
|
| 229 |
|
|
|
|
| 328 |
def get_page_lines(self, idx: int) -> List[ProviderOutput]:
|
| 329 |
return self.page_lines[idx]
|
| 330 |
|
| 331 |
+
def get_page_refs(self, idx: int):
|
| 332 |
+
return self.page_refs[idx]
|
| 333 |
+
|
| 334 |
@staticmethod
|
| 335 |
def _get_fontname(font) -> str:
|
| 336 |
font_name = ""
|
marker/renderers/__init__.py
CHANGED
|
@@ -15,7 +15,6 @@ from marker.util import assign_config
|
|
| 15 |
|
| 16 |
|
| 17 |
class BaseRenderer:
|
| 18 |
-
remove_blocks: Annotated[Tuple[BlockTypes, ...], "The block types to ignore while rendering."] = (BlockTypes.PageHeader, BlockTypes.PageFooter)
|
| 19 |
image_blocks: Annotated[Tuple[BlockTypes, ...], "The block types to consider as images."] = (BlockTypes.Picture, BlockTypes.Figure)
|
| 20 |
extract_images: Annotated[bool, "Extract images from the document."] = True
|
| 21 |
image_extraction_mode: Annotated[
|
|
|
|
| 15 |
|
| 16 |
|
| 17 |
class BaseRenderer:
|
|
|
|
| 18 |
image_blocks: Annotated[Tuple[BlockTypes, ...], "The block types to consider as images."] = (BlockTypes.Picture, BlockTypes.Figure)
|
| 19 |
extract_images: Annotated[bool, "Extract images from the document."] = True
|
| 20 |
image_extraction_mode: Annotated[
|
marker/renderers/html.py
CHANGED
|
@@ -60,14 +60,12 @@ class HTMLRenderer(BaseRenderer):
|
|
| 60 |
ref_block_id: BlockId = item.id
|
| 61 |
break
|
| 62 |
|
| 63 |
-
if ref_block_id.block_type in self.
|
| 64 |
-
ref.replace_with('')
|
| 65 |
-
elif ref_block_id.block_type in self.image_blocks:
|
| 66 |
if self.extract_images:
|
| 67 |
image = self.extract_image(document, ref_block_id)
|
| 68 |
image_name = f"{ref_block_id.to_path()}.{settings.OUTPUT_IMAGE_FORMAT.lower()}"
|
| 69 |
images[image_name] = image
|
| 70 |
-
ref.replace_with(BeautifulSoup(f"<p
|
| 71 |
else:
|
| 72 |
# This will be the image description if using llm mode, or empty if not
|
| 73 |
ref.replace_with(BeautifulSoup(f"{content}", 'html.parser'))
|
|
|
|
| 60 |
ref_block_id: BlockId = item.id
|
| 61 |
break
|
| 62 |
|
| 63 |
+
if ref_block_id.block_type in self.image_blocks:
|
|
|
|
|
|
|
| 64 |
if self.extract_images:
|
| 65 |
image = self.extract_image(document, ref_block_id)
|
| 66 |
image_name = f"{ref_block_id.to_path()}.{settings.OUTPUT_IMAGE_FORMAT.lower()}"
|
| 67 |
images[image_name] = image
|
| 68 |
+
ref.replace_with(BeautifulSoup(f"<p>{content}<img src='{image_name}'></p>", 'html.parser'))
|
| 69 |
else:
|
| 70 |
# This will be the image description if using llm mode, or empty if not
|
| 71 |
ref.replace_with(BeautifulSoup(f"{content}", 'html.parser'))
|
marker/schema/__init__.py
CHANGED
|
@@ -28,6 +28,7 @@ class BlockTypes(str, Enum):
|
|
| 28 |
Document = auto()
|
| 29 |
ComplexRegion = auto()
|
| 30 |
TableCell = auto()
|
|
|
|
| 31 |
|
| 32 |
def __str__(self):
|
| 33 |
return self.name
|
|
|
|
| 28 |
Document = auto()
|
| 29 |
ComplexRegion = auto()
|
| 30 |
TableCell = auto()
|
| 31 |
+
Reference = auto()
|
| 32 |
|
| 33 |
def __str__(self):
|
| 34 |
return self.name
|
marker/schema/blocks/__init__.py
CHANGED
|
@@ -19,3 +19,4 @@ from marker.schema.blocks.text import Text
|
|
| 19 |
from marker.schema.blocks.toc import TableOfContents
|
| 20 |
from marker.schema.blocks.complexregion import ComplexRegion
|
| 21 |
from marker.schema.blocks.tablecell import TableCell
|
|
|
|
|
|
| 19 |
from marker.schema.blocks.toc import TableOfContents
|
| 20 |
from marker.schema.blocks.complexregion import ComplexRegion
|
| 21 |
from marker.schema.blocks.tablecell import TableCell
|
| 22 |
+
from marker.schema.blocks.reference import Reference
|
marker/schema/blocks/base.py
CHANGED
|
@@ -12,6 +12,7 @@ if TYPE_CHECKING:
|
|
| 12 |
from marker.schema.document import Document
|
| 13 |
from marker.schema.groups.page import PageGroup
|
| 14 |
|
|
|
|
| 15 |
class BlockMetadata(BaseModel):
|
| 16 |
llm_request_count: int = 0
|
| 17 |
llm_error_count: int = 0
|
|
@@ -78,6 +79,7 @@ class Block(BaseModel):
|
|
| 78 |
text_extraction_method: Optional[Literal['pdftext', 'surya', 'gemini']] = None
|
| 79 |
structure: List[BlockId] | None = None # The top-level page structure, which is the block ids in order
|
| 80 |
ignore_for_output: bool = False # Whether this block should be ignored in output
|
|
|
|
| 81 |
source: Literal['layout', 'heuristics', 'processor'] = 'layout'
|
| 82 |
top_k: Optional[Dict[BlockTypes, float]] = None
|
| 83 |
metadata: BlockMetadata | None = None
|
|
@@ -187,6 +189,10 @@ class Block(BaseModel):
|
|
| 187 |
template = ""
|
| 188 |
for c in child_blocks:
|
| 189 |
template += f"<content-ref src='{c.id}'></content-ref>"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 190 |
return template
|
| 191 |
|
| 192 |
def assign_section_hierarchy(self, section_hierarchy):
|
|
|
|
| 12 |
from marker.schema.document import Document
|
| 13 |
from marker.schema.groups.page import PageGroup
|
| 14 |
|
| 15 |
+
|
| 16 |
class BlockMetadata(BaseModel):
|
| 17 |
llm_request_count: int = 0
|
| 18 |
llm_error_count: int = 0
|
|
|
|
| 79 |
text_extraction_method: Optional[Literal['pdftext', 'surya', 'gemini']] = None
|
| 80 |
structure: List[BlockId] | None = None # The top-level page structure, which is the block ids in order
|
| 81 |
ignore_for_output: bool = False # Whether this block should be ignored in output
|
| 82 |
+
replace_output_newlines: bool = False # Whether to replace newlines with spaces in output
|
| 83 |
source: Literal['layout', 'heuristics', 'processor'] = 'layout'
|
| 84 |
top_k: Optional[Dict[BlockTypes, float]] = None
|
| 85 |
metadata: BlockMetadata | None = None
|
|
|
|
| 189 |
template = ""
|
| 190 |
for c in child_blocks:
|
| 191 |
template += f"<content-ref src='{c.id}'></content-ref>"
|
| 192 |
+
|
| 193 |
+
if self.replace_output_newlines:
|
| 194 |
+
template = "<p>" + template.replace("\n", " ") + "</p>"
|
| 195 |
+
|
| 196 |
return template
|
| 197 |
|
| 198 |
def assign_section_hierarchy(self, section_hierarchy):
|
marker/schema/blocks/basetable.py
CHANGED
|
@@ -24,13 +24,16 @@ class BaseTable(Block):
|
|
| 24 |
|
| 25 |
|
| 26 |
def assemble_html(self, document, child_blocks: List[BlockOutput], parent_structure=None):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
if self.html:
|
| 28 |
# LLM processor
|
| 29 |
-
return self.html
|
| 30 |
elif len(child_blocks) > 0 and child_blocks[0].id.block_type == BlockTypes.TableCell:
|
| 31 |
# Table processor
|
| 32 |
-
return self.format_cells(document, child_blocks)
|
| 33 |
else:
|
| 34 |
# Default text lines and spans
|
| 35 |
-
template = super().assemble_html(document, child_blocks, parent_structure)
|
| 36 |
return f"<p>{template}</p>"
|
|
|
|
| 24 |
|
| 25 |
|
| 26 |
def assemble_html(self, document, child_blocks: List[BlockOutput], parent_structure=None):
|
| 27 |
+
# Filter out the table cells, so they don't render twice
|
| 28 |
+
selected_blocks = [b for b in child_blocks if b.id.block_type != BlockTypes.TableCell]
|
| 29 |
+
template = super().assemble_html(document, selected_blocks, parent_structure)
|
| 30 |
+
|
| 31 |
if self.html:
|
| 32 |
# LLM processor
|
| 33 |
+
return template + self.html
|
| 34 |
elif len(child_blocks) > 0 and child_blocks[0].id.block_type == BlockTypes.TableCell:
|
| 35 |
# Table processor
|
| 36 |
+
return template + self.format_cells(document, child_blocks)
|
| 37 |
else:
|
| 38 |
# Default text lines and spans
|
|
|
|
| 39 |
return f"<p>{template}</p>"
|
marker/schema/blocks/caption.py
CHANGED
|
@@ -5,8 +5,10 @@ from marker.schema.blocks import Block
|
|
| 5 |
class Caption(Block):
|
| 6 |
block_type: BlockTypes = BlockTypes.Caption
|
| 7 |
block_description: str = "A text caption that is directly above or below an image or table. Only used for text describing the image or table. "
|
|
|
|
| 8 |
|
| 9 |
def assemble_html(self, document, child_blocks, parent_structure):
|
| 10 |
template = super().assemble_html(document, child_blocks, parent_structure)
|
| 11 |
template = template.replace("\n", " ")
|
| 12 |
return f"<p>{template}</p>"
|
|
|
|
|
|
| 5 |
class Caption(Block):
|
| 6 |
block_type: BlockTypes = BlockTypes.Caption
|
| 7 |
block_description: str = "A text caption that is directly above or below an image or table. Only used for text describing the image or table. "
|
| 8 |
+
replace_output_newlines: bool = True
|
| 9 |
|
| 10 |
def assemble_html(self, document, child_blocks, parent_structure):
|
| 11 |
template = super().assemble_html(document, child_blocks, parent_structure)
|
| 12 |
template = template.replace("\n", " ")
|
| 13 |
return f"<p>{template}</p>"
|
| 14 |
+
|
marker/schema/blocks/equation.py
CHANGED
|
@@ -11,7 +11,9 @@ class Equation(Block):
|
|
| 11 |
|
| 12 |
def assemble_html(self, document, child_blocks, parent_structure=None):
|
| 13 |
if self.latex:
|
| 14 |
-
|
|
|
|
|
|
|
| 15 |
|
| 16 |
try:
|
| 17 |
latex = self.parse_latex(html.escape(self.latex))
|
|
@@ -44,9 +46,9 @@ class Equation(Block):
|
|
| 44 |
("$$", "block"),
|
| 45 |
("$", "inline")
|
| 46 |
]
|
| 47 |
-
|
| 48 |
-
text = text.replace("\n", "<br>")
|
| 49 |
-
|
| 50 |
i = 0
|
| 51 |
stack = []
|
| 52 |
result = []
|
|
@@ -73,7 +75,7 @@ class Equation(Block):
|
|
| 73 |
else: # No delimiter match
|
| 74 |
buffer += text[i]
|
| 75 |
i += 1
|
| 76 |
-
|
| 77 |
if buffer:
|
| 78 |
result.append({"class": "text", "content": buffer})
|
| 79 |
-
return result
|
|
|
|
| 11 |
|
| 12 |
def assemble_html(self, document, child_blocks, parent_structure=None):
|
| 13 |
if self.latex:
|
| 14 |
+
child_ref_blocks = [block for block in child_blocks if block.id.block_type == BlockTypes.Reference]
|
| 15 |
+
html_out = super().assemble_html(child_ref_blocks, parent_structure)
|
| 16 |
+
html_out += f"<p block-type='{self.block_type}'>"
|
| 17 |
|
| 18 |
try:
|
| 19 |
latex = self.parse_latex(html.escape(self.latex))
|
|
|
|
| 46 |
("$$", "block"),
|
| 47 |
("$", "inline")
|
| 48 |
]
|
| 49 |
+
|
| 50 |
+
text = text.replace("\n", "<br>") # we can't handle \n's inside <p> properly if we don't do this
|
| 51 |
+
|
| 52 |
i = 0
|
| 53 |
stack = []
|
| 54 |
result = []
|
|
|
|
| 75 |
else: # No delimiter match
|
| 76 |
buffer += text[i]
|
| 77 |
i += 1
|
| 78 |
+
|
| 79 |
if buffer:
|
| 80 |
result.append({"class": "text", "content": buffer})
|
| 81 |
+
return result
|
marker/schema/blocks/figure.py
CHANGED
|
@@ -8,7 +8,8 @@ class Figure(Block):
|
|
| 8 |
block_description: str = "A chart or other image that contains data."
|
| 9 |
|
| 10 |
def assemble_html(self, document, child_blocks, parent_structure):
|
|
|
|
|
|
|
| 11 |
if self.description:
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
return ""
|
|
|
|
| 8 |
block_description: str = "A chart or other image that contains data."
|
| 9 |
|
| 10 |
def assemble_html(self, document, child_blocks, parent_structure):
|
| 11 |
+
child_ref_blocks = [block for block in child_blocks if block.id.block_type == BlockTypes.Reference]
|
| 12 |
+
html = super().assemble_html(document, child_ref_blocks, parent_structure)
|
| 13 |
if self.description:
|
| 14 |
+
html += f"<p role='img' data-original-image-id='{self.id}'>Image {self.id} description: {self.description}</p>"
|
| 15 |
+
return html
|
|
|
marker/schema/blocks/footnote.py
CHANGED
|
@@ -5,9 +5,11 @@ from marker.schema.blocks import Block
|
|
| 5 |
class Footnote(Block):
|
| 6 |
block_type: BlockTypes = BlockTypes.Footnote
|
| 7 |
block_description: str = "A footnote that explains a term or concept in the document."
|
|
|
|
| 8 |
|
| 9 |
def assemble_html(self, document, child_blocks, parent_structure):
|
| 10 |
template = super().assemble_html(document, child_blocks, parent_structure)
|
| 11 |
template = template.replace("\n", " ")
|
| 12 |
|
| 13 |
return f"<p>{template}</p>"
|
|
|
|
|
|
| 5 |
class Footnote(Block):
|
| 6 |
block_type: BlockTypes = BlockTypes.Footnote
|
| 7 |
block_description: str = "A footnote that explains a term or concept in the document."
|
| 8 |
+
replace_output_newlines: bool = True
|
| 9 |
|
| 10 |
def assemble_html(self, document, child_blocks, parent_structure):
|
| 11 |
template = super().assemble_html(document, child_blocks, parent_structure)
|
| 12 |
template = template.replace("\n", " ")
|
| 13 |
|
| 14 |
return f"<p>{template}</p>"
|
| 15 |
+
|
marker/schema/blocks/handwriting.py
CHANGED
|
@@ -6,6 +6,7 @@ class Handwriting(Block):
|
|
| 6 |
block_type: BlockTypes = BlockTypes.Handwriting
|
| 7 |
block_description: str = "A region that contains handwriting."
|
| 8 |
html: str | None = None
|
|
|
|
| 9 |
|
| 10 |
def assemble_html(self, document, child_blocks, parent_structure):
|
| 11 |
if self.html:
|
|
@@ -14,3 +15,4 @@ class Handwriting(Block):
|
|
| 14 |
template = super().assemble_html(document, child_blocks, parent_structure)
|
| 15 |
template = template.replace("\n", " ")
|
| 16 |
return f"<p>{template}</p>"
|
|
|
|
|
|
| 6 |
block_type: BlockTypes = BlockTypes.Handwriting
|
| 7 |
block_description: str = "A region that contains handwriting."
|
| 8 |
html: str | None = None
|
| 9 |
+
replace_output_newlines: bool = True
|
| 10 |
|
| 11 |
def assemble_html(self, document, child_blocks, parent_structure):
|
| 12 |
if self.html:
|
|
|
|
| 15 |
template = super().assemble_html(document, child_blocks, parent_structure)
|
| 16 |
template = template.replace("\n", " ")
|
| 17 |
return f"<p>{template}</p>"
|
| 18 |
+
|
marker/schema/blocks/pagefooter.py
CHANGED
|
@@ -5,6 +5,8 @@ from marker.schema.blocks import Block
|
|
| 5 |
class PageFooter(Block):
|
| 6 |
block_type: str = BlockTypes.PageFooter
|
| 7 |
block_description: str = "Text that appears at the bottom of a page, like a page number."
|
|
|
|
|
|
|
| 8 |
|
| 9 |
def assemble_html(self, document, child_blocks, parent_structure):
|
| 10 |
if self.ignore_for_output:
|
|
@@ -13,3 +15,4 @@ class PageFooter(Block):
|
|
| 13 |
template = super().assemble_html(document, child_blocks, parent_structure)
|
| 14 |
template = template.replace("\n", " ")
|
| 15 |
return f"<p>{template}</p>"
|
|
|
|
|
|
| 5 |
class PageFooter(Block):
|
| 6 |
block_type: str = BlockTypes.PageFooter
|
| 7 |
block_description: str = "Text that appears at the bottom of a page, like a page number."
|
| 8 |
+
replace_output_newlines: bool = True
|
| 9 |
+
ignore_for_output: bool = True
|
| 10 |
|
| 11 |
def assemble_html(self, document, child_blocks, parent_structure):
|
| 12 |
if self.ignore_for_output:
|
|
|
|
| 15 |
template = super().assemble_html(document, child_blocks, parent_structure)
|
| 16 |
template = template.replace("\n", " ")
|
| 17 |
return f"<p>{template}</p>"
|
| 18 |
+
|
marker/schema/blocks/pageheader.py
CHANGED
|
@@ -5,6 +5,8 @@ from marker.schema.blocks import Block
|
|
| 5 |
class PageHeader(Block):
|
| 6 |
block_type: BlockTypes = BlockTypes.PageHeader
|
| 7 |
block_description: str = "Text that appears at the top of a page, like a page title."
|
|
|
|
|
|
|
| 8 |
|
| 9 |
def assemble_html(self, document, child_blocks, parent_structure):
|
| 10 |
if self.ignore_for_output:
|
|
@@ -13,3 +15,4 @@ class PageHeader(Block):
|
|
| 13 |
template = super().assemble_html(document, child_blocks, parent_structure)
|
| 14 |
template = template.replace("\n", " ")
|
| 15 |
return f"<p>{template}</p>"
|
|
|
|
|
|
| 5 |
class PageHeader(Block):
|
| 6 |
block_type: BlockTypes = BlockTypes.PageHeader
|
| 7 |
block_description: str = "Text that appears at the top of a page, like a page title."
|
| 8 |
+
replace_output_newlines: bool = True
|
| 9 |
+
ignore_for_output: bool = True
|
| 10 |
|
| 11 |
def assemble_html(self, document, child_blocks, parent_structure):
|
| 12 |
if self.ignore_for_output:
|
|
|
|
| 15 |
template = super().assemble_html(document, child_blocks, parent_structure)
|
| 16 |
template = template.replace("\n", " ")
|
| 17 |
return f"<p>{template}</p>"
|
| 18 |
+
|
marker/schema/blocks/reference.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from marker.schema import BlockTypes
|
| 2 |
+
from marker.schema.blocks import Block
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
class Reference(Block):
|
| 6 |
+
block_type: BlockTypes = BlockTypes.Reference
|
| 7 |
+
ref: str
|
| 8 |
+
|
| 9 |
+
def assemble_html(self, child_blocks, parent_structure=None):
|
| 10 |
+
template = super().assemble_html(child_blocks, parent_structure)
|
| 11 |
+
return f"<span id='{self.ref}'>{template}</span>"
|
marker/schema/groups/page.py
CHANGED
|
@@ -3,6 +3,7 @@ from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
|
|
| 3 |
|
| 4 |
from PIL import Image
|
| 5 |
|
|
|
|
| 6 |
from marker.providers import ProviderOutput
|
| 7 |
from marker.schema import BlockTypes
|
| 8 |
from marker.schema.blocks import Block, BlockId, Text
|
|
@@ -23,6 +24,7 @@ class PageGroup(Group):
|
|
| 23 |
excluded_block_types: Sequence[BlockTypes] = (BlockTypes.Line, BlockTypes.Span,)
|
| 24 |
maximum_assignment_distance: float = 20 # pixels
|
| 25 |
block_description: str = "A single page in the document."
|
|
|
|
| 26 |
|
| 27 |
def incr_block_id(self):
|
| 28 |
if self.block_id is None:
|
|
|
|
| 3 |
|
| 4 |
from PIL import Image
|
| 5 |
|
| 6 |
+
from pdftext.schema import Reference
|
| 7 |
from marker.providers import ProviderOutput
|
| 8 |
from marker.schema import BlockTypes
|
| 9 |
from marker.schema.blocks import Block, BlockId, Text
|
|
|
|
| 24 |
excluded_block_types: Sequence[BlockTypes] = (BlockTypes.Line, BlockTypes.Span,)
|
| 25 |
maximum_assignment_distance: float = 20 # pixels
|
| 26 |
block_description: str = "A single page in the document."
|
| 27 |
+
refs: List[Reference] | None = None
|
| 28 |
|
| 29 |
def incr_block_id(self):
|
| 30 |
if self.block_id is None:
|
marker/schema/registry.py
CHANGED
|
@@ -6,7 +6,7 @@ from marker.schema.blocks import Block, Caption, Code, Equation, Figure, \
|
|
| 6 |
Footnote, Form, Handwriting, InlineMath, \
|
| 7 |
ListItem, PageFooter, PageHeader, Picture, \
|
| 8 |
SectionHeader, Table, TableOfContents, \
|
| 9 |
-
Text, ComplexRegion, TableCell
|
| 10 |
from marker.schema.document import Document
|
| 11 |
from marker.schema.groups import FigureGroup, ListGroup, PageGroup, \
|
| 12 |
PictureGroup, TableGroup
|
|
@@ -51,6 +51,7 @@ register_block_class(BlockTypes.Text, Text)
|
|
| 51 |
register_block_class(BlockTypes.TableOfContents, TableOfContents)
|
| 52 |
register_block_class(BlockTypes.ComplexRegion, ComplexRegion)
|
| 53 |
register_block_class(BlockTypes.TableCell, TableCell)
|
|
|
|
| 54 |
register_block_class(BlockTypes.Document, Document)
|
| 55 |
|
| 56 |
assert len(BLOCK_REGISTRY) == len(BlockTypes)
|
|
|
|
| 6 |
Footnote, Form, Handwriting, InlineMath, \
|
| 7 |
ListItem, PageFooter, PageHeader, Picture, \
|
| 8 |
SectionHeader, Table, TableOfContents, \
|
| 9 |
+
Text, ComplexRegion, TableCell, Reference
|
| 10 |
from marker.schema.document import Document
|
| 11 |
from marker.schema.groups import FigureGroup, ListGroup, PageGroup, \
|
| 12 |
PictureGroup, TableGroup
|
|
|
|
| 51 |
register_block_class(BlockTypes.TableOfContents, TableOfContents)
|
| 52 |
register_block_class(BlockTypes.ComplexRegion, ComplexRegion)
|
| 53 |
register_block_class(BlockTypes.TableCell, TableCell)
|
| 54 |
+
register_block_class(BlockTypes.Reference, Reference)
|
| 55 |
register_block_class(BlockTypes.Document, Document)
|
| 56 |
|
| 57 |
assert len(BLOCK_REGISTRY) == len(BlockTypes)
|
marker/schema/text/span.py
CHANGED
|
@@ -25,7 +25,6 @@ class Span(Block):
|
|
| 25 |
formats: List[Literal['plain', 'math', 'chemical', 'bold', 'italic']]
|
| 26 |
has_superscript: bool = False
|
| 27 |
url: Optional[str] = None
|
| 28 |
-
anchors: Optional[List[str]] = None
|
| 29 |
|
| 30 |
@property
|
| 31 |
def bold(self):
|
|
@@ -75,6 +74,4 @@ class Span(Block):
|
|
| 75 |
elif self.math:
|
| 76 |
text = f"<math display='inline'>{text}</math>"
|
| 77 |
|
| 78 |
-
if self.anchors:
|
| 79 |
-
text = "".join(f"<span id='{anchor}'/>" for anchor in self.anchors) + text
|
| 80 |
return text
|
|
|
|
| 25 |
formats: List[Literal['plain', 'math', 'chemical', 'bold', 'italic']]
|
| 26 |
has_superscript: bool = False
|
| 27 |
url: Optional[str] = None
|
|
|
|
| 28 |
|
| 29 |
@property
|
| 30 |
def bold(self):
|
|
|
|
| 74 |
elif self.math:
|
| 75 |
text = f"<math display='inline'>{text}</math>"
|
| 76 |
|
|
|
|
|
|
|
| 77 |
return text
|
tests/builders/test_pdf_links.py
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
|
|
|
|
|
| 1 |
import pytest
|
| 2 |
|
| 3 |
from marker.converters.pdf import PdfConverter
|
|
@@ -8,9 +10,8 @@ from marker.schema.document import Document
|
|
| 8 |
|
| 9 |
@pytest.mark.filename("arxiv_test.pdf")
|
| 10 |
@pytest.mark.output_format("markdown")
|
| 11 |
-
@pytest.mark.config({"page_range": [1]})
|
| 12 |
def test_pdf_links(pdf_document: Document, pdf_converter: PdfConverter, temp_pdf):
|
| 13 |
-
first_page = pdf_document.pages[
|
| 14 |
|
| 15 |
for section_header_span in first_page.contained_blocks(pdf_document, (BlockTypes.Span,)):
|
| 16 |
if "II." in section_header_span.text:
|
|
@@ -22,11 +23,13 @@ def test_pdf_links(pdf_document: Document, pdf_converter: PdfConverter, temp_pdf
|
|
| 22 |
section_header_block = first_page.contained_blocks(pdf_document, (BlockTypes.SectionHeader,))[0]
|
| 23 |
assert section_header_block.raw_text(pdf_document) == 'II. THEORETICAL FRAMEWORK\n'
|
| 24 |
|
| 25 |
-
|
| 26 |
-
assert section_header_span.anchors == ['page-1-0']
|
| 27 |
|
| 28 |
markdown_output: MarkdownOutput = pdf_converter(temp_pdf.name)
|
| 29 |
markdown = markdown_output.markdown
|
| 30 |
|
| 31 |
assert '[II.](#page-1-0)' in markdown
|
| 32 |
assert '<span id="page-1-0"/>II. THEORETICAL FRAMEWORK' in markdown
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
|
| 3 |
import pytest
|
| 4 |
|
| 5 |
from marker.converters.pdf import PdfConverter
|
|
|
|
| 10 |
|
| 11 |
@pytest.mark.filename("arxiv_test.pdf")
|
| 12 |
@pytest.mark.output_format("markdown")
|
|
|
|
| 13 |
def test_pdf_links(pdf_document: Document, pdf_converter: PdfConverter, temp_pdf):
|
| 14 |
+
first_page = pdf_document.pages[1]
|
| 15 |
|
| 16 |
for section_header_span in first_page.contained_blocks(pdf_document, (BlockTypes.Span,)):
|
| 17 |
if "II." in section_header_span.text:
|
|
|
|
| 23 |
section_header_block = first_page.contained_blocks(pdf_document, (BlockTypes.SectionHeader,))[0]
|
| 24 |
assert section_header_block.raw_text(pdf_document) == 'II. THEORETICAL FRAMEWORK\n'
|
| 25 |
|
| 26 |
+
assert first_page.refs[0].ref == "page-1-0"
|
|
|
|
| 27 |
|
| 28 |
markdown_output: MarkdownOutput = pdf_converter(temp_pdf.name)
|
| 29 |
markdown = markdown_output.markdown
|
| 30 |
|
| 31 |
assert '[II.](#page-1-0)' in markdown
|
| 32 |
assert '<span id="page-1-0"/>II. THEORETICAL FRAMEWORK' in markdown
|
| 33 |
+
|
| 34 |
+
for ref in set([f'<span id="page-{m[0]}-{m[1]}"/>' for m in re.findall(r'\]\(#page-(\d+)-(\d+)\)', markdown)]):
|
| 35 |
+
assert ref in markdown, f"Reference {ref} not found in markdown"
|