Vik Paruchuri commited on
Commit
32b6790
·
2 Parent(s): adc9952 333b95b

Merge in dev

Browse files
.github/workflows/scripts.yml ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Test CLI scripts
2
+
3
+ on: [push]
4
+
5
+ env:
6
+ TORCH_DEVICE: "cpu"
7
+ OCR_ENGINE: "surya"
8
+
9
+ jobs:
10
+ tests:
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - uses: actions/checkout@v3
14
+ - name: Set up Python 3.11
15
+ uses: actions/setup-python@v4
16
+ with:
17
+ python-version: 3.11
18
+ - name: Install python dependencies
19
+ run: |
20
+ pip install poetry
21
+ poetry install
22
+ - name: Download benchmark data
23
+ run: |
24
+ wget -O benchmark_data.zip "https://drive.google.com/uc?export=download&id=1NHrdYatR1rtqs2gPVfdvO0BAvocH8CJi"
25
+ unzip -o benchmark_data.zip
26
+ - name: Test single script
27
+ run: poetry run marker_single benchmark_data/pdfs/switch_trans.pdf --page_range 0
28
+ - name: Test convert script
29
+ run: poetry run marker benchmark_data/pdfs --max_files 1 --workers 1 --page_range 0
marker/builders/document.py CHANGED
@@ -43,7 +43,8 @@ class DocumentBuilder(BaseBuilder):
43
  page_id=p,
44
  lowres_image=lowres_images[i],
45
  highres_image=highres_images[i],
46
- polygon=provider.get_page_bbox(p)
 
47
  ) for i, p in enumerate(provider.page_range)
48
  ]
49
  DocumentClass: Document = get_block_class(BlockTypes.Document)
 
43
  page_id=p,
44
  lowres_image=lowres_images[i],
45
  highres_image=highres_images[i],
46
+ polygon=provider.get_page_bbox(p),
47
+ refs=provider.get_page_refs(p)
48
  ) for i, p in enumerate(provider.page_range)
49
  ]
50
  DocumentClass: Document = get_block_class(BlockTypes.Document)
marker/converters/pdf.py CHANGED
@@ -1,13 +1,10 @@
1
  import os
2
-
3
- from marker.processors.llm.llm_handwriting import LLMHandwritingProcessor
4
-
5
  os.environ["TOKENIZERS_PARALLELISM"] = "false" # disables a tokenizers warning
6
 
7
  import inspect
8
  from collections import defaultdict
9
- from typing import Annotated, Any, Dict, List, Optional, Type, Tuple
10
  from functools import cache
 
11
 
12
  from marker.processors import BaseProcessor
13
  from marker.processors.llm.llm_table_merge import LLMTableMergeProcessor
@@ -33,6 +30,7 @@ from marker.processors.llm.llm_image_description import LLMImageDescriptionProce
33
  from marker.processors.llm.llm_table import LLMTableProcessor
34
  from marker.processors.llm.llm_text import LLMTextProcessor
35
  from marker.processors.page_header import PageHeaderProcessor
 
36
  from marker.processors.sectionheader import SectionHeaderProcessor
37
  from marker.processors.table import TableProcessor
38
  from marker.processors.text import TextProcessor
@@ -42,6 +40,7 @@ from marker.schema import BlockTypes
42
  from marker.schema.blocks import Block
43
  from marker.schema.registry import register_block_class
44
  from marker.util import strings_to_classes
 
45
 
46
 
47
  class PdfConverter(BaseConverter):
@@ -80,6 +79,7 @@ class PdfConverter(BaseConverter):
80
  LLMImageDescriptionProcessor,
81
  LLMEquationProcessor,
82
  LLMHandwritingProcessor,
 
83
  DebugProcessor,
84
  )
85
 
 
1
  import os
 
 
 
2
  os.environ["TOKENIZERS_PARALLELISM"] = "false" # disables a tokenizers warning
3
 
4
  import inspect
5
  from collections import defaultdict
 
6
  from functools import cache
7
+ from typing import Annotated, Any, Dict, List, Optional, Type, Tuple
8
 
9
  from marker.processors import BaseProcessor
10
  from marker.processors.llm.llm_table_merge import LLMTableMergeProcessor
 
30
  from marker.processors.llm.llm_table import LLMTableProcessor
31
  from marker.processors.llm.llm_text import LLMTextProcessor
32
  from marker.processors.page_header import PageHeaderProcessor
33
+ from marker.processors.reference import ReferenceProcessor
34
  from marker.processors.sectionheader import SectionHeaderProcessor
35
  from marker.processors.table import TableProcessor
36
  from marker.processors.text import TextProcessor
 
40
  from marker.schema.blocks import Block
41
  from marker.schema.registry import register_block_class
42
  from marker.util import strings_to_classes
43
+ from marker.processors.llm.llm_handwriting import LLMHandwritingProcessor
44
 
45
 
46
  class PdfConverter(BaseConverter):
 
79
  LLMImageDescriptionProcessor,
80
  LLMEquationProcessor,
81
  LLMHandwritingProcessor,
82
+ ReferenceProcessor,
83
  DebugProcessor,
84
  )
85
 
marker/processors/reference.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+
3
+ from marker.processors import BaseProcessor
4
+ from marker.schema import BlockTypes
5
+ from marker.schema.blocks import Reference
6
+ from marker.schema.document import Document
7
+ from marker.schema.groups.list import ListGroup
8
+ from marker.schema.groups.table import TableGroup
9
+ from marker.schema.registry import get_block_class
10
+ from marker.schema.groups.picture import PictureGroup
11
+ from marker.schema.groups.figure import FigureGroup
12
+
13
+
14
+ class ReferenceProcessor(BaseProcessor):
15
+ """
16
+ A processor for adding references to the document.
17
+ """
18
+
19
+ def __init__(self, config):
20
+ super().__init__(config)
21
+
22
+ def __call__(self, document: Document):
23
+ ReferenceClass: Reference = get_block_class(BlockTypes.Reference)
24
+
25
+ for page in document.pages:
26
+ refs = page.refs
27
+ ref_starts = np.array([ref.coord for ref in refs])
28
+
29
+ blocks = []
30
+ for block_id in page.structure:
31
+ block = page.get_block(block_id)
32
+ if isinstance(block, (ListGroup, FigureGroup, TableGroup)):
33
+ blocks.extend([page.get_block(b) for b in block.structure])
34
+ else:
35
+ blocks.append(block)
36
+ blocks = [b for b in blocks if not b.ignore_for_output]
37
+
38
+ block_starts = np.array([block.polygon.bbox[:2] for block in blocks])
39
+
40
+ if not (len(refs) and len(block_starts)):
41
+ continue
42
+
43
+ distances = np.linalg.norm(block_starts[:, np.newaxis, :] - ref_starts[np.newaxis, :, :], axis=2)
44
+ for ref_idx in range(len(ref_starts)):
45
+ block_idx = np.argmin(distances[:, ref_idx])
46
+ block = blocks[block_idx]
47
+
48
+ ref_block = page.add_full_block(ReferenceClass(
49
+ ref=refs[ref_idx].ref,
50
+ polygon=block.polygon,
51
+ page_id=page.page_id
52
+ ))
53
+ if block.structure is None:
54
+ block.structure = []
55
+ block.structure.insert(0, ref_block.id)
marker/providers/pdf.py CHANGED
@@ -1,12 +1,13 @@
1
  import atexit
2
  import ctypes
3
  import re
4
- from typing import Annotated, List, Optional, Set
5
 
6
  import pypdfium2 as pdfium
7
  import pypdfium2.raw as pdfium_c
8
  from ftfy import fix_text
9
  from pdftext.extraction import dictionary_output
 
10
  from PIL import Image
11
  from pypdfium2 import PdfiumError
12
 
@@ -75,6 +76,7 @@ class PdfProvider(BaseProvider):
75
 
76
  self.doc: pdfium.PdfDocument = pdfium.PdfDocument(self.filepath)
77
  self.page_lines: ProviderPageLines = {i: [] for i in range(len(self.doc))}
 
78
 
79
  if self.page_range is None:
80
  self.page_range = range(len(self.doc))
@@ -210,7 +212,6 @@ class PdfProvider(BaseProvider):
210
  page_id=page_id,
211
  text_extraction_method="pdftext",
212
  url=span.get("url"),
213
- anchors=span.get("anchors"),
214
  )
215
  )
216
  polygon = PolygonBox.from_bbox(line["bbox"], ensure_nonzero_area=True)
@@ -222,6 +223,7 @@ class PdfProvider(BaseProvider):
222
  )
223
  if self.check_line_spans(lines):
224
  page_lines[page_id] = lines
 
225
 
226
  return page_lines
227
 
@@ -326,6 +328,9 @@ class PdfProvider(BaseProvider):
326
  def get_page_lines(self, idx: int) -> List[ProviderOutput]:
327
  return self.page_lines[idx]
328
 
 
 
 
329
  @staticmethod
330
  def _get_fontname(font) -> str:
331
  font_name = ""
 
1
  import atexit
2
  import ctypes
3
  import re
4
+ from typing import Annotated, Dict, List, Optional, Set
5
 
6
  import pypdfium2 as pdfium
7
  import pypdfium2.raw as pdfium_c
8
  from ftfy import fix_text
9
  from pdftext.extraction import dictionary_output
10
+ from pdftext.schema import Reference
11
  from PIL import Image
12
  from pypdfium2 import PdfiumError
13
 
 
76
 
77
  self.doc: pdfium.PdfDocument = pdfium.PdfDocument(self.filepath)
78
  self.page_lines: ProviderPageLines = {i: [] for i in range(len(self.doc))}
79
+ self.page_refs: Dict[int, List[Reference]] = {i: [] for i in range(len(self.doc))}
80
 
81
  if self.page_range is None:
82
  self.page_range = range(len(self.doc))
 
212
  page_id=page_id,
213
  text_extraction_method="pdftext",
214
  url=span.get("url"),
 
215
  )
216
  )
217
  polygon = PolygonBox.from_bbox(line["bbox"], ensure_nonzero_area=True)
 
223
  )
224
  if self.check_line_spans(lines):
225
  page_lines[page_id] = lines
226
+ self.page_refs[page_id] = page["refs"]
227
 
228
  return page_lines
229
 
 
328
  def get_page_lines(self, idx: int) -> List[ProviderOutput]:
329
  return self.page_lines[idx]
330
 
331
+ def get_page_refs(self, idx: int):
332
+ return self.page_refs[idx]
333
+
334
  @staticmethod
335
  def _get_fontname(font) -> str:
336
  font_name = ""
marker/renderers/__init__.py CHANGED
@@ -15,7 +15,6 @@ from marker.util import assign_config
15
 
16
 
17
  class BaseRenderer:
18
- remove_blocks: Annotated[Tuple[BlockTypes, ...], "The block types to ignore while rendering."] = (BlockTypes.PageHeader, BlockTypes.PageFooter)
19
  image_blocks: Annotated[Tuple[BlockTypes, ...], "The block types to consider as images."] = (BlockTypes.Picture, BlockTypes.Figure)
20
  extract_images: Annotated[bool, "Extract images from the document."] = True
21
  image_extraction_mode: Annotated[
 
15
 
16
 
17
  class BaseRenderer:
 
18
  image_blocks: Annotated[Tuple[BlockTypes, ...], "The block types to consider as images."] = (BlockTypes.Picture, BlockTypes.Figure)
19
  extract_images: Annotated[bool, "Extract images from the document."] = True
20
  image_extraction_mode: Annotated[
marker/renderers/html.py CHANGED
@@ -60,14 +60,12 @@ class HTMLRenderer(BaseRenderer):
60
  ref_block_id: BlockId = item.id
61
  break
62
 
63
- if ref_block_id.block_type in self.remove_blocks:
64
- ref.replace_with('')
65
- elif ref_block_id.block_type in self.image_blocks:
66
  if self.extract_images:
67
  image = self.extract_image(document, ref_block_id)
68
  image_name = f"{ref_block_id.to_path()}.{settings.OUTPUT_IMAGE_FORMAT.lower()}"
69
  images[image_name] = image
70
- ref.replace_with(BeautifulSoup(f"<p><img src='{image_name}'></p>", 'html.parser'))
71
  else:
72
  # This will be the image description if using llm mode, or empty if not
73
  ref.replace_with(BeautifulSoup(f"{content}", 'html.parser'))
 
60
  ref_block_id: BlockId = item.id
61
  break
62
 
63
+ if ref_block_id.block_type in self.image_blocks:
 
 
64
  if self.extract_images:
65
  image = self.extract_image(document, ref_block_id)
66
  image_name = f"{ref_block_id.to_path()}.{settings.OUTPUT_IMAGE_FORMAT.lower()}"
67
  images[image_name] = image
68
+ ref.replace_with(BeautifulSoup(f"<p>{content}<img src='{image_name}'></p>", 'html.parser'))
69
  else:
70
  # This will be the image description if using llm mode, or empty if not
71
  ref.replace_with(BeautifulSoup(f"{content}", 'html.parser'))
marker/schema/__init__.py CHANGED
@@ -28,6 +28,7 @@ class BlockTypes(str, Enum):
28
  Document = auto()
29
  ComplexRegion = auto()
30
  TableCell = auto()
 
31
 
32
  def __str__(self):
33
  return self.name
 
28
  Document = auto()
29
  ComplexRegion = auto()
30
  TableCell = auto()
31
+ Reference = auto()
32
 
33
  def __str__(self):
34
  return self.name
marker/schema/blocks/__init__.py CHANGED
@@ -19,3 +19,4 @@ from marker.schema.blocks.text import Text
19
  from marker.schema.blocks.toc import TableOfContents
20
  from marker.schema.blocks.complexregion import ComplexRegion
21
  from marker.schema.blocks.tablecell import TableCell
 
 
19
  from marker.schema.blocks.toc import TableOfContents
20
  from marker.schema.blocks.complexregion import ComplexRegion
21
  from marker.schema.blocks.tablecell import TableCell
22
+ from marker.schema.blocks.reference import Reference
marker/schema/blocks/base.py CHANGED
@@ -12,6 +12,7 @@ if TYPE_CHECKING:
12
  from marker.schema.document import Document
13
  from marker.schema.groups.page import PageGroup
14
 
 
15
  class BlockMetadata(BaseModel):
16
  llm_request_count: int = 0
17
  llm_error_count: int = 0
@@ -78,6 +79,7 @@ class Block(BaseModel):
78
  text_extraction_method: Optional[Literal['pdftext', 'surya', 'gemini']] = None
79
  structure: List[BlockId] | None = None # The top-level page structure, which is the block ids in order
80
  ignore_for_output: bool = False # Whether this block should be ignored in output
 
81
  source: Literal['layout', 'heuristics', 'processor'] = 'layout'
82
  top_k: Optional[Dict[BlockTypes, float]] = None
83
  metadata: BlockMetadata | None = None
@@ -187,6 +189,10 @@ class Block(BaseModel):
187
  template = ""
188
  for c in child_blocks:
189
  template += f"<content-ref src='{c.id}'></content-ref>"
 
 
 
 
190
  return template
191
 
192
  def assign_section_hierarchy(self, section_hierarchy):
 
12
  from marker.schema.document import Document
13
  from marker.schema.groups.page import PageGroup
14
 
15
+
16
  class BlockMetadata(BaseModel):
17
  llm_request_count: int = 0
18
  llm_error_count: int = 0
 
79
  text_extraction_method: Optional[Literal['pdftext', 'surya', 'gemini']] = None
80
  structure: List[BlockId] | None = None # The top-level page structure, which is the block ids in order
81
  ignore_for_output: bool = False # Whether this block should be ignored in output
82
+ replace_output_newlines: bool = False # Whether to replace newlines with spaces in output
83
  source: Literal['layout', 'heuristics', 'processor'] = 'layout'
84
  top_k: Optional[Dict[BlockTypes, float]] = None
85
  metadata: BlockMetadata | None = None
 
189
  template = ""
190
  for c in child_blocks:
191
  template += f"<content-ref src='{c.id}'></content-ref>"
192
+
193
+ if self.replace_output_newlines:
194
+ template = "<p>" + template.replace("\n", " ") + "</p>"
195
+
196
  return template
197
 
198
  def assign_section_hierarchy(self, section_hierarchy):
marker/schema/blocks/basetable.py CHANGED
@@ -24,13 +24,16 @@ class BaseTable(Block):
24
 
25
 
26
  def assemble_html(self, document, child_blocks: List[BlockOutput], parent_structure=None):
 
 
 
 
27
  if self.html:
28
  # LLM processor
29
- return self.html
30
  elif len(child_blocks) > 0 and child_blocks[0].id.block_type == BlockTypes.TableCell:
31
  # Table processor
32
- return self.format_cells(document, child_blocks)
33
  else:
34
  # Default text lines and spans
35
- template = super().assemble_html(document, child_blocks, parent_structure)
36
  return f"<p>{template}</p>"
 
24
 
25
 
26
  def assemble_html(self, document, child_blocks: List[BlockOutput], parent_structure=None):
27
+ # Filter out the table cells, so they don't render twice
28
+ selected_blocks = [b for b in child_blocks if b.id.block_type != BlockTypes.TableCell]
29
+ template = super().assemble_html(document, selected_blocks, parent_structure)
30
+
31
  if self.html:
32
  # LLM processor
33
+ return template + self.html
34
  elif len(child_blocks) > 0 and child_blocks[0].id.block_type == BlockTypes.TableCell:
35
  # Table processor
36
+ return template + self.format_cells(document, child_blocks)
37
  else:
38
  # Default text lines and spans
 
39
  return f"<p>{template}</p>"
marker/schema/blocks/caption.py CHANGED
@@ -5,8 +5,10 @@ from marker.schema.blocks import Block
5
  class Caption(Block):
6
  block_type: BlockTypes = BlockTypes.Caption
7
  block_description: str = "A text caption that is directly above or below an image or table. Only used for text describing the image or table. "
 
8
 
9
  def assemble_html(self, document, child_blocks, parent_structure):
10
  template = super().assemble_html(document, child_blocks, parent_structure)
11
  template = template.replace("\n", " ")
12
  return f"<p>{template}</p>"
 
 
5
  class Caption(Block):
6
  block_type: BlockTypes = BlockTypes.Caption
7
  block_description: str = "A text caption that is directly above or below an image or table. Only used for text describing the image or table. "
8
+ replace_output_newlines: bool = True
9
 
10
  def assemble_html(self, document, child_blocks, parent_structure):
11
  template = super().assemble_html(document, child_blocks, parent_structure)
12
  template = template.replace("\n", " ")
13
  return f"<p>{template}</p>"
14
+
marker/schema/blocks/equation.py CHANGED
@@ -11,7 +11,9 @@ class Equation(Block):
11
 
12
  def assemble_html(self, document, child_blocks, parent_structure=None):
13
  if self.latex:
14
- html_out = f"<p block-type='{self.block_type}'>"
 
 
15
 
16
  try:
17
  latex = self.parse_latex(html.escape(self.latex))
@@ -44,9 +46,9 @@ class Equation(Block):
44
  ("$$", "block"),
45
  ("$", "inline")
46
  ]
47
-
48
- text = text.replace("\n", "<br>") # we can't handle \n's inside <p> properly if we don't do this
49
-
50
  i = 0
51
  stack = []
52
  result = []
@@ -73,7 +75,7 @@ class Equation(Block):
73
  else: # No delimiter match
74
  buffer += text[i]
75
  i += 1
76
-
77
  if buffer:
78
  result.append({"class": "text", "content": buffer})
79
- return result
 
11
 
12
  def assemble_html(self, document, child_blocks, parent_structure=None):
13
  if self.latex:
14
+ child_ref_blocks = [block for block in child_blocks if block.id.block_type == BlockTypes.Reference]
15
+ html_out = super().assemble_html(child_ref_blocks, parent_structure)
16
+ html_out += f"<p block-type='{self.block_type}'>"
17
 
18
  try:
19
  latex = self.parse_latex(html.escape(self.latex))
 
46
  ("$$", "block"),
47
  ("$", "inline")
48
  ]
49
+
50
+ text = text.replace("\n", "<br>") # we can't handle \n's inside <p> properly if we don't do this
51
+
52
  i = 0
53
  stack = []
54
  result = []
 
75
  else: # No delimiter match
76
  buffer += text[i]
77
  i += 1
78
+
79
  if buffer:
80
  result.append({"class": "text", "content": buffer})
81
+ return result
marker/schema/blocks/figure.py CHANGED
@@ -8,7 +8,8 @@ class Figure(Block):
8
  block_description: str = "A chart or other image that contains data."
9
 
10
  def assemble_html(self, document, child_blocks, parent_structure):
 
 
11
  if self.description:
12
- return f"<p role='img' data-original-image-id='{self.id}'>Image {self.id} description: {self.description}</p>"
13
- else:
14
- return ""
 
8
  block_description: str = "A chart or other image that contains data."
9
 
10
  def assemble_html(self, document, child_blocks, parent_structure):
11
+ child_ref_blocks = [block for block in child_blocks if block.id.block_type == BlockTypes.Reference]
12
+ html = super().assemble_html(document, child_ref_blocks, parent_structure)
13
  if self.description:
14
+ html += f"<p role='img' data-original-image-id='{self.id}'>Image {self.id} description: {self.description}</p>"
15
+ return html
 
marker/schema/blocks/footnote.py CHANGED
@@ -5,9 +5,11 @@ from marker.schema.blocks import Block
5
  class Footnote(Block):
6
  block_type: BlockTypes = BlockTypes.Footnote
7
  block_description: str = "A footnote that explains a term or concept in the document."
 
8
 
9
  def assemble_html(self, document, child_blocks, parent_structure):
10
  template = super().assemble_html(document, child_blocks, parent_structure)
11
  template = template.replace("\n", " ")
12
 
13
  return f"<p>{template}</p>"
 
 
5
  class Footnote(Block):
6
  block_type: BlockTypes = BlockTypes.Footnote
7
  block_description: str = "A footnote that explains a term or concept in the document."
8
+ replace_output_newlines: bool = True
9
 
10
  def assemble_html(self, document, child_blocks, parent_structure):
11
  template = super().assemble_html(document, child_blocks, parent_structure)
12
  template = template.replace("\n", " ")
13
 
14
  return f"<p>{template}</p>"
15
+
marker/schema/blocks/handwriting.py CHANGED
@@ -6,6 +6,7 @@ class Handwriting(Block):
6
  block_type: BlockTypes = BlockTypes.Handwriting
7
  block_description: str = "A region that contains handwriting."
8
  html: str | None = None
 
9
 
10
  def assemble_html(self, document, child_blocks, parent_structure):
11
  if self.html:
@@ -14,3 +15,4 @@ class Handwriting(Block):
14
  template = super().assemble_html(document, child_blocks, parent_structure)
15
  template = template.replace("\n", " ")
16
  return f"<p>{template}</p>"
 
 
6
  block_type: BlockTypes = BlockTypes.Handwriting
7
  block_description: str = "A region that contains handwriting."
8
  html: str | None = None
9
+ replace_output_newlines: bool = True
10
 
11
  def assemble_html(self, document, child_blocks, parent_structure):
12
  if self.html:
 
15
  template = super().assemble_html(document, child_blocks, parent_structure)
16
  template = template.replace("\n", " ")
17
  return f"<p>{template}</p>"
18
+
marker/schema/blocks/pagefooter.py CHANGED
@@ -5,6 +5,8 @@ from marker.schema.blocks import Block
5
  class PageFooter(Block):
6
  block_type: str = BlockTypes.PageFooter
7
  block_description: str = "Text that appears at the bottom of a page, like a page number."
 
 
8
 
9
  def assemble_html(self, document, child_blocks, parent_structure):
10
  if self.ignore_for_output:
@@ -13,3 +15,4 @@ class PageFooter(Block):
13
  template = super().assemble_html(document, child_blocks, parent_structure)
14
  template = template.replace("\n", " ")
15
  return f"<p>{template}</p>"
 
 
5
  class PageFooter(Block):
6
  block_type: str = BlockTypes.PageFooter
7
  block_description: str = "Text that appears at the bottom of a page, like a page number."
8
+ replace_output_newlines: bool = True
9
+ ignore_for_output: bool = True
10
 
11
  def assemble_html(self, document, child_blocks, parent_structure):
12
  if self.ignore_for_output:
 
15
  template = super().assemble_html(document, child_blocks, parent_structure)
16
  template = template.replace("\n", " ")
17
  return f"<p>{template}</p>"
18
+
marker/schema/blocks/pageheader.py CHANGED
@@ -5,6 +5,8 @@ from marker.schema.blocks import Block
5
  class PageHeader(Block):
6
  block_type: BlockTypes = BlockTypes.PageHeader
7
  block_description: str = "Text that appears at the top of a page, like a page title."
 
 
8
 
9
  def assemble_html(self, document, child_blocks, parent_structure):
10
  if self.ignore_for_output:
@@ -13,3 +15,4 @@ class PageHeader(Block):
13
  template = super().assemble_html(document, child_blocks, parent_structure)
14
  template = template.replace("\n", " ")
15
  return f"<p>{template}</p>"
 
 
5
  class PageHeader(Block):
6
  block_type: BlockTypes = BlockTypes.PageHeader
7
  block_description: str = "Text that appears at the top of a page, like a page title."
8
+ replace_output_newlines: bool = True
9
+ ignore_for_output: bool = True
10
 
11
  def assemble_html(self, document, child_blocks, parent_structure):
12
  if self.ignore_for_output:
 
15
  template = super().assemble_html(document, child_blocks, parent_structure)
16
  template = template.replace("\n", " ")
17
  return f"<p>{template}</p>"
18
+
marker/schema/blocks/reference.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from marker.schema import BlockTypes
2
+ from marker.schema.blocks import Block
3
+
4
+
5
+ class Reference(Block):
6
+ block_type: BlockTypes = BlockTypes.Reference
7
+ ref: str
8
+
9
+ def assemble_html(self, child_blocks, parent_structure=None):
10
+ template = super().assemble_html(child_blocks, parent_structure)
11
+ return f"<span id='{self.ref}'>{template}</span>"
marker/schema/groups/page.py CHANGED
@@ -3,6 +3,7 @@ from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
3
 
4
  from PIL import Image
5
 
 
6
  from marker.providers import ProviderOutput
7
  from marker.schema import BlockTypes
8
  from marker.schema.blocks import Block, BlockId, Text
@@ -23,6 +24,7 @@ class PageGroup(Group):
23
  excluded_block_types: Sequence[BlockTypes] = (BlockTypes.Line, BlockTypes.Span,)
24
  maximum_assignment_distance: float = 20 # pixels
25
  block_description: str = "A single page in the document."
 
26
 
27
  def incr_block_id(self):
28
  if self.block_id is None:
 
3
 
4
  from PIL import Image
5
 
6
+ from pdftext.schema import Reference
7
  from marker.providers import ProviderOutput
8
  from marker.schema import BlockTypes
9
  from marker.schema.blocks import Block, BlockId, Text
 
24
  excluded_block_types: Sequence[BlockTypes] = (BlockTypes.Line, BlockTypes.Span,)
25
  maximum_assignment_distance: float = 20 # pixels
26
  block_description: str = "A single page in the document."
27
+ refs: List[Reference] | None = None
28
 
29
  def incr_block_id(self):
30
  if self.block_id is None:
marker/schema/registry.py CHANGED
@@ -6,7 +6,7 @@ from marker.schema.blocks import Block, Caption, Code, Equation, Figure, \
6
  Footnote, Form, Handwriting, InlineMath, \
7
  ListItem, PageFooter, PageHeader, Picture, \
8
  SectionHeader, Table, TableOfContents, \
9
- Text, ComplexRegion, TableCell
10
  from marker.schema.document import Document
11
  from marker.schema.groups import FigureGroup, ListGroup, PageGroup, \
12
  PictureGroup, TableGroup
@@ -51,6 +51,7 @@ register_block_class(BlockTypes.Text, Text)
51
  register_block_class(BlockTypes.TableOfContents, TableOfContents)
52
  register_block_class(BlockTypes.ComplexRegion, ComplexRegion)
53
  register_block_class(BlockTypes.TableCell, TableCell)
 
54
  register_block_class(BlockTypes.Document, Document)
55
 
56
  assert len(BLOCK_REGISTRY) == len(BlockTypes)
 
6
  Footnote, Form, Handwriting, InlineMath, \
7
  ListItem, PageFooter, PageHeader, Picture, \
8
  SectionHeader, Table, TableOfContents, \
9
+ Text, ComplexRegion, TableCell, Reference
10
  from marker.schema.document import Document
11
  from marker.schema.groups import FigureGroup, ListGroup, PageGroup, \
12
  PictureGroup, TableGroup
 
51
  register_block_class(BlockTypes.TableOfContents, TableOfContents)
52
  register_block_class(BlockTypes.ComplexRegion, ComplexRegion)
53
  register_block_class(BlockTypes.TableCell, TableCell)
54
+ register_block_class(BlockTypes.Reference, Reference)
55
  register_block_class(BlockTypes.Document, Document)
56
 
57
  assert len(BLOCK_REGISTRY) == len(BlockTypes)
marker/schema/text/span.py CHANGED
@@ -25,7 +25,6 @@ class Span(Block):
25
  formats: List[Literal['plain', 'math', 'chemical', 'bold', 'italic']]
26
  has_superscript: bool = False
27
  url: Optional[str] = None
28
- anchors: Optional[List[str]] = None
29
 
30
  @property
31
  def bold(self):
@@ -75,6 +74,4 @@ class Span(Block):
75
  elif self.math:
76
  text = f"<math display='inline'>{text}</math>"
77
 
78
- if self.anchors:
79
- text = "".join(f"<span id='{anchor}'/>" for anchor in self.anchors) + text
80
  return text
 
25
  formats: List[Literal['plain', 'math', 'chemical', 'bold', 'italic']]
26
  has_superscript: bool = False
27
  url: Optional[str] = None
 
28
 
29
  @property
30
  def bold(self):
 
74
  elif self.math:
75
  text = f"<math display='inline'>{text}</math>"
76
 
 
 
77
  return text
tests/builders/test_pdf_links.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  import pytest
2
 
3
  from marker.converters.pdf import PdfConverter
@@ -8,9 +10,8 @@ from marker.schema.document import Document
8
 
9
  @pytest.mark.filename("arxiv_test.pdf")
10
  @pytest.mark.output_format("markdown")
11
- @pytest.mark.config({"page_range": [1]})
12
  def test_pdf_links(pdf_document: Document, pdf_converter: PdfConverter, temp_pdf):
13
- first_page = pdf_document.pages[0]
14
 
15
  for section_header_span in first_page.contained_blocks(pdf_document, (BlockTypes.Span,)):
16
  if "II." in section_header_span.text:
@@ -22,11 +23,13 @@ def test_pdf_links(pdf_document: Document, pdf_converter: PdfConverter, temp_pdf
22
  section_header_block = first_page.contained_blocks(pdf_document, (BlockTypes.SectionHeader,))[0]
23
  assert section_header_block.raw_text(pdf_document) == 'II. THEORETICAL FRAMEWORK\n'
24
 
25
- section_header_span = section_header_block.contained_blocks(pdf_document, (BlockTypes.Span,))[0]
26
- assert section_header_span.anchors == ['page-1-0']
27
 
28
  markdown_output: MarkdownOutput = pdf_converter(temp_pdf.name)
29
  markdown = markdown_output.markdown
30
 
31
  assert '[II.](#page-1-0)' in markdown
32
  assert '<span id="page-1-0"/>II. THEORETICAL FRAMEWORK' in markdown
 
 
 
 
1
+ import re
2
+
3
  import pytest
4
 
5
  from marker.converters.pdf import PdfConverter
 
10
 
11
  @pytest.mark.filename("arxiv_test.pdf")
12
  @pytest.mark.output_format("markdown")
 
13
  def test_pdf_links(pdf_document: Document, pdf_converter: PdfConverter, temp_pdf):
14
+ first_page = pdf_document.pages[1]
15
 
16
  for section_header_span in first_page.contained_blocks(pdf_document, (BlockTypes.Span,)):
17
  if "II." in section_header_span.text:
 
23
  section_header_block = first_page.contained_blocks(pdf_document, (BlockTypes.SectionHeader,))[0]
24
  assert section_header_block.raw_text(pdf_document) == 'II. THEORETICAL FRAMEWORK\n'
25
 
26
+ assert first_page.refs[0].ref == "page-1-0"
 
27
 
28
  markdown_output: MarkdownOutput = pdf_converter(temp_pdf.name)
29
  markdown = markdown_output.markdown
30
 
31
  assert '[II.](#page-1-0)' in markdown
32
  assert '<span id="page-1-0"/>II. THEORETICAL FRAMEWORK' in markdown
33
+
34
+ for ref in set([f'<span id="page-{m[0]}-{m[1]}"/>' for m in re.findall(r'\]\(#page-(\d+)-(\d+)\)', markdown)]):
35
+ assert ref in markdown, f"Reference {ref} not found in markdown"