Vik Paruchuri commited on
Commit
6ff9f43
·
2 Parent(s): 3453dd8 b0a6c31

Merge remote-tracking branch 'origin/dev' into vik_dev

Browse files
convert.py CHANGED
@@ -13,6 +13,7 @@ import torch.multiprocessing as mp
13
  from tqdm import tqdm
14
 
15
  from marker.config.parser import ConfigParser
 
16
  from marker.converters.pdf import PdfConverter
17
  from marker.logger import configure_logging
18
  from marker.models import create_model_dict
@@ -59,7 +60,7 @@ def process_single_pdf(args):
59
  print(traceback.format_exc())
60
 
61
 
62
- @click.command()
63
  @click.argument("in_folder", type=str)
64
  @ConfigParser.common_options
65
  @click.option("--chunk_idx", type=int, default=0, help="Chunk index to convert")
 
13
  from tqdm import tqdm
14
 
15
  from marker.config.parser import ConfigParser
16
+ from marker.config.printer import CustomClickPrinter
17
  from marker.converters.pdf import PdfConverter
18
  from marker.logger import configure_logging
19
  from marker.models import create_model_dict
 
60
  print(traceback.format_exc())
61
 
62
 
63
+ @click.command(cls=CustomClickPrinter)
64
  @click.argument("in_folder", type=str)
65
  @ConfigParser.common_options
66
  @click.option("--chunk_idx", type=int, default=0, help="Chunk index to convert")
marker/builders/document.py CHANGED
@@ -1,4 +1,5 @@
1
- from marker.settings import settings
 
2
  from marker.builders import BaseBuilder
3
  from marker.builders.layout import LayoutBuilder
4
  from marker.builders.ocr import OcrBuilder
@@ -12,18 +13,15 @@ from marker.schema.registry import get_block_class
12
  class DocumentBuilder(BaseBuilder):
13
  """
14
  Constructs a Document given a PdfProvider, LayoutBuilder, and OcrBuilder.
15
-
16
- Attributes:
17
- lowres_image_dpi (int):
18
- DPI setting for low-resolution page images used for Layout and Line Detection.
19
- Default is 96.
20
-
21
- highres_image_dpi (int):
22
- DPI setting for high-resolution page images used for OCR.
23
- Default is 192.
24
  """
25
- lowres_image_dpi: int = 96
26
- highres_image_dpi: int = 192
 
 
 
 
 
 
27
 
28
  def __call__(self, provider: PdfProvider, layout_builder: LayoutBuilder, ocr_builder: OcrBuilder):
29
  document = self.build_document(provider)
 
1
+ from typing import Annotated
2
+
3
  from marker.builders import BaseBuilder
4
  from marker.builders.layout import LayoutBuilder
5
  from marker.builders.ocr import OcrBuilder
 
13
  class DocumentBuilder(BaseBuilder):
14
  """
15
  Constructs a Document given a PdfProvider, LayoutBuilder, and OcrBuilder.
 
 
 
 
 
 
 
 
 
16
  """
17
+ lowres_image_dpi: Annotated[
18
+ int,
19
+ "DPI setting for low-resolution page images used for Layout and Line Detection.",
20
+ ] = 96
21
+ highres_image_dpi: Annotated[
22
+ int,
23
+ "DPI setting for high-resolution page images used for OCR.",
24
+ ] = 192
25
 
26
  def __call__(self, provider: PdfProvider, layout_builder: LayoutBuilder, ocr_builder: OcrBuilder):
27
  document = self.build_document(provider)
marker/builders/layout.py CHANGED
@@ -1,15 +1,12 @@
1
- from typing import List
2
 
3
  import numpy as np
4
  from surya.layout import batch_layout_detection
5
- from surya.schema import LayoutResult
6
  from surya.model.layout.encoderdecoder import SuryaLayoutModel
7
-
8
- from surya.ocr_error import batch_ocr_error_detection
9
- from surya.schema import OCRErrorDetectionResult
10
  from surya.model.ocr_error.model import DistilBertForSequenceClassification
 
 
11
 
12
- from marker.settings import settings
13
  from marker.builders import BaseBuilder
14
  from marker.providers import ProviderOutput, ProviderPageLines
15
  from marker.providers.pdf import PdfProvider
@@ -18,40 +15,42 @@ from marker.schema.document import Document
18
  from marker.schema.groups.page import PageGroup
19
  from marker.schema.polygon import PolygonBox
20
  from marker.schema.registry import get_block_class
 
21
  from marker.util import matrix_intersection_area
22
 
23
 
24
  class LayoutBuilder(BaseBuilder):
25
  """
26
  A builder for performing layout detection on PDF pages and merging the results into the document.
27
-
28
- Attributes:
29
- batch_size (int):
30
- The batch size to use for the layout model.
31
- Default is None, which will use the default batch size for the model.
32
-
33
- layout_coverage_min_lines (int):
34
- The minimum number of PdfProvider lines that must be covered by the layout model
35
- to consider the lines from the PdfProvider valid. Default is 1.
36
-
37
- layout_coverage_threshold (float):
38
- The minimum coverage ratio required for the layout model to consider
39
- the lines from the PdfProvider valid. Default is 0.3.
40
-
41
- document_ocr_threshold (float):
42
- The minimum ratio of pages that must pass the layout coverage check
43
- to avoid OCR. Default is 0.8.
44
-
45
- error_model_segment_length (int):
46
- The maximum number of characters to send to the OCR error model.
47
- Default is 1024.
48
  """
49
- batch_size = None
50
- layout_coverage_min_lines = 1
51
- layout_coverage_threshold = .1
52
- document_ocr_threshold = .8
53
- error_model_segment_length = 512
54
- excluded_for_coverage = (BlockTypes.Figure, BlockTypes.Picture, BlockTypes.Table, BlockTypes.FigureGroup, BlockTypes.TableGroup, BlockTypes.PictureGroup)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
  def __init__(self, layout_model: SuryaLayoutModel, ocr_error_model: DistilBertForSequenceClassification, config=None):
57
  self.layout_model = layout_model
@@ -81,7 +80,7 @@ class LayoutBuilder(BaseBuilder):
81
  )
82
  return layout_results
83
 
84
- def surya_ocr_error_detection(self, pages:List[PageGroup], provider_page_lines: ProviderPageLines) -> OCRErrorDetectionResult:
85
  page_texts = []
86
  for document_page in pages:
87
  page_text = ''
@@ -102,7 +101,7 @@ class LayoutBuilder(BaseBuilder):
102
  page_texts,
103
  self.ocr_error_model,
104
  self.ocr_error_model.tokenizer,
105
- batch_size=int(self.get_batch_size()) #TODO Better Multiplier
106
  )
107
  return ocr_error_detection_results
108
 
 
1
+ from typing import Annotated, List, Optional, Tuple
2
 
3
  import numpy as np
4
  from surya.layout import batch_layout_detection
 
5
  from surya.model.layout.encoderdecoder import SuryaLayoutModel
 
 
 
6
  from surya.model.ocr_error.model import DistilBertForSequenceClassification
7
+ from surya.ocr_error import batch_ocr_error_detection
8
+ from surya.schema import LayoutResult, OCRErrorDetectionResult
9
 
 
10
  from marker.builders import BaseBuilder
11
  from marker.providers import ProviderOutput, ProviderPageLines
12
  from marker.providers.pdf import PdfProvider
 
15
  from marker.schema.groups.page import PageGroup
16
  from marker.schema.polygon import PolygonBox
17
  from marker.schema.registry import get_block_class
18
+ from marker.settings import settings
19
  from marker.util import matrix_intersection_area
20
 
21
 
22
  class LayoutBuilder(BaseBuilder):
23
  """
24
  A builder for performing layout detection on PDF pages and merging the results into the document.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  """
26
+ batch_size: Annotated[
27
+ Optional[int],
28
+ "The batch size to use for the layout model.",
29
+ "Default is None, which will use the default batch size for the model."
30
+ ] = None
31
+ layout_coverage_min_lines: Annotated[
32
+ int,
33
+ "The minimum number of PdfProvider lines that must be covered by the layout model",
34
+ "to consider the lines from the PdfProvider valid.",
35
+ ] = 1
36
+ layout_coverage_threshold: Annotated[
37
+ float,
38
+ "The minimum coverage ratio required for the layout model to consider",
39
+ "the lines from the PdfProvider valid.",
40
+ ] = .1
41
+ document_ocr_threshold: Annotated[
42
+ float,
43
+ "The minimum ratio of pages that must pass the layout coverage check",
44
+ "to avoid OCR.",
45
+ ] = .8
46
+ error_model_segment_length: Annotated[
47
+ int,
48
+ "The maximum number of characters to send to the OCR error model.",
49
+ ] = 512
50
+ excluded_for_coverage: Annotated[
51
+ Tuple[BlockTypes],
52
+ "A list of block types to exclude from the layout coverage check.",
53
+ ] = (BlockTypes.Figure, BlockTypes.Picture, BlockTypes.Table, BlockTypes.FigureGroup, BlockTypes.TableGroup, BlockTypes.PictureGroup)
54
 
55
  def __init__(self, layout_model: SuryaLayoutModel, ocr_error_model: DistilBertForSequenceClassification, config=None):
56
  self.layout_model = layout_model
 
80
  )
81
  return layout_results
82
 
83
+ def surya_ocr_error_detection(self, pages: List[PageGroup], provider_page_lines: ProviderPageLines) -> OCRErrorDetectionResult:
84
  page_texts = []
85
  for document_page in pages:
86
  page_text = ''
 
101
  page_texts,
102
  self.ocr_error_model,
103
  self.ocr_error_model.tokenizer,
104
+ batch_size=int(self.get_batch_size()) # TODO Better Multiplier
105
  )
106
  return ocr_error_detection_results
107
 
marker/builders/llm_layout.py CHANGED
@@ -1,13 +1,8 @@
1
  import json
2
- import time
3
- import traceback
4
  from concurrent.futures import ThreadPoolExecutor, as_completed
5
- from typing import Optional
6
 
7
- import google.generativeai as genai
8
- import PIL
9
  from google.ai.generativelanguage_v1beta.types import content
10
- from google.api_core.exceptions import ResourceExhausted
11
  from surya.model.layout.encoderdecoder import SuryaLayoutModel
12
  from surya.model.ocr_error.model import DistilBertForSequenceClassification
13
  from tqdm import tqdm
@@ -26,45 +21,41 @@ from marker.settings import settings
26
  class LLMLayoutBuilder(LayoutBuilder):
27
  """
28
  A builder for relabelling blocks to improve the quality of the layout.
29
-
30
- Attributes:
31
- google_api_key (str):
32
- The Google API key to use for the Gemini model.
33
- Default is None.
34
- confidence_threshold (float):
35
- The confidence threshold to use for relabeling.
36
- Default is 0.75.
37
- picture_height_threshold (float):
38
- The height threshold for pictures that may actually be complex regions.
39
- model_name (str):
40
- The name of the Gemini model to use.
41
- Default is "gemini-1.5-flash".
42
- max_retries (int):
43
- The maximum number of retries to use for the Gemini model.
44
- Default is 3.
45
- max_concurrency (int):
46
- The maximum number of concurrent requests to make to the Gemini model.
47
- Default is 3.
48
- timeout (int):
49
- The timeout for requests to the Gemini model.
50
- Default is 60 seconds.
51
- topk_relabelling_prompt (str):
52
- The prompt to use for relabelling blocks.
53
- Default is a string containing the Gemini relabelling prompt.
54
- complex_relabeling_prompt (str):
55
- The prompt to use for complex relabelling blocks.
56
- Default is a string containing the complex relabelling prompt.
57
  """
58
 
59
- google_api_key: Optional[str] = settings.GOOGLE_API_KEY
60
- confidence_threshold: float = 0.75
61
- picture_height_threshold: float = 0.8
62
- model_name: str = "gemini-1.5-flash"
63
- max_retries: int = 3
64
- max_concurrency: int = 3
65
- timeout: int = 60
66
-
67
- topk_relabelling_prompt = """You are a layout expert specializing in document analysis.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  Your task is to relabel layout blocks in images to improve the accuracy of an existing layout model.
69
  You will be provided with an image of a layout block and the top k predictions from the current model, along with their confidence scores.
70
  Your job is to analyze the image and choose the single most appropriate label from the provided top k predictions.
@@ -75,7 +66,11 @@ Choose the label you believe is the most accurate representation of the layout b
75
  Here are the top k predictions from the model followed by the image:
76
 
77
  """
78
- complex_relabeling_prompt = """You are a layout expert specializing in document analysis.
 
 
 
 
79
  Your task is to relabel layout blocks in images to improve the accuracy of an existing layout model.
80
  You will be provided with an image of a layout block and some potential labels.
81
  Your job is to analyze the image and choose the single most appropriate label from the provided labels.
@@ -140,7 +135,6 @@ Here is the image of the layout block:
140
  complex_prompt = self.complex_relabeling_prompt
141
  return self.process_block_relabeling(page, block, complex_prompt)
142
 
143
-
144
  def process_block_relabeling(self, page: PageGroup, block: Block, prompt: str):
145
  image = self.extract_image(page, block)
146
  response_schema = content.Schema(
@@ -174,4 +168,4 @@ Here is the image of the layout block:
174
  .rescale(page.polygon.size, page_img.size)\
175
  .expand(expand, expand)
176
  cropped = page_img.crop(image_box.bbox)
177
- return cropped
 
1
  import json
 
 
2
  from concurrent.futures import ThreadPoolExecutor, as_completed
3
+ from typing import Annotated, Optional
4
 
 
 
5
  from google.ai.generativelanguage_v1beta.types import content
 
6
  from surya.model.layout.encoderdecoder import SuryaLayoutModel
7
  from surya.model.ocr_error.model import DistilBertForSequenceClassification
8
  from tqdm import tqdm
 
21
  class LLMLayoutBuilder(LayoutBuilder):
22
  """
23
  A builder for relabelling blocks to improve the quality of the layout.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  """
25
 
26
+ google_api_key: Annotated[
27
+ Optional[str],
28
+ "The Google API key to use for the Gemini model.",
29
+ ] = settings.GOOGLE_API_KEY
30
+ confidence_threshold: Annotated[
31
+ float,
32
+ "The confidence threshold to use for relabeling.",
33
+ ] = 0.75
34
+ picture_height_threshold: Annotated[
35
+ float,
36
+ "The height threshold for pictures that may actually be complex regions.",
37
+ ] = 0.8
38
+ model_name: Annotated[
39
+ str,
40
+ "The name of the Gemini model to use.",
41
+ ] = "gemini-1.5-flash"
42
+ max_retries: Annotated[
43
+ int,
44
+ "The maximum number of retries to use for the Gemini model.",
45
+ ] = 3
46
+ max_concurrency: Annotated[
47
+ int,
48
+ "The maximum number of concurrent requests to make to the Gemini model.",
49
+ ] = 3
50
+ timeout: Annotated[
51
+ int,
52
+ "The timeout for requests to the Gemini model.",
53
+ ] = 60
54
+ topk_relabelling_prompt: Annotated[
55
+ str,
56
+ "The prompt to use for relabelling blocks.",
57
+ "Default is a string containing the Gemini relabelling prompt."
58
+ ] = """You are a layout expert specializing in document analysis.
59
  Your task is to relabel layout blocks in images to improve the accuracy of an existing layout model.
60
  You will be provided with an image of a layout block and the top k predictions from the current model, along with their confidence scores.
61
  Your job is to analyze the image and choose the single most appropriate label from the provided top k predictions.
 
66
  Here are the top k predictions from the model followed by the image:
67
 
68
  """
69
+ complex_relabeling_prompt: Annotated[
70
+ str,
71
+ "The prompt to use for complex relabelling blocks.",
72
+ "Default is a string containing the complex relabelling prompt."
73
+ ] = """You are a layout expert specializing in document analysis.
74
  Your task is to relabel layout blocks in images to improve the accuracy of an existing layout model.
75
  You will be provided with an image of a layout block and some potential labels.
76
  Your job is to analyze the image and choose the single most appropriate label from the provided labels.
 
135
  complex_prompt = self.complex_relabeling_prompt
136
  return self.process_block_relabeling(page, block, complex_prompt)
137
 
 
138
  def process_block_relabeling(self, page: PageGroup, block: Block, prompt: str):
139
  image = self.extract_image(page, block)
140
  response_schema = content.Schema(
 
168
  .rescale(page.polygon.size, page_img.size)\
169
  .expand(expand, expand)
170
  cropped = page_img.crop(image_box.bbox)
171
+ return cropped
marker/builders/ocr.py CHANGED
@@ -1,4 +1,4 @@
1
- from typing import List
2
 
3
  from ftfy import fix_text
4
  from surya.model.detection.model import EfficientViTForSemanticSegmentation
@@ -20,22 +20,22 @@ from marker.settings import settings
20
  class OcrBuilder(BaseBuilder):
21
  """
22
  A builder for performing OCR on PDF pages and merging the results into the document.
23
-
24
- Attributes:
25
- detection_batch_size (int):
26
- The batch size to use for the detection model.
27
- Default is None, which will use the default batch size for the model.
28
-
29
- recognition_batch_size (int):
30
- The batch size to use for the recognition model.
31
- Default is None, which will use the default batch size for the model.
32
-
33
- languages (List[str]):
34
- A list of languages to use for OCR. Default is None.
35
  """
36
- recognition_batch_size: int | None = None
37
- detection_batch_size: int | None = None
38
- languages: List[str] | None = None
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
  def __init__(self, detection_model: EfficientViTForSemanticSegmentation, recognition_model: OCREncoderDecoderModel, config=None):
41
  super().__init__(config)
 
1
+ from typing import Annotated, List, Optional
2
 
3
  from ftfy import fix_text
4
  from surya.model.detection.model import EfficientViTForSemanticSegmentation
 
20
  class OcrBuilder(BaseBuilder):
21
  """
22
  A builder for performing OCR on PDF pages and merging the results into the document.
 
 
 
 
 
 
 
 
 
 
 
 
23
  """
24
+ recognition_batch_size: Annotated[
25
+ Optional[int],
26
+ "The batch size to use for the recognition model.",
27
+ "Default is None, which will use the default batch size for the model."
28
+ ] = None
29
+ detection_batch_size: Annotated[
30
+ Optional[int],
31
+ "The batch size to use for the detection model.",
32
+ "Default is None, which will use the default batch size for the model."
33
+ ] = None
34
+ languages: Annotated[
35
+ Optional[List[str]],
36
+ "A list of languages to use for OCR.",
37
+ "Default is None."
38
+ ] = None
39
 
40
  def __init__(self, detection_model: EfficientViTForSemanticSegmentation, recognition_model: OCREncoderDecoderModel, config=None):
41
  super().__init__(config)
marker/builders/structure.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  from marker.builders import BaseBuilder
2
  from marker.schema import BlockTypes
3
  from marker.schema.document import Document
@@ -9,18 +11,15 @@ from marker.schema.registry import get_block_class
9
  class StructureBuilder(BaseBuilder):
10
  """
11
  A builder for grouping blocks together based on their structure.
12
-
13
- Attributes:
14
- gap_threshold (float):
15
- The minimum gap between blocks to consider them part of the same group.
16
- Default is 0.05.
17
-
18
- list_gap_threshold (float):
19
- The minimum gap between list items to consider them part of the same group.
20
- Default is 0.1.
21
  """
22
- gap_threshold: int = .05
23
- list_gap_threshold: int = .1
 
 
 
 
 
 
24
 
25
  def __init__(self, config=None):
26
  super().__init__(config)
@@ -58,8 +57,8 @@ class StructureBuilder(BaseBuilder):
58
  selected_polygons.append(prev_block.polygon)
59
 
60
  if next_block and \
61
- next_block.block_type in caption_types and \
62
- next_block.polygon.minimum_gap(block.polygon) < gap_threshold_px:
63
  block_structure.append(next_block.id)
64
  selected_polygons.append(next_block.polygon)
65
 
 
1
+ from typing import Annotated
2
+
3
  from marker.builders import BaseBuilder
4
  from marker.schema import BlockTypes
5
  from marker.schema.document import Document
 
11
  class StructureBuilder(BaseBuilder):
12
  """
13
  A builder for grouping blocks together based on their structure.
 
 
 
 
 
 
 
 
 
14
  """
15
+ gap_threshold: Annotated[
16
+ float,
17
+ "The minimum gap between blocks to consider them part of the same group.",
18
+ ] = 0.05
19
+ list_gap_threshold: Annotated[
20
+ float,
21
+ "The minimum gap between list items to consider them part of the same group.",
22
+ ] = 0.1
23
 
24
  def __init__(self, config=None):
25
  super().__init__(config)
 
57
  selected_polygons.append(prev_block.polygon)
58
 
59
  if next_block and \
60
+ next_block.block_type in caption_types and \
61
+ next_block.polygon.minimum_gap(block.polygon) < gap_threshold_px:
62
  block_structure.append(next_block.id)
63
  selected_polygons.append(next_block.polygon)
64
 
marker/config/crawler.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import importlib
2
+ import inspect
3
+ import pkgutil
4
+ from functools import cached_property
5
+ from typing import Annotated, Dict, Set, Type, get_args, get_origin
6
+
7
+ from marker.builders import BaseBuilder
8
+ from marker.converters import BaseConverter
9
+ from marker.processors import BaseProcessor
10
+ from marker.providers import BaseProvider
11
+ from marker.renderers import BaseRenderer
12
+
13
+
14
+ class ConfigCrawler:
15
+ def __init__(self, base_classes=(BaseBuilder, BaseProcessor, BaseConverter, BaseProvider, BaseRenderer)):
16
+ self.base_classes = base_classes
17
+ self.class_config_map = {}
18
+
19
+ self._crawl_config()
20
+
21
+ def _crawl_config(self):
22
+ for base in self.base_classes:
23
+ base_class_type = base.__name__.removeprefix('Base')
24
+ self.class_config_map.setdefault(base_class_type, {})
25
+ for class_name, class_type in self._find_subclasses(base).items():
26
+ if class_name.startswith('Base'):
27
+ continue
28
+
29
+ self.class_config_map[base_class_type].setdefault(class_name, {
30
+ 'class_type': class_type,
31
+ 'config': {}
32
+ })
33
+ for attr, attr_type in self._gather_super_annotations(class_type).items():
34
+ default = getattr(class_type, attr)
35
+ metadata = (f"Default is {default}.",)
36
+
37
+ if get_origin(attr_type) is Annotated:
38
+ if any('Default' in desc for desc in attr_type.__metadata__):
39
+ metadata = attr_type.__metadata__
40
+ else:
41
+ metadata = attr_type.__metadata__ + metadata
42
+ attr_type = get_args(attr_type)[0]
43
+
44
+ formatted_type = self._format_type(attr_type)
45
+ self.class_config_map[base_class_type][class_name]['config'][attr] = (attr_type, formatted_type, default, metadata)
46
+
47
+ def _gather_super_annotations(self, cls: Type) -> Dict[str, Type]:
48
+ """
49
+ Collect all annotated attributes from `cls` and its superclasses, bottom-up.
50
+ Subclass attributes overwrite superclass attributes with the same name.
51
+ """
52
+ # We'll walk the MRO from base -> derived so subclass attributes overwrite
53
+ # the same attribute name from superclasses.
54
+ annotations = {}
55
+ for base in reversed(cls.__mro__):
56
+ if base is object:
57
+ continue
58
+ if hasattr(base, "__annotations__"):
59
+ for name, annotation in base.__annotations__.items():
60
+ annotations[name] = annotation
61
+ return annotations
62
+
63
+ @cached_property
64
+ def attr_counts(self) -> Dict[str, int]:
65
+ counts: Dict[str, int] = {}
66
+ for base_type_dict in self.class_config_map.values():
67
+ for class_map in base_type_dict.values():
68
+ for attr in class_map['config'].keys():
69
+ counts[attr] = counts.get(attr, 0) + 1
70
+ return counts
71
+
72
+ @cached_property
73
+ def attr_set(self) -> Set[str]:
74
+ attr_set: Set[str] = set()
75
+ for base_type_dict in self.class_config_map.values():
76
+ for class_name, class_map in base_type_dict.items():
77
+ for attr in class_map['config'].keys():
78
+ attr_set.add(attr)
79
+ attr_set.add(f"{class_name}_{attr}")
80
+ return attr_set
81
+
82
+ def _find_subclasses(self, base_class):
83
+ subclasses = {}
84
+ module_name = base_class.__module__
85
+ package = importlib.import_module(module_name)
86
+ if hasattr(package, '__path__'):
87
+ for _, module_name, _ in pkgutil.walk_packages(package.__path__, module_name + "."):
88
+ try:
89
+ module = importlib.import_module(module_name)
90
+ for name, obj in inspect.getmembers(module, inspect.isclass):
91
+ if issubclass(obj, base_class) and obj is not base_class:
92
+ subclasses[name] = obj
93
+ except ImportError:
94
+ pass
95
+ return subclasses
96
+
97
+ def _format_type(self, t: Type) -> str:
98
+ """Format a typing type like Optional[int] into a readable string."""
99
+
100
+ if get_origin(t): # Handle Optional and types with origins separately
101
+ return f"{t}".removeprefix('typing.')
102
+ else: # Regular types like int, str
103
+ return t.__name__
104
+
105
+
106
+ crawler = ConfigCrawler()
marker/config/parser.py CHANGED
@@ -4,11 +4,12 @@ from typing import Dict
4
 
5
  import click
6
 
 
7
  from marker.renderers.html import HTMLRenderer
8
- from marker.settings import settings
9
- from marker.util import parse_range_str, strings_to_classes, classes_to_strings
10
- from marker.renderers.markdown import MarkdownRenderer
11
  from marker.renderers.json import JSONRenderer
 
 
 
12
 
13
 
14
  class ConfigParser:
@@ -22,20 +23,22 @@ class ConfigParser:
22
  fn = click.option('--debug', '-d', is_flag=True, help='Enable debug mode.')(fn)
23
  fn = click.option("--output_format", type=click.Choice(["markdown", "json", "html"]), default="markdown",
24
  help="Format to output results in.")(fn)
25
- fn = click.option("--page_range", type=str, default=None,
26
- help="Page range to convert, specify comma separated page numbers or ranges. Example: 0,5-10,20")(
27
- fn)
28
- fn = click.option("--force_ocr", is_flag=True, help="Force OCR on the whole document.")(fn)
29
  fn = click.option("--processors", type=str, default=None,
30
  help="Comma separated list of processors to use. Must use full module path.")(fn)
31
  fn = click.option("--config_json", type=str, default=None,
32
  help="Path to JSON file with additional configuration.")(fn)
33
- fn = click.option("--languages", type=str, default=None, help="Comma separated list of languages to use for OCR.")(fn)
34
  fn = click.option("--disable_multiprocessing", is_flag=True, default=False, help="Disable multiprocessing.")(fn)
35
- fn = click.option("--paginate_output", is_flag=True, default=False, help="Paginate output.")(fn)
36
  fn = click.option("--disable_image_extraction", is_flag=True, default=False, help="Disable image extraction.")(fn)
 
 
 
 
 
 
 
 
 
37
  fn = click.option("--use_llm", is_flag=True, default=False, help="Enable higher quality processing with LLMs.")(fn)
38
- fn = click.option("--strip_existing_ocr", is_flag=True, default=False, help="Strip existing OCR text from the PDF.")(fn)
39
  return fn
40
 
41
  def generate_config_dict(self) -> Dict[str, any]:
@@ -53,8 +56,6 @@ class ConfigParser:
53
  config["debug_data_folder"] = output_dir
54
  case "page_range":
55
  config["page_range"] = parse_range_str(v)
56
- case "force_ocr":
57
- config["force_ocr"] = True
58
  case "languages":
59
  config["languages"] = v.split(",")
60
  case "config_json":
@@ -62,14 +63,11 @@ class ConfigParser:
62
  config.update(json.load(f))
63
  case "disable_multiprocessing":
64
  config["pdftext_workers"] = 1
65
- case "paginate_output":
66
- config["paginate_output"] = True
67
  case "disable_image_extraction":
68
  config["extract_images"] = False
69
- case "use_llm":
70
- config["use_llm"] = True
71
- case "strip_existing_ocr":
72
- config["strip_existing_ocr"] = True
73
  return config
74
 
75
  def get_renderer(self):
 
4
 
5
  import click
6
 
7
+ from marker.config.crawler import crawler
8
  from marker.renderers.html import HTMLRenderer
 
 
 
9
  from marker.renderers.json import JSONRenderer
10
+ from marker.renderers.markdown import MarkdownRenderer
11
+ from marker.settings import settings
12
+ from marker.util import classes_to_strings, parse_range_str, strings_to_classes
13
 
14
 
15
  class ConfigParser:
 
23
  fn = click.option('--debug', '-d', is_flag=True, help='Enable debug mode.')(fn)
24
  fn = click.option("--output_format", type=click.Choice(["markdown", "json", "html"]), default="markdown",
25
  help="Format to output results in.")(fn)
 
 
 
 
26
  fn = click.option("--processors", type=str, default=None,
27
  help="Comma separated list of processors to use. Must use full module path.")(fn)
28
  fn = click.option("--config_json", type=str, default=None,
29
  help="Path to JSON file with additional configuration.")(fn)
 
30
  fn = click.option("--disable_multiprocessing", is_flag=True, default=False, help="Disable multiprocessing.")(fn)
 
31
  fn = click.option("--disable_image_extraction", is_flag=True, default=False, help="Disable image extraction.")(fn)
32
+
33
+ # these are options that need a list transformation, i.e splitting/parsing a string
34
+ fn = click.option("--page_range", type=str, default=None,
35
+ help="Page range to convert, specify comma separated page numbers or ranges. Example: 0,5-10,20")(
36
+ fn)
37
+ fn = click.option("--languages", type=str, default=None, help="Comma separated list of languages to use for OCR.")(fn)
38
+
39
+ # we put common options here
40
+ fn = click.option("--google_api_key", type=str, default=None, help="Google API key for using LLMs.")(fn)
41
  fn = click.option("--use_llm", is_flag=True, default=False, help="Enable higher quality processing with LLMs.")(fn)
 
42
  return fn
43
 
44
  def generate_config_dict(self) -> Dict[str, any]:
 
56
  config["debug_data_folder"] = output_dir
57
  case "page_range":
58
  config["page_range"] = parse_range_str(v)
 
 
59
  case "languages":
60
  config["languages"] = v.split(",")
61
  case "config_json":
 
63
  config.update(json.load(f))
64
  case "disable_multiprocessing":
65
  config["pdftext_workers"] = 1
 
 
66
  case "disable_image_extraction":
67
  config["extract_images"] = False
68
+ case _:
69
+ if k in crawler.attr_set:
70
+ config[k] = v
 
71
  return config
72
 
73
  def get_renderer(self):
marker/config/printer.py CHANGED
@@ -1,32 +1,8 @@
1
- import importlib
2
- import inspect
3
- import pkgutil
4
 
5
  import click
6
 
7
- from marker.builders import BaseBuilder
8
- from marker.converters import BaseConverter
9
- from marker.processors import BaseProcessor
10
-
11
-
12
- def find_subclasses(base_class):
13
- """
14
- Dynamically find all subclasses of a base class in the module where the base class is defined
15
- and its submodules.
16
- """
17
- subclasses = {}
18
- module_name = base_class.__module__
19
- package = importlib.import_module(module_name)
20
- if hasattr(package, '__path__'):
21
- for _, module_name, _ in pkgutil.walk_packages(package.__path__, module_name + "."):
22
- try:
23
- module = importlib.import_module(module_name)
24
- for name, obj in inspect.getmembers(module, inspect.isclass):
25
- if issubclass(obj, base_class) and obj is not base_class:
26
- subclasses[name] = obj
27
- except ImportError:
28
- pass
29
- return subclasses
30
 
31
 
32
  class CustomClickPrinter(click.Command):
@@ -39,16 +15,41 @@ class CustomClickPrinter(click.Command):
39
  click.echo(help_text)
40
 
41
  def parse_args(self, ctx, args):
42
- if 'config' in args and '--help' in args:
43
- click.echo("Here is a list of all the Builders, Processors, and Converters in Marker along with their attributes:")
44
- base_classes = [BaseBuilder, BaseProcessor, BaseConverter]
45
- for base in base_classes:
46
- click.echo(f"{base.__name__.removeprefix('Base')}s:\n")
47
-
48
- subclasses = find_subclasses(base)
49
- for class_name, class_type in subclasses.items():
50
- doc = class_type.__doc__
51
- if doc and "Attributes:" in doc:
52
- click.echo(f" {class_name}: {doc}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  ctx.exit()
 
54
  super().parse_args(ctx, args)
 
1
+ from typing import Optional
 
 
2
 
3
  import click
4
 
5
+ from marker.config.crawler import crawler
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
 
8
  class CustomClickPrinter(click.Command):
 
15
  click.echo(help_text)
16
 
17
  def parse_args(self, ctx, args):
18
+ display_help = 'config' in args and '--help' in args
19
+ if display_help:
20
+ click.echo("Here is a list of all the Builders, Processors, Converters, Providers and Renderers in Marker along with their attributes:")
21
+
22
+ for base_type, base_type_dict in crawler.class_config_map.items():
23
+ if display_help:
24
+ click.echo(f"{base_type}s:")
25
+ for class_name, class_map in base_type_dict.items():
26
+ if display_help and class_map['config']:
27
+ click.echo(f"\n {class_name}: {class_map['class_type'].__doc__ or ''}")
28
+ click.echo(" " * 4 + "Attributes:")
29
+ for attr, (attr_type, formatted_type, default, metadata) in class_map['config'].items():
30
+ class_name_attr = class_name + "_" + attr
31
+
32
+ if display_help:
33
+ click.echo(" " * 8 + f"{attr} ({formatted_type}):")
34
+ click.echo("\n".join([f'{" " * 12}' + desc for desc in metadata]))
35
+ if attr_type in [str, int, float, bool, Optional[int], Optional[float], Optional[str]]:
36
+ is_flag = attr_type in [bool, Optional[bool]] and not default
37
+ if crawler.attr_counts.get(attr) > 1:
38
+ options = ["--" + class_name_attr]
39
+ else:
40
+ options = ["--" + attr, "--" + class_name_attr]
41
+ options.append(class_name_attr)
42
+ ctx.command.params.append(
43
+ click.Option(
44
+ options,
45
+ type=attr_type,
46
+ help=" ".join(metadata),
47
+ default=default,
48
+ is_flag=is_flag,
49
+ )
50
+ )
51
+
52
+ if display_help:
53
  ctx.exit()
54
+
55
  super().parse_args(ctx, args)
marker/converters/pdf.py CHANGED
@@ -1,30 +1,31 @@
1
  import os
2
- os.environ["TOKENIZERS_PARALLELISM"] = "false" # disables a tokenizers warning
 
3
 
4
  import inspect
5
  from collections import defaultdict
6
- from typing import Any, Dict, List, Type
7
 
8
  from marker.builders.document import DocumentBuilder
9
- from marker.builders.llm_layout import LLMLayoutBuilder
10
  from marker.builders.layout import LayoutBuilder
 
11
  from marker.builders.ocr import OcrBuilder
12
  from marker.builders.structure import StructureBuilder
13
  from marker.converters import BaseConverter
14
- from marker.processors.llm.llm_complex import LLMComplexRegionProcessor
15
  from marker.processors.blockquote import BlockquoteProcessor
16
  from marker.processors.code import CodeProcessor
17
  from marker.processors.debug import DebugProcessor
18
  from marker.processors.document_toc import DocumentTOCProcessor
19
  from marker.processors.equation import EquationProcessor
20
  from marker.processors.footnote import FootnoteProcessor
21
- from marker.processors.llm.llm_form import LLMFormProcessor
22
- from marker.processors.llm.llm_table import LLMTableProcessor
23
- from marker.processors.llm.llm_text import LLMTextProcessor
24
- from marker.processors.llm.llm_image_description import LLMImageDescriptionProcessor
25
  from marker.processors.ignoretext import IgnoreTextProcessor
26
  from marker.processors.line_numbers import LineNumbersProcessor
27
  from marker.processors.list import ListProcessor
 
 
 
 
 
28
  from marker.processors.page_header import PageHeaderProcessor
29
  from marker.processors.sectionheader import SectionHeaderProcessor
30
  from marker.processors.table import TableProcessor
@@ -40,18 +41,20 @@ from marker.util import strings_to_classes
40
  class PdfConverter(BaseConverter):
41
  """
42
  A converter for processing and rendering PDF files into Markdown, JSON, HTML and other formats.
43
-
44
- Attributes:
45
- override_map (Dict[BlockTypes, Type[Block]]):
46
- A mapping to override the default block classes for specific block types.
47
- The keys are `BlockTypes` enum values, representing the types of blocks,
48
- and the values are corresponding `Block` class implementations to use
49
- instead of the defaults.
50
  """
51
- override_map: Dict[BlockTypes, Type[Block]] = defaultdict()
52
- use_llm: bool = False
53
-
54
- def __init__(self, artifact_dict: Dict[str, Any], processor_list: List[str] | None = None, renderer: str | None = None, config=None):
 
 
 
 
 
 
 
 
 
55
  super().__init__(config)
56
 
57
  for block_type, override_block_type in self.override_map.items():
 
1
  import os
2
+
3
+ os.environ["TOKENIZERS_PARALLELISM"] = "false" # disables a tokenizers warning
4
 
5
  import inspect
6
  from collections import defaultdict
7
+ from typing import Annotated, Any, Dict, List, Optional, Type
8
 
9
  from marker.builders.document import DocumentBuilder
 
10
  from marker.builders.layout import LayoutBuilder
11
+ from marker.builders.llm_layout import LLMLayoutBuilder
12
  from marker.builders.ocr import OcrBuilder
13
  from marker.builders.structure import StructureBuilder
14
  from marker.converters import BaseConverter
 
15
  from marker.processors.blockquote import BlockquoteProcessor
16
  from marker.processors.code import CodeProcessor
17
  from marker.processors.debug import DebugProcessor
18
  from marker.processors.document_toc import DocumentTOCProcessor
19
  from marker.processors.equation import EquationProcessor
20
  from marker.processors.footnote import FootnoteProcessor
 
 
 
 
21
  from marker.processors.ignoretext import IgnoreTextProcessor
22
  from marker.processors.line_numbers import LineNumbersProcessor
23
  from marker.processors.list import ListProcessor
24
+ from marker.processors.llm.llm_complex import LLMComplexRegionProcessor
25
+ from marker.processors.llm.llm_form import LLMFormProcessor
26
+ from marker.processors.llm.llm_image_description import LLMImageDescriptionProcessor
27
+ from marker.processors.llm.llm_table import LLMTableProcessor
28
+ from marker.processors.llm.llm_text import LLMTextProcessor
29
  from marker.processors.page_header import PageHeaderProcessor
30
  from marker.processors.sectionheader import SectionHeaderProcessor
31
  from marker.processors.table import TableProcessor
 
41
  class PdfConverter(BaseConverter):
42
  """
43
  A converter for processing and rendering PDF files into Markdown, JSON, HTML and other formats.
 
 
 
 
 
 
 
44
  """
45
+ override_map: Annotated[
46
+ Dict[BlockTypes, Type[Block]],
47
+ "A mapping to override the default block classes for specific block types.",
48
+ "The keys are `BlockTypes` enum values, representing the types of blocks,",
49
+ "and the values are corresponding `Block` class implementations to use",
50
+ "instead of the defaults."
51
+ ] = defaultdict()
52
+ use_llm: Annotated[
53
+ bool,
54
+ "Enable higher quality processing with LLMs.",
55
+ ] = False
56
+
57
+ def __init__(self, artifact_dict: Dict[str, Any], processor_list: Optional[List[str]] = None, renderer: str | None = None, config=None):
58
  super().__init__(config)
59
 
60
  for block_type, override_block_type in self.override_map.items():
marker/processors/blockquote.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  from marker.processors import BaseProcessor
2
  from marker.schema import BlockTypes
3
  from marker.schema.document import Document
@@ -5,12 +7,27 @@ from marker.schema.document import Document
5
 
6
  class BlockquoteProcessor(BaseProcessor):
7
  """
8
- A processor for tagging blockquotes
9
  """
10
- block_types = (BlockTypes.Text, BlockTypes.TextInlineMath)
11
- min_x_indent = 0.05 # % of block width
12
- x_start_tolerance = 0.01 # % of block width
13
- x_end_tolerance = 0.01 # % of block width
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
  def __init__(self, config):
16
  super().__init__(config)
 
1
+ from typing import Annotated, Tuple
2
+
3
  from marker.processors import BaseProcessor
4
  from marker.schema import BlockTypes
5
  from marker.schema.document import Document
 
7
 
8
  class BlockquoteProcessor(BaseProcessor):
9
  """
10
+ A processor for tagging blockquotes.
11
  """
12
+ block_types: Annotated[
13
+ Tuple[BlockTypes],
14
+ "The block types to process.",
15
+ ] = (BlockTypes.Text, BlockTypes.TextInlineMath)
16
+ min_x_indent: Annotated[
17
+ float,
18
+ "The minimum horizontal indentation required to consider a block as part of a blockquote.",
19
+ "Expressed as a percentage of the block width.",
20
+ ] = 0.05
21
+ x_start_tolerance: Annotated[
22
+ float,
23
+ "The maximum allowable difference between the starting x-coordinates of consecutive blocks to consider them aligned.",
24
+ "Expressed as a percentage of the block width.",
25
+ ] = 0.01
26
+ x_end_tolerance: Annotated[
27
+ float,
28
+ "The maximum allowable difference between the ending x-coordinates of consecutive blocks to consider them aligned.",
29
+ "Expressed as a percentage of the block width.",
30
+ ] = 0.01
31
 
32
  def __init__(self, config):
33
  super().__init__(config)
marker/processors/debug.py CHANGED
@@ -1,5 +1,6 @@
1
  import json
2
  import os
 
3
 
4
  import requests
5
  from PIL import Image, ImageDraw, ImageFont
@@ -13,39 +14,36 @@ from marker.settings import settings
13
  class DebugProcessor(BaseProcessor):
14
  """
15
  A processor for debugging the document.
16
-
17
- Attributes:
18
- debug_data_folder (str):
19
- The folder to dump debug data to.
20
- Default is "debug_data".
21
-
22
- debug_layout_images (bool):
23
- Whether to dump layout debug images.
24
- Default is False.
25
-
26
- debug_pdf_images (bool):
27
- Whether to dump PDF debug images.
28
- Default is False.
29
-
30
- debug_json (bool):
31
- Whether to dump block debug data.
32
- Default is False.
33
-
34
- render_font (str):
35
- The path to the font to use for rendering debug images.
36
- Default is "GoNotoCurrent-Regular.ttf" in the FONT_DIR folder.
37
-
38
- font_dl_path (str):
39
- The path to download the font from.
40
- Default is "https://github.com/satbyy/go-noto-universal/releases/download/v7.0".
41
  """
42
- block_types = tuple()
43
- debug_data_folder: str = "debug_data"
44
- debug_layout_images: bool = False
45
- debug_pdf_images: bool = False
46
- debug_json: bool = False
47
- render_font: str = os.path.join(settings.FONT_DIR, "GoNotoCurrent-Regular.ttf")
48
- font_dl_path: str = "https://github.com/satbyy/go-noto-universal/releases/download/v7.0"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
  def __call__(self, document: Document):
51
  # Remove extension from doc name
@@ -90,7 +88,6 @@ class DebugProcessor(BaseProcessor):
90
  debug_file = os.path.join(self.debug_folder, f"pdf_page_{page.page_id}.png")
91
  png_image.save(debug_file)
92
 
93
-
94
  def draw_layout_debug_images(self, document: Document, pdf_mode=False):
95
  for page in document.pages:
96
  img_size = page.highres_image.size
@@ -113,7 +110,6 @@ class DebugProcessor(BaseProcessor):
113
  debug_file = os.path.join(self.debug_folder, f"layout_page_{page.page_id}.png")
114
  png_image.save(debug_file)
115
 
116
-
117
  def render_layout_boxes(self, page, png_image):
118
  layout_bboxes = []
119
  layout_labels = []
 
1
  import json
2
  import os
3
+ from typing import Annotated
4
 
5
  import requests
6
  from PIL import Image, ImageDraw, ImageFont
 
14
  class DebugProcessor(BaseProcessor):
15
  """
16
  A processor for debugging the document.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  """
18
+ block_types: Annotated[
19
+ tuple,
20
+ "The block types to process.",
21
+ "Default is an empty tuple."
22
+ ] = tuple()
23
+ debug_data_folder: Annotated[
24
+ str,
25
+ "The folder to dump debug data to.",
26
+ ] = "debug_data"
27
+ debug_layout_images: Annotated[
28
+ bool,
29
+ "Whether to dump layout debug images.",
30
+ ] = False
31
+ debug_pdf_images: Annotated[
32
+ bool,
33
+ "Whether to dump PDF debug images.",
34
+ ] = False
35
+ debug_json: Annotated[
36
+ bool,
37
+ "Whether to dump block debug data.",
38
+ ] = False
39
+ render_font: Annotated[
40
+ str,
41
+ "The path to the font to use for rendering debug images.",
42
+ ] = os.path.join(settings.FONT_DIR, "GoNotoCurrent-Regular.ttf")
43
+ font_dl_path: Annotated[
44
+ str,
45
+ "The path to download the font from.",
46
+ ] = "https://github.com/satbyy/go-noto-universal/releases/download/v7.0"
47
 
48
  def __call__(self, document: Document):
49
  # Remove extension from doc name
 
88
  debug_file = os.path.join(self.debug_folder, f"pdf_page_{page.page_id}.png")
89
  png_image.save(debug_file)
90
 
 
91
  def draw_layout_debug_images(self, document: Document, pdf_mode=False):
92
  for page in document.pages:
93
  img_size = page.highres_image.size
 
110
  debug_file = os.path.join(self.debug_folder, f"layout_page_{page.page_id}.png")
111
  png_image.save(debug_file)
112
 
 
113
  def render_layout_boxes(self, page, png_image):
114
  layout_bboxes = []
115
  layout_labels = []
marker/processors/equation.py CHANGED
@@ -1,4 +1,4 @@
1
- from typing import List
2
 
3
  from texify.inference import batch_inference
4
  from texify.model.model import GenerateVisionEncoderDecoderModel
@@ -13,24 +13,24 @@ from marker.settings import settings
13
  class EquationProcessor(BaseProcessor):
14
  """
15
  A processor for recognizing equations in the document.
16
-
17
- Attributes:
18
- model_max_length (int):
19
- The maximum number of tokens to allow for the Texify model.
20
- Default is 384.
21
-
22
- batch_size (int):
23
- The batch size to use for the Texify model.
24
- Default is None, which will use the default batch size for the model.
25
-
26
- token_buffer (int):
27
- The number of tokens to buffer above max for the Texify model.
28
- Default is 256.
29
  """
30
- block_types = (BlockTypes.Equation, )
31
- model_max_length = 384
32
- texify_batch_size = None
33
- token_buffer = 256
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
  def __init__(self, texify_model: GenerateVisionEncoderDecoderModel, config=None):
36
  super().__init__(config)
 
1
+ from typing import Annotated, List, Optional, Tuple
2
 
3
  from texify.inference import batch_inference
4
  from texify.model.model import GenerateVisionEncoderDecoderModel
 
13
  class EquationProcessor(BaseProcessor):
14
  """
15
  A processor for recognizing equations in the document.
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  """
17
+ block_types: Annotated[
18
+ Tuple[BlockTypes],
19
+ "The block types to process.",
20
+ ] = (BlockTypes.Equation,)
21
+ model_max_length: Annotated[
22
+ int,
23
+ "The maximum number of tokens to allow for the Texify model.",
24
+ ] = 384
25
+ texify_batch_size: Annotated[
26
+ Optional[int],
27
+ "The batch size to use for the Texify model.",
28
+ "Default is None, which will use the default batch size for the model."
29
+ ] = None
30
+ token_buffer: Annotated[
31
+ int,
32
+ "The number of tokens to buffer above max for the Texify model.",
33
+ ] = 256
34
 
35
  def __init__(self, texify_model: GenerateVisionEncoderDecoderModel, config=None):
36
  super().__init__(config)
marker/processors/footnote.py CHANGED
@@ -1,27 +1,12 @@
1
- from statistics import mean
2
-
3
  from marker.processors import BaseProcessor
4
  from marker.schema import BlockTypes
5
- from marker.schema.blocks import Footnote
6
  from marker.schema.document import Document
7
-
8
- from rapidfuzz import fuzz
9
-
10
  from marker.schema.groups import PageGroup
11
 
12
 
13
  class FootnoteProcessor(BaseProcessor):
14
  """
15
  A processor for pushing footnotes to the bottom, and relabeling mislabeled text blocks.
16
-
17
- Attributes:
18
- page_bottom_threshold (float):
19
- The fraction of page height that is considered the bottom.
20
- Default is .8
21
-
22
- line_height_scaler (float):
23
- The amount to scale line height by to consider a block a footnote. (from N to 1+(1-N))
24
- Default is .99
25
  """
26
  block_types = (BlockTypes.Footnote,)
27
 
@@ -29,7 +14,6 @@ class FootnoteProcessor(BaseProcessor):
29
  for page in document.pages:
30
  self.push_footnotes_to_bottom(page, document)
31
 
32
-
33
  def push_footnotes_to_bottom(self, page: PageGroup, document: Document):
34
  footnote_blocks = page.contained_blocks(document, self.block_types)
35
 
@@ -39,4 +23,4 @@ class FootnoteProcessor(BaseProcessor):
39
  if block.id in page.structure:
40
  # Move to bottom if it is
41
  page.structure.remove(block.id)
42
- page.add_structure(block)
 
 
 
1
  from marker.processors import BaseProcessor
2
  from marker.schema import BlockTypes
 
3
  from marker.schema.document import Document
 
 
 
4
  from marker.schema.groups import PageGroup
5
 
6
 
7
  class FootnoteProcessor(BaseProcessor):
8
  """
9
  A processor for pushing footnotes to the bottom, and relabeling mislabeled text blocks.
 
 
 
 
 
 
 
 
 
10
  """
11
  block_types = (BlockTypes.Footnote,)
12
 
 
14
  for page in document.pages:
15
  self.push_footnotes_to_bottom(page, document)
16
 
 
17
  def push_footnotes_to_bottom(self, page: PageGroup, document: Document):
18
  footnote_blocks = page.contained_blocks(document, self.block_types)
19
 
 
23
  if block.id in page.structure:
24
  # Move to bottom if it is
25
  page.structure.remove(block.id)
26
+ page.add_structure(block)
marker/processors/ignoretext.py CHANGED
@@ -1,7 +1,7 @@
1
  import re
2
  from collections import Counter
3
  from itertools import groupby
4
- from typing import List
5
 
6
  from rapidfuzz import fuzz
7
 
@@ -13,22 +13,34 @@ from marker.schema.document import Document
13
 
14
  class IgnoreTextProcessor(BaseProcessor):
15
  """
16
- A processor for ignoring text blocks that are common elements in the document.
17
-
18
- Attributes:
19
- common_element_threshold (float):
20
- The minimum fraction of pages that a block must appear in to be considered a common element.
21
- Default is 0.6.
22
  """
23
  block_types = (
24
- BlockTypes.Text, BlockTypes.PageHeader,
25
  BlockTypes.PageFooter, BlockTypes.SectionHeader,
26
  BlockTypes.TextInlineMath
27
  )
28
- common_element_threshold = .20
29
- common_element_min_blocks = 3
30
- max_streak = 3 # The maximum number of blocks in a row to consider a common element
31
- text_match_threshold = 90
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
  def __call__(self, document: Document):
34
  first_blocks = []
@@ -55,8 +67,8 @@ class IgnoreTextProcessor(BaseProcessor):
55
  @staticmethod
56
  def clean_text(text):
57
  text = text.replace("\n", "").strip()
58
- text = re.sub(r"^\d+\s*", "", text) # remove numbers at the start of the line
59
- text = re.sub(r"\s*\d+$", "", text) # remove numbers at the end of the line
60
  return text
61
 
62
  def filter_common_elements(self, document, blocks: List[Block]):
@@ -74,7 +86,7 @@ class IgnoreTextProcessor(BaseProcessor):
74
  common = [
75
  k for k, v in counter.items()
76
  if (v >= len(blocks) * self.common_element_threshold or streaks[k] >= self.max_streak)
77
- and v > self.common_element_min_blocks
78
  ]
79
  if len(common) == 0:
80
  return
 
1
  import re
2
  from collections import Counter
3
  from itertools import groupby
4
+ from typing import Annotated, List
5
 
6
  from rapidfuzz import fuzz
7
 
 
13
 
14
  class IgnoreTextProcessor(BaseProcessor):
15
  """
16
+ A processor for identifying and ignoring common text blocks in a document.
17
+ These blocks often represent repetitive or non-essential elements, such as headers, footers, or page numbers.
 
 
 
 
18
  """
19
  block_types = (
20
+ BlockTypes.Text, BlockTypes.PageHeader,
21
  BlockTypes.PageFooter, BlockTypes.SectionHeader,
22
  BlockTypes.TextInlineMath
23
  )
24
+ common_element_threshold: Annotated[
25
+ float,
26
+ "The minimum ratio of pages a text block must appear on to be considered a common element.",
27
+ "Blocks that meet or exceed this threshold are marked as common elements.",
28
+ ] = 0.2
29
+ common_element_min_blocks: Annotated[
30
+ int,
31
+ "The minimum number of occurrences of a text block within a document to consider it a common element.",
32
+ "This ensures that rare blocks are not mistakenly flagged.",
33
+ ] = 3
34
+ max_streak: Annotated[
35
+ int,
36
+ "The maximum number of consecutive occurrences of a text block allowed before it is classified as a common element.",
37
+ "Helps to identify patterns like repeated headers or footers.",
38
+ ] = 3
39
+ text_match_threshold: Annotated[
40
+ int,
41
+ "The minimum fuzzy match score (0-100) required to classify a text block as similar to a common element.",
42
+ "Higher values enforce stricter matching.",
43
+ ] = 90
44
 
45
  def __call__(self, document: Document):
46
  first_blocks = []
 
67
  @staticmethod
68
  def clean_text(text):
69
  text = text.replace("\n", "").strip()
70
+ text = re.sub(r"^\d+\s*", "", text) # remove numbers at the start of the line
71
+ text = re.sub(r"\s*\d+$", "", text) # remove numbers at the end of the line
72
  return text
73
 
74
  def filter_common_elements(self, document, blocks: List[Block]):
 
86
  common = [
87
  k for k, v in counter.items()
88
  if (v >= len(blocks) * self.common_element_threshold or streaks[k] >= self.max_streak)
89
+ and v > self.common_element_min_blocks
90
  ]
91
  if len(common) == 0:
92
  return
marker/processors/line_numbers.py CHANGED
@@ -1,13 +1,29 @@
 
 
1
  from marker.processors import BaseProcessor
2
  from marker.schema import BlockTypes
3
  from marker.schema.document import Document
4
 
5
 
6
  class LineNumbersProcessor(BaseProcessor):
 
 
 
7
  block_types = (BlockTypes.Text, BlockTypes.TextInlineMath)
8
- strip_numbers_threshold: int = .6
9
- min_lines_in_block: int = 4
10
- min_line_length: int = 10
 
 
 
 
 
 
 
 
 
 
 
11
 
12
  def __init__(self, config):
13
  super().__init__(config)
@@ -27,11 +43,10 @@ class LineNumbersProcessor(BaseProcessor):
27
  tokens_are_numbers = [token.isdigit() for token in tokens]
28
  if all([
29
  sum(tokens_are_numbers) / len(tokens) > self.strip_numbers_threshold,
30
- block.polygon.height > block.polygon.width # Ensure block is taller than it is wide, like vertical page numbers
31
  ]):
32
  block.ignore_for_output = True
33
 
34
-
35
  def ignore_line_starts_ends(self, document: Document):
36
  for page in document.pages:
37
  for block in page.contained_blocks(document, self.block_types):
@@ -57,7 +72,7 @@ class LineNumbersProcessor(BaseProcessor):
57
  len(raw_text) - len(spans[0].text.strip()) > self.min_line_length
58
  ])
59
 
60
- ends= all([
61
  spans[-1].text.strip().isdigit(),
62
  len(raw_text) - len(spans[-1].text.strip()) > self.min_line_length
63
  ])
@@ -76,4 +91,3 @@ class LineNumbersProcessor(BaseProcessor):
76
  if ends:
77
  span = page.get_block(line.structure[-1])
78
  span.ignore_for_output = True
79
-
 
1
+ from typing import Annotated
2
+
3
  from marker.processors import BaseProcessor
4
  from marker.schema import BlockTypes
5
  from marker.schema.document import Document
6
 
7
 
8
  class LineNumbersProcessor(BaseProcessor):
9
+ """
10
+ A processor for ignoring line numbers.
11
+ """
12
  block_types = (BlockTypes.Text, BlockTypes.TextInlineMath)
13
+ strip_numbers_threshold: Annotated[
14
+ float,
15
+ "The fraction of lines or tokens in a block that must be numeric to consider them as line numbers.",
16
+ ] = 0.6
17
+ min_lines_in_block: Annotated[
18
+ int,
19
+ "The minimum number of lines required in a block for it to be considered during processing.",
20
+ "Ensures that small blocks are ignored as they are unlikely to contain meaningful line numbers.",
21
+ ] = 4
22
+ min_line_length: Annotated[
23
+ int,
24
+ "The minimum length of a line (in characters) to consider it significant when checking for",
25
+ "numeric prefixes or suffixes. Prevents false positives for short lines.",
26
+ ] = 10
27
 
28
  def __init__(self, config):
29
  super().__init__(config)
 
43
  tokens_are_numbers = [token.isdigit() for token in tokens]
44
  if all([
45
  sum(tokens_are_numbers) / len(tokens) > self.strip_numbers_threshold,
46
+ block.polygon.height > block.polygon.width # Ensure block is taller than it is wide, like vertical page numbers
47
  ]):
48
  block.ignore_for_output = True
49
 
 
50
  def ignore_line_starts_ends(self, document: Document):
51
  for page in document.pages:
52
  for block in page.contained_blocks(document, self.block_types):
 
72
  len(raw_text) - len(spans[0].text.strip()) > self.min_line_length
73
  ])
74
 
75
+ ends = all([
76
  spans[-1].text.strip().isdigit(),
77
  len(raw_text) - len(spans[-1].text.strip()) > self.min_line_length
78
  ])
 
91
  if ends:
92
  span = page.get_block(line.structure[-1])
93
  span.ignore_for_output = True
 
marker/processors/list.py CHANGED
@@ -1,4 +1,4 @@
1
- from typing import List
2
 
3
  from marker.processors import BaseProcessor
4
  from marker.schema import BlockTypes
@@ -11,8 +11,14 @@ class ListProcessor(BaseProcessor):
11
  A processor for merging lists across pages and columns
12
  """
13
  block_types = (BlockTypes.ListGroup,)
14
- ignored_block_types = (BlockTypes.PageHeader, BlockTypes.PageFooter)
15
- min_x_indent = 0.01 # % of page width
 
 
 
 
 
 
16
 
17
  def __init__(self, config):
18
  super().__init__(config)
 
1
+ from typing import Annotated, List, Tuple
2
 
3
  from marker.processors import BaseProcessor
4
  from marker.schema import BlockTypes
 
11
  A processor for merging lists across pages and columns
12
  """
13
  block_types = (BlockTypes.ListGroup,)
14
+ ignored_block_types: Annotated[
15
+ Tuple[BlockTypes],
16
+ "The list of block types to ignore when merging lists.",
17
+ ] = (BlockTypes.PageHeader, BlockTypes.PageFooter)
18
+ min_x_indent: Annotated[
19
+ float, "The minimum horizontal indentation required to consider a block as a nested list item.",
20
+ "This is expressed as a percentage of the page width and is used to determine hierarchical relationships within a list.",
21
+ ] = 0.01
22
 
23
  def __init__(self, config):
24
  super().__init__(config)
marker/processors/llm/__init__.py CHANGED
@@ -1,5 +1,5 @@
1
  from concurrent.futures import ThreadPoolExecutor, as_completed
2
- from typing import Optional
3
 
4
  from tqdm import tqdm
5
 
@@ -14,37 +14,40 @@ from marker.settings import settings
14
  class BaseLLMProcessor(BaseProcessor):
15
  """
16
  A processor for using LLMs to convert blocks.
17
- Attributes:
18
- google_api_key (str):
19
- The Google API key to use for the Gemini model.
20
- Default is None.
21
- model_name (str):
22
- The name of the Gemini model to use.
23
- Default is "gemini-1.5-flash".
24
- max_retries (int):
25
- The maximum number of retries to use for the Gemini model.
26
- Default is 3.
27
- max_concurrency (int):
28
- The maximum number of concurrent requests to make to the Gemini model.
29
- Default is 3.
30
- timeout (int):
31
- The timeout for requests to the Gemini model.
32
- gemini_rewriting_prompt (str):
33
- The prompt to use for rewriting text.
34
- Default is a string containing the Gemini rewriting prompt.
35
- use_llm (bool):
36
- Whether to use the LLM model.
37
- Default is False.
38
  """
39
-
40
- google_api_key: Optional[str] = settings.GOOGLE_API_KEY
41
- model_name: str = "gemini-1.5-flash"
42
- use_llm: bool = False
43
- max_retries: int = 3
44
- max_concurrency: int = 3
45
- timeout: int = 60
46
- image_expansion_ratio: float = 0.01
47
- gemini_rewriting_prompt = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  block_types = None
49
 
50
  def __init__(self, config=None):
@@ -87,4 +90,4 @@ class BaseLLMProcessor(BaseProcessor):
87
  .rescale(page.polygon.size, page_img.size)\
88
  .expand(self.image_expansion_ratio, self.image_expansion_ratio)
89
  cropped = page_img.crop(image_box.bbox)
90
- return cropped
 
1
  from concurrent.futures import ThreadPoolExecutor, as_completed
2
+ from typing import Annotated, Optional
3
 
4
  from tqdm import tqdm
5
 
 
14
  class BaseLLMProcessor(BaseProcessor):
15
  """
16
  A processor for using LLMs to convert blocks.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  """
18
+ google_api_key: Annotated[
19
+ Optional[str],
20
+ "The Google API key to use for the Gemini model.",
21
+ ] = settings.GOOGLE_API_KEY
22
+ model_name: Annotated[
23
+ str,
24
+ "The name of the Gemini model to use.",
25
+ ] = "gemini-1.5-flash"
26
+ max_retries: Annotated[
27
+ int,
28
+ "The maximum number of retries to use for the Gemini model.",
29
+ ] = 3
30
+ max_concurrency: Annotated[
31
+ int,
32
+ "The maximum number of concurrent requests to make to the Gemini model.",
33
+ ] = 3
34
+ timeout: Annotated[
35
+ int,
36
+ "The timeout for requests to the Gemini model.",
37
+ ] = 60
38
+ image_expansion_ratio: Annotated[
39
+ float,
40
+ "The ratio to expand the image by when cropping.",
41
+ ] = 0.01
42
+ gemini_rewriting_prompt: Annotated[
43
+ str,
44
+ "The prompt to use for rewriting text.",
45
+ "Default is a string containing the Gemini rewriting prompt."
46
+ ] = ''
47
+ use_llm: Annotated[
48
+ bool,
49
+ "Whether to use the LLM model.",
50
+ ] = False
51
  block_types = None
52
 
53
  def __init__(self, config=None):
 
90
  .rescale(page.polygon.size, page_img.size)\
91
  .expand(self.image_expansion_ratio, self.image_expansion_ratio)
92
  cropped = page_img.crop(image_box.bbox)
93
+ return cropped
marker/processors/llm/llm_image_description.py CHANGED
@@ -7,11 +7,20 @@ from marker.schema.blocks import Block
7
  from marker.schema.document import Document
8
  from marker.schema.groups.page import PageGroup
9
 
 
 
10
 
11
  class LLMImageDescriptionProcessor(BaseLLMProcessor):
12
  block_types = (BlockTypes.Picture, BlockTypes.Figure,)
13
- extract_images: bool = True
14
- image_description_prompt = """You are a document analysis expert who specializes in creating text descriptions for images.
 
 
 
 
 
 
 
15
  You will receive an image of a picture or figure. Your job will be to create a short description of the image.
16
  **Instructions:**
17
  1. Carefully examine the provided image.
 
7
  from marker.schema.document import Document
8
  from marker.schema.groups.page import PageGroup
9
 
10
+ from typing import Annotated
11
+
12
 
13
  class LLMImageDescriptionProcessor(BaseLLMProcessor):
14
  block_types = (BlockTypes.Picture, BlockTypes.Figure,)
15
+ extract_images: Annotated[
16
+ bool,
17
+ "Extract images from the document."
18
+ ] = True
19
+ image_description_prompt: Annotated[
20
+ str,
21
+ "The prompt to use for generating image descriptions.",
22
+ "Default is a string containing the Gemini prompt."
23
+ ] = """You are a document analysis expert who specializes in creating text descriptions for images.
24
  You will receive an image of a picture or figure. Your job will be to create a short description of the image.
25
  **Instructions:**
26
  1. Carefully examine the provided image.
marker/processors/llm/llm_table.py CHANGED
@@ -1,12 +1,11 @@
1
- from tabled.schema import SpanTableCell
2
 
3
- from marker.processors.llm import BaseLLMProcessor
4
  from bs4 import BeautifulSoup
5
- from typing import List
6
-
7
  from google.ai.generativelanguage_v1beta.types import content
8
  from tabled.formats import html_format
 
9
 
 
10
  from marker.schema import BlockTypes
11
  from marker.schema.blocks import Block
12
  from marker.schema.document import Document
@@ -15,8 +14,15 @@ from marker.schema.polygon import PolygonBox
15
 
16
 
17
  class LLMTableProcessor(BaseLLMProcessor):
18
- block_types = (BlockTypes.Table,)
19
- gemini_rewriting_prompt = """You are a text correction expert specializing in accurately reproducing text from images.
 
 
 
 
 
 
 
20
  You will receive an image of a text block and an html representation of the table in the image.
21
  Your task is to correct any errors in the html representation. The html representation should be as faithful to the original table as possible.
22
  **Instructions:**
@@ -92,10 +98,8 @@ No corrections needed.
92
  block.update_metadata(llm_error_count=1)
93
  return
94
 
95
-
96
  block.cells = parsed_cells
97
 
98
-
99
  def parse_html_table(self, html_text: str, block: Block) -> List[SpanTableCell]:
100
  soup = BeautifulSoup(html_text, 'html.parser')
101
  table = soup.find('table')
@@ -151,5 +155,4 @@ No corrections needed.
151
  cells.append(cell_obj)
152
  cur_col += colspan
153
 
154
-
155
  return cells
 
1
+ from typing import Annotated, List, Tuple
2
 
 
3
  from bs4 import BeautifulSoup
 
 
4
  from google.ai.generativelanguage_v1beta.types import content
5
  from tabled.formats import html_format
6
+ from tabled.schema import SpanTableCell
7
 
8
+ from marker.processors.llm import BaseLLMProcessor
9
  from marker.schema import BlockTypes
10
  from marker.schema.blocks import Block
11
  from marker.schema.document import Document
 
14
 
15
 
16
  class LLMTableProcessor(BaseLLMProcessor):
17
+ block_types: Annotated[
18
+ Tuple[BlockTypes],
19
+ "The block types to process.",
20
+ ] = (BlockTypes.Table,)
21
+ gemini_rewriting_prompt: Annotated[
22
+ str,
23
+ "The prompt to use for rewriting text.",
24
+ "Default is a string containing the Gemini rewriting prompt."
25
+ ] = """You are a text correction expert specializing in accurately reproducing text from images.
26
  You will receive an image of a text block and an html representation of the table in the image.
27
  Your task is to correct any errors in the html representation. The html representation should be as faithful to the original table as possible.
28
  **Instructions:**
 
98
  block.update_metadata(llm_error_count=1)
99
  return
100
 
 
101
  block.cells = parsed_cells
102
 
 
103
  def parse_html_table(self, html_text: str, block: Block) -> List[SpanTableCell]:
104
  soup = BeautifulSoup(html_text, 'html.parser')
105
  table = soup.find('table')
 
155
  cells.append(cell_obj)
156
  cur_col += colspan
157
 
 
158
  return cells
marker/processors/sectionheader.py CHANGED
@@ -1,5 +1,5 @@
1
  import warnings
2
- from typing import Dict, List
3
 
4
  import numpy as np
5
  from sklearn.cluster import KMeans
@@ -16,29 +16,24 @@ warnings.filterwarnings("ignore", category=ConvergenceWarning)
16
  class SectionHeaderProcessor(BaseProcessor):
17
  """
18
  A processor for recognizing section headers in the document.
19
-
20
- Attributes:
21
- level_count (int):
22
- The number of levels to use for headings.
23
- Default is 4.
24
-
25
- merge_threshold (float):
26
- The minimum gap between headings to consider them part of the same group.
27
- Default is 0.25.
28
-
29
- default_level (int):
30
- The default heading level to use if no heading level is detected.
31
- Default is 2.
32
-
33
- height_tolerance (float):
34
- The minimum height of a heading to consider it a heading.
35
- Default is 0.99.
36
  """
37
  block_types = (BlockTypes.SectionHeader, )
38
- level_count = 4
39
- merge_threshold = .25
40
- default_level = 2
41
- height_tolerance = .99
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
  def __call__(self, document: Document):
44
  line_heights: Dict[int, List[float]] = {}
@@ -48,7 +43,7 @@ class SectionHeaderProcessor(BaseProcessor):
48
  line_heights[block.id] = block.line_height(document)
49
  else:
50
  line_heights[block.id] = 0
51
- block.ignore_for_output = True # Don't output an empty section header
52
 
53
  flat_line_heights = list(line_heights.values())
54
  heading_ranges = self.bucket_headings(flat_line_heights)
 
1
  import warnings
2
+ from typing import Annotated, Dict, List
3
 
4
  import numpy as np
5
  from sklearn.cluster import KMeans
 
16
  class SectionHeaderProcessor(BaseProcessor):
17
  """
18
  A processor for recognizing section headers in the document.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  """
20
  block_types = (BlockTypes.SectionHeader, )
21
+ level_count: Annotated[
22
+ int,
23
+ "The number of levels to use for headings.",
24
+ ] = 4
25
+ merge_threshold: Annotated[
26
+ float,
27
+ "The minimum gap between headings to consider them part of the same group.",
28
+ ] = 0.25
29
+ default_level: Annotated[
30
+ int,
31
+ "The default heading level to use if no heading level is detected.",
32
+ ] = 2
33
+ height_tolerance: Annotated[
34
+ float,
35
+ "The minimum height of a heading to consider it a heading.",
36
+ ] = 0.99
37
 
38
  def __call__(self, document: Document):
39
  line_heights: Dict[int, List[float]] = {}
 
43
  line_heights[block.id] = block.line_height(document)
44
  else:
45
  line_heights[block.id] = 0
46
+ block.ignore_for_output = True # Don't output an empty section header
47
 
48
  flat_line_heights = list(line_heights.values())
49
  heading_ranges = self.bucket_headings(flat_line_heights)
marker/processors/table.py CHANGED
@@ -1,4 +1,6 @@
1
 
 
 
2
  from ftfy import fix_text
3
  from surya.input.pdflines import get_page_text_lines
4
  from surya.model.detection.model import EfficientViTForSemanticSegmentation
@@ -16,29 +18,27 @@ from marker.settings import settings
16
  class TableProcessor(BaseProcessor):
17
  """
18
  A processor for recognizing tables in the document.
19
-
20
- Attributes:
21
- detect_boxes (bool):
22
- Whether to detect boxes for the table recognition model.
23
- Default is False.
24
-
25
- detector_batch_size (int):
26
- The batch size to use for the table detection model.
27
- Default is None, which will use the default batch size for the model.
28
-
29
- table_rec_batch_size (int):
30
- The batch size to use for the table recognition model.
31
- Default is None, which will use the default batch size for the model.
32
-
33
- recognition_batch_size (int):
34
- The batch size to use for the table recognition model.
35
- Default is None, which will use the default batch size for the model.
36
  """
37
  block_types = (BlockTypes.Table, BlockTypes.TableOfContents, BlockTypes.Form)
38
- detect_boxes = False
39
- detector_batch_size = None
40
- table_rec_batch_size = None
41
- recognition_batch_size = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
  def __init__(
44
  self,
 
1
 
2
+ from typing import Annotated
3
+
4
  from ftfy import fix_text
5
  from surya.input.pdflines import get_page_text_lines
6
  from surya.model.detection.model import EfficientViTForSemanticSegmentation
 
18
  class TableProcessor(BaseProcessor):
19
  """
20
  A processor for recognizing tables in the document.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  """
22
  block_types = (BlockTypes.Table, BlockTypes.TableOfContents, BlockTypes.Form)
23
+ detect_boxes: Annotated[
24
+ bool,
25
+ "Whether to detect boxes for the table recognition model.",
26
+ ] = False
27
+ detector_batch_size: Annotated[
28
+ int,
29
+ "The batch size to use for the table detection model.",
30
+ "Default is None, which will use the default batch size for the model."
31
+ ] = None
32
+ table_rec_batch_size: Annotated[
33
+ int,
34
+ "The batch size to use for the table recognition model.",
35
+ "Default is None, which will use the default batch size for the model."
36
+ ] = None
37
+ recognition_batch_size: Annotated[
38
+ int,
39
+ "The batch size to use for the table recognition model.",
40
+ "Default is None, which will use the default batch size for the model."
41
+ ] = None
42
 
43
  def __init__(
44
  self,
marker/processors/text.py CHANGED
@@ -1,5 +1,5 @@
1
  import math
2
- from typing import List
3
 
4
  import regex
5
 
@@ -12,15 +12,13 @@ from marker.schema.text.line import Line
12
  class TextProcessor(BaseProcessor):
13
  """
14
  A processor for merging text across pages and columns.
15
-
16
- Attributes:
17
- column_gap_ratio (float):
18
- The minimum ratio of the page width to the column gap to consider a column break.
19
- Default is 0.02.
20
  """
21
  block_types = (BlockTypes.Text, BlockTypes.TextInlineMath)
22
  ignored_block_types = (BlockTypes.PageHeader, BlockTypes.PageFooter)
23
- column_gap_ratio = 0.02 # column gaps are atleast 2% of the current column width
 
 
 
24
 
25
  def __init__(self, config):
26
  super().__init__(config)
@@ -35,14 +33,14 @@ class TextProcessor(BaseProcessor):
35
  continue
36
 
37
  next_block = document.get_next_block(block, self.ignored_block_types)
38
- if next_block is None: # we've reached the end of the document
39
  continue
40
  if next_block.block_type not in self.block_types:
41
- continue # we found a non-text block
42
  if next_block.structure is None:
43
  continue # This is odd though, why do we have text blocks with no structure?
44
  if next_block.ignore_for_output:
45
- continue # skip ignored blocks
46
 
47
  column_gap = block.polygon.width * self.column_gap_ratio
48
 
@@ -53,7 +51,7 @@ class TextProcessor(BaseProcessor):
53
  last_line_is_hyphentated = False
54
  new_block_lines = []
55
 
56
- if next_block.page_id == block.page_id: # block on the same page
57
  # we check for a column break
58
  column_break = (
59
  math.floor(next_block.polygon.y_start) <= math.ceil(block.polygon.y_start) and
@@ -63,11 +61,11 @@ class TextProcessor(BaseProcessor):
63
  page_break = True
64
  next_page = document.get_page(next_block.page_id)
65
  next_block_in_first_quadrant = (next_block.polygon.x_start < next_page.polygon.width // 2) and \
66
- (next_block.polygon.y_start < next_page.polygon.height // 2)
67
 
68
  if not (column_break or page_break):
69
  continue
70
-
71
  new_block_lines = next_block.structure_blocks(document)
72
 
73
  # we check for next_block indentation
 
1
  import math
2
+ from typing import Annotated, List
3
 
4
  import regex
5
 
 
12
  class TextProcessor(BaseProcessor):
13
  """
14
  A processor for merging text across pages and columns.
 
 
 
 
 
15
  """
16
  block_types = (BlockTypes.Text, BlockTypes.TextInlineMath)
17
  ignored_block_types = (BlockTypes.PageHeader, BlockTypes.PageFooter)
18
+ column_gap_ratio: Annotated[
19
+ float,
20
+ "The minimum ratio of the page width to the column gap to consider a column break.",
21
+ ] = 0.02
22
 
23
  def __init__(self, config):
24
  super().__init__(config)
 
33
  continue
34
 
35
  next_block = document.get_next_block(block, self.ignored_block_types)
36
+ if next_block is None: # we've reached the end of the document
37
  continue
38
  if next_block.block_type not in self.block_types:
39
+ continue # we found a non-text block
40
  if next_block.structure is None:
41
  continue # This is odd though, why do we have text blocks with no structure?
42
  if next_block.ignore_for_output:
43
+ continue # skip ignored blocks
44
 
45
  column_gap = block.polygon.width * self.column_gap_ratio
46
 
 
51
  last_line_is_hyphentated = False
52
  new_block_lines = []
53
 
54
+ if next_block.page_id == block.page_id: # block on the same page
55
  # we check for a column break
56
  column_break = (
57
  math.floor(next_block.polygon.y_start) <= math.ceil(block.polygon.y_start) and
 
61
  page_break = True
62
  next_page = document.get_page(next_block.page_id)
63
  next_block_in_first_quadrant = (next_block.polygon.x_start < next_page.polygon.width // 2) and \
64
+ (next_block.polygon.y_start < next_page.polygon.height // 2)
65
 
66
  if not (column_break or page_break):
67
  continue
68
+
69
  new_block_lines = next_block.structure_blocks(document)
70
 
71
  # we check for next_block indentation
marker/providers/pdf.py CHANGED
@@ -1,7 +1,7 @@
1
  import atexit
2
  import ctypes
3
  import re
4
- from typing import List, Set
5
 
6
  import pypdfium2 as pdfium
7
  import pypdfium2.raw as pdfium_c
@@ -19,16 +19,51 @@ from marker.schema.text.span import Span
19
 
20
 
21
  class PdfProvider(BaseProvider):
22
- page_range: List[int] | None = None
23
- pdftext_workers: int = 4
24
- flatten_pdf: bool = True
25
- force_ocr: bool = False
26
- ocr_invalid_chars: tuple = (chr(0xfffd), "�")
27
- ocr_space_threshold: float = .7
28
- ocr_newline_threshold: float = .6
29
- ocr_alphanum_threshold: float = .3
30
- image_threshold: float = .65
31
- strip_existing_ocr: bool = False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
  def __init__(self, filepath: str, config=None):
34
  super().__init__(filepath, config)
@@ -57,7 +92,7 @@ class PdfProvider(BaseProvider):
57
  if self.doc is not None:
58
  self.doc.close()
59
 
60
- def font_flags_to_format(self, flags: int | None) -> Set[str]:
61
  if flags is None:
62
  return {"plain"}
63
 
@@ -188,35 +223,33 @@ class PdfProvider(BaseProvider):
188
  if not any([obj.type == pdfium_c.FPDF_PAGEOBJ_TEXT for obj in page_objs]):
189
  return False
190
 
191
- if not self.strip_existing_ocr:
192
- return True
193
-
194
- # If any text objects on the page are in invisible render mode, skip this page
195
- for text_obj in filter(lambda obj: obj.type == pdfium_c.FPDF_PAGEOBJ_TEXT, page_objs):
196
- if pdfium_c.FPDFTextObj_GetTextRenderMode(text_obj) in [pdfium_c.FPDF_TEXTRENDERMODE_INVISIBLE, pdfium_c.FPDF_TEXTRENDERMODE_UNKNOWN]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
  return False
198
 
199
- non_embedded_fonts = []
200
- empty_fonts = []
201
- font_map = {}
202
- for text_obj in filter(lambda obj: obj.type == pdfium_c.FPDF_PAGEOBJ_TEXT, page_objs):
203
- font = pdfium_c.FPDFTextObj_GetFont(text_obj)
204
- font_name = self.get_fontname(font)
205
-
206
- # we also skip pages without embedded fonts and fonts without names
207
- non_embedded_fonts.append(pdfium_c.FPDFFont_GetIsEmbedded(font) == 0)
208
- empty_fonts.append(not font_name or font_name == "GlyphLessFont")
209
- if font_name not in font_map:
210
- font_map[font_name or 'Unknown'] = font
211
-
212
- if all(non_embedded_fonts) or all(empty_fonts):
213
- return False
214
-
215
- # if we see very large images covering most of the page, we can skip this page
216
- for img_obj in filter(lambda obj: obj.type == pdfium_c.FPDF_PAGEOBJ_IMAGE, page_objs):
217
- img_bbox = PolygonBox.from_bbox(img_obj.get_pos())
218
- if page_bbox.intersection_pct(img_bbox) >= self.image_threshold:
219
- return False
220
 
221
  return True
222
 
@@ -265,8 +298,8 @@ class PdfProvider(BaseProvider):
265
 
266
  def get_fontname(self, font) -> str:
267
  font_name = ""
268
- buffer_size = 256
269
-
270
  try:
271
  font_name_buffer = ctypes.create_string_buffer(buffer_size)
272
  length = pdfium_c.FPDFFont_GetBaseFontName(font, font_name_buffer, buffer_size)
 
1
  import atexit
2
  import ctypes
3
  import re
4
+ from typing import Annotated, List, Optional, Set
5
 
6
  import pypdfium2 as pdfium
7
  import pypdfium2.raw as pdfium_c
 
19
 
20
 
21
  class PdfProvider(BaseProvider):
22
+ """
23
+ A provider for PDF files.
24
+ """
25
+
26
+ page_range: Annotated[
27
+ Optional[List[int]],
28
+ "The range of pages to process.",
29
+ "Default is None, which will process all pages."
30
+ ] = None
31
+ pdftext_workers: Annotated[
32
+ int,
33
+ "The number of workers to use for pdftext.",
34
+ ] = 4
35
+ flatten_pdf: Annotated[
36
+ bool,
37
+ "Whether to flatten the PDF structure.",
38
+ ] = True
39
+ force_ocr: Annotated[
40
+ bool,
41
+ "Whether to force OCR on the whole document.",
42
+ ] = False
43
+ ocr_invalid_chars: Annotated[
44
+ tuple,
45
+ "The characters to consider invalid for OCR.",
46
+ ] = (chr(0xfffd), "�")
47
+ ocr_space_threshold: Annotated[
48
+ float,
49
+ "The minimum ratio of spaces to non-spaces to detect bad text.",
50
+ ] = .7
51
+ ocr_newline_threshold: Annotated[
52
+ float,
53
+ "The minimum ratio of newlines to non-newlines to detect bad text.",
54
+ ] = .6
55
+ ocr_alphanum_threshold: Annotated[
56
+ float,
57
+ "The minimum ratio of alphanumeric characters to non-alphanumeric characters to consider an alphanumeric character.",
58
+ ] = .3
59
+ image_threshold: Annotated[
60
+ float,
61
+ "The minimum coverage ratio of the image to the page to consider skipping the page.",
62
+ ] = .65
63
+ strip_existing_ocr: Annotated[
64
+ bool,
65
+ "Whether to strip existing OCR text from the PDF.",
66
+ ] = False
67
 
68
  def __init__(self, filepath: str, config=None):
69
  super().__init__(filepath, config)
 
92
  if self.doc is not None:
93
  self.doc.close()
94
 
95
+ def font_flags_to_format(self, flags: Optional[int]) -> Set[str]:
96
  if flags is None:
97
  return {"plain"}
98
 
 
223
  if not any([obj.type == pdfium_c.FPDF_PAGEOBJ_TEXT for obj in page_objs]):
224
  return False
225
 
226
+ if self.strip_existing_ocr:
227
+ # If any text objects on the page are in invisible render mode, skip this page
228
+ for text_obj in filter(lambda obj: obj.type == pdfium_c.FPDF_PAGEOBJ_TEXT, page_objs):
229
+ if pdfium_c.FPDFTextObj_GetTextRenderMode(text_obj) in [pdfium_c.FPDF_TEXTRENDERMODE_INVISIBLE, pdfium_c.FPDF_TEXTRENDERMODE_UNKNOWN]:
230
+ return False
231
+
232
+ non_embedded_fonts = []
233
+ empty_fonts = []
234
+ font_map = {}
235
+ for text_obj in filter(lambda obj: obj.type == pdfium_c.FPDF_PAGEOBJ_TEXT, page_objs):
236
+ font = pdfium_c.FPDFTextObj_GetFont(text_obj)
237
+ font_name = self.get_fontname(font)
238
+
239
+ # we also skip pages without embedded fonts and fonts without names
240
+ non_embedded_fonts.append(pdfium_c.FPDFFont_GetIsEmbedded(font) == 0)
241
+ empty_fonts.append(not font_name or font_name == "GlyphLessFont")
242
+ if font_name not in font_map:
243
+ font_map[font_name or 'Unknown'] = font
244
+
245
+ if all(non_embedded_fonts) or all(empty_fonts):
246
  return False
247
 
248
+ # if we see very large images covering most of the page, we can skip this page
249
+ for img_obj in filter(lambda obj: obj.type == pdfium_c.FPDF_PAGEOBJ_IMAGE, page_objs):
250
+ img_bbox = PolygonBox.from_bbox(img_obj.get_pos())
251
+ if page_bbox.intersection_pct(img_bbox) >= self.image_threshold:
252
+ return False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
253
 
254
  return True
255
 
 
298
 
299
  def get_fontname(self, font) -> str:
300
  font_name = ""
301
+ buffer_size = 256
302
+
303
  try:
304
  font_name_buffer = ctypes.create_string_buffer(buffer_size)
305
  length = pdfium_c.FPDFFont_GetBaseFontName(font, font_name_buffer, buffer_size)
marker/renderers/__init__.py CHANGED
@@ -2,7 +2,7 @@ import base64
2
  import io
3
  import re
4
  from collections import Counter
5
- from typing import Optional
6
 
7
  from bs4 import BeautifulSoup
8
  from pydantic import BaseModel
@@ -15,9 +15,9 @@ from marker.util import assign_config
15
 
16
 
17
  class BaseRenderer:
18
- remove_blocks: list = [BlockTypes.PageHeader, BlockTypes.PageFooter]
19
- image_blocks: list = [BlockTypes.Picture, BlockTypes.Figure]
20
- extract_images: bool = True
21
 
22
  def __init__(self, config: Optional[BaseModel | dict] = None):
23
  assign_config(self, config)
@@ -71,7 +71,7 @@ class BaseRenderer:
71
  return page_stats
72
 
73
  def generate_document_metadata(self, document: Document, document_output):
74
- metadata = {
75
  "table_of_contents": document.table_of_contents,
76
  "page_stats": self.generate_page_stats(document, document_output),
77
  }
 
2
  import io
3
  import re
4
  from collections import Counter
5
+ from typing import Annotated, Optional, Tuple
6
 
7
  from bs4 import BeautifulSoup
8
  from pydantic import BaseModel
 
15
 
16
 
17
  class BaseRenderer:
18
+ remove_blocks: Annotated[Tuple[BlockTypes, ...], "The block types to ignore while rendering."] = (BlockTypes.PageHeader, BlockTypes.PageFooter)
19
+ image_blocks: Annotated[Tuple[BlockTypes, ...], "The block types to consider as images."] = (BlockTypes.Picture, BlockTypes.Figure)
20
+ extract_images: Annotated[bool, "Extract images from the document."] = True
21
 
22
  def __init__(self, config: Optional[BaseModel | dict] = None):
23
  assign_config(self, config)
 
71
  return page_stats
72
 
73
  def generate_document_metadata(self, document: Document, document_output):
74
+ metadata = {
75
  "table_of_contents": document.table_of_contents,
76
  "page_stats": self.generate_page_stats(document, document_output),
77
  }
marker/renderers/html.py CHANGED
@@ -1,4 +1,5 @@
1
- from typing import Literal
 
2
 
3
  from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
4
  from pydantic import BaseModel
@@ -13,7 +14,6 @@ import warnings
13
  warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning)
14
 
15
  # Suppress DecompressionBombError
16
- from PIL import Image
17
  Image.MAX_IMAGE_PIXELS = None
18
 
19
 
@@ -24,9 +24,21 @@ class HTMLOutput(BaseModel):
24
 
25
 
26
  class HTMLRenderer(BaseRenderer):
27
- page_blocks: list = [BlockTypes.Page]
28
- paginate_output: bool = False
29
- image_extraction_mode: Literal["lowres", "highres"] = "highres"
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
  def extract_image(self, document, image_id):
32
  image_block = document.get_block(image_id)
 
1
+ from PIL import Image
2
+ from typing import Annotated, Literal, Tuple
3
 
4
  from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
5
  from pydantic import BaseModel
 
14
  warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning)
15
 
16
  # Suppress DecompressionBombError
 
17
  Image.MAX_IMAGE_PIXELS = None
18
 
19
 
 
24
 
25
 
26
  class HTMLRenderer(BaseRenderer):
27
+ """
28
+ A renderer for HTML output.
29
+ """
30
+ page_blocks: Annotated[
31
+ Tuple[BlockTypes],
32
+ "The block types to consider as pages.",
33
+ ] = (BlockTypes.Page,)
34
+ paginate_output: Annotated[
35
+ bool,
36
+ "Whether to paginate the output.",
37
+ ] = False
38
+ image_extraction_mode: Annotated[
39
+ Literal["lowres", "highres"],
40
+ "The mode to use for extracting images.",
41
+ ] = "highres"
42
 
43
  def extract_image(self, document, image_id):
44
  image_block = document.get_block(image_id)
marker/renderers/json.py CHANGED
@@ -1,6 +1,4 @@
1
- from __future__ import annotations
2
-
3
- from typing import Dict, List
4
 
5
  from pydantic import BaseModel
6
 
@@ -16,7 +14,7 @@ class JSONBlockOutput(BaseModel):
16
  block_type: str
17
  html: str
18
  polygon: List[List[float]]
19
- children: List[JSONBlockOutput] | None = None
20
  section_hierarchy: Dict[int, str] | None = None
21
  images: dict | None = None
22
 
@@ -35,8 +33,17 @@ def reformat_section_hierarchy(section_hierarchy):
35
 
36
 
37
  class JSONRenderer(BaseRenderer):
38
- image_blocks: list = [BlockTypes.Picture, BlockTypes.Figure]
39
- page_blocks: list = [BlockTypes.Page]
 
 
 
 
 
 
 
 
 
40
 
41
  def extract_json(self, document: Document, block_output: BlockOutput):
42
  cls = get_block_class(block_output.id.block_type)
 
1
+ from typing import Annotated, Dict, List, Tuple
 
 
2
 
3
  from pydantic import BaseModel
4
 
 
14
  block_type: str
15
  html: str
16
  polygon: List[List[float]]
17
+ children: List['JSONBlockOutput'] | None = None
18
  section_hierarchy: Dict[int, str] | None = None
19
  images: dict | None = None
20
 
 
33
 
34
 
35
  class JSONRenderer(BaseRenderer):
36
+ """
37
+ A renderer for JSON output.
38
+ """
39
+ image_blocks: Annotated[
40
+ Tuple[BlockTypes],
41
+ "The list of block types to consider as images.",
42
+ ] = (BlockTypes.Picture, BlockTypes.Figure)
43
+ page_blocks: Annotated[
44
+ Tuple[BlockTypes],
45
+ "The list of block types to consider as pages.",
46
+ ] = (BlockTypes.Page,)
47
 
48
  def extract_json(self, document: Document, block_output: BlockOutput):
49
  cls = get_block_class(block_output.id.block_type)
marker/renderers/markdown.py CHANGED
@@ -1,5 +1,5 @@
1
  import re
2
- from typing import List
3
 
4
  import regex
5
  from markdownify import MarkdownConverter
@@ -62,7 +62,6 @@ class Markdownify(MarkdownConverter):
62
  return super().convert_th(el, text, convert_as_inline)
63
 
64
 
65
-
66
  class MarkdownOutput(BaseModel):
67
  markdown: str
68
  images: dict
@@ -70,9 +69,9 @@ class MarkdownOutput(BaseModel):
70
 
71
 
72
  class MarkdownRenderer(HTMLRenderer):
73
- page_separator: str = "-" * 48
74
- inline_math_delimiters: List[str] = ["$", "$"]
75
- block_math_delimiters: List[str] = ["$$", "$$"]
76
 
77
  def __call__(self, document: Document) -> MarkdownOutput:
78
  document_output = document.render()
 
1
  import re
2
+ from typing import Annotated, Tuple
3
 
4
  import regex
5
  from markdownify import MarkdownConverter
 
62
  return super().convert_th(el, text, convert_as_inline)
63
 
64
 
 
65
  class MarkdownOutput(BaseModel):
66
  markdown: str
67
  images: dict
 
69
 
70
 
71
  class MarkdownRenderer(HTMLRenderer):
72
+ page_separator: Annotated[str, "The separator to use between pages.", "Default is '-' * 48."] = "-" * 48
73
+ inline_math_delimiters: Annotated[Tuple[str], "The delimiters to use for inline math."] = ("$", "$")
74
+ block_math_delimiters: Annotated[Tuple[str], "The delimiters to use for block math."] = ("$$", "$$")
75
 
76
  def __call__(self, document: Document) -> MarkdownOutput:
77
  document_output = document.render()
marker/schema/blocks/base.py CHANGED
@@ -1,6 +1,6 @@
1
  from __future__ import annotations
2
 
3
- from typing import TYPE_CHECKING, List, Literal, Optional, Dict, Sequence
4
 
5
  from pydantic import BaseModel, ConfigDict, field_validator
6
 
@@ -33,7 +33,7 @@ class BlockOutput(BaseModel):
33
 
34
  class BlockId(BaseModel):
35
  page_id: int
36
- block_id: int | None = None
37
  block_type: BlockTypes | None = None
38
 
39
  def __str__(self):
 
1
  from __future__ import annotations
2
 
3
+ from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Sequence
4
 
5
  from pydantic import BaseModel, ConfigDict, field_validator
6
 
 
33
 
34
  class BlockId(BaseModel):
35
  page_id: int
36
+ block_id: Optional[int] = None
37
  block_type: BlockTypes | None = None
38
 
39
  def __str__(self):
marker/schema/blocks/sectionheader.py CHANGED
@@ -1,10 +1,12 @@
 
 
1
  from marker.schema import BlockTypes
2
  from marker.schema.blocks import Block
3
 
4
 
5
  class SectionHeader(Block):
6
  block_type: BlockTypes = BlockTypes.SectionHeader
7
- heading_level: int | None = None
8
 
9
  def assemble_html(self, child_blocks, parent_structure):
10
  if self.ignore_for_output:
 
1
+ from typing import Optional
2
+
3
  from marker.schema import BlockTypes
4
  from marker.schema.blocks import Block
5
 
6
 
7
  class SectionHeader(Block):
8
  block_type: BlockTypes = BlockTypes.SectionHeader
9
+ heading_level: Optional[int] = None
10
 
11
  def assemble_html(self, child_blocks, parent_structure):
12
  if self.ignore_for_output:
marker/util.py CHANGED
@@ -1,4 +1,5 @@
1
  import inspect
 
2
  from importlib import import_module
3
  from typing import List
4
 
@@ -56,7 +57,7 @@ def parse_range_str(range_str: str) -> List[int]:
56
  page_lst += list(range(int(start), int(end) + 1))
57
  else:
58
  page_lst.append(int(i))
59
- page_lst = sorted(list(set(page_lst))) # Deduplicate page numbers and sort in order
60
  return page_lst
61
 
62
 
 
1
  import inspect
2
+ import re
3
  from importlib import import_module
4
  from typing import List
5
 
 
57
  page_lst += list(range(int(start), int(end) + 1))
58
  else:
59
  page_lst.append(int(i))
60
+ page_lst = sorted(list(set(page_lst))) # Deduplicate page numbers and sort in order
61
  return page_lst
62
 
63