Spaces:

rt4u
/

marker

Sleeping

App Files Files Community

Vik Paruchuri commited on Jan 8

Commit

6ff9f43

2 Parent(s): 3453dd8 b0a6c31

Merge remote-tracking branch 'origin/dev' into vik_dev

Browse files

Files changed (31) hide show

convert.py +2 -1
marker/builders/document.py +10 -12
marker/builders/layout.py +34 -35
marker/builders/llm_layout.py +40 -46
marker/builders/ocr.py +16 -16
marker/builders/structure.py +12 -13
marker/config/crawler.py +106 -0
marker/config/parser.py +16 -18
marker/config/printer.py +38 -37
marker/converters/pdf.py +22 -19
marker/processors/blockquote.py +22 -5
marker/processors/debug.py +30 -34
marker/processors/equation.py +18 -18
marker/processors/footnote.py +1 -17
marker/processors/ignoretext.py +27 -15
marker/processors/line_numbers.py +21 -7
marker/processors/list.py +9 -3
marker/processors/llm/__init__.py +35 -32
marker/processors/llm/llm_image_description.py +11 -2
marker/processors/llm/llm_table.py +12 -9
marker/processors/sectionheader.py +18 -23
marker/processors/table.py +21 -21
marker/processors/text.py +11 -13
marker/providers/pdf.py +74 -41
marker/renderers/__init__.py +5 -5
marker/renderers/html.py +17 -5
marker/renderers/json.py +13 -6
marker/renderers/markdown.py +4 -5
marker/schema/blocks/base.py +2 -2
marker/schema/blocks/sectionheader.py +3 -1
marker/util.py +2 -1

convert.py CHANGED Viewed

@@ -13,6 +13,7 @@ import torch.multiprocessing as mp
 from tqdm import tqdm
 from marker.config.parser import ConfigParser
 from marker.converters.pdf import PdfConverter
 from marker.logger import configure_logging
 from marker.models import create_model_dict
@@ -59,7 +60,7 @@ def process_single_pdf(args):
         print(traceback.format_exc())
-@click.command()
 @click.argument("in_folder", type=str)
 @ConfigParser.common_options
 @click.option("--chunk_idx", type=int, default=0, help="Chunk index to convert")

 from tqdm import tqdm
 from marker.config.parser import ConfigParser
+from marker.config.printer import CustomClickPrinter
 from marker.converters.pdf import PdfConverter
 from marker.logger import configure_logging
 from marker.models import create_model_dict
         print(traceback.format_exc())
+@click.command(cls=CustomClickPrinter)
 @click.argument("in_folder", type=str)
 @ConfigParser.common_options
 @click.option("--chunk_idx", type=int, default=0, help="Chunk index to convert")

marker/builders/document.py CHANGED Viewed

@@ -1,4 +1,5 @@
-from marker.settings import settings
 from marker.builders import BaseBuilder
 from marker.builders.layout import LayoutBuilder
 from marker.builders.ocr import OcrBuilder
@@ -12,18 +13,15 @@ from marker.schema.registry import get_block_class
 class DocumentBuilder(BaseBuilder):
     """
     Constructs a Document given a PdfProvider, LayoutBuilder, and OcrBuilder.
-    Attributes:
-        lowres_image_dpi (int):
-            DPI setting for low-resolution page images used for Layout and Line Detection.
-            Default is 96.
-        highres_image_dpi (int):
-            DPI setting for high-resolution page images used for OCR.
-            Default is 192.
     """
-    lowres_image_dpi: int = 96
-    highres_image_dpi: int = 192
     def __call__(self, provider: PdfProvider, layout_builder: LayoutBuilder, ocr_builder: OcrBuilder):
         document = self.build_document(provider)

+from typing import Annotated
 from marker.builders import BaseBuilder
 from marker.builders.layout import LayoutBuilder
 from marker.builders.ocr import OcrBuilder
 class DocumentBuilder(BaseBuilder):
     """
     Constructs a Document given a PdfProvider, LayoutBuilder, and OcrBuilder.
     """
+    lowres_image_dpi: Annotated[
+        int,
+        "DPI setting for low-resolution page images used for Layout and Line Detection.",
+    ] = 96
+    highres_image_dpi: Annotated[
+        int,
+        "DPI setting for high-resolution page images used for OCR.",
+    ] = 192
     def __call__(self, provider: PdfProvider, layout_builder: LayoutBuilder, ocr_builder: OcrBuilder):
         document = self.build_document(provider)

marker/builders/layout.py CHANGED Viewed

@@ -1,15 +1,12 @@
-from typing import List
 import numpy as np
 from surya.layout import batch_layout_detection
-from surya.schema import LayoutResult
 from surya.model.layout.encoderdecoder import SuryaLayoutModel
-from surya.ocr_error import batch_ocr_error_detection
-from surya.schema import OCRErrorDetectionResult
 from surya.model.ocr_error.model import DistilBertForSequenceClassification
-from marker.settings import settings
 from marker.builders import BaseBuilder
 from marker.providers import ProviderOutput, ProviderPageLines
 from marker.providers.pdf import PdfProvider
@@ -18,40 +15,42 @@ from marker.schema.document import Document
 from marker.schema.groups.page import PageGroup
 from marker.schema.polygon import PolygonBox
 from marker.schema.registry import get_block_class
 from marker.util import matrix_intersection_area
 class LayoutBuilder(BaseBuilder):
     """
     A builder for performing layout detection on PDF pages and merging the results into the document.
-    Attributes:
-        batch_size (int):
-            The batch size to use for the layout model.
-            Default is None, which will use the default batch size for the model.
-        layout_coverage_min_lines (int):
-            The minimum number of PdfProvider lines that must be covered by the layout model
-            to consider the lines from the PdfProvider valid. Default is 1.
-        layout_coverage_threshold (float):
-            The minimum coverage ratio required for the layout model to consider
-            the lines from the PdfProvider valid. Default is 0.3.
-        document_ocr_threshold (float):
-            The minimum ratio of pages that must pass the layout coverage check
-            to avoid OCR. Default is 0.8.
-        error_model_segment_length (int):
-            The maximum number of characters to send to the OCR error model.
-            Default is 1024.
     """
-    batch_size = None
-    layout_coverage_min_lines = 1
-    layout_coverage_threshold = .1
-    document_ocr_threshold = .8
-    error_model_segment_length = 512
-    excluded_for_coverage = (BlockTypes.Figure, BlockTypes.Picture, BlockTypes.Table, BlockTypes.FigureGroup, BlockTypes.TableGroup, BlockTypes.PictureGroup)
     def __init__(self, layout_model: SuryaLayoutModel, ocr_error_model: DistilBertForSequenceClassification, config=None):
         self.layout_model = layout_model
@@ -81,7 +80,7 @@ class LayoutBuilder(BaseBuilder):
         )
         return layout_results
-    def surya_ocr_error_detection(self, pages:List[PageGroup], provider_page_lines: ProviderPageLines) -> OCRErrorDetectionResult:
         page_texts = []
         for document_page in pages:
             page_text = ''
@@ -102,7 +101,7 @@ class LayoutBuilder(BaseBuilder):
             page_texts,
             self.ocr_error_model,
             self.ocr_error_model.tokenizer,
-            batch_size=int(self.get_batch_size())       #TODO Better Multiplier
         )
         return ocr_error_detection_results

+from typing import Annotated, List, Optional, Tuple
 import numpy as np
 from surya.layout import batch_layout_detection
 from surya.model.layout.encoderdecoder import SuryaLayoutModel
 from surya.model.ocr_error.model import DistilBertForSequenceClassification
+from surya.ocr_error import batch_ocr_error_detection
+from surya.schema import LayoutResult, OCRErrorDetectionResult
 from marker.builders import BaseBuilder
 from marker.providers import ProviderOutput, ProviderPageLines
 from marker.providers.pdf import PdfProvider
 from marker.schema.groups.page import PageGroup
 from marker.schema.polygon import PolygonBox
 from marker.schema.registry import get_block_class
+from marker.settings import settings
 from marker.util import matrix_intersection_area
 class LayoutBuilder(BaseBuilder):
     """
     A builder for performing layout detection on PDF pages and merging the results into the document.
     """
+    batch_size: Annotated[
+        Optional[int],
+        "The batch size to use for the layout model.",
+        "Default is None, which will use the default batch size for the model."
+    ] = None
+    layout_coverage_min_lines: Annotated[
+        int,
+        "The minimum number of PdfProvider lines that must be covered by the layout model",
+        "to consider the lines from the PdfProvider valid.",
+    ] = 1
+    layout_coverage_threshold: Annotated[
+        float,
+        "The minimum coverage ratio required for the layout model to consider",
+        "the lines from the PdfProvider valid.",
+    ] = .1
+    document_ocr_threshold: Annotated[
+        float,
+        "The minimum ratio of pages that must pass the layout coverage check",
+        "to avoid OCR.",
+    ] = .8
+    error_model_segment_length: Annotated[
+        int,
+        "The maximum number of characters to send to the OCR error model.",
+    ] = 512
+    excluded_for_coverage: Annotated[
+        Tuple[BlockTypes],
+        "A list of block types to exclude from the layout coverage check.",
+    ] = (BlockTypes.Figure, BlockTypes.Picture, BlockTypes.Table, BlockTypes.FigureGroup, BlockTypes.TableGroup, BlockTypes.PictureGroup)
     def __init__(self, layout_model: SuryaLayoutModel, ocr_error_model: DistilBertForSequenceClassification, config=None):
         self.layout_model = layout_model
         )
         return layout_results
+    def surya_ocr_error_detection(self, pages: List[PageGroup], provider_page_lines: ProviderPageLines) -> OCRErrorDetectionResult:
         page_texts = []
         for document_page in pages:
             page_text = ''
             page_texts,
             self.ocr_error_model,
             self.ocr_error_model.tokenizer,
+            batch_size=int(self.get_batch_size())  # TODO Better Multiplier
         )
         return ocr_error_detection_results

marker/builders/llm_layout.py CHANGED Viewed

@@ -1,13 +1,8 @@
 import json
-import time
-import traceback
 from concurrent.futures import ThreadPoolExecutor, as_completed
-from typing import Optional
-import google.generativeai as genai
-import PIL
 from google.ai.generativelanguage_v1beta.types import content
-from google.api_core.exceptions import ResourceExhausted
 from surya.model.layout.encoderdecoder import SuryaLayoutModel
 from surya.model.ocr_error.model import DistilBertForSequenceClassification
 from tqdm import tqdm
@@ -26,45 +21,41 @@ from marker.settings import settings
 class LLMLayoutBuilder(LayoutBuilder):
     """
     A builder for relabelling blocks to improve the quality of the layout.
-    Attributes:
-        google_api_key (str):
-            The Google API key to use for the Gemini model.
-            Default is None.
-        confidence_threshold (float):
-            The confidence threshold to use for relabeling.
-            Default is 0.75.
-        picture_height_threshold (float):
-            The height threshold for pictures that may actually be complex regions.
-        model_name (str):
-            The name of the Gemini model to use.
-            Default is "gemini-1.5-flash".
-        max_retries (int):
-            The maximum number of retries to use for the Gemini model.
-            Default is 3.
-        max_concurrency (int):
-            The maximum number of concurrent requests to make to the Gemini model.
-            Default is 3.
-        timeout (int):
-            The timeout for requests to the Gemini model.
-            Default is 60 seconds.
-        topk_relabelling_prompt (str):
-            The prompt to use for relabelling blocks.
-            Default is a string containing the Gemini relabelling prompt.
-        complex_relabeling_prompt (str):
-            The prompt to use for complex relabelling blocks.
-            Default is a string containing the complex relabelling prompt.
     """
-    google_api_key: Optional[str] = settings.GOOGLE_API_KEY
-    confidence_threshold: float = 0.75
-    picture_height_threshold: float = 0.8
-    model_name: str = "gemini-1.5-flash"
-    max_retries: int = 3
-    max_concurrency: int = 3
-    timeout: int = 60
-    topk_relabelling_prompt = """You are a layout expert specializing in document analysis.
 Your task is to relabel layout blocks in images to improve the accuracy of an existing layout model.
 You will be provided with an image of a layout block and the top k predictions from the current model, along with their confidence scores.
 Your job is to analyze the image and choose the single most appropriate label from the provided top k predictions.
@@ -75,7 +66,11 @@ Choose the label you believe is the most accurate representation of the layout b
 Here are the top k predictions from the model followed by the image:
 """
-    complex_relabeling_prompt = """You are a layout expert specializing in document analysis.
 Your task is to relabel layout blocks in images to improve the accuracy of an existing layout model.
 You will be provided with an image of a layout block and some potential labels.
 Your job is to analyze the image and choose the single most appropriate label from the provided labels.
@@ -140,7 +135,6 @@ Here is the image of the layout block:
         complex_prompt = self.complex_relabeling_prompt
         return self.process_block_relabeling(page, block, complex_prompt)
     def process_block_relabeling(self, page: PageGroup, block: Block, prompt: str):
         image = self.extract_image(page, block)
         response_schema = content.Schema(
@@ -174,4 +168,4 @@ Here is the image of the layout block:
             .rescale(page.polygon.size, page_img.size)\
             .expand(expand, expand)
         cropped = page_img.crop(image_box.bbox)
-        return cropped

 import json
 from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import Annotated, Optional
 from google.ai.generativelanguage_v1beta.types import content
 from surya.model.layout.encoderdecoder import SuryaLayoutModel
 from surya.model.ocr_error.model import DistilBertForSequenceClassification
 from tqdm import tqdm
 class LLMLayoutBuilder(LayoutBuilder):
     """
     A builder for relabelling blocks to improve the quality of the layout.
     """
+    google_api_key: Annotated[
+        Optional[str],
+        "The Google API key to use for the Gemini model.",
+    ] = settings.GOOGLE_API_KEY
+    confidence_threshold: Annotated[
+        float,
+        "The confidence threshold to use for relabeling.",
+    ] = 0.75
+    picture_height_threshold: Annotated[
+        float,
+        "The height threshold for pictures that may actually be complex regions.",
+    ] = 0.8
+    model_name: Annotated[
+        str,
+        "The name of the Gemini model to use.",
+    ] = "gemini-1.5-flash"
+    max_retries: Annotated[
+        int,
+        "The maximum number of retries to use for the Gemini model.",
+    ] = 3
+    max_concurrency: Annotated[
+        int,
+        "The maximum number of concurrent requests to make to the Gemini model.",
+    ] = 3
+    timeout: Annotated[
+        int,
+        "The timeout for requests to the Gemini model.",
+    ] = 60
+    topk_relabelling_prompt: Annotated[
+        str,
+        "The prompt to use for relabelling blocks.",
+        "Default is a string containing the Gemini relabelling prompt."
+    ] = """You are a layout expert specializing in document analysis.
 Your task is to relabel layout blocks in images to improve the accuracy of an existing layout model.
 You will be provided with an image of a layout block and the top k predictions from the current model, along with their confidence scores.
 Your job is to analyze the image and choose the single most appropriate label from the provided top k predictions.
 Here are the top k predictions from the model followed by the image:
 """
+    complex_relabeling_prompt: Annotated[
+        str,
+        "The prompt to use for complex relabelling blocks.",
+        "Default is a string containing the complex relabelling prompt."
+    ] = """You are a layout expert specializing in document analysis.
 Your task is to relabel layout blocks in images to improve the accuracy of an existing layout model.
 You will be provided with an image of a layout block and some potential labels.
 Your job is to analyze the image and choose the single most appropriate label from the provided labels.
         complex_prompt = self.complex_relabeling_prompt
         return self.process_block_relabeling(page, block, complex_prompt)
     def process_block_relabeling(self, page: PageGroup, block: Block, prompt: str):
         image = self.extract_image(page, block)
         response_schema = content.Schema(
             .rescale(page.polygon.size, page_img.size)\
             .expand(expand, expand)
         cropped = page_img.crop(image_box.bbox)
+        return cropped

marker/builders/ocr.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import List
 from ftfy import fix_text
 from surya.model.detection.model import EfficientViTForSemanticSegmentation
@@ -20,22 +20,22 @@ from marker.settings import settings
 class OcrBuilder(BaseBuilder):
     """
     A builder for performing OCR on PDF pages and merging the results into the document.
-    Attributes:
-        detection_batch_size (int):
-            The batch size to use for the detection model.
-            Default is None, which will use the default batch size for the model.
-        recognition_batch_size (int):
-            The batch size to use for the recognition model.
-            Default is None, which will use the default batch size for the model.
-        languages (List[str]):
-            A list of languages to use for OCR. Default is None.
     """
-    recognition_batch_size: int | None = None
-    detection_batch_size: int | None = None
-    languages: List[str] | None = None
     def __init__(self, detection_model: EfficientViTForSemanticSegmentation, recognition_model: OCREncoderDecoderModel, config=None):
         super().__init__(config)

+from typing import Annotated, List, Optional
 from ftfy import fix_text
 from surya.model.detection.model import EfficientViTForSemanticSegmentation
 class OcrBuilder(BaseBuilder):
     """
     A builder for performing OCR on PDF pages and merging the results into the document.
     """
+    recognition_batch_size: Annotated[
+        Optional[int],
+        "The batch size to use for the recognition model.",
+        "Default is None, which will use the default batch size for the model."
+    ] = None
+    detection_batch_size: Annotated[
+        Optional[int],
+        "The batch size to use for the detection model.",
+        "Default is None, which will use the default batch size for the model."
+    ] = None
+    languages: Annotated[
+        Optional[List[str]],
+        "A list of languages to use for OCR.",
+        "Default is None."
+    ] = None
     def __init__(self, detection_model: EfficientViTForSemanticSegmentation, recognition_model: OCREncoderDecoderModel, config=None):
         super().__init__(config)

marker/builders/structure.py CHANGED Viewed

@@ -1,3 +1,5 @@
 from marker.builders import BaseBuilder
 from marker.schema import BlockTypes
 from marker.schema.document import Document
@@ -9,18 +11,15 @@ from marker.schema.registry import get_block_class
 class StructureBuilder(BaseBuilder):
     """
     A builder for grouping blocks together based on their structure.
-    Attributes:
-        gap_threshold (float):
-            The minimum gap between blocks to consider them part of the same group.
-            Default is 0.05.
-        list_gap_threshold (float):
-            The minimum gap between list items to consider them part of the same group.
-            Default is 0.1.
     """
-    gap_threshold: int = .05
-    list_gap_threshold: int = .1
     def __init__(self, config=None):
         super().__init__(config)
@@ -58,8 +57,8 @@ class StructureBuilder(BaseBuilder):
                 selected_polygons.append(prev_block.polygon)
             if next_block and \
-                next_block.block_type in caption_types and \
-                next_block.polygon.minimum_gap(block.polygon) < gap_threshold_px:
                 block_structure.append(next_block.id)
                 selected_polygons.append(next_block.polygon)

+from typing import Annotated
 from marker.builders import BaseBuilder
 from marker.schema import BlockTypes
 from marker.schema.document import Document
 class StructureBuilder(BaseBuilder):
     """
     A builder for grouping blocks together based on their structure.
     """
+    gap_threshold: Annotated[
+        float,
+        "The minimum gap between blocks to consider them part of the same group.",
+    ] = 0.05
+    list_gap_threshold: Annotated[
+        float,
+        "The minimum gap between list items to consider them part of the same group.",
+    ] = 0.1
     def __init__(self, config=None):
         super().__init__(config)
                 selected_polygons.append(prev_block.polygon)
             if next_block and \
+                    next_block.block_type in caption_types and \
+                    next_block.polygon.minimum_gap(block.polygon) < gap_threshold_px:
                 block_structure.append(next_block.id)
                 selected_polygons.append(next_block.polygon)

marker/config/crawler.py ADDED Viewed

	@@ -0,0 +1,106 @@

+import importlib
+import inspect
+import pkgutil
+from functools import cached_property
+from typing import Annotated, Dict, Set, Type, get_args, get_origin
+from marker.builders import BaseBuilder
+from marker.converters import BaseConverter
+from marker.processors import BaseProcessor
+from marker.providers import BaseProvider
+from marker.renderers import BaseRenderer
+class ConfigCrawler:
+    def __init__(self, base_classes=(BaseBuilder, BaseProcessor, BaseConverter, BaseProvider, BaseRenderer)):
+        self.base_classes = base_classes
+        self.class_config_map = {}
+        self._crawl_config()
+    def _crawl_config(self):
+        for base in self.base_classes:
+            base_class_type = base.__name__.removeprefix('Base')
+            self.class_config_map.setdefault(base_class_type, {})
+            for class_name, class_type in self._find_subclasses(base).items():
+                if class_name.startswith('Base'):
+                    continue
+                self.class_config_map[base_class_type].setdefault(class_name, {
+                    'class_type': class_type,
+                    'config': {}
+                })
+                for attr, attr_type in self._gather_super_annotations(class_type).items():
+                    default = getattr(class_type, attr)
+                    metadata = (f"Default is {default}.",)
+                    if get_origin(attr_type) is Annotated:
+                        if any('Default' in desc for desc in attr_type.__metadata__):
+                            metadata = attr_type.__metadata__
+                        else:
+                            metadata = attr_type.__metadata__ + metadata
+                        attr_type = get_args(attr_type)[0]
+                    formatted_type = self._format_type(attr_type)
+                    self.class_config_map[base_class_type][class_name]['config'][attr] = (attr_type, formatted_type, default, metadata)
+    def _gather_super_annotations(self, cls: Type) -> Dict[str, Type]:
+        """
+        Collect all annotated attributes from `cls` and its superclasses, bottom-up.
+        Subclass attributes overwrite superclass attributes with the same name.
+        """
+        # We'll walk the MRO from base -> derived so subclass attributes overwrite
+        # the same attribute name from superclasses.
+        annotations = {}
+        for base in reversed(cls.__mro__):
+            if base is object:
+                continue
+            if hasattr(base, "__annotations__"):
+                for name, annotation in base.__annotations__.items():
+                    annotations[name] = annotation
+        return annotations
+    @cached_property
+    def attr_counts(self) -> Dict[str, int]:
+        counts: Dict[str, int] = {}
+        for base_type_dict in self.class_config_map.values():
+            for class_map in base_type_dict.values():
+                for attr in class_map['config'].keys():
+                    counts[attr] = counts.get(attr, 0) + 1
+        return counts
+    @cached_property
+    def attr_set(self) -> Set[str]:
+        attr_set: Set[str] = set()
+        for base_type_dict in self.class_config_map.values():
+            for class_name, class_map in base_type_dict.items():
+                for attr in class_map['config'].keys():
+                    attr_set.add(attr)
+                    attr_set.add(f"{class_name}_{attr}")
+        return attr_set
+    def _find_subclasses(self, base_class):
+        subclasses = {}
+        module_name = base_class.__module__
+        package = importlib.import_module(module_name)
+        if hasattr(package, '__path__'):
+            for _, module_name, _ in pkgutil.walk_packages(package.__path__, module_name + "."):
+                try:
+                    module = importlib.import_module(module_name)
+                    for name, obj in inspect.getmembers(module, inspect.isclass):
+                        if issubclass(obj, base_class) and obj is not base_class:
+                            subclasses[name] = obj
+                except ImportError:
+                    pass
+        return subclasses
+    def _format_type(self, t: Type) -> str:
+        """Format a typing type like Optional[int] into a readable string."""
+        if get_origin(t):  # Handle Optional and types with origins separately
+            return f"{t}".removeprefix('typing.')
+        else:  # Regular types like int, str
+            return t.__name__
+crawler = ConfigCrawler()

marker/config/parser.py CHANGED Viewed

@@ -4,11 +4,12 @@ from typing import Dict
 import click
 from marker.renderers.html import HTMLRenderer
-from marker.settings import settings
-from marker.util import parse_range_str, strings_to_classes, classes_to_strings
-from marker.renderers.markdown import MarkdownRenderer
 from marker.renderers.json import JSONRenderer
 class ConfigParser:
@@ -22,20 +23,22 @@ class ConfigParser:
         fn = click.option('--debug', '-d', is_flag=True, help='Enable debug mode.')(fn)
         fn = click.option("--output_format", type=click.Choice(["markdown", "json", "html"]), default="markdown",
                           help="Format to output results in.")(fn)
-        fn = click.option("--page_range", type=str, default=None,
-                          help="Page range to convert, specify comma separated page numbers or ranges.  Example: 0,5-10,20")(
-            fn)
-        fn = click.option("--force_ocr", is_flag=True, help="Force OCR on the whole document.")(fn)
         fn = click.option("--processors", type=str, default=None,
                           help="Comma separated list of processors to use.  Must use full module path.")(fn)
         fn = click.option("--config_json", type=str, default=None,
                           help="Path to JSON file with additional configuration.")(fn)
-        fn = click.option("--languages", type=str, default=None, help="Comma separated list of languages to use for OCR.")(fn)
         fn = click.option("--disable_multiprocessing", is_flag=True, default=False, help="Disable multiprocessing.")(fn)
-        fn = click.option("--paginate_output", is_flag=True, default=False, help="Paginate output.")(fn)
         fn = click.option("--disable_image_extraction", is_flag=True, default=False, help="Disable image extraction.")(fn)
         fn = click.option("--use_llm", is_flag=True, default=False, help="Enable higher quality processing with LLMs.")(fn)
-        fn = click.option("--strip_existing_ocr", is_flag=True, default=False, help="Strip existing OCR text from the PDF.")(fn)
         return fn
     def generate_config_dict(self) -> Dict[str, any]:
@@ -53,8 +56,6 @@ class ConfigParser:
                     config["debug_data_folder"] = output_dir
                 case "page_range":
                     config["page_range"] = parse_range_str(v)
-                case "force_ocr":
-                    config["force_ocr"] = True
                 case "languages":
                     config["languages"] = v.split(",")
                 case "config_json":
@@ -62,14 +63,11 @@ class ConfigParser:
                         config.update(json.load(f))
                 case "disable_multiprocessing":
                     config["pdftext_workers"] = 1
-                case "paginate_output":
-                    config["paginate_output"] = True
                 case "disable_image_extraction":
                     config["extract_images"] = False
-                case "use_llm":
-                    config["use_llm"] = True
-                case "strip_existing_ocr":
-                    config["strip_existing_ocr"] = True
         return config
     def get_renderer(self):

 import click
+from marker.config.crawler import crawler
 from marker.renderers.html import HTMLRenderer
 from marker.renderers.json import JSONRenderer
+from marker.renderers.markdown import MarkdownRenderer
+from marker.settings import settings
+from marker.util import classes_to_strings, parse_range_str, strings_to_classes
 class ConfigParser:
         fn = click.option('--debug', '-d', is_flag=True, help='Enable debug mode.')(fn)
         fn = click.option("--output_format", type=click.Choice(["markdown", "json", "html"]), default="markdown",
                           help="Format to output results in.")(fn)
         fn = click.option("--processors", type=str, default=None,
                           help="Comma separated list of processors to use.  Must use full module path.")(fn)
         fn = click.option("--config_json", type=str, default=None,
                           help="Path to JSON file with additional configuration.")(fn)
         fn = click.option("--disable_multiprocessing", is_flag=True, default=False, help="Disable multiprocessing.")(fn)
         fn = click.option("--disable_image_extraction", is_flag=True, default=False, help="Disable image extraction.")(fn)
+        # these are options that need a list transformation, i.e splitting/parsing a string
+        fn = click.option("--page_range", type=str, default=None,
+                          help="Page range to convert, specify comma separated page numbers or ranges.  Example: 0,5-10,20")(
+            fn)
+        fn = click.option("--languages", type=str, default=None, help="Comma separated list of languages to use for OCR.")(fn)
+        # we put common options here
+        fn = click.option("--google_api_key", type=str, default=None, help="Google API key for using LLMs.")(fn)
         fn = click.option("--use_llm", is_flag=True, default=False, help="Enable higher quality processing with LLMs.")(fn)
         return fn
     def generate_config_dict(self) -> Dict[str, any]:
                     config["debug_data_folder"] = output_dir
                 case "page_range":
                     config["page_range"] = parse_range_str(v)
                 case "languages":
                     config["languages"] = v.split(",")
                 case "config_json":
                         config.update(json.load(f))
                 case "disable_multiprocessing":
                     config["pdftext_workers"] = 1
                 case "disable_image_extraction":
                     config["extract_images"] = False
+                case _:
+                    if k in crawler.attr_set:
+                        config[k] = v
         return config
     def get_renderer(self):

marker/config/printer.py CHANGED Viewed

@@ -1,32 +1,8 @@
-import importlib
-import inspect
-import pkgutil
 import click
-from marker.builders import BaseBuilder
-from marker.converters import BaseConverter
-from marker.processors import BaseProcessor
-def find_subclasses(base_class):
-    """
-    Dynamically find all subclasses of a base class in the module where the base class is defined
-    and its submodules.
-    """
-    subclasses = {}
-    module_name = base_class.__module__
-    package = importlib.import_module(module_name)
-    if hasattr(package, '__path__'):
-        for _, module_name, _ in pkgutil.walk_packages(package.__path__, module_name + "."):
-            try:
-                module = importlib.import_module(module_name)
-                for name, obj in inspect.getmembers(module, inspect.isclass):
-                    if issubclass(obj, base_class) and obj is not base_class:
-                        subclasses[name] = obj
-            except ImportError:
-                pass
-    return subclasses
 class CustomClickPrinter(click.Command):
@@ -39,16 +15,41 @@ class CustomClickPrinter(click.Command):
         click.echo(help_text)
     def parse_args(self, ctx, args):
-        if 'config' in args and '--help' in args:
-            click.echo("Here is a list of all the Builders, Processors, and Converters in Marker along with their attributes:")
-            base_classes = [BaseBuilder, BaseProcessor, BaseConverter]
-            for base in base_classes:
-                click.echo(f"{base.__name__.removeprefix('Base')}s:\n")
-                subclasses = find_subclasses(base)
-                for class_name, class_type in subclasses.items():
-                    doc = class_type.__doc__
-                    if doc and "Attributes:" in doc:
-                        click.echo(f"  {class_name}: {doc}")
             ctx.exit()
         super().parse_args(ctx, args)

+from typing import Optional
 import click
+from marker.config.crawler import crawler
 class CustomClickPrinter(click.Command):
         click.echo(help_text)
     def parse_args(self, ctx, args):
+        display_help = 'config' in args and '--help' in args
+        if display_help:
+            click.echo("Here is a list of all the Builders, Processors, Converters, Providers and Renderers in Marker along with their attributes:")
+        for base_type, base_type_dict in crawler.class_config_map.items():
+            if display_help:
+                click.echo(f"{base_type}s:")
+            for class_name, class_map in base_type_dict.items():
+                if display_help and class_map['config']:
+                    click.echo(f"\n  {class_name}: {class_map['class_type'].__doc__ or ''}")
+                    click.echo(" " * 4 + "Attributes:")
+                for attr, (attr_type, formatted_type, default, metadata) in class_map['config'].items():
+                    class_name_attr = class_name + "_" + attr
+                    if display_help:
+                        click.echo(" " * 8 + f"{attr} ({formatted_type}):")
+                        click.echo("\n".join([f'{" " * 12}' + desc for desc in metadata]))
+                    if attr_type in [str, int, float, bool, Optional[int], Optional[float], Optional[str]]:
+                        is_flag = attr_type in [bool, Optional[bool]] and not default
+                        if crawler.attr_counts.get(attr) > 1:
+                            options = ["--" + class_name_attr]
+                        else:
+                            options = ["--" + attr, "--" + class_name_attr]
+                        options.append(class_name_attr)
+                        ctx.command.params.append(
+                            click.Option(
+                                options,
+                                type=attr_type,
+                                help=" ".join(metadata),
+                                default=default,
+                                is_flag=is_flag,
+                            )
+                        )
+        if display_help:
             ctx.exit()
         super().parse_args(ctx, args)

marker/converters/pdf.py CHANGED Viewed

@@ -1,30 +1,31 @@
 import os
-os.environ["TOKENIZERS_PARALLELISM"] = "false" # disables a tokenizers warning
 import inspect
 from collections import defaultdict
-from typing import Any, Dict, List, Type
 from marker.builders.document import DocumentBuilder
-from marker.builders.llm_layout import LLMLayoutBuilder
 from marker.builders.layout import LayoutBuilder
 from marker.builders.ocr import OcrBuilder
 from marker.builders.structure import StructureBuilder
 from marker.converters import BaseConverter
-from marker.processors.llm.llm_complex import LLMComplexRegionProcessor
 from marker.processors.blockquote import BlockquoteProcessor
 from marker.processors.code import CodeProcessor
 from marker.processors.debug import DebugProcessor
 from marker.processors.document_toc import DocumentTOCProcessor
 from marker.processors.equation import EquationProcessor
 from marker.processors.footnote import FootnoteProcessor
-from marker.processors.llm.llm_form import LLMFormProcessor
-from marker.processors.llm.llm_table import LLMTableProcessor
-from marker.processors.llm.llm_text import LLMTextProcessor
-from marker.processors.llm.llm_image_description import LLMImageDescriptionProcessor
 from marker.processors.ignoretext import IgnoreTextProcessor
 from marker.processors.line_numbers import LineNumbersProcessor
 from marker.processors.list import ListProcessor
 from marker.processors.page_header import PageHeaderProcessor
 from marker.processors.sectionheader import SectionHeaderProcessor
 from marker.processors.table import TableProcessor
@@ -40,18 +41,20 @@ from marker.util import strings_to_classes
 class PdfConverter(BaseConverter):
     """
     A converter for processing and rendering PDF files into Markdown, JSON, HTML and other formats.
-    Attributes:
-        override_map (Dict[BlockTypes, Type[Block]]):
-            A mapping to override the default block classes for specific block types.
-            The keys are `BlockTypes` enum values, representing the types of blocks,
-            and the values are corresponding `Block` class implementations to use
-            instead of the defaults.
     """
-    override_map: Dict[BlockTypes, Type[Block]] = defaultdict()
-    use_llm: bool = False
-    def __init__(self, artifact_dict: Dict[str, Any], processor_list: List[str] | None = None, renderer: str | None = None, config=None):
         super().__init__(config)
         for block_type, override_block_type in self.override_map.items():

 import os
+os.environ["TOKENIZERS_PARALLELISM"] = "false"  # disables a tokenizers warning
 import inspect
 from collections import defaultdict
+from typing import Annotated, Any, Dict, List, Optional, Type
 from marker.builders.document import DocumentBuilder
 from marker.builders.layout import LayoutBuilder
+from marker.builders.llm_layout import LLMLayoutBuilder
 from marker.builders.ocr import OcrBuilder
 from marker.builders.structure import StructureBuilder
 from marker.converters import BaseConverter
 from marker.processors.blockquote import BlockquoteProcessor
 from marker.processors.code import CodeProcessor
 from marker.processors.debug import DebugProcessor
 from marker.processors.document_toc import DocumentTOCProcessor
 from marker.processors.equation import EquationProcessor
 from marker.processors.footnote import FootnoteProcessor
 from marker.processors.ignoretext import IgnoreTextProcessor
 from marker.processors.line_numbers import LineNumbersProcessor
 from marker.processors.list import ListProcessor
+from marker.processors.llm.llm_complex import LLMComplexRegionProcessor
+from marker.processors.llm.llm_form import LLMFormProcessor
+from marker.processors.llm.llm_image_description import LLMImageDescriptionProcessor
+from marker.processors.llm.llm_table import LLMTableProcessor
+from marker.processors.llm.llm_text import LLMTextProcessor
 from marker.processors.page_header import PageHeaderProcessor
 from marker.processors.sectionheader import SectionHeaderProcessor
 from marker.processors.table import TableProcessor
 class PdfConverter(BaseConverter):
     """
     A converter for processing and rendering PDF files into Markdown, JSON, HTML and other formats.
     """
+    override_map: Annotated[
+        Dict[BlockTypes, Type[Block]],
+        "A mapping to override the default block classes for specific block types.",
+        "The keys are `BlockTypes` enum values, representing the types of blocks,",
+        "and the values are corresponding `Block` class implementations to use",
+        "instead of the defaults."
+    ] = defaultdict()
+    use_llm: Annotated[
+        bool,
+        "Enable higher quality processing with LLMs.",
+    ] = False
+    def __init__(self, artifact_dict: Dict[str, Any], processor_list: Optional[List[str]] = None, renderer: str | None = None, config=None):
         super().__init__(config)
         for block_type, override_block_type in self.override_map.items():

marker/processors/blockquote.py CHANGED Viewed

@@ -1,3 +1,5 @@
 from marker.processors import BaseProcessor
 from marker.schema import BlockTypes
 from marker.schema.document import Document
@@ -5,12 +7,27 @@ from marker.schema.document import Document
 class BlockquoteProcessor(BaseProcessor):
     """
-    A processor for tagging blockquotes
     """
-    block_types = (BlockTypes.Text, BlockTypes.TextInlineMath)
-    min_x_indent = 0.05  # % of block width
-    x_start_tolerance = 0.01  # % of block width
-    x_end_tolerance = 0.01  # % of block width
     def __init__(self, config):
         super().__init__(config)

+from typing import Annotated, Tuple
 from marker.processors import BaseProcessor
 from marker.schema import BlockTypes
 from marker.schema.document import Document
 class BlockquoteProcessor(BaseProcessor):
     """
+    A processor for tagging blockquotes.
     """
+    block_types: Annotated[
+        Tuple[BlockTypes],
+        "The block types to process.",
+    ] = (BlockTypes.Text, BlockTypes.TextInlineMath)
+    min_x_indent: Annotated[
+        float,
+        "The minimum horizontal indentation required to consider a block as part of a blockquote.",
+        "Expressed as a percentage of the block width.",
+    ] = 0.05
+    x_start_tolerance: Annotated[
+        float,
+        "The maximum allowable difference between the starting x-coordinates of consecutive blocks to consider them aligned.",
+        "Expressed as a percentage of the block width.",
+    ] = 0.01
+    x_end_tolerance: Annotated[
+        float,
+        "The maximum allowable difference between the ending x-coordinates of consecutive blocks to consider them aligned.",
+        "Expressed as a percentage of the block width.",
+    ] = 0.01
     def __init__(self, config):
         super().__init__(config)

marker/processors/debug.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import json
 import os
 import requests
 from PIL import Image, ImageDraw, ImageFont
@@ -13,39 +14,36 @@ from marker.settings import settings
 class DebugProcessor(BaseProcessor):
     """
     A processor for debugging the document.
-    Attributes:
-        debug_data_folder (str):
-            The folder to dump debug data to.
-            Default is "debug_data".
-        debug_layout_images (bool):
-            Whether to dump layout debug images.
-            Default is False.
-        debug_pdf_images (bool):
-            Whether to dump PDF debug images.
-            Default is False.
-        debug_json (bool):
-            Whether to dump block debug data.
-            Default is False.
-        render_font (str):
-            The path to the font to use for rendering debug images.
-            Default is "GoNotoCurrent-Regular.ttf" in the FONT_DIR folder.
-        font_dl_path (str):
-            The path to download the font from.
-            Default is "https://github.com/satbyy/go-noto-universal/releases/download/v7.0".
     """
-    block_types = tuple()
-    debug_data_folder: str = "debug_data"
-    debug_layout_images: bool = False
-    debug_pdf_images: bool = False
-    debug_json: bool = False
-    render_font: str = os.path.join(settings.FONT_DIR, "GoNotoCurrent-Regular.ttf")
-    font_dl_path: str = "https://github.com/satbyy/go-noto-universal/releases/download/v7.0"
     def __call__(self, document: Document):
         # Remove extension from doc name
@@ -90,7 +88,6 @@ class DebugProcessor(BaseProcessor):
             debug_file = os.path.join(self.debug_folder, f"pdf_page_{page.page_id}.png")
             png_image.save(debug_file)
     def draw_layout_debug_images(self, document: Document, pdf_mode=False):
         for page in document.pages:
             img_size = page.highres_image.size
@@ -113,7 +110,6 @@ class DebugProcessor(BaseProcessor):
             debug_file = os.path.join(self.debug_folder, f"layout_page_{page.page_id}.png")
             png_image.save(debug_file)
     def render_layout_boxes(self, page, png_image):
         layout_bboxes = []
         layout_labels = []

 import json
 import os
+from typing import Annotated
 import requests
 from PIL import Image, ImageDraw, ImageFont
 class DebugProcessor(BaseProcessor):
     """
     A processor for debugging the document.
     """
+    block_types: Annotated[
+        tuple,
+        "The block types to process.",
+        "Default is an empty tuple."
+    ] = tuple()
+    debug_data_folder: Annotated[
+        str,
+        "The folder to dump debug data to.",
+    ] = "debug_data"
+    debug_layout_images: Annotated[
+        bool,
+        "Whether to dump layout debug images.",
+    ] = False
+    debug_pdf_images: Annotated[
+        bool,
+        "Whether to dump PDF debug images.",
+    ] = False
+    debug_json: Annotated[
+        bool,
+        "Whether to dump block debug data.",
+    ] = False
+    render_font: Annotated[
+        str,
+        "The path to the font to use for rendering debug images.",
+    ] = os.path.join(settings.FONT_DIR, "GoNotoCurrent-Regular.ttf")
+    font_dl_path: Annotated[
+        str,
+        "The path to download the font from.",
+    ] = "https://github.com/satbyy/go-noto-universal/releases/download/v7.0"
     def __call__(self, document: Document):
         # Remove extension from doc name
             debug_file = os.path.join(self.debug_folder, f"pdf_page_{page.page_id}.png")
             png_image.save(debug_file)
     def draw_layout_debug_images(self, document: Document, pdf_mode=False):
         for page in document.pages:
             img_size = page.highres_image.size
             debug_file = os.path.join(self.debug_folder, f"layout_page_{page.page_id}.png")
             png_image.save(debug_file)
     def render_layout_boxes(self, page, png_image):
         layout_bboxes = []
         layout_labels = []

marker/processors/equation.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import List
 from texify.inference import batch_inference
 from texify.model.model import GenerateVisionEncoderDecoderModel
@@ -13,24 +13,24 @@ from marker.settings import settings
 class EquationProcessor(BaseProcessor):
     """
     A processor for recognizing equations in the document.
-    Attributes:
-        model_max_length (int):
-            The maximum number of tokens to allow for the Texify model.
-            Default is 384.
-        batch_size (int):
-            The batch size to use for the Texify model.
-            Default is None, which will use the default batch size for the model.
-        token_buffer (int):
-            The number of tokens to buffer above max for the Texify model.
-            Default is 256.
     """
-    block_types = (BlockTypes.Equation, )
-    model_max_length = 384
-    texify_batch_size = None
-    token_buffer = 256
     def __init__(self, texify_model: GenerateVisionEncoderDecoderModel, config=None):
         super().__init__(config)

+from typing import Annotated, List, Optional, Tuple
 from texify.inference import batch_inference
 from texify.model.model import GenerateVisionEncoderDecoderModel
 class EquationProcessor(BaseProcessor):
     """
     A processor for recognizing equations in the document.
     """
+    block_types: Annotated[
+        Tuple[BlockTypes],
+        "The block types to process.",
+    ] = (BlockTypes.Equation,)
+    model_max_length: Annotated[
+        int,
+        "The maximum number of tokens to allow for the Texify model.",
+    ] = 384
+    texify_batch_size: Annotated[
+        Optional[int],
+        "The batch size to use for the Texify model.",
+        "Default is None, which will use the default batch size for the model."
+    ] = None
+    token_buffer: Annotated[
+        int,
+        "The number of tokens to buffer above max for the Texify model.",
+    ] = 256
     def __init__(self, texify_model: GenerateVisionEncoderDecoderModel, config=None):
         super().__init__(config)

marker/processors/footnote.py CHANGED Viewed

@@ -1,27 +1,12 @@
-from statistics import mean
 from marker.processors import BaseProcessor
 from marker.schema import BlockTypes
-from marker.schema.blocks import Footnote
 from marker.schema.document import Document
-from rapidfuzz import fuzz
 from marker.schema.groups import PageGroup
 class FootnoteProcessor(BaseProcessor):
     """
     A processor for pushing footnotes to the bottom, and relabeling mislabeled text blocks.
-    Attributes:
-        page_bottom_threshold (float):
-            The fraction of page height that is considered the bottom.
-            Default is .8
-        line_height_scaler (float):
-            The amount to scale line height by to consider a block a footnote. (from N to 1+(1-N))
-            Default is .99
     """
     block_types = (BlockTypes.Footnote,)
@@ -29,7 +14,6 @@ class FootnoteProcessor(BaseProcessor):
         for page in document.pages:
             self.push_footnotes_to_bottom(page, document)
     def push_footnotes_to_bottom(self, page: PageGroup, document: Document):
         footnote_blocks = page.contained_blocks(document, self.block_types)
@@ -39,4 +23,4 @@ class FootnoteProcessor(BaseProcessor):
             if block.id in page.structure:
                 # Move to bottom if it is
                 page.structure.remove(block.id)
-                page.add_structure(block)

 from marker.processors import BaseProcessor
 from marker.schema import BlockTypes
 from marker.schema.document import Document
 from marker.schema.groups import PageGroup
 class FootnoteProcessor(BaseProcessor):
     """
     A processor for pushing footnotes to the bottom, and relabeling mislabeled text blocks.
     """
     block_types = (BlockTypes.Footnote,)
         for page in document.pages:
             self.push_footnotes_to_bottom(page, document)
     def push_footnotes_to_bottom(self, page: PageGroup, document: Document):
         footnote_blocks = page.contained_blocks(document, self.block_types)
             if block.id in page.structure:
                 # Move to bottom if it is
                 page.structure.remove(block.id)
+                page.add_structure(block)

marker/processors/ignoretext.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import re
 from collections import Counter
 from itertools import groupby
-from typing import List
 from rapidfuzz import fuzz
@@ -13,22 +13,34 @@ from marker.schema.document import Document
 class IgnoreTextProcessor(BaseProcessor):
     """
-    A processor for ignoring text blocks that are common elements in the document.
-    Attributes:
-        common_element_threshold (float):
-            The minimum fraction of pages that a block must appear in to be considered a common element.
-            Default is 0.6.
     """
     block_types = (
-        BlockTypes.Text, BlockTypes.PageHeader,
         BlockTypes.PageFooter, BlockTypes.SectionHeader,
         BlockTypes.TextInlineMath
     )
-    common_element_threshold = .20
-    common_element_min_blocks = 3
-    max_streak = 3 # The maximum number of blocks in a row to consider a common element
-    text_match_threshold = 90
     def __call__(self, document: Document):
         first_blocks = []
@@ -55,8 +67,8 @@ class IgnoreTextProcessor(BaseProcessor):
     @staticmethod
     def clean_text(text):
         text = text.replace("\n", "").strip()
-        text = re.sub(r"^\d+\s*", "", text) # remove numbers at the start of the line
-        text = re.sub(r"\s*\d+$", "", text) # remove numbers at the end of the line
         return text
     def filter_common_elements(self, document, blocks: List[Block]):
@@ -74,7 +86,7 @@ class IgnoreTextProcessor(BaseProcessor):
         common = [
             k for k, v in counter.items()
             if (v >= len(blocks) * self.common_element_threshold or streaks[k] >= self.max_streak)
-               and v > self.common_element_min_blocks
         ]
         if len(common) == 0:
             return

 import re
 from collections import Counter
 from itertools import groupby
+from typing import Annotated, List
 from rapidfuzz import fuzz
 class IgnoreTextProcessor(BaseProcessor):
     """
+    A processor for identifying and ignoring common text blocks in a document.
+    These blocks often represent repetitive or non-essential elements, such as headers, footers, or page numbers.
     """
     block_types = (
+        BlockTypes.Text, BlockTypes.PageHeader,
         BlockTypes.PageFooter, BlockTypes.SectionHeader,
         BlockTypes.TextInlineMath
     )
+    common_element_threshold: Annotated[
+        float,
+        "The minimum ratio of pages a text block must appear on to be considered a common element.",
+        "Blocks that meet or exceed this threshold are marked as common elements.",
+    ] = 0.2
+    common_element_min_blocks: Annotated[
+        int,
+        "The minimum number of occurrences of a text block within a document to consider it a common element.",
+        "This ensures that rare blocks are not mistakenly flagged.",
+    ] = 3
+    max_streak: Annotated[
+        int,
+        "The maximum number of consecutive occurrences of a text block allowed before it is classified as a common element.",
+        "Helps to identify patterns like repeated headers or footers.",
+    ] = 3
+    text_match_threshold: Annotated[
+        int,
+        "The minimum fuzzy match score (0-100) required to classify a text block as similar to a common element.",
+        "Higher values enforce stricter matching.",
+    ] = 90
     def __call__(self, document: Document):
         first_blocks = []
     @staticmethod
     def clean_text(text):
         text = text.replace("\n", "").strip()
+        text = re.sub(r"^\d+\s*", "", text)  # remove numbers at the start of the line
+        text = re.sub(r"\s*\d+$", "", text)  # remove numbers at the end of the line
         return text
     def filter_common_elements(self, document, blocks: List[Block]):
         common = [
             k for k, v in counter.items()
             if (v >= len(blocks) * self.common_element_threshold or streaks[k] >= self.max_streak)
+            and v > self.common_element_min_blocks
         ]
         if len(common) == 0:
             return

marker/processors/line_numbers.py CHANGED Viewed

@@ -1,13 +1,29 @@
 from marker.processors import BaseProcessor
 from marker.schema import BlockTypes
 from marker.schema.document import Document
 class LineNumbersProcessor(BaseProcessor):
     block_types = (BlockTypes.Text, BlockTypes.TextInlineMath)
-    strip_numbers_threshold: int = .6
-    min_lines_in_block: int = 4
-    min_line_length: int = 10
     def __init__(self, config):
         super().__init__(config)
@@ -27,11 +43,10 @@ class LineNumbersProcessor(BaseProcessor):
                 tokens_are_numbers = [token.isdigit() for token in tokens]
                 if all([
                     sum(tokens_are_numbers) / len(tokens) > self.strip_numbers_threshold,
-                    block.polygon.height > block.polygon.width # Ensure block is taller than it is wide, like vertical page numbers
                 ]):
                     block.ignore_for_output = True
     def ignore_line_starts_ends(self, document: Document):
         for page in document.pages:
             for block in page.contained_blocks(document, self.block_types):
@@ -57,7 +72,7 @@ class LineNumbersProcessor(BaseProcessor):
                         len(raw_text) - len(spans[0].text.strip()) > self.min_line_length
                     ])
-                    ends= all([
                         spans[-1].text.strip().isdigit(),
                         len(raw_text) - len(spans[-1].text.strip()) > self.min_line_length
                     ])
@@ -76,4 +91,3 @@ class LineNumbersProcessor(BaseProcessor):
                         if ends:
                             span = page.get_block(line.structure[-1])
                             span.ignore_for_output = True

+from typing import Annotated
 from marker.processors import BaseProcessor
 from marker.schema import BlockTypes
 from marker.schema.document import Document
 class LineNumbersProcessor(BaseProcessor):
+    """
+    A processor for ignoring line numbers.
+    """
     block_types = (BlockTypes.Text, BlockTypes.TextInlineMath)
+    strip_numbers_threshold: Annotated[
+        float,
+        "The fraction of lines or tokens in a block that must be numeric to consider them as line numbers.",
+    ] = 0.6
+    min_lines_in_block: Annotated[
+        int,
+        "The minimum number of lines required in a block for it to be considered during processing.",
+        "Ensures that small blocks are ignored as they are unlikely to contain meaningful line numbers.",
+    ] = 4
+    min_line_length: Annotated[
+        int,
+        "The minimum length of a line (in characters) to consider it significant when checking for",
+        "numeric prefixes or suffixes. Prevents false positives for short lines.",
+    ] = 10
     def __init__(self, config):
         super().__init__(config)
                 tokens_are_numbers = [token.isdigit() for token in tokens]
                 if all([
                     sum(tokens_are_numbers) / len(tokens) > self.strip_numbers_threshold,
+                    block.polygon.height > block.polygon.width  # Ensure block is taller than it is wide, like vertical page numbers
                 ]):
                     block.ignore_for_output = True
     def ignore_line_starts_ends(self, document: Document):
         for page in document.pages:
             for block in page.contained_blocks(document, self.block_types):
                         len(raw_text) - len(spans[0].text.strip()) > self.min_line_length
                     ])
+                    ends = all([
                         spans[-1].text.strip().isdigit(),
                         len(raw_text) - len(spans[-1].text.strip()) > self.min_line_length
                     ])
                         if ends:
                             span = page.get_block(line.structure[-1])
                             span.ignore_for_output = True

marker/processors/list.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import List
 from marker.processors import BaseProcessor
 from marker.schema import BlockTypes
@@ -11,8 +11,14 @@ class ListProcessor(BaseProcessor):
     A processor for merging lists across pages and columns
     """
     block_types = (BlockTypes.ListGroup,)
-    ignored_block_types = (BlockTypes.PageHeader, BlockTypes.PageFooter)
-    min_x_indent = 0.01  # % of page width
     def __init__(self, config):
         super().__init__(config)

+from typing import Annotated, List, Tuple
 from marker.processors import BaseProcessor
 from marker.schema import BlockTypes
     A processor for merging lists across pages and columns
     """
     block_types = (BlockTypes.ListGroup,)
+    ignored_block_types: Annotated[
+        Tuple[BlockTypes],
+        "The list of block types to ignore when merging lists.",
+    ] = (BlockTypes.PageHeader, BlockTypes.PageFooter)
+    min_x_indent: Annotated[
+        float, "The minimum horizontal indentation required to consider a block as a nested list item.",
+        "This is expressed as a percentage of the page width and is used to determine hierarchical relationships within a list.",
+    ] = 0.01
     def __init__(self, config):
         super().__init__(config)

marker/processors/llm/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from concurrent.futures import ThreadPoolExecutor, as_completed
-from typing import Optional
 from tqdm import tqdm
@@ -14,37 +14,40 @@ from marker.settings import settings
 class BaseLLMProcessor(BaseProcessor):
     """
     A processor for using LLMs to convert blocks.
-    Attributes:
-        google_api_key (str):
-            The Google API key to use for the Gemini model.
-            Default is None.
-        model_name (str):
-            The name of the Gemini model to use.
-            Default is "gemini-1.5-flash".
-        max_retries (int):
-            The maximum number of retries to use for the Gemini model.
-            Default is 3.
-        max_concurrency (int):
-            The maximum number of concurrent requests to make to the Gemini model.
-            Default is 3.
-        timeout (int):
-            The timeout for requests to the Gemini model.
-        gemini_rewriting_prompt (str):
-            The prompt to use for rewriting text.
-            Default is a string containing the Gemini rewriting prompt.
-        use_llm (bool):
-            Whether to use the LLM model.
-            Default is False.
     """
-    google_api_key: Optional[str] = settings.GOOGLE_API_KEY
-    model_name: str = "gemini-1.5-flash"
-    use_llm: bool = False
-    max_retries: int = 3
-    max_concurrency: int = 3
-    timeout: int = 60
-    image_expansion_ratio: float = 0.01
-    gemini_rewriting_prompt = None
     block_types = None
     def __init__(self, config=None):
@@ -87,4 +90,4 @@ class BaseLLMProcessor(BaseProcessor):
             .rescale(page.polygon.size, page_img.size)\
             .expand(self.image_expansion_ratio, self.image_expansion_ratio)
         cropped = page_img.crop(image_box.bbox)
-        return cropped

 from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import Annotated, Optional
 from tqdm import tqdm
 class BaseLLMProcessor(BaseProcessor):
     """
     A processor for using LLMs to convert blocks.
     """
+    google_api_key: Annotated[
+        Optional[str],
+        "The Google API key to use for the Gemini model.",
+    ] = settings.GOOGLE_API_KEY
+    model_name: Annotated[
+        str,
+        "The name of the Gemini model to use.",
+    ] = "gemini-1.5-flash"
+    max_retries: Annotated[
+        int,
+        "The maximum number of retries to use for the Gemini model.",
+    ] = 3
+    max_concurrency: Annotated[
+        int,
+        "The maximum number of concurrent requests to make to the Gemini model.",
+    ] = 3
+    timeout: Annotated[
+        int,
+        "The timeout for requests to the Gemini model.",
+    ] = 60
+    image_expansion_ratio: Annotated[
+        float,
+        "The ratio to expand the image by when cropping.",
+    ] = 0.01
+    gemini_rewriting_prompt: Annotated[
+        str,
+        "The prompt to use for rewriting text.",
+        "Default is a string containing the Gemini rewriting prompt."
+    ] = ''
+    use_llm: Annotated[
+        bool,
+        "Whether to use the LLM model.",
+    ] = False
     block_types = None
     def __init__(self, config=None):
             .rescale(page.polygon.size, page_img.size)\
             .expand(self.image_expansion_ratio, self.image_expansion_ratio)
         cropped = page_img.crop(image_box.bbox)
+        return cropped

marker/processors/llm/llm_image_description.py CHANGED Viewed

@@ -7,11 +7,20 @@ from marker.schema.blocks import Block
 from marker.schema.document import Document
 from marker.schema.groups.page import PageGroup
 class LLMImageDescriptionProcessor(BaseLLMProcessor):
     block_types = (BlockTypes.Picture, BlockTypes.Figure,)
-    extract_images: bool = True
-    image_description_prompt = """You are a document analysis expert who specializes in creating text descriptions for images.
 You will receive an image of a picture or figure.  Your job will be to create a short description of the image.
 **Instructions:**
 1. Carefully examine the provided image.

 from marker.schema.document import Document
 from marker.schema.groups.page import PageGroup
+from typing import Annotated
 class LLMImageDescriptionProcessor(BaseLLMProcessor):
     block_types = (BlockTypes.Picture, BlockTypes.Figure,)
+    extract_images: Annotated[
+        bool,
+        "Extract images from the document."
+    ] = True
+    image_description_prompt: Annotated[
+        str,
+        "The prompt to use for generating image descriptions.",
+        "Default is a string containing the Gemini prompt."
+    ] = """You are a document analysis expert who specializes in creating text descriptions for images.
 You will receive an image of a picture or figure.  Your job will be to create a short description of the image.
 **Instructions:**
 1. Carefully examine the provided image.

marker/processors/llm/llm_table.py CHANGED Viewed

@@ -1,12 +1,11 @@
-from tabled.schema import SpanTableCell
-from marker.processors.llm import BaseLLMProcessor
 from bs4 import BeautifulSoup
-from typing import List
 from google.ai.generativelanguage_v1beta.types import content
 from tabled.formats import html_format
 from marker.schema import BlockTypes
 from marker.schema.blocks import Block
 from marker.schema.document import Document
@@ -15,8 +14,15 @@ from marker.schema.polygon import PolygonBox
 class LLMTableProcessor(BaseLLMProcessor):
-    block_types = (BlockTypes.Table,)
-    gemini_rewriting_prompt = """You are a text correction expert specializing in accurately reproducing text from images.
 You will receive an image of a text block and an html representation of the table in the image.
 Your task is to correct any errors in the html representation.  The html representation should be as faithful to the original table as possible.
 **Instructions:**
@@ -92,10 +98,8 @@ No corrections needed.
             block.update_metadata(llm_error_count=1)
             return
         block.cells = parsed_cells
     def parse_html_table(self, html_text: str, block: Block) -> List[SpanTableCell]:
         soup = BeautifulSoup(html_text, 'html.parser')
         table = soup.find('table')
@@ -151,5 +155,4 @@ No corrections needed.
                 cells.append(cell_obj)
                 cur_col += colspan
         return cells

+from typing import Annotated, List, Tuple
 from bs4 import BeautifulSoup
 from google.ai.generativelanguage_v1beta.types import content
 from tabled.formats import html_format
+from tabled.schema import SpanTableCell
+from marker.processors.llm import BaseLLMProcessor
 from marker.schema import BlockTypes
 from marker.schema.blocks import Block
 from marker.schema.document import Document
 class LLMTableProcessor(BaseLLMProcessor):
+    block_types: Annotated[
+        Tuple[BlockTypes],
+        "The block types to process.",
+    ] = (BlockTypes.Table,)
+    gemini_rewriting_prompt: Annotated[
+        str,
+        "The prompt to use for rewriting text.",
+        "Default is a string containing the Gemini rewriting prompt."
+    ] = """You are a text correction expert specializing in accurately reproducing text from images.
 You will receive an image of a text block and an html representation of the table in the image.
 Your task is to correct any errors in the html representation.  The html representation should be as faithful to the original table as possible.
 **Instructions:**
             block.update_metadata(llm_error_count=1)
             return
         block.cells = parsed_cells
     def parse_html_table(self, html_text: str, block: Block) -> List[SpanTableCell]:
         soup = BeautifulSoup(html_text, 'html.parser')
         table = soup.find('table')
                 cells.append(cell_obj)
                 cur_col += colspan
         return cells

marker/processors/sectionheader.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import warnings
-from typing import Dict, List
 import numpy as np
 from sklearn.cluster import KMeans
@@ -16,29 +16,24 @@ warnings.filterwarnings("ignore", category=ConvergenceWarning)
 class SectionHeaderProcessor(BaseProcessor):
     """
     A processor for recognizing section headers in the document.
-    Attributes:
-        level_count (int):
-            The number of levels to use for headings.
-            Default is 4.
-        merge_threshold (float):
-            The minimum gap between headings to consider them part of the same group.
-            Default is 0.25.
-        default_level (int):
-            The default heading level to use if no heading level is detected.
-            Default is 2.
-        height_tolerance (float):
-            The minimum height of a heading to consider it a heading.
-            Default is 0.99.
     """
     block_types = (BlockTypes.SectionHeader, )
-    level_count = 4
-    merge_threshold = .25
-    default_level = 2
-    height_tolerance = .99
     def __call__(self, document: Document):
         line_heights: Dict[int, List[float]] = {}
@@ -48,7 +43,7 @@ class SectionHeaderProcessor(BaseProcessor):
                     line_heights[block.id] = block.line_height(document)
                 else:
                     line_heights[block.id] = 0
-                    block.ignore_for_output = True # Don't output an empty section header
         flat_line_heights = list(line_heights.values())
         heading_ranges = self.bucket_headings(flat_line_heights)

 import warnings
+from typing import Annotated, Dict, List
 import numpy as np
 from sklearn.cluster import KMeans
 class SectionHeaderProcessor(BaseProcessor):
     """
     A processor for recognizing section headers in the document.
     """
     block_types = (BlockTypes.SectionHeader, )
+    level_count: Annotated[
+        int,
+        "The number of levels to use for headings.",
+    ] = 4
+    merge_threshold: Annotated[
+        float,
+        "The minimum gap between headings to consider them part of the same group.",
+    ] = 0.25
+    default_level: Annotated[
+        int,
+        "The default heading level to use if no heading level is detected.",
+    ] = 2
+    height_tolerance: Annotated[
+        float,
+        "The minimum height of a heading to consider it a heading.",
+    ] = 0.99
     def __call__(self, document: Document):
         line_heights: Dict[int, List[float]] = {}
                     line_heights[block.id] = block.line_height(document)
                 else:
                     line_heights[block.id] = 0
+                    block.ignore_for_output = True  # Don't output an empty section header
         flat_line_heights = list(line_heights.values())
         heading_ranges = self.bucket_headings(flat_line_heights)

marker/processors/table.py CHANGED Viewed

@@ -1,4 +1,6 @@
 from ftfy import fix_text
 from surya.input.pdflines import get_page_text_lines
 from surya.model.detection.model import EfficientViTForSemanticSegmentation
@@ -16,29 +18,27 @@ from marker.settings import settings
 class TableProcessor(BaseProcessor):
     """
     A processor for recognizing tables in the document.
-    Attributes:
-        detect_boxes (bool):
-            Whether to detect boxes for the table recognition model.
-            Default is False.
-        detector_batch_size (int):
-            The batch size to use for the table detection model.
-            Default is None, which will use the default batch size for the model.
-        table_rec_batch_size (int):
-            The batch size to use for the table recognition model.
-            Default is None, which will use the default batch size for the model.
-        recognition_batch_size (int):
-            The batch size to use for the table recognition model.
-            Default is None, which will use the default batch size for the model.
     """
     block_types = (BlockTypes.Table, BlockTypes.TableOfContents, BlockTypes.Form)
-    detect_boxes = False
-    detector_batch_size = None
-    table_rec_batch_size = None
-    recognition_batch_size = None
     def __init__(
         self,

+from typing import Annotated
 from ftfy import fix_text
 from surya.input.pdflines import get_page_text_lines
 from surya.model.detection.model import EfficientViTForSemanticSegmentation
 class TableProcessor(BaseProcessor):
     """
     A processor for recognizing tables in the document.
     """
     block_types = (BlockTypes.Table, BlockTypes.TableOfContents, BlockTypes.Form)
+    detect_boxes: Annotated[
+        bool,
+        "Whether to detect boxes for the table recognition model.",
+    ] = False
+    detector_batch_size: Annotated[
+        int,
+        "The batch size to use for the table detection model.",
+        "Default is None, which will use the default batch size for the model."
+    ] = None
+    table_rec_batch_size: Annotated[
+        int,
+        "The batch size to use for the table recognition model.",
+        "Default is None, which will use the default batch size for the model."
+    ] = None
+    recognition_batch_size: Annotated[
+        int,
+        "The batch size to use for the table recognition model.",
+        "Default is None, which will use the default batch size for the model."
+    ] = None
     def __init__(
         self,

marker/processors/text.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import math
-from typing import List
 import regex
@@ -12,15 +12,13 @@ from marker.schema.text.line import Line
 class TextProcessor(BaseProcessor):
     """
     A processor for merging text across pages and columns.
-    Attributes:
-        column_gap_ratio (float):
-            The minimum ratio of the page width to the column gap to consider a column break.
-            Default is 0.02.
     """
     block_types = (BlockTypes.Text, BlockTypes.TextInlineMath)
     ignored_block_types = (BlockTypes.PageHeader, BlockTypes.PageFooter)
-    column_gap_ratio = 0.02  # column gaps are atleast 2% of the current column width
     def __init__(self, config):
         super().__init__(config)
@@ -35,14 +33,14 @@ class TextProcessor(BaseProcessor):
                     continue
                 next_block = document.get_next_block(block, self.ignored_block_types)
-                if next_block is None: # we've reached the end of the document
                     continue
                 if next_block.block_type not in self.block_types:
-                    continue # we found a non-text block
                 if next_block.structure is None:
                     continue  # This is odd though, why do we have text blocks with no structure?
                 if next_block.ignore_for_output:
-                    continue # skip ignored blocks
                 column_gap = block.polygon.width * self.column_gap_ratio
@@ -53,7 +51,7 @@ class TextProcessor(BaseProcessor):
                 last_line_is_hyphentated = False
                 new_block_lines = []
-                if next_block.page_id == block.page_id: # block on the same page
                     # we check for a column break
                     column_break = (
                         math.floor(next_block.polygon.y_start) <= math.ceil(block.polygon.y_start) and
@@ -63,11 +61,11 @@ class TextProcessor(BaseProcessor):
                     page_break = True
                     next_page = document.get_page(next_block.page_id)
                     next_block_in_first_quadrant = (next_block.polygon.x_start < next_page.polygon.width // 2) and \
-                                        (next_block.polygon.y_start < next_page.polygon.height // 2)
                 if not (column_break or page_break):
                     continue
                 new_block_lines = next_block.structure_blocks(document)
                 # we check for next_block indentation

 import math
+from typing import Annotated, List
 import regex
 class TextProcessor(BaseProcessor):
     """
     A processor for merging text across pages and columns.
     """
     block_types = (BlockTypes.Text, BlockTypes.TextInlineMath)
     ignored_block_types = (BlockTypes.PageHeader, BlockTypes.PageFooter)
+    column_gap_ratio: Annotated[
+        float,
+        "The minimum ratio of the page width to the column gap to consider a column break.",
+    ] = 0.02
     def __init__(self, config):
         super().__init__(config)
                     continue
                 next_block = document.get_next_block(block, self.ignored_block_types)
+                if next_block is None:  # we've reached the end of the document
                     continue
                 if next_block.block_type not in self.block_types:
+                    continue  # we found a non-text block
                 if next_block.structure is None:
                     continue  # This is odd though, why do we have text blocks with no structure?
                 if next_block.ignore_for_output:
+                    continue  # skip ignored blocks
                 column_gap = block.polygon.width * self.column_gap_ratio
                 last_line_is_hyphentated = False
                 new_block_lines = []
+                if next_block.page_id == block.page_id:  # block on the same page
                     # we check for a column break
                     column_break = (
                         math.floor(next_block.polygon.y_start) <= math.ceil(block.polygon.y_start) and
                     page_break = True
                     next_page = document.get_page(next_block.page_id)
                     next_block_in_first_quadrant = (next_block.polygon.x_start < next_page.polygon.width // 2) and \
+                        (next_block.polygon.y_start < next_page.polygon.height // 2)
                 if not (column_break or page_break):
                     continue
                 new_block_lines = next_block.structure_blocks(document)
                 # we check for next_block indentation

marker/providers/pdf.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import atexit
 import ctypes
 import re
-from typing import List, Set
 import pypdfium2 as pdfium
 import pypdfium2.raw as pdfium_c
@@ -19,16 +19,51 @@ from marker.schema.text.span import Span
 class PdfProvider(BaseProvider):
-    page_range: List[int] | None = None
-    pdftext_workers: int = 4
-    flatten_pdf: bool = True
-    force_ocr: bool = False
-    ocr_invalid_chars: tuple = (chr(0xfffd), "�")
-    ocr_space_threshold: float = .7
-    ocr_newline_threshold: float = .6
-    ocr_alphanum_threshold: float = .3
-    image_threshold: float = .65
-    strip_existing_ocr: bool = False
     def __init__(self, filepath: str, config=None):
         super().__init__(filepath, config)
@@ -57,7 +92,7 @@ class PdfProvider(BaseProvider):
         if self.doc is not None:
             self.doc.close()
-    def font_flags_to_format(self, flags: int | None) -> Set[str]:
         if flags is None:
             return {"plain"}
@@ -188,35 +223,33 @@ class PdfProvider(BaseProvider):
         if not any([obj.type == pdfium_c.FPDF_PAGEOBJ_TEXT for obj in page_objs]):
             return False
-        if not self.strip_existing_ocr:
-            return True
-        # If any text objects on the page are in invisible render mode, skip this page
-        for text_obj in filter(lambda obj: obj.type == pdfium_c.FPDF_PAGEOBJ_TEXT, page_objs):
-            if pdfium_c.FPDFTextObj_GetTextRenderMode(text_obj) in [pdfium_c.FPDF_TEXTRENDERMODE_INVISIBLE, pdfium_c.FPDF_TEXTRENDERMODE_UNKNOWN]:
                 return False
-        non_embedded_fonts = []
-        empty_fonts = []
-        font_map = {}
-        for text_obj in filter(lambda obj: obj.type == pdfium_c.FPDF_PAGEOBJ_TEXT, page_objs):
-            font = pdfium_c.FPDFTextObj_GetFont(text_obj)
-            font_name = self.get_fontname(font)
-            # we also skip pages without embedded fonts and fonts without names
-            non_embedded_fonts.append(pdfium_c.FPDFFont_GetIsEmbedded(font) == 0)
-            empty_fonts.append(not font_name or font_name == "GlyphLessFont")
-            if font_name not in font_map:
-                font_map[font_name or 'Unknown'] = font
-        if all(non_embedded_fonts) or all(empty_fonts):
-            return False
-        # if we see very large images covering most of the page, we can skip this page
-        for img_obj in filter(lambda obj: obj.type == pdfium_c.FPDF_PAGEOBJ_IMAGE, page_objs):
-            img_bbox = PolygonBox.from_bbox(img_obj.get_pos())
-            if page_bbox.intersection_pct(img_bbox) >= self.image_threshold:
-                return False
         return True
@@ -265,8 +298,8 @@ class PdfProvider(BaseProvider):
     def get_fontname(self, font) -> str:
         font_name = ""
-        buffer_size = 256
         try:
             font_name_buffer = ctypes.create_string_buffer(buffer_size)
             length = pdfium_c.FPDFFont_GetBaseFontName(font, font_name_buffer, buffer_size)

 import atexit
 import ctypes
 import re
+from typing import Annotated, List, Optional, Set
 import pypdfium2 as pdfium
 import pypdfium2.raw as pdfium_c
 class PdfProvider(BaseProvider):
+    """
+    A provider for PDF files.
+    """
+    page_range: Annotated[
+        Optional[List[int]],
+        "The range of pages to process.",
+        "Default is None, which will process all pages."
+    ] = None
+    pdftext_workers: Annotated[
+        int,
+        "The number of workers to use for pdftext.",
+    ] = 4
+    flatten_pdf: Annotated[
+        bool,
+        "Whether to flatten the PDF structure.",
+    ] = True
+    force_ocr: Annotated[
+        bool,
+        "Whether to force OCR on the whole document.",
+    ] = False
+    ocr_invalid_chars: Annotated[
+        tuple,
+        "The characters to consider invalid for OCR.",
+    ] = (chr(0xfffd), "�")
+    ocr_space_threshold: Annotated[
+        float,
+        "The minimum ratio of spaces to non-spaces to detect bad text.",
+    ] = .7
+    ocr_newline_threshold: Annotated[
+        float,
+        "The minimum ratio of newlines to non-newlines to detect bad text.",
+    ] = .6
+    ocr_alphanum_threshold: Annotated[
+        float,
+        "The minimum ratio of alphanumeric characters to non-alphanumeric characters to consider an alphanumeric character.",
+    ] = .3
+    image_threshold: Annotated[
+        float,
+        "The minimum coverage ratio of the image to the page to consider skipping the page.",
+    ] = .65
+    strip_existing_ocr: Annotated[
+        bool,
+        "Whether to strip existing OCR text from the PDF.",
+    ] = False
     def __init__(self, filepath: str, config=None):
         super().__init__(filepath, config)
         if self.doc is not None:
             self.doc.close()
+    def font_flags_to_format(self, flags: Optional[int]) -> Set[str]:
         if flags is None:
             return {"plain"}
         if not any([obj.type == pdfium_c.FPDF_PAGEOBJ_TEXT for obj in page_objs]):
             return False
+        if self.strip_existing_ocr:
+            # If any text objects on the page are in invisible render mode, skip this page
+            for text_obj in filter(lambda obj: obj.type == pdfium_c.FPDF_PAGEOBJ_TEXT, page_objs):
+                if pdfium_c.FPDFTextObj_GetTextRenderMode(text_obj) in [pdfium_c.FPDF_TEXTRENDERMODE_INVISIBLE, pdfium_c.FPDF_TEXTRENDERMODE_UNKNOWN]:
+                    return False
+            non_embedded_fonts = []
+            empty_fonts = []
+            font_map = {}
+            for text_obj in filter(lambda obj: obj.type == pdfium_c.FPDF_PAGEOBJ_TEXT, page_objs):
+                font = pdfium_c.FPDFTextObj_GetFont(text_obj)
+                font_name = self.get_fontname(font)
+                # we also skip pages without embedded fonts and fonts without names
+                non_embedded_fonts.append(pdfium_c.FPDFFont_GetIsEmbedded(font) == 0)
+                empty_fonts.append(not font_name or font_name == "GlyphLessFont")
+                if font_name not in font_map:
+                    font_map[font_name or 'Unknown'] = font
+            if all(non_embedded_fonts) or all(empty_fonts):
                 return False
+            # if we see very large images covering most of the page, we can skip this page
+            for img_obj in filter(lambda obj: obj.type == pdfium_c.FPDF_PAGEOBJ_IMAGE, page_objs):
+                img_bbox = PolygonBox.from_bbox(img_obj.get_pos())
+                if page_bbox.intersection_pct(img_bbox) >= self.image_threshold:
+                    return False
         return True
     def get_fontname(self, font) -> str:
         font_name = ""
+        buffer_size = 256
         try:
             font_name_buffer = ctypes.create_string_buffer(buffer_size)
             length = pdfium_c.FPDFFont_GetBaseFontName(font, font_name_buffer, buffer_size)

marker/renderers/__init__.py CHANGED Viewed

@@ -2,7 +2,7 @@ import base64
 import io
 import re
 from collections import Counter
-from typing import Optional
 from bs4 import BeautifulSoup
 from pydantic import BaseModel
@@ -15,9 +15,9 @@ from marker.util import assign_config
 class BaseRenderer:
-    remove_blocks: list = [BlockTypes.PageHeader, BlockTypes.PageFooter]
-    image_blocks: list = [BlockTypes.Picture, BlockTypes.Figure]
-    extract_images: bool = True
     def __init__(self, config: Optional[BaseModel | dict] = None):
         assign_config(self, config)
@@ -71,7 +71,7 @@ class BaseRenderer:
         return page_stats
     def generate_document_metadata(self, document: Document, document_output):
-        metadata =  {
             "table_of_contents": document.table_of_contents,
             "page_stats": self.generate_page_stats(document, document_output),
         }

 import io
 import re
 from collections import Counter
+from typing import Annotated, Optional, Tuple
 from bs4 import BeautifulSoup
 from pydantic import BaseModel
 class BaseRenderer:
+    remove_blocks: Annotated[Tuple[BlockTypes, ...], "The block types to ignore while rendering."] = (BlockTypes.PageHeader, BlockTypes.PageFooter)
+    image_blocks: Annotated[Tuple[BlockTypes, ...], "The block types to consider as images."] = (BlockTypes.Picture, BlockTypes.Figure)
+    extract_images: Annotated[bool, "Extract images from the document."] = True
     def __init__(self, config: Optional[BaseModel | dict] = None):
         assign_config(self, config)
         return page_stats
     def generate_document_metadata(self, document: Document, document_output):
+        metadata = {
             "table_of_contents": document.table_of_contents,
             "page_stats": self.generate_page_stats(document, document_output),
         }

marker/renderers/html.py CHANGED Viewed

@@ -1,4 +1,5 @@
-from typing import Literal
 from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
 from pydantic import BaseModel
@@ -13,7 +14,6 @@ import warnings
 warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning)
 # Suppress DecompressionBombError
-from PIL import Image
 Image.MAX_IMAGE_PIXELS = None
@@ -24,9 +24,21 @@ class HTMLOutput(BaseModel):
 class HTMLRenderer(BaseRenderer):
-    page_blocks: list = [BlockTypes.Page]
-    paginate_output: bool = False
-    image_extraction_mode: Literal["lowres", "highres"] = "highres"
     def extract_image(self, document, image_id):
         image_block = document.get_block(image_id)

+from PIL import Image
+from typing import Annotated, Literal, Tuple
 from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
 from pydantic import BaseModel
 warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning)
 # Suppress DecompressionBombError
 Image.MAX_IMAGE_PIXELS = None
 class HTMLRenderer(BaseRenderer):
+    """
+    A renderer for HTML output.
+    """
+    page_blocks: Annotated[
+        Tuple[BlockTypes],
+        "The block types to consider as pages.",
+    ] = (BlockTypes.Page,)
+    paginate_output: Annotated[
+        bool,
+        "Whether to paginate the output.",
+    ] = False
+    image_extraction_mode: Annotated[
+        Literal["lowres", "highres"],
+        "The mode to use for extracting images.",
+    ] = "highres"
     def extract_image(self, document, image_id):
         image_block = document.get_block(image_id)

marker/renderers/json.py CHANGED Viewed

@@ -1,6 +1,4 @@
-from __future__ import annotations
-from typing import Dict, List
 from pydantic import BaseModel
@@ -16,7 +14,7 @@ class JSONBlockOutput(BaseModel):
     block_type: str
     html: str
     polygon: List[List[float]]
-    children: List[JSONBlockOutput] | None = None
     section_hierarchy: Dict[int, str] | None = None
     images: dict | None = None
@@ -35,8 +33,17 @@ def reformat_section_hierarchy(section_hierarchy):
 class JSONRenderer(BaseRenderer):
-    image_blocks: list = [BlockTypes.Picture, BlockTypes.Figure]
-    page_blocks: list = [BlockTypes.Page]
     def extract_json(self, document: Document, block_output: BlockOutput):
         cls = get_block_class(block_output.id.block_type)

+from typing import Annotated, Dict, List, Tuple
 from pydantic import BaseModel
     block_type: str
     html: str
     polygon: List[List[float]]
+    children: List['JSONBlockOutput'] | None = None
     section_hierarchy: Dict[int, str] | None = None
     images: dict | None = None
 class JSONRenderer(BaseRenderer):
+    """
+    A renderer for JSON output.
+    """
+    image_blocks: Annotated[
+        Tuple[BlockTypes],
+        "The list of block types to consider as images.",
+    ] = (BlockTypes.Picture, BlockTypes.Figure)
+    page_blocks: Annotated[
+        Tuple[BlockTypes],
+        "The list of block types to consider as pages.",
+    ] = (BlockTypes.Page,)
     def extract_json(self, document: Document, block_output: BlockOutput):
         cls = get_block_class(block_output.id.block_type)

marker/renderers/markdown.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import re
-from typing import List
 import regex
 from markdownify import MarkdownConverter
@@ -62,7 +62,6 @@ class Markdownify(MarkdownConverter):
         return super().convert_th(el, text, convert_as_inline)
 class MarkdownOutput(BaseModel):
     markdown: str
     images: dict
@@ -70,9 +69,9 @@ class MarkdownOutput(BaseModel):
 class MarkdownRenderer(HTMLRenderer):
-    page_separator: str = "-" * 48
-    inline_math_delimiters: List[str] = ["$", "$"]
-    block_math_delimiters: List[str] = ["$$", "$$"]
     def __call__(self, document: Document) -> MarkdownOutput:
         document_output = document.render()

 import re
+from typing import Annotated, Tuple
 import regex
 from markdownify import MarkdownConverter
         return super().convert_th(el, text, convert_as_inline)
 class MarkdownOutput(BaseModel):
     markdown: str
     images: dict
 class MarkdownRenderer(HTMLRenderer):
+    page_separator: Annotated[str, "The separator to use between pages.", "Default is '-' * 48."] = "-" * 48
+    inline_math_delimiters: Annotated[Tuple[str], "The delimiters to use for inline math."] = ("$", "$")
+    block_math_delimiters: Annotated[Tuple[str], "The delimiters to use for block math."] = ("$$", "$$")
     def __call__(self, document: Document) -> MarkdownOutput:
         document_output = document.render()

marker/schema/blocks/base.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from __future__ import annotations
-from typing import TYPE_CHECKING, List, Literal, Optional, Dict, Sequence
 from pydantic import BaseModel, ConfigDict, field_validator
@@ -33,7 +33,7 @@ class BlockOutput(BaseModel):
 class BlockId(BaseModel):
     page_id: int
-    block_id: int | None = None
     block_type: BlockTypes | None = None
     def __str__(self):

 from __future__ import annotations
+from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Sequence
 from pydantic import BaseModel, ConfigDict, field_validator
 class BlockId(BaseModel):
     page_id: int
+    block_id: Optional[int] = None
     block_type: BlockTypes | None = None
     def __str__(self):

marker/schema/blocks/sectionheader.py CHANGED Viewed

@@ -1,10 +1,12 @@
 from marker.schema import BlockTypes
 from marker.schema.blocks import Block
 class SectionHeader(Block):
     block_type: BlockTypes = BlockTypes.SectionHeader
-    heading_level: int | None = None
     def assemble_html(self, child_blocks, parent_structure):
         if self.ignore_for_output:

+from typing import Optional
 from marker.schema import BlockTypes
 from marker.schema.blocks import Block
 class SectionHeader(Block):
     block_type: BlockTypes = BlockTypes.SectionHeader
+    heading_level: Optional[int] = None
     def assemble_html(self, child_blocks, parent_structure):
         if self.ignore_for_output:

marker/util.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import inspect
 from importlib import import_module
 from typing import List
@@ -56,7 +57,7 @@ def parse_range_str(range_str: str) -> List[int]:
             page_lst += list(range(int(start), int(end) + 1))
         else:
             page_lst.append(int(i))
-    page_lst = sorted(list(set(page_lst))) # Deduplicate page numbers and sort in order
     return page_lst

 import inspect
+import re
 from importlib import import_module
 from typing import List
             page_lst += list(range(int(start), int(end) + 1))
         else:
             page_lst.append(int(i))
+    page_lst = sorted(list(set(page_lst)))  # Deduplicate page numbers and sort in order
     return page_lst