Merge remote-tracking branch 'origin/dev' into vik_dev
Browse files- convert.py +2 -1
- marker/builders/document.py +10 -12
- marker/builders/layout.py +34 -35
- marker/builders/llm_layout.py +40 -46
- marker/builders/ocr.py +16 -16
- marker/builders/structure.py +12 -13
- marker/config/crawler.py +106 -0
- marker/config/parser.py +16 -18
- marker/config/printer.py +38 -37
- marker/converters/pdf.py +22 -19
- marker/processors/blockquote.py +22 -5
- marker/processors/debug.py +30 -34
- marker/processors/equation.py +18 -18
- marker/processors/footnote.py +1 -17
- marker/processors/ignoretext.py +27 -15
- marker/processors/line_numbers.py +21 -7
- marker/processors/list.py +9 -3
- marker/processors/llm/__init__.py +35 -32
- marker/processors/llm/llm_image_description.py +11 -2
- marker/processors/llm/llm_table.py +12 -9
- marker/processors/sectionheader.py +18 -23
- marker/processors/table.py +21 -21
- marker/processors/text.py +11 -13
- marker/providers/pdf.py +74 -41
- marker/renderers/__init__.py +5 -5
- marker/renderers/html.py +17 -5
- marker/renderers/json.py +13 -6
- marker/renderers/markdown.py +4 -5
- marker/schema/blocks/base.py +2 -2
- marker/schema/blocks/sectionheader.py +3 -1
- marker/util.py +2 -1
convert.py
CHANGED
|
@@ -13,6 +13,7 @@ import torch.multiprocessing as mp
|
|
| 13 |
from tqdm import tqdm
|
| 14 |
|
| 15 |
from marker.config.parser import ConfigParser
|
|
|
|
| 16 |
from marker.converters.pdf import PdfConverter
|
| 17 |
from marker.logger import configure_logging
|
| 18 |
from marker.models import create_model_dict
|
|
@@ -59,7 +60,7 @@ def process_single_pdf(args):
|
|
| 59 |
print(traceback.format_exc())
|
| 60 |
|
| 61 |
|
| 62 |
-
@click.command()
|
| 63 |
@click.argument("in_folder", type=str)
|
| 64 |
@ConfigParser.common_options
|
| 65 |
@click.option("--chunk_idx", type=int, default=0, help="Chunk index to convert")
|
|
|
|
| 13 |
from tqdm import tqdm
|
| 14 |
|
| 15 |
from marker.config.parser import ConfigParser
|
| 16 |
+
from marker.config.printer import CustomClickPrinter
|
| 17 |
from marker.converters.pdf import PdfConverter
|
| 18 |
from marker.logger import configure_logging
|
| 19 |
from marker.models import create_model_dict
|
|
|
|
| 60 |
print(traceback.format_exc())
|
| 61 |
|
| 62 |
|
| 63 |
+
@click.command(cls=CustomClickPrinter)
|
| 64 |
@click.argument("in_folder", type=str)
|
| 65 |
@ConfigParser.common_options
|
| 66 |
@click.option("--chunk_idx", type=int, default=0, help="Chunk index to convert")
|
marker/builders/document.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
-
from
|
|
|
|
| 2 |
from marker.builders import BaseBuilder
|
| 3 |
from marker.builders.layout import LayoutBuilder
|
| 4 |
from marker.builders.ocr import OcrBuilder
|
|
@@ -12,18 +13,15 @@ from marker.schema.registry import get_block_class
|
|
| 12 |
class DocumentBuilder(BaseBuilder):
|
| 13 |
"""
|
| 14 |
Constructs a Document given a PdfProvider, LayoutBuilder, and OcrBuilder.
|
| 15 |
-
|
| 16 |
-
Attributes:
|
| 17 |
-
lowres_image_dpi (int):
|
| 18 |
-
DPI setting for low-resolution page images used for Layout and Line Detection.
|
| 19 |
-
Default is 96.
|
| 20 |
-
|
| 21 |
-
highres_image_dpi (int):
|
| 22 |
-
DPI setting for high-resolution page images used for OCR.
|
| 23 |
-
Default is 192.
|
| 24 |
"""
|
| 25 |
-
lowres_image_dpi:
|
| 26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
|
| 28 |
def __call__(self, provider: PdfProvider, layout_builder: LayoutBuilder, ocr_builder: OcrBuilder):
|
| 29 |
document = self.build_document(provider)
|
|
|
|
| 1 |
+
from typing import Annotated
|
| 2 |
+
|
| 3 |
from marker.builders import BaseBuilder
|
| 4 |
from marker.builders.layout import LayoutBuilder
|
| 5 |
from marker.builders.ocr import OcrBuilder
|
|
|
|
| 13 |
class DocumentBuilder(BaseBuilder):
|
| 14 |
"""
|
| 15 |
Constructs a Document given a PdfProvider, LayoutBuilder, and OcrBuilder.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
"""
|
| 17 |
+
lowres_image_dpi: Annotated[
|
| 18 |
+
int,
|
| 19 |
+
"DPI setting for low-resolution page images used for Layout and Line Detection.",
|
| 20 |
+
] = 96
|
| 21 |
+
highres_image_dpi: Annotated[
|
| 22 |
+
int,
|
| 23 |
+
"DPI setting for high-resolution page images used for OCR.",
|
| 24 |
+
] = 192
|
| 25 |
|
| 26 |
def __call__(self, provider: PdfProvider, layout_builder: LayoutBuilder, ocr_builder: OcrBuilder):
|
| 27 |
document = self.build_document(provider)
|
marker/builders/layout.py
CHANGED
|
@@ -1,15 +1,12 @@
|
|
| 1 |
-
from typing import List
|
| 2 |
|
| 3 |
import numpy as np
|
| 4 |
from surya.layout import batch_layout_detection
|
| 5 |
-
from surya.schema import LayoutResult
|
| 6 |
from surya.model.layout.encoderdecoder import SuryaLayoutModel
|
| 7 |
-
|
| 8 |
-
from surya.ocr_error import batch_ocr_error_detection
|
| 9 |
-
from surya.schema import OCRErrorDetectionResult
|
| 10 |
from surya.model.ocr_error.model import DistilBertForSequenceClassification
|
|
|
|
|
|
|
| 11 |
|
| 12 |
-
from marker.settings import settings
|
| 13 |
from marker.builders import BaseBuilder
|
| 14 |
from marker.providers import ProviderOutput, ProviderPageLines
|
| 15 |
from marker.providers.pdf import PdfProvider
|
|
@@ -18,40 +15,42 @@ from marker.schema.document import Document
|
|
| 18 |
from marker.schema.groups.page import PageGroup
|
| 19 |
from marker.schema.polygon import PolygonBox
|
| 20 |
from marker.schema.registry import get_block_class
|
|
|
|
| 21 |
from marker.util import matrix_intersection_area
|
| 22 |
|
| 23 |
|
| 24 |
class LayoutBuilder(BaseBuilder):
|
| 25 |
"""
|
| 26 |
A builder for performing layout detection on PDF pages and merging the results into the document.
|
| 27 |
-
|
| 28 |
-
Attributes:
|
| 29 |
-
batch_size (int):
|
| 30 |
-
The batch size to use for the layout model.
|
| 31 |
-
Default is None, which will use the default batch size for the model.
|
| 32 |
-
|
| 33 |
-
layout_coverage_min_lines (int):
|
| 34 |
-
The minimum number of PdfProvider lines that must be covered by the layout model
|
| 35 |
-
to consider the lines from the PdfProvider valid. Default is 1.
|
| 36 |
-
|
| 37 |
-
layout_coverage_threshold (float):
|
| 38 |
-
The minimum coverage ratio required for the layout model to consider
|
| 39 |
-
the lines from the PdfProvider valid. Default is 0.3.
|
| 40 |
-
|
| 41 |
-
document_ocr_threshold (float):
|
| 42 |
-
The minimum ratio of pages that must pass the layout coverage check
|
| 43 |
-
to avoid OCR. Default is 0.8.
|
| 44 |
-
|
| 45 |
-
error_model_segment_length (int):
|
| 46 |
-
The maximum number of characters to send to the OCR error model.
|
| 47 |
-
Default is 1024.
|
| 48 |
"""
|
| 49 |
-
batch_size
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
|
| 56 |
def __init__(self, layout_model: SuryaLayoutModel, ocr_error_model: DistilBertForSequenceClassification, config=None):
|
| 57 |
self.layout_model = layout_model
|
|
@@ -81,7 +80,7 @@ class LayoutBuilder(BaseBuilder):
|
|
| 81 |
)
|
| 82 |
return layout_results
|
| 83 |
|
| 84 |
-
def surya_ocr_error_detection(self, pages:List[PageGroup], provider_page_lines: ProviderPageLines) -> OCRErrorDetectionResult:
|
| 85 |
page_texts = []
|
| 86 |
for document_page in pages:
|
| 87 |
page_text = ''
|
|
@@ -102,7 +101,7 @@ class LayoutBuilder(BaseBuilder):
|
|
| 102 |
page_texts,
|
| 103 |
self.ocr_error_model,
|
| 104 |
self.ocr_error_model.tokenizer,
|
| 105 |
-
batch_size=int(self.get_batch_size())
|
| 106 |
)
|
| 107 |
return ocr_error_detection_results
|
| 108 |
|
|
|
|
| 1 |
+
from typing import Annotated, List, Optional, Tuple
|
| 2 |
|
| 3 |
import numpy as np
|
| 4 |
from surya.layout import batch_layout_detection
|
|
|
|
| 5 |
from surya.model.layout.encoderdecoder import SuryaLayoutModel
|
|
|
|
|
|
|
|
|
|
| 6 |
from surya.model.ocr_error.model import DistilBertForSequenceClassification
|
| 7 |
+
from surya.ocr_error import batch_ocr_error_detection
|
| 8 |
+
from surya.schema import LayoutResult, OCRErrorDetectionResult
|
| 9 |
|
|
|
|
| 10 |
from marker.builders import BaseBuilder
|
| 11 |
from marker.providers import ProviderOutput, ProviderPageLines
|
| 12 |
from marker.providers.pdf import PdfProvider
|
|
|
|
| 15 |
from marker.schema.groups.page import PageGroup
|
| 16 |
from marker.schema.polygon import PolygonBox
|
| 17 |
from marker.schema.registry import get_block_class
|
| 18 |
+
from marker.settings import settings
|
| 19 |
from marker.util import matrix_intersection_area
|
| 20 |
|
| 21 |
|
| 22 |
class LayoutBuilder(BaseBuilder):
|
| 23 |
"""
|
| 24 |
A builder for performing layout detection on PDF pages and merging the results into the document.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
"""
|
| 26 |
+
batch_size: Annotated[
|
| 27 |
+
Optional[int],
|
| 28 |
+
"The batch size to use for the layout model.",
|
| 29 |
+
"Default is None, which will use the default batch size for the model."
|
| 30 |
+
] = None
|
| 31 |
+
layout_coverage_min_lines: Annotated[
|
| 32 |
+
int,
|
| 33 |
+
"The minimum number of PdfProvider lines that must be covered by the layout model",
|
| 34 |
+
"to consider the lines from the PdfProvider valid.",
|
| 35 |
+
] = 1
|
| 36 |
+
layout_coverage_threshold: Annotated[
|
| 37 |
+
float,
|
| 38 |
+
"The minimum coverage ratio required for the layout model to consider",
|
| 39 |
+
"the lines from the PdfProvider valid.",
|
| 40 |
+
] = .1
|
| 41 |
+
document_ocr_threshold: Annotated[
|
| 42 |
+
float,
|
| 43 |
+
"The minimum ratio of pages that must pass the layout coverage check",
|
| 44 |
+
"to avoid OCR.",
|
| 45 |
+
] = .8
|
| 46 |
+
error_model_segment_length: Annotated[
|
| 47 |
+
int,
|
| 48 |
+
"The maximum number of characters to send to the OCR error model.",
|
| 49 |
+
] = 512
|
| 50 |
+
excluded_for_coverage: Annotated[
|
| 51 |
+
Tuple[BlockTypes],
|
| 52 |
+
"A list of block types to exclude from the layout coverage check.",
|
| 53 |
+
] = (BlockTypes.Figure, BlockTypes.Picture, BlockTypes.Table, BlockTypes.FigureGroup, BlockTypes.TableGroup, BlockTypes.PictureGroup)
|
| 54 |
|
| 55 |
def __init__(self, layout_model: SuryaLayoutModel, ocr_error_model: DistilBertForSequenceClassification, config=None):
|
| 56 |
self.layout_model = layout_model
|
|
|
|
| 80 |
)
|
| 81 |
return layout_results
|
| 82 |
|
| 83 |
+
def surya_ocr_error_detection(self, pages: List[PageGroup], provider_page_lines: ProviderPageLines) -> OCRErrorDetectionResult:
|
| 84 |
page_texts = []
|
| 85 |
for document_page in pages:
|
| 86 |
page_text = ''
|
|
|
|
| 101 |
page_texts,
|
| 102 |
self.ocr_error_model,
|
| 103 |
self.ocr_error_model.tokenizer,
|
| 104 |
+
batch_size=int(self.get_batch_size()) # TODO Better Multiplier
|
| 105 |
)
|
| 106 |
return ocr_error_detection_results
|
| 107 |
|
marker/builders/llm_layout.py
CHANGED
|
@@ -1,13 +1,8 @@
|
|
| 1 |
import json
|
| 2 |
-
import time
|
| 3 |
-
import traceback
|
| 4 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 5 |
-
from typing import Optional
|
| 6 |
|
| 7 |
-
import google.generativeai as genai
|
| 8 |
-
import PIL
|
| 9 |
from google.ai.generativelanguage_v1beta.types import content
|
| 10 |
-
from google.api_core.exceptions import ResourceExhausted
|
| 11 |
from surya.model.layout.encoderdecoder import SuryaLayoutModel
|
| 12 |
from surya.model.ocr_error.model import DistilBertForSequenceClassification
|
| 13 |
from tqdm import tqdm
|
|
@@ -26,45 +21,41 @@ from marker.settings import settings
|
|
| 26 |
class LLMLayoutBuilder(LayoutBuilder):
|
| 27 |
"""
|
| 28 |
A builder for relabelling blocks to improve the quality of the layout.
|
| 29 |
-
|
| 30 |
-
Attributes:
|
| 31 |
-
google_api_key (str):
|
| 32 |
-
The Google API key to use for the Gemini model.
|
| 33 |
-
Default is None.
|
| 34 |
-
confidence_threshold (float):
|
| 35 |
-
The confidence threshold to use for relabeling.
|
| 36 |
-
Default is 0.75.
|
| 37 |
-
picture_height_threshold (float):
|
| 38 |
-
The height threshold for pictures that may actually be complex regions.
|
| 39 |
-
model_name (str):
|
| 40 |
-
The name of the Gemini model to use.
|
| 41 |
-
Default is "gemini-1.5-flash".
|
| 42 |
-
max_retries (int):
|
| 43 |
-
The maximum number of retries to use for the Gemini model.
|
| 44 |
-
Default is 3.
|
| 45 |
-
max_concurrency (int):
|
| 46 |
-
The maximum number of concurrent requests to make to the Gemini model.
|
| 47 |
-
Default is 3.
|
| 48 |
-
timeout (int):
|
| 49 |
-
The timeout for requests to the Gemini model.
|
| 50 |
-
Default is 60 seconds.
|
| 51 |
-
topk_relabelling_prompt (str):
|
| 52 |
-
The prompt to use for relabelling blocks.
|
| 53 |
-
Default is a string containing the Gemini relabelling prompt.
|
| 54 |
-
complex_relabeling_prompt (str):
|
| 55 |
-
The prompt to use for complex relabelling blocks.
|
| 56 |
-
Default is a string containing the complex relabelling prompt.
|
| 57 |
"""
|
| 58 |
|
| 59 |
-
google_api_key:
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
Your task is to relabel layout blocks in images to improve the accuracy of an existing layout model.
|
| 69 |
You will be provided with an image of a layout block and the top k predictions from the current model, along with their confidence scores.
|
| 70 |
Your job is to analyze the image and choose the single most appropriate label from the provided top k predictions.
|
|
@@ -75,7 +66,11 @@ Choose the label you believe is the most accurate representation of the layout b
|
|
| 75 |
Here are the top k predictions from the model followed by the image:
|
| 76 |
|
| 77 |
"""
|
| 78 |
-
complex_relabeling_prompt
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
Your task is to relabel layout blocks in images to improve the accuracy of an existing layout model.
|
| 80 |
You will be provided with an image of a layout block and some potential labels.
|
| 81 |
Your job is to analyze the image and choose the single most appropriate label from the provided labels.
|
|
@@ -140,7 +135,6 @@ Here is the image of the layout block:
|
|
| 140 |
complex_prompt = self.complex_relabeling_prompt
|
| 141 |
return self.process_block_relabeling(page, block, complex_prompt)
|
| 142 |
|
| 143 |
-
|
| 144 |
def process_block_relabeling(self, page: PageGroup, block: Block, prompt: str):
|
| 145 |
image = self.extract_image(page, block)
|
| 146 |
response_schema = content.Schema(
|
|
@@ -174,4 +168,4 @@ Here is the image of the layout block:
|
|
| 174 |
.rescale(page.polygon.size, page_img.size)\
|
| 175 |
.expand(expand, expand)
|
| 176 |
cropped = page_img.crop(image_box.bbox)
|
| 177 |
-
return cropped
|
|
|
|
| 1 |
import json
|
|
|
|
|
|
|
| 2 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 3 |
+
from typing import Annotated, Optional
|
| 4 |
|
|
|
|
|
|
|
| 5 |
from google.ai.generativelanguage_v1beta.types import content
|
|
|
|
| 6 |
from surya.model.layout.encoderdecoder import SuryaLayoutModel
|
| 7 |
from surya.model.ocr_error.model import DistilBertForSequenceClassification
|
| 8 |
from tqdm import tqdm
|
|
|
|
| 21 |
class LLMLayoutBuilder(LayoutBuilder):
|
| 22 |
"""
|
| 23 |
A builder for relabelling blocks to improve the quality of the layout.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
"""
|
| 25 |
|
| 26 |
+
google_api_key: Annotated[
|
| 27 |
+
Optional[str],
|
| 28 |
+
"The Google API key to use for the Gemini model.",
|
| 29 |
+
] = settings.GOOGLE_API_KEY
|
| 30 |
+
confidence_threshold: Annotated[
|
| 31 |
+
float,
|
| 32 |
+
"The confidence threshold to use for relabeling.",
|
| 33 |
+
] = 0.75
|
| 34 |
+
picture_height_threshold: Annotated[
|
| 35 |
+
float,
|
| 36 |
+
"The height threshold for pictures that may actually be complex regions.",
|
| 37 |
+
] = 0.8
|
| 38 |
+
model_name: Annotated[
|
| 39 |
+
str,
|
| 40 |
+
"The name of the Gemini model to use.",
|
| 41 |
+
] = "gemini-1.5-flash"
|
| 42 |
+
max_retries: Annotated[
|
| 43 |
+
int,
|
| 44 |
+
"The maximum number of retries to use for the Gemini model.",
|
| 45 |
+
] = 3
|
| 46 |
+
max_concurrency: Annotated[
|
| 47 |
+
int,
|
| 48 |
+
"The maximum number of concurrent requests to make to the Gemini model.",
|
| 49 |
+
] = 3
|
| 50 |
+
timeout: Annotated[
|
| 51 |
+
int,
|
| 52 |
+
"The timeout for requests to the Gemini model.",
|
| 53 |
+
] = 60
|
| 54 |
+
topk_relabelling_prompt: Annotated[
|
| 55 |
+
str,
|
| 56 |
+
"The prompt to use for relabelling blocks.",
|
| 57 |
+
"Default is a string containing the Gemini relabelling prompt."
|
| 58 |
+
] = """You are a layout expert specializing in document analysis.
|
| 59 |
Your task is to relabel layout blocks in images to improve the accuracy of an existing layout model.
|
| 60 |
You will be provided with an image of a layout block and the top k predictions from the current model, along with their confidence scores.
|
| 61 |
Your job is to analyze the image and choose the single most appropriate label from the provided top k predictions.
|
|
|
|
| 66 |
Here are the top k predictions from the model followed by the image:
|
| 67 |
|
| 68 |
"""
|
| 69 |
+
complex_relabeling_prompt: Annotated[
|
| 70 |
+
str,
|
| 71 |
+
"The prompt to use for complex relabelling blocks.",
|
| 72 |
+
"Default is a string containing the complex relabelling prompt."
|
| 73 |
+
] = """You are a layout expert specializing in document analysis.
|
| 74 |
Your task is to relabel layout blocks in images to improve the accuracy of an existing layout model.
|
| 75 |
You will be provided with an image of a layout block and some potential labels.
|
| 76 |
Your job is to analyze the image and choose the single most appropriate label from the provided labels.
|
|
|
|
| 135 |
complex_prompt = self.complex_relabeling_prompt
|
| 136 |
return self.process_block_relabeling(page, block, complex_prompt)
|
| 137 |
|
|
|
|
| 138 |
def process_block_relabeling(self, page: PageGroup, block: Block, prompt: str):
|
| 139 |
image = self.extract_image(page, block)
|
| 140 |
response_schema = content.Schema(
|
|
|
|
| 168 |
.rescale(page.polygon.size, page_img.size)\
|
| 169 |
.expand(expand, expand)
|
| 170 |
cropped = page_img.crop(image_box.bbox)
|
| 171 |
+
return cropped
|
marker/builders/ocr.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
from typing import List
|
| 2 |
|
| 3 |
from ftfy import fix_text
|
| 4 |
from surya.model.detection.model import EfficientViTForSemanticSegmentation
|
|
@@ -20,22 +20,22 @@ from marker.settings import settings
|
|
| 20 |
class OcrBuilder(BaseBuilder):
|
| 21 |
"""
|
| 22 |
A builder for performing OCR on PDF pages and merging the results into the document.
|
| 23 |
-
|
| 24 |
-
Attributes:
|
| 25 |
-
detection_batch_size (int):
|
| 26 |
-
The batch size to use for the detection model.
|
| 27 |
-
Default is None, which will use the default batch size for the model.
|
| 28 |
-
|
| 29 |
-
recognition_batch_size (int):
|
| 30 |
-
The batch size to use for the recognition model.
|
| 31 |
-
Default is None, which will use the default batch size for the model.
|
| 32 |
-
|
| 33 |
-
languages (List[str]):
|
| 34 |
-
A list of languages to use for OCR. Default is None.
|
| 35 |
"""
|
| 36 |
-
recognition_batch_size:
|
| 37 |
-
|
| 38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
|
| 40 |
def __init__(self, detection_model: EfficientViTForSemanticSegmentation, recognition_model: OCREncoderDecoderModel, config=None):
|
| 41 |
super().__init__(config)
|
|
|
|
| 1 |
+
from typing import Annotated, List, Optional
|
| 2 |
|
| 3 |
from ftfy import fix_text
|
| 4 |
from surya.model.detection.model import EfficientViTForSemanticSegmentation
|
|
|
|
| 20 |
class OcrBuilder(BaseBuilder):
|
| 21 |
"""
|
| 22 |
A builder for performing OCR on PDF pages and merging the results into the document.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
"""
|
| 24 |
+
recognition_batch_size: Annotated[
|
| 25 |
+
Optional[int],
|
| 26 |
+
"The batch size to use for the recognition model.",
|
| 27 |
+
"Default is None, which will use the default batch size for the model."
|
| 28 |
+
] = None
|
| 29 |
+
detection_batch_size: Annotated[
|
| 30 |
+
Optional[int],
|
| 31 |
+
"The batch size to use for the detection model.",
|
| 32 |
+
"Default is None, which will use the default batch size for the model."
|
| 33 |
+
] = None
|
| 34 |
+
languages: Annotated[
|
| 35 |
+
Optional[List[str]],
|
| 36 |
+
"A list of languages to use for OCR.",
|
| 37 |
+
"Default is None."
|
| 38 |
+
] = None
|
| 39 |
|
| 40 |
def __init__(self, detection_model: EfficientViTForSemanticSegmentation, recognition_model: OCREncoderDecoderModel, config=None):
|
| 41 |
super().__init__(config)
|
marker/builders/structure.py
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
|
|
|
|
|
| 1 |
from marker.builders import BaseBuilder
|
| 2 |
from marker.schema import BlockTypes
|
| 3 |
from marker.schema.document import Document
|
|
@@ -9,18 +11,15 @@ from marker.schema.registry import get_block_class
|
|
| 9 |
class StructureBuilder(BaseBuilder):
|
| 10 |
"""
|
| 11 |
A builder for grouping blocks together based on their structure.
|
| 12 |
-
|
| 13 |
-
Attributes:
|
| 14 |
-
gap_threshold (float):
|
| 15 |
-
The minimum gap between blocks to consider them part of the same group.
|
| 16 |
-
Default is 0.05.
|
| 17 |
-
|
| 18 |
-
list_gap_threshold (float):
|
| 19 |
-
The minimum gap between list items to consider them part of the same group.
|
| 20 |
-
Default is 0.1.
|
| 21 |
"""
|
| 22 |
-
gap_threshold:
|
| 23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
def __init__(self, config=None):
|
| 26 |
super().__init__(config)
|
|
@@ -58,8 +57,8 @@ class StructureBuilder(BaseBuilder):
|
|
| 58 |
selected_polygons.append(prev_block.polygon)
|
| 59 |
|
| 60 |
if next_block and \
|
| 61 |
-
|
| 62 |
-
|
| 63 |
block_structure.append(next_block.id)
|
| 64 |
selected_polygons.append(next_block.polygon)
|
| 65 |
|
|
|
|
| 1 |
+
from typing import Annotated
|
| 2 |
+
|
| 3 |
from marker.builders import BaseBuilder
|
| 4 |
from marker.schema import BlockTypes
|
| 5 |
from marker.schema.document import Document
|
|
|
|
| 11 |
class StructureBuilder(BaseBuilder):
|
| 12 |
"""
|
| 13 |
A builder for grouping blocks together based on their structure.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
"""
|
| 15 |
+
gap_threshold: Annotated[
|
| 16 |
+
float,
|
| 17 |
+
"The minimum gap between blocks to consider them part of the same group.",
|
| 18 |
+
] = 0.05
|
| 19 |
+
list_gap_threshold: Annotated[
|
| 20 |
+
float,
|
| 21 |
+
"The minimum gap between list items to consider them part of the same group.",
|
| 22 |
+
] = 0.1
|
| 23 |
|
| 24 |
def __init__(self, config=None):
|
| 25 |
super().__init__(config)
|
|
|
|
| 57 |
selected_polygons.append(prev_block.polygon)
|
| 58 |
|
| 59 |
if next_block and \
|
| 60 |
+
next_block.block_type in caption_types and \
|
| 61 |
+
next_block.polygon.minimum_gap(block.polygon) < gap_threshold_px:
|
| 62 |
block_structure.append(next_block.id)
|
| 63 |
selected_polygons.append(next_block.polygon)
|
| 64 |
|
marker/config/crawler.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import importlib
|
| 2 |
+
import inspect
|
| 3 |
+
import pkgutil
|
| 4 |
+
from functools import cached_property
|
| 5 |
+
from typing import Annotated, Dict, Set, Type, get_args, get_origin
|
| 6 |
+
|
| 7 |
+
from marker.builders import BaseBuilder
|
| 8 |
+
from marker.converters import BaseConverter
|
| 9 |
+
from marker.processors import BaseProcessor
|
| 10 |
+
from marker.providers import BaseProvider
|
| 11 |
+
from marker.renderers import BaseRenderer
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class ConfigCrawler:
|
| 15 |
+
def __init__(self, base_classes=(BaseBuilder, BaseProcessor, BaseConverter, BaseProvider, BaseRenderer)):
|
| 16 |
+
self.base_classes = base_classes
|
| 17 |
+
self.class_config_map = {}
|
| 18 |
+
|
| 19 |
+
self._crawl_config()
|
| 20 |
+
|
| 21 |
+
def _crawl_config(self):
|
| 22 |
+
for base in self.base_classes:
|
| 23 |
+
base_class_type = base.__name__.removeprefix('Base')
|
| 24 |
+
self.class_config_map.setdefault(base_class_type, {})
|
| 25 |
+
for class_name, class_type in self._find_subclasses(base).items():
|
| 26 |
+
if class_name.startswith('Base'):
|
| 27 |
+
continue
|
| 28 |
+
|
| 29 |
+
self.class_config_map[base_class_type].setdefault(class_name, {
|
| 30 |
+
'class_type': class_type,
|
| 31 |
+
'config': {}
|
| 32 |
+
})
|
| 33 |
+
for attr, attr_type in self._gather_super_annotations(class_type).items():
|
| 34 |
+
default = getattr(class_type, attr)
|
| 35 |
+
metadata = (f"Default is {default}.",)
|
| 36 |
+
|
| 37 |
+
if get_origin(attr_type) is Annotated:
|
| 38 |
+
if any('Default' in desc for desc in attr_type.__metadata__):
|
| 39 |
+
metadata = attr_type.__metadata__
|
| 40 |
+
else:
|
| 41 |
+
metadata = attr_type.__metadata__ + metadata
|
| 42 |
+
attr_type = get_args(attr_type)[0]
|
| 43 |
+
|
| 44 |
+
formatted_type = self._format_type(attr_type)
|
| 45 |
+
self.class_config_map[base_class_type][class_name]['config'][attr] = (attr_type, formatted_type, default, metadata)
|
| 46 |
+
|
| 47 |
+
def _gather_super_annotations(self, cls: Type) -> Dict[str, Type]:
|
| 48 |
+
"""
|
| 49 |
+
Collect all annotated attributes from `cls` and its superclasses, bottom-up.
|
| 50 |
+
Subclass attributes overwrite superclass attributes with the same name.
|
| 51 |
+
"""
|
| 52 |
+
# We'll walk the MRO from base -> derived so subclass attributes overwrite
|
| 53 |
+
# the same attribute name from superclasses.
|
| 54 |
+
annotations = {}
|
| 55 |
+
for base in reversed(cls.__mro__):
|
| 56 |
+
if base is object:
|
| 57 |
+
continue
|
| 58 |
+
if hasattr(base, "__annotations__"):
|
| 59 |
+
for name, annotation in base.__annotations__.items():
|
| 60 |
+
annotations[name] = annotation
|
| 61 |
+
return annotations
|
| 62 |
+
|
| 63 |
+
@cached_property
|
| 64 |
+
def attr_counts(self) -> Dict[str, int]:
|
| 65 |
+
counts: Dict[str, int] = {}
|
| 66 |
+
for base_type_dict in self.class_config_map.values():
|
| 67 |
+
for class_map in base_type_dict.values():
|
| 68 |
+
for attr in class_map['config'].keys():
|
| 69 |
+
counts[attr] = counts.get(attr, 0) + 1
|
| 70 |
+
return counts
|
| 71 |
+
|
| 72 |
+
@cached_property
|
| 73 |
+
def attr_set(self) -> Set[str]:
|
| 74 |
+
attr_set: Set[str] = set()
|
| 75 |
+
for base_type_dict in self.class_config_map.values():
|
| 76 |
+
for class_name, class_map in base_type_dict.items():
|
| 77 |
+
for attr in class_map['config'].keys():
|
| 78 |
+
attr_set.add(attr)
|
| 79 |
+
attr_set.add(f"{class_name}_{attr}")
|
| 80 |
+
return attr_set
|
| 81 |
+
|
| 82 |
+
def _find_subclasses(self, base_class):
|
| 83 |
+
subclasses = {}
|
| 84 |
+
module_name = base_class.__module__
|
| 85 |
+
package = importlib.import_module(module_name)
|
| 86 |
+
if hasattr(package, '__path__'):
|
| 87 |
+
for _, module_name, _ in pkgutil.walk_packages(package.__path__, module_name + "."):
|
| 88 |
+
try:
|
| 89 |
+
module = importlib.import_module(module_name)
|
| 90 |
+
for name, obj in inspect.getmembers(module, inspect.isclass):
|
| 91 |
+
if issubclass(obj, base_class) and obj is not base_class:
|
| 92 |
+
subclasses[name] = obj
|
| 93 |
+
except ImportError:
|
| 94 |
+
pass
|
| 95 |
+
return subclasses
|
| 96 |
+
|
| 97 |
+
def _format_type(self, t: Type) -> str:
|
| 98 |
+
"""Format a typing type like Optional[int] into a readable string."""
|
| 99 |
+
|
| 100 |
+
if get_origin(t): # Handle Optional and types with origins separately
|
| 101 |
+
return f"{t}".removeprefix('typing.')
|
| 102 |
+
else: # Regular types like int, str
|
| 103 |
+
return t.__name__
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
crawler = ConfigCrawler()
|
marker/config/parser.py
CHANGED
|
@@ -4,11 +4,12 @@ from typing import Dict
|
|
| 4 |
|
| 5 |
import click
|
| 6 |
|
|
|
|
| 7 |
from marker.renderers.html import HTMLRenderer
|
| 8 |
-
from marker.settings import settings
|
| 9 |
-
from marker.util import parse_range_str, strings_to_classes, classes_to_strings
|
| 10 |
-
from marker.renderers.markdown import MarkdownRenderer
|
| 11 |
from marker.renderers.json import JSONRenderer
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
|
| 14 |
class ConfigParser:
|
|
@@ -22,20 +23,22 @@ class ConfigParser:
|
|
| 22 |
fn = click.option('--debug', '-d', is_flag=True, help='Enable debug mode.')(fn)
|
| 23 |
fn = click.option("--output_format", type=click.Choice(["markdown", "json", "html"]), default="markdown",
|
| 24 |
help="Format to output results in.")(fn)
|
| 25 |
-
fn = click.option("--page_range", type=str, default=None,
|
| 26 |
-
help="Page range to convert, specify comma separated page numbers or ranges. Example: 0,5-10,20")(
|
| 27 |
-
fn)
|
| 28 |
-
fn = click.option("--force_ocr", is_flag=True, help="Force OCR on the whole document.")(fn)
|
| 29 |
fn = click.option("--processors", type=str, default=None,
|
| 30 |
help="Comma separated list of processors to use. Must use full module path.")(fn)
|
| 31 |
fn = click.option("--config_json", type=str, default=None,
|
| 32 |
help="Path to JSON file with additional configuration.")(fn)
|
| 33 |
-
fn = click.option("--languages", type=str, default=None, help="Comma separated list of languages to use for OCR.")(fn)
|
| 34 |
fn = click.option("--disable_multiprocessing", is_flag=True, default=False, help="Disable multiprocessing.")(fn)
|
| 35 |
-
fn = click.option("--paginate_output", is_flag=True, default=False, help="Paginate output.")(fn)
|
| 36 |
fn = click.option("--disable_image_extraction", is_flag=True, default=False, help="Disable image extraction.")(fn)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
fn = click.option("--use_llm", is_flag=True, default=False, help="Enable higher quality processing with LLMs.")(fn)
|
| 38 |
-
fn = click.option("--strip_existing_ocr", is_flag=True, default=False, help="Strip existing OCR text from the PDF.")(fn)
|
| 39 |
return fn
|
| 40 |
|
| 41 |
def generate_config_dict(self) -> Dict[str, any]:
|
|
@@ -53,8 +56,6 @@ class ConfigParser:
|
|
| 53 |
config["debug_data_folder"] = output_dir
|
| 54 |
case "page_range":
|
| 55 |
config["page_range"] = parse_range_str(v)
|
| 56 |
-
case "force_ocr":
|
| 57 |
-
config["force_ocr"] = True
|
| 58 |
case "languages":
|
| 59 |
config["languages"] = v.split(",")
|
| 60 |
case "config_json":
|
|
@@ -62,14 +63,11 @@ class ConfigParser:
|
|
| 62 |
config.update(json.load(f))
|
| 63 |
case "disable_multiprocessing":
|
| 64 |
config["pdftext_workers"] = 1
|
| 65 |
-
case "paginate_output":
|
| 66 |
-
config["paginate_output"] = True
|
| 67 |
case "disable_image_extraction":
|
| 68 |
config["extract_images"] = False
|
| 69 |
-
case
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
config["strip_existing_ocr"] = True
|
| 73 |
return config
|
| 74 |
|
| 75 |
def get_renderer(self):
|
|
|
|
| 4 |
|
| 5 |
import click
|
| 6 |
|
| 7 |
+
from marker.config.crawler import crawler
|
| 8 |
from marker.renderers.html import HTMLRenderer
|
|
|
|
|
|
|
|
|
|
| 9 |
from marker.renderers.json import JSONRenderer
|
| 10 |
+
from marker.renderers.markdown import MarkdownRenderer
|
| 11 |
+
from marker.settings import settings
|
| 12 |
+
from marker.util import classes_to_strings, parse_range_str, strings_to_classes
|
| 13 |
|
| 14 |
|
| 15 |
class ConfigParser:
|
|
|
|
| 23 |
fn = click.option('--debug', '-d', is_flag=True, help='Enable debug mode.')(fn)
|
| 24 |
fn = click.option("--output_format", type=click.Choice(["markdown", "json", "html"]), default="markdown",
|
| 25 |
help="Format to output results in.")(fn)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
fn = click.option("--processors", type=str, default=None,
|
| 27 |
help="Comma separated list of processors to use. Must use full module path.")(fn)
|
| 28 |
fn = click.option("--config_json", type=str, default=None,
|
| 29 |
help="Path to JSON file with additional configuration.")(fn)
|
|
|
|
| 30 |
fn = click.option("--disable_multiprocessing", is_flag=True, default=False, help="Disable multiprocessing.")(fn)
|
|
|
|
| 31 |
fn = click.option("--disable_image_extraction", is_flag=True, default=False, help="Disable image extraction.")(fn)
|
| 32 |
+
|
| 33 |
+
# these are options that need a list transformation, i.e splitting/parsing a string
|
| 34 |
+
fn = click.option("--page_range", type=str, default=None,
|
| 35 |
+
help="Page range to convert, specify comma separated page numbers or ranges. Example: 0,5-10,20")(
|
| 36 |
+
fn)
|
| 37 |
+
fn = click.option("--languages", type=str, default=None, help="Comma separated list of languages to use for OCR.")(fn)
|
| 38 |
+
|
| 39 |
+
# we put common options here
|
| 40 |
+
fn = click.option("--google_api_key", type=str, default=None, help="Google API key for using LLMs.")(fn)
|
| 41 |
fn = click.option("--use_llm", is_flag=True, default=False, help="Enable higher quality processing with LLMs.")(fn)
|
|
|
|
| 42 |
return fn
|
| 43 |
|
| 44 |
def generate_config_dict(self) -> Dict[str, any]:
|
|
|
|
| 56 |
config["debug_data_folder"] = output_dir
|
| 57 |
case "page_range":
|
| 58 |
config["page_range"] = parse_range_str(v)
|
|
|
|
|
|
|
| 59 |
case "languages":
|
| 60 |
config["languages"] = v.split(",")
|
| 61 |
case "config_json":
|
|
|
|
| 63 |
config.update(json.load(f))
|
| 64 |
case "disable_multiprocessing":
|
| 65 |
config["pdftext_workers"] = 1
|
|
|
|
|
|
|
| 66 |
case "disable_image_extraction":
|
| 67 |
config["extract_images"] = False
|
| 68 |
+
case _:
|
| 69 |
+
if k in crawler.attr_set:
|
| 70 |
+
config[k] = v
|
|
|
|
| 71 |
return config
|
| 72 |
|
| 73 |
def get_renderer(self):
|
marker/config/printer.py
CHANGED
|
@@ -1,32 +1,8 @@
|
|
| 1 |
-
import
|
| 2 |
-
import inspect
|
| 3 |
-
import pkgutil
|
| 4 |
|
| 5 |
import click
|
| 6 |
|
| 7 |
-
from marker.
|
| 8 |
-
from marker.converters import BaseConverter
|
| 9 |
-
from marker.processors import BaseProcessor
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
def find_subclasses(base_class):
|
| 13 |
-
"""
|
| 14 |
-
Dynamically find all subclasses of a base class in the module where the base class is defined
|
| 15 |
-
and its submodules.
|
| 16 |
-
"""
|
| 17 |
-
subclasses = {}
|
| 18 |
-
module_name = base_class.__module__
|
| 19 |
-
package = importlib.import_module(module_name)
|
| 20 |
-
if hasattr(package, '__path__'):
|
| 21 |
-
for _, module_name, _ in pkgutil.walk_packages(package.__path__, module_name + "."):
|
| 22 |
-
try:
|
| 23 |
-
module = importlib.import_module(module_name)
|
| 24 |
-
for name, obj in inspect.getmembers(module, inspect.isclass):
|
| 25 |
-
if issubclass(obj, base_class) and obj is not base_class:
|
| 26 |
-
subclasses[name] = obj
|
| 27 |
-
except ImportError:
|
| 28 |
-
pass
|
| 29 |
-
return subclasses
|
| 30 |
|
| 31 |
|
| 32 |
class CustomClickPrinter(click.Command):
|
|
@@ -39,16 +15,41 @@ class CustomClickPrinter(click.Command):
|
|
| 39 |
click.echo(help_text)
|
| 40 |
|
| 41 |
def parse_args(self, ctx, args):
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
ctx.exit()
|
|
|
|
| 54 |
super().parse_args(ctx, args)
|
|
|
|
| 1 |
+
from typing import Optional
|
|
|
|
|
|
|
| 2 |
|
| 3 |
import click
|
| 4 |
|
| 5 |
+
from marker.config.crawler import crawler
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
|
| 8 |
class CustomClickPrinter(click.Command):
|
|
|
|
| 15 |
click.echo(help_text)
|
| 16 |
|
| 17 |
def parse_args(self, ctx, args):
|
| 18 |
+
display_help = 'config' in args and '--help' in args
|
| 19 |
+
if display_help:
|
| 20 |
+
click.echo("Here is a list of all the Builders, Processors, Converters, Providers and Renderers in Marker along with their attributes:")
|
| 21 |
+
|
| 22 |
+
for base_type, base_type_dict in crawler.class_config_map.items():
|
| 23 |
+
if display_help:
|
| 24 |
+
click.echo(f"{base_type}s:")
|
| 25 |
+
for class_name, class_map in base_type_dict.items():
|
| 26 |
+
if display_help and class_map['config']:
|
| 27 |
+
click.echo(f"\n {class_name}: {class_map['class_type'].__doc__ or ''}")
|
| 28 |
+
click.echo(" " * 4 + "Attributes:")
|
| 29 |
+
for attr, (attr_type, formatted_type, default, metadata) in class_map['config'].items():
|
| 30 |
+
class_name_attr = class_name + "_" + attr
|
| 31 |
+
|
| 32 |
+
if display_help:
|
| 33 |
+
click.echo(" " * 8 + f"{attr} ({formatted_type}):")
|
| 34 |
+
click.echo("\n".join([f'{" " * 12}' + desc for desc in metadata]))
|
| 35 |
+
if attr_type in [str, int, float, bool, Optional[int], Optional[float], Optional[str]]:
|
| 36 |
+
is_flag = attr_type in [bool, Optional[bool]] and not default
|
| 37 |
+
if crawler.attr_counts.get(attr) > 1:
|
| 38 |
+
options = ["--" + class_name_attr]
|
| 39 |
+
else:
|
| 40 |
+
options = ["--" + attr, "--" + class_name_attr]
|
| 41 |
+
options.append(class_name_attr)
|
| 42 |
+
ctx.command.params.append(
|
| 43 |
+
click.Option(
|
| 44 |
+
options,
|
| 45 |
+
type=attr_type,
|
| 46 |
+
help=" ".join(metadata),
|
| 47 |
+
default=default,
|
| 48 |
+
is_flag=is_flag,
|
| 49 |
+
)
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
if display_help:
|
| 53 |
ctx.exit()
|
| 54 |
+
|
| 55 |
super().parse_args(ctx, args)
|
marker/converters/pdf.py
CHANGED
|
@@ -1,30 +1,31 @@
|
|
| 1 |
import os
|
| 2 |
-
|
|
|
|
| 3 |
|
| 4 |
import inspect
|
| 5 |
from collections import defaultdict
|
| 6 |
-
from typing import Any, Dict, List, Type
|
| 7 |
|
| 8 |
from marker.builders.document import DocumentBuilder
|
| 9 |
-
from marker.builders.llm_layout import LLMLayoutBuilder
|
| 10 |
from marker.builders.layout import LayoutBuilder
|
|
|
|
| 11 |
from marker.builders.ocr import OcrBuilder
|
| 12 |
from marker.builders.structure import StructureBuilder
|
| 13 |
from marker.converters import BaseConverter
|
| 14 |
-
from marker.processors.llm.llm_complex import LLMComplexRegionProcessor
|
| 15 |
from marker.processors.blockquote import BlockquoteProcessor
|
| 16 |
from marker.processors.code import CodeProcessor
|
| 17 |
from marker.processors.debug import DebugProcessor
|
| 18 |
from marker.processors.document_toc import DocumentTOCProcessor
|
| 19 |
from marker.processors.equation import EquationProcessor
|
| 20 |
from marker.processors.footnote import FootnoteProcessor
|
| 21 |
-
from marker.processors.llm.llm_form import LLMFormProcessor
|
| 22 |
-
from marker.processors.llm.llm_table import LLMTableProcessor
|
| 23 |
-
from marker.processors.llm.llm_text import LLMTextProcessor
|
| 24 |
-
from marker.processors.llm.llm_image_description import LLMImageDescriptionProcessor
|
| 25 |
from marker.processors.ignoretext import IgnoreTextProcessor
|
| 26 |
from marker.processors.line_numbers import LineNumbersProcessor
|
| 27 |
from marker.processors.list import ListProcessor
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
from marker.processors.page_header import PageHeaderProcessor
|
| 29 |
from marker.processors.sectionheader import SectionHeaderProcessor
|
| 30 |
from marker.processors.table import TableProcessor
|
|
@@ -40,18 +41,20 @@ from marker.util import strings_to_classes
|
|
| 40 |
class PdfConverter(BaseConverter):
|
| 41 |
"""
|
| 42 |
A converter for processing and rendering PDF files into Markdown, JSON, HTML and other formats.
|
| 43 |
-
|
| 44 |
-
Attributes:
|
| 45 |
-
override_map (Dict[BlockTypes, Type[Block]]):
|
| 46 |
-
A mapping to override the default block classes for specific block types.
|
| 47 |
-
The keys are `BlockTypes` enum values, representing the types of blocks,
|
| 48 |
-
and the values are corresponding `Block` class implementations to use
|
| 49 |
-
instead of the defaults.
|
| 50 |
"""
|
| 51 |
-
override_map:
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
super().__init__(config)
|
| 56 |
|
| 57 |
for block_type, override_block_type in self.override_map.items():
|
|
|
|
| 1 |
import os
|
| 2 |
+
|
| 3 |
+
os.environ["TOKENIZERS_PARALLELISM"] = "false" # disables a tokenizers warning
|
| 4 |
|
| 5 |
import inspect
|
| 6 |
from collections import defaultdict
|
| 7 |
+
from typing import Annotated, Any, Dict, List, Optional, Type
|
| 8 |
|
| 9 |
from marker.builders.document import DocumentBuilder
|
|
|
|
| 10 |
from marker.builders.layout import LayoutBuilder
|
| 11 |
+
from marker.builders.llm_layout import LLMLayoutBuilder
|
| 12 |
from marker.builders.ocr import OcrBuilder
|
| 13 |
from marker.builders.structure import StructureBuilder
|
| 14 |
from marker.converters import BaseConverter
|
|
|
|
| 15 |
from marker.processors.blockquote import BlockquoteProcessor
|
| 16 |
from marker.processors.code import CodeProcessor
|
| 17 |
from marker.processors.debug import DebugProcessor
|
| 18 |
from marker.processors.document_toc import DocumentTOCProcessor
|
| 19 |
from marker.processors.equation import EquationProcessor
|
| 20 |
from marker.processors.footnote import FootnoteProcessor
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
from marker.processors.ignoretext import IgnoreTextProcessor
|
| 22 |
from marker.processors.line_numbers import LineNumbersProcessor
|
| 23 |
from marker.processors.list import ListProcessor
|
| 24 |
+
from marker.processors.llm.llm_complex import LLMComplexRegionProcessor
|
| 25 |
+
from marker.processors.llm.llm_form import LLMFormProcessor
|
| 26 |
+
from marker.processors.llm.llm_image_description import LLMImageDescriptionProcessor
|
| 27 |
+
from marker.processors.llm.llm_table import LLMTableProcessor
|
| 28 |
+
from marker.processors.llm.llm_text import LLMTextProcessor
|
| 29 |
from marker.processors.page_header import PageHeaderProcessor
|
| 30 |
from marker.processors.sectionheader import SectionHeaderProcessor
|
| 31 |
from marker.processors.table import TableProcessor
|
|
|
|
| 41 |
class PdfConverter(BaseConverter):
|
| 42 |
"""
|
| 43 |
A converter for processing and rendering PDF files into Markdown, JSON, HTML and other formats.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
"""
|
| 45 |
+
override_map: Annotated[
|
| 46 |
+
Dict[BlockTypes, Type[Block]],
|
| 47 |
+
"A mapping to override the default block classes for specific block types.",
|
| 48 |
+
"The keys are `BlockTypes` enum values, representing the types of blocks,",
|
| 49 |
+
"and the values are corresponding `Block` class implementations to use",
|
| 50 |
+
"instead of the defaults."
|
| 51 |
+
] = defaultdict()
|
| 52 |
+
use_llm: Annotated[
|
| 53 |
+
bool,
|
| 54 |
+
"Enable higher quality processing with LLMs.",
|
| 55 |
+
] = False
|
| 56 |
+
|
| 57 |
+
def __init__(self, artifact_dict: Dict[str, Any], processor_list: Optional[List[str]] = None, renderer: str | None = None, config=None):
|
| 58 |
super().__init__(config)
|
| 59 |
|
| 60 |
for block_type, override_block_type in self.override_map.items():
|
marker/processors/blockquote.py
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
|
|
|
|
|
| 1 |
from marker.processors import BaseProcessor
|
| 2 |
from marker.schema import BlockTypes
|
| 3 |
from marker.schema.document import Document
|
|
@@ -5,12 +7,27 @@ from marker.schema.document import Document
|
|
| 5 |
|
| 6 |
class BlockquoteProcessor(BaseProcessor):
|
| 7 |
"""
|
| 8 |
-
A processor for tagging blockquotes
|
| 9 |
"""
|
| 10 |
-
block_types
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
def __init__(self, config):
|
| 16 |
super().__init__(config)
|
|
|
|
| 1 |
+
from typing import Annotated, Tuple
|
| 2 |
+
|
| 3 |
from marker.processors import BaseProcessor
|
| 4 |
from marker.schema import BlockTypes
|
| 5 |
from marker.schema.document import Document
|
|
|
|
| 7 |
|
| 8 |
class BlockquoteProcessor(BaseProcessor):
|
| 9 |
"""
|
| 10 |
+
A processor for tagging blockquotes.
|
| 11 |
"""
|
| 12 |
+
block_types: Annotated[
|
| 13 |
+
Tuple[BlockTypes],
|
| 14 |
+
"The block types to process.",
|
| 15 |
+
] = (BlockTypes.Text, BlockTypes.TextInlineMath)
|
| 16 |
+
min_x_indent: Annotated[
|
| 17 |
+
float,
|
| 18 |
+
"The minimum horizontal indentation required to consider a block as part of a blockquote.",
|
| 19 |
+
"Expressed as a percentage of the block width.",
|
| 20 |
+
] = 0.05
|
| 21 |
+
x_start_tolerance: Annotated[
|
| 22 |
+
float,
|
| 23 |
+
"The maximum allowable difference between the starting x-coordinates of consecutive blocks to consider them aligned.",
|
| 24 |
+
"Expressed as a percentage of the block width.",
|
| 25 |
+
] = 0.01
|
| 26 |
+
x_end_tolerance: Annotated[
|
| 27 |
+
float,
|
| 28 |
+
"The maximum allowable difference between the ending x-coordinates of consecutive blocks to consider them aligned.",
|
| 29 |
+
"Expressed as a percentage of the block width.",
|
| 30 |
+
] = 0.01
|
| 31 |
|
| 32 |
def __init__(self, config):
|
| 33 |
super().__init__(config)
|
marker/processors/debug.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
import json
|
| 2 |
import os
|
|
|
|
| 3 |
|
| 4 |
import requests
|
| 5 |
from PIL import Image, ImageDraw, ImageFont
|
|
@@ -13,39 +14,36 @@ from marker.settings import settings
|
|
| 13 |
class DebugProcessor(BaseProcessor):
|
| 14 |
"""
|
| 15 |
A processor for debugging the document.
|
| 16 |
-
|
| 17 |
-
Attributes:
|
| 18 |
-
debug_data_folder (str):
|
| 19 |
-
The folder to dump debug data to.
|
| 20 |
-
Default is "debug_data".
|
| 21 |
-
|
| 22 |
-
debug_layout_images (bool):
|
| 23 |
-
Whether to dump layout debug images.
|
| 24 |
-
Default is False.
|
| 25 |
-
|
| 26 |
-
debug_pdf_images (bool):
|
| 27 |
-
Whether to dump PDF debug images.
|
| 28 |
-
Default is False.
|
| 29 |
-
|
| 30 |
-
debug_json (bool):
|
| 31 |
-
Whether to dump block debug data.
|
| 32 |
-
Default is False.
|
| 33 |
-
|
| 34 |
-
render_font (str):
|
| 35 |
-
The path to the font to use for rendering debug images.
|
| 36 |
-
Default is "GoNotoCurrent-Regular.ttf" in the FONT_DIR folder.
|
| 37 |
-
|
| 38 |
-
font_dl_path (str):
|
| 39 |
-
The path to download the font from.
|
| 40 |
-
Default is "https://github.com/satbyy/go-noto-universal/releases/download/v7.0".
|
| 41 |
"""
|
| 42 |
-
block_types
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
|
| 50 |
def __call__(self, document: Document):
|
| 51 |
# Remove extension from doc name
|
|
@@ -90,7 +88,6 @@ class DebugProcessor(BaseProcessor):
|
|
| 90 |
debug_file = os.path.join(self.debug_folder, f"pdf_page_{page.page_id}.png")
|
| 91 |
png_image.save(debug_file)
|
| 92 |
|
| 93 |
-
|
| 94 |
def draw_layout_debug_images(self, document: Document, pdf_mode=False):
|
| 95 |
for page in document.pages:
|
| 96 |
img_size = page.highres_image.size
|
|
@@ -113,7 +110,6 @@ class DebugProcessor(BaseProcessor):
|
|
| 113 |
debug_file = os.path.join(self.debug_folder, f"layout_page_{page.page_id}.png")
|
| 114 |
png_image.save(debug_file)
|
| 115 |
|
| 116 |
-
|
| 117 |
def render_layout_boxes(self, page, png_image):
|
| 118 |
layout_bboxes = []
|
| 119 |
layout_labels = []
|
|
|
|
| 1 |
import json
|
| 2 |
import os
|
| 3 |
+
from typing import Annotated
|
| 4 |
|
| 5 |
import requests
|
| 6 |
from PIL import Image, ImageDraw, ImageFont
|
|
|
|
| 14 |
class DebugProcessor(BaseProcessor):
|
| 15 |
"""
|
| 16 |
A processor for debugging the document.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
"""
|
| 18 |
+
block_types: Annotated[
|
| 19 |
+
tuple,
|
| 20 |
+
"The block types to process.",
|
| 21 |
+
"Default is an empty tuple."
|
| 22 |
+
] = tuple()
|
| 23 |
+
debug_data_folder: Annotated[
|
| 24 |
+
str,
|
| 25 |
+
"The folder to dump debug data to.",
|
| 26 |
+
] = "debug_data"
|
| 27 |
+
debug_layout_images: Annotated[
|
| 28 |
+
bool,
|
| 29 |
+
"Whether to dump layout debug images.",
|
| 30 |
+
] = False
|
| 31 |
+
debug_pdf_images: Annotated[
|
| 32 |
+
bool,
|
| 33 |
+
"Whether to dump PDF debug images.",
|
| 34 |
+
] = False
|
| 35 |
+
debug_json: Annotated[
|
| 36 |
+
bool,
|
| 37 |
+
"Whether to dump block debug data.",
|
| 38 |
+
] = False
|
| 39 |
+
render_font: Annotated[
|
| 40 |
+
str,
|
| 41 |
+
"The path to the font to use for rendering debug images.",
|
| 42 |
+
] = os.path.join(settings.FONT_DIR, "GoNotoCurrent-Regular.ttf")
|
| 43 |
+
font_dl_path: Annotated[
|
| 44 |
+
str,
|
| 45 |
+
"The path to download the font from.",
|
| 46 |
+
] = "https://github.com/satbyy/go-noto-universal/releases/download/v7.0"
|
| 47 |
|
| 48 |
def __call__(self, document: Document):
|
| 49 |
# Remove extension from doc name
|
|
|
|
| 88 |
debug_file = os.path.join(self.debug_folder, f"pdf_page_{page.page_id}.png")
|
| 89 |
png_image.save(debug_file)
|
| 90 |
|
|
|
|
| 91 |
def draw_layout_debug_images(self, document: Document, pdf_mode=False):
|
| 92 |
for page in document.pages:
|
| 93 |
img_size = page.highres_image.size
|
|
|
|
| 110 |
debug_file = os.path.join(self.debug_folder, f"layout_page_{page.page_id}.png")
|
| 111 |
png_image.save(debug_file)
|
| 112 |
|
|
|
|
| 113 |
def render_layout_boxes(self, page, png_image):
|
| 114 |
layout_bboxes = []
|
| 115 |
layout_labels = []
|
marker/processors/equation.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
from typing import List
|
| 2 |
|
| 3 |
from texify.inference import batch_inference
|
| 4 |
from texify.model.model import GenerateVisionEncoderDecoderModel
|
|
@@ -13,24 +13,24 @@ from marker.settings import settings
|
|
| 13 |
class EquationProcessor(BaseProcessor):
|
| 14 |
"""
|
| 15 |
A processor for recognizing equations in the document.
|
| 16 |
-
|
| 17 |
-
Attributes:
|
| 18 |
-
model_max_length (int):
|
| 19 |
-
The maximum number of tokens to allow for the Texify model.
|
| 20 |
-
Default is 384.
|
| 21 |
-
|
| 22 |
-
batch_size (int):
|
| 23 |
-
The batch size to use for the Texify model.
|
| 24 |
-
Default is None, which will use the default batch size for the model.
|
| 25 |
-
|
| 26 |
-
token_buffer (int):
|
| 27 |
-
The number of tokens to buffer above max for the Texify model.
|
| 28 |
-
Default is 256.
|
| 29 |
"""
|
| 30 |
-
block_types
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
def __init__(self, texify_model: GenerateVisionEncoderDecoderModel, config=None):
|
| 36 |
super().__init__(config)
|
|
|
|
| 1 |
+
from typing import Annotated, List, Optional, Tuple
|
| 2 |
|
| 3 |
from texify.inference import batch_inference
|
| 4 |
from texify.model.model import GenerateVisionEncoderDecoderModel
|
|
|
|
| 13 |
class EquationProcessor(BaseProcessor):
|
| 14 |
"""
|
| 15 |
A processor for recognizing equations in the document.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
"""
|
| 17 |
+
block_types: Annotated[
|
| 18 |
+
Tuple[BlockTypes],
|
| 19 |
+
"The block types to process.",
|
| 20 |
+
] = (BlockTypes.Equation,)
|
| 21 |
+
model_max_length: Annotated[
|
| 22 |
+
int,
|
| 23 |
+
"The maximum number of tokens to allow for the Texify model.",
|
| 24 |
+
] = 384
|
| 25 |
+
texify_batch_size: Annotated[
|
| 26 |
+
Optional[int],
|
| 27 |
+
"The batch size to use for the Texify model.",
|
| 28 |
+
"Default is None, which will use the default batch size for the model."
|
| 29 |
+
] = None
|
| 30 |
+
token_buffer: Annotated[
|
| 31 |
+
int,
|
| 32 |
+
"The number of tokens to buffer above max for the Texify model.",
|
| 33 |
+
] = 256
|
| 34 |
|
| 35 |
def __init__(self, texify_model: GenerateVisionEncoderDecoderModel, config=None):
|
| 36 |
super().__init__(config)
|
marker/processors/footnote.py
CHANGED
|
@@ -1,27 +1,12 @@
|
|
| 1 |
-
from statistics import mean
|
| 2 |
-
|
| 3 |
from marker.processors import BaseProcessor
|
| 4 |
from marker.schema import BlockTypes
|
| 5 |
-
from marker.schema.blocks import Footnote
|
| 6 |
from marker.schema.document import Document
|
| 7 |
-
|
| 8 |
-
from rapidfuzz import fuzz
|
| 9 |
-
|
| 10 |
from marker.schema.groups import PageGroup
|
| 11 |
|
| 12 |
|
| 13 |
class FootnoteProcessor(BaseProcessor):
|
| 14 |
"""
|
| 15 |
A processor for pushing footnotes to the bottom, and relabeling mislabeled text blocks.
|
| 16 |
-
|
| 17 |
-
Attributes:
|
| 18 |
-
page_bottom_threshold (float):
|
| 19 |
-
The fraction of page height that is considered the bottom.
|
| 20 |
-
Default is .8
|
| 21 |
-
|
| 22 |
-
line_height_scaler (float):
|
| 23 |
-
The amount to scale line height by to consider a block a footnote. (from N to 1+(1-N))
|
| 24 |
-
Default is .99
|
| 25 |
"""
|
| 26 |
block_types = (BlockTypes.Footnote,)
|
| 27 |
|
|
@@ -29,7 +14,6 @@ class FootnoteProcessor(BaseProcessor):
|
|
| 29 |
for page in document.pages:
|
| 30 |
self.push_footnotes_to_bottom(page, document)
|
| 31 |
|
| 32 |
-
|
| 33 |
def push_footnotes_to_bottom(self, page: PageGroup, document: Document):
|
| 34 |
footnote_blocks = page.contained_blocks(document, self.block_types)
|
| 35 |
|
|
@@ -39,4 +23,4 @@ class FootnoteProcessor(BaseProcessor):
|
|
| 39 |
if block.id in page.structure:
|
| 40 |
# Move to bottom if it is
|
| 41 |
page.structure.remove(block.id)
|
| 42 |
-
page.add_structure(block)
|
|
|
|
|
|
|
|
|
|
| 1 |
from marker.processors import BaseProcessor
|
| 2 |
from marker.schema import BlockTypes
|
|
|
|
| 3 |
from marker.schema.document import Document
|
|
|
|
|
|
|
|
|
|
| 4 |
from marker.schema.groups import PageGroup
|
| 5 |
|
| 6 |
|
| 7 |
class FootnoteProcessor(BaseProcessor):
|
| 8 |
"""
|
| 9 |
A processor for pushing footnotes to the bottom, and relabeling mislabeled text blocks.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
"""
|
| 11 |
block_types = (BlockTypes.Footnote,)
|
| 12 |
|
|
|
|
| 14 |
for page in document.pages:
|
| 15 |
self.push_footnotes_to_bottom(page, document)
|
| 16 |
|
|
|
|
| 17 |
def push_footnotes_to_bottom(self, page: PageGroup, document: Document):
|
| 18 |
footnote_blocks = page.contained_blocks(document, self.block_types)
|
| 19 |
|
|
|
|
| 23 |
if block.id in page.structure:
|
| 24 |
# Move to bottom if it is
|
| 25 |
page.structure.remove(block.id)
|
| 26 |
+
page.add_structure(block)
|
marker/processors/ignoretext.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
import re
|
| 2 |
from collections import Counter
|
| 3 |
from itertools import groupby
|
| 4 |
-
from typing import List
|
| 5 |
|
| 6 |
from rapidfuzz import fuzz
|
| 7 |
|
|
@@ -13,22 +13,34 @@ from marker.schema.document import Document
|
|
| 13 |
|
| 14 |
class IgnoreTextProcessor(BaseProcessor):
|
| 15 |
"""
|
| 16 |
-
A processor for ignoring text blocks
|
| 17 |
-
|
| 18 |
-
Attributes:
|
| 19 |
-
common_element_threshold (float):
|
| 20 |
-
The minimum fraction of pages that a block must appear in to be considered a common element.
|
| 21 |
-
Default is 0.6.
|
| 22 |
"""
|
| 23 |
block_types = (
|
| 24 |
-
BlockTypes.Text, BlockTypes.PageHeader,
|
| 25 |
BlockTypes.PageFooter, BlockTypes.SectionHeader,
|
| 26 |
BlockTypes.TextInlineMath
|
| 27 |
)
|
| 28 |
-
common_element_threshold
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
|
| 33 |
def __call__(self, document: Document):
|
| 34 |
first_blocks = []
|
|
@@ -55,8 +67,8 @@ class IgnoreTextProcessor(BaseProcessor):
|
|
| 55 |
@staticmethod
|
| 56 |
def clean_text(text):
|
| 57 |
text = text.replace("\n", "").strip()
|
| 58 |
-
text = re.sub(r"^\d+\s*", "", text)
|
| 59 |
-
text = re.sub(r"\s*\d+$", "", text)
|
| 60 |
return text
|
| 61 |
|
| 62 |
def filter_common_elements(self, document, blocks: List[Block]):
|
|
@@ -74,7 +86,7 @@ class IgnoreTextProcessor(BaseProcessor):
|
|
| 74 |
common = [
|
| 75 |
k for k, v in counter.items()
|
| 76 |
if (v >= len(blocks) * self.common_element_threshold or streaks[k] >= self.max_streak)
|
| 77 |
-
|
| 78 |
]
|
| 79 |
if len(common) == 0:
|
| 80 |
return
|
|
|
|
| 1 |
import re
|
| 2 |
from collections import Counter
|
| 3 |
from itertools import groupby
|
| 4 |
+
from typing import Annotated, List
|
| 5 |
|
| 6 |
from rapidfuzz import fuzz
|
| 7 |
|
|
|
|
| 13 |
|
| 14 |
class IgnoreTextProcessor(BaseProcessor):
|
| 15 |
"""
|
| 16 |
+
A processor for identifying and ignoring common text blocks in a document.
|
| 17 |
+
These blocks often represent repetitive or non-essential elements, such as headers, footers, or page numbers.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
"""
|
| 19 |
block_types = (
|
| 20 |
+
BlockTypes.Text, BlockTypes.PageHeader,
|
| 21 |
BlockTypes.PageFooter, BlockTypes.SectionHeader,
|
| 22 |
BlockTypes.TextInlineMath
|
| 23 |
)
|
| 24 |
+
common_element_threshold: Annotated[
|
| 25 |
+
float,
|
| 26 |
+
"The minimum ratio of pages a text block must appear on to be considered a common element.",
|
| 27 |
+
"Blocks that meet or exceed this threshold are marked as common elements.",
|
| 28 |
+
] = 0.2
|
| 29 |
+
common_element_min_blocks: Annotated[
|
| 30 |
+
int,
|
| 31 |
+
"The minimum number of occurrences of a text block within a document to consider it a common element.",
|
| 32 |
+
"This ensures that rare blocks are not mistakenly flagged.",
|
| 33 |
+
] = 3
|
| 34 |
+
max_streak: Annotated[
|
| 35 |
+
int,
|
| 36 |
+
"The maximum number of consecutive occurrences of a text block allowed before it is classified as a common element.",
|
| 37 |
+
"Helps to identify patterns like repeated headers or footers.",
|
| 38 |
+
] = 3
|
| 39 |
+
text_match_threshold: Annotated[
|
| 40 |
+
int,
|
| 41 |
+
"The minimum fuzzy match score (0-100) required to classify a text block as similar to a common element.",
|
| 42 |
+
"Higher values enforce stricter matching.",
|
| 43 |
+
] = 90
|
| 44 |
|
| 45 |
def __call__(self, document: Document):
|
| 46 |
first_blocks = []
|
|
|
|
| 67 |
@staticmethod
|
| 68 |
def clean_text(text):
|
| 69 |
text = text.replace("\n", "").strip()
|
| 70 |
+
text = re.sub(r"^\d+\s*", "", text) # remove numbers at the start of the line
|
| 71 |
+
text = re.sub(r"\s*\d+$", "", text) # remove numbers at the end of the line
|
| 72 |
return text
|
| 73 |
|
| 74 |
def filter_common_elements(self, document, blocks: List[Block]):
|
|
|
|
| 86 |
common = [
|
| 87 |
k for k, v in counter.items()
|
| 88 |
if (v >= len(blocks) * self.common_element_threshold or streaks[k] >= self.max_streak)
|
| 89 |
+
and v > self.common_element_min_blocks
|
| 90 |
]
|
| 91 |
if len(common) == 0:
|
| 92 |
return
|
marker/processors/line_numbers.py
CHANGED
|
@@ -1,13 +1,29 @@
|
|
|
|
|
|
|
|
| 1 |
from marker.processors import BaseProcessor
|
| 2 |
from marker.schema import BlockTypes
|
| 3 |
from marker.schema.document import Document
|
| 4 |
|
| 5 |
|
| 6 |
class LineNumbersProcessor(BaseProcessor):
|
|
|
|
|
|
|
|
|
|
| 7 |
block_types = (BlockTypes.Text, BlockTypes.TextInlineMath)
|
| 8 |
-
strip_numbers_threshold:
|
| 9 |
-
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
def __init__(self, config):
|
| 13 |
super().__init__(config)
|
|
@@ -27,11 +43,10 @@ class LineNumbersProcessor(BaseProcessor):
|
|
| 27 |
tokens_are_numbers = [token.isdigit() for token in tokens]
|
| 28 |
if all([
|
| 29 |
sum(tokens_are_numbers) / len(tokens) > self.strip_numbers_threshold,
|
| 30 |
-
block.polygon.height > block.polygon.width
|
| 31 |
]):
|
| 32 |
block.ignore_for_output = True
|
| 33 |
|
| 34 |
-
|
| 35 |
def ignore_line_starts_ends(self, document: Document):
|
| 36 |
for page in document.pages:
|
| 37 |
for block in page.contained_blocks(document, self.block_types):
|
|
@@ -57,7 +72,7 @@ class LineNumbersProcessor(BaseProcessor):
|
|
| 57 |
len(raw_text) - len(spans[0].text.strip()) > self.min_line_length
|
| 58 |
])
|
| 59 |
|
| 60 |
-
ends= all([
|
| 61 |
spans[-1].text.strip().isdigit(),
|
| 62 |
len(raw_text) - len(spans[-1].text.strip()) > self.min_line_length
|
| 63 |
])
|
|
@@ -76,4 +91,3 @@ class LineNumbersProcessor(BaseProcessor):
|
|
| 76 |
if ends:
|
| 77 |
span = page.get_block(line.structure[-1])
|
| 78 |
span.ignore_for_output = True
|
| 79 |
-
|
|
|
|
| 1 |
+
from typing import Annotated
|
| 2 |
+
|
| 3 |
from marker.processors import BaseProcessor
|
| 4 |
from marker.schema import BlockTypes
|
| 5 |
from marker.schema.document import Document
|
| 6 |
|
| 7 |
|
| 8 |
class LineNumbersProcessor(BaseProcessor):
|
| 9 |
+
"""
|
| 10 |
+
A processor for ignoring line numbers.
|
| 11 |
+
"""
|
| 12 |
block_types = (BlockTypes.Text, BlockTypes.TextInlineMath)
|
| 13 |
+
strip_numbers_threshold: Annotated[
|
| 14 |
+
float,
|
| 15 |
+
"The fraction of lines or tokens in a block that must be numeric to consider them as line numbers.",
|
| 16 |
+
] = 0.6
|
| 17 |
+
min_lines_in_block: Annotated[
|
| 18 |
+
int,
|
| 19 |
+
"The minimum number of lines required in a block for it to be considered during processing.",
|
| 20 |
+
"Ensures that small blocks are ignored as they are unlikely to contain meaningful line numbers.",
|
| 21 |
+
] = 4
|
| 22 |
+
min_line_length: Annotated[
|
| 23 |
+
int,
|
| 24 |
+
"The minimum length of a line (in characters) to consider it significant when checking for",
|
| 25 |
+
"numeric prefixes or suffixes. Prevents false positives for short lines.",
|
| 26 |
+
] = 10
|
| 27 |
|
| 28 |
def __init__(self, config):
|
| 29 |
super().__init__(config)
|
|
|
|
| 43 |
tokens_are_numbers = [token.isdigit() for token in tokens]
|
| 44 |
if all([
|
| 45 |
sum(tokens_are_numbers) / len(tokens) > self.strip_numbers_threshold,
|
| 46 |
+
block.polygon.height > block.polygon.width # Ensure block is taller than it is wide, like vertical page numbers
|
| 47 |
]):
|
| 48 |
block.ignore_for_output = True
|
| 49 |
|
|
|
|
| 50 |
def ignore_line_starts_ends(self, document: Document):
|
| 51 |
for page in document.pages:
|
| 52 |
for block in page.contained_blocks(document, self.block_types):
|
|
|
|
| 72 |
len(raw_text) - len(spans[0].text.strip()) > self.min_line_length
|
| 73 |
])
|
| 74 |
|
| 75 |
+
ends = all([
|
| 76 |
spans[-1].text.strip().isdigit(),
|
| 77 |
len(raw_text) - len(spans[-1].text.strip()) > self.min_line_length
|
| 78 |
])
|
|
|
|
| 91 |
if ends:
|
| 92 |
span = page.get_block(line.structure[-1])
|
| 93 |
span.ignore_for_output = True
|
|
|
marker/processors/list.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
from typing import List
|
| 2 |
|
| 3 |
from marker.processors import BaseProcessor
|
| 4 |
from marker.schema import BlockTypes
|
|
@@ -11,8 +11,14 @@ class ListProcessor(BaseProcessor):
|
|
| 11 |
A processor for merging lists across pages and columns
|
| 12 |
"""
|
| 13 |
block_types = (BlockTypes.ListGroup,)
|
| 14 |
-
ignored_block_types
|
| 15 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
def __init__(self, config):
|
| 18 |
super().__init__(config)
|
|
|
|
| 1 |
+
from typing import Annotated, List, Tuple
|
| 2 |
|
| 3 |
from marker.processors import BaseProcessor
|
| 4 |
from marker.schema import BlockTypes
|
|
|
|
| 11 |
A processor for merging lists across pages and columns
|
| 12 |
"""
|
| 13 |
block_types = (BlockTypes.ListGroup,)
|
| 14 |
+
ignored_block_types: Annotated[
|
| 15 |
+
Tuple[BlockTypes],
|
| 16 |
+
"The list of block types to ignore when merging lists.",
|
| 17 |
+
] = (BlockTypes.PageHeader, BlockTypes.PageFooter)
|
| 18 |
+
min_x_indent: Annotated[
|
| 19 |
+
float, "The minimum horizontal indentation required to consider a block as a nested list item.",
|
| 20 |
+
"This is expressed as a percentage of the page width and is used to determine hierarchical relationships within a list.",
|
| 21 |
+
] = 0.01
|
| 22 |
|
| 23 |
def __init__(self, config):
|
| 24 |
super().__init__(config)
|
marker/processors/llm/__init__.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 2 |
-
from typing import Optional
|
| 3 |
|
| 4 |
from tqdm import tqdm
|
| 5 |
|
|
@@ -14,37 +14,40 @@ from marker.settings import settings
|
|
| 14 |
class BaseLLMProcessor(BaseProcessor):
|
| 15 |
"""
|
| 16 |
A processor for using LLMs to convert blocks.
|
| 17 |
-
Attributes:
|
| 18 |
-
google_api_key (str):
|
| 19 |
-
The Google API key to use for the Gemini model.
|
| 20 |
-
Default is None.
|
| 21 |
-
model_name (str):
|
| 22 |
-
The name of the Gemini model to use.
|
| 23 |
-
Default is "gemini-1.5-flash".
|
| 24 |
-
max_retries (int):
|
| 25 |
-
The maximum number of retries to use for the Gemini model.
|
| 26 |
-
Default is 3.
|
| 27 |
-
max_concurrency (int):
|
| 28 |
-
The maximum number of concurrent requests to make to the Gemini model.
|
| 29 |
-
Default is 3.
|
| 30 |
-
timeout (int):
|
| 31 |
-
The timeout for requests to the Gemini model.
|
| 32 |
-
gemini_rewriting_prompt (str):
|
| 33 |
-
The prompt to use for rewriting text.
|
| 34 |
-
Default is a string containing the Gemini rewriting prompt.
|
| 35 |
-
use_llm (bool):
|
| 36 |
-
Whether to use the LLM model.
|
| 37 |
-
Default is False.
|
| 38 |
"""
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
block_types = None
|
| 49 |
|
| 50 |
def __init__(self, config=None):
|
|
@@ -87,4 +90,4 @@ class BaseLLMProcessor(BaseProcessor):
|
|
| 87 |
.rescale(page.polygon.size, page_img.size)\
|
| 88 |
.expand(self.image_expansion_ratio, self.image_expansion_ratio)
|
| 89 |
cropped = page_img.crop(image_box.bbox)
|
| 90 |
-
return cropped
|
|
|
|
| 1 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 2 |
+
from typing import Annotated, Optional
|
| 3 |
|
| 4 |
from tqdm import tqdm
|
| 5 |
|
|
|
|
| 14 |
class BaseLLMProcessor(BaseProcessor):
|
| 15 |
"""
|
| 16 |
A processor for using LLMs to convert blocks.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
"""
|
| 18 |
+
google_api_key: Annotated[
|
| 19 |
+
Optional[str],
|
| 20 |
+
"The Google API key to use for the Gemini model.",
|
| 21 |
+
] = settings.GOOGLE_API_KEY
|
| 22 |
+
model_name: Annotated[
|
| 23 |
+
str,
|
| 24 |
+
"The name of the Gemini model to use.",
|
| 25 |
+
] = "gemini-1.5-flash"
|
| 26 |
+
max_retries: Annotated[
|
| 27 |
+
int,
|
| 28 |
+
"The maximum number of retries to use for the Gemini model.",
|
| 29 |
+
] = 3
|
| 30 |
+
max_concurrency: Annotated[
|
| 31 |
+
int,
|
| 32 |
+
"The maximum number of concurrent requests to make to the Gemini model.",
|
| 33 |
+
] = 3
|
| 34 |
+
timeout: Annotated[
|
| 35 |
+
int,
|
| 36 |
+
"The timeout for requests to the Gemini model.",
|
| 37 |
+
] = 60
|
| 38 |
+
image_expansion_ratio: Annotated[
|
| 39 |
+
float,
|
| 40 |
+
"The ratio to expand the image by when cropping.",
|
| 41 |
+
] = 0.01
|
| 42 |
+
gemini_rewriting_prompt: Annotated[
|
| 43 |
+
str,
|
| 44 |
+
"The prompt to use for rewriting text.",
|
| 45 |
+
"Default is a string containing the Gemini rewriting prompt."
|
| 46 |
+
] = ''
|
| 47 |
+
use_llm: Annotated[
|
| 48 |
+
bool,
|
| 49 |
+
"Whether to use the LLM model.",
|
| 50 |
+
] = False
|
| 51 |
block_types = None
|
| 52 |
|
| 53 |
def __init__(self, config=None):
|
|
|
|
| 90 |
.rescale(page.polygon.size, page_img.size)\
|
| 91 |
.expand(self.image_expansion_ratio, self.image_expansion_ratio)
|
| 92 |
cropped = page_img.crop(image_box.bbox)
|
| 93 |
+
return cropped
|
marker/processors/llm/llm_image_description.py
CHANGED
|
@@ -7,11 +7,20 @@ from marker.schema.blocks import Block
|
|
| 7 |
from marker.schema.document import Document
|
| 8 |
from marker.schema.groups.page import PageGroup
|
| 9 |
|
|
|
|
|
|
|
| 10 |
|
| 11 |
class LLMImageDescriptionProcessor(BaseLLMProcessor):
|
| 12 |
block_types = (BlockTypes.Picture, BlockTypes.Figure,)
|
| 13 |
-
extract_images:
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
You will receive an image of a picture or figure. Your job will be to create a short description of the image.
|
| 16 |
**Instructions:**
|
| 17 |
1. Carefully examine the provided image.
|
|
|
|
| 7 |
from marker.schema.document import Document
|
| 8 |
from marker.schema.groups.page import PageGroup
|
| 9 |
|
| 10 |
+
from typing import Annotated
|
| 11 |
+
|
| 12 |
|
| 13 |
class LLMImageDescriptionProcessor(BaseLLMProcessor):
|
| 14 |
block_types = (BlockTypes.Picture, BlockTypes.Figure,)
|
| 15 |
+
extract_images: Annotated[
|
| 16 |
+
bool,
|
| 17 |
+
"Extract images from the document."
|
| 18 |
+
] = True
|
| 19 |
+
image_description_prompt: Annotated[
|
| 20 |
+
str,
|
| 21 |
+
"The prompt to use for generating image descriptions.",
|
| 22 |
+
"Default is a string containing the Gemini prompt."
|
| 23 |
+
] = """You are a document analysis expert who specializes in creating text descriptions for images.
|
| 24 |
You will receive an image of a picture or figure. Your job will be to create a short description of the image.
|
| 25 |
**Instructions:**
|
| 26 |
1. Carefully examine the provided image.
|
marker/processors/llm/llm_table.py
CHANGED
|
@@ -1,12 +1,11 @@
|
|
| 1 |
-
from
|
| 2 |
|
| 3 |
-
from marker.processors.llm import BaseLLMProcessor
|
| 4 |
from bs4 import BeautifulSoup
|
| 5 |
-
from typing import List
|
| 6 |
-
|
| 7 |
from google.ai.generativelanguage_v1beta.types import content
|
| 8 |
from tabled.formats import html_format
|
|
|
|
| 9 |
|
|
|
|
| 10 |
from marker.schema import BlockTypes
|
| 11 |
from marker.schema.blocks import Block
|
| 12 |
from marker.schema.document import Document
|
|
@@ -15,8 +14,15 @@ from marker.schema.polygon import PolygonBox
|
|
| 15 |
|
| 16 |
|
| 17 |
class LLMTableProcessor(BaseLLMProcessor):
|
| 18 |
-
block_types
|
| 19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
You will receive an image of a text block and an html representation of the table in the image.
|
| 21 |
Your task is to correct any errors in the html representation. The html representation should be as faithful to the original table as possible.
|
| 22 |
**Instructions:**
|
|
@@ -92,10 +98,8 @@ No corrections needed.
|
|
| 92 |
block.update_metadata(llm_error_count=1)
|
| 93 |
return
|
| 94 |
|
| 95 |
-
|
| 96 |
block.cells = parsed_cells
|
| 97 |
|
| 98 |
-
|
| 99 |
def parse_html_table(self, html_text: str, block: Block) -> List[SpanTableCell]:
|
| 100 |
soup = BeautifulSoup(html_text, 'html.parser')
|
| 101 |
table = soup.find('table')
|
|
@@ -151,5 +155,4 @@ No corrections needed.
|
|
| 151 |
cells.append(cell_obj)
|
| 152 |
cur_col += colspan
|
| 153 |
|
| 154 |
-
|
| 155 |
return cells
|
|
|
|
| 1 |
+
from typing import Annotated, List, Tuple
|
| 2 |
|
|
|
|
| 3 |
from bs4 import BeautifulSoup
|
|
|
|
|
|
|
| 4 |
from google.ai.generativelanguage_v1beta.types import content
|
| 5 |
from tabled.formats import html_format
|
| 6 |
+
from tabled.schema import SpanTableCell
|
| 7 |
|
| 8 |
+
from marker.processors.llm import BaseLLMProcessor
|
| 9 |
from marker.schema import BlockTypes
|
| 10 |
from marker.schema.blocks import Block
|
| 11 |
from marker.schema.document import Document
|
|
|
|
| 14 |
|
| 15 |
|
| 16 |
class LLMTableProcessor(BaseLLMProcessor):
|
| 17 |
+
block_types: Annotated[
|
| 18 |
+
Tuple[BlockTypes],
|
| 19 |
+
"The block types to process.",
|
| 20 |
+
] = (BlockTypes.Table,)
|
| 21 |
+
gemini_rewriting_prompt: Annotated[
|
| 22 |
+
str,
|
| 23 |
+
"The prompt to use for rewriting text.",
|
| 24 |
+
"Default is a string containing the Gemini rewriting prompt."
|
| 25 |
+
] = """You are a text correction expert specializing in accurately reproducing text from images.
|
| 26 |
You will receive an image of a text block and an html representation of the table in the image.
|
| 27 |
Your task is to correct any errors in the html representation. The html representation should be as faithful to the original table as possible.
|
| 28 |
**Instructions:**
|
|
|
|
| 98 |
block.update_metadata(llm_error_count=1)
|
| 99 |
return
|
| 100 |
|
|
|
|
| 101 |
block.cells = parsed_cells
|
| 102 |
|
|
|
|
| 103 |
def parse_html_table(self, html_text: str, block: Block) -> List[SpanTableCell]:
|
| 104 |
soup = BeautifulSoup(html_text, 'html.parser')
|
| 105 |
table = soup.find('table')
|
|
|
|
| 155 |
cells.append(cell_obj)
|
| 156 |
cur_col += colspan
|
| 157 |
|
|
|
|
| 158 |
return cells
|
marker/processors/sectionheader.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
import warnings
|
| 2 |
-
from typing import Dict, List
|
| 3 |
|
| 4 |
import numpy as np
|
| 5 |
from sklearn.cluster import KMeans
|
|
@@ -16,29 +16,24 @@ warnings.filterwarnings("ignore", category=ConvergenceWarning)
|
|
| 16 |
class SectionHeaderProcessor(BaseProcessor):
|
| 17 |
"""
|
| 18 |
A processor for recognizing section headers in the document.
|
| 19 |
-
|
| 20 |
-
Attributes:
|
| 21 |
-
level_count (int):
|
| 22 |
-
The number of levels to use for headings.
|
| 23 |
-
Default is 4.
|
| 24 |
-
|
| 25 |
-
merge_threshold (float):
|
| 26 |
-
The minimum gap between headings to consider them part of the same group.
|
| 27 |
-
Default is 0.25.
|
| 28 |
-
|
| 29 |
-
default_level (int):
|
| 30 |
-
The default heading level to use if no heading level is detected.
|
| 31 |
-
Default is 2.
|
| 32 |
-
|
| 33 |
-
height_tolerance (float):
|
| 34 |
-
The minimum height of a heading to consider it a heading.
|
| 35 |
-
Default is 0.99.
|
| 36 |
"""
|
| 37 |
block_types = (BlockTypes.SectionHeader, )
|
| 38 |
-
level_count
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
|
| 43 |
def __call__(self, document: Document):
|
| 44 |
line_heights: Dict[int, List[float]] = {}
|
|
@@ -48,7 +43,7 @@ class SectionHeaderProcessor(BaseProcessor):
|
|
| 48 |
line_heights[block.id] = block.line_height(document)
|
| 49 |
else:
|
| 50 |
line_heights[block.id] = 0
|
| 51 |
-
block.ignore_for_output = True
|
| 52 |
|
| 53 |
flat_line_heights = list(line_heights.values())
|
| 54 |
heading_ranges = self.bucket_headings(flat_line_heights)
|
|
|
|
| 1 |
import warnings
|
| 2 |
+
from typing import Annotated, Dict, List
|
| 3 |
|
| 4 |
import numpy as np
|
| 5 |
from sklearn.cluster import KMeans
|
|
|
|
| 16 |
class SectionHeaderProcessor(BaseProcessor):
|
| 17 |
"""
|
| 18 |
A processor for recognizing section headers in the document.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
"""
|
| 20 |
block_types = (BlockTypes.SectionHeader, )
|
| 21 |
+
level_count: Annotated[
|
| 22 |
+
int,
|
| 23 |
+
"The number of levels to use for headings.",
|
| 24 |
+
] = 4
|
| 25 |
+
merge_threshold: Annotated[
|
| 26 |
+
float,
|
| 27 |
+
"The minimum gap between headings to consider them part of the same group.",
|
| 28 |
+
] = 0.25
|
| 29 |
+
default_level: Annotated[
|
| 30 |
+
int,
|
| 31 |
+
"The default heading level to use if no heading level is detected.",
|
| 32 |
+
] = 2
|
| 33 |
+
height_tolerance: Annotated[
|
| 34 |
+
float,
|
| 35 |
+
"The minimum height of a heading to consider it a heading.",
|
| 36 |
+
] = 0.99
|
| 37 |
|
| 38 |
def __call__(self, document: Document):
|
| 39 |
line_heights: Dict[int, List[float]] = {}
|
|
|
|
| 43 |
line_heights[block.id] = block.line_height(document)
|
| 44 |
else:
|
| 45 |
line_heights[block.id] = 0
|
| 46 |
+
block.ignore_for_output = True # Don't output an empty section header
|
| 47 |
|
| 48 |
flat_line_heights = list(line_heights.values())
|
| 49 |
heading_ranges = self.bucket_headings(flat_line_heights)
|
marker/processors/table.py
CHANGED
|
@@ -1,4 +1,6 @@
|
|
| 1 |
|
|
|
|
|
|
|
| 2 |
from ftfy import fix_text
|
| 3 |
from surya.input.pdflines import get_page_text_lines
|
| 4 |
from surya.model.detection.model import EfficientViTForSemanticSegmentation
|
|
@@ -16,29 +18,27 @@ from marker.settings import settings
|
|
| 16 |
class TableProcessor(BaseProcessor):
|
| 17 |
"""
|
| 18 |
A processor for recognizing tables in the document.
|
| 19 |
-
|
| 20 |
-
Attributes:
|
| 21 |
-
detect_boxes (bool):
|
| 22 |
-
Whether to detect boxes for the table recognition model.
|
| 23 |
-
Default is False.
|
| 24 |
-
|
| 25 |
-
detector_batch_size (int):
|
| 26 |
-
The batch size to use for the table detection model.
|
| 27 |
-
Default is None, which will use the default batch size for the model.
|
| 28 |
-
|
| 29 |
-
table_rec_batch_size (int):
|
| 30 |
-
The batch size to use for the table recognition model.
|
| 31 |
-
Default is None, which will use the default batch size for the model.
|
| 32 |
-
|
| 33 |
-
recognition_batch_size (int):
|
| 34 |
-
The batch size to use for the table recognition model.
|
| 35 |
-
Default is None, which will use the default batch size for the model.
|
| 36 |
"""
|
| 37 |
block_types = (BlockTypes.Table, BlockTypes.TableOfContents, BlockTypes.Form)
|
| 38 |
-
detect_boxes
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
|
| 43 |
def __init__(
|
| 44 |
self,
|
|
|
|
| 1 |
|
| 2 |
+
from typing import Annotated
|
| 3 |
+
|
| 4 |
from ftfy import fix_text
|
| 5 |
from surya.input.pdflines import get_page_text_lines
|
| 6 |
from surya.model.detection.model import EfficientViTForSemanticSegmentation
|
|
|
|
| 18 |
class TableProcessor(BaseProcessor):
|
| 19 |
"""
|
| 20 |
A processor for recognizing tables in the document.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
"""
|
| 22 |
block_types = (BlockTypes.Table, BlockTypes.TableOfContents, BlockTypes.Form)
|
| 23 |
+
detect_boxes: Annotated[
|
| 24 |
+
bool,
|
| 25 |
+
"Whether to detect boxes for the table recognition model.",
|
| 26 |
+
] = False
|
| 27 |
+
detector_batch_size: Annotated[
|
| 28 |
+
int,
|
| 29 |
+
"The batch size to use for the table detection model.",
|
| 30 |
+
"Default is None, which will use the default batch size for the model."
|
| 31 |
+
] = None
|
| 32 |
+
table_rec_batch_size: Annotated[
|
| 33 |
+
int,
|
| 34 |
+
"The batch size to use for the table recognition model.",
|
| 35 |
+
"Default is None, which will use the default batch size for the model."
|
| 36 |
+
] = None
|
| 37 |
+
recognition_batch_size: Annotated[
|
| 38 |
+
int,
|
| 39 |
+
"The batch size to use for the table recognition model.",
|
| 40 |
+
"Default is None, which will use the default batch size for the model."
|
| 41 |
+
] = None
|
| 42 |
|
| 43 |
def __init__(
|
| 44 |
self,
|
marker/processors/text.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
import math
|
| 2 |
-
from typing import List
|
| 3 |
|
| 4 |
import regex
|
| 5 |
|
|
@@ -12,15 +12,13 @@ from marker.schema.text.line import Line
|
|
| 12 |
class TextProcessor(BaseProcessor):
|
| 13 |
"""
|
| 14 |
A processor for merging text across pages and columns.
|
| 15 |
-
|
| 16 |
-
Attributes:
|
| 17 |
-
column_gap_ratio (float):
|
| 18 |
-
The minimum ratio of the page width to the column gap to consider a column break.
|
| 19 |
-
Default is 0.02.
|
| 20 |
"""
|
| 21 |
block_types = (BlockTypes.Text, BlockTypes.TextInlineMath)
|
| 22 |
ignored_block_types = (BlockTypes.PageHeader, BlockTypes.PageFooter)
|
| 23 |
-
column_gap_ratio
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
def __init__(self, config):
|
| 26 |
super().__init__(config)
|
|
@@ -35,14 +33,14 @@ class TextProcessor(BaseProcessor):
|
|
| 35 |
continue
|
| 36 |
|
| 37 |
next_block = document.get_next_block(block, self.ignored_block_types)
|
| 38 |
-
if next_block is None:
|
| 39 |
continue
|
| 40 |
if next_block.block_type not in self.block_types:
|
| 41 |
-
continue
|
| 42 |
if next_block.structure is None:
|
| 43 |
continue # This is odd though, why do we have text blocks with no structure?
|
| 44 |
if next_block.ignore_for_output:
|
| 45 |
-
continue
|
| 46 |
|
| 47 |
column_gap = block.polygon.width * self.column_gap_ratio
|
| 48 |
|
|
@@ -53,7 +51,7 @@ class TextProcessor(BaseProcessor):
|
|
| 53 |
last_line_is_hyphentated = False
|
| 54 |
new_block_lines = []
|
| 55 |
|
| 56 |
-
if next_block.page_id == block.page_id:
|
| 57 |
# we check for a column break
|
| 58 |
column_break = (
|
| 59 |
math.floor(next_block.polygon.y_start) <= math.ceil(block.polygon.y_start) and
|
|
@@ -63,11 +61,11 @@ class TextProcessor(BaseProcessor):
|
|
| 63 |
page_break = True
|
| 64 |
next_page = document.get_page(next_block.page_id)
|
| 65 |
next_block_in_first_quadrant = (next_block.polygon.x_start < next_page.polygon.width // 2) and \
|
| 66 |
-
|
| 67 |
|
| 68 |
if not (column_break or page_break):
|
| 69 |
continue
|
| 70 |
-
|
| 71 |
new_block_lines = next_block.structure_blocks(document)
|
| 72 |
|
| 73 |
# we check for next_block indentation
|
|
|
|
| 1 |
import math
|
| 2 |
+
from typing import Annotated, List
|
| 3 |
|
| 4 |
import regex
|
| 5 |
|
|
|
|
| 12 |
class TextProcessor(BaseProcessor):
|
| 13 |
"""
|
| 14 |
A processor for merging text across pages and columns.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
"""
|
| 16 |
block_types = (BlockTypes.Text, BlockTypes.TextInlineMath)
|
| 17 |
ignored_block_types = (BlockTypes.PageHeader, BlockTypes.PageFooter)
|
| 18 |
+
column_gap_ratio: Annotated[
|
| 19 |
+
float,
|
| 20 |
+
"The minimum ratio of the page width to the column gap to consider a column break.",
|
| 21 |
+
] = 0.02
|
| 22 |
|
| 23 |
def __init__(self, config):
|
| 24 |
super().__init__(config)
|
|
|
|
| 33 |
continue
|
| 34 |
|
| 35 |
next_block = document.get_next_block(block, self.ignored_block_types)
|
| 36 |
+
if next_block is None: # we've reached the end of the document
|
| 37 |
continue
|
| 38 |
if next_block.block_type not in self.block_types:
|
| 39 |
+
continue # we found a non-text block
|
| 40 |
if next_block.structure is None:
|
| 41 |
continue # This is odd though, why do we have text blocks with no structure?
|
| 42 |
if next_block.ignore_for_output:
|
| 43 |
+
continue # skip ignored blocks
|
| 44 |
|
| 45 |
column_gap = block.polygon.width * self.column_gap_ratio
|
| 46 |
|
|
|
|
| 51 |
last_line_is_hyphentated = False
|
| 52 |
new_block_lines = []
|
| 53 |
|
| 54 |
+
if next_block.page_id == block.page_id: # block on the same page
|
| 55 |
# we check for a column break
|
| 56 |
column_break = (
|
| 57 |
math.floor(next_block.polygon.y_start) <= math.ceil(block.polygon.y_start) and
|
|
|
|
| 61 |
page_break = True
|
| 62 |
next_page = document.get_page(next_block.page_id)
|
| 63 |
next_block_in_first_quadrant = (next_block.polygon.x_start < next_page.polygon.width // 2) and \
|
| 64 |
+
(next_block.polygon.y_start < next_page.polygon.height // 2)
|
| 65 |
|
| 66 |
if not (column_break or page_break):
|
| 67 |
continue
|
| 68 |
+
|
| 69 |
new_block_lines = next_block.structure_blocks(document)
|
| 70 |
|
| 71 |
# we check for next_block indentation
|
marker/providers/pdf.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
import atexit
|
| 2 |
import ctypes
|
| 3 |
import re
|
| 4 |
-
from typing import List, Set
|
| 5 |
|
| 6 |
import pypdfium2 as pdfium
|
| 7 |
import pypdfium2.raw as pdfium_c
|
|
@@ -19,16 +19,51 @@ from marker.schema.text.span import Span
|
|
| 19 |
|
| 20 |
|
| 21 |
class PdfProvider(BaseProvider):
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
|
| 33 |
def __init__(self, filepath: str, config=None):
|
| 34 |
super().__init__(filepath, config)
|
|
@@ -57,7 +92,7 @@ class PdfProvider(BaseProvider):
|
|
| 57 |
if self.doc is not None:
|
| 58 |
self.doc.close()
|
| 59 |
|
| 60 |
-
def font_flags_to_format(self, flags: int
|
| 61 |
if flags is None:
|
| 62 |
return {"plain"}
|
| 63 |
|
|
@@ -188,35 +223,33 @@ class PdfProvider(BaseProvider):
|
|
| 188 |
if not any([obj.type == pdfium_c.FPDF_PAGEOBJ_TEXT for obj in page_objs]):
|
| 189 |
return False
|
| 190 |
|
| 191 |
-
if
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 197 |
return False
|
| 198 |
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
font_name = self.get_fontname(font)
|
| 205 |
-
|
| 206 |
-
# we also skip pages without embedded fonts and fonts without names
|
| 207 |
-
non_embedded_fonts.append(pdfium_c.FPDFFont_GetIsEmbedded(font) == 0)
|
| 208 |
-
empty_fonts.append(not font_name or font_name == "GlyphLessFont")
|
| 209 |
-
if font_name not in font_map:
|
| 210 |
-
font_map[font_name or 'Unknown'] = font
|
| 211 |
-
|
| 212 |
-
if all(non_embedded_fonts) or all(empty_fonts):
|
| 213 |
-
return False
|
| 214 |
-
|
| 215 |
-
# if we see very large images covering most of the page, we can skip this page
|
| 216 |
-
for img_obj in filter(lambda obj: obj.type == pdfium_c.FPDF_PAGEOBJ_IMAGE, page_objs):
|
| 217 |
-
img_bbox = PolygonBox.from_bbox(img_obj.get_pos())
|
| 218 |
-
if page_bbox.intersection_pct(img_bbox) >= self.image_threshold:
|
| 219 |
-
return False
|
| 220 |
|
| 221 |
return True
|
| 222 |
|
|
@@ -265,8 +298,8 @@ class PdfProvider(BaseProvider):
|
|
| 265 |
|
| 266 |
def get_fontname(self, font) -> str:
|
| 267 |
font_name = ""
|
| 268 |
-
buffer_size = 256
|
| 269 |
-
|
| 270 |
try:
|
| 271 |
font_name_buffer = ctypes.create_string_buffer(buffer_size)
|
| 272 |
length = pdfium_c.FPDFFont_GetBaseFontName(font, font_name_buffer, buffer_size)
|
|
|
|
| 1 |
import atexit
|
| 2 |
import ctypes
|
| 3 |
import re
|
| 4 |
+
from typing import Annotated, List, Optional, Set
|
| 5 |
|
| 6 |
import pypdfium2 as pdfium
|
| 7 |
import pypdfium2.raw as pdfium_c
|
|
|
|
| 19 |
|
| 20 |
|
| 21 |
class PdfProvider(BaseProvider):
|
| 22 |
+
"""
|
| 23 |
+
A provider for PDF files.
|
| 24 |
+
"""
|
| 25 |
+
|
| 26 |
+
page_range: Annotated[
|
| 27 |
+
Optional[List[int]],
|
| 28 |
+
"The range of pages to process.",
|
| 29 |
+
"Default is None, which will process all pages."
|
| 30 |
+
] = None
|
| 31 |
+
pdftext_workers: Annotated[
|
| 32 |
+
int,
|
| 33 |
+
"The number of workers to use for pdftext.",
|
| 34 |
+
] = 4
|
| 35 |
+
flatten_pdf: Annotated[
|
| 36 |
+
bool,
|
| 37 |
+
"Whether to flatten the PDF structure.",
|
| 38 |
+
] = True
|
| 39 |
+
force_ocr: Annotated[
|
| 40 |
+
bool,
|
| 41 |
+
"Whether to force OCR on the whole document.",
|
| 42 |
+
] = False
|
| 43 |
+
ocr_invalid_chars: Annotated[
|
| 44 |
+
tuple,
|
| 45 |
+
"The characters to consider invalid for OCR.",
|
| 46 |
+
] = (chr(0xfffd), "�")
|
| 47 |
+
ocr_space_threshold: Annotated[
|
| 48 |
+
float,
|
| 49 |
+
"The minimum ratio of spaces to non-spaces to detect bad text.",
|
| 50 |
+
] = .7
|
| 51 |
+
ocr_newline_threshold: Annotated[
|
| 52 |
+
float,
|
| 53 |
+
"The minimum ratio of newlines to non-newlines to detect bad text.",
|
| 54 |
+
] = .6
|
| 55 |
+
ocr_alphanum_threshold: Annotated[
|
| 56 |
+
float,
|
| 57 |
+
"The minimum ratio of alphanumeric characters to non-alphanumeric characters to consider an alphanumeric character.",
|
| 58 |
+
] = .3
|
| 59 |
+
image_threshold: Annotated[
|
| 60 |
+
float,
|
| 61 |
+
"The minimum coverage ratio of the image to the page to consider skipping the page.",
|
| 62 |
+
] = .65
|
| 63 |
+
strip_existing_ocr: Annotated[
|
| 64 |
+
bool,
|
| 65 |
+
"Whether to strip existing OCR text from the PDF.",
|
| 66 |
+
] = False
|
| 67 |
|
| 68 |
def __init__(self, filepath: str, config=None):
|
| 69 |
super().__init__(filepath, config)
|
|
|
|
| 92 |
if self.doc is not None:
|
| 93 |
self.doc.close()
|
| 94 |
|
| 95 |
+
def font_flags_to_format(self, flags: Optional[int]) -> Set[str]:
|
| 96 |
if flags is None:
|
| 97 |
return {"plain"}
|
| 98 |
|
|
|
|
| 223 |
if not any([obj.type == pdfium_c.FPDF_PAGEOBJ_TEXT for obj in page_objs]):
|
| 224 |
return False
|
| 225 |
|
| 226 |
+
if self.strip_existing_ocr:
|
| 227 |
+
# If any text objects on the page are in invisible render mode, skip this page
|
| 228 |
+
for text_obj in filter(lambda obj: obj.type == pdfium_c.FPDF_PAGEOBJ_TEXT, page_objs):
|
| 229 |
+
if pdfium_c.FPDFTextObj_GetTextRenderMode(text_obj) in [pdfium_c.FPDF_TEXTRENDERMODE_INVISIBLE, pdfium_c.FPDF_TEXTRENDERMODE_UNKNOWN]:
|
| 230 |
+
return False
|
| 231 |
+
|
| 232 |
+
non_embedded_fonts = []
|
| 233 |
+
empty_fonts = []
|
| 234 |
+
font_map = {}
|
| 235 |
+
for text_obj in filter(lambda obj: obj.type == pdfium_c.FPDF_PAGEOBJ_TEXT, page_objs):
|
| 236 |
+
font = pdfium_c.FPDFTextObj_GetFont(text_obj)
|
| 237 |
+
font_name = self.get_fontname(font)
|
| 238 |
+
|
| 239 |
+
# we also skip pages without embedded fonts and fonts without names
|
| 240 |
+
non_embedded_fonts.append(pdfium_c.FPDFFont_GetIsEmbedded(font) == 0)
|
| 241 |
+
empty_fonts.append(not font_name or font_name == "GlyphLessFont")
|
| 242 |
+
if font_name not in font_map:
|
| 243 |
+
font_map[font_name or 'Unknown'] = font
|
| 244 |
+
|
| 245 |
+
if all(non_embedded_fonts) or all(empty_fonts):
|
| 246 |
return False
|
| 247 |
|
| 248 |
+
# if we see very large images covering most of the page, we can skip this page
|
| 249 |
+
for img_obj in filter(lambda obj: obj.type == pdfium_c.FPDF_PAGEOBJ_IMAGE, page_objs):
|
| 250 |
+
img_bbox = PolygonBox.from_bbox(img_obj.get_pos())
|
| 251 |
+
if page_bbox.intersection_pct(img_bbox) >= self.image_threshold:
|
| 252 |
+
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 253 |
|
| 254 |
return True
|
| 255 |
|
|
|
|
| 298 |
|
| 299 |
def get_fontname(self, font) -> str:
|
| 300 |
font_name = ""
|
| 301 |
+
buffer_size = 256
|
| 302 |
+
|
| 303 |
try:
|
| 304 |
font_name_buffer = ctypes.create_string_buffer(buffer_size)
|
| 305 |
length = pdfium_c.FPDFFont_GetBaseFontName(font, font_name_buffer, buffer_size)
|
marker/renderers/__init__.py
CHANGED
|
@@ -2,7 +2,7 @@ import base64
|
|
| 2 |
import io
|
| 3 |
import re
|
| 4 |
from collections import Counter
|
| 5 |
-
from typing import Optional
|
| 6 |
|
| 7 |
from bs4 import BeautifulSoup
|
| 8 |
from pydantic import BaseModel
|
|
@@ -15,9 +15,9 @@ from marker.util import assign_config
|
|
| 15 |
|
| 16 |
|
| 17 |
class BaseRenderer:
|
| 18 |
-
remove_blocks:
|
| 19 |
-
image_blocks:
|
| 20 |
-
extract_images: bool = True
|
| 21 |
|
| 22 |
def __init__(self, config: Optional[BaseModel | dict] = None):
|
| 23 |
assign_config(self, config)
|
|
@@ -71,7 +71,7 @@ class BaseRenderer:
|
|
| 71 |
return page_stats
|
| 72 |
|
| 73 |
def generate_document_metadata(self, document: Document, document_output):
|
| 74 |
-
metadata =
|
| 75 |
"table_of_contents": document.table_of_contents,
|
| 76 |
"page_stats": self.generate_page_stats(document, document_output),
|
| 77 |
}
|
|
|
|
| 2 |
import io
|
| 3 |
import re
|
| 4 |
from collections import Counter
|
| 5 |
+
from typing import Annotated, Optional, Tuple
|
| 6 |
|
| 7 |
from bs4 import BeautifulSoup
|
| 8 |
from pydantic import BaseModel
|
|
|
|
| 15 |
|
| 16 |
|
| 17 |
class BaseRenderer:
|
| 18 |
+
remove_blocks: Annotated[Tuple[BlockTypes, ...], "The block types to ignore while rendering."] = (BlockTypes.PageHeader, BlockTypes.PageFooter)
|
| 19 |
+
image_blocks: Annotated[Tuple[BlockTypes, ...], "The block types to consider as images."] = (BlockTypes.Picture, BlockTypes.Figure)
|
| 20 |
+
extract_images: Annotated[bool, "Extract images from the document."] = True
|
| 21 |
|
| 22 |
def __init__(self, config: Optional[BaseModel | dict] = None):
|
| 23 |
assign_config(self, config)
|
|
|
|
| 71 |
return page_stats
|
| 72 |
|
| 73 |
def generate_document_metadata(self, document: Document, document_output):
|
| 74 |
+
metadata = {
|
| 75 |
"table_of_contents": document.table_of_contents,
|
| 76 |
"page_stats": self.generate_page_stats(document, document_output),
|
| 77 |
}
|
marker/renderers/html.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
-
from
|
|
|
|
| 2 |
|
| 3 |
from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
|
| 4 |
from pydantic import BaseModel
|
|
@@ -13,7 +14,6 @@ import warnings
|
|
| 13 |
warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning)
|
| 14 |
|
| 15 |
# Suppress DecompressionBombError
|
| 16 |
-
from PIL import Image
|
| 17 |
Image.MAX_IMAGE_PIXELS = None
|
| 18 |
|
| 19 |
|
|
@@ -24,9 +24,21 @@ class HTMLOutput(BaseModel):
|
|
| 24 |
|
| 25 |
|
| 26 |
class HTMLRenderer(BaseRenderer):
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
|
| 31 |
def extract_image(self, document, image_id):
|
| 32 |
image_block = document.get_block(image_id)
|
|
|
|
| 1 |
+
from PIL import Image
|
| 2 |
+
from typing import Annotated, Literal, Tuple
|
| 3 |
|
| 4 |
from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
|
| 5 |
from pydantic import BaseModel
|
|
|
|
| 14 |
warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning)
|
| 15 |
|
| 16 |
# Suppress DecompressionBombError
|
|
|
|
| 17 |
Image.MAX_IMAGE_PIXELS = None
|
| 18 |
|
| 19 |
|
|
|
|
| 24 |
|
| 25 |
|
| 26 |
class HTMLRenderer(BaseRenderer):
|
| 27 |
+
"""
|
| 28 |
+
A renderer for HTML output.
|
| 29 |
+
"""
|
| 30 |
+
page_blocks: Annotated[
|
| 31 |
+
Tuple[BlockTypes],
|
| 32 |
+
"The block types to consider as pages.",
|
| 33 |
+
] = (BlockTypes.Page,)
|
| 34 |
+
paginate_output: Annotated[
|
| 35 |
+
bool,
|
| 36 |
+
"Whether to paginate the output.",
|
| 37 |
+
] = False
|
| 38 |
+
image_extraction_mode: Annotated[
|
| 39 |
+
Literal["lowres", "highres"],
|
| 40 |
+
"The mode to use for extracting images.",
|
| 41 |
+
] = "highres"
|
| 42 |
|
| 43 |
def extract_image(self, document, image_id):
|
| 44 |
image_block = document.get_block(image_id)
|
marker/renderers/json.py
CHANGED
|
@@ -1,6 +1,4 @@
|
|
| 1 |
-
from
|
| 2 |
-
|
| 3 |
-
from typing import Dict, List
|
| 4 |
|
| 5 |
from pydantic import BaseModel
|
| 6 |
|
|
@@ -16,7 +14,7 @@ class JSONBlockOutput(BaseModel):
|
|
| 16 |
block_type: str
|
| 17 |
html: str
|
| 18 |
polygon: List[List[float]]
|
| 19 |
-
children: List[JSONBlockOutput] | None = None
|
| 20 |
section_hierarchy: Dict[int, str] | None = None
|
| 21 |
images: dict | None = None
|
| 22 |
|
|
@@ -35,8 +33,17 @@ def reformat_section_hierarchy(section_hierarchy):
|
|
| 35 |
|
| 36 |
|
| 37 |
class JSONRenderer(BaseRenderer):
|
| 38 |
-
|
| 39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
def extract_json(self, document: Document, block_output: BlockOutput):
|
| 42 |
cls = get_block_class(block_output.id.block_type)
|
|
|
|
| 1 |
+
from typing import Annotated, Dict, List, Tuple
|
|
|
|
|
|
|
| 2 |
|
| 3 |
from pydantic import BaseModel
|
| 4 |
|
|
|
|
| 14 |
block_type: str
|
| 15 |
html: str
|
| 16 |
polygon: List[List[float]]
|
| 17 |
+
children: List['JSONBlockOutput'] | None = None
|
| 18 |
section_hierarchy: Dict[int, str] | None = None
|
| 19 |
images: dict | None = None
|
| 20 |
|
|
|
|
| 33 |
|
| 34 |
|
| 35 |
class JSONRenderer(BaseRenderer):
|
| 36 |
+
"""
|
| 37 |
+
A renderer for JSON output.
|
| 38 |
+
"""
|
| 39 |
+
image_blocks: Annotated[
|
| 40 |
+
Tuple[BlockTypes],
|
| 41 |
+
"The list of block types to consider as images.",
|
| 42 |
+
] = (BlockTypes.Picture, BlockTypes.Figure)
|
| 43 |
+
page_blocks: Annotated[
|
| 44 |
+
Tuple[BlockTypes],
|
| 45 |
+
"The list of block types to consider as pages.",
|
| 46 |
+
] = (BlockTypes.Page,)
|
| 47 |
|
| 48 |
def extract_json(self, document: Document, block_output: BlockOutput):
|
| 49 |
cls = get_block_class(block_output.id.block_type)
|
marker/renderers/markdown.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
import re
|
| 2 |
-
from typing import
|
| 3 |
|
| 4 |
import regex
|
| 5 |
from markdownify import MarkdownConverter
|
|
@@ -62,7 +62,6 @@ class Markdownify(MarkdownConverter):
|
|
| 62 |
return super().convert_th(el, text, convert_as_inline)
|
| 63 |
|
| 64 |
|
| 65 |
-
|
| 66 |
class MarkdownOutput(BaseModel):
|
| 67 |
markdown: str
|
| 68 |
images: dict
|
|
@@ -70,9 +69,9 @@ class MarkdownOutput(BaseModel):
|
|
| 70 |
|
| 71 |
|
| 72 |
class MarkdownRenderer(HTMLRenderer):
|
| 73 |
-
page_separator: str = "-" * 48
|
| 74 |
-
inline_math_delimiters:
|
| 75 |
-
block_math_delimiters:
|
| 76 |
|
| 77 |
def __call__(self, document: Document) -> MarkdownOutput:
|
| 78 |
document_output = document.render()
|
|
|
|
| 1 |
import re
|
| 2 |
+
from typing import Annotated, Tuple
|
| 3 |
|
| 4 |
import regex
|
| 5 |
from markdownify import MarkdownConverter
|
|
|
|
| 62 |
return super().convert_th(el, text, convert_as_inline)
|
| 63 |
|
| 64 |
|
|
|
|
| 65 |
class MarkdownOutput(BaseModel):
|
| 66 |
markdown: str
|
| 67 |
images: dict
|
|
|
|
| 69 |
|
| 70 |
|
| 71 |
class MarkdownRenderer(HTMLRenderer):
|
| 72 |
+
page_separator: Annotated[str, "The separator to use between pages.", "Default is '-' * 48."] = "-" * 48
|
| 73 |
+
inline_math_delimiters: Annotated[Tuple[str], "The delimiters to use for inline math."] = ("$", "$")
|
| 74 |
+
block_math_delimiters: Annotated[Tuple[str], "The delimiters to use for block math."] = ("$$", "$$")
|
| 75 |
|
| 76 |
def __call__(self, document: Document) -> MarkdownOutput:
|
| 77 |
document_output = document.render()
|
marker/schema/blocks/base.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
from __future__ import annotations
|
| 2 |
|
| 3 |
-
from typing import TYPE_CHECKING, List, Literal, Optional,
|
| 4 |
|
| 5 |
from pydantic import BaseModel, ConfigDict, field_validator
|
| 6 |
|
|
@@ -33,7 +33,7 @@ class BlockOutput(BaseModel):
|
|
| 33 |
|
| 34 |
class BlockId(BaseModel):
|
| 35 |
page_id: int
|
| 36 |
-
block_id: int
|
| 37 |
block_type: BlockTypes | None = None
|
| 38 |
|
| 39 |
def __str__(self):
|
|
|
|
| 1 |
from __future__ import annotations
|
| 2 |
|
| 3 |
+
from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Sequence
|
| 4 |
|
| 5 |
from pydantic import BaseModel, ConfigDict, field_validator
|
| 6 |
|
|
|
|
| 33 |
|
| 34 |
class BlockId(BaseModel):
|
| 35 |
page_id: int
|
| 36 |
+
block_id: Optional[int] = None
|
| 37 |
block_type: BlockTypes | None = None
|
| 38 |
|
| 39 |
def __str__(self):
|
marker/schema/blocks/sectionheader.py
CHANGED
|
@@ -1,10 +1,12 @@
|
|
|
|
|
|
|
|
| 1 |
from marker.schema import BlockTypes
|
| 2 |
from marker.schema.blocks import Block
|
| 3 |
|
| 4 |
|
| 5 |
class SectionHeader(Block):
|
| 6 |
block_type: BlockTypes = BlockTypes.SectionHeader
|
| 7 |
-
heading_level: int
|
| 8 |
|
| 9 |
def assemble_html(self, child_blocks, parent_structure):
|
| 10 |
if self.ignore_for_output:
|
|
|
|
| 1 |
+
from typing import Optional
|
| 2 |
+
|
| 3 |
from marker.schema import BlockTypes
|
| 4 |
from marker.schema.blocks import Block
|
| 5 |
|
| 6 |
|
| 7 |
class SectionHeader(Block):
|
| 8 |
block_type: BlockTypes = BlockTypes.SectionHeader
|
| 9 |
+
heading_level: Optional[int] = None
|
| 10 |
|
| 11 |
def assemble_html(self, child_blocks, parent_structure):
|
| 12 |
if self.ignore_for_output:
|
marker/util.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
import inspect
|
|
|
|
| 2 |
from importlib import import_module
|
| 3 |
from typing import List
|
| 4 |
|
|
@@ -56,7 +57,7 @@ def parse_range_str(range_str: str) -> List[int]:
|
|
| 56 |
page_lst += list(range(int(start), int(end) + 1))
|
| 57 |
else:
|
| 58 |
page_lst.append(int(i))
|
| 59 |
-
page_lst = sorted(list(set(page_lst)))
|
| 60 |
return page_lst
|
| 61 |
|
| 62 |
|
|
|
|
| 1 |
import inspect
|
| 2 |
+
import re
|
| 3 |
from importlib import import_module
|
| 4 |
from typing import List
|
| 5 |
|
|
|
|
| 57 |
page_lst += list(range(int(start), int(end) + 1))
|
| 58 |
else:
|
| 59 |
page_lst.append(int(i))
|
| 60 |
+
page_lst = sorted(list(set(page_lst))) # Deduplicate page numbers and sort in order
|
| 61 |
return page_lst
|
| 62 |
|
| 63 |
|