Vik Paruchuri
commited on
Commit
·
ae2424e
1
Parent(s):
2ad7f6b
Add in surya OCR
Browse files- README.md +5 -3
- benchmark.py +1 -1
- convert.py +7 -1
- marker/cleaners/code.py +2 -1
- marker/cleaners/equations.py +7 -18
- marker/cleaners/headers.py +2 -1
- marker/cleaners/table.py +3 -2
- marker/convert.py +42 -65
- marker/debug/data.py +1 -2
- marker/layout/layout.py +0 -0
- marker/layout/order.py +0 -0
- marker/markdown.py +2 -1
- marker/models.py +51 -8
- marker/ocr/detection.py +22 -0
- marker/ocr/heuristics.py +71 -0
- marker/ocr/lang.py +14 -0
- marker/ocr/page.py +0 -75
- marker/ocr/recognition.py +136 -0
- marker/ocr/utils.py +0 -36
- marker/ordering.py +1 -4
- marker/{extract_text.py → pdf/extract_text.py} +17 -40
- marker/pdf/filetype.py +21 -0
- marker/{bbox.py → schema/bbox.py} +43 -14
- marker/schema/page.py +68 -0
- marker/{schema.py → schema/schema.py} +2 -103
- marker/segmentation.py +3 -4
- marker/settings.py +25 -29
README.md
CHANGED
|
@@ -123,17 +123,19 @@ python convert.py /path/to/input/folder /path/to/output/folder --workers 10 --ma
|
|
| 123 |
|
| 124 |
- `--workers` is the number of pdfs to convert at once. This is set to 1 by default, but you can increase it to increase throughput, at the cost of more CPU/GPU usage. Parallelism will not increase beyond `INFERENCE_RAM / VRAM_PER_TASK` if you're using GPU.
|
| 125 |
- `--max` is the maximum number of pdfs to convert. Omit this to convert all pdfs in the folder.
|
| 126 |
-
- `--metadata_file` is an optional path to a json file with metadata about the pdfs. If you provide it, it will be used to set the language for each pdf. If not, `DEFAULT_LANG` will be used. The format is:
|
| 127 |
- `--min_length` is the minimum number of characters that need to be extracted from a pdf before it will be considered for processing. If you're processing a lot of pdfs, I recommend setting this to avoid OCRing pdfs that are mostly images. (slows everything down)
|
|
|
|
| 128 |
|
| 129 |
```
|
| 130 |
{
|
| 131 |
-
"pdf1.pdf": {"
|
| 132 |
-
"pdf2.pdf": {"
|
| 133 |
...
|
| 134 |
}
|
| 135 |
```
|
| 136 |
|
|
|
|
|
|
|
| 137 |
## Convert multiple files on multiple GPUs
|
| 138 |
|
| 139 |
Run `chunk_convert.sh`, like this:
|
|
|
|
| 123 |
|
| 124 |
- `--workers` is the number of pdfs to convert at once. This is set to 1 by default, but you can increase it to increase throughput, at the cost of more CPU/GPU usage. Parallelism will not increase beyond `INFERENCE_RAM / VRAM_PER_TASK` if you're using GPU.
|
| 125 |
- `--max` is the maximum number of pdfs to convert. Omit this to convert all pdfs in the folder.
|
|
|
|
| 126 |
- `--min_length` is the minimum number of characters that need to be extracted from a pdf before it will be considered for processing. If you're processing a lot of pdfs, I recommend setting this to avoid OCRing pdfs that are mostly images. (slows everything down)
|
| 127 |
+
- `--metadata_file` is an optional path to a json file with metadata about the pdfs. If you provide it, it will be used to set the language for each pdf. If not, `DEFAULT_LANG` will be used. The format is:
|
| 128 |
|
| 129 |
```
|
| 130 |
{
|
| 131 |
+
"pdf1.pdf": {"languages": ["English"]},
|
| 132 |
+
"pdf2.pdf": {"languages": ["Spanish", "Russian"]},
|
| 133 |
...
|
| 134 |
}
|
| 135 |
```
|
| 136 |
|
| 137 |
+
You can use language names or codes. See [here](https://github.com/VikParuchuri/surya/blob/master/surya/languages.py) for a full list.
|
| 138 |
+
|
| 139 |
## Convert multiple files on multiple GPUs
|
| 140 |
|
| 141 |
Run `chunk_convert.sh`, like this:
|
benchmark.py
CHANGED
|
@@ -9,7 +9,7 @@ from marker.convert import convert_single_pdf
|
|
| 9 |
from marker.logger import configure_logging
|
| 10 |
from marker.models import load_all_models
|
| 11 |
from marker.benchmark.scoring import score_text
|
| 12 |
-
from marker.extract_text import naive_get_text
|
| 13 |
import json
|
| 14 |
import os
|
| 15 |
import subprocess
|
|
|
|
| 9 |
from marker.logger import configure_logging
|
| 10 |
from marker.models import load_all_models
|
| 11 |
from marker.benchmark.scoring import score_text
|
| 12 |
+
from marker.pdf.extract_text import naive_get_text
|
| 13 |
import json
|
| 14 |
import os
|
| 15 |
import subprocess
|
convert.py
CHANGED
|
@@ -6,7 +6,9 @@ import ray
|
|
| 6 |
from tqdm import tqdm
|
| 7 |
import math
|
| 8 |
|
| 9 |
-
from marker.convert import convert_single_pdf
|
|
|
|
|
|
|
| 10 |
from marker.models import load_all_models
|
| 11 |
from marker.settings import settings
|
| 12 |
from marker.logger import configure_logging
|
|
@@ -28,6 +30,10 @@ def process_single_pdf(fname: str, out_folder: str, model_refs, metadata: Option
|
|
| 28 |
# This can indicate that they were scanned, and not OCRed properly
|
| 29 |
# Usually these files are not recent/high-quality
|
| 30 |
if min_length:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
length = get_length_of_text(fname)
|
| 32 |
if length < min_length:
|
| 33 |
return
|
|
|
|
| 6 |
from tqdm import tqdm
|
| 7 |
import math
|
| 8 |
|
| 9 |
+
from marker.convert import convert_single_pdf
|
| 10 |
+
from marker.pdf.filetype import find_filetype
|
| 11 |
+
from marker.pdf.extract_text import get_length_of_text
|
| 12 |
from marker.models import load_all_models
|
| 13 |
from marker.settings import settings
|
| 14 |
from marker.logger import configure_logging
|
|
|
|
| 30 |
# This can indicate that they were scanned, and not OCRed properly
|
| 31 |
# Usually these files are not recent/high-quality
|
| 32 |
if min_length:
|
| 33 |
+
filetype = find_filetype(fname)
|
| 34 |
+
if filetype == "other":
|
| 35 |
+
return 0
|
| 36 |
+
|
| 37 |
length = get_length_of_text(fname)
|
| 38 |
if length < min_length:
|
| 39 |
return
|
marker/cleaners/code.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
-
from marker.schema import Span, Line
|
|
|
|
| 2 |
import re
|
| 3 |
from typing import List
|
| 4 |
|
|
|
|
| 1 |
+
from marker.schema.schema import Span, Line
|
| 2 |
+
from marker.schema.page import Page
|
| 3 |
import re
|
| 4 |
from typing import List
|
| 5 |
|
marker/cleaners/equations.py
CHANGED
|
@@ -1,31 +1,20 @@
|
|
| 1 |
-
import io
|
| 2 |
from copy import deepcopy
|
| 3 |
-
from functools import partial
|
| 4 |
from typing import List
|
| 5 |
|
| 6 |
-
import torch
|
| 7 |
from texify.inference import batch_inference
|
| 8 |
-
|
| 9 |
-
from texify.model.processor import load_processor
|
| 10 |
-
import re
|
| 11 |
from PIL import Image, ImageDraw
|
| 12 |
|
| 13 |
-
from marker.bbox import should_merge_blocks, merge_boxes
|
| 14 |
from marker.debug.data import dump_equation_debug_data
|
| 15 |
from marker.pdf.images import render_image
|
| 16 |
from marker.settings import settings
|
| 17 |
-
from marker.schema import
|
|
|
|
| 18 |
import os
|
| 19 |
|
| 20 |
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
| 21 |
|
| 22 |
-
processor = load_processor()
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
def load_texify_model():
|
| 26 |
-
texify_model = load_model(checkpoint=settings.TEXIFY_MODEL_NAME, device=settings.TORCH_DEVICE_MODEL, dtype=settings.TEXIFY_DTYPE)
|
| 27 |
-
return texify_model
|
| 28 |
-
|
| 29 |
|
| 30 |
def mask_bbox(png_image, bbox, selected_bboxes):
|
| 31 |
mask = Image.new('L', png_image.size, 0) # 'L' mode for grayscale
|
|
@@ -72,10 +61,10 @@ def get_latex_batched(images, reformat_region_lens, texify_model, batch_size):
|
|
| 72 |
max_length = min(max_length, settings.TEXIFY_MODEL_MAX)
|
| 73 |
max_length += settings.TEXIFY_TOKEN_BUFFER
|
| 74 |
|
| 75 |
-
model_output = batch_inference(images[min_idx:max_idx], texify_model, processor, max_tokens=max_length)
|
| 76 |
|
| 77 |
for j, output in enumerate(model_output):
|
| 78 |
-
token_count = get_total_texify_tokens(output)
|
| 79 |
if token_count >= max_length - 1:
|
| 80 |
output = ""
|
| 81 |
|
|
@@ -84,7 +73,7 @@ def get_latex_batched(images, reformat_region_lens, texify_model, batch_size):
|
|
| 84 |
return predictions
|
| 85 |
|
| 86 |
|
| 87 |
-
def get_total_texify_tokens(text):
|
| 88 |
tokenizer = processor.tokenizer
|
| 89 |
tokens = tokenizer(text)
|
| 90 |
return len(tokens["input_ids"])
|
|
|
|
|
|
|
| 1 |
from copy import deepcopy
|
|
|
|
| 2 |
from typing import List
|
| 3 |
|
|
|
|
| 4 |
from texify.inference import batch_inference
|
| 5 |
+
|
|
|
|
|
|
|
| 6 |
from PIL import Image, ImageDraw
|
| 7 |
|
| 8 |
+
from marker.schema.bbox import should_merge_blocks, merge_boxes
|
| 9 |
from marker.debug.data import dump_equation_debug_data
|
| 10 |
from marker.pdf.images import render_image
|
| 11 |
from marker.settings import settings
|
| 12 |
+
from marker.schema.schema import Span, Line, Block, BlockType
|
| 13 |
+
from marker.schema.page import Page
|
| 14 |
import os
|
| 15 |
|
| 16 |
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
| 17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
def mask_bbox(png_image, bbox, selected_bboxes):
|
| 20 |
mask = Image.new('L', png_image.size, 0) # 'L' mode for grayscale
|
|
|
|
| 61 |
max_length = min(max_length, settings.TEXIFY_MODEL_MAX)
|
| 62 |
max_length += settings.TEXIFY_TOKEN_BUFFER
|
| 63 |
|
| 64 |
+
model_output = batch_inference(images[min_idx:max_idx], texify_model, texify_model.processor, max_tokens=max_length)
|
| 65 |
|
| 66 |
for j, output in enumerate(model_output):
|
| 67 |
+
token_count = get_total_texify_tokens(output, texify_model.processor)
|
| 68 |
if token_count >= max_length - 1:
|
| 69 |
output = ""
|
| 70 |
|
|
|
|
| 73 |
return predictions
|
| 74 |
|
| 75 |
|
| 76 |
+
def get_total_texify_tokens(text, processor):
|
| 77 |
tokenizer = processor.tokenizer
|
| 78 |
tokens = tokenizer(text)
|
| 79 |
return len(tokens["input_ids"])
|
marker/cleaners/headers.py
CHANGED
|
@@ -6,7 +6,8 @@ from rapidfuzz import fuzz
|
|
| 6 |
from sklearn.cluster import DBSCAN
|
| 7 |
import numpy as np
|
| 8 |
|
| 9 |
-
from marker.schema import
|
|
|
|
| 10 |
from typing import List, Tuple
|
| 11 |
|
| 12 |
|
|
|
|
| 6 |
from sklearn.cluster import DBSCAN
|
| 7 |
import numpy as np
|
| 8 |
|
| 9 |
+
from marker.schema.schema import FullyMergedBlock
|
| 10 |
+
from marker.schema.page import Page
|
| 11 |
from typing import List, Tuple
|
| 12 |
|
| 13 |
|
marker/cleaners/table.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
-
from marker.bbox import merge_boxes
|
| 2 |
-
from marker.schema import Line, Span, Block
|
|
|
|
| 3 |
from copy import deepcopy
|
| 4 |
from tabulate import tabulate
|
| 5 |
from typing import List
|
|
|
|
| 1 |
+
from marker.schema.bbox import merge_boxes
|
| 2 |
+
from marker.schema.schema import Line, Span, Block
|
| 3 |
+
from marker.schema.page import Page
|
| 4 |
from copy import deepcopy
|
| 5 |
from tabulate import tabulate
|
| 6 |
from typing import List
|
marker/convert.py
CHANGED
|
@@ -2,61 +2,32 @@ import pypdfium2 as pdfium
|
|
| 2 |
|
| 3 |
from marker.cleaners.table import merge_table_blocks, create_new_tables
|
| 4 |
from marker.debug.data import dump_bbox_debug_data
|
| 5 |
-
from marker.
|
|
|
|
|
|
|
|
|
|
| 6 |
from marker.cleaners.headers import filter_header_footer, filter_common_titles
|
| 7 |
from marker.cleaners.equations import replace_equations
|
| 8 |
from marker.ordering import order_blocks
|
|
|
|
| 9 |
from marker.postprocessors.editor import edit_full_text
|
| 10 |
from marker.segmentation import detect_document_block_types
|
| 11 |
from marker.cleaners.code import identify_code_blocks, indent_blocks
|
| 12 |
from marker.cleaners.bullets import replace_bullets
|
| 13 |
from marker.markdown import merge_spans, merge_lines, get_full_text
|
| 14 |
-
from marker.schema import
|
|
|
|
| 15 |
from typing import List, Dict, Tuple, Optional
|
| 16 |
import re
|
| 17 |
-
import magic
|
| 18 |
from marker.settings import settings
|
| 19 |
|
| 20 |
|
| 21 |
-
def find_filetype(fpath):
|
| 22 |
-
mimetype = magic.from_file(fpath).lower()
|
| 23 |
-
|
| 24 |
-
# Get extensions from mimetype
|
| 25 |
-
# The mimetype is not always consistent, so use in to check the most common formats
|
| 26 |
-
if "pdf" in mimetype:
|
| 27 |
-
return "pdf"
|
| 28 |
-
#elif "epub" in mimetype:
|
| 29 |
-
# return "epub"
|
| 30 |
-
#elif "mobi" in mimetype:
|
| 31 |
-
# return "mobi"
|
| 32 |
-
elif mimetype in settings.SUPPORTED_FILETYPES:
|
| 33 |
-
return settings.SUPPORTED_FILETYPES[mimetype]
|
| 34 |
-
else:
|
| 35 |
-
print(f"Found nonstandard filetype {mimetype}")
|
| 36 |
-
return "other"
|
| 37 |
-
|
| 38 |
-
|
| 39 |
def annotate_spans(blocks: List[Page], block_types: List[BlockType]):
|
| 40 |
for i, page in enumerate(blocks):
|
| 41 |
page_block_types = block_types[i]
|
| 42 |
page.add_block_types(page_block_types)
|
| 43 |
|
| 44 |
|
| 45 |
-
def get_length_of_text(fname: str) -> int:
|
| 46 |
-
filetype = find_filetype(fname)
|
| 47 |
-
if filetype == "other":
|
| 48 |
-
return 0
|
| 49 |
-
|
| 50 |
-
doc = pdfium.PdfDocument(fname)
|
| 51 |
-
full_text = ""
|
| 52 |
-
for page_idx in range(len(doc)):
|
| 53 |
-
page = doc.get_page(page_idx)
|
| 54 |
-
text_page = page.get_textpage()
|
| 55 |
-
full_text += text_page.get_text_bounded()
|
| 56 |
-
|
| 57 |
-
return len(full_text)
|
| 58 |
-
|
| 59 |
-
|
| 60 |
def convert_single_pdf(
|
| 61 |
fname: str,
|
| 62 |
model_lst: List,
|
|
@@ -64,63 +35,69 @@ def convert_single_pdf(
|
|
| 64 |
metadata: Optional[Dict]=None,
|
| 65 |
parallel_factor: int = 1
|
| 66 |
) -> Tuple[str, Dict]:
|
| 67 |
-
|
|
|
|
| 68 |
if metadata:
|
| 69 |
-
|
| 70 |
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
spell_lang = settings.SPELLCHECK_LANGUAGES.get(lang, None)
|
| 74 |
-
if "eng" not in tess_lang:
|
| 75 |
-
tess_lang = f"eng+{tess_lang}"
|
| 76 |
-
|
| 77 |
-
# Output metadata
|
| 78 |
-
out_meta = {"language": lang}
|
| 79 |
|
|
|
|
| 80 |
filetype = find_filetype(fname)
|
| 81 |
-
if filetype == "other":
|
| 82 |
-
return "", out_meta
|
| 83 |
|
| 84 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
doc = pdfium.PdfDocument(fname)
|
| 87 |
-
|
| 88 |
doc,
|
| 89 |
-
tess_lang,
|
| 90 |
-
spell_lang,
|
| 91 |
max_pages=max_pages,
|
| 92 |
-
parallel=int(parallel_factor * settings.OCR_PARALLEL_WORKERS)
|
| 93 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
if len([b for p in
|
| 99 |
print(f"Could not extract any text blocks for {fname}")
|
| 100 |
return "", out_meta
|
| 101 |
|
| 102 |
-
# Unpack models from list
|
| 103 |
-
texify_model, layoutlm_model, order_model, edit_model = model_lst
|
| 104 |
-
|
| 105 |
block_types = detect_document_block_types(
|
| 106 |
doc,
|
| 107 |
-
|
| 108 |
layoutlm_model,
|
| 109 |
batch_size=int(settings.LAYOUT_BATCH_SIZE * parallel_factor)
|
| 110 |
)
|
| 111 |
|
| 112 |
# Find headers and footers
|
| 113 |
-
bad_span_ids = filter_header_footer(
|
| 114 |
out_meta["block_stats"] = {"header_footer": len(bad_span_ids)}
|
| 115 |
|
| 116 |
-
annotate_spans(
|
| 117 |
|
| 118 |
# Dump debug data if flags are set
|
| 119 |
-
dump_bbox_debug_data(doc,
|
| 120 |
|
| 121 |
blocks = order_blocks(
|
| 122 |
doc,
|
| 123 |
-
|
| 124 |
order_model,
|
| 125 |
batch_size=int(settings.ORDERER_BATCH_SIZE * parallel_factor)
|
| 126 |
)
|
|
|
|
| 2 |
|
| 3 |
from marker.cleaners.table import merge_table_blocks, create_new_tables
|
| 4 |
from marker.debug.data import dump_bbox_debug_data
|
| 5 |
+
from marker.ocr.lang import replace_langs_with_codes, validate_langs
|
| 6 |
+
from marker.ocr.detection import surya_detection
|
| 7 |
+
from marker.ocr.recognition import run_ocr
|
| 8 |
+
from marker.pdf.extract_text import get_text_blocks
|
| 9 |
from marker.cleaners.headers import filter_header_footer, filter_common_titles
|
| 10 |
from marker.cleaners.equations import replace_equations
|
| 11 |
from marker.ordering import order_blocks
|
| 12 |
+
from marker.pdf.filetype import find_filetype
|
| 13 |
from marker.postprocessors.editor import edit_full_text
|
| 14 |
from marker.segmentation import detect_document_block_types
|
| 15 |
from marker.cleaners.code import identify_code_blocks, indent_blocks
|
| 16 |
from marker.cleaners.bullets import replace_bullets
|
| 17 |
from marker.markdown import merge_spans, merge_lines, get_full_text
|
| 18 |
+
from marker.schema.schema import BlockType
|
| 19 |
+
from marker.schema.page import Page
|
| 20 |
from typing import List, Dict, Tuple, Optional
|
| 21 |
import re
|
|
|
|
| 22 |
from marker.settings import settings
|
| 23 |
|
| 24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
def annotate_spans(blocks: List[Page], block_types: List[BlockType]):
|
| 26 |
for i, page in enumerate(blocks):
|
| 27 |
page_block_types = block_types[i]
|
| 28 |
page.add_block_types(page_block_types)
|
| 29 |
|
| 30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
def convert_single_pdf(
|
| 32 |
fname: str,
|
| 33 |
model_lst: List,
|
|
|
|
| 35 |
metadata: Optional[Dict]=None,
|
| 36 |
parallel_factor: int = 1
|
| 37 |
) -> Tuple[str, Dict]:
|
| 38 |
+
# Set language needed for OCR
|
| 39 |
+
langs = [settings.DEFAULT_LANG]
|
| 40 |
if metadata:
|
| 41 |
+
langs = metadata.get("languages", langs)
|
| 42 |
|
| 43 |
+
langs = replace_langs_with_codes(langs)
|
| 44 |
+
validate_langs(langs)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
+
# Find the filetype
|
| 47 |
filetype = find_filetype(fname)
|
|
|
|
|
|
|
| 48 |
|
| 49 |
+
# Setup output metadata
|
| 50 |
+
out_meta = {
|
| 51 |
+
"languages": langs,
|
| 52 |
+
"filetype": filetype,
|
| 53 |
+
}
|
| 54 |
|
| 55 |
+
if filetype == "other": # We can't process this file
|
| 56 |
+
return "", out_meta
|
| 57 |
+
|
| 58 |
+
# Get initial text blocks from the pdf
|
| 59 |
doc = pdfium.PdfDocument(fname)
|
| 60 |
+
pages, toc = get_text_blocks(
|
| 61 |
doc,
|
|
|
|
|
|
|
| 62 |
max_pages=max_pages,
|
|
|
|
| 63 |
)
|
| 64 |
+
out_meta.update({
|
| 65 |
+
"toc": toc,
|
| 66 |
+
"pages": len(pages),
|
| 67 |
+
})
|
| 68 |
+
|
| 69 |
+
# Unpack models from list
|
| 70 |
+
texify_model, layout_model, order_model, edit_model, detection_model, ocr_model = model_lst
|
| 71 |
+
|
| 72 |
+
# Identify text lines on pages
|
| 73 |
+
surya_detection(doc, pages, detection_model)
|
| 74 |
|
| 75 |
+
# OCR pages as needed
|
| 76 |
+
pages, ocr_stats = run_ocr(doc, pages, langs, ocr_model, parallel_factor)
|
| 77 |
+
|
| 78 |
+
if len([b for p in pages for b in p.blocks]) == 0:
|
| 79 |
print(f"Could not extract any text blocks for {fname}")
|
| 80 |
return "", out_meta
|
| 81 |
|
|
|
|
|
|
|
|
|
|
| 82 |
block_types = detect_document_block_types(
|
| 83 |
doc,
|
| 84 |
+
pages,
|
| 85 |
layoutlm_model,
|
| 86 |
batch_size=int(settings.LAYOUT_BATCH_SIZE * parallel_factor)
|
| 87 |
)
|
| 88 |
|
| 89 |
# Find headers and footers
|
| 90 |
+
bad_span_ids = filter_header_footer(pages)
|
| 91 |
out_meta["block_stats"] = {"header_footer": len(bad_span_ids)}
|
| 92 |
|
| 93 |
+
annotate_spans(pages, block_types)
|
| 94 |
|
| 95 |
# Dump debug data if flags are set
|
| 96 |
+
dump_bbox_debug_data(doc, pages)
|
| 97 |
|
| 98 |
blocks = order_blocks(
|
| 99 |
doc,
|
| 100 |
+
pages,
|
| 101 |
order_model,
|
| 102 |
batch_size=int(settings.ORDERER_BATCH_SIZE * parallel_factor)
|
| 103 |
)
|
marker/debug/data.py
CHANGED
|
@@ -1,11 +1,10 @@
|
|
| 1 |
import base64
|
| 2 |
import json
|
| 3 |
import os
|
| 4 |
-
import zlib
|
| 5 |
from typing import List
|
| 6 |
|
| 7 |
from marker.pdf.images import render_image
|
| 8 |
-
from marker.schema import Page
|
| 9 |
from marker.settings import settings
|
| 10 |
from PIL import Image
|
| 11 |
import io
|
|
|
|
| 1 |
import base64
|
| 2 |
import json
|
| 3 |
import os
|
|
|
|
| 4 |
from typing import List
|
| 5 |
|
| 6 |
from marker.pdf.images import render_image
|
| 7 |
+
from marker.schema.page import Page
|
| 8 |
from marker.settings import settings
|
| 9 |
from PIL import Image
|
| 10 |
import io
|
marker/layout/layout.py
ADDED
|
File without changes
|
marker/layout/order.py
ADDED
|
File without changes
|
marker/markdown.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
-
from marker.schema import MergedLine, MergedBlock, FullyMergedBlock
|
|
|
|
| 2 |
import re
|
| 3 |
from typing import List
|
| 4 |
|
|
|
|
| 1 |
+
from marker.schema.schema import MergedLine, MergedBlock, FullyMergedBlock
|
| 2 |
+
from marker.schema.page import Page
|
| 3 |
import re
|
| 4 |
from typing import List
|
| 5 |
|
marker/models.py
CHANGED
|
@@ -1,13 +1,56 @@
|
|
| 1 |
-
from marker.cleaners.equations import load_texify_model
|
| 2 |
-
from marker.ordering import load_ordering_model
|
| 3 |
from marker.postprocessors.editor import load_editing_model
|
| 4 |
-
from
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
|
| 7 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
edit = load_editing_model()
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
model_lst = [texify, layout, order, edit]
|
| 13 |
return model_lst
|
|
|
|
|
|
|
|
|
|
| 1 |
from marker.postprocessors.editor import load_editing_model
|
| 2 |
+
from surya.model.detection import segformer
|
| 3 |
+
from texify.model.model import load_model as load_texify_model
|
| 4 |
+
from texify.model.processor import load_processor as load_texify_processor
|
| 5 |
+
from marker.settings import settings
|
| 6 |
+
from surya.model.recognition.model import load_model as load_recognition_model
|
| 7 |
+
from surya.model.recognition.processor import load_processor as load_recognition_processor
|
| 8 |
+
from surya.model.ordering.model import load_model as load_order_model
|
| 9 |
+
from surya.model.ordering.processor import load_processor as load_order_processor
|
| 10 |
|
| 11 |
|
| 12 |
+
def setup_recognition_model(langs):
|
| 13 |
+
rec_model = load_recognition_model(langs=langs)
|
| 14 |
+
rec_processor = load_recognition_processor()
|
| 15 |
+
rec_model.processor = rec_processor
|
| 16 |
+
return rec_model
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def setup_detection_model():
|
| 20 |
+
model = segformer.load_model()
|
| 21 |
+
processor = segformer.load_processor()
|
| 22 |
+
model.processor = processor
|
| 23 |
+
return model
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def setup_texify_model():
|
| 27 |
+
texify_model = load_texify_model(checkpoint=settings.TEXIFY_MODEL_NAME, device=settings.TORCH_DEVICE_MODEL, dtype=settings.TEXIFY_DTYPE)
|
| 28 |
+
texify_processor = load_texify_processor()
|
| 29 |
+
texify_model.processor = texify_processor
|
| 30 |
+
return texify_model
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def setup_layout_model():
|
| 34 |
+
model = segformer.load_model(checkpoint=settings.LAYOUT_MODEL_CHECKPOINT)
|
| 35 |
+
processor = segformer.load_processor()
|
| 36 |
+
model.processor = processor
|
| 37 |
+
return model
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def setup_order_model():
|
| 41 |
+
model = load_order_model()
|
| 42 |
+
processor = load_order_processor()
|
| 43 |
+
model.processor = processor
|
| 44 |
+
return model
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def load_all_models(langs=None):
|
| 48 |
+
# langs is optional list of languages to prune from recognition MoE model
|
| 49 |
+
detection = setup_detection_model()
|
| 50 |
+
layout = setup_layout_model()
|
| 51 |
+
order = setup_order_model()
|
| 52 |
edit = load_editing_model()
|
| 53 |
+
ocr = setup_recognition_model(langs)
|
| 54 |
+
texify = setup_texify_model()
|
| 55 |
+
model_lst = [texify, layout, order, edit, detection, ocr]
|
|
|
|
| 56 |
return model_lst
|
marker/ocr/detection.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List
|
| 2 |
+
|
| 3 |
+
from pypdfium2 import PdfDocument
|
| 4 |
+
from surya.detection import batch_text_detection
|
| 5 |
+
|
| 6 |
+
from marker.pdf.images import render_image
|
| 7 |
+
from marker.schema.page import Page
|
| 8 |
+
from marker.settings import settings
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def surya_detection(doc: PdfDocument, pages: List[Page], det_model):
|
| 12 |
+
processor = det_model.processor
|
| 13 |
+
max_len = min(len(pages), len(doc))
|
| 14 |
+
images = [render_image(doc[pnum], dpi=settings.SURYA_DETECTOR_DPI) for pnum in range(max_len)]
|
| 15 |
+
|
| 16 |
+
predictions = batch_text_detection(images, det_model, processor)
|
| 17 |
+
for (page, pred) in zip(pages, predictions):
|
| 18 |
+
page.text_lines = pred
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
|
marker/ocr/heuristics.py
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
from typing import List
|
| 3 |
+
|
| 4 |
+
from nltk import wordpunct_tokenize
|
| 5 |
+
|
| 6 |
+
from marker.ocr.utils import alphanum_ratio
|
| 7 |
+
from marker.schema.page import Page
|
| 8 |
+
from marker.settings import settings
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def should_ocr_page(page: Page, no_text: bool):
|
| 12 |
+
detected_lines_found = detected_line_coverage(page)
|
| 13 |
+
|
| 14 |
+
# OCR page if we got minimal text, or if we got too many spaces
|
| 15 |
+
conditions = [
|
| 16 |
+
no_text , # Full doc has no text, and needs full OCR
|
| 17 |
+
(len(page.prelim_text) > 0 and detect_bad_ocr(page.prelim_text)), # Bad OCR
|
| 18 |
+
detected_lines_found is False, # didn't extract text for all detected lines
|
| 19 |
+
]
|
| 20 |
+
|
| 21 |
+
return any(conditions) or settings.OCR_ALL_PAGES
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def detect_bad_ocr(text, space_threshold=.6, newline_threshold=.5, alphanum_threshold=.4):
|
| 25 |
+
if len(text) == 0:
|
| 26 |
+
# Assume OCR failed if we have no text
|
| 27 |
+
return True
|
| 28 |
+
|
| 29 |
+
words = wordpunct_tokenize(text)
|
| 30 |
+
words = [w for w in words if w.strip()]
|
| 31 |
+
alpha_words = [word for word in words if word.isalnum()]
|
| 32 |
+
|
| 33 |
+
spaces = len(re.findall(r'\s+', text))
|
| 34 |
+
alpha_chars = len(re.sub(r'\s+', '', text))
|
| 35 |
+
if spaces / (alpha_chars + spaces) > space_threshold:
|
| 36 |
+
return True
|
| 37 |
+
|
| 38 |
+
newlines = len(re.findall(r'\n+', text))
|
| 39 |
+
non_newlines = len(re.sub(r'\n+', '', text))
|
| 40 |
+
if newlines / (newlines + non_newlines) > newline_threshold:
|
| 41 |
+
return True
|
| 42 |
+
|
| 43 |
+
if alphanum_ratio(text) < alphanum_threshold: # Garbled text
|
| 44 |
+
return True
|
| 45 |
+
|
| 46 |
+
invalid_chars = len([c for c in text if c in settings.INVALID_CHARS])
|
| 47 |
+
if invalid_chars > max(3.0, len(text) * .02):
|
| 48 |
+
return True
|
| 49 |
+
|
| 50 |
+
return False
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def no_text_found(pages: List[Page]):
|
| 54 |
+
full_text = ""
|
| 55 |
+
for page in pages:
|
| 56 |
+
full_text += page.text
|
| 57 |
+
return len(full_text.strip()) < 10
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def detected_line_coverage(page: Page, intersect_thresh=.6, detection_thresh=.5):
|
| 61 |
+
found_lines = 0
|
| 62 |
+
total_lines = 0
|
| 63 |
+
for detected_line in page.text_lines.bboxes:
|
| 64 |
+
detected_bbox = detected_line.bbox
|
| 65 |
+
for block in page.blocks:
|
| 66 |
+
for line in block.lines:
|
| 67 |
+
intersection_pct = line.intersection_pct(detected_bbox)
|
| 68 |
+
if intersection_pct > intersect_thresh:
|
| 69 |
+
found_lines += 1
|
| 70 |
+
total_lines += 1
|
| 71 |
+
return found_lines / total_lines > detection_thresh
|
marker/ocr/lang.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from surya.languages import CODE_TO_LANGUAGE, LANGUAGE_TO_CODE
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
def replace_langs_with_codes(langs):
|
| 5 |
+
for i, lang in enumerate(langs):
|
| 6 |
+
if lang in LANGUAGE_TO_CODE:
|
| 7 |
+
langs[i] = LANGUAGE_TO_CODE[lang]
|
| 8 |
+
return langs
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def validate_langs(langs):
|
| 12 |
+
for lang in langs:
|
| 13 |
+
if lang not in CODE_TO_LANGUAGE:
|
| 14 |
+
raise ValueError(f"Invalid language code {lang}")
|
marker/ocr/page.py
DELETED
|
@@ -1,75 +0,0 @@
|
|
| 1 |
-
import io
|
| 2 |
-
from typing import List, Optional
|
| 3 |
-
|
| 4 |
-
import ocrmypdf
|
| 5 |
-
|
| 6 |
-
from marker.ocr.utils import detect_bad_ocr
|
| 7 |
-
from marker.schema import Block
|
| 8 |
-
from marker.settings import settings
|
| 9 |
-
|
| 10 |
-
ocrmypdf.configure_logging(verbosity=ocrmypdf.Verbosity.quiet)
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
def ocr_entire_page(page, lang: str) -> List[Block]:
|
| 14 |
-
if settings.OCR_ENGINE == "tesseract":
|
| 15 |
-
return ocr_entire_page_tess(page, lang)
|
| 16 |
-
elif settings.OCR_ENGINE == "ocrmypdf":
|
| 17 |
-
return ocr_entire_page_ocrmp(page, lang)
|
| 18 |
-
else:
|
| 19 |
-
raise ValueError(f"Unknown OCR engine {settings.OCR_ENGINE}")
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
def ocr_entire_page_tess(page, lang: str) -> List[Block]:
|
| 23 |
-
try:
|
| 24 |
-
full_tp = page.get_textpage_ocr(flags=settings.TEXT_FLAGS, dpi=settings.OCR_DPI, full=True, language=lang)
|
| 25 |
-
blocks = page.get_text("dict", sort=True, flags=settings.TEXT_FLAGS, textpage=full_tp)["blocks"]
|
| 26 |
-
full_text = page.get_text("text", sort=True, flags=settings.TEXT_FLAGS, textpage=full_tp)
|
| 27 |
-
|
| 28 |
-
if len(full_text) == 0:
|
| 29 |
-
return []
|
| 30 |
-
|
| 31 |
-
# Check if OCR worked. If it didn't, return empty list
|
| 32 |
-
# OCR can fail if there is a scanned blank page with some faint text impressions, for example
|
| 33 |
-
if detect_bad_ocr(full_text):
|
| 34 |
-
return []
|
| 35 |
-
except RuntimeError:
|
| 36 |
-
return []
|
| 37 |
-
return blocks
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
def ocr_entire_page_ocrmp(page, lang: str) -> List[Block]:
|
| 41 |
-
# Use ocrmypdf to get OCR text for the whole page
|
| 42 |
-
src = page.parent # the page's document
|
| 43 |
-
blank_doc = pymupdf.open() # make temporary 1-pager
|
| 44 |
-
blank_doc.insert_pdf(src, from_page=page.number, to_page=page.number, annots=False, links=False)
|
| 45 |
-
pdfbytes = blank_doc.tobytes()
|
| 46 |
-
inbytes = io.BytesIO(pdfbytes) # transform to BytesIO object
|
| 47 |
-
outbytes = io.BytesIO() # let ocrmypdf store its result pdf here
|
| 48 |
-
ocrmypdf.ocr(
|
| 49 |
-
inbytes,
|
| 50 |
-
outbytes,
|
| 51 |
-
language=lang,
|
| 52 |
-
output_type="pdf",
|
| 53 |
-
redo_ocr=None if settings.OCR_ALL_PAGES else True,
|
| 54 |
-
force_ocr=True if settings.OCR_ALL_PAGES else None,
|
| 55 |
-
progress_bar=False,
|
| 56 |
-
optimize=False,
|
| 57 |
-
fast_web_view=1e6,
|
| 58 |
-
skip_big=15, # skip images larger than 15 megapixels
|
| 59 |
-
tesseract_timeout=settings.TESSERACT_TIMEOUT,
|
| 60 |
-
tesseract_non_ocr_timeout=settings.TESSERACT_TIMEOUT,
|
| 61 |
-
)
|
| 62 |
-
ocr_pdf = pymupdf.open("pdf", outbytes.getvalue()) # read output as fitz PDF
|
| 63 |
-
blocks = ocr_pdf[0].get_text("dict", sort=True, flags=settings.TEXT_FLAGS)["blocks"]
|
| 64 |
-
full_text = ocr_pdf[0].get_text("text", sort=True, flags=settings.TEXT_FLAGS)
|
| 65 |
-
|
| 66 |
-
# Make sure the original pdf/epub/mobi bbox and the ocr pdf bbox are the same
|
| 67 |
-
assert page.bound() == ocr_pdf[0].bound()
|
| 68 |
-
|
| 69 |
-
if len(full_text) == 0:
|
| 70 |
-
return []
|
| 71 |
-
|
| 72 |
-
if detect_bad_ocr(full_text):
|
| 73 |
-
return []
|
| 74 |
-
|
| 75 |
-
return blocks
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
marker/ocr/recognition.py
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from itertools import repeat
|
| 2 |
+
from typing import List, Optional, Dict
|
| 3 |
+
|
| 4 |
+
import ocrmypdf
|
| 5 |
+
import pypdfium2 as pdfium
|
| 6 |
+
import io
|
| 7 |
+
from concurrent.futures import ThreadPoolExecutor
|
| 8 |
+
|
| 9 |
+
from surya.ocr import run_recognition
|
| 10 |
+
|
| 11 |
+
from marker.ocr.heuristics import should_ocr_page, no_text_found, detect_bad_ocr
|
| 12 |
+
from marker.pdf.images import render_image
|
| 13 |
+
from marker.schema.page import Page
|
| 14 |
+
from marker.schema.schema import Block, Line, Span
|
| 15 |
+
from marker.settings import settings
|
| 16 |
+
from marker.pdf.extract_text import get_text_blocks
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def run_ocr(doc, pages: List[Page], langs: List[str], rec_model, parallel_factor) -> (List[Page], Dict):
|
| 20 |
+
ocr_pages = 0
|
| 21 |
+
ocr_success = 0
|
| 22 |
+
ocr_failed = 0
|
| 23 |
+
no_text = no_text_found(pages)
|
| 24 |
+
ocr_idxs = []
|
| 25 |
+
for pnum, page in enumerate(pages):
|
| 26 |
+
ocr_needed = should_ocr_page(page, no_text)
|
| 27 |
+
if ocr_needed:
|
| 28 |
+
ocr_idxs.append(pnum)
|
| 29 |
+
ocr_pages += 1
|
| 30 |
+
|
| 31 |
+
ocr_method = settings.OCR_ENGINE_INTERNAL
|
| 32 |
+
if ocr_method == "surya":
|
| 33 |
+
new_pages = surya_recognition(doc, ocr_idxs, langs, rec_model, pages)
|
| 34 |
+
else:
|
| 35 |
+
new_pages = tesseract_recognition(doc, ocr_idxs, langs)
|
| 36 |
+
|
| 37 |
+
for orig_idx, page in zip(ocr_idxs, new_pages):
|
| 38 |
+
if detect_bad_ocr(page) or len(page.prelim_text) == 0:
|
| 39 |
+
ocr_failed += 1
|
| 40 |
+
else:
|
| 41 |
+
ocr_success += 1
|
| 42 |
+
pages[orig_idx] = page
|
| 43 |
+
|
| 44 |
+
return pages, {"ocr_pages": ocr_pages, "ocr_failed": ocr_failed, "ocr_success": ocr_success}
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def surya_recognition(doc, page_idxs, langs: List[str], rec_model, pages: List[Page]) -> List[Optional[Page]]:
|
| 48 |
+
images = [render_image(doc[pnum], dpi=settings.SURYA_OCR_DPI) for pnum in page_idxs]
|
| 49 |
+
processor = rec_model.processor
|
| 50 |
+
selected_pages = [p for i, p in enumerate(pages) if i in page_idxs]
|
| 51 |
+
|
| 52 |
+
surya_langs = [langs] * len(page_idxs)
|
| 53 |
+
detection_results = [p.text_lines.bboxes for p in selected_pages]
|
| 54 |
+
polygons = [[b.polygon for b in bboxes] for bboxes in detection_results]
|
| 55 |
+
|
| 56 |
+
results = run_recognition(images, surya_langs, rec_model, processor, polygons=polygons)
|
| 57 |
+
|
| 58 |
+
new_pages = []
|
| 59 |
+
for (page_idx, result, old_page) in zip(page_idxs, results, selected_pages):
|
| 60 |
+
text_lines = old_page.text_lines
|
| 61 |
+
ocr_results = result.text_lines
|
| 62 |
+
blocks = []
|
| 63 |
+
for i, line in enumerate(ocr_results):
|
| 64 |
+
block = Block(
|
| 65 |
+
bbox=line.bbox,
|
| 66 |
+
pnum=page_idx,
|
| 67 |
+
lines=[Line(
|
| 68 |
+
bbox=line.bbox,
|
| 69 |
+
spans=[Span(
|
| 70 |
+
text=line.text,
|
| 71 |
+
bbox=line.bbox,
|
| 72 |
+
span_id=f"{page_idx}_{i}",
|
| 73 |
+
font="",
|
| 74 |
+
font_weight=0,
|
| 75 |
+
font_size=0,
|
| 76 |
+
)
|
| 77 |
+
]
|
| 78 |
+
)]
|
| 79 |
+
)
|
| 80 |
+
blocks.append(block)
|
| 81 |
+
page = Page(
|
| 82 |
+
blocks=blocks,
|
| 83 |
+
pnum=page_idx,
|
| 84 |
+
bbox=old_page.bbox,
|
| 85 |
+
rotation=old_page.rotation,
|
| 86 |
+
text_lines=text_lines
|
| 87 |
+
)
|
| 88 |
+
new_pages.append(page)
|
| 89 |
+
return new_pages
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
def tesseract_recognition(doc, page_idxs, langs: List[str]) -> List[Optional[Page]]:
|
| 93 |
+
pdf_pages = generate_single_page_pdfs(doc, page_idxs)
|
| 94 |
+
with ThreadPoolExecutor(max_workers=settings.OCR_THREADS) as executor:
|
| 95 |
+
pages = list(executor.map(_tesseract_recognition, pdf_pages, repeat(langs, len(pdf_pages))))
|
| 96 |
+
|
| 97 |
+
return pages
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
def generate_single_page_pdfs(doc, page_idxs) -> List[io.BytesIO]:
|
| 101 |
+
pdf_pages = []
|
| 102 |
+
for page_idx in page_idxs:
|
| 103 |
+
blank_doc = pdfium.PdfDocument.new()
|
| 104 |
+
blank_doc.import_pages(doc, pages=[page_idx])
|
| 105 |
+
assert len(blank_doc) == 1, "Failed to import page"
|
| 106 |
+
|
| 107 |
+
in_pdf = io.BytesIO()
|
| 108 |
+
blank_doc.save(in_pdf)
|
| 109 |
+
in_pdf.seek(0)
|
| 110 |
+
pdf_pages.append(in_pdf)
|
| 111 |
+
return pdf_pages
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
def _tesseract_recognition(in_pdf, langs: List[str]) -> Optional[Page]:
|
| 115 |
+
out_pdf = io.BytesIO()
|
| 116 |
+
|
| 117 |
+
ocrmypdf.ocr(
|
| 118 |
+
in_pdf,
|
| 119 |
+
out_pdf,
|
| 120 |
+
language=langs[0],
|
| 121 |
+
output_type="pdf",
|
| 122 |
+
redo_ocr=None if settings.OCR_ALL_PAGES else True,
|
| 123 |
+
force_ocr=True if settings.OCR_ALL_PAGES else None,
|
| 124 |
+
progress_bar=False,
|
| 125 |
+
optimize=False,
|
| 126 |
+
fast_web_view=1e6,
|
| 127 |
+
skip_big=15, # skip images larger than 15 megapixels
|
| 128 |
+
tesseract_timeout=settings.TESSERACT_TIMEOUT,
|
| 129 |
+
tesseract_non_ocr_timeout=settings.TESSERACT_TIMEOUT,
|
| 130 |
+
)
|
| 131 |
+
|
| 132 |
+
new_doc = pdfium.PdfDocument(out_pdf.getvalue())
|
| 133 |
+
|
| 134 |
+
blocks, _ = get_text_blocks(new_doc, max_pages=1)
|
| 135 |
+
page = blocks[0]
|
| 136 |
+
return page
|
marker/ocr/utils.py
CHANGED
|
@@ -1,39 +1,3 @@
|
|
| 1 |
-
from typing import Optional
|
| 2 |
-
|
| 3 |
-
from nltk import wordpunct_tokenize
|
| 4 |
-
from marker.settings import settings
|
| 5 |
-
import re
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
def detect_bad_ocr(text, space_threshold=.6, newline_threshold=.5, alphanum_threshold=.4):
|
| 9 |
-
if len(text) == 0:
|
| 10 |
-
# Assume OCR failed if we have no text
|
| 11 |
-
return True
|
| 12 |
-
|
| 13 |
-
words = wordpunct_tokenize(text)
|
| 14 |
-
words = [w for w in words if w.strip()]
|
| 15 |
-
alpha_words = [word for word in words if word.isalnum()]
|
| 16 |
-
|
| 17 |
-
spaces = len(re.findall(r'\s+', text))
|
| 18 |
-
alpha_chars = len(re.sub(r'\s+', '', text))
|
| 19 |
-
if spaces / (alpha_chars + spaces) > space_threshold:
|
| 20 |
-
return True
|
| 21 |
-
|
| 22 |
-
newlines = len(re.findall(r'\n+', text))
|
| 23 |
-
non_newlines = len(re.sub(r'\n+', '', text))
|
| 24 |
-
if newlines / (newlines + non_newlines) > newline_threshold:
|
| 25 |
-
return True
|
| 26 |
-
|
| 27 |
-
if alphanum_ratio(text) < alphanum_threshold: # Garbled text
|
| 28 |
-
return True
|
| 29 |
-
|
| 30 |
-
invalid_chars = len([c for c in text if c in settings.INVALID_CHARS])
|
| 31 |
-
if invalid_chars > max(3.0, len(text) * .02):
|
| 32 |
-
return True
|
| 33 |
-
|
| 34 |
-
return False
|
| 35 |
-
|
| 36 |
-
|
| 37 |
def font_flags_decomposer(flags):
|
| 38 |
flags = int(flags)
|
| 39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
def font_flags_decomposer(flags):
|
| 2 |
flags = int(flags)
|
| 3 |
|
marker/ordering.py
CHANGED
|
@@ -2,14 +2,11 @@ from copy import deepcopy
|
|
| 2 |
from typing import List
|
| 3 |
|
| 4 |
import torch
|
| 5 |
-
import sys, os
|
| 6 |
|
| 7 |
from transformers import LayoutLMv3ForSequenceClassification, LayoutLMv3Processor
|
| 8 |
-
from PIL import Image
|
| 9 |
-
import io
|
| 10 |
|
| 11 |
from marker.pdf.images import render_image
|
| 12 |
-
from marker.schema import Page
|
| 13 |
from marker.settings import settings
|
| 14 |
|
| 15 |
processor = LayoutLMv3Processor.from_pretrained(settings.ORDERER_MODEL_NAME)
|
|
|
|
| 2 |
from typing import List
|
| 3 |
|
| 4 |
import torch
|
|
|
|
| 5 |
|
| 6 |
from transformers import LayoutLMv3ForSequenceClassification, LayoutLMv3Processor
|
|
|
|
|
|
|
| 7 |
|
| 8 |
from marker.pdf.images import render_image
|
| 9 |
+
from marker.schema.page import Page
|
| 10 |
from marker.settings import settings
|
| 11 |
|
| 12 |
processor = LayoutLMv3Processor.from_pretrained(settings.ORDERER_MODEL_NAME)
|
marker/{extract_text.py → pdf/extract_text.py}
RENAMED
|
@@ -1,17 +1,21 @@
|
|
| 1 |
import os
|
| 2 |
-
from typing import List, Optional
|
| 3 |
|
|
|
|
| 4 |
import pypdfium2.internal as pdfium_i
|
| 5 |
|
| 6 |
-
from marker.
|
|
|
|
|
|
|
| 7 |
from marker.settings import settings
|
| 8 |
-
from marker.schema import Span, Line, Block
|
|
|
|
| 9 |
from pdftext.extraction import dictionary_output
|
| 10 |
|
| 11 |
os.environ["TESSDATA_PREFIX"] = settings.TESSDATA_PREFIX
|
| 12 |
|
| 13 |
|
| 14 |
-
def pdftext_format_to_blocks(page, pnum: int) ->
|
| 15 |
page_blocks = []
|
| 16 |
span_id = 0
|
| 17 |
for block_idx, block in enumerate(page["blocks"]):
|
|
@@ -54,42 +58,8 @@ def pdftext_format_to_blocks(page, pnum: int) -> List[Block]:
|
|
| 54 |
return out_page
|
| 55 |
|
| 56 |
|
| 57 |
-
def
|
| 58 |
-
ocr_pages = 0
|
| 59 |
-
ocr_success = 0
|
| 60 |
-
ocr_failed = 0
|
| 61 |
-
page_bbox = doc[pnum].bound()
|
| 62 |
-
|
| 63 |
-
blocks = get_single_page_blocks(doc, pnum, tess_lang)
|
| 64 |
-
page_obj = Page(blocks=blocks, pnum=pnum, bbox=page_bbox)
|
| 65 |
-
|
| 66 |
-
# OCR page if we got minimal text, or if we got too many spaces
|
| 67 |
-
conditions = [
|
| 68 |
-
(
|
| 69 |
-
no_text # Full doc has no text, and needs full OCR
|
| 70 |
-
or
|
| 71 |
-
(len(page_obj.prelim_text) > 0 and detect_bad_ocr(page_obj.prelim_text)) # Bad OCR
|
| 72 |
-
),
|
| 73 |
-
min_ocr_page < pnum < len(doc) - 1,
|
| 74 |
-
not disable_ocr
|
| 75 |
-
]
|
| 76 |
-
if all(conditions) or settings.OCR_ALL_PAGES:
|
| 77 |
-
page = doc[pnum]
|
| 78 |
-
blocks = get_single_page_blocks(doc, pnum, tess_lang, ocr=True)
|
| 79 |
-
page_obj = Page(blocks=blocks, pnum=pnum, bbox=page_bbox, rotation=page.rotation)
|
| 80 |
-
ocr_pages = 1
|
| 81 |
-
if len(blocks) == 0:
|
| 82 |
-
ocr_failed = 1
|
| 83 |
-
else:
|
| 84 |
-
ocr_success = 1
|
| 85 |
-
return page_obj, {"ocr_pages": ocr_pages, "ocr_failed": ocr_failed, "ocr_success": ocr_success}
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
def get_text_blocks(doc, tess_lang: str, spell_lang: Optional[str], max_pages: Optional[int] = None, parallel: int = settings.OCR_PARALLEL_WORKERS):
|
| 89 |
toc = get_toc(doc)
|
| 90 |
-
ocr_pages = 0
|
| 91 |
-
ocr_failed = 0
|
| 92 |
-
ocr_success = 0
|
| 93 |
|
| 94 |
page_range = range(len(doc))
|
| 95 |
if max_pages:
|
|
@@ -99,7 +69,7 @@ def get_text_blocks(doc, tess_lang: str, spell_lang: Optional[str], max_pages: O
|
|
| 99 |
all_blocks = dictionary_output(doc, page_range=page_range)
|
| 100 |
all_blocks = [pdftext_format_to_blocks(page, pnum) for pnum, page in enumerate(all_blocks)]
|
| 101 |
|
| 102 |
-
return all_blocks, toc
|
| 103 |
|
| 104 |
|
| 105 |
def naive_get_text(doc):
|
|
@@ -126,3 +96,10 @@ def get_toc(doc, max_depth=15):
|
|
| 126 |
}
|
| 127 |
toc_list.append(list_item)
|
| 128 |
return toc_list
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
+
from typing import List, Optional, Dict
|
| 3 |
|
| 4 |
+
import pypdfium2 as pdfium
|
| 5 |
import pypdfium2.internal as pdfium_i
|
| 6 |
|
| 7 |
+
from marker.pdf.filetype import find_filetype
|
| 8 |
+
from marker.ocr.utils import font_flags_decomposer
|
| 9 |
+
from marker.ocr.heuristics import detect_bad_ocr
|
| 10 |
from marker.settings import settings
|
| 11 |
+
from marker.schema.schema import Span, Line, Block
|
| 12 |
+
from marker.schema.page import Page
|
| 13 |
from pdftext.extraction import dictionary_output
|
| 14 |
|
| 15 |
os.environ["TESSDATA_PREFIX"] = settings.TESSDATA_PREFIX
|
| 16 |
|
| 17 |
|
| 18 |
+
def pdftext_format_to_blocks(page, pnum: int) -> Page:
|
| 19 |
page_blocks = []
|
| 20 |
span_id = 0
|
| 21 |
for block_idx, block in enumerate(page["blocks"]):
|
|
|
|
| 58 |
return out_page
|
| 59 |
|
| 60 |
|
| 61 |
+
def get_text_blocks(doc, max_pages: Optional[int] = None) -> (List[Page], Dict):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
toc = get_toc(doc)
|
|
|
|
|
|
|
|
|
|
| 63 |
|
| 64 |
page_range = range(len(doc))
|
| 65 |
if max_pages:
|
|
|
|
| 69 |
all_blocks = dictionary_output(doc, page_range=page_range)
|
| 70 |
all_blocks = [pdftext_format_to_blocks(page, pnum) for pnum, page in enumerate(all_blocks)]
|
| 71 |
|
| 72 |
+
return all_blocks, toc
|
| 73 |
|
| 74 |
|
| 75 |
def naive_get_text(doc):
|
|
|
|
| 96 |
}
|
| 97 |
toc_list.append(list_item)
|
| 98 |
return toc_list
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
def get_length_of_text(fname: str) -> int:
|
| 102 |
+
doc = pdfium.PdfDocument(fname)
|
| 103 |
+
text = naive_get_text(doc).strip()
|
| 104 |
+
|
| 105 |
+
return len(text)
|
marker/pdf/filetype.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import magic
|
| 2 |
+
|
| 3 |
+
from marker.settings import settings
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def find_filetype(fpath):
|
| 7 |
+
mimetype = magic.from_file(fpath).lower()
|
| 8 |
+
|
| 9 |
+
# Get extensions from mimetype
|
| 10 |
+
# The mimetype is not always consistent, so use in to check the most common formats
|
| 11 |
+
if "pdf" in mimetype:
|
| 12 |
+
return "pdf"
|
| 13 |
+
#elif "epub" in mimetype:
|
| 14 |
+
# return "epub"
|
| 15 |
+
#elif "mobi" in mimetype:
|
| 16 |
+
# return "mobi"
|
| 17 |
+
elif mimetype in settings.SUPPORTED_FILETYPES:
|
| 18 |
+
return settings.SUPPORTED_FILETYPES[mimetype]
|
| 19 |
+
else:
|
| 20 |
+
print(f"Found nonstandard filetype {mimetype}")
|
| 21 |
+
return "other"
|
marker/{bbox.py → schema/bbox.py}
RENAMED
|
@@ -1,3 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
def should_merge_blocks(box1, box2, tol=5):
|
| 2 |
# Within tol y px, and to the right within tol px
|
| 3 |
merge = [
|
|
@@ -18,7 +23,7 @@ def boxes_intersect(box1, box2):
|
|
| 18 |
return box1[0] < box2[2] and box1[2] > box2[0] and box1[1] < box2[3] and box1[3] > box2[1]
|
| 19 |
|
| 20 |
|
| 21 |
-
def
|
| 22 |
# determine the coordinates of the intersection rectangle
|
| 23 |
x_left = max(box1[0], box2[0])
|
| 24 |
y_top = max(box1[1], box2[1])
|
|
@@ -28,16 +33,11 @@ def boxes_intersect_pct(box1, box2, pct=.9):
|
|
| 28 |
if x_right < x_left or y_bottom < y_top:
|
| 29 |
return 0.0
|
| 30 |
|
| 31 |
-
# The intersection of two axis-aligned bounding boxes is always an
|
| 32 |
-
# axis-aligned bounding box
|
| 33 |
intersection_area = (x_right - x_left) * (y_bottom - y_top)
|
| 34 |
-
|
| 35 |
-
# compute the area of both AABBs
|
| 36 |
bb1_area = (box1[2] - box1[0]) * (box1[3] - box1[1])
|
| 37 |
-
bb2_area = (box2[2] - box2[0]) * (box2[3] - box2[1])
|
| 38 |
|
| 39 |
-
iou = intersection_area /
|
| 40 |
-
return iou
|
| 41 |
|
| 42 |
|
| 43 |
def multiple_boxes_intersect(box1, boxes):
|
|
@@ -47,15 +47,44 @@ def multiple_boxes_intersect(box1, boxes):
|
|
| 47 |
return False
|
| 48 |
|
| 49 |
|
| 50 |
-
def box_contained(box1, box2):
|
| 51 |
-
# Box1 inside box2
|
| 52 |
-
return box1[0] > box2[0] and box1[1] > box2[1] and box1[2] < box2[2] and box1[3] < box2[3]
|
| 53 |
-
|
| 54 |
-
|
| 55 |
def unnormalize_box(bbox, width, height):
|
| 56 |
return [
|
| 57 |
width * (bbox[0] / 1000),
|
| 58 |
height * (bbox[1] / 1000),
|
| 59 |
width * (bbox[2] / 1000),
|
| 60 |
height * (bbox[3] / 1000),
|
| 61 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List
|
| 2 |
+
|
| 3 |
+
from pydantic import BaseModel, field_validator
|
| 4 |
+
|
| 5 |
+
|
| 6 |
def should_merge_blocks(box1, box2, tol=5):
|
| 7 |
# Within tol y px, and to the right within tol px
|
| 8 |
merge = [
|
|
|
|
| 23 |
return box1[0] < box2[2] and box1[2] > box2[0] and box1[1] < box2[3] and box1[3] > box2[1]
|
| 24 |
|
| 25 |
|
| 26 |
+
def box_intersection_pct(box1, box2):
|
| 27 |
# determine the coordinates of the intersection rectangle
|
| 28 |
x_left = max(box1[0], box2[0])
|
| 29 |
y_top = max(box1[1], box2[1])
|
|
|
|
| 33 |
if x_right < x_left or y_bottom < y_top:
|
| 34 |
return 0.0
|
| 35 |
|
|
|
|
|
|
|
| 36 |
intersection_area = (x_right - x_left) * (y_bottom - y_top)
|
|
|
|
|
|
|
| 37 |
bb1_area = (box1[2] - box1[0]) * (box1[3] - box1[1])
|
|
|
|
| 38 |
|
| 39 |
+
iou = intersection_area / bb1_area
|
| 40 |
+
return iou
|
| 41 |
|
| 42 |
|
| 43 |
def multiple_boxes_intersect(box1, boxes):
|
|
|
|
| 47 |
return False
|
| 48 |
|
| 49 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
def unnormalize_box(bbox, width, height):
|
| 51 |
return [
|
| 52 |
width * (bbox[0] / 1000),
|
| 53 |
height * (bbox[1] / 1000),
|
| 54 |
width * (bbox[2] / 1000),
|
| 55 |
height * (bbox[3] / 1000),
|
| 56 |
+
]
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
class BboxElement(BaseModel):
|
| 60 |
+
bbox: List[float]
|
| 61 |
+
|
| 62 |
+
@field_validator('bbox')
|
| 63 |
+
@classmethod
|
| 64 |
+
def check_4_elements(cls, v: List[float]) -> List[float]:
|
| 65 |
+
if len(v) != 4:
|
| 66 |
+
raise ValueError('bbox must have 4 elements')
|
| 67 |
+
return v
|
| 68 |
+
|
| 69 |
+
@property
|
| 70 |
+
def height(self):
|
| 71 |
+
return self.bbox[3] - self.bbox[1]
|
| 72 |
+
|
| 73 |
+
@property
|
| 74 |
+
def width(self):
|
| 75 |
+
return self.bbox[2] - self.bbox[0]
|
| 76 |
+
|
| 77 |
+
@property
|
| 78 |
+
def x_start(self):
|
| 79 |
+
return self.bbox[0]
|
| 80 |
+
|
| 81 |
+
@property
|
| 82 |
+
def y_start(self):
|
| 83 |
+
return self.bbox[1]
|
| 84 |
+
|
| 85 |
+
@property
|
| 86 |
+
def area(self):
|
| 87 |
+
return self.width * self.height
|
| 88 |
+
|
| 89 |
+
def intersection_pct(self, other_bbox: List[float]):
|
| 90 |
+
return box_intersection_pct(self.bbox, other_bbox)
|
marker/schema/page.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from collections import Counter
|
| 2 |
+
from typing import List, Optional
|
| 3 |
+
|
| 4 |
+
from marker.schema.bbox import BboxElement
|
| 5 |
+
from marker.schema.schema import Block, Span
|
| 6 |
+
from surya.schema import TextDetectionResult
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class Page(BboxElement):
|
| 10 |
+
blocks: List[Block]
|
| 11 |
+
pnum: int
|
| 12 |
+
column_count: Optional[int] = None
|
| 13 |
+
rotation: Optional[int] = None # Rotation degrees of the page
|
| 14 |
+
text_lines: Optional[TextDetectionResult] = None
|
| 15 |
+
|
| 16 |
+
def get_nonblank_lines(self):
|
| 17 |
+
lines = self.get_all_lines()
|
| 18 |
+
nonblank_lines = [l for l in lines if l.prelim_text.strip()]
|
| 19 |
+
return nonblank_lines
|
| 20 |
+
|
| 21 |
+
def get_all_lines(self):
|
| 22 |
+
lines = [l for b in self.blocks for l in b.lines]
|
| 23 |
+
return lines
|
| 24 |
+
|
| 25 |
+
def get_nonblank_spans(self) -> List[Span]:
|
| 26 |
+
lines = [l for b in self.blocks for l in b.lines]
|
| 27 |
+
spans = [s for l in lines for s in l.spans if s.text.strip()]
|
| 28 |
+
return spans
|
| 29 |
+
|
| 30 |
+
def add_block_types(self, page_block_types):
|
| 31 |
+
if len(page_block_types) != len(self.get_all_lines()):
|
| 32 |
+
print(f"Warning: Number of detected lines {len(page_block_types)} does not match number of lines {len(self.get_all_lines())}")
|
| 33 |
+
|
| 34 |
+
i = 0
|
| 35 |
+
for block in self.blocks:
|
| 36 |
+
for line in block.lines:
|
| 37 |
+
if i < len(page_block_types):
|
| 38 |
+
line_block_type = page_block_types[i].block_type
|
| 39 |
+
else:
|
| 40 |
+
line_block_type = "Text"
|
| 41 |
+
i += 1
|
| 42 |
+
for span in line.spans:
|
| 43 |
+
span.block_type = line_block_type
|
| 44 |
+
|
| 45 |
+
def get_font_stats(self):
|
| 46 |
+
fonts = [s.font for s in self.get_nonblank_spans()]
|
| 47 |
+
font_counts = Counter(fonts)
|
| 48 |
+
return font_counts
|
| 49 |
+
|
| 50 |
+
def get_line_height_stats(self):
|
| 51 |
+
heights = [l.bbox[3] - l.bbox[1] for l in self.get_nonblank_lines()]
|
| 52 |
+
height_counts = Counter(heights)
|
| 53 |
+
return height_counts
|
| 54 |
+
|
| 55 |
+
def get_line_start_stats(self):
|
| 56 |
+
starts = [l.bbox[0] for l in self.get_nonblank_lines()]
|
| 57 |
+
start_counts = Counter(starts)
|
| 58 |
+
return start_counts
|
| 59 |
+
|
| 60 |
+
def get_min_line_start(self):
|
| 61 |
+
starts = [l.bbox[0] for l in self.get_nonblank_lines() if l.spans[0].block_type == "Text"]
|
| 62 |
+
if len(starts) == 0:
|
| 63 |
+
raise IndexError("No lines found")
|
| 64 |
+
return min(starts)
|
| 65 |
+
|
| 66 |
+
@property
|
| 67 |
+
def prelim_text(self):
|
| 68 |
+
return "\n".join([b.prelim_text for b in self.blocks])
|
marker/{schema.py → schema/schema.py}
RENAMED
|
@@ -1,53 +1,13 @@
|
|
| 1 |
from collections import Counter
|
| 2 |
-
from typing import List, Optional
|
| 3 |
|
| 4 |
from pydantic import BaseModel, field_validator
|
| 5 |
import ftfy
|
| 6 |
|
| 7 |
-
from marker.bbox import
|
| 8 |
from marker.settings import settings
|
| 9 |
|
| 10 |
|
| 11 |
-
def find_span_type(span, page_blocks):
|
| 12 |
-
block_type = "Text"
|
| 13 |
-
for block in page_blocks:
|
| 14 |
-
if boxes_intersect_pct(span.bbox, block.bbox):
|
| 15 |
-
block_type = block.block_type
|
| 16 |
-
break
|
| 17 |
-
return block_type
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
class BboxElement(BaseModel):
|
| 21 |
-
bbox: List[float]
|
| 22 |
-
|
| 23 |
-
@field_validator('bbox')
|
| 24 |
-
@classmethod
|
| 25 |
-
def check_4_elements(cls, v: List[float]) -> List[float]:
|
| 26 |
-
if len(v) != 4:
|
| 27 |
-
raise ValueError('bbox must have 4 elements')
|
| 28 |
-
return v
|
| 29 |
-
|
| 30 |
-
@property
|
| 31 |
-
def height(self):
|
| 32 |
-
return self.bbox[3] - self.bbox[1]
|
| 33 |
-
|
| 34 |
-
@property
|
| 35 |
-
def width(self):
|
| 36 |
-
return self.bbox[2] - self.bbox[0]
|
| 37 |
-
|
| 38 |
-
@property
|
| 39 |
-
def x_start(self):
|
| 40 |
-
return self.bbox[0]
|
| 41 |
-
|
| 42 |
-
@property
|
| 43 |
-
def y_start(self):
|
| 44 |
-
return self.bbox[1]
|
| 45 |
-
|
| 46 |
-
@property
|
| 47 |
-
def area(self):
|
| 48 |
-
return self.width * self.height
|
| 49 |
-
|
| 50 |
-
|
| 51 |
class BlockType(BboxElement):
|
| 52 |
block_type: str
|
| 53 |
|
|
@@ -59,7 +19,6 @@ class Span(BboxElement):
|
|
| 59 |
font_weight: float
|
| 60 |
font_size: float
|
| 61 |
block_type: Optional[str] = None
|
| 62 |
-
selected: bool = True
|
| 63 |
|
| 64 |
|
| 65 |
@field_validator('text')
|
|
@@ -128,66 +87,6 @@ class Block(BboxElement):
|
|
| 128 |
span.block_type = block_type
|
| 129 |
|
| 130 |
|
| 131 |
-
class Page(BboxElement):
|
| 132 |
-
blocks: List[Block]
|
| 133 |
-
pnum: int
|
| 134 |
-
column_count: Optional[int] = None
|
| 135 |
-
rotation: Optional[int] = None # Rotation degrees of the page
|
| 136 |
-
|
| 137 |
-
def get_nonblank_lines(self):
|
| 138 |
-
lines = self.get_all_lines()
|
| 139 |
-
nonblank_lines = [l for l in lines if l.prelim_text.strip()]
|
| 140 |
-
return nonblank_lines
|
| 141 |
-
|
| 142 |
-
def get_all_lines(self):
|
| 143 |
-
lines = [l for b in self.blocks for l in b.lines]
|
| 144 |
-
return lines
|
| 145 |
-
|
| 146 |
-
def get_nonblank_spans(self) -> List[Span]:
|
| 147 |
-
lines = [l for b in self.blocks for l in b.lines]
|
| 148 |
-
spans = [s for l in lines for s in l.spans if s.text.strip()]
|
| 149 |
-
return spans
|
| 150 |
-
|
| 151 |
-
def add_block_types(self, page_block_types):
|
| 152 |
-
if len(page_block_types) != len(self.get_all_lines()):
|
| 153 |
-
print(f"Warning: Number of detected lines {len(page_block_types)} does not match number of lines {len(self.get_all_lines())}")
|
| 154 |
-
|
| 155 |
-
i = 0
|
| 156 |
-
for block in self.blocks:
|
| 157 |
-
for line in block.lines:
|
| 158 |
-
if i < len(page_block_types):
|
| 159 |
-
line_block_type = page_block_types[i].block_type
|
| 160 |
-
else:
|
| 161 |
-
line_block_type = "Text"
|
| 162 |
-
i += 1
|
| 163 |
-
for span in line.spans:
|
| 164 |
-
span.block_type = line_block_type
|
| 165 |
-
|
| 166 |
-
def get_font_stats(self):
|
| 167 |
-
fonts = [s.font for s in self.get_nonblank_spans()]
|
| 168 |
-
font_counts = Counter(fonts)
|
| 169 |
-
return font_counts
|
| 170 |
-
|
| 171 |
-
def get_line_height_stats(self):
|
| 172 |
-
heights = [l.bbox[3] - l.bbox[1] for l in self.get_nonblank_lines()]
|
| 173 |
-
height_counts = Counter(heights)
|
| 174 |
-
return height_counts
|
| 175 |
-
|
| 176 |
-
def get_line_start_stats(self):
|
| 177 |
-
starts = [l.bbox[0] for l in self.get_nonblank_lines()]
|
| 178 |
-
start_counts = Counter(starts)
|
| 179 |
-
return start_counts
|
| 180 |
-
|
| 181 |
-
def get_min_line_start(self):
|
| 182 |
-
starts = [l.bbox[0] for l in self.get_nonblank_lines() if l.spans[0].block_type == "Text"]
|
| 183 |
-
if len(starts) == 0:
|
| 184 |
-
raise IndexError("No lines found")
|
| 185 |
-
return min(starts)
|
| 186 |
-
|
| 187 |
-
@property
|
| 188 |
-
def prelim_text(self):
|
| 189 |
-
return "\n".join([b.prelim_text for b in self.blocks])
|
| 190 |
-
|
| 191 |
class MergedLine(BboxElement):
|
| 192 |
text: str
|
| 193 |
fonts: List[str]
|
|
|
|
| 1 |
from collections import Counter
|
| 2 |
+
from typing import List, Optional
|
| 3 |
|
| 4 |
from pydantic import BaseModel, field_validator
|
| 5 |
import ftfy
|
| 6 |
|
| 7 |
+
from marker.schema.bbox import multiple_boxes_intersect, BboxElement
|
| 8 |
from marker.settings import settings
|
| 9 |
|
| 10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
class BlockType(BboxElement):
|
| 12 |
block_type: str
|
| 13 |
|
|
|
|
| 19 |
font_weight: float
|
| 20 |
font_size: float
|
| 21 |
block_type: Optional[str] = None
|
|
|
|
| 22 |
|
| 23 |
|
| 24 |
@field_validator('text')
|
|
|
|
| 87 |
span.block_type = block_type
|
| 88 |
|
| 89 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
class MergedLine(BboxElement):
|
| 91 |
text: str
|
| 92 |
fonts: List[str]
|
marker/segmentation.py
CHANGED
|
@@ -1,18 +1,17 @@
|
|
| 1 |
-
from concurrent.futures import ThreadPoolExecutor
|
| 2 |
from typing import List
|
| 3 |
|
| 4 |
from transformers import LayoutLMv3ForTokenClassification
|
| 5 |
|
| 6 |
-
from marker.bbox import unnormalize_box
|
| 7 |
from transformers.models.layoutlmv3.image_processing_layoutlmv3 import normalize_box
|
| 8 |
-
import io
|
| 9 |
from PIL import Image
|
| 10 |
from transformers import LayoutLMv3Processor
|
| 11 |
import numpy as np
|
| 12 |
|
| 13 |
from marker.pdf.images import render_image
|
| 14 |
from marker.settings import settings
|
| 15 |
-
from marker.schema import
|
|
|
|
| 16 |
import torch
|
| 17 |
from math import isclose
|
| 18 |
|
|
|
|
|
|
|
| 1 |
from typing import List
|
| 2 |
|
| 3 |
from transformers import LayoutLMv3ForTokenClassification
|
| 4 |
|
| 5 |
+
from marker.schema.bbox import unnormalize_box
|
| 6 |
from transformers.models.layoutlmv3.image_processing_layoutlmv3 import normalize_box
|
|
|
|
| 7 |
from PIL import Image
|
| 8 |
from transformers import LayoutLMv3Processor
|
| 9 |
import numpy as np
|
| 10 |
|
| 11 |
from marker.pdf.images import render_image
|
| 12 |
from marker.settings import settings
|
| 13 |
+
from marker.schema.schema import BlockType
|
| 14 |
+
from marker.schema.page import Page
|
| 15 |
import torch
|
| 16 |
from math import isclose
|
| 17 |
|
marker/settings.py
CHANGED
|
@@ -37,38 +37,34 @@ class Settings(BaseSettings):
|
|
| 37 |
#"application/x-fictionbook+xml": "fb2"
|
| 38 |
}
|
| 39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
# OCR
|
| 41 |
INVALID_CHARS: List[str] = [chr(0xfffd), "�"]
|
| 42 |
-
|
| 43 |
-
TESSDATA_PREFIX: str = ""
|
| 44 |
-
TESSERACT_LANGUAGES: Dict = {
|
| 45 |
-
"English": "eng",
|
| 46 |
-
"Spanish": "spa",
|
| 47 |
-
"Portuguese": "por",
|
| 48 |
-
"French": "fra",
|
| 49 |
-
"German": "deu",
|
| 50 |
-
"Russian": "rus",
|
| 51 |
-
"Chinese": "chi_sim",
|
| 52 |
-
"Japanese": "jpn",
|
| 53 |
-
"Korean": "kor",
|
| 54 |
-
"Hindi": "hin",
|
| 55 |
-
}
|
| 56 |
-
TESSERACT_TIMEOUT: int = 20 # When to give up on OCR
|
| 57 |
-
SPELLCHECK_LANGUAGES: Dict = {
|
| 58 |
-
"English": "en",
|
| 59 |
-
"Spanish": "es",
|
| 60 |
-
"Portuguese": "pt",
|
| 61 |
-
"French": "fr",
|
| 62 |
-
"German": "de",
|
| 63 |
-
"Russian": "ru",
|
| 64 |
-
"Chinese": None,
|
| 65 |
-
"Japanese": None,
|
| 66 |
-
"Korean": None,
|
| 67 |
-
"Hindi": None,
|
| 68 |
-
}
|
| 69 |
OCR_ALL_PAGES: bool = False # Run OCR on every page even if text can be extracted
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
OCR_PARALLEL_WORKERS: int = 2 # How many CPU workers to use for OCR
|
| 71 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
|
| 73 |
# Texify model
|
| 74 |
TEXIFY_MODEL_MAX: int = 384 # Max inference length for texify
|
|
@@ -82,7 +78,7 @@ class Settings(BaseSettings):
|
|
| 82 |
LAYOUT_MODEL_MAX: int = 512
|
| 83 |
LAYOUT_CHUNK_OVERLAP: int = 64
|
| 84 |
LAYOUT_DPI: int = 96
|
| 85 |
-
|
| 86 |
LAYOUT_BATCH_SIZE: int = 8 # Max 512 tokens means high batch size
|
| 87 |
|
| 88 |
# Ordering model
|
|
|
|
| 37 |
#"application/x-fictionbook+xml": "fb2"
|
| 38 |
}
|
| 39 |
|
| 40 |
+
# Text line Detection
|
| 41 |
+
DETECTOR_BATCH_SIZE: Optional[int] = None
|
| 42 |
+
SURYA_DETECTOR_DPI: int = 96
|
| 43 |
+
DETECTOR_POSTPROCESSING_CPU_WORKERS: int = 4
|
| 44 |
+
|
| 45 |
# OCR
|
| 46 |
INVALID_CHARS: List[str] = [chr(0xfffd), "�"]
|
| 47 |
+
OCR_ENGINE: Optional[str] = None # Which OCR engine to use, either "surya" or "ocrmypdf". Defaults to "ocrmypdf" on CPU, "surya" on GPU.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
OCR_ALL_PAGES: bool = False # Run OCR on every page even if text can be extracted
|
| 49 |
+
|
| 50 |
+
## Surya
|
| 51 |
+
SURYA_OCR_DPI: int = 96
|
| 52 |
+
RECOGNITION_BATCH_SIZE: Optional[int] = None # Batch size for surya OCR
|
| 53 |
+
|
| 54 |
+
## Tesseract
|
| 55 |
OCR_PARALLEL_WORKERS: int = 2 # How many CPU workers to use for OCR
|
| 56 |
+
TESSERACT_TIMEOUT: int = 20 # When to give up on OCR
|
| 57 |
+
|
| 58 |
+
@computed_field
|
| 59 |
+
def OCR_ENGINE_INTERNAL(self) -> str:
|
| 60 |
+
if self.OCR_ENGINE is not None:
|
| 61 |
+
return self.OCR_ENGINE
|
| 62 |
+
|
| 63 |
+
# Does not work with mps
|
| 64 |
+
if torch.cuda.is_available():
|
| 65 |
+
return "surya"
|
| 66 |
+
|
| 67 |
+
return "ocrmypdf"
|
| 68 |
|
| 69 |
# Texify model
|
| 70 |
TEXIFY_MODEL_MAX: int = 384 # Max inference length for texify
|
|
|
|
| 78 |
LAYOUT_MODEL_MAX: int = 512
|
| 79 |
LAYOUT_CHUNK_OVERLAP: int = 64
|
| 80 |
LAYOUT_DPI: int = 96
|
| 81 |
+
LAYOUT_MODEL_CHECKPOINT: str = "vikp/layout_segmenter"
|
| 82 |
LAYOUT_BATCH_SIZE: int = 8 # Max 512 tokens means high batch size
|
| 83 |
|
| 84 |
# Ordering model
|