Vik Paruchuri commited on
Commit
ae2424e
·
1 Parent(s): 2ad7f6b

Add in surya OCR

Browse files
README.md CHANGED
@@ -123,17 +123,19 @@ python convert.py /path/to/input/folder /path/to/output/folder --workers 10 --ma
123
 
124
  - `--workers` is the number of pdfs to convert at once. This is set to 1 by default, but you can increase it to increase throughput, at the cost of more CPU/GPU usage. Parallelism will not increase beyond `INFERENCE_RAM / VRAM_PER_TASK` if you're using GPU.
125
  - `--max` is the maximum number of pdfs to convert. Omit this to convert all pdfs in the folder.
126
- - `--metadata_file` is an optional path to a json file with metadata about the pdfs. If you provide it, it will be used to set the language for each pdf. If not, `DEFAULT_LANG` will be used. The format is:
127
  - `--min_length` is the minimum number of characters that need to be extracted from a pdf before it will be considered for processing. If you're processing a lot of pdfs, I recommend setting this to avoid OCRing pdfs that are mostly images. (slows everything down)
 
128
 
129
  ```
130
  {
131
- "pdf1.pdf": {"language": "English"},
132
- "pdf2.pdf": {"language": "Spanish"},
133
  ...
134
  }
135
  ```
136
 
 
 
137
  ## Convert multiple files on multiple GPUs
138
 
139
  Run `chunk_convert.sh`, like this:
 
123
 
124
  - `--workers` is the number of pdfs to convert at once. This is set to 1 by default, but you can increase it to increase throughput, at the cost of more CPU/GPU usage. Parallelism will not increase beyond `INFERENCE_RAM / VRAM_PER_TASK` if you're using GPU.
125
  - `--max` is the maximum number of pdfs to convert. Omit this to convert all pdfs in the folder.
 
126
  - `--min_length` is the minimum number of characters that need to be extracted from a pdf before it will be considered for processing. If you're processing a lot of pdfs, I recommend setting this to avoid OCRing pdfs that are mostly images. (slows everything down)
127
+ - `--metadata_file` is an optional path to a json file with metadata about the pdfs. If you provide it, it will be used to set the language for each pdf. If not, `DEFAULT_LANG` will be used. The format is:
128
 
129
  ```
130
  {
131
+ "pdf1.pdf": {"languages": ["English"]},
132
+ "pdf2.pdf": {"languages": ["Spanish", "Russian"]},
133
  ...
134
  }
135
  ```
136
 
137
+ You can use language names or codes. See [here](https://github.com/VikParuchuri/surya/blob/master/surya/languages.py) for a full list.
138
+
139
  ## Convert multiple files on multiple GPUs
140
 
141
  Run `chunk_convert.sh`, like this:
benchmark.py CHANGED
@@ -9,7 +9,7 @@ from marker.convert import convert_single_pdf
9
  from marker.logger import configure_logging
10
  from marker.models import load_all_models
11
  from marker.benchmark.scoring import score_text
12
- from marker.extract_text import naive_get_text
13
  import json
14
  import os
15
  import subprocess
 
9
  from marker.logger import configure_logging
10
  from marker.models import load_all_models
11
  from marker.benchmark.scoring import score_text
12
+ from marker.pdf.extract_text import naive_get_text
13
  import json
14
  import os
15
  import subprocess
convert.py CHANGED
@@ -6,7 +6,9 @@ import ray
6
  from tqdm import tqdm
7
  import math
8
 
9
- from marker.convert import convert_single_pdf, get_length_of_text
 
 
10
  from marker.models import load_all_models
11
  from marker.settings import settings
12
  from marker.logger import configure_logging
@@ -28,6 +30,10 @@ def process_single_pdf(fname: str, out_folder: str, model_refs, metadata: Option
28
  # This can indicate that they were scanned, and not OCRed properly
29
  # Usually these files are not recent/high-quality
30
  if min_length:
 
 
 
 
31
  length = get_length_of_text(fname)
32
  if length < min_length:
33
  return
 
6
  from tqdm import tqdm
7
  import math
8
 
9
+ from marker.convert import convert_single_pdf
10
+ from marker.pdf.filetype import find_filetype
11
+ from marker.pdf.extract_text import get_length_of_text
12
  from marker.models import load_all_models
13
  from marker.settings import settings
14
  from marker.logger import configure_logging
 
30
  # This can indicate that they were scanned, and not OCRed properly
31
  # Usually these files are not recent/high-quality
32
  if min_length:
33
+ filetype = find_filetype(fname)
34
+ if filetype == "other":
35
+ return 0
36
+
37
  length = get_length_of_text(fname)
38
  if length < min_length:
39
  return
marker/cleaners/code.py CHANGED
@@ -1,4 +1,5 @@
1
- from marker.schema import Span, Line, Page
 
2
  import re
3
  from typing import List
4
 
 
1
+ from marker.schema.schema import Span, Line
2
+ from marker.schema.page import Page
3
  import re
4
  from typing import List
5
 
marker/cleaners/equations.py CHANGED
@@ -1,31 +1,20 @@
1
- import io
2
  from copy import deepcopy
3
- from functools import partial
4
  from typing import List
5
 
6
- import torch
7
  from texify.inference import batch_inference
8
- from texify.model.model import load_model
9
- from texify.model.processor import load_processor
10
- import re
11
  from PIL import Image, ImageDraw
12
 
13
- from marker.bbox import should_merge_blocks, merge_boxes
14
  from marker.debug.data import dump_equation_debug_data
15
  from marker.pdf.images import render_image
16
  from marker.settings import settings
17
- from marker.schema import Page, Span, Line, Block, BlockType
 
18
  import os
19
 
20
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
21
 
22
- processor = load_processor()
23
-
24
-
25
- def load_texify_model():
26
- texify_model = load_model(checkpoint=settings.TEXIFY_MODEL_NAME, device=settings.TORCH_DEVICE_MODEL, dtype=settings.TEXIFY_DTYPE)
27
- return texify_model
28
-
29
 
30
  def mask_bbox(png_image, bbox, selected_bboxes):
31
  mask = Image.new('L', png_image.size, 0) # 'L' mode for grayscale
@@ -72,10 +61,10 @@ def get_latex_batched(images, reformat_region_lens, texify_model, batch_size):
72
  max_length = min(max_length, settings.TEXIFY_MODEL_MAX)
73
  max_length += settings.TEXIFY_TOKEN_BUFFER
74
 
75
- model_output = batch_inference(images[min_idx:max_idx], texify_model, processor, max_tokens=max_length)
76
 
77
  for j, output in enumerate(model_output):
78
- token_count = get_total_texify_tokens(output)
79
  if token_count >= max_length - 1:
80
  output = ""
81
 
@@ -84,7 +73,7 @@ def get_latex_batched(images, reformat_region_lens, texify_model, batch_size):
84
  return predictions
85
 
86
 
87
- def get_total_texify_tokens(text):
88
  tokenizer = processor.tokenizer
89
  tokens = tokenizer(text)
90
  return len(tokens["input_ids"])
 
 
1
  from copy import deepcopy
 
2
  from typing import List
3
 
 
4
  from texify.inference import batch_inference
5
+
 
 
6
  from PIL import Image, ImageDraw
7
 
8
+ from marker.schema.bbox import should_merge_blocks, merge_boxes
9
  from marker.debug.data import dump_equation_debug_data
10
  from marker.pdf.images import render_image
11
  from marker.settings import settings
12
+ from marker.schema.schema import Span, Line, Block, BlockType
13
+ from marker.schema.page import Page
14
  import os
15
 
16
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
17
 
 
 
 
 
 
 
 
18
 
19
  def mask_bbox(png_image, bbox, selected_bboxes):
20
  mask = Image.new('L', png_image.size, 0) # 'L' mode for grayscale
 
61
  max_length = min(max_length, settings.TEXIFY_MODEL_MAX)
62
  max_length += settings.TEXIFY_TOKEN_BUFFER
63
 
64
+ model_output = batch_inference(images[min_idx:max_idx], texify_model, texify_model.processor, max_tokens=max_length)
65
 
66
  for j, output in enumerate(model_output):
67
+ token_count = get_total_texify_tokens(output, texify_model.processor)
68
  if token_count >= max_length - 1:
69
  output = ""
70
 
 
73
  return predictions
74
 
75
 
76
+ def get_total_texify_tokens(text, processor):
77
  tokenizer = processor.tokenizer
78
  tokens = tokenizer(text)
79
  return len(tokens["input_ids"])
marker/cleaners/headers.py CHANGED
@@ -6,7 +6,8 @@ from rapidfuzz import fuzz
6
  from sklearn.cluster import DBSCAN
7
  import numpy as np
8
 
9
- from marker.schema import Page, FullyMergedBlock
 
10
  from typing import List, Tuple
11
 
12
 
 
6
  from sklearn.cluster import DBSCAN
7
  import numpy as np
8
 
9
+ from marker.schema.schema import FullyMergedBlock
10
+ from marker.schema.page import Page
11
  from typing import List, Tuple
12
 
13
 
marker/cleaners/table.py CHANGED
@@ -1,5 +1,6 @@
1
- from marker.bbox import merge_boxes
2
- from marker.schema import Line, Span, Block, Page
 
3
  from copy import deepcopy
4
  from tabulate import tabulate
5
  from typing import List
 
1
+ from marker.schema.bbox import merge_boxes
2
+ from marker.schema.schema import Line, Span, Block
3
+ from marker.schema.page import Page
4
  from copy import deepcopy
5
  from tabulate import tabulate
6
  from typing import List
marker/convert.py CHANGED
@@ -2,61 +2,32 @@ import pypdfium2 as pdfium
2
 
3
  from marker.cleaners.table import merge_table_blocks, create_new_tables
4
  from marker.debug.data import dump_bbox_debug_data
5
- from marker.extract_text import get_text_blocks
 
 
 
6
  from marker.cleaners.headers import filter_header_footer, filter_common_titles
7
  from marker.cleaners.equations import replace_equations
8
  from marker.ordering import order_blocks
 
9
  from marker.postprocessors.editor import edit_full_text
10
  from marker.segmentation import detect_document_block_types
11
  from marker.cleaners.code import identify_code_blocks, indent_blocks
12
  from marker.cleaners.bullets import replace_bullets
13
  from marker.markdown import merge_spans, merge_lines, get_full_text
14
- from marker.schema import Page, BlockType
 
15
  from typing import List, Dict, Tuple, Optional
16
  import re
17
- import magic
18
  from marker.settings import settings
19
 
20
 
21
- def find_filetype(fpath):
22
- mimetype = magic.from_file(fpath).lower()
23
-
24
- # Get extensions from mimetype
25
- # The mimetype is not always consistent, so use in to check the most common formats
26
- if "pdf" in mimetype:
27
- return "pdf"
28
- #elif "epub" in mimetype:
29
- # return "epub"
30
- #elif "mobi" in mimetype:
31
- # return "mobi"
32
- elif mimetype in settings.SUPPORTED_FILETYPES:
33
- return settings.SUPPORTED_FILETYPES[mimetype]
34
- else:
35
- print(f"Found nonstandard filetype {mimetype}")
36
- return "other"
37
-
38
-
39
  def annotate_spans(blocks: List[Page], block_types: List[BlockType]):
40
  for i, page in enumerate(blocks):
41
  page_block_types = block_types[i]
42
  page.add_block_types(page_block_types)
43
 
44
 
45
- def get_length_of_text(fname: str) -> int:
46
- filetype = find_filetype(fname)
47
- if filetype == "other":
48
- return 0
49
-
50
- doc = pdfium.PdfDocument(fname)
51
- full_text = ""
52
- for page_idx in range(len(doc)):
53
- page = doc.get_page(page_idx)
54
- text_page = page.get_textpage()
55
- full_text += text_page.get_text_bounded()
56
-
57
- return len(full_text)
58
-
59
-
60
  def convert_single_pdf(
61
  fname: str,
62
  model_lst: List,
@@ -64,63 +35,69 @@ def convert_single_pdf(
64
  metadata: Optional[Dict]=None,
65
  parallel_factor: int = 1
66
  ) -> Tuple[str, Dict]:
67
- lang = settings.DEFAULT_LANG
 
68
  if metadata:
69
- lang = metadata.get("language", settings.DEFAULT_LANG)
70
 
71
- # Use tesseract language if available
72
- tess_lang = settings.TESSERACT_LANGUAGES.get(lang, "eng")
73
- spell_lang = settings.SPELLCHECK_LANGUAGES.get(lang, None)
74
- if "eng" not in tess_lang:
75
- tess_lang = f"eng+{tess_lang}"
76
-
77
- # Output metadata
78
- out_meta = {"language": lang}
79
 
 
80
  filetype = find_filetype(fname)
81
- if filetype == "other":
82
- return "", out_meta
83
 
84
- out_meta["filetype"] = filetype
 
 
 
 
85
 
 
 
 
 
86
  doc = pdfium.PdfDocument(fname)
87
- blocks, toc, ocr_stats = get_text_blocks(
88
  doc,
89
- tess_lang,
90
- spell_lang,
91
  max_pages=max_pages,
92
- parallel=int(parallel_factor * settings.OCR_PARALLEL_WORKERS)
93
  )
 
 
 
 
 
 
 
 
 
 
94
 
95
- out_meta["toc"] = toc
96
- out_meta["pages"] = len(blocks)
97
- out_meta["ocr_stats"] = ocr_stats
98
- if len([b for p in blocks for b in p.blocks]) == 0:
99
  print(f"Could not extract any text blocks for {fname}")
100
  return "", out_meta
101
 
102
- # Unpack models from list
103
- texify_model, layoutlm_model, order_model, edit_model = model_lst
104
-
105
  block_types = detect_document_block_types(
106
  doc,
107
- blocks,
108
  layoutlm_model,
109
  batch_size=int(settings.LAYOUT_BATCH_SIZE * parallel_factor)
110
  )
111
 
112
  # Find headers and footers
113
- bad_span_ids = filter_header_footer(blocks)
114
  out_meta["block_stats"] = {"header_footer": len(bad_span_ids)}
115
 
116
- annotate_spans(blocks, block_types)
117
 
118
  # Dump debug data if flags are set
119
- dump_bbox_debug_data(doc, blocks)
120
 
121
  blocks = order_blocks(
122
  doc,
123
- blocks,
124
  order_model,
125
  batch_size=int(settings.ORDERER_BATCH_SIZE * parallel_factor)
126
  )
 
2
 
3
  from marker.cleaners.table import merge_table_blocks, create_new_tables
4
  from marker.debug.data import dump_bbox_debug_data
5
+ from marker.ocr.lang import replace_langs_with_codes, validate_langs
6
+ from marker.ocr.detection import surya_detection
7
+ from marker.ocr.recognition import run_ocr
8
+ from marker.pdf.extract_text import get_text_blocks
9
  from marker.cleaners.headers import filter_header_footer, filter_common_titles
10
  from marker.cleaners.equations import replace_equations
11
  from marker.ordering import order_blocks
12
+ from marker.pdf.filetype import find_filetype
13
  from marker.postprocessors.editor import edit_full_text
14
  from marker.segmentation import detect_document_block_types
15
  from marker.cleaners.code import identify_code_blocks, indent_blocks
16
  from marker.cleaners.bullets import replace_bullets
17
  from marker.markdown import merge_spans, merge_lines, get_full_text
18
+ from marker.schema.schema import BlockType
19
+ from marker.schema.page import Page
20
  from typing import List, Dict, Tuple, Optional
21
  import re
 
22
  from marker.settings import settings
23
 
24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  def annotate_spans(blocks: List[Page], block_types: List[BlockType]):
26
  for i, page in enumerate(blocks):
27
  page_block_types = block_types[i]
28
  page.add_block_types(page_block_types)
29
 
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  def convert_single_pdf(
32
  fname: str,
33
  model_lst: List,
 
35
  metadata: Optional[Dict]=None,
36
  parallel_factor: int = 1
37
  ) -> Tuple[str, Dict]:
38
+ # Set language needed for OCR
39
+ langs = [settings.DEFAULT_LANG]
40
  if metadata:
41
+ langs = metadata.get("languages", langs)
42
 
43
+ langs = replace_langs_with_codes(langs)
44
+ validate_langs(langs)
 
 
 
 
 
 
45
 
46
+ # Find the filetype
47
  filetype = find_filetype(fname)
 
 
48
 
49
+ # Setup output metadata
50
+ out_meta = {
51
+ "languages": langs,
52
+ "filetype": filetype,
53
+ }
54
 
55
+ if filetype == "other": # We can't process this file
56
+ return "", out_meta
57
+
58
+ # Get initial text blocks from the pdf
59
  doc = pdfium.PdfDocument(fname)
60
+ pages, toc = get_text_blocks(
61
  doc,
 
 
62
  max_pages=max_pages,
 
63
  )
64
+ out_meta.update({
65
+ "toc": toc,
66
+ "pages": len(pages),
67
+ })
68
+
69
+ # Unpack models from list
70
+ texify_model, layout_model, order_model, edit_model, detection_model, ocr_model = model_lst
71
+
72
+ # Identify text lines on pages
73
+ surya_detection(doc, pages, detection_model)
74
 
75
+ # OCR pages as needed
76
+ pages, ocr_stats = run_ocr(doc, pages, langs, ocr_model, parallel_factor)
77
+
78
+ if len([b for p in pages for b in p.blocks]) == 0:
79
  print(f"Could not extract any text blocks for {fname}")
80
  return "", out_meta
81
 
 
 
 
82
  block_types = detect_document_block_types(
83
  doc,
84
+ pages,
85
  layoutlm_model,
86
  batch_size=int(settings.LAYOUT_BATCH_SIZE * parallel_factor)
87
  )
88
 
89
  # Find headers and footers
90
+ bad_span_ids = filter_header_footer(pages)
91
  out_meta["block_stats"] = {"header_footer": len(bad_span_ids)}
92
 
93
+ annotate_spans(pages, block_types)
94
 
95
  # Dump debug data if flags are set
96
+ dump_bbox_debug_data(doc, pages)
97
 
98
  blocks = order_blocks(
99
  doc,
100
+ pages,
101
  order_model,
102
  batch_size=int(settings.ORDERER_BATCH_SIZE * parallel_factor)
103
  )
marker/debug/data.py CHANGED
@@ -1,11 +1,10 @@
1
  import base64
2
  import json
3
  import os
4
- import zlib
5
  from typing import List
6
 
7
  from marker.pdf.images import render_image
8
- from marker.schema import Page
9
  from marker.settings import settings
10
  from PIL import Image
11
  import io
 
1
  import base64
2
  import json
3
  import os
 
4
  from typing import List
5
 
6
  from marker.pdf.images import render_image
7
+ from marker.schema.page import Page
8
  from marker.settings import settings
9
  from PIL import Image
10
  import io
marker/layout/layout.py ADDED
File without changes
marker/layout/order.py ADDED
File without changes
marker/markdown.py CHANGED
@@ -1,4 +1,5 @@
1
- from marker.schema import MergedLine, MergedBlock, FullyMergedBlock, Page
 
2
  import re
3
  from typing import List
4
 
 
1
+ from marker.schema.schema import MergedLine, MergedBlock, FullyMergedBlock
2
+ from marker.schema.page import Page
3
  import re
4
  from typing import List
5
 
marker/models.py CHANGED
@@ -1,13 +1,56 @@
1
- from marker.cleaners.equations import load_texify_model
2
- from marker.ordering import load_ordering_model
3
  from marker.postprocessors.editor import load_editing_model
4
- from marker.segmentation import load_layout_model
 
 
 
 
 
 
 
5
 
6
 
7
- def load_all_models():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  edit = load_editing_model()
9
- order = load_ordering_model()
10
- layout = load_layout_model()
11
- texify = load_texify_model()
12
- model_lst = [texify, layout, order, edit]
13
  return model_lst
 
 
 
1
  from marker.postprocessors.editor import load_editing_model
2
+ from surya.model.detection import segformer
3
+ from texify.model.model import load_model as load_texify_model
4
+ from texify.model.processor import load_processor as load_texify_processor
5
+ from marker.settings import settings
6
+ from surya.model.recognition.model import load_model as load_recognition_model
7
+ from surya.model.recognition.processor import load_processor as load_recognition_processor
8
+ from surya.model.ordering.model import load_model as load_order_model
9
+ from surya.model.ordering.processor import load_processor as load_order_processor
10
 
11
 
12
+ def setup_recognition_model(langs):
13
+ rec_model = load_recognition_model(langs=langs)
14
+ rec_processor = load_recognition_processor()
15
+ rec_model.processor = rec_processor
16
+ return rec_model
17
+
18
+
19
+ def setup_detection_model():
20
+ model = segformer.load_model()
21
+ processor = segformer.load_processor()
22
+ model.processor = processor
23
+ return model
24
+
25
+
26
+ def setup_texify_model():
27
+ texify_model = load_texify_model(checkpoint=settings.TEXIFY_MODEL_NAME, device=settings.TORCH_DEVICE_MODEL, dtype=settings.TEXIFY_DTYPE)
28
+ texify_processor = load_texify_processor()
29
+ texify_model.processor = texify_processor
30
+ return texify_model
31
+
32
+
33
+ def setup_layout_model():
34
+ model = segformer.load_model(checkpoint=settings.LAYOUT_MODEL_CHECKPOINT)
35
+ processor = segformer.load_processor()
36
+ model.processor = processor
37
+ return model
38
+
39
+
40
+ def setup_order_model():
41
+ model = load_order_model()
42
+ processor = load_order_processor()
43
+ model.processor = processor
44
+ return model
45
+
46
+
47
+ def load_all_models(langs=None):
48
+ # langs is optional list of languages to prune from recognition MoE model
49
+ detection = setup_detection_model()
50
+ layout = setup_layout_model()
51
+ order = setup_order_model()
52
  edit = load_editing_model()
53
+ ocr = setup_recognition_model(langs)
54
+ texify = setup_texify_model()
55
+ model_lst = [texify, layout, order, edit, detection, ocr]
 
56
  return model_lst
marker/ocr/detection.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+
3
+ from pypdfium2 import PdfDocument
4
+ from surya.detection import batch_text_detection
5
+
6
+ from marker.pdf.images import render_image
7
+ from marker.schema.page import Page
8
+ from marker.settings import settings
9
+
10
+
11
+ def surya_detection(doc: PdfDocument, pages: List[Page], det_model):
12
+ processor = det_model.processor
13
+ max_len = min(len(pages), len(doc))
14
+ images = [render_image(doc[pnum], dpi=settings.SURYA_DETECTOR_DPI) for pnum in range(max_len)]
15
+
16
+ predictions = batch_text_detection(images, det_model, processor)
17
+ for (page, pred) in zip(pages, predictions):
18
+ page.text_lines = pred
19
+
20
+
21
+
22
+
marker/ocr/heuristics.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from typing import List
3
+
4
+ from nltk import wordpunct_tokenize
5
+
6
+ from marker.ocr.utils import alphanum_ratio
7
+ from marker.schema.page import Page
8
+ from marker.settings import settings
9
+
10
+
11
+ def should_ocr_page(page: Page, no_text: bool):
12
+ detected_lines_found = detected_line_coverage(page)
13
+
14
+ # OCR page if we got minimal text, or if we got too many spaces
15
+ conditions = [
16
+ no_text , # Full doc has no text, and needs full OCR
17
+ (len(page.prelim_text) > 0 and detect_bad_ocr(page.prelim_text)), # Bad OCR
18
+ detected_lines_found is False, # didn't extract text for all detected lines
19
+ ]
20
+
21
+ return any(conditions) or settings.OCR_ALL_PAGES
22
+
23
+
24
+ def detect_bad_ocr(text, space_threshold=.6, newline_threshold=.5, alphanum_threshold=.4):
25
+ if len(text) == 0:
26
+ # Assume OCR failed if we have no text
27
+ return True
28
+
29
+ words = wordpunct_tokenize(text)
30
+ words = [w for w in words if w.strip()]
31
+ alpha_words = [word for word in words if word.isalnum()]
32
+
33
+ spaces = len(re.findall(r'\s+', text))
34
+ alpha_chars = len(re.sub(r'\s+', '', text))
35
+ if spaces / (alpha_chars + spaces) > space_threshold:
36
+ return True
37
+
38
+ newlines = len(re.findall(r'\n+', text))
39
+ non_newlines = len(re.sub(r'\n+', '', text))
40
+ if newlines / (newlines + non_newlines) > newline_threshold:
41
+ return True
42
+
43
+ if alphanum_ratio(text) < alphanum_threshold: # Garbled text
44
+ return True
45
+
46
+ invalid_chars = len([c for c in text if c in settings.INVALID_CHARS])
47
+ if invalid_chars > max(3.0, len(text) * .02):
48
+ return True
49
+
50
+ return False
51
+
52
+
53
+ def no_text_found(pages: List[Page]):
54
+ full_text = ""
55
+ for page in pages:
56
+ full_text += page.text
57
+ return len(full_text.strip()) < 10
58
+
59
+
60
+ def detected_line_coverage(page: Page, intersect_thresh=.6, detection_thresh=.5):
61
+ found_lines = 0
62
+ total_lines = 0
63
+ for detected_line in page.text_lines.bboxes:
64
+ detected_bbox = detected_line.bbox
65
+ for block in page.blocks:
66
+ for line in block.lines:
67
+ intersection_pct = line.intersection_pct(detected_bbox)
68
+ if intersection_pct > intersect_thresh:
69
+ found_lines += 1
70
+ total_lines += 1
71
+ return found_lines / total_lines > detection_thresh
marker/ocr/lang.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from surya.languages import CODE_TO_LANGUAGE, LANGUAGE_TO_CODE
2
+
3
+
4
+ def replace_langs_with_codes(langs):
5
+ for i, lang in enumerate(langs):
6
+ if lang in LANGUAGE_TO_CODE:
7
+ langs[i] = LANGUAGE_TO_CODE[lang]
8
+ return langs
9
+
10
+
11
+ def validate_langs(langs):
12
+ for lang in langs:
13
+ if lang not in CODE_TO_LANGUAGE:
14
+ raise ValueError(f"Invalid language code {lang}")
marker/ocr/page.py DELETED
@@ -1,75 +0,0 @@
1
- import io
2
- from typing import List, Optional
3
-
4
- import ocrmypdf
5
-
6
- from marker.ocr.utils import detect_bad_ocr
7
- from marker.schema import Block
8
- from marker.settings import settings
9
-
10
- ocrmypdf.configure_logging(verbosity=ocrmypdf.Verbosity.quiet)
11
-
12
-
13
- def ocr_entire_page(page, lang: str) -> List[Block]:
14
- if settings.OCR_ENGINE == "tesseract":
15
- return ocr_entire_page_tess(page, lang)
16
- elif settings.OCR_ENGINE == "ocrmypdf":
17
- return ocr_entire_page_ocrmp(page, lang)
18
- else:
19
- raise ValueError(f"Unknown OCR engine {settings.OCR_ENGINE}")
20
-
21
-
22
- def ocr_entire_page_tess(page, lang: str) -> List[Block]:
23
- try:
24
- full_tp = page.get_textpage_ocr(flags=settings.TEXT_FLAGS, dpi=settings.OCR_DPI, full=True, language=lang)
25
- blocks = page.get_text("dict", sort=True, flags=settings.TEXT_FLAGS, textpage=full_tp)["blocks"]
26
- full_text = page.get_text("text", sort=True, flags=settings.TEXT_FLAGS, textpage=full_tp)
27
-
28
- if len(full_text) == 0:
29
- return []
30
-
31
- # Check if OCR worked. If it didn't, return empty list
32
- # OCR can fail if there is a scanned blank page with some faint text impressions, for example
33
- if detect_bad_ocr(full_text):
34
- return []
35
- except RuntimeError:
36
- return []
37
- return blocks
38
-
39
-
40
- def ocr_entire_page_ocrmp(page, lang: str) -> List[Block]:
41
- # Use ocrmypdf to get OCR text for the whole page
42
- src = page.parent # the page's document
43
- blank_doc = pymupdf.open() # make temporary 1-pager
44
- blank_doc.insert_pdf(src, from_page=page.number, to_page=page.number, annots=False, links=False)
45
- pdfbytes = blank_doc.tobytes()
46
- inbytes = io.BytesIO(pdfbytes) # transform to BytesIO object
47
- outbytes = io.BytesIO() # let ocrmypdf store its result pdf here
48
- ocrmypdf.ocr(
49
- inbytes,
50
- outbytes,
51
- language=lang,
52
- output_type="pdf",
53
- redo_ocr=None if settings.OCR_ALL_PAGES else True,
54
- force_ocr=True if settings.OCR_ALL_PAGES else None,
55
- progress_bar=False,
56
- optimize=False,
57
- fast_web_view=1e6,
58
- skip_big=15, # skip images larger than 15 megapixels
59
- tesseract_timeout=settings.TESSERACT_TIMEOUT,
60
- tesseract_non_ocr_timeout=settings.TESSERACT_TIMEOUT,
61
- )
62
- ocr_pdf = pymupdf.open("pdf", outbytes.getvalue()) # read output as fitz PDF
63
- blocks = ocr_pdf[0].get_text("dict", sort=True, flags=settings.TEXT_FLAGS)["blocks"]
64
- full_text = ocr_pdf[0].get_text("text", sort=True, flags=settings.TEXT_FLAGS)
65
-
66
- # Make sure the original pdf/epub/mobi bbox and the ocr pdf bbox are the same
67
- assert page.bound() == ocr_pdf[0].bound()
68
-
69
- if len(full_text) == 0:
70
- return []
71
-
72
- if detect_bad_ocr(full_text):
73
- return []
74
-
75
- return blocks
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
marker/ocr/recognition.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from itertools import repeat
2
+ from typing import List, Optional, Dict
3
+
4
+ import ocrmypdf
5
+ import pypdfium2 as pdfium
6
+ import io
7
+ from concurrent.futures import ThreadPoolExecutor
8
+
9
+ from surya.ocr import run_recognition
10
+
11
+ from marker.ocr.heuristics import should_ocr_page, no_text_found, detect_bad_ocr
12
+ from marker.pdf.images import render_image
13
+ from marker.schema.page import Page
14
+ from marker.schema.schema import Block, Line, Span
15
+ from marker.settings import settings
16
+ from marker.pdf.extract_text import get_text_blocks
17
+
18
+
19
+ def run_ocr(doc, pages: List[Page], langs: List[str], rec_model, parallel_factor) -> (List[Page], Dict):
20
+ ocr_pages = 0
21
+ ocr_success = 0
22
+ ocr_failed = 0
23
+ no_text = no_text_found(pages)
24
+ ocr_idxs = []
25
+ for pnum, page in enumerate(pages):
26
+ ocr_needed = should_ocr_page(page, no_text)
27
+ if ocr_needed:
28
+ ocr_idxs.append(pnum)
29
+ ocr_pages += 1
30
+
31
+ ocr_method = settings.OCR_ENGINE_INTERNAL
32
+ if ocr_method == "surya":
33
+ new_pages = surya_recognition(doc, ocr_idxs, langs, rec_model, pages)
34
+ else:
35
+ new_pages = tesseract_recognition(doc, ocr_idxs, langs)
36
+
37
+ for orig_idx, page in zip(ocr_idxs, new_pages):
38
+ if detect_bad_ocr(page) or len(page.prelim_text) == 0:
39
+ ocr_failed += 1
40
+ else:
41
+ ocr_success += 1
42
+ pages[orig_idx] = page
43
+
44
+ return pages, {"ocr_pages": ocr_pages, "ocr_failed": ocr_failed, "ocr_success": ocr_success}
45
+
46
+
47
+ def surya_recognition(doc, page_idxs, langs: List[str], rec_model, pages: List[Page]) -> List[Optional[Page]]:
48
+ images = [render_image(doc[pnum], dpi=settings.SURYA_OCR_DPI) for pnum in page_idxs]
49
+ processor = rec_model.processor
50
+ selected_pages = [p for i, p in enumerate(pages) if i in page_idxs]
51
+
52
+ surya_langs = [langs] * len(page_idxs)
53
+ detection_results = [p.text_lines.bboxes for p in selected_pages]
54
+ polygons = [[b.polygon for b in bboxes] for bboxes in detection_results]
55
+
56
+ results = run_recognition(images, surya_langs, rec_model, processor, polygons=polygons)
57
+
58
+ new_pages = []
59
+ for (page_idx, result, old_page) in zip(page_idxs, results, selected_pages):
60
+ text_lines = old_page.text_lines
61
+ ocr_results = result.text_lines
62
+ blocks = []
63
+ for i, line in enumerate(ocr_results):
64
+ block = Block(
65
+ bbox=line.bbox,
66
+ pnum=page_idx,
67
+ lines=[Line(
68
+ bbox=line.bbox,
69
+ spans=[Span(
70
+ text=line.text,
71
+ bbox=line.bbox,
72
+ span_id=f"{page_idx}_{i}",
73
+ font="",
74
+ font_weight=0,
75
+ font_size=0,
76
+ )
77
+ ]
78
+ )]
79
+ )
80
+ blocks.append(block)
81
+ page = Page(
82
+ blocks=blocks,
83
+ pnum=page_idx,
84
+ bbox=old_page.bbox,
85
+ rotation=old_page.rotation,
86
+ text_lines=text_lines
87
+ )
88
+ new_pages.append(page)
89
+ return new_pages
90
+
91
+
92
+ def tesseract_recognition(doc, page_idxs, langs: List[str]) -> List[Optional[Page]]:
93
+ pdf_pages = generate_single_page_pdfs(doc, page_idxs)
94
+ with ThreadPoolExecutor(max_workers=settings.OCR_THREADS) as executor:
95
+ pages = list(executor.map(_tesseract_recognition, pdf_pages, repeat(langs, len(pdf_pages))))
96
+
97
+ return pages
98
+
99
+
100
+ def generate_single_page_pdfs(doc, page_idxs) -> List[io.BytesIO]:
101
+ pdf_pages = []
102
+ for page_idx in page_idxs:
103
+ blank_doc = pdfium.PdfDocument.new()
104
+ blank_doc.import_pages(doc, pages=[page_idx])
105
+ assert len(blank_doc) == 1, "Failed to import page"
106
+
107
+ in_pdf = io.BytesIO()
108
+ blank_doc.save(in_pdf)
109
+ in_pdf.seek(0)
110
+ pdf_pages.append(in_pdf)
111
+ return pdf_pages
112
+
113
+
114
+ def _tesseract_recognition(in_pdf, langs: List[str]) -> Optional[Page]:
115
+ out_pdf = io.BytesIO()
116
+
117
+ ocrmypdf.ocr(
118
+ in_pdf,
119
+ out_pdf,
120
+ language=langs[0],
121
+ output_type="pdf",
122
+ redo_ocr=None if settings.OCR_ALL_PAGES else True,
123
+ force_ocr=True if settings.OCR_ALL_PAGES else None,
124
+ progress_bar=False,
125
+ optimize=False,
126
+ fast_web_view=1e6,
127
+ skip_big=15, # skip images larger than 15 megapixels
128
+ tesseract_timeout=settings.TESSERACT_TIMEOUT,
129
+ tesseract_non_ocr_timeout=settings.TESSERACT_TIMEOUT,
130
+ )
131
+
132
+ new_doc = pdfium.PdfDocument(out_pdf.getvalue())
133
+
134
+ blocks, _ = get_text_blocks(new_doc, max_pages=1)
135
+ page = blocks[0]
136
+ return page
marker/ocr/utils.py CHANGED
@@ -1,39 +1,3 @@
1
- from typing import Optional
2
-
3
- from nltk import wordpunct_tokenize
4
- from marker.settings import settings
5
- import re
6
-
7
-
8
- def detect_bad_ocr(text, space_threshold=.6, newline_threshold=.5, alphanum_threshold=.4):
9
- if len(text) == 0:
10
- # Assume OCR failed if we have no text
11
- return True
12
-
13
- words = wordpunct_tokenize(text)
14
- words = [w for w in words if w.strip()]
15
- alpha_words = [word for word in words if word.isalnum()]
16
-
17
- spaces = len(re.findall(r'\s+', text))
18
- alpha_chars = len(re.sub(r'\s+', '', text))
19
- if spaces / (alpha_chars + spaces) > space_threshold:
20
- return True
21
-
22
- newlines = len(re.findall(r'\n+', text))
23
- non_newlines = len(re.sub(r'\n+', '', text))
24
- if newlines / (newlines + non_newlines) > newline_threshold:
25
- return True
26
-
27
- if alphanum_ratio(text) < alphanum_threshold: # Garbled text
28
- return True
29
-
30
- invalid_chars = len([c for c in text if c in settings.INVALID_CHARS])
31
- if invalid_chars > max(3.0, len(text) * .02):
32
- return True
33
-
34
- return False
35
-
36
-
37
  def font_flags_decomposer(flags):
38
  flags = int(flags)
39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  def font_flags_decomposer(flags):
2
  flags = int(flags)
3
 
marker/ordering.py CHANGED
@@ -2,14 +2,11 @@ from copy import deepcopy
2
  from typing import List
3
 
4
  import torch
5
- import sys, os
6
 
7
  from transformers import LayoutLMv3ForSequenceClassification, LayoutLMv3Processor
8
- from PIL import Image
9
- import io
10
 
11
  from marker.pdf.images import render_image
12
- from marker.schema import Page
13
  from marker.settings import settings
14
 
15
  processor = LayoutLMv3Processor.from_pretrained(settings.ORDERER_MODEL_NAME)
 
2
  from typing import List
3
 
4
  import torch
 
5
 
6
  from transformers import LayoutLMv3ForSequenceClassification, LayoutLMv3Processor
 
 
7
 
8
  from marker.pdf.images import render_image
9
+ from marker.schema.page import Page
10
  from marker.settings import settings
11
 
12
  processor = LayoutLMv3Processor.from_pretrained(settings.ORDERER_MODEL_NAME)
marker/{extract_text.py → pdf/extract_text.py} RENAMED
@@ -1,17 +1,21 @@
1
  import os
2
- from typing import List, Optional
3
 
 
4
  import pypdfium2.internal as pdfium_i
5
 
6
- from marker.ocr.utils import detect_bad_ocr, font_flags_decomposer
 
 
7
  from marker.settings import settings
8
- from marker.schema import Span, Line, Block, Page
 
9
  from pdftext.extraction import dictionary_output
10
 
11
  os.environ["TESSDATA_PREFIX"] = settings.TESSDATA_PREFIX
12
 
13
 
14
- def pdftext_format_to_blocks(page, pnum: int) -> List[Block]:
15
  page_blocks = []
16
  span_id = 0
17
  for block_idx, block in enumerate(page["blocks"]):
@@ -54,42 +58,8 @@ def pdftext_format_to_blocks(page, pnum: int) -> List[Block]:
54
  return out_page
55
 
56
 
57
- def ocr_page(doc, pnum, page: Page, tess_lang: str):
58
- ocr_pages = 0
59
- ocr_success = 0
60
- ocr_failed = 0
61
- page_bbox = doc[pnum].bound()
62
-
63
- blocks = get_single_page_blocks(doc, pnum, tess_lang)
64
- page_obj = Page(blocks=blocks, pnum=pnum, bbox=page_bbox)
65
-
66
- # OCR page if we got minimal text, or if we got too many spaces
67
- conditions = [
68
- (
69
- no_text # Full doc has no text, and needs full OCR
70
- or
71
- (len(page_obj.prelim_text) > 0 and detect_bad_ocr(page_obj.prelim_text)) # Bad OCR
72
- ),
73
- min_ocr_page < pnum < len(doc) - 1,
74
- not disable_ocr
75
- ]
76
- if all(conditions) or settings.OCR_ALL_PAGES:
77
- page = doc[pnum]
78
- blocks = get_single_page_blocks(doc, pnum, tess_lang, ocr=True)
79
- page_obj = Page(blocks=blocks, pnum=pnum, bbox=page_bbox, rotation=page.rotation)
80
- ocr_pages = 1
81
- if len(blocks) == 0:
82
- ocr_failed = 1
83
- else:
84
- ocr_success = 1
85
- return page_obj, {"ocr_pages": ocr_pages, "ocr_failed": ocr_failed, "ocr_success": ocr_success}
86
-
87
-
88
- def get_text_blocks(doc, tess_lang: str, spell_lang: Optional[str], max_pages: Optional[int] = None, parallel: int = settings.OCR_PARALLEL_WORKERS):
89
  toc = get_toc(doc)
90
- ocr_pages = 0
91
- ocr_failed = 0
92
- ocr_success = 0
93
 
94
  page_range = range(len(doc))
95
  if max_pages:
@@ -99,7 +69,7 @@ def get_text_blocks(doc, tess_lang: str, spell_lang: Optional[str], max_pages: O
99
  all_blocks = dictionary_output(doc, page_range=page_range)
100
  all_blocks = [pdftext_format_to_blocks(page, pnum) for pnum, page in enumerate(all_blocks)]
101
 
102
- return all_blocks, toc, {"ocr_pages": ocr_pages, "ocr_failed": ocr_failed, "ocr_success": ocr_success}
103
 
104
 
105
  def naive_get_text(doc):
@@ -126,3 +96,10 @@ def get_toc(doc, max_depth=15):
126
  }
127
  toc_list.append(list_item)
128
  return toc_list
 
 
 
 
 
 
 
 
1
  import os
2
+ from typing import List, Optional, Dict
3
 
4
+ import pypdfium2 as pdfium
5
  import pypdfium2.internal as pdfium_i
6
 
7
+ from marker.pdf.filetype import find_filetype
8
+ from marker.ocr.utils import font_flags_decomposer
9
+ from marker.ocr.heuristics import detect_bad_ocr
10
  from marker.settings import settings
11
+ from marker.schema.schema import Span, Line, Block
12
+ from marker.schema.page import Page
13
  from pdftext.extraction import dictionary_output
14
 
15
  os.environ["TESSDATA_PREFIX"] = settings.TESSDATA_PREFIX
16
 
17
 
18
+ def pdftext_format_to_blocks(page, pnum: int) -> Page:
19
  page_blocks = []
20
  span_id = 0
21
  for block_idx, block in enumerate(page["blocks"]):
 
58
  return out_page
59
 
60
 
61
+ def get_text_blocks(doc, max_pages: Optional[int] = None) -> (List[Page], Dict):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  toc = get_toc(doc)
 
 
 
63
 
64
  page_range = range(len(doc))
65
  if max_pages:
 
69
  all_blocks = dictionary_output(doc, page_range=page_range)
70
  all_blocks = [pdftext_format_to_blocks(page, pnum) for pnum, page in enumerate(all_blocks)]
71
 
72
+ return all_blocks, toc
73
 
74
 
75
  def naive_get_text(doc):
 
96
  }
97
  toc_list.append(list_item)
98
  return toc_list
99
+
100
+
101
+ def get_length_of_text(fname: str) -> int:
102
+ doc = pdfium.PdfDocument(fname)
103
+ text = naive_get_text(doc).strip()
104
+
105
+ return len(text)
marker/pdf/filetype.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import magic
2
+
3
+ from marker.settings import settings
4
+
5
+
6
+ def find_filetype(fpath):
7
+ mimetype = magic.from_file(fpath).lower()
8
+
9
+ # Get extensions from mimetype
10
+ # The mimetype is not always consistent, so use in to check the most common formats
11
+ if "pdf" in mimetype:
12
+ return "pdf"
13
+ #elif "epub" in mimetype:
14
+ # return "epub"
15
+ #elif "mobi" in mimetype:
16
+ # return "mobi"
17
+ elif mimetype in settings.SUPPORTED_FILETYPES:
18
+ return settings.SUPPORTED_FILETYPES[mimetype]
19
+ else:
20
+ print(f"Found nonstandard filetype {mimetype}")
21
+ return "other"
marker/{bbox.py → schema/bbox.py} RENAMED
@@ -1,3 +1,8 @@
 
 
 
 
 
1
  def should_merge_blocks(box1, box2, tol=5):
2
  # Within tol y px, and to the right within tol px
3
  merge = [
@@ -18,7 +23,7 @@ def boxes_intersect(box1, box2):
18
  return box1[0] < box2[2] and box1[2] > box2[0] and box1[1] < box2[3] and box1[3] > box2[1]
19
 
20
 
21
- def boxes_intersect_pct(box1, box2, pct=.9):
22
  # determine the coordinates of the intersection rectangle
23
  x_left = max(box1[0], box2[0])
24
  y_top = max(box1[1], box2[1])
@@ -28,16 +33,11 @@ def boxes_intersect_pct(box1, box2, pct=.9):
28
  if x_right < x_left or y_bottom < y_top:
29
  return 0.0
30
 
31
- # The intersection of two axis-aligned bounding boxes is always an
32
- # axis-aligned bounding box
33
  intersection_area = (x_right - x_left) * (y_bottom - y_top)
34
-
35
- # compute the area of both AABBs
36
  bb1_area = (box1[2] - box1[0]) * (box1[3] - box1[1])
37
- bb2_area = (box2[2] - box2[0]) * (box2[3] - box2[1])
38
 
39
- iou = intersection_area / float(bb1_area + bb2_area - intersection_area)
40
- return iou > pct
41
 
42
 
43
  def multiple_boxes_intersect(box1, boxes):
@@ -47,15 +47,44 @@ def multiple_boxes_intersect(box1, boxes):
47
  return False
48
 
49
 
50
- def box_contained(box1, box2):
51
- # Box1 inside box2
52
- return box1[0] > box2[0] and box1[1] > box2[1] and box1[2] < box2[2] and box1[3] < box2[3]
53
-
54
-
55
  def unnormalize_box(bbox, width, height):
56
  return [
57
  width * (bbox[0] / 1000),
58
  height * (bbox[1] / 1000),
59
  width * (bbox[2] / 1000),
60
  height * (bbox[3] / 1000),
61
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+
3
+ from pydantic import BaseModel, field_validator
4
+
5
+
6
  def should_merge_blocks(box1, box2, tol=5):
7
  # Within tol y px, and to the right within tol px
8
  merge = [
 
23
  return box1[0] < box2[2] and box1[2] > box2[0] and box1[1] < box2[3] and box1[3] > box2[1]
24
 
25
 
26
+ def box_intersection_pct(box1, box2):
27
  # determine the coordinates of the intersection rectangle
28
  x_left = max(box1[0], box2[0])
29
  y_top = max(box1[1], box2[1])
 
33
  if x_right < x_left or y_bottom < y_top:
34
  return 0.0
35
 
 
 
36
  intersection_area = (x_right - x_left) * (y_bottom - y_top)
 
 
37
  bb1_area = (box1[2] - box1[0]) * (box1[3] - box1[1])
 
38
 
39
+ iou = intersection_area / bb1_area
40
+ return iou
41
 
42
 
43
  def multiple_boxes_intersect(box1, boxes):
 
47
  return False
48
 
49
 
 
 
 
 
 
50
  def unnormalize_box(bbox, width, height):
51
  return [
52
  width * (bbox[0] / 1000),
53
  height * (bbox[1] / 1000),
54
  width * (bbox[2] / 1000),
55
  height * (bbox[3] / 1000),
56
+ ]
57
+
58
+
59
+ class BboxElement(BaseModel):
60
+ bbox: List[float]
61
+
62
+ @field_validator('bbox')
63
+ @classmethod
64
+ def check_4_elements(cls, v: List[float]) -> List[float]:
65
+ if len(v) != 4:
66
+ raise ValueError('bbox must have 4 elements')
67
+ return v
68
+
69
+ @property
70
+ def height(self):
71
+ return self.bbox[3] - self.bbox[1]
72
+
73
+ @property
74
+ def width(self):
75
+ return self.bbox[2] - self.bbox[0]
76
+
77
+ @property
78
+ def x_start(self):
79
+ return self.bbox[0]
80
+
81
+ @property
82
+ def y_start(self):
83
+ return self.bbox[1]
84
+
85
+ @property
86
+ def area(self):
87
+ return self.width * self.height
88
+
89
+ def intersection_pct(self, other_bbox: List[float]):
90
+ return box_intersection_pct(self.bbox, other_bbox)
marker/schema/page.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import Counter
2
+ from typing import List, Optional
3
+
4
+ from marker.schema.bbox import BboxElement
5
+ from marker.schema.schema import Block, Span
6
+ from surya.schema import TextDetectionResult
7
+
8
+
9
+ class Page(BboxElement):
10
+ blocks: List[Block]
11
+ pnum: int
12
+ column_count: Optional[int] = None
13
+ rotation: Optional[int] = None # Rotation degrees of the page
14
+ text_lines: Optional[TextDetectionResult] = None
15
+
16
+ def get_nonblank_lines(self):
17
+ lines = self.get_all_lines()
18
+ nonblank_lines = [l for l in lines if l.prelim_text.strip()]
19
+ return nonblank_lines
20
+
21
+ def get_all_lines(self):
22
+ lines = [l for b in self.blocks for l in b.lines]
23
+ return lines
24
+
25
+ def get_nonblank_spans(self) -> List[Span]:
26
+ lines = [l for b in self.blocks for l in b.lines]
27
+ spans = [s for l in lines for s in l.spans if s.text.strip()]
28
+ return spans
29
+
30
+ def add_block_types(self, page_block_types):
31
+ if len(page_block_types) != len(self.get_all_lines()):
32
+ print(f"Warning: Number of detected lines {len(page_block_types)} does not match number of lines {len(self.get_all_lines())}")
33
+
34
+ i = 0
35
+ for block in self.blocks:
36
+ for line in block.lines:
37
+ if i < len(page_block_types):
38
+ line_block_type = page_block_types[i].block_type
39
+ else:
40
+ line_block_type = "Text"
41
+ i += 1
42
+ for span in line.spans:
43
+ span.block_type = line_block_type
44
+
45
+ def get_font_stats(self):
46
+ fonts = [s.font for s in self.get_nonblank_spans()]
47
+ font_counts = Counter(fonts)
48
+ return font_counts
49
+
50
+ def get_line_height_stats(self):
51
+ heights = [l.bbox[3] - l.bbox[1] for l in self.get_nonblank_lines()]
52
+ height_counts = Counter(heights)
53
+ return height_counts
54
+
55
+ def get_line_start_stats(self):
56
+ starts = [l.bbox[0] for l in self.get_nonblank_lines()]
57
+ start_counts = Counter(starts)
58
+ return start_counts
59
+
60
+ def get_min_line_start(self):
61
+ starts = [l.bbox[0] for l in self.get_nonblank_lines() if l.spans[0].block_type == "Text"]
62
+ if len(starts) == 0:
63
+ raise IndexError("No lines found")
64
+ return min(starts)
65
+
66
+ @property
67
+ def prelim_text(self):
68
+ return "\n".join([b.prelim_text for b in self.blocks])
marker/{schema.py → schema/schema.py} RENAMED
@@ -1,53 +1,13 @@
1
  from collections import Counter
2
- from typing import List, Optional, Tuple
3
 
4
  from pydantic import BaseModel, field_validator
5
  import ftfy
6
 
7
- from marker.bbox import boxes_intersect_pct, multiple_boxes_intersect
8
  from marker.settings import settings
9
 
10
 
11
- def find_span_type(span, page_blocks):
12
- block_type = "Text"
13
- for block in page_blocks:
14
- if boxes_intersect_pct(span.bbox, block.bbox):
15
- block_type = block.block_type
16
- break
17
- return block_type
18
-
19
-
20
- class BboxElement(BaseModel):
21
- bbox: List[float]
22
-
23
- @field_validator('bbox')
24
- @classmethod
25
- def check_4_elements(cls, v: List[float]) -> List[float]:
26
- if len(v) != 4:
27
- raise ValueError('bbox must have 4 elements')
28
- return v
29
-
30
- @property
31
- def height(self):
32
- return self.bbox[3] - self.bbox[1]
33
-
34
- @property
35
- def width(self):
36
- return self.bbox[2] - self.bbox[0]
37
-
38
- @property
39
- def x_start(self):
40
- return self.bbox[0]
41
-
42
- @property
43
- def y_start(self):
44
- return self.bbox[1]
45
-
46
- @property
47
- def area(self):
48
- return self.width * self.height
49
-
50
-
51
  class BlockType(BboxElement):
52
  block_type: str
53
 
@@ -59,7 +19,6 @@ class Span(BboxElement):
59
  font_weight: float
60
  font_size: float
61
  block_type: Optional[str] = None
62
- selected: bool = True
63
 
64
 
65
  @field_validator('text')
@@ -128,66 +87,6 @@ class Block(BboxElement):
128
  span.block_type = block_type
129
 
130
 
131
- class Page(BboxElement):
132
- blocks: List[Block]
133
- pnum: int
134
- column_count: Optional[int] = None
135
- rotation: Optional[int] = None # Rotation degrees of the page
136
-
137
- def get_nonblank_lines(self):
138
- lines = self.get_all_lines()
139
- nonblank_lines = [l for l in lines if l.prelim_text.strip()]
140
- return nonblank_lines
141
-
142
- def get_all_lines(self):
143
- lines = [l for b in self.blocks for l in b.lines]
144
- return lines
145
-
146
- def get_nonblank_spans(self) -> List[Span]:
147
- lines = [l for b in self.blocks for l in b.lines]
148
- spans = [s for l in lines for s in l.spans if s.text.strip()]
149
- return spans
150
-
151
- def add_block_types(self, page_block_types):
152
- if len(page_block_types) != len(self.get_all_lines()):
153
- print(f"Warning: Number of detected lines {len(page_block_types)} does not match number of lines {len(self.get_all_lines())}")
154
-
155
- i = 0
156
- for block in self.blocks:
157
- for line in block.lines:
158
- if i < len(page_block_types):
159
- line_block_type = page_block_types[i].block_type
160
- else:
161
- line_block_type = "Text"
162
- i += 1
163
- for span in line.spans:
164
- span.block_type = line_block_type
165
-
166
- def get_font_stats(self):
167
- fonts = [s.font for s in self.get_nonblank_spans()]
168
- font_counts = Counter(fonts)
169
- return font_counts
170
-
171
- def get_line_height_stats(self):
172
- heights = [l.bbox[3] - l.bbox[1] for l in self.get_nonblank_lines()]
173
- height_counts = Counter(heights)
174
- return height_counts
175
-
176
- def get_line_start_stats(self):
177
- starts = [l.bbox[0] for l in self.get_nonblank_lines()]
178
- start_counts = Counter(starts)
179
- return start_counts
180
-
181
- def get_min_line_start(self):
182
- starts = [l.bbox[0] for l in self.get_nonblank_lines() if l.spans[0].block_type == "Text"]
183
- if len(starts) == 0:
184
- raise IndexError("No lines found")
185
- return min(starts)
186
-
187
- @property
188
- def prelim_text(self):
189
- return "\n".join([b.prelim_text for b in self.blocks])
190
-
191
  class MergedLine(BboxElement):
192
  text: str
193
  fonts: List[str]
 
1
  from collections import Counter
2
+ from typing import List, Optional
3
 
4
  from pydantic import BaseModel, field_validator
5
  import ftfy
6
 
7
+ from marker.schema.bbox import multiple_boxes_intersect, BboxElement
8
  from marker.settings import settings
9
 
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  class BlockType(BboxElement):
12
  block_type: str
13
 
 
19
  font_weight: float
20
  font_size: float
21
  block_type: Optional[str] = None
 
22
 
23
 
24
  @field_validator('text')
 
87
  span.block_type = block_type
88
 
89
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  class MergedLine(BboxElement):
91
  text: str
92
  fonts: List[str]
marker/segmentation.py CHANGED
@@ -1,18 +1,17 @@
1
- from concurrent.futures import ThreadPoolExecutor
2
  from typing import List
3
 
4
  from transformers import LayoutLMv3ForTokenClassification
5
 
6
- from marker.bbox import unnormalize_box
7
  from transformers.models.layoutlmv3.image_processing_layoutlmv3 import normalize_box
8
- import io
9
  from PIL import Image
10
  from transformers import LayoutLMv3Processor
11
  import numpy as np
12
 
13
  from marker.pdf.images import render_image
14
  from marker.settings import settings
15
- from marker.schema import Page, BlockType
 
16
  import torch
17
  from math import isclose
18
 
 
 
1
  from typing import List
2
 
3
  from transformers import LayoutLMv3ForTokenClassification
4
 
5
+ from marker.schema.bbox import unnormalize_box
6
  from transformers.models.layoutlmv3.image_processing_layoutlmv3 import normalize_box
 
7
  from PIL import Image
8
  from transformers import LayoutLMv3Processor
9
  import numpy as np
10
 
11
  from marker.pdf.images import render_image
12
  from marker.settings import settings
13
+ from marker.schema.schema import BlockType
14
+ from marker.schema.page import Page
15
  import torch
16
  from math import isclose
17
 
marker/settings.py CHANGED
@@ -37,38 +37,34 @@ class Settings(BaseSettings):
37
  #"application/x-fictionbook+xml": "fb2"
38
  }
39
 
 
 
 
 
 
40
  # OCR
41
  INVALID_CHARS: List[str] = [chr(0xfffd), "�"]
42
- OCR_DPI: int = 400
43
- TESSDATA_PREFIX: str = ""
44
- TESSERACT_LANGUAGES: Dict = {
45
- "English": "eng",
46
- "Spanish": "spa",
47
- "Portuguese": "por",
48
- "French": "fra",
49
- "German": "deu",
50
- "Russian": "rus",
51
- "Chinese": "chi_sim",
52
- "Japanese": "jpn",
53
- "Korean": "kor",
54
- "Hindi": "hin",
55
- }
56
- TESSERACT_TIMEOUT: int = 20 # When to give up on OCR
57
- SPELLCHECK_LANGUAGES: Dict = {
58
- "English": "en",
59
- "Spanish": "es",
60
- "Portuguese": "pt",
61
- "French": "fr",
62
- "German": "de",
63
- "Russian": "ru",
64
- "Chinese": None,
65
- "Japanese": None,
66
- "Korean": None,
67
- "Hindi": None,
68
- }
69
  OCR_ALL_PAGES: bool = False # Run OCR on every page even if text can be extracted
 
 
 
 
 
 
70
  OCR_PARALLEL_WORKERS: int = 2 # How many CPU workers to use for OCR
71
- OCR_ENGINE: str = "ocrmypdf" # Which OCR engine to use, either "tesseract" or "ocrmypdf". Ocrmypdf is higher quality, but slower.
 
 
 
 
 
 
 
 
 
 
 
72
 
73
  # Texify model
74
  TEXIFY_MODEL_MAX: int = 384 # Max inference length for texify
@@ -82,7 +78,7 @@ class Settings(BaseSettings):
82
  LAYOUT_MODEL_MAX: int = 512
83
  LAYOUT_CHUNK_OVERLAP: int = 64
84
  LAYOUT_DPI: int = 96
85
- LAYOUT_MODEL_NAME: str = "vikp/layout_segmenter"
86
  LAYOUT_BATCH_SIZE: int = 8 # Max 512 tokens means high batch size
87
 
88
  # Ordering model
 
37
  #"application/x-fictionbook+xml": "fb2"
38
  }
39
 
40
+ # Text line Detection
41
+ DETECTOR_BATCH_SIZE: Optional[int] = None
42
+ SURYA_DETECTOR_DPI: int = 96
43
+ DETECTOR_POSTPROCESSING_CPU_WORKERS: int = 4
44
+
45
  # OCR
46
  INVALID_CHARS: List[str] = [chr(0xfffd), "�"]
47
+ OCR_ENGINE: Optional[str] = None # Which OCR engine to use, either "surya" or "ocrmypdf". Defaults to "ocrmypdf" on CPU, "surya" on GPU.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  OCR_ALL_PAGES: bool = False # Run OCR on every page even if text can be extracted
49
+
50
+ ## Surya
51
+ SURYA_OCR_DPI: int = 96
52
+ RECOGNITION_BATCH_SIZE: Optional[int] = None # Batch size for surya OCR
53
+
54
+ ## Tesseract
55
  OCR_PARALLEL_WORKERS: int = 2 # How many CPU workers to use for OCR
56
+ TESSERACT_TIMEOUT: int = 20 # When to give up on OCR
57
+
58
+ @computed_field
59
+ def OCR_ENGINE_INTERNAL(self) -> str:
60
+ if self.OCR_ENGINE is not None:
61
+ return self.OCR_ENGINE
62
+
63
+ # Does not work with mps
64
+ if torch.cuda.is_available():
65
+ return "surya"
66
+
67
+ return "ocrmypdf"
68
 
69
  # Texify model
70
  TEXIFY_MODEL_MAX: int = 384 # Max inference length for texify
 
78
  LAYOUT_MODEL_MAX: int = 512
79
  LAYOUT_CHUNK_OVERLAP: int = 64
80
  LAYOUT_DPI: int = 96
81
+ LAYOUT_MODEL_CHECKPOINT: str = "vikp/layout_segmenter"
82
  LAYOUT_BATCH_SIZE: int = 8 # Max 512 tokens means high batch size
83
 
84
  # Ordering model