Vik Paruchuri commited on
Commit
10b0dcd
·
1 Parent(s): a79daf8

Add postprocessor

Browse files
README.md CHANGED
@@ -1,6 +1,6 @@
1
  # Marker
2
 
3
- Marker converts PDF, EPUB, and MOBI to Markdown. It is up to 10x faster than nougat, works across many types of documents, and minimizes the risk of hallucinations significantly.
4
 
5
  Features:
6
 
@@ -115,18 +115,33 @@ METADATA_FILE=../pdf_meta.json NUM_DEVICES=4 NUM_WORKERS=35 bash chunk_convert.s
115
 
116
  # Benchmarks
117
 
118
- Benchmarking PDF extraction quality is hard. I've created a test set by finding books and scientific papers that have a pdf version and a latex source. I can then convert the latex to text, and compare it to the output of marker using edit distance.
119
 
120
- Benchmarks show that marker is up to 10x faster than nougat, and more accurate outside arXiv (nougat is better inside arXiv):
121
 
 
122
 
 
 
 
 
 
123
 
 
 
 
 
 
 
 
 
 
124
 
125
  Peak GPU memory usage during the benchmark is `3.3GB` for nougat, and `3.7GB` for marker.
126
 
127
  ## Running your own benchmarks
128
 
129
- You can benchmark the performance of marker on your machine. The benchmark consists of 3 scientific papers from arXiv, and 3 textbooks.
130
 
131
  Run `benchmark.py` like this:
132
 
@@ -134,7 +149,7 @@ Run `benchmark.py` like this:
134
  python benchmark.py benchmark_data/pdfs benchmark_data/references report.json --nougat
135
  ```
136
 
137
- This will benchmark marker against other text extraction methods. It sets up batch sizes for nougat and marker to use a similar amount of GPU RAM for each (4GB).
138
 
139
  Omit `--nougat` to exclude nougat from the benchmark. I don't recommend running nougat on CPU, since it is very slow.
140
 
 
1
  # Marker
2
 
3
+ Marker converts PDF, EPUB, and MOBI to Markdown. It is 12x faster than nougat, works across many types of documents, and minimizes the risk of hallucinations significantly.
4
 
5
  Features:
6
 
 
115
 
116
  # Benchmarks
117
 
118
+ Benchmarking PDF extraction quality is hard. I've created a test set by finding books and scientific papers that have a pdf version and a latex source. I can then convert the latex to text, and compare the reference to the output of text extraction methods.
119
 
120
+ Benchmarks show that marker is 12x faster than nougat, and more accurate outside arXiv (nougat was trained on arXiv data).
121
 
122
+ **Speed**
123
 
124
+ Method Average Score Time per doc
125
+ -------- --------------- --------------
126
+ naive 0.287605 0.149704
127
+ marker 0.62978 33.9778
128
+ nougat 0.63989 395.091
129
 
130
+ **Accuracy**
131
+
132
+ First 3 are non-arXiv books, last 3 are arXiv papers.
133
+
134
+ Method thinkos.pdf thinkdsp.pdf thinkpython.pdf switch_trans.pdf crowd.pdf multicolcnn.pdf
135
+ -------- ------------- -------------- ----------------- ------------------ ----------- -----------------
136
+ naive 0.366817 0.412014 0.468147 0.244739 0.14489 0.0890217
137
+ marker 0.753291 0.787938 0.779262 0.478387 0.446068 0.533737
138
+ nougat 0.638434 0.632723 0.637626 0.690028 0.540994 0.699539
139
 
140
  Peak GPU memory usage during the benchmark is `3.3GB` for nougat, and `3.7GB` for marker.
141
 
142
  ## Running your own benchmarks
143
 
144
+ You can benchmark the performance of marker on your machine.
145
 
146
  Run `benchmark.py` like this:
147
 
 
149
  python benchmark.py benchmark_data/pdfs benchmark_data/references report.json --nougat
150
  ```
151
 
152
+ This will benchmark marker against other text extraction methods. It sets up batch sizes for nougat and marker to use a similar amount of GPU RAM for each.
153
 
154
  Omit `--nougat` to exclude nougat from the benchmark. I don't recommend running nougat on CPU, since it is very slow.
155
 
benchmark.py CHANGED
@@ -7,6 +7,7 @@ from tqdm import tqdm
7
 
8
  from marker.convert import convert_single_pdf
9
  from marker.logger import configure_logging
 
10
  from marker.ordering import load_ordering_model
11
  from marker.segmentation import load_layout_model
12
  from marker.cleaners.equations import load_nougat_model
@@ -48,9 +49,7 @@ if __name__ == "__main__":
48
  if args.nougat:
49
  methods.append("nougat")
50
 
51
- layoutlm_model = load_layout_model()
52
- nougat_model = load_nougat_model()
53
- order_model = load_ordering_model()
54
 
55
  scores = defaultdict(dict)
56
  benchmark_files = os.listdir(args.in_folder)
@@ -70,7 +69,7 @@ if __name__ == "__main__":
70
  for method in methods:
71
  start = time.time()
72
  if method == "marker":
73
- full_text, out_meta = convert_single_pdf(pdf_filename, layoutlm_model, nougat_model, order_model, parallel=args.marker_parallel)
74
  elif method == "nougat":
75
  full_text = nougat_prediction(pdf_filename, batch_size=args.nougat_batch_size)
76
  elif method == "naive":
 
7
 
8
  from marker.convert import convert_single_pdf
9
  from marker.logger import configure_logging
10
+ from marker.models import load_all_models
11
  from marker.ordering import load_ordering_model
12
  from marker.segmentation import load_layout_model
13
  from marker.cleaners.equations import load_nougat_model
 
49
  if args.nougat:
50
  methods.append("nougat")
51
 
52
+ model_lst = load_all_models()
 
 
53
 
54
  scores = defaultdict(dict)
55
  benchmark_files = os.listdir(args.in_folder)
 
69
  for method in methods:
70
  start = time.time()
71
  if method == "marker":
72
+ full_text, out_meta = convert_single_pdf(pdf_filename, model_lst, parallel=args.marker_parallel)
73
  elif method == "nougat":
74
  full_text = nougat_prediction(pdf_filename, batch_size=args.nougat_batch_size)
75
  elif method == "naive":
convert.py CHANGED
@@ -8,6 +8,7 @@ from tqdm import tqdm
8
  import math
9
 
10
  from marker.convert import convert_single_pdf, get_length_of_text
 
11
  from marker.ordering import load_ordering_model
12
  from marker.segmentation import load_layout_model
13
  from marker.cleaners.equations import load_nougat_model
@@ -20,7 +21,7 @@ configure_logging()
20
 
21
 
22
  @ray.remote(num_cpus=settings.RAY_CORES_PER_WORKER, num_gpus=.05 if settings.CUDA else 0)
23
- def process_single_pdf(fname: str, out_folder: str, nougat_model, layout_model, order_model, metadata: Dict | None=None, min_length: int | None = None):
24
  out_filename = fname.rsplit(".", 1)[0] + ".md"
25
  out_filename = os.path.join(out_folder, os.path.basename(out_filename))
26
  out_meta_filename = out_filename.rsplit(".", 1)[0] + "_meta.json"
@@ -35,7 +36,7 @@ def process_single_pdf(fname: str, out_folder: str, nougat_model, layout_model,
35
  if length < min_length:
36
  return
37
 
38
- full_text, out_metadata = convert_single_pdf(fname, layout_model, nougat_model, order_model, metadata=metadata)
39
  if len(full_text.strip()) > 0:
40
  with open(out_filename, "w+") as f:
41
  f.write(full_text)
@@ -94,13 +95,8 @@ if __name__ == "__main__":
94
  log_to_driver=False
95
  )
96
 
97
- nougat_model = load_nougat_model()
98
- layoutlm_model = load_layout_model()
99
- order_model = load_ordering_model()
100
-
101
- nougat_ref = ray.put(nougat_model)
102
- layoutlm_ref = ray.put(layoutlm_model)
103
- order_ref = ray.put(order_model)
104
 
105
  # Dynamically set GPU allocation per task based on GPU ram
106
  gpu_frac = settings.INFERENCE_RAM // settings.VRAM_PER_TASK if settings.CUDA else 0
@@ -110,9 +106,7 @@ if __name__ == "__main__":
110
  process_single_pdf.options(num_gpus=gpu_frac).remote(
111
  filename,
112
  out_folder,
113
- nougat_ref,
114
- layoutlm_ref,
115
- order_ref,
116
  metadata=metadata.get(os.path.basename(filename)),
117
  min_length=args.min_length
118
  ) for filename in files_to_convert
 
8
  import math
9
 
10
  from marker.convert import convert_single_pdf, get_length_of_text
11
+ from marker.models import load_all_models
12
  from marker.ordering import load_ordering_model
13
  from marker.segmentation import load_layout_model
14
  from marker.cleaners.equations import load_nougat_model
 
21
 
22
 
23
  @ray.remote(num_cpus=settings.RAY_CORES_PER_WORKER, num_gpus=.05 if settings.CUDA else 0)
24
+ def process_single_pdf(fname: str, out_folder: str, model_refs, metadata: Dict | None=None, min_length: int | None = None):
25
  out_filename = fname.rsplit(".", 1)[0] + ".md"
26
  out_filename = os.path.join(out_folder, os.path.basename(out_filename))
27
  out_meta_filename = out_filename.rsplit(".", 1)[0] + "_meta.json"
 
36
  if length < min_length:
37
  return
38
 
39
+ full_text, out_metadata = convert_single_pdf(fname, model_refs, metadata=metadata)
40
  if len(full_text.strip()) > 0:
41
  with open(out_filename, "w+") as f:
42
  f.write(full_text)
 
95
  log_to_driver=False
96
  )
97
 
98
+ model_lst = load_all_models()
99
+ model_refs = [ray.put(m) if m else None for m in model_lst]
 
 
 
 
 
100
 
101
  # Dynamically set GPU allocation per task based on GPU ram
102
  gpu_frac = settings.INFERENCE_RAM // settings.VRAM_PER_TASK if settings.CUDA else 0
 
106
  process_single_pdf.options(num_gpus=gpu_frac).remote(
107
  filename,
108
  out_folder,
109
+ model_refs,
 
 
110
  metadata=metadata.get(os.path.basename(filename)),
111
  min_length=args.min_length
112
  ) for filename in files_to_convert
convert_single.py CHANGED
@@ -2,6 +2,7 @@ import argparse
2
 
3
  from marker.convert import convert_single_pdf
4
  from marker.logger import configure_logging
 
5
  from marker.ordering import load_ordering_model
6
  from marker.segmentation import load_layout_model
7
  from marker.cleaners.equations import load_nougat_model
@@ -19,10 +20,8 @@ if __name__ == "__main__":
19
  args = parser.parse_args()
20
 
21
  fname = args.filename
22
- layoutlm_model = load_layout_model()
23
- nougat_model = load_nougat_model()
24
- order_model = load_ordering_model()
25
- full_text, out_meta = convert_single_pdf(fname, layoutlm_model, nougat_model, order_model, max_pages=args.max_pages, parallel=args.workers)
26
 
27
  with open(args.output, "w+") as f:
28
  f.write(full_text)
 
2
 
3
  from marker.convert import convert_single_pdf
4
  from marker.logger import configure_logging
5
+ from marker.models import load_all_models
6
  from marker.ordering import load_ordering_model
7
  from marker.segmentation import load_layout_model
8
  from marker.cleaners.equations import load_nougat_model
 
20
  args = parser.parse_args()
21
 
22
  fname = args.filename
23
+ model_lst = load_all_models()
24
+ full_text, out_meta = convert_single_pdf(fname, model_lst, max_pages=args.max_pages, parallel=args.workers)
 
 
25
 
26
  with open(args.output, "w+") as f:
27
  f.write(full_text)
marker/convert.py CHANGED
@@ -5,6 +5,7 @@ from marker.extract_text import get_text_blocks
5
  from marker.cleaners.headers import filter_header_footer, filter_common_titles
6
  from marker.cleaners.equations import replace_equations
7
  from marker.ordering import order_blocks
 
8
  from marker.segmentation import detect_all_block_types
9
  from marker.cleaners.code import identify_code_blocks, indent_blocks
10
  from marker.cleaners.bullets import replace_bullets
@@ -56,9 +57,7 @@ def get_length_of_text(fname: str) -> int:
56
 
57
  def convert_single_pdf(
58
  fname: str,
59
- layoutlm_model,
60
- nougat_model,
61
- order_model,
62
  max_pages=None,
63
  metadata: Dict | None=None,
64
  parallel: int = 1
@@ -96,6 +95,9 @@ def convert_single_pdf(
96
  print(f"Could not extract any text blocks for {fname}")
97
  return "", out_meta
98
 
 
 
 
99
  block_types = detect_all_block_types(doc, blocks, layoutlm_model, parallel=parallel)
100
 
101
  # Find headers and footers
@@ -135,5 +137,7 @@ def convert_single_pdf(
135
 
136
  # Replace bullet characters with a -
137
  full_text = replace_bullets(full_text)
 
 
138
 
139
  return full_text, out_meta
 
5
  from marker.cleaners.headers import filter_header_footer, filter_common_titles
6
  from marker.cleaners.equations import replace_equations
7
  from marker.ordering import order_blocks
8
+ from marker.postprocessors.editor import edit_full_text
9
  from marker.segmentation import detect_all_block_types
10
  from marker.cleaners.code import identify_code_blocks, indent_blocks
11
  from marker.cleaners.bullets import replace_bullets
 
57
 
58
  def convert_single_pdf(
59
  fname: str,
60
+ model_lst: List,
 
 
61
  max_pages=None,
62
  metadata: Dict | None=None,
63
  parallel: int = 1
 
95
  print(f"Could not extract any text blocks for {fname}")
96
  return "", out_meta
97
 
98
+ # Unpack models from list
99
+ nougat_model, layoutlm_model, order_model, edit_model = model_lst
100
+
101
  block_types = detect_all_block_types(doc, blocks, layoutlm_model, parallel=parallel)
102
 
103
  # Find headers and footers
 
137
 
138
  # Replace bullet characters with a -
139
  full_text = replace_bullets(full_text)
140
+ full_text, edit_stats = edit_full_text(full_text, edit_model)
141
+ out_meta["postprocess_stats"] = {"edit": edit_stats}
142
 
143
  return full_text, out_meta
marker/models.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from marker.cleaners.equations import load_nougat_model
2
+ from marker.ordering import load_ordering_model
3
+ from marker.postprocessors.editor import load_editing_model
4
+ from marker.segmentation import load_layout_model
5
+
6
+
7
+ def load_all_models():
8
+ edit = load_editing_model()
9
+ order = load_ordering_model()
10
+ layout = load_layout_model()
11
+ nougat = load_nougat_model()
12
+ model_lst = [nougat, layout, order, edit]
13
+ return model_lst
marker/postprocessors/editor.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import defaultdict, Counter
2
+ from itertools import chain
3
+ from typing import Optional
4
+ import re
5
+
6
+ from transformers import BloomForTokenClassification, AutoTokenizer, DataCollatorForTokenClassification
7
+ from marker.settings import settings
8
+ import torch
9
+
10
+ tokenizer = AutoTokenizer.from_pretrained(settings.EDITOR_MODEL_NAME)
11
+
12
+
13
+ def load_editing_model(disable_editor=False):
14
+ if disable_editor:
15
+ return None
16
+
17
+ if not settings.CUDA:
18
+ # Don't postprocess on CPU to save time
19
+ return None
20
+
21
+ model = BloomForTokenClassification.from_pretrained(
22
+ settings.EDITOR_MODEL_NAME,
23
+ load_in_4bit=True,
24
+ torch_dtype=torch.bfloat16,
25
+ bnb_4bit_compute_dtype=torch.bfloat16,
26
+ bnb_4bit_quant_type="nf4",
27
+ device_map="sequential"
28
+ )
29
+
30
+ model.config.label2id = {
31
+ "equal": 0,
32
+ "delete": 1,
33
+ "delete_trailing_newline": 2,
34
+ "delete_leading_space": 3,
35
+ "leading_space_to_newline": 4,
36
+ "newline-1": 5,
37
+ "space-1": 6,
38
+ }
39
+ model.config.id2label = {v: k for k, v in model.config.label2id.items()}
40
+ return model
41
+
42
+
43
+ def edit_full_text(text: str, model: Optional[BloomForTokenClassification]):
44
+ if not model:
45
+ return text
46
+
47
+ tokenized = tokenizer(
48
+ text,
49
+ truncation=True,
50
+ max_length=settings.EDITOR_MAX_LENGTH,
51
+ return_overflowing_tokens=True,
52
+ padding="max_length",
53
+ )
54
+ input_ids = tokenized["input_ids"]
55
+
56
+ # Tokenize, and make sure reverse tokenization works
57
+ model_tokens = [tokenizer.convert_ids_to_tokens(t, skip_special_tokens=True) for t in input_ids]
58
+ model_str_tokens = [tokenizer.convert_tokens_to_string(t) for t in model_tokens]
59
+ full_text = "".join(model_str_tokens)
60
+ assert full_text == text
61
+
62
+ # Long list of all tokens
63
+ model_tokens = [tokenizer.convert_ids_to_tokens(t) for t in input_ids]
64
+ flat_tokens = list(chain.from_iterable(model_tokens))
65
+ flat_str_tokens = [tokenizer.convert_tokens_to_string([t]) for t in flat_tokens]
66
+
67
+ # Run model
68
+ token_masks = []
69
+ for i in range(0, len(input_ids), settings.EDITOR_BATCH_SIZE):
70
+ batch_input_ids = tokenized["input_ids"][i: i + settings.EDITOR_BATCH_SIZE]
71
+ batch_input_ids = torch.tensor(batch_input_ids, device=model.device)
72
+ batch_attention_mask = tokenized["attention_mask"][i: i + settings.EDITOR_BATCH_SIZE]
73
+ batch_attention_mask = torch.tensor(batch_attention_mask, device=model.device)
74
+ with torch.inference_mode():
75
+ predictions = model(batch_input_ids, attention_mask=batch_attention_mask)
76
+
77
+ logits = predictions.logits.cpu()
78
+
79
+ labels = logits.argmax(-1).squeeze().tolist()
80
+ labels = list(chain.from_iterable(labels))
81
+ token_masks.extend(labels)
82
+
83
+ assert len(token_masks) == len(flat_tokens) == len(flat_str_tokens)
84
+
85
+ edit_stats = defaultdict(int)
86
+ out_tokens = []
87
+ for i, (token, str_token, mask) in enumerate(zip(flat_tokens, flat_str_tokens, token_masks)):
88
+ label = model.config.id2label[mask]
89
+
90
+ match label:
91
+ case "equal":
92
+ out_tokens.append(str_token)
93
+ edit_stats[label] += 1
94
+ case "delete":
95
+ # If we delete whitespace, roll with it, otherwise ignore
96
+ if str_token.strip():
97
+ out_tokens.append(str_token)
98
+ edit_stats[label] += 1
99
+ case "delete_trailing_newline":
100
+ if str_token.endswith("\n"):
101
+ str_token = re.sub(r"\n+$", "", str_token)
102
+ edit_stats[label] += 1
103
+ out_tokens.append(str_token)
104
+
105
+ case "delete_leading_space":
106
+ if str_token.startswith(" "):
107
+ str_token = re.sub(r"^ +", "", str_token)
108
+ edit_stats[label] += 1
109
+ out_tokens.append(str_token)
110
+ case "leading_space_to_newline":
111
+ if str_token.startswith(" "):
112
+ str_token = "\n" + str_token[1:]
113
+ edit_stats[label] += 1
114
+ out_tokens.append(str_token)
115
+ case "newline-1":
116
+ out_tokens.append("\n")
117
+ out_tokens.append(str_token)
118
+ edit_stats[label] += 1
119
+ case "space-1":
120
+ out_tokens.append(" ")
121
+ out_tokens.append(str_token)
122
+ edit_stats[label] += 1
123
+
124
+ return "".join(out_tokens), edit_stats
125
+
126
+
127
+
128
+
129
+
130
+
marker/settings.py CHANGED
@@ -67,6 +67,11 @@ class Settings(BaseSettings):
67
  ORDERER_BATCH_SIZE: int = 16 # This can be high, because max token count is 128
68
  ORDERER_MODEL_NAME: str = "vikp/column_detector"
69
 
 
 
 
 
 
70
  # Ray
71
  RAY_CACHE_PATH: Optional[str] = None # Where to save ray cache
72
  RAY_DASHBOARD_HOST: str = "127.0.0.1"
 
67
  ORDERER_BATCH_SIZE: int = 16 # This can be high, because max token count is 128
68
  ORDERER_MODEL_NAME: str = "vikp/column_detector"
69
 
70
+ # Final editing model
71
+ EDITOR_BATCH_SIZE: int = 4
72
+ EDITOR_MAX_LENGTH: int = 1024
73
+ EDITOR_MODEL_NAME: str = "vikp/pdf_postprocessor"
74
+
75
  # Ray
76
  RAY_CACHE_PATH: Optional[str] = None # Where to save ray cache
77
  RAY_DASHBOARD_HOST: str = "127.0.0.1"
poetry.lock CHANGED
@@ -361,6 +361,17 @@ soupsieve = ">1.2"
361
  html5lib = ["html5lib"]
362
  lxml = ["lxml"]
363
 
 
 
 
 
 
 
 
 
 
 
 
364
  [[package]]
365
  name = "bleach"
366
  version = "6.1.0"
@@ -5484,4 +5495,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p
5484
  [metadata]
5485
  lock-version = "2.0"
5486
  python-versions = ">=3.9,<3.13"
5487
- content-hash = "2fb598c6fe9ac11179d892a4c57a776a3274f2a26b24e4db308e44871ed2f8d3"
 
361
  html5lib = ["html5lib"]
362
  lxml = ["lxml"]
363
 
364
+ [[package]]
365
+ name = "bitsandbytes"
366
+ version = "0.41.2.post2"
367
+ description = "k-bit optimizers and matrix multiplication routines."
368
+ optional = false
369
+ python-versions = "*"
370
+ files = [
371
+ {file = "bitsandbytes-0.41.2.post2-py3-none-any.whl", hash = "sha256:98e5e1979aea3d481ed06181c689f3a154d7f5dc1af770c5173485bc54cf7b72"},
372
+ {file = "bitsandbytes-0.41.2.post2.tar.gz", hash = "sha256:d374da4700651f36a285ed53e012ee527736109614e3f5c0249985d41027136d"},
373
+ ]
374
+
375
  [[package]]
376
  name = "bleach"
377
  version = "6.1.0"
 
5495
  [metadata]
5496
  lock-version = "2.0"
5497
  python-versions = ">=3.9,<3.13"
5498
+ content-hash = "867abbd491c21af26d74884792e63116aab25a1a362e1c719dfe145c6cc3c2bd"
pyproject.toml CHANGED
@@ -28,6 +28,7 @@ pyspellchecker = "^0.7.2"
28
  ftfy = "^6.1.1"
29
  nltk = "^3.8.1"
30
  ocrmypdf = "^15.4.0"
 
31
 
32
 
33
  [tool.poetry.group.dev.dependencies]
 
28
  ftfy = "^6.1.1"
29
  nltk = "^3.8.1"
30
  ocrmypdf = "^15.4.0"
31
+ bitsandbytes = "^0.41.2.post2"
32
 
33
 
34
  [tool.poetry.group.dev.dependencies]