Vik Paruchuri commited on
Commit
c5a5454
·
1 Parent(s): 5510f81

Fix deployment bugs

Browse files
README.md CHANGED
@@ -141,6 +141,17 @@ MIN_LENGTH=10000 METADATA_FILE=../pdf_meta.json NUM_DEVICES=4 NUM_WORKERS=15 mar
141
 
142
  Note that the env variables above are specific to this script, and cannot be set in `local.env`.
143
 
 
 
 
 
 
 
 
 
 
 
 
144
  # Benchmarks
145
 
146
  Benchmarking PDF extraction quality is hard. I've created a test set by finding books and scientific papers that have a pdf version and a latex source. I convert the latex to text, and compare the reference to the output of text extraction methods. It's noisy, but at least directionally correct.
@@ -163,7 +174,7 @@ First 3 are non-arXiv books, last 3 are arXiv papers.
163
  | marker | 0.536176 | 0.516833 | 0.70515 | 0.710657 | 0.690042 | 0.523467 |
164
  | nougat | 0.44009 | 0.588973 | 0.322706 | 0.401342 | 0.160842 | 0.525663 |
165
 
166
- Peak GPU memory usage during the benchmark is `4.2GB` for nougat, and `5.1GB` for marker. Benchmarks were run on an A6000 Ada.
167
 
168
  **Throughput**
169
 
 
141
 
142
  Note that the env variables above are specific to this script, and cannot be set in `local.env`.
143
 
144
+ # Important settings/Troubleshooting
145
+
146
+ There are some settings that you may find especially useful if things aren't working the way you expect:
147
+
148
+ - `OCR_ALL_PAGES` - set this to true to force OCR all pages. This can be very useful if the table layouts aren't recognized properly by default, or if there is garbled text.
149
+ - `TORCH_DEVICE` - set this to force marker to use a given torch device for inference.
150
+ - `OCR_ENGINE` - can set this to `surya` or `ocrmypdf`.
151
+ - `DEBUG` - setting this to `True` shows ray logs when converting multiple pdfs
152
+
153
+ In general, if output is not what you expect, trying to OCR the PDF is a good first step.
154
+
155
  # Benchmarks
156
 
157
  Benchmarking PDF extraction quality is hard. I've created a test set by finding books and scientific papers that have a pdf version and a latex source. I convert the latex to text, and compare the reference to the output of text extraction methods. It's noisy, but at least directionally correct.
 
174
  | marker | 0.536176 | 0.516833 | 0.70515 | 0.710657 | 0.690042 | 0.523467 |
175
  | nougat | 0.44009 | 0.588973 | 0.322706 | 0.401342 | 0.160842 | 0.525663 |
176
 
177
+ Peak GPU memory usage during the benchmark is `4.2GB` for nougat, and `4.1GB` for marker. Benchmarks were run on an A6000 Ada.
178
 
179
  **Throughput**
180
 
convert.py CHANGED
@@ -20,7 +20,8 @@ configure_logging()
20
 
21
 
22
  @ray.remote(num_cpus=settings.RAY_CORES_PER_WORKER, num_gpus=.05 if settings.CUDA else 0)
23
- def process_single_pdf(fname: str, out_folder: str, model_refs, metadata: Optional[Dict] = None, min_length: Optional[int] = None):
 
24
  if markdown_exists(out_folder, fname):
25
  return
26
  try:
@@ -28,21 +29,21 @@ def process_single_pdf(fname: str, out_folder: str, model_refs, metadata: Option
28
  # This can indicate that they were scanned, and not OCRed properly
29
  # Usually these files are not recent/high-quality
30
  if min_length:
31
- filetype = find_filetype(fname)
32
  if filetype == "other":
33
  return 0
34
 
35
- length = get_length_of_text(fname)
36
  if length < min_length:
37
  return
38
 
39
- full_text, images, out_metadata = convert_single_pdf(fname, model_refs, metadata=metadata)
40
  if len(full_text.strip()) > 0:
41
  save_markdown(out_folder, fname, full_text, images, out_metadata)
42
  else:
43
- print(f"Empty file: {fname}. Could not convert.")
44
  except Exception as e:
45
- print(f"Error converting {fname}: {e}")
46
  print(traceback.format_exc())
47
 
48
 
@@ -62,6 +63,7 @@ def main():
62
  in_folder = os.path.abspath(args.in_folder)
63
  out_folder = os.path.abspath(args.out_folder)
64
  files = [os.path.join(in_folder, f) for f in os.listdir(in_folder)]
 
65
  os.makedirs(out_folder, exist_ok=True)
66
 
67
  # Handle chunks if we're processing in parallel
@@ -100,12 +102,12 @@ def main():
100
  print(f"Converting {len(files_to_convert)} pdfs in chunk {args.chunk_idx + 1}/{args.num_chunks} with {total_processes} processes, and storing in {out_folder}")
101
  futures = [
102
  process_single_pdf.options(num_gpus=gpu_frac).remote(
103
- filename,
104
  out_folder,
105
  model_refs,
106
- metadata=metadata.get(os.path.basename(filename)),
107
  min_length=args.min_length
108
- ) for filename in files_to_convert
109
  ]
110
 
111
  # Run all ray conversion tasks
 
20
 
21
 
22
  @ray.remote(num_cpus=settings.RAY_CORES_PER_WORKER, num_gpus=.05 if settings.CUDA else 0)
23
+ def process_single_pdf(filepath: str, out_folder: str, model_refs, metadata: Optional[Dict] = None, min_length: Optional[int] = None):
24
+ fname = os.path.basename(filepath)
25
  if markdown_exists(out_folder, fname):
26
  return
27
  try:
 
29
  # This can indicate that they were scanned, and not OCRed properly
30
  # Usually these files are not recent/high-quality
31
  if min_length:
32
+ filetype = find_filetype(filepath)
33
  if filetype == "other":
34
  return 0
35
 
36
+ length = get_length_of_text(filepath)
37
  if length < min_length:
38
  return
39
 
40
+ full_text, images, out_metadata = convert_single_pdf(filepath, model_refs, metadata=metadata)
41
  if len(full_text.strip()) > 0:
42
  save_markdown(out_folder, fname, full_text, images, out_metadata)
43
  else:
44
+ print(f"Empty file: {filepath}. Could not convert.")
45
  except Exception as e:
46
+ print(f"Error converting {filepath}: {e}")
47
  print(traceback.format_exc())
48
 
49
 
 
63
  in_folder = os.path.abspath(args.in_folder)
64
  out_folder = os.path.abspath(args.out_folder)
65
  files = [os.path.join(in_folder, f) for f in os.listdir(in_folder)]
66
+ files = [f for f in files if os.path.isfile(f)]
67
  os.makedirs(out_folder, exist_ok=True)
68
 
69
  # Handle chunks if we're processing in parallel
 
102
  print(f"Converting {len(files_to_convert)} pdfs in chunk {args.chunk_idx + 1}/{args.num_chunks} with {total_processes} processes, and storing in {out_folder}")
103
  futures = [
104
  process_single_pdf.options(num_gpus=gpu_frac).remote(
105
+ filepath,
106
  out_folder,
107
  model_refs,
108
+ metadata=metadata.get(os.path.basename(filepath)),
109
  min_length=args.min_length
110
+ ) for filepath in files_to_convert
111
  ]
112
 
113
  # Run all ray conversion tasks
marker/convert.py CHANGED
@@ -1,12 +1,10 @@
1
  import warnings
2
-
3
- from marker.utils import flush_cuda_memory
4
-
5
  warnings.filterwarnings("ignore", category=UserWarning) # Filter torch pytree user warnings
6
 
7
  import pypdfium2 as pdfium
8
  from PIL import Image
9
 
 
10
  from marker.tables.table import format_tables
11
  from marker.debug.data import dump_bbox_debug_data
12
  from marker.layout.layout import surya_layout, annotate_block_types
 
1
  import warnings
 
 
 
2
  warnings.filterwarnings("ignore", category=UserWarning) # Filter torch pytree user warnings
3
 
4
  import pypdfium2 as pdfium
5
  from PIL import Image
6
 
7
+ from marker.utils import flush_cuda_memory
8
  from marker.tables.table import format_tables
9
  from marker.debug.data import dump_bbox_debug_data
10
  from marker.layout.layout import surya_layout, annotate_block_types
marker/layout/order.py CHANGED
@@ -26,7 +26,7 @@ def surya_order(doc, pages: List[Page], order_model, batch_multiplier=1):
26
  # Get bboxes for all pages
27
  bboxes = []
28
  for page in pages:
29
- bbox = [b.bbox for b in page.layout.bboxes]
30
  bboxes.append(bbox)
31
 
32
  processor = order_model.processor
 
26
  # Get bboxes for all pages
27
  bboxes = []
28
  for page in pages:
29
+ bbox = [b.bbox for b in page.layout.bboxes][:settings.ORDER_MAX_BBOXES]
30
  bboxes.append(bbox)
31
 
32
  processor = order_model.processor
marker/models.py CHANGED
@@ -50,7 +50,9 @@ def load_all_models(langs=None):
50
  layout = setup_layout_model()
51
  order = setup_order_model()
52
  edit = load_editing_model()
53
- ocr = setup_recognition_model(langs) if settings.OCR_ENGINE == "surya" else None
 
 
54
  texify = setup_texify_model()
55
  model_lst = [texify, layout, order, edit, detection, ocr]
56
- return model_lst
 
50
  layout = setup_layout_model()
51
  order = setup_order_model()
52
  edit = load_editing_model()
53
+
54
+ # Only load recognition model if we'll need it for all pdfs
55
+ ocr = setup_recognition_model(langs) if (settings.OCR_ENGINE == "surya" and settings.OCR_ALL_PAGES) else None
56
  texify = setup_texify_model()
57
  model_lst = [texify, layout, order, edit, detection, ocr]
58
+ return model_lst
marker/ocr/lang.py CHANGED
@@ -1,14 +1,23 @@
 
 
1
  from surya.languages import CODE_TO_LANGUAGE, LANGUAGE_TO_CODE
 
2
 
3
  from marker.ocr.tesseract import LANGUAGE_TO_TESSERACT_CODE, TESSERACT_CODE_TO_LANGUAGE
4
  from marker.settings import settings
5
 
6
 
 
 
 
 
 
 
7
  def replace_langs_with_codes(langs):
8
  if settings.OCR_ENGINE == "surya":
9
  for i, lang in enumerate(langs):
10
- if lang in LANGUAGE_TO_CODE:
11
- langs[i] = LANGUAGE_TO_CODE[lang]
12
  else:
13
  for i, lang in enumerate(langs):
14
  if lang in LANGUAGE_TO_CODE:
 
1
+ from typing import List
2
+
3
  from surya.languages import CODE_TO_LANGUAGE, LANGUAGE_TO_CODE
4
+ from surya.model.recognition.tokenizer import _tokenize as lang_tokenize
5
 
6
  from marker.ocr.tesseract import LANGUAGE_TO_TESSERACT_CODE, TESSERACT_CODE_TO_LANGUAGE
7
  from marker.settings import settings
8
 
9
 
10
+ def langs_to_ids(langs: List[str]):
11
+ unique_langs = list(set(langs))
12
+ _, lang_tokens = lang_tokenize("", unique_langs)
13
+ return lang_tokens
14
+
15
+
16
  def replace_langs_with_codes(langs):
17
  if settings.OCR_ENGINE == "surya":
18
  for i, lang in enumerate(langs):
19
+ if lang.title() in LANGUAGE_TO_CODE:
20
+ langs[i] = LANGUAGE_TO_CODE[lang.title()]
21
  else:
22
  for i, lang in enumerate(langs):
23
  if lang in LANGUAGE_TO_CODE:
marker/ocr/recognition.py CHANGED
@@ -7,7 +7,9 @@ from concurrent.futures import ThreadPoolExecutor
7
 
8
  from surya.ocr import run_recognition
9
 
 
10
  from marker.ocr.heuristics import should_ocr_page, no_text_found, detect_bad_ocr
 
11
  from marker.pdf.images import render_image
12
  from marker.schema.page import Page
13
  from marker.schema.block import Block, Line, Span
@@ -19,7 +21,7 @@ def get_batch_size():
19
  if settings.RECOGNITION_BATCH_SIZE is not None:
20
  return settings.RECOGNITION_BATCH_SIZE
21
  elif settings.TORCH_DEVICE_MODEL == "cuda":
22
- return 64
23
  elif settings.TORCH_DEVICE_MODEL == "mps":
24
  return 32
25
  return 32
@@ -37,11 +39,25 @@ def run_ocr(doc, pages: List[Page], langs: List[str], rec_model, batch_multiplie
37
  ocr_idxs.append(pnum)
38
  ocr_pages += 1
39
 
 
 
 
 
40
  ocr_method = settings.OCR_ENGINE
41
  if ocr_method is None:
42
  return pages, {"ocr_pages": 0, "ocr_failed": 0, "ocr_success": 0, "ocr_engine": "none"}
43
  elif ocr_method == "surya":
 
 
 
 
 
 
 
44
  new_pages = surya_recognition(doc, ocr_idxs, langs, rec_model, pages, batch_multiplier=batch_multiplier)
 
 
 
45
  elif ocr_method == "ocrmypdf":
46
  new_pages = tesseract_recognition(doc, ocr_idxs, langs)
47
  else:
 
7
 
8
  from surya.ocr import run_recognition
9
 
10
+ from marker.models import setup_recognition_model
11
  from marker.ocr.heuristics import should_ocr_page, no_text_found, detect_bad_ocr
12
+ from marker.ocr.lang import langs_to_ids
13
  from marker.pdf.images import render_image
14
  from marker.schema.page import Page
15
  from marker.schema.block import Block, Line, Span
 
21
  if settings.RECOGNITION_BATCH_SIZE is not None:
22
  return settings.RECOGNITION_BATCH_SIZE
23
  elif settings.TORCH_DEVICE_MODEL == "cuda":
24
+ return 32
25
  elif settings.TORCH_DEVICE_MODEL == "mps":
26
  return 32
27
  return 32
 
39
  ocr_idxs.append(pnum)
40
  ocr_pages += 1
41
 
42
+ # No pages need OCR
43
+ if ocr_pages == 0:
44
+ return pages, {"ocr_pages": 0, "ocr_failed": 0, "ocr_success": 0, "ocr_engine": "none"}
45
+
46
  ocr_method = settings.OCR_ENGINE
47
  if ocr_method is None:
48
  return pages, {"ocr_pages": 0, "ocr_failed": 0, "ocr_success": 0, "ocr_engine": "none"}
49
  elif ocr_method == "surya":
50
+ # Load model just in time if we're not OCRing everything
51
+ del_rec_model = False
52
+ if rec_model is None:
53
+ lang_tokens = langs_to_ids(langs)
54
+ rec_model = setup_recognition_model(lang_tokens)
55
+ del_rec_model = True
56
+
57
  new_pages = surya_recognition(doc, ocr_idxs, langs, rec_model, pages, batch_multiplier=batch_multiplier)
58
+
59
+ if del_rec_model:
60
+ del rec_model
61
  elif ocr_method == "ocrmypdf":
62
  new_pages = tesseract_recognition(doc, ocr_idxs, langs)
63
  else:
marker/settings.py CHANGED
@@ -70,6 +70,7 @@ class Settings(BaseSettings):
70
  # Ordering model
71
  SURYA_ORDER_DPI: int = 96
72
  ORDER_BATCH_SIZE: Optional[int] = None # Defaults to 12 for cuda, 6 otherwise
 
73
 
74
  # Final editing model
75
  EDITOR_BATCH_SIZE: Optional[int] = None # Defaults to 6 for cuda, 12 otherwise
 
70
  # Ordering model
71
  SURYA_ORDER_DPI: int = 96
72
  ORDER_BATCH_SIZE: Optional[int] = None # Defaults to 12 for cuda, 6 otherwise
73
+ ORDER_MAX_BBOXES: int = 255
74
 
75
  # Final editing model
76
  EDITOR_BATCH_SIZE: Optional[int] = None # Defaults to 6 for cuda, 12 otherwise
pyproject.toml CHANGED
@@ -1,6 +1,6 @@
1
  [tool.poetry]
2
  name = "marker-pdf"
3
- version = "0.2.0"
4
  description = "Convert PDF to markdown with high speed and accuracy."
5
  authors = ["Vik Paruchuri <github@vikas.sh>"]
6
  readme = "README.md"
 
1
  [tool.poetry]
2
  name = "marker-pdf"
3
+ version = "0.2.2"
4
  description = "Convert PDF to markdown with high speed and accuracy."
5
  authors = ["Vik Paruchuri <github@vikas.sh>"]
6
  readme = "README.md"