Vik Paruchuri
commited on
Commit
·
c5a5454
1
Parent(s):
5510f81
Fix deployment bugs
Browse files- README.md +12 -1
- convert.py +11 -9
- marker/convert.py +1 -3
- marker/layout/order.py +1 -1
- marker/models.py +4 -2
- marker/ocr/lang.py +11 -2
- marker/ocr/recognition.py +17 -1
- marker/settings.py +1 -0
- pyproject.toml +1 -1
README.md
CHANGED
|
@@ -141,6 +141,17 @@ MIN_LENGTH=10000 METADATA_FILE=../pdf_meta.json NUM_DEVICES=4 NUM_WORKERS=15 mar
|
|
| 141 |
|
| 142 |
Note that the env variables above are specific to this script, and cannot be set in `local.env`.
|
| 143 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 144 |
# Benchmarks
|
| 145 |
|
| 146 |
Benchmarking PDF extraction quality is hard. I've created a test set by finding books and scientific papers that have a pdf version and a latex source. I convert the latex to text, and compare the reference to the output of text extraction methods. It's noisy, but at least directionally correct.
|
|
@@ -163,7 +174,7 @@ First 3 are non-arXiv books, last 3 are arXiv papers.
|
|
| 163 |
| marker | 0.536176 | 0.516833 | 0.70515 | 0.710657 | 0.690042 | 0.523467 |
|
| 164 |
| nougat | 0.44009 | 0.588973 | 0.322706 | 0.401342 | 0.160842 | 0.525663 |
|
| 165 |
|
| 166 |
-
Peak GPU memory usage during the benchmark is `4.2GB` for nougat, and `
|
| 167 |
|
| 168 |
**Throughput**
|
| 169 |
|
|
|
|
| 141 |
|
| 142 |
Note that the env variables above are specific to this script, and cannot be set in `local.env`.
|
| 143 |
|
| 144 |
+
# Important settings/Troubleshooting
|
| 145 |
+
|
| 146 |
+
There are some settings that you may find especially useful if things aren't working the way you expect:
|
| 147 |
+
|
| 148 |
+
- `OCR_ALL_PAGES` - set this to true to force OCR all pages. This can be very useful if the table layouts aren't recognized properly by default, or if there is garbled text.
|
| 149 |
+
- `TORCH_DEVICE` - set this to force marker to use a given torch device for inference.
|
| 150 |
+
- `OCR_ENGINE` - can set this to `surya` or `ocrmypdf`.
|
| 151 |
+
- `DEBUG` - setting this to `True` shows ray logs when converting multiple pdfs
|
| 152 |
+
|
| 153 |
+
In general, if output is not what you expect, trying to OCR the PDF is a good first step.
|
| 154 |
+
|
| 155 |
# Benchmarks
|
| 156 |
|
| 157 |
Benchmarking PDF extraction quality is hard. I've created a test set by finding books and scientific papers that have a pdf version and a latex source. I convert the latex to text, and compare the reference to the output of text extraction methods. It's noisy, but at least directionally correct.
|
|
|
|
| 174 |
| marker | 0.536176 | 0.516833 | 0.70515 | 0.710657 | 0.690042 | 0.523467 |
|
| 175 |
| nougat | 0.44009 | 0.588973 | 0.322706 | 0.401342 | 0.160842 | 0.525663 |
|
| 176 |
|
| 177 |
+
Peak GPU memory usage during the benchmark is `4.2GB` for nougat, and `4.1GB` for marker. Benchmarks were run on an A6000 Ada.
|
| 178 |
|
| 179 |
**Throughput**
|
| 180 |
|
convert.py
CHANGED
|
@@ -20,7 +20,8 @@ configure_logging()
|
|
| 20 |
|
| 21 |
|
| 22 |
@ray.remote(num_cpus=settings.RAY_CORES_PER_WORKER, num_gpus=.05 if settings.CUDA else 0)
|
| 23 |
-
def process_single_pdf(
|
|
|
|
| 24 |
if markdown_exists(out_folder, fname):
|
| 25 |
return
|
| 26 |
try:
|
|
@@ -28,21 +29,21 @@ def process_single_pdf(fname: str, out_folder: str, model_refs, metadata: Option
|
|
| 28 |
# This can indicate that they were scanned, and not OCRed properly
|
| 29 |
# Usually these files are not recent/high-quality
|
| 30 |
if min_length:
|
| 31 |
-
filetype = find_filetype(
|
| 32 |
if filetype == "other":
|
| 33 |
return 0
|
| 34 |
|
| 35 |
-
length = get_length_of_text(
|
| 36 |
if length < min_length:
|
| 37 |
return
|
| 38 |
|
| 39 |
-
full_text, images, out_metadata = convert_single_pdf(
|
| 40 |
if len(full_text.strip()) > 0:
|
| 41 |
save_markdown(out_folder, fname, full_text, images, out_metadata)
|
| 42 |
else:
|
| 43 |
-
print(f"Empty file: {
|
| 44 |
except Exception as e:
|
| 45 |
-
print(f"Error converting {
|
| 46 |
print(traceback.format_exc())
|
| 47 |
|
| 48 |
|
|
@@ -62,6 +63,7 @@ def main():
|
|
| 62 |
in_folder = os.path.abspath(args.in_folder)
|
| 63 |
out_folder = os.path.abspath(args.out_folder)
|
| 64 |
files = [os.path.join(in_folder, f) for f in os.listdir(in_folder)]
|
|
|
|
| 65 |
os.makedirs(out_folder, exist_ok=True)
|
| 66 |
|
| 67 |
# Handle chunks if we're processing in parallel
|
|
@@ -100,12 +102,12 @@ def main():
|
|
| 100 |
print(f"Converting {len(files_to_convert)} pdfs in chunk {args.chunk_idx + 1}/{args.num_chunks} with {total_processes} processes, and storing in {out_folder}")
|
| 101 |
futures = [
|
| 102 |
process_single_pdf.options(num_gpus=gpu_frac).remote(
|
| 103 |
-
|
| 104 |
out_folder,
|
| 105 |
model_refs,
|
| 106 |
-
metadata=metadata.get(os.path.basename(
|
| 107 |
min_length=args.min_length
|
| 108 |
-
) for
|
| 109 |
]
|
| 110 |
|
| 111 |
# Run all ray conversion tasks
|
|
|
|
| 20 |
|
| 21 |
|
| 22 |
@ray.remote(num_cpus=settings.RAY_CORES_PER_WORKER, num_gpus=.05 if settings.CUDA else 0)
|
| 23 |
+
def process_single_pdf(filepath: str, out_folder: str, model_refs, metadata: Optional[Dict] = None, min_length: Optional[int] = None):
|
| 24 |
+
fname = os.path.basename(filepath)
|
| 25 |
if markdown_exists(out_folder, fname):
|
| 26 |
return
|
| 27 |
try:
|
|
|
|
| 29 |
# This can indicate that they were scanned, and not OCRed properly
|
| 30 |
# Usually these files are not recent/high-quality
|
| 31 |
if min_length:
|
| 32 |
+
filetype = find_filetype(filepath)
|
| 33 |
if filetype == "other":
|
| 34 |
return 0
|
| 35 |
|
| 36 |
+
length = get_length_of_text(filepath)
|
| 37 |
if length < min_length:
|
| 38 |
return
|
| 39 |
|
| 40 |
+
full_text, images, out_metadata = convert_single_pdf(filepath, model_refs, metadata=metadata)
|
| 41 |
if len(full_text.strip()) > 0:
|
| 42 |
save_markdown(out_folder, fname, full_text, images, out_metadata)
|
| 43 |
else:
|
| 44 |
+
print(f"Empty file: {filepath}. Could not convert.")
|
| 45 |
except Exception as e:
|
| 46 |
+
print(f"Error converting {filepath}: {e}")
|
| 47 |
print(traceback.format_exc())
|
| 48 |
|
| 49 |
|
|
|
|
| 63 |
in_folder = os.path.abspath(args.in_folder)
|
| 64 |
out_folder = os.path.abspath(args.out_folder)
|
| 65 |
files = [os.path.join(in_folder, f) for f in os.listdir(in_folder)]
|
| 66 |
+
files = [f for f in files if os.path.isfile(f)]
|
| 67 |
os.makedirs(out_folder, exist_ok=True)
|
| 68 |
|
| 69 |
# Handle chunks if we're processing in parallel
|
|
|
|
| 102 |
print(f"Converting {len(files_to_convert)} pdfs in chunk {args.chunk_idx + 1}/{args.num_chunks} with {total_processes} processes, and storing in {out_folder}")
|
| 103 |
futures = [
|
| 104 |
process_single_pdf.options(num_gpus=gpu_frac).remote(
|
| 105 |
+
filepath,
|
| 106 |
out_folder,
|
| 107 |
model_refs,
|
| 108 |
+
metadata=metadata.get(os.path.basename(filepath)),
|
| 109 |
min_length=args.min_length
|
| 110 |
+
) for filepath in files_to_convert
|
| 111 |
]
|
| 112 |
|
| 113 |
# Run all ray conversion tasks
|
marker/convert.py
CHANGED
|
@@ -1,12 +1,10 @@
|
|
| 1 |
import warnings
|
| 2 |
-
|
| 3 |
-
from marker.utils import flush_cuda_memory
|
| 4 |
-
|
| 5 |
warnings.filterwarnings("ignore", category=UserWarning) # Filter torch pytree user warnings
|
| 6 |
|
| 7 |
import pypdfium2 as pdfium
|
| 8 |
from PIL import Image
|
| 9 |
|
|
|
|
| 10 |
from marker.tables.table import format_tables
|
| 11 |
from marker.debug.data import dump_bbox_debug_data
|
| 12 |
from marker.layout.layout import surya_layout, annotate_block_types
|
|
|
|
| 1 |
import warnings
|
|
|
|
|
|
|
|
|
|
| 2 |
warnings.filterwarnings("ignore", category=UserWarning) # Filter torch pytree user warnings
|
| 3 |
|
| 4 |
import pypdfium2 as pdfium
|
| 5 |
from PIL import Image
|
| 6 |
|
| 7 |
+
from marker.utils import flush_cuda_memory
|
| 8 |
from marker.tables.table import format_tables
|
| 9 |
from marker.debug.data import dump_bbox_debug_data
|
| 10 |
from marker.layout.layout import surya_layout, annotate_block_types
|
marker/layout/order.py
CHANGED
|
@@ -26,7 +26,7 @@ def surya_order(doc, pages: List[Page], order_model, batch_multiplier=1):
|
|
| 26 |
# Get bboxes for all pages
|
| 27 |
bboxes = []
|
| 28 |
for page in pages:
|
| 29 |
-
bbox = [b.bbox for b in page.layout.bboxes]
|
| 30 |
bboxes.append(bbox)
|
| 31 |
|
| 32 |
processor = order_model.processor
|
|
|
|
| 26 |
# Get bboxes for all pages
|
| 27 |
bboxes = []
|
| 28 |
for page in pages:
|
| 29 |
+
bbox = [b.bbox for b in page.layout.bboxes][:settings.ORDER_MAX_BBOXES]
|
| 30 |
bboxes.append(bbox)
|
| 31 |
|
| 32 |
processor = order_model.processor
|
marker/models.py
CHANGED
|
@@ -50,7 +50,9 @@ def load_all_models(langs=None):
|
|
| 50 |
layout = setup_layout_model()
|
| 51 |
order = setup_order_model()
|
| 52 |
edit = load_editing_model()
|
| 53 |
-
|
|
|
|
|
|
|
| 54 |
texify = setup_texify_model()
|
| 55 |
model_lst = [texify, layout, order, edit, detection, ocr]
|
| 56 |
-
return model_lst
|
|
|
|
| 50 |
layout = setup_layout_model()
|
| 51 |
order = setup_order_model()
|
| 52 |
edit = load_editing_model()
|
| 53 |
+
|
| 54 |
+
# Only load recognition model if we'll need it for all pdfs
|
| 55 |
+
ocr = setup_recognition_model(langs) if (settings.OCR_ENGINE == "surya" and settings.OCR_ALL_PAGES) else None
|
| 56 |
texify = setup_texify_model()
|
| 57 |
model_lst = [texify, layout, order, edit, detection, ocr]
|
| 58 |
+
return model_lst
|
marker/ocr/lang.py
CHANGED
|
@@ -1,14 +1,23 @@
|
|
|
|
|
|
|
|
| 1 |
from surya.languages import CODE_TO_LANGUAGE, LANGUAGE_TO_CODE
|
|
|
|
| 2 |
|
| 3 |
from marker.ocr.tesseract import LANGUAGE_TO_TESSERACT_CODE, TESSERACT_CODE_TO_LANGUAGE
|
| 4 |
from marker.settings import settings
|
| 5 |
|
| 6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
def replace_langs_with_codes(langs):
|
| 8 |
if settings.OCR_ENGINE == "surya":
|
| 9 |
for i, lang in enumerate(langs):
|
| 10 |
-
if lang in LANGUAGE_TO_CODE:
|
| 11 |
-
langs[i] = LANGUAGE_TO_CODE[lang]
|
| 12 |
else:
|
| 13 |
for i, lang in enumerate(langs):
|
| 14 |
if lang in LANGUAGE_TO_CODE:
|
|
|
|
| 1 |
+
from typing import List
|
| 2 |
+
|
| 3 |
from surya.languages import CODE_TO_LANGUAGE, LANGUAGE_TO_CODE
|
| 4 |
+
from surya.model.recognition.tokenizer import _tokenize as lang_tokenize
|
| 5 |
|
| 6 |
from marker.ocr.tesseract import LANGUAGE_TO_TESSERACT_CODE, TESSERACT_CODE_TO_LANGUAGE
|
| 7 |
from marker.settings import settings
|
| 8 |
|
| 9 |
|
| 10 |
+
def langs_to_ids(langs: List[str]):
|
| 11 |
+
unique_langs = list(set(langs))
|
| 12 |
+
_, lang_tokens = lang_tokenize("", unique_langs)
|
| 13 |
+
return lang_tokens
|
| 14 |
+
|
| 15 |
+
|
| 16 |
def replace_langs_with_codes(langs):
|
| 17 |
if settings.OCR_ENGINE == "surya":
|
| 18 |
for i, lang in enumerate(langs):
|
| 19 |
+
if lang.title() in LANGUAGE_TO_CODE:
|
| 20 |
+
langs[i] = LANGUAGE_TO_CODE[lang.title()]
|
| 21 |
else:
|
| 22 |
for i, lang in enumerate(langs):
|
| 23 |
if lang in LANGUAGE_TO_CODE:
|
marker/ocr/recognition.py
CHANGED
|
@@ -7,7 +7,9 @@ from concurrent.futures import ThreadPoolExecutor
|
|
| 7 |
|
| 8 |
from surya.ocr import run_recognition
|
| 9 |
|
|
|
|
| 10 |
from marker.ocr.heuristics import should_ocr_page, no_text_found, detect_bad_ocr
|
|
|
|
| 11 |
from marker.pdf.images import render_image
|
| 12 |
from marker.schema.page import Page
|
| 13 |
from marker.schema.block import Block, Line, Span
|
|
@@ -19,7 +21,7 @@ def get_batch_size():
|
|
| 19 |
if settings.RECOGNITION_BATCH_SIZE is not None:
|
| 20 |
return settings.RECOGNITION_BATCH_SIZE
|
| 21 |
elif settings.TORCH_DEVICE_MODEL == "cuda":
|
| 22 |
-
return
|
| 23 |
elif settings.TORCH_DEVICE_MODEL == "mps":
|
| 24 |
return 32
|
| 25 |
return 32
|
|
@@ -37,11 +39,25 @@ def run_ocr(doc, pages: List[Page], langs: List[str], rec_model, batch_multiplie
|
|
| 37 |
ocr_idxs.append(pnum)
|
| 38 |
ocr_pages += 1
|
| 39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
ocr_method = settings.OCR_ENGINE
|
| 41 |
if ocr_method is None:
|
| 42 |
return pages, {"ocr_pages": 0, "ocr_failed": 0, "ocr_success": 0, "ocr_engine": "none"}
|
| 43 |
elif ocr_method == "surya":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
new_pages = surya_recognition(doc, ocr_idxs, langs, rec_model, pages, batch_multiplier=batch_multiplier)
|
|
|
|
|
|
|
|
|
|
| 45 |
elif ocr_method == "ocrmypdf":
|
| 46 |
new_pages = tesseract_recognition(doc, ocr_idxs, langs)
|
| 47 |
else:
|
|
|
|
| 7 |
|
| 8 |
from surya.ocr import run_recognition
|
| 9 |
|
| 10 |
+
from marker.models import setup_recognition_model
|
| 11 |
from marker.ocr.heuristics import should_ocr_page, no_text_found, detect_bad_ocr
|
| 12 |
+
from marker.ocr.lang import langs_to_ids
|
| 13 |
from marker.pdf.images import render_image
|
| 14 |
from marker.schema.page import Page
|
| 15 |
from marker.schema.block import Block, Line, Span
|
|
|
|
| 21 |
if settings.RECOGNITION_BATCH_SIZE is not None:
|
| 22 |
return settings.RECOGNITION_BATCH_SIZE
|
| 23 |
elif settings.TORCH_DEVICE_MODEL == "cuda":
|
| 24 |
+
return 32
|
| 25 |
elif settings.TORCH_DEVICE_MODEL == "mps":
|
| 26 |
return 32
|
| 27 |
return 32
|
|
|
|
| 39 |
ocr_idxs.append(pnum)
|
| 40 |
ocr_pages += 1
|
| 41 |
|
| 42 |
+
# No pages need OCR
|
| 43 |
+
if ocr_pages == 0:
|
| 44 |
+
return pages, {"ocr_pages": 0, "ocr_failed": 0, "ocr_success": 0, "ocr_engine": "none"}
|
| 45 |
+
|
| 46 |
ocr_method = settings.OCR_ENGINE
|
| 47 |
if ocr_method is None:
|
| 48 |
return pages, {"ocr_pages": 0, "ocr_failed": 0, "ocr_success": 0, "ocr_engine": "none"}
|
| 49 |
elif ocr_method == "surya":
|
| 50 |
+
# Load model just in time if we're not OCRing everything
|
| 51 |
+
del_rec_model = False
|
| 52 |
+
if rec_model is None:
|
| 53 |
+
lang_tokens = langs_to_ids(langs)
|
| 54 |
+
rec_model = setup_recognition_model(lang_tokens)
|
| 55 |
+
del_rec_model = True
|
| 56 |
+
|
| 57 |
new_pages = surya_recognition(doc, ocr_idxs, langs, rec_model, pages, batch_multiplier=batch_multiplier)
|
| 58 |
+
|
| 59 |
+
if del_rec_model:
|
| 60 |
+
del rec_model
|
| 61 |
elif ocr_method == "ocrmypdf":
|
| 62 |
new_pages = tesseract_recognition(doc, ocr_idxs, langs)
|
| 63 |
else:
|
marker/settings.py
CHANGED
|
@@ -70,6 +70,7 @@ class Settings(BaseSettings):
|
|
| 70 |
# Ordering model
|
| 71 |
SURYA_ORDER_DPI: int = 96
|
| 72 |
ORDER_BATCH_SIZE: Optional[int] = None # Defaults to 12 for cuda, 6 otherwise
|
|
|
|
| 73 |
|
| 74 |
# Final editing model
|
| 75 |
EDITOR_BATCH_SIZE: Optional[int] = None # Defaults to 6 for cuda, 12 otherwise
|
|
|
|
| 70 |
# Ordering model
|
| 71 |
SURYA_ORDER_DPI: int = 96
|
| 72 |
ORDER_BATCH_SIZE: Optional[int] = None # Defaults to 12 for cuda, 6 otherwise
|
| 73 |
+
ORDER_MAX_BBOXES: int = 255
|
| 74 |
|
| 75 |
# Final editing model
|
| 76 |
EDITOR_BATCH_SIZE: Optional[int] = None # Defaults to 6 for cuda, 12 otherwise
|
pyproject.toml
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
[tool.poetry]
|
| 2 |
name = "marker-pdf"
|
| 3 |
-
version = "0.2.
|
| 4 |
description = "Convert PDF to markdown with high speed and accuracy."
|
| 5 |
authors = ["Vik Paruchuri <github@vikas.sh>"]
|
| 6 |
readme = "README.md"
|
|
|
|
| 1 |
[tool.poetry]
|
| 2 |
name = "marker-pdf"
|
| 3 |
+
version = "0.2.2"
|
| 4 |
description = "Convert PDF to markdown with high speed and accuracy."
|
| 5 |
authors = ["Vik Paruchuri <github@vikas.sh>"]
|
| 6 |
readme = "README.md"
|