Vik Paruchuri
commited on
Commit
·
fcfc3c0
1
Parent(s):
a6bdfaa
Add interactive app
Browse files- README.md +11 -1
- convert_single.py +2 -1
- marker/convert.py +5 -2
- marker/ocr/heuristics.py +2 -2
- marker/ocr/recognition.py +2 -2
- marker_app.py +118 -0
- pyproject.toml +3 -0
- run_marker_app.py +14 -0
README.md
CHANGED
|
@@ -90,6 +90,15 @@ First, some configuration:
|
|
| 90 |
- Your torch device will be automatically detected, but you can override this. For example, `TORCH_DEVICE=cuda`.
|
| 91 |
- By default, marker will use `surya` for OCR. Surya is slower on CPU, but more accurate than tesseract. It also doesn't require you to specify the languages in the document. If you want faster OCR, set `OCR_ENGINE` to `ocrmypdf`. This also requires external dependencies (see above). If you don't want OCR at all, set `OCR_ENGINE` to `None`.
|
| 92 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
## Convert a single file
|
| 94 |
|
| 95 |
```shell
|
|
@@ -98,7 +107,8 @@ marker_single /path/to/file.pdf /path/to/output/folder --batch_multiplier 2 --ma
|
|
| 98 |
|
| 99 |
- `--batch_multiplier` is how much to multiply default batch sizes by if you have extra VRAM. Higher numbers will take more VRAM, but process faster. Set to 2 by default. The default batch sizes will take ~3GB of VRAM.
|
| 100 |
- `--max_pages` is the maximum number of pages to process. Omit this to convert the entire document.
|
| 101 |
-
- `--langs` is
|
|
|
|
| 102 |
|
| 103 |
The list of supported languages for surya OCR is [here](https://github.com/VikParuchuri/surya/blob/master/surya/languages.py). If you need more languages, you can use any language supported by [Tesseract](https://tesseract-ocr.github.io/tessdoc/Data-Files#data-files-for-version-400-november-29-2016) if you set `OCR_ENGINE` to `ocrmypdf`. If you don't need OCR, marker can work with any language.
|
| 104 |
|
|
|
|
| 90 |
- Your torch device will be automatically detected, but you can override this. For example, `TORCH_DEVICE=cuda`.
|
| 91 |
- By default, marker will use `surya` for OCR. Surya is slower on CPU, but more accurate than tesseract. It also doesn't require you to specify the languages in the document. If you want faster OCR, set `OCR_ENGINE` to `ocrmypdf`. This also requires external dependencies (see above). If you don't want OCR at all, set `OCR_ENGINE` to `None`.
|
| 92 |
|
| 93 |
+
## Interactive App
|
| 94 |
+
|
| 95 |
+
I've included a streamlit app that lets you interactively try marker with some basic options. Run it with:
|
| 96 |
+
|
| 97 |
+
```shell
|
| 98 |
+
pip install streamlit
|
| 99 |
+
marker_gui
|
| 100 |
+
```
|
| 101 |
+
|
| 102 |
## Convert a single file
|
| 103 |
|
| 104 |
```shell
|
|
|
|
| 107 |
|
| 108 |
- `--batch_multiplier` is how much to multiply default batch sizes by if you have extra VRAM. Higher numbers will take more VRAM, but process faster. Set to 2 by default. The default batch sizes will take ~3GB of VRAM.
|
| 109 |
- `--max_pages` is the maximum number of pages to process. Omit this to convert the entire document.
|
| 110 |
+
- `--langs` is an optional comma separated list of the languages in the document, for OCR. Optional by default, required if you use tesseract.
|
| 111 |
+
- `--ocr_all_pages` is an optional argument to force OCR on all pages of the PDF. If this or the env var `OCR_ALL_PAGES` are true, OCR will be forced.
|
| 112 |
|
| 113 |
The list of supported languages for surya OCR is [here](https://github.com/VikParuchuri/surya/blob/master/surya/languages.py). If you need more languages, you can use any language supported by [Tesseract](https://tesseract-ocr.github.io/tessdoc/Data-Files#data-files-for-version-400-november-29-2016) if you set `OCR_ENGINE` to `ocrmypdf`. If you don't need OCR, marker can work with any language.
|
| 114 |
|
convert_single.py
CHANGED
|
@@ -23,6 +23,7 @@ def main():
|
|
| 23 |
parser.add_argument("--langs", type=str, help="Optional languages to use for OCR, comma separated", default=None)
|
| 24 |
parser.add_argument("--batch_multiplier", type=int, default=2, help="How much to increase batch sizes")
|
| 25 |
parser.add_argument("--debug", action="store_true", help="Enable debug logging", default=False)
|
|
|
|
| 26 |
args = parser.parse_args()
|
| 27 |
|
| 28 |
langs = args.langs.split(",") if args.langs else None
|
|
@@ -30,7 +31,7 @@ def main():
|
|
| 30 |
fname = args.filename
|
| 31 |
model_lst = load_all_models()
|
| 32 |
start = time.time()
|
| 33 |
-
full_text, images, out_meta = convert_single_pdf(fname, model_lst, max_pages=args.max_pages, langs=langs, batch_multiplier=args.batch_multiplier, start_page=args.start_page)
|
| 34 |
|
| 35 |
fname = os.path.basename(fname)
|
| 36 |
subfolder_path = save_markdown(args.output, fname, full_text, images, out_meta)
|
|
|
|
| 23 |
parser.add_argument("--langs", type=str, help="Optional languages to use for OCR, comma separated", default=None)
|
| 24 |
parser.add_argument("--batch_multiplier", type=int, default=2, help="How much to increase batch sizes")
|
| 25 |
parser.add_argument("--debug", action="store_true", help="Enable debug logging", default=False)
|
| 26 |
+
parser.add_argument("--ocr_all_pages", action="store_true", help="Force OCR on all pages", default=False)
|
| 27 |
args = parser.parse_args()
|
| 28 |
|
| 29 |
langs = args.langs.split(",") if args.langs else None
|
|
|
|
| 31 |
fname = args.filename
|
| 32 |
model_lst = load_all_models()
|
| 33 |
start = time.time()
|
| 34 |
+
full_text, images, out_meta = convert_single_pdf(fname, model_lst, max_pages=args.max_pages, langs=langs, batch_multiplier=args.batch_multiplier, start_page=args.start_page, ocr_all_pages=args.ocr_all_pages)
|
| 35 |
|
| 36 |
fname = os.path.basename(fname)
|
| 37 |
subfolder_path = save_markdown(args.output, fname, full_text, images, out_meta)
|
marker/convert.py
CHANGED
|
@@ -41,8 +41,11 @@ def convert_single_pdf(
|
|
| 41 |
start_page: int = None,
|
| 42 |
metadata: Optional[Dict] = None,
|
| 43 |
langs: Optional[List[str]] = None,
|
| 44 |
-
batch_multiplier: int = 1
|
|
|
|
| 45 |
) -> Tuple[str, Dict[str, Image.Image], Dict]:
|
|
|
|
|
|
|
| 46 |
if metadata:
|
| 47 |
langs = metadata.get("languages", langs)
|
| 48 |
|
|
@@ -87,7 +90,7 @@ def convert_single_pdf(
|
|
| 87 |
flush_cuda_memory()
|
| 88 |
|
| 89 |
# OCR pages as needed
|
| 90 |
-
pages, ocr_stats = run_ocr(doc, pages, langs, ocr_model, batch_multiplier=batch_multiplier)
|
| 91 |
flush_cuda_memory()
|
| 92 |
|
| 93 |
out_meta["ocr_stats"] = ocr_stats
|
|
|
|
| 41 |
start_page: int = None,
|
| 42 |
metadata: Optional[Dict] = None,
|
| 43 |
langs: Optional[List[str]] = None,
|
| 44 |
+
batch_multiplier: int = 1,
|
| 45 |
+
ocr_all_pages: bool = False
|
| 46 |
) -> Tuple[str, Dict[str, Image.Image], Dict]:
|
| 47 |
+
ocr_all_pages = ocr_all_pages or settings.OCR_ALL_PAGES
|
| 48 |
+
|
| 49 |
if metadata:
|
| 50 |
langs = metadata.get("languages", langs)
|
| 51 |
|
|
|
|
| 90 |
flush_cuda_memory()
|
| 91 |
|
| 92 |
# OCR pages as needed
|
| 93 |
+
pages, ocr_stats = run_ocr(doc, pages, langs, ocr_model, batch_multiplier=batch_multiplier, ocr_all_pages=ocr_all_pages)
|
| 94 |
flush_cuda_memory()
|
| 95 |
|
| 96 |
out_meta["ocr_stats"] = ocr_stats
|
marker/ocr/heuristics.py
CHANGED
|
@@ -7,7 +7,7 @@ from marker.schema.page import Page
|
|
| 7 |
from marker.settings import settings
|
| 8 |
|
| 9 |
|
| 10 |
-
def should_ocr_page(page: Page, no_text: bool):
|
| 11 |
detected_lines_found, total_lines = detected_line_coverage(page)
|
| 12 |
|
| 13 |
# No reason to OCR page if it has no text lines
|
|
@@ -21,7 +21,7 @@ def should_ocr_page(page: Page, no_text: bool):
|
|
| 21 |
detected_lines_found is False, # didn't extract text for all detected lines
|
| 22 |
]
|
| 23 |
|
| 24 |
-
return any(conditions) or
|
| 25 |
|
| 26 |
|
| 27 |
def detect_bad_ocr(text, space_threshold=.7, newline_threshold=.6, alphanum_threshold=.3):
|
|
|
|
| 7 |
from marker.settings import settings
|
| 8 |
|
| 9 |
|
| 10 |
+
def should_ocr_page(page: Page, no_text: bool, ocr_all_pages=False):
|
| 11 |
detected_lines_found, total_lines = detected_line_coverage(page)
|
| 12 |
|
| 13 |
# No reason to OCR page if it has no text lines
|
|
|
|
| 21 |
detected_lines_found is False, # didn't extract text for all detected lines
|
| 22 |
]
|
| 23 |
|
| 24 |
+
return any(conditions) or ocr_all_pages
|
| 25 |
|
| 26 |
|
| 27 |
def detect_bad_ocr(text, space_threshold=.7, newline_threshold=.6, alphanum_threshold=.3):
|
marker/ocr/recognition.py
CHANGED
|
@@ -28,14 +28,14 @@ def get_batch_size():
|
|
| 28 |
return 32
|
| 29 |
|
| 30 |
|
| 31 |
-
def run_ocr(doc, pages: List[Page], langs: List[str], rec_model, batch_multiplier=1) -> (List[Page], Dict):
|
| 32 |
ocr_pages = 0
|
| 33 |
ocr_success = 0
|
| 34 |
ocr_failed = 0
|
| 35 |
no_text = no_text_found(pages)
|
| 36 |
ocr_idxs = []
|
| 37 |
for pnum, page in enumerate(pages):
|
| 38 |
-
ocr_needed = should_ocr_page(page, no_text)
|
| 39 |
if ocr_needed:
|
| 40 |
ocr_idxs.append(pnum)
|
| 41 |
ocr_pages += 1
|
|
|
|
| 28 |
return 32
|
| 29 |
|
| 30 |
|
| 31 |
+
def run_ocr(doc, pages: List[Page], langs: List[str], rec_model, batch_multiplier=1, ocr_all_pages=False) -> (List[Page], Dict):
|
| 32 |
ocr_pages = 0
|
| 33 |
ocr_success = 0
|
| 34 |
ocr_failed = 0
|
| 35 |
no_text = no_text_found(pages)
|
| 36 |
ocr_idxs = []
|
| 37 |
for pnum, page in enumerate(pages):
|
| 38 |
+
ocr_needed = should_ocr_page(page, no_text, ocr_all_pages=ocr_all_pages)
|
| 39 |
if ocr_needed:
|
| 40 |
ocr_idxs.append(pnum)
|
| 41 |
ocr_pages += 1
|
marker_app.py
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
|
| 3 |
+
os.environ["IN_STREAMLIT"] = "true"
|
| 4 |
+
|
| 5 |
+
import base64
|
| 6 |
+
import io
|
| 7 |
+
import re
|
| 8 |
+
import tempfile
|
| 9 |
+
from typing import List, Any, Dict
|
| 10 |
+
|
| 11 |
+
import pypdfium2
|
| 12 |
+
import streamlit as st
|
| 13 |
+
|
| 14 |
+
from marker.convert import convert_single_pdf
|
| 15 |
+
from marker.models import load_all_models
|
| 16 |
+
from surya.languages import CODE_TO_LANGUAGE
|
| 17 |
+
|
| 18 |
+
@st.cache_resource()
|
| 19 |
+
def load_models():
|
| 20 |
+
return load_all_models()
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def convert_pdf(fname: str, langs: List[str] | None, max_pages: int | None, ocr_all_pages: bool) -> (str, Dict[str, Any], dict):
|
| 24 |
+
full_text, images, out_meta = convert_single_pdf(fname, model_lst, max_pages=max_pages, langs=langs, ocr_all_pages=ocr_all_pages)
|
| 25 |
+
return full_text, images, out_meta
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def open_pdf(pdf_file):
|
| 29 |
+
stream = io.BytesIO(pdf_file.getvalue())
|
| 30 |
+
return pypdfium2.PdfDocument(stream)
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def img_to_html(img, img_alt):
|
| 34 |
+
img_bytes = io.BytesIO()
|
| 35 |
+
img.save(img_bytes, format="PNG")
|
| 36 |
+
img_bytes = img_bytes.getvalue()
|
| 37 |
+
encoded = base64.b64encode(img_bytes).decode()
|
| 38 |
+
img_html = f'<img src="data:image/png;base64,{encoded}" alt="{img_alt}" style="max-width: 100%;">'
|
| 39 |
+
return img_html
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def markdown_insert_images(markdown, images):
|
| 43 |
+
image_tags = re.findall(r'(!\[(?P<image_title>[^\]]+)\]\((?P<image_path>[^\)"\s]+)\s*([^\)]*)\))', markdown)
|
| 44 |
+
|
| 45 |
+
for image in image_tags:
|
| 46 |
+
image_markdown = image[0]
|
| 47 |
+
image_alt = image[1]
|
| 48 |
+
image_path = image[2]
|
| 49 |
+
if image_path in images:
|
| 50 |
+
markdown = markdown.replace(image_markdown, img_to_html(images[image_path], image_alt))
|
| 51 |
+
return markdown
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
@st.cache_data()
|
| 55 |
+
def get_page_image(pdf_file, page_num, dpi=96):
|
| 56 |
+
doc = open_pdf(pdf_file)
|
| 57 |
+
renderer = doc.render(
|
| 58 |
+
pypdfium2.PdfBitmap.to_pil,
|
| 59 |
+
page_indices=[page_num - 1],
|
| 60 |
+
scale=dpi / 72,
|
| 61 |
+
)
|
| 62 |
+
png = list(renderer)[0]
|
| 63 |
+
png_image = png.convert("RGB")
|
| 64 |
+
return png_image
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
@st.cache_data()
|
| 68 |
+
def page_count(pdf_file):
|
| 69 |
+
doc = open_pdf(pdf_file)
|
| 70 |
+
return len(doc)
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
st.set_page_config(layout="wide")
|
| 74 |
+
col1, col2 = st.columns([.5, .5])
|
| 75 |
+
|
| 76 |
+
model_lst = load_models()
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
st.markdown("""
|
| 80 |
+
# Marker Demo
|
| 81 |
+
|
| 82 |
+
This app will let you try marker, a PDF -> Markdown converter. It works with any languages, and extracts images, tables, equations, etc.
|
| 83 |
+
|
| 84 |
+
Find the project [here](https://github.com/VikParuchuri/marker).
|
| 85 |
+
""")
|
| 86 |
+
|
| 87 |
+
in_file = st.sidebar.file_uploader("PDF file:", type=["pdf"])
|
| 88 |
+
languages = st.sidebar.multiselect("Languages", sorted(list(CODE_TO_LANGUAGE.values())), default=[], max_selections=4, help="Select the languages in the pdf (if known) to improve OCR accuracy. Optional.")
|
| 89 |
+
max_pages = st.sidebar.number_input("Max pages to parse", min_value=1, value=10, help="Optional maximum number of pages to convert")
|
| 90 |
+
ocr_all_pages = st.sidebar.checkbox("Force OCR on all pages", help="Force OCR on all pages, even if they are images", value=False)
|
| 91 |
+
|
| 92 |
+
if in_file is None:
|
| 93 |
+
st.stop()
|
| 94 |
+
|
| 95 |
+
filetype = in_file.type
|
| 96 |
+
|
| 97 |
+
with col1:
|
| 98 |
+
page_count = page_count(in_file)
|
| 99 |
+
page_number = st.number_input(f"Page number out of {page_count}:", min_value=1, value=1, max_value=page_count)
|
| 100 |
+
pil_image = get_page_image(in_file, page_number)
|
| 101 |
+
|
| 102 |
+
st.image(pil_image, caption="PDF file (preview)", use_column_width=True)
|
| 103 |
+
|
| 104 |
+
run_marker = st.sidebar.button("Run Marker")
|
| 105 |
+
|
| 106 |
+
if not run_marker:
|
| 107 |
+
st.stop()
|
| 108 |
+
|
| 109 |
+
# Run Marker
|
| 110 |
+
with tempfile.NamedTemporaryFile(suffix=".pdf") as temp_pdf:
|
| 111 |
+
temp_pdf.write(in_file.getvalue())
|
| 112 |
+
temp_pdf.seek(0)
|
| 113 |
+
filename = temp_pdf.name
|
| 114 |
+
md_text, images, out_meta = convert_pdf(filename, languages, max_pages, ocr_all_pages)
|
| 115 |
+
md_text = markdown_insert_images(md_text, images)
|
| 116 |
+
with col2:
|
| 117 |
+
st.markdown(md_text, unsafe_allow_html=True)
|
| 118 |
+
|
pyproject.toml
CHANGED
|
@@ -15,6 +15,8 @@ include = [
|
|
| 15 |
"convert_single.py",
|
| 16 |
"chunk_convert.sh",
|
| 17 |
"chunk_convert.py",
|
|
|
|
|
|
|
| 18 |
]
|
| 19 |
|
| 20 |
[tool.poetry.dependencies]
|
|
@@ -45,6 +47,7 @@ jupyter = "^1.0.0"
|
|
| 45 |
marker = "convert:main"
|
| 46 |
marker_single = "convert_single:main"
|
| 47 |
marker_chunk_convert = "chunk_convert:main"
|
|
|
|
| 48 |
|
| 49 |
[build-system]
|
| 50 |
requires = ["poetry-core"]
|
|
|
|
| 15 |
"convert_single.py",
|
| 16 |
"chunk_convert.sh",
|
| 17 |
"chunk_convert.py",
|
| 18 |
+
"marker_app.py",
|
| 19 |
+
"run_marker_app.py"
|
| 20 |
]
|
| 21 |
|
| 22 |
[tool.poetry.dependencies]
|
|
|
|
| 47 |
marker = "convert:main"
|
| 48 |
marker_single = "convert_single:main"
|
| 49 |
marker_chunk_convert = "chunk_convert:main"
|
| 50 |
+
marker_gui = "run_marker_app:run_app"
|
| 51 |
|
| 52 |
[build-system]
|
| 53 |
requires = ["poetry-core"]
|
run_marker_app.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import subprocess
|
| 3 |
+
import os
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def run_app():
|
| 7 |
+
cur_dir = os.path.dirname(os.path.abspath(__file__))
|
| 8 |
+
app_path = os.path.join(cur_dir, "marker_app.py")
|
| 9 |
+
cmd = ["streamlit", "run", app_path]
|
| 10 |
+
subprocess.run(cmd, env={**os.environ, "IN_STREAMLIT": "true"})
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
if __name__ == "__main__":
|
| 14 |
+
run_app()
|