Spaces:

rt4u
/

marker

Sleeping

App Files Files Community

Vik Paruchuri commited on Aug 19, 2024

Commit

fcfc3c0

1 Parent(s): a6bdfaa

Add interactive app

Browse files

Files changed (8) hide show

README.md +11 -1
convert_single.py +2 -1
marker/convert.py +5 -2
marker/ocr/heuristics.py +2 -2
marker/ocr/recognition.py +2 -2
marker_app.py +118 -0
pyproject.toml +3 -0
run_marker_app.py +14 -0

README.md CHANGED Viewed

@@ -90,6 +90,15 @@ First, some configuration:
 - Your torch device will be automatically detected, but you can override this.  For example, `TORCH_DEVICE=cuda`.
 - By default, marker will use `surya` for OCR.  Surya is slower on CPU, but more accurate than tesseract.  It also doesn't require you to specify the languages in the document.  If you want faster OCR, set `OCR_ENGINE` to `ocrmypdf`. This also requires external dependencies (see above).  If you don't want OCR at all, set `OCR_ENGINE` to `None`.
 ## Convert a single file
 ```shell
@@ -98,7 +107,8 @@ marker_single /path/to/file.pdf /path/to/output/folder --batch_multiplier 2 --ma
 - `--batch_multiplier` is how much to multiply default batch sizes by if you have extra VRAM.  Higher numbers will take more VRAM, but process faster.  Set to 2 by default.  The default batch sizes will take ~3GB of VRAM.
 - `--max_pages` is the maximum number of pages to process.  Omit this to convert the entire document.
-- `--langs` is am optional comma separated list of the languages in the document, for OCR.  Optional by default, required if you use tesseract.
 The list of supported languages for surya OCR is [here](https://github.com/VikParuchuri/surya/blob/master/surya/languages.py).  If you need more languages, you can use any language supported by [Tesseract](https://tesseract-ocr.github.io/tessdoc/Data-Files#data-files-for-version-400-november-29-2016) if you set `OCR_ENGINE` to `ocrmypdf`.  If you don't need OCR, marker can work with any language.

 - Your torch device will be automatically detected, but you can override this.  For example, `TORCH_DEVICE=cuda`.
 - By default, marker will use `surya` for OCR.  Surya is slower on CPU, but more accurate than tesseract.  It also doesn't require you to specify the languages in the document.  If you want faster OCR, set `OCR_ENGINE` to `ocrmypdf`. This also requires external dependencies (see above).  If you don't want OCR at all, set `OCR_ENGINE` to `None`.
+## Interactive App
+I've included a streamlit app that lets you interactively try marker with some basic options.  Run it with:
+```shell
+pip install streamlit
+marker_gui
+```
 ## Convert a single file
 ```shell
 - `--batch_multiplier` is how much to multiply default batch sizes by if you have extra VRAM.  Higher numbers will take more VRAM, but process faster.  Set to 2 by default.  The default batch sizes will take ~3GB of VRAM.
 - `--max_pages` is the maximum number of pages to process.  Omit this to convert the entire document.
+- `--langs` is an optional comma separated list of the languages in the document, for OCR.  Optional by default, required if you use tesseract.
+- `--ocr_all_pages` is an optional argument to force OCR on all pages of the PDF.  If this or the env var `OCR_ALL_PAGES` are true, OCR will be forced.
 The list of supported languages for surya OCR is [here](https://github.com/VikParuchuri/surya/blob/master/surya/languages.py).  If you need more languages, you can use any language supported by [Tesseract](https://tesseract-ocr.github.io/tessdoc/Data-Files#data-files-for-version-400-november-29-2016) if you set `OCR_ENGINE` to `ocrmypdf`.  If you don't need OCR, marker can work with any language.

convert_single.py CHANGED Viewed

@@ -23,6 +23,7 @@ def main():
     parser.add_argument("--langs", type=str, help="Optional languages to use for OCR, comma separated", default=None)
     parser.add_argument("--batch_multiplier", type=int, default=2, help="How much to increase batch sizes")
     parser.add_argument("--debug", action="store_true", help="Enable debug logging", default=False)
     args = parser.parse_args()
     langs = args.langs.split(",") if args.langs else None
@@ -30,7 +31,7 @@ def main():
     fname = args.filename
     model_lst = load_all_models()
     start = time.time()
-    full_text, images, out_meta = convert_single_pdf(fname, model_lst, max_pages=args.max_pages, langs=langs, batch_multiplier=args.batch_multiplier, start_page=args.start_page)
     fname = os.path.basename(fname)
     subfolder_path = save_markdown(args.output, fname, full_text, images, out_meta)

     parser.add_argument("--langs", type=str, help="Optional languages to use for OCR, comma separated", default=None)
     parser.add_argument("--batch_multiplier", type=int, default=2, help="How much to increase batch sizes")
     parser.add_argument("--debug", action="store_true", help="Enable debug logging", default=False)
+    parser.add_argument("--ocr_all_pages", action="store_true", help="Force OCR on all pages", default=False)
     args = parser.parse_args()
     langs = args.langs.split(",") if args.langs else None
     fname = args.filename
     model_lst = load_all_models()
     start = time.time()
+    full_text, images, out_meta = convert_single_pdf(fname, model_lst, max_pages=args.max_pages, langs=langs, batch_multiplier=args.batch_multiplier, start_page=args.start_page, ocr_all_pages=args.ocr_all_pages)
     fname = os.path.basename(fname)
     subfolder_path = save_markdown(args.output, fname, full_text, images, out_meta)

marker/convert.py CHANGED Viewed

@@ -41,8 +41,11 @@ def convert_single_pdf(
         start_page: int = None,
         metadata: Optional[Dict] = None,
         langs: Optional[List[str]] = None,
-        batch_multiplier: int = 1
 ) -> Tuple[str, Dict[str, Image.Image], Dict]:
     if metadata:
         langs = metadata.get("languages", langs)
@@ -87,7 +90,7 @@ def convert_single_pdf(
     flush_cuda_memory()
     # OCR pages as needed
-    pages, ocr_stats = run_ocr(doc, pages, langs, ocr_model, batch_multiplier=batch_multiplier)
     flush_cuda_memory()
     out_meta["ocr_stats"] = ocr_stats

         start_page: int = None,
         metadata: Optional[Dict] = None,
         langs: Optional[List[str]] = None,
+        batch_multiplier: int = 1,
+        ocr_all_pages: bool = False
 ) -> Tuple[str, Dict[str, Image.Image], Dict]:
+    ocr_all_pages = ocr_all_pages or settings.OCR_ALL_PAGES
     if metadata:
         langs = metadata.get("languages", langs)
     flush_cuda_memory()
     # OCR pages as needed
+    pages, ocr_stats = run_ocr(doc, pages, langs, ocr_model, batch_multiplier=batch_multiplier, ocr_all_pages=ocr_all_pages)
     flush_cuda_memory()
     out_meta["ocr_stats"] = ocr_stats

marker/ocr/heuristics.py CHANGED Viewed

@@ -7,7 +7,7 @@ from marker.schema.page import Page
 from marker.settings import settings
-def should_ocr_page(page: Page, no_text: bool):
     detected_lines_found, total_lines = detected_line_coverage(page)
     # No reason to OCR page if it has no text lines
@@ -21,7 +21,7 @@ def should_ocr_page(page: Page, no_text: bool):
         detected_lines_found is False, # didn't extract text for all detected lines
     ]
-    return any(conditions) or settings.OCR_ALL_PAGES
 def detect_bad_ocr(text, space_threshold=.7, newline_threshold=.6, alphanum_threshold=.3):

 from marker.settings import settings
+def should_ocr_page(page: Page, no_text: bool, ocr_all_pages=False):
     detected_lines_found, total_lines = detected_line_coverage(page)
     # No reason to OCR page if it has no text lines
         detected_lines_found is False, # didn't extract text for all detected lines
     ]
+    return any(conditions) or ocr_all_pages
 def detect_bad_ocr(text, space_threshold=.7, newline_threshold=.6, alphanum_threshold=.3):

marker/ocr/recognition.py CHANGED Viewed

@@ -28,14 +28,14 @@ def get_batch_size():
     return 32
-def run_ocr(doc, pages: List[Page], langs: List[str], rec_model, batch_multiplier=1) -> (List[Page], Dict):
     ocr_pages = 0
     ocr_success = 0
     ocr_failed = 0
     no_text = no_text_found(pages)
     ocr_idxs = []
     for pnum, page in enumerate(pages):
-        ocr_needed = should_ocr_page(page, no_text)
         if ocr_needed:
             ocr_idxs.append(pnum)
             ocr_pages += 1

     return 32
+def run_ocr(doc, pages: List[Page], langs: List[str], rec_model, batch_multiplier=1, ocr_all_pages=False) -> (List[Page], Dict):
     ocr_pages = 0
     ocr_success = 0
     ocr_failed = 0
     no_text = no_text_found(pages)
     ocr_idxs = []
     for pnum, page in enumerate(pages):
+        ocr_needed = should_ocr_page(page, no_text, ocr_all_pages=ocr_all_pages)
         if ocr_needed:
             ocr_idxs.append(pnum)
             ocr_pages += 1

marker_app.py ADDED Viewed

	@@ -0,0 +1,118 @@

+import os
+os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
+os.environ["IN_STREAMLIT"] = "true"
+import base64
+import io
+import re
+import tempfile
+from typing import List, Any, Dict
+import pypdfium2
+import streamlit as st
+from marker.convert import convert_single_pdf
+from marker.models import load_all_models
+from surya.languages import CODE_TO_LANGUAGE
+@st.cache_resource()
+def load_models():
+    return load_all_models()
+def convert_pdf(fname: str, langs: List[str] | None, max_pages: int | None, ocr_all_pages: bool) -> (str, Dict[str, Any], dict):
+    full_text, images, out_meta = convert_single_pdf(fname, model_lst, max_pages=max_pages, langs=langs, ocr_all_pages=ocr_all_pages)
+    return full_text, images, out_meta
+def open_pdf(pdf_file):
+    stream = io.BytesIO(pdf_file.getvalue())
+    return pypdfium2.PdfDocument(stream)
+def img_to_html(img, img_alt):
+    img_bytes = io.BytesIO()
+    img.save(img_bytes, format="PNG")
+    img_bytes = img_bytes.getvalue()
+    encoded = base64.b64encode(img_bytes).decode()
+    img_html = f'<img src="data:image/png;base64,{encoded}" alt="{img_alt}" style="max-width: 100%;">'
+    return img_html
+def markdown_insert_images(markdown, images):
+    image_tags = re.findall(r'(!\[(?P<image_title>[^\]]+)\]\((?P<image_path>[^\)"\s]+)\s*([^\)]*)\))', markdown)
+    for image in image_tags:
+        image_markdown = image[0]
+        image_alt = image[1]
+        image_path = image[2]
+        if image_path in images:
+            markdown = markdown.replace(image_markdown, img_to_html(images[image_path], image_alt))
+    return markdown
+@st.cache_data()
+def get_page_image(pdf_file, page_num, dpi=96):
+    doc = open_pdf(pdf_file)
+    renderer = doc.render(
+        pypdfium2.PdfBitmap.to_pil,
+        page_indices=[page_num - 1],
+        scale=dpi / 72,
+    )
+    png = list(renderer)[0]
+    png_image = png.convert("RGB")
+    return png_image
+@st.cache_data()
+def page_count(pdf_file):
+    doc = open_pdf(pdf_file)
+    return len(doc)
+st.set_page_config(layout="wide")
+col1, col2 = st.columns([.5, .5])
+model_lst = load_models()
+st.markdown("""
+# Marker Demo
+This app will let you try marker, a PDF -> Markdown converter. It works with any languages, and extracts images, tables, equations, etc.
+Find the project [here](https://github.com/VikParuchuri/marker).
+""")
+in_file = st.sidebar.file_uploader("PDF file:", type=["pdf"])
+languages = st.sidebar.multiselect("Languages", sorted(list(CODE_TO_LANGUAGE.values())), default=[], max_selections=4, help="Select the languages in the pdf (if known) to improve OCR accuracy.  Optional.")
+max_pages = st.sidebar.number_input("Max pages to parse", min_value=1, value=10, help="Optional maximum number of pages to convert")
+ocr_all_pages = st.sidebar.checkbox("Force OCR on all pages", help="Force OCR on all pages, even if they are images", value=False)
+if in_file is None:
+    st.stop()
+filetype = in_file.type
+with col1:
+    page_count = page_count(in_file)
+    page_number = st.number_input(f"Page number out of {page_count}:", min_value=1, value=1, max_value=page_count)
+    pil_image = get_page_image(in_file, page_number)
+    st.image(pil_image, caption="PDF file (preview)", use_column_width=True)
+run_marker = st.sidebar.button("Run Marker")
+if not run_marker:
+    st.stop()
+# Run Marker
+with tempfile.NamedTemporaryFile(suffix=".pdf") as temp_pdf:
+    temp_pdf.write(in_file.getvalue())
+    temp_pdf.seek(0)
+    filename = temp_pdf.name
+    md_text, images, out_meta = convert_pdf(filename, languages, max_pages, ocr_all_pages)
+md_text = markdown_insert_images(md_text, images)
+with col2:
+    st.markdown(md_text, unsafe_allow_html=True)

pyproject.toml CHANGED Viewed

@@ -15,6 +15,8 @@ include = [
     "convert_single.py",
     "chunk_convert.sh",
     "chunk_convert.py",
 ]
 [tool.poetry.dependencies]
@@ -45,6 +47,7 @@ jupyter = "^1.0.0"
 marker = "convert:main"
 marker_single = "convert_single:main"
 marker_chunk_convert = "chunk_convert:main"
 [build-system]
 requires = ["poetry-core"]

     "convert_single.py",
     "chunk_convert.sh",
     "chunk_convert.py",
+    "marker_app.py",
+    "run_marker_app.py"
 ]
 [tool.poetry.dependencies]
 marker = "convert:main"
 marker_single = "convert_single:main"
 marker_chunk_convert = "chunk_convert:main"
+marker_gui = "run_marker_app:run_app"
 [build-system]
 requires = ["poetry-core"]

run_marker_app.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import argparse
+import subprocess
+import os
+def run_app():
+    cur_dir = os.path.dirname(os.path.abspath(__file__))
+    app_path = os.path.join(cur_dir, "marker_app.py")
+    cmd = ["streamlit", "run", app_path]
+    subprocess.run(cmd, env={**os.environ, "IN_STREAMLIT": "true"})
+if __name__ == "__main__":
+    run_app()