Vik Paruchuri commited on
Commit
fcfc3c0
·
1 Parent(s): a6bdfaa

Add interactive app

Browse files
README.md CHANGED
@@ -90,6 +90,15 @@ First, some configuration:
90
  - Your torch device will be automatically detected, but you can override this. For example, `TORCH_DEVICE=cuda`.
91
  - By default, marker will use `surya` for OCR. Surya is slower on CPU, but more accurate than tesseract. It also doesn't require you to specify the languages in the document. If you want faster OCR, set `OCR_ENGINE` to `ocrmypdf`. This also requires external dependencies (see above). If you don't want OCR at all, set `OCR_ENGINE` to `None`.
92
 
 
 
 
 
 
 
 
 
 
93
  ## Convert a single file
94
 
95
  ```shell
@@ -98,7 +107,8 @@ marker_single /path/to/file.pdf /path/to/output/folder --batch_multiplier 2 --ma
98
 
99
  - `--batch_multiplier` is how much to multiply default batch sizes by if you have extra VRAM. Higher numbers will take more VRAM, but process faster. Set to 2 by default. The default batch sizes will take ~3GB of VRAM.
100
  - `--max_pages` is the maximum number of pages to process. Omit this to convert the entire document.
101
- - `--langs` is am optional comma separated list of the languages in the document, for OCR. Optional by default, required if you use tesseract.
 
102
 
103
  The list of supported languages for surya OCR is [here](https://github.com/VikParuchuri/surya/blob/master/surya/languages.py). If you need more languages, you can use any language supported by [Tesseract](https://tesseract-ocr.github.io/tessdoc/Data-Files#data-files-for-version-400-november-29-2016) if you set `OCR_ENGINE` to `ocrmypdf`. If you don't need OCR, marker can work with any language.
104
 
 
90
  - Your torch device will be automatically detected, but you can override this. For example, `TORCH_DEVICE=cuda`.
91
  - By default, marker will use `surya` for OCR. Surya is slower on CPU, but more accurate than tesseract. It also doesn't require you to specify the languages in the document. If you want faster OCR, set `OCR_ENGINE` to `ocrmypdf`. This also requires external dependencies (see above). If you don't want OCR at all, set `OCR_ENGINE` to `None`.
92
 
93
+ ## Interactive App
94
+
95
+ I've included a streamlit app that lets you interactively try marker with some basic options. Run it with:
96
+
97
+ ```shell
98
+ pip install streamlit
99
+ marker_gui
100
+ ```
101
+
102
  ## Convert a single file
103
 
104
  ```shell
 
107
 
108
  - `--batch_multiplier` is how much to multiply default batch sizes by if you have extra VRAM. Higher numbers will take more VRAM, but process faster. Set to 2 by default. The default batch sizes will take ~3GB of VRAM.
109
  - `--max_pages` is the maximum number of pages to process. Omit this to convert the entire document.
110
+ - `--langs` is an optional comma separated list of the languages in the document, for OCR. Optional by default, required if you use tesseract.
111
+ - `--ocr_all_pages` is an optional argument to force OCR on all pages of the PDF. If this or the env var `OCR_ALL_PAGES` are true, OCR will be forced.
112
 
113
  The list of supported languages for surya OCR is [here](https://github.com/VikParuchuri/surya/blob/master/surya/languages.py). If you need more languages, you can use any language supported by [Tesseract](https://tesseract-ocr.github.io/tessdoc/Data-Files#data-files-for-version-400-november-29-2016) if you set `OCR_ENGINE` to `ocrmypdf`. If you don't need OCR, marker can work with any language.
114
 
convert_single.py CHANGED
@@ -23,6 +23,7 @@ def main():
23
  parser.add_argument("--langs", type=str, help="Optional languages to use for OCR, comma separated", default=None)
24
  parser.add_argument("--batch_multiplier", type=int, default=2, help="How much to increase batch sizes")
25
  parser.add_argument("--debug", action="store_true", help="Enable debug logging", default=False)
 
26
  args = parser.parse_args()
27
 
28
  langs = args.langs.split(",") if args.langs else None
@@ -30,7 +31,7 @@ def main():
30
  fname = args.filename
31
  model_lst = load_all_models()
32
  start = time.time()
33
- full_text, images, out_meta = convert_single_pdf(fname, model_lst, max_pages=args.max_pages, langs=langs, batch_multiplier=args.batch_multiplier, start_page=args.start_page)
34
 
35
  fname = os.path.basename(fname)
36
  subfolder_path = save_markdown(args.output, fname, full_text, images, out_meta)
 
23
  parser.add_argument("--langs", type=str, help="Optional languages to use for OCR, comma separated", default=None)
24
  parser.add_argument("--batch_multiplier", type=int, default=2, help="How much to increase batch sizes")
25
  parser.add_argument("--debug", action="store_true", help="Enable debug logging", default=False)
26
+ parser.add_argument("--ocr_all_pages", action="store_true", help="Force OCR on all pages", default=False)
27
  args = parser.parse_args()
28
 
29
  langs = args.langs.split(",") if args.langs else None
 
31
  fname = args.filename
32
  model_lst = load_all_models()
33
  start = time.time()
34
+ full_text, images, out_meta = convert_single_pdf(fname, model_lst, max_pages=args.max_pages, langs=langs, batch_multiplier=args.batch_multiplier, start_page=args.start_page, ocr_all_pages=args.ocr_all_pages)
35
 
36
  fname = os.path.basename(fname)
37
  subfolder_path = save_markdown(args.output, fname, full_text, images, out_meta)
marker/convert.py CHANGED
@@ -41,8 +41,11 @@ def convert_single_pdf(
41
  start_page: int = None,
42
  metadata: Optional[Dict] = None,
43
  langs: Optional[List[str]] = None,
44
- batch_multiplier: int = 1
 
45
  ) -> Tuple[str, Dict[str, Image.Image], Dict]:
 
 
46
  if metadata:
47
  langs = metadata.get("languages", langs)
48
 
@@ -87,7 +90,7 @@ def convert_single_pdf(
87
  flush_cuda_memory()
88
 
89
  # OCR pages as needed
90
- pages, ocr_stats = run_ocr(doc, pages, langs, ocr_model, batch_multiplier=batch_multiplier)
91
  flush_cuda_memory()
92
 
93
  out_meta["ocr_stats"] = ocr_stats
 
41
  start_page: int = None,
42
  metadata: Optional[Dict] = None,
43
  langs: Optional[List[str]] = None,
44
+ batch_multiplier: int = 1,
45
+ ocr_all_pages: bool = False
46
  ) -> Tuple[str, Dict[str, Image.Image], Dict]:
47
+ ocr_all_pages = ocr_all_pages or settings.OCR_ALL_PAGES
48
+
49
  if metadata:
50
  langs = metadata.get("languages", langs)
51
 
 
90
  flush_cuda_memory()
91
 
92
  # OCR pages as needed
93
+ pages, ocr_stats = run_ocr(doc, pages, langs, ocr_model, batch_multiplier=batch_multiplier, ocr_all_pages=ocr_all_pages)
94
  flush_cuda_memory()
95
 
96
  out_meta["ocr_stats"] = ocr_stats
marker/ocr/heuristics.py CHANGED
@@ -7,7 +7,7 @@ from marker.schema.page import Page
7
  from marker.settings import settings
8
 
9
 
10
- def should_ocr_page(page: Page, no_text: bool):
11
  detected_lines_found, total_lines = detected_line_coverage(page)
12
 
13
  # No reason to OCR page if it has no text lines
@@ -21,7 +21,7 @@ def should_ocr_page(page: Page, no_text: bool):
21
  detected_lines_found is False, # didn't extract text for all detected lines
22
  ]
23
 
24
- return any(conditions) or settings.OCR_ALL_PAGES
25
 
26
 
27
  def detect_bad_ocr(text, space_threshold=.7, newline_threshold=.6, alphanum_threshold=.3):
 
7
  from marker.settings import settings
8
 
9
 
10
+ def should_ocr_page(page: Page, no_text: bool, ocr_all_pages=False):
11
  detected_lines_found, total_lines = detected_line_coverage(page)
12
 
13
  # No reason to OCR page if it has no text lines
 
21
  detected_lines_found is False, # didn't extract text for all detected lines
22
  ]
23
 
24
+ return any(conditions) or ocr_all_pages
25
 
26
 
27
  def detect_bad_ocr(text, space_threshold=.7, newline_threshold=.6, alphanum_threshold=.3):
marker/ocr/recognition.py CHANGED
@@ -28,14 +28,14 @@ def get_batch_size():
28
  return 32
29
 
30
 
31
- def run_ocr(doc, pages: List[Page], langs: List[str], rec_model, batch_multiplier=1) -> (List[Page], Dict):
32
  ocr_pages = 0
33
  ocr_success = 0
34
  ocr_failed = 0
35
  no_text = no_text_found(pages)
36
  ocr_idxs = []
37
  for pnum, page in enumerate(pages):
38
- ocr_needed = should_ocr_page(page, no_text)
39
  if ocr_needed:
40
  ocr_idxs.append(pnum)
41
  ocr_pages += 1
 
28
  return 32
29
 
30
 
31
+ def run_ocr(doc, pages: List[Page], langs: List[str], rec_model, batch_multiplier=1, ocr_all_pages=False) -> (List[Page], Dict):
32
  ocr_pages = 0
33
  ocr_success = 0
34
  ocr_failed = 0
35
  no_text = no_text_found(pages)
36
  ocr_idxs = []
37
  for pnum, page in enumerate(pages):
38
+ ocr_needed = should_ocr_page(page, no_text, ocr_all_pages=ocr_all_pages)
39
  if ocr_needed:
40
  ocr_idxs.append(pnum)
41
  ocr_pages += 1
marker_app.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
3
+ os.environ["IN_STREAMLIT"] = "true"
4
+
5
+ import base64
6
+ import io
7
+ import re
8
+ import tempfile
9
+ from typing import List, Any, Dict
10
+
11
+ import pypdfium2
12
+ import streamlit as st
13
+
14
+ from marker.convert import convert_single_pdf
15
+ from marker.models import load_all_models
16
+ from surya.languages import CODE_TO_LANGUAGE
17
+
18
+ @st.cache_resource()
19
+ def load_models():
20
+ return load_all_models()
21
+
22
+
23
+ def convert_pdf(fname: str, langs: List[str] | None, max_pages: int | None, ocr_all_pages: bool) -> (str, Dict[str, Any], dict):
24
+ full_text, images, out_meta = convert_single_pdf(fname, model_lst, max_pages=max_pages, langs=langs, ocr_all_pages=ocr_all_pages)
25
+ return full_text, images, out_meta
26
+
27
+
28
+ def open_pdf(pdf_file):
29
+ stream = io.BytesIO(pdf_file.getvalue())
30
+ return pypdfium2.PdfDocument(stream)
31
+
32
+
33
+ def img_to_html(img, img_alt):
34
+ img_bytes = io.BytesIO()
35
+ img.save(img_bytes, format="PNG")
36
+ img_bytes = img_bytes.getvalue()
37
+ encoded = base64.b64encode(img_bytes).decode()
38
+ img_html = f'<img src="data:image/png;base64,{encoded}" alt="{img_alt}" style="max-width: 100%;">'
39
+ return img_html
40
+
41
+
42
+ def markdown_insert_images(markdown, images):
43
+ image_tags = re.findall(r'(!\[(?P<image_title>[^\]]+)\]\((?P<image_path>[^\)"\s]+)\s*([^\)]*)\))', markdown)
44
+
45
+ for image in image_tags:
46
+ image_markdown = image[0]
47
+ image_alt = image[1]
48
+ image_path = image[2]
49
+ if image_path in images:
50
+ markdown = markdown.replace(image_markdown, img_to_html(images[image_path], image_alt))
51
+ return markdown
52
+
53
+
54
+ @st.cache_data()
55
+ def get_page_image(pdf_file, page_num, dpi=96):
56
+ doc = open_pdf(pdf_file)
57
+ renderer = doc.render(
58
+ pypdfium2.PdfBitmap.to_pil,
59
+ page_indices=[page_num - 1],
60
+ scale=dpi / 72,
61
+ )
62
+ png = list(renderer)[0]
63
+ png_image = png.convert("RGB")
64
+ return png_image
65
+
66
+
67
+ @st.cache_data()
68
+ def page_count(pdf_file):
69
+ doc = open_pdf(pdf_file)
70
+ return len(doc)
71
+
72
+
73
+ st.set_page_config(layout="wide")
74
+ col1, col2 = st.columns([.5, .5])
75
+
76
+ model_lst = load_models()
77
+
78
+
79
+ st.markdown("""
80
+ # Marker Demo
81
+
82
+ This app will let you try marker, a PDF -> Markdown converter. It works with any languages, and extracts images, tables, equations, etc.
83
+
84
+ Find the project [here](https://github.com/VikParuchuri/marker).
85
+ """)
86
+
87
+ in_file = st.sidebar.file_uploader("PDF file:", type=["pdf"])
88
+ languages = st.sidebar.multiselect("Languages", sorted(list(CODE_TO_LANGUAGE.values())), default=[], max_selections=4, help="Select the languages in the pdf (if known) to improve OCR accuracy. Optional.")
89
+ max_pages = st.sidebar.number_input("Max pages to parse", min_value=1, value=10, help="Optional maximum number of pages to convert")
90
+ ocr_all_pages = st.sidebar.checkbox("Force OCR on all pages", help="Force OCR on all pages, even if they are images", value=False)
91
+
92
+ if in_file is None:
93
+ st.stop()
94
+
95
+ filetype = in_file.type
96
+
97
+ with col1:
98
+ page_count = page_count(in_file)
99
+ page_number = st.number_input(f"Page number out of {page_count}:", min_value=1, value=1, max_value=page_count)
100
+ pil_image = get_page_image(in_file, page_number)
101
+
102
+ st.image(pil_image, caption="PDF file (preview)", use_column_width=True)
103
+
104
+ run_marker = st.sidebar.button("Run Marker")
105
+
106
+ if not run_marker:
107
+ st.stop()
108
+
109
+ # Run Marker
110
+ with tempfile.NamedTemporaryFile(suffix=".pdf") as temp_pdf:
111
+ temp_pdf.write(in_file.getvalue())
112
+ temp_pdf.seek(0)
113
+ filename = temp_pdf.name
114
+ md_text, images, out_meta = convert_pdf(filename, languages, max_pages, ocr_all_pages)
115
+ md_text = markdown_insert_images(md_text, images)
116
+ with col2:
117
+ st.markdown(md_text, unsafe_allow_html=True)
118
+
pyproject.toml CHANGED
@@ -15,6 +15,8 @@ include = [
15
  "convert_single.py",
16
  "chunk_convert.sh",
17
  "chunk_convert.py",
 
 
18
  ]
19
 
20
  [tool.poetry.dependencies]
@@ -45,6 +47,7 @@ jupyter = "^1.0.0"
45
  marker = "convert:main"
46
  marker_single = "convert_single:main"
47
  marker_chunk_convert = "chunk_convert:main"
 
48
 
49
  [build-system]
50
  requires = ["poetry-core"]
 
15
  "convert_single.py",
16
  "chunk_convert.sh",
17
  "chunk_convert.py",
18
+ "marker_app.py",
19
+ "run_marker_app.py"
20
  ]
21
 
22
  [tool.poetry.dependencies]
 
47
  marker = "convert:main"
48
  marker_single = "convert_single:main"
49
  marker_chunk_convert = "chunk_convert:main"
50
+ marker_gui = "run_marker_app:run_app"
51
 
52
  [build-system]
53
  requires = ["poetry-core"]
run_marker_app.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import subprocess
3
+ import os
4
+
5
+
6
+ def run_app():
7
+ cur_dir = os.path.dirname(os.path.abspath(__file__))
8
+ app_path = os.path.join(cur_dir, "marker_app.py")
9
+ cmd = ["streamlit", "run", app_path]
10
+ subprocess.run(cmd, env={**os.environ, "IN_STREAMLIT": "true"})
11
+
12
+
13
+ if __name__ == "__main__":
14
+ run_app()