Vik Paruchuri commited on
Commit
250c0ca
·
1 Parent(s): 116af39

Fix README for doc formats

Browse files
README.md CHANGED
@@ -48,9 +48,11 @@ The weights for the models are licensed `cc-by-nc-sa-4.0`, but I will waive that
48
 
49
  There's a hosted API for marker available [here](https://www.datalab.to/):
50
 
51
- - Supports PDFs, word documents, and powerpoints
52
  - 1/4th the price of leading cloud-based competitors
53
- - High uptime (99.99%), quality, and speed (around 15 seconds to convert a 250 page PDF)
 
 
54
 
55
  # Community
56
 
 
48
 
49
  There's a hosted API for marker available [here](https://www.datalab.to/):
50
 
51
+ - Supports PDF, image, PPT, PPTX, DOC, DOCX, XLS, XLSX, HTML, EPUB files
52
  - 1/4th the price of leading cloud-based competitors
53
+ - Fast - ~15s for a 250 page PDF
54
+ - Supports LLM mode
55
+ - High uptime (99.99%)
56
 
57
  # Community
58
 
marker/providers/registry.py CHANGED
@@ -19,9 +19,9 @@ DOCTYPE_MATCHERS = {
19
  "epub": [
20
  archive.Epub,
21
  ],
22
- "doc": [document.Doc, document.Docx, document.Odt],
23
- "xls": [document.Xls, document.Xlsx, document.Ods],
24
- "ppt": [document.Ppt, document.Pptx, document.Odp],
25
  }
26
 
27
 
@@ -71,10 +71,11 @@ def provider_from_filepath(filepath: str):
71
  return PowerPointProvider
72
 
73
  try:
74
- soup = BeautifulSoup(open(filepath, "r").read(), "html.parser")
75
- # Check if there are any HTML tags
76
- if bool(soup.find()):
77
- return HTMLProvider
 
78
  except Exception:
79
  pass
80
 
 
19
  "epub": [
20
  archive.Epub,
21
  ],
22
+ "doc": [document.Docx],
23
+ "xls": [document.Xlsx],
24
+ "ppt": [document.Pptx],
25
  }
26
 
27
 
 
71
  return PowerPointProvider
72
 
73
  try:
74
+ with open(filepath, "r", encoding="utf-8") as f:
75
+ soup = BeautifulSoup(f.read(), "html.parser")
76
+ # Check if there are any HTML tags
77
+ if bool(soup.find()):
78
+ return HTMLProvider
79
  except Exception:
80
  pass
81
 
marker/scripts/streamlit_app.py CHANGED
@@ -1,5 +1,6 @@
1
  import os
2
  import sys
 
3
  os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
4
  os.environ["IN_STREAMLIT"] = "true"
5
 
@@ -37,11 +38,12 @@ COLORS = [
37
  "#af7aa1",
38
  "#ff9da7",
39
  "#9c755f",
40
- "#bab0ab"
41
  ]
42
 
43
  with open(
44
- os.path.join(os.path.dirname(__file__), "streamlit_app_blocks_viz.html"), encoding="utf-8"
 
45
  ) as f:
46
  BLOCKS_VIZ_TMPL = string.Template(f.read())
47
 
@@ -54,7 +56,7 @@ def parse_args():
54
  pass
55
 
56
  def extract_click_params(decorated_function):
57
- if hasattr(decorated_function, '__click_params__'):
58
  return decorated_function.__click_params__
59
  return []
60
 
@@ -69,6 +71,7 @@ def parse_args():
69
  except click.exceptions.ClickException as e:
70
  return {"error": str(e)}
71
 
 
72
  @st.cache_resource()
73
  def load_models():
74
  return create_model_dict()
@@ -83,7 +86,7 @@ def convert_pdf(fname: str, config_parser: ConfigParser) -> (str, Dict[str, Any]
83
  artifact_dict=model_dict,
84
  processor_list=config_parser.get_processors(),
85
  renderer=config_parser.get_renderer(),
86
- llm_service=config_parser.get_llm_service()
87
  )
88
  return converter(fname)
89
 
@@ -103,14 +106,19 @@ def img_to_html(img, img_alt):
103
 
104
 
105
  def markdown_insert_images(markdown, images):
106
- image_tags = re.findall(r'(!\[(?P<image_title>[^\]]*)\]\((?P<image_path>[^\)"\s]+)\s*([^\)]*)\))', markdown)
 
 
 
107
 
108
  for image in image_tags:
109
  image_markdown = image[0]
110
  image_alt = image[1]
111
  image_path = image[2]
112
  if image_path in images:
113
- markdown = markdown.replace(image_markdown, img_to_html(images[image_path], image_alt))
 
 
114
  return markdown
115
 
116
 
@@ -119,9 +127,13 @@ def get_page_image(pdf_file, page_num, dpi=96):
119
  if "pdf" in pdf_file.type:
120
  doc = open_pdf(pdf_file)
121
  page = doc[page_num]
122
- png_image = page.render(
123
- scale=dpi / 72,
124
- ).to_pil().convert("RGB")
 
 
 
 
125
  else:
126
  png_image = Image.open(pdf_file).convert("RGB")
127
  return png_image
@@ -146,31 +158,27 @@ def block_display(image: Image, blocks: dict | None = None, dpi=96):
146
  if blocks is None:
147
  blocks = {}
148
 
149
- image_data_url = (
150
- 'data:image/jpeg;base64,' + pillow_image_to_base64_string(image)
151
- )
152
 
153
  template_values = {
154
  "image_data_url": image_data_url,
155
- "image_width": image.width, "image_height": image.height,
156
- "blocks_json": blocks, "colors_json": json.dumps(COLORS),
157
- "block_types_json": json.dumps({
158
- bt.name: i for i, bt in enumerate(BlockTypes)
159
- })
160
  }
161
  return components.html(
162
- BLOCKS_VIZ_TMPL.substitute(**template_values),
163
- height=image.height
164
  )
165
 
166
 
167
  st.set_page_config(layout="wide")
168
- col1, col2 = st.columns([.5, .5])
169
 
170
  model_dict = load_models()
171
  cli_options = parse_args()
172
 
173
-
174
  st.markdown("""
175
  # Marker Demo
176
 
@@ -179,7 +187,10 @@ This app will let you try marker, a PDF or image -> Markdown, HTML, JSON convert
179
  Find the project [here](https://github.com/VikParuchuri/marker).
180
  """)
181
 
182
- in_file: UploadedFile = st.sidebar.file_uploader("PDF, document, or image file:", type=["pdf", "png", "jpg", "jpeg", "gif", "pptx", "docx", "xlsx", "html", "epub"])
 
 
 
183
 
184
  if in_file is None:
185
  st.stop()
@@ -188,49 +199,63 @@ filetype = in_file.type
188
 
189
  with col1:
190
  page_count = page_count(in_file)
191
- page_number = st.number_input(f"Page number out of {page_count}:", min_value=0, value=0, max_value=page_count)
 
 
192
  pil_image = get_page_image(in_file, page_number)
193
  image_placeholder = st.empty()
194
 
195
  with image_placeholder:
196
  block_display(pil_image)
197
 
198
-
199
- page_range = st.sidebar.text_input("Page range to parse, comma separated like 0,5-10,20", value=f"{page_number}-{page_number}")
200
- output_format = st.sidebar.selectbox("Output format", ["markdown", "json", "html"], index=0)
 
 
 
 
201
  run_marker = st.sidebar.button("Run Marker")
202
 
203
- use_llm = st.sidebar.checkbox("Use LLM", help="Use LLM for higher quality processing", value=False)
204
- show_blocks = st.sidebar.checkbox("Show Blocks", help="Display detected blocks, only when output is JSON", value=False, disabled=output_format != "json")
 
 
 
 
 
 
 
205
  force_ocr = st.sidebar.checkbox("Force OCR", help="Force OCR on all pages", value=False)
206
- strip_existing_ocr = st.sidebar.checkbox("Strip existing OCR", help="Strip existing OCR text from the PDF and re-OCR.", value=False)
 
 
 
 
207
  debug = st.sidebar.checkbox("Debug", help="Show debug information", value=False)
208
- fix_lines = st.sidebar.checkbox("Fix lines", help="Fix line formats and math in the document", value=False)
209
 
210
  if not run_marker:
211
  st.stop()
212
 
213
  # Run Marker
214
  with tempfile.TemporaryDirectory() as tmp_dir:
215
- temp_pdf = os.path.join(tmp_dir, 'temp.pdf')
216
- with open(temp_pdf, 'wb') as f:
217
  f.write(in_file.getvalue())
218
-
219
- cli_options.update({
220
- "output_format": output_format,
221
- "page_range": page_range,
222
- "force_ocr": force_ocr,
223
- "debug": debug,
224
- "output_dir": settings.DEBUG_DATA_FOLDER if debug else None,
225
- "use_llm": use_llm,
226
- "strip_existing_ocr": strip_existing_ocr,
227
- "fix_lines": fix_lines
228
- })
229
- config_parser = ConfigParser(cli_options)
230
- rendered = convert_pdf(
231
- temp_pdf,
232
- config_parser
233
  )
 
 
234
  page_range = config_parser.generate_config_dict()["page_range"]
235
  first_page = page_range[0] if page_range else 0
236
 
@@ -242,7 +267,7 @@ with col2:
242
  elif output_format == "json":
243
  st.json(text)
244
  elif output_format == "html":
245
- st.markdown(text, unsafe_allow_html=True)
246
 
247
  if output_format == "json" and show_blocks:
248
  with image_placeholder:
@@ -255,7 +280,9 @@ if debug:
255
  pdf_image_path = os.path.join(debug_data_path, f"pdf_page_{first_page}.png")
256
  img = Image.open(pdf_image_path)
257
  st.image(img, caption="PDF debug image", use_container_width=True)
258
- layout_image_path = os.path.join(debug_data_path, f"layout_page_{first_page}.png")
 
 
259
  img = Image.open(layout_image_path)
260
  st.image(img, caption="Layout debug image", use_container_width=True)
261
  st.write("Raw output:")
 
1
  import os
2
  import sys
3
+
4
  os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
5
  os.environ["IN_STREAMLIT"] = "true"
6
 
 
38
  "#af7aa1",
39
  "#ff9da7",
40
  "#9c755f",
41
+ "#bab0ab",
42
  ]
43
 
44
  with open(
45
+ os.path.join(os.path.dirname(__file__), "streamlit_app_blocks_viz.html"),
46
+ encoding="utf-8",
47
  ) as f:
48
  BLOCKS_VIZ_TMPL = string.Template(f.read())
49
 
 
56
  pass
57
 
58
  def extract_click_params(decorated_function):
59
+ if hasattr(decorated_function, "__click_params__"):
60
  return decorated_function.__click_params__
61
  return []
62
 
 
71
  except click.exceptions.ClickException as e:
72
  return {"error": str(e)}
73
 
74
+
75
  @st.cache_resource()
76
  def load_models():
77
  return create_model_dict()
 
86
  artifact_dict=model_dict,
87
  processor_list=config_parser.get_processors(),
88
  renderer=config_parser.get_renderer(),
89
+ llm_service=config_parser.get_llm_service(),
90
  )
91
  return converter(fname)
92
 
 
106
 
107
 
108
  def markdown_insert_images(markdown, images):
109
+ image_tags = re.findall(
110
+ r'(!\[(?P<image_title>[^\]]*)\]\((?P<image_path>[^\)"\s]+)\s*([^\)]*)\))',
111
+ markdown,
112
+ )
113
 
114
  for image in image_tags:
115
  image_markdown = image[0]
116
  image_alt = image[1]
117
  image_path = image[2]
118
  if image_path in images:
119
+ markdown = markdown.replace(
120
+ image_markdown, img_to_html(images[image_path], image_alt)
121
+ )
122
  return markdown
123
 
124
 
 
127
  if "pdf" in pdf_file.type:
128
  doc = open_pdf(pdf_file)
129
  page = doc[page_num]
130
+ png_image = (
131
+ page.render(
132
+ scale=dpi / 72,
133
+ )
134
+ .to_pil()
135
+ .convert("RGB")
136
+ )
137
  else:
138
  png_image = Image.open(pdf_file).convert("RGB")
139
  return png_image
 
158
  if blocks is None:
159
  blocks = {}
160
 
161
+ image_data_url = "data:image/jpeg;base64," + pillow_image_to_base64_string(image)
 
 
162
 
163
  template_values = {
164
  "image_data_url": image_data_url,
165
+ "image_width": image.width,
166
+ "image_height": image.height,
167
+ "blocks_json": blocks,
168
+ "colors_json": json.dumps(COLORS),
169
+ "block_types_json": json.dumps({bt.name: i for i, bt in enumerate(BlockTypes)}),
170
  }
171
  return components.html(
172
+ BLOCKS_VIZ_TMPL.substitute(**template_values), height=image.height * 1.5
 
173
  )
174
 
175
 
176
  st.set_page_config(layout="wide")
177
+ col1, col2 = st.columns([0.5, 0.5])
178
 
179
  model_dict = load_models()
180
  cli_options = parse_args()
181
 
 
182
  st.markdown("""
183
  # Marker Demo
184
 
 
187
  Find the project [here](https://github.com/VikParuchuri/marker).
188
  """)
189
 
190
+ in_file: UploadedFile = st.sidebar.file_uploader(
191
+ "PDF, document, or image file:",
192
+ type=["pdf", "png", "jpg", "jpeg", "gif", "pptx", "docx", "xlsx", "html", "epub"],
193
+ )
194
 
195
  if in_file is None:
196
  st.stop()
 
199
 
200
  with col1:
201
  page_count = page_count(in_file)
202
+ page_number = st.number_input(
203
+ f"Page number out of {page_count}:", min_value=0, value=0, max_value=page_count
204
+ )
205
  pil_image = get_page_image(in_file, page_number)
206
  image_placeholder = st.empty()
207
 
208
  with image_placeholder:
209
  block_display(pil_image)
210
 
211
+ page_range = st.sidebar.text_input(
212
+ "Page range to parse, comma separated like 0,5-10,20",
213
+ value=f"{page_number}-{page_number}",
214
+ )
215
+ output_format = st.sidebar.selectbox(
216
+ "Output format", ["markdown", "json", "html"], index=0
217
+ )
218
  run_marker = st.sidebar.button("Run Marker")
219
 
220
+ use_llm = st.sidebar.checkbox(
221
+ "Use LLM", help="Use LLM for higher quality processing", value=False
222
+ )
223
+ show_blocks = st.sidebar.checkbox(
224
+ "Show Blocks",
225
+ help="Display detected blocks, only when output is JSON",
226
+ value=False,
227
+ disabled=output_format != "json",
228
+ )
229
  force_ocr = st.sidebar.checkbox("Force OCR", help="Force OCR on all pages", value=False)
230
+ strip_existing_ocr = st.sidebar.checkbox(
231
+ "Strip existing OCR",
232
+ help="Strip existing OCR text from the PDF and re-OCR.",
233
+ value=False,
234
+ )
235
  debug = st.sidebar.checkbox("Debug", help="Show debug information", value=False)
 
236
 
237
  if not run_marker:
238
  st.stop()
239
 
240
  # Run Marker
241
  with tempfile.TemporaryDirectory() as tmp_dir:
242
+ temp_pdf = os.path.join(tmp_dir, "temp.pdf")
243
+ with open(temp_pdf, "wb") as f:
244
  f.write(in_file.getvalue())
245
+
246
+ cli_options.update(
247
+ {
248
+ "output_format": output_format,
249
+ "page_range": page_range,
250
+ "force_ocr": force_ocr,
251
+ "debug": debug,
252
+ "output_dir": settings.DEBUG_DATA_FOLDER if debug else None,
253
+ "use_llm": use_llm,
254
+ "strip_existing_ocr": strip_existing_ocr,
255
+ }
 
 
 
 
256
  )
257
+ config_parser = ConfigParser(cli_options)
258
+ rendered = convert_pdf(temp_pdf, config_parser)
259
  page_range = config_parser.generate_config_dict()["page_range"]
260
  first_page = page_range[0] if page_range else 0
261
 
 
267
  elif output_format == "json":
268
  st.json(text)
269
  elif output_format == "html":
270
+ st.html(text)
271
 
272
  if output_format == "json" and show_blocks:
273
  with image_placeholder:
 
280
  pdf_image_path = os.path.join(debug_data_path, f"pdf_page_{first_page}.png")
281
  img = Image.open(pdf_image_path)
282
  st.image(img, caption="PDF debug image", use_container_width=True)
283
+ layout_image_path = os.path.join(
284
+ debug_data_path, f"layout_page_{first_page}.png"
285
+ )
286
  img = Image.open(layout_image_path)
287
  st.image(img, caption="Layout debug image", use_container_width=True)
288
  st.write("Raw output:")
pyproject.toml CHANGED
@@ -1,6 +1,6 @@
1
  [tool.poetry]
2
  name = "marker-pdf"
3
- version = "1.6.2"
4
  description = "Convert documents to markdown with high speed and accuracy."
5
  authors = ["Vik Paruchuri <github@vikas.sh>"]
6
  readme = "README.md"
 
1
  [tool.poetry]
2
  name = "marker-pdf"
3
+ version = "1.7.0"
4
  description = "Convert documents to markdown with high speed and accuracy."
5
  authors = ["Vik Paruchuri <github@vikas.sh>"]
6
  readme = "README.md"