Vik Paruchuri commited on
Commit
906ae67
·
1 Parent(s): 2b12856

Add debug mode to marker app

Browse files
marker/processors/code.py CHANGED
@@ -47,4 +47,4 @@ class CodeProcessor(BaseProcessor):
47
  code_text += text
48
  is_new_line = text.endswith("\n")
49
 
50
- block.code = code_text
 
47
  code_text += text
48
  is_new_line = text.endswith("\n")
49
 
50
+ block.code = code_text.rstrip()
marker/processors/debug.py CHANGED
@@ -54,6 +54,8 @@ class DebugProcessor(BaseProcessor):
54
  if any([self.debug_layout_images, self.debug_pdf_images, self.debug_json]):
55
  os.makedirs(self.debug_folder, exist_ok=True)
56
 
 
 
57
  if self.debug_layout_images:
58
  self.draw_layout_debug_images(document)
59
  print(f"Dumped layout debug images to {self.debug_data_folder}")
 
54
  if any([self.debug_layout_images, self.debug_pdf_images, self.debug_json]):
55
  os.makedirs(self.debug_folder, exist_ok=True)
56
 
57
+ document.debug_data_path = self.debug_folder
58
+
59
  if self.debug_layout_images:
60
  self.draw_layout_debug_images(document)
61
  print(f"Dumped layout debug images to {self.debug_data_folder}")
marker/processors/footnote.py CHANGED
@@ -19,15 +19,15 @@ class FootnoteProcessor(BaseProcessor):
19
  Attributes:
20
  page_bottom_threshold (float):
21
  The fraction of page height that is considered the bottom.
22
- Default is .75
23
 
24
  line_height_scaler (float):
25
- The amount to scale line height by to consider a block a footnote.
26
- Default is .5
27
  """
28
  block_types = (BlockTypes.Footnote,)
29
  page_bottom_threshold = .75
30
- line_height_scaler = .85
31
 
32
 
33
  def __call__(self, document: Document):
 
19
  Attributes:
20
  page_bottom_threshold (float):
21
  The fraction of page height that is considered the bottom.
22
+ Default is .8
23
 
24
  line_height_scaler (float):
25
+ The amount to scale line height by to consider a block a footnote. (from N to 1+(1-N))
26
+ Default is .99
27
  """
28
  block_types = (BlockTypes.Footnote,)
29
  page_bottom_threshold = .75
30
+ line_height_scaler = .99
31
 
32
 
33
  def __call__(self, document: Document):
marker/renderers/__init__.py CHANGED
@@ -63,10 +63,14 @@ class BaseRenderer:
63
  return page_stats
64
 
65
  def generate_document_metadata(self, document, document_output):
66
- return {
67
  "table_of_contents": document.table_of_contents,
68
- "page_stats": self.generate_page_stats(document, document_output)
69
  }
 
 
 
 
70
 
71
  def extract_block_html(self, document, block_output):
72
  soup = BeautifulSoup(block_output.html, 'html.parser')
 
63
  return page_stats
64
 
65
  def generate_document_metadata(self, document, document_output):
66
+ metadata = {
67
  "table_of_contents": document.table_of_contents,
68
+ "page_stats": self.generate_page_stats(document, document_output),
69
  }
70
+ if document.debug_data_path is not None:
71
+ metadata["debug_data_path"] = document.debug_data_path
72
+
73
+ return metadata
74
 
75
  def extract_block_html(self, document, block_output):
76
  soup = BeautifulSoup(block_output.html, 'html.parser')
marker/schema/document.py CHANGED
@@ -27,6 +27,7 @@ class Document(BaseModel):
27
  pages: List[PageGroup]
28
  block_type: BlockTypes = BlockTypes.Document
29
  table_of_contents: List[TocItem] | None = None
 
30
 
31
  def get_block(self, block_id: BlockId):
32
  page = self.get_page(block_id.page_id)
 
27
  pages: List[PageGroup]
28
  block_type: BlockTypes = BlockTypes.Document
29
  table_of_contents: List[TocItem] | None = None
30
+ debug_data_path: str | None = None # Path that debug data was saved to
31
 
32
  def get_block(self, block_id: BlockId):
33
  page = self.get_page(block_id.page_id)
marker/settings.py CHANGED
@@ -12,6 +12,7 @@ class Settings(BaseSettings):
12
  BASE_DIR: str = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
13
  OUTPUT_DIR: str = os.path.join(BASE_DIR, "conversion_results")
14
  FONT_DIR: str = os.path.join(BASE_DIR, "static", "fonts")
 
15
 
16
  # General models
17
  TORCH_DEVICE: Optional[str] = None # Note: MPS device does not work for text detection, and will default to CPU
 
12
  BASE_DIR: str = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
13
  OUTPUT_DIR: str = os.path.join(BASE_DIR, "conversion_results")
14
  FONT_DIR: str = os.path.join(BASE_DIR, "static", "fonts")
15
+ DEBUG_DATA_FOLDER: str = os.path.join(BASE_DIR, "debug_data")
16
 
17
  # General models
18
  TORCH_DEVICE: Optional[str] = None # Note: MPS device does not work for text detection, and will default to CPU
marker_app.py CHANGED
@@ -1,5 +1,7 @@
1
  import os
2
 
 
 
3
  os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
4
  os.environ["IN_STREAMLIT"] = "true"
5
 
@@ -11,6 +13,7 @@ from typing import Any, Dict
11
 
12
  import pypdfium2
13
  import streamlit as st
 
14
 
15
  from marker.converters.pdf import PdfConverter
16
  from marker.models import create_model_dict
@@ -109,10 +112,12 @@ with col1:
109
  st.image(pil_image, caption="PDF file (preview)", use_container_width=True)
110
 
111
  page_range = st.sidebar.text_input("Page range to parse, comma separated like 0,5-10,20", value=f"{page_number}-{page_number}")
112
- force_ocr = st.sidebar.checkbox("Force OCR on all pages", help="Force OCR on all pages, even if they are images", value=False)
113
  output_format = st.sidebar.selectbox("Output format", ["markdown", "json", "html"], index=0)
114
  run_marker = st.sidebar.button("Run Marker")
115
 
 
 
 
116
  if not run_marker:
117
  st.stop()
118
 
@@ -121,7 +126,14 @@ with tempfile.NamedTemporaryFile(suffix=".pdf") as temp_pdf:
121
  temp_pdf.write(in_file.getvalue())
122
  temp_pdf.seek(0)
123
  filename = temp_pdf.name
124
- rendered = convert_pdf(filename, page_range=page_range, force_ocr=force_ocr, output_format=output_format)
 
 
 
 
 
 
 
125
 
126
  text, ext, images = text_from_rendered(rendered)
127
  with col2:
@@ -133,3 +145,14 @@ with col2:
133
  elif output_format == "html":
134
  st.html(text)
135
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
 
3
+ from marker.settings import settings
4
+
5
  os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
6
  os.environ["IN_STREAMLIT"] = "true"
7
 
 
13
 
14
  import pypdfium2
15
  import streamlit as st
16
+ from PIL import Image
17
 
18
  from marker.converters.pdf import PdfConverter
19
  from marker.models import create_model_dict
 
112
  st.image(pil_image, caption="PDF file (preview)", use_container_width=True)
113
 
114
  page_range = st.sidebar.text_input("Page range to parse, comma separated like 0,5-10,20", value=f"{page_number}-{page_number}")
 
115
  output_format = st.sidebar.selectbox("Output format", ["markdown", "json", "html"], index=0)
116
  run_marker = st.sidebar.button("Run Marker")
117
 
118
+ force_ocr = st.sidebar.checkbox("Force OCR", help="Force OCR on all pages", value=False)
119
+ debug = st.sidebar.checkbox("Debug", help="Show debug information", value=False)
120
+
121
  if not run_marker:
122
  st.stop()
123
 
 
126
  temp_pdf.write(in_file.getvalue())
127
  temp_pdf.seek(0)
128
  filename = temp_pdf.name
129
+ rendered = convert_pdf(
130
+ filename,
131
+ page_range=page_range,
132
+ force_ocr=force_ocr,
133
+ output_format=output_format,
134
+ output_dir=settings.DEBUG_DATA_FOLDER if debug else None,
135
+ debug=debug
136
+ )
137
 
138
  text, ext, images = text_from_rendered(rendered)
139
  with col2:
 
145
  elif output_format == "html":
146
  st.html(text)
147
 
148
+ if debug:
149
+ with col1:
150
+ debug_data_path = rendered.metadata.get("debug_data_path")
151
+ if debug_data_path:
152
+ pdf_image_path = os.path.join(debug_data_path, f"pdf_page_0.png")
153
+ img = Image.open(pdf_image_path)
154
+ st.image(img, caption="PDF debug image", use_container_width=True)
155
+ layout_image_path = os.path.join(debug_data_path, f"layout_page_0.png")
156
+ img = Image.open(layout_image_path)
157
+ st.image(img, caption="Layout debug image", use_container_width=True)
158
+