Spaces:

rt4u
/

marker

Sleeping

App Files Files Community

Vik Paruchuri commited on Nov 25, 2024

Commit

906ae67

1 Parent(s): 2b12856

Add debug mode to marker app

Browse files

Files changed (7) hide show

marker/processors/code.py +1 -1
marker/processors/debug.py +2 -0
marker/processors/footnote.py +4 -4
marker/renderers/__init__.py +6 -2
marker/schema/document.py +1 -0
marker/settings.py +1 -0
marker_app.py +25 -2

marker/processors/code.py CHANGED Viewed

@@ -47,4 +47,4 @@ class CodeProcessor(BaseProcessor):
             code_text += text
             is_new_line = text.endswith("\n")
-        block.code = code_text

             code_text += text
             is_new_line = text.endswith("\n")
+        block.code = code_text.rstrip()

marker/processors/debug.py CHANGED Viewed

@@ -54,6 +54,8 @@ class DebugProcessor(BaseProcessor):
         if any([self.debug_layout_images, self.debug_pdf_images, self.debug_json]):
             os.makedirs(self.debug_folder, exist_ok=True)
         if self.debug_layout_images:
             self.draw_layout_debug_images(document)
             print(f"Dumped layout debug images to {self.debug_data_folder}")

         if any([self.debug_layout_images, self.debug_pdf_images, self.debug_json]):
             os.makedirs(self.debug_folder, exist_ok=True)
+        document.debug_data_path = self.debug_folder
         if self.debug_layout_images:
             self.draw_layout_debug_images(document)
             print(f"Dumped layout debug images to {self.debug_data_folder}")

marker/processors/footnote.py CHANGED Viewed

@@ -19,15 +19,15 @@ class FootnoteProcessor(BaseProcessor):
     Attributes:
         page_bottom_threshold (float):
             The fraction of page height that is considered the bottom.
-            Default is .75
         line_height_scaler (float):
-            The amount to scale line height by to consider a block a footnote.
-            Default is .5
     """
     block_types = (BlockTypes.Footnote,)
     page_bottom_threshold = .75
-    line_height_scaler = .85
     def __call__(self, document: Document):

     Attributes:
         page_bottom_threshold (float):
             The fraction of page height that is considered the bottom.
+            Default is .8
         line_height_scaler (float):
+            The amount to scale line height by to consider a block a footnote. (from N to 1+(1-N))
+            Default is .99
     """
     block_types = (BlockTypes.Footnote,)
     page_bottom_threshold = .75
+    line_height_scaler = .99
     def __call__(self, document: Document):

marker/renderers/__init__.py CHANGED Viewed

@@ -63,10 +63,14 @@ class BaseRenderer:
         return page_stats
     def generate_document_metadata(self, document, document_output):
-        return {
             "table_of_contents": document.table_of_contents,
-            "page_stats": self.generate_page_stats(document, document_output)
         }
     def extract_block_html(self, document, block_output):
         soup = BeautifulSoup(block_output.html, 'html.parser')

         return page_stats
     def generate_document_metadata(self, document, document_output):
+        metadata =  {
             "table_of_contents": document.table_of_contents,
+            "page_stats": self.generate_page_stats(document, document_output),
         }
+        if document.debug_data_path is not None:
+            metadata["debug_data_path"] = document.debug_data_path
+        return metadata
     def extract_block_html(self, document, block_output):
         soup = BeautifulSoup(block_output.html, 'html.parser')

marker/schema/document.py CHANGED Viewed

@@ -27,6 +27,7 @@ class Document(BaseModel):
     pages: List[PageGroup]
     block_type: BlockTypes = BlockTypes.Document
     table_of_contents: List[TocItem] | None = None
     def get_block(self, block_id: BlockId):
         page = self.get_page(block_id.page_id)

     pages: List[PageGroup]
     block_type: BlockTypes = BlockTypes.Document
     table_of_contents: List[TocItem] | None = None
+    debug_data_path: str | None = None # Path that debug data was saved to
     def get_block(self, block_id: BlockId):
         page = self.get_page(block_id.page_id)

marker/settings.py CHANGED Viewed

@@ -12,6 +12,7 @@ class Settings(BaseSettings):
     BASE_DIR: str = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
     OUTPUT_DIR: str = os.path.join(BASE_DIR, "conversion_results")
     FONT_DIR: str = os.path.join(BASE_DIR, "static", "fonts")
     # General models
     TORCH_DEVICE: Optional[str] = None # Note: MPS device does not work for text detection, and will default to CPU

     BASE_DIR: str = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
     OUTPUT_DIR: str = os.path.join(BASE_DIR, "conversion_results")
     FONT_DIR: str = os.path.join(BASE_DIR, "static", "fonts")
+    DEBUG_DATA_FOLDER: str = os.path.join(BASE_DIR, "debug_data")
     # General models
     TORCH_DEVICE: Optional[str] = None # Note: MPS device does not work for text detection, and will default to CPU

marker_app.py CHANGED Viewed

@@ -1,5 +1,7 @@
 import os
 os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
 os.environ["IN_STREAMLIT"] = "true"
@@ -11,6 +13,7 @@ from typing import Any, Dict
 import pypdfium2
 import streamlit as st
 from marker.converters.pdf import PdfConverter
 from marker.models import create_model_dict
@@ -109,10 +112,12 @@ with col1:
     st.image(pil_image, caption="PDF file (preview)", use_container_width=True)
 page_range = st.sidebar.text_input("Page range to parse, comma separated like 0,5-10,20", value=f"{page_number}-{page_number}")
-force_ocr = st.sidebar.checkbox("Force OCR on all pages", help="Force OCR on all pages, even if they are images", value=False)
 output_format = st.sidebar.selectbox("Output format", ["markdown", "json", "html"], index=0)
 run_marker = st.sidebar.button("Run Marker")
 if not run_marker:
     st.stop()
@@ -121,7 +126,14 @@ with tempfile.NamedTemporaryFile(suffix=".pdf") as temp_pdf:
     temp_pdf.write(in_file.getvalue())
     temp_pdf.seek(0)
     filename = temp_pdf.name
-    rendered = convert_pdf(filename, page_range=page_range, force_ocr=force_ocr, output_format=output_format)
 text, ext, images = text_from_rendered(rendered)
 with col2:
@@ -133,3 +145,14 @@ with col2:
     elif output_format == "html":
         st.html(text)

 import os
+from marker.settings import settings
 os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
 os.environ["IN_STREAMLIT"] = "true"
 import pypdfium2
 import streamlit as st
+from PIL import Image
 from marker.converters.pdf import PdfConverter
 from marker.models import create_model_dict
     st.image(pil_image, caption="PDF file (preview)", use_container_width=True)
 page_range = st.sidebar.text_input("Page range to parse, comma separated like 0,5-10,20", value=f"{page_number}-{page_number}")
 output_format = st.sidebar.selectbox("Output format", ["markdown", "json", "html"], index=0)
 run_marker = st.sidebar.button("Run Marker")
+force_ocr = st.sidebar.checkbox("Force OCR", help="Force OCR on all pages", value=False)
+debug = st.sidebar.checkbox("Debug", help="Show debug information", value=False)
 if not run_marker:
     st.stop()
     temp_pdf.write(in_file.getvalue())
     temp_pdf.seek(0)
     filename = temp_pdf.name
+    rendered = convert_pdf(
+        filename,
+        page_range=page_range,
+        force_ocr=force_ocr,
+        output_format=output_format,
+        output_dir=settings.DEBUG_DATA_FOLDER if debug else None,
+        debug=debug
+    )
 text, ext, images = text_from_rendered(rendered)
 with col2:
     elif output_format == "html":
         st.html(text)
+if debug:
+    with col1:
+        debug_data_path = rendered.metadata.get("debug_data_path")
+        if debug_data_path:
+            pdf_image_path = os.path.join(debug_data_path, f"pdf_page_0.png")
+            img = Image.open(pdf_image_path)
+            st.image(img, caption="PDF debug image", use_container_width=True)
+            layout_image_path = os.path.join(debug_data_path, f"layout_page_0.png")
+            img = Image.open(layout_image_path)
+            st.image(img, caption="Layout debug image", use_container_width=True)