Spaces:

rt4u
/

marker

Sleeping

Vik Paruchuri commited on Oct 17, 2024

Commit

a97dc07

1 Parent(s): d8cea21

Add computed table of contents

Files changed (5) hide show

README.md CHANGED Viewed

@@ -147,6 +147,36 @@ METADATA_FILE=../pdf_meta.json NUM_DEVICES=4 NUM_WORKERS=15 marker_chunk_convert
 Note that the env variables above are specific to this script, and cannot be set in `local.env`.
 # Troubleshooting
 There are some settings that you may find useful if things aren't working the way you expect:

 Note that the env variables above are specific to this script, and cannot be set in `local.env`.
+# Output format
+The output will be a markdown file, but there will also be a metadata json file that gives information about the conversion process.  It has these fields:
+```json
+{
+    "languages": null, // any languages that were passed in
+    "filetype": "pdf", // type of the file
+    "pdf_toc": [], // the table of contents from the pdf
+    "computed_toc": [], //the computed table of contents
+    "pages": 10, // page count
+    "ocr_stats": {
+        "ocr_pages": 0, // number of pages OCRed
+        "ocr_failed": 0, // number of pages where OCR failed
+        "ocr_success": 0,
+        "ocr_engine": "none"
+    },
+    "block_stats": {
+        "header_footer": 0,
+        "code": 0, // number of code blocks
+        "table": 2, // number of tables
+        "equations": {
+            "successful_ocr": 0,
+            "unsuccessful_ocr": 0,
+            "equations": 0
+        }
+    }
+}
+```
 # Troubleshooting
 There are some settings that you may find useful if things aren't working the way you expect:

marker/cleaners/headings.py CHANGED Viewed

@@ -120,3 +120,4 @@ def infer_heading_levels(pages: List[Page], height_tol=.99):
             if block.heading_level is None:
                 block.heading_level = min(len(heading_ranges), settings.HEADING_DEFAULT_LEVEL)


120
121	if block.heading_level is None:
122	block.heading_level = min(len(heading_ranges), settings.HEADING_DEFAULT_LEVEL)
123	+

marker/cleaners/toc.py ADDED Viewed

+from typing import List
+from marker.schema.page import Page
+def get_pdf_toc(doc, max_depth=15):
+    toc = doc.get_toc(max_depth=max_depth)
+    toc_list = []
+    for item in toc:
+        list_item = {
+            "title": item.title,
+            "level": item.level,
+            "page": item.page_index,
+        }
+        toc_list.append(list_item)
+    return toc_list
+def compute_toc(pages: List[Page]):
+    toc = []
+    for page in pages:
+        for block in page.blocks:
+            if block.block_type in ["Title", "Section-header"]:
+                toc.append({
+                    "title": block.prelim_text,
+                    "level": block.heading_level,
+                    "page": page.pnum
+                })
+    return toc

marker/convert.py CHANGED Viewed

@@ -1,4 +1,7 @@
 import warnings
 warnings.filterwarnings("ignore", category=UserWarning) # Filter torch pytree user warnings
 import os
@@ -72,7 +75,7 @@ def convert_single_pdf(
         start_page=start_page
     )
     out_meta.update({
-        "toc": toc,
         "pages": len(pages),
     })
@@ -149,6 +152,9 @@ def convert_single_pdf(
     infer_heading_levels(pages)
     find_bold_italic(pages)
     # Copy to avoid changing original data
     merged_lines = merge_spans(filtered)
     text_blocks = merge_lines(merged_lines)

 import warnings
+from marker.cleaners.toc import compute_toc
 warnings.filterwarnings("ignore", category=UserWarning) # Filter torch pytree user warnings
 import os
         start_page=start_page
     )
     out_meta.update({
+        "pdf_toc": toc,
         "pages": len(pages),
     })
     infer_heading_levels(pages)
     find_bold_italic(pages)
+    # Use headers to compute a table of contents
+    out_meta["computed_toc"] = compute_toc(pages)
     # Copy to avoid changing original data
     merged_lines = merge_spans(filtered)
     text_blocks = merge_lines(merged_lines)

marker/pdf/extract_text.py CHANGED Viewed

@@ -2,8 +2,8 @@ import os
 from typing import List, Optional, Dict
 import pypdfium2 as pdfium
-import pypdfium2.internal as pdfium_i
 from marker.pdf.utils import font_flags_decomposer
 from marker.settings import settings
 from marker.schema.block import Span, Line, Block
@@ -77,7 +77,7 @@ def pdftext_format_to_blocks(page, pnum: int) -> Page:
 def get_text_blocks(doc, fname, max_pages: Optional[int] = None, start_page: Optional[int] = None) -> (List[Page], Dict):
-    toc = get_toc(doc)
     if start_page:
         assert start_page < len(doc)
@@ -107,23 +107,6 @@ def naive_get_text(doc):
     return full_text
-def get_toc(doc, max_depth=15):
-    toc = doc.get_toc(max_depth=max_depth)
-    toc_list = []
-    for item in toc:
-        list_item = {
-            "title": item.title,
-            "level": item.level,
-            "is_closed": item.is_closed,
-            "n_kids": item.n_kids,
-            "page_index": item.page_index,
-            "view_mode": pdfium_i.ViewmodeToStr.get(item.view_mode),
-            "view_pos": item.view_pos,
-        }
-        toc_list.append(list_item)
-    return toc_list
 def get_length_of_text(fname: str) -> int:
     doc = pdfium.PdfDocument(fname)
     text = naive_get_text(doc).strip()

 from typing import List, Optional, Dict
 import pypdfium2 as pdfium
+from marker.cleaners.toc import get_pdf_toc
 from marker.pdf.utils import font_flags_decomposer
 from marker.settings import settings
 from marker.schema.block import Span, Line, Block
 def get_text_blocks(doc, fname, max_pages: Optional[int] = None, start_page: Optional[int] = None) -> (List[Page], Dict):
+    toc = get_pdf_toc(doc)
     if start_page:
         assert start_page < len(doc)
     return full_text
 def get_length_of_text(fname: str) -> int:
     doc = pdfium.PdfDocument(fname)
     text = naive_get_text(doc).strip()