Vik Paruchuri commited on
Commit
a97dc07
·
1 Parent(s): d8cea21

Add computed table of contents

Browse files
README.md CHANGED
@@ -147,6 +147,36 @@ METADATA_FILE=../pdf_meta.json NUM_DEVICES=4 NUM_WORKERS=15 marker_chunk_convert
147
 
148
  Note that the env variables above are specific to this script, and cannot be set in `local.env`.
149
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
  # Troubleshooting
151
 
152
  There are some settings that you may find useful if things aren't working the way you expect:
 
147
 
148
  Note that the env variables above are specific to this script, and cannot be set in `local.env`.
149
 
150
+ # Output format
151
+
152
+ The output will be a markdown file, but there will also be a metadata json file that gives information about the conversion process. It has these fields:
153
+
154
+ ```json
155
+ {
156
+ "languages": null, // any languages that were passed in
157
+ "filetype": "pdf", // type of the file
158
+ "pdf_toc": [], // the table of contents from the pdf
159
+ "computed_toc": [], //the computed table of contents
160
+ "pages": 10, // page count
161
+ "ocr_stats": {
162
+ "ocr_pages": 0, // number of pages OCRed
163
+ "ocr_failed": 0, // number of pages where OCR failed
164
+ "ocr_success": 0,
165
+ "ocr_engine": "none"
166
+ },
167
+ "block_stats": {
168
+ "header_footer": 0,
169
+ "code": 0, // number of code blocks
170
+ "table": 2, // number of tables
171
+ "equations": {
172
+ "successful_ocr": 0,
173
+ "unsuccessful_ocr": 0,
174
+ "equations": 0
175
+ }
176
+ }
177
+ }
178
+ ```
179
+
180
  # Troubleshooting
181
 
182
  There are some settings that you may find useful if things aren't working the way you expect:
marker/cleaners/headings.py CHANGED
@@ -120,3 +120,4 @@ def infer_heading_levels(pages: List[Page], height_tol=.99):
120
 
121
  if block.heading_level is None:
122
  block.heading_level = min(len(heading_ranges), settings.HEADING_DEFAULT_LEVEL)
 
 
120
 
121
  if block.heading_level is None:
122
  block.heading_level = min(len(heading_ranges), settings.HEADING_DEFAULT_LEVEL)
123
+
marker/cleaners/toc.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+
3
+ from marker.schema.page import Page
4
+
5
+
6
+ def get_pdf_toc(doc, max_depth=15):
7
+ toc = doc.get_toc(max_depth=max_depth)
8
+ toc_list = []
9
+ for item in toc:
10
+ list_item = {
11
+ "title": item.title,
12
+ "level": item.level,
13
+ "page": item.page_index,
14
+ }
15
+ toc_list.append(list_item)
16
+ return toc_list
17
+
18
+
19
+ def compute_toc(pages: List[Page]):
20
+ toc = []
21
+ for page in pages:
22
+ for block in page.blocks:
23
+ if block.block_type in ["Title", "Section-header"]:
24
+ toc.append({
25
+ "title": block.prelim_text,
26
+ "level": block.heading_level,
27
+ "page": page.pnum
28
+ })
29
+ return toc
marker/convert.py CHANGED
@@ -1,4 +1,7 @@
1
  import warnings
 
 
 
2
  warnings.filterwarnings("ignore", category=UserWarning) # Filter torch pytree user warnings
3
 
4
  import os
@@ -72,7 +75,7 @@ def convert_single_pdf(
72
  start_page=start_page
73
  )
74
  out_meta.update({
75
- "toc": toc,
76
  "pages": len(pages),
77
  })
78
 
@@ -149,6 +152,9 @@ def convert_single_pdf(
149
  infer_heading_levels(pages)
150
  find_bold_italic(pages)
151
 
 
 
 
152
  # Copy to avoid changing original data
153
  merged_lines = merge_spans(filtered)
154
  text_blocks = merge_lines(merged_lines)
 
1
  import warnings
2
+
3
+ from marker.cleaners.toc import compute_toc
4
+
5
  warnings.filterwarnings("ignore", category=UserWarning) # Filter torch pytree user warnings
6
 
7
  import os
 
75
  start_page=start_page
76
  )
77
  out_meta.update({
78
+ "pdf_toc": toc,
79
  "pages": len(pages),
80
  })
81
 
 
152
  infer_heading_levels(pages)
153
  find_bold_italic(pages)
154
 
155
+ # Use headers to compute a table of contents
156
+ out_meta["computed_toc"] = compute_toc(pages)
157
+
158
  # Copy to avoid changing original data
159
  merged_lines = merge_spans(filtered)
160
  text_blocks = merge_lines(merged_lines)
marker/pdf/extract_text.py CHANGED
@@ -2,8 +2,8 @@ import os
2
  from typing import List, Optional, Dict
3
 
4
  import pypdfium2 as pdfium
5
- import pypdfium2.internal as pdfium_i
6
 
 
7
  from marker.pdf.utils import font_flags_decomposer
8
  from marker.settings import settings
9
  from marker.schema.block import Span, Line, Block
@@ -77,7 +77,7 @@ def pdftext_format_to_blocks(page, pnum: int) -> Page:
77
 
78
 
79
  def get_text_blocks(doc, fname, max_pages: Optional[int] = None, start_page: Optional[int] = None) -> (List[Page], Dict):
80
- toc = get_toc(doc)
81
 
82
  if start_page:
83
  assert start_page < len(doc)
@@ -107,23 +107,6 @@ def naive_get_text(doc):
107
  return full_text
108
 
109
 
110
- def get_toc(doc, max_depth=15):
111
- toc = doc.get_toc(max_depth=max_depth)
112
- toc_list = []
113
- for item in toc:
114
- list_item = {
115
- "title": item.title,
116
- "level": item.level,
117
- "is_closed": item.is_closed,
118
- "n_kids": item.n_kids,
119
- "page_index": item.page_index,
120
- "view_mode": pdfium_i.ViewmodeToStr.get(item.view_mode),
121
- "view_pos": item.view_pos,
122
- }
123
- toc_list.append(list_item)
124
- return toc_list
125
-
126
-
127
  def get_length_of_text(fname: str) -> int:
128
  doc = pdfium.PdfDocument(fname)
129
  text = naive_get_text(doc).strip()
 
2
  from typing import List, Optional, Dict
3
 
4
  import pypdfium2 as pdfium
 
5
 
6
+ from marker.cleaners.toc import get_pdf_toc
7
  from marker.pdf.utils import font_flags_decomposer
8
  from marker.settings import settings
9
  from marker.schema.block import Span, Line, Block
 
77
 
78
 
79
  def get_text_blocks(doc, fname, max_pages: Optional[int] = None, start_page: Optional[int] = None) -> (List[Page], Dict):
80
+ toc = get_pdf_toc(doc)
81
 
82
  if start_page:
83
  assert start_page < len(doc)
 
107
  return full_text
108
 
109
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  def get_length_of_text(fname: str) -> int:
111
  doc = pdfium.PdfDocument(fname)
112
  text = naive_get_text(doc).strip()