Vik Paruchuri
commited on
Commit
·
a97dc07
1
Parent(s):
d8cea21
Add computed table of contents
Browse files- README.md +30 -0
- marker/cleaners/headings.py +1 -0
- marker/cleaners/toc.py +29 -0
- marker/convert.py +7 -1
- marker/pdf/extract_text.py +2 -19
README.md
CHANGED
|
@@ -147,6 +147,36 @@ METADATA_FILE=../pdf_meta.json NUM_DEVICES=4 NUM_WORKERS=15 marker_chunk_convert
|
|
| 147 |
|
| 148 |
Note that the env variables above are specific to this script, and cannot be set in `local.env`.
|
| 149 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
# Troubleshooting
|
| 151 |
|
| 152 |
There are some settings that you may find useful if things aren't working the way you expect:
|
|
|
|
| 147 |
|
| 148 |
Note that the env variables above are specific to this script, and cannot be set in `local.env`.
|
| 149 |
|
| 150 |
+
# Output format
|
| 151 |
+
|
| 152 |
+
The output will be a markdown file, but there will also be a metadata json file that gives information about the conversion process. It has these fields:
|
| 153 |
+
|
| 154 |
+
```json
|
| 155 |
+
{
|
| 156 |
+
"languages": null, // any languages that were passed in
|
| 157 |
+
"filetype": "pdf", // type of the file
|
| 158 |
+
"pdf_toc": [], // the table of contents from the pdf
|
| 159 |
+
"computed_toc": [], //the computed table of contents
|
| 160 |
+
"pages": 10, // page count
|
| 161 |
+
"ocr_stats": {
|
| 162 |
+
"ocr_pages": 0, // number of pages OCRed
|
| 163 |
+
"ocr_failed": 0, // number of pages where OCR failed
|
| 164 |
+
"ocr_success": 0,
|
| 165 |
+
"ocr_engine": "none"
|
| 166 |
+
},
|
| 167 |
+
"block_stats": {
|
| 168 |
+
"header_footer": 0,
|
| 169 |
+
"code": 0, // number of code blocks
|
| 170 |
+
"table": 2, // number of tables
|
| 171 |
+
"equations": {
|
| 172 |
+
"successful_ocr": 0,
|
| 173 |
+
"unsuccessful_ocr": 0,
|
| 174 |
+
"equations": 0
|
| 175 |
+
}
|
| 176 |
+
}
|
| 177 |
+
}
|
| 178 |
+
```
|
| 179 |
+
|
| 180 |
# Troubleshooting
|
| 181 |
|
| 182 |
There are some settings that you may find useful if things aren't working the way you expect:
|
marker/cleaners/headings.py
CHANGED
|
@@ -120,3 +120,4 @@ def infer_heading_levels(pages: List[Page], height_tol=.99):
|
|
| 120 |
|
| 121 |
if block.heading_level is None:
|
| 122 |
block.heading_level = min(len(heading_ranges), settings.HEADING_DEFAULT_LEVEL)
|
|
|
|
|
|
| 120 |
|
| 121 |
if block.heading_level is None:
|
| 122 |
block.heading_level = min(len(heading_ranges), settings.HEADING_DEFAULT_LEVEL)
|
| 123 |
+
|
marker/cleaners/toc.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List
|
| 2 |
+
|
| 3 |
+
from marker.schema.page import Page
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def get_pdf_toc(doc, max_depth=15):
|
| 7 |
+
toc = doc.get_toc(max_depth=max_depth)
|
| 8 |
+
toc_list = []
|
| 9 |
+
for item in toc:
|
| 10 |
+
list_item = {
|
| 11 |
+
"title": item.title,
|
| 12 |
+
"level": item.level,
|
| 13 |
+
"page": item.page_index,
|
| 14 |
+
}
|
| 15 |
+
toc_list.append(list_item)
|
| 16 |
+
return toc_list
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def compute_toc(pages: List[Page]):
|
| 20 |
+
toc = []
|
| 21 |
+
for page in pages:
|
| 22 |
+
for block in page.blocks:
|
| 23 |
+
if block.block_type in ["Title", "Section-header"]:
|
| 24 |
+
toc.append({
|
| 25 |
+
"title": block.prelim_text,
|
| 26 |
+
"level": block.heading_level,
|
| 27 |
+
"page": page.pnum
|
| 28 |
+
})
|
| 29 |
+
return toc
|
marker/convert.py
CHANGED
|
@@ -1,4 +1,7 @@
|
|
| 1 |
import warnings
|
|
|
|
|
|
|
|
|
|
| 2 |
warnings.filterwarnings("ignore", category=UserWarning) # Filter torch pytree user warnings
|
| 3 |
|
| 4 |
import os
|
|
@@ -72,7 +75,7 @@ def convert_single_pdf(
|
|
| 72 |
start_page=start_page
|
| 73 |
)
|
| 74 |
out_meta.update({
|
| 75 |
-
"
|
| 76 |
"pages": len(pages),
|
| 77 |
})
|
| 78 |
|
|
@@ -149,6 +152,9 @@ def convert_single_pdf(
|
|
| 149 |
infer_heading_levels(pages)
|
| 150 |
find_bold_italic(pages)
|
| 151 |
|
|
|
|
|
|
|
|
|
|
| 152 |
# Copy to avoid changing original data
|
| 153 |
merged_lines = merge_spans(filtered)
|
| 154 |
text_blocks = merge_lines(merged_lines)
|
|
|
|
| 1 |
import warnings
|
| 2 |
+
|
| 3 |
+
from marker.cleaners.toc import compute_toc
|
| 4 |
+
|
| 5 |
warnings.filterwarnings("ignore", category=UserWarning) # Filter torch pytree user warnings
|
| 6 |
|
| 7 |
import os
|
|
|
|
| 75 |
start_page=start_page
|
| 76 |
)
|
| 77 |
out_meta.update({
|
| 78 |
+
"pdf_toc": toc,
|
| 79 |
"pages": len(pages),
|
| 80 |
})
|
| 81 |
|
|
|
|
| 152 |
infer_heading_levels(pages)
|
| 153 |
find_bold_italic(pages)
|
| 154 |
|
| 155 |
+
# Use headers to compute a table of contents
|
| 156 |
+
out_meta["computed_toc"] = compute_toc(pages)
|
| 157 |
+
|
| 158 |
# Copy to avoid changing original data
|
| 159 |
merged_lines = merge_spans(filtered)
|
| 160 |
text_blocks = merge_lines(merged_lines)
|
marker/pdf/extract_text.py
CHANGED
|
@@ -2,8 +2,8 @@ import os
|
|
| 2 |
from typing import List, Optional, Dict
|
| 3 |
|
| 4 |
import pypdfium2 as pdfium
|
| 5 |
-
import pypdfium2.internal as pdfium_i
|
| 6 |
|
|
|
|
| 7 |
from marker.pdf.utils import font_flags_decomposer
|
| 8 |
from marker.settings import settings
|
| 9 |
from marker.schema.block import Span, Line, Block
|
|
@@ -77,7 +77,7 @@ def pdftext_format_to_blocks(page, pnum: int) -> Page:
|
|
| 77 |
|
| 78 |
|
| 79 |
def get_text_blocks(doc, fname, max_pages: Optional[int] = None, start_page: Optional[int] = None) -> (List[Page], Dict):
|
| 80 |
-
toc =
|
| 81 |
|
| 82 |
if start_page:
|
| 83 |
assert start_page < len(doc)
|
|
@@ -107,23 +107,6 @@ def naive_get_text(doc):
|
|
| 107 |
return full_text
|
| 108 |
|
| 109 |
|
| 110 |
-
def get_toc(doc, max_depth=15):
|
| 111 |
-
toc = doc.get_toc(max_depth=max_depth)
|
| 112 |
-
toc_list = []
|
| 113 |
-
for item in toc:
|
| 114 |
-
list_item = {
|
| 115 |
-
"title": item.title,
|
| 116 |
-
"level": item.level,
|
| 117 |
-
"is_closed": item.is_closed,
|
| 118 |
-
"n_kids": item.n_kids,
|
| 119 |
-
"page_index": item.page_index,
|
| 120 |
-
"view_mode": pdfium_i.ViewmodeToStr.get(item.view_mode),
|
| 121 |
-
"view_pos": item.view_pos,
|
| 122 |
-
}
|
| 123 |
-
toc_list.append(list_item)
|
| 124 |
-
return toc_list
|
| 125 |
-
|
| 126 |
-
|
| 127 |
def get_length_of_text(fname: str) -> int:
|
| 128 |
doc = pdfium.PdfDocument(fname)
|
| 129 |
text = naive_get_text(doc).strip()
|
|
|
|
| 2 |
from typing import List, Optional, Dict
|
| 3 |
|
| 4 |
import pypdfium2 as pdfium
|
|
|
|
| 5 |
|
| 6 |
+
from marker.cleaners.toc import get_pdf_toc
|
| 7 |
from marker.pdf.utils import font_flags_decomposer
|
| 8 |
from marker.settings import settings
|
| 9 |
from marker.schema.block import Span, Line, Block
|
|
|
|
| 77 |
|
| 78 |
|
| 79 |
def get_text_blocks(doc, fname, max_pages: Optional[int] = None, start_page: Optional[int] = None) -> (List[Page], Dict):
|
| 80 |
+
toc = get_pdf_toc(doc)
|
| 81 |
|
| 82 |
if start_page:
|
| 83 |
assert start_page < len(doc)
|
|
|
|
| 107 |
return full_text
|
| 108 |
|
| 109 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
def get_length_of_text(fname: str) -> int:
|
| 111 |
doc = pdfium.PdfDocument(fname)
|
| 112 |
text = naive_get_text(doc).strip()
|