pdf.tocgen.split / pdftocio /tocparser.py
adelevett's picture
Upload 76 files
046e3b8 verified
"""Parser for table of content csv file"""
import csv
import sys
from typing import IO, List
from fitzutils import ToCEntry
from itertools import takewhile
def parse_entry(entry: List) -> ToCEntry:
"""parse a row in csv to a toc entry"""
# a somewhat weird hack, csv reader would read spaces as an empty '', so we
# only need to count the number of '' before an entry to determined the
# heading level
indent = len(list(takewhile(lambda x: x == '', entry)))
try:
toc_entry = ToCEntry(
int(indent / 4) + 1, # 4 spaces = 1 level
entry[indent], # heading
int(entry[indent + 1]), # pagenum
*entry[indent + 2:] # vpos
)
return toc_entry
except IndexError as e:
print(f"Unable to parse toc entry {entry};",
f"Need at least {indent + 2} parts but only have {len(entry)}.",
"Make sure the page number is present.",
file=sys.stderr)
raise e
def parse_toc(file: IO) -> List[ToCEntry]:
"""Parse a toc file to a list of toc entries"""
reader = csv.reader(file, lineterminator='\n',
delimiter=' ', quoting=csv.QUOTE_NONNUMERIC)
return list(map(parse_entry, reader))