File size: 1,312 Bytes
046e3b8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
"""Parser for table of content csv file"""

import csv
import sys

from typing import IO, List
from fitzutils import ToCEntry
from itertools import takewhile


def parse_entry(entry: List) -> ToCEntry:
    """parse a row in csv to a toc entry"""

    # a somewhat weird hack, csv reader would read spaces as an empty '', so we
    # only need to count the number of '' before an entry to determined the
    # heading level
    indent = len(list(takewhile(lambda x: x == '', entry)))
    try:
        toc_entry = ToCEntry(
            int(indent / 4) + 1,     # 4 spaces = 1 level
            entry[indent],           # heading
            int(entry[indent + 1]),  # pagenum
            *entry[indent + 2:]      # vpos
        )
        return toc_entry
    except IndexError as e:
        print(f"Unable to parse toc entry {entry};",
              f"Need at least {indent + 2} parts but only have {len(entry)}.",
              "Make sure the page number is present.",
              file=sys.stderr)
        raise e


def parse_toc(file: IO) -> List[ToCEntry]:
    """Parse a toc file to a list of toc entries"""
    reader = csv.reader(file, lineterminator='\n',
                        delimiter=' ', quoting=csv.QUOTE_NONNUMERIC)
    return list(map(parse_entry, reader))