File size: 3,057 Bytes
046e3b8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
from contextlib import contextmanager
from dataclasses import dataclass
from typing import Optional, ContextManager, List, Tuple
from fitz import Document

import sys
import fitz
import io
import csv
import chardet


@contextmanager
def open_pdf(path: str,

             exit_on_error: bool = True

             ) -> ContextManager[Optional[Document]]:
    """A context manager for fitz Document



    This context manager will take care of the error handling when creating a

    fitz Document.



    Arguments

      path: the path of the pdf file

      exit_on_error: if true, exit with error code 1 when error occurs

    """
    try:
        doc = fitz.open(path)
    except Exception as e:
        if exit_on_error:
            print(f"error: fail to open {path}", file=sys.stderr)
            print(e, file=sys.stderr)
            sys.exit(1)
        else:
            yield None
    else:
        try:
            yield doc
        finally:
            doc.close()


@dataclass
class ToCEntry:
    """A single entry in the table of contents"""
    level: int
    title: str
    pagenum: int
    # vpos == bbox.top, used for sorting
    vpos: Optional[float] = None

    @staticmethod
    def key(e) -> Tuple[int, float]:
        """Key used for sorting"""
        return (e.pagenum, 0 if e.vpos is None else e.vpos)

    def to_fitz_entry(self) -> list:
        return ([self.level, self.title, self.pagenum] +
                [self.vpos] * (self.vpos is not None))


def dump_toc(entries: List[ToCEntry], dump_vpos: bool = False) -> str:
    """Dump table of contents as a CSV dialect



    We will use indentations to represent the level of each entry, except that,

    everything should be similar to the normal CSV.



    Argument

      entries: a list of ToC entries

      dump_vpos: if true, the vertical position of a page is also dumped

    Returns

      a multiline string

    """
    with io.StringIO(newline='\n') as out:
        writer = csv.writer(out, lineterminator='\n',
                            delimiter=' ', quoting=csv.QUOTE_NONNUMERIC)
        for entry in entries:
            out.write((entry.level - 1) * '    ')
            writer.writerow(
                [entry.title, entry.pagenum] +
                ([entry.vpos] * (dump_vpos and entry.vpos is not None))
            )
        return out.getvalue()


def pprint_toc(entries: List[ToCEntry]) -> str:
    """Pretty print table of contents



    Argument

      entries: a list of ToC entries

    Returns

      a multiline string

    """
    return '\n'.join([
        f"{(entry.level - 1) * '    '}{entry.title} 路路路 {entry.pagenum}"
        for entry in entries
    ])


def get_file_encoding(path: str) -> str:
    """Get encoding of file



    Argument

      path: file path

    Returns

      encoding string

    """
    try:
        with open(path, "rb") as f:
            enc = chardet.detect(f.read()).encoding
    except:
        enc = 'utf-8'
    return enc