Spaces:
Sleeping
Sleeping
File size: 3,057 Bytes
046e3b8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 | from contextlib import contextmanager
from dataclasses import dataclass
from typing import Optional, ContextManager, List, Tuple
from fitz import Document
import sys
import fitz
import io
import csv
import chardet
@contextmanager
def open_pdf(path: str,
exit_on_error: bool = True
) -> ContextManager[Optional[Document]]:
"""A context manager for fitz Document
This context manager will take care of the error handling when creating a
fitz Document.
Arguments
path: the path of the pdf file
exit_on_error: if true, exit with error code 1 when error occurs
"""
try:
doc = fitz.open(path)
except Exception as e:
if exit_on_error:
print(f"error: fail to open {path}", file=sys.stderr)
print(e, file=sys.stderr)
sys.exit(1)
else:
yield None
else:
try:
yield doc
finally:
doc.close()
@dataclass
class ToCEntry:
"""A single entry in the table of contents"""
level: int
title: str
pagenum: int
# vpos == bbox.top, used for sorting
vpos: Optional[float] = None
@staticmethod
def key(e) -> Tuple[int, float]:
"""Key used for sorting"""
return (e.pagenum, 0 if e.vpos is None else e.vpos)
def to_fitz_entry(self) -> list:
return ([self.level, self.title, self.pagenum] +
[self.vpos] * (self.vpos is not None))
def dump_toc(entries: List[ToCEntry], dump_vpos: bool = False) -> str:
"""Dump table of contents as a CSV dialect
We will use indentations to represent the level of each entry, except that,
everything should be similar to the normal CSV.
Argument
entries: a list of ToC entries
dump_vpos: if true, the vertical position of a page is also dumped
Returns
a multiline string
"""
with io.StringIO(newline='\n') as out:
writer = csv.writer(out, lineterminator='\n',
delimiter=' ', quoting=csv.QUOTE_NONNUMERIC)
for entry in entries:
out.write((entry.level - 1) * ' ')
writer.writerow(
[entry.title, entry.pagenum] +
([entry.vpos] * (dump_vpos and entry.vpos is not None))
)
return out.getvalue()
def pprint_toc(entries: List[ToCEntry]) -> str:
"""Pretty print table of contents
Argument
entries: a list of ToC entries
Returns
a multiline string
"""
return '\n'.join([
f"{(entry.level - 1) * ' '}{entry.title} 路路路 {entry.pagenum}"
for entry in entries
])
def get_file_encoding(path: str) -> str:
"""Get encoding of file
Argument
path: file path
Returns
encoding string
"""
try:
with open(path, "rb") as f:
enc = chardet.detect(f.read()).encoding
except:
enc = 'utf-8'
return enc
|