Spaces:

adelevett
/

pdf.tocgen.split

Sleeping

pdf.tocgen.split

File size: 3,057 Bytes

046e3b8

from contextlib import contextmanager
from dataclasses import dataclass
from typing import Optional, ContextManager, List, Tuple
from fitz import Document

import sys
import fitz
import io
import csv
import chardet


@contextmanager
def open_pdf(path: str,

             exit_on_error: bool = True

             ) -> ContextManager[Optional[Document]]:
    """A context manager for fitz Document



    This context manager will take care of the error handling when creating a

    fitz Document.



    Arguments

      path: the path of the pdf file

      exit_on_error: if true, exit with error code 1 when error occurs

    """
    try:
        doc = fitz.open(path)
    except Exception as e:
        if exit_on_error:
            print(f"error: fail to open {path}", file=sys.stderr)
            print(e, file=sys.stderr)
            sys.exit(1)
        else:
            yield None
    else:
        try:
            yield doc
        finally:
            doc.close()


@dataclass
class ToCEntry:
    """A single entry in the table of contents"""
    level: int
    title: str
    pagenum: int
    # vpos == bbox.top, used for sorting
    vpos: Optional[float] = None

    @staticmethod
    def key(e) -> Tuple[int, float]:
        """Key used for sorting"""
        return (e.pagenum, 0 if e.vpos is None else e.vpos)

    def to_fitz_entry(self) -> list:
        return ([self.level, self.title, self.pagenum] +
                [self.vpos] * (self.vpos is not None))


def dump_toc(entries: List[ToCEntry], dump_vpos: bool = False) -> str:
    """Dump table of contents as a CSV dialect



    We will use indentations to represent the level of each entry, except that,

    everything should be similar to the normal CSV.



    Argument

      entries: a list of ToC entries

      dump_vpos: if true, the vertical position of a page is also dumped

    Returns

      a multiline string

    """
    with io.StringIO(newline='\n') as out:
        writer = csv.writer(out, lineterminator='\n',
                            delimiter=' ', quoting=csv.QUOTE_NONNUMERIC)
        for entry in entries:
            out.write((entry.level - 1) * '    ')
            writer.writerow(
                [entry.title, entry.pagenum] +
                ([entry.vpos] * (dump_vpos and entry.vpos is not None))
            )
        return out.getvalue()


def pprint_toc(entries: List[ToCEntry]) -> str:
    """Pretty print table of contents



    Argument

      entries: a list of ToC entries

    Returns

      a multiline string

    """
    return '\n'.join([
        f"{(entry.level - 1) * '    '}{entry.title} ··· {entry.pagenum}"
        for entry in entries
    ])


def get_file_encoding(path: str) -> str:
    """Get encoding of file



    Argument

      path: file path

    Returns

      encoding string

    """
    try:
        with open(path, "rb") as f:
            enc = chardet.detect(f.read()).encoding
    except:
        enc = 'utf-8'
    return enc