File size: 836 Bytes
7d2fea2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
import re
import tempfile
import unicodedata
from pathlib import Path

import fitz


def read_pdf(source: bytes | str | Path) -> tuple[str, int]:
    if isinstance(source, bytes):
        with tempfile.NamedTemporaryFile(suffix=".pdf") as tmp:
            tmp.write(source)
            tmp.flush()
            return _read_pdf_path(Path(tmp.name))
    return _read_pdf_path(Path(source))


def _read_pdf_path(path: Path) -> tuple[str, int]:
    with fitz.open(path) as pdf:
        pages = [page.get_text("text") or "" for page in pdf]
    text = "\n\n".join(pages)
    text = unicodedata.normalize("NFKC", text)
    text = re.sub(r"[ \t]+", " ", text)
    text = re.sub(r"\n{3,}", "\n\n", text).strip()
    if not text:
        raise ValueError("No extractable text found in PDF. The file may be scanned.")
    return text, len(pages)