Spaces:
Running
Running
File size: 836 Bytes
7d2fea2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 | import re
import tempfile
import unicodedata
from pathlib import Path
import fitz
def read_pdf(source: bytes | str | Path) -> tuple[str, int]:
if isinstance(source, bytes):
with tempfile.NamedTemporaryFile(suffix=".pdf") as tmp:
tmp.write(source)
tmp.flush()
return _read_pdf_path(Path(tmp.name))
return _read_pdf_path(Path(source))
def _read_pdf_path(path: Path) -> tuple[str, int]:
with fitz.open(path) as pdf:
pages = [page.get_text("text") or "" for page in pdf]
text = "\n\n".join(pages)
text = unicodedata.normalize("NFKC", text)
text = re.sub(r"[ \t]+", " ", text)
text = re.sub(r"\n{3,}", "\n\n", text).strip()
if not text:
raise ValueError("No extractable text found in PDF. The file may be scanned.")
return text, len(pages)
|