| from pathlib import Path | |
| import fitz | |
| import PIL.Image | |
| def extract_pages(path: Path) -> list[dict]: | |
| doc = fitz.open(str(path)) | |
| pages = [] | |
| for i, page in enumerate(doc): | |
| text = page.get_text("text") | |
| pages.append({"page": i + 1, "text": text}) | |
| doc.close() | |
| return pages | |
| def is_text_pdf(path: Path) -> bool: | |
| doc = fitz.open(str(path)) | |
| if not doc.page_count: | |
| doc.close() | |
| return False | |
| total_chars = sum(len(page.get_text("text")) for page in doc) | |
| avg = total_chars / doc.page_count | |
| doc.close() | |
| return avg >= 50 | |
| def render_page_to_image(path: Path, page_no: int, dpi: int = 200) -> PIL.Image.Image: | |
| doc = fitz.open(str(path)) | |
| page = doc[page_no - 1] | |
| mat = fitz.Matrix(dpi / 72, dpi / 72) | |
| pix = page.get_pixmap(matrix=mat, colorspace=fitz.csRGB) | |
| img = PIL.Image.frombytes("RGB", (pix.width, pix.height), pix.samples) | |
| doc.close() | |
| return img | |