from __future__ import annotations from dataclasses import dataclass from typing import List, Tuple import fitz # PyMuPDF from PIL import Image import io import pytesseract @dataclass class PageText: page_index: int # 0-based extracted_text: str ocr_text: str used_ocr: bool def _safe_text(s: str) -> str: return (s or "").replace("\x00", " ").strip() def is_likely_scanned(extracted_text: str, min_text_chars_for_digital: int) -> bool: """ Simple heuristic: if the native extracted text is too short, likely scanned. """ t = _safe_text(extracted_text) return len(t) < min_text_chars_for_digital def render_page_to_pil(doc: fitz.Document, page_index: int, dpi: int = 200) -> Image.Image: page = doc.load_page(page_index) zoom = dpi / 72.0 mat = fitz.Matrix(zoom, zoom) pix = page.get_pixmap(matrix=mat, alpha=False) img = Image.open(io.BytesIO(pix.tobytes("png"))).convert("RGB") return img def ocr_pil_image(img: Image.Image, lang: str = "eng") -> str: # You can also add config like "--psm 6" if needed. txt = pytesseract.image_to_string(img, lang=lang) return _safe_text(txt) def extract_texts_from_pdf( pdf_path: str, dpi: int = 200, ocr_lang: str = "eng", min_text_chars_for_digital: int = 80, ) -> Tuple[List[PageText], int]: doc = fitz.open(pdf_path) page_count = doc.page_count results: List[PageText] = [] for i in range(page_count): page = doc.load_page(i) extracted = _safe_text(page.get_text("text")) if is_likely_scanned(extracted, min_text_chars_for_digital): img = render_page_to_pil(doc, i, dpi=dpi) ocr_txt = ocr_pil_image(img, lang=ocr_lang) results.append(PageText(i, extracted_text=extracted, ocr_text=ocr_txt, used_ocr=True)) else: results.append(PageText(i, extracted_text=extracted, ocr_text="", used_ocr=False)) doc.close() return results, page_count def render_pages_to_png_bytes(pdf_path: str, page_indices: List[int], dpi: int = 200) -> List[bytes]: doc = fitz.open(pdf_path) out: List[bytes] = [] for p in page_indices: img = render_page_to_pil(doc, p, dpi=dpi) buf = io.BytesIO() img.save(buf, format="PNG") out.append(buf.getvalue()) doc.close() return out