Spaces:
Sleeping
Sleeping
| from __future__ import annotations | |
| from dataclasses import dataclass | |
| from typing import List, Tuple | |
| import fitz # PyMuPDF | |
| from PIL import Image | |
| import io | |
| import pytesseract | |
| class PageText: | |
| page_index: int # 0-based | |
| extracted_text: str | |
| ocr_text: str | |
| used_ocr: bool | |
| def _safe_text(s: str) -> str: | |
| return (s or "").replace("\x00", " ").strip() | |
| def is_likely_scanned(extracted_text: str, min_text_chars_for_digital: int) -> bool: | |
| """ | |
| Simple heuristic: if the native extracted text is too short, likely scanned. | |
| """ | |
| t = _safe_text(extracted_text) | |
| return len(t) < min_text_chars_for_digital | |
| def render_page_to_pil(doc: fitz.Document, page_index: int, dpi: int = 200) -> Image.Image: | |
| page = doc.load_page(page_index) | |
| zoom = dpi / 72.0 | |
| mat = fitz.Matrix(zoom, zoom) | |
| pix = page.get_pixmap(matrix=mat, alpha=False) | |
| img = Image.open(io.BytesIO(pix.tobytes("png"))).convert("RGB") | |
| return img | |
| def ocr_pil_image(img: Image.Image, lang: str = "eng") -> str: | |
| # You can also add config like "--psm 6" if needed. | |
| txt = pytesseract.image_to_string(img, lang=lang) | |
| return _safe_text(txt) | |
| def extract_texts_from_pdf( | |
| pdf_path: str, | |
| dpi: int = 200, | |
| ocr_lang: str = "eng", | |
| min_text_chars_for_digital: int = 80, | |
| ) -> Tuple[List[PageText], int]: | |
| doc = fitz.open(pdf_path) | |
| page_count = doc.page_count | |
| results: List[PageText] = [] | |
| for i in range(page_count): | |
| page = doc.load_page(i) | |
| extracted = _safe_text(page.get_text("text")) | |
| if is_likely_scanned(extracted, min_text_chars_for_digital): | |
| img = render_page_to_pil(doc, i, dpi=dpi) | |
| ocr_txt = ocr_pil_image(img, lang=ocr_lang) | |
| results.append(PageText(i, extracted_text=extracted, ocr_text=ocr_txt, used_ocr=True)) | |
| else: | |
| results.append(PageText(i, extracted_text=extracted, ocr_text="", used_ocr=False)) | |
| doc.close() | |
| return results, page_count | |
| def render_pages_to_png_bytes(pdf_path: str, page_indices: List[int], dpi: int = 200) -> List[bytes]: | |
| doc = fitz.open(pdf_path) | |
| out: List[bytes] = [] | |
| for p in page_indices: | |
| img = render_page_to_pil(doc, p, dpi=dpi) | |
| buf = io.BytesIO() | |
| img.save(buf, format="PNG") | |
| out.append(buf.getvalue()) | |
| doc.close() | |
| return out | |