finalyze / pdf_io.py
FridayCodehhr's picture
Upload 10 files
a9d5e1b verified
from __future__ import annotations
from dataclasses import dataclass
from typing import List, Tuple
import fitz # PyMuPDF
from PIL import Image
import io
import pytesseract
@dataclass
class PageText:
page_index: int # 0-based
extracted_text: str
ocr_text: str
used_ocr: bool
def _safe_text(s: str) -> str:
return (s or "").replace("\x00", " ").strip()
def is_likely_scanned(extracted_text: str, min_text_chars_for_digital: int) -> bool:
"""
Simple heuristic: if the native extracted text is too short, likely scanned.
"""
t = _safe_text(extracted_text)
return len(t) < min_text_chars_for_digital
def render_page_to_pil(doc: fitz.Document, page_index: int, dpi: int = 200) -> Image.Image:
page = doc.load_page(page_index)
zoom = dpi / 72.0
mat = fitz.Matrix(zoom, zoom)
pix = page.get_pixmap(matrix=mat, alpha=False)
img = Image.open(io.BytesIO(pix.tobytes("png"))).convert("RGB")
return img
def ocr_pil_image(img: Image.Image, lang: str = "eng") -> str:
# You can also add config like "--psm 6" if needed.
txt = pytesseract.image_to_string(img, lang=lang)
return _safe_text(txt)
def extract_texts_from_pdf(
pdf_path: str,
dpi: int = 200,
ocr_lang: str = "eng",
min_text_chars_for_digital: int = 80,
) -> Tuple[List[PageText], int]:
doc = fitz.open(pdf_path)
page_count = doc.page_count
results: List[PageText] = []
for i in range(page_count):
page = doc.load_page(i)
extracted = _safe_text(page.get_text("text"))
if is_likely_scanned(extracted, min_text_chars_for_digital):
img = render_page_to_pil(doc, i, dpi=dpi)
ocr_txt = ocr_pil_image(img, lang=ocr_lang)
results.append(PageText(i, extracted_text=extracted, ocr_text=ocr_txt, used_ocr=True))
else:
results.append(PageText(i, extracted_text=extracted, ocr_text="", used_ocr=False))
doc.close()
return results, page_count
def render_pages_to_png_bytes(pdf_path: str, page_indices: List[int], dpi: int = 200) -> List[bytes]:
doc = fitz.open(pdf_path)
out: List[bytes] = []
for p in page_indices:
img = render_page_to_pil(doc, p, dpi=dpi)
buf = io.BytesIO()
img.save(buf, format="PNG")
out.append(buf.getvalue())
doc.close()
return out