from __future__ import annotations from typing import List import fitz # PyMuPDF import pdfplumber def pdf_to_images(pdf_path: str, dpi: int = 200, max_pages: int = 6) -> List[bytes]: images: List[bytes] = [] with fitz.open(pdf_path) as doc: for i, page in enumerate(doc): if i >= max_pages: break zoom = dpi / 72.0 mat = fitz.Matrix(zoom, zoom) pix = page.get_pixmap(matrix=mat, alpha=False) images.append(pix.tobytes("png")) return images def pdf_to_text(pdf_path: str, max_chars: int = 15000, pages:int=3) -> str: chunks = [] with pdfplumber.open(pdf_path) as pdf: for i, page in enumerate(pdf.pages[:pages]): t = (page.extract_text() or "").strip() if t: chunks.append(f"[page {i+1}]\n{t}") if sum(len(c) for c in chunks) > max_chars: break return "\n\n".join(chunks)[:max_chars]