Score / core /pdf_io.py
Corin1998's picture
Update core/pdf_io.py
2394322 verified
raw
history blame contribute delete
923 Bytes
from __future__ import annotations
from typing import List
import fitz # PyMuPDF
import pdfplumber
def pdf_to_images(pdf_path: str, dpi: int = 200, max_pages: int = 6) -> List[bytes]:
images: List[bytes] = []
with fitz.open(pdf_path) as doc:
for i, page in enumerate(doc):
if i >= max_pages: break
zoom = dpi / 72.0
mat = fitz.Matrix(zoom, zoom)
pix = page.get_pixmap(matrix=mat, alpha=False)
images.append(pix.tobytes("png"))
return images
def pdf_to_text(pdf_path: str, max_chars: int = 15000, pages:int=3) -> str:
chunks = []
with pdfplumber.open(pdf_path) as pdf:
for i, page in enumerate(pdf.pages[:pages]):
t = (page.extract_text() or "").strip()
if t: chunks.append(f"[page {i+1}]\n{t}")
if sum(len(c) for c in chunks) > max_chars: break
return "\n\n".join(chunks)[:max_chars]