|
|
from __future__ import annotations |
|
|
from typing import List |
|
|
import fitz |
|
|
import pdfplumber |
|
|
|
|
|
def pdf_to_images(pdf_path: str, dpi: int = 200, max_pages: int = 6) -> List[bytes]: |
|
|
images: List[bytes] = [] |
|
|
with fitz.open(pdf_path) as doc: |
|
|
for i, page in enumerate(doc): |
|
|
if i >= max_pages: break |
|
|
zoom = dpi / 72.0 |
|
|
mat = fitz.Matrix(zoom, zoom) |
|
|
pix = page.get_pixmap(matrix=mat, alpha=False) |
|
|
images.append(pix.tobytes("png")) |
|
|
return images |
|
|
|
|
|
def pdf_to_text(pdf_path: str, max_chars: int = 15000, pages:int=3) -> str: |
|
|
chunks = [] |
|
|
with pdfplumber.open(pdf_path) as pdf: |
|
|
for i, page in enumerate(pdf.pages[:pages]): |
|
|
t = (page.extract_text() or "").strip() |
|
|
if t: chunks.append(f"[page {i+1}]\n{t}") |
|
|
if sum(len(c) for c in chunks) > max_chars: break |
|
|
return "\n\n".join(chunks)[:max_chars] |
|
|
|