File size: 923 Bytes
2c42056
 
 
 
 
 
 
 
 
2394322
 
2c42056
 
 
 
 
2394322
2c42056
 
2394322
2c42056
2394322
 
2c42056
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
from __future__ import annotations
from typing import List
import fitz  # PyMuPDF
import pdfplumber

def pdf_to_images(pdf_path: str, dpi: int = 200, max_pages: int = 6) -> List[bytes]:
    images: List[bytes] = []
    with fitz.open(pdf_path) as doc:
        for i, page in enumerate(doc):
            if i >= max_pages: break
            zoom = dpi / 72.0
            mat = fitz.Matrix(zoom, zoom)
            pix = page.get_pixmap(matrix=mat, alpha=False)
            images.append(pix.tobytes("png"))
    return images

def pdf_to_text(pdf_path: str, max_chars: int = 15000, pages:int=3) -> str:
    chunks = []
    with pdfplumber.open(pdf_path) as pdf:
        for i, page in enumerate(pdf.pages[:pages]):
            t = (page.extract_text() or "").strip()
            if t: chunks.append(f"[page {i+1}]\n{t}")
            if sum(len(c) for c in chunks) > max_chars: break
    return "\n\n".join(chunks)[:max_chars]