File size: 2,754 Bytes
e1ced8e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
"""PDF page rendering (PyMuPDF/fitz) — upfront bulk rendering at ingest time."""
from __future__ import annotations

from pathlib import Path

import fitz  # PyMuPDF

from config import PDF_RENDER_DPI


def get_page_count(pdf_path: str) -> int:
    """Return the number of pages in a PDF without rendering anything."""
    doc = fitz.open(pdf_path)
    count = len(doc)
    doc.close()
    return count


def render_pages(pdf_path: str, output_dir: str, dpi: int = PDF_RENDER_DPI) -> int:
    """Render every PDF page as a PNG image.



    This is the primary rendering method, called once during PDF ingestion

    to pre-render all pages at the configured DPI.

    """
    out = Path(output_dir)
    out.mkdir(parents=True, exist_ok=True)

    doc = fitz.open(pdf_path)
    num_pages = len(doc)
    zoom = dpi / 72.0
    matrix = fitz.Matrix(zoom, zoom)

    for page_num in range(num_pages):
        page = doc.load_page(page_num)
        pix = page.get_pixmap(matrix=matrix)
        img_bytes = pix.tobytes("png")
        img_path = out / f"page_{page_num}.png"
        with open(img_path, "wb") as f:
            f.write(img_bytes)

    doc.close()
    return num_pages


def render_single_page(

    pdf_path: str,

    page_num: int,

    output_dir: str,

    dpi: int = PDF_RENDER_DPI,

) -> None:
    """Render a single PDF page as a PNG and save to disk."""
    out = Path(output_dir)
    out.mkdir(parents=True, exist_ok=True)

    doc = fitz.open(pdf_path)
    zoom = dpi / 72.0
    page = doc.load_page(page_num)
    pix = page.get_pixmap(matrix=fitz.Matrix(zoom, zoom))
    img_path = out / f"page_{page_num}.png"
    with open(img_path, "wb") as f:
        f.write(pix.tobytes("png"))
    doc.close()


def extract_page_range_bytes(pdf_path: str, start: int, end: int) -> bytes:
    """Extract a range of pages from a PDF and return as in-memory PDF bytes.



    Args:

        pdf_path: Path to the source PDF.

        start: First page index (0-indexed, inclusive).

        end: Last page index (0-indexed, inclusive).



    Returns:

        Raw bytes of a new PDF containing only the specified pages.

    """
    src = fitz.open(pdf_path)
    dst = fitz.open()  # new empty PDF
    dst.insert_pdf(src, from_page=start, to_page=end)
    pdf_bytes = dst.tobytes()
    dst.close()
    src.close()
    return pdf_bytes


def get_page_image_bytes(

    page_image_dir: str,

    page_num: int,

) -> bytes:
    """Load a pre-rendered page image from disk.



    Pages are expected to already exist from the upfront bulk render

    performed during PDF ingestion.

    """
    path = Path(page_image_dir) / f"page_{page_num}.png"
    return path.read_bytes()