File size: 1,692 Bytes
c227628
127cc6f
 
 
ab82350
127cc6f
ffca6e7
127cc6f
 
 
 
ffca6e7
 
 
 
 
 
 
127cc6f
 
 
ffca6e7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c227628
 
ffca6e7
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
"""Helper script to go from PDF to PNG ZIP Files we can use in HTML on the LB."""

from __future__ import annotations

import zipfile
from pathlib import Path
from multiprocessing import Pool, cpu_count

from pdf2image import convert_from_path


ROOT_DIR = Path("./data")
DPI = 800  # you can lower this if files are huge / too slow


def process_pdf(pdf_path_str: str) -> None:
    pdf_path = Path(pdf_path_str).resolve()
    zip_path = pdf_path.with_suffix(".png.zip")

    print(f"Converting {pdf_path}...")

    # Convert all pages of the PDF
    images = convert_from_path(str(pdf_path), dpi=DPI)

    # Save pages as PNGs (multi-page safe naming)
    png_path = pdf_path.with_suffix(".png")
    images[0].save(png_path, "PNG")
    png_paths = [png_path]


    # Zip all PNGs into one archive
    with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as zipf:
        for png_path in png_paths:
            zipf.write(png_path, arcname=png_path.name)

    # Clean up PNGs and original PDF
    for png_path in png_paths:
        png_path.unlink(missing_ok=True)
    pdf_path.unlink(missing_ok=True)


def main() -> None:
    pdf_paths = [str(p) for p in ROOT_DIR.rglob("*.pdf")]

    if not pdf_paths:
        print("No PDFs found.")
        return

    # Use one process per CPU, but not more than number of PDFs
    n_procs = min(cpu_count(), len(pdf_paths))
    print(f"Found {len(pdf_paths)} PDFs. Using {n_procs} processes.")

    with Pool(processes=n_procs) as pool:
        # imap_unordered gives you streaming results + simple progress printing
        for _ in pool.imap_unordered(process_pdf, pdf_paths):
            pass


if __name__ == "__main__":
    main()