leaderboard / data_pdfs_to_pngs.py
LennartPurucker's picture
maint: new LB
ffca6e7
raw
history blame
1.69 kB
"""Helper script to go from PDF to PNG ZIP Files we can use in HTML on the LB."""
from __future__ import annotations
import zipfile
from pathlib import Path
from multiprocessing import Pool, cpu_count
from pdf2image import convert_from_path
ROOT_DIR = Path("./data")
DPI = 800 # you can lower this if files are huge / too slow
def process_pdf(pdf_path_str: str) -> None:
pdf_path = Path(pdf_path_str).resolve()
zip_path = pdf_path.with_suffix(".png.zip")
print(f"Converting {pdf_path}...")
# Convert all pages of the PDF
images = convert_from_path(str(pdf_path), dpi=DPI)
# Save pages as PNGs (multi-page safe naming)
png_path = pdf_path.with_suffix(".png")
images[0].save(png_path, "PNG")
png_paths = [png_path]
# Zip all PNGs into one archive
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as zipf:
for png_path in png_paths:
zipf.write(png_path, arcname=png_path.name)
# Clean up PNGs and original PDF
for png_path in png_paths:
png_path.unlink(missing_ok=True)
pdf_path.unlink(missing_ok=True)
def main() -> None:
pdf_paths = [str(p) for p in ROOT_DIR.rglob("*.pdf")]
if not pdf_paths:
print("No PDFs found.")
return
# Use one process per CPU, but not more than number of PDFs
n_procs = min(cpu_count(), len(pdf_paths))
print(f"Found {len(pdf_paths)} PDFs. Using {n_procs} processes.")
with Pool(processes=n_procs) as pool:
# imap_unordered gives you streaming results + simple progress printing
for _ in pool.imap_unordered(process_pdf, pdf_paths):
pass
if __name__ == "__main__":
main()