Spaces:
Running
Running
| """Helper script to go from PDF to PNG ZIP Files we can use in HTML on the LB.""" | |
| from __future__ import annotations | |
| import zipfile | |
| from pathlib import Path | |
| from multiprocessing import Pool, cpu_count | |
| from pdf2image import convert_from_path | |
| ROOT_DIR = Path("./data") | |
| DPI = 800 # you can lower this if files are huge / too slow | |
| def process_pdf(pdf_path_str: str) -> None: | |
| pdf_path = Path(pdf_path_str).resolve() | |
| zip_path = pdf_path.with_suffix(".png.zip") | |
| print(f"Converting {pdf_path}...") | |
| # Convert all pages of the PDF | |
| images = convert_from_path(str(pdf_path), dpi=DPI) | |
| # Save pages as PNGs (multi-page safe naming) | |
| png_path = pdf_path.with_suffix(".png") | |
| images[0].save(png_path, "PNG") | |
| png_paths = [png_path] | |
| # Zip all PNGs into one archive | |
| with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as zipf: | |
| for png_path in png_paths: | |
| zipf.write(png_path, arcname=png_path.name) | |
| # Clean up PNGs and original PDF | |
| for png_path in png_paths: | |
| png_path.unlink(missing_ok=True) | |
| pdf_path.unlink(missing_ok=True) | |
| def main() -> None: | |
| pdf_paths = [str(p) for p in ROOT_DIR.rglob("*.pdf")] | |
| if not pdf_paths: | |
| print("No PDFs found.") | |
| return | |
| # Use one process per CPU, but not more than number of PDFs | |
| n_procs = min(cpu_count(), len(pdf_paths)) | |
| print(f"Found {len(pdf_paths)} PDFs. Using {n_procs} processes.") | |
| with Pool(processes=n_procs) as pool: | |
| # imap_unordered gives you streaming results + simple progress printing | |
| for _ in pool.imap_unordered(process_pdf, pdf_paths): | |
| pass | |
| if __name__ == "__main__": | |
| main() |