Spaces:

TabArena
/

leaderboard

Running

leaderboard / data_pdfs_to_pngs.py

maint: new LB

ffca6e7 20 days ago

1.69 kB

	"""Helper script to go from PDF to PNG ZIP Files we can use in HTML on the LB."""

	from __future__ import annotations

	import zipfile
	from pathlib import Path
	from multiprocessing import Pool, cpu_count

	from pdf2image import convert_from_path


	ROOT_DIR = Path("./data")
	DPI = 800 # you can lower this if files are huge / too slow


	def process_pdf(pdf_path_str: str) -> None:
	pdf_path = Path(pdf_path_str).resolve()
	zip_path = pdf_path.with_suffix(".png.zip")

	print(f"Converting {pdf_path}...")

	# Convert all pages of the PDF
	images = convert_from_path(str(pdf_path), dpi=DPI)

	# Save pages as PNGs (multi-page safe naming)
	png_path = pdf_path.with_suffix(".png")
	images[0].save(png_path, "PNG")
	png_paths = [png_path]


	# Zip all PNGs into one archive
	with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as zipf:
	for png_path in png_paths:
	zipf.write(png_path, arcname=png_path.name)

	# Clean up PNGs and original PDF
	for png_path in png_paths:
	png_path.unlink(missing_ok=True)
	pdf_path.unlink(missing_ok=True)


	def main() -> None:
	pdf_paths = [str(p) for p in ROOT_DIR.rglob("*.pdf")]

	if not pdf_paths:
	print("No PDFs found.")
	return

	# Use one process per CPU, but not more than number of PDFs
	n_procs = min(cpu_count(), len(pdf_paths))
	print(f"Found {len(pdf_paths)} PDFs. Using {n_procs} processes.")

	with Pool(processes=n_procs) as pool:
	# imap_unordered gives you streaming results + simple progress printing
	for _ in pool.imap_unordered(process_pdf, pdf_paths):
	pass


	if __name__ == "__main__":
	main()