#!/usr/bin/env python3 """Re-encode unified-clip images from PNG to JPEG@95 in place (reclaims disk). python scripts/reencode_jpeg.py --root data/unified/av2 --workers 20 """ import argparse, glob, os from multiprocessing import Pool import imageio.v2 as imageio def _conv(png): try: arr = imageio.imread(png) jpg = png[:-4] + ".jpg" imageio.imwrite(jpg, arr, quality=95) os.remove(png) return os.path.getsize(jpg) except Exception as e: # leave the PNG if anything goes wrong return f"ERR {png}: {e}" def main(): ap = argparse.ArgumentParser() ap.add_argument("--root", required=True) ap.add_argument("--workers", type=int, default=20) args = ap.parse_args() pngs = glob.glob(os.path.join(args.root, "**", "images", "*.png"), recursive=True) print(f"re-encoding {len(pngs)} PNGs under {args.root} with {args.workers} workers ...", flush=True) errs = 0 with Pool(args.workers) as pool: for i, r in enumerate(pool.imap_unordered(_conv, pngs, chunksize=64)): if isinstance(r, str): errs += 1 if errs <= 5: print(r, flush=True) if (i + 1) % 20000 == 0: print(f" {i+1}/{len(pngs)}", flush=True) print(f"done: {len(pngs)-errs} converted, {errs} errors", flush=True) if __name__ == "__main__": main()