File size: 1,406 Bytes
8cf92b3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
#!/usr/bin/env python3
"""Re-encode unified-clip images from PNG to JPEG@95 in place (reclaims disk).

    python scripts/reencode_jpeg.py --root data/unified/av2 --workers 20
"""

import argparse, glob, os
from multiprocessing import Pool

import imageio.v2 as imageio


def _conv(png):
    try:
        arr = imageio.imread(png)
        jpg = png[:-4] + ".jpg"
        imageio.imwrite(jpg, arr, quality=95)
        os.remove(png)
        return os.path.getsize(jpg)
    except Exception as e:  # leave the PNG if anything goes wrong
        return f"ERR {png}: {e}"


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--root", required=True)
    ap.add_argument("--workers", type=int, default=20)
    args = ap.parse_args()
    pngs = glob.glob(os.path.join(args.root, "**", "images", "*.png"), recursive=True)
    print(f"re-encoding {len(pngs)} PNGs under {args.root} with {args.workers} workers ...", flush=True)
    errs = 0
    with Pool(args.workers) as pool:
        for i, r in enumerate(pool.imap_unordered(_conv, pngs, chunksize=64)):
            if isinstance(r, str):
                errs += 1
                if errs <= 5:
                    print(r, flush=True)
            if (i + 1) % 20000 == 0:
                print(f"  {i+1}/{len(pngs)}", flush=True)
    print(f"done: {len(pngs)-errs} converted, {errs} errors", flush=True)


if __name__ == "__main__":
    main()