#!/usr/bin/env python3 """Extract OSD image subset actually referenced by osd_dpo.jsonl. Copies only the 16,657 images referenced by osd_dpo (~5.2 GB), skipping the other ~31k unrelated images (full set is 45 GB). """ import argparse, json, os, shutil, sys from pathlib import Path def main(): ap = argparse.ArgumentParser() ap.add_argument("--jsonl", required=True, help="osd_dpo.jsonl path") ap.add_argument("--src", required=True, help="OSD train image dir") ap.add_argument("--dst", required=True, help="output dir") ap.add_argument("--hardlink", action="store_true", help="hardlink instead of copy (requires same filesystem; no extra space)") args = ap.parse_args() src = Path(args.src) dst = Path(args.dst) dst.mkdir(parents=True, exist_ok=True) # collect referenced filenames names = set() with open(args.jsonl) as f: for line in f: d = json.loads(line) for k in ("chosen_image", "rejected_image"): p = d.get(k) if p: names.add(Path(p).name) print(f"unique image filenames in jsonl: {len(names)}") n_ok, n_miss = 0, 0 for i, name in enumerate(sorted(names)): s = src / name d = dst / name if d.exists(): n_ok += 1 continue if not s.exists(): n_miss += 1 continue if args.hardlink: os.link(s, d) else: shutil.copy2(s, d) n_ok += 1 if (i + 1) % 1000 == 0: print(f" copied {i+1}/{len(names)} ...") print(f"done. ok={n_ok}, missing={n_miss}, dst={dst}") print(f"dst size: {sum(p.stat().st_size for p in dst.iterdir())/1024**3:.2f} GB") if __name__ == "__main__": main()