| |
| """Extract OSD image subset actually referenced by osd_dpo.jsonl. |
| |
| Copies only the 16,657 images referenced by osd_dpo (~5.2 GB), |
| skipping the other ~31k unrelated images (full set is 45 GB). |
| """ |
| import argparse, json, os, shutil, sys |
| from pathlib import Path |
|
|
| def main(): |
| ap = argparse.ArgumentParser() |
| ap.add_argument("--jsonl", required=True, help="osd_dpo.jsonl path") |
| ap.add_argument("--src", required=True, help="OSD train image dir") |
| ap.add_argument("--dst", required=True, help="output dir") |
| ap.add_argument("--hardlink", action="store_true", |
| help="hardlink instead of copy (requires same filesystem; no extra space)") |
| args = ap.parse_args() |
|
|
| src = Path(args.src) |
| dst = Path(args.dst) |
| dst.mkdir(parents=True, exist_ok=True) |
|
|
| |
| names = set() |
| with open(args.jsonl) as f: |
| for line in f: |
| d = json.loads(line) |
| for k in ("chosen_image", "rejected_image"): |
| p = d.get(k) |
| if p: names.add(Path(p).name) |
| print(f"unique image filenames in jsonl: {len(names)}") |
|
|
| n_ok, n_miss = 0, 0 |
| for i, name in enumerate(sorted(names)): |
| s = src / name |
| d = dst / name |
| if d.exists(): |
| n_ok += 1 |
| continue |
| if not s.exists(): |
| n_miss += 1 |
| continue |
| if args.hardlink: |
| os.link(s, d) |
| else: |
| shutil.copy2(s, d) |
| n_ok += 1 |
| if (i + 1) % 1000 == 0: |
| print(f" copied {i+1}/{len(names)} ...") |
| print(f"done. ok={n_ok}, missing={n_miss}, dst={dst}") |
| print(f"dst size: {sum(p.stat().st_size for p in dst.iterdir())/1024**3:.2f} GB") |
|
|
| if __name__ == "__main__": |
| main() |
|
|