spatial-mllm-v11-kit / scripts /extract_osd_subset.py
Larer's picture
Add files using upload-large-folder tool
e1ee3c6 verified
Raw
History Blame Contribute Delete
1.78 kB
#!/usr/bin/env python3
"""Extract OSD image subset actually referenced by osd_dpo.jsonl.
Copies only the 16,657 images referenced by osd_dpo (~5.2 GB),
skipping the other ~31k unrelated images (full set is 45 GB).
"""
import argparse, json, os, shutil, sys
from pathlib import Path
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--jsonl", required=True, help="osd_dpo.jsonl path")
ap.add_argument("--src", required=True, help="OSD train image dir")
ap.add_argument("--dst", required=True, help="output dir")
ap.add_argument("--hardlink", action="store_true",
help="hardlink instead of copy (requires same filesystem; no extra space)")
args = ap.parse_args()
src = Path(args.src)
dst = Path(args.dst)
dst.mkdir(parents=True, exist_ok=True)
# collect referenced filenames
names = set()
with open(args.jsonl) as f:
for line in f:
d = json.loads(line)
for k in ("chosen_image", "rejected_image"):
p = d.get(k)
if p: names.add(Path(p).name)
print(f"unique image filenames in jsonl: {len(names)}")
n_ok, n_miss = 0, 0
for i, name in enumerate(sorted(names)):
s = src / name
d = dst / name
if d.exists():
n_ok += 1
continue
if not s.exists():
n_miss += 1
continue
if args.hardlink:
os.link(s, d)
else:
shutil.copy2(s, d)
n_ok += 1
if (i + 1) % 1000 == 0:
print(f" copied {i+1}/{len(names)} ...")
print(f"done. ok={n_ok}, missing={n_miss}, dst={dst}")
print(f"dst size: {sum(p.stat().st_size for p in dst.iterdir())/1024**3:.2f} GB")
if __name__ == "__main__":
main()