Add files using upload-large-folder tool

e1ee3c6 verified about 1 month ago

1.78 kB

	#!/usr/bin/env python3
	"""Extract OSD image subset actually referenced by osd_dpo.jsonl.

	Copies only the 16,657 images referenced by osd_dpo (~5.2 GB),
	skipping the other ~31k unrelated images (full set is 45 GB).
	"""
	import argparse, json, os, shutil, sys
	from pathlib import Path

	def main():
	ap = argparse.ArgumentParser()
	ap.add_argument("--jsonl", required=True, help="osd_dpo.jsonl path")
	ap.add_argument("--src", required=True, help="OSD train image dir")
	ap.add_argument("--dst", required=True, help="output dir")
	ap.add_argument("--hardlink", action="store_true",
	help="hardlink instead of copy (requires same filesystem; no extra space)")
	args = ap.parse_args()

	src = Path(args.src)
	dst = Path(args.dst)
	dst.mkdir(parents=True, exist_ok=True)

	# collect referenced filenames
	names = set()
	with open(args.jsonl) as f:
	for line in f:
	d = json.loads(line)
	for k in ("chosen_image", "rejected_image"):
	p = d.get(k)
	if p: names.add(Path(p).name)
	print(f"unique image filenames in jsonl: {len(names)}")

	n_ok, n_miss = 0, 0
	for i, name in enumerate(sorted(names)):
	s = src / name
	d = dst / name
	if d.exists():
	n_ok += 1
	continue
	if not s.exists():
	n_miss += 1
	continue
	if args.hardlink:
	os.link(s, d)
	else:
	shutil.copy2(s, d)
	n_ok += 1
	if (i + 1) % 1000 == 0:
	print(f" copied {i+1}/{len(names)} ...")
	print(f"done. ok={n_ok}, missing={n_miss}, dst={dst}")
	print(f"dst size: {sum(p.stat().st_size for p in dst.iterdir())/1024**3:.2f} GB")

	if __name__ == "__main__":
	main()