visual-narrator-llm / fetch_mini_coco.py

feat: Visual Narrator 3B - Clean repository with professional benchmarks

d6e97b5 3 months ago

475 Bytes

	import os, shutil
	from datasets import load_dataset

	out = "/home/ubuntu/data/coco/train2017"
	os.makedirs(out, exist_ok=True)

	# small slice of train split
	ds = load_dataset("coco_captions", "2017", split="train[:1000]")
	print("Downloading ~1000 images...")
	for i, row in enumerate(ds):
	# row['image'] is a PIL image (HF auto-downloads the actual JPEGs)
	fn = os.path.join(out, f"{i:012d}.jpg")
	row["image"].save(fn, quality=90)
	print("✅ Wrote images to:", out)