File size: 475 Bytes

d6e97b5

import os, shutil
from datasets import load_dataset

out = "/home/ubuntu/data/coco/train2017"
os.makedirs(out, exist_ok=True)

# small slice of train split
ds = load_dataset("coco_captions", "2017", split="train[:1000]")
print("Downloading ~1000 images...")
for i, row in enumerate(ds):
    # row['image'] is a PIL image (HF auto-downloads the actual JPEGs)
    fn = os.path.join(out, f"{i:012d}.jpg")
    row["image"].save(fn, quality=90)
print("✅ Wrote images to:", out)