from datasets import load_dataset import json from tqdm import tqdm # capture = load_dataset("/group/40005/public_datasets/DetailCaps-4870") capture = load_dataset("parquet", data_files={"test": "/group/40005/public_datasets/DetailCaps-4870/DetailCaps-4870.parquet"})['test'] print(len(capture)) save_dir = "/group/40005/auroraji/CAPTURE/samples" anno = {} for i, instance in tqdm(enumerate(capture)): img_binary = instance['binary'] anno[i] = [instance['GT_Caption_GPT4V'], instance['GT_Caption_GPT4O'], instance['GT_Caption_Gemini15Pro']] with open(f"{save_dir}/{i}.png", "wb") as f: f.write(img_binary) with open("annotations.json", "w") as f: json.dump(anno, f, indent=4)