Upload ./read.py with huggingface_hub
Browse files
read.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from datasets import load_dataset
|
| 2 |
+
import json
|
| 3 |
+
from tqdm import tqdm
|
| 4 |
+
|
| 5 |
+
# capture = load_dataset("/group/40005/public_datasets/DetailCaps-4870")
|
| 6 |
+
capture = load_dataset("parquet", data_files={"test": "/group/40005/public_datasets/DetailCaps-4870/DetailCaps-4870.parquet"})['test']
|
| 7 |
+
print(len(capture))
|
| 8 |
+
save_dir = "/group/40005/auroraji/CAPTURE/samples"
|
| 9 |
+
anno = {}
|
| 10 |
+
|
| 11 |
+
for i, instance in tqdm(enumerate(capture)):
|
| 12 |
+
img_binary = instance['binary']
|
| 13 |
+
anno[i] = [instance['GT_Caption_GPT4V'], instance['GT_Caption_GPT4O'], instance['GT_Caption_Gemini15Pro']]
|
| 14 |
+
|
| 15 |
+
with open(f"{save_dir}/{i}.png", "wb") as f:
|
| 16 |
+
f.write(img_binary)
|
| 17 |
+
|
| 18 |
+
with open("annotations.json", "w") as f:
|
| 19 |
+
json.dump(anno, f, indent=4)
|