File size: 1,637 Bytes
bb7f76d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 |
import json
import io
from datasets import Dataset, Features, Sequence, Value, Image
from PIL import Image as PILImage
# 1️⃣ Load your JSON file (which is a top-level list of dicts)
with open("Train_QA_10k_noFreeForm.json", "r") as f:
records = json.load(f) # List[Dict]
# 2️⃣ Build an HF Dataset
ds = Dataset.from_list(records)
# 3️⃣ Read each image file into raw bytes
def read_image_bytes(example):
with open(example["path"], "rb") as img_f:
example["image_bytes"] = img_f.read()
return example
# we keep all original columns + add "image_bytes"
ds = ds.map(read_image_bytes, remove_columns=[])
# 4️⃣ Define your schema, telling HF that image_bytes is binary
features = Features({
"problem_id": Value("int64"),
"problem": Value("string"),
"data_type": Value("string"),
"problem_type": Value("string"),
"options": Sequence(Value("string")),
"solution": Value("string"),
"data_source": Value("string"),
# "prompt": Value("string"),
"answer": Value("string"),
"path": Value("string"),
"image_bytes": Value("binary"), # ← raw bytes in Arrow
})
ds = ds.cast(features)
# 5️⃣ Rename, and cast that byte-column to an Image feature that decodes to PIL
ds = ds.rename_column("image_bytes", "images")
ds = ds.cast_column("images", Image(decode=True))
# 6️⃣ Sanity-check
img0 = ds[0]["images"]
print(img0)
# → PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=512x384
# 7️⃣ Finally, write out to Parquet (the bytes go in the file)
ds.to_parquet("./hf_data/Train_QA_10k_noFreeForm.parquet")
|