File size: 1,637 Bytes
bb7f76d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import json
import io
from datasets import Dataset, Features, Sequence, Value, Image
from PIL import Image as PILImage

# 1️⃣ Load your JSON file (which is a top-level list of dicts)
with open("Train_QA_10k_noFreeForm.json", "r") as f:
    records = json.load(f)   # List[Dict]

# 2️⃣ Build an HF Dataset
ds = Dataset.from_list(records)

# 3️⃣ Read each image file into raw bytes
def read_image_bytes(example):
    with open(example["path"], "rb") as img_f:
        example["image_bytes"] = img_f.read()
    return example

# we keep all original columns + add "image_bytes"
ds = ds.map(read_image_bytes, remove_columns=[])

# 4️⃣ Define your schema, telling HF that image_bytes is binary
features = Features({
    "problem_id":   Value("int64"),
    "problem":      Value("string"),
    "data_type":    Value("string"),
    "problem_type": Value("string"),
    "options":      Sequence(Value("string")),
    "solution":     Value("string"),
    "data_source":  Value("string"),
    # "prompt":       Value("string"),
    "answer":       Value("string"),
    "path":         Value("string"),
    "image_bytes":  Value("binary"),    # ← raw bytes in Arrow
})
ds = ds.cast(features)

# 5️⃣ Rename, and cast that byte-column to an Image feature that decodes to PIL
ds = ds.rename_column("image_bytes", "images")
ds = ds.cast_column("images", Image(decode=True))

# 6️⃣ Sanity-check
img0 = ds[0]["images"]
print(img0)  
# → PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=512x384

# 7️⃣ Finally, write out to Parquet (the bytes go in the file)
ds.to_parquet("./hf_data/Train_QA_10k_noFreeForm.parquet")