File size: 663 Bytes
3050f1b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 | """
Load the validation set from the CuneiformPhotosMSII dataset.
"""
from datasets import load_dataset, Dataset
from pathlib import Path
import json
VAL_IDS_PATH = Path(__file__).parent.parent.parent / "data" / "val_tablet_ids.json"
VAL_IDS = set(json.load(open(VAL_IDS_PATH)))
def load_val_dataset() -> Dataset:
ds = load_dataset("boatbomber/CuneiformPhotosMSII", split="train", num_proc=4)
# First pass: parquet column projection reads only the ID strings, skipping image bytes
indices = [
i
for i, row in enumerate(ds.select_columns(["hs_number"]))
if row["hs_number"] in VAL_IDS
]
return ds.select(indices)
|