""" Load the validation set from the CuneiformPhotosMSII dataset. """ from datasets import load_dataset, Dataset from pathlib import Path import json VAL_IDS_PATH = Path(__file__).parent.parent.parent / "data" / "val_tablet_ids.json" VAL_IDS = set(json.load(open(VAL_IDS_PATH))) def load_val_dataset() -> Dataset: ds = load_dataset("boatbomber/CuneiformPhotosMSII", split="train", num_proc=4) # First pass: parquet column projection reads only the ID strings, skipping image bytes indices = [ i for i, row in enumerate(ds.select_columns(["hs_number"])) if row["hs_number"] in VAL_IDS ] return ds.select(indices)