File size: 663 Bytes

3050f1b

"""
Load the validation set from the CuneiformPhotosMSII dataset.
"""

from datasets import load_dataset, Dataset

from pathlib import Path
import json

VAL_IDS_PATH = Path(__file__).parent.parent.parent / "data" / "val_tablet_ids.json"
VAL_IDS = set(json.load(open(VAL_IDS_PATH)))


def load_val_dataset() -> Dataset:
    ds = load_dataset("boatbomber/CuneiformPhotosMSII", split="train", num_proc=4)

    # First pass: parquet column projection reads only the ID strings, skipping image bytes
    indices = [
        i
        for i, row in enumerate(ds.select_columns(["hs_number"]))
        if row["hs_number"] in VAL_IDS
    ]

    return ds.select(indices)