File size: 663 Bytes
3050f1b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
"""
Load the validation set from the CuneiformPhotosMSII dataset.
"""

from datasets import load_dataset, Dataset

from pathlib import Path
import json

VAL_IDS_PATH = Path(__file__).parent.parent.parent / "data" / "val_tablet_ids.json"
VAL_IDS = set(json.load(open(VAL_IDS_PATH)))


def load_val_dataset() -> Dataset:
    ds = load_dataset("boatbomber/CuneiformPhotosMSII", split="train", num_proc=4)

    # First pass: parquet column projection reads only the ID strings, skipping image bytes
    indices = [
        i
        for i, row in enumerate(ds.select_columns(["hs_number"]))
        if row["hs_number"] in VAL_IDS
    ]

    return ds.select(indices)