| """ |
| Load the validation set from the CuneiformPhotosMSII dataset. |
| """ |
|
|
| from datasets import load_dataset, Dataset |
|
|
| from pathlib import Path |
| import json |
|
|
| VAL_IDS_PATH = Path(__file__).parent.parent.parent / "data" / "val_tablet_ids.json" |
| VAL_IDS = set(json.load(open(VAL_IDS_PATH))) |
|
|
|
|
| def load_val_dataset() -> Dataset: |
| ds = load_dataset("boatbomber/CuneiformPhotosMSII", split="train", num_proc=4) |
|
|
| |
| indices = [ |
| i |
| for i, row in enumerate(ds.select_columns(["hs_number"])) |
| if row["hs_number"] in VAL_IDS |
| ] |
|
|
| return ds.select(indices) |
|
|