File size: 1,349 Bytes
aedd6ab | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 | """Dataset name(s) -> merged {id, code, input, output} rows.
Add a converted dataset by running its folder's convert.py (saves ./data via
save_to_disk) and listing it in _LOCAL. cruxeval keeps its own Hub-fallback
loader and is held out entirely for eval (eval_cruxeval_*.py), never trained on.
"""
from __future__ import annotations
import os
from pathlib import Path
_LOCAL = {"mbpp": "MBPP", "humaneval": "HumanEval", "pyx": "PyX"} # name -> folder, data in ./data
def load_cruxeval():
"""Full CRUXEval-O (the held-out test set). Prefer a local save_to_disk copy;
the HF builder FileLock dies on NFS caches."""
local_dir = os.environ.get("CRUXEVAL_DIR")
if local_dir and os.path.isdir(local_dir):
from datasets import load_from_disk
return list(load_from_disk(local_dir))
from datasets import load_dataset
return list(load_dataset("cruxeval-org/cruxeval", split="test"))
def load_one(name: str) -> list[dict]:
key = name.strip().lower()
if key == "cruxeval":
return load_cruxeval()
if key in _LOCAL:
from datasets import load_from_disk
d = os.environ.get(key.upper() + "_DIR") or str(Path(__file__).parent / _LOCAL[key] / "data")
return list(load_from_disk(d))
raise ValueError(f"unknown data source {name!r}; pick from {['cruxeval', *_LOCAL]}")
|