File size: 989 Bytes
80b7188 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 | from datasets import load_dataset
from rllm.data.dataset import DatasetRegistry
def prepare_math_data():
train_dataset = load_dataset("agentica-org/DeepScaleR-Preview-Dataset", split="train")
test_dataset = load_dataset("HuggingFaceH4/aime_2024", split="train")
def preprocess_fn(example, idx):
return {
"question": example["problem"],
"ground_truth": example["answer"],
"data_source": "math",
}
train_dataset = train_dataset.map(preprocess_fn, with_indices=True)
test_dataset = test_dataset.map(preprocess_fn, with_indices=True)
train_dataset = DatasetRegistry.register_dataset("deepscaler_math", train_dataset, "train")
test_dataset = DatasetRegistry.register_dataset("aime2024", test_dataset, "test")
return train_dataset, test_dataset
if __name__ == "__main__":
train_dataset, test_dataset = prepare_math_data()
print(train_dataset)
print(test_dataset)
|