| from datasets import load_dataset | |
| from rllm.data.dataset import DatasetRegistry | |
| def prepare_math_data(): | |
| train_dataset = load_dataset("agentica-org/DeepScaleR-Preview-Dataset", split="train") | |
| test_dataset = load_dataset("HuggingFaceH4/aime_2024", split="train") | |
| def preprocess_fn(example, idx): | |
| return { | |
| "question": example["problem"], | |
| "ground_truth": example["answer"], | |
| "data_source": "math", | |
| } | |
| train_dataset = train_dataset.map(preprocess_fn, with_indices=True) | |
| test_dataset = test_dataset.map(preprocess_fn, with_indices=True) | |
| train_dataset = DatasetRegistry.register_dataset("deepscaler_math", train_dataset, "train") | |
| test_dataset = DatasetRegistry.register_dataset("aime2024", test_dataset, "test") | |
| return train_dataset, test_dataset | |
| if __name__ == "__main__": | |
| train_dataset, test_dataset = prepare_math_data() | |
| print(train_dataset) | |
| print(test_dataset) | |