gensearcher-firered / vendor /rllm /examples /math_tool /prepare_math_data.py
JSCPPProgrammer's picture
Initial: GenSearcher workflow + FireRed /generate adapter + Gradio
80b7188 verified
from datasets import load_dataset
from rllm.data.dataset import DatasetRegistry
def prepare_math_data():
train_dataset = load_dataset("agentica-org/DeepScaleR-Preview-Dataset", split="train")
test_dataset = load_dataset("HuggingFaceH4/aime_2024", split="train")
def preprocess_fn(example, idx):
return {
"question": example["problem"],
"ground_truth": example["answer"],
"data_source": "math",
}
train_dataset = train_dataset.map(preprocess_fn, with_indices=True)
test_dataset = test_dataset.map(preprocess_fn, with_indices=True)
train_dataset = DatasetRegistry.register_dataset("deepscaler_math", train_dataset, "train")
test_dataset = DatasetRegistry.register_dataset("aime2024", test_dataset, "test")
return train_dataset, test_dataset
if __name__ == "__main__":
train_dataset, test_dataset = prepare_math_data()
print(train_dataset)
print(test_dataset)