from datasets import load_dataset import pandas as pd # ds = load_dataset("open-r1/DAPO-Math-17k-Processed", "en", split="train") # df = ds.to_pandas() # df = df.rename(columns={"prompt": "question", "solution": "answer"}) # df = df[["question", "answer"]] # df.to_parquet("data/Open-R1/DAPO_Math17k.parquet", index=True) # print(df.head()) # ds = load_dataset("open-r1/Big-Math-RL-Verified-Processed", "all", split="train") # df = ds.to_pandas() # df = df.rename(columns={"prompt": "question", "solution": "answer"}) # df = df[["question", "answer"]] # df.to_parquet("data/Open-R1/Big-Math-RL-Verified-Processed.parquet", index=True) # print(df.head()) ds = load_dataset("/storage/group/renkan/zhengz/deepseek/dataset/open-r1/OpenR1-Math-220k", split="train") df = ds.to_pandas() df = df.rename(columns={"problem": "question", "answer": "answer"}) df = df[["question", "answer"]] df.to_parquet("data/Open-R1/OpenR1-Math-220k.parquet", index=True) print(df.head())