| from datasets import load_dataset | |
| import pandas as pd | |
| # ds = load_dataset("open-r1/DAPO-Math-17k-Processed", "en", split="train") | |
| # df = ds.to_pandas() | |
| # df = df.rename(columns={"prompt": "question", "solution": "answer"}) | |
| # df = df[["question", "answer"]] | |
| # df.to_parquet("data/Open-R1/DAPO_Math17k.parquet", index=True) | |
| # print(df.head()) | |
| # ds = load_dataset("open-r1/Big-Math-RL-Verified-Processed", "all", split="train") | |
| # df = ds.to_pandas() | |
| # df = df.rename(columns={"prompt": "question", "solution": "answer"}) | |
| # df = df[["question", "answer"]] | |
| # df.to_parquet("data/Open-R1/Big-Math-RL-Verified-Processed.parquet", index=True) | |
| # print(df.head()) | |
| ds = load_dataset("/storage/group/renkan/zhengz/deepseek/dataset/open-r1/OpenR1-Math-220k", split="train") | |
| df = ds.to_pandas() | |
| df = df.rename(columns={"problem": "question", "answer": "answer"}) | |
| df = df[["question", "answer"]] | |
| df.to_parquet("data/Open-R1/OpenR1-Math-220k.parquet", index=True) | |
| print(df.head()) |