BudgetThinker_backup / easyr1 /cut_dataset.py
Xin-Rui's picture
Upload folder using huggingface_hub
7155cf2 verified
import pandas as pd
def cut_data():
file_path = "datasets/train_first_half.parquet"
data = pd.read_parquet(file_path)
print(data['problem'][0])
half_size = len(data) // 2
data_first_half = data.iloc[:half_size]
data_second_half = data.iloc[half_size:]
print(f"First half length: {len(data_first_half)}")
print(f"Second half length: {len(data_second_half)}")
data_first_half.to_parquet("datasets/train_1_in_4.parquet", index=False)
data_second_half.to_parquet("datasets/train_2_in_4.parquet", index=False)
def formatted_data():
file_path = "datasets/train_first_half.parquet"
data = pd.read_parquet(file_path)
data['problem'] = data['problem'].apply(lambda x: "Return your final response within \\boxed{}. " + x)
print(data['problem'][0])
target_path = file_path.replace(".parquet", "_formatted.parquet")
data.to_parquet(target_path, index=False)
def visualize_data():
# 定义文件路径
file_path = "datasets/train-00000-of-00001_formatted.parquet"
# 读取数据
data = pd.read_parquet(file_path)
print(data.head())
if __name__ == "__main__":
formatted_data()
visualize_data()
cut_data()