import pandas as pd # 读取原始 parquet 文件 input_path = "/home/data/raw/test/4201_2355_full_label_1000-8192_sys3round.parquet" output_path = "/home/data/raw/test/4201_2355_full_label_1000-8192_sys3round_chosensafe.parquet" # 加载数据 df = pd.read_parquet(input_path) # 只保留 label == "safe" 的样本 df_safe = df[df["chosen_label"] == "safe"] # 保存为新的 parquet 文件 df_safe.to_parquet(output_path, index=False) print(f"筛选完成,共保留 {len(df_safe)} 条样本,已保存到 {output_path}")