| import pandas as pd | |
| # 读取原始 parquet 文件 | |
| input_path = "/home/data/raw/test/4201_2355_full_label_1000-8192_sys3round.parquet" | |
| output_path = "/home/data/raw/test/4201_2355_full_label_1000-8192_sys3round_chosensafe.parquet" | |
| # 加载数据 | |
| df = pd.read_parquet(input_path) | |
| # 只保留 label == "safe" 的样本 | |
| df_safe = df[df["chosen_label"] == "safe"] | |
| # 保存为新的 parquet 文件 | |
| df_safe.to_parquet(output_path, index=False) | |
| print(f"筛选完成,共保留 {len(df_safe)} 条样本,已保存到 {output_path}") | |