rm_code / safe_filter.py
hahayang012's picture
Upload folder using huggingface_hub
d8a76be verified
import pandas as pd
# 读取原始 parquet 文件
input_path = "/home/data/raw/test/4201_2355_full_label_1000-8192_sys3round.parquet"
output_path = "/home/data/raw/test/4201_2355_full_label_1000-8192_sys3round_chosensafe.parquet"
# 加载数据
df = pd.read_parquet(input_path)
# 只保留 label == "safe" 的样本
df_safe = df[df["chosen_label"] == "safe"]
# 保存为新的 parquet 文件
df_safe.to_parquet(output_path, index=False)
print(f"筛选完成,共保留 {len(df_safe)} 条样本,已保存到 {output_path}")