rm_code / win_rate.py
hahayang012's picture
Upload folder using huggingface_hub
d8a76be verified
# -*- coding: utf-8 -*-
"""
需求:
统计 /home/data/pk-2089-L6_full_label.parquet 中:
(chosen_label == 'safe') 且 (reject_label == 'safe') 且 (chosen_model == 3089)
的样本数;同时打印总样本数与比率。
依赖:pandas, pyarrow(或 fastparquet)
pip install pandas pyarrow
"""
import pandas as pd
PATH = "/home/data/raw/test/1159-L6_format_full_label.parquet"
ID=2159
def norm_label(x) -> str:
if pd.isna(x):
return ""
return str(x).strip().lower()
def main():
df = pd.read_parquet(PATH)
# 规范化标签为小写去空格
chosen_label = df.get("chosen_label").map(norm_label)
reject_label = df.get("reject_label").map(norm_label)
# 将 chosen_model 转为数值;无法转为数值的置为 NaN
chosen_model_num = pd.to_numeric(df.get("chosen_model"), errors="coerce")
mask1 = (
(chosen_label == "safe") &
(reject_label == "safe") &
(chosen_model_num == ID)
)
mask2 = (
(chosen_label == "safe") &
(reject_label == "safe")
)
mask3 = (
(chosen_label == "unsafe") &
(reject_label == "safe") &
(chosen_model_num == ID)
)
mask4 = (
(chosen_label == "unsafe") &
(reject_label == "safe")
)
mask5 = (
(chosen_label == "unsafe") &
(reject_label == "unsafe") &
(chosen_model_num == ID)
)
mask6 = (
(chosen_label == "unsafe") &
(reject_label == "unsafe")
)
mask7 =(chosen_label == "safe")
safenum =int(mask7.sum())
count1 = int(mask1.sum())
total1 = int(mask2.sum())
count2 = int(mask3.sum())
total2 = int(mask4.sum())
count3 = int(mask5.sum())
total3 = int(mask6.sum())
ratio1 = (count1 / total1) if total1 > 0 else 0.0
ratio2 = (count2 / total2) if total2 > 0 else 0.0
ratio3 = (count3 / total3) if total3 > 0 else 0.0
saferatio= (safenum / len(df)) if len(df) > 0 else 0.0
print(f"安全率={saferatio:.6f} ({safenum}/{len(df)})")
print(f"比率: {ratio1:.6f} ({count1}/{total1}),"
f"{ratio2:.6f} ({count2}/{total2}),"
f" {ratio3:.6f} ({count3}/{total3})")
if __name__ == "__main__":
main()