# -*- coding: utf-8 -*- """ 需求: 统计 /home/data/pk-2089-L6_full_label.parquet 中: (chosen_label == 'safe') 且 (reject_label == 'safe') 且 (chosen_model == 3089) 的样本数;同时打印总样本数与比率。 依赖:pandas, pyarrow(或 fastparquet) pip install pandas pyarrow """ import pandas as pd PATH = "/home/data/raw/test/1159-L6_format_full_label.parquet" ID=2159 def norm_label(x) -> str: if pd.isna(x): return "" return str(x).strip().lower() def main(): df = pd.read_parquet(PATH) # 规范化标签为小写去空格 chosen_label = df.get("chosen_label").map(norm_label) reject_label = df.get("reject_label").map(norm_label) # 将 chosen_model 转为数值;无法转为数值的置为 NaN chosen_model_num = pd.to_numeric(df.get("chosen_model"), errors="coerce") mask1 = ( (chosen_label == "safe") & (reject_label == "safe") & (chosen_model_num == ID) ) mask2 = ( (chosen_label == "safe") & (reject_label == "safe") ) mask3 = ( (chosen_label == "unsafe") & (reject_label == "safe") & (chosen_model_num == ID) ) mask4 = ( (chosen_label == "unsafe") & (reject_label == "safe") ) mask5 = ( (chosen_label == "unsafe") & (reject_label == "unsafe") & (chosen_model_num == ID) ) mask6 = ( (chosen_label == "unsafe") & (reject_label == "unsafe") ) mask7 =(chosen_label == "safe") safenum =int(mask7.sum()) count1 = int(mask1.sum()) total1 = int(mask2.sum()) count2 = int(mask3.sum()) total2 = int(mask4.sum()) count3 = int(mask5.sum()) total3 = int(mask6.sum()) ratio1 = (count1 / total1) if total1 > 0 else 0.0 ratio2 = (count2 / total2) if total2 > 0 else 0.0 ratio3 = (count3 / total3) if total3 > 0 else 0.0 saferatio= (safenum / len(df)) if len(df) > 0 else 0.0 print(f"安全率={saferatio:.6f} ({safenum}/{len(df)})") print(f"比率: {ratio1:.6f} ({count1}/{total1})," f"{ratio2:.6f} ({count2}/{total2})," f" {ratio3:.6f} ({count3}/{total3})") if __name__ == "__main__": main()