File size: 2,214 Bytes
d8a76be
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# -*- coding: utf-8 -*-
"""
需求:
统计 /home/data/pk-2089-L6_full_label.parquet 中:
(chosen_label == 'safe') 且 (reject_label == 'safe') 且 (chosen_model == 3089)
的样本数;同时打印总样本数与比率。

依赖:pandas, pyarrow(或 fastparquet)
pip install pandas pyarrow
"""

import pandas as pd

PATH = "/home/data/raw/test/1159-L6_format_full_label.parquet"
ID=2159
def norm_label(x) -> str:
    if pd.isna(x):
        return ""
    return str(x).strip().lower()

def main():
    df = pd.read_parquet(PATH)

    # 规范化标签为小写去空格
    chosen_label = df.get("chosen_label").map(norm_label)
    reject_label = df.get("reject_label").map(norm_label)

    # 将 chosen_model 转为数值;无法转为数值的置为 NaN
    chosen_model_num = pd.to_numeric(df.get("chosen_model"), errors="coerce")
    mask1 = (
        (chosen_label == "safe") &
        (reject_label == "safe") &
        (chosen_model_num == ID)
    )
    mask2 = (
        (chosen_label == "safe") &
        (reject_label == "safe") 
    )
    mask3 = (
        (chosen_label == "unsafe") &
        (reject_label == "safe") &
        (chosen_model_num == ID)
    )
    mask4 = (
        (chosen_label == "unsafe") &
        (reject_label == "safe") 
    )
    mask5 = (
        (chosen_label == "unsafe") &
        (reject_label == "unsafe") &
        (chosen_model_num == ID)
    )
    mask6 = (
        (chosen_label == "unsafe") &
        (reject_label == "unsafe") 
    )
    mask7 =(chosen_label == "safe")
    safenum =int(mask7.sum())
    count1 = int(mask1.sum())
    total1 = int(mask2.sum())
    count2 = int(mask3.sum())
    total2 = int(mask4.sum())
    count3 = int(mask5.sum())
    total3 = int(mask6.sum())
    ratio1 = (count1 / total1) if total1 > 0 else 0.0
    ratio2 = (count2 / total2) if total2 > 0 else 0.0
    ratio3 = (count3 / total3) if total3 > 0 else 0.0
    saferatio= (safenum / len(df)) if len(df) > 0 else 0.0
    print(f"安全率={saferatio:.6f}  ({safenum}/{len(df)})")
    print(f"比率: {ratio1:.6f}  ({count1}/{total1}),"
          f"{ratio2:.6f}  ({count2}/{total2}),"
          f" {ratio3:.6f}  ({count3}/{total3})")

if __name__ == "__main__":
    main()