File size: 8,987 Bytes
60ae732
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
#!/usr/bin/env python3
"""
验证加权平均计算的问题
"""

# 从 qa_distribution.json 获取的真实数据
qa_distribution = {
    "overall_distribution": {
        "problem_types": {
            "A": {"ratio": 0.336058},  # Recall
            "B": {"ratio": 0.238782},  # Causal Inference
            "C": {"ratio": 0.259296},  # State Updating
            "D": {"ratio": 0.165865}   # State Abstraction
        }
    },
    "domain_distribution": {
        "TEXT2SQL": {
            "qa_ratio": 0.245192,
            "problem_types": {
                "A": {"ratio_in_domain": 0.364379, "ratio_overall": 0.089344},
                "B": {"ratio_in_domain": 0.250000, "ratio_overall": 0.061298},
                "C": {"ratio_in_domain": 0.218954, "ratio_overall": 0.053686},
                "D": {"ratio_in_domain": 0.166667, "ratio_overall": 0.040865}
            }
        },
        "SOFTWARE": {
            "qa_ratio": 0.173077,
            "problem_types": {
                "A": {"ratio_in_domain": 0.490741, "ratio_overall": 0.084936},
                "B": {"ratio_in_domain": 0.173611, "ratio_overall": 0.030048},
                "C": {"ratio_in_domain": 0.168981, "ratio_overall": 0.029247},
                "D": {"ratio_in_domain": 0.166667, "ratio_overall": 0.028846}
            }
        },
        "WEB": {
            "qa_ratio": 0.149038,
            "problem_types": {
                "A": {"ratio_in_domain": 0.336022, "ratio_overall": 0.050080},
                "B": {"ratio_in_domain": 0.250000, "ratio_overall": 0.037260},
                "C": {"ratio_in_domain": 0.250000, "ratio_overall": 0.037260},
                "D": {"ratio_in_domain": 0.163978, "ratio_overall": 0.024439}
            }
        },
        "GAME": {
            "qa_ratio": 0.144231,
            "problem_types": {
                "A": {"ratio_in_domain": 0.333333, "ratio_overall": 0.048077},
                "B": {"ratio_in_domain": 0.250000, "ratio_overall": 0.036058},
                "C": {"ratio_in_domain": 0.250000, "ratio_overall": 0.036058},
                "D": {"ratio_in_domain": 0.166667, "ratio_overall": 0.024038}
            }
        },
        "EMBODIED_AI": {
            "qa_ratio": 0.144231,
            "problem_types": {
                "A": {"ratio_in_domain": 0.169444, "ratio_overall": 0.024439},
                "B": {"ratio_in_domain": 0.250000, "ratio_overall": 0.036058},
                "C": {"ratio_in_domain": 0.416667, "ratio_overall": 0.060096},
                "D": {"ratio_in_domain": 0.163889, "ratio_overall": 0.023638}
            }
        },
        "OPENWORLD_QA": {
            "qa_ratio": 0.144231,
            "problem_types": {
                "A": {"ratio_in_domain": 0.272222, "ratio_overall": 0.039263},
                "B": {"ratio_in_domain": 0.263889, "ratio_overall": 0.038062},
                "C": {"ratio_in_domain": 0.297222, "ratio_overall": 0.042868},
                "D": {"ratio_in_domain": 0.166667, "ratio_overall": 0.024038}
            }
        }
    }
}

# 使用第一个模型的真实数据: Claude Haiku 3.5
model_scores = {
    "GAME": {"A": 0.5, "B": 0.458, "C": 0.564, "D": 0.583},
    "EMBODIED_AI": {"A": 0.3934, "B": 0.4667, "C": 0.34, "D": 0.0},
    "WEB": {"A": 0.4711, "B": 0.5889, "C": 0.5222, "D": 0.5932},
    "TEXT2SQL": {"A": 0.6233, "B": 0.1961, "C": 0.4328, "D": 0.1569},
    "OPENWORLD_QA": {"A": 0.6596, "B": 0.7333, "C": 0.5625, "D": 0.5},
    "SOFTWARE": {"A": 0.26, "B": 0.4366, "C": 0.1739, "D": 0.1324}
}

print("=" * 80)
print("测试:Claude Haiku 3.5 的平均分计算")
print("=" * 80)

# ========== 当前算法(错误)==========
print("\n【当前算法 - 简单平均】")

# 1. 计算各domain的简单平均分
domain_scores_simple = {}
for domain, scores in model_scores.items():
    avg = sum(scores.values()) / len(scores)
    domain_scores_simple[domain] = avg
    print(f"  {domain}: ({scores['A']:.4f} + {scores['B']:.4f} + {scores['C']:.4f} + {scores['D']:.4f}) / 4 = {avg:.6f}")

# 2. 按domain权重计算总平均
domain_weighted_sum = 0
domain_weight_sum = 0
for domain, score in domain_scores_simple.items():
    weight = qa_distribution["domain_distribution"][domain]["qa_ratio"]
    domain_weighted_sum += score * weight
    domain_weight_sum += weight
current_domain_avg = domain_weighted_sum / domain_weight_sum
print(f"\n  Domain加权平均: {current_domain_avg:.6f}")

# 3. 计算各capability的简单平均分
capability_scores_simple = {"A": [], "B": [], "C": [], "D": []}
for domain, scores in model_scores.items():
    for cap in ["A", "B", "C", "D"]:
        capability_scores_simple[cap].append(scores[cap])

for cap in ["A", "B", "C", "D"]:
    avg = sum(capability_scores_simple[cap]) / len(capability_scores_simple[cap])
    capability_scores_simple[cap] = avg
    print(f"  Capability {cap}: 平均 = {avg:.6f}")

# 4. 按capability权重计算总平均
cap_weighted_sum = 0
cap_weight_sum = 0
for cap, score in capability_scores_simple.items():
    weight = qa_distribution["overall_distribution"]["problem_types"][cap]["ratio"]
    cap_weighted_sum += score * weight
    cap_weight_sum += weight
current_cap_avg = cap_weighted_sum / cap_weight_sum
print(f"\n  Capability加权平均: {current_cap_avg:.6f}")

print(f"\n  ❌ 差异: {abs(current_domain_avg - current_cap_avg):.6f}")

# ========== 正确算法 ==========
print("\n" + "=" * 80)
print("【正确算法 - 使用 ratio_overall】")

# 方法1: 直接用每个问题类型在每个domain中的 ratio_overall
total_weighted_sum = 0
total_weight_sum = 0

for domain, scores in model_scores.items():
    for cap in ["A", "B", "C", "D"]:
        score = scores[cap]
        weight = qa_distribution["domain_distribution"][domain]["problem_types"][cap]["ratio_overall"]
        total_weighted_sum += score * weight
        total_weight_sum += weight
        print(f"  {domain}-{cap}: {score:.4f} * {weight:.6f} = {score * weight:.6f}")

correct_avg = total_weighted_sum / total_weight_sum
print(f"\n  ✓ 正确的总平均: {correct_avg:.6f}")
print(f"  权重总和: {total_weight_sum:.6f} (应该等于1.0)")

# ========== 验证:按Domain加权平均(使用domain内的ratio_in_domain)==========
print("\n" + "=" * 80)
print("【正确算法验证 - Domain维度】")

# 1. 先用ratio_in_domain计算各domain的加权平均
domain_scores_weighted = {}
for domain, scores in model_scores.items():
    weighted_sum = 0
    weight_sum = 0
    for cap in ["A", "B", "C", "D"]:
        score = scores[cap]
        weight = qa_distribution["domain_distribution"][domain]["problem_types"][cap]["ratio_in_domain"]
        weighted_sum += score * weight
        weight_sum += weight
    domain_avg = weighted_sum / weight_sum
    domain_scores_weighted[domain] = domain_avg
    print(f"  {domain}: {domain_avg:.6f}")

# 2. 再用qa_ratio加权
domain_final_sum = 0
domain_final_weight = 0
for domain, score in domain_scores_weighted.items():
    weight = qa_distribution["domain_distribution"][domain]["qa_ratio"]
    domain_final_sum += score * weight
    domain_final_weight += weight
correct_domain_avg = domain_final_sum / domain_final_weight
print(f"\n  ✓ Domain维度的正确平均: {correct_domain_avg:.6f}")

# ========== 验证:按Capability加权平均 ==========
print("\n" + "=" * 80)
print("【正确算法验证 - Capability维度】")

# 1. 先用ratio_overall计算各capability跨domain的加权平均
capability_scores_weighted = {}
for cap in ["A", "B", "C", "D"]:
    weighted_sum = 0
    weight_sum = 0
    for domain in model_scores.keys():
        score = model_scores[domain][cap]
        weight = qa_distribution["domain_distribution"][domain]["problem_types"][cap]["ratio_overall"]
        weighted_sum += score * weight
        weight_sum += weight
    capability_scores_weighted[cap] = weighted_sum / weight_sum
    print(f"  Capability {cap}: {capability_scores_weighted[cap]:.6f}")

# 2. 再用overall ratio加权
cap_final_sum = 0
cap_final_weight = 0
for cap, score in capability_scores_weighted.items():
    weight = qa_distribution["overall_distribution"]["problem_types"][cap]["ratio"]
    cap_final_sum += score * weight
    cap_final_weight += weight
correct_cap_avg = cap_final_sum / cap_final_weight
print(f"\n  ✓ Capability维度的正确平均: {correct_cap_avg:.6f}")

# ========== 总结 ==========
print("\n" + "=" * 80)
print("【总结】")
print("=" * 80)
print(f"当前算法 - Domain平均:     {current_domain_avg:.6f}")
print(f"当前算法 - Capability平均: {current_cap_avg:.6f}")
print(f"差异:                       {abs(current_domain_avg - current_cap_avg):.6f} ❌")
print()
print(f"正确算法 - 直接计算:        {correct_avg:.6f}")
print(f"正确算法 - Domain维度:     {correct_domain_avg:.6f}")
print(f"正确算法 - Capability维度: {correct_cap_avg:.6f}")
print(f"三者是否一致:               {abs(correct_avg - correct_domain_avg) < 1e-6 and abs(correct_avg - correct_cap_avg) < 1e-6} ✓")