Spaces:
Running
Running
File size: 8,987 Bytes
60ae732 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 | #!/usr/bin/env python3
"""
验证加权平均计算的问题
"""
# 从 qa_distribution.json 获取的真实数据
qa_distribution = {
"overall_distribution": {
"problem_types": {
"A": {"ratio": 0.336058}, # Recall
"B": {"ratio": 0.238782}, # Causal Inference
"C": {"ratio": 0.259296}, # State Updating
"D": {"ratio": 0.165865} # State Abstraction
}
},
"domain_distribution": {
"TEXT2SQL": {
"qa_ratio": 0.245192,
"problem_types": {
"A": {"ratio_in_domain": 0.364379, "ratio_overall": 0.089344},
"B": {"ratio_in_domain": 0.250000, "ratio_overall": 0.061298},
"C": {"ratio_in_domain": 0.218954, "ratio_overall": 0.053686},
"D": {"ratio_in_domain": 0.166667, "ratio_overall": 0.040865}
}
},
"SOFTWARE": {
"qa_ratio": 0.173077,
"problem_types": {
"A": {"ratio_in_domain": 0.490741, "ratio_overall": 0.084936},
"B": {"ratio_in_domain": 0.173611, "ratio_overall": 0.030048},
"C": {"ratio_in_domain": 0.168981, "ratio_overall": 0.029247},
"D": {"ratio_in_domain": 0.166667, "ratio_overall": 0.028846}
}
},
"WEB": {
"qa_ratio": 0.149038,
"problem_types": {
"A": {"ratio_in_domain": 0.336022, "ratio_overall": 0.050080},
"B": {"ratio_in_domain": 0.250000, "ratio_overall": 0.037260},
"C": {"ratio_in_domain": 0.250000, "ratio_overall": 0.037260},
"D": {"ratio_in_domain": 0.163978, "ratio_overall": 0.024439}
}
},
"GAME": {
"qa_ratio": 0.144231,
"problem_types": {
"A": {"ratio_in_domain": 0.333333, "ratio_overall": 0.048077},
"B": {"ratio_in_domain": 0.250000, "ratio_overall": 0.036058},
"C": {"ratio_in_domain": 0.250000, "ratio_overall": 0.036058},
"D": {"ratio_in_domain": 0.166667, "ratio_overall": 0.024038}
}
},
"EMBODIED_AI": {
"qa_ratio": 0.144231,
"problem_types": {
"A": {"ratio_in_domain": 0.169444, "ratio_overall": 0.024439},
"B": {"ratio_in_domain": 0.250000, "ratio_overall": 0.036058},
"C": {"ratio_in_domain": 0.416667, "ratio_overall": 0.060096},
"D": {"ratio_in_domain": 0.163889, "ratio_overall": 0.023638}
}
},
"OPENWORLD_QA": {
"qa_ratio": 0.144231,
"problem_types": {
"A": {"ratio_in_domain": 0.272222, "ratio_overall": 0.039263},
"B": {"ratio_in_domain": 0.263889, "ratio_overall": 0.038062},
"C": {"ratio_in_domain": 0.297222, "ratio_overall": 0.042868},
"D": {"ratio_in_domain": 0.166667, "ratio_overall": 0.024038}
}
}
}
}
# 使用第一个模型的真实数据: Claude Haiku 3.5
model_scores = {
"GAME": {"A": 0.5, "B": 0.458, "C": 0.564, "D": 0.583},
"EMBODIED_AI": {"A": 0.3934, "B": 0.4667, "C": 0.34, "D": 0.0},
"WEB": {"A": 0.4711, "B": 0.5889, "C": 0.5222, "D": 0.5932},
"TEXT2SQL": {"A": 0.6233, "B": 0.1961, "C": 0.4328, "D": 0.1569},
"OPENWORLD_QA": {"A": 0.6596, "B": 0.7333, "C": 0.5625, "D": 0.5},
"SOFTWARE": {"A": 0.26, "B": 0.4366, "C": 0.1739, "D": 0.1324}
}
print("=" * 80)
print("测试:Claude Haiku 3.5 的平均分计算")
print("=" * 80)
# ========== 当前算法(错误)==========
print("\n【当前算法 - 简单平均】")
# 1. 计算各domain的简单平均分
domain_scores_simple = {}
for domain, scores in model_scores.items():
avg = sum(scores.values()) / len(scores)
domain_scores_simple[domain] = avg
print(f" {domain}: ({scores['A']:.4f} + {scores['B']:.4f} + {scores['C']:.4f} + {scores['D']:.4f}) / 4 = {avg:.6f}")
# 2. 按domain权重计算总平均
domain_weighted_sum = 0
domain_weight_sum = 0
for domain, score in domain_scores_simple.items():
weight = qa_distribution["domain_distribution"][domain]["qa_ratio"]
domain_weighted_sum += score * weight
domain_weight_sum += weight
current_domain_avg = domain_weighted_sum / domain_weight_sum
print(f"\n Domain加权平均: {current_domain_avg:.6f}")
# 3. 计算各capability的简单平均分
capability_scores_simple = {"A": [], "B": [], "C": [], "D": []}
for domain, scores in model_scores.items():
for cap in ["A", "B", "C", "D"]:
capability_scores_simple[cap].append(scores[cap])
for cap in ["A", "B", "C", "D"]:
avg = sum(capability_scores_simple[cap]) / len(capability_scores_simple[cap])
capability_scores_simple[cap] = avg
print(f" Capability {cap}: 平均 = {avg:.6f}")
# 4. 按capability权重计算总平均
cap_weighted_sum = 0
cap_weight_sum = 0
for cap, score in capability_scores_simple.items():
weight = qa_distribution["overall_distribution"]["problem_types"][cap]["ratio"]
cap_weighted_sum += score * weight
cap_weight_sum += weight
current_cap_avg = cap_weighted_sum / cap_weight_sum
print(f"\n Capability加权平均: {current_cap_avg:.6f}")
print(f"\n ❌ 差异: {abs(current_domain_avg - current_cap_avg):.6f}")
# ========== 正确算法 ==========
print("\n" + "=" * 80)
print("【正确算法 - 使用 ratio_overall】")
# 方法1: 直接用每个问题类型在每个domain中的 ratio_overall
total_weighted_sum = 0
total_weight_sum = 0
for domain, scores in model_scores.items():
for cap in ["A", "B", "C", "D"]:
score = scores[cap]
weight = qa_distribution["domain_distribution"][domain]["problem_types"][cap]["ratio_overall"]
total_weighted_sum += score * weight
total_weight_sum += weight
print(f" {domain}-{cap}: {score:.4f} * {weight:.6f} = {score * weight:.6f}")
correct_avg = total_weighted_sum / total_weight_sum
print(f"\n ✓ 正确的总平均: {correct_avg:.6f}")
print(f" 权重总和: {total_weight_sum:.6f} (应该等于1.0)")
# ========== 验证:按Domain加权平均(使用domain内的ratio_in_domain)==========
print("\n" + "=" * 80)
print("【正确算法验证 - Domain维度】")
# 1. 先用ratio_in_domain计算各domain的加权平均
domain_scores_weighted = {}
for domain, scores in model_scores.items():
weighted_sum = 0
weight_sum = 0
for cap in ["A", "B", "C", "D"]:
score = scores[cap]
weight = qa_distribution["domain_distribution"][domain]["problem_types"][cap]["ratio_in_domain"]
weighted_sum += score * weight
weight_sum += weight
domain_avg = weighted_sum / weight_sum
domain_scores_weighted[domain] = domain_avg
print(f" {domain}: {domain_avg:.6f}")
# 2. 再用qa_ratio加权
domain_final_sum = 0
domain_final_weight = 0
for domain, score in domain_scores_weighted.items():
weight = qa_distribution["domain_distribution"][domain]["qa_ratio"]
domain_final_sum += score * weight
domain_final_weight += weight
correct_domain_avg = domain_final_sum / domain_final_weight
print(f"\n ✓ Domain维度的正确平均: {correct_domain_avg:.6f}")
# ========== 验证:按Capability加权平均 ==========
print("\n" + "=" * 80)
print("【正确算法验证 - Capability维度】")
# 1. 先用ratio_overall计算各capability跨domain的加权平均
capability_scores_weighted = {}
for cap in ["A", "B", "C", "D"]:
weighted_sum = 0
weight_sum = 0
for domain in model_scores.keys():
score = model_scores[domain][cap]
weight = qa_distribution["domain_distribution"][domain]["problem_types"][cap]["ratio_overall"]
weighted_sum += score * weight
weight_sum += weight
capability_scores_weighted[cap] = weighted_sum / weight_sum
print(f" Capability {cap}: {capability_scores_weighted[cap]:.6f}")
# 2. 再用overall ratio加权
cap_final_sum = 0
cap_final_weight = 0
for cap, score in capability_scores_weighted.items():
weight = qa_distribution["overall_distribution"]["problem_types"][cap]["ratio"]
cap_final_sum += score * weight
cap_final_weight += weight
correct_cap_avg = cap_final_sum / cap_final_weight
print(f"\n ✓ Capability维度的正确平均: {correct_cap_avg:.6f}")
# ========== 总结 ==========
print("\n" + "=" * 80)
print("【总结】")
print("=" * 80)
print(f"当前算法 - Domain平均: {current_domain_avg:.6f}")
print(f"当前算法 - Capability平均: {current_cap_avg:.6f}")
print(f"差异: {abs(current_domain_avg - current_cap_avg):.6f} ❌")
print()
print(f"正确算法 - 直接计算: {correct_avg:.6f}")
print(f"正确算法 - Domain维度: {correct_domain_avg:.6f}")
print(f"正确算法 - Capability维度: {correct_cap_avg:.6f}")
print(f"三者是否一致: {abs(correct_avg - correct_domain_avg) < 1e-6 and abs(correct_avg - correct_cap_avg) < 1e-6} ✓")
|