AMA-bench-Leaderboard / test_average_calculation.py
NorahYujieZhao
fix the avg score but
60ae732
#!/usr/bin/env python3
"""
验证加权平均计算的问题
"""
# 从 qa_distribution.json 获取的真实数据
qa_distribution = {
"overall_distribution": {
"problem_types": {
"A": {"ratio": 0.336058}, # Recall
"B": {"ratio": 0.238782}, # Causal Inference
"C": {"ratio": 0.259296}, # State Updating
"D": {"ratio": 0.165865} # State Abstraction
}
},
"domain_distribution": {
"TEXT2SQL": {
"qa_ratio": 0.245192,
"problem_types": {
"A": {"ratio_in_domain": 0.364379, "ratio_overall": 0.089344},
"B": {"ratio_in_domain": 0.250000, "ratio_overall": 0.061298},
"C": {"ratio_in_domain": 0.218954, "ratio_overall": 0.053686},
"D": {"ratio_in_domain": 0.166667, "ratio_overall": 0.040865}
}
},
"SOFTWARE": {
"qa_ratio": 0.173077,
"problem_types": {
"A": {"ratio_in_domain": 0.490741, "ratio_overall": 0.084936},
"B": {"ratio_in_domain": 0.173611, "ratio_overall": 0.030048},
"C": {"ratio_in_domain": 0.168981, "ratio_overall": 0.029247},
"D": {"ratio_in_domain": 0.166667, "ratio_overall": 0.028846}
}
},
"WEB": {
"qa_ratio": 0.149038,
"problem_types": {
"A": {"ratio_in_domain": 0.336022, "ratio_overall": 0.050080},
"B": {"ratio_in_domain": 0.250000, "ratio_overall": 0.037260},
"C": {"ratio_in_domain": 0.250000, "ratio_overall": 0.037260},
"D": {"ratio_in_domain": 0.163978, "ratio_overall": 0.024439}
}
},
"GAME": {
"qa_ratio": 0.144231,
"problem_types": {
"A": {"ratio_in_domain": 0.333333, "ratio_overall": 0.048077},
"B": {"ratio_in_domain": 0.250000, "ratio_overall": 0.036058},
"C": {"ratio_in_domain": 0.250000, "ratio_overall": 0.036058},
"D": {"ratio_in_domain": 0.166667, "ratio_overall": 0.024038}
}
},
"EMBODIED_AI": {
"qa_ratio": 0.144231,
"problem_types": {
"A": {"ratio_in_domain": 0.169444, "ratio_overall": 0.024439},
"B": {"ratio_in_domain": 0.250000, "ratio_overall": 0.036058},
"C": {"ratio_in_domain": 0.416667, "ratio_overall": 0.060096},
"D": {"ratio_in_domain": 0.163889, "ratio_overall": 0.023638}
}
},
"OPENWORLD_QA": {
"qa_ratio": 0.144231,
"problem_types": {
"A": {"ratio_in_domain": 0.272222, "ratio_overall": 0.039263},
"B": {"ratio_in_domain": 0.263889, "ratio_overall": 0.038062},
"C": {"ratio_in_domain": 0.297222, "ratio_overall": 0.042868},
"D": {"ratio_in_domain": 0.166667, "ratio_overall": 0.024038}
}
}
}
}
# 使用第一个模型的真实数据: Claude Haiku 3.5
model_scores = {
"GAME": {"A": 0.5, "B": 0.458, "C": 0.564, "D": 0.583},
"EMBODIED_AI": {"A": 0.3934, "B": 0.4667, "C": 0.34, "D": 0.0},
"WEB": {"A": 0.4711, "B": 0.5889, "C": 0.5222, "D": 0.5932},
"TEXT2SQL": {"A": 0.6233, "B": 0.1961, "C": 0.4328, "D": 0.1569},
"OPENWORLD_QA": {"A": 0.6596, "B": 0.7333, "C": 0.5625, "D": 0.5},
"SOFTWARE": {"A": 0.26, "B": 0.4366, "C": 0.1739, "D": 0.1324}
}
print("=" * 80)
print("测试:Claude Haiku 3.5 的平均分计算")
print("=" * 80)
# ========== 当前算法(错误)==========
print("\n【当前算法 - 简单平均】")
# 1. 计算各domain的简单平均分
domain_scores_simple = {}
for domain, scores in model_scores.items():
avg = sum(scores.values()) / len(scores)
domain_scores_simple[domain] = avg
print(f" {domain}: ({scores['A']:.4f} + {scores['B']:.4f} + {scores['C']:.4f} + {scores['D']:.4f}) / 4 = {avg:.6f}")
# 2. 按domain权重计算总平均
domain_weighted_sum = 0
domain_weight_sum = 0
for domain, score in domain_scores_simple.items():
weight = qa_distribution["domain_distribution"][domain]["qa_ratio"]
domain_weighted_sum += score * weight
domain_weight_sum += weight
current_domain_avg = domain_weighted_sum / domain_weight_sum
print(f"\n Domain加权平均: {current_domain_avg:.6f}")
# 3. 计算各capability的简单平均分
capability_scores_simple = {"A": [], "B": [], "C": [], "D": []}
for domain, scores in model_scores.items():
for cap in ["A", "B", "C", "D"]:
capability_scores_simple[cap].append(scores[cap])
for cap in ["A", "B", "C", "D"]:
avg = sum(capability_scores_simple[cap]) / len(capability_scores_simple[cap])
capability_scores_simple[cap] = avg
print(f" Capability {cap}: 平均 = {avg:.6f}")
# 4. 按capability权重计算总平均
cap_weighted_sum = 0
cap_weight_sum = 0
for cap, score in capability_scores_simple.items():
weight = qa_distribution["overall_distribution"]["problem_types"][cap]["ratio"]
cap_weighted_sum += score * weight
cap_weight_sum += weight
current_cap_avg = cap_weighted_sum / cap_weight_sum
print(f"\n Capability加权平均: {current_cap_avg:.6f}")
print(f"\n ❌ 差异: {abs(current_domain_avg - current_cap_avg):.6f}")
# ========== 正确算法 ==========
print("\n" + "=" * 80)
print("【正确算法 - 使用 ratio_overall】")
# 方法1: 直接用每个问题类型在每个domain中的 ratio_overall
total_weighted_sum = 0
total_weight_sum = 0
for domain, scores in model_scores.items():
for cap in ["A", "B", "C", "D"]:
score = scores[cap]
weight = qa_distribution["domain_distribution"][domain]["problem_types"][cap]["ratio_overall"]
total_weighted_sum += score * weight
total_weight_sum += weight
print(f" {domain}-{cap}: {score:.4f} * {weight:.6f} = {score * weight:.6f}")
correct_avg = total_weighted_sum / total_weight_sum
print(f"\n ✓ 正确的总平均: {correct_avg:.6f}")
print(f" 权重总和: {total_weight_sum:.6f} (应该等于1.0)")
# ========== 验证:按Domain加权平均(使用domain内的ratio_in_domain)==========
print("\n" + "=" * 80)
print("【正确算法验证 - Domain维度】")
# 1. 先用ratio_in_domain计算各domain的加权平均
domain_scores_weighted = {}
for domain, scores in model_scores.items():
weighted_sum = 0
weight_sum = 0
for cap in ["A", "B", "C", "D"]:
score = scores[cap]
weight = qa_distribution["domain_distribution"][domain]["problem_types"][cap]["ratio_in_domain"]
weighted_sum += score * weight
weight_sum += weight
domain_avg = weighted_sum / weight_sum
domain_scores_weighted[domain] = domain_avg
print(f" {domain}: {domain_avg:.6f}")
# 2. 再用qa_ratio加权
domain_final_sum = 0
domain_final_weight = 0
for domain, score in domain_scores_weighted.items():
weight = qa_distribution["domain_distribution"][domain]["qa_ratio"]
domain_final_sum += score * weight
domain_final_weight += weight
correct_domain_avg = domain_final_sum / domain_final_weight
print(f"\n ✓ Domain维度的正确平均: {correct_domain_avg:.6f}")
# ========== 验证:按Capability加权平均 ==========
print("\n" + "=" * 80)
print("【正确算法验证 - Capability维度】")
# 1. 先用ratio_overall计算各capability跨domain的加权平均
capability_scores_weighted = {}
for cap in ["A", "B", "C", "D"]:
weighted_sum = 0
weight_sum = 0
for domain in model_scores.keys():
score = model_scores[domain][cap]
weight = qa_distribution["domain_distribution"][domain]["problem_types"][cap]["ratio_overall"]
weighted_sum += score * weight
weight_sum += weight
capability_scores_weighted[cap] = weighted_sum / weight_sum
print(f" Capability {cap}: {capability_scores_weighted[cap]:.6f}")
# 2. 再用overall ratio加权
cap_final_sum = 0
cap_final_weight = 0
for cap, score in capability_scores_weighted.items():
weight = qa_distribution["overall_distribution"]["problem_types"][cap]["ratio"]
cap_final_sum += score * weight
cap_final_weight += weight
correct_cap_avg = cap_final_sum / cap_final_weight
print(f"\n ✓ Capability维度的正确平均: {correct_cap_avg:.6f}")
# ========== 总结 ==========
print("\n" + "=" * 80)
print("【总结】")
print("=" * 80)
print(f"当前算法 - Domain平均: {current_domain_avg:.6f}")
print(f"当前算法 - Capability平均: {current_cap_avg:.6f}")
print(f"差异: {abs(current_domain_avg - current_cap_avg):.6f} ❌")
print()
print(f"正确算法 - 直接计算: {correct_avg:.6f}")
print(f"正确算法 - Domain维度: {correct_domain_avg:.6f}")
print(f"正确算法 - Capability维度: {correct_cap_avg:.6f}")
print(f"三者是否一致: {abs(correct_avg - correct_domain_avg) < 1e-6 and abs(correct_avg - correct_cap_avg) < 1e-6} ✓")