#!/usr/bin/env python3 """ 验证加权平均计算的问题 """ # 从 qa_distribution.json 获取的真实数据 qa_distribution = { "overall_distribution": { "problem_types": { "A": {"ratio": 0.336058}, # Recall "B": {"ratio": 0.238782}, # Causal Inference "C": {"ratio": 0.259296}, # State Updating "D": {"ratio": 0.165865} # State Abstraction } }, "domain_distribution": { "TEXT2SQL": { "qa_ratio": 0.245192, "problem_types": { "A": {"ratio_in_domain": 0.364379, "ratio_overall": 0.089344}, "B": {"ratio_in_domain": 0.250000, "ratio_overall": 0.061298}, "C": {"ratio_in_domain": 0.218954, "ratio_overall": 0.053686}, "D": {"ratio_in_domain": 0.166667, "ratio_overall": 0.040865} } }, "SOFTWARE": { "qa_ratio": 0.173077, "problem_types": { "A": {"ratio_in_domain": 0.490741, "ratio_overall": 0.084936}, "B": {"ratio_in_domain": 0.173611, "ratio_overall": 0.030048}, "C": {"ratio_in_domain": 0.168981, "ratio_overall": 0.029247}, "D": {"ratio_in_domain": 0.166667, "ratio_overall": 0.028846} } }, "WEB": { "qa_ratio": 0.149038, "problem_types": { "A": {"ratio_in_domain": 0.336022, "ratio_overall": 0.050080}, "B": {"ratio_in_domain": 0.250000, "ratio_overall": 0.037260}, "C": {"ratio_in_domain": 0.250000, "ratio_overall": 0.037260}, "D": {"ratio_in_domain": 0.163978, "ratio_overall": 0.024439} } }, "GAME": { "qa_ratio": 0.144231, "problem_types": { "A": {"ratio_in_domain": 0.333333, "ratio_overall": 0.048077}, "B": {"ratio_in_domain": 0.250000, "ratio_overall": 0.036058}, "C": {"ratio_in_domain": 0.250000, "ratio_overall": 0.036058}, "D": {"ratio_in_domain": 0.166667, "ratio_overall": 0.024038} } }, "EMBODIED_AI": { "qa_ratio": 0.144231, "problem_types": { "A": {"ratio_in_domain": 0.169444, "ratio_overall": 0.024439}, "B": {"ratio_in_domain": 0.250000, "ratio_overall": 0.036058}, "C": {"ratio_in_domain": 0.416667, "ratio_overall": 0.060096}, "D": {"ratio_in_domain": 0.163889, "ratio_overall": 0.023638} } }, "OPENWORLD_QA": { "qa_ratio": 0.144231, "problem_types": { "A": {"ratio_in_domain": 0.272222, "ratio_overall": 0.039263}, "B": {"ratio_in_domain": 0.263889, "ratio_overall": 0.038062}, "C": {"ratio_in_domain": 0.297222, "ratio_overall": 0.042868}, "D": {"ratio_in_domain": 0.166667, "ratio_overall": 0.024038} } } } } # 使用第一个模型的真实数据: Claude Haiku 3.5 model_scores = { "GAME": {"A": 0.5, "B": 0.458, "C": 0.564, "D": 0.583}, "EMBODIED_AI": {"A": 0.3934, "B": 0.4667, "C": 0.34, "D": 0.0}, "WEB": {"A": 0.4711, "B": 0.5889, "C": 0.5222, "D": 0.5932}, "TEXT2SQL": {"A": 0.6233, "B": 0.1961, "C": 0.4328, "D": 0.1569}, "OPENWORLD_QA": {"A": 0.6596, "B": 0.7333, "C": 0.5625, "D": 0.5}, "SOFTWARE": {"A": 0.26, "B": 0.4366, "C": 0.1739, "D": 0.1324} } print("=" * 80) print("测试:Claude Haiku 3.5 的平均分计算") print("=" * 80) # ========== 当前算法(错误)========== print("\n【当前算法 - 简单平均】") # 1. 计算各domain的简单平均分 domain_scores_simple = {} for domain, scores in model_scores.items(): avg = sum(scores.values()) / len(scores) domain_scores_simple[domain] = avg print(f" {domain}: ({scores['A']:.4f} + {scores['B']:.4f} + {scores['C']:.4f} + {scores['D']:.4f}) / 4 = {avg:.6f}") # 2. 按domain权重计算总平均 domain_weighted_sum = 0 domain_weight_sum = 0 for domain, score in domain_scores_simple.items(): weight = qa_distribution["domain_distribution"][domain]["qa_ratio"] domain_weighted_sum += score * weight domain_weight_sum += weight current_domain_avg = domain_weighted_sum / domain_weight_sum print(f"\n Domain加权平均: {current_domain_avg:.6f}") # 3. 计算各capability的简单平均分 capability_scores_simple = {"A": [], "B": [], "C": [], "D": []} for domain, scores in model_scores.items(): for cap in ["A", "B", "C", "D"]: capability_scores_simple[cap].append(scores[cap]) for cap in ["A", "B", "C", "D"]: avg = sum(capability_scores_simple[cap]) / len(capability_scores_simple[cap]) capability_scores_simple[cap] = avg print(f" Capability {cap}: 平均 = {avg:.6f}") # 4. 按capability权重计算总平均 cap_weighted_sum = 0 cap_weight_sum = 0 for cap, score in capability_scores_simple.items(): weight = qa_distribution["overall_distribution"]["problem_types"][cap]["ratio"] cap_weighted_sum += score * weight cap_weight_sum += weight current_cap_avg = cap_weighted_sum / cap_weight_sum print(f"\n Capability加权平均: {current_cap_avg:.6f}") print(f"\n ❌ 差异: {abs(current_domain_avg - current_cap_avg):.6f}") # ========== 正确算法 ========== print("\n" + "=" * 80) print("【正确算法 - 使用 ratio_overall】") # 方法1: 直接用每个问题类型在每个domain中的 ratio_overall total_weighted_sum = 0 total_weight_sum = 0 for domain, scores in model_scores.items(): for cap in ["A", "B", "C", "D"]: score = scores[cap] weight = qa_distribution["domain_distribution"][domain]["problem_types"][cap]["ratio_overall"] total_weighted_sum += score * weight total_weight_sum += weight print(f" {domain}-{cap}: {score:.4f} * {weight:.6f} = {score * weight:.6f}") correct_avg = total_weighted_sum / total_weight_sum print(f"\n ✓ 正确的总平均: {correct_avg:.6f}") print(f" 权重总和: {total_weight_sum:.6f} (应该等于1.0)") # ========== 验证:按Domain加权平均(使用domain内的ratio_in_domain)========== print("\n" + "=" * 80) print("【正确算法验证 - Domain维度】") # 1. 先用ratio_in_domain计算各domain的加权平均 domain_scores_weighted = {} for domain, scores in model_scores.items(): weighted_sum = 0 weight_sum = 0 for cap in ["A", "B", "C", "D"]: score = scores[cap] weight = qa_distribution["domain_distribution"][domain]["problem_types"][cap]["ratio_in_domain"] weighted_sum += score * weight weight_sum += weight domain_avg = weighted_sum / weight_sum domain_scores_weighted[domain] = domain_avg print(f" {domain}: {domain_avg:.6f}") # 2. 再用qa_ratio加权 domain_final_sum = 0 domain_final_weight = 0 for domain, score in domain_scores_weighted.items(): weight = qa_distribution["domain_distribution"][domain]["qa_ratio"] domain_final_sum += score * weight domain_final_weight += weight correct_domain_avg = domain_final_sum / domain_final_weight print(f"\n ✓ Domain维度的正确平均: {correct_domain_avg:.6f}") # ========== 验证:按Capability加权平均 ========== print("\n" + "=" * 80) print("【正确算法验证 - Capability维度】") # 1. 先用ratio_overall计算各capability跨domain的加权平均 capability_scores_weighted = {} for cap in ["A", "B", "C", "D"]: weighted_sum = 0 weight_sum = 0 for domain in model_scores.keys(): score = model_scores[domain][cap] weight = qa_distribution["domain_distribution"][domain]["problem_types"][cap]["ratio_overall"] weighted_sum += score * weight weight_sum += weight capability_scores_weighted[cap] = weighted_sum / weight_sum print(f" Capability {cap}: {capability_scores_weighted[cap]:.6f}") # 2. 再用overall ratio加权 cap_final_sum = 0 cap_final_weight = 0 for cap, score in capability_scores_weighted.items(): weight = qa_distribution["overall_distribution"]["problem_types"][cap]["ratio"] cap_final_sum += score * weight cap_final_weight += weight correct_cap_avg = cap_final_sum / cap_final_weight print(f"\n ✓ Capability维度的正确平均: {correct_cap_avg:.6f}") # ========== 总结 ========== print("\n" + "=" * 80) print("【总结】") print("=" * 80) print(f"当前算法 - Domain平均: {current_domain_avg:.6f}") print(f"当前算法 - Capability平均: {current_cap_avg:.6f}") print(f"差异: {abs(current_domain_avg - current_cap_avg):.6f} ❌") print() print(f"正确算法 - 直接计算: {correct_avg:.6f}") print(f"正确算法 - Domain维度: {correct_domain_avg:.6f}") print(f"正确算法 - Capability维度: {correct_cap_avg:.6f}") print(f"三者是否一致: {abs(correct_avg - correct_domain_avg) < 1e-6 and abs(correct_avg - correct_cap_avg) < 1e-6} ✓")