Spaces:
Running
Running
| #!/usr/bin/env python3 | |
| """ | |
| 验证加权平均计算的问题 | |
| """ | |
| # 从 qa_distribution.json 获取的真实数据 | |
| qa_distribution = { | |
| "overall_distribution": { | |
| "problem_types": { | |
| "A": {"ratio": 0.336058}, # Recall | |
| "B": {"ratio": 0.238782}, # Causal Inference | |
| "C": {"ratio": 0.259296}, # State Updating | |
| "D": {"ratio": 0.165865} # State Abstraction | |
| } | |
| }, | |
| "domain_distribution": { | |
| "TEXT2SQL": { | |
| "qa_ratio": 0.245192, | |
| "problem_types": { | |
| "A": {"ratio_in_domain": 0.364379, "ratio_overall": 0.089344}, | |
| "B": {"ratio_in_domain": 0.250000, "ratio_overall": 0.061298}, | |
| "C": {"ratio_in_domain": 0.218954, "ratio_overall": 0.053686}, | |
| "D": {"ratio_in_domain": 0.166667, "ratio_overall": 0.040865} | |
| } | |
| }, | |
| "SOFTWARE": { | |
| "qa_ratio": 0.173077, | |
| "problem_types": { | |
| "A": {"ratio_in_domain": 0.490741, "ratio_overall": 0.084936}, | |
| "B": {"ratio_in_domain": 0.173611, "ratio_overall": 0.030048}, | |
| "C": {"ratio_in_domain": 0.168981, "ratio_overall": 0.029247}, | |
| "D": {"ratio_in_domain": 0.166667, "ratio_overall": 0.028846} | |
| } | |
| }, | |
| "WEB": { | |
| "qa_ratio": 0.149038, | |
| "problem_types": { | |
| "A": {"ratio_in_domain": 0.336022, "ratio_overall": 0.050080}, | |
| "B": {"ratio_in_domain": 0.250000, "ratio_overall": 0.037260}, | |
| "C": {"ratio_in_domain": 0.250000, "ratio_overall": 0.037260}, | |
| "D": {"ratio_in_domain": 0.163978, "ratio_overall": 0.024439} | |
| } | |
| }, | |
| "GAME": { | |
| "qa_ratio": 0.144231, | |
| "problem_types": { | |
| "A": {"ratio_in_domain": 0.333333, "ratio_overall": 0.048077}, | |
| "B": {"ratio_in_domain": 0.250000, "ratio_overall": 0.036058}, | |
| "C": {"ratio_in_domain": 0.250000, "ratio_overall": 0.036058}, | |
| "D": {"ratio_in_domain": 0.166667, "ratio_overall": 0.024038} | |
| } | |
| }, | |
| "EMBODIED_AI": { | |
| "qa_ratio": 0.144231, | |
| "problem_types": { | |
| "A": {"ratio_in_domain": 0.169444, "ratio_overall": 0.024439}, | |
| "B": {"ratio_in_domain": 0.250000, "ratio_overall": 0.036058}, | |
| "C": {"ratio_in_domain": 0.416667, "ratio_overall": 0.060096}, | |
| "D": {"ratio_in_domain": 0.163889, "ratio_overall": 0.023638} | |
| } | |
| }, | |
| "OPENWORLD_QA": { | |
| "qa_ratio": 0.144231, | |
| "problem_types": { | |
| "A": {"ratio_in_domain": 0.272222, "ratio_overall": 0.039263}, | |
| "B": {"ratio_in_domain": 0.263889, "ratio_overall": 0.038062}, | |
| "C": {"ratio_in_domain": 0.297222, "ratio_overall": 0.042868}, | |
| "D": {"ratio_in_domain": 0.166667, "ratio_overall": 0.024038} | |
| } | |
| } | |
| } | |
| } | |
| # 使用第一个模型的真实数据: Claude Haiku 3.5 | |
| model_scores = { | |
| "GAME": {"A": 0.5, "B": 0.458, "C": 0.564, "D": 0.583}, | |
| "EMBODIED_AI": {"A": 0.3934, "B": 0.4667, "C": 0.34, "D": 0.0}, | |
| "WEB": {"A": 0.4711, "B": 0.5889, "C": 0.5222, "D": 0.5932}, | |
| "TEXT2SQL": {"A": 0.6233, "B": 0.1961, "C": 0.4328, "D": 0.1569}, | |
| "OPENWORLD_QA": {"A": 0.6596, "B": 0.7333, "C": 0.5625, "D": 0.5}, | |
| "SOFTWARE": {"A": 0.26, "B": 0.4366, "C": 0.1739, "D": 0.1324} | |
| } | |
| print("=" * 80) | |
| print("测试:Claude Haiku 3.5 的平均分计算") | |
| print("=" * 80) | |
| # ========== 当前算法(错误)========== | |
| print("\n【当前算法 - 简单平均】") | |
| # 1. 计算各domain的简单平均分 | |
| domain_scores_simple = {} | |
| for domain, scores in model_scores.items(): | |
| avg = sum(scores.values()) / len(scores) | |
| domain_scores_simple[domain] = avg | |
| print(f" {domain}: ({scores['A']:.4f} + {scores['B']:.4f} + {scores['C']:.4f} + {scores['D']:.4f}) / 4 = {avg:.6f}") | |
| # 2. 按domain权重计算总平均 | |
| domain_weighted_sum = 0 | |
| domain_weight_sum = 0 | |
| for domain, score in domain_scores_simple.items(): | |
| weight = qa_distribution["domain_distribution"][domain]["qa_ratio"] | |
| domain_weighted_sum += score * weight | |
| domain_weight_sum += weight | |
| current_domain_avg = domain_weighted_sum / domain_weight_sum | |
| print(f"\n Domain加权平均: {current_domain_avg:.6f}") | |
| # 3. 计算各capability的简单平均分 | |
| capability_scores_simple = {"A": [], "B": [], "C": [], "D": []} | |
| for domain, scores in model_scores.items(): | |
| for cap in ["A", "B", "C", "D"]: | |
| capability_scores_simple[cap].append(scores[cap]) | |
| for cap in ["A", "B", "C", "D"]: | |
| avg = sum(capability_scores_simple[cap]) / len(capability_scores_simple[cap]) | |
| capability_scores_simple[cap] = avg | |
| print(f" Capability {cap}: 平均 = {avg:.6f}") | |
| # 4. 按capability权重计算总平均 | |
| cap_weighted_sum = 0 | |
| cap_weight_sum = 0 | |
| for cap, score in capability_scores_simple.items(): | |
| weight = qa_distribution["overall_distribution"]["problem_types"][cap]["ratio"] | |
| cap_weighted_sum += score * weight | |
| cap_weight_sum += weight | |
| current_cap_avg = cap_weighted_sum / cap_weight_sum | |
| print(f"\n Capability加权平均: {current_cap_avg:.6f}") | |
| print(f"\n ❌ 差异: {abs(current_domain_avg - current_cap_avg):.6f}") | |
| # ========== 正确算法 ========== | |
| print("\n" + "=" * 80) | |
| print("【正确算法 - 使用 ratio_overall】") | |
| # 方法1: 直接用每个问题类型在每个domain中的 ratio_overall | |
| total_weighted_sum = 0 | |
| total_weight_sum = 0 | |
| for domain, scores in model_scores.items(): | |
| for cap in ["A", "B", "C", "D"]: | |
| score = scores[cap] | |
| weight = qa_distribution["domain_distribution"][domain]["problem_types"][cap]["ratio_overall"] | |
| total_weighted_sum += score * weight | |
| total_weight_sum += weight | |
| print(f" {domain}-{cap}: {score:.4f} * {weight:.6f} = {score * weight:.6f}") | |
| correct_avg = total_weighted_sum / total_weight_sum | |
| print(f"\n ✓ 正确的总平均: {correct_avg:.6f}") | |
| print(f" 权重总和: {total_weight_sum:.6f} (应该等于1.0)") | |
| # ========== 验证:按Domain加权平均(使用domain内的ratio_in_domain)========== | |
| print("\n" + "=" * 80) | |
| print("【正确算法验证 - Domain维度】") | |
| # 1. 先用ratio_in_domain计算各domain的加权平均 | |
| domain_scores_weighted = {} | |
| for domain, scores in model_scores.items(): | |
| weighted_sum = 0 | |
| weight_sum = 0 | |
| for cap in ["A", "B", "C", "D"]: | |
| score = scores[cap] | |
| weight = qa_distribution["domain_distribution"][domain]["problem_types"][cap]["ratio_in_domain"] | |
| weighted_sum += score * weight | |
| weight_sum += weight | |
| domain_avg = weighted_sum / weight_sum | |
| domain_scores_weighted[domain] = domain_avg | |
| print(f" {domain}: {domain_avg:.6f}") | |
| # 2. 再用qa_ratio加权 | |
| domain_final_sum = 0 | |
| domain_final_weight = 0 | |
| for domain, score in domain_scores_weighted.items(): | |
| weight = qa_distribution["domain_distribution"][domain]["qa_ratio"] | |
| domain_final_sum += score * weight | |
| domain_final_weight += weight | |
| correct_domain_avg = domain_final_sum / domain_final_weight | |
| print(f"\n ✓ Domain维度的正确平均: {correct_domain_avg:.6f}") | |
| # ========== 验证:按Capability加权平均 ========== | |
| print("\n" + "=" * 80) | |
| print("【正确算法验证 - Capability维度】") | |
| # 1. 先用ratio_overall计算各capability跨domain的加权平均 | |
| capability_scores_weighted = {} | |
| for cap in ["A", "B", "C", "D"]: | |
| weighted_sum = 0 | |
| weight_sum = 0 | |
| for domain in model_scores.keys(): | |
| score = model_scores[domain][cap] | |
| weight = qa_distribution["domain_distribution"][domain]["problem_types"][cap]["ratio_overall"] | |
| weighted_sum += score * weight | |
| weight_sum += weight | |
| capability_scores_weighted[cap] = weighted_sum / weight_sum | |
| print(f" Capability {cap}: {capability_scores_weighted[cap]:.6f}") | |
| # 2. 再用overall ratio加权 | |
| cap_final_sum = 0 | |
| cap_final_weight = 0 | |
| for cap, score in capability_scores_weighted.items(): | |
| weight = qa_distribution["overall_distribution"]["problem_types"][cap]["ratio"] | |
| cap_final_sum += score * weight | |
| cap_final_weight += weight | |
| correct_cap_avg = cap_final_sum / cap_final_weight | |
| print(f"\n ✓ Capability维度的正确平均: {correct_cap_avg:.6f}") | |
| # ========== 总结 ========== | |
| print("\n" + "=" * 80) | |
| print("【总结】") | |
| print("=" * 80) | |
| print(f"当前算法 - Domain平均: {current_domain_avg:.6f}") | |
| print(f"当前算法 - Capability平均: {current_cap_avg:.6f}") | |
| print(f"差异: {abs(current_domain_avg - current_cap_avg):.6f} ❌") | |
| print() | |
| print(f"正确算法 - 直接计算: {correct_avg:.6f}") | |
| print(f"正确算法 - Domain维度: {correct_domain_avg:.6f}") | |
| print(f"正确算法 - Capability维度: {correct_cap_avg:.6f}") | |
| print(f"三者是否一致: {abs(correct_avg - correct_domain_avg) < 1e-6 and abs(correct_avg - correct_cap_avg) < 1e-6} ✓") | |