NorahYujieZhao commited on
Commit
31cd310
·
1 Parent(s): 60ae732

fix the avg score bug

Browse files
Files changed (1) hide show
  1. verify_fix.py +0 -127
verify_fix.py DELETED
@@ -1,127 +0,0 @@
1
- #!/usr/bin/env python3
2
- """验证修复后的加权平均计算"""
3
-
4
- import json
5
- import sys
6
-
7
- # Load data
8
- with open("data/qa_distribution.json", "r") as f:
9
- QA_DISTRIBUTION = json.load(f)
10
-
11
- # Load model data
12
- model_data = []
13
- with open("data/model.jsonl", "r") as f:
14
- for line in f:
15
- if line.strip():
16
- model_data.append(json.loads(line))
17
-
18
- # Simulate the new convert_jsonl_to_dict logic for first model
19
- model = model_data[0]
20
- name = model["model"]
21
- scores = model["Score"]
22
-
23
- print(f"验证模型: {name}")
24
- print("=" * 80)
25
-
26
- capability_mapping = {
27
- "A": "Recall",
28
- "B": "Causal Inference",
29
- "C": "State Updating",
30
- "D": "State Abstraction"
31
- }
32
-
33
- # Calculate domain scores using ratio_in_domain
34
- print("\n【Domain分数计算 - 使用 ratio_in_domain】")
35
- domain_scores = {}
36
- for domain, domain_score_list in scores.items():
37
- capability_scores_for_domain = {}
38
- for score_dict in domain_score_list:
39
- for cap_letter, score_value in score_dict.items():
40
- capability_scores_for_domain[cap_letter] = score_value
41
-
42
- # Weighted average using ratio_in_domain
43
- domain_info = QA_DISTRIBUTION["domain_distribution"][domain]
44
- problem_types = domain_info.get("problem_types", {})
45
-
46
- weighted_sum = 0
47
- weight_total = 0
48
- for cap_letter, score_value in capability_scores_for_domain.items():
49
- if cap_letter in problem_types:
50
- weight = problem_types[cap_letter].get("ratio_in_domain", 0.0)
51
- weighted_sum += score_value * weight
52
- weight_total += weight
53
-
54
- avg_domain_score = weighted_sum / weight_total if weight_total > 0 else 0
55
- domain_scores[domain] = avg_domain_score
56
- print(f" {domain:15s}: {avg_domain_score:.6f}")
57
-
58
- # Calculate domain weighted average
59
- print("\n【Domain维度的总平均 - 使用 qa_ratio】")
60
- domain_weighted_sum = 0
61
- domain_weight_total = 0
62
- for domain, score in domain_scores.items():
63
- weight = QA_DISTRIBUTION["domain_distribution"][domain]["qa_ratio"]
64
- domain_weighted_sum += score * weight
65
- domain_weight_total += weight
66
- print(f" {domain:15s}: {score:.6f} * {weight:.6f} = {score * weight:.6f}")
67
-
68
- domain_avg = domain_weighted_sum / domain_weight_total
69
- print(f"\n 总平均: {domain_avg:.6f}")
70
-
71
- # Calculate capability scores using ratio_overall
72
- print("\n【Capability分数计算 - 使用 ratio_overall】")
73
- capability_scores = {"A": 0, "B": 0, "C": 0, "D": 0}
74
- capability_weights = {"A": 0, "B": 0, "C": 0, "D": 0}
75
-
76
- for domain, domain_score_list in scores.items():
77
- capability_scores_for_domain = {}
78
- for score_dict in domain_score_list:
79
- for cap_letter, score_value in score_dict.items():
80
- capability_scores_for_domain[cap_letter] = score_value
81
-
82
- # Accumulate with ratio_overall
83
- domain_info = QA_DISTRIBUTION["domain_distribution"][domain]
84
- problem_types = domain_info.get("problem_types", {})
85
-
86
- for cap_letter, score_value in capability_scores_for_domain.items():
87
- if cap_letter in problem_types:
88
- weight = problem_types[cap_letter].get("ratio_overall", 0.0)
89
- capability_scores[cap_letter] += score_value * weight
90
- capability_weights[cap_letter] += weight
91
-
92
- # Calculate weighted averages
93
- capability_avgs = {}
94
- for cap_letter in ["A", "B", "C", "D"]:
95
- avg = capability_scores[cap_letter] / capability_weights[cap_letter] if capability_weights[cap_letter] > 0 else 0
96
- capability_avgs[cap_letter] = avg
97
- cap_name = capability_mapping[cap_letter]
98
- print(f" {cap_name:20s} ({cap_letter}): {avg:.6f}")
99
-
100
- # Calculate capability weighted average
101
- print("\n【Capability维度的总平均 - 使用 overall ratio】")
102
- cap_weighted_sum = 0
103
- cap_weight_total = 0
104
- for cap_letter, score in capability_avgs.items():
105
- weight = QA_DISTRIBUTION["overall_distribution"]["problem_types"][cap_letter]["ratio"]
106
- cap_weighted_sum += score * weight
107
- cap_weight_total += weight
108
- cap_name = capability_mapping[cap_letter]
109
- print(f" {cap_name:20s}: {score:.6f} * {weight:.6f} = {score * weight:.6f}")
110
-
111
- cap_avg = cap_weighted_sum / cap_weight_total
112
- print(f"\n 总平均: {cap_avg:.6f}")
113
-
114
- # Compare
115
- print("\n" + "=" * 80)
116
- print("【结果对比】")
117
- print("=" * 80)
118
- print(f"Domain维度总平均: {domain_avg:.6f}")
119
- print(f"Capability维度总平均: {cap_avg:.6f}")
120
- print(f"差异: {abs(domain_avg - cap_avg):.10f}")
121
-
122
- if abs(domain_avg - cap_avg) < 1e-6:
123
- print("\n✓ 成功!两个维度的计算结果一致!")
124
- sys.exit(0)
125
- else:
126
- print(f"\n✗ 失败!两个维度的计算结果不一致,差异: {abs(domain_avg - cap_avg):.10f}")
127
- sys.exit(1)