ChibuUkachi commited on
Commit
9e374b6
·
1 Parent(s): 7a95a38

results except chat

Browse files
every_eval_ever/aime25.json ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "schema_version": "0.2.2",
3
+ "evaluation_id": "aime25/inference-optimization/MiniMax-M2.5.w8a8/1777568692.163912",
4
+ "evaluation_timestamp": "7718166",
5
+ "retrieved_timestamp": "1777568692.163912",
6
+ "source_metadata": {
7
+ "source_name": "lighteval",
8
+ "source_type": "evaluation_run",
9
+ "source_organization_name": "RedHatAI",
10
+ "evaluator_relationship": "third_party"
11
+ },
12
+ "eval_library": {
13
+ "name": "lighteval",
14
+ "version": "v0.13.0"
15
+ },
16
+ "model_info": {
17
+ "name": "inference-optimization/MiniMax-M2.5.w8a8",
18
+ "id": "inference-optimization/MiniMax-M2.5.w8a8",
19
+ "developer": "inference-optimization",
20
+ "inference_engine": {
21
+ "name": "vllm"
22
+ },
23
+ "additional_details": {
24
+ "provider": "hosted_vllm",
25
+ "base_url": "http://0.0.0.0:8000/v1",
26
+ "concurrent_requests": "8",
27
+ "verbose": "False",
28
+ "api_max_retry": "8",
29
+ "api_retry_sleep": "1.0",
30
+ "api_retry_multiplier": "2.0",
31
+ "timeout": "2400.0",
32
+ "num_seeds_merged": "8"
33
+ }
34
+ },
35
+ "evaluation_results": [
36
+ {
37
+ "evaluation_name": "aime25",
38
+ "source_data": {
39
+ "dataset_name": "aime25",
40
+ "source_type": "hf_dataset",
41
+ "hf_repo": "yentinglin/aime_2025",
42
+ "hf_split": "train"
43
+ },
44
+ "evaluation_timestamp": "7721195",
45
+ "metric_config": {
46
+ "evaluation_description": "pass@k:k=1&n=1",
47
+ "lower_is_better": false,
48
+ "score_type": "continuous",
49
+ "min_score": 0.0,
50
+ "max_score": 1.0
51
+ },
52
+ "score_details": {
53
+ "score": 0.8833333333333333,
54
+ "details": {
55
+ "seed_scores": "[0.9, 0.9, 0.9, 0.9, 0.8666666666666667, 0.8333333333333334, 0.8666666666666667, 0.9]",
56
+ "seed_values": "[1234, 1356, 3344, 4158, 42, 5322, 5678, 9843]"
57
+ },
58
+ "uncertainty": {
59
+ "standard_error": {
60
+ "value": 0.008908708063747477,
61
+ "method": "across_seeds"
62
+ },
63
+ "num_samples": 8
64
+ }
65
+ },
66
+ "generation_config": {
67
+ "generation_args": {
68
+ "temperature": 1.0,
69
+ "top_p": 0.95,
70
+ "top_k": 40.0,
71
+ "max_tokens": 64000,
72
+ "max_attempts": 1
73
+ },
74
+ "additional_details": {
75
+ "repetition_penalty": "1.0",
76
+ "presence_penalty": "1.5",
77
+ "seed": "1234",
78
+ "min_p": "0.0"
79
+ }
80
+ }
81
+ },
82
+ {
83
+ "evaluation_name": "aime25",
84
+ "source_data": {
85
+ "dataset_name": "aime25",
86
+ "source_type": "hf_dataset",
87
+ "hf_repo": "yentinglin/aime_2025",
88
+ "hf_split": "train"
89
+ },
90
+ "evaluation_timestamp": "7721195",
91
+ "metric_config": {
92
+ "evaluation_description": "avg@n:n=1",
93
+ "lower_is_better": false,
94
+ "score_type": "continuous",
95
+ "min_score": 0.0,
96
+ "max_score": 1.0
97
+ },
98
+ "score_details": {
99
+ "score": 0.8833333333333333,
100
+ "details": {
101
+ "seed_scores": "[0.9, 0.9, 0.9, 0.9, 0.8666666666666667, 0.8333333333333334, 0.8666666666666667, 0.9]",
102
+ "seed_values": "[1234, 1356, 3344, 4158, 42, 5322, 5678, 9843]"
103
+ },
104
+ "uncertainty": {
105
+ "standard_error": {
106
+ "value": 0.008908708063747477,
107
+ "method": "across_seeds"
108
+ },
109
+ "num_samples": 8
110
+ }
111
+ },
112
+ "generation_config": {
113
+ "generation_args": {
114
+ "temperature": 1.0,
115
+ "top_p": 0.95,
116
+ "top_k": 40.0,
117
+ "max_tokens": 64000,
118
+ "max_attempts": 1
119
+ },
120
+ "additional_details": {
121
+ "repetition_penalty": "1.0",
122
+ "presence_penalty": "1.5",
123
+ "seed": "1234",
124
+ "min_p": "0.0"
125
+ }
126
+ }
127
+ }
128
+ ]
129
+ }
every_eval_ever/gpqa_diamond.json ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "schema_version": "0.2.2",
3
+ "evaluation_id": "gpqa:diamond/inference-optimization/MiniMax-M2.5.w8a8/1777568737.410978",
4
+ "evaluation_timestamp": "7704068",
5
+ "retrieved_timestamp": "1777568737.410978",
6
+ "source_metadata": {
7
+ "source_name": "lighteval",
8
+ "source_type": "evaluation_run",
9
+ "source_organization_name": "RedHatAI",
10
+ "evaluator_relationship": "third_party"
11
+ },
12
+ "eval_library": {
13
+ "name": "lighteval",
14
+ "version": "v0.13.0"
15
+ },
16
+ "model_info": {
17
+ "name": "inference-optimization/MiniMax-M2.5.w8a8",
18
+ "id": "inference-optimization/MiniMax-M2.5.w8a8",
19
+ "developer": "inference-optimization",
20
+ "inference_engine": {
21
+ "name": "vllm"
22
+ },
23
+ "additional_details": {
24
+ "provider": "hosted_vllm",
25
+ "base_url": "http://0.0.0.0:8000/v1",
26
+ "concurrent_requests": "8",
27
+ "verbose": "False",
28
+ "api_max_retry": "8",
29
+ "api_retry_sleep": "1.0",
30
+ "api_retry_multiplier": "2.0",
31
+ "timeout": "2400.0",
32
+ "num_seeds_merged": "3"
33
+ }
34
+ },
35
+ "evaluation_results": [
36
+ {
37
+ "evaluation_name": "gpqa:diamond",
38
+ "source_data": {
39
+ "dataset_name": "gpqa:diamond",
40
+ "source_type": "hf_dataset",
41
+ "hf_repo": "Idavidrein/gpqa",
42
+ "hf_split": "train"
43
+ },
44
+ "evaluation_timestamp": "7708369",
45
+ "metric_config": {
46
+ "evaluation_description": "gpqa_pass@k:k=1",
47
+ "lower_is_better": false,
48
+ "score_type": "continuous",
49
+ "min_score": 0.0,
50
+ "max_score": 1.0
51
+ },
52
+ "score_details": {
53
+ "score": 0.845117845117845,
54
+ "details": {
55
+ "seed_scores": "[0.8585858585858586, 0.8535353535353535, 0.8232323232323232]",
56
+ "seed_values": "[1234, 4158, 42]"
57
+ },
58
+ "uncertainty": {
59
+ "standard_error": {
60
+ "value": 0.011039458795121212,
61
+ "method": "across_seeds"
62
+ },
63
+ "num_samples": 3
64
+ }
65
+ },
66
+ "generation_config": {
67
+ "generation_args": {
68
+ "temperature": 1.0,
69
+ "top_p": 0.95,
70
+ "top_k": 40.0,
71
+ "max_tokens": 64000,
72
+ "max_attempts": 1
73
+ },
74
+ "additional_details": {
75
+ "repetition_penalty": "1.0",
76
+ "presence_penalty": "1.5",
77
+ "seed": "1234",
78
+ "min_p": "0.0"
79
+ }
80
+ }
81
+ }
82
+ ]
83
+ }
every_eval_ever/gsm8k_platinum_cot_llama.json ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "schema_version": "0.2.2",
3
+ "evaluation_id": "gsm8k_platinum_cot_llama/inference-optimization/MiniMax-M2.5.w8a8/1777568615.837832",
4
+ "evaluation_timestamp": "1777483916",
5
+ "retrieved_timestamp": "1777568615.837832",
6
+ "source_metadata": {
7
+ "source_name": "lm-evaluation-harness",
8
+ "source_type": "evaluation_run",
9
+ "source_organization_name": "RedHatAI",
10
+ "evaluator_relationship": "third_party"
11
+ },
12
+ "eval_library": {
13
+ "name": "lm_eval",
14
+ "version": "0.4.12.dev0"
15
+ },
16
+ "model_info": {
17
+ "name": "inference-optimization/MiniMax-M2.5.w8a8",
18
+ "id": "inference-optimization/MiniMax-M2.5.w8a8",
19
+ "developer": "inference-optimization",
20
+ "additional_details": {
21
+ "model_args": "{'model': 'inference-optimization/MiniMax-M2.5.w8a8', 'max_length': 196608, 'base_url': 'http://0.0.0.0:8000/v1/chat/completions', 'num_concurrent': 128, 'max_retries': 3, 'tokenized_requests': False, 'tokenizer_backend': None, 'timeout': 2400}",
22
+ "seed": "1234",
23
+ "num_seeds_merged": "3"
24
+ }
25
+ },
26
+ "evaluation_results": [
27
+ {
28
+ "evaluation_name": "gsm8k_platinum_cot_llama/strict-match",
29
+ "source_data": {
30
+ "dataset_name": "gsm8k_platinum_cot_llama",
31
+ "source_type": "hf_dataset",
32
+ "hf_repo": "madrylab/gsm8k-platinum",
33
+ "hf_split": "test"
34
+ },
35
+ "evaluation_timestamp": "1777484287",
36
+ "metric_config": {
37
+ "evaluation_description": "exact_match (filter: strict-match)",
38
+ "lower_is_better": false,
39
+ "score_type": "continuous",
40
+ "min_score": 0.0,
41
+ "max_score": 1.0
42
+ },
43
+ "score_details": {
44
+ "score": 0.9517507582023711,
45
+ "details": {
46
+ "seed_scores": "[0.9511993382961125, 0.9487179487179487, 0.9553349875930521]",
47
+ "seed_values": "[1234, 4158, 42]"
48
+ },
49
+ "uncertainty": {
50
+ "standard_error": {
51
+ "value": 0.001929969671905174,
52
+ "method": "across_seeds"
53
+ },
54
+ "num_samples": 3
55
+ }
56
+ },
57
+ "generation_config": {
58
+ "generation_args": {
59
+ "temperature": 1.0,
60
+ "top_p": 0.95,
61
+ "top_k": 40.0,
62
+ "max_tokens": 64000,
63
+ "max_attempts": 1
64
+ },
65
+ "additional_details": {
66
+ "do_sample": "true",
67
+ "until": "[\"<|eot_id|>\", \"<|start_header_id|>user<|end_header_id|>\", \"Q:\", \"</s>\", \"<|im_end|>\"]",
68
+ "min_p": "0.0",
69
+ "presence_penalty": "1.5",
70
+ "repetition_penalty": "1.0",
71
+ "seed": "1234",
72
+ "num_fewshot": "0"
73
+ }
74
+ }
75
+ },
76
+ {
77
+ "evaluation_name": "gsm8k_platinum_cot_llama/flexible-extract",
78
+ "source_data": {
79
+ "dataset_name": "gsm8k_platinum_cot_llama",
80
+ "source_type": "hf_dataset",
81
+ "hf_repo": "madrylab/gsm8k-platinum",
82
+ "hf_split": "test"
83
+ },
84
+ "evaluation_timestamp": "1777484287",
85
+ "metric_config": {
86
+ "evaluation_description": "exact_match (filter: flexible-extract)",
87
+ "lower_is_better": false,
88
+ "score_type": "continuous",
89
+ "min_score": 0.0,
90
+ "max_score": 1.0
91
+ },
92
+ "score_details": {
93
+ "score": 0.9567135373586987,
94
+ "details": {
95
+ "seed_scores": "[0.9536807278742763, 0.9561621174524401, 0.9602977667493796]",
96
+ "seed_values": "[1234, 4158, 42]"
97
+ },
98
+ "uncertainty": {
99
+ "standard_error": {
100
+ "value": 0.0019299696719051397,
101
+ "method": "across_seeds"
102
+ },
103
+ "num_samples": 3
104
+ }
105
+ },
106
+ "generation_config": {
107
+ "generation_args": {
108
+ "temperature": 1.0,
109
+ "top_p": 0.95,
110
+ "top_k": 40.0,
111
+ "max_tokens": 64000,
112
+ "max_attempts": 1
113
+ },
114
+ "additional_details": {
115
+ "do_sample": "true",
116
+ "until": "[\"<|eot_id|>\", \"<|start_header_id|>user<|end_header_id|>\", \"Q:\", \"</s>\", \"<|im_end|>\"]",
117
+ "min_p": "0.0",
118
+ "presence_penalty": "1.5",
119
+ "repetition_penalty": "1.0",
120
+ "seed": "1234",
121
+ "num_fewshot": "0"
122
+ }
123
+ }
124
+ }
125
+ ]
126
+ }
every_eval_ever/ifeval.json ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "schema_version": "0.2.2",
3
+ "evaluation_id": "ifeval/inference-optimization/MiniMax-M2.5.w8a8/1777568653.331068",
4
+ "evaluation_timestamp": "1777485653",
5
+ "retrieved_timestamp": "1777568653.331068",
6
+ "source_metadata": {
7
+ "source_name": "lm-evaluation-harness",
8
+ "source_type": "evaluation_run",
9
+ "source_organization_name": "RedHatAI",
10
+ "evaluator_relationship": "third_party"
11
+ },
12
+ "eval_library": {
13
+ "name": "lm_eval",
14
+ "version": "0.4.12.dev0"
15
+ },
16
+ "model_info": {
17
+ "name": "inference-optimization/MiniMax-M2.5.w8a8",
18
+ "id": "inference-optimization/MiniMax-M2.5.w8a8",
19
+ "developer": "inference-optimization",
20
+ "additional_details": {
21
+ "model_args": "{'model': 'inference-optimization/MiniMax-M2.5.w8a8', 'max_length': 196608, 'base_url': 'http://0.0.0.0:8000/v1/chat/completions', 'num_concurrent': 128, 'max_retries': 3, 'tokenized_requests': False, 'tokenizer_backend': None, 'timeout': 2400}",
22
+ "seed": "1234",
23
+ "num_seeds_merged": "3"
24
+ }
25
+ },
26
+ "evaluation_results": [
27
+ {
28
+ "evaluation_name": "ifeval",
29
+ "source_data": {
30
+ "dataset_name": "ifeval",
31
+ "source_type": "hf_dataset",
32
+ "hf_repo": "google/IFEval",
33
+ "hf_split": "train"
34
+ },
35
+ "evaluation_timestamp": "1777488144",
36
+ "metric_config": {
37
+ "evaluation_description": "prompt_level_strict_acc",
38
+ "lower_is_better": false,
39
+ "score_type": "continuous",
40
+ "min_score": 0.0,
41
+ "max_score": 1.0
42
+ },
43
+ "score_details": {
44
+ "score": 0.8644485520640789,
45
+ "details": {
46
+ "seed_scores": "[0.8595194085027726, 0.8576709796672828, 0.8761552680221811]",
47
+ "seed_values": "[1234, 4158, 42]"
48
+ },
49
+ "uncertainty": {
50
+ "standard_error": {
51
+ "value": 0.005877629090677424,
52
+ "method": "across_seeds"
53
+ },
54
+ "num_samples": 3
55
+ }
56
+ },
57
+ "generation_config": {
58
+ "generation_args": {
59
+ "temperature": 1.0,
60
+ "top_p": 0.95,
61
+ "top_k": 40.0,
62
+ "max_tokens": 64000,
63
+ "max_attempts": 1
64
+ },
65
+ "additional_details": {
66
+ "until": "[]",
67
+ "do_sample": "true",
68
+ "min_p": "0.0",
69
+ "presence_penalty": "1.5",
70
+ "repetition_penalty": "1.0",
71
+ "seed": "1234",
72
+ "num_fewshot": "0"
73
+ }
74
+ }
75
+ },
76
+ {
77
+ "evaluation_name": "ifeval",
78
+ "source_data": {
79
+ "dataset_name": "ifeval",
80
+ "source_type": "hf_dataset",
81
+ "hf_repo": "google/IFEval",
82
+ "hf_split": "train"
83
+ },
84
+ "evaluation_timestamp": "1777488144",
85
+ "metric_config": {
86
+ "evaluation_description": "inst_level_strict_acc",
87
+ "lower_is_better": false,
88
+ "score_type": "continuous",
89
+ "min_score": 0.0,
90
+ "max_score": 1.0
91
+ },
92
+ "score_details": {
93
+ "score": 0.903277378097522,
94
+ "details": {
95
+ "seed_scores": "[0.8980815347721822, 0.8968824940047961, 0.9148681055155875]",
96
+ "seed_values": "[1234, 4158, 42]"
97
+ },
98
+ "uncertainty": {
99
+ "standard_error": {
100
+ "value": 0.005805691065681036,
101
+ "method": "across_seeds"
102
+ },
103
+ "num_samples": 3
104
+ }
105
+ },
106
+ "generation_config": {
107
+ "generation_args": {
108
+ "temperature": 1.0,
109
+ "top_p": 0.95,
110
+ "top_k": 40.0,
111
+ "max_tokens": 64000,
112
+ "max_attempts": 1
113
+ },
114
+ "additional_details": {
115
+ "until": "[]",
116
+ "do_sample": "true",
117
+ "min_p": "0.0",
118
+ "presence_penalty": "1.5",
119
+ "repetition_penalty": "1.0",
120
+ "seed": "1234",
121
+ "num_fewshot": "0"
122
+ }
123
+ }
124
+ },
125
+ {
126
+ "evaluation_name": "ifeval",
127
+ "source_data": {
128
+ "dataset_name": "ifeval",
129
+ "source_type": "hf_dataset",
130
+ "hf_repo": "google/IFEval",
131
+ "hf_split": "train"
132
+ },
133
+ "evaluation_timestamp": "1777488144",
134
+ "metric_config": {
135
+ "evaluation_description": "prompt_level_loose_acc",
136
+ "lower_is_better": false,
137
+ "score_type": "continuous",
138
+ "min_score": 0.0,
139
+ "max_score": 1.0
140
+ },
141
+ "score_details": {
142
+ "score": 0.895871842267406,
143
+ "details": {
144
+ "seed_scores": "[0.8817005545286506, 0.88909426987061, 0.9168207024029574]",
145
+ "seed_values": "[1234, 4158, 42]"
146
+ },
147
+ "uncertainty": {
148
+ "standard_error": {
149
+ "value": 0.010689680574798186,
150
+ "method": "across_seeds"
151
+ },
152
+ "num_samples": 3
153
+ }
154
+ },
155
+ "generation_config": {
156
+ "generation_args": {
157
+ "temperature": 1.0,
158
+ "top_p": 0.95,
159
+ "top_k": 40.0,
160
+ "max_tokens": 64000,
161
+ "max_attempts": 1
162
+ },
163
+ "additional_details": {
164
+ "until": "[]",
165
+ "do_sample": "true",
166
+ "min_p": "0.0",
167
+ "presence_penalty": "1.5",
168
+ "repetition_penalty": "1.0",
169
+ "seed": "1234",
170
+ "num_fewshot": "0"
171
+ }
172
+ }
173
+ },
174
+ {
175
+ "evaluation_name": "ifeval",
176
+ "source_data": {
177
+ "dataset_name": "ifeval",
178
+ "source_type": "hf_dataset",
179
+ "hf_repo": "google/IFEval",
180
+ "hf_split": "train"
181
+ },
182
+ "evaluation_timestamp": "1777488144",
183
+ "metric_config": {
184
+ "evaluation_description": "inst_level_loose_acc",
185
+ "lower_is_better": false,
186
+ "score_type": "continuous",
187
+ "min_score": 0.0,
188
+ "max_score": 1.0
189
+ },
190
+ "score_details": {
191
+ "score": 0.9240607513988809,
192
+ "details": {
193
+ "seed_scores": "[0.9136690647482014, 0.9172661870503597, 0.9412470023980816]",
194
+ "seed_values": "[1234, 4158, 42]"
195
+ },
196
+ "uncertainty": {
197
+ "standard_error": {
198
+ "value": 0.008655638620186958,
199
+ "method": "across_seeds"
200
+ },
201
+ "num_samples": 3
202
+ }
203
+ },
204
+ "generation_config": {
205
+ "generation_args": {
206
+ "temperature": 1.0,
207
+ "top_p": 0.95,
208
+ "top_k": 40.0,
209
+ "max_tokens": 64000,
210
+ "max_attempts": 1
211
+ },
212
+ "additional_details": {
213
+ "until": "[]",
214
+ "do_sample": "true",
215
+ "min_p": "0.0",
216
+ "presence_penalty": "1.5",
217
+ "repetition_penalty": "1.0",
218
+ "seed": "1234",
219
+ "num_fewshot": "0"
220
+ }
221
+ }
222
+ }
223
+ ]
224
+ }
every_eval_ever/math_500.json ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "schema_version": "0.2.2",
3
+ "evaluation_id": "math_500/inference-optimization/MiniMax-M2.5.w8a8/1777568712.013831",
4
+ "evaluation_timestamp": "7694693",
5
+ "retrieved_timestamp": "1777568712.013831",
6
+ "source_metadata": {
7
+ "source_name": "lighteval",
8
+ "source_type": "evaluation_run",
9
+ "source_organization_name": "RedHatAI",
10
+ "evaluator_relationship": "third_party"
11
+ },
12
+ "eval_library": {
13
+ "name": "lighteval",
14
+ "version": "v0.13.0"
15
+ },
16
+ "model_info": {
17
+ "name": "inference-optimization/MiniMax-M2.5.w8a8",
18
+ "id": "inference-optimization/MiniMax-M2.5.w8a8",
19
+ "developer": "inference-optimization",
20
+ "inference_engine": {
21
+ "name": "vllm"
22
+ },
23
+ "additional_details": {
24
+ "provider": "hosted_vllm",
25
+ "base_url": "http://0.0.0.0:8000/v1",
26
+ "concurrent_requests": "8",
27
+ "verbose": "False",
28
+ "api_max_retry": "8",
29
+ "api_retry_sleep": "1.0",
30
+ "api_retry_multiplier": "2.0",
31
+ "timeout": "2400.0",
32
+ "num_seeds_merged": "3"
33
+ }
34
+ },
35
+ "evaluation_results": [
36
+ {
37
+ "evaluation_name": "math_500",
38
+ "source_data": {
39
+ "dataset_name": "math_500",
40
+ "source_type": "hf_dataset",
41
+ "hf_repo": "HuggingFaceH4/MATH-500",
42
+ "hf_split": "test"
43
+ },
44
+ "evaluation_timestamp": "7697322",
45
+ "metric_config": {
46
+ "evaluation_description": "pass@k:k=1&n=1",
47
+ "lower_is_better": false,
48
+ "score_type": "continuous",
49
+ "min_score": 0.0,
50
+ "max_score": 1.0
51
+ },
52
+ "score_details": {
53
+ "score": 0.8713333333333333,
54
+ "details": {
55
+ "seed_scores": "[0.872, 0.87, 0.872]",
56
+ "seed_values": "[1234, 4158, 42]"
57
+ },
58
+ "uncertainty": {
59
+ "standard_error": {
60
+ "value": 0.0006666666666666673,
61
+ "method": "across_seeds"
62
+ },
63
+ "num_samples": 3
64
+ }
65
+ },
66
+ "generation_config": {
67
+ "generation_args": {
68
+ "temperature": 1.0,
69
+ "top_p": 0.95,
70
+ "top_k": 40.0,
71
+ "max_tokens": 64000,
72
+ "max_attempts": 1
73
+ },
74
+ "additional_details": {
75
+ "repetition_penalty": "1.0",
76
+ "presence_penalty": "1.5",
77
+ "seed": "1234",
78
+ "min_p": "0.0"
79
+ }
80
+ }
81
+ }
82
+ ]
83
+ }