GitHub Actions commited on
Commit
2edd871
·
1 Parent(s): f4514c9

chore: sync EEE pipeline output [2026-03-28 04:56 UTC]

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. data/benchmarks/ace.json +120 -0
  2. data/benchmarks/apex-agents.json +218 -0
  3. data/benchmarks/apex-v1.json +93 -0
  4. data/benchmarks/appworld_test_normal.json +28 -0
  5. data/benchmarks/browsecompplus.json +28 -0
  6. data/benchmarks/global-mmlu-lite.json +706 -0
  7. data/benchmarks/helm_capabilities.json +797 -0
  8. data/benchmarks/helm_classic.json +1478 -0
  9. data/benchmarks/helm_instruct.json +60 -0
  10. data/benchmarks/helm_lite.json +1551 -0
  11. data/benchmarks/helm_mmlu.json +0 -0
  12. data/benchmarks/hfopenllm_v2.json +0 -0
  13. data/benchmarks/livecodebenchpro.json +274 -0
  14. data/benchmarks/reward-bench.json +0 -0
  15. data/benchmarks/swe-bench.json +28 -0
  16. data/benchmarks/tau-bench-2_airline.json +28 -0
  17. data/benchmarks/tau-bench-2_retail.json +28 -0
  18. data/benchmarks/tau-bench-2_telecom.json +28 -0
  19. data/benchmarks/terminal-bench-2.0.json +300 -0
  20. data/developers/0-hero.json +47 -0
  21. data/developers/01-ai.json +433 -0
  22. data/developers/1-800-LLMs.json +33 -0
  23. data/developers/1024m.json +33 -0
  24. data/developers/152334H.json +19 -0
  25. data/developers/1TuanPham.json +33 -0
  26. data/developers/3rd-Degree-Burn.json +61 -0
  27. data/developers/4season.json +19 -0
  28. data/developers/AALF.json +61 -0
  29. data/developers/AELLM.json +33 -0
  30. data/developers/AGI-0.json +47 -0
  31. data/developers/AI-MO.json +33 -0
  32. data/developers/AI-Sweden-Models.json +33 -0
  33. data/developers/AI4free.json +33 -0
  34. data/developers/AIDC-AI.json +19 -0
  35. data/developers/Aashraf995.json +61 -0
  36. data/developers/AbacusResearch.json +19 -0
  37. data/developers/Ahdoot.json +33 -0
  38. data/developers/Ahjeong.json +33 -0
  39. data/developers/AicoresSecurity.json +61 -0
  40. data/developers/Alepach.json +47 -0
  41. data/developers/AlephAlpha.json +59 -0
  42. data/developers/Alibaba-NLP.json +19 -0
  43. data/developers/Alibaba.json +58 -0
  44. data/developers/Alsebay.json +19 -0
  45. data/developers/Amaorynho.json +61 -0
  46. data/developers/Amu.json +33 -0
  47. data/developers/Anthropic.json +129 -0
  48. data/developers/ArliAI.json +33 -0
  49. data/developers/Arthur-LAGACHERIE.json +19 -0
  50. data/developers/Artples.json +33 -0
data/benchmarks/ace.json ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "models": [
3
+ {
4
+ "model_id": "anthropic/Opus 4.1",
5
+ "name": "Opus 4.1",
6
+ "developer": "anthropic",
7
+ "scores": {
8
+ "Overall Score": 0.4,
9
+ "Gaming Score": 0.318
10
+ }
11
+ },
12
+ {
13
+ "model_id": "anthropic/Opus 4.5",
14
+ "name": "Opus 4.5",
15
+ "developer": "anthropic",
16
+ "scores": {
17
+ "Overall Score": 0.478,
18
+ "Gaming Score": 0.391
19
+ }
20
+ },
21
+ {
22
+ "model_id": "anthropic/Sonnet 4.5",
23
+ "name": "Sonnet 4.5",
24
+ "developer": "anthropic",
25
+ "scores": {
26
+ "Overall Score": 0.44,
27
+ "Gaming Score": 0.373
28
+ }
29
+ },
30
+ {
31
+ "model_id": "google/Gemini 2.5 Flash",
32
+ "name": "Gemini 2.5 Flash",
33
+ "developer": "google",
34
+ "scores": {
35
+ "Overall Score": 0.38,
36
+ "Gaming Score": 0.284
37
+ }
38
+ },
39
+ {
40
+ "model_id": "google/Gemini 2.5 Pro",
41
+ "name": "Gemini 2.5 Pro",
42
+ "developer": "google",
43
+ "scores": {
44
+ "Overall Score": 0.4,
45
+ "Gaming Score": 0.285
46
+ }
47
+ },
48
+ {
49
+ "model_id": "google/Gemini 3 Flash",
50
+ "name": "Gemini 3 Flash",
51
+ "developer": "google",
52
+ "scores": {
53
+ "Gaming Score": 0.415
54
+ }
55
+ },
56
+ {
57
+ "model_id": "google/Gemini 3 Pro",
58
+ "name": "Gemini 3 Pro",
59
+ "developer": "google",
60
+ "scores": {
61
+ "Overall Score": 0.47,
62
+ "Gaming Score": 0.509
63
+ }
64
+ },
65
+ {
66
+ "model_id": "openai/GPT 5",
67
+ "name": "GPT 5",
68
+ "developer": "openai",
69
+ "scores": {
70
+ "Overall Score": 0.561,
71
+ "DIY Score": 0.55,
72
+ "Food Score": 0.7,
73
+ "Gaming Score": 0.575
74
+ }
75
+ },
76
+ {
77
+ "model_id": "openai/GPT 5.1",
78
+ "name": "GPT 5.1",
79
+ "developer": "openai",
80
+ "scores": {
81
+ "Overall Score": 0.551,
82
+ "DIY Score": 0.56,
83
+ "Gaming Score": 0.61,
84
+ "Shopping Score": 0.45
85
+ }
86
+ },
87
+ {
88
+ "model_id": "openai/GPT 5.2",
89
+ "name": "GPT 5.2",
90
+ "developer": "openai",
91
+ "scores": {
92
+ "Overall Score": 0.515,
93
+ "Food Score": 0.65,
94
+ "Gaming Score": 0.578
95
+ }
96
+ },
97
+ {
98
+ "model_id": "openai/o3",
99
+ "name": "o3",
100
+ "developer": "openai",
101
+ "scores": {
102
+ "Overall Score": 0.529,
103
+ "Gaming Score": 0.585,
104
+ "Shopping Score": 0.45
105
+ }
106
+ },
107
+ {
108
+ "model_id": "openai/o3 Pro",
109
+ "name": "o3 Pro",
110
+ "developer": "openai",
111
+ "scores": {
112
+ "Overall Score": 0.552,
113
+ "DIY Score": 0.54,
114
+ "Food Score": 0.6,
115
+ "Gaming Score": 0.613,
116
+ "Shopping Score": 0.45
117
+ }
118
+ }
119
+ ]
120
+ }
data/benchmarks/apex-agents.json ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "models": [
3
+ {
4
+ "model_id": "anthropic/Opus 4.5",
5
+ "name": "Opus 4.5",
6
+ "developer": "anthropic",
7
+ "scores": {
8
+ "Overall Pass@1": 0.184,
9
+ "Overall Pass@8": 0.34,
10
+ "Overall Mean Score": 0.348,
11
+ "Investment Banking Pass@1": 0.216,
12
+ "Management Consulting Pass@1": 0.132,
13
+ "Corporate Law Pass@1": 0.202,
14
+ "Corporate Lawyer Mean Score": 0.471
15
+ }
16
+ },
17
+ {
18
+ "model_id": "anthropic/Opus 4.6",
19
+ "name": "Opus 4.6",
20
+ "developer": "anthropic",
21
+ "scores": {
22
+ "Overall Pass@1": 0.298,
23
+ "Corporate Lawyer Mean Score": 0.502
24
+ }
25
+ },
26
+ {
27
+ "model_id": "applied-compute/Applied Compute: Small",
28
+ "name": "Applied Compute: Small",
29
+ "developer": "applied-compute",
30
+ "scores": {
31
+ "Overall Pass@1": 0.23,
32
+ "Overall Mean Score": 0.401,
33
+ "Corporate Law Pass@1": 0.266,
34
+ "Corporate Lawyer Mean Score": 0.548
35
+ }
36
+ },
37
+ {
38
+ "model_id": "google/Gemini 3 Flash",
39
+ "name": "Gemini 3 Flash",
40
+ "developer": "google",
41
+ "scores": {
42
+ "Overall Pass@1": 0.24,
43
+ "Overall Pass@8": 0.367,
44
+ "Overall Mean Score": 0.395,
45
+ "Investment Banking Pass@1": 0.267,
46
+ "Management Consulting Pass@1": 0.193,
47
+ "Corporate Law Pass@1": 0.259,
48
+ "Corporate Lawyer Mean Score": 0.524
49
+ }
50
+ },
51
+ {
52
+ "model_id": "google/Gemini 3 Pro",
53
+ "name": "Gemini 3 Pro",
54
+ "developer": "google",
55
+ "scores": {
56
+ "Overall Pass@1": 0.184,
57
+ "Overall Pass@8": 0.373,
58
+ "Overall Mean Score": 0.341,
59
+ "Investment Banking Pass@1": 0.188,
60
+ "Management Consulting Pass@1": 0.124,
61
+ "Corporate Law Pass@1": 0.239,
62
+ "Corporate Lawyer Mean Score": 0.487
63
+ }
64
+ },
65
+ {
66
+ "model_id": "google/Gemini 3.1 Pro",
67
+ "name": "Gemini 3.1 Pro",
68
+ "developer": "google",
69
+ "scores": {
70
+ "Overall Pass@1": 0.335,
71
+ "Corporate Lawyer Mean Score": 0.494
72
+ }
73
+ },
74
+ {
75
+ "model_id": "minimax/Minimax-2.5",
76
+ "name": "Minimax-2.5",
77
+ "developer": "minimax",
78
+ "scores": {
79
+ "Corporate Lawyer Mean Score": 0.339
80
+ }
81
+ },
82
+ {
83
+ "model_id": "moonshot/Kimi K2 Thinking",
84
+ "name": "Kimi K2 Thinking",
85
+ "developer": "moonshot",
86
+ "scores": {
87
+ "Overall Pass@1": 0.04,
88
+ "Overall Pass@8": 0.144,
89
+ "Overall Mean Score": 0.115,
90
+ "Investment Banking Pass@1": 0.012,
91
+ "Management Consulting Pass@1": 0.029,
92
+ "Corporate Law Pass@1": 0.08,
93
+ "Corporate Lawyer Mean Score": 0.223
94
+ }
95
+ },
96
+ {
97
+ "model_id": "moonshot/Kimi K2.5",
98
+ "name": "Kimi K2.5",
99
+ "developer": "moonshot",
100
+ "scores": {
101
+ "Corporate Lawyer Mean Score": 0.402
102
+ }
103
+ },
104
+ {
105
+ "model_id": "openai/GPT 5",
106
+ "name": "GPT 5",
107
+ "developer": "openai",
108
+ "scores": {
109
+ "Overall Pass@1": 0.183,
110
+ "Overall Pass@8": 0.31,
111
+ "Overall Mean Score": 0.329,
112
+ "Investment Banking Pass@1": 0.273,
113
+ "Management Consulting Pass@1": 0.123,
114
+ "Corporate Law Pass@1": 0.153,
115
+ "Corporate Lawyer Mean Score": 0.382
116
+ }
117
+ },
118
+ {
119
+ "model_id": "openai/GPT 5 Codex",
120
+ "name": "GPT 5 Codex",
121
+ "developer": "openai",
122
+ "scores": {
123
+ "Corporate Lawyer Mean Score": 0.362
124
+ }
125
+ },
126
+ {
127
+ "model_id": "openai/GPT 5.1",
128
+ "name": "GPT 5.1",
129
+ "developer": "openai",
130
+ "scores": {
131
+ "Corporate Lawyer Mean Score": 0.376
132
+ }
133
+ },
134
+ {
135
+ "model_id": "openai/GPT 5.1 Codex",
136
+ "name": "GPT 5.1 Codex",
137
+ "developer": "openai",
138
+ "scores": {
139
+ "Corporate Lawyer Mean Score": 0.366
140
+ }
141
+ },
142
+ {
143
+ "model_id": "openai/GPT 5.2",
144
+ "name": "GPT 5.2",
145
+ "developer": "openai",
146
+ "scores": {
147
+ "Overall Pass@1": 0.23,
148
+ "Overall Pass@8": 0.4,
149
+ "Overall Mean Score": 0.387,
150
+ "Investment Banking Pass@1": 0.273,
151
+ "Management Consulting Pass@1": 0.227,
152
+ "Corporate Law Pass@1": 0.189,
153
+ "Corporate Lawyer Mean Score": 0.443
154
+ }
155
+ },
156
+ {
157
+ "model_id": "openai/GPT 5.2 Codex",
158
+ "name": "GPT 5.2 Codex",
159
+ "developer": "openai",
160
+ "scores": {
161
+ "Overall Pass@1": 0.276,
162
+ "Corporate Lawyer Mean Score": 0.394
163
+ }
164
+ },
165
+ {
166
+ "model_id": "openai/GPT 5.3 Codex",
167
+ "name": "GPT 5.3 Codex",
168
+ "developer": "openai",
169
+ "scores": {
170
+ "Overall Pass@1": 0.317
171
+ }
172
+ },
173
+ {
174
+ "model_id": "openai/GPT OSS 120B",
175
+ "name": "GPT OSS 120B",
176
+ "developer": "openai",
177
+ "scores": {
178
+ "Overall Pass@1": 0.047,
179
+ "Overall Pass@8": 0.115,
180
+ "Overall Mean Score": 0.145,
181
+ "Investment Banking Pass@1": 0.027,
182
+ "Management Consulting Pass@1": 0.035,
183
+ "Corporate Law Pass@1": 0.078,
184
+ "Corporate Lawyer Mean Score": 0.269
185
+ }
186
+ },
187
+ {
188
+ "model_id": "xai/Grok 4",
189
+ "name": "Grok 4",
190
+ "developer": "xai",
191
+ "scores": {
192
+ "Overall Pass@1": 0.152,
193
+ "Overall Pass@8": 0.329,
194
+ "Overall Mean Score": 0.303,
195
+ "Investment Banking Pass@1": 0.17,
196
+ "Management Consulting Pass@1": 0.12,
197
+ "Corporate Law Pass@1": 0.165,
198
+ "Corporate Lawyer Mean Score": 0.41
199
+ }
200
+ },
201
+ {
202
+ "model_id": "zhipu/GLM 4.6",
203
+ "name": "GLM 4.6",
204
+ "developer": "zhipu",
205
+ "scores": {
206
+ "Corporate Lawyer Mean Score": 0.196
207
+ }
208
+ },
209
+ {
210
+ "model_id": "zhipu/GLM 4.7",
211
+ "name": "GLM 4.7",
212
+ "developer": "zhipu",
213
+ "scores": {
214
+ "Corporate Lawyer Mean Score": 0.147
215
+ }
216
+ }
217
+ ]
218
+ }
data/benchmarks/apex-v1.json ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "models": [
3
+ {
4
+ "model_id": "anthropic/Opus 4.5",
5
+ "name": "Opus 4.5",
6
+ "developer": "anthropic",
7
+ "scores": {
8
+ "Medicine (MD) Score": 0.65
9
+ }
10
+ },
11
+ {
12
+ "model_id": "google/Gemini 2.5 Flash",
13
+ "name": "Gemini 2.5 Flash",
14
+ "developer": "google",
15
+ "scores": {
16
+ "Overall Score": 0.604
17
+ }
18
+ },
19
+ {
20
+ "model_id": "google/Gemini 3 Flash",
21
+ "name": "Gemini 3 Flash",
22
+ "developer": "google",
23
+ "scores": {
24
+ "Overall Score": 0.64,
25
+ "Consulting Score": 0.64
26
+ }
27
+ },
28
+ {
29
+ "model_id": "google/Gemini 3 Pro",
30
+ "name": "Gemini 3 Pro",
31
+ "developer": "google",
32
+ "scores": {
33
+ "Overall Score": 0.643,
34
+ "Consulting Score": 0.64,
35
+ "Investment Banking Score": 0.63
36
+ }
37
+ },
38
+ {
39
+ "model_id": "openai/GPT 4o",
40
+ "name": "GPT 4o",
41
+ "developer": "openai",
42
+ "scores": {
43
+ "Overall Score": 0.359
44
+ }
45
+ },
46
+ {
47
+ "model_id": "openai/GPT 5",
48
+ "name": "GPT 5",
49
+ "developer": "openai",
50
+ "scores": {
51
+ "Overall Score": 0.67,
52
+ "Big Law Score": 0.78,
53
+ "Medicine (MD) Score": 0.66,
54
+ "Investment Banking Score": 0.61
55
+ }
56
+ },
57
+ {
58
+ "model_id": "openai/GPT 5.1",
59
+ "name": "GPT 5.1",
60
+ "developer": "openai",
61
+ "scores": {
62
+ "Big Law Score": 0.77
63
+ }
64
+ },
65
+ {
66
+ "model_id": "openai/GPT 5.2 Pro",
67
+ "name": "GPT 5.2 Pro",
68
+ "developer": "openai",
69
+ "scores": {
70
+ "Overall Score": 0.668,
71
+ "Consulting Score": 0.64,
72
+ "Medicine (MD) Score": 0.65,
73
+ "Investment Banking Score": 0.64
74
+ }
75
+ },
76
+ {
77
+ "model_id": "openai/o3",
78
+ "name": "o3",
79
+ "developer": "openai",
80
+ "scores": {
81
+ "Big Law Score": 0.76
82
+ }
83
+ },
84
+ {
85
+ "model_id": "xai/Grok 4",
86
+ "name": "Grok 4",
87
+ "developer": "xai",
88
+ "scores": {
89
+ "Overall Score": 0.635
90
+ }
91
+ }
92
+ ]
93
+ }
data/benchmarks/appworld_test_normal.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "models": [
3
+ {
4
+ "model_id": "anthropic/claude-opus-4-5",
5
+ "name": "claude-opus-4-5",
6
+ "developer": "Anthropic",
7
+ "scores": {
8
+ "appworld/test_normal": 0.7
9
+ }
10
+ },
11
+ {
12
+ "model_id": "google/gemini-3-pro-preview",
13
+ "name": "gemini-3-pro-preview",
14
+ "developer": "Google",
15
+ "scores": {
16
+ "appworld/test_normal": 0.36
17
+ }
18
+ },
19
+ {
20
+ "model_id": "openai/gpt-5.2-2025-12-11",
21
+ "name": "gpt-5.2-2025-12-11",
22
+ "developer": "OpenAI",
23
+ "scores": {
24
+ "appworld/test_normal": 0.0
25
+ }
26
+ }
27
+ ]
28
+ }
data/benchmarks/browsecompplus.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "models": [
3
+ {
4
+ "model_id": "anthropic/claude-opus-4-5",
5
+ "name": "claude-opus-4-5",
6
+ "developer": "Anthropic",
7
+ "scores": {
8
+ "browsecompplus": 0.61
9
+ }
10
+ },
11
+ {
12
+ "model_id": "google/gemini-3-pro-preview",
13
+ "name": "gemini-3-pro-preview",
14
+ "developer": "Google",
15
+ "scores": {
16
+ "browsecompplus": 0.57
17
+ }
18
+ },
19
+ {
20
+ "model_id": "openai/gpt-5.2-2025-12-11",
21
+ "name": "gpt-5.2-2025-12-11",
22
+ "developer": "OpenAI",
23
+ "scores": {
24
+ "browsecompplus": 0.46
25
+ }
26
+ }
27
+ ]
28
+ }
data/benchmarks/global-mmlu-lite.json ADDED
@@ -0,0 +1,706 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "models": [
3
+ {
4
+ "model_id": "alibaba/qwen3-235b-a22b-instruct-2507",
5
+ "name": "qwen3-235b-a22b-instruct-2507",
6
+ "developer": "alibaba",
7
+ "scores": {
8
+ "Global MMLU Lite": 0.8798,
9
+ "Culturally Sensitive": 0.8522,
10
+ "Culturally Agnostic": 0.9075,
11
+ "Arabic": 0.88,
12
+ "English": 0.89,
13
+ "Bengali": 0.8875,
14
+ "German": 0.885,
15
+ "French": 0.88,
16
+ "Hindi": 0.8775,
17
+ "Indonesian": 0.88,
18
+ "Italian": 0.88,
19
+ "Japanese": 0.88,
20
+ "Korean": 0.875,
21
+ "Portuguese": 0.8875,
22
+ "Spanish": 0.875,
23
+ "Swahili": 0.87,
24
+ "Yoruba": 0.8725,
25
+ "Chinese": 0.8775,
26
+ "Burmese": 0.88
27
+ }
28
+ },
29
+ {
30
+ "model_id": "anthropic/claude-3-5-haiku-20241022",
31
+ "name": "claude-3-5-haiku-20241022",
32
+ "developer": "anthropic",
33
+ "scores": {
34
+ "Global MMLU Lite": 0.6114,
35
+ "Culturally Sensitive": 0.5834,
36
+ "Culturally Agnostic": 0.6394,
37
+ "Arabic": 0.695,
38
+ "English": 0.485,
39
+ "Bengali": 0.675,
40
+ "German": 0.565,
41
+ "French": 0.61,
42
+ "Hindi": 0.6575,
43
+ "Indonesian": 0.5475,
44
+ "Italian": 0.48,
45
+ "Japanese": 0.655,
46
+ "Korean": 0.6575,
47
+ "Portuguese": 0.5225,
48
+ "Spanish": 0.485,
49
+ "Swahili": 0.69,
50
+ "Yoruba": 0.6675,
51
+ "Chinese": 0.69,
52
+ "Burmese": 0.7
53
+ }
54
+ },
55
+ {
56
+ "model_id": "anthropic/claude-3-7-sonnet-20250219",
57
+ "name": "claude-3-7-sonnet-20250219",
58
+ "developer": "anthropic",
59
+ "scores": {
60
+ "Global MMLU Lite": 0.8078,
61
+ "Culturally Sensitive": 0.7794,
62
+ "Culturally Agnostic": 0.8362,
63
+ "Arabic": 0.7925,
64
+ "English": 0.7625,
65
+ "Bengali": 0.825,
66
+ "German": 0.8125,
67
+ "French": 0.7675,
68
+ "Hindi": 0.805,
69
+ "Indonesian": 0.8175,
70
+ "Italian": 0.8225,
71
+ "Japanese": 0.8425,
72
+ "Korean": 0.83,
73
+ "Portuguese": 0.77,
74
+ "Spanish": 0.8075,
75
+ "Swahili": 0.8125,
76
+ "Yoruba": 0.81,
77
+ "Chinese": 0.835,
78
+ "Burmese": 0.8125
79
+ }
80
+ },
81
+ {
82
+ "model_id": "anthropic/claude-opus-4-1-20250805",
83
+ "name": "claude-opus-4-1-20250805",
84
+ "developer": "anthropic",
85
+ "scores": {
86
+ "Global MMLU Lite": 0.943,
87
+ "Culturally Sensitive": 0.9331,
88
+ "Culturally Agnostic": 0.9528,
89
+ "Arabic": 0.945,
90
+ "English": 0.9475,
91
+ "Bengali": 0.9425,
92
+ "German": 0.94,
93
+ "French": 0.945,
94
+ "Hindi": 0.9475,
95
+ "Indonesian": 0.9425,
96
+ "Italian": 0.94,
97
+ "Japanese": 0.94,
98
+ "Korean": 0.95,
99
+ "Portuguese": 0.945,
100
+ "Spanish": 0.945,
101
+ "Swahili": 0.93,
102
+ "Yoruba": 0.9375,
103
+ "Chinese": 0.945,
104
+ "Burmese": 0.945
105
+ }
106
+ },
107
+ {
108
+ "model_id": "anthropic/claude-sonnet-4-20250514",
109
+ "name": "claude-sonnet-4-20250514",
110
+ "developer": "anthropic",
111
+ "scores": {
112
+ "Global MMLU Lite": 0.9058,
113
+ "Culturally Sensitive": 0.8913,
114
+ "Culturally Agnostic": 0.9203,
115
+ "Arabic": 0.9125,
116
+ "English": 0.905,
117
+ "Bengali": 0.9075,
118
+ "German": 0.9125,
119
+ "French": 0.91,
120
+ "Hindi": 0.9,
121
+ "Indonesian": 0.9025,
122
+ "Italian": 0.9075,
123
+ "Japanese": 0.9,
124
+ "Korean": 0.9125,
125
+ "Portuguese": 0.91,
126
+ "Spanish": 0.9075,
127
+ "Swahili": 0.8975,
128
+ "Yoruba": 0.8975,
129
+ "Chinese": 0.9175,
130
+ "Burmese": 0.8925
131
+ }
132
+ },
133
+ {
134
+ "model_id": "cohere/aya-expanse-32b",
135
+ "name": "aya-expanse-32b",
136
+ "developer": "cohere",
137
+ "scores": {
138
+ "Global MMLU Lite": 0.7353,
139
+ "Culturally Sensitive": 0.6891,
140
+ "Culturally Agnostic": 0.7815,
141
+ "Arabic": 0.7425,
142
+ "English": 0.7544,
143
+ "Bengali": 0.7343,
144
+ "German": 0.7425,
145
+ "French": 0.7325,
146
+ "Hindi": 0.7375,
147
+ "Indonesian": 0.7594,
148
+ "Italian": 0.7305,
149
+ "Japanese": 0.7419,
150
+ "Korean": 0.7525,
151
+ "Portuguese": 0.7544,
152
+ "Spanish": 0.7362,
153
+ "Swahili": 0.7071,
154
+ "Yoruba": 0.6942,
155
+ "Chinese": 0.743,
156
+ "Burmese": 0.7025
157
+ }
158
+ },
159
+ {
160
+ "model_id": "cohere/command-a-03-2025",
161
+ "name": "command-a-03-2025",
162
+ "developer": "cohere",
163
+ "scores": {
164
+ "Global MMLU Lite": 0.8385,
165
+ "Culturally Sensitive": 0.7993,
166
+ "Culturally Agnostic": 0.8778,
167
+ "Arabic": 0.8425,
168
+ "English": 0.855,
169
+ "Bengali": 0.8225,
170
+ "German": 0.8425,
171
+ "French": 0.8375,
172
+ "Hindi": 0.8421,
173
+ "Indonesian": 0.8546,
174
+ "Italian": 0.8375,
175
+ "Japanese": 0.845,
176
+ "Korean": 0.85,
177
+ "Portuguese": 0.84,
178
+ "Spanish": 0.8525,
179
+ "Swahili": 0.8275,
180
+ "Yoruba": 0.815,
181
+ "Chinese": 0.835,
182
+ "Burmese": 0.8175
183
+ }
184
+ },
185
+ {
186
+ "model_id": "deepseek/deepseek-r1-0528",
187
+ "name": "deepseek-r1-0528",
188
+ "developer": "deepseek",
189
+ "scores": {
190
+ "Global MMLU Lite": 0.6744,
191
+ "Culturally Sensitive": 0.6672,
192
+ "Culturally Agnostic": 0.6816,
193
+ "Arabic": 0.6825,
194
+ "English": 0.715,
195
+ "Bengali": 0.655,
196
+ "German": 0.6375,
197
+ "French": 0.6925,
198
+ "Hindi": 0.6475,
199
+ "Indonesian": 0.655,
200
+ "Italian": 0.6775,
201
+ "Japanese": 0.7725,
202
+ "Korean": 0.6575,
203
+ "Portuguese": 0.635,
204
+ "Spanish": 0.7175,
205
+ "Swahili": 0.6775,
206
+ "Yoruba": 0.77,
207
+ "Chinese": 0.5075,
208
+ "Burmese": 0.69
209
+ }
210
+ },
211
+ {
212
+ "model_id": "deepseek/deepseek-v3.1",
213
+ "name": "deepseek-v3.1",
214
+ "developer": "deepseek",
215
+ "scores": {
216
+ "Global MMLU Lite": 0.8044,
217
+ "Culturally Sensitive": 0.7793,
218
+ "Culturally Agnostic": 0.8295,
219
+ "Arabic": 0.805,
220
+ "English": 0.825,
221
+ "Bengali": 0.8157,
222
+ "German": 0.7925,
223
+ "French": 0.8175,
224
+ "Hindi": 0.7569,
225
+ "Indonesian": 0.7764,
226
+ "Italian": 0.8075,
227
+ "Japanese": 0.8312,
228
+ "Korean": 0.8125,
229
+ "Portuguese": 0.8246,
230
+ "Spanish": 0.8125,
231
+ "Swahili": 0.801,
232
+ "Yoruba": 0.7831,
233
+ "Chinese": 0.8161,
234
+ "Burmese": 0.7925
235
+ }
236
+ },
237
+ {
238
+ "model_id": "google/gemini-2.5-flash",
239
+ "name": "gemini-2.5-flash",
240
+ "developer": "google",
241
+ "scores": {
242
+ "Global MMLU Lite": 0.9145,
243
+ "Culturally Sensitive": 0.9,
244
+ "Culturally Agnostic": 0.9291,
245
+ "Arabic": 0.9125,
246
+ "English": 0.9325,
247
+ "Bengali": 0.91,
248
+ "German": 0.9025,
249
+ "French": 0.91,
250
+ "Hindi": 0.925,
251
+ "Indonesian": 0.9075,
252
+ "Italian": 0.9225,
253
+ "Japanese": 0.9125,
254
+ "Korean": 0.915,
255
+ "Portuguese": 0.9125,
256
+ "Spanish": 0.9175,
257
+ "Swahili": 0.915,
258
+ "Yoruba": 0.9075,
259
+ "Chinese": 0.915,
260
+ "Burmese": 0.915
261
+ }
262
+ },
263
+ {
264
+ "model_id": "google/gemini-2.5-flash-preview-05-20",
265
+ "name": "gemini-2.5-flash-preview-05-20",
266
+ "developer": "google",
267
+ "scores": {
268
+ "Global MMLU Lite": 0.9092,
269
+ "Culturally Sensitive": 0.8925,
270
+ "Culturally Agnostic": 0.9259,
271
+ "Arabic": 0.905,
272
+ "English": 0.9225,
273
+ "Bengali": 0.91,
274
+ "German": 0.905,
275
+ "French": 0.925,
276
+ "Hindi": 0.9125,
277
+ "Indonesian": 0.9075,
278
+ "Italian": 0.89,
279
+ "Japanese": 0.9125,
280
+ "Korean": 0.9075,
281
+ "Portuguese": 0.915,
282
+ "Spanish": 0.915,
283
+ "Swahili": 0.905,
284
+ "Yoruba": 0.8825,
285
+ "Chinese": 0.93,
286
+ "Burmese": 0.9025
287
+ }
288
+ },
289
+ {
290
+ "model_id": "google/gemini-2.5-pro",
291
+ "name": "gemini-2.5-pro",
292
+ "developer": "google",
293
+ "scores": {
294
+ "Global MMLU Lite": 0.9323,
295
+ "Culturally Sensitive": 0.9241,
296
+ "Culturally Agnostic": 0.9406,
297
+ "Arabic": 0.9475,
298
+ "English": 0.9275,
299
+ "Bengali": 0.9275,
300
+ "German": 0.93,
301
+ "French": 0.9425,
302
+ "Hindi": 0.9275,
303
+ "Indonesian": 0.925,
304
+ "Italian": 0.935,
305
+ "Japanese": 0.9375,
306
+ "Korean": 0.9275,
307
+ "Portuguese": 0.93,
308
+ "Spanish": 0.94,
309
+ "Swahili": 0.9375,
310
+ "Yoruba": 0.925,
311
+ "Chinese": 0.9275,
312
+ "Burmese": 0.93
313
+ }
314
+ },
315
+ {
316
+ "model_id": "google/gemini-3-pro-preview",
317
+ "name": "gemini-3-pro-preview",
318
+ "developer": "Google",
319
+ "scores": {
320
+ "Global MMLU Lite": 0.9453,
321
+ "Culturally Sensitive": 0.9397,
322
+ "Culturally Agnostic": 0.9509,
323
+ "Arabic": 0.9475,
324
+ "English": 0.9425,
325
+ "Bengali": 0.9425,
326
+ "German": 0.94,
327
+ "French": 0.9575,
328
+ "Hindi": 0.9425,
329
+ "Indonesian": 0.955,
330
+ "Italian": 0.955,
331
+ "Japanese": 0.94,
332
+ "Korean": 0.94,
333
+ "Portuguese": 0.9425,
334
+ "Spanish": 0.9475,
335
+ "Swahili": 0.94,
336
+ "Yoruba": 0.9425,
337
+ "Chinese": 0.9475,
338
+ "Burmese": 0.9425
339
+ }
340
+ },
341
+ {
342
+ "model_id": "google/gemma-3-27b-it",
343
+ "name": "gemma-3-27b-it",
344
+ "developer": "google",
345
+ "scores": {
346
+ "Global MMLU Lite": 0.763,
347
+ "Culturally Sensitive": 0.7528,
348
+ "Culturally Agnostic": 0.7733,
349
+ "Arabic": 0.78,
350
+ "English": 0.7337,
351
+ "Bengali": 0.75,
352
+ "German": 0.775,
353
+ "French": 0.7481,
354
+ "Hindi": 0.7335,
355
+ "Indonesian": 0.7563,
356
+ "Italian": 0.75,
357
+ "Japanese": 0.7925,
358
+ "Korean": 0.798,
359
+ "Portuguese": 0.7481,
360
+ "Spanish": 0.7494,
361
+ "Swahili": 0.785,
362
+ "Yoruba": 0.7444,
363
+ "Chinese": 0.7925,
364
+ "Burmese": 0.7719
365
+ }
366
+ },
367
+ {
368
+ "model_id": "google/gemma-3-4b-it",
369
+ "name": "gemma-3-4b-it",
370
+ "developer": "google",
371
+ "scores": {
372
+ "Global MMLU Lite": 0.6511,
373
+ "Culturally Sensitive": 0.6116,
374
+ "Culturally Agnostic": 0.6906,
375
+ "Arabic": 0.6525,
376
+ "English": 0.67,
377
+ "Bengali": 0.68,
378
+ "German": 0.6525,
379
+ "French": 0.6575,
380
+ "Hindi": 0.6475,
381
+ "Indonesian": 0.6775,
382
+ "Italian": 0.6675,
383
+ "Japanese": 0.6325,
384
+ "Korean": 0.66,
385
+ "Portuguese": 0.68,
386
+ "Spanish": 0.6725,
387
+ "Swahili": 0.6075,
388
+ "Yoruba": 0.5825,
389
+ "Chinese": 0.6475,
390
+ "Burmese": 0.63
391
+ }
392
+ },
393
+ {
394
+ "model_id": "ibm/granite-4.0-h-small",
395
+ "name": "granite-4.0-h-small",
396
+ "developer": "ibm",
397
+ "scores": {
398
+ "Global MMLU Lite": 0.7503,
399
+ "Culturally Sensitive": 0.7182,
400
+ "Culturally Agnostic": 0.7826,
401
+ "Arabic": 0.7613,
402
+ "English": 0.77,
403
+ "Bengali": 0.7613,
404
+ "German": 0.755,
405
+ "French": 0.7594,
406
+ "Hindi": 0.7575,
407
+ "Indonesian": 0.7614,
408
+ "Italian": 0.7525,
409
+ "Japanese": 0.7406,
410
+ "Korean": 0.7525,
411
+ "Portuguese": 0.757,
412
+ "Spanish": 0.7638,
413
+ "Swahili": 0.7318,
414
+ "Yoruba": 0.6921,
415
+ "Chinese": 0.7475,
416
+ "Burmese": 0.7419
417
+ }
418
+ },
419
+ {
420
+ "model_id": "mistralai/mistral-medium-3",
421
+ "name": "mistral-medium-3",
422
+ "developer": "mistralai",
423
+ "scores": {
424
+ "Global MMLU Lite": 0.5511,
425
+ "Culturally Sensitive": 0.5391,
426
+ "Culturally Agnostic": 0.5631,
427
+ "Arabic": 0.455,
428
+ "English": 0.38,
429
+ "Bengali": 0.5175,
430
+ "German": 0.4775,
431
+ "French": 0.41,
432
+ "Hindi": 0.555,
433
+ "Indonesian": 0.515,
434
+ "Italian": 0.535,
435
+ "Japanese": 0.58,
436
+ "Korean": 0.595,
437
+ "Portuguese": 0.5175,
438
+ "Spanish": 0.5375,
439
+ "Swahili": 0.7075,
440
+ "Yoruba": 0.7675,
441
+ "Chinese": 0.535,
442
+ "Burmese": 0.7325
443
+ }
444
+ },
445
+ {
446
+ "model_id": "mistralai/mistral-small-2503",
447
+ "name": "mistral-small-2503",
448
+ "developer": "mistralai",
449
+ "scores": {
450
+ "Global MMLU Lite": 0.7852,
451
+ "Culturally Sensitive": 0.7537,
452
+ "Culturally Agnostic": 0.8166,
453
+ "Arabic": 0.7875,
454
+ "English": 0.8,
455
+ "Bengali": 0.7725,
456
+ "German": 0.7975,
457
+ "French": 0.8,
458
+ "Hindi": 0.795,
459
+ "Indonesian": 0.785,
460
+ "Italian": 0.805,
461
+ "Japanese": 0.77,
462
+ "Korean": 0.79,
463
+ "Portuguese": 0.7925,
464
+ "Spanish": 0.7825,
465
+ "Swahili": 0.775,
466
+ "Yoruba": 0.735,
467
+ "Chinese": 0.7925,
468
+ "Burmese": 0.7825
469
+ }
470
+ },
471
+ {
472
+ "model_id": "openai/gpt-4.1-2025-04-14",
473
+ "name": "gpt-4.1-2025-04-14",
474
+ "developer": "openai",
475
+ "scores": {
476
+ "Global MMLU Lite": 0.8755,
477
+ "Culturally Sensitive": 0.8541,
478
+ "Culturally Agnostic": 0.8969,
479
+ "Arabic": 0.88,
480
+ "English": 0.8825,
481
+ "Bengali": 0.8625,
482
+ "German": 0.875,
483
+ "French": 0.8875,
484
+ "Hindi": 0.8775,
485
+ "Indonesian": 0.885,
486
+ "Italian": 0.88,
487
+ "Japanese": 0.8725,
488
+ "Korean": 0.87,
489
+ "Portuguese": 0.875,
490
+ "Spanish": 0.885,
491
+ "Swahili": 0.8725,
492
+ "Yoruba": 0.875,
493
+ "Chinese": 0.87,
494
+ "Burmese": 0.8575
495
+ }
496
+ },
497
+ {
498
+ "model_id": "openai/gpt-5-2025-08-07",
499
+ "name": "gpt-5-2025-08-07",
500
+ "developer": "openai",
501
+ "scores": {
502
+ "Global MMLU Lite": 0.8895,
503
+ "Culturally Sensitive": 0.8913,
504
+ "Culturally Agnostic": 0.8878,
505
+ "Arabic": 0.8925,
506
+ "English": 0.8725,
507
+ "Bengali": 0.9,
508
+ "German": 0.91,
509
+ "French": 0.9075,
510
+ "Hindi": 0.865,
511
+ "Indonesian": 0.795,
512
+ "Italian": 0.9075,
513
+ "Japanese": 0.8875,
514
+ "Korean": 0.915,
515
+ "Portuguese": 0.8875,
516
+ "Spanish": 0.905,
517
+ "Swahili": 0.865,
518
+ "Yoruba": 0.9125,
519
+ "Chinese": 0.895,
520
+ "Burmese": 0.915
521
+ }
522
+ },
523
+ {
524
+ "model_id": "openai/o3-mini-2025-01-31",
525
+ "name": "o3-mini-2025-01-31",
526
+ "developer": "openai",
527
+ "scores": {
528
+ "Global MMLU Lite": 0.78,
529
+ "Culturally Sensitive": 0.765,
530
+ "Culturally Agnostic": 0.795,
531
+ "Arabic": 0.7725,
532
+ "English": 0.8025,
533
+ "Bengali": 0.77,
534
+ "German": 0.7525,
535
+ "French": 0.74,
536
+ "Hindi": 0.7525,
537
+ "Indonesian": 0.7425,
538
+ "Italian": 0.8,
539
+ "Japanese": 0.81,
540
+ "Korean": 0.8075,
541
+ "Portuguese": 0.7975,
542
+ "Spanish": 0.775,
543
+ "Swahili": 0.765,
544
+ "Yoruba": 0.7725,
545
+ "Chinese": 0.8125,
546
+ "Burmese": 0.8075
547
+ }
548
+ },
549
+ {
550
+ "model_id": "openai/o4-mini-2025-04-16",
551
+ "name": "o4-mini-2025-04-16",
552
+ "developer": "openai",
553
+ "scores": {
554
+ "Global MMLU Lite": 0.8705,
555
+ "Culturally Sensitive": 0.8503,
556
+ "Culturally Agnostic": 0.8906,
557
+ "Arabic": 0.865,
558
+ "English": 0.8675,
559
+ "Bengali": 0.8875,
560
+ "German": 0.8775,
561
+ "French": 0.87,
562
+ "Hindi": 0.87,
563
+ "Indonesian": 0.8675,
564
+ "Italian": 0.855,
565
+ "Japanese": 0.885,
566
+ "Korean": 0.88,
567
+ "Portuguese": 0.88,
568
+ "Spanish": 0.855,
569
+ "Swahili": 0.8525,
570
+ "Yoruba": 0.8525,
571
+ "Chinese": 0.89,
572
+ "Burmese": 0.8725
573
+ }
574
+ },
575
+ {
576
+ "model_id": "unknown/aya-expanse-32b",
577
+ "name": "aya-expanse-32b",
578
+ "developer": "unknown",
579
+ "scores": {
580
+ "Global MMLU Lite": 0.7353,
581
+ "Culturally Sensitive": 0.6891,
582
+ "Culturally Agnostic": 0.7815,
583
+ "Arabic": 0.7425,
584
+ "English": 0.7544,
585
+ "Bengali": 0.7343,
586
+ "German": 0.7425,
587
+ "French": 0.7325,
588
+ "Hindi": 0.7375,
589
+ "Indonesian": 0.7594,
590
+ "Italian": 0.7305,
591
+ "Japanese": 0.7419,
592
+ "Korean": 0.7525,
593
+ "Portuguese": 0.7544,
594
+ "Spanish": 0.7362,
595
+ "Swahili": 0.7071,
596
+ "Yoruba": 0.6942,
597
+ "Chinese": 0.743,
598
+ "Burmese": 0.7025
599
+ }
600
+ },
601
+ {
602
+ "model_id": "unknown/granite-4.0-h-small",
603
+ "name": "granite-4.0-h-small",
604
+ "developer": "unknown",
605
+ "scores": {
606
+ "Global MMLU Lite": 0.7503,
607
+ "Culturally Sensitive": 0.7182,
608
+ "Culturally Agnostic": 0.7826,
609
+ "Arabic": 0.7613,
610
+ "English": 0.77,
611
+ "Bengali": 0.7613,
612
+ "German": 0.755,
613
+ "French": 0.7594,
614
+ "Hindi": 0.7575,
615
+ "Indonesian": 0.7614,
616
+ "Italian": 0.7525,
617
+ "Japanese": 0.7406,
618
+ "Korean": 0.7525,
619
+ "Portuguese": 0.757,
620
+ "Spanish": 0.7638,
621
+ "Swahili": 0.7318,
622
+ "Yoruba": 0.6921,
623
+ "Chinese": 0.7475,
624
+ "Burmese": 0.7419
625
+ }
626
+ },
627
+ {
628
+ "model_id": "unknown/o4-mini-2025-04-16",
629
+ "name": "o4-mini-2025-04-16",
630
+ "developer": "unknown",
631
+ "scores": {
632
+ "Global MMLU Lite": 0.8705,
633
+ "Culturally Sensitive": 0.8503,
634
+ "Culturally Agnostic": 0.8906,
635
+ "Arabic": 0.865,
636
+ "English": 0.8675,
637
+ "Bengali": 0.8875,
638
+ "German": 0.8775,
639
+ "French": 0.87,
640
+ "Hindi": 0.87,
641
+ "Indonesian": 0.8675,
642
+ "Italian": 0.855,
643
+ "Japanese": 0.885,
644
+ "Korean": 0.88,
645
+ "Portuguese": 0.88,
646
+ "Spanish": 0.855,
647
+ "Swahili": 0.8525,
648
+ "Yoruba": 0.8525,
649
+ "Chinese": 0.89,
650
+ "Burmese": 0.8725
651
+ }
652
+ },
653
+ {
654
+ "model_id": "xai/grok-3-mini",
655
+ "name": "grok-3-mini",
656
+ "developer": "xai",
657
+ "scores": {
658
+ "Global MMLU Lite": 0.673,
659
+ "Culturally Sensitive": 0.6717,
660
+ "Culturally Agnostic": 0.6743,
661
+ "Arabic": 0.755,
662
+ "English": 0.5075,
663
+ "Bengali": 0.7355,
664
+ "German": 0.6591,
665
+ "French": 0.485,
666
+ "Hindi": 0.56,
667
+ "Indonesian": 0.725,
668
+ "Italian": 0.696,
669
+ "Japanese": 0.6575,
670
+ "Korean": 0.7325,
671
+ "Portuguese": 0.6275,
672
+ "Spanish": 0.61,
673
+ "Swahili": 0.7625,
674
+ "Yoruba": 0.8296,
675
+ "Chinese": 0.5564,
676
+ "Burmese": 0.8693
677
+ }
678
+ },
679
+ {
680
+ "model_id": "xai/grok-4-0709",
681
+ "name": "grok-4-0709",
682
+ "developer": "xai",
683
+ "scores": {
684
+ "Global MMLU Lite": 0.8881,
685
+ "Culturally Sensitive": 0.8862,
686
+ "Culturally Agnostic": 0.89,
687
+ "Arabic": 0.885,
688
+ "English": 0.905,
689
+ "Bengali": 0.8925,
690
+ "German": 0.8725,
691
+ "French": 0.875,
692
+ "Hindi": 0.8675,
693
+ "Indonesian": 0.89,
694
+ "Italian": 0.9025,
695
+ "Japanese": 0.87,
696
+ "Korean": 0.895,
697
+ "Portuguese": 0.8725,
698
+ "Spanish": 0.9075,
699
+ "Swahili": 0.91,
700
+ "Yoruba": 0.905,
701
+ "Chinese": 0.8525,
702
+ "Burmese": 0.9075
703
+ }
704
+ }
705
+ ]
706
+ }
data/benchmarks/helm_capabilities.json ADDED
@@ -0,0 +1,797 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "models": [
3
+ {
4
+ "model_id": "allenai/olmo-2-0325-32b-instruct",
5
+ "name": "OLMo 2 32B Instruct March 2025",
6
+ "developer": "allenai",
7
+ "scores": {
8
+ "Mean score": 0.475,
9
+ "MMLU-Pro": 0.414,
10
+ "GPQA": 0.287,
11
+ "IFEval": 0.78,
12
+ "WildBench": 0.734,
13
+ "Omni-MATH": 0.161
14
+ }
15
+ },
16
+ {
17
+ "model_id": "allenai/olmo-2-1124-13b-instruct",
18
+ "name": "OLMo 2 13B Instruct November 2024",
19
+ "developer": "allenai",
20
+ "scores": {
21
+ "Mean score": 0.44,
22
+ "MMLU-Pro": 0.31,
23
+ "GPQA": 0.316,
24
+ "IFEval": 0.73,
25
+ "WildBench": 0.689,
26
+ "Omni-MATH": 0.156
27
+ }
28
+ },
29
+ {
30
+ "model_id": "allenai/olmo-2-1124-7b-instruct",
31
+ "name": "OLMo 2 7B Instruct November 2024",
32
+ "developer": "allenai",
33
+ "scores": {
34
+ "Mean score": 0.405,
35
+ "MMLU-Pro": 0.292,
36
+ "GPQA": 0.296,
37
+ "IFEval": 0.693,
38
+ "WildBench": 0.628,
39
+ "Omni-MATH": 0.116
40
+ }
41
+ },
42
+ {
43
+ "model_id": "allenai/olmoe-1b-7b-0125-instruct",
44
+ "name": "OLMoE 1B-7B Instruct January 2025",
45
+ "developer": "allenai",
46
+ "scores": {
47
+ "Mean score": 0.332,
48
+ "MMLU-Pro": 0.169,
49
+ "GPQA": 0.22,
50
+ "IFEval": 0.628,
51
+ "WildBench": 0.551,
52
+ "Omni-MATH": 0.093
53
+ }
54
+ },
55
+ {
56
+ "model_id": "amazon/nova-lite-v1:0",
57
+ "name": "Amazon Nova Lite",
58
+ "developer": "amazon",
59
+ "scores": {
60
+ "Mean score": 0.551,
61
+ "MMLU-Pro": 0.6,
62
+ "GPQA": 0.397,
63
+ "IFEval": 0.776,
64
+ "WildBench": 0.75,
65
+ "Omni-MATH": 0.233
66
+ }
67
+ },
68
+ {
69
+ "model_id": "amazon/nova-micro-v1:0",
70
+ "name": "Amazon Nova Micro",
71
+ "developer": "amazon",
72
+ "scores": {
73
+ "Mean score": 0.522,
74
+ "MMLU-Pro": 0.511,
75
+ "GPQA": 0.383,
76
+ "IFEval": 0.76,
77
+ "WildBench": 0.743,
78
+ "Omni-MATH": 0.214
79
+ }
80
+ },
81
+ {
82
+ "model_id": "amazon/nova-premier-v1:0",
83
+ "name": "Amazon Nova Premier",
84
+ "developer": "amazon",
85
+ "scores": {
86
+ "Mean score": 0.637,
87
+ "MMLU-Pro": 0.726,
88
+ "GPQA": 0.518,
89
+ "IFEval": 0.803,
90
+ "WildBench": 0.788,
91
+ "Omni-MATH": 0.35
92
+ }
93
+ },
94
+ {
95
+ "model_id": "amazon/nova-pro-v1:0",
96
+ "name": "Amazon Nova Pro",
97
+ "developer": "amazon",
98
+ "scores": {
99
+ "Mean score": 0.591,
100
+ "MMLU-Pro": 0.673,
101
+ "GPQA": 0.446,
102
+ "IFEval": 0.815,
103
+ "WildBench": 0.777,
104
+ "Omni-MATH": 0.242
105
+ }
106
+ },
107
+ {
108
+ "model_id": "anthropic/claude-3-5-haiku-20241022",
109
+ "name": "claude-3-5-haiku-20241022",
110
+ "developer": "anthropic",
111
+ "scores": {
112
+ "Mean score": 0.549,
113
+ "MMLU-Pro": 0.605,
114
+ "GPQA": 0.363,
115
+ "IFEval": 0.792,
116
+ "WildBench": 0.76,
117
+ "Omni-MATH": 0.224
118
+ }
119
+ },
120
+ {
121
+ "model_id": "anthropic/claude-3-5-sonnet-20241022",
122
+ "name": "Claude 3.5 Sonnet 20241022",
123
+ "developer": "anthropic",
124
+ "scores": {
125
+ "Mean score": 0.653,
126
+ "MMLU-Pro": 0.777,
127
+ "GPQA": 0.565,
128
+ "IFEval": 0.856,
129
+ "WildBench": 0.792,
130
+ "Omni-MATH": 0.276
131
+ }
132
+ },
133
+ {
134
+ "model_id": "anthropic/claude-3-7-sonnet-20250219",
135
+ "name": "claude-3-7-sonnet-20250219",
136
+ "developer": "anthropic",
137
+ "scores": {
138
+ "Mean score": 0.674,
139
+ "MMLU-Pro": 0.784,
140
+ "GPQA": 0.608,
141
+ "IFEval": 0.834,
142
+ "WildBench": 0.814,
143
+ "Omni-MATH": 0.33
144
+ }
145
+ },
146
+ {
147
+ "model_id": "anthropic/claude-opus-4-20250514",
148
+ "name": "Claude 4 Opus 20250514",
149
+ "developer": "anthropic",
150
+ "scores": {
151
+ "Mean score": 0.757,
152
+ "MMLU-Pro": 0.859,
153
+ "GPQA": 0.666,
154
+ "IFEval": 0.918,
155
+ "WildBench": 0.833,
156
+ "Omni-MATH": 0.511
157
+ }
158
+ },
159
+ {
160
+ "model_id": "anthropic/claude-opus-4-20250514-thinking-10k",
161
+ "name": "Claude 4 Opus 20250514, extended thinking",
162
+ "developer": "anthropic",
163
+ "scores": {
164
+ "Mean score": 0.78,
165
+ "MMLU-Pro": 0.875,
166
+ "GPQA": 0.709,
167
+ "IFEval": 0.849,
168
+ "WildBench": 0.852,
169
+ "Omni-MATH": 0.616
170
+ }
171
+ },
172
+ {
173
+ "model_id": "anthropic/claude-sonnet-4-20250514",
174
+ "name": "claude-sonnet-4-20250514",
175
+ "developer": "anthropic",
176
+ "scores": {
177
+ "Mean score": 0.733,
178
+ "MMLU-Pro": 0.843,
179
+ "GPQA": 0.643,
180
+ "IFEval": 0.839,
181
+ "WildBench": 0.825,
182
+ "Omni-MATH": 0.512
183
+ }
184
+ },
185
+ {
186
+ "model_id": "anthropic/claude-sonnet-4-20250514-thinking-10k",
187
+ "name": "Claude 4 Sonnet 20250514, extended thinking",
188
+ "developer": "anthropic",
189
+ "scores": {
190
+ "Mean score": 0.766,
191
+ "MMLU-Pro": 0.843,
192
+ "GPQA": 0.706,
193
+ "IFEval": 0.84,
194
+ "WildBench": 0.838,
195
+ "Omni-MATH": 0.602
196
+ }
197
+ },
198
+ {
199
+ "model_id": "deepseek-ai/deepseek-r1-0528",
200
+ "name": "DeepSeek-R1-0528",
201
+ "developer": "deepseek-ai",
202
+ "scores": {
203
+ "Mean score": 0.699,
204
+ "MMLU-Pro": 0.793,
205
+ "GPQA": 0.666,
206
+ "IFEval": 0.784,
207
+ "WildBench": 0.828,
208
+ "Omni-MATH": 0.424
209
+ }
210
+ },
211
+ {
212
+ "model_id": "deepseek-ai/deepseek-v3",
213
+ "name": "DeepSeek v3",
214
+ "developer": "deepseek-ai",
215
+ "scores": {
216
+ "Mean score": 0.665,
217
+ "MMLU-Pro": 0.723,
218
+ "GPQA": 0.538,
219
+ "IFEval": 0.832,
220
+ "WildBench": 0.831,
221
+ "Omni-MATH": 0.403
222
+ }
223
+ },
224
+ {
225
+ "model_id": "google/gemini-1.5-flash-002",
226
+ "name": "Gemini 1.5 Flash 002",
227
+ "developer": "google",
228
+ "scores": {
229
+ "Mean score": 0.609,
230
+ "MMLU-Pro": 0.678,
231
+ "GPQA": 0.437,
232
+ "IFEval": 0.831,
233
+ "WildBench": 0.792,
234
+ "Omni-MATH": 0.305
235
+ }
236
+ },
237
+ {
238
+ "model_id": "google/gemini-1.5-pro-002",
239
+ "name": "Gemini 1.5 Pro 002",
240
+ "developer": "google",
241
+ "scores": {
242
+ "Mean score": 0.657,
243
+ "MMLU-Pro": 0.737,
244
+ "GPQA": 0.534,
245
+ "IFEval": 0.837,
246
+ "WildBench": 0.813,
247
+ "Omni-MATH": 0.364
248
+ }
249
+ },
250
+ {
251
+ "model_id": "google/gemini-2.0-flash-001",
252
+ "name": "Gemini 2.0 Flash",
253
+ "developer": "google",
254
+ "scores": {
255
+ "Mean score": 0.679,
256
+ "MMLU-Pro": 0.737,
257
+ "GPQA": 0.556,
258
+ "IFEval": 0.841,
259
+ "WildBench": 0.8,
260
+ "Omni-MATH": 0.459
261
+ }
262
+ },
263
+ {
264
+ "model_id": "google/gemini-2.0-flash-lite-preview-02-05",
265
+ "name": "Gemini 2.0 Flash Lite 02-05 preview",
266
+ "developer": "google",
267
+ "scores": {
268
+ "Mean score": 0.642,
269
+ "MMLU-Pro": 0.72,
270
+ "GPQA": 0.5,
271
+ "IFEval": 0.824,
272
+ "WildBench": 0.79,
273
+ "Omni-MATH": 0.374
274
+ }
275
+ },
276
+ {
277
+ "model_id": "google/gemini-2.5-flash-lite",
278
+ "name": "Gemini 2.5 Flash-Lite",
279
+ "developer": "google",
280
+ "scores": {
281
+ "Mean score": 0.591,
282
+ "MMLU-Pro": 0.537,
283
+ "GPQA": 0.309,
284
+ "IFEval": 0.81,
285
+ "WildBench": 0.818,
286
+ "Omni-MATH": 0.48
287
+ }
288
+ },
289
+ {
290
+ "model_id": "google/gemini-2.5-flash-preview-04-17",
291
+ "name": "Gemini 2.5 Flash 04-17 preview",
292
+ "developer": "google",
293
+ "scores": {
294
+ "Mean score": 0.626,
295
+ "MMLU-Pro": 0.639,
296
+ "GPQA": 0.39,
297
+ "IFEval": 0.898,
298
+ "WildBench": 0.817,
299
+ "Omni-MATH": 0.384
300
+ }
301
+ },
302
+ {
303
+ "model_id": "google/gemini-2.5-pro-preview-03-25",
304
+ "name": "Gemini 2.5 Pro 03-25 preview",
305
+ "developer": "google",
306
+ "scores": {
307
+ "Mean score": 0.745,
308
+ "MMLU-Pro": 0.863,
309
+ "GPQA": 0.749,
310
+ "IFEval": 0.84,
311
+ "WildBench": 0.857,
312
+ "Omni-MATH": 0.416
313
+ }
314
+ },
315
+ {
316
+ "model_id": "ibm/granite-3.3-8b-instruct",
317
+ "name": "IBM Granite 3.3 8B Instruct",
318
+ "developer": "ibm",
319
+ "scores": {
320
+ "Mean score": 0.463,
321
+ "MMLU-Pro": 0.343,
322
+ "GPQA": 0.325,
323
+ "IFEval": 0.729,
324
+ "WildBench": 0.741,
325
+ "Omni-MATH": 0.176
326
+ }
327
+ },
328
+ {
329
+ "model_id": "marin-community/marin-8b-instruct",
330
+ "name": "Marin 8B Instruct",
331
+ "developer": "marin-community",
332
+ "scores": {
333
+ "Mean score": 0.325,
334
+ "MMLU-Pro": 0.188,
335
+ "GPQA": 0.168,
336
+ "IFEval": 0.632,
337
+ "WildBench": 0.477,
338
+ "Omni-MATH": 0.16
339
+ }
340
+ },
341
+ {
342
+ "model_id": "meta/llama-3.1-405b-instruct-turbo",
343
+ "name": "Llama 3.1 Instruct Turbo 405B",
344
+ "developer": "meta",
345
+ "scores": {
346
+ "Mean score": 0.618,
347
+ "MMLU-Pro": 0.723,
348
+ "GPQA": 0.522,
349
+ "IFEval": 0.811,
350
+ "WildBench": 0.783,
351
+ "Omni-MATH": 0.249
352
+ }
353
+ },
354
+ {
355
+ "model_id": "meta/llama-3.1-70b-instruct-turbo",
356
+ "name": "Llama 3.1 Instruct Turbo 70B",
357
+ "developer": "meta",
358
+ "scores": {
359
+ "Mean score": 0.574,
360
+ "MMLU-Pro": 0.653,
361
+ "GPQA": 0.426,
362
+ "IFEval": 0.821,
363
+ "WildBench": 0.758,
364
+ "Omni-MATH": 0.21
365
+ }
366
+ },
367
+ {
368
+ "model_id": "meta/llama-3.1-8b-instruct-turbo",
369
+ "name": "Llama 3.1 Instruct Turbo 8B",
370
+ "developer": "meta",
371
+ "scores": {
372
+ "Mean score": 0.444,
373
+ "MMLU-Pro": 0.406,
374
+ "GPQA": 0.247,
375
+ "IFEval": 0.743,
376
+ "WildBench": 0.686,
377
+ "Omni-MATH": 0.137
378
+ }
379
+ },
380
+ {
381
+ "model_id": "meta/llama-4-maverick-17b-128e-instruct-fp8",
382
+ "name": "Llama 4 Maverick 17Bx128E Instruct FP8",
383
+ "developer": "meta",
384
+ "scores": {
385
+ "Mean score": 0.718,
386
+ "MMLU-Pro": 0.81,
387
+ "GPQA": 0.65,
388
+ "IFEval": 0.908,
389
+ "WildBench": 0.8,
390
+ "Omni-MATH": 0.422
391
+ }
392
+ },
393
+ {
394
+ "model_id": "meta/llama-4-scout-17b-16e-instruct",
395
+ "name": "Llama 4 Scout 17Bx16E Instruct",
396
+ "developer": "meta",
397
+ "scores": {
398
+ "Mean score": 0.644,
399
+ "MMLU-Pro": 0.742,
400
+ "GPQA": 0.507,
401
+ "IFEval": 0.818,
402
+ "WildBench": 0.779,
403
+ "Omni-MATH": 0.373
404
+ }
405
+ },
406
+ {
407
+ "model_id": "mistralai/mistral-7b-instruct-v0.3",
408
+ "name": "Mistral Instruct v0.3 7B",
409
+ "developer": "mistralai",
410
+ "scores": {
411
+ "Mean score": 0.376,
412
+ "MMLU-Pro": 0.277,
413
+ "GPQA": 0.303,
414
+ "IFEval": 0.567,
415
+ "WildBench": 0.66,
416
+ "Omni-MATH": 0.072
417
+ }
418
+ },
419
+ {
420
+ "model_id": "mistralai/mistral-large-2411",
421
+ "name": "Mistral Large 2411",
422
+ "developer": "mistralai",
423
+ "scores": {
424
+ "Mean score": 0.598,
425
+ "MMLU-Pro": 0.599,
426
+ "GPQA": 0.435,
427
+ "IFEval": 0.876,
428
+ "WildBench": 0.801,
429
+ "Omni-MATH": 0.281
430
+ }
431
+ },
432
+ {
433
+ "model_id": "mistralai/mistral-small-2503",
434
+ "name": "mistral-small-2503",
435
+ "developer": "mistralai",
436
+ "scores": {
437
+ "Mean score": 0.558,
438
+ "MMLU-Pro": 0.61,
439
+ "GPQA": 0.392,
440
+ "IFEval": 0.75,
441
+ "WildBench": 0.788,
442
+ "Omni-MATH": 0.248
443
+ }
444
+ },
445
+ {
446
+ "model_id": "mistralai/mixtral-8x22b-instruct-v0.1",
447
+ "name": "Mixtral Instruct 8x22B",
448
+ "developer": "mistralai",
449
+ "scores": {
450
+ "Mean score": 0.478,
451
+ "MMLU-Pro": 0.46,
452
+ "GPQA": 0.334,
453
+ "IFEval": 0.724,
454
+ "WildBench": 0.711,
455
+ "Omni-MATH": 0.163
456
+ }
457
+ },
458
+ {
459
+ "model_id": "mistralai/mixtral-8x7b-instruct-v0.1",
460
+ "name": "Mixtral Instruct 8x7B",
461
+ "developer": "mistralai",
462
+ "scores": {
463
+ "Mean score": 0.397,
464
+ "MMLU-Pro": 0.335,
465
+ "GPQA": 0.296,
466
+ "IFEval": 0.575,
467
+ "WildBench": 0.673,
468
+ "Omni-MATH": 0.105
469
+ }
470
+ },
471
+ {
472
+ "model_id": "moonshotai/kimi-k2-instruct",
473
+ "name": "Kimi K2 Instruct",
474
+ "developer": "moonshotai",
475
+ "scores": {
476
+ "Mean score": 0.768,
477
+ "MMLU-Pro": 0.819,
478
+ "GPQA": 0.652,
479
+ "IFEval": 0.85,
480
+ "WildBench": 0.862,
481
+ "Omni-MATH": 0.654
482
+ }
483
+ },
484
+ {
485
+ "model_id": "openai/gpt-4.1-2025-04-14",
486
+ "name": "gpt-4.1-2025-04-14",
487
+ "developer": "openai",
488
+ "scores": {
489
+ "Mean score": 0.727,
490
+ "MMLU-Pro": 0.811,
491
+ "GPQA": 0.659,
492
+ "IFEval": 0.838,
493
+ "WildBench": 0.854,
494
+ "Omni-MATH": 0.471
495
+ }
496
+ },
497
+ {
498
+ "model_id": "openai/gpt-4.1-mini-2025-04-14",
499
+ "name": "GPT-4.1 mini 2025-04-14",
500
+ "developer": "openai",
501
+ "scores": {
502
+ "Mean score": 0.726,
503
+ "MMLU-Pro": 0.783,
504
+ "GPQA": 0.614,
505
+ "IFEval": 0.904,
506
+ "WildBench": 0.838,
507
+ "Omni-MATH": 0.491
508
+ }
509
+ },
510
+ {
511
+ "model_id": "openai/gpt-4.1-nano-2025-04-14",
512
+ "name": "GPT-4.1 nano 2025-04-14",
513
+ "developer": "openai",
514
+ "scores": {
515
+ "Mean score": 0.616,
516
+ "MMLU-Pro": 0.55,
517
+ "GPQA": 0.507,
518
+ "IFEval": 0.843,
519
+ "WildBench": 0.811,
520
+ "Omni-MATH": 0.367
521
+ }
522
+ },
523
+ {
524
+ "model_id": "openai/gpt-4o-2024-11-20",
525
+ "name": "GPT-4o 2024-11-20",
526
+ "developer": "openai",
527
+ "scores": {
528
+ "Mean score": 0.634,
529
+ "MMLU-Pro": 0.713,
530
+ "GPQA": 0.52,
531
+ "IFEval": 0.817,
532
+ "WildBench": 0.828,
533
+ "Omni-MATH": 0.293
534
+ }
535
+ },
536
+ {
537
+ "model_id": "openai/gpt-4o-mini-2024-07-18",
538
+ "name": "GPT-4o mini 2024-07-18",
539
+ "developer": "openai",
540
+ "scores": {
541
+ "Mean score": 0.565,
542
+ "MMLU-Pro": 0.603,
543
+ "GPQA": 0.368,
544
+ "IFEval": 0.782,
545
+ "WildBench": 0.791,
546
+ "Omni-MATH": 0.28
547
+ }
548
+ },
549
+ {
550
+ "model_id": "openai/gpt-5-2025-08-07",
551
+ "name": "gpt-5-2025-08-07",
552
+ "developer": "openai",
553
+ "scores": {
554
+ "Mean score": 0.807,
555
+ "MMLU-Pro": 0.863,
556
+ "GPQA": 0.791,
557
+ "IFEval": 0.875,
558
+ "WildBench": 0.857,
559
+ "Omni-MATH": 0.647
560
+ }
561
+ },
562
+ {
563
+ "model_id": "openai/gpt-5-mini-2025-08-07",
564
+ "name": "GPT-5 mini 2025-08-07",
565
+ "developer": "openai",
566
+ "scores": {
567
+ "Mean score": 0.819,
568
+ "MMLU-Pro": 0.835,
569
+ "GPQA": 0.756,
570
+ "IFEval": 0.927,
571
+ "WildBench": 0.855,
572
+ "Omni-MATH": 0.722
573
+ }
574
+ },
575
+ {
576
+ "model_id": "openai/gpt-5-nano-2025-08-07",
577
+ "name": "GPT-5 nano 2025-08-07",
578
+ "developer": "openai",
579
+ "scores": {
580
+ "Mean score": 0.748,
581
+ "MMLU-Pro": 0.778,
582
+ "GPQA": 0.679,
583
+ "IFEval": 0.932,
584
+ "WildBench": 0.806,
585
+ "Omni-MATH": 0.547
586
+ }
587
+ },
588
+ {
589
+ "model_id": "openai/gpt-oss-120b",
590
+ "name": "gpt-oss-120b",
591
+ "developer": "openai",
592
+ "scores": {
593
+ "Mean score": 0.77,
594
+ "MMLU-Pro": 0.795,
595
+ "GPQA": 0.684,
596
+ "IFEval": 0.836,
597
+ "WildBench": 0.845,
598
+ "Omni-MATH": 0.688
599
+ }
600
+ },
601
+ {
602
+ "model_id": "openai/gpt-oss-20b",
603
+ "name": "gpt-oss-20b",
604
+ "developer": "openai",
605
+ "scores": {
606
+ "Mean score": 0.674,
607
+ "MMLU-Pro": 0.74,
608
+ "GPQA": 0.594,
609
+ "IFEval": 0.732,
610
+ "WildBench": 0.737,
611
+ "Omni-MATH": 0.565
612
+ }
613
+ },
614
+ {
615
+ "model_id": "openai/o3-2025-04-16",
616
+ "name": "o3 2025-04-16",
617
+ "developer": "openai",
618
+ "scores": {
619
+ "Mean score": 0.811,
620
+ "MMLU-Pro": 0.859,
621
+ "GPQA": 0.753,
622
+ "IFEval": 0.869,
623
+ "WildBench": 0.861,
624
+ "Omni-MATH": 0.714
625
+ }
626
+ },
627
+ {
628
+ "model_id": "openai/o4-mini-2025-04-16",
629
+ "name": "o4-mini-2025-04-16",
630
+ "developer": "openai",
631
+ "scores": {
632
+ "Mean score": 0.812,
633
+ "MMLU-Pro": 0.82,
634
+ "GPQA": 0.735,
635
+ "IFEval": 0.929,
636
+ "WildBench": 0.854,
637
+ "Omni-MATH": 0.72
638
+ }
639
+ },
640
+ {
641
+ "model_id": "qwen/qwen2.5-72b-instruct-turbo",
642
+ "name": "Qwen2.5 Instruct Turbo 72B",
643
+ "developer": "qwen",
644
+ "scores": {
645
+ "Mean score": 0.599,
646
+ "MMLU-Pro": 0.631,
647
+ "GPQA": 0.426,
648
+ "IFEval": 0.806,
649
+ "WildBench": 0.802,
650
+ "Omni-MATH": 0.33
651
+ }
652
+ },
653
+ {
654
+ "model_id": "qwen/qwen2.5-7b-instruct-turbo",
655
+ "name": "Qwen2.5 Instruct Turbo 7B",
656
+ "developer": "qwen",
657
+ "scores": {
658
+ "Mean score": 0.529,
659
+ "MMLU-Pro": 0.539,
660
+ "GPQA": 0.341,
661
+ "IFEval": 0.741,
662
+ "WildBench": 0.731,
663
+ "Omni-MATH": 0.294
664
+ }
665
+ },
666
+ {
667
+ "model_id": "qwen/qwen3-235b-a22b-fp8-tput",
668
+ "name": "Qwen3 235B A22B FP8 Throughput",
669
+ "developer": "qwen",
670
+ "scores": {
671
+ "Mean score": 0.726,
672
+ "MMLU-Pro": 0.817,
673
+ "GPQA": 0.623,
674
+ "IFEval": 0.816,
675
+ "WildBench": 0.828,
676
+ "Omni-MATH": 0.548
677
+ }
678
+ },
679
+ {
680
+ "model_id": "qwen/qwen3-235b-a22b-instruct-2507-fp8",
681
+ "name": "Qwen3 235B A22B Instruct 2507 FP8",
682
+ "developer": "qwen",
683
+ "scores": {
684
+ "Mean score": 0.798,
685
+ "MMLU-Pro": 0.844,
686
+ "GPQA": 0.726,
687
+ "IFEval": 0.835,
688
+ "WildBench": 0.866,
689
+ "Omni-MATH": 0.718
690
+ }
691
+ },
692
+ {
693
+ "model_id": "writer/palmyra-fin",
694
+ "name": "Palmyra Fin",
695
+ "developer": "writer",
696
+ "scores": {
697
+ "Mean score": 0.577,
698
+ "MMLU-Pro": 0.591,
699
+ "GPQA": 0.422,
700
+ "IFEval": 0.793,
701
+ "WildBench": 0.783,
702
+ "Omni-MATH": 0.295
703
+ }
704
+ },
705
+ {
706
+ "model_id": "writer/palmyra-med",
707
+ "name": "Palmyra Med",
708
+ "developer": "writer",
709
+ "scores": {
710
+ "Mean score": 0.476,
711
+ "MMLU-Pro": 0.411,
712
+ "GPQA": 0.368,
713
+ "IFEval": 0.767,
714
+ "WildBench": 0.676,
715
+ "Omni-MATH": 0.156
716
+ }
717
+ },
718
+ {
719
+ "model_id": "writer/palmyra-x-004",
720
+ "name": "Palmyra-X-004",
721
+ "developer": "writer",
722
+ "scores": {
723
+ "Mean score": 0.609,
724
+ "MMLU-Pro": 0.657,
725
+ "GPQA": 0.395,
726
+ "IFEval": 0.872,
727
+ "WildBench": 0.802,
728
+ "Omni-MATH": 0.32
729
+ }
730
+ },
731
+ {
732
+ "model_id": "writer/palmyra-x5",
733
+ "name": "Palmyra X5",
734
+ "developer": "writer",
735
+ "scores": {
736
+ "Mean score": 0.696,
737
+ "MMLU-Pro": 0.804,
738
+ "GPQA": 0.661,
739
+ "IFEval": 0.823,
740
+ "WildBench": 0.78,
741
+ "Omni-MATH": 0.414
742
+ }
743
+ },
744
+ {
745
+ "model_id": "xai/grok-3-beta",
746
+ "name": "Grok 3 Beta",
747
+ "developer": "xai",
748
+ "scores": {
749
+ "Mean score": 0.727,
750
+ "MMLU-Pro": 0.788,
751
+ "GPQA": 0.65,
752
+ "IFEval": 0.884,
753
+ "WildBench": 0.849,
754
+ "Omni-MATH": 0.464
755
+ }
756
+ },
757
+ {
758
+ "model_id": "xai/grok-3-mini-beta",
759
+ "name": "Grok 3 mini Beta",
760
+ "developer": "xai",
761
+ "scores": {
762
+ "Mean score": 0.679,
763
+ "MMLU-Pro": 0.799,
764
+ "GPQA": 0.675,
765
+ "IFEval": 0.951,
766
+ "WildBench": 0.651,
767
+ "Omni-MATH": 0.318
768
+ }
769
+ },
770
+ {
771
+ "model_id": "xai/grok-4-0709",
772
+ "name": "grok-4-0709",
773
+ "developer": "xai",
774
+ "scores": {
775
+ "Mean score": 0.785,
776
+ "MMLU-Pro": 0.851,
777
+ "GPQA": 0.726,
778
+ "IFEval": 0.949,
779
+ "WildBench": 0.797,
780
+ "Omni-MATH": 0.603
781
+ }
782
+ },
783
+ {
784
+ "model_id": "zai-org/glm-4.5-air-fp8",
785
+ "name": "GLM-4.5-Air-FP8",
786
+ "developer": "zai-org",
787
+ "scores": {
788
+ "Mean score": 0.67,
789
+ "MMLU-Pro": 0.762,
790
+ "GPQA": 0.594,
791
+ "IFEval": 0.812,
792
+ "WildBench": 0.789,
793
+ "Omni-MATH": 0.391
794
+ }
795
+ }
796
+ ]
797
+ }
data/benchmarks/helm_classic.json ADDED
@@ -0,0 +1,1478 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "models": [
3
+ {
4
+ "model_id": "Anthropic-LM-v4-s3-52B",
5
+ "name": "Anthropic-LM v4-s3 52B",
6
+ "developer": "unknown",
7
+ "scores": {
8
+ "Mean win rate": 0.78,
9
+ "MMLU": 0.481,
10
+ "BoolQ": 0.815,
11
+ "NarrativeQA": 0.728,
12
+ "NaturalQuestions (open-book)": 0.686,
13
+ "QuAC": 0.431,
14
+ "HellaSwag": 0.807,
15
+ "OpenbookQA": 0.558,
16
+ "TruthfulQA": 0.368,
17
+ "MS MARCO (TREC)": -1.0,
18
+ "CNN/DailyMail": 0.154,
19
+ "XSUM": 0.134,
20
+ "IMDB": 0.934,
21
+ "CivilComments": 0.61,
22
+ "RAFT": 0.699
23
+ }
24
+ },
25
+ {
26
+ "model_id": "ai21/J1-Grande-v1-17B",
27
+ "name": "J1-Grande v1 17B",
28
+ "developer": "ai21",
29
+ "scores": {
30
+ "Mean win rate": 0.433,
31
+ "MMLU": 0.27,
32
+ "BoolQ": 0.722,
33
+ "NarrativeQA": 0.672,
34
+ "NaturalQuestions (open-book)": 0.578,
35
+ "QuAC": 0.362,
36
+ "HellaSwag": 0.739,
37
+ "OpenbookQA": 0.52,
38
+ "TruthfulQA": 0.193,
39
+ "MS MARCO (TREC)": 0.341,
40
+ "CNN/DailyMail": 0.143,
41
+ "XSUM": 0.122,
42
+ "IMDB": 0.953,
43
+ "CivilComments": 0.529,
44
+ "RAFT": 0.658
45
+ }
46
+ },
47
+ {
48
+ "model_id": "ai21/J1-Grande-v2-beta-17B",
49
+ "name": "J1-Grande v2 beta 17B",
50
+ "developer": "ai21",
51
+ "scores": {
52
+ "Mean win rate": 0.706,
53
+ "MMLU": 0.445,
54
+ "BoolQ": 0.812,
55
+ "NarrativeQA": 0.725,
56
+ "NaturalQuestions (open-book)": 0.625,
57
+ "QuAC": 0.392,
58
+ "HellaSwag": 0.764,
59
+ "OpenbookQA": 0.56,
60
+ "TruthfulQA": 0.306,
61
+ "MS MARCO (TREC)": 0.46,
62
+ "CNN/DailyMail": 0.146,
63
+ "XSUM": 0.152,
64
+ "IMDB": 0.957,
65
+ "CivilComments": 0.546,
66
+ "RAFT": 0.679
67
+ }
68
+ },
69
+ {
70
+ "model_id": "ai21/J1-Jumbo-v1-178B",
71
+ "name": "J1-Jumbo v1 178B",
72
+ "developer": "ai21",
73
+ "scores": {
74
+ "Mean win rate": 0.517,
75
+ "MMLU": 0.259,
76
+ "BoolQ": 0.776,
77
+ "NarrativeQA": 0.695,
78
+ "NaturalQuestions (open-book)": 0.595,
79
+ "QuAC": 0.358,
80
+ "HellaSwag": 0.765,
81
+ "OpenbookQA": 0.534,
82
+ "TruthfulQA": 0.175,
83
+ "MS MARCO (TREC)": 0.363,
84
+ "CNN/DailyMail": 0.144,
85
+ "XSUM": 0.129,
86
+ "IMDB": 0.943,
87
+ "CivilComments": 0.553,
88
+ "RAFT": 0.681
89
+ }
90
+ },
91
+ {
92
+ "model_id": "ai21/J1-Large-v1-7.5B",
93
+ "name": "J1-Large v1 7.5B",
94
+ "developer": "ai21",
95
+ "scores": {
96
+ "Mean win rate": 0.285,
97
+ "MMLU": 0.241,
98
+ "BoolQ": 0.683,
99
+ "NarrativeQA": 0.623,
100
+ "NaturalQuestions (open-book)": 0.532,
101
+ "QuAC": 0.328,
102
+ "HellaSwag": 0.7,
103
+ "OpenbookQA": 0.514,
104
+ "TruthfulQA": 0.197,
105
+ "MS MARCO (TREC)": 0.292,
106
+ "CNN/DailyMail": 0.134,
107
+ "XSUM": 0.102,
108
+ "IMDB": 0.956,
109
+ "CivilComments": 0.532,
110
+ "RAFT": 0.545
111
+ }
112
+ },
113
+ {
114
+ "model_id": "ai21/Jurassic-2-Grande-17B",
115
+ "name": "Jurassic-2 Grande 17B",
116
+ "developer": "ai21",
117
+ "scores": {
118
+ "Mean win rate": 0.743,
119
+ "MMLU": 0.475,
120
+ "BoolQ": 0.826,
121
+ "NarrativeQA": 0.737,
122
+ "NaturalQuestions (open-book)": 0.639,
123
+ "QuAC": 0.418,
124
+ "HellaSwag": 0.781,
125
+ "OpenbookQA": 0.542,
126
+ "TruthfulQA": 0.348,
127
+ "MS MARCO (TREC)": 0.514,
128
+ "CNN/DailyMail": 0.144,
129
+ "XSUM": 0.167,
130
+ "IMDB": 0.938,
131
+ "CivilComments": 0.547,
132
+ "RAFT": 0.712
133
+ }
134
+ },
135
+ {
136
+ "model_id": "ai21/Jurassic-2-Jumbo-178B",
137
+ "name": "Jurassic-2 Jumbo 178B",
138
+ "developer": "ai21",
139
+ "scores": {
140
+ "Mean win rate": 0.824,
141
+ "MMLU": 0.48,
142
+ "BoolQ": 0.829,
143
+ "NarrativeQA": 0.733,
144
+ "NaturalQuestions (open-book)": 0.669,
145
+ "QuAC": 0.435,
146
+ "HellaSwag": 0.788,
147
+ "OpenbookQA": 0.558,
148
+ "TruthfulQA": 0.437,
149
+ "MS MARCO (TREC)": 0.661,
150
+ "CNN/DailyMail": 0.149,
151
+ "XSUM": 0.182,
152
+ "IMDB": 0.938,
153
+ "CivilComments": 0.57,
154
+ "RAFT": 0.746
155
+ }
156
+ },
157
+ {
158
+ "model_id": "ai21/Jurassic-2-Large-7.5B",
159
+ "name": "Jurassic-2 Large 7.5B",
160
+ "developer": "ai21",
161
+ "scores": {
162
+ "Mean win rate": 0.553,
163
+ "MMLU": 0.339,
164
+ "BoolQ": 0.742,
165
+ "NarrativeQA": -1.0,
166
+ "NaturalQuestions (open-book)": 0.589,
167
+ "QuAC": -1.0,
168
+ "HellaSwag": 0.729,
169
+ "OpenbookQA": 0.53,
170
+ "TruthfulQA": 0.245,
171
+ "MS MARCO (TREC)": 0.464,
172
+ "CNN/DailyMail": 0.136,
173
+ "XSUM": 0.142,
174
+ "IMDB": 0.956,
175
+ "CivilComments": 0.57,
176
+ "RAFT": 0.622
177
+ }
178
+ },
179
+ {
180
+ "model_id": "aleph-alpha/Luminous-Base-13B",
181
+ "name": "Luminous Base 13B",
182
+ "developer": "aleph-alpha",
183
+ "scores": {
184
+ "Mean win rate": 0.315,
185
+ "MMLU": 0.27,
186
+ "BoolQ": 0.719,
187
+ "NarrativeQA": 0.605,
188
+ "NaturalQuestions (open-book)": 0.568,
189
+ "QuAC": 0.334,
190
+ "HellaSwag": -1.0,
191
+ "OpenbookQA": -1.0,
192
+ "TruthfulQA": 0.182,
193
+ "MS MARCO (TREC)": -1.0,
194
+ "CNN/DailyMail": 0.11,
195
+ "XSUM": 0.105,
196
+ "IMDB": 0.939,
197
+ "CivilComments": 0.544,
198
+ "RAFT": 0.473
199
+ }
200
+ },
201
+ {
202
+ "model_id": "aleph-alpha/Luminous-Extended-30B",
203
+ "name": "Luminous Extended 30B",
204
+ "developer": "aleph-alpha",
205
+ "scores": {
206
+ "Mean win rate": 0.485,
207
+ "MMLU": 0.321,
208
+ "BoolQ": 0.767,
209
+ "NarrativeQA": 0.665,
210
+ "NaturalQuestions (open-book)": 0.609,
211
+ "QuAC": 0.349,
212
+ "HellaSwag": -1.0,
213
+ "OpenbookQA": -1.0,
214
+ "TruthfulQA": 0.221,
215
+ "MS MARCO (TREC)": -1.0,
216
+ "CNN/DailyMail": 0.139,
217
+ "XSUM": 0.124,
218
+ "IMDB": 0.947,
219
+ "CivilComments": 0.524,
220
+ "RAFT": 0.523
221
+ }
222
+ },
223
+ {
224
+ "model_id": "aleph-alpha/Luminous-Supreme-70B",
225
+ "name": "Luminous Supreme 70B",
226
+ "developer": "aleph-alpha",
227
+ "scores": {
228
+ "Mean win rate": 0.662,
229
+ "MMLU": 0.38,
230
+ "BoolQ": 0.775,
231
+ "NarrativeQA": 0.711,
232
+ "NaturalQuestions (open-book)": 0.649,
233
+ "QuAC": 0.37,
234
+ "HellaSwag": -1.0,
235
+ "OpenbookQA": -1.0,
236
+ "TruthfulQA": 0.222,
237
+ "MS MARCO (TREC)": -1.0,
238
+ "CNN/DailyMail": 0.15,
239
+ "XSUM": 0.136,
240
+ "IMDB": 0.959,
241
+ "CivilComments": 0.562,
242
+ "RAFT": 0.653
243
+ }
244
+ },
245
+ {
246
+ "model_id": "bigscience/BLOOM-176B",
247
+ "name": "BLOOM 176B",
248
+ "developer": "bigscience",
249
+ "scores": {
250
+ "Mean win rate": 0.446,
251
+ "MMLU": 0.299,
252
+ "BoolQ": 0.704,
253
+ "NarrativeQA": 0.662,
254
+ "NaturalQuestions (open-book)": 0.621,
255
+ "QuAC": 0.361,
256
+ "HellaSwag": 0.744,
257
+ "OpenbookQA": 0.534,
258
+ "TruthfulQA": 0.205,
259
+ "MS MARCO (TREC)": 0.386,
260
+ "CNN/DailyMail": 0.08,
261
+ "XSUM": 0.03,
262
+ "IMDB": 0.945,
263
+ "CivilComments": 0.62,
264
+ "RAFT": 0.592
265
+ }
266
+ },
267
+ {
268
+ "model_id": "bigscience/T0pp-11B",
269
+ "name": "T0pp 11B",
270
+ "developer": "bigscience",
271
+ "scores": {
272
+ "Mean win rate": 0.197,
273
+ "MMLU": 0.407,
274
+ "BoolQ": 0.0,
275
+ "NarrativeQA": 0.151,
276
+ "NaturalQuestions (open-book)": 0.19,
277
+ "QuAC": 0.121,
278
+ "HellaSwag": -1.0,
279
+ "OpenbookQA": -1.0,
280
+ "TruthfulQA": 0.377,
281
+ "MS MARCO (TREC)": -1.0,
282
+ "CNN/DailyMail": 0.122,
283
+ "XSUM": 0.09,
284
+ "IMDB": 0.207,
285
+ "CivilComments": 0.234,
286
+ "RAFT": 0.118
287
+ }
288
+ },
289
+ {
290
+ "model_id": "cohere/Cohere-Command-beta-52.4B",
291
+ "name": "Cohere Command beta 52.4B",
292
+ "developer": "cohere",
293
+ "scores": {
294
+ "Mean win rate": 0.874,
295
+ "MMLU": 0.452,
296
+ "BoolQ": 0.856,
297
+ "NarrativeQA": 0.752,
298
+ "NaturalQuestions (open-book)": 0.76,
299
+ "QuAC": 0.432,
300
+ "HellaSwag": 0.811,
301
+ "OpenbookQA": 0.582,
302
+ "TruthfulQA": 0.269,
303
+ "MS MARCO (TREC)": 0.762,
304
+ "CNN/DailyMail": 0.161,
305
+ "XSUM": 0.152,
306
+ "IMDB": 0.96,
307
+ "CivilComments": 0.601,
308
+ "RAFT": 0.667
309
+ }
310
+ },
311
+ {
312
+ "model_id": "cohere/Cohere-Command-beta-6.1B",
313
+ "name": "Cohere Command beta 6.1B",
314
+ "developer": "cohere",
315
+ "scores": {
316
+ "Mean win rate": 0.675,
317
+ "MMLU": 0.406,
318
+ "BoolQ": 0.798,
319
+ "NarrativeQA": 0.709,
320
+ "NaturalQuestions (open-book)": 0.717,
321
+ "QuAC": 0.375,
322
+ "HellaSwag": 0.752,
323
+ "OpenbookQA": 0.55,
324
+ "TruthfulQA": 0.203,
325
+ "MS MARCO (TREC)": 0.709,
326
+ "CNN/DailyMail": 0.153,
327
+ "XSUM": 0.122,
328
+ "IMDB": 0.961,
329
+ "CivilComments": 0.54,
330
+ "RAFT": 0.634
331
+ }
332
+ },
333
+ {
334
+ "model_id": "cohere/Cohere-large-v20220720-13.1B",
335
+ "name": "Cohere large v20220720 13.1B",
336
+ "developer": "cohere",
337
+ "scores": {
338
+ "Mean win rate": 0.372,
339
+ "MMLU": 0.324,
340
+ "BoolQ": 0.725,
341
+ "NarrativeQA": 0.625,
342
+ "NaturalQuestions (open-book)": 0.573,
343
+ "QuAC": 0.338,
344
+ "HellaSwag": 0.736,
345
+ "OpenbookQA": 0.542,
346
+ "TruthfulQA": 0.181,
347
+ "MS MARCO (TREC)": 0.33,
348
+ "CNN/DailyMail": 0.126,
349
+ "XSUM": 0.108,
350
+ "IMDB": 0.933,
351
+ "CivilComments": 0.507,
352
+ "RAFT": 0.596
353
+ }
354
+ },
355
+ {
356
+ "model_id": "cohere/Cohere-medium-v20220720-6.1B",
357
+ "name": "Cohere medium v20220720 6.1B",
358
+ "developer": "cohere",
359
+ "scores": {
360
+ "Mean win rate": 0.23,
361
+ "MMLU": 0.279,
362
+ "BoolQ": 0.659,
363
+ "NarrativeQA": 0.559,
364
+ "NaturalQuestions (open-book)": 0.504,
365
+ "QuAC": 0.279,
366
+ "HellaSwag": 0.706,
367
+ "OpenbookQA": 0.496,
368
+ "TruthfulQA": 0.19,
369
+ "MS MARCO (TREC)": 0.374,
370
+ "CNN/DailyMail": 0.077,
371
+ "XSUM": 0.087,
372
+ "IMDB": 0.935,
373
+ "CivilComments": 0.504,
374
+ "RAFT": 0.52
375
+ }
376
+ },
377
+ {
378
+ "model_id": "cohere/Cohere-medium-v20221108-6.1B",
379
+ "name": "Cohere medium v20221108 6.1B",
380
+ "developer": "cohere",
381
+ "scores": {
382
+ "Mean win rate": 0.312,
383
+ "MMLU": 0.254,
384
+ "BoolQ": 0.7,
385
+ "NarrativeQA": 0.61,
386
+ "NaturalQuestions (open-book)": 0.517,
387
+ "QuAC": 0.314,
388
+ "HellaSwag": 0.726,
389
+ "OpenbookQA": 0.538,
390
+ "TruthfulQA": 0.215,
391
+ "MS MARCO (TREC)": 0.373,
392
+ "CNN/DailyMail": 0.121,
393
+ "XSUM": 0.099,
394
+ "IMDB": 0.935,
395
+ "CivilComments": 0.5,
396
+ "RAFT": 0.591
397
+ }
398
+ },
399
+ {
400
+ "model_id": "cohere/Cohere-small-v20220720-410M",
401
+ "name": "Cohere small v20220720 410M",
402
+ "developer": "cohere",
403
+ "scores": {
404
+ "Mean win rate": 0.109,
405
+ "MMLU": 0.264,
406
+ "BoolQ": 0.457,
407
+ "NarrativeQA": 0.294,
408
+ "NaturalQuestions (open-book)": 0.309,
409
+ "QuAC": 0.219,
410
+ "HellaSwag": 0.483,
411
+ "OpenbookQA": 0.348,
412
+ "TruthfulQA": 0.217,
413
+ "MS MARCO (TREC)": 0.304,
414
+ "CNN/DailyMail": 0.063,
415
+ "XSUM": 0.033,
416
+ "IMDB": 0.578,
417
+ "CivilComments": 0.501,
418
+ "RAFT": 0.492
419
+ }
420
+ },
421
+ {
422
+ "model_id": "cohere/Cohere-xlarge-v20220609-52.4B",
423
+ "name": "Cohere xlarge v20220609 52.4B",
424
+ "developer": "cohere",
425
+ "scores": {
426
+ "Mean win rate": 0.56,
427
+ "MMLU": 0.353,
428
+ "BoolQ": 0.718,
429
+ "NarrativeQA": 0.65,
430
+ "NaturalQuestions (open-book)": 0.595,
431
+ "QuAC": 0.361,
432
+ "HellaSwag": 0.811,
433
+ "OpenbookQA": 0.55,
434
+ "TruthfulQA": 0.198,
435
+ "MS MARCO (TREC)": 0.459,
436
+ "CNN/DailyMail": 0.144,
437
+ "XSUM": 0.129,
438
+ "IMDB": 0.956,
439
+ "CivilComments": 0.532,
440
+ "RAFT": 0.633
441
+ }
442
+ },
443
+ {
444
+ "model_id": "cohere/Cohere-xlarge-v20221108-52.4B",
445
+ "name": "Cohere xlarge v20221108 52.4B",
446
+ "developer": "cohere",
447
+ "scores": {
448
+ "Mean win rate": 0.664,
449
+ "MMLU": 0.382,
450
+ "BoolQ": 0.762,
451
+ "NarrativeQA": 0.672,
452
+ "NaturalQuestions (open-book)": 0.628,
453
+ "QuAC": 0.374,
454
+ "HellaSwag": 0.81,
455
+ "OpenbookQA": 0.588,
456
+ "TruthfulQA": 0.169,
457
+ "MS MARCO (TREC)": 0.55,
458
+ "CNN/DailyMail": 0.153,
459
+ "XSUM": 0.153,
460
+ "IMDB": 0.956,
461
+ "CivilComments": 0.524,
462
+ "RAFT": 0.624
463
+ }
464
+ },
465
+ {
466
+ "model_id": "eleutherai/Pythia-12B",
467
+ "name": "Pythia 12B",
468
+ "developer": "eleutherai",
469
+ "scores": {
470
+ "Mean win rate": 0.257,
471
+ "MMLU": 0.274,
472
+ "BoolQ": 0.662,
473
+ "NarrativeQA": 0.596,
474
+ "NaturalQuestions (open-book)": 0.581,
475
+ "QuAC": 0.313,
476
+ "HellaSwag": -1.0,
477
+ "OpenbookQA": -1.0,
478
+ "TruthfulQA": 0.177,
479
+ "MS MARCO (TREC)": -1.0,
480
+ "CNN/DailyMail": -1.0,
481
+ "XSUM": -1.0,
482
+ "IMDB": 0.931,
483
+ "CivilComments": 0.531,
484
+ "RAFT": 0.514
485
+ }
486
+ },
487
+ {
488
+ "model_id": "eleutherai/Pythia-6.9B",
489
+ "name": "Pythia 6.9B",
490
+ "developer": "eleutherai",
491
+ "scores": {
492
+ "Mean win rate": 0.196,
493
+ "MMLU": 0.236,
494
+ "BoolQ": 0.631,
495
+ "NarrativeQA": 0.528,
496
+ "NaturalQuestions (open-book)": 0.539,
497
+ "QuAC": 0.296,
498
+ "HellaSwag": -1.0,
499
+ "OpenbookQA": -1.0,
500
+ "TruthfulQA": 0.213,
501
+ "MS MARCO (TREC)": -1.0,
502
+ "CNN/DailyMail": -1.0,
503
+ "XSUM": -1.0,
504
+ "IMDB": 0.928,
505
+ "CivilComments": 0.511,
506
+ "RAFT": 0.502
507
+ }
508
+ },
509
+ {
510
+ "model_id": "google/Palmyra-X-43B",
511
+ "name": "Palmyra X 43B",
512
+ "developer": "google",
513
+ "scores": {
514
+ "Mean win rate": 0.732,
515
+ "MMLU": 0.609,
516
+ "BoolQ": 0.896,
517
+ "NarrativeQA": 0.742,
518
+ "NaturalQuestions (open-book)": -1.0,
519
+ "QuAC": 0.473,
520
+ "HellaSwag": -1.0,
521
+ "OpenbookQA": -1.0,
522
+ "TruthfulQA": 0.616,
523
+ "MS MARCO (TREC)": -1.0,
524
+ "CNN/DailyMail": 0.049,
525
+ "XSUM": 0.149,
526
+ "IMDB": 0.935,
527
+ "CivilComments": 0.008,
528
+ "RAFT": 0.701
529
+ }
530
+ },
531
+ {
532
+ "model_id": "google/T5-11B",
533
+ "name": "T5 11B",
534
+ "developer": "google",
535
+ "scores": {
536
+ "Mean win rate": 0.131,
537
+ "MMLU": 0.29,
538
+ "BoolQ": 0.761,
539
+ "NarrativeQA": 0.086,
540
+ "NaturalQuestions (open-book)": 0.477,
541
+ "QuAC": 0.116,
542
+ "HellaSwag": -1.0,
543
+ "OpenbookQA": -1.0,
544
+ "TruthfulQA": 0.133,
545
+ "MS MARCO (TREC)": -1.0,
546
+ "CNN/DailyMail": 0.043,
547
+ "XSUM": 0.015,
548
+ "IMDB": 0.379,
549
+ "CivilComments": 0.509,
550
+ "RAFT": 0.37
551
+ }
552
+ },
553
+ {
554
+ "model_id": "google/UL2-20B",
555
+ "name": "UL2 20B",
556
+ "developer": "google",
557
+ "scores": {
558
+ "Mean win rate": 0.167,
559
+ "MMLU": 0.291,
560
+ "BoolQ": 0.746,
561
+ "NarrativeQA": 0.083,
562
+ "NaturalQuestions (open-book)": 0.349,
563
+ "QuAC": 0.144,
564
+ "HellaSwag": -1.0,
565
+ "OpenbookQA": -1.0,
566
+ "TruthfulQA": 0.193,
567
+ "MS MARCO (TREC)": -1.0,
568
+ "CNN/DailyMail": 0.03,
569
+ "XSUM": 0.058,
570
+ "IMDB": 0.337,
571
+ "CivilComments": 0.521,
572
+ "RAFT": 0.404
573
+ }
574
+ },
575
+ {
576
+ "model_id": "lmsys/Vicuna-v1.3-13B",
577
+ "name": "Vicuna v1.3 13B",
578
+ "developer": "lmsys",
579
+ "scores": {
580
+ "Mean win rate": 0.706,
581
+ "MMLU": 0.462,
582
+ "BoolQ": 0.808,
583
+ "NarrativeQA": 0.691,
584
+ "NaturalQuestions (open-book)": 0.686,
585
+ "QuAC": 0.403,
586
+ "HellaSwag": -1.0,
587
+ "OpenbookQA": -1.0,
588
+ "TruthfulQA": 0.385,
589
+ "MS MARCO (TREC)": -1.0,
590
+ "CNN/DailyMail": -1.0,
591
+ "XSUM": -1.0,
592
+ "IMDB": 0.762,
593
+ "CivilComments": 0.645,
594
+ "RAFT": 0.657
595
+ }
596
+ },
597
+ {
598
+ "model_id": "lmsys/Vicuna-v1.3-7B",
599
+ "name": "Vicuna v1.3 7B",
600
+ "developer": "lmsys",
601
+ "scores": {
602
+ "Mean win rate": 0.625,
603
+ "MMLU": 0.434,
604
+ "BoolQ": 0.76,
605
+ "NarrativeQA": 0.643,
606
+ "NaturalQuestions (open-book)": 0.634,
607
+ "QuAC": 0.392,
608
+ "HellaSwag": -1.0,
609
+ "OpenbookQA": -1.0,
610
+ "TruthfulQA": 0.292,
611
+ "MS MARCO (TREC)": -1.0,
612
+ "CNN/DailyMail": -1.0,
613
+ "XSUM": -1.0,
614
+ "IMDB": 0.916,
615
+ "CivilComments": 0.62,
616
+ "RAFT": 0.693
617
+ }
618
+ },
619
+ {
620
+ "model_id": "meta/LLaMA-13B",
621
+ "name": "LLaMA 13B",
622
+ "developer": "meta",
623
+ "scores": {
624
+ "Mean win rate": 0.595,
625
+ "MMLU": 0.422,
626
+ "BoolQ": 0.714,
627
+ "NarrativeQA": 0.711,
628
+ "NaturalQuestions (open-book)": 0.614,
629
+ "QuAC": 0.347,
630
+ "HellaSwag": -1.0,
631
+ "OpenbookQA": -1.0,
632
+ "TruthfulQA": 0.324,
633
+ "MS MARCO (TREC)": -1.0,
634
+ "CNN/DailyMail": -1.0,
635
+ "XSUM": -1.0,
636
+ "IMDB": 0.928,
637
+ "CivilComments": 0.6,
638
+ "RAFT": 0.643
639
+ }
640
+ },
641
+ {
642
+ "model_id": "meta/LLaMA-30B",
643
+ "name": "LLaMA 30B",
644
+ "developer": "meta",
645
+ "scores": {
646
+ "Mean win rate": 0.781,
647
+ "MMLU": 0.531,
648
+ "BoolQ": 0.861,
649
+ "NarrativeQA": 0.752,
650
+ "NaturalQuestions (open-book)": 0.666,
651
+ "QuAC": 0.39,
652
+ "HellaSwag": -1.0,
653
+ "OpenbookQA": -1.0,
654
+ "TruthfulQA": 0.344,
655
+ "MS MARCO (TREC)": -1.0,
656
+ "CNN/DailyMail": -1.0,
657
+ "XSUM": -1.0,
658
+ "IMDB": 0.927,
659
+ "CivilComments": 0.549,
660
+ "RAFT": 0.752
661
+ }
662
+ },
663
+ {
664
+ "model_id": "meta/LLaMA-65B",
665
+ "name": "LLaMA 65B",
666
+ "developer": "meta",
667
+ "scores": {
668
+ "Mean win rate": 0.908,
669
+ "MMLU": 0.584,
670
+ "BoolQ": 0.871,
671
+ "NarrativeQA": 0.755,
672
+ "NaturalQuestions (open-book)": 0.672,
673
+ "QuAC": 0.401,
674
+ "HellaSwag": -1.0,
675
+ "OpenbookQA": -1.0,
676
+ "TruthfulQA": 0.508,
677
+ "MS MARCO (TREC)": -1.0,
678
+ "CNN/DailyMail": -1.0,
679
+ "XSUM": -1.0,
680
+ "IMDB": 0.962,
681
+ "CivilComments": 0.655,
682
+ "RAFT": 0.702
683
+ }
684
+ },
685
+ {
686
+ "model_id": "meta/LLaMA-7B",
687
+ "name": "LLaMA 7B",
688
+ "developer": "meta",
689
+ "scores": {
690
+ "Mean win rate": 0.533,
691
+ "MMLU": 0.321,
692
+ "BoolQ": 0.756,
693
+ "NarrativeQA": 0.669,
694
+ "NaturalQuestions (open-book)": 0.589,
695
+ "QuAC": 0.338,
696
+ "HellaSwag": -1.0,
697
+ "OpenbookQA": -1.0,
698
+ "TruthfulQA": 0.28,
699
+ "MS MARCO (TREC)": -1.0,
700
+ "CNN/DailyMail": -1.0,
701
+ "XSUM": -1.0,
702
+ "IMDB": 0.947,
703
+ "CivilComments": 0.563,
704
+ "RAFT": 0.573
705
+ }
706
+ },
707
+ {
708
+ "model_id": "meta/Llama-2-13B",
709
+ "name": "Llama 2 13B",
710
+ "developer": "meta",
711
+ "scores": {
712
+ "Mean win rate": 0.823,
713
+ "MMLU": 0.507,
714
+ "BoolQ": 0.811,
715
+ "NarrativeQA": 0.744,
716
+ "NaturalQuestions (open-book)": 0.637,
717
+ "QuAC": 0.424,
718
+ "HellaSwag": -1.0,
719
+ "OpenbookQA": -1.0,
720
+ "TruthfulQA": 0.33,
721
+ "MS MARCO (TREC)": -1.0,
722
+ "CNN/DailyMail": -1.0,
723
+ "XSUM": -1.0,
724
+ "IMDB": 0.962,
725
+ "CivilComments": 0.588,
726
+ "RAFT": 0.707
727
+ }
728
+ },
729
+ {
730
+ "model_id": "meta/Llama-2-70B",
731
+ "name": "Llama 2 70B",
732
+ "developer": "meta",
733
+ "scores": {
734
+ "Mean win rate": 0.944,
735
+ "MMLU": 0.582,
736
+ "BoolQ": 0.886,
737
+ "NarrativeQA": 0.77,
738
+ "NaturalQuestions (open-book)": 0.674,
739
+ "QuAC": 0.484,
740
+ "HellaSwag": -1.0,
741
+ "OpenbookQA": -1.0,
742
+ "TruthfulQA": 0.554,
743
+ "MS MARCO (TREC)": -1.0,
744
+ "CNN/DailyMail": -1.0,
745
+ "XSUM": -1.0,
746
+ "IMDB": 0.961,
747
+ "CivilComments": 0.652,
748
+ "RAFT": 0.727
749
+ }
750
+ },
751
+ {
752
+ "model_id": "meta/Llama-2-7B",
753
+ "name": "Llama 2 7B",
754
+ "developer": "meta",
755
+ "scores": {
756
+ "Mean win rate": 0.607,
757
+ "MMLU": 0.431,
758
+ "BoolQ": 0.762,
759
+ "NarrativeQA": 0.691,
760
+ "NaturalQuestions (open-book)": 0.611,
761
+ "QuAC": 0.406,
762
+ "HellaSwag": -1.0,
763
+ "OpenbookQA": -1.0,
764
+ "TruthfulQA": 0.272,
765
+ "MS MARCO (TREC)": -1.0,
766
+ "CNN/DailyMail": -1.0,
767
+ "XSUM": -1.0,
768
+ "IMDB": 0.907,
769
+ "CivilComments": 0.562,
770
+ "RAFT": 0.643
771
+ }
772
+ },
773
+ {
774
+ "model_id": "meta/OPT-175B",
775
+ "name": "OPT 175B",
776
+ "developer": "meta",
777
+ "scores": {
778
+ "Mean win rate": 0.609,
779
+ "MMLU": 0.318,
780
+ "BoolQ": 0.793,
781
+ "NarrativeQA": 0.671,
782
+ "NaturalQuestions (open-book)": 0.615,
783
+ "QuAC": 0.36,
784
+ "HellaSwag": 0.791,
785
+ "OpenbookQA": 0.586,
786
+ "TruthfulQA": 0.25,
787
+ "MS MARCO (TREC)": 0.448,
788
+ "CNN/DailyMail": 0.146,
789
+ "XSUM": 0.155,
790
+ "IMDB": 0.947,
791
+ "CivilComments": 0.505,
792
+ "RAFT": 0.606
793
+ }
794
+ },
795
+ {
796
+ "model_id": "meta/OPT-66B",
797
+ "name": "OPT 66B",
798
+ "developer": "meta",
799
+ "scores": {
800
+ "Mean win rate": 0.448,
801
+ "MMLU": 0.276,
802
+ "BoolQ": 0.76,
803
+ "NarrativeQA": 0.638,
804
+ "NaturalQuestions (open-book)": 0.596,
805
+ "QuAC": 0.357,
806
+ "HellaSwag": 0.745,
807
+ "OpenbookQA": 0.534,
808
+ "TruthfulQA": 0.201,
809
+ "MS MARCO (TREC)": 0.482,
810
+ "CNN/DailyMail": 0.136,
811
+ "XSUM": 0.126,
812
+ "IMDB": 0.917,
813
+ "CivilComments": 0.506,
814
+ "RAFT": 0.557
815
+ }
816
+ },
817
+ {
818
+ "model_id": "microsoft/TNLG-v2-530B",
819
+ "name": "TNLG v2 530B",
820
+ "developer": "microsoft",
821
+ "scores": {
822
+ "Mean win rate": 0.787,
823
+ "MMLU": 0.469,
824
+ "BoolQ": 0.809,
825
+ "NarrativeQA": 0.722,
826
+ "NaturalQuestions (open-book)": 0.642,
827
+ "QuAC": 0.39,
828
+ "HellaSwag": 0.799,
829
+ "OpenbookQA": 0.562,
830
+ "TruthfulQA": 0.251,
831
+ "MS MARCO (TREC)": 0.643,
832
+ "CNN/DailyMail": 0.161,
833
+ "XSUM": 0.169,
834
+ "IMDB": 0.941,
835
+ "CivilComments": 0.601,
836
+ "RAFT": 0.679
837
+ }
838
+ },
839
+ {
840
+ "model_id": "microsoft/TNLG-v2-6.7B",
841
+ "name": "TNLG v2 6.7B",
842
+ "developer": "microsoft",
843
+ "scores": {
844
+ "Mean win rate": 0.309,
845
+ "MMLU": 0.242,
846
+ "BoolQ": 0.698,
847
+ "NarrativeQA": 0.631,
848
+ "NaturalQuestions (open-book)": 0.561,
849
+ "QuAC": 0.345,
850
+ "HellaSwag": 0.704,
851
+ "OpenbookQA": 0.478,
852
+ "TruthfulQA": 0.167,
853
+ "MS MARCO (TREC)": 0.332,
854
+ "CNN/DailyMail": 0.146,
855
+ "XSUM": 0.11,
856
+ "IMDB": 0.927,
857
+ "CivilComments": 0.532,
858
+ "RAFT": 0.525
859
+ }
860
+ },
861
+ {
862
+ "model_id": "mistralai/Mistral-v0.1-7B",
863
+ "name": "Mistral v0.1 7B",
864
+ "developer": "mistralai",
865
+ "scores": {
866
+ "Mean win rate": 0.884,
867
+ "MMLU": 0.572,
868
+ "BoolQ": 0.874,
869
+ "NarrativeQA": 0.716,
870
+ "NaturalQuestions (open-book)": 0.687,
871
+ "QuAC": 0.423,
872
+ "HellaSwag": -1.0,
873
+ "OpenbookQA": -1.0,
874
+ "TruthfulQA": 0.422,
875
+ "MS MARCO (TREC)": -1.0,
876
+ "CNN/DailyMail": -1.0,
877
+ "XSUM": -1.0,
878
+ "IMDB": 0.962,
879
+ "CivilComments": 0.624,
880
+ "RAFT": 0.707
881
+ }
882
+ },
883
+ {
884
+ "model_id": "mosaicml/MPT-30B",
885
+ "name": "MPT 30B",
886
+ "developer": "mosaicml",
887
+ "scores": {
888
+ "Mean win rate": 0.714,
889
+ "MMLU": 0.437,
890
+ "BoolQ": 0.704,
891
+ "NarrativeQA": 0.732,
892
+ "NaturalQuestions (open-book)": 0.673,
893
+ "QuAC": 0.393,
894
+ "HellaSwag": -1.0,
895
+ "OpenbookQA": -1.0,
896
+ "TruthfulQA": 0.231,
897
+ "MS MARCO (TREC)": -1.0,
898
+ "CNN/DailyMail": -1.0,
899
+ "XSUM": -1.0,
900
+ "IMDB": 0.959,
901
+ "CivilComments": 0.599,
902
+ "RAFT": 0.723
903
+ }
904
+ },
905
+ {
906
+ "model_id": "mosaicml/MPT-Instruct-30B",
907
+ "name": "MPT-Instruct 30B",
908
+ "developer": "mosaicml",
909
+ "scores": {
910
+ "Mean win rate": 0.716,
911
+ "MMLU": 0.444,
912
+ "BoolQ": 0.85,
913
+ "NarrativeQA": 0.733,
914
+ "NaturalQuestions (open-book)": 0.697,
915
+ "QuAC": 0.327,
916
+ "HellaSwag": -1.0,
917
+ "OpenbookQA": -1.0,
918
+ "TruthfulQA": 0.234,
919
+ "MS MARCO (TREC)": -1.0,
920
+ "CNN/DailyMail": -1.0,
921
+ "XSUM": -1.0,
922
+ "IMDB": 0.956,
923
+ "CivilComments": 0.573,
924
+ "RAFT": 0.68
925
+ }
926
+ },
927
+ {
928
+ "model_id": "openai/GPT-J-6B",
929
+ "name": "GPT-J 6B",
930
+ "developer": "openai",
931
+ "scores": {
932
+ "Mean win rate": 0.273,
933
+ "MMLU": 0.249,
934
+ "BoolQ": 0.649,
935
+ "NarrativeQA": 0.545,
936
+ "NaturalQuestions (open-book)": 0.559,
937
+ "QuAC": 0.33,
938
+ "HellaSwag": 0.663,
939
+ "OpenbookQA": 0.514,
940
+ "TruthfulQA": 0.199,
941
+ "MS MARCO (TREC)": 0.345,
942
+ "CNN/DailyMail": 0.131,
943
+ "XSUM": 0.096,
944
+ "IMDB": 0.939,
945
+ "CivilComments": 0.52,
946
+ "RAFT": 0.619
947
+ }
948
+ },
949
+ {
950
+ "model_id": "openai/GPT-NeoX-20B",
951
+ "name": "GPT-NeoX 20B",
952
+ "developer": "openai",
953
+ "scores": {
954
+ "Mean win rate": 0.351,
955
+ "MMLU": 0.276,
956
+ "BoolQ": 0.683,
957
+ "NarrativeQA": 0.599,
958
+ "NaturalQuestions (open-book)": 0.596,
959
+ "QuAC": 0.326,
960
+ "HellaSwag": 0.718,
961
+ "OpenbookQA": 0.524,
962
+ "TruthfulQA": 0.216,
963
+ "MS MARCO (TREC)": 0.398,
964
+ "CNN/DailyMail": 0.123,
965
+ "XSUM": 0.102,
966
+ "IMDB": 0.948,
967
+ "CivilComments": 0.516,
968
+ "RAFT": 0.505
969
+ }
970
+ },
971
+ {
972
+ "model_id": "openai/ada-350M",
973
+ "name": "ada 350M",
974
+ "developer": "openai",
975
+ "scores": {
976
+ "Mean win rate": 0.108,
977
+ "MMLU": 0.243,
978
+ "BoolQ": 0.581,
979
+ "NarrativeQA": 0.326,
980
+ "NaturalQuestions (open-book)": 0.365,
981
+ "QuAC": 0.242,
982
+ "HellaSwag": 0.435,
983
+ "OpenbookQA": 0.38,
984
+ "TruthfulQA": 0.215,
985
+ "MS MARCO (TREC)": 0.29,
986
+ "CNN/DailyMail": 0.09,
987
+ "XSUM": 0.022,
988
+ "IMDB": 0.849,
989
+ "CivilComments": 0.517,
990
+ "RAFT": 0.423
991
+ }
992
+ },
993
+ {
994
+ "model_id": "openai/babbage-1.3B",
995
+ "name": "babbage 1.3B",
996
+ "developer": "openai",
997
+ "scores": {
998
+ "Mean win rate": 0.114,
999
+ "MMLU": 0.235,
1000
+ "BoolQ": 0.574,
1001
+ "NarrativeQA": 0.491,
1002
+ "NaturalQuestions (open-book)": 0.451,
1003
+ "QuAC": 0.273,
1004
+ "HellaSwag": 0.555,
1005
+ "OpenbookQA": 0.438,
1006
+ "TruthfulQA": 0.188,
1007
+ "MS MARCO (TREC)": 0.317,
1008
+ "CNN/DailyMail": 0.079,
1009
+ "XSUM": 0.045,
1010
+ "IMDB": 0.597,
1011
+ "CivilComments": 0.519,
1012
+ "RAFT": 0.455
1013
+ }
1014
+ },
1015
+ {
1016
+ "model_id": "openai/curie-6.7B",
1017
+ "name": "curie 6.7B",
1018
+ "developer": "openai",
1019
+ "scores": {
1020
+ "Mean win rate": 0.247,
1021
+ "MMLU": 0.243,
1022
+ "BoolQ": 0.656,
1023
+ "NarrativeQA": 0.604,
1024
+ "NaturalQuestions (open-book)": 0.552,
1025
+ "QuAC": 0.321,
1026
+ "HellaSwag": 0.682,
1027
+ "OpenbookQA": 0.502,
1028
+ "TruthfulQA": 0.232,
1029
+ "MS MARCO (TREC)": 0.3,
1030
+ "CNN/DailyMail": 0.113,
1031
+ "XSUM": 0.091,
1032
+ "IMDB": 0.889,
1033
+ "CivilComments": 0.539,
1034
+ "RAFT": 0.49
1035
+ }
1036
+ },
1037
+ {
1038
+ "model_id": "openai/davinci-175B",
1039
+ "name": "davinci 175B",
1040
+ "developer": "openai",
1041
+ "scores": {
1042
+ "Mean win rate": 0.538,
1043
+ "MMLU": 0.422,
1044
+ "BoolQ": 0.722,
1045
+ "NarrativeQA": 0.687,
1046
+ "NaturalQuestions (open-book)": 0.625,
1047
+ "QuAC": 0.36,
1048
+ "HellaSwag": 0.775,
1049
+ "OpenbookQA": 0.586,
1050
+ "TruthfulQA": 0.194,
1051
+ "MS MARCO (TREC)": 0.378,
1052
+ "CNN/DailyMail": 0.127,
1053
+ "XSUM": 0.126,
1054
+ "IMDB": 0.933,
1055
+ "CivilComments": 0.532,
1056
+ "RAFT": 0.642
1057
+ }
1058
+ },
1059
+ {
1060
+ "model_id": "openai/gpt-3.5-turbo-0301",
1061
+ "name": "gpt-3.5-turbo-0301",
1062
+ "developer": "openai",
1063
+ "scores": {
1064
+ "Mean win rate": 0.76,
1065
+ "MMLU": 0.59,
1066
+ "BoolQ": 0.74,
1067
+ "NarrativeQA": 0.663,
1068
+ "NaturalQuestions (open-book)": 0.624,
1069
+ "QuAC": 0.512,
1070
+ "HellaSwag": -1.0,
1071
+ "OpenbookQA": -1.0,
1072
+ "TruthfulQA": 0.609,
1073
+ "MS MARCO (TREC)": -1.0,
1074
+ "CNN/DailyMail": -1.0,
1075
+ "XSUM": -1.0,
1076
+ "IMDB": 0.899,
1077
+ "CivilComments": 0.674,
1078
+ "RAFT": 0.768
1079
+ }
1080
+ },
1081
+ {
1082
+ "model_id": "openai/gpt-3.5-turbo-0613",
1083
+ "name": "gpt-3.5-turbo-0613",
1084
+ "developer": "openai",
1085
+ "scores": {
1086
+ "Mean win rate": 0.783,
1087
+ "MMLU": 0.391,
1088
+ "BoolQ": 0.87,
1089
+ "NarrativeQA": 0.625,
1090
+ "NaturalQuestions (open-book)": 0.675,
1091
+ "QuAC": 0.485,
1092
+ "HellaSwag": -1.0,
1093
+ "OpenbookQA": -1.0,
1094
+ "TruthfulQA": 0.339,
1095
+ "MS MARCO (TREC)": -1.0,
1096
+ "CNN/DailyMail": -1.0,
1097
+ "XSUM": -1.0,
1098
+ "IMDB": 0.943,
1099
+ "CivilComments": 0.696,
1100
+ "RAFT": 0.748
1101
+ }
1102
+ },
1103
+ {
1104
+ "model_id": "openai/text-ada-001",
1105
+ "name": "text-ada-001",
1106
+ "developer": "openai",
1107
+ "scores": {
1108
+ "Mean win rate": 0.107,
1109
+ "MMLU": 0.238,
1110
+ "BoolQ": 0.464,
1111
+ "NarrativeQA": 0.238,
1112
+ "NaturalQuestions (open-book)": 0.149,
1113
+ "QuAC": 0.176,
1114
+ "HellaSwag": 0.429,
1115
+ "OpenbookQA": 0.346,
1116
+ "TruthfulQA": 0.232,
1117
+ "MS MARCO (TREC)": 0.302,
1118
+ "CNN/DailyMail": 0.136,
1119
+ "XSUM": 0.034,
1120
+ "IMDB": 0.822,
1121
+ "CivilComments": 0.503,
1122
+ "RAFT": 0.406
1123
+ }
1124
+ },
1125
+ {
1126
+ "model_id": "openai/text-babbage-001",
1127
+ "name": "text-babbage-001",
1128
+ "developer": "openai",
1129
+ "scores": {
1130
+ "Mean win rate": 0.229,
1131
+ "MMLU": 0.229,
1132
+ "BoolQ": 0.451,
1133
+ "NarrativeQA": 0.429,
1134
+ "NaturalQuestions (open-book)": 0.33,
1135
+ "QuAC": 0.284,
1136
+ "HellaSwag": 0.561,
1137
+ "OpenbookQA": 0.452,
1138
+ "TruthfulQA": 0.233,
1139
+ "MS MARCO (TREC)": 0.449,
1140
+ "CNN/DailyMail": 0.151,
1141
+ "XSUM": 0.046,
1142
+ "IMDB": 0.913,
1143
+ "CivilComments": 0.499,
1144
+ "RAFT": 0.509
1145
+ }
1146
+ },
1147
+ {
1148
+ "model_id": "openai/text-curie-001",
1149
+ "name": "text-curie-001",
1150
+ "developer": "openai",
1151
+ "scores": {
1152
+ "Mean win rate": 0.36,
1153
+ "MMLU": 0.237,
1154
+ "BoolQ": 0.62,
1155
+ "NarrativeQA": 0.582,
1156
+ "NaturalQuestions (open-book)": 0.571,
1157
+ "QuAC": 0.358,
1158
+ "HellaSwag": 0.676,
1159
+ "OpenbookQA": 0.514,
1160
+ "TruthfulQA": 0.257,
1161
+ "MS MARCO (TREC)": 0.507,
1162
+ "CNN/DailyMail": 0.152,
1163
+ "XSUM": 0.076,
1164
+ "IMDB": 0.923,
1165
+ "CivilComments": 0.537,
1166
+ "RAFT": 0.489
1167
+ }
1168
+ },
1169
+ {
1170
+ "model_id": "openai/text-davinci-002",
1171
+ "name": "text-davinci-002",
1172
+ "developer": "openai",
1173
+ "scores": {
1174
+ "Mean win rate": 0.905,
1175
+ "MMLU": 0.568,
1176
+ "BoolQ": 0.877,
1177
+ "NarrativeQA": 0.727,
1178
+ "NaturalQuestions (open-book)": 0.713,
1179
+ "QuAC": 0.445,
1180
+ "HellaSwag": 0.815,
1181
+ "OpenbookQA": 0.594,
1182
+ "TruthfulQA": 0.61,
1183
+ "MS MARCO (TREC)": 0.664,
1184
+ "CNN/DailyMail": 0.153,
1185
+ "XSUM": 0.144,
1186
+ "IMDB": 0.948,
1187
+ "CivilComments": 0.668,
1188
+ "RAFT": 0.733
1189
+ }
1190
+ },
1191
+ {
1192
+ "model_id": "openai/text-davinci-003",
1193
+ "name": "text-davinci-003",
1194
+ "developer": "openai",
1195
+ "scores": {
1196
+ "Mean win rate": 0.872,
1197
+ "MMLU": 0.569,
1198
+ "BoolQ": 0.881,
1199
+ "NarrativeQA": 0.727,
1200
+ "NaturalQuestions (open-book)": 0.77,
1201
+ "QuAC": 0.525,
1202
+ "HellaSwag": 0.822,
1203
+ "OpenbookQA": 0.646,
1204
+ "TruthfulQA": 0.593,
1205
+ "MS MARCO (TREC)": 0.644,
1206
+ "CNN/DailyMail": 0.156,
1207
+ "XSUM": 0.124,
1208
+ "IMDB": 0.848,
1209
+ "CivilComments": 0.684,
1210
+ "RAFT": 0.759
1211
+ }
1212
+ },
1213
+ {
1214
+ "model_id": "stanford/Alpaca-7B",
1215
+ "name": "Alpaca 7B",
1216
+ "developer": "stanford",
1217
+ "scores": {
1218
+ "Mean win rate": 0.381,
1219
+ "MMLU": 0.385,
1220
+ "BoolQ": 0.778,
1221
+ "NarrativeQA": 0.396,
1222
+ "NaturalQuestions (open-book)": 0.592,
1223
+ "QuAC": 0.27,
1224
+ "HellaSwag": -1.0,
1225
+ "OpenbookQA": -1.0,
1226
+ "TruthfulQA": 0.243,
1227
+ "MS MARCO (TREC)": -1.0,
1228
+ "CNN/DailyMail": -1.0,
1229
+ "XSUM": -1.0,
1230
+ "IMDB": 0.738,
1231
+ "CivilComments": 0.566,
1232
+ "RAFT": 0.486
1233
+ }
1234
+ },
1235
+ {
1236
+ "model_id": "tiiuae/Falcon-40B",
1237
+ "name": "Falcon 40B",
1238
+ "developer": "tiiuae",
1239
+ "scores": {
1240
+ "Mean win rate": 0.729,
1241
+ "MMLU": 0.509,
1242
+ "BoolQ": 0.819,
1243
+ "NarrativeQA": 0.673,
1244
+ "NaturalQuestions (open-book)": 0.675,
1245
+ "QuAC": 0.307,
1246
+ "HellaSwag": -1.0,
1247
+ "OpenbookQA": -1.0,
1248
+ "TruthfulQA": 0.353,
1249
+ "MS MARCO (TREC)": -1.0,
1250
+ "CNN/DailyMail": -1.0,
1251
+ "XSUM": -1.0,
1252
+ "IMDB": 0.959,
1253
+ "CivilComments": 0.552,
1254
+ "RAFT": 0.661
1255
+ }
1256
+ },
1257
+ {
1258
+ "model_id": "tiiuae/Falcon-7B",
1259
+ "name": "Falcon 7B",
1260
+ "developer": "tiiuae",
1261
+ "scores": {
1262
+ "Mean win rate": 0.378,
1263
+ "MMLU": 0.286,
1264
+ "BoolQ": 0.753,
1265
+ "NarrativeQA": 0.621,
1266
+ "NaturalQuestions (open-book)": 0.579,
1267
+ "QuAC": 0.332,
1268
+ "HellaSwag": -1.0,
1269
+ "OpenbookQA": -1.0,
1270
+ "TruthfulQA": 0.234,
1271
+ "MS MARCO (TREC)": -1.0,
1272
+ "CNN/DailyMail": -1.0,
1273
+ "XSUM": -1.0,
1274
+ "IMDB": 0.836,
1275
+ "CivilComments": 0.514,
1276
+ "RAFT": 0.602
1277
+ }
1278
+ },
1279
+ {
1280
+ "model_id": "tiiuae/Falcon-Instruct-40B",
1281
+ "name": "Falcon-Instruct 40B",
1282
+ "developer": "tiiuae",
1283
+ "scores": {
1284
+ "Mean win rate": 0.727,
1285
+ "MMLU": 0.497,
1286
+ "BoolQ": 0.829,
1287
+ "NarrativeQA": 0.625,
1288
+ "NaturalQuestions (open-book)": 0.666,
1289
+ "QuAC": 0.371,
1290
+ "HellaSwag": -1.0,
1291
+ "OpenbookQA": -1.0,
1292
+ "TruthfulQA": 0.384,
1293
+ "MS MARCO (TREC)": -1.0,
1294
+ "CNN/DailyMail": -1.0,
1295
+ "XSUM": -1.0,
1296
+ "IMDB": 0.959,
1297
+ "CivilComments": 0.603,
1298
+ "RAFT": 0.586
1299
+ }
1300
+ },
1301
+ {
1302
+ "model_id": "tiiuae/Falcon-Instruct-7B",
1303
+ "name": "Falcon-Instruct 7B",
1304
+ "developer": "tiiuae",
1305
+ "scores": {
1306
+ "Mean win rate": 0.244,
1307
+ "MMLU": 0.275,
1308
+ "BoolQ": 0.72,
1309
+ "NarrativeQA": 0.476,
1310
+ "NaturalQuestions (open-book)": 0.449,
1311
+ "QuAC": 0.311,
1312
+ "HellaSwag": -1.0,
1313
+ "OpenbookQA": -1.0,
1314
+ "TruthfulQA": 0.213,
1315
+ "MS MARCO (TREC)": -1.0,
1316
+ "CNN/DailyMail": -1.0,
1317
+ "XSUM": -1.0,
1318
+ "IMDB": 0.852,
1319
+ "CivilComments": 0.511,
1320
+ "RAFT": 0.523
1321
+ }
1322
+ },
1323
+ {
1324
+ "model_id": "together/RedPajama-INCITE-Base-7B",
1325
+ "name": "RedPajama-INCITE-Base 7B",
1326
+ "developer": "together",
1327
+ "scores": {
1328
+ "Mean win rate": 0.378,
1329
+ "MMLU": 0.302,
1330
+ "BoolQ": 0.713,
1331
+ "NarrativeQA": 0.617,
1332
+ "NaturalQuestions (open-book)": 0.586,
1333
+ "QuAC": 0.336,
1334
+ "HellaSwag": -1.0,
1335
+ "OpenbookQA": -1.0,
1336
+ "TruthfulQA": 0.205,
1337
+ "MS MARCO (TREC)": -1.0,
1338
+ "CNN/DailyMail": -1.0,
1339
+ "XSUM": -1.0,
1340
+ "IMDB": 0.752,
1341
+ "CivilComments": 0.547,
1342
+ "RAFT": 0.648
1343
+ }
1344
+ },
1345
+ {
1346
+ "model_id": "together/RedPajama-INCITE-Base-v1-3B",
1347
+ "name": "RedPajama-INCITE-Base-v1 3B",
1348
+ "developer": "together",
1349
+ "scores": {
1350
+ "Mean win rate": 0.311,
1351
+ "MMLU": 0.263,
1352
+ "BoolQ": 0.685,
1353
+ "NarrativeQA": 0.555,
1354
+ "NaturalQuestions (open-book)": 0.52,
1355
+ "QuAC": 0.309,
1356
+ "HellaSwag": -1.0,
1357
+ "OpenbookQA": -1.0,
1358
+ "TruthfulQA": 0.277,
1359
+ "MS MARCO (TREC)": -1.0,
1360
+ "CNN/DailyMail": -1.0,
1361
+ "XSUM": -1.0,
1362
+ "IMDB": 0.907,
1363
+ "CivilComments": 0.549,
1364
+ "RAFT": 0.502
1365
+ }
1366
+ },
1367
+ {
1368
+ "model_id": "together/RedPajama-INCITE-Instruct-7B",
1369
+ "name": "RedPajama-INCITE-Instruct 7B",
1370
+ "developer": "together",
1371
+ "scores": {
1372
+ "Mean win rate": 0.524,
1373
+ "MMLU": 0.363,
1374
+ "BoolQ": 0.705,
1375
+ "NarrativeQA": 0.638,
1376
+ "NaturalQuestions (open-book)": 0.659,
1377
+ "QuAC": 0.26,
1378
+ "HellaSwag": -1.0,
1379
+ "OpenbookQA": -1.0,
1380
+ "TruthfulQA": 0.243,
1381
+ "MS MARCO (TREC)": -1.0,
1382
+ "CNN/DailyMail": -1.0,
1383
+ "XSUM": -1.0,
1384
+ "IMDB": 0.927,
1385
+ "CivilComments": 0.664,
1386
+ "RAFT": 0.695
1387
+ }
1388
+ },
1389
+ {
1390
+ "model_id": "together/RedPajama-INCITE-Instruct-v1-3B",
1391
+ "name": "RedPajama-INCITE-Instruct-v1 3B",
1392
+ "developer": "together",
1393
+ "scores": {
1394
+ "Mean win rate": 0.366,
1395
+ "MMLU": 0.257,
1396
+ "BoolQ": 0.677,
1397
+ "NarrativeQA": 0.638,
1398
+ "NaturalQuestions (open-book)": 0.637,
1399
+ "QuAC": 0.259,
1400
+ "HellaSwag": -1.0,
1401
+ "OpenbookQA": -1.0,
1402
+ "TruthfulQA": 0.208,
1403
+ "MS MARCO (TREC)": -1.0,
1404
+ "CNN/DailyMail": -1.0,
1405
+ "XSUM": -1.0,
1406
+ "IMDB": 0.894,
1407
+ "CivilComments": 0.549,
1408
+ "RAFT": 0.661
1409
+ }
1410
+ },
1411
+ {
1412
+ "model_id": "writer/InstructPalmyra-30B",
1413
+ "name": "InstructPalmyra 30B",
1414
+ "developer": "writer",
1415
+ "scores": {
1416
+ "Mean win rate": 0.568,
1417
+ "MMLU": 0.403,
1418
+ "BoolQ": 0.751,
1419
+ "NarrativeQA": 0.496,
1420
+ "NaturalQuestions (open-book)": 0.682,
1421
+ "QuAC": 0.433,
1422
+ "HellaSwag": -1.0,
1423
+ "OpenbookQA": -1.0,
1424
+ "TruthfulQA": 0.185,
1425
+ "MS MARCO (TREC)": -1.0,
1426
+ "CNN/DailyMail": 0.152,
1427
+ "XSUM": 0.104,
1428
+ "IMDB": 0.94,
1429
+ "CivilComments": 0.555,
1430
+ "RAFT": 0.652
1431
+ }
1432
+ },
1433
+ {
1434
+ "model_id": "yandex/YaLM-100B",
1435
+ "name": "YaLM 100B",
1436
+ "developer": "yandex",
1437
+ "scores": {
1438
+ "Mean win rate": 0.075,
1439
+ "MMLU": 0.243,
1440
+ "BoolQ": 0.634,
1441
+ "NarrativeQA": 0.252,
1442
+ "NaturalQuestions (open-book)": 0.227,
1443
+ "QuAC": 0.162,
1444
+ "HellaSwag": -1.0,
1445
+ "OpenbookQA": -1.0,
1446
+ "TruthfulQA": 0.202,
1447
+ "MS MARCO (TREC)": -1.0,
1448
+ "CNN/DailyMail": 0.017,
1449
+ "XSUM": 0.021,
1450
+ "IMDB": 0.836,
1451
+ "CivilComments": 0.49,
1452
+ "RAFT": 0.395
1453
+ }
1454
+ },
1455
+ {
1456
+ "model_id": "zhipu-ai/GLM-130B",
1457
+ "name": "GLM 130B",
1458
+ "developer": "zhipu-ai",
1459
+ "scores": {
1460
+ "Mean win rate": 0.512,
1461
+ "MMLU": 0.344,
1462
+ "BoolQ": 0.784,
1463
+ "NarrativeQA": 0.706,
1464
+ "NaturalQuestions (open-book)": 0.642,
1465
+ "QuAC": 0.272,
1466
+ "HellaSwag": -1.0,
1467
+ "OpenbookQA": -1.0,
1468
+ "TruthfulQA": 0.218,
1469
+ "MS MARCO (TREC)": -1.0,
1470
+ "CNN/DailyMail": 0.154,
1471
+ "XSUM": 0.132,
1472
+ "IMDB": 0.955,
1473
+ "CivilComments": 0.5,
1474
+ "RAFT": 0.598
1475
+ }
1476
+ }
1477
+ ]
1478
+ }
data/benchmarks/helm_instruct.json ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "models": [
3
+ {
4
+ "model_id": "anthropic/claude-v1.3",
5
+ "name": "Anthropic Claude v1.3",
6
+ "developer": "anthropic",
7
+ "scores": {
8
+ "Mean win rate": 0.611,
9
+ "Anthropic RLHF dataset": 4.965,
10
+ "Best ChatGPT Prompts": 4.995,
11
+ "Koala test dataset": 4.981,
12
+ "Open Assistant": 4.975,
13
+ "Self Instruct": 4.992,
14
+ "Vicuna": 4.989
15
+ }
16
+ },
17
+ {
18
+ "model_id": "cohere/command-xlarge-beta",
19
+ "name": "Cohere Command beta 52.4B",
20
+ "developer": "cohere",
21
+ "scores": {
22
+ "Mean win rate": 0.089,
23
+ "Anthropic RLHF dataset": 4.214,
24
+ "Best ChatGPT Prompts": 4.988,
25
+ "Koala test dataset": 4.969,
26
+ "Open Assistant": 4.967,
27
+ "Self Instruct": 4.971,
28
+ "Vicuna": 4.995
29
+ }
30
+ },
31
+ {
32
+ "model_id": "openai/gpt-3.5-turbo-0613",
33
+ "name": "gpt-3.5-turbo-0613",
34
+ "developer": "openai",
35
+ "scores": {
36
+ "Mean win rate": 0.689,
37
+ "Anthropic RLHF dataset": 4.964,
38
+ "Best ChatGPT Prompts": 4.986,
39
+ "Koala test dataset": 4.987,
40
+ "Open Assistant": 4.987,
41
+ "Self Instruct": 4.99,
42
+ "Vicuna": 4.992
43
+ }
44
+ },
45
+ {
46
+ "model_id": "openai/gpt-4-0314",
47
+ "name": "GPT-4 0314",
48
+ "developer": "openai",
49
+ "scores": {
50
+ "Mean win rate": 0.611,
51
+ "Anthropic RLHF dataset": 4.934,
52
+ "Best ChatGPT Prompts": 4.973,
53
+ "Koala test dataset": 4.966,
54
+ "Open Assistant": 4.986,
55
+ "Self Instruct": 4.976,
56
+ "Vicuna": 4.995
57
+ }
58
+ }
59
+ ]
60
+ }
data/benchmarks/helm_lite.json ADDED
@@ -0,0 +1,1551 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "models": [
3
+ {
4
+ "model_id": "01-ai/yi-34b",
5
+ "name": "Yi 34B",
6
+ "developer": "01-ai",
7
+ "scores": {
8
+ "Mean win rate": 0.57,
9
+ "NarrativeQA": 0.782,
10
+ "NaturalQuestions (closed-book)": 0.443,
11
+ "OpenbookQA": 0.92,
12
+ "MMLU": 0.65,
13
+ "MATH": 0.375,
14
+ "GSM8K": 0.648,
15
+ "LegalBench": 0.618,
16
+ "MedQA": 0.656,
17
+ "WMT 2014": 0.172
18
+ }
19
+ },
20
+ {
21
+ "model_id": "01-ai/yi-6b",
22
+ "name": "Yi 6B",
23
+ "developer": "01-ai",
24
+ "scores": {
25
+ "Mean win rate": 0.253,
26
+ "NarrativeQA": 0.702,
27
+ "NaturalQuestions (closed-book)": 0.31,
28
+ "OpenbookQA": 0.8,
29
+ "MMLU": 0.53,
30
+ "MATH": 0.126,
31
+ "GSM8K": 0.375,
32
+ "LegalBench": 0.519,
33
+ "MedQA": 0.497,
34
+ "WMT 2014": 0.117
35
+ }
36
+ },
37
+ {
38
+ "model_id": "01-ai/yi-large-preview",
39
+ "name": "Yi Large Preview",
40
+ "developer": "01-ai",
41
+ "scores": {
42
+ "Mean win rate": 0.471,
43
+ "NarrativeQA": 0.373,
44
+ "NaturalQuestions (closed-book)": 0.428,
45
+ "OpenbookQA": 0.946,
46
+ "MMLU": 0.712,
47
+ "MATH": 0.712,
48
+ "GSM8K": 0.69,
49
+ "LegalBench": 0.519,
50
+ "MedQA": 0.66,
51
+ "WMT 2014": 0.176
52
+ }
53
+ },
54
+ {
55
+ "model_id": "AlephAlpha/luminous-base",
56
+ "name": "Luminous Base 13B",
57
+ "developer": "AlephAlpha",
58
+ "scores": {
59
+ "Mean win rate": 0.041,
60
+ "NarrativeQA": 0.633,
61
+ "NaturalQuestions (closed-book)": 0.197,
62
+ "OpenbookQA": 0.286,
63
+ "MMLU": 0.243,
64
+ "MATH": 0.026,
65
+ "GSM8K": 0.028,
66
+ "LegalBench": 0.332,
67
+ "MedQA": 0.26,
68
+ "WMT 2014": 0.066
69
+ }
70
+ },
71
+ {
72
+ "model_id": "AlephAlpha/luminous-extended",
73
+ "name": "Luminous Extended 30B",
74
+ "developer": "AlephAlpha",
75
+ "scores": {
76
+ "Mean win rate": 0.078,
77
+ "NarrativeQA": 0.684,
78
+ "NaturalQuestions (closed-book)": 0.253,
79
+ "OpenbookQA": 0.272,
80
+ "MMLU": 0.248,
81
+ "MATH": 0.04,
82
+ "GSM8K": 0.075,
83
+ "LegalBench": 0.421,
84
+ "MedQA": 0.276,
85
+ "WMT 2014": 0.083
86
+ }
87
+ },
88
+ {
89
+ "model_id": "AlephAlpha/luminous-supreme",
90
+ "name": "Luminous Supreme 70B",
91
+ "developer": "AlephAlpha",
92
+ "scores": {
93
+ "Mean win rate": 0.145,
94
+ "NarrativeQA": 0.743,
95
+ "NaturalQuestions (closed-book)": 0.299,
96
+ "OpenbookQA": 0.284,
97
+ "MMLU": 0.316,
98
+ "MATH": 0.078,
99
+ "GSM8K": 0.137,
100
+ "LegalBench": 0.452,
101
+ "MedQA": 0.276,
102
+ "WMT 2014": 0.102
103
+ }
104
+ },
105
+ {
106
+ "model_id": "ai21/j2-grande",
107
+ "name": "Jurassic-2 Grande 17B",
108
+ "developer": "ai21",
109
+ "scores": {
110
+ "Mean win rate": 0.172,
111
+ "NarrativeQA": 0.744,
112
+ "NaturalQuestions (closed-book)": 0.35,
113
+ "OpenbookQA": 0.614,
114
+ "MMLU": 0.471,
115
+ "MATH": 0.064,
116
+ "GSM8K": 0.159,
117
+ "LegalBench": 0.468,
118
+ "MedQA": 0.39,
119
+ "WMT 2014": 0.102
120
+ }
121
+ },
122
+ {
123
+ "model_id": "ai21/j2-jumbo",
124
+ "name": "Jurassic-2 Jumbo 178B",
125
+ "developer": "ai21",
126
+ "scores": {
127
+ "Mean win rate": 0.215,
128
+ "NarrativeQA": 0.728,
129
+ "NaturalQuestions (closed-book)": 0.385,
130
+ "OpenbookQA": 0.688,
131
+ "MMLU": 0.483,
132
+ "MATH": 0.103,
133
+ "GSM8K": 0.239,
134
+ "LegalBench": 0.533,
135
+ "MedQA": 0.431,
136
+ "WMT 2014": 0.114
137
+ }
138
+ },
139
+ {
140
+ "model_id": "ai21/jamba-1.5-large",
141
+ "name": "Jamba 1.5 Large",
142
+ "developer": "ai21",
143
+ "scores": {
144
+ "Mean win rate": 0.637,
145
+ "NarrativeQA": 0.664,
146
+ "NaturalQuestions (closed-book)": 0.394,
147
+ "OpenbookQA": 0.948,
148
+ "MMLU": 0.683,
149
+ "MATH": 0.692,
150
+ "GSM8K": 0.846,
151
+ "LegalBench": 0.675,
152
+ "MedQA": 0.698,
153
+ "WMT 2014": 0.203
154
+ }
155
+ },
156
+ {
157
+ "model_id": "ai21/jamba-1.5-mini",
158
+ "name": "Jamba 1.5 Mini",
159
+ "developer": "ai21",
160
+ "scores": {
161
+ "Mean win rate": 0.414,
162
+ "NarrativeQA": 0.746,
163
+ "NaturalQuestions (closed-book)": 0.388,
164
+ "OpenbookQA": 0.89,
165
+ "MMLU": 0.582,
166
+ "MATH": 0.318,
167
+ "GSM8K": 0.691,
168
+ "LegalBench": 0.503,
169
+ "MedQA": 0.632,
170
+ "WMT 2014": 0.179
171
+ }
172
+ },
173
+ {
174
+ "model_id": "ai21/jamba-instruct",
175
+ "name": "Jamba Instruct",
176
+ "developer": "ai21",
177
+ "scores": {
178
+ "Mean win rate": 0.287,
179
+ "NarrativeQA": 0.658,
180
+ "NaturalQuestions (closed-book)": 0.384,
181
+ "OpenbookQA": 0.796,
182
+ "MMLU": 0.582,
183
+ "MATH": 0.38,
184
+ "GSM8K": 0.67,
185
+ "LegalBench": 0.54,
186
+ "MedQA": 0.519,
187
+ "WMT 2014": 0.164
188
+ }
189
+ },
190
+ {
191
+ "model_id": "allenai/olmo-7b",
192
+ "name": "OLMo 7B",
193
+ "developer": "allenai",
194
+ "scores": {
195
+ "Mean win rate": 0.052,
196
+ "NarrativeQA": 0.597,
197
+ "NaturalQuestions (closed-book)": 0.259,
198
+ "OpenbookQA": 0.222,
199
+ "MMLU": 0.305,
200
+ "MATH": 0.029,
201
+ "GSM8K": 0.044,
202
+ "LegalBench": 0.341,
203
+ "MedQA": 0.229,
204
+ "WMT 2014": 0.097
205
+ }
206
+ },
207
+ {
208
+ "model_id": "amazon/nova-lite-v1:0",
209
+ "name": "Amazon Nova Lite",
210
+ "developer": "amazon",
211
+ "scores": {
212
+ "Mean win rate": 0.708,
213
+ "NarrativeQA": 0.768,
214
+ "NaturalQuestions (closed-book)": 0.352,
215
+ "OpenbookQA": 0.928,
216
+ "MMLU": 0.693,
217
+ "MATH": 0.779,
218
+ "GSM8K": 0.829,
219
+ "LegalBench": 0.659,
220
+ "MedQA": 0.696,
221
+ "WMT 2014": 0.204
222
+ }
223
+ },
224
+ {
225
+ "model_id": "amazon/nova-micro-v1:0",
226
+ "name": "Amazon Nova Micro",
227
+ "developer": "amazon",
228
+ "scores": {
229
+ "Mean win rate": 0.524,
230
+ "NarrativeQA": 0.744,
231
+ "NaturalQuestions (closed-book)": 0.285,
232
+ "OpenbookQA": 0.888,
233
+ "MMLU": 0.64,
234
+ "MATH": 0.76,
235
+ "GSM8K": 0.794,
236
+ "LegalBench": 0.615,
237
+ "MedQA": 0.608,
238
+ "WMT 2014": 0.192
239
+ }
240
+ },
241
+ {
242
+ "model_id": "amazon/nova-pro-v1:0",
243
+ "name": "Amazon Nova Pro",
244
+ "developer": "amazon",
245
+ "scores": {
246
+ "Mean win rate": 0.885,
247
+ "NarrativeQA": 0.791,
248
+ "NaturalQuestions (closed-book)": 0.405,
249
+ "OpenbookQA": 0.96,
250
+ "MMLU": 0.758,
251
+ "MATH": 0.821,
252
+ "GSM8K": 0.87,
253
+ "LegalBench": 0.736,
254
+ "MedQA": 0.811,
255
+ "WMT 2014": 0.229
256
+ }
257
+ },
258
+ {
259
+ "model_id": "anthropic/claude-2.0",
260
+ "name": "Claude 2.0",
261
+ "developer": "anthropic",
262
+ "scores": {
263
+ "Mean win rate": 0.489,
264
+ "NarrativeQA": 0.718,
265
+ "NaturalQuestions (closed-book)": 0.428,
266
+ "OpenbookQA": 0.862,
267
+ "MMLU": 0.639,
268
+ "MATH": 0.603,
269
+ "GSM8K": 0.583,
270
+ "LegalBench": 0.643,
271
+ "MedQA": 0.652,
272
+ "WMT 2014": 0.219
273
+ }
274
+ },
275
+ {
276
+ "model_id": "anthropic/claude-2.1",
277
+ "name": "Claude 2.1",
278
+ "developer": "anthropic",
279
+ "scores": {
280
+ "Mean win rate": 0.437,
281
+ "NarrativeQA": 0.677,
282
+ "NaturalQuestions (closed-book)": 0.375,
283
+ "OpenbookQA": 0.872,
284
+ "MMLU": 0.643,
285
+ "MATH": 0.632,
286
+ "GSM8K": 0.604,
287
+ "LegalBench": 0.643,
288
+ "MedQA": 0.644,
289
+ "WMT 2014": 0.204
290
+ }
291
+ },
292
+ {
293
+ "model_id": "anthropic/claude-3-5-haiku-20241022",
294
+ "name": "claude-3-5-haiku-20241022",
295
+ "developer": "anthropic",
296
+ "scores": {
297
+ "Mean win rate": 0.531,
298
+ "NarrativeQA": 0.763,
299
+ "NaturalQuestions (closed-book)": 0.344,
300
+ "OpenbookQA": 0.854,
301
+ "MMLU": 0.671,
302
+ "MATH": 0.872,
303
+ "GSM8K": 0.815,
304
+ "LegalBench": 0.631,
305
+ "MedQA": 0.722,
306
+ "WMT 2014": 0.135
307
+ }
308
+ },
309
+ {
310
+ "model_id": "anthropic/claude-3-5-sonnet-20240620",
311
+ "name": "Claude 3.5 Sonnet 20240620",
312
+ "developer": "anthropic",
313
+ "scores": {
314
+ "Mean win rate": 0.885,
315
+ "NarrativeQA": 0.746,
316
+ "NaturalQuestions (closed-book)": 0.502,
317
+ "OpenbookQA": 0.972,
318
+ "MMLU": 0.799,
319
+ "MATH": 0.813,
320
+ "GSM8K": 0.949,
321
+ "LegalBench": 0.707,
322
+ "MedQA": 0.825,
323
+ "WMT 2014": 0.229
324
+ }
325
+ },
326
+ {
327
+ "model_id": "anthropic/claude-3-5-sonnet-20241022",
328
+ "name": "Claude 3.5 Sonnet 20241022",
329
+ "developer": "anthropic",
330
+ "scores": {
331
+ "Mean win rate": 0.846,
332
+ "NarrativeQA": 0.77,
333
+ "NaturalQuestions (closed-book)": 0.467,
334
+ "OpenbookQA": 0.966,
335
+ "MMLU": 0.809,
336
+ "MATH": 0.904,
337
+ "GSM8K": 0.956,
338
+ "LegalBench": 0.647,
339
+ "MedQA": 0.859,
340
+ "WMT 2014": 0.226
341
+ }
342
+ },
343
+ {
344
+ "model_id": "anthropic/claude-3-haiku-20240307",
345
+ "name": "Claude 3 Haiku 20240307",
346
+ "developer": "anthropic",
347
+ "scores": {
348
+ "Mean win rate": 0.263,
349
+ "NarrativeQA": 0.244,
350
+ "NaturalQuestions (closed-book)": 0.144,
351
+ "OpenbookQA": 0.838,
352
+ "MMLU": 0.662,
353
+ "MATH": 0.131,
354
+ "GSM8K": 0.699,
355
+ "LegalBench": 0.46,
356
+ "MedQA": 0.702,
357
+ "WMT 2014": 0.148
358
+ }
359
+ },
360
+ {
361
+ "model_id": "anthropic/claude-3-opus-20240229",
362
+ "name": "Claude 3 Opus 20240229",
363
+ "developer": "anthropic",
364
+ "scores": {
365
+ "Mean win rate": 0.683,
366
+ "NarrativeQA": 0.351,
367
+ "NaturalQuestions (closed-book)": 0.441,
368
+ "OpenbookQA": 0.956,
369
+ "MMLU": 0.768,
370
+ "MATH": 0.76,
371
+ "GSM8K": 0.924,
372
+ "LegalBench": 0.662,
373
+ "MedQA": 0.775,
374
+ "WMT 2014": 0.24
375
+ }
376
+ },
377
+ {
378
+ "model_id": "anthropic/claude-3-sonnet-20240229",
379
+ "name": "Claude 3 Sonnet 20240229",
380
+ "developer": "anthropic",
381
+ "scores": {
382
+ "Mean win rate": 0.377,
383
+ "NarrativeQA": 0.111,
384
+ "NaturalQuestions (closed-book)": 0.028,
385
+ "OpenbookQA": 0.918,
386
+ "MMLU": 0.652,
387
+ "MATH": 0.084,
388
+ "GSM8K": 0.907,
389
+ "LegalBench": 0.49,
390
+ "MedQA": 0.684,
391
+ "WMT 2014": 0.218
392
+ }
393
+ },
394
+ {
395
+ "model_id": "anthropic/claude-instant-1.2",
396
+ "name": "Claude Instant 1.2",
397
+ "developer": "anthropic",
398
+ "scores": {
399
+ "Mean win rate": 0.399,
400
+ "NarrativeQA": 0.616,
401
+ "NaturalQuestions (closed-book)": 0.343,
402
+ "OpenbookQA": 0.844,
403
+ "MMLU": 0.631,
404
+ "MATH": 0.499,
405
+ "GSM8K": 0.721,
406
+ "LegalBench": 0.586,
407
+ "MedQA": 0.559,
408
+ "WMT 2014": 0.194
409
+ }
410
+ },
411
+ {
412
+ "model_id": "anthropic/claude-v1.3",
413
+ "name": "Anthropic Claude v1.3",
414
+ "developer": "anthropic",
415
+ "scores": {
416
+ "Mean win rate": 0.518,
417
+ "NarrativeQA": 0.723,
418
+ "NaturalQuestions (closed-book)": 0.409,
419
+ "OpenbookQA": 0.908,
420
+ "MMLU": 0.631,
421
+ "MATH": 0.54,
422
+ "GSM8K": 0.784,
423
+ "LegalBench": 0.629,
424
+ "MedQA": 0.618,
425
+ "WMT 2014": 0.219
426
+ }
427
+ },
428
+ {
429
+ "model_id": "cohere/command",
430
+ "name": "Command",
431
+ "developer": "cohere",
432
+ "scores": {
433
+ "Mean win rate": 0.327,
434
+ "NarrativeQA": 0.749,
435
+ "NaturalQuestions (closed-book)": 0.391,
436
+ "OpenbookQA": 0.774,
437
+ "MMLU": 0.525,
438
+ "MATH": 0.236,
439
+ "GSM8K": 0.452,
440
+ "LegalBench": 0.578,
441
+ "MedQA": 0.445,
442
+ "WMT 2014": 0.088
443
+ }
444
+ },
445
+ {
446
+ "model_id": "cohere/command-light",
447
+ "name": "Command Light",
448
+ "developer": "cohere",
449
+ "scores": {
450
+ "Mean win rate": 0.105,
451
+ "NarrativeQA": 0.629,
452
+ "NaturalQuestions (closed-book)": 0.195,
453
+ "OpenbookQA": 0.398,
454
+ "MMLU": 0.386,
455
+ "MATH": 0.098,
456
+ "GSM8K": 0.149,
457
+ "LegalBench": 0.397,
458
+ "MedQA": 0.312,
459
+ "WMT 2014": 0.023
460
+ }
461
+ },
462
+ {
463
+ "model_id": "cohere/command-r",
464
+ "name": "Command R",
465
+ "developer": "cohere",
466
+ "scores": {
467
+ "Mean win rate": 0.299,
468
+ "NarrativeQA": 0.742,
469
+ "NaturalQuestions (closed-book)": 0.352,
470
+ "OpenbookQA": 0.782,
471
+ "MMLU": 0.567,
472
+ "MATH": 0.266,
473
+ "GSM8K": 0.551,
474
+ "LegalBench": 0.507,
475
+ "MedQA": 0.555,
476
+ "WMT 2014": 0.149
477
+ }
478
+ },
479
+ {
480
+ "model_id": "cohere/command-r-plus",
481
+ "name": "Command R Plus",
482
+ "developer": "cohere",
483
+ "scores": {
484
+ "Mean win rate": 0.441,
485
+ "NarrativeQA": 0.735,
486
+ "NaturalQuestions (closed-book)": 0.343,
487
+ "OpenbookQA": 0.828,
488
+ "MMLU": 0.59,
489
+ "MATH": 0.403,
490
+ "GSM8K": 0.738,
491
+ "LegalBench": 0.672,
492
+ "MedQA": 0.567,
493
+ "WMT 2014": 0.203
494
+ }
495
+ },
496
+ {
497
+ "model_id": "databricks/dbrx-instruct",
498
+ "name": "DBRX Instruct",
499
+ "developer": "databricks",
500
+ "scores": {
501
+ "Mean win rate": 0.289,
502
+ "NarrativeQA": 0.488,
503
+ "NaturalQuestions (closed-book)": 0.284,
504
+ "OpenbookQA": 0.91,
505
+ "MMLU": 0.643,
506
+ "MATH": 0.358,
507
+ "GSM8K": 0.671,
508
+ "LegalBench": 0.426,
509
+ "MedQA": 0.694,
510
+ "WMT 2014": 0.131
511
+ }
512
+ },
513
+ {
514
+ "model_id": "deepseek-ai/deepseek-llm-67b-chat",
515
+ "name": "DeepSeek LLM Chat 67B",
516
+ "developer": "deepseek-ai",
517
+ "scores": {
518
+ "Mean win rate": 0.488,
519
+ "NarrativeQA": 0.581,
520
+ "NaturalQuestions (closed-book)": 0.412,
521
+ "OpenbookQA": 0.88,
522
+ "MMLU": 0.641,
523
+ "MATH": 0.615,
524
+ "GSM8K": 0.795,
525
+ "LegalBench": 0.637,
526
+ "MedQA": 0.628,
527
+ "WMT 2014": 0.186
528
+ }
529
+ },
530
+ {
531
+ "model_id": "deepseek-ai/deepseek-v3",
532
+ "name": "DeepSeek v3",
533
+ "developer": "deepseek-ai",
534
+ "scores": {
535
+ "Mean win rate": 0.908,
536
+ "NarrativeQA": 0.796,
537
+ "NaturalQuestions (closed-book)": 0.467,
538
+ "OpenbookQA": 0.954,
539
+ "MMLU": 0.803,
540
+ "MATH": 0.912,
541
+ "GSM8K": 0.94,
542
+ "LegalBench": 0.718,
543
+ "MedQA": 0.809,
544
+ "WMT 2014": 0.209
545
+ }
546
+ },
547
+ {
548
+ "model_id": "google/gemini-1.0-pro-002",
549
+ "name": "Gemini 1.0 Pro 002",
550
+ "developer": "google",
551
+ "scores": {
552
+ "Mean win rate": 0.422,
553
+ "NarrativeQA": 0.751,
554
+ "NaturalQuestions (closed-book)": 0.391,
555
+ "OpenbookQA": 0.788,
556
+ "MMLU": 0.534,
557
+ "MATH": 0.665,
558
+ "GSM8K": 0.816,
559
+ "LegalBench": 0.475,
560
+ "MedQA": 0.483,
561
+ "WMT 2014": 0.194
562
+ }
563
+ },
564
+ {
565
+ "model_id": "google/gemini-1.5-flash-001",
566
+ "name": "Gemini 1.5 Flash 001",
567
+ "developer": "google",
568
+ "scores": {
569
+ "Mean win rate": 0.667,
570
+ "NarrativeQA": 0.783,
571
+ "NaturalQuestions (closed-book)": 0.332,
572
+ "OpenbookQA": 0.928,
573
+ "MMLU": 0.703,
574
+ "MATH": 0.753,
575
+ "GSM8K": 0.785,
576
+ "LegalBench": 0.661,
577
+ "MedQA": 0.68,
578
+ "WMT 2014": 0.225
579
+ }
580
+ },
581
+ {
582
+ "model_id": "google/gemini-1.5-flash-002",
583
+ "name": "Gemini 1.5 Flash 002",
584
+ "developer": "google",
585
+ "scores": {
586
+ "Mean win rate": 0.573,
587
+ "NarrativeQA": 0.746,
588
+ "NaturalQuestions (closed-book)": 0.323,
589
+ "OpenbookQA": 0.914,
590
+ "MMLU": 0.679,
591
+ "MATH": 0.908,
592
+ "GSM8K": 0.328,
593
+ "LegalBench": 0.67,
594
+ "MedQA": 0.656,
595
+ "WMT 2014": 0.212
596
+ }
597
+ },
598
+ {
599
+ "model_id": "google/gemini-1.5-pro-001",
600
+ "name": "Gemini 1.5 Pro 001",
601
+ "developer": "google",
602
+ "scores": {
603
+ "Mean win rate": 0.739,
604
+ "NarrativeQA": 0.783,
605
+ "NaturalQuestions (closed-book)": 0.378,
606
+ "OpenbookQA": 0.902,
607
+ "MMLU": 0.772,
608
+ "MATH": 0.825,
609
+ "GSM8K": 0.836,
610
+ "LegalBench": 0.757,
611
+ "MedQA": 0.692,
612
+ "WMT 2014": 0.189
613
+ }
614
+ },
615
+ {
616
+ "model_id": "google/gemini-1.5-pro-002",
617
+ "name": "Gemini 1.5 Pro 002",
618
+ "developer": "google",
619
+ "scores": {
620
+ "Mean win rate": 0.842,
621
+ "NarrativeQA": 0.756,
622
+ "NaturalQuestions (closed-book)": 0.455,
623
+ "OpenbookQA": 0.952,
624
+ "MMLU": 0.795,
625
+ "MATH": 0.92,
626
+ "GSM8K": 0.817,
627
+ "LegalBench": 0.747,
628
+ "MedQA": 0.771,
629
+ "WMT 2014": 0.231
630
+ }
631
+ },
632
+ {
633
+ "model_id": "google/gemini-2.0-flash-exp",
634
+ "name": "Gemini 2.0 Flash Experimental",
635
+ "developer": "google",
636
+ "scores": {
637
+ "Mean win rate": 0.813,
638
+ "NarrativeQA": 0.783,
639
+ "NaturalQuestions (closed-book)": 0.443,
640
+ "OpenbookQA": 0.946,
641
+ "MMLU": 0.717,
642
+ "MATH": 0.901,
643
+ "GSM8K": 0.946,
644
+ "LegalBench": 0.674,
645
+ "MedQA": 0.73,
646
+ "WMT 2014": 0.212
647
+ }
648
+ },
649
+ {
650
+ "model_id": "google/gemma-2-27b-it",
651
+ "name": "Gemma 2 Instruct 27B",
652
+ "developer": "google",
653
+ "scores": {
654
+ "Mean win rate": 0.675,
655
+ "NarrativeQA": 0.79,
656
+ "NaturalQuestions (closed-book)": 0.353,
657
+ "OpenbookQA": 0.918,
658
+ "MMLU": 0.664,
659
+ "MATH": 0.746,
660
+ "GSM8K": 0.812,
661
+ "LegalBench": 0.7,
662
+ "MedQA": 0.684,
663
+ "WMT 2014": 0.214
664
+ }
665
+ },
666
+ {
667
+ "model_id": "google/gemma-2-9b-it",
668
+ "name": "Gemma 2 Instruct 9B",
669
+ "developer": "google",
670
+ "scores": {
671
+ "Mean win rate": 0.562,
672
+ "NarrativeQA": 0.768,
673
+ "NaturalQuestions (closed-book)": 0.328,
674
+ "OpenbookQA": 0.91,
675
+ "MMLU": 0.645,
676
+ "MATH": 0.724,
677
+ "GSM8K": 0.762,
678
+ "LegalBench": 0.639,
679
+ "MedQA": 0.63,
680
+ "WMT 2014": 0.201
681
+ }
682
+ },
683
+ {
684
+ "model_id": "google/gemma-7b",
685
+ "name": "Gemma 7B",
686
+ "developer": "google",
687
+ "scores": {
688
+ "Mean win rate": 0.336,
689
+ "NarrativeQA": 0.752,
690
+ "NaturalQuestions (closed-book)": 0.336,
691
+ "OpenbookQA": 0.808,
692
+ "MMLU": 0.571,
693
+ "MATH": 0.5,
694
+ "GSM8K": 0.559,
695
+ "LegalBench": 0.581,
696
+ "MedQA": 0.513,
697
+ "WMT 2014": 0.187
698
+ }
699
+ },
700
+ {
701
+ "model_id": "google/text-bison@001",
702
+ "name": "PaLM-2 Bison",
703
+ "developer": "google",
704
+ "scores": {
705
+ "Mean win rate": 0.526,
706
+ "NarrativeQA": 0.718,
707
+ "NaturalQuestions (closed-book)": 0.39,
708
+ "OpenbookQA": 0.878,
709
+ "MMLU": 0.608,
710
+ "MATH": 0.421,
711
+ "GSM8K": 0.61,
712
+ "LegalBench": 0.645,
713
+ "MedQA": 0.547,
714
+ "WMT 2014": 0.241
715
+ }
716
+ },
717
+ {
718
+ "model_id": "google/text-unicorn@001",
719
+ "name": "PaLM-2 Unicorn",
720
+ "developer": "google",
721
+ "scores": {
722
+ "Mean win rate": 0.644,
723
+ "NarrativeQA": 0.583,
724
+ "NaturalQuestions (closed-book)": 0.435,
725
+ "OpenbookQA": 0.938,
726
+ "MMLU": 0.702,
727
+ "MATH": 0.674,
728
+ "GSM8K": 0.831,
729
+ "LegalBench": 0.677,
730
+ "MedQA": 0.684,
731
+ "WMT 2014": 0.26
732
+ }
733
+ },
734
+ {
735
+ "model_id": "meta/llama-2-13b",
736
+ "name": "Llama 2 13B",
737
+ "developer": "meta",
738
+ "scores": {
739
+ "Mean win rate": 0.233,
740
+ "NarrativeQA": 0.741,
741
+ "NaturalQuestions (closed-book)": 0.371,
742
+ "OpenbookQA": 0.634,
743
+ "MMLU": 0.505,
744
+ "MATH": 0.102,
745
+ "GSM8K": 0.266,
746
+ "LegalBench": 0.591,
747
+ "MedQA": 0.392,
748
+ "WMT 2014": 0.167
749
+ }
750
+ },
751
+ {
752
+ "model_id": "meta/llama-2-70b",
753
+ "name": "Llama 2 70B",
754
+ "developer": "meta",
755
+ "scores": {
756
+ "Mean win rate": 0.482,
757
+ "NarrativeQA": 0.763,
758
+ "NaturalQuestions (closed-book)": 0.46,
759
+ "OpenbookQA": 0.838,
760
+ "MMLU": 0.58,
761
+ "MATH": 0.323,
762
+ "GSM8K": 0.567,
763
+ "LegalBench": 0.673,
764
+ "MedQA": 0.618,
765
+ "WMT 2014": 0.196
766
+ }
767
+ },
768
+ {
769
+ "model_id": "meta/llama-2-7b",
770
+ "name": "Llama 2 7B",
771
+ "developer": "meta",
772
+ "scores": {
773
+ "Mean win rate": 0.152,
774
+ "NarrativeQA": 0.686,
775
+ "NaturalQuestions (closed-book)": 0.333,
776
+ "OpenbookQA": 0.544,
777
+ "MMLU": 0.425,
778
+ "MATH": 0.097,
779
+ "GSM8K": 0.154,
780
+ "LegalBench": 0.502,
781
+ "MedQA": 0.392,
782
+ "WMT 2014": 0.144
783
+ }
784
+ },
785
+ {
786
+ "model_id": "meta/llama-3-70b",
787
+ "name": "Llama 3 70B",
788
+ "developer": "meta",
789
+ "scores": {
790
+ "Mean win rate": 0.793,
791
+ "NarrativeQA": 0.798,
792
+ "NaturalQuestions (closed-book)": 0.475,
793
+ "OpenbookQA": 0.934,
794
+ "MMLU": 0.695,
795
+ "MATH": 0.663,
796
+ "GSM8K": 0.805,
797
+ "LegalBench": 0.733,
798
+ "MedQA": 0.777,
799
+ "WMT 2014": 0.225
800
+ }
801
+ },
802
+ {
803
+ "model_id": "meta/llama-3-8b",
804
+ "name": "Llama 3 8B",
805
+ "developer": "meta",
806
+ "scores": {
807
+ "Mean win rate": 0.387,
808
+ "NarrativeQA": 0.754,
809
+ "NaturalQuestions (closed-book)": 0.378,
810
+ "OpenbookQA": 0.766,
811
+ "MMLU": 0.602,
812
+ "MATH": 0.391,
813
+ "GSM8K": 0.499,
814
+ "LegalBench": 0.637,
815
+ "MedQA": 0.581,
816
+ "WMT 2014": 0.183
817
+ }
818
+ },
819
+ {
820
+ "model_id": "meta/llama-3.1-405b-instruct-turbo",
821
+ "name": "Llama 3.1 Instruct Turbo 405B",
822
+ "developer": "meta",
823
+ "scores": {
824
+ "Mean win rate": 0.854,
825
+ "NarrativeQA": 0.749,
826
+ "NaturalQuestions (closed-book)": 0.456,
827
+ "OpenbookQA": 0.94,
828
+ "MMLU": 0.759,
829
+ "MATH": 0.827,
830
+ "GSM8K": 0.949,
831
+ "LegalBench": 0.707,
832
+ "MedQA": 0.805,
833
+ "WMT 2014": 0.238
834
+ }
835
+ },
836
+ {
837
+ "model_id": "meta/llama-3.1-70b-instruct-turbo",
838
+ "name": "Llama 3.1 Instruct Turbo 70B",
839
+ "developer": "meta",
840
+ "scores": {
841
+ "Mean win rate": 0.808,
842
+ "NarrativeQA": 0.772,
843
+ "NaturalQuestions (closed-book)": 0.452,
844
+ "OpenbookQA": 0.938,
845
+ "MMLU": 0.709,
846
+ "MATH": 0.783,
847
+ "GSM8K": 0.938,
848
+ "LegalBench": 0.687,
849
+ "MedQA": 0.769,
850
+ "WMT 2014": 0.223
851
+ }
852
+ },
853
+ {
854
+ "model_id": "meta/llama-3.1-8b-instruct-turbo",
855
+ "name": "Llama 3.1 Instruct Turbo 8B",
856
+ "developer": "meta",
857
+ "scores": {
858
+ "Mean win rate": 0.303,
859
+ "NarrativeQA": 0.756,
860
+ "NaturalQuestions (closed-book)": 0.209,
861
+ "OpenbookQA": 0.74,
862
+ "MMLU": 0.5,
863
+ "MATH": 0.703,
864
+ "GSM8K": 0.798,
865
+ "LegalBench": 0.342,
866
+ "MedQA": 0.245,
867
+ "WMT 2014": 0.181
868
+ }
869
+ },
870
+ {
871
+ "model_id": "meta/llama-3.2-11b-vision-instruct-turbo",
872
+ "name": "Llama 3.2 Vision Instruct Turbo 11B",
873
+ "developer": "meta",
874
+ "scores": {
875
+ "Mean win rate": 0.325,
876
+ "NarrativeQA": 0.756,
877
+ "NaturalQuestions (closed-book)": 0.234,
878
+ "OpenbookQA": 0.724,
879
+ "MMLU": 0.511,
880
+ "MATH": 0.739,
881
+ "GSM8K": 0.823,
882
+ "LegalBench": 0.435,
883
+ "MedQA": 0.27,
884
+ "WMT 2014": 0.179
885
+ }
886
+ },
887
+ {
888
+ "model_id": "meta/llama-3.2-90b-vision-instruct-turbo",
889
+ "name": "Llama 3.2 Vision Instruct Turbo 90B",
890
+ "developer": "meta",
891
+ "scores": {
892
+ "Mean win rate": 0.819,
893
+ "NarrativeQA": 0.777,
894
+ "NaturalQuestions (closed-book)": 0.457,
895
+ "OpenbookQA": 0.942,
896
+ "MMLU": 0.703,
897
+ "MATH": 0.791,
898
+ "GSM8K": 0.936,
899
+ "LegalBench": 0.68,
900
+ "MedQA": 0.769,
901
+ "WMT 2014": 0.224
902
+ }
903
+ },
904
+ {
905
+ "model_id": "meta/llama-3.3-70b-instruct-turbo",
906
+ "name": "Llama 3.3 Instruct Turbo 70B",
907
+ "developer": "meta",
908
+ "scores": {
909
+ "Mean win rate": 0.812,
910
+ "NarrativeQA": 0.791,
911
+ "NaturalQuestions (closed-book)": 0.431,
912
+ "OpenbookQA": 0.928,
913
+ "MMLU": 0.7,
914
+ "MATH": 0.808,
915
+ "GSM8K": 0.942,
916
+ "LegalBench": 0.725,
917
+ "MedQA": 0.761,
918
+ "WMT 2014": 0.219
919
+ }
920
+ },
921
+ {
922
+ "model_id": "meta/llama-65b",
923
+ "name": "LLaMA 65B",
924
+ "developer": "meta",
925
+ "scores": {
926
+ "Mean win rate": 0.345,
927
+ "NarrativeQA": 0.755,
928
+ "NaturalQuestions (closed-book)": 0.433,
929
+ "OpenbookQA": 0.754,
930
+ "MMLU": 0.584,
931
+ "MATH": 0.257,
932
+ "GSM8K": 0.489,
933
+ "LegalBench": 0.48,
934
+ "MedQA": 0.507,
935
+ "WMT 2014": 0.189
936
+ }
937
+ },
938
+ {
939
+ "model_id": "microsoft/phi-2",
940
+ "name": "Phi-2",
941
+ "developer": "microsoft",
942
+ "scores": {
943
+ "Mean win rate": 0.169,
944
+ "NarrativeQA": 0.703,
945
+ "NaturalQuestions (closed-book)": 0.155,
946
+ "OpenbookQA": 0.798,
947
+ "MMLU": 0.518,
948
+ "MATH": 0.255,
949
+ "GSM8K": 0.581,
950
+ "LegalBench": 0.334,
951
+ "MedQA": 0.41,
952
+ "WMT 2014": 0.038
953
+ }
954
+ },
955
+ {
956
+ "model_id": "microsoft/phi-3-medium-4k-instruct",
957
+ "name": "Phi-3 14B",
958
+ "developer": "microsoft",
959
+ "scores": {
960
+ "Mean win rate": 0.509,
961
+ "NarrativeQA": 0.724,
962
+ "NaturalQuestions (closed-book)": 0.278,
963
+ "OpenbookQA": 0.916,
964
+ "MMLU": 0.675,
965
+ "MATH": 0.611,
966
+ "GSM8K": 0.878,
967
+ "LegalBench": 0.593,
968
+ "MedQA": 0.696,
969
+ "WMT 2014": 0.17
970
+ }
971
+ },
972
+ {
973
+ "model_id": "microsoft/phi-3-small-8k-instruct",
974
+ "name": "Phi-3 7B",
975
+ "developer": "microsoft",
976
+ "scores": {
977
+ "Mean win rate": 0.473,
978
+ "NarrativeQA": 0.754,
979
+ "NaturalQuestions (closed-book)": 0.324,
980
+ "OpenbookQA": 0.912,
981
+ "MMLU": 0.659,
982
+ "MATH": 0.703,
983
+ "GSM8K": -1.0,
984
+ "LegalBench": 0.584,
985
+ "MedQA": 0.672,
986
+ "WMT 2014": 0.154
987
+ }
988
+ },
989
+ {
990
+ "model_id": "mistralai/mistral-7b-instruct-v0.3",
991
+ "name": "Mistral Instruct v0.3 7B",
992
+ "developer": "mistralai",
993
+ "scores": {
994
+ "Mean win rate": 0.196,
995
+ "NarrativeQA": 0.716,
996
+ "NaturalQuestions (closed-book)": 0.253,
997
+ "OpenbookQA": 0.79,
998
+ "MMLU": 0.51,
999
+ "MATH": 0.289,
1000
+ "GSM8K": 0.538,
1001
+ "LegalBench": 0.331,
1002
+ "MedQA": 0.517,
1003
+ "WMT 2014": 0.142
1004
+ }
1005
+ },
1006
+ {
1007
+ "model_id": "mistralai/mistral-7b-v0.1",
1008
+ "name": "Mistral v0.1 7B",
1009
+ "developer": "mistralai",
1010
+ "scores": {
1011
+ "Mean win rate": 0.292,
1012
+ "NarrativeQA": 0.716,
1013
+ "NaturalQuestions (closed-book)": 0.367,
1014
+ "OpenbookQA": 0.776,
1015
+ "MMLU": 0.584,
1016
+ "MATH": 0.297,
1017
+ "GSM8K": 0.377,
1018
+ "LegalBench": 0.58,
1019
+ "MedQA": 0.525,
1020
+ "WMT 2014": 0.16
1021
+ }
1022
+ },
1023
+ {
1024
+ "model_id": "mistralai/mistral-large-2402",
1025
+ "name": "Mistral Large 2402",
1026
+ "developer": "mistralai",
1027
+ "scores": {
1028
+ "Mean win rate": 0.328,
1029
+ "NarrativeQA": 0.454,
1030
+ "NaturalQuestions (closed-book)": 0.311,
1031
+ "OpenbookQA": 0.894,
1032
+ "MMLU": 0.638,
1033
+ "MATH": 0.75,
1034
+ "GSM8K": 0.694,
1035
+ "LegalBench": 0.479,
1036
+ "MedQA": 0.499,
1037
+ "WMT 2014": 0.182
1038
+ }
1039
+ },
1040
+ {
1041
+ "model_id": "mistralai/mistral-large-2407",
1042
+ "name": "Mistral Large 2 2407",
1043
+ "developer": "mistralai",
1044
+ "scores": {
1045
+ "Mean win rate": 0.744,
1046
+ "NarrativeQA": 0.779,
1047
+ "NaturalQuestions (closed-book)": 0.453,
1048
+ "OpenbookQA": 0.932,
1049
+ "MMLU": 0.725,
1050
+ "MATH": 0.677,
1051
+ "GSM8K": 0.912,
1052
+ "LegalBench": 0.646,
1053
+ "MedQA": 0.775,
1054
+ "WMT 2014": 0.192
1055
+ }
1056
+ },
1057
+ {
1058
+ "model_id": "mistralai/mistral-medium-2312",
1059
+ "name": "Mistral Medium 2312",
1060
+ "developer": "mistralai",
1061
+ "scores": {
1062
+ "Mean win rate": 0.268,
1063
+ "NarrativeQA": 0.449,
1064
+ "NaturalQuestions (closed-book)": 0.29,
1065
+ "OpenbookQA": 0.83,
1066
+ "MMLU": 0.618,
1067
+ "MATH": 0.565,
1068
+ "GSM8K": 0.706,
1069
+ "LegalBench": 0.452,
1070
+ "MedQA": 0.61,
1071
+ "WMT 2014": 0.169
1072
+ }
1073
+ },
1074
+ {
1075
+ "model_id": "mistralai/mistral-small-2402",
1076
+ "name": "Mistral Small 2402",
1077
+ "developer": "mistralai",
1078
+ "scores": {
1079
+ "Mean win rate": 0.288,
1080
+ "NarrativeQA": 0.519,
1081
+ "NaturalQuestions (closed-book)": 0.304,
1082
+ "OpenbookQA": 0.862,
1083
+ "MMLU": 0.593,
1084
+ "MATH": 0.621,
1085
+ "GSM8K": 0.734,
1086
+ "LegalBench": 0.389,
1087
+ "MedQA": 0.616,
1088
+ "WMT 2014": 0.169
1089
+ }
1090
+ },
1091
+ {
1092
+ "model_id": "mistralai/mixtral-8x22b",
1093
+ "name": "Mixtral 8x22B",
1094
+ "developer": "mistralai",
1095
+ "scores": {
1096
+ "Mean win rate": 0.705,
1097
+ "NarrativeQA": 0.779,
1098
+ "NaturalQuestions (closed-book)": 0.478,
1099
+ "OpenbookQA": 0.882,
1100
+ "MMLU": 0.701,
1101
+ "MATH": 0.656,
1102
+ "GSM8K": 0.8,
1103
+ "LegalBench": 0.708,
1104
+ "MedQA": 0.704,
1105
+ "WMT 2014": 0.209
1106
+ }
1107
+ },
1108
+ {
1109
+ "model_id": "mistralai/mixtral-8x7b-32kseqlen",
1110
+ "name": "Mixtral 8x7B 32K seqlen",
1111
+ "developer": "mistralai",
1112
+ "scores": {
1113
+ "Mean win rate": 0.51,
1114
+ "NarrativeQA": 0.767,
1115
+ "NaturalQuestions (closed-book)": 0.427,
1116
+ "OpenbookQA": 0.868,
1117
+ "MMLU": 0.649,
1118
+ "MATH": 0.494,
1119
+ "GSM8K": 0.622,
1120
+ "LegalBench": 0.63,
1121
+ "MedQA": 0.652,
1122
+ "WMT 2014": 0.19
1123
+ }
1124
+ },
1125
+ {
1126
+ "model_id": "mistralai/open-mistral-nemo-2407",
1127
+ "name": "Mistral NeMo 2402",
1128
+ "developer": "mistralai",
1129
+ "scores": {
1130
+ "Mean win rate": 0.333,
1131
+ "NarrativeQA": 0.731,
1132
+ "NaturalQuestions (closed-book)": 0.265,
1133
+ "OpenbookQA": 0.822,
1134
+ "MMLU": 0.604,
1135
+ "MATH": 0.668,
1136
+ "GSM8K": 0.782,
1137
+ "LegalBench": 0.415,
1138
+ "MedQA": 0.59,
1139
+ "WMT 2014": 0.177
1140
+ }
1141
+ },
1142
+ {
1143
+ "model_id": "openai/gpt-3.5-turbo-0613",
1144
+ "name": "gpt-3.5-turbo-0613",
1145
+ "developer": "openai",
1146
+ "scores": {
1147
+ "Mean win rate": 0.358,
1148
+ "NarrativeQA": 0.655,
1149
+ "NaturalQuestions (closed-book)": 0.335,
1150
+ "OpenbookQA": 0.838,
1151
+ "MMLU": 0.614,
1152
+ "MATH": 0.667,
1153
+ "GSM8K": 0.501,
1154
+ "LegalBench": 0.528,
1155
+ "MedQA": 0.622,
1156
+ "WMT 2014": 0.187
1157
+ }
1158
+ },
1159
+ {
1160
+ "model_id": "openai/gpt-4-0613",
1161
+ "name": "GPT-4 0613",
1162
+ "developer": "openai",
1163
+ "scores": {
1164
+ "Mean win rate": 0.867,
1165
+ "NarrativeQA": 0.768,
1166
+ "NaturalQuestions (closed-book)": 0.457,
1167
+ "OpenbookQA": 0.96,
1168
+ "MMLU": 0.735,
1169
+ "MATH": 0.802,
1170
+ "GSM8K": 0.932,
1171
+ "LegalBench": 0.713,
1172
+ "MedQA": 0.815,
1173
+ "WMT 2014": 0.211
1174
+ }
1175
+ },
1176
+ {
1177
+ "model_id": "openai/gpt-4-1106-preview",
1178
+ "name": "GPT-4 Turbo 1106 preview",
1179
+ "developer": "openai",
1180
+ "scores": {
1181
+ "Mean win rate": 0.698,
1182
+ "NarrativeQA": 0.727,
1183
+ "NaturalQuestions (closed-book)": 0.435,
1184
+ "OpenbookQA": 0.95,
1185
+ "MMLU": 0.699,
1186
+ "MATH": 0.857,
1187
+ "GSM8K": 0.668,
1188
+ "LegalBench": 0.626,
1189
+ "MedQA": 0.817,
1190
+ "WMT 2014": 0.205
1191
+ }
1192
+ },
1193
+ {
1194
+ "model_id": "openai/gpt-4-turbo-2024-04-09",
1195
+ "name": "GPT-4 Turbo 2024-04-09",
1196
+ "developer": "openai",
1197
+ "scores": {
1198
+ "Mean win rate": 0.864,
1199
+ "NarrativeQA": 0.761,
1200
+ "NaturalQuestions (closed-book)": 0.482,
1201
+ "OpenbookQA": 0.97,
1202
+ "MMLU": 0.711,
1203
+ "MATH": 0.833,
1204
+ "GSM8K": 0.824,
1205
+ "LegalBench": 0.727,
1206
+ "MedQA": 0.783,
1207
+ "WMT 2014": 0.218
1208
+ }
1209
+ },
1210
+ {
1211
+ "model_id": "openai/gpt-4o-2024-05-13",
1212
+ "name": "GPT-4o 2024-05-13",
1213
+ "developer": "openai",
1214
+ "scores": {
1215
+ "Mean win rate": 0.938,
1216
+ "NarrativeQA": 0.804,
1217
+ "NaturalQuestions (closed-book)": 0.501,
1218
+ "OpenbookQA": 0.966,
1219
+ "MMLU": 0.748,
1220
+ "MATH": 0.829,
1221
+ "GSM8K": 0.905,
1222
+ "LegalBench": 0.733,
1223
+ "MedQA": 0.857,
1224
+ "WMT 2014": 0.231
1225
+ }
1226
+ },
1227
+ {
1228
+ "model_id": "openai/gpt-4o-2024-08-06",
1229
+ "name": "GPT-4o 2024-08-06",
1230
+ "developer": "openai",
1231
+ "scores": {
1232
+ "Mean win rate": 0.928,
1233
+ "NarrativeQA": 0.795,
1234
+ "NaturalQuestions (closed-book)": 0.496,
1235
+ "OpenbookQA": 0.968,
1236
+ "MMLU": 0.738,
1237
+ "MATH": 0.853,
1238
+ "GSM8K": 0.909,
1239
+ "LegalBench": 0.721,
1240
+ "MedQA": 0.863,
1241
+ "WMT 2014": 0.225
1242
+ }
1243
+ },
1244
+ {
1245
+ "model_id": "openai/gpt-4o-mini-2024-07-18",
1246
+ "name": "GPT-4o mini 2024-07-18",
1247
+ "developer": "openai",
1248
+ "scores": {
1249
+ "Mean win rate": 0.701,
1250
+ "NarrativeQA": 0.768,
1251
+ "NaturalQuestions (closed-book)": 0.386,
1252
+ "OpenbookQA": 0.92,
1253
+ "MMLU": 0.668,
1254
+ "MATH": 0.802,
1255
+ "GSM8K": 0.843,
1256
+ "LegalBench": 0.653,
1257
+ "MedQA": 0.748,
1258
+ "WMT 2014": 0.206
1259
+ }
1260
+ },
1261
+ {
1262
+ "model_id": "openai/text-davinci-002",
1263
+ "name": "text-davinci-002",
1264
+ "developer": "openai",
1265
+ "scores": {
1266
+ "Mean win rate": 0.336,
1267
+ "NarrativeQA": 0.719,
1268
+ "NaturalQuestions (closed-book)": 0.394,
1269
+ "OpenbookQA": 0.796,
1270
+ "MMLU": 0.568,
1271
+ "MATH": 0.428,
1272
+ "GSM8K": 0.479,
1273
+ "LegalBench": 0.58,
1274
+ "MedQA": 0.525,
1275
+ "WMT 2014": 0.174
1276
+ }
1277
+ },
1278
+ {
1279
+ "model_id": "openai/text-davinci-003",
1280
+ "name": "text-davinci-003",
1281
+ "developer": "openai",
1282
+ "scores": {
1283
+ "Mean win rate": 0.439,
1284
+ "NarrativeQA": 0.731,
1285
+ "NaturalQuestions (closed-book)": 0.413,
1286
+ "OpenbookQA": 0.828,
1287
+ "MMLU": 0.555,
1288
+ "MATH": 0.449,
1289
+ "GSM8K": 0.615,
1290
+ "LegalBench": 0.622,
1291
+ "MedQA": 0.531,
1292
+ "WMT 2014": 0.191
1293
+ }
1294
+ },
1295
+ {
1296
+ "model_id": "qwen/qwen1.5-110b-chat",
1297
+ "name": "Qwen1.5 Chat 110B",
1298
+ "developer": "qwen",
1299
+ "scores": {
1300
+ "Mean win rate": 0.55,
1301
+ "NarrativeQA": 0.721,
1302
+ "NaturalQuestions (closed-book)": 0.35,
1303
+ "OpenbookQA": 0.922,
1304
+ "MMLU": 0.704,
1305
+ "MATH": 0.568,
1306
+ "GSM8K": 0.815,
1307
+ "LegalBench": 0.624,
1308
+ "MedQA": 0.64,
1309
+ "WMT 2014": 0.192
1310
+ }
1311
+ },
1312
+ {
1313
+ "model_id": "qwen/qwen1.5-14b",
1314
+ "name": "Qwen1.5 14B",
1315
+ "developer": "qwen",
1316
+ "scores": {
1317
+ "Mean win rate": 0.425,
1318
+ "NarrativeQA": 0.711,
1319
+ "NaturalQuestions (closed-book)": 0.3,
1320
+ "OpenbookQA": 0.862,
1321
+ "MMLU": 0.626,
1322
+ "MATH": 0.686,
1323
+ "GSM8K": 0.693,
1324
+ "LegalBench": 0.593,
1325
+ "MedQA": 0.515,
1326
+ "WMT 2014": 0.178
1327
+ }
1328
+ },
1329
+ {
1330
+ "model_id": "qwen/qwen1.5-32b",
1331
+ "name": "Qwen1.5 32B",
1332
+ "developer": "qwen",
1333
+ "scores": {
1334
+ "Mean win rate": 0.546,
1335
+ "NarrativeQA": 0.589,
1336
+ "NaturalQuestions (closed-book)": 0.353,
1337
+ "OpenbookQA": 0.932,
1338
+ "MMLU": 0.628,
1339
+ "MATH": 0.733,
1340
+ "GSM8K": 0.773,
1341
+ "LegalBench": 0.636,
1342
+ "MedQA": 0.656,
1343
+ "WMT 2014": 0.193
1344
+ }
1345
+ },
1346
+ {
1347
+ "model_id": "qwen/qwen1.5-72b",
1348
+ "name": "Qwen1.5 72B",
1349
+ "developer": "qwen",
1350
+ "scores": {
1351
+ "Mean win rate": 0.608,
1352
+ "NarrativeQA": 0.601,
1353
+ "NaturalQuestions (closed-book)": 0.417,
1354
+ "OpenbookQA": 0.93,
1355
+ "MMLU": 0.647,
1356
+ "MATH": 0.683,
1357
+ "GSM8K": 0.799,
1358
+ "LegalBench": 0.694,
1359
+ "MedQA": 0.67,
1360
+ "WMT 2014": 0.201
1361
+ }
1362
+ },
1363
+ {
1364
+ "model_id": "qwen/qwen1.5-7b",
1365
+ "name": "Qwen1.5 7B",
1366
+ "developer": "qwen",
1367
+ "scores": {
1368
+ "Mean win rate": 0.275,
1369
+ "NarrativeQA": 0.448,
1370
+ "NaturalQuestions (closed-book)": 0.27,
1371
+ "OpenbookQA": 0.806,
1372
+ "MMLU": 0.569,
1373
+ "MATH": 0.561,
1374
+ "GSM8K": 0.6,
1375
+ "LegalBench": 0.523,
1376
+ "MedQA": 0.479,
1377
+ "WMT 2014": 0.153
1378
+ }
1379
+ },
1380
+ {
1381
+ "model_id": "qwen/qwen2-72b-instruct",
1382
+ "name": "Qwen2 Instruct 72B",
1383
+ "developer": "qwen",
1384
+ "scores": {
1385
+ "Mean win rate": 0.77,
1386
+ "NarrativeQA": 0.727,
1387
+ "NaturalQuestions (closed-book)": 0.39,
1388
+ "OpenbookQA": 0.954,
1389
+ "MMLU": 0.769,
1390
+ "MATH": 0.79,
1391
+ "GSM8K": 0.92,
1392
+ "LegalBench": 0.712,
1393
+ "MedQA": 0.746,
1394
+ "WMT 2014": 0.207
1395
+ }
1396
+ },
1397
+ {
1398
+ "model_id": "qwen/qwen2.5-72b-instruct-turbo",
1399
+ "name": "Qwen2.5 Instruct Turbo 72B",
1400
+ "developer": "qwen",
1401
+ "scores": {
1402
+ "Mean win rate": 0.745,
1403
+ "NarrativeQA": 0.745,
1404
+ "NaturalQuestions (closed-book)": 0.359,
1405
+ "OpenbookQA": 0.962,
1406
+ "MMLU": 0.77,
1407
+ "MATH": 0.884,
1408
+ "GSM8K": 0.9,
1409
+ "LegalBench": 0.74,
1410
+ "MedQA": 0.753,
1411
+ "WMT 2014": 0.207
1412
+ }
1413
+ },
1414
+ {
1415
+ "model_id": "qwen/qwen2.5-7b-instruct-turbo",
1416
+ "name": "Qwen2.5 Instruct Turbo 7B",
1417
+ "developer": "qwen",
1418
+ "scores": {
1419
+ "Mean win rate": 0.488,
1420
+ "NarrativeQA": 0.742,
1421
+ "NaturalQuestions (closed-book)": 0.205,
1422
+ "OpenbookQA": 0.862,
1423
+ "MMLU": 0.658,
1424
+ "MATH": 0.835,
1425
+ "GSM8K": 0.83,
1426
+ "LegalBench": 0.632,
1427
+ "MedQA": 0.6,
1428
+ "WMT 2014": 0.155
1429
+ }
1430
+ },
1431
+ {
1432
+ "model_id": "snowflake/snowflake-arctic-instruct",
1433
+ "name": "Arctic Instruct",
1434
+ "developer": "snowflake",
1435
+ "scores": {
1436
+ "Mean win rate": 0.338,
1437
+ "NarrativeQA": 0.654,
1438
+ "NaturalQuestions (closed-book)": 0.39,
1439
+ "OpenbookQA": 0.828,
1440
+ "MMLU": 0.575,
1441
+ "MATH": 0.519,
1442
+ "GSM8K": 0.768,
1443
+ "LegalBench": 0.588,
1444
+ "MedQA": 0.581,
1445
+ "WMT 2014": 0.172
1446
+ }
1447
+ },
1448
+ {
1449
+ "model_id": "tiiuae/falcon-40b",
1450
+ "name": "Falcon 40B",
1451
+ "developer": "tiiuae",
1452
+ "scores": {
1453
+ "Mean win rate": 0.217,
1454
+ "NarrativeQA": 0.671,
1455
+ "NaturalQuestions (closed-book)": 0.392,
1456
+ "OpenbookQA": 0.662,
1457
+ "MMLU": 0.507,
1458
+ "MATH": 0.128,
1459
+ "GSM8K": 0.267,
1460
+ "LegalBench": 0.442,
1461
+ "MedQA": 0.419,
1462
+ "WMT 2014": 0.162
1463
+ }
1464
+ },
1465
+ {
1466
+ "model_id": "tiiuae/falcon-7b",
1467
+ "name": "Falcon 7B",
1468
+ "developer": "tiiuae",
1469
+ "scores": {
1470
+ "Mean win rate": 0.064,
1471
+ "NarrativeQA": 0.621,
1472
+ "NaturalQuestions (closed-book)": 0.285,
1473
+ "OpenbookQA": 0.26,
1474
+ "MMLU": 0.288,
1475
+ "MATH": 0.044,
1476
+ "GSM8K": 0.055,
1477
+ "LegalBench": 0.346,
1478
+ "MedQA": 0.254,
1479
+ "WMT 2014": 0.094
1480
+ }
1481
+ },
1482
+ {
1483
+ "model_id": "upstage/solar-pro-241126",
1484
+ "name": "Solar Pro",
1485
+ "developer": "upstage",
1486
+ "scores": {
1487
+ "Mean win rate": 0.602,
1488
+ "NarrativeQA": 0.753,
1489
+ "NaturalQuestions (closed-book)": 0.297,
1490
+ "OpenbookQA": 0.922,
1491
+ "MMLU": 0.679,
1492
+ "MATH": 0.567,
1493
+ "GSM8K": 0.871,
1494
+ "LegalBench": 0.67,
1495
+ "MedQA": 0.698,
1496
+ "WMT 2014": 0.169
1497
+ }
1498
+ },
1499
+ {
1500
+ "model_id": "writer/palmyra-x-004",
1501
+ "name": "Palmyra-X-004",
1502
+ "developer": "writer",
1503
+ "scores": {
1504
+ "Mean win rate": 0.808,
1505
+ "NarrativeQA": 0.773,
1506
+ "NaturalQuestions (closed-book)": 0.457,
1507
+ "OpenbookQA": 0.926,
1508
+ "MMLU": 0.739,
1509
+ "MATH": 0.767,
1510
+ "GSM8K": 0.905,
1511
+ "LegalBench": 0.73,
1512
+ "MedQA": 0.775,
1513
+ "WMT 2014": 0.203
1514
+ }
1515
+ },
1516
+ {
1517
+ "model_id": "writer/palmyra-x-v2",
1518
+ "name": "Palmyra X V2 33B",
1519
+ "developer": "writer",
1520
+ "scores": {
1521
+ "Mean win rate": 0.589,
1522
+ "NarrativeQA": 0.752,
1523
+ "NaturalQuestions (closed-book)": 0.428,
1524
+ "OpenbookQA": 0.878,
1525
+ "MMLU": 0.621,
1526
+ "MATH": 0.58,
1527
+ "GSM8K": 0.735,
1528
+ "LegalBench": 0.644,
1529
+ "MedQA": 0.598,
1530
+ "WMT 2014": 0.239
1531
+ }
1532
+ },
1533
+ {
1534
+ "model_id": "writer/palmyra-x-v3",
1535
+ "name": "Palmyra X V3 72B",
1536
+ "developer": "writer",
1537
+ "scores": {
1538
+ "Mean win rate": 0.679,
1539
+ "NarrativeQA": 0.706,
1540
+ "NaturalQuestions (closed-book)": 0.407,
1541
+ "OpenbookQA": 0.938,
1542
+ "MMLU": 0.702,
1543
+ "MATH": 0.723,
1544
+ "GSM8K": 0.831,
1545
+ "LegalBench": 0.709,
1546
+ "MedQA": 0.684,
1547
+ "WMT 2014": 0.262
1548
+ }
1549
+ }
1550
+ ]
1551
+ }
data/benchmarks/helm_mmlu.json ADDED
The diff for this file is too large to render. See raw diff
 
data/benchmarks/hfopenllm_v2.json ADDED
The diff for this file is too large to render. See raw diff
 
data/benchmarks/livecodebenchpro.json ADDED
@@ -0,0 +1,274 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "models": [
3
+ {
4
+ "model_id": "alibaba/qwen3-235b-a22b-thinking-2507",
5
+ "name": "qwen3-235b-a22b-thinking-2507",
6
+ "developer": "Alibaba",
7
+ "scores": {
8
+ "Hard Problems": 0.0,
9
+ "Medium Problems": 0.1267605633802817,
10
+ "Easy Problems": 0.7605633802816901
11
+ }
12
+ },
13
+ {
14
+ "model_id": "alibaba/qwen3-30b-a3b",
15
+ "name": "qwen3-30b-a3b",
16
+ "developer": "Alibaba",
17
+ "scores": {
18
+ "Hard Problems": 0.0,
19
+ "Medium Problems": 0.028169014084507043,
20
+ "Easy Problems": 0.5774647887323944
21
+ }
22
+ },
23
+ {
24
+ "model_id": "alibaba/qwen3-max",
25
+ "name": "alibaba/qwen3-max",
26
+ "developer": "Alibaba",
27
+ "scores": {
28
+ "Hard Problems": 0.0,
29
+ "Medium Problems": 0.04225352112676056,
30
+ "Easy Problems": 0.36619718309859156
31
+ }
32
+ },
33
+ {
34
+ "model_id": "alibaba/qwen3-next-80b-a3b-thinking",
35
+ "name": "qwen3-next-80b-a3b-thinking",
36
+ "developer": "Alibaba",
37
+ "scores": {
38
+ "Hard Problems": 0.0,
39
+ "Medium Problems": 0.14084507042253522,
40
+ "Easy Problems": 0.7464788732394366
41
+ }
42
+ },
43
+ {
44
+ "model_id": "aliyun/qwen3-next-80b-a3b-thinking",
45
+ "name": "qwen3-next-80b-a3b-thinking",
46
+ "developer": "aliyun",
47
+ "scores": {
48
+ "Hard Problems": 0.0,
49
+ "Medium Problems": 0.0704,
50
+ "Easy Problems": 0.6901
51
+ }
52
+ },
53
+ {
54
+ "model_id": "anthropic/claude-3-7-sonnet-20250219",
55
+ "name": "claude-3-7-sonnet-20250219",
56
+ "developer": "anthropic",
57
+ "scores": {
58
+ "Hard Problems": 0.0,
59
+ "Medium Problems": 0.0,
60
+ "Easy Problems": 0.28169014084507044
61
+ }
62
+ },
63
+ {
64
+ "model_id": "anthropic/claude-3.7-sonnet",
65
+ "name": "anthropic/claude-3.7-sonnet",
66
+ "developer": "Anthropic",
67
+ "scores": {
68
+ "Hard Problems": 0.0,
69
+ "Medium Problems": 0.014084507042253521,
70
+ "Easy Problems": 0.15492957746478872
71
+ }
72
+ },
73
+ {
74
+ "model_id": "anthropic/claude-sonnet-4-5-20250929",
75
+ "name": "claude-sonnet-4-5-20250929",
76
+ "developer": "anthropic",
77
+ "scores": {
78
+ "Hard Problems": 0.0,
79
+ "Medium Problems": 0.0,
80
+ "Easy Problems": 0.5352
81
+ }
82
+ },
83
+ {
84
+ "model_id": "ark/ep-20250603132404-cgpjm",
85
+ "name": "ep-20250603132404-cgpjm",
86
+ "developer": "ark",
87
+ "scores": {
88
+ "Hard Problems": 0.0,
89
+ "Medium Problems": 0.0141,
90
+ "Easy Problems": 0.507
91
+ }
92
+ },
93
+ {
94
+ "model_id": "bytedance/doubao-seed-1-6-thinking-250615",
95
+ "name": "doubao-seed-1-6-thinking-250615",
96
+ "developer": "ByteDance",
97
+ "scores": {
98
+ "Hard Problems": 0.0,
99
+ "Medium Problems": 0.07042253521126761,
100
+ "Easy Problems": 0.5774647887323944
101
+ }
102
+ },
103
+ {
104
+ "model_id": "deepseek/chat-v3-0324",
105
+ "name": "deepseek/chat-v3-0324",
106
+ "developer": "DeepSeek",
107
+ "scores": {
108
+ "Hard Problems": 0.0,
109
+ "Medium Problems": 0.0,
110
+ "Easy Problems": 0.19718309859154928
111
+ }
112
+ },
113
+ {
114
+ "model_id": "deepseek/ep-20250214004308-p7n89",
115
+ "name": "ep-20250214004308-p7n89",
116
+ "developer": "DeepSeek",
117
+ "scores": {
118
+ "Hard Problems": 0.0,
119
+ "Medium Problems": 0.014084507042253521,
120
+ "Easy Problems": 0.4225352112676056
121
+ }
122
+ },
123
+ {
124
+ "model_id": "deepseek/ep-20250228232227-z44x5",
125
+ "name": "ep-20250228232227-z44x5",
126
+ "developer": "DeepSeek",
127
+ "scores": {
128
+ "Hard Problems": 0.0,
129
+ "Medium Problems": 0.0,
130
+ "Easy Problems": 0.1267605633802817
131
+ }
132
+ },
133
+ {
134
+ "model_id": "deepseek/ep-20250603132404-cgpjm",
135
+ "name": "ep-20250603132404-cgpjm",
136
+ "developer": "DeepSeek",
137
+ "scores": {
138
+ "Hard Problems": 0.0,
139
+ "Medium Problems": 0.08450704225352113,
140
+ "Easy Problems": 0.5774647887323944
141
+ }
142
+ },
143
+ {
144
+ "model_id": "google/gemini-2.5-flash",
145
+ "name": "gemini-2.5-flash",
146
+ "developer": "google",
147
+ "scores": {
148
+ "Hard Problems": 0.0,
149
+ "Medium Problems": 0.028169014084507043,
150
+ "Easy Problems": 0.38028169014084506
151
+ }
152
+ },
153
+ {
154
+ "model_id": "google/gemini-2.5-pro",
155
+ "name": "gemini-2.5-pro",
156
+ "developer": "google",
157
+ "scores": {
158
+ "Hard Problems": 0.014084507042253521,
159
+ "Medium Problems": 0.2112676056338028,
160
+ "Easy Problems": 0.7183098591549296
161
+ }
162
+ },
163
+ {
164
+ "model_id": "kuaishou/kwaipilot-40b-0604",
165
+ "name": "kwaipilot-40b-0604",
166
+ "developer": "Kuaishou",
167
+ "scores": {
168
+ "Hard Problems": 0.0,
169
+ "Medium Problems": 0.07042253521126761,
170
+ "Easy Problems": 0.056338028169014086
171
+ }
172
+ },
173
+ {
174
+ "model_id": "meta/llama-4-maverick",
175
+ "name": "meta/llama-4-maverick",
176
+ "developer": "Meta",
177
+ "scores": {
178
+ "Hard Problems": 0.0,
179
+ "Medium Problems": 0.0,
180
+ "Easy Problems": 0.09859154929577464
181
+ }
182
+ },
183
+ {
184
+ "model_id": "openai/gpt-4.1",
185
+ "name": "openai/gpt-4.1",
186
+ "developer": "OpenAI",
187
+ "scores": {
188
+ "Hard Problems": 0.0,
189
+ "Medium Problems": 0.0,
190
+ "Easy Problems": 0.19718309859154928
191
+ }
192
+ },
193
+ {
194
+ "model_id": "openai/gpt-4o-2024-11-20",
195
+ "name": "GPT-4o 2024-11-20",
196
+ "developer": "openai",
197
+ "scores": {
198
+ "Hard Problems": 0.0,
199
+ "Medium Problems": 0.0,
200
+ "Easy Problems": 0.07042253521126761
201
+ }
202
+ },
203
+ {
204
+ "model_id": "openai/gpt-5-2025-08-07",
205
+ "name": "gpt-5-2025-08-07",
206
+ "developer": "openai",
207
+ "scores": {
208
+ "Hard Problems": 0.0423,
209
+ "Medium Problems": 0.4085,
210
+ "Easy Problems": 0.9014
211
+ }
212
+ },
213
+ {
214
+ "model_id": "openai/gpt-5.2-2025-12-11",
215
+ "name": "gpt-5.2-2025-12-11",
216
+ "developer": "OpenAI",
217
+ "scores": {
218
+ "Hard Problems": 0.1594,
219
+ "Medium Problems": 0.5211,
220
+ "Easy Problems": 0.9014
221
+ }
222
+ },
223
+ {
224
+ "model_id": "openai/gpt-oss-120b",
225
+ "name": "gpt-oss-120b",
226
+ "developer": "openai",
227
+ "scores": {
228
+ "Hard Problems": 0.0,
229
+ "Medium Problems": 0.11267605633802817,
230
+ "Easy Problems": 0.6619718309859155
231
+ }
232
+ },
233
+ {
234
+ "model_id": "openai/gpt-oss-20b",
235
+ "name": "gpt-oss-20b",
236
+ "developer": "openai",
237
+ "scores": {
238
+ "Hard Problems": 0.0,
239
+ "Medium Problems": 0.056338028169014086,
240
+ "Easy Problems": 0.5070422535211268
241
+ }
242
+ },
243
+ {
244
+ "model_id": "openai/o3-2025-04-16",
245
+ "name": "o3 2025-04-16",
246
+ "developer": "openai",
247
+ "scores": {
248
+ "Hard Problems": 0.0,
249
+ "Medium Problems": 0.22535211267605634,
250
+ "Easy Problems": 0.7183098591549296
251
+ }
252
+ },
253
+ {
254
+ "model_id": "openai/o4-mini-2025-04-16",
255
+ "name": "o4-mini-2025-04-16",
256
+ "developer": "openai",
257
+ "scores": {
258
+ "Hard Problems": 0.014084507042253521,
259
+ "Medium Problems": 0.30985915492957744,
260
+ "Easy Problems": 0.8873239436619719
261
+ }
262
+ },
263
+ {
264
+ "model_id": "z-ai/glm-4.5",
265
+ "name": "z-ai/glm-4.5",
266
+ "developer": "Z.AI",
267
+ "scores": {
268
+ "Hard Problems": 0.0,
269
+ "Medium Problems": 0.028169014084507043,
270
+ "Easy Problems": 0.1267605633802817
271
+ }
272
+ }
273
+ ]
274
+ }
data/benchmarks/reward-bench.json ADDED
The diff for this file is too large to render. See raw diff
 
data/benchmarks/swe-bench.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "models": [
3
+ {
4
+ "model_id": "anthropic/claude-opus-4-5",
5
+ "name": "claude-opus-4-5",
6
+ "developer": "Anthropic",
7
+ "scores": {
8
+ "swe-bench": 0.6061
9
+ }
10
+ },
11
+ {
12
+ "model_id": "google/gemini-3-pro-preview",
13
+ "name": "gemini-3-pro-preview",
14
+ "developer": "Google",
15
+ "scores": {
16
+ "swe-bench": 0.7576
17
+ }
18
+ },
19
+ {
20
+ "model_id": "openai/gpt-5.2-2025-12-11",
21
+ "name": "gpt-5.2-2025-12-11",
22
+ "developer": "OpenAI",
23
+ "scores": {
24
+ "swe-bench": 0.57
25
+ }
26
+ }
27
+ ]
28
+ }
data/benchmarks/tau-bench-2_airline.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "models": [
3
+ {
4
+ "model_id": "anthropic/claude-opus-4-5",
5
+ "name": "claude-opus-4-5",
6
+ "developer": "Anthropic",
7
+ "scores": {
8
+ "tau-bench-2/airline": 0.66
9
+ }
10
+ },
11
+ {
12
+ "model_id": "google/gemini-3-pro-preview",
13
+ "name": "gemini-3-pro-preview",
14
+ "developer": "Google",
15
+ "scores": {
16
+ "tau-bench-2/airline": 0.7
17
+ }
18
+ },
19
+ {
20
+ "model_id": "openai/gpt-5.2-2025-12-11",
21
+ "name": "gpt-5.2-2025-12-11",
22
+ "developer": "OpenAI",
23
+ "scores": {
24
+ "tau-bench-2/airline": 0.54
25
+ }
26
+ }
27
+ ]
28
+ }
data/benchmarks/tau-bench-2_retail.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "models": [
3
+ {
4
+ "model_id": "anthropic/claude-opus-4-5",
5
+ "name": "claude-opus-4-5",
6
+ "developer": "Anthropic",
7
+ "scores": {
8
+ "tau-bench-2/retail": 0.83
9
+ }
10
+ },
11
+ {
12
+ "model_id": "google/gemini-3-pro-preview",
13
+ "name": "gemini-3-pro-preview",
14
+ "developer": "Google",
15
+ "scores": {
16
+ "tau-bench-2/retail": 0.7576
17
+ }
18
+ },
19
+ {
20
+ "model_id": "openai/gpt-5.2-2025-12-11",
21
+ "name": "gpt-5.2-2025-12-11",
22
+ "developer": "OpenAI",
23
+ "scores": {
24
+ "tau-bench-2/retail": 0.68
25
+ }
26
+ }
27
+ ]
28
+ }
data/benchmarks/tau-bench-2_telecom.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "models": [
3
+ {
4
+ "model_id": "anthropic/claude-opus-4-5",
5
+ "name": "claude-opus-4-5",
6
+ "developer": "Anthropic",
7
+ "scores": {
8
+ "tau-bench-2/telecom": 0.76
9
+ }
10
+ },
11
+ {
12
+ "model_id": "google/gemini-3-pro-preview",
13
+ "name": "gemini-3-pro-preview",
14
+ "developer": "Google",
15
+ "scores": {
16
+ "tau-bench-2/telecom": 0.73
17
+ }
18
+ },
19
+ {
20
+ "model_id": "openai/gpt-5.2-2025-12-11",
21
+ "name": "gpt-5.2-2025-12-11",
22
+ "developer": "OpenAI",
23
+ "scores": {
24
+ "tau-bench-2/telecom": 0.5354
25
+ }
26
+ }
27
+ ]
28
+ }
data/benchmarks/terminal-bench-2.0.json ADDED
@@ -0,0 +1,300 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "models": [
3
+ {
4
+ "model_id": "alibaba/qwen-3-coder-480b",
5
+ "name": "Qwen 3 Coder 480B",
6
+ "developer": "Alibaba",
7
+ "scores": {
8
+ "terminal-bench-2.0": 25.4
9
+ }
10
+ },
11
+ {
12
+ "model_id": "anthropic/claude-haiku-4.5",
13
+ "name": "Claude Haiku 4.5",
14
+ "developer": "Anthropic",
15
+ "scores": {
16
+ "terminal-bench-2.0": 29.8
17
+ }
18
+ },
19
+ {
20
+ "model_id": "anthropic/claude-opus-4.1",
21
+ "name": "Claude Opus 4.1",
22
+ "developer": "Anthropic",
23
+ "scores": {
24
+ "terminal-bench-2.0": 35.1
25
+ }
26
+ },
27
+ {
28
+ "model_id": "anthropic/claude-opus-4.5",
29
+ "name": "Claude Opus 4.5",
30
+ "developer": "Anthropic",
31
+ "scores": {
32
+ "terminal-bench-2.0": 59.1
33
+ }
34
+ },
35
+ {
36
+ "model_id": "anthropic/claude-opus-4.6",
37
+ "name": "Claude Opus 4.6",
38
+ "developer": "Anthropic",
39
+ "scores": {
40
+ "terminal-bench-2.0": 58.0
41
+ }
42
+ },
43
+ {
44
+ "model_id": "anthropic/claude-sonnet-4.5",
45
+ "name": "Claude Sonnet 4.5",
46
+ "developer": "Anthropic",
47
+ "scores": {
48
+ "terminal-bench-2.0": 46.5
49
+ }
50
+ },
51
+ {
52
+ "model_id": "deepseek/deepseek-v3.2",
53
+ "name": "DeepSeek-V3.2",
54
+ "developer": "DeepSeek",
55
+ "scores": {
56
+ "terminal-bench-2.0": 39.6
57
+ }
58
+ },
59
+ {
60
+ "model_id": "google/gemini-2.5-flash",
61
+ "name": "gemini-2.5-flash",
62
+ "developer": "google",
63
+ "scores": {
64
+ "terminal-bench-2.0": 17.1
65
+ }
66
+ },
67
+ {
68
+ "model_id": "google/gemini-2.5-pro",
69
+ "name": "gemini-2.5-pro",
70
+ "developer": "google",
71
+ "scores": {
72
+ "terminal-bench-2.0": 26.1
73
+ }
74
+ },
75
+ {
76
+ "model_id": "google/gemini-3-flash",
77
+ "name": "Gemini 3 Flash",
78
+ "developer": "Google",
79
+ "scores": {
80
+ "terminal-bench-2.0": 64.3
81
+ }
82
+ },
83
+ {
84
+ "model_id": "google/gemini-3-pro",
85
+ "name": "Gemini 3 Pro",
86
+ "developer": "Google",
87
+ "scores": {
88
+ "terminal-bench-2.0": 65.2
89
+ }
90
+ },
91
+ {
92
+ "model_id": "google/gemini-3.1-pro",
93
+ "name": "Gemini 3.1 Pro",
94
+ "developer": "Google",
95
+ "scores": {
96
+ "terminal-bench-2.0": 74.8
97
+ }
98
+ },
99
+ {
100
+ "model_id": "minimax/minimax-m2",
101
+ "name": "MiniMax M2",
102
+ "developer": "MiniMax",
103
+ "scores": {
104
+ "terminal-bench-2.0": 30.0
105
+ }
106
+ },
107
+ {
108
+ "model_id": "minimax/minimax-m2.1",
109
+ "name": "MiniMax M2.1",
110
+ "developer": "MiniMax",
111
+ "scores": {
112
+ "terminal-bench-2.0": 36.6
113
+ }
114
+ },
115
+ {
116
+ "model_id": "minimax/minimax-m2.5",
117
+ "name": "Minimax m2.5",
118
+ "developer": "Minimax",
119
+ "scores": {
120
+ "terminal-bench-2.0": 42.2
121
+ }
122
+ },
123
+ {
124
+ "model_id": "moonshot-ai/kimi-k2-instruct",
125
+ "name": "Kimi K2 Instruct",
126
+ "developer": "Moonshot AI",
127
+ "scores": {
128
+ "terminal-bench-2.0": 27.8
129
+ }
130
+ },
131
+ {
132
+ "model_id": "moonshot-ai/kimi-k2-thinking",
133
+ "name": "Kimi K2 Thinking",
134
+ "developer": "Moonshot AI",
135
+ "scores": {
136
+ "terminal-bench-2.0": 35.7
137
+ }
138
+ },
139
+ {
140
+ "model_id": "moonshot-ai/kimi-k2.5",
141
+ "name": "Kimi K2.5",
142
+ "developer": "Kimi",
143
+ "scores": {
144
+ "terminal-bench-2.0": 43.2
145
+ }
146
+ },
147
+ {
148
+ "model_id": "multiple/multiple",
149
+ "name": "Multiple",
150
+ "developer": "Multiple",
151
+ "scores": {
152
+ "terminal-bench-2.0": 59.1
153
+ }
154
+ },
155
+ {
156
+ "model_id": "openai/gpt-5",
157
+ "name": "GPT-5",
158
+ "developer": "OpenAI",
159
+ "scores": {
160
+ "terminal-bench-2.0": 33.9
161
+ }
162
+ },
163
+ {
164
+ "model_id": "openai/gpt-5-codex",
165
+ "name": "GPT-5-Codex",
166
+ "developer": "OpenAI",
167
+ "scores": {
168
+ "terminal-bench-2.0": 43.4
169
+ }
170
+ },
171
+ {
172
+ "model_id": "openai/gpt-5-mini",
173
+ "name": "GPT-5-Mini",
174
+ "developer": "OpenAI",
175
+ "scores": {
176
+ "terminal-bench-2.0": 29.2
177
+ }
178
+ },
179
+ {
180
+ "model_id": "openai/gpt-5-nano",
181
+ "name": "GPT-5-Nano",
182
+ "developer": "OpenAI",
183
+ "scores": {
184
+ "terminal-bench-2.0": 9.9
185
+ }
186
+ },
187
+ {
188
+ "model_id": "openai/gpt-5.1",
189
+ "name": "GPT-5.1",
190
+ "developer": "OpenAI",
191
+ "scores": {
192
+ "terminal-bench-2.0": 47.6
193
+ }
194
+ },
195
+ {
196
+ "model_id": "openai/gpt-5.1-codex",
197
+ "name": "GPT-5.1-Codex",
198
+ "developer": "OpenAI",
199
+ "scores": {
200
+ "terminal-bench-2.0": 57.8
201
+ }
202
+ },
203
+ {
204
+ "model_id": "openai/gpt-5.1-codex-max",
205
+ "name": "GPT-5.1-Codex-Max",
206
+ "developer": "OpenAI",
207
+ "scores": {
208
+ "terminal-bench-2.0": 60.4
209
+ }
210
+ },
211
+ {
212
+ "model_id": "openai/gpt-5.1-codex-mini",
213
+ "name": "GPT-5.1-Codex-Mini",
214
+ "developer": "OpenAI",
215
+ "scores": {
216
+ "terminal-bench-2.0": 43.1
217
+ }
218
+ },
219
+ {
220
+ "model_id": "openai/gpt-5.2",
221
+ "name": "GPT-5.2",
222
+ "developer": "OpenAI",
223
+ "scores": {
224
+ "terminal-bench-2.0": 54.0
225
+ }
226
+ },
227
+ {
228
+ "model_id": "openai/gpt-5.2-codex",
229
+ "name": "GPT-5.2-Codex",
230
+ "developer": "OpenAI",
231
+ "scores": {
232
+ "terminal-bench-2.0": 66.5
233
+ }
234
+ },
235
+ {
236
+ "model_id": "openai/gpt-5.3-codex",
237
+ "name": "GPT-5.3-Codex",
238
+ "developer": "OpenAI",
239
+ "scores": {
240
+ "terminal-bench-2.0": 70.3
241
+ }
242
+ },
243
+ {
244
+ "model_id": "openai/gpt-oss-120b",
245
+ "name": "gpt-oss-120b",
246
+ "developer": "openai",
247
+ "scores": {
248
+ "terminal-bench-2.0": 14.2
249
+ }
250
+ },
251
+ {
252
+ "model_id": "openai/gpt-oss-20b",
253
+ "name": "gpt-oss-20b",
254
+ "developer": "openai",
255
+ "scores": {
256
+ "terminal-bench-2.0": 3.1
257
+ }
258
+ },
259
+ {
260
+ "model_id": "xai/grok-4",
261
+ "name": "Grok 4",
262
+ "developer": "xAI",
263
+ "scores": {
264
+ "terminal-bench-2.0": 25.4
265
+ }
266
+ },
267
+ {
268
+ "model_id": "xai/grok-code-fast-1",
269
+ "name": "Grok Code Fast 1",
270
+ "developer": "xAI",
271
+ "scores": {
272
+ "terminal-bench-2.0": 25.8
273
+ }
274
+ },
275
+ {
276
+ "model_id": "zhipu-ai/glm-4.6",
277
+ "name": "GLM 4.6",
278
+ "developer": "Z.ai",
279
+ "scores": {
280
+ "terminal-bench-2.0": 24.5
281
+ }
282
+ },
283
+ {
284
+ "model_id": "zhipu-ai/glm-4.7",
285
+ "name": "GLM 4.7",
286
+ "developer": "Z-AI",
287
+ "scores": {
288
+ "terminal-bench-2.0": 33.4
289
+ }
290
+ },
291
+ {
292
+ "model_id": "zhipu-ai/glm-5",
293
+ "name": "GLM 5",
294
+ "developer": "Z-AI",
295
+ "scores": {
296
+ "terminal-bench-2.0": 52.4
297
+ }
298
+ }
299
+ ]
300
+ }
data/developers/0-hero.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "developer": "0-hero",
3
+ "models": [
4
+ {
5
+ "id": "0-hero/Matter-0.1-7B-DPO-preview",
6
+ "name": "0-hero/Matter-0.1-7B-DPO-preview",
7
+ "developer": "0-hero",
8
+ "evaluator_relationship": null,
9
+ "benchmark_scores": {
10
+ "reward-bench/Score": 0.7247,
11
+ "reward-bench/Chat": 0.8939,
12
+ "reward-bench/Chat Hard": 0.5768,
13
+ "reward-bench/Safety": 0.6378,
14
+ "reward-bench/Reasoning": 0.8854,
15
+ "reward-bench/Prior Sets (0.5 weight)": 0.5348
16
+ }
17
+ },
18
+ {
19
+ "id": "0-hero/Matter-0.1-7B-boost-DPO-preview",
20
+ "name": "0-hero/Matter-0.1-7B-boost-DPO-preview",
21
+ "developer": "0-hero",
22
+ "evaluator_relationship": null,
23
+ "benchmark_scores": {
24
+ "reward-bench/Score": 0.7448,
25
+ "reward-bench/Chat": 0.9106,
26
+ "reward-bench/Chat Hard": 0.6096,
27
+ "reward-bench/Safety": 0.7135,
28
+ "reward-bench/Reasoning": 0.8395,
29
+ "reward-bench/Prior Sets (0.5 weight)": 0.5566
30
+ }
31
+ },
32
+ {
33
+ "id": "0-hero/Matter-0.2-7B-DPO",
34
+ "name": "Matter-0.2-7B-DPO",
35
+ "developer": "0-hero",
36
+ "evaluator_relationship": null,
37
+ "benchmark_scores": {
38
+ "hfopenllm_v2/IFEval": 0.3303,
39
+ "hfopenllm_v2/BBH": 0.3596,
40
+ "hfopenllm_v2/MATH Level 5": 0.0144,
41
+ "hfopenllm_v2/GPQA": 0.2592,
42
+ "hfopenllm_v2/MUSR": 0.3814,
43
+ "hfopenllm_v2/MMLU-PRO": 0.1164
44
+ }
45
+ }
46
+ ]
47
+ }
data/developers/01-ai.json ADDED
@@ -0,0 +1,433 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "developer": "01-ai",
3
+ "models": [
4
+ {
5
+ "id": "01-ai/Yi-1.5-34B",
6
+ "name": "Yi-1.5-34B",
7
+ "developer": "01-ai",
8
+ "evaluator_relationship": null,
9
+ "benchmark_scores": {
10
+ "hfopenllm_v2/IFEval": 0.2841,
11
+ "hfopenllm_v2/BBH": 0.5976,
12
+ "hfopenllm_v2/MATH Level 5": 0.1533,
13
+ "hfopenllm_v2/GPQA": 0.3658,
14
+ "hfopenllm_v2/MUSR": 0.4236,
15
+ "hfopenllm_v2/MMLU-PRO": 0.4666
16
+ }
17
+ },
18
+ {
19
+ "id": "01-ai/Yi-1.5-34B-32K",
20
+ "name": "Yi-1.5-34B-32K",
21
+ "developer": "01-ai",
22
+ "evaluator_relationship": null,
23
+ "benchmark_scores": {
24
+ "hfopenllm_v2/IFEval": 0.3119,
25
+ "hfopenllm_v2/BBH": 0.6016,
26
+ "hfopenllm_v2/MATH Level 5": 0.1541,
27
+ "hfopenllm_v2/GPQA": 0.3633,
28
+ "hfopenllm_v2/MUSR": 0.4398,
29
+ "hfopenllm_v2/MMLU-PRO": 0.4709
30
+ }
31
+ },
32
+ {
33
+ "id": "01-ai/Yi-1.5-34B-Chat",
34
+ "name": "Yi-1.5-34B-Chat",
35
+ "developer": "01-ai",
36
+ "evaluator_relationship": null,
37
+ "benchmark_scores": {
38
+ "hfopenllm_v2/IFEval": 0.6067,
39
+ "hfopenllm_v2/BBH": 0.6084,
40
+ "hfopenllm_v2/MATH Level 5": 0.2772,
41
+ "hfopenllm_v2/GPQA": 0.3649,
42
+ "hfopenllm_v2/MUSR": 0.4282,
43
+ "hfopenllm_v2/MMLU-PRO": 0.452
44
+ }
45
+ },
46
+ {
47
+ "id": "01-ai/Yi-1.5-34B-Chat-16K",
48
+ "name": "Yi-1.5-34B-Chat-16K",
49
+ "developer": "01-ai",
50
+ "evaluator_relationship": null,
51
+ "benchmark_scores": {
52
+ "hfopenllm_v2/IFEval": 0.4564,
53
+ "hfopenllm_v2/BBH": 0.61,
54
+ "hfopenllm_v2/MATH Level 5": 0.2137,
55
+ "hfopenllm_v2/GPQA": 0.3381,
56
+ "hfopenllm_v2/MUSR": 0.4398,
57
+ "hfopenllm_v2/MMLU-PRO": 0.4545
58
+ }
59
+ },
60
+ {
61
+ "id": "01-ai/Yi-1.5-6B",
62
+ "name": "Yi-1.5-6B",
63
+ "developer": "01-ai",
64
+ "evaluator_relationship": null,
65
+ "benchmark_scores": {
66
+ "hfopenllm_v2/IFEval": 0.2617,
67
+ "hfopenllm_v2/BBH": 0.4493,
68
+ "hfopenllm_v2/MATH Level 5": 0.0665,
69
+ "hfopenllm_v2/GPQA": 0.3138,
70
+ "hfopenllm_v2/MUSR": 0.4374,
71
+ "hfopenllm_v2/MMLU-PRO": 0.3144
72
+ }
73
+ },
74
+ {
75
+ "id": "01-ai/Yi-1.5-6B-Chat",
76
+ "name": "Yi-1.5-6B-Chat",
77
+ "developer": "01-ai",
78
+ "evaluator_relationship": null,
79
+ "benchmark_scores": {
80
+ "hfopenllm_v2/IFEval": 0.5145,
81
+ "hfopenllm_v2/BBH": 0.4571,
82
+ "hfopenllm_v2/MATH Level 5": 0.1624,
83
+ "hfopenllm_v2/GPQA": 0.302,
84
+ "hfopenllm_v2/MUSR": 0.4392,
85
+ "hfopenllm_v2/MMLU-PRO": 0.3193
86
+ }
87
+ },
88
+ {
89
+ "id": "01-ai/Yi-1.5-9B",
90
+ "name": "Yi-1.5-9B",
91
+ "developer": "01-ai",
92
+ "evaluator_relationship": null,
93
+ "benchmark_scores": {
94
+ "hfopenllm_v2/IFEval": 0.2936,
95
+ "hfopenllm_v2/BBH": 0.5143,
96
+ "hfopenllm_v2/MATH Level 5": 0.114,
97
+ "hfopenllm_v2/GPQA": 0.3792,
98
+ "hfopenllm_v2/MUSR": 0.4328,
99
+ "hfopenllm_v2/MMLU-PRO": 0.3916
100
+ }
101
+ },
102
+ {
103
+ "id": "01-ai/Yi-1.5-9B-32K",
104
+ "name": "Yi-1.5-9B-32K",
105
+ "developer": "01-ai",
106
+ "evaluator_relationship": null,
107
+ "benchmark_scores": {
108
+ "hfopenllm_v2/IFEval": 0.2303,
109
+ "hfopenllm_v2/BBH": 0.4963,
110
+ "hfopenllm_v2/MATH Level 5": 0.108,
111
+ "hfopenllm_v2/GPQA": 0.3591,
112
+ "hfopenllm_v2/MUSR": 0.4186,
113
+ "hfopenllm_v2/MMLU-PRO": 0.3765
114
+ }
115
+ },
116
+ {
117
+ "id": "01-ai/Yi-1.5-9B-Chat",
118
+ "name": "Yi-1.5-9B-Chat",
119
+ "developer": "01-ai",
120
+ "evaluator_relationship": null,
121
+ "benchmark_scores": {
122
+ "hfopenllm_v2/IFEval": 0.6046,
123
+ "hfopenllm_v2/BBH": 0.5559,
124
+ "hfopenllm_v2/MATH Level 5": 0.2258,
125
+ "hfopenllm_v2/GPQA": 0.3347,
126
+ "hfopenllm_v2/MUSR": 0.4259,
127
+ "hfopenllm_v2/MMLU-PRO": 0.3975
128
+ }
129
+ },
130
+ {
131
+ "id": "01-ai/Yi-1.5-9B-Chat-16K",
132
+ "name": "Yi-1.5-9B-Chat-16K",
133
+ "developer": "01-ai",
134
+ "evaluator_relationship": null,
135
+ "benchmark_scores": {
136
+ "hfopenllm_v2/IFEval": 0.4214,
137
+ "hfopenllm_v2/BBH": 0.5153,
138
+ "hfopenllm_v2/MATH Level 5": 0.1782,
139
+ "hfopenllm_v2/GPQA": 0.3087,
140
+ "hfopenllm_v2/MUSR": 0.4099,
141
+ "hfopenllm_v2/MMLU-PRO": 0.3994
142
+ }
143
+ },
144
+ {
145
+ "id": "01-ai/Yi-34B",
146
+ "name": "Yi-34B",
147
+ "developer": "01-ai",
148
+ "evaluator_relationship": null,
149
+ "benchmark_scores": {
150
+ "hfopenllm_v2/IFEval": 0.3046,
151
+ "hfopenllm_v2/BBH": 0.5457,
152
+ "hfopenllm_v2/MATH Level 5": 0.0514,
153
+ "hfopenllm_v2/GPQA": 0.3666,
154
+ "hfopenllm_v2/MUSR": 0.4119,
155
+ "hfopenllm_v2/MMLU-PRO": 0.4412
156
+ }
157
+ },
158
+ {
159
+ "id": "01-ai/Yi-34B-200K",
160
+ "name": "Yi-34B-200K",
161
+ "developer": "01-ai",
162
+ "evaluator_relationship": null,
163
+ "benchmark_scores": {
164
+ "hfopenllm_v2/IFEval": 0.1542,
165
+ "hfopenllm_v2/BBH": 0.5442,
166
+ "hfopenllm_v2/MATH Level 5": 0.0574,
167
+ "hfopenllm_v2/GPQA": 0.3565,
168
+ "hfopenllm_v2/MUSR": 0.3817,
169
+ "hfopenllm_v2/MMLU-PRO": 0.4535
170
+ }
171
+ },
172
+ {
173
+ "id": "01-ai/Yi-34B-Chat",
174
+ "name": "Yi-34B-Chat",
175
+ "developer": "01-ai",
176
+ "evaluator_relationship": null,
177
+ "benchmark_scores": {
178
+ "hfopenllm_v2/IFEval": 0.4699,
179
+ "hfopenllm_v2/BBH": 0.5561,
180
+ "hfopenllm_v2/MATH Level 5": 0.0627,
181
+ "hfopenllm_v2/GPQA": 0.3381,
182
+ "hfopenllm_v2/MUSR": 0.3978,
183
+ "hfopenllm_v2/MMLU-PRO": 0.4093
184
+ }
185
+ },
186
+ {
187
+ "id": "01-ai/Yi-6B",
188
+ "name": "Yi-6B",
189
+ "developer": "01-ai",
190
+ "evaluator_relationship": null,
191
+ "benchmark_scores": {
192
+ "hfopenllm_v2/IFEval": 0.2893,
193
+ "hfopenllm_v2/BBH": 0.4309,
194
+ "hfopenllm_v2/MATH Level 5": 0.0159,
195
+ "hfopenllm_v2/GPQA": 0.2693,
196
+ "hfopenllm_v2/MUSR": 0.3937,
197
+ "hfopenllm_v2/MMLU-PRO": 0.2991
198
+ }
199
+ },
200
+ {
201
+ "id": "01-ai/Yi-6B-200K",
202
+ "name": "Yi-6B-200K",
203
+ "developer": "01-ai",
204
+ "evaluator_relationship": null,
205
+ "benchmark_scores": {
206
+ "hfopenllm_v2/IFEval": 0.0843,
207
+ "hfopenllm_v2/BBH": 0.4289,
208
+ "hfopenllm_v2/MATH Level 5": 0.0181,
209
+ "hfopenllm_v2/GPQA": 0.2819,
210
+ "hfopenllm_v2/MUSR": 0.4587,
211
+ "hfopenllm_v2/MMLU-PRO": 0.2844
212
+ }
213
+ },
214
+ {
215
+ "id": "01-ai/Yi-6B-Chat",
216
+ "name": "Yi-6B-Chat",
217
+ "developer": "01-ai",
218
+ "evaluator_relationship": null,
219
+ "benchmark_scores": {
220
+ "hfopenllm_v2/IFEval": 0.3395,
221
+ "hfopenllm_v2/BBH": 0.4133,
222
+ "hfopenllm_v2/MATH Level 5": 0.0136,
223
+ "hfopenllm_v2/GPQA": 0.2945,
224
+ "hfopenllm_v2/MUSR": 0.3688,
225
+ "hfopenllm_v2/MMLU-PRO": 0.3061
226
+ }
227
+ },
228
+ {
229
+ "id": "01-ai/Yi-9B",
230
+ "name": "Yi-9B",
231
+ "developer": "01-ai",
232
+ "evaluator_relationship": null,
233
+ "benchmark_scores": {
234
+ "hfopenllm_v2/IFEval": 0.2709,
235
+ "hfopenllm_v2/BBH": 0.494,
236
+ "hfopenllm_v2/MATH Level 5": 0.0559,
237
+ "hfopenllm_v2/GPQA": 0.318,
238
+ "hfopenllm_v2/MUSR": 0.4054,
239
+ "hfopenllm_v2/MMLU-PRO": 0.3574
240
+ }
241
+ },
242
+ {
243
+ "id": "01-ai/Yi-9B-200K",
244
+ "name": "Yi-9B-200K",
245
+ "developer": "01-ai",
246
+ "evaluator_relationship": null,
247
+ "benchmark_scores": {
248
+ "hfopenllm_v2/IFEval": 0.2327,
249
+ "hfopenllm_v2/BBH": 0.4793,
250
+ "hfopenllm_v2/MATH Level 5": 0.0665,
251
+ "hfopenllm_v2/GPQA": 0.3154,
252
+ "hfopenllm_v2/MUSR": 0.4294,
253
+ "hfopenllm_v2/MMLU-PRO": 0.3622
254
+ }
255
+ },
256
+ {
257
+ "id": "01-ai/Yi-Coder-9B-Chat",
258
+ "name": "Yi-Coder-9B-Chat",
259
+ "developer": "01-ai",
260
+ "evaluator_relationship": null,
261
+ "benchmark_scores": {
262
+ "hfopenllm_v2/IFEval": 0.4817,
263
+ "hfopenllm_v2/BBH": 0.4814,
264
+ "hfopenllm_v2/MATH Level 5": 0.04,
265
+ "hfopenllm_v2/GPQA": 0.2475,
266
+ "hfopenllm_v2/MUSR": 0.3992,
267
+ "hfopenllm_v2/MMLU-PRO": 0.2425
268
+ }
269
+ },
270
+ {
271
+ "id": "01-ai/yi-34b",
272
+ "name": "Yi 34B",
273
+ "developer": "01-ai",
274
+ "evaluator_relationship": null,
275
+ "benchmark_scores": {
276
+ "helm_lite/Mean win rate": 0.57,
277
+ "helm_lite/NarrativeQA": 0.782,
278
+ "helm_lite/NaturalQuestions (closed-book)": 0.443,
279
+ "helm_lite/OpenbookQA": 0.92,
280
+ "helm_lite/MMLU": 0.65,
281
+ "helm_lite/MATH": 0.375,
282
+ "helm_lite/GSM8K": 0.648,
283
+ "helm_lite/LegalBench": 0.618,
284
+ "helm_lite/MedQA": 0.656,
285
+ "helm_lite/WMT 2014": 0.172,
286
+ "helm_mmlu/MMLU All Subjects": 0.762,
287
+ "helm_mmlu/Abstract Algebra": 0.4,
288
+ "helm_mmlu/Anatomy": 0.748,
289
+ "helm_mmlu/College Physics": 0.5,
290
+ "helm_mmlu/Computer Security": 0.83,
291
+ "helm_mmlu/Econometrics": 0.588,
292
+ "helm_mmlu/Global Facts": 0.53,
293
+ "helm_mmlu/Jurisprudence": 0.898,
294
+ "helm_mmlu/Philosophy": 0.82,
295
+ "helm_mmlu/Professional Psychology": 0.835,
296
+ "helm_mmlu/Us Foreign Policy": 0.91,
297
+ "helm_mmlu/Astronomy": 0.901,
298
+ "helm_mmlu/Business Ethics": 0.75,
299
+ "helm_mmlu/Clinical Knowledge": 0.8,
300
+ "helm_mmlu/Conceptual Physics": 0.77,
301
+ "helm_mmlu/Electrical Engineering": 0.779,
302
+ "helm_mmlu/Elementary Mathematics": 0.656,
303
+ "helm_mmlu/Formal Logic": 0.548,
304
+ "helm_mmlu/High School World History": 0.907,
305
+ "helm_mmlu/Human Sexuality": 0.87,
306
+ "helm_mmlu/International Law": 0.909,
307
+ "helm_mmlu/Logical Fallacies": 0.883,
308
+ "helm_mmlu/Machine Learning": 0.58,
309
+ "helm_mmlu/Management": 0.893,
310
+ "helm_mmlu/Marketing": 0.936,
311
+ "helm_mmlu/Medical Genetics": 0.87,
312
+ "helm_mmlu/Miscellaneous": 0.902,
313
+ "helm_mmlu/Moral Scenarios": 0.606,
314
+ "helm_mmlu/Nutrition": 0.869,
315
+ "helm_mmlu/Prehistory": 0.877,
316
+ "helm_mmlu/Public Relations": 0.745,
317
+ "helm_mmlu/Security Studies": 0.833,
318
+ "helm_mmlu/Sociology": 0.9,
319
+ "helm_mmlu/Virology": 0.572,
320
+ "helm_mmlu/World Religions": 0.877,
321
+ "helm_mmlu/Mean win rate": 0.315
322
+ }
323
+ },
324
+ {
325
+ "id": "01-ai/yi-6b",
326
+ "name": "Yi 6B",
327
+ "developer": "01-ai",
328
+ "evaluator_relationship": null,
329
+ "benchmark_scores": {
330
+ "helm_lite/Mean win rate": 0.253,
331
+ "helm_lite/NarrativeQA": 0.702,
332
+ "helm_lite/NaturalQuestions (closed-book)": 0.31,
333
+ "helm_lite/OpenbookQA": 0.8,
334
+ "helm_lite/MMLU": 0.53,
335
+ "helm_lite/MATH": 0.126,
336
+ "helm_lite/GSM8K": 0.375,
337
+ "helm_lite/LegalBench": 0.519,
338
+ "helm_lite/MedQA": 0.497,
339
+ "helm_lite/WMT 2014": 0.117,
340
+ "helm_mmlu/MMLU All Subjects": 0.64,
341
+ "helm_mmlu/Abstract Algebra": 0.3,
342
+ "helm_mmlu/Anatomy": 0.6,
343
+ "helm_mmlu/College Physics": 0.422,
344
+ "helm_mmlu/Computer Security": 0.73,
345
+ "helm_mmlu/Econometrics": 0.351,
346
+ "helm_mmlu/Global Facts": 0.43,
347
+ "helm_mmlu/Jurisprudence": 0.796,
348
+ "helm_mmlu/Philosophy": 0.678,
349
+ "helm_mmlu/Professional Psychology": 0.668,
350
+ "helm_mmlu/Us Foreign Policy": 0.87,
351
+ "helm_mmlu/Astronomy": 0.684,
352
+ "helm_mmlu/Business Ethics": 0.67,
353
+ "helm_mmlu/Clinical Knowledge": 0.66,
354
+ "helm_mmlu/Conceptual Physics": 0.621,
355
+ "helm_mmlu/Electrical Engineering": 0.662,
356
+ "helm_mmlu/Elementary Mathematics": 0.452,
357
+ "helm_mmlu/Formal Logic": 0.452,
358
+ "helm_mmlu/High School World History": 0.785,
359
+ "helm_mmlu/Human Sexuality": 0.763,
360
+ "helm_mmlu/International Law": 0.769,
361
+ "helm_mmlu/Logical Fallacies": 0.779,
362
+ "helm_mmlu/Machine Learning": 0.411,
363
+ "helm_mmlu/Management": 0.806,
364
+ "helm_mmlu/Marketing": 0.893,
365
+ "helm_mmlu/Medical Genetics": 0.77,
366
+ "helm_mmlu/Miscellaneous": 0.796,
367
+ "helm_mmlu/Moral Scenarios": 0.335,
368
+ "helm_mmlu/Nutrition": 0.739,
369
+ "helm_mmlu/Prehistory": 0.713,
370
+ "helm_mmlu/Public Relations": 0.718,
371
+ "helm_mmlu/Security Studies": 0.735,
372
+ "helm_mmlu/Sociology": 0.831,
373
+ "helm_mmlu/Virology": 0.452,
374
+ "helm_mmlu/World Religions": 0.836,
375
+ "helm_mmlu/Mean win rate": 0.651
376
+ }
377
+ },
378
+ {
379
+ "id": "01-ai/yi-large-preview",
380
+ "name": "Yi Large Preview",
381
+ "developer": "01-ai",
382
+ "evaluator_relationship": null,
383
+ "benchmark_scores": {
384
+ "helm_lite/Mean win rate": 0.471,
385
+ "helm_lite/NarrativeQA": 0.373,
386
+ "helm_lite/NaturalQuestions (closed-book)": 0.428,
387
+ "helm_lite/OpenbookQA": 0.946,
388
+ "helm_lite/MMLU": 0.712,
389
+ "helm_lite/MATH": 0.712,
390
+ "helm_lite/GSM8K": 0.69,
391
+ "helm_lite/LegalBench": 0.519,
392
+ "helm_lite/MedQA": 0.66,
393
+ "helm_lite/WMT 2014": 0.176,
394
+ "helm_mmlu/MMLU All Subjects": 0.793,
395
+ "helm_mmlu/Abstract Algebra": 0.6,
396
+ "helm_mmlu/Anatomy": 0.83,
397
+ "helm_mmlu/College Physics": 0.569,
398
+ "helm_mmlu/Computer Security": 0.86,
399
+ "helm_mmlu/Econometrics": 0.728,
400
+ "helm_mmlu/Global Facts": 0.52,
401
+ "helm_mmlu/Jurisprudence": 0.852,
402
+ "helm_mmlu/Philosophy": 0.842,
403
+ "helm_mmlu/Professional Psychology": 0.853,
404
+ "helm_mmlu/Us Foreign Policy": 0.85,
405
+ "helm_mmlu/Astronomy": 0.914,
406
+ "helm_mmlu/Business Ethics": 0.8,
407
+ "helm_mmlu/Clinical Knowledge": 0.857,
408
+ "helm_mmlu/Conceptual Physics": 0.864,
409
+ "helm_mmlu/Electrical Engineering": 0.779,
410
+ "helm_mmlu/Elementary Mathematics": 0.685,
411
+ "helm_mmlu/Formal Logic": 0.603,
412
+ "helm_mmlu/High School World History": 0.928,
413
+ "helm_mmlu/Human Sexuality": 0.901,
414
+ "helm_mmlu/International Law": 0.917,
415
+ "helm_mmlu/Logical Fallacies": 0.865,
416
+ "helm_mmlu/Machine Learning": 0.616,
417
+ "helm_mmlu/Management": 0.903,
418
+ "helm_mmlu/Marketing": 0.927,
419
+ "helm_mmlu/Medical Genetics": 0.83,
420
+ "helm_mmlu/Miscellaneous": 0.916,
421
+ "helm_mmlu/Moral Scenarios": 0.831,
422
+ "helm_mmlu/Nutrition": 0.846,
423
+ "helm_mmlu/Prehistory": 0.892,
424
+ "helm_mmlu/Public Relations": 0.827,
425
+ "helm_mmlu/Security Studies": 0.82,
426
+ "helm_mmlu/Sociology": 0.881,
427
+ "helm_mmlu/Virology": 0.59,
428
+ "helm_mmlu/World Religions": 0.871,
429
+ "helm_mmlu/Mean win rate": 0.258
430
+ }
431
+ }
432
+ ]
433
+ }
data/developers/1-800-LLMs.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "developer": "1-800-LLMs",
3
+ "models": [
4
+ {
5
+ "id": "1-800-LLMs/Qwen-2.5-14B-Hindi",
6
+ "name": "Qwen-2.5-14B-Hindi",
7
+ "developer": "1-800-LLMs",
8
+ "evaluator_relationship": null,
9
+ "benchmark_scores": {
10
+ "hfopenllm_v2/IFEval": 0.5826,
11
+ "hfopenllm_v2/BBH": 0.6524,
12
+ "hfopenllm_v2/MATH Level 5": 0.3331,
13
+ "hfopenllm_v2/GPQA": 0.3624,
14
+ "hfopenllm_v2/MUSR": 0.4489,
15
+ "hfopenllm_v2/MMLU-PRO": 0.5263
16
+ }
17
+ },
18
+ {
19
+ "id": "1-800-LLMs/Qwen-2.5-14B-Hindi-Custom-Instruct",
20
+ "name": "Qwen-2.5-14B-Hindi-Custom-Instruct",
21
+ "developer": "1-800-LLMs",
22
+ "evaluator_relationship": null,
23
+ "benchmark_scores": {
24
+ "hfopenllm_v2/IFEval": 0.3077,
25
+ "hfopenllm_v2/BBH": 0.6284,
26
+ "hfopenllm_v2/MATH Level 5": 0.3112,
27
+ "hfopenllm_v2/GPQA": 0.37,
28
+ "hfopenllm_v2/MUSR": 0.4491,
29
+ "hfopenllm_v2/MMLU-PRO": 0.5164
30
+ }
31
+ }
32
+ ]
33
+ }
data/developers/1024m.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "developer": "1024m",
3
+ "models": [
4
+ {
5
+ "id": "1024m/PHI-4-Hindi",
6
+ "name": "PHI-4-Hindi",
7
+ "developer": "1024m",
8
+ "evaluator_relationship": null,
9
+ "benchmark_scores": {
10
+ "hfopenllm_v2/IFEval": 0.0082,
11
+ "hfopenllm_v2/BBH": 0.671,
12
+ "hfopenllm_v2/MATH Level 5": 0.2334,
13
+ "hfopenllm_v2/GPQA": 0.3977,
14
+ "hfopenllm_v2/MUSR": 0.4914,
15
+ "hfopenllm_v2/MMLU-PRO": 0.5239
16
+ }
17
+ },
18
+ {
19
+ "id": "1024m/QWEN-14B-B100",
20
+ "name": "QWEN-14B-B100",
21
+ "developer": "1024m",
22
+ "evaluator_relationship": null,
23
+ "benchmark_scores": {
24
+ "hfopenllm_v2/IFEval": 0.7762,
25
+ "hfopenllm_v2/BBH": 0.6533,
26
+ "hfopenllm_v2/MATH Level 5": 0.5438,
27
+ "hfopenllm_v2/GPQA": 0.3507,
28
+ "hfopenllm_v2/MUSR": 0.41,
29
+ "hfopenllm_v2/MMLU-PRO": 0.5179
30
+ }
31
+ }
32
+ ]
33
+ }
data/developers/152334H.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "developer": "152334H",
3
+ "models": [
4
+ {
5
+ "id": "152334H/miqu-1-70b-sf",
6
+ "name": "miqu-1-70b-sf",
7
+ "developer": "152334H",
8
+ "evaluator_relationship": null,
9
+ "benchmark_scores": {
10
+ "hfopenllm_v2/IFEval": 0.5182,
11
+ "hfopenllm_v2/BBH": 0.6102,
12
+ "hfopenllm_v2/MATH Level 5": 0.1246,
13
+ "hfopenllm_v2/GPQA": 0.3507,
14
+ "hfopenllm_v2/MUSR": 0.4582,
15
+ "hfopenllm_v2/MMLU-PRO": 0.4228
16
+ }
17
+ }
18
+ ]
19
+ }
data/developers/1TuanPham.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "developer": "1TuanPham",
3
+ "models": [
4
+ {
5
+ "id": "1TuanPham/T-VisStar-7B-v0.1",
6
+ "name": "T-VisStar-7B-v0.1",
7
+ "developer": "1TuanPham",
8
+ "evaluator_relationship": null,
9
+ "benchmark_scores": {
10
+ "hfopenllm_v2/IFEval": 0.3607,
11
+ "hfopenllm_v2/BBH": 0.5052,
12
+ "hfopenllm_v2/MATH Level 5": 0.0574,
13
+ "hfopenllm_v2/GPQA": 0.2852,
14
+ "hfopenllm_v2/MUSR": 0.4375,
15
+ "hfopenllm_v2/MMLU-PRO": 0.3211
16
+ }
17
+ },
18
+ {
19
+ "id": "1TuanPham/T-VisStar-v0.1",
20
+ "name": "T-VisStar-v0.1",
21
+ "developer": "1TuanPham",
22
+ "evaluator_relationship": null,
23
+ "benchmark_scores": {
24
+ "hfopenllm_v2/IFEval": 0.3607,
25
+ "hfopenllm_v2/BBH": 0.5052,
26
+ "hfopenllm_v2/MATH Level 5": 0.0574,
27
+ "hfopenllm_v2/GPQA": 0.2852,
28
+ "hfopenllm_v2/MUSR": 0.4375,
29
+ "hfopenllm_v2/MMLU-PRO": 0.3211
30
+ }
31
+ }
32
+ ]
33
+ }
data/developers/3rd-Degree-Burn.json ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "developer": "3rd-Degree-Burn",
3
+ "models": [
4
+ {
5
+ "id": "3rd-Degree-Burn/L-3.1-Science-Writer-8B",
6
+ "name": "L-3.1-Science-Writer-8B",
7
+ "developer": "3rd-Degree-Burn",
8
+ "evaluator_relationship": null,
9
+ "benchmark_scores": {
10
+ "hfopenllm_v2/IFEval": 0.4263,
11
+ "hfopenllm_v2/BBH": 0.5041,
12
+ "hfopenllm_v2/MATH Level 5": 0.1035,
13
+ "hfopenllm_v2/GPQA": 0.2743,
14
+ "hfopenllm_v2/MUSR": 0.3959,
15
+ "hfopenllm_v2/MMLU-PRO": 0.3649
16
+ }
17
+ },
18
+ {
19
+ "id": "3rd-Degree-Burn/Llama-3.1-8B-Squareroot",
20
+ "name": "Llama-3.1-8B-Squareroot",
21
+ "developer": "3rd-Degree-Burn",
22
+ "evaluator_relationship": null,
23
+ "benchmark_scores": {
24
+ "hfopenllm_v2/IFEval": 0.2213,
25
+ "hfopenllm_v2/BBH": 0.3461,
26
+ "hfopenllm_v2/MATH Level 5": 0.2659,
27
+ "hfopenllm_v2/GPQA": 0.2567,
28
+ "hfopenllm_v2/MUSR": 0.3089,
29
+ "hfopenllm_v2/MMLU-PRO": 0.175
30
+ }
31
+ },
32
+ {
33
+ "id": "3rd-Degree-Burn/Llama-3.1-8B-Squareroot-v1",
34
+ "name": "Llama-3.1-8B-Squareroot-v1",
35
+ "developer": "3rd-Degree-Burn",
36
+ "evaluator_relationship": null,
37
+ "benchmark_scores": {
38
+ "hfopenllm_v2/IFEval": 0.2892,
39
+ "hfopenllm_v2/BBH": 0.3343,
40
+ "hfopenllm_v2/MATH Level 5": 0.0884,
41
+ "hfopenllm_v2/GPQA": 0.2559,
42
+ "hfopenllm_v2/MUSR": 0.3341,
43
+ "hfopenllm_v2/MMLU-PRO": 0.1127
44
+ }
45
+ },
46
+ {
47
+ "id": "3rd-Degree-Burn/Llama-Squared-8B",
48
+ "name": "Llama-Squared-8B",
49
+ "developer": "3rd-Degree-Burn",
50
+ "evaluator_relationship": null,
51
+ "benchmark_scores": {
52
+ "hfopenllm_v2/IFEval": 0.2755,
53
+ "hfopenllm_v2/BBH": 0.4431,
54
+ "hfopenllm_v2/MATH Level 5": 0.0574,
55
+ "hfopenllm_v2/GPQA": 0.2718,
56
+ "hfopenllm_v2/MUSR": 0.3089,
57
+ "hfopenllm_v2/MMLU-PRO": 0.2366
58
+ }
59
+ }
60
+ ]
61
+ }
data/developers/4season.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "developer": "4season",
3
+ "models": [
4
+ {
5
+ "id": "4season/final_model_test_v2",
6
+ "name": "final_model_test_v2",
7
+ "developer": "4season",
8
+ "evaluator_relationship": null,
9
+ "benchmark_scores": {
10
+ "hfopenllm_v2/IFEval": 0.3191,
11
+ "hfopenllm_v2/BBH": 0.6342,
12
+ "hfopenllm_v2/MATH Level 5": 0.0838,
13
+ "hfopenllm_v2/GPQA": 0.3272,
14
+ "hfopenllm_v2/MUSR": 0.4314,
15
+ "hfopenllm_v2/MMLU-PRO": 0.3528
16
+ }
17
+ }
18
+ ]
19
+ }
data/developers/AALF.json ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "developer": "AALF",
3
+ "models": [
4
+ {
5
+ "id": "AALF/FuseChat-Llama-3.1-8B-Instruct-preview",
6
+ "name": "FuseChat-Llama-3.1-8B-Instruct-preview",
7
+ "developer": "AALF",
8
+ "evaluator_relationship": null,
9
+ "benchmark_scores": {
10
+ "hfopenllm_v2/IFEval": 0.719,
11
+ "hfopenllm_v2/BBH": 0.512,
12
+ "hfopenllm_v2/MATH Level 5": 0.2477,
13
+ "hfopenllm_v2/GPQA": 0.3054,
14
+ "hfopenllm_v2/MUSR": 0.382,
15
+ "hfopenllm_v2/MMLU-PRO": 0.3733
16
+ }
17
+ },
18
+ {
19
+ "id": "AALF/FuseChat-Llama-3.1-8B-SFT-preview",
20
+ "name": "FuseChat-Llama-3.1-8B-SFT-preview",
21
+ "developer": "AALF",
22
+ "evaluator_relationship": null,
23
+ "benchmark_scores": {
24
+ "hfopenllm_v2/IFEval": 0.7281,
25
+ "hfopenllm_v2/BBH": 0.524,
26
+ "hfopenllm_v2/MATH Level 5": 0.2251,
27
+ "hfopenllm_v2/GPQA": 0.3045,
28
+ "hfopenllm_v2/MUSR": 0.402,
29
+ "hfopenllm_v2/MMLU-PRO": 0.3743
30
+ }
31
+ },
32
+ {
33
+ "id": "AALF/gemma-2-27b-it-SimPO-37K",
34
+ "name": "gemma-2-27b-it-SimPO-37K",
35
+ "developer": "AALF",
36
+ "evaluator_relationship": null,
37
+ "benchmark_scores": {
38
+ "hfopenllm_v2/IFEval": 0.2407,
39
+ "hfopenllm_v2/BBH": 0.3911,
40
+ "hfopenllm_v2/MATH Level 5": 0.0128,
41
+ "hfopenllm_v2/GPQA": 0.2802,
42
+ "hfopenllm_v2/MUSR": 0.3488,
43
+ "hfopenllm_v2/MMLU-PRO": 0.1971
44
+ }
45
+ },
46
+ {
47
+ "id": "AALF/gemma-2-27b-it-SimPO-37K-100steps",
48
+ "name": "gemma-2-27b-it-SimPO-37K-100steps",
49
+ "developer": "AALF",
50
+ "evaluator_relationship": null,
51
+ "benchmark_scores": {
52
+ "hfopenllm_v2/IFEval": 0.2568,
53
+ "hfopenllm_v2/BBH": 0.3931,
54
+ "hfopenllm_v2/MATH Level 5": 0.0211,
55
+ "hfopenllm_v2/GPQA": 0.2886,
56
+ "hfopenllm_v2/MUSR": 0.3329,
57
+ "hfopenllm_v2/MMLU-PRO": 0.2125
58
+ }
59
+ }
60
+ ]
61
+ }
data/developers/AELLM.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "developer": "AELLM",
3
+ "models": [
4
+ {
5
+ "id": "AELLM/gemma-2-aeria-infinity-9b",
6
+ "name": "gemma-2-aeria-infinity-9b",
7
+ "developer": "AELLM",
8
+ "evaluator_relationship": null,
9
+ "benchmark_scores": {
10
+ "hfopenllm_v2/IFEval": 0.7594,
11
+ "hfopenllm_v2/BBH": 0.5983,
12
+ "hfopenllm_v2/MATH Level 5": 0.2145,
13
+ "hfopenllm_v2/GPQA": 0.3339,
14
+ "hfopenllm_v2/MUSR": 0.402,
15
+ "hfopenllm_v2/MMLU-PRO": 0.3862
16
+ }
17
+ },
18
+ {
19
+ "id": "AELLM/gemma-2-lyco-infinity-9b",
20
+ "name": "gemma-2-lyco-infinity-9b",
21
+ "developer": "AELLM",
22
+ "evaluator_relationship": null,
23
+ "benchmark_scores": {
24
+ "hfopenllm_v2/IFEval": 0.7316,
25
+ "hfopenllm_v2/BBH": 0.584,
26
+ "hfopenllm_v2/MATH Level 5": 0.1707,
27
+ "hfopenllm_v2/GPQA": 0.328,
28
+ "hfopenllm_v2/MUSR": 0.4006,
29
+ "hfopenllm_v2/MMLU-PRO": 0.3787
30
+ }
31
+ }
32
+ ]
33
+ }
data/developers/AGI-0.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "developer": "AGI-0",
3
+ "models": [
4
+ {
5
+ "id": "AGI-0/Art-v0-3B",
6
+ "name": "Art-v0-3B",
7
+ "developer": "AGI-0",
8
+ "evaluator_relationship": null,
9
+ "benchmark_scores": {
10
+ "hfopenllm_v2/IFEval": 0.3192,
11
+ "hfopenllm_v2/BBH": 0.3401,
12
+ "hfopenllm_v2/MATH Level 5": 0.2462,
13
+ "hfopenllm_v2/GPQA": 0.2592,
14
+ "hfopenllm_v2/MUSR": 0.3768,
15
+ "hfopenllm_v2/MMLU-PRO": 0.1179
16
+ }
17
+ },
18
+ {
19
+ "id": "AGI-0/Artificium-llama3.1-8B-001",
20
+ "name": "Artificium-llama3.1-8B-001",
21
+ "developer": "AGI-0",
22
+ "evaluator_relationship": null,
23
+ "benchmark_scores": {
24
+ "hfopenllm_v2/IFEval": 0.5248,
25
+ "hfopenllm_v2/BBH": 0.4256,
26
+ "hfopenllm_v2/MATH Level 5": 0.136,
27
+ "hfopenllm_v2/GPQA": 0.2659,
28
+ "hfopenllm_v2/MUSR": 0.3795,
29
+ "hfopenllm_v2/MMLU-PRO": 0.3182
30
+ }
31
+ },
32
+ {
33
+ "id": "AGI-0/smartllama3.1-8B-001",
34
+ "name": "smartllama3.1-8B-001",
35
+ "developer": "AGI-0",
36
+ "evaluator_relationship": null,
37
+ "benchmark_scores": {
38
+ "hfopenllm_v2/IFEval": 0.3518,
39
+ "hfopenllm_v2/BBH": 0.467,
40
+ "hfopenllm_v2/MATH Level 5": 0.1299,
41
+ "hfopenllm_v2/GPQA": 0.3062,
42
+ "hfopenllm_v2/MUSR": 0.4386,
43
+ "hfopenllm_v2/MMLU-PRO": 0.3487
44
+ }
45
+ }
46
+ ]
47
+ }
data/developers/AI-MO.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "developer": "AI-MO",
3
+ "models": [
4
+ {
5
+ "id": "AI-MO/NuminaMath-7B-CoT",
6
+ "name": "NuminaMath-7B-CoT",
7
+ "developer": "AI-MO",
8
+ "evaluator_relationship": null,
9
+ "benchmark_scores": {
10
+ "hfopenllm_v2/IFEval": 0.2689,
11
+ "hfopenllm_v2/BBH": 0.4314,
12
+ "hfopenllm_v2/MATH Level 5": 0.2696,
13
+ "hfopenllm_v2/GPQA": 0.2659,
14
+ "hfopenllm_v2/MUSR": 0.3303,
15
+ "hfopenllm_v2/MMLU-PRO": 0.2868
16
+ }
17
+ },
18
+ {
19
+ "id": "AI-MO/NuminaMath-7B-TIR",
20
+ "name": "NuminaMath-7B-TIR",
21
+ "developer": "AI-MO",
22
+ "evaluator_relationship": null,
23
+ "benchmark_scores": {
24
+ "hfopenllm_v2/IFEval": 0.2756,
25
+ "hfopenllm_v2/BBH": 0.4144,
26
+ "hfopenllm_v2/MATH Level 5": 0.1609,
27
+ "hfopenllm_v2/GPQA": 0.2584,
28
+ "hfopenllm_v2/MUSR": 0.3509,
29
+ "hfopenllm_v2/MMLU-PRO": 0.2733
30
+ }
31
+ }
32
+ ]
33
+ }
data/developers/AI-Sweden-Models.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "developer": "AI-Sweden-Models",
3
+ "models": [
4
+ {
5
+ "id": "AI-Sweden-Models/Llama-3-8B-instruct",
6
+ "name": "Llama-3-8B-instruct",
7
+ "developer": "AI-Sweden-Models",
8
+ "evaluator_relationship": null,
9
+ "benchmark_scores": {
10
+ "hfopenllm_v2/IFEval": 0.2401,
11
+ "hfopenllm_v2/BBH": 0.4173,
12
+ "hfopenllm_v2/MATH Level 5": 0.0385,
13
+ "hfopenllm_v2/GPQA": 0.2659,
14
+ "hfopenllm_v2/MUSR": 0.4771,
15
+ "hfopenllm_v2/MMLU-PRO": 0.2597
16
+ }
17
+ },
18
+ {
19
+ "id": "AI-Sweden-Models/gpt-sw3-40b",
20
+ "name": "gpt-sw3-40b",
21
+ "developer": "AI-Sweden-Models",
22
+ "evaluator_relationship": null,
23
+ "benchmark_scores": {
24
+ "hfopenllm_v2/IFEval": 0.147,
25
+ "hfopenllm_v2/BBH": 0.3268,
26
+ "hfopenllm_v2/MATH Level 5": 0.0174,
27
+ "hfopenllm_v2/GPQA": 0.2349,
28
+ "hfopenllm_v2/MUSR": 0.3632,
29
+ "hfopenllm_v2/MMLU-PRO": 0.1276
30
+ }
31
+ }
32
+ ]
33
+ }
data/developers/AI4free.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "developer": "AI4free",
3
+ "models": [
4
+ {
5
+ "id": "AI4free/Dhanishtha",
6
+ "name": "Dhanishtha",
7
+ "developer": "AI4free",
8
+ "evaluator_relationship": null,
9
+ "benchmark_scores": {
10
+ "hfopenllm_v2/IFEval": 0.2451,
11
+ "hfopenllm_v2/BBH": 0.3404,
12
+ "hfopenllm_v2/MATH Level 5": 0.256,
13
+ "hfopenllm_v2/GPQA": 0.2525,
14
+ "hfopenllm_v2/MUSR": 0.3569,
15
+ "hfopenllm_v2/MMLU-PRO": 0.1643
16
+ }
17
+ },
18
+ {
19
+ "id": "AI4free/t2",
20
+ "name": "t2",
21
+ "developer": "AI4free",
22
+ "evaluator_relationship": null,
23
+ "benchmark_scores": {
24
+ "hfopenllm_v2/IFEval": 0.3867,
25
+ "hfopenllm_v2/BBH": 0.291,
26
+ "hfopenllm_v2/MATH Level 5": 0.1896,
27
+ "hfopenllm_v2/GPQA": 0.2576,
28
+ "hfopenllm_v2/MUSR": 0.3846,
29
+ "hfopenllm_v2/MMLU-PRO": 0.1144
30
+ }
31
+ }
32
+ ]
33
+ }
data/developers/AIDC-AI.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "developer": "AIDC-AI",
3
+ "models": [
4
+ {
5
+ "id": "AIDC-AI/Marco-o1",
6
+ "name": "Marco-o1",
7
+ "developer": "AIDC-AI",
8
+ "evaluator_relationship": null,
9
+ "benchmark_scores": {
10
+ "hfopenllm_v2/IFEval": 0.4771,
11
+ "hfopenllm_v2/BBH": 0.5364,
12
+ "hfopenllm_v2/MATH Level 5": 0.3746,
13
+ "hfopenllm_v2/GPQA": 0.2592,
14
+ "hfopenllm_v2/MUSR": 0.4138,
15
+ "hfopenllm_v2/MMLU-PRO": 0.4117
16
+ }
17
+ }
18
+ ]
19
+ }
data/developers/Aashraf995.json ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "developer": "Aashraf995",
3
+ "models": [
4
+ {
5
+ "id": "Aashraf995/Creative-7B-nerd",
6
+ "name": "Creative-7B-nerd",
7
+ "developer": "Aashraf995",
8
+ "evaluator_relationship": null,
9
+ "benchmark_scores": {
10
+ "hfopenllm_v2/IFEval": 0.4722,
11
+ "hfopenllm_v2/BBH": 0.5607,
12
+ "hfopenllm_v2/MATH Level 5": 0.3165,
13
+ "hfopenllm_v2/GPQA": 0.3263,
14
+ "hfopenllm_v2/MUSR": 0.4515,
15
+ "hfopenllm_v2/MMLU-PRO": 0.4492
16
+ }
17
+ },
18
+ {
19
+ "id": "Aashraf995/Gemma-Evo-10B",
20
+ "name": "Gemma-Evo-10B",
21
+ "developer": "Aashraf995",
22
+ "evaluator_relationship": null,
23
+ "benchmark_scores": {
24
+ "hfopenllm_v2/IFEval": 0.7332,
25
+ "hfopenllm_v2/BBH": 0.6044,
26
+ "hfopenllm_v2/MATH Level 5": 0.2228,
27
+ "hfopenllm_v2/GPQA": 0.354,
28
+ "hfopenllm_v2/MUSR": 0.4595,
29
+ "hfopenllm_v2/MMLU-PRO": 0.4275
30
+ }
31
+ },
32
+ {
33
+ "id": "Aashraf995/Qwen-Evo-7B",
34
+ "name": "Qwen-Evo-7B",
35
+ "developer": "Aashraf995",
36
+ "evaluator_relationship": null,
37
+ "benchmark_scores": {
38
+ "hfopenllm_v2/IFEval": 0.4757,
39
+ "hfopenllm_v2/BBH": 0.5709,
40
+ "hfopenllm_v2/MATH Level 5": 0.3142,
41
+ "hfopenllm_v2/GPQA": 0.3255,
42
+ "hfopenllm_v2/MUSR": 0.4541,
43
+ "hfopenllm_v2/MMLU-PRO": 0.4462
44
+ }
45
+ },
46
+ {
47
+ "id": "Aashraf995/QwenStock-14B",
48
+ "name": "QwenStock-14B",
49
+ "developer": "Aashraf995",
50
+ "evaluator_relationship": null,
51
+ "benchmark_scores": {
52
+ "hfopenllm_v2/IFEval": 0.5009,
53
+ "hfopenllm_v2/BBH": 0.655,
54
+ "hfopenllm_v2/MATH Level 5": 0.3573,
55
+ "hfopenllm_v2/GPQA": 0.3893,
56
+ "hfopenllm_v2/MUSR": 0.4793,
57
+ "hfopenllm_v2/MMLU-PRO": 0.5382
58
+ }
59
+ }
60
+ ]
61
+ }
data/developers/AbacusResearch.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "developer": "AbacusResearch",
3
+ "models": [
4
+ {
5
+ "id": "AbacusResearch/Jallabi-34B",
6
+ "name": "Jallabi-34B",
7
+ "developer": "AbacusResearch",
8
+ "evaluator_relationship": null,
9
+ "benchmark_scores": {
10
+ "hfopenllm_v2/IFEval": 0.3529,
11
+ "hfopenllm_v2/BBH": 0.6023,
12
+ "hfopenllm_v2/MATH Level 5": 0.0521,
13
+ "hfopenllm_v2/GPQA": 0.3389,
14
+ "hfopenllm_v2/MUSR": 0.4822,
15
+ "hfopenllm_v2/MMLU-PRO": 0.4682
16
+ }
17
+ }
18
+ ]
19
+ }
data/developers/Ahdoot.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "developer": "Ahdoot",
3
+ "models": [
4
+ {
5
+ "id": "Ahdoot/StructuredThinker-v0.3-MoreStructure",
6
+ "name": "StructuredThinker-v0.3-MoreStructure",
7
+ "developer": "Ahdoot",
8
+ "evaluator_relationship": null,
9
+ "benchmark_scores": {
10
+ "hfopenllm_v2/IFEval": 0.4193,
11
+ "hfopenllm_v2/BBH": 0.4838,
12
+ "hfopenllm_v2/MATH Level 5": 0.2908,
13
+ "hfopenllm_v2/GPQA": 0.297,
14
+ "hfopenllm_v2/MUSR": 0.4158,
15
+ "hfopenllm_v2/MMLU-PRO": 0.361
16
+ }
17
+ },
18
+ {
19
+ "id": "Ahdoot/Test_StealthThinker",
20
+ "name": "Test_StealthThinker",
21
+ "developer": "Ahdoot",
22
+ "evaluator_relationship": null,
23
+ "benchmark_scores": {
24
+ "hfopenllm_v2/IFEval": 0.422,
25
+ "hfopenllm_v2/BBH": 0.4647,
26
+ "hfopenllm_v2/MATH Level 5": 0.179,
27
+ "hfopenllm_v2/GPQA": 0.2961,
28
+ "hfopenllm_v2/MUSR": 0.428,
29
+ "hfopenllm_v2/MMLU-PRO": 0.3597
30
+ }
31
+ }
32
+ ]
33
+ }
data/developers/Ahjeong.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "developer": "Ahjeong",
3
+ "models": [
4
+ {
5
+ "id": "Ahjeong/MMPO_Gemma_7b",
6
+ "name": "Ahjeong/MMPO_Gemma_7b",
7
+ "developer": "Ahjeong",
8
+ "evaluator_relationship": null,
9
+ "benchmark_scores": {
10
+ "reward-bench/Score": 0.7587,
11
+ "reward-bench/Chat": 0.9693,
12
+ "reward-bench/Chat Hard": 0.614,
13
+ "reward-bench/Safety": 0.7135,
14
+ "reward-bench/Reasoning": 0.7756,
15
+ "reward-bench/Prior Sets (0.5 weight)": 0.6831
16
+ }
17
+ },
18
+ {
19
+ "id": "Ahjeong/MMPO_Gemma_7b_gamma1.1_epoch3",
20
+ "name": "Ahjeong/MMPO_Gemma_7b_gamma1.1_epoch3",
21
+ "developer": "Ahjeong",
22
+ "evaluator_relationship": null,
23
+ "benchmark_scores": {
24
+ "reward-bench/Score": 0.7652,
25
+ "reward-bench/Chat": 0.9721,
26
+ "reward-bench/Chat Hard": 0.6338,
27
+ "reward-bench/Safety": 0.7635,
28
+ "reward-bench/Reasoning": 0.7284,
29
+ "reward-bench/Prior Sets (0.5 weight)": 0.6913
30
+ }
31
+ }
32
+ ]
33
+ }
data/developers/AicoresSecurity.json ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "developer": "AicoresSecurity",
3
+ "models": [
4
+ {
5
+ "id": "AicoresSecurity/Cybernet-Sec-3B-R1-V0",
6
+ "name": "Cybernet-Sec-3B-R1-V0",
7
+ "developer": "AicoresSecurity",
8
+ "evaluator_relationship": null,
9
+ "benchmark_scores": {
10
+ "hfopenllm_v2/IFEval": 0.6358,
11
+ "hfopenllm_v2/BBH": 0.4497,
12
+ "hfopenllm_v2/MATH Level 5": 0.1156,
13
+ "hfopenllm_v2/GPQA": 0.2634,
14
+ "hfopenllm_v2/MUSR": 0.3314,
15
+ "hfopenllm_v2/MMLU-PRO": 0.301
16
+ }
17
+ },
18
+ {
19
+ "id": "AicoresSecurity/Cybernet-Sec-3B-R1-V0-Coder",
20
+ "name": "Cybernet-Sec-3B-R1-V0-Coder",
21
+ "developer": "AicoresSecurity",
22
+ "evaluator_relationship": null,
23
+ "benchmark_scores": {
24
+ "hfopenllm_v2/IFEval": 0.7098,
25
+ "hfopenllm_v2/BBH": 0.4478,
26
+ "hfopenllm_v2/MATH Level 5": 0.1488,
27
+ "hfopenllm_v2/GPQA": 0.2718,
28
+ "hfopenllm_v2/MUSR": 0.3408,
29
+ "hfopenllm_v2/MMLU-PRO": 0.3178
30
+ }
31
+ },
32
+ {
33
+ "id": "AicoresSecurity/Cybernet-Sec-3B-R1-V1",
34
+ "name": "Cybernet-Sec-3B-R1-V1",
35
+ "developer": "AicoresSecurity",
36
+ "evaluator_relationship": null,
37
+ "benchmark_scores": {
38
+ "hfopenllm_v2/IFEval": 0.6146,
39
+ "hfopenllm_v2/BBH": 0.4282,
40
+ "hfopenllm_v2/MATH Level 5": 0.1518,
41
+ "hfopenllm_v2/GPQA": 0.2609,
42
+ "hfopenllm_v2/MUSR": 0.3287,
43
+ "hfopenllm_v2/MMLU-PRO": 0.2876
44
+ }
45
+ },
46
+ {
47
+ "id": "AicoresSecurity/Cybernet-Sec-3B-R1-V1.1",
48
+ "name": "Cybernet-Sec-3B-R1-V1.1",
49
+ "developer": "AicoresSecurity",
50
+ "evaluator_relationship": null,
51
+ "benchmark_scores": {
52
+ "hfopenllm_v2/IFEval": 0.673,
53
+ "hfopenllm_v2/BBH": 0.4392,
54
+ "hfopenllm_v2/MATH Level 5": 0.176,
55
+ "hfopenllm_v2/GPQA": 0.271,
56
+ "hfopenllm_v2/MUSR": 0.3541,
57
+ "hfopenllm_v2/MMLU-PRO": 0.3088
58
+ }
59
+ }
60
+ ]
61
+ }
data/developers/Alepach.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "developer": "Alepach",
3
+ "models": [
4
+ {
5
+ "id": "Alepach/notHumpback-M0",
6
+ "name": "notHumpback-M0",
7
+ "developer": "Alepach",
8
+ "evaluator_relationship": null,
9
+ "benchmark_scores": {
10
+ "hfopenllm_v2/IFEval": 0.235,
11
+ "hfopenllm_v2/BBH": 0.2785,
12
+ "hfopenllm_v2/MATH Level 5": 0.0189,
13
+ "hfopenllm_v2/GPQA": 0.2492,
14
+ "hfopenllm_v2/MUSR": 0.3552,
15
+ "hfopenllm_v2/MMLU-PRO": 0.1119
16
+ }
17
+ },
18
+ {
19
+ "id": "Alepach/notHumpback-M1",
20
+ "name": "notHumpback-M1",
21
+ "developer": "Alepach",
22
+ "evaluator_relationship": null,
23
+ "benchmark_scores": {
24
+ "hfopenllm_v2/IFEval": 0.2207,
25
+ "hfopenllm_v2/BBH": 0.2882,
26
+ "hfopenllm_v2/MATH Level 5": 0.0159,
27
+ "hfopenllm_v2/GPQA": 0.2374,
28
+ "hfopenllm_v2/MUSR": 0.342,
29
+ "hfopenllm_v2/MMLU-PRO": 0.1091
30
+ }
31
+ },
32
+ {
33
+ "id": "Alepach/notHumpback-M1-v2",
34
+ "name": "notHumpback-M1-v2",
35
+ "developer": "Alepach",
36
+ "evaluator_relationship": null,
37
+ "benchmark_scores": {
38
+ "hfopenllm_v2/IFEval": 0.2277,
39
+ "hfopenllm_v2/BBH": 0.2776,
40
+ "hfopenllm_v2/MATH Level 5": 0.0219,
41
+ "hfopenllm_v2/GPQA": 0.2601,
42
+ "hfopenllm_v2/MUSR": 0.3473,
43
+ "hfopenllm_v2/MMLU-PRO": 0.1119
44
+ }
45
+ }
46
+ ]
47
+ }
data/developers/AlephAlpha.json ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "developer": "AlephAlpha",
3
+ "models": [
4
+ {
5
+ "id": "AlephAlpha/luminous-base",
6
+ "name": "Luminous Base 13B",
7
+ "developer": "AlephAlpha",
8
+ "evaluator_relationship": null,
9
+ "benchmark_scores": {
10
+ "helm_lite/Mean win rate": 0.041,
11
+ "helm_lite/NarrativeQA": 0.633,
12
+ "helm_lite/NaturalQuestions (closed-book)": 0.197,
13
+ "helm_lite/OpenbookQA": 0.286,
14
+ "helm_lite/MMLU": 0.243,
15
+ "helm_lite/MATH": 0.026,
16
+ "helm_lite/GSM8K": 0.028,
17
+ "helm_lite/LegalBench": 0.332,
18
+ "helm_lite/MedQA": 0.26,
19
+ "helm_lite/WMT 2014": 0.066
20
+ }
21
+ },
22
+ {
23
+ "id": "AlephAlpha/luminous-extended",
24
+ "name": "Luminous Extended 30B",
25
+ "developer": "AlephAlpha",
26
+ "evaluator_relationship": null,
27
+ "benchmark_scores": {
28
+ "helm_lite/Mean win rate": 0.078,
29
+ "helm_lite/NarrativeQA": 0.684,
30
+ "helm_lite/NaturalQuestions (closed-book)": 0.253,
31
+ "helm_lite/OpenbookQA": 0.272,
32
+ "helm_lite/MMLU": 0.248,
33
+ "helm_lite/MATH": 0.04,
34
+ "helm_lite/GSM8K": 0.075,
35
+ "helm_lite/LegalBench": 0.421,
36
+ "helm_lite/MedQA": 0.276,
37
+ "helm_lite/WMT 2014": 0.083
38
+ }
39
+ },
40
+ {
41
+ "id": "AlephAlpha/luminous-supreme",
42
+ "name": "Luminous Supreme 70B",
43
+ "developer": "AlephAlpha",
44
+ "evaluator_relationship": null,
45
+ "benchmark_scores": {
46
+ "helm_lite/Mean win rate": 0.145,
47
+ "helm_lite/NarrativeQA": 0.743,
48
+ "helm_lite/NaturalQuestions (closed-book)": 0.299,
49
+ "helm_lite/OpenbookQA": 0.284,
50
+ "helm_lite/MMLU": 0.316,
51
+ "helm_lite/MATH": 0.078,
52
+ "helm_lite/GSM8K": 0.137,
53
+ "helm_lite/LegalBench": 0.452,
54
+ "helm_lite/MedQA": 0.276,
55
+ "helm_lite/WMT 2014": 0.102
56
+ }
57
+ }
58
+ ]
59
+ }
data/developers/Alibaba-NLP.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "developer": "Alibaba-NLP",
3
+ "models": [
4
+ {
5
+ "id": "Alibaba-NLP/gte-Qwen2-7B-instruct",
6
+ "name": "gte-Qwen2-7B-instruct",
7
+ "developer": "Alibaba-NLP",
8
+ "evaluator_relationship": null,
9
+ "benchmark_scores": {
10
+ "hfopenllm_v2/IFEval": 0.2255,
11
+ "hfopenllm_v2/BBH": 0.4495,
12
+ "hfopenllm_v2/MATH Level 5": 0.0642,
13
+ "hfopenllm_v2/GPQA": 0.245,
14
+ "hfopenllm_v2/MUSR": 0.3559,
15
+ "hfopenllm_v2/MMLU-PRO": 0.3321
16
+ }
17
+ }
18
+ ]
19
+ }
data/developers/Alibaba.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "developer": "Alibaba",
3
+ "models": [
4
+ {
5
+ "id": "alibaba/qwen-3-coder-480b",
6
+ "name": "Qwen 3 Coder 480B",
7
+ "developer": "Alibaba",
8
+ "evaluator_relationship": null,
9
+ "benchmark_scores": {
10
+ "terminal-bench-2.0/terminal-bench-2.0": 23.9
11
+ }
12
+ },
13
+ {
14
+ "id": "alibaba/qwen3-235b-a22b-thinking-2507",
15
+ "name": "qwen3-235b-a22b-thinking-2507",
16
+ "developer": "Alibaba",
17
+ "evaluator_relationship": null,
18
+ "benchmark_scores": {
19
+ "livecodebenchpro/Hard Problems": 0.0,
20
+ "livecodebenchpro/Medium Problems": 0.1267605633802817,
21
+ "livecodebenchpro/Easy Problems": 0.7605633802816901
22
+ }
23
+ },
24
+ {
25
+ "id": "alibaba/qwen3-30b-a3b",
26
+ "name": "qwen3-30b-a3b",
27
+ "developer": "Alibaba",
28
+ "evaluator_relationship": null,
29
+ "benchmark_scores": {
30
+ "livecodebenchpro/Hard Problems": 0.0,
31
+ "livecodebenchpro/Medium Problems": 0.028169014084507043,
32
+ "livecodebenchpro/Easy Problems": 0.5774647887323944
33
+ }
34
+ },
35
+ {
36
+ "id": "alibaba/qwen3-max",
37
+ "name": "alibaba/qwen3-max",
38
+ "developer": "Alibaba",
39
+ "evaluator_relationship": null,
40
+ "benchmark_scores": {
41
+ "livecodebenchpro/Hard Problems": 0.0,
42
+ "livecodebenchpro/Medium Problems": 0.04225352112676056,
43
+ "livecodebenchpro/Easy Problems": 0.36619718309859156
44
+ }
45
+ },
46
+ {
47
+ "id": "alibaba/qwen3-next-80b-a3b-thinking",
48
+ "name": "qwen3-next-80b-a3b-thinking",
49
+ "developer": "Alibaba",
50
+ "evaluator_relationship": null,
51
+ "benchmark_scores": {
52
+ "livecodebenchpro/Hard Problems": 0.0,
53
+ "livecodebenchpro/Medium Problems": 0.14084507042253522,
54
+ "livecodebenchpro/Easy Problems": 0.7464788732394366
55
+ }
56
+ }
57
+ ]
58
+ }
data/developers/Alsebay.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "developer": "Alsebay",
3
+ "models": [
4
+ {
5
+ "id": "Alsebay/Qwen2.5-7B-test-novelist",
6
+ "name": "Qwen2.5-7B-test-novelist",
7
+ "developer": "Alsebay",
8
+ "evaluator_relationship": null,
9
+ "benchmark_scores": {
10
+ "hfopenllm_v2/IFEval": 0.5352,
11
+ "hfopenllm_v2/BBH": 0.5151,
12
+ "hfopenllm_v2/MATH Level 5": 0.2349,
13
+ "hfopenllm_v2/GPQA": 0.2911,
14
+ "hfopenllm_v2/MUSR": 0.4749,
15
+ "hfopenllm_v2/MMLU-PRO": 0.3866
16
+ }
17
+ }
18
+ ]
19
+ }
data/developers/Amaorynho.json ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "developer": "Amaorynho",
3
+ "models": [
4
+ {
5
+ "id": "Amaorynho/BBAI2006",
6
+ "name": "BBAI2006",
7
+ "developer": "Amaorynho",
8
+ "evaluator_relationship": null,
9
+ "benchmark_scores": {
10
+ "hfopenllm_v2/IFEval": 0.1467,
11
+ "hfopenllm_v2/BBH": 0.2704,
12
+ "hfopenllm_v2/MATH Level 5": 0.0,
13
+ "hfopenllm_v2/GPQA": 0.2525,
14
+ "hfopenllm_v2/MUSR": 0.3605,
15
+ "hfopenllm_v2/MMLU-PRO": 0.1123
16
+ }
17
+ },
18
+ {
19
+ "id": "Amaorynho/BBAI270V4",
20
+ "name": "BBAI270V4",
21
+ "developer": "Amaorynho",
22
+ "evaluator_relationship": null,
23
+ "benchmark_scores": {
24
+ "hfopenllm_v2/IFEval": 0.199,
25
+ "hfopenllm_v2/BBH": 0.3071,
26
+ "hfopenllm_v2/MATH Level 5": 0.0083,
27
+ "hfopenllm_v2/GPQA": 0.2458,
28
+ "hfopenllm_v2/MUSR": 0.3314,
29
+ "hfopenllm_v2/MMLU-PRO": 0.1114
30
+ }
31
+ },
32
+ {
33
+ "id": "Amaorynho/BBAIIFEV1",
34
+ "name": "BBAIIFEV1",
35
+ "developer": "Amaorynho",
36
+ "evaluator_relationship": null,
37
+ "benchmark_scores": {
38
+ "hfopenllm_v2/IFEval": 0.8047,
39
+ "hfopenllm_v2/BBH": 0.5292,
40
+ "hfopenllm_v2/MATH Level 5": 0.1934,
41
+ "hfopenllm_v2/GPQA": 0.3104,
42
+ "hfopenllm_v2/MUSR": 0.4185,
43
+ "hfopenllm_v2/MMLU-PRO": 0.3857
44
+ }
45
+ },
46
+ {
47
+ "id": "Amaorynho/BBAI_375",
48
+ "name": "BBAI_375",
49
+ "developer": "Amaorynho",
50
+ "evaluator_relationship": null,
51
+ "benchmark_scores": {
52
+ "hfopenllm_v2/IFEval": 0.1467,
53
+ "hfopenllm_v2/BBH": 0.2704,
54
+ "hfopenllm_v2/MATH Level 5": 0.0,
55
+ "hfopenllm_v2/GPQA": 0.2525,
56
+ "hfopenllm_v2/MUSR": 0.3605,
57
+ "hfopenllm_v2/MMLU-PRO": 0.1123
58
+ }
59
+ }
60
+ ]
61
+ }
data/developers/Amu.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "developer": "Amu",
3
+ "models": [
4
+ {
5
+ "id": "Amu/t1-1.5B",
6
+ "name": "t1-1.5B",
7
+ "developer": "Amu",
8
+ "evaluator_relationship": null,
9
+ "benchmark_scores": {
10
+ "hfopenllm_v2/IFEval": 0.3394,
11
+ "hfopenllm_v2/BBH": 0.4008,
12
+ "hfopenllm_v2/MATH Level 5": 0.0514,
13
+ "hfopenllm_v2/GPQA": 0.2433,
14
+ "hfopenllm_v2/MUSR": 0.3517,
15
+ "hfopenllm_v2/MMLU-PRO": 0.2566
16
+ }
17
+ },
18
+ {
19
+ "id": "Amu/t1-3B",
20
+ "name": "t1-3B",
21
+ "developer": "Amu",
22
+ "evaluator_relationship": null,
23
+ "benchmark_scores": {
24
+ "hfopenllm_v2/IFEval": 0.3328,
25
+ "hfopenllm_v2/BBH": 0.3999,
26
+ "hfopenllm_v2/MATH Level 5": 0.1375,
27
+ "hfopenllm_v2/GPQA": 0.2408,
28
+ "hfopenllm_v2/MUSR": 0.3435,
29
+ "hfopenllm_v2/MMLU-PRO": 0.1284
30
+ }
31
+ }
32
+ ]
33
+ }
data/developers/Anthropic.json ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "developer": "Anthropic",
3
+ "models": [
4
+ {
5
+ "id": "Anthropic/claude-3-5-sonnet-20240620",
6
+ "name": "Anthropic/claude-3-5-sonnet-20240620",
7
+ "developer": "Anthropic",
8
+ "evaluator_relationship": null,
9
+ "benchmark_scores": {
10
+ "reward-bench/Score": 0.8417,
11
+ "reward-bench/Chat": 0.9637,
12
+ "reward-bench/Chat Hard": 0.7401,
13
+ "reward-bench/Safety": 0.8162,
14
+ "reward-bench/Reasoning": 0.8469
15
+ }
16
+ },
17
+ {
18
+ "id": "Anthropic/claude-3-haiku-20240307",
19
+ "name": "Anthropic/claude-3-haiku-20240307",
20
+ "developer": "Anthropic",
21
+ "evaluator_relationship": null,
22
+ "benchmark_scores": {
23
+ "reward-bench/Score": 0.7289,
24
+ "reward-bench/Chat": 0.9274,
25
+ "reward-bench/Chat Hard": 0.5197,
26
+ "reward-bench/Safety": 0.7953,
27
+ "reward-bench/Reasoning": 0.706,
28
+ "reward-bench/Prior Sets (0.5 weight)": 0.6635
29
+ }
30
+ },
31
+ {
32
+ "id": "Anthropic/claude-3-opus-20240229",
33
+ "name": "Anthropic/claude-3-opus-20240229",
34
+ "developer": "Anthropic",
35
+ "evaluator_relationship": null,
36
+ "benchmark_scores": {
37
+ "reward-bench/Score": 0.8008,
38
+ "reward-bench/Chat": 0.9469,
39
+ "reward-bench/Chat Hard": 0.6031,
40
+ "reward-bench/Safety": 0.8662,
41
+ "reward-bench/Reasoning": 0.7868
42
+ }
43
+ },
44
+ {
45
+ "id": "Anthropic/claude-3-sonnet-20240229",
46
+ "name": "Anthropic/claude-3-sonnet-20240229",
47
+ "developer": "Anthropic",
48
+ "evaluator_relationship": null,
49
+ "benchmark_scores": {
50
+ "reward-bench/Score": 0.7458,
51
+ "reward-bench/Chat": 0.9344,
52
+ "reward-bench/Chat Hard": 0.5658,
53
+ "reward-bench/Safety": 0.8169,
54
+ "reward-bench/Reasoning": 0.6907,
55
+ "reward-bench/Prior Sets (0.5 weight)": 0.6963
56
+ }
57
+ },
58
+ {
59
+ "id": "anthropic/claude-3.7-sonnet",
60
+ "name": "anthropic/claude-3.7-sonnet",
61
+ "developer": "Anthropic",
62
+ "evaluator_relationship": null,
63
+ "benchmark_scores": {
64
+ "livecodebenchpro/Hard Problems": 0.0,
65
+ "livecodebenchpro/Medium Problems": 0.014084507042253521,
66
+ "livecodebenchpro/Easy Problems": 0.15492957746478872
67
+ }
68
+ },
69
+ {
70
+ "id": "anthropic/claude-haiku-4.5",
71
+ "name": "Claude Haiku 4.5",
72
+ "developer": "Anthropic",
73
+ "evaluator_relationship": null,
74
+ "benchmark_scores": {
75
+ "terminal-bench-2.0/terminal-bench-2.0": 35.5
76
+ }
77
+ },
78
+ {
79
+ "id": "anthropic/claude-opus-4-5",
80
+ "name": "claude-opus-4-5",
81
+ "developer": "Anthropic",
82
+ "evaluator_relationship": null,
83
+ "benchmark_scores": {
84
+ "appworld_test_normal/appworld/test_normal": 0.66,
85
+ "browsecompplus/browsecompplus": 0.49,
86
+ "swe-bench/swe-bench": 0.65,
87
+ "tau-bench-2_airline/tau-bench-2/airline": 0.66,
88
+ "tau-bench-2_retail/tau-bench-2/retail": 0.85,
89
+ "tau-bench-2_telecom/tau-bench-2/telecom": 0.58
90
+ }
91
+ },
92
+ {
93
+ "id": "anthropic/claude-opus-4.1",
94
+ "name": "Claude Opus 4.1",
95
+ "developer": "Anthropic",
96
+ "evaluator_relationship": null,
97
+ "benchmark_scores": {
98
+ "terminal-bench-2.0/terminal-bench-2.0": 38.0
99
+ }
100
+ },
101
+ {
102
+ "id": "anthropic/claude-opus-4.5",
103
+ "name": "Claude Opus 4.5",
104
+ "developer": "Anthropic",
105
+ "evaluator_relationship": null,
106
+ "benchmark_scores": {
107
+ "terminal-bench-2.0/terminal-bench-2.0": 54.3
108
+ }
109
+ },
110
+ {
111
+ "id": "anthropic/claude-opus-4.6",
112
+ "name": "Claude Opus 4.6",
113
+ "developer": "Anthropic",
114
+ "evaluator_relationship": null,
115
+ "benchmark_scores": {
116
+ "terminal-bench-2.0/terminal-bench-2.0": 69.9
117
+ }
118
+ },
119
+ {
120
+ "id": "anthropic/claude-sonnet-4.5",
121
+ "name": "Claude Sonnet 4.5",
122
+ "developer": "Anthropic",
123
+ "evaluator_relationship": null,
124
+ "benchmark_scores": {
125
+ "terminal-bench-2.0/terminal-bench-2.0": 42.6
126
+ }
127
+ }
128
+ ]
129
+ }
data/developers/ArliAI.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "developer": "ArliAI",
3
+ "models": [
4
+ {
5
+ "id": "ArliAI/ArliAI-RPMax-12B-v1.1",
6
+ "name": "ArliAI-RPMax-12B-v1.1",
7
+ "developer": "ArliAI",
8
+ "evaluator_relationship": null,
9
+ "benchmark_scores": {
10
+ "hfopenllm_v2/IFEval": 0.5349,
11
+ "hfopenllm_v2/BBH": 0.4752,
12
+ "hfopenllm_v2/MATH Level 5": 0.1125,
13
+ "hfopenllm_v2/GPQA": 0.2819,
14
+ "hfopenllm_v2/MUSR": 0.3618,
15
+ "hfopenllm_v2/MMLU-PRO": 0.3384
16
+ }
17
+ },
18
+ {
19
+ "id": "ArliAI/Llama-3.1-8B-ArliAI-RPMax-v1.1",
20
+ "name": "Llama-3.1-8B-ArliAI-RPMax-v1.1",
21
+ "developer": "ArliAI",
22
+ "evaluator_relationship": null,
23
+ "benchmark_scores": {
24
+ "hfopenllm_v2/IFEval": 0.6359,
25
+ "hfopenllm_v2/BBH": 0.5016,
26
+ "hfopenllm_v2/MATH Level 5": 0.1314,
27
+ "hfopenllm_v2/GPQA": 0.2836,
28
+ "hfopenllm_v2/MUSR": 0.3577,
29
+ "hfopenllm_v2/MMLU-PRO": 0.3551
30
+ }
31
+ }
32
+ ]
33
+ }
data/developers/Arthur-LAGACHERIE.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "developer": "Arthur-LAGACHERIE",
3
+ "models": [
4
+ {
5
+ "id": "Arthur-LAGACHERIE/Precis-1B-Instruct",
6
+ "name": "Precis-1B-Instruct",
7
+ "developer": "Arthur-LAGACHERIE",
8
+ "evaluator_relationship": null,
9
+ "benchmark_scores": {
10
+ "hfopenllm_v2/IFEval": 0.3671,
11
+ "hfopenllm_v2/BBH": 0.3224,
12
+ "hfopenllm_v2/MATH Level 5": 0.0038,
13
+ "hfopenllm_v2/GPQA": 0.2659,
14
+ "hfopenllm_v2/MUSR": 0.3436,
15
+ "hfopenllm_v2/MMLU-PRO": 0.1426
16
+ }
17
+ }
18
+ ]
19
+ }
data/developers/Artples.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "developer": "Artples",
3
+ "models": [
4
+ {
5
+ "id": "Artples/L-MChat-7b",
6
+ "name": "L-MChat-7b",
7
+ "developer": "Artples",
8
+ "evaluator_relationship": null,
9
+ "benchmark_scores": {
10
+ "hfopenllm_v2/IFEval": 0.5297,
11
+ "hfopenllm_v2/BBH": 0.46,
12
+ "hfopenllm_v2/MATH Level 5": 0.0921,
13
+ "hfopenllm_v2/GPQA": 0.3054,
14
+ "hfopenllm_v2/MUSR": 0.4029,
15
+ "hfopenllm_v2/MMLU-PRO": 0.3299
16
+ }
17
+ },
18
+ {
19
+ "id": "Artples/L-MChat-Small",
20
+ "name": "L-MChat-Small",
21
+ "developer": "Artples",
22
+ "evaluator_relationship": null,
23
+ "benchmark_scores": {
24
+ "hfopenllm_v2/IFEval": 0.3287,
25
+ "hfopenllm_v2/BBH": 0.4823,
26
+ "hfopenllm_v2/MATH Level 5": 0.0378,
27
+ "hfopenllm_v2/GPQA": 0.2676,
28
+ "hfopenllm_v2/MUSR": 0.3696,
29
+ "hfopenllm_v2/MMLU-PRO": 0.2464
30
+ }
31
+ }
32
+ ]
33
+ }