update aith gpt5 results

#13
by JessicaOjo - opened
data/community_results/New Results - June2025.csv CHANGED
@@ -19,7 +19,7 @@ GPT-4.1 (April),sib,1.0,89.7,85.8,84.8,85.8,87.3,84.3,82.4,84.3,86.3,86.3,81.4,8
19
  GPT-4.1 (April),afrixnli,1.0,88.5,70.8,72.7,69.0,69.3,32.0,67.0,68.0,73.7,71.5,72.8,74.0,54.8,67.2,69.8,66.6
20
  GPT-4.1 (April),belebele,1.0,73.9,72.0,48.0,37.7,76.4,61.7,54.0,58.3,77.6,60.8,90.0,78.0,37.55,52.2,76.1,62.9
21
  GPT-4.1 (April),afrimmlu,1.0,71.4,55.2,55.4,56.2,63.8,60.2,50.2,53.0,65.2,57.0,76.8,66.6,37,53.6,69.8,58.6
22
- GPT-4.1 (April),afrimgsm,1.0,82.4,57.2,57.2,54.0,60.4,49.6,54.4,64.4,60.0,64.4,77.2,51.2,48.7,65.2,58.0,58.7
23
  GPT-4.1 (April),flores - en_xx,1.0,73.1,32.5,53.1,42.0,53.2,48.7,41.3,41.6,49.6,50.6,65.5,53.6,44.2,27.5,55.7,47.1
24
  LLaMa 4 405B,injongointent,4.0,88.9,84.8,80.5,78.8,63.8,65.3,63.0,62.3,77.0,59.4,91.4,84.2,-,75.3,74.5,73.9
25
  LLaMa 4 405B,sib,3.0,84.8,86.8,80.9,82.4,74.5,84.3,79.4,71.6,77.5,77.9,86.8,84.8,-,78.9,81.9,80.6
@@ -71,3 +71,10 @@ Gemini-2.5 Pro,afrimmlu,1.0,72.2,82.2,68.4,76.4,75.8,79.2,79.8,75.0,81.2,75.2,80
71
  Gemini-2.5 Pro,injongointent,4.0,87.9,91.7,94.1,92.0,80.9,90.5,85.5,85.3,93.8,76.6,93.8,90.5,84.2,89.2,83.8,88.0
72
  Gemini-2.5 Pro,sib,3.0,88.7,88.7,89.2,89.7,87.3,87.3,85.3,86.8,87.7,87.7,90.2,89.2,85.8,88.2,87.7,87.9
73
  Gemini-2.5 Pro,belebele,5.0,78.1,85.6,56.1,62.2,80.9,78.9,75.4,74.1,84.4,73.7,87.8,86.3,72.9,68.8,83.0,76.4
 
 
 
 
 
 
 
 
19
  GPT-4.1 (April),afrixnli,1.0,88.5,70.8,72.7,69.0,69.3,32.0,67.0,68.0,73.7,71.5,72.8,74.0,54.8,67.2,69.8,66.6
20
  GPT-4.1 (April),belebele,1.0,73.9,72.0,48.0,37.7,76.4,61.7,54.0,58.3,77.6,60.8,90.0,78.0,37.55,52.2,76.1,62.9
21
  GPT-4.1 (April),afrimmlu,1.0,71.4,55.2,55.4,56.2,63.8,60.2,50.2,53.0,65.2,57.0,76.8,66.6,37,53.6,69.8,58.6
22
+ GPT-4.1 (April),afrimgsm,1.0,82.4,57.2,57.2,54.0,60.4,49.6,54.4,64.4,60.0,64.4,77.2,51.2,28.8,65.2,58.0,58.7
23
  GPT-4.1 (April),flores - en_xx,1.0,73.1,32.5,53.1,42.0,53.2,48.7,41.3,41.6,49.6,50.6,65.5,53.6,44.2,27.5,55.7,47.1
24
  LLaMa 4 405B,injongointent,4.0,88.9,84.8,80.5,78.8,63.8,65.3,63.0,62.3,77.0,59.4,91.4,84.2,-,75.3,74.5,73.9
25
  LLaMa 4 405B,sib,3.0,84.8,86.8,80.9,82.4,74.5,84.3,79.4,71.6,77.5,77.9,86.8,84.8,-,78.9,81.9,80.6
 
71
  Gemini-2.5 Pro,injongointent,4.0,87.9,91.7,94.1,92.0,80.9,90.5,85.5,85.3,93.8,76.6,93.8,90.5,84.2,89.2,83.8,88.0
72
  Gemini-2.5 Pro,sib,3.0,88.7,88.7,89.2,89.7,87.3,87.3,85.3,86.8,87.7,87.7,90.2,89.2,85.8,88.2,87.7,87.9
73
  Gemini-2.5 Pro,belebele,5.0,78.1,85.6,56.1,62.2,80.9,78.9,75.4,74.1,84.4,73.7,87.8,86.3,72.9,68.8,83.0,76.4
74
+ GPT-5 (Aug),afrixnli,5.0,89.2,90.3,82.3,78.3,86.9,81.2,77.6,81.3,85.6,86.4,94.6,87.3,69.1,80.4,85.3,83.3
75
+ GPT-5 (Aug),afrimgsm,2.0,92.4,78.4,75.6,72.8,75.6,70.4,66.0,83.2,74.8,73.2,88.8,66.8,51.6,82.4,71.6,73.7
76
+ GPT-5 (Aug),flores - en_xx,3.0,66.8,35.5,49.3,41.6,51.2,47.0,41.6,42.5,47.3,47.3,60.0,50.2,29.8,28.6,54.7,44.8
77
+ GPT-5 (Aug),afrimmlu,1.0,90.6,85.8,82.8,84.6,82.0,82.6,82.0,86.4,83.0,85.2,90.0,88.4,63.4,81.4,88.2,83.3
78
+ GPT-5 (Aug),injongointent,4.0,88.3,90.9,95.8,89.2,79.8,87.3,87.8,83.8,94.7,77.2,93.3,89.4,80.6,87.5,85.0,87.3
79
+ GPT-5 (Aug),sib,3.0,91.2,89.2,90.2,90.2,89.7,88.2,85.8,88.2,88.7,90.2,91.2,90.7,79.9,87.3,89.7,88.5
80
+ GPT-5 (Aug),belebele,5.0,89.2,90.3,82.3,78.3,86.9,81.2,77.6,81.3,85.6,86.4,94.6,87.3,69.1,80.4,85.3,83.3
data/leaderboard_json/afrobench_lite.json CHANGED
@@ -22,7 +22,8 @@
22
  "Claude 3.7 Sonnet": 58.8,
23
  "Claude 4.5 Sonnet": 69.9,
24
  "Gemini-2.5 Flash": 69.3,
25
- "Gemini-2.5 Pro": 72.5
 
26
  }
27
  },
28
  "Intent": {
@@ -48,7 +49,8 @@
48
  "Claude 3.7 Sonnet": 72.2,
49
  "Claude 4.5 Sonnet": 79.3,
50
  "Gemini-2.5 Flash": 87.4,
51
- "Gemini-2.5 Pro": 88.0
 
52
  }
53
  },
54
  "MT(en/fr-xx)": {
@@ -74,7 +76,8 @@
74
  "Claude 3.7 Sonnet": 42.9,
75
  "Claude 4.5 Sonnet": 45.2,
76
  "Gemini-2.5 Flash": 45.3,
77
- "Gemini-2.5 Pro": 46.3
 
78
  }
79
  },
80
  "MMLU": {
@@ -100,7 +103,8 @@
100
  "Claude 3.7 Sonnet": 65.3,
101
  "Claude 4.5 Sonnet": 74.0,
102
  "Gemini-2.5 Flash": 67.3,
103
- "Gemini-2.5 Pro": 77.4
 
104
  }
105
  },
106
  "Math": {
@@ -126,7 +130,8 @@
126
  "Claude 3.7 Sonnet": 33.9,
127
  "Claude 4.5 Sonnet": 69.7,
128
  "Gemini-2.5 Flash": 69.3,
129
- "Gemini-2.5 Pro": 73.2
 
130
  }
131
  },
132
  "Topic": {
@@ -152,7 +157,8 @@
152
  "Claude 3.7 Sonnet": 84.5,
153
  "Claude 4.5 Sonnet": 83.3,
154
  "Gemini-2.5 Flash": 86.8,
155
- "Gemini-2.5 Pro": 87.9
 
156
  }
157
  },
158
  "RC": {
@@ -178,7 +184,8 @@
178
  "Claude 3.7 Sonnet": 64.2,
179
  "Claude 4.5 Sonnet": 72.8,
180
  "Gemini-2.5 Flash": 41.6,
181
- "Gemini-2.5 Pro": 76.4
 
182
  }
183
  }
184
  }
 
22
  "Claude 3.7 Sonnet": 58.8,
23
  "Claude 4.5 Sonnet": 69.9,
24
  "Gemini-2.5 Flash": 69.3,
25
+ "Gemini-2.5 Pro": 72.5,
26
+ "GPT-5 (Aug)": 83.3
27
  }
28
  },
29
  "Intent": {
 
49
  "Claude 3.7 Sonnet": 72.2,
50
  "Claude 4.5 Sonnet": 79.3,
51
  "Gemini-2.5 Flash": 87.4,
52
+ "Gemini-2.5 Pro": 88.0,
53
+ "GPT-5 (Aug)": 87.3
54
  }
55
  },
56
  "MT(en/fr-xx)": {
 
76
  "Claude 3.7 Sonnet": 42.9,
77
  "Claude 4.5 Sonnet": 45.2,
78
  "Gemini-2.5 Flash": 45.3,
79
+ "Gemini-2.5 Pro": 46.3,
80
+ "GPT-5 (Aug)": 44.8
81
  }
82
  },
83
  "MMLU": {
 
103
  "Claude 3.7 Sonnet": 65.3,
104
  "Claude 4.5 Sonnet": 74.0,
105
  "Gemini-2.5 Flash": 67.3,
106
+ "Gemini-2.5 Pro": 77.4,
107
+ "GPT-5 (Aug)": 83.3
108
  }
109
  },
110
  "Math": {
 
130
  "Claude 3.7 Sonnet": 33.9,
131
  "Claude 4.5 Sonnet": 69.7,
132
  "Gemini-2.5 Flash": 69.3,
133
+ "Gemini-2.5 Pro": 73.2,
134
+ "GPT-5 (Aug)": 73.7
135
  }
136
  },
137
  "Topic": {
 
157
  "Claude 3.7 Sonnet": 84.5,
158
  "Claude 4.5 Sonnet": 83.3,
159
  "Gemini-2.5 Flash": 86.8,
160
+ "Gemini-2.5 Pro": 87.9,
161
+ "GPT-5 (Aug)": 88.5
162
  }
163
  },
164
  "RC": {
 
184
  "Claude 3.7 Sonnet": 64.2,
185
  "Claude 4.5 Sonnet": 72.8,
186
  "Gemini-2.5 Flash": 41.6,
187
+ "Gemini-2.5 Pro": 76.4,
188
+ "GPT-5 (Aug)": 83.3
189
  }
190
  }
191
  }
data/leaderboard_json/lite_language_scores.json CHANGED
@@ -251,6 +251,7 @@
251
  "sot": 46.8,
252
  "swa": 59.7,
253
  "xho": 52.7,
 
254
  "yor": 47.1,
255
  "zul": 51.1
256
  },
@@ -266,6 +267,7 @@
266
  "sot": 21.0,
267
  "swa": 26.0,
268
  "xho": 21.4,
 
269
  "yor": 21.9,
270
  "zul": 20.6
271
  },
@@ -348,5 +350,21 @@
348
  "wol": 66.9,
349
  "yor": 71.8,
350
  "zul": 76.7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
351
  }
352
  }
 
251
  "sot": 46.8,
252
  "swa": 59.7,
253
  "xho": 52.7,
254
+ "wol": null,
255
  "yor": 47.1,
256
  "zul": 51.1
257
  },
 
267
  "sot": 21.0,
268
  "swa": 26.0,
269
  "xho": 21.4,
270
+ "wol": null,
271
  "yor": 21.9,
272
  "zul": 20.6
273
  },
 
350
  "wol": 66.9,
351
  "yor": 71.8,
352
  "zul": 76.7
353
+ },
354
+ "GPT-5 (Aug)": {
355
+ "amh": 80.1,
356
+ "hau": 79.8,
357
+ "ibo": 76.4,
358
+ "kin": 78.9,
359
+ "lin": 76.8,
360
+ "lug": 74.1,
361
+ "orm": 78.1,
362
+ "sna": 80.0,
363
+ "sot": 78.0,
364
+ "swa": 87.5,
365
+ "xho": 80.0,
366
+ "wol": 63.4,
367
+ "yor": 75.4,
368
+ "zul": 80.0
369
  }
370
  }