Spaces:
Running
Running
update aith gpt5 results
#13
by
JessicaOjo - opened
data/community_results/New Results - June2025.csv
CHANGED
|
@@ -19,7 +19,7 @@ GPT-4.1 (April),sib,1.0,89.7,85.8,84.8,85.8,87.3,84.3,82.4,84.3,86.3,86.3,81.4,8
|
|
| 19 |
GPT-4.1 (April),afrixnli,1.0,88.5,70.8,72.7,69.0,69.3,32.0,67.0,68.0,73.7,71.5,72.8,74.0,54.8,67.2,69.8,66.6
|
| 20 |
GPT-4.1 (April),belebele,1.0,73.9,72.0,48.0,37.7,76.4,61.7,54.0,58.3,77.6,60.8,90.0,78.0,37.55,52.2,76.1,62.9
|
| 21 |
GPT-4.1 (April),afrimmlu,1.0,71.4,55.2,55.4,56.2,63.8,60.2,50.2,53.0,65.2,57.0,76.8,66.6,37,53.6,69.8,58.6
|
| 22 |
-
GPT-4.1 (April),afrimgsm,1.0,82.4,57.2,57.2,54.0,60.4,49.6,54.4,64.4,60.0,64.4,77.2,51.2,
|
| 23 |
GPT-4.1 (April),flores - en_xx,1.0,73.1,32.5,53.1,42.0,53.2,48.7,41.3,41.6,49.6,50.6,65.5,53.6,44.2,27.5,55.7,47.1
|
| 24 |
LLaMa 4 405B,injongointent,4.0,88.9,84.8,80.5,78.8,63.8,65.3,63.0,62.3,77.0,59.4,91.4,84.2,-,75.3,74.5,73.9
|
| 25 |
LLaMa 4 405B,sib,3.0,84.8,86.8,80.9,82.4,74.5,84.3,79.4,71.6,77.5,77.9,86.8,84.8,-,78.9,81.9,80.6
|
|
@@ -71,3 +71,10 @@ Gemini-2.5 Pro,afrimmlu,1.0,72.2,82.2,68.4,76.4,75.8,79.2,79.8,75.0,81.2,75.2,80
|
|
| 71 |
Gemini-2.5 Pro,injongointent,4.0,87.9,91.7,94.1,92.0,80.9,90.5,85.5,85.3,93.8,76.6,93.8,90.5,84.2,89.2,83.8,88.0
|
| 72 |
Gemini-2.5 Pro,sib,3.0,88.7,88.7,89.2,89.7,87.3,87.3,85.3,86.8,87.7,87.7,90.2,89.2,85.8,88.2,87.7,87.9
|
| 73 |
Gemini-2.5 Pro,belebele,5.0,78.1,85.6,56.1,62.2,80.9,78.9,75.4,74.1,84.4,73.7,87.8,86.3,72.9,68.8,83.0,76.4
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
GPT-4.1 (April),afrixnli,1.0,88.5,70.8,72.7,69.0,69.3,32.0,67.0,68.0,73.7,71.5,72.8,74.0,54.8,67.2,69.8,66.6
|
| 20 |
GPT-4.1 (April),belebele,1.0,73.9,72.0,48.0,37.7,76.4,61.7,54.0,58.3,77.6,60.8,90.0,78.0,37.55,52.2,76.1,62.9
|
| 21 |
GPT-4.1 (April),afrimmlu,1.0,71.4,55.2,55.4,56.2,63.8,60.2,50.2,53.0,65.2,57.0,76.8,66.6,37,53.6,69.8,58.6
|
| 22 |
+
GPT-4.1 (April),afrimgsm,1.0,82.4,57.2,57.2,54.0,60.4,49.6,54.4,64.4,60.0,64.4,77.2,51.2,28.8,65.2,58.0,58.7
|
| 23 |
GPT-4.1 (April),flores - en_xx,1.0,73.1,32.5,53.1,42.0,53.2,48.7,41.3,41.6,49.6,50.6,65.5,53.6,44.2,27.5,55.7,47.1
|
| 24 |
LLaMa 4 405B,injongointent,4.0,88.9,84.8,80.5,78.8,63.8,65.3,63.0,62.3,77.0,59.4,91.4,84.2,-,75.3,74.5,73.9
|
| 25 |
LLaMa 4 405B,sib,3.0,84.8,86.8,80.9,82.4,74.5,84.3,79.4,71.6,77.5,77.9,86.8,84.8,-,78.9,81.9,80.6
|
|
|
|
| 71 |
Gemini-2.5 Pro,injongointent,4.0,87.9,91.7,94.1,92.0,80.9,90.5,85.5,85.3,93.8,76.6,93.8,90.5,84.2,89.2,83.8,88.0
|
| 72 |
Gemini-2.5 Pro,sib,3.0,88.7,88.7,89.2,89.7,87.3,87.3,85.3,86.8,87.7,87.7,90.2,89.2,85.8,88.2,87.7,87.9
|
| 73 |
Gemini-2.5 Pro,belebele,5.0,78.1,85.6,56.1,62.2,80.9,78.9,75.4,74.1,84.4,73.7,87.8,86.3,72.9,68.8,83.0,76.4
|
| 74 |
+
GPT-5 (Aug),afrixnli,5.0,89.2,90.3,82.3,78.3,86.9,81.2,77.6,81.3,85.6,86.4,94.6,87.3,69.1,80.4,85.3,83.3
|
| 75 |
+
GPT-5 (Aug),afrimgsm,2.0,92.4,78.4,75.6,72.8,75.6,70.4,66.0,83.2,74.8,73.2,88.8,66.8,51.6,82.4,71.6,73.7
|
| 76 |
+
GPT-5 (Aug),flores - en_xx,3.0,66.8,35.5,49.3,41.6,51.2,47.0,41.6,42.5,47.3,47.3,60.0,50.2,29.8,28.6,54.7,44.8
|
| 77 |
+
GPT-5 (Aug),afrimmlu,1.0,90.6,85.8,82.8,84.6,82.0,82.6,82.0,86.4,83.0,85.2,90.0,88.4,63.4,81.4,88.2,83.3
|
| 78 |
+
GPT-5 (Aug),injongointent,4.0,88.3,90.9,95.8,89.2,79.8,87.3,87.8,83.8,94.7,77.2,93.3,89.4,80.6,87.5,85.0,87.3
|
| 79 |
+
GPT-5 (Aug),sib,3.0,91.2,89.2,90.2,90.2,89.7,88.2,85.8,88.2,88.7,90.2,91.2,90.7,79.9,87.3,89.7,88.5
|
| 80 |
+
GPT-5 (Aug),belebele,5.0,89.2,90.3,82.3,78.3,86.9,81.2,77.6,81.3,85.6,86.4,94.6,87.3,69.1,80.4,85.3,83.3
|
data/leaderboard_json/afrobench_lite.json
CHANGED
|
@@ -22,7 +22,8 @@
|
|
| 22 |
"Claude 3.7 Sonnet": 58.8,
|
| 23 |
"Claude 4.5 Sonnet": 69.9,
|
| 24 |
"Gemini-2.5 Flash": 69.3,
|
| 25 |
-
"Gemini-2.5 Pro": 72.5
|
|
|
|
| 26 |
}
|
| 27 |
},
|
| 28 |
"Intent": {
|
|
@@ -48,7 +49,8 @@
|
|
| 48 |
"Claude 3.7 Sonnet": 72.2,
|
| 49 |
"Claude 4.5 Sonnet": 79.3,
|
| 50 |
"Gemini-2.5 Flash": 87.4,
|
| 51 |
-
"Gemini-2.5 Pro": 88.0
|
|
|
|
| 52 |
}
|
| 53 |
},
|
| 54 |
"MT(en/fr-xx)": {
|
|
@@ -74,7 +76,8 @@
|
|
| 74 |
"Claude 3.7 Sonnet": 42.9,
|
| 75 |
"Claude 4.5 Sonnet": 45.2,
|
| 76 |
"Gemini-2.5 Flash": 45.3,
|
| 77 |
-
"Gemini-2.5 Pro": 46.3
|
|
|
|
| 78 |
}
|
| 79 |
},
|
| 80 |
"MMLU": {
|
|
@@ -100,7 +103,8 @@
|
|
| 100 |
"Claude 3.7 Sonnet": 65.3,
|
| 101 |
"Claude 4.5 Sonnet": 74.0,
|
| 102 |
"Gemini-2.5 Flash": 67.3,
|
| 103 |
-
"Gemini-2.5 Pro": 77.4
|
|
|
|
| 104 |
}
|
| 105 |
},
|
| 106 |
"Math": {
|
|
@@ -126,7 +130,8 @@
|
|
| 126 |
"Claude 3.7 Sonnet": 33.9,
|
| 127 |
"Claude 4.5 Sonnet": 69.7,
|
| 128 |
"Gemini-2.5 Flash": 69.3,
|
| 129 |
-
"Gemini-2.5 Pro": 73.2
|
|
|
|
| 130 |
}
|
| 131 |
},
|
| 132 |
"Topic": {
|
|
@@ -152,7 +157,8 @@
|
|
| 152 |
"Claude 3.7 Sonnet": 84.5,
|
| 153 |
"Claude 4.5 Sonnet": 83.3,
|
| 154 |
"Gemini-2.5 Flash": 86.8,
|
| 155 |
-
"Gemini-2.5 Pro": 87.9
|
|
|
|
| 156 |
}
|
| 157 |
},
|
| 158 |
"RC": {
|
|
@@ -178,7 +184,8 @@
|
|
| 178 |
"Claude 3.7 Sonnet": 64.2,
|
| 179 |
"Claude 4.5 Sonnet": 72.8,
|
| 180 |
"Gemini-2.5 Flash": 41.6,
|
| 181 |
-
"Gemini-2.5 Pro": 76.4
|
|
|
|
| 182 |
}
|
| 183 |
}
|
| 184 |
}
|
|
|
|
| 22 |
"Claude 3.7 Sonnet": 58.8,
|
| 23 |
"Claude 4.5 Sonnet": 69.9,
|
| 24 |
"Gemini-2.5 Flash": 69.3,
|
| 25 |
+
"Gemini-2.5 Pro": 72.5,
|
| 26 |
+
"GPT-5 (Aug)": 83.3
|
| 27 |
}
|
| 28 |
},
|
| 29 |
"Intent": {
|
|
|
|
| 49 |
"Claude 3.7 Sonnet": 72.2,
|
| 50 |
"Claude 4.5 Sonnet": 79.3,
|
| 51 |
"Gemini-2.5 Flash": 87.4,
|
| 52 |
+
"Gemini-2.5 Pro": 88.0,
|
| 53 |
+
"GPT-5 (Aug)": 87.3
|
| 54 |
}
|
| 55 |
},
|
| 56 |
"MT(en/fr-xx)": {
|
|
|
|
| 76 |
"Claude 3.7 Sonnet": 42.9,
|
| 77 |
"Claude 4.5 Sonnet": 45.2,
|
| 78 |
"Gemini-2.5 Flash": 45.3,
|
| 79 |
+
"Gemini-2.5 Pro": 46.3,
|
| 80 |
+
"GPT-5 (Aug)": 44.8
|
| 81 |
}
|
| 82 |
},
|
| 83 |
"MMLU": {
|
|
|
|
| 103 |
"Claude 3.7 Sonnet": 65.3,
|
| 104 |
"Claude 4.5 Sonnet": 74.0,
|
| 105 |
"Gemini-2.5 Flash": 67.3,
|
| 106 |
+
"Gemini-2.5 Pro": 77.4,
|
| 107 |
+
"GPT-5 (Aug)": 83.3
|
| 108 |
}
|
| 109 |
},
|
| 110 |
"Math": {
|
|
|
|
| 130 |
"Claude 3.7 Sonnet": 33.9,
|
| 131 |
"Claude 4.5 Sonnet": 69.7,
|
| 132 |
"Gemini-2.5 Flash": 69.3,
|
| 133 |
+
"Gemini-2.5 Pro": 73.2,
|
| 134 |
+
"GPT-5 (Aug)": 73.7
|
| 135 |
}
|
| 136 |
},
|
| 137 |
"Topic": {
|
|
|
|
| 157 |
"Claude 3.7 Sonnet": 84.5,
|
| 158 |
"Claude 4.5 Sonnet": 83.3,
|
| 159 |
"Gemini-2.5 Flash": 86.8,
|
| 160 |
+
"Gemini-2.5 Pro": 87.9,
|
| 161 |
+
"GPT-5 (Aug)": 88.5
|
| 162 |
}
|
| 163 |
},
|
| 164 |
"RC": {
|
|
|
|
| 184 |
"Claude 3.7 Sonnet": 64.2,
|
| 185 |
"Claude 4.5 Sonnet": 72.8,
|
| 186 |
"Gemini-2.5 Flash": 41.6,
|
| 187 |
+
"Gemini-2.5 Pro": 76.4,
|
| 188 |
+
"GPT-5 (Aug)": 83.3
|
| 189 |
}
|
| 190 |
}
|
| 191 |
}
|
data/leaderboard_json/lite_language_scores.json
CHANGED
|
@@ -251,6 +251,7 @@
|
|
| 251 |
"sot": 46.8,
|
| 252 |
"swa": 59.7,
|
| 253 |
"xho": 52.7,
|
|
|
|
| 254 |
"yor": 47.1,
|
| 255 |
"zul": 51.1
|
| 256 |
},
|
|
@@ -266,6 +267,7 @@
|
|
| 266 |
"sot": 21.0,
|
| 267 |
"swa": 26.0,
|
| 268 |
"xho": 21.4,
|
|
|
|
| 269 |
"yor": 21.9,
|
| 270 |
"zul": 20.6
|
| 271 |
},
|
|
@@ -348,5 +350,21 @@
|
|
| 348 |
"wol": 66.9,
|
| 349 |
"yor": 71.8,
|
| 350 |
"zul": 76.7
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 351 |
}
|
| 352 |
}
|
|
|
|
| 251 |
"sot": 46.8,
|
| 252 |
"swa": 59.7,
|
| 253 |
"xho": 52.7,
|
| 254 |
+
"wol": null,
|
| 255 |
"yor": 47.1,
|
| 256 |
"zul": 51.1
|
| 257 |
},
|
|
|
|
| 267 |
"sot": 21.0,
|
| 268 |
"swa": 26.0,
|
| 269 |
"xho": 21.4,
|
| 270 |
+
"wol": null,
|
| 271 |
"yor": 21.9,
|
| 272 |
"zul": 20.6
|
| 273 |
},
|
|
|
|
| 350 |
"wol": 66.9,
|
| 351 |
"yor": 71.8,
|
| 352 |
"zul": 76.7
|
| 353 |
+
},
|
| 354 |
+
"GPT-5 (Aug)": {
|
| 355 |
+
"amh": 80.1,
|
| 356 |
+
"hau": 79.8,
|
| 357 |
+
"ibo": 76.4,
|
| 358 |
+
"kin": 78.9,
|
| 359 |
+
"lin": 76.8,
|
| 360 |
+
"lug": 74.1,
|
| 361 |
+
"orm": 78.1,
|
| 362 |
+
"sna": 80.0,
|
| 363 |
+
"sot": 78.0,
|
| 364 |
+
"swa": 87.5,
|
| 365 |
+
"xho": 80.0,
|
| 366 |
+
"wol": 63.4,
|
| 367 |
+
"yor": 75.4,
|
| 368 |
+
"zul": 80.0
|
| 369 |
}
|
| 370 |
}
|