Update README.md
Browse files
README.md
CHANGED
|
@@ -150,9 +150,10 @@ Detailed model outputs are available as HuggingFace datasets for [Arena-Hard](ht
|
|
| 150 |
|
| 151 |
### Accuracy
|
| 152 |
|
| 153 |
-
#### Open LLM Leaderboard evaluation scores
|
| 154 |
<table>
|
| 155 |
<tr>
|
|
|
|
|
|
|
| 156 |
<td><strong>Benchmark</strong>
|
| 157 |
</td>
|
| 158 |
<td><strong>Meta-Llama-3.1-8B-Instruct </strong>
|
|
@@ -163,7 +164,9 @@ Detailed model outputs are available as HuggingFace datasets for [Arena-Hard](ht
|
|
| 163 |
</td>
|
| 164 |
</tr>
|
| 165 |
<tr>
|
| 166 |
-
<td><strong>
|
|
|
|
|
|
|
| 167 |
</td>
|
| 168 |
<td>25.8 (25.1 / 26.5)
|
| 169 |
</td>
|
|
@@ -173,10 +176,8 @@ Detailed model outputs are available as HuggingFace datasets for [Arena-Hard](ht
|
|
| 173 |
</td>
|
| 174 |
</tr>
|
| 175 |
<tr>
|
| 176 |
-
<td><strong>OpenLLM v1</strong>
|
| 177 |
</td>
|
| 178 |
-
</tr>
|
| 179 |
-
<tr>
|
| 180 |
<td>MMLU (5-shot)
|
| 181 |
</td>
|
| 182 |
<td>68.3
|
|
@@ -257,10 +258,8 @@ Detailed model outputs are available as HuggingFace datasets for [Arena-Hard](ht
|
|
| 257 |
</td>
|
| 258 |
</tr>
|
| 259 |
<tr>
|
| 260 |
-
<td><strong>OpenLLM v2</strong>
|
| 261 |
</td>
|
| 262 |
-
</tr>
|
| 263 |
-
<tr>
|
| 264 |
<td>MMLU-Pro (5-shot)
|
| 265 |
</td>
|
| 266 |
<td>30.8
|
|
@@ -291,7 +290,7 @@ Detailed model outputs are available as HuggingFace datasets for [Arena-Hard](ht
|
|
| 291 |
</td>
|
| 292 |
</tr>
|
| 293 |
<tr>
|
| 294 |
-
<td>Math
|
| 295 |
</td>
|
| 296 |
<td>15.7
|
| 297 |
</td>
|
|
@@ -331,10 +330,8 @@ Detailed model outputs are available as HuggingFace datasets for [Arena-Hard](ht
|
|
| 331 |
</td>
|
| 332 |
</tr>
|
| 333 |
<tr>
|
| 334 |
-
<td><strong>Coding</strong>
|
| 335 |
</td>
|
| 336 |
-
</tr>
|
| 337 |
-
<tr>
|
| 338 |
<td>HumanEval pass@1
|
| 339 |
</td>
|
| 340 |
<td>67.3
|
|
@@ -354,8 +351,81 @@ Detailed model outputs are available as HuggingFace datasets for [Arena-Hard](ht
|
|
| 354 |
<td>98.8%
|
| 355 |
</td>
|
| 356 |
</tr>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 357 |
</table>
|
| 358 |
|
|
|
|
| 359 |
### Reproduction
|
| 360 |
|
| 361 |
The results were obtained using the following commands:
|
|
@@ -447,6 +517,90 @@ lm_eval \
|
|
| 447 |
--batch_size auto
|
| 448 |
```
|
| 449 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 450 |
#### HumanEval and HumanEval+
|
| 451 |
##### Generation
|
| 452 |
```
|
|
|
|
| 150 |
|
| 151 |
### Accuracy
|
| 152 |
|
|
|
|
| 153 |
<table>
|
| 154 |
<tr>
|
| 155 |
+
<td><strong>Category</strong>
|
| 156 |
+
</td>
|
| 157 |
<td><strong>Benchmark</strong>
|
| 158 |
</td>
|
| 159 |
<td><strong>Meta-Llama-3.1-8B-Instruct </strong>
|
|
|
|
| 164 |
</td>
|
| 165 |
</tr>
|
| 166 |
<tr>
|
| 167 |
+
<td rowspan="1" ><strong>LLM as a judge</strong>
|
| 168 |
+
</td>
|
| 169 |
+
<td>Arena Hard
|
| 170 |
</td>
|
| 171 |
<td>25.8 (25.1 / 26.5)
|
| 172 |
</td>
|
|
|
|
| 176 |
</td>
|
| 177 |
</tr>
|
| 178 |
<tr>
|
| 179 |
+
<td rowspan="8" ><strong>OpenLLM v1</strong>
|
| 180 |
</td>
|
|
|
|
|
|
|
| 181 |
<td>MMLU (5-shot)
|
| 182 |
</td>
|
| 183 |
<td>68.3
|
|
|
|
| 258 |
</td>
|
| 259 |
</tr>
|
| 260 |
<tr>
|
| 261 |
+
<td rowspan="7" ><strong>OpenLLM v2</strong>
|
| 262 |
</td>
|
|
|
|
|
|
|
| 263 |
<td>MMLU-Pro (5-shot)
|
| 264 |
</td>
|
| 265 |
<td>30.8
|
|
|
|
| 290 |
</td>
|
| 291 |
</tr>
|
| 292 |
<tr>
|
| 293 |
+
<td>Math-lvl-5 (4-shot)
|
| 294 |
</td>
|
| 295 |
<td>15.7
|
| 296 |
</td>
|
|
|
|
| 330 |
</td>
|
| 331 |
</tr>
|
| 332 |
<tr>
|
| 333 |
+
<td rowspan="2" ><strong>Coding</strong>
|
| 334 |
</td>
|
|
|
|
|
|
|
| 335 |
<td>HumanEval pass@1
|
| 336 |
</td>
|
| 337 |
<td>67.3
|
|
|
|
| 351 |
<td>98.8%
|
| 352 |
</td>
|
| 353 |
</tr>
|
| 354 |
+
<tr>
|
| 355 |
+
<td rowspan="9" ><strong>Multilingual</strong>
|
| 356 |
+
</td>
|
| 357 |
+
<td>Portuguese MMLU (5-shot)
|
| 358 |
+
</td>
|
| 359 |
+
<td>59.96
|
| 360 |
+
</td>
|
| 361 |
+
<td>59.36
|
| 362 |
+
</td>
|
| 363 |
+
<td>99.0%
|
| 364 |
+
</td>
|
| 365 |
+
</tr>
|
| 366 |
+
<tr>
|
| 367 |
+
<td>Spanish MMLU (5-shot)
|
| 368 |
+
</td>
|
| 369 |
+
<td>60.25
|
| 370 |
+
</td>
|
| 371 |
+
<td>59.77
|
| 372 |
+
</td>
|
| 373 |
+
<td>99.2%
|
| 374 |
+
</td>
|
| 375 |
+
</tr>
|
| 376 |
+
<tr>
|
| 377 |
+
<td>Italian MMLU (5-shot)
|
| 378 |
+
</td>
|
| 379 |
+
<td>59.23
|
| 380 |
+
</td>
|
| 381 |
+
<td>58.61
|
| 382 |
+
</td>
|
| 383 |
+
<td>99.0%
|
| 384 |
+
</td>
|
| 385 |
+
</tr>
|
| 386 |
+
<tr>
|
| 387 |
+
<td>German MMLU (5-shot)
|
| 388 |
+
</td>
|
| 389 |
+
<td>58.63
|
| 390 |
+
</td>
|
| 391 |
+
<td>58.23
|
| 392 |
+
</td>
|
| 393 |
+
<td>99.3%
|
| 394 |
+
</td>
|
| 395 |
+
</tr>
|
| 396 |
+
<tr>
|
| 397 |
+
<td>French MMLU (5-shot)
|
| 398 |
+
</td>
|
| 399 |
+
<td>59.65
|
| 400 |
+
</td>
|
| 401 |
+
<td>58.70
|
| 402 |
+
</td>
|
| 403 |
+
<td>98.4%
|
| 404 |
+
</td>
|
| 405 |
+
</tr>
|
| 406 |
+
<tr>
|
| 407 |
+
<td>Hindi MMLU (5-shot)
|
| 408 |
+
</td>
|
| 409 |
+
<td>50.10
|
| 410 |
+
</td>
|
| 411 |
+
<td>49.33
|
| 412 |
+
</td>
|
| 413 |
+
<td>98.5%
|
| 414 |
+
</td>
|
| 415 |
+
</tr>
|
| 416 |
+
<tr>
|
| 417 |
+
<td>Thai MMLU (5-shot)
|
| 418 |
+
</td>
|
| 419 |
+
<td>49.12
|
| 420 |
+
</td>
|
| 421 |
+
<td>48.09
|
| 422 |
+
</td>
|
| 423 |
+
<td>97.9%
|
| 424 |
+
</td>
|
| 425 |
+
</tr>
|
| 426 |
</table>
|
| 427 |
|
| 428 |
+
|
| 429 |
### Reproduction
|
| 430 |
|
| 431 |
The results were obtained using the following commands:
|
|
|
|
| 517 |
--batch_size auto
|
| 518 |
```
|
| 519 |
|
| 520 |
+
#### MMLU Portuguese
|
| 521 |
+
```
|
| 522 |
+
lm_eval \
|
| 523 |
+
--model vllm \
|
| 524 |
+
--model_args pretrained="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",dtype=auto,max_model_len=3850,max_gen_toks=10,tensor_parallel_size=1 \
|
| 525 |
+
--tasks mmlu_pt_llama_3.1_instruct \
|
| 526 |
+
--fewshot_as_multiturn \
|
| 527 |
+
--apply_chat_template \
|
| 528 |
+
--num_fewshot 5 \
|
| 529 |
+
--batch_size auto
|
| 530 |
+
```
|
| 531 |
+
|
| 532 |
+
#### MMLU Spanish
|
| 533 |
+
```
|
| 534 |
+
lm_eval \
|
| 535 |
+
--model vllm \
|
| 536 |
+
--model_args pretrained="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",dtype=auto,max_model_len=3850,max_gen_toks=10,tensor_parallel_size=1 \
|
| 537 |
+
--tasks mmlu_es_llama_3.1_instruct \
|
| 538 |
+
--fewshot_as_multiturn \
|
| 539 |
+
--apply_chat_template \
|
| 540 |
+
--num_fewshot 5 \
|
| 541 |
+
--batch_size auto
|
| 542 |
+
```
|
| 543 |
+
|
| 544 |
+
#### MMLU Italian
|
| 545 |
+
```
|
| 546 |
+
lm_eval \
|
| 547 |
+
--model vllm \
|
| 548 |
+
--model_args pretrained="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",dtype=auto,max_model_len=3850,max_gen_toks=10,tensor_parallel_size=1 \
|
| 549 |
+
--tasks mmlu_it_llama_3.1_instruct \
|
| 550 |
+
--fewshot_as_multiturn \
|
| 551 |
+
--apply_chat_template \
|
| 552 |
+
--num_fewshot 5 \
|
| 553 |
+
--batch_size auto
|
| 554 |
+
```
|
| 555 |
+
|
| 556 |
+
#### MMLU German
|
| 557 |
+
```
|
| 558 |
+
lm_eval \
|
| 559 |
+
--model vllm \
|
| 560 |
+
--model_args pretrained="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",dtype=auto,max_model_len=3850,max_gen_toks=10,tensor_parallel_size=1 \
|
| 561 |
+
--tasks mmlu_de_llama_3.1_instruct \
|
| 562 |
+
--fewshot_as_multiturn \
|
| 563 |
+
--apply_chat_template \
|
| 564 |
+
--num_fewshot 5 \
|
| 565 |
+
--batch_size auto
|
| 566 |
+
```
|
| 567 |
+
|
| 568 |
+
#### MMLU French
|
| 569 |
+
```
|
| 570 |
+
lm_eval \
|
| 571 |
+
--model vllm \
|
| 572 |
+
--model_args pretrained="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",dtype=auto,max_model_len=3850,max_gen_toks=10,tensor_parallel_size=1 \
|
| 573 |
+
--tasks mmlu_fr_llama_3.1_instruct \
|
| 574 |
+
--fewshot_as_multiturn \
|
| 575 |
+
--apply_chat_template \
|
| 576 |
+
--num_fewshot 5 \
|
| 577 |
+
--batch_size auto
|
| 578 |
+
```
|
| 579 |
+
|
| 580 |
+
#### MMLU Hindi
|
| 581 |
+
```
|
| 582 |
+
lm_eval \
|
| 583 |
+
--model vllm \
|
| 584 |
+
--model_args pretrained="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",dtype=auto,max_model_len=3850,max_gen_toks=10,tensor_parallel_size=1 \
|
| 585 |
+
--tasks mmlu_hi_llama_3.1_instruct \
|
| 586 |
+
--fewshot_as_multiturn \
|
| 587 |
+
--apply_chat_template \
|
| 588 |
+
--num_fewshot 5 \
|
| 589 |
+
--batch_size auto
|
| 590 |
+
```
|
| 591 |
+
|
| 592 |
+
#### MMLU Thai
|
| 593 |
+
```
|
| 594 |
+
lm_eval \
|
| 595 |
+
--model vllm \
|
| 596 |
+
--model_args pretrained="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",dtype=auto,max_model_len=3850,max_gen_toks=10,tensor_parallel_size=1 \
|
| 597 |
+
--tasks mmlu_th_llama_3.1_instruct \
|
| 598 |
+
--fewshot_as_multiturn \
|
| 599 |
+
--apply_chat_template \
|
| 600 |
+
--num_fewshot 5 \
|
| 601 |
+
--batch_size auto
|
| 602 |
+
```
|
| 603 |
+
|
| 604 |
#### HumanEval and HumanEval+
|
| 605 |
##### Generation
|
| 606 |
```
|