| Task , Accuracy , Centered | |
| hellaswag_zeroshot , 0.493328 , 0.324437 | |
| jeopardy , 0.143599 , 0.143599 | |
| bigbench_qa_wikidata , 0.519069 , 0.519069 | |
| arc_easy , 0.654040 , 0.538721 | |
| arc_challenge , 0.376280 , 0.168373 | |
| copa , 0.620000 , 0.240000 | |
| commonsense_qa , 0.218673 , 0.023342 | |
| piqa , 0.714363 , 0.428727 | |
| openbook_qa , 0.372000 , 0.162667 | |
| lambada_openai , 0.405977 , 0.405977 | |
| hellaswag , 0.498208 , 0.330943 | |
| winograd , 0.659341 , 0.318681 | |
| winogrande , 0.543804 , 0.087609 | |
| bigbench_dyck_languages , 0.171000 , 0.171000 | |
| agi_eval_lsat_ar , 0.217391 , 0.021739 | |
| bigbench_cs_algorithms , 0.428788 , 0.428788 | |
| bigbench_operators , 0.180952 , 0.180952 | |
| bigbench_repeat_copy_logic , 0.031250 , 0.031250 | |
| squad , 0.326301 , 0.326301 | |
| coqa , 0.217587 , 0.217587 | |
| boolq , 0.521713 , -0.258651 | |
| bigbench_language_identification , 0.260000 , 0.185919 | |
| CORE , , 0.227138 | |
| fwe_bpb , 0.758697 , | |
| sv2_bpb , 0.444557 , | |
| avg_bpb , 0.601627 , | |