| Task , Accuracy , Centered | |
| hellaswag_zeroshot , 0.296554 , 0.062073 | |
| jeopardy , 0.003307 , 0.003307 | |
| bigbench_qa_wikidata , 0.189016 , 0.189016 | |
| arc_easy , 0.435185 , 0.246914 | |
| arc_challenge , 0.246587 , -0.004551 | |
| copa , 0.500000 , 0.000000 | |
| commonsense_qa , 0.266175 , 0.082719 | |
| piqa , 0.597388 , 0.194777 | |
| openbook_qa , 0.298000 , 0.064000 | |
| lambada_openai , 0.207646 , 0.207646 | |
| hellaswag , 0.296156 , 0.061542 | |
| winograd , 0.556777 , 0.113553 | |
| winogrande , 0.520916 , 0.041831 | |
| bigbench_dyck_languages , 0.116000 , 0.116000 | |
| agi_eval_lsat_ar , 0.269565 , 0.086957 | |
| bigbench_cs_algorithms , 0.361364 , 0.361364 | |
| bigbench_operators , 0.104762 , 0.104762 | |
| bigbench_repeat_copy_logic , 0.000000 , 0.000000 | |
| squad , 0.004068 , 0.004068 | |
| coqa , 0.056495 , 0.056495 | |
| boolq , 0.545872 , -0.195075 | |
| bigbench_language_identification , 0.261400 , 0.187459 | |
| CORE , , 0.090221 | |