| Task , Accuracy , Centered | |
| hellaswag_zeroshot , 0.451504 , 0.268672 | |
| jeopardy , 0.121398 , 0.121398 | |
| bigbench_qa_wikidata , 0.527828 , 0.527828 | |
| arc_easy , 0.648569 , 0.531425 | |
| arc_challenge , 0.343857 , 0.125142 | |
| copa , 0.680000 , 0.360000 | |
| commonsense_qa , 0.291564 , 0.114455 | |
| piqa , 0.695865 , 0.391730 | |
| openbook_qa , 0.352000 , 0.136000 | |
| lambada_openai , 0.354939 , 0.354939 | |
| hellaswag , 0.447520 , 0.263361 | |
| winograd , 0.630037 , 0.260073 | |
| winogrande , 0.550908 , 0.101815 | |
| bigbench_dyck_languages , 0.108000 , 0.108000 | |
| agi_eval_lsat_ar , 0.308696 , 0.135870 | |
| bigbench_cs_algorithms , 0.371970 , 0.371970 | |
| bigbench_operators , 0.142857 , 0.142857 | |
| bigbench_repeat_copy_logic , 0.000000 , 0.000000 | |
| squad , 0.252791 , 0.252791 | |
| coqa , 0.193160 , 0.193160 | |
| boolq , 0.529969 , -0.236923 | |
| bigbench_language_identification , 0.251200 , 0.176238 | |
| CORE , , 0.213673 | |