| Task , Accuracy , Centered | |
| hellaswag_zeroshot , 0.444633 , 0.259510 | |
| jeopardy , 0.101559 , 0.101559 | |
| bigbench_qa_wikidata , 0.530830 , 0.530830 | |
| arc_easy , 0.634259 , 0.512346 | |
| arc_challenge , 0.339590 , 0.119454 | |
| copa , 0.650000 , 0.300000 | |
| commonsense_qa , 0.281736 , 0.102170 | |
| piqa , 0.687704 , 0.375408 | |
| openbook_qa , 0.330000 , 0.106667 | |
| lambada_openai , 0.366000 , 0.366000 | |
| hellaswag , 0.443637 , 0.258182 | |
| winograd , 0.604396 , 0.208791 | |
| winogrande , 0.528808 , 0.057616 | |
| bigbench_dyck_languages , 0.108000 , 0.108000 | |
| agi_eval_lsat_ar , 0.260870 , 0.076087 | |
| bigbench_cs_algorithms , 0.350758 , 0.350758 | |
| bigbench_operators , 0.185714 , 0.185714 | |
| bigbench_repeat_copy_logic , 0.000000 , 0.000000 | |
| squad , 0.230558 , 0.230558 | |
| coqa , 0.202054 , 0.202054 | |
| boolq , 0.537003 , -0.218413 | |
| bigbench_language_identification , 0.254900 , 0.180308 | |
| CORE , , 0.200618 | |