Task , Accuracy , Centered hellaswag_zeroshot , 0.445031 , 0.260041 jeopardy , 0.116202 , 0.116202 bigbench_qa_wikidata , 0.524039 , 0.524039 arc_easy , 0.651936 , 0.535915 arc_challenge , 0.353242 , 0.137656 copa , 0.640000 , 0.280000 commonsense_qa , 0.325962 , 0.157453 piqa , 0.686072 , 0.372144 openbook_qa , 0.332000 , 0.109333 lambada_openai , 0.368523 , 0.368523 hellaswag , 0.445927 , 0.261236 winograd , 0.663004 , 0.326007 winogrande , 0.546961 , 0.093923 bigbench_dyck_languages , 0.126000 , 0.126000 agi_eval_lsat_ar , 0.269565 , 0.086956 bigbench_cs_algorithms , 0.342424 , 0.342424 bigbench_operators , 0.161905 , 0.161905 bigbench_repeat_copy_logic , 0.000000 , 0.000000 squad , 0.237938 , 0.237938 coqa , 0.200676 , 0.200676 boolq , 0.490214 , -0.341542 bigbench_language_identification , 0.257400 , 0.183058 CORE , , 0.206359