| dataset,prompt,metric,value | |
| anli_dev_r1,GPT-3 style,accuracy,0.486 | |
| anli_dev_r1,MNLI crowdsource,accuracy,0.427 | |
| anli_dev_r1,can we infer,accuracy,0.474 | |
| anli_dev_r1,guaranteed/possible/impossible,accuracy,0.39 | |
| anli_dev_r1,justified in saying,accuracy,0.46 | |
| anli_dev_r1,median,accuracy,0.46 | |
| anli_dev_r2,GPT-3 style,accuracy,0.441 | |
| anli_dev_r2,MNLI crowdsource,accuracy,0.406 | |
| anli_dev_r2,can we infer,accuracy,0.426 | |
| anli_dev_r2,guaranteed/possible/impossible,accuracy,0.36 | |
| anli_dev_r2,justified in saying,accuracy,0.419 | |
| anli_dev_r2,median,accuracy,0.419 | |
| anli_dev_r3,GPT-3 style,accuracy,0.455 | |
| anli_dev_r3,MNLI crowdsource,accuracy,0.42 | |
| anli_dev_r3,can we infer,accuracy,0.445 | |
| anli_dev_r3,guaranteed/possible/impossible,accuracy,0.32083333333333336 | |
| anli_dev_r3,justified in saying,accuracy,0.4266666666666667 | |
| anli_dev_r3,median,accuracy,0.4266666666666667 | |
| story_cloze_2016,Answer Given options,accuracy,0.9567076429716729 | |
| story_cloze_2016,Choose Story Ending,accuracy,0.9625868519508284 | |
| story_cloze_2016,Generate Ending,accuracy,0.7814003206841261 | |
| story_cloze_2016,Novel Correct Ending,accuracy,0.9577765900587921 | |
| story_cloze_2016,Story Continuation and Options,accuracy,0.951362907536077 | |
| story_cloze_2016,median,accuracy,0.9567076429716729 | |
| super_glue_cb,GPT-3 style,accuracy,0.8214285714285714 | |
| super_glue_cb,MNLI crowdsource,accuracy,0.375 | |
| super_glue_cb,can we infer,accuracy,0.8214285714285714 | |
| super_glue_cb,guaranteed/possible/impossible,accuracy,0.7321428571428571 | |
| super_glue_cb,justified in saying,accuracy,0.7678571428571429 | |
| super_glue_cb,median,accuracy,0.7678571428571429 | |
| super_glue_copa,"C1 or C2? premise, so/because…",accuracy,0.62 | |
| super_glue_copa,best_option,accuracy,0.87 | |
| super_glue_copa,cause_effect,accuracy,0.88 | |
| super_glue_copa,i_am_hesitating,accuracy,0.91 | |
| super_glue_copa,plausible_alternatives,accuracy,0.88 | |
| super_glue_copa,median,accuracy,0.88 | |
| super_glue_rte,GPT-3 style,accuracy,0.8303249097472925 | |
| super_glue_rte,MNLI crowdsource,accuracy,0.855595667870036 | |
| super_glue_rte,does it follow that,accuracy,0.7833935018050542 | |
| super_glue_rte,guaranteed true,accuracy,0.8122743682310469 | |
| super_glue_rte,should assume,accuracy,0.8194945848375451 | |
| super_glue_rte,median,accuracy,0.8194945848375451 | |
| winogrande_winogrande_xl,Replace,accuracy,0.584846093133386 | |
| winogrande_winogrande_xl,True or False,accuracy,0.5217048145224941 | |
| winogrande_winogrande_xl,does underscore refer to,accuracy,0.5840568271507498 | |
| winogrande_winogrande_xl,stand for,accuracy,0.5114443567482242 | |
| winogrande_winogrande_xl,underscore refer to,accuracy,0.5927387529597474 | |
| winogrande_winogrande_xl,median,accuracy,0.5840568271507498 | |
| xcopa_id,"C1 or C2? premise, so/because…",accuracy,0.55 | |
| xcopa_id,best_option,accuracy,0.78 | |
| xcopa_id,cause_effect,accuracy,0.86 | |
| xcopa_id,i_am_hesitating,accuracy,0.79 | |
| xcopa_id,plausible_alternatives,accuracy,0.84 | |
| xcopa_id,median,accuracy,0.79 | |
| xcopa_sw,"C1 or C2? premise, so/because…",accuracy,0.57 | |
| xcopa_sw,best_option,accuracy,0.6 | |
| xcopa_sw,cause_effect,accuracy,0.6 | |
| xcopa_sw,i_am_hesitating,accuracy,0.64 | |
| xcopa_sw,plausible_alternatives,accuracy,0.62 | |
| xcopa_sw,median,accuracy,0.6 | |
| xcopa_ta,"C1 or C2? premise, so/because…",accuracy,0.58 | |
| xcopa_ta,best_option,accuracy,0.67 | |
| xcopa_ta,cause_effect,accuracy,0.67 | |
| xcopa_ta,i_am_hesitating,accuracy,0.68 | |
| xcopa_ta,plausible_alternatives,accuracy,0.69 | |
| xcopa_ta,median,accuracy,0.67 | |
| xcopa_vi,"C1 or C2? premise, so/because…",accuracy,0.55 | |
| xcopa_vi,best_option,accuracy,0.83 | |
| xcopa_vi,cause_effect,accuracy,0.87 | |
| xcopa_vi,i_am_hesitating,accuracy,0.84 | |
| xcopa_vi,plausible_alternatives,accuracy,0.86 | |
| xcopa_vi,median,accuracy,0.84 | |
| xcopa_zh,"C1 or C2? premise, so/because…",accuracy,0.55 | |
| xcopa_zh,best_option,accuracy,0.83 | |
| xcopa_zh,cause_effect,accuracy,0.9 | |
| xcopa_zh,i_am_hesitating,accuracy,0.9 | |
| xcopa_zh,plausible_alternatives,accuracy,0.86 | |
| xcopa_zh,median,accuracy,0.86 | |
| xnli_ar,GPT-3 style,accuracy,0.5357429718875502 | |
| xnli_ar,MNLI crowdsource,accuracy,0.41004016064257026 | |
| xnli_ar,can we infer,accuracy,0.5606425702811245 | |
| xnli_ar,guaranteed/possible/impossible,accuracy,0.6068273092369478 | |
| xnli_ar,justified in saying,accuracy,0.5437751004016064 | |
| xnli_ar,median,accuracy,0.5437751004016064 | |
| xnli_en,GPT-3 style,accuracy,0.6168674698795181 | |
| xnli_en,MNLI crowdsource,accuracy,0.45502008032128516 | |
| xnli_en,can we infer,accuracy,0.6092369477911647 | |
| xnli_en,guaranteed/possible/impossible,accuracy,0.6746987951807228 | |
| xnli_en,justified in saying,accuracy,0.5895582329317269 | |
| xnli_en,median,accuracy,0.6092369477911647 | |
| xnli_es,GPT-3 style,accuracy,0.585140562248996 | |
| xnli_es,MNLI crowdsource,accuracy,0.4357429718875502 | |
| xnli_es,can we infer,accuracy,0.5883534136546185 | |
| xnli_es,guaranteed/possible/impossible,accuracy,0.6124497991967871 | |
| xnli_es,justified in saying,accuracy,0.5734939759036145 | |
| xnli_es,median,accuracy,0.585140562248996 | |
| xnli_fr,GPT-3 style,accuracy,0.5771084337349398 | |
| xnli_fr,MNLI crowdsource,accuracy,0.43012048192771085 | |
| xnli_fr,can we infer,accuracy,0.5807228915662651 | |
| xnli_fr,guaranteed/possible/impossible,accuracy,0.6136546184738956 | |
| xnli_fr,justified in saying,accuracy,0.5694779116465863 | |
| xnli_fr,median,accuracy,0.5771084337349398 | |
| xnli_hi,GPT-3 style,accuracy,0.5248995983935743 | |
| xnli_hi,MNLI crowdsource,accuracy,0.3795180722891566 | |
| xnli_hi,can we infer,accuracy,0.5506024096385542 | |
| xnli_hi,guaranteed/possible/impossible,accuracy,0.5682730923694779 | |
| xnli_hi,justified in saying,accuracy,0.5353413654618474 | |
| xnli_hi,median,accuracy,0.5353413654618474 | |
| xnli_sw,GPT-3 style,accuracy,0.4795180722891566 | |
| xnli_sw,MNLI crowdsource,accuracy,0.39196787148594375 | |
| xnli_sw,can we infer,accuracy,0.5208835341365462 | |
| xnli_sw,guaranteed/possible/impossible,accuracy,0.5036144578313253 | |
| xnli_sw,justified in saying,accuracy,0.5184738955823294 | |
| xnli_sw,median,accuracy,0.5036144578313253 | |
| xnli_ur,GPT-3 style,accuracy,0.46586345381526106 | |
| xnli_ur,MNLI crowdsource,accuracy,0.3718875502008032 | |
| xnli_ur,can we infer,accuracy,0.5080321285140562 | |
| xnli_ur,guaranteed/possible/impossible,accuracy,0.4995983935742972 | |
| xnli_ur,justified in saying,accuracy,0.5080321285140562 | |
| xnli_ur,median,accuracy,0.4995983935742972 | |
| xnli_vi,GPT-3 style,accuracy,0.5578313253012048 | |
| xnli_vi,MNLI crowdsource,accuracy,0.42449799196787147 | |
| xnli_vi,can we infer,accuracy,0.5678714859437751 | |
| xnli_vi,guaranteed/possible/impossible,accuracy,0.6100401606425703 | |
| xnli_vi,justified in saying,accuracy,0.5538152610441767 | |
| xnli_vi,median,accuracy,0.5578313253012048 | |
| xnli_zh,GPT-3 style,accuracy,0.5526104417670683 | |
| xnli_zh,MNLI crowdsource,accuracy,0.38473895582329315 | |
| xnli_zh,can we infer,accuracy,0.5690763052208835 | |
| xnli_zh,guaranteed/possible/impossible,accuracy,0.5674698795180723 | |
| xnli_zh,justified in saying,accuracy,0.5622489959839357 | |
| xnli_zh,median,accuracy,0.5622489959839357 | |
| xstory_cloze_ar,Answer Given options,accuracy,0.7968232958305758 | |
| xstory_cloze_ar,Choose Story Ending,accuracy,0.9232296492389146 | |
| xstory_cloze_ar,Generate Ending,accuracy,0.6677696889477167 | |
| xstory_cloze_ar,Novel Correct Ending,accuracy,0.9265387160820648 | |
| xstory_cloze_ar,Story Continuation and Options,accuracy,0.9126406353408338 | |
| xstory_cloze_ar,median,accuracy,0.9126406353408338 | |
| xstory_cloze_es,Answer Given options,accuracy,0.8729318332230311 | |
| xstory_cloze_es,Choose Story Ending,accuracy,0.9417604235605559 | |
| xstory_cloze_es,Generate Ending,accuracy,0.7359364659166115 | |
| xstory_cloze_es,Novel Correct Ending,accuracy,0.9430840502978161 | |
| xstory_cloze_es,Story Continuation and Options,accuracy,0.9318332230311053 | |
| xstory_cloze_es,median,accuracy,0.9318332230311053 | |
| xstory_cloze_eu,Answer Given options,accuracy,0.7054930509596293 | |
| xstory_cloze_eu,Choose Story Ending,accuracy,0.8663136995367307 | |
| xstory_cloze_eu,Generate Ending,accuracy,0.6320317670416943 | |
| xstory_cloze_eu,Novel Correct Ending,accuracy,0.8689609530112509 | |
| xstory_cloze_eu,Story Continuation and Options,accuracy,0.8524156187954997 | |
| xstory_cloze_eu,median,accuracy,0.8524156187954997 | |
| xstory_cloze_hi,Answer Given options,accuracy,0.798808735936466 | |
| xstory_cloze_hi,Choose Story Ending,accuracy,0.8702845797485109 | |
| xstory_cloze_hi,Generate Ending,accuracy,0.6604897418927862 | |
| xstory_cloze_hi,Novel Correct Ending,accuracy,0.8788881535407015 | |
| xstory_cloze_hi,Story Continuation and Options,accuracy,0.870946393117141 | |
| xstory_cloze_hi,median,accuracy,0.8702845797485109 | |
| xstory_cloze_id,Answer Given options,accuracy,0.8557246856386499 | |
| xstory_cloze_id,Choose Story Ending,accuracy,0.9212442091330245 | |
| xstory_cloze_id,Generate Ending,accuracy,0.7041694242223693 | |
| xstory_cloze_id,Novel Correct Ending,accuracy,0.9205823957643945 | |
| xstory_cloze_id,Story Continuation and Options,accuracy,0.9066843150231635 | |
| xstory_cloze_id,median,accuracy,0.9066843150231635 | |
| xstory_cloze_zh,Answer Given options,accuracy,0.900066181336863 | |
| xstory_cloze_zh,Choose Story Ending,accuracy,0.9232296492389146 | |
| xstory_cloze_zh,Generate Ending,accuracy,0.684976836532098 | |
| xstory_cloze_zh,Novel Correct Ending,accuracy,0.9311714096624751 | |
| xstory_cloze_zh,Story Continuation and Options,accuracy,0.9199205823957644 | |
| xstory_cloze_zh,median,accuracy,0.9199205823957644 | |
| xwinograd_en,Replace,accuracy,0.6847311827956989 | |
| xwinograd_en,True or False,accuracy,0.5135483870967742 | |
| xwinograd_en,does underscore refer to,accuracy,0.6787096774193548 | |
| xwinograd_en,stand for,accuracy,0.5053763440860215 | |
| xwinograd_en,underscore refer to,accuracy,0.690752688172043 | |
| xwinograd_en,median,accuracy,0.6787096774193548 | |
| xwinograd_fr,Replace,accuracy,0.6506024096385542 | |
| xwinograd_fr,True or False,accuracy,0.4939759036144578 | |
| xwinograd_fr,does underscore refer to,accuracy,0.6867469879518072 | |
| xwinograd_fr,stand for,accuracy,0.46987951807228917 | |
| xwinograd_fr,underscore refer to,accuracy,0.6626506024096386 | |
| xwinograd_fr,median,accuracy,0.6506024096385542 | |
| xwinograd_pt,Replace,accuracy,0.6349809885931559 | |
| xwinograd_pt,True or False,accuracy,0.4866920152091255 | |
| xwinograd_pt,does underscore refer to,accuracy,0.6387832699619772 | |
| xwinograd_pt,stand for,accuracy,0.49429657794676807 | |
| xwinograd_pt,underscore refer to,accuracy,0.6425855513307985 | |
| xwinograd_pt,median,accuracy,0.6349809885931559 | |
| xwinograd_zh,Replace,accuracy,0.6865079365079365 | |
| xwinograd_zh,True or False,accuracy,0.5277777777777778 | |
| xwinograd_zh,does underscore refer to,accuracy,0.6884920634920635 | |
| xwinograd_zh,stand for,accuracy,0.4861111111111111 | |
| xwinograd_zh,underscore refer to,accuracy,0.6904761904761905 | |
| xwinograd_zh,median,accuracy,0.6865079365079365 | |
| multiple,average,multiple,0.6903830754158429 | |