bloomz / evaluation_bloomz /evaluation_l1 /merged.csv

Organize val files

baaf0c1 about 3 years ago

10.5 kB

	dataset,prompt,metric,value
	anli_dev_r1,GPT-3 style,accuracy,0.486
	anli_dev_r1,MNLI crowdsource,accuracy,0.427
	anli_dev_r1,can we infer,accuracy,0.474
	anli_dev_r1,guaranteed/possible/impossible,accuracy,0.39
	anli_dev_r1,justified in saying,accuracy,0.46
	anli_dev_r1,median,accuracy,0.46
	anli_dev_r2,GPT-3 style,accuracy,0.441
	anli_dev_r2,MNLI crowdsource,accuracy,0.406
	anli_dev_r2,can we infer,accuracy,0.426
	anli_dev_r2,guaranteed/possible/impossible,accuracy,0.36
	anli_dev_r2,justified in saying,accuracy,0.419
	anli_dev_r2,median,accuracy,0.419
	anli_dev_r3,GPT-3 style,accuracy,0.455
	anli_dev_r3,MNLI crowdsource,accuracy,0.42
	anli_dev_r3,can we infer,accuracy,0.445
	anli_dev_r3,guaranteed/possible/impossible,accuracy,0.32083333333333336
	anli_dev_r3,justified in saying,accuracy,0.4266666666666667
	anli_dev_r3,median,accuracy,0.4266666666666667
	story_cloze_2016,Answer Given options,accuracy,0.9567076429716729
	story_cloze_2016,Choose Story Ending,accuracy,0.9625868519508284
	story_cloze_2016,Generate Ending,accuracy,0.7814003206841261
	story_cloze_2016,Novel Correct Ending,accuracy,0.9577765900587921
	story_cloze_2016,Story Continuation and Options,accuracy,0.951362907536077
	story_cloze_2016,median,accuracy,0.9567076429716729
	super_glue_cb,GPT-3 style,accuracy,0.8214285714285714
	super_glue_cb,MNLI crowdsource,accuracy,0.375
	super_glue_cb,can we infer,accuracy,0.8214285714285714
	super_glue_cb,guaranteed/possible/impossible,accuracy,0.7321428571428571
	super_glue_cb,justified in saying,accuracy,0.7678571428571429
	super_glue_cb,median,accuracy,0.7678571428571429
	super_glue_copa,"C1 or C2? premise, so/because…",accuracy,0.62
	super_glue_copa,best_option,accuracy,0.87
	super_glue_copa,cause_effect,accuracy,0.88
	super_glue_copa,i_am_hesitating,accuracy,0.91
	super_glue_copa,plausible_alternatives,accuracy,0.88
	super_glue_copa,median,accuracy,0.88
	super_glue_rte,GPT-3 style,accuracy,0.8303249097472925
	super_glue_rte,MNLI crowdsource,accuracy,0.855595667870036
	super_glue_rte,does it follow that,accuracy,0.7833935018050542
	super_glue_rte,guaranteed true,accuracy,0.8122743682310469
	super_glue_rte,should assume,accuracy,0.8194945848375451
	super_glue_rte,median,accuracy,0.8194945848375451
	winogrande_winogrande_xl,Replace,accuracy,0.584846093133386
	winogrande_winogrande_xl,True or False,accuracy,0.5217048145224941
	winogrande_winogrande_xl,does underscore refer to,accuracy,0.5840568271507498
	winogrande_winogrande_xl,stand for,accuracy,0.5114443567482242
	winogrande_winogrande_xl,underscore refer to,accuracy,0.5927387529597474
	winogrande_winogrande_xl,median,accuracy,0.5840568271507498
	xcopa_id,"C1 or C2? premise, so/because…",accuracy,0.55
	xcopa_id,best_option,accuracy,0.78
	xcopa_id,cause_effect,accuracy,0.86
	xcopa_id,i_am_hesitating,accuracy,0.79
	xcopa_id,plausible_alternatives,accuracy,0.84
	xcopa_id,median,accuracy,0.79
	xcopa_sw,"C1 or C2? premise, so/because…",accuracy,0.57
	xcopa_sw,best_option,accuracy,0.6
	xcopa_sw,cause_effect,accuracy,0.6
	xcopa_sw,i_am_hesitating,accuracy,0.64
	xcopa_sw,plausible_alternatives,accuracy,0.62
	xcopa_sw,median,accuracy,0.6
	xcopa_ta,"C1 or C2? premise, so/because…",accuracy,0.58
	xcopa_ta,best_option,accuracy,0.67
	xcopa_ta,cause_effect,accuracy,0.67
	xcopa_ta,i_am_hesitating,accuracy,0.68
	xcopa_ta,plausible_alternatives,accuracy,0.69
	xcopa_ta,median,accuracy,0.67
	xcopa_vi,"C1 or C2? premise, so/because…",accuracy,0.55
	xcopa_vi,best_option,accuracy,0.83
	xcopa_vi,cause_effect,accuracy,0.87
	xcopa_vi,i_am_hesitating,accuracy,0.84
	xcopa_vi,plausible_alternatives,accuracy,0.86
	xcopa_vi,median,accuracy,0.84
	xcopa_zh,"C1 or C2? premise, so/because…",accuracy,0.55
	xcopa_zh,best_option,accuracy,0.83
	xcopa_zh,cause_effect,accuracy,0.9
	xcopa_zh,i_am_hesitating,accuracy,0.9
	xcopa_zh,plausible_alternatives,accuracy,0.86
	xcopa_zh,median,accuracy,0.86
	xnli_ar,GPT-3 style,accuracy,0.5357429718875502
	xnli_ar,MNLI crowdsource,accuracy,0.41004016064257026
	xnli_ar,can we infer,accuracy,0.5606425702811245
	xnli_ar,guaranteed/possible/impossible,accuracy,0.6068273092369478
	xnli_ar,justified in saying,accuracy,0.5437751004016064
	xnli_ar,median,accuracy,0.5437751004016064
	xnli_en,GPT-3 style,accuracy,0.6168674698795181
	xnli_en,MNLI crowdsource,accuracy,0.45502008032128516
	xnli_en,can we infer,accuracy,0.6092369477911647
	xnli_en,guaranteed/possible/impossible,accuracy,0.6746987951807228
	xnli_en,justified in saying,accuracy,0.5895582329317269
	xnli_en,median,accuracy,0.6092369477911647
	xnli_es,GPT-3 style,accuracy,0.585140562248996
	xnli_es,MNLI crowdsource,accuracy,0.4357429718875502
	xnli_es,can we infer,accuracy,0.5883534136546185
	xnli_es,guaranteed/possible/impossible,accuracy,0.6124497991967871
	xnli_es,justified in saying,accuracy,0.5734939759036145
	xnli_es,median,accuracy,0.585140562248996
	xnli_fr,GPT-3 style,accuracy,0.5771084337349398
	xnli_fr,MNLI crowdsource,accuracy,0.43012048192771085
	xnli_fr,can we infer,accuracy,0.5807228915662651
	xnli_fr,guaranteed/possible/impossible,accuracy,0.6136546184738956
	xnli_fr,justified in saying,accuracy,0.5694779116465863
	xnli_fr,median,accuracy,0.5771084337349398
	xnli_hi,GPT-3 style,accuracy,0.5248995983935743
	xnli_hi,MNLI crowdsource,accuracy,0.3795180722891566
	xnli_hi,can we infer,accuracy,0.5506024096385542
	xnli_hi,guaranteed/possible/impossible,accuracy,0.5682730923694779
	xnli_hi,justified in saying,accuracy,0.5353413654618474
	xnli_hi,median,accuracy,0.5353413654618474
	xnli_sw,GPT-3 style,accuracy,0.4795180722891566
	xnli_sw,MNLI crowdsource,accuracy,0.39196787148594375
	xnli_sw,can we infer,accuracy,0.5208835341365462
	xnli_sw,guaranteed/possible/impossible,accuracy,0.5036144578313253
	xnli_sw,justified in saying,accuracy,0.5184738955823294
	xnli_sw,median,accuracy,0.5036144578313253
	xnli_ur,GPT-3 style,accuracy,0.46586345381526106
	xnli_ur,MNLI crowdsource,accuracy,0.3718875502008032
	xnli_ur,can we infer,accuracy,0.5080321285140562
	xnli_ur,guaranteed/possible/impossible,accuracy,0.4995983935742972
	xnli_ur,justified in saying,accuracy,0.5080321285140562
	xnli_ur,median,accuracy,0.4995983935742972
	xnli_vi,GPT-3 style,accuracy,0.5578313253012048
	xnli_vi,MNLI crowdsource,accuracy,0.42449799196787147
	xnli_vi,can we infer,accuracy,0.5678714859437751
	xnli_vi,guaranteed/possible/impossible,accuracy,0.6100401606425703
	xnli_vi,justified in saying,accuracy,0.5538152610441767
	xnli_vi,median,accuracy,0.5578313253012048
	xnli_zh,GPT-3 style,accuracy,0.5526104417670683
	xnli_zh,MNLI crowdsource,accuracy,0.38473895582329315
	xnli_zh,can we infer,accuracy,0.5690763052208835
	xnli_zh,guaranteed/possible/impossible,accuracy,0.5674698795180723
	xnli_zh,justified in saying,accuracy,0.5622489959839357
	xnli_zh,median,accuracy,0.5622489959839357
	xstory_cloze_ar,Answer Given options,accuracy,0.7968232958305758
	xstory_cloze_ar,Choose Story Ending,accuracy,0.9232296492389146
	xstory_cloze_ar,Generate Ending,accuracy,0.6677696889477167
	xstory_cloze_ar,Novel Correct Ending,accuracy,0.9265387160820648
	xstory_cloze_ar,Story Continuation and Options,accuracy,0.9126406353408338
	xstory_cloze_ar,median,accuracy,0.9126406353408338
	xstory_cloze_es,Answer Given options,accuracy,0.8729318332230311
	xstory_cloze_es,Choose Story Ending,accuracy,0.9417604235605559
	xstory_cloze_es,Generate Ending,accuracy,0.7359364659166115
	xstory_cloze_es,Novel Correct Ending,accuracy,0.9430840502978161
	xstory_cloze_es,Story Continuation and Options,accuracy,0.9318332230311053
	xstory_cloze_es,median,accuracy,0.9318332230311053
	xstory_cloze_eu,Answer Given options,accuracy,0.7054930509596293
	xstory_cloze_eu,Choose Story Ending,accuracy,0.8663136995367307
	xstory_cloze_eu,Generate Ending,accuracy,0.6320317670416943
	xstory_cloze_eu,Novel Correct Ending,accuracy,0.8689609530112509
	xstory_cloze_eu,Story Continuation and Options,accuracy,0.8524156187954997
	xstory_cloze_eu,median,accuracy,0.8524156187954997
	xstory_cloze_hi,Answer Given options,accuracy,0.798808735936466
	xstory_cloze_hi,Choose Story Ending,accuracy,0.8702845797485109
	xstory_cloze_hi,Generate Ending,accuracy,0.6604897418927862
	xstory_cloze_hi,Novel Correct Ending,accuracy,0.8788881535407015
	xstory_cloze_hi,Story Continuation and Options,accuracy,0.870946393117141
	xstory_cloze_hi,median,accuracy,0.8702845797485109
	xstory_cloze_id,Answer Given options,accuracy,0.8557246856386499
	xstory_cloze_id,Choose Story Ending,accuracy,0.9212442091330245
	xstory_cloze_id,Generate Ending,accuracy,0.7041694242223693
	xstory_cloze_id,Novel Correct Ending,accuracy,0.9205823957643945
	xstory_cloze_id,Story Continuation and Options,accuracy,0.9066843150231635
	xstory_cloze_id,median,accuracy,0.9066843150231635
	xstory_cloze_zh,Answer Given options,accuracy,0.900066181336863
	xstory_cloze_zh,Choose Story Ending,accuracy,0.9232296492389146
	xstory_cloze_zh,Generate Ending,accuracy,0.684976836532098
	xstory_cloze_zh,Novel Correct Ending,accuracy,0.9311714096624751
	xstory_cloze_zh,Story Continuation and Options,accuracy,0.9199205823957644
	xstory_cloze_zh,median,accuracy,0.9199205823957644
	xwinograd_en,Replace,accuracy,0.6847311827956989
	xwinograd_en,True or False,accuracy,0.5135483870967742
	xwinograd_en,does underscore refer to,accuracy,0.6787096774193548
	xwinograd_en,stand for,accuracy,0.5053763440860215
	xwinograd_en,underscore refer to,accuracy,0.690752688172043
	xwinograd_en,median,accuracy,0.6787096774193548
	xwinograd_fr,Replace,accuracy,0.6506024096385542
	xwinograd_fr,True or False,accuracy,0.4939759036144578
	xwinograd_fr,does underscore refer to,accuracy,0.6867469879518072
	xwinograd_fr,stand for,accuracy,0.46987951807228917
	xwinograd_fr,underscore refer to,accuracy,0.6626506024096386
	xwinograd_fr,median,accuracy,0.6506024096385542
	xwinograd_pt,Replace,accuracy,0.6349809885931559
	xwinograd_pt,True or False,accuracy,0.4866920152091255
	xwinograd_pt,does underscore refer to,accuracy,0.6387832699619772
	xwinograd_pt,stand for,accuracy,0.49429657794676807
	xwinograd_pt,underscore refer to,accuracy,0.6425855513307985
	xwinograd_pt,median,accuracy,0.6349809885931559
	xwinograd_zh,Replace,accuracy,0.6865079365079365
	xwinograd_zh,True or False,accuracy,0.5277777777777778
	xwinograd_zh,does underscore refer to,accuracy,0.6884920634920635
	xwinograd_zh,stand for,accuracy,0.4861111111111111
	xwinograd_zh,underscore refer to,accuracy,0.6904761904761905
	xwinograd_zh,median,accuracy,0.6865079365079365
	multiple,average,multiple,0.6903830754158429