RRT-Foundation / eval /eval_comparison.csv

Open release: RRT-355M weights, CORE eval artifacts, README

4f983a9 verified 11 days ago

1.78 kB

	task,gpt2_124m_acc,gpt2_medium_355m_acc,pythia_410m_acc,rrt_355m_acc,gpt2_124m_centered,gpt2_medium_355m_centered,pythia_410m_centered,rrt_355m_centered
	hellaswag_zeroshot,0.3092,0.3937,0.3977,0.3690,0.0789,0.1917,0.1970,0.1587
	jeopardy,0.0019,0.0387,0.0392,0.0132,0.0019,0.0387,0.0392,0.0132
	bigbench_qa_wikidata,0.2834,0.4209,0.5093,0.3665,0.2834,0.4209,0.5093,0.3665
	arc_easy,0.4150,0.4853,0.5274,0.5732,0.2200,0.3137,0.3698,0.4310
	arc_challenge,0.2184,0.2696,0.2483,0.2884,-0.0421,0.0262,-0.0023,0.0512
	copa,0.6300,0.6400,0.6400,0.6400,0.2600,0.2800,0.2800,0.2800
	commonsense_qa,0.2326,0.2138,0.2408,0.2015,0.0407,0.0172,0.0510,0.0018
	piqa,0.6289,0.6594,0.6855,0.6551,0.2579,0.3188,0.3711,0.3101
	openbook_qa,0.2640,0.3060,0.3060,0.3600,0.0187,0.0747,0.0747,0.1467
	lambada_openai,0.3208,0.4211,0.4758,0.2633,0.3208,0.4211,0.4758,0.2633
	hellaswag,0.3077,0.3981,0.3977,0.3626,0.0769,0.1975,0.1970,0.1501
	winograd,0.5897,0.6154,0.6996,0.5861,0.1795,0.2308,0.3993,0.1722
	winogrande,0.5114,0.5130,0.5296,0.5099,0.0229,0.0260,0.0592,0.0197
	bigbench_dyck_languages,0.1580,0.1800,0.2720,0.1290,0.1580,0.1800,0.2720,0.1290
	agi_eval_lsat_ar,0.2261,0.2261,0.2609,0.3000,0.0326,0.0326,0.0761,0.1250
	bigbench_cs_algorithms,0.4280,0.4295,0.4644,0.4432,0.4280,0.4295,0.4644,0.4432
	bigbench_operators,0.0952,0.1143,0.1333,0.0714,0.0952,0.1143,0.1333,0.0714
	bigbench_repeat_copy_logic,0.0312,0.0625,0.0000,0.0000,0.0312,0.0625,0.0000,0.0000
	squad,0.0574,0.1550,0.2173,0.0858,0.0574,0.1550,0.2173,0.0858
	coqa,0.1362,0.2233,0.2013,0.0911,0.1362,0.2233,0.2013,0.0911
	boolq,0.5541,0.6024,0.4700,0.5963,-0.1733,-0.0462,-0.3947,-0.0623
	bigbench_language_identification,0.2546,0.2590,0.2537,0.2549,0.1800,0.1848,0.1790,0.1803

	CORE_METRIC,0.1211,0.1770,0.1895,0.1558,,,,