{ "best_global_step": 360, "best_metric": 0.1140967383980751, "best_model_checkpoint": "saves/p-tuning/llama-3-8b-instruct/train_copa_1754652160/checkpoint-360", "epoch": 10.0, "eval_steps": 45, "global_step": 900, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05555555555555555, "grad_norm": 211.93502807617188, "learning_rate": 2.2222222222222225e-06, "loss": 4.5242, "num_input_tokens_seen": 1536, "step": 5 }, { "epoch": 0.1111111111111111, "grad_norm": 55.84653854370117, "learning_rate": 5e-06, "loss": 3.1759, "num_input_tokens_seen": 3168, "step": 10 }, { "epoch": 0.16666666666666666, "grad_norm": 77.70538330078125, "learning_rate": 7.777777777777777e-06, "loss": 2.3772, "num_input_tokens_seen": 4736, "step": 15 }, { "epoch": 0.2222222222222222, "grad_norm": 88.52490997314453, "learning_rate": 1.0555555555555555e-05, "loss": 1.808, "num_input_tokens_seen": 6304, "step": 20 }, { "epoch": 0.2777777777777778, "grad_norm": 171.03692626953125, "learning_rate": 1.3333333333333333e-05, "loss": 1.0693, "num_input_tokens_seen": 7840, "step": 25 }, { "epoch": 0.3333333333333333, "grad_norm": 217.7040557861328, "learning_rate": 1.6111111111111115e-05, "loss": 0.7198, "num_input_tokens_seen": 9408, "step": 30 }, { "epoch": 0.3888888888888889, "grad_norm": 158.4123077392578, "learning_rate": 1.888888888888889e-05, "loss": 1.0175, "num_input_tokens_seen": 10912, "step": 35 }, { "epoch": 0.4444444444444444, "grad_norm": 36.769805908203125, "learning_rate": 2.1666666666666667e-05, "loss": 1.0818, "num_input_tokens_seen": 12448, "step": 40 }, { "epoch": 0.5, "grad_norm": 28.32155418395996, "learning_rate": 2.4444444444444445e-05, "loss": 0.5598, "num_input_tokens_seen": 14016, "step": 45 }, { "epoch": 0.5, "eval_loss": 0.3143084943294525, "eval_runtime": 0.9332, "eval_samples_per_second": 42.863, "eval_steps_per_second": 10.716, "num_input_tokens_seen": 14016, "step": 45 }, { "epoch": 0.5555555555555556, "grad_norm": 243.9434356689453, "learning_rate": 2.7222222222222223e-05, "loss": 0.4034, "num_input_tokens_seen": 15584, "step": 50 }, { "epoch": 0.6111111111111112, "grad_norm": 21.132078170776367, "learning_rate": 3e-05, "loss": 0.4113, "num_input_tokens_seen": 17184, "step": 55 }, { "epoch": 0.6666666666666666, "grad_norm": 11.026226043701172, "learning_rate": 3.277777777777778e-05, "loss": 0.2833, "num_input_tokens_seen": 18752, "step": 60 }, { "epoch": 0.7222222222222222, "grad_norm": 13.724584579467773, "learning_rate": 3.555555555555556e-05, "loss": 0.2309, "num_input_tokens_seen": 20352, "step": 65 }, { "epoch": 0.7777777777777778, "grad_norm": 78.358642578125, "learning_rate": 3.8333333333333334e-05, "loss": 0.741, "num_input_tokens_seen": 21952, "step": 70 }, { "epoch": 0.8333333333333334, "grad_norm": 8.49279499053955, "learning_rate": 4.111111111111111e-05, "loss": 0.2628, "num_input_tokens_seen": 23456, "step": 75 }, { "epoch": 0.8888888888888888, "grad_norm": 5.491214275360107, "learning_rate": 4.388888888888889e-05, "loss": 0.3305, "num_input_tokens_seen": 25056, "step": 80 }, { "epoch": 0.9444444444444444, "grad_norm": 229.71315002441406, "learning_rate": 4.666666666666667e-05, "loss": 4.0693, "num_input_tokens_seen": 26560, "step": 85 }, { "epoch": 1.0, "grad_norm": 65.04704284667969, "learning_rate": 4.9444444444444446e-05, "loss": 1.6129, "num_input_tokens_seen": 28096, "step": 90 }, { "epoch": 1.0, "eval_loss": 0.28230223059654236, "eval_runtime": 0.9341, "eval_samples_per_second": 42.823, "eval_steps_per_second": 10.706, "num_input_tokens_seen": 28096, "step": 90 }, { "epoch": 1.0555555555555556, "grad_norm": 2.8092379570007324, "learning_rate": 4.9996991493233693e-05, "loss": 0.2852, "num_input_tokens_seen": 29696, "step": 95 }, { "epoch": 1.1111111111111112, "grad_norm": 3.550919532775879, "learning_rate": 4.99847706754774e-05, "loss": 0.2552, "num_input_tokens_seen": 31232, "step": 100 }, { "epoch": 1.1666666666666667, "grad_norm": 4.582396507263184, "learning_rate": 4.9963154107272295e-05, "loss": 0.279, "num_input_tokens_seen": 32768, "step": 105 }, { "epoch": 1.2222222222222223, "grad_norm": 3.2737157344818115, "learning_rate": 4.993214991772563e-05, "loss": 0.2366, "num_input_tokens_seen": 34304, "step": 110 }, { "epoch": 1.2777777777777777, "grad_norm": 5.495439529418945, "learning_rate": 4.989176976624511e-05, "loss": 0.6461, "num_input_tokens_seen": 35872, "step": 115 }, { "epoch": 1.3333333333333333, "grad_norm": 14.088470458984375, "learning_rate": 4.9842028838154285e-05, "loss": 0.3324, "num_input_tokens_seen": 37408, "step": 120 }, { "epoch": 1.3888888888888888, "grad_norm": 18.816679000854492, "learning_rate": 4.978294583898196e-05, "loss": 0.2167, "num_input_tokens_seen": 38976, "step": 125 }, { "epoch": 1.4444444444444444, "grad_norm": 10.976043701171875, "learning_rate": 4.971454298742779e-05, "loss": 0.2518, "num_input_tokens_seen": 40576, "step": 130 }, { "epoch": 1.5, "grad_norm": 136.6255340576172, "learning_rate": 4.963684600700679e-05, "loss": 0.8986, "num_input_tokens_seen": 42144, "step": 135 }, { "epoch": 1.5, "eval_loss": 0.3364465534687042, "eval_runtime": 0.9378, "eval_samples_per_second": 42.655, "eval_steps_per_second": 10.664, "num_input_tokens_seen": 42144, "step": 135 }, { "epoch": 1.5555555555555556, "grad_norm": 6.133034706115723, "learning_rate": 4.9549884116375714e-05, "loss": 0.4373, "num_input_tokens_seen": 43680, "step": 140 }, { "epoch": 1.6111111111111112, "grad_norm": 3.9126594066619873, "learning_rate": 4.9453690018345144e-05, "loss": 1.3978, "num_input_tokens_seen": 45248, "step": 145 }, { "epoch": 1.6666666666666665, "grad_norm": 6.755434513092041, "learning_rate": 4.934829988758131e-05, "loss": 0.2847, "num_input_tokens_seen": 46816, "step": 150 }, { "epoch": 1.7222222222222223, "grad_norm": 18.98358154296875, "learning_rate": 4.923375335700223e-05, "loss": 0.2777, "num_input_tokens_seen": 48384, "step": 155 }, { "epoch": 1.7777777777777777, "grad_norm": 2.0623528957366943, "learning_rate": 4.9110093502873476e-05, "loss": 0.1408, "num_input_tokens_seen": 49952, "step": 160 }, { "epoch": 1.8333333333333335, "grad_norm": 17.358171463012695, "learning_rate": 4.897736682860885e-05, "loss": 0.0471, "num_input_tokens_seen": 51520, "step": 165 }, { "epoch": 1.8888888888888888, "grad_norm": 22.74451446533203, "learning_rate": 4.883562324728241e-05, "loss": 0.1595, "num_input_tokens_seen": 53024, "step": 170 }, { "epoch": 1.9444444444444444, "grad_norm": 7.793264865875244, "learning_rate": 4.868491606285823e-05, "loss": 0.2312, "num_input_tokens_seen": 54592, "step": 175 }, { "epoch": 2.0, "grad_norm": 3.0166492462158203, "learning_rate": 4.8525301950144894e-05, "loss": 0.162, "num_input_tokens_seen": 56128, "step": 180 }, { "epoch": 2.0, "eval_loss": 0.12522773444652557, "eval_runtime": 0.9365, "eval_samples_per_second": 42.713, "eval_steps_per_second": 10.678, "num_input_tokens_seen": 56128, "step": 180 }, { "epoch": 2.0555555555555554, "grad_norm": 3.614006519317627, "learning_rate": 4.835684093348244e-05, "loss": 0.0719, "num_input_tokens_seen": 57696, "step": 185 }, { "epoch": 2.111111111111111, "grad_norm": 11.736072540283203, "learning_rate": 4.817959636416969e-05, "loss": 0.3451, "num_input_tokens_seen": 59264, "step": 190 }, { "epoch": 2.1666666666666665, "grad_norm": 0.017602359876036644, "learning_rate": 4.7993634896640394e-05, "loss": 0.0367, "num_input_tokens_seen": 60864, "step": 195 }, { "epoch": 2.2222222222222223, "grad_norm": 128.3651885986328, "learning_rate": 4.779902646339722e-05, "loss": 0.1548, "num_input_tokens_seen": 62464, "step": 200 }, { "epoch": 2.2777777777777777, "grad_norm": 4.051515579223633, "learning_rate": 4.759584424871302e-05, "loss": 0.1765, "num_input_tokens_seen": 64032, "step": 205 }, { "epoch": 2.3333333333333335, "grad_norm": 0.603679895401001, "learning_rate": 4.7384164661109176e-05, "loss": 0.1182, "num_input_tokens_seen": 65568, "step": 210 }, { "epoch": 2.388888888888889, "grad_norm": 0.7103855609893799, "learning_rate": 4.7164067304621536e-05, "loss": 0.0868, "num_input_tokens_seen": 67104, "step": 215 }, { "epoch": 2.4444444444444446, "grad_norm": 3.1308445930480957, "learning_rate": 4.693563494886455e-05, "loss": 0.2294, "num_input_tokens_seen": 68704, "step": 220 }, { "epoch": 2.5, "grad_norm": 3.5392675399780273, "learning_rate": 4.669895349790502e-05, "loss": 0.0545, "num_input_tokens_seen": 70272, "step": 225 }, { "epoch": 2.5, "eval_loss": 0.1658935248851776, "eval_runtime": 0.9481, "eval_samples_per_second": 42.19, "eval_steps_per_second": 10.548, "num_input_tokens_seen": 70272, "step": 225 }, { "epoch": 2.5555555555555554, "grad_norm": 1.4086837768554688, "learning_rate": 4.645411195795709e-05, "loss": 0.0091, "num_input_tokens_seen": 71808, "step": 230 }, { "epoch": 2.611111111111111, "grad_norm": 6.530168056488037, "learning_rate": 4.620120240391065e-05, "loss": 0.1379, "num_input_tokens_seen": 73408, "step": 235 }, { "epoch": 2.6666666666666665, "grad_norm": 4.456235885620117, "learning_rate": 4.5940319944705736e-05, "loss": 0.1022, "num_input_tokens_seen": 74912, "step": 240 }, { "epoch": 2.7222222222222223, "grad_norm": 4.331588268280029, "learning_rate": 4.567156268756594e-05, "loss": 0.1812, "num_input_tokens_seen": 76544, "step": 245 }, { "epoch": 2.7777777777777777, "grad_norm": 0.16590723395347595, "learning_rate": 4.539503170110431e-05, "loss": 0.0448, "num_input_tokens_seen": 78112, "step": 250 }, { "epoch": 2.8333333333333335, "grad_norm": 0.05489581450819969, "learning_rate": 4.5110830977315556e-05, "loss": 0.004, "num_input_tokens_seen": 79712, "step": 255 }, { "epoch": 2.888888888888889, "grad_norm": 0.06641557812690735, "learning_rate": 4.4819067392468944e-05, "loss": 0.076, "num_input_tokens_seen": 81280, "step": 260 }, { "epoch": 2.9444444444444446, "grad_norm": 0.9573344588279724, "learning_rate": 4.4519850666916484e-05, "loss": 0.0639, "num_input_tokens_seen": 82848, "step": 265 }, { "epoch": 3.0, "grad_norm": 0.09293336421251297, "learning_rate": 4.4213293323831585e-05, "loss": 0.0493, "num_input_tokens_seen": 84352, "step": 270 }, { "epoch": 3.0, "eval_loss": 0.11682591587305069, "eval_runtime": 0.9379, "eval_samples_per_second": 42.648, "eval_steps_per_second": 10.662, "num_input_tokens_seen": 84352, "step": 270 }, { "epoch": 3.0555555555555554, "grad_norm": 2.5821824073791504, "learning_rate": 4.38995106468937e-05, "loss": 0.011, "num_input_tokens_seen": 85920, "step": 275 }, { "epoch": 3.111111111111111, "grad_norm": 0.03350764885544777, "learning_rate": 4.357862063693486e-05, "loss": 0.0094, "num_input_tokens_seen": 87520, "step": 280 }, { "epoch": 3.1666666666666665, "grad_norm": 0.02359146997332573, "learning_rate": 4.325074396756437e-05, "loss": 0.0009, "num_input_tokens_seen": 89088, "step": 285 }, { "epoch": 3.2222222222222223, "grad_norm": 0.9292595386505127, "learning_rate": 4.2916003939788403e-05, "loss": 0.0859, "num_input_tokens_seen": 90688, "step": 290 }, { "epoch": 3.2777777777777777, "grad_norm": 0.3055465817451477, "learning_rate": 4.257452643564155e-05, "loss": 0.1095, "num_input_tokens_seen": 92160, "step": 295 }, { "epoch": 3.3333333333333335, "grad_norm": 8.65383243560791, "learning_rate": 4.22264398708477e-05, "loss": 0.1926, "num_input_tokens_seen": 93760, "step": 300 }, { "epoch": 3.388888888888889, "grad_norm": 5.946500301361084, "learning_rate": 4.1871875146528195e-05, "loss": 0.1205, "num_input_tokens_seen": 95360, "step": 305 }, { "epoch": 3.4444444444444446, "grad_norm": 0.05868682637810707, "learning_rate": 4.1510965599975196e-05, "loss": 0.0432, "num_input_tokens_seen": 96928, "step": 310 }, { "epoch": 3.5, "grad_norm": 0.07760920375585556, "learning_rate": 4.114384695450906e-05, "loss": 0.0166, "num_input_tokens_seen": 98464, "step": 315 }, { "epoch": 3.5, "eval_loss": 0.16605496406555176, "eval_runtime": 0.9477, "eval_samples_per_second": 42.207, "eval_steps_per_second": 10.552, "num_input_tokens_seen": 98464, "step": 315 }, { "epoch": 3.5555555555555554, "grad_norm": 4.408195972442627, "learning_rate": 4.077065726843828e-05, "loss": 0.0249, "num_input_tokens_seen": 100064, "step": 320 }, { "epoch": 3.611111111111111, "grad_norm": 0.08003110438585281, "learning_rate": 4.039153688314145e-05, "loss": 0.056, "num_input_tokens_seen": 101600, "step": 325 }, { "epoch": 3.6666666666666665, "grad_norm": 0.014739356003701687, "learning_rate": 4.000662837029062e-05, "loss": 0.01, "num_input_tokens_seen": 103200, "step": 330 }, { "epoch": 3.7222222222222223, "grad_norm": 4.2343668937683105, "learning_rate": 3.961607647823583e-05, "loss": 0.1279, "num_input_tokens_seen": 104768, "step": 335 }, { "epoch": 3.7777777777777777, "grad_norm": 0.9541974663734436, "learning_rate": 3.9220028077571295e-05, "loss": 0.043, "num_input_tokens_seen": 106304, "step": 340 }, { "epoch": 3.8333333333333335, "grad_norm": 0.029086820781230927, "learning_rate": 3.881863210590332e-05, "loss": 0.0006, "num_input_tokens_seen": 107904, "step": 345 }, { "epoch": 3.888888888888889, "grad_norm": 4.72139835357666, "learning_rate": 3.841203951184095e-05, "loss": 0.1241, "num_input_tokens_seen": 109408, "step": 350 }, { "epoch": 3.9444444444444446, "grad_norm": 7.2094197273254395, "learning_rate": 3.8000403198230387e-05, "loss": 0.0363, "num_input_tokens_seen": 111008, "step": 355 }, { "epoch": 4.0, "grad_norm": 0.01099959947168827, "learning_rate": 3.75838779646545e-05, "loss": 0.0146, "num_input_tokens_seen": 112576, "step": 360 }, { "epoch": 4.0, "eval_loss": 0.1140967383980751, "eval_runtime": 0.943, "eval_samples_per_second": 42.417, "eval_steps_per_second": 10.604, "num_input_tokens_seen": 112576, "step": 360 }, { "epoch": 4.055555555555555, "grad_norm": 0.06453730165958405, "learning_rate": 3.7162620449219e-05, "loss": 0.1621, "num_input_tokens_seen": 114144, "step": 365 }, { "epoch": 4.111111111111111, "grad_norm": 1.2406953573226929, "learning_rate": 3.673678906964727e-05, "loss": 0.0137, "num_input_tokens_seen": 115712, "step": 370 }, { "epoch": 4.166666666666667, "grad_norm": 0.040748681873083115, "learning_rate": 3.630654396370594e-05, "loss": 0.0514, "num_input_tokens_seen": 117216, "step": 375 }, { "epoch": 4.222222222222222, "grad_norm": 0.04640405625104904, "learning_rate": 3.5872046928983626e-05, "loss": 0.0012, "num_input_tokens_seen": 118816, "step": 380 }, { "epoch": 4.277777777777778, "grad_norm": 0.4107241928577423, "learning_rate": 3.543346136204545e-05, "loss": 0.0132, "num_input_tokens_seen": 120352, "step": 385 }, { "epoch": 4.333333333333333, "grad_norm": 0.03547825664281845, "learning_rate": 3.499095219698631e-05, "loss": 0.0107, "num_input_tokens_seen": 121920, "step": 390 }, { "epoch": 4.388888888888889, "grad_norm": 4.26585578918457, "learning_rate": 3.454468584340588e-05, "loss": 0.0743, "num_input_tokens_seen": 123456, "step": 395 }, { "epoch": 4.444444444444445, "grad_norm": 0.04872285574674606, "learning_rate": 3.409483012382879e-05, "loss": 0.0116, "num_input_tokens_seen": 125056, "step": 400 }, { "epoch": 4.5, "grad_norm": 4.138394355773926, "learning_rate": 3.364155421059342e-05, "loss": 0.1392, "num_input_tokens_seen": 126624, "step": 405 }, { "epoch": 4.5, "eval_loss": 0.12619957327842712, "eval_runtime": 0.9498, "eval_samples_per_second": 42.114, "eval_steps_per_second": 10.528, "num_input_tokens_seen": 126624, "step": 405 }, { "epoch": 4.555555555555555, "grad_norm": 0.07925529032945633, "learning_rate": 3.318502856223311e-05, "loss": 0.0013, "num_input_tokens_seen": 128224, "step": 410 }, { "epoch": 4.611111111111111, "grad_norm": 0.07759137451648712, "learning_rate": 3.272542485937369e-05, "loss": 0.0181, "num_input_tokens_seen": 129728, "step": 415 }, { "epoch": 4.666666666666667, "grad_norm": 0.020686015486717224, "learning_rate": 3.2262915940171376e-05, "loss": 0.002, "num_input_tokens_seen": 131328, "step": 420 }, { "epoch": 4.722222222222222, "grad_norm": 0.10983490198850632, "learning_rate": 3.1797675735315455e-05, "loss": 0.0026, "num_input_tokens_seen": 132896, "step": 425 }, { "epoch": 4.777777777777778, "grad_norm": 0.06394976377487183, "learning_rate": 3.132987920262005e-05, "loss": 0.0024, "num_input_tokens_seen": 134496, "step": 430 }, { "epoch": 4.833333333333333, "grad_norm": 0.006211922504007816, "learning_rate": 3.085970226122962e-05, "loss": 0.0031, "num_input_tokens_seen": 136064, "step": 435 }, { "epoch": 4.888888888888889, "grad_norm": 0.00576823391020298, "learning_rate": 3.0387321725463e-05, "loss": 0.0282, "num_input_tokens_seen": 137664, "step": 440 }, { "epoch": 4.944444444444445, "grad_norm": 0.004720824770629406, "learning_rate": 2.9912915238320754e-05, "loss": 0.0188, "num_input_tokens_seen": 139232, "step": 445 }, { "epoch": 5.0, "grad_norm": 0.12190665304660797, "learning_rate": 2.9436661204680882e-05, "loss": 0.0007, "num_input_tokens_seen": 140832, "step": 450 }, { "epoch": 5.0, "eval_loss": 0.16102315485477448, "eval_runtime": 0.9518, "eval_samples_per_second": 42.027, "eval_steps_per_second": 10.507, "num_input_tokens_seen": 140832, "step": 450 }, { "epoch": 5.055555555555555, "grad_norm": 0.12717540562152863, "learning_rate": 2.8958738724208072e-05, "loss": 0.0051, "num_input_tokens_seen": 142368, "step": 455 }, { "epoch": 5.111111111111111, "grad_norm": 0.004431543871760368, "learning_rate": 2.8479327524001636e-05, "loss": 0.0012, "num_input_tokens_seen": 144000, "step": 460 }, { "epoch": 5.166666666666667, "grad_norm": 0.4021238684654236, "learning_rate": 2.7998607891007495e-05, "loss": 0.0018, "num_input_tokens_seen": 145632, "step": 465 }, { "epoch": 5.222222222222222, "grad_norm": 0.034988883882761, "learning_rate": 2.7516760604219617e-05, "loss": 0.0024, "num_input_tokens_seen": 147168, "step": 470 }, { "epoch": 5.277777777777778, "grad_norm": 0.008324529975652695, "learning_rate": 2.7033966866696457e-05, "loss": 0.0002, "num_input_tokens_seen": 148736, "step": 475 }, { "epoch": 5.333333333333333, "grad_norm": 0.012731038965284824, "learning_rate": 2.6550408237417885e-05, "loss": 0.0002, "num_input_tokens_seen": 150304, "step": 480 }, { "epoch": 5.388888888888889, "grad_norm": 0.09622369706630707, "learning_rate": 2.6066266563008267e-05, "loss": 0.0004, "num_input_tokens_seen": 151872, "step": 485 }, { "epoch": 5.444444444444445, "grad_norm": 11.24488639831543, "learning_rate": 2.5581723909351406e-05, "loss": 0.0472, "num_input_tokens_seen": 153472, "step": 490 }, { "epoch": 5.5, "grad_norm": 0.008177004754543304, "learning_rate": 2.5096962493123012e-05, "loss": 0.0002, "num_input_tokens_seen": 154976, "step": 495 }, { "epoch": 5.5, "eval_loss": 0.29022911190986633, "eval_runtime": 0.9436, "eval_samples_per_second": 42.392, "eval_steps_per_second": 10.598, "num_input_tokens_seen": 154976, "step": 495 }, { "epoch": 5.555555555555555, "grad_norm": 0.12855511903762817, "learning_rate": 2.461216461326642e-05, "loss": 0.0007, "num_input_tokens_seen": 156544, "step": 500 }, { "epoch": 5.611111111111111, "grad_norm": 0.004401104990392923, "learning_rate": 2.4127512582437485e-05, "loss": 0.0186, "num_input_tokens_seen": 158112, "step": 505 }, { "epoch": 5.666666666666667, "grad_norm": 0.11994586884975433, "learning_rate": 2.364318865844416e-05, "loss": 0.0023, "num_input_tokens_seen": 159680, "step": 510 }, { "epoch": 5.722222222222222, "grad_norm": 0.0036123669706285, "learning_rate": 2.3159374975706884e-05, "loss": 0.0001, "num_input_tokens_seen": 161312, "step": 515 }, { "epoch": 5.777777777777778, "grad_norm": 0.005037173628807068, "learning_rate": 2.2676253476765196e-05, "loss": 0.0001, "num_input_tokens_seen": 162880, "step": 520 }, { "epoch": 5.833333333333333, "grad_norm": 0.004541746340692043, "learning_rate": 2.2194005843856636e-05, "loss": 0.0001, "num_input_tokens_seen": 164448, "step": 525 }, { "epoch": 5.888888888888889, "grad_norm": 0.003595268353819847, "learning_rate": 2.1712813430593436e-05, "loss": 0.0001, "num_input_tokens_seen": 166016, "step": 530 }, { "epoch": 5.944444444444445, "grad_norm": 0.05647879093885422, "learning_rate": 2.1232857193762924e-05, "loss": 0.1753, "num_input_tokens_seen": 167552, "step": 535 }, { "epoch": 6.0, "grad_norm": 0.024071840569376945, "learning_rate": 2.0754317625276983e-05, "loss": 0.0003, "num_input_tokens_seen": 169056, "step": 540 }, { "epoch": 6.0, "eval_loss": 0.18793392181396484, "eval_runtime": 0.9426, "eval_samples_per_second": 42.435, "eval_steps_per_second": 10.609, "num_input_tokens_seen": 169056, "step": 540 }, { "epoch": 6.055555555555555, "grad_norm": 0.003708529518917203, "learning_rate": 2.02773746842965e-05, "loss": 0.0003, "num_input_tokens_seen": 170592, "step": 545 }, { "epoch": 6.111111111111111, "grad_norm": 0.029693368822336197, "learning_rate": 1.980220772955602e-05, "loss": 0.0837, "num_input_tokens_seen": 172192, "step": 550 }, { "epoch": 6.166666666666667, "grad_norm": 1.9385651350021362, "learning_rate": 1.932899545191433e-05, "loss": 0.0069, "num_input_tokens_seen": 173792, "step": 555 }, { "epoch": 6.222222222222222, "grad_norm": 0.008497129194438457, "learning_rate": 1.8857915807156092e-05, "loss": 0.0003, "num_input_tokens_seen": 175360, "step": 560 }, { "epoch": 6.277777777777778, "grad_norm": 0.005275554955005646, "learning_rate": 1.838914594906995e-05, "loss": 0.0001, "num_input_tokens_seen": 176992, "step": 565 }, { "epoch": 6.333333333333333, "grad_norm": 0.007108105346560478, "learning_rate": 1.792286216282824e-05, "loss": 0.0007, "num_input_tokens_seen": 178592, "step": 570 }, { "epoch": 6.388888888888889, "grad_norm": 0.003662517061457038, "learning_rate": 1.7459239798693364e-05, "loss": 0.0003, "num_input_tokens_seen": 180128, "step": 575 }, { "epoch": 6.444444444444445, "grad_norm": 0.03587842732667923, "learning_rate": 1.699845320607571e-05, "loss": 0.0002, "num_input_tokens_seen": 181632, "step": 580 }, { "epoch": 6.5, "grad_norm": 0.5192928314208984, "learning_rate": 1.6540675667967974e-05, "loss": 0.0013, "num_input_tokens_seen": 183200, "step": 585 }, { "epoch": 6.5, "eval_loss": 0.23766329884529114, "eval_runtime": 0.9465, "eval_samples_per_second": 42.262, "eval_steps_per_second": 10.565, "num_input_tokens_seen": 183200, "step": 585 }, { "epoch": 6.555555555555555, "grad_norm": 0.02627461589872837, "learning_rate": 1.60860793357805e-05, "loss": 0.0004, "num_input_tokens_seen": 184800, "step": 590 }, { "epoch": 6.611111111111111, "grad_norm": 0.007544673513621092, "learning_rate": 1.56348351646022e-05, "loss": 0.0285, "num_input_tokens_seen": 186336, "step": 595 }, { "epoch": 6.666666666666667, "grad_norm": 0.0033945958130061626, "learning_rate": 1.5187112848911323e-05, "loss": 0.0001, "num_input_tokens_seen": 187904, "step": 600 }, { "epoch": 6.722222222222222, "grad_norm": 0.004758753813803196, "learning_rate": 1.47430807587603e-05, "loss": 0.0001, "num_input_tokens_seen": 189472, "step": 605 }, { "epoch": 6.777777777777778, "grad_norm": 0.33814820647239685, "learning_rate": 1.430290587645865e-05, "loss": 0.0013, "num_input_tokens_seen": 191072, "step": 610 }, { "epoch": 6.833333333333333, "grad_norm": 0.002982664154842496, "learning_rate": 1.3866753733777765e-05, "loss": 0.0001, "num_input_tokens_seen": 192608, "step": 615 }, { "epoch": 6.888888888888889, "grad_norm": 0.003444693749770522, "learning_rate": 1.343478834970121e-05, "loss": 0.0001, "num_input_tokens_seen": 194208, "step": 620 }, { "epoch": 6.944444444444445, "grad_norm": 0.0027797692455351353, "learning_rate": 1.3007172168743854e-05, "loss": 0.0001, "num_input_tokens_seen": 195776, "step": 625 }, { "epoch": 7.0, "grad_norm": 0.00582587905228138, "learning_rate": 1.2584065999863102e-05, "loss": 0.0001, "num_input_tokens_seen": 197344, "step": 630 }, { "epoch": 7.0, "eval_loss": 0.24828998744487762, "eval_runtime": 0.942, "eval_samples_per_second": 42.462, "eval_steps_per_second": 10.616, "num_input_tokens_seen": 197344, "step": 630 }, { "epoch": 7.055555555555555, "grad_norm": 0.012683599255979061, "learning_rate": 1.2165628955985314e-05, "loss": 0.0001, "num_input_tokens_seen": 198944, "step": 635 }, { "epoch": 7.111111111111111, "grad_norm": 0.0020441152155399323, "learning_rate": 1.175201839416988e-05, "loss": 0.0001, "num_input_tokens_seen": 200512, "step": 640 }, { "epoch": 7.166666666666667, "grad_norm": 0.0023130401968955994, "learning_rate": 1.1343389856433658e-05, "loss": 0.0001, "num_input_tokens_seen": 202016, "step": 645 }, { "epoch": 7.222222222222222, "grad_norm": 0.0022213098127394915, "learning_rate": 1.0939897011258001e-05, "loss": 0.0001, "num_input_tokens_seen": 203648, "step": 650 }, { "epoch": 7.277777777777778, "grad_norm": 0.002087386092171073, "learning_rate": 1.0541691595800337e-05, "loss": 0.0001, "num_input_tokens_seen": 205184, "step": 655 }, { "epoch": 7.333333333333333, "grad_norm": 0.0022648421581834555, "learning_rate": 1.0148923358832022e-05, "loss": 0.0001, "num_input_tokens_seen": 206720, "step": 660 }, { "epoch": 7.388888888888889, "grad_norm": 0.004804587922990322, "learning_rate": 9.761740004423927e-06, "loss": 0.0001, "num_input_tokens_seen": 208320, "step": 665 }, { "epoch": 7.444444444444445, "grad_norm": 0.0030888738110661507, "learning_rate": 9.380287136401e-06, "loss": 0.0001, "num_input_tokens_seen": 209856, "step": 670 }, { "epoch": 7.5, "grad_norm": 0.0017403181409463286, "learning_rate": 9.00470820358663e-06, "loss": 0.0002, "num_input_tokens_seen": 211392, "step": 675 }, { "epoch": 7.5, "eval_loss": 0.25386351346969604, "eval_runtime": 0.9402, "eval_samples_per_second": 42.544, "eval_steps_per_second": 10.636, "num_input_tokens_seen": 211392, "step": 675 }, { "epoch": 7.555555555555555, "grad_norm": 0.0020435000769793987, "learning_rate": 8.635144445857406e-06, "loss": 0.0001, "num_input_tokens_seen": 212960, "step": 680 }, { "epoch": 7.611111111111111, "grad_norm": 0.004450716078281403, "learning_rate": 8.271734841028553e-06, "loss": 0.0002, "num_input_tokens_seen": 214528, "step": 685 }, { "epoch": 7.666666666666667, "grad_norm": 0.002327506896108389, "learning_rate": 7.914616052590071e-06, "loss": 0.0001, "num_input_tokens_seen": 216000, "step": 690 }, { "epoch": 7.722222222222222, "grad_norm": 0.0016173458425328135, "learning_rate": 7.563922378313218e-06, "loss": 0.0097, "num_input_tokens_seen": 217632, "step": 695 }, { "epoch": 7.777777777777778, "grad_norm": 0.0022673553321510553, "learning_rate": 7.219785699746573e-06, "loss": 0.0001, "num_input_tokens_seen": 219232, "step": 700 }, { "epoch": 7.833333333333333, "grad_norm": 0.008347373455762863, "learning_rate": 6.882335432620779e-06, "loss": 0.0001, "num_input_tokens_seen": 220800, "step": 705 }, { "epoch": 7.888888888888889, "grad_norm": 0.002359850564971566, "learning_rate": 6.55169847818059e-06, "loss": 0.0001, "num_input_tokens_seen": 222368, "step": 710 }, { "epoch": 7.944444444444445, "grad_norm": 0.002948438050225377, "learning_rate": 6.22799917546252e-06, "loss": 0.0001, "num_input_tokens_seen": 223968, "step": 715 }, { "epoch": 8.0, "grad_norm": 0.0022273629438132048, "learning_rate": 5.9113592545359945e-06, "loss": 0.0001, "num_input_tokens_seen": 225536, "step": 720 }, { "epoch": 8.0, "eval_loss": 0.2521117627620697, "eval_runtime": 0.9388, "eval_samples_per_second": 42.606, "eval_steps_per_second": 10.651, "num_input_tokens_seen": 225536, "step": 720 }, { "epoch": 8.055555555555555, "grad_norm": 0.0022272937931120396, "learning_rate": 5.601897790725643e-06, "loss": 0.0001, "num_input_tokens_seen": 227168, "step": 725 }, { "epoch": 8.11111111111111, "grad_norm": 0.001649300567805767, "learning_rate": 5.299731159831953e-06, "loss": 0.0001, "num_input_tokens_seen": 228704, "step": 730 }, { "epoch": 8.166666666666666, "grad_norm": 0.0024456402752548456, "learning_rate": 5.004972994367102e-06, "loss": 0.0001, "num_input_tokens_seen": 230336, "step": 735 }, { "epoch": 8.222222222222221, "grad_norm": 0.022852079942822456, "learning_rate": 4.7177341408224e-06, "loss": 0.0001, "num_input_tokens_seen": 231936, "step": 740 }, { "epoch": 8.277777777777779, "grad_norm": 0.0026604924350976944, "learning_rate": 4.438122617983443e-06, "loss": 0.0001, "num_input_tokens_seen": 233472, "step": 745 }, { "epoch": 8.333333333333334, "grad_norm": 0.0019071005517616868, "learning_rate": 4.166243576308712e-06, "loss": 0.0001, "num_input_tokens_seen": 235040, "step": 750 }, { "epoch": 8.38888888888889, "grad_norm": 0.007083934266120195, "learning_rate": 3.9021992583867325e-06, "loss": 0.0001, "num_input_tokens_seen": 236608, "step": 755 }, { "epoch": 8.444444444444445, "grad_norm": 0.00216863676905632, "learning_rate": 3.6460889604868626e-06, "loss": 0.0008, "num_input_tokens_seen": 238144, "step": 760 }, { "epoch": 8.5, "grad_norm": 0.0029811670538038015, "learning_rate": 3.398008995217988e-06, "loss": 0.0001, "num_input_tokens_seen": 239680, "step": 765 }, { "epoch": 8.5, "eval_loss": 0.24617867171764374, "eval_runtime": 0.9394, "eval_samples_per_second": 42.581, "eval_steps_per_second": 10.645, "num_input_tokens_seen": 239680, "step": 765 }, { "epoch": 8.555555555555555, "grad_norm": 0.0019673961214721203, "learning_rate": 3.158052655309332e-06, "loss": 0.0001, "num_input_tokens_seen": 241280, "step": 770 }, { "epoch": 8.61111111111111, "grad_norm": 0.0018027002224698663, "learning_rate": 2.9263101785268254e-06, "loss": 0.0001, "num_input_tokens_seen": 242816, "step": 775 }, { "epoch": 8.666666666666666, "grad_norm": 0.0017442210810258985, "learning_rate": 2.7028687137384267e-06, "loss": 0.0001, "num_input_tokens_seen": 244352, "step": 780 }, { "epoch": 8.722222222222221, "grad_norm": 0.001671099103987217, "learning_rate": 2.487812288140945e-06, "loss": 0.0002, "num_input_tokens_seen": 245856, "step": 785 }, { "epoch": 8.777777777777779, "grad_norm": 0.012044212780892849, "learning_rate": 2.281221775660894e-06, "loss": 0.0001, "num_input_tokens_seen": 247456, "step": 790 }, { "epoch": 8.833333333333334, "grad_norm": 0.001784978318028152, "learning_rate": 2.0831748665410765e-06, "loss": 0.0001, "num_input_tokens_seen": 248992, "step": 795 }, { "epoch": 8.88888888888889, "grad_norm": 0.001969333505257964, "learning_rate": 1.893746038124497e-06, "loss": 0.0001, "num_input_tokens_seen": 250528, "step": 800 }, { "epoch": 8.944444444444445, "grad_norm": 0.002932202536612749, "learning_rate": 1.713006526846439e-06, "loss": 0.0001, "num_input_tokens_seen": 252128, "step": 805 }, { "epoch": 9.0, "grad_norm": 0.001777714816853404, "learning_rate": 1.541024301445404e-06, "loss": 0.0001, "num_input_tokens_seen": 253696, "step": 810 }, { "epoch": 9.0, "eval_loss": 0.2544807493686676, "eval_runtime": 0.9396, "eval_samples_per_second": 42.57, "eval_steps_per_second": 10.643, "num_input_tokens_seen": 253696, "step": 810 }, { "epoch": 9.055555555555555, "grad_norm": 0.0017283963970839977, "learning_rate": 1.3778640374027985e-06, "loss": 0.0001, "num_input_tokens_seen": 255296, "step": 815 }, { "epoch": 9.11111111111111, "grad_norm": 0.0016462679486721754, "learning_rate": 1.2235870926211619e-06, "loss": 0.0001, "num_input_tokens_seen": 256896, "step": 820 }, { "epoch": 9.166666666666666, "grad_norm": 0.0017024942208081484, "learning_rate": 1.0782514843499653e-06, "loss": 0.0001, "num_input_tokens_seen": 258432, "step": 825 }, { "epoch": 9.222222222222221, "grad_norm": 0.0017732703126966953, "learning_rate": 9.419118673676924e-07, "loss": 0.0001, "num_input_tokens_seen": 260000, "step": 830 }, { "epoch": 9.277777777777779, "grad_norm": 0.0017788108671084046, "learning_rate": 8.146195134284052e-07, "loss": 0.0001, "num_input_tokens_seen": 261568, "step": 835 }, { "epoch": 9.333333333333334, "grad_norm": 0.001521819387562573, "learning_rate": 6.964222919805391e-07, "loss": 0.0001, "num_input_tokens_seen": 263200, "step": 840 }, { "epoch": 9.38888888888889, "grad_norm": 0.07402946054935455, "learning_rate": 5.87364652165176e-07, "loss": 0.0002, "num_input_tokens_seen": 264736, "step": 845 }, { "epoch": 9.444444444444445, "grad_norm": 0.005844974424690008, "learning_rate": 4.874876061005173e-07, "loss": 0.0001, "num_input_tokens_seen": 266304, "step": 850 }, { "epoch": 9.5, "grad_norm": 0.0018154801800847054, "learning_rate": 3.9682871345891883e-07, "loss": 0.0001, "num_input_tokens_seen": 267840, "step": 855 }, { "epoch": 9.5, "eval_loss": 0.24860987067222595, "eval_runtime": 0.9418, "eval_samples_per_second": 42.472, "eval_steps_per_second": 10.618, "num_input_tokens_seen": 267840, "step": 855 }, { "epoch": 9.555555555555555, "grad_norm": 0.0017057630466297269, "learning_rate": 3.1542206734221924e-07, "loss": 0.0001, "num_input_tokens_seen": 269376, "step": 860 }, { "epoch": 9.61111111111111, "grad_norm": 0.004398024175316095, "learning_rate": 2.4329828146074095e-07, "loss": 0.0005, "num_input_tokens_seen": 270944, "step": 865 }, { "epoch": 9.666666666666666, "grad_norm": 0.022089840844273567, "learning_rate": 1.8048447862070718e-07, "loss": 0.0001, "num_input_tokens_seen": 272448, "step": 870 }, { "epoch": 9.722222222222221, "grad_norm": 0.0015576216392219067, "learning_rate": 1.2700428052447033e-07, "loss": 0.0001, "num_input_tokens_seen": 273984, "step": 875 }, { "epoch": 9.777777777777779, "grad_norm": 0.021998705342411995, "learning_rate": 8.28777988873486e-08, "loss": 0.0001, "num_input_tokens_seen": 275520, "step": 880 }, { "epoch": 9.833333333333334, "grad_norm": 0.0017123895231634378, "learning_rate": 4.8121627874450625e-08, "loss": 0.0001, "num_input_tokens_seen": 277152, "step": 885 }, { "epoch": 9.88888888888889, "grad_norm": 0.001752789132297039, "learning_rate": 2.2748837860270267e-08, "loss": 0.0001, "num_input_tokens_seen": 278688, "step": 890 }, { "epoch": 9.944444444444445, "grad_norm": 0.0014716936275362968, "learning_rate": 6.768970513457151e-09, "loss": 0.0001, "num_input_tokens_seen": 280256, "step": 895 }, { "epoch": 10.0, "grad_norm": 0.0017592560034245253, "learning_rate": 1.8803520859811406e-10, "loss": 0.0001, "num_input_tokens_seen": 281856, "step": 900 }, { "epoch": 10.0, "eval_loss": 0.24965079128742218, "eval_runtime": 0.9391, "eval_samples_per_second": 42.592, "eval_steps_per_second": 10.648, "num_input_tokens_seen": 281856, "step": 900 }, { "epoch": 10.0, "num_input_tokens_seen": 281856, "step": 900, "total_flos": 1.2691848290107392e+16, "train_loss": 0.195216263138508, "train_runtime": 204.8687, "train_samples_per_second": 17.572, "train_steps_per_second": 4.393 } ], "logging_steps": 5, "max_steps": 900, "num_input_tokens_seen": 281856, "num_train_epochs": 10, "save_steps": 45, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.2691848290107392e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }