| { | |
| "best_global_step": 360, | |
| "best_metric": 0.1140967383980751, | |
| "best_model_checkpoint": "saves/p-tuning/llama-3-8b-instruct/train_copa_1754649795/checkpoint-360", | |
| "epoch": 10.0, | |
| "eval_steps": 45, | |
| "global_step": 900, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.05555555555555555, | |
| "grad_norm": 211.93502807617188, | |
| "learning_rate": 2.2222222222222225e-06, | |
| "loss": 4.5242, | |
| "num_input_tokens_seen": 1536, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.1111111111111111, | |
| "grad_norm": 55.84653854370117, | |
| "learning_rate": 5e-06, | |
| "loss": 3.1759, | |
| "num_input_tokens_seen": 3168, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.16666666666666666, | |
| "grad_norm": 77.70538330078125, | |
| "learning_rate": 7.777777777777777e-06, | |
| "loss": 2.3772, | |
| "num_input_tokens_seen": 4736, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.2222222222222222, | |
| "grad_norm": 88.52490997314453, | |
| "learning_rate": 1.0555555555555555e-05, | |
| "loss": 1.808, | |
| "num_input_tokens_seen": 6304, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.2777777777777778, | |
| "grad_norm": 171.03692626953125, | |
| "learning_rate": 1.3333333333333333e-05, | |
| "loss": 1.0693, | |
| "num_input_tokens_seen": 7840, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.3333333333333333, | |
| "grad_norm": 217.7040557861328, | |
| "learning_rate": 1.6111111111111115e-05, | |
| "loss": 0.7198, | |
| "num_input_tokens_seen": 9408, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.3888888888888889, | |
| "grad_norm": 158.4123077392578, | |
| "learning_rate": 1.888888888888889e-05, | |
| "loss": 1.0175, | |
| "num_input_tokens_seen": 10912, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.4444444444444444, | |
| "grad_norm": 36.769805908203125, | |
| "learning_rate": 2.1666666666666667e-05, | |
| "loss": 1.0818, | |
| "num_input_tokens_seen": 12448, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 28.32155418395996, | |
| "learning_rate": 2.4444444444444445e-05, | |
| "loss": 0.5598, | |
| "num_input_tokens_seen": 14016, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "eval_loss": 0.3143084943294525, | |
| "eval_runtime": 1.1504, | |
| "eval_samples_per_second": 34.772, | |
| "eval_steps_per_second": 8.693, | |
| "num_input_tokens_seen": 14016, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.5555555555555556, | |
| "grad_norm": 243.9434356689453, | |
| "learning_rate": 2.7222222222222223e-05, | |
| "loss": 0.4034, | |
| "num_input_tokens_seen": 15584, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.6111111111111112, | |
| "grad_norm": 21.132078170776367, | |
| "learning_rate": 3e-05, | |
| "loss": 0.4113, | |
| "num_input_tokens_seen": 17184, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.6666666666666666, | |
| "grad_norm": 11.026226043701172, | |
| "learning_rate": 3.277777777777778e-05, | |
| "loss": 0.2833, | |
| "num_input_tokens_seen": 18752, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.7222222222222222, | |
| "grad_norm": 13.724584579467773, | |
| "learning_rate": 3.555555555555556e-05, | |
| "loss": 0.2309, | |
| "num_input_tokens_seen": 20352, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.7777777777777778, | |
| "grad_norm": 78.358642578125, | |
| "learning_rate": 3.8333333333333334e-05, | |
| "loss": 0.741, | |
| "num_input_tokens_seen": 21952, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.8333333333333334, | |
| "grad_norm": 8.49279499053955, | |
| "learning_rate": 4.111111111111111e-05, | |
| "loss": 0.2628, | |
| "num_input_tokens_seen": 23456, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.8888888888888888, | |
| "grad_norm": 5.491214275360107, | |
| "learning_rate": 4.388888888888889e-05, | |
| "loss": 0.3305, | |
| "num_input_tokens_seen": 25056, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.9444444444444444, | |
| "grad_norm": 229.71315002441406, | |
| "learning_rate": 4.666666666666667e-05, | |
| "loss": 4.0693, | |
| "num_input_tokens_seen": 26560, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 65.04704284667969, | |
| "learning_rate": 4.9444444444444446e-05, | |
| "loss": 1.6129, | |
| "num_input_tokens_seen": 28096, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.28230223059654236, | |
| "eval_runtime": 1.1699, | |
| "eval_samples_per_second": 34.191, | |
| "eval_steps_per_second": 8.548, | |
| "num_input_tokens_seen": 28096, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 1.0555555555555556, | |
| "grad_norm": 2.8092379570007324, | |
| "learning_rate": 4.9996991493233693e-05, | |
| "loss": 0.2852, | |
| "num_input_tokens_seen": 29696, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 1.1111111111111112, | |
| "grad_norm": 3.550919532775879, | |
| "learning_rate": 4.99847706754774e-05, | |
| "loss": 0.2552, | |
| "num_input_tokens_seen": 31232, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.1666666666666667, | |
| "grad_norm": 4.582396507263184, | |
| "learning_rate": 4.9963154107272295e-05, | |
| "loss": 0.279, | |
| "num_input_tokens_seen": 32768, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 1.2222222222222223, | |
| "grad_norm": 3.2737157344818115, | |
| "learning_rate": 4.993214991772563e-05, | |
| "loss": 0.2366, | |
| "num_input_tokens_seen": 34304, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 1.2777777777777777, | |
| "grad_norm": 5.495439529418945, | |
| "learning_rate": 4.989176976624511e-05, | |
| "loss": 0.6461, | |
| "num_input_tokens_seen": 35872, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 1.3333333333333333, | |
| "grad_norm": 14.088470458984375, | |
| "learning_rate": 4.9842028838154285e-05, | |
| "loss": 0.3324, | |
| "num_input_tokens_seen": 37408, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.3888888888888888, | |
| "grad_norm": 18.816679000854492, | |
| "learning_rate": 4.978294583898196e-05, | |
| "loss": 0.2167, | |
| "num_input_tokens_seen": 38976, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 1.4444444444444444, | |
| "grad_norm": 10.976043701171875, | |
| "learning_rate": 4.971454298742779e-05, | |
| "loss": 0.2518, | |
| "num_input_tokens_seen": 40576, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 136.6255340576172, | |
| "learning_rate": 4.963684600700679e-05, | |
| "loss": 0.8986, | |
| "num_input_tokens_seen": 42144, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "eval_loss": 0.3364465534687042, | |
| "eval_runtime": 1.1659, | |
| "eval_samples_per_second": 34.307, | |
| "eval_steps_per_second": 8.577, | |
| "num_input_tokens_seen": 42144, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 1.5555555555555556, | |
| "grad_norm": 6.133034706115723, | |
| "learning_rate": 4.9549884116375714e-05, | |
| "loss": 0.4373, | |
| "num_input_tokens_seen": 43680, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.6111111111111112, | |
| "grad_norm": 3.9126594066619873, | |
| "learning_rate": 4.9453690018345144e-05, | |
| "loss": 1.3978, | |
| "num_input_tokens_seen": 45248, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 1.6666666666666665, | |
| "grad_norm": 6.755434513092041, | |
| "learning_rate": 4.934829988758131e-05, | |
| "loss": 0.2847, | |
| "num_input_tokens_seen": 46816, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.7222222222222223, | |
| "grad_norm": 18.98358154296875, | |
| "learning_rate": 4.923375335700223e-05, | |
| "loss": 0.2777, | |
| "num_input_tokens_seen": 48384, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 1.7777777777777777, | |
| "grad_norm": 2.0623528957366943, | |
| "learning_rate": 4.9110093502873476e-05, | |
| "loss": 0.1408, | |
| "num_input_tokens_seen": 49952, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.8333333333333335, | |
| "grad_norm": 17.358171463012695, | |
| "learning_rate": 4.897736682860885e-05, | |
| "loss": 0.0471, | |
| "num_input_tokens_seen": 51520, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 1.8888888888888888, | |
| "grad_norm": 22.74451446533203, | |
| "learning_rate": 4.883562324728241e-05, | |
| "loss": 0.1595, | |
| "num_input_tokens_seen": 53024, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.9444444444444444, | |
| "grad_norm": 7.793264865875244, | |
| "learning_rate": 4.868491606285823e-05, | |
| "loss": 0.2312, | |
| "num_input_tokens_seen": 54592, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 3.0166492462158203, | |
| "learning_rate": 4.8525301950144894e-05, | |
| "loss": 0.162, | |
| "num_input_tokens_seen": 56128, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 0.12522773444652557, | |
| "eval_runtime": 1.1795, | |
| "eval_samples_per_second": 33.912, | |
| "eval_steps_per_second": 8.478, | |
| "num_input_tokens_seen": 56128, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 2.0555555555555554, | |
| "grad_norm": 3.614006519317627, | |
| "learning_rate": 4.835684093348244e-05, | |
| "loss": 0.0719, | |
| "num_input_tokens_seen": 57696, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 2.111111111111111, | |
| "grad_norm": 11.736072540283203, | |
| "learning_rate": 4.817959636416969e-05, | |
| "loss": 0.3451, | |
| "num_input_tokens_seen": 59264, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 2.1666666666666665, | |
| "grad_norm": 0.017602359876036644, | |
| "learning_rate": 4.7993634896640394e-05, | |
| "loss": 0.0367, | |
| "num_input_tokens_seen": 60864, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 2.2222222222222223, | |
| "grad_norm": 128.3651885986328, | |
| "learning_rate": 4.779902646339722e-05, | |
| "loss": 0.1548, | |
| "num_input_tokens_seen": 62464, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 2.2777777777777777, | |
| "grad_norm": 4.051515579223633, | |
| "learning_rate": 4.759584424871302e-05, | |
| "loss": 0.1765, | |
| "num_input_tokens_seen": 64032, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 2.3333333333333335, | |
| "grad_norm": 0.603679895401001, | |
| "learning_rate": 4.7384164661109176e-05, | |
| "loss": 0.1182, | |
| "num_input_tokens_seen": 65568, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 2.388888888888889, | |
| "grad_norm": 0.7103855609893799, | |
| "learning_rate": 4.7164067304621536e-05, | |
| "loss": 0.0868, | |
| "num_input_tokens_seen": 67104, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 2.4444444444444446, | |
| "grad_norm": 3.1308445930480957, | |
| "learning_rate": 4.693563494886455e-05, | |
| "loss": 0.2294, | |
| "num_input_tokens_seen": 68704, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 3.5392675399780273, | |
| "learning_rate": 4.669895349790502e-05, | |
| "loss": 0.0545, | |
| "num_input_tokens_seen": 70272, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "eval_loss": 0.1658935248851776, | |
| "eval_runtime": 1.1947, | |
| "eval_samples_per_second": 33.481, | |
| "eval_steps_per_second": 8.37, | |
| "num_input_tokens_seen": 70272, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 2.5555555555555554, | |
| "grad_norm": 1.4086837768554688, | |
| "learning_rate": 4.645411195795709e-05, | |
| "loss": 0.0091, | |
| "num_input_tokens_seen": 71808, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 2.611111111111111, | |
| "grad_norm": 6.530168056488037, | |
| "learning_rate": 4.620120240391065e-05, | |
| "loss": 0.1379, | |
| "num_input_tokens_seen": 73408, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 2.6666666666666665, | |
| "grad_norm": 4.456235885620117, | |
| "learning_rate": 4.5940319944705736e-05, | |
| "loss": 0.1022, | |
| "num_input_tokens_seen": 74912, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 2.7222222222222223, | |
| "grad_norm": 4.331588268280029, | |
| "learning_rate": 4.567156268756594e-05, | |
| "loss": 0.1812, | |
| "num_input_tokens_seen": 76544, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 2.7777777777777777, | |
| "grad_norm": 0.16590723395347595, | |
| "learning_rate": 4.539503170110431e-05, | |
| "loss": 0.0448, | |
| "num_input_tokens_seen": 78112, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 2.8333333333333335, | |
| "grad_norm": 0.05489581450819969, | |
| "learning_rate": 4.5110830977315556e-05, | |
| "loss": 0.004, | |
| "num_input_tokens_seen": 79712, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 2.888888888888889, | |
| "grad_norm": 0.06641557812690735, | |
| "learning_rate": 4.4819067392468944e-05, | |
| "loss": 0.076, | |
| "num_input_tokens_seen": 81280, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 2.9444444444444446, | |
| "grad_norm": 0.9573344588279724, | |
| "learning_rate": 4.4519850666916484e-05, | |
| "loss": 0.0639, | |
| "num_input_tokens_seen": 82848, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 0.09293336421251297, | |
| "learning_rate": 4.4213293323831585e-05, | |
| "loss": 0.0493, | |
| "num_input_tokens_seen": 84352, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 0.11682591587305069, | |
| "eval_runtime": 1.1636, | |
| "eval_samples_per_second": 34.375, | |
| "eval_steps_per_second": 8.594, | |
| "num_input_tokens_seen": 84352, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 3.0555555555555554, | |
| "grad_norm": 2.5821824073791504, | |
| "learning_rate": 4.38995106468937e-05, | |
| "loss": 0.011, | |
| "num_input_tokens_seen": 85920, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 3.111111111111111, | |
| "grad_norm": 0.03350764885544777, | |
| "learning_rate": 4.357862063693486e-05, | |
| "loss": 0.0094, | |
| "num_input_tokens_seen": 87520, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 3.1666666666666665, | |
| "grad_norm": 0.02359146997332573, | |
| "learning_rate": 4.325074396756437e-05, | |
| "loss": 0.0009, | |
| "num_input_tokens_seen": 89088, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 3.2222222222222223, | |
| "grad_norm": 0.9292595386505127, | |
| "learning_rate": 4.2916003939788403e-05, | |
| "loss": 0.0859, | |
| "num_input_tokens_seen": 90688, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 3.2777777777777777, | |
| "grad_norm": 0.3055465817451477, | |
| "learning_rate": 4.257452643564155e-05, | |
| "loss": 0.1095, | |
| "num_input_tokens_seen": 92160, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 3.3333333333333335, | |
| "grad_norm": 8.65383243560791, | |
| "learning_rate": 4.22264398708477e-05, | |
| "loss": 0.1926, | |
| "num_input_tokens_seen": 93760, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 3.388888888888889, | |
| "grad_norm": 5.946500301361084, | |
| "learning_rate": 4.1871875146528195e-05, | |
| "loss": 0.1205, | |
| "num_input_tokens_seen": 95360, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 3.4444444444444446, | |
| "grad_norm": 0.05868682637810707, | |
| "learning_rate": 4.1510965599975196e-05, | |
| "loss": 0.0432, | |
| "num_input_tokens_seen": 96928, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 3.5, | |
| "grad_norm": 0.07760920375585556, | |
| "learning_rate": 4.114384695450906e-05, | |
| "loss": 0.0166, | |
| "num_input_tokens_seen": 98464, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 3.5, | |
| "eval_loss": 0.16605496406555176, | |
| "eval_runtime": 1.1874, | |
| "eval_samples_per_second": 33.688, | |
| "eval_steps_per_second": 8.422, | |
| "num_input_tokens_seen": 98464, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 3.5555555555555554, | |
| "grad_norm": 4.408195972442627, | |
| "learning_rate": 4.077065726843828e-05, | |
| "loss": 0.0249, | |
| "num_input_tokens_seen": 100064, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 3.611111111111111, | |
| "grad_norm": 0.08003110438585281, | |
| "learning_rate": 4.039153688314145e-05, | |
| "loss": 0.056, | |
| "num_input_tokens_seen": 101600, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 3.6666666666666665, | |
| "grad_norm": 0.014739356003701687, | |
| "learning_rate": 4.000662837029062e-05, | |
| "loss": 0.01, | |
| "num_input_tokens_seen": 103200, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 3.7222222222222223, | |
| "grad_norm": 4.2343668937683105, | |
| "learning_rate": 3.961607647823583e-05, | |
| "loss": 0.1279, | |
| "num_input_tokens_seen": 104768, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 3.7777777777777777, | |
| "grad_norm": 0.9541974663734436, | |
| "learning_rate": 3.9220028077571295e-05, | |
| "loss": 0.043, | |
| "num_input_tokens_seen": 106304, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 3.8333333333333335, | |
| "grad_norm": 0.029086820781230927, | |
| "learning_rate": 3.881863210590332e-05, | |
| "loss": 0.0006, | |
| "num_input_tokens_seen": 107904, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 3.888888888888889, | |
| "grad_norm": 4.72139835357666, | |
| "learning_rate": 3.841203951184095e-05, | |
| "loss": 0.1241, | |
| "num_input_tokens_seen": 109408, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 3.9444444444444446, | |
| "grad_norm": 7.2094197273254395, | |
| "learning_rate": 3.8000403198230387e-05, | |
| "loss": 0.0363, | |
| "num_input_tokens_seen": 111008, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 0.01099959947168827, | |
| "learning_rate": 3.75838779646545e-05, | |
| "loss": 0.0146, | |
| "num_input_tokens_seen": 112576, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_loss": 0.1140967383980751, | |
| "eval_runtime": 1.1728, | |
| "eval_samples_per_second": 34.107, | |
| "eval_steps_per_second": 8.527, | |
| "num_input_tokens_seen": 112576, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 4.055555555555555, | |
| "grad_norm": 0.06453730165958405, | |
| "learning_rate": 3.7162620449219e-05, | |
| "loss": 0.1621, | |
| "num_input_tokens_seen": 114144, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 4.111111111111111, | |
| "grad_norm": 1.2406953573226929, | |
| "learning_rate": 3.673678906964727e-05, | |
| "loss": 0.0137, | |
| "num_input_tokens_seen": 115712, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 4.166666666666667, | |
| "grad_norm": 0.040748681873083115, | |
| "learning_rate": 3.630654396370594e-05, | |
| "loss": 0.0514, | |
| "num_input_tokens_seen": 117216, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 4.222222222222222, | |
| "grad_norm": 0.04640405625104904, | |
| "learning_rate": 3.5872046928983626e-05, | |
| "loss": 0.0012, | |
| "num_input_tokens_seen": 118816, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 4.277777777777778, | |
| "grad_norm": 0.4107241928577423, | |
| "learning_rate": 3.543346136204545e-05, | |
| "loss": 0.0132, | |
| "num_input_tokens_seen": 120352, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 4.333333333333333, | |
| "grad_norm": 0.03547825664281845, | |
| "learning_rate": 3.499095219698631e-05, | |
| "loss": 0.0107, | |
| "num_input_tokens_seen": 121920, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 4.388888888888889, | |
| "grad_norm": 4.26585578918457, | |
| "learning_rate": 3.454468584340588e-05, | |
| "loss": 0.0743, | |
| "num_input_tokens_seen": 123456, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 4.444444444444445, | |
| "grad_norm": 0.04872285574674606, | |
| "learning_rate": 3.409483012382879e-05, | |
| "loss": 0.0116, | |
| "num_input_tokens_seen": 125056, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 4.5, | |
| "grad_norm": 4.138394355773926, | |
| "learning_rate": 3.364155421059342e-05, | |
| "loss": 0.1392, | |
| "num_input_tokens_seen": 126624, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 4.5, | |
| "eval_loss": 0.12619957327842712, | |
| "eval_runtime": 1.1955, | |
| "eval_samples_per_second": 33.457, | |
| "eval_steps_per_second": 8.364, | |
| "num_input_tokens_seen": 126624, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 4.555555555555555, | |
| "grad_norm": 0.07925529032945633, | |
| "learning_rate": 3.318502856223311e-05, | |
| "loss": 0.0013, | |
| "num_input_tokens_seen": 128224, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 4.611111111111111, | |
| "grad_norm": 0.07759137451648712, | |
| "learning_rate": 3.272542485937369e-05, | |
| "loss": 0.0181, | |
| "num_input_tokens_seen": 129728, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 4.666666666666667, | |
| "grad_norm": 0.020686015486717224, | |
| "learning_rate": 3.2262915940171376e-05, | |
| "loss": 0.002, | |
| "num_input_tokens_seen": 131328, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 4.722222222222222, | |
| "grad_norm": 0.10983490198850632, | |
| "learning_rate": 3.1797675735315455e-05, | |
| "loss": 0.0026, | |
| "num_input_tokens_seen": 132896, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 4.777777777777778, | |
| "grad_norm": 0.06394976377487183, | |
| "learning_rate": 3.132987920262005e-05, | |
| "loss": 0.0024, | |
| "num_input_tokens_seen": 134496, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 4.833333333333333, | |
| "grad_norm": 0.006211922504007816, | |
| "learning_rate": 3.085970226122962e-05, | |
| "loss": 0.0031, | |
| "num_input_tokens_seen": 136064, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 4.888888888888889, | |
| "grad_norm": 0.00576823391020298, | |
| "learning_rate": 3.0387321725463e-05, | |
| "loss": 0.0282, | |
| "num_input_tokens_seen": 137664, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 4.944444444444445, | |
| "grad_norm": 0.004720824770629406, | |
| "learning_rate": 2.9912915238320754e-05, | |
| "loss": 0.0188, | |
| "num_input_tokens_seen": 139232, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 0.12190665304660797, | |
| "learning_rate": 2.9436661204680882e-05, | |
| "loss": 0.0007, | |
| "num_input_tokens_seen": 140832, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_loss": 0.16102315485477448, | |
| "eval_runtime": 1.178, | |
| "eval_samples_per_second": 33.955, | |
| "eval_steps_per_second": 8.489, | |
| "num_input_tokens_seen": 140832, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 5.055555555555555, | |
| "grad_norm": 0.12717540562152863, | |
| "learning_rate": 2.8958738724208072e-05, | |
| "loss": 0.0051, | |
| "num_input_tokens_seen": 142368, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 5.111111111111111, | |
| "grad_norm": 0.004431543871760368, | |
| "learning_rate": 2.8479327524001636e-05, | |
| "loss": 0.0012, | |
| "num_input_tokens_seen": 144000, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 5.166666666666667, | |
| "grad_norm": 0.4021238684654236, | |
| "learning_rate": 2.7998607891007495e-05, | |
| "loss": 0.0018, | |
| "num_input_tokens_seen": 145632, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 5.222222222222222, | |
| "grad_norm": 0.034988883882761, | |
| "learning_rate": 2.7516760604219617e-05, | |
| "loss": 0.0024, | |
| "num_input_tokens_seen": 147168, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 5.277777777777778, | |
| "grad_norm": 0.008324529975652695, | |
| "learning_rate": 2.7033966866696457e-05, | |
| "loss": 0.0002, | |
| "num_input_tokens_seen": 148736, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 5.333333333333333, | |
| "grad_norm": 0.012731038965284824, | |
| "learning_rate": 2.6550408237417885e-05, | |
| "loss": 0.0002, | |
| "num_input_tokens_seen": 150304, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 5.388888888888889, | |
| "grad_norm": 0.09622369706630707, | |
| "learning_rate": 2.6066266563008267e-05, | |
| "loss": 0.0004, | |
| "num_input_tokens_seen": 151872, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 5.444444444444445, | |
| "grad_norm": 11.24488639831543, | |
| "learning_rate": 2.5581723909351406e-05, | |
| "loss": 0.0472, | |
| "num_input_tokens_seen": 153472, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 5.5, | |
| "grad_norm": 0.008177004754543304, | |
| "learning_rate": 2.5096962493123012e-05, | |
| "loss": 0.0002, | |
| "num_input_tokens_seen": 154976, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 5.5, | |
| "eval_loss": 0.29022911190986633, | |
| "eval_runtime": 1.2018, | |
| "eval_samples_per_second": 33.282, | |
| "eval_steps_per_second": 8.321, | |
| "num_input_tokens_seen": 154976, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 5.555555555555555, | |
| "grad_norm": 0.12855511903762817, | |
| "learning_rate": 2.461216461326642e-05, | |
| "loss": 0.0007, | |
| "num_input_tokens_seen": 156544, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 5.611111111111111, | |
| "grad_norm": 0.004401104990392923, | |
| "learning_rate": 2.4127512582437485e-05, | |
| "loss": 0.0186, | |
| "num_input_tokens_seen": 158112, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 5.666666666666667, | |
| "grad_norm": 0.11994586884975433, | |
| "learning_rate": 2.364318865844416e-05, | |
| "loss": 0.0023, | |
| "num_input_tokens_seen": 159680, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 5.722222222222222, | |
| "grad_norm": 0.0036123669706285, | |
| "learning_rate": 2.3159374975706884e-05, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 161312, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 5.777777777777778, | |
| "grad_norm": 0.005037173628807068, | |
| "learning_rate": 2.2676253476765196e-05, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 162880, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 5.833333333333333, | |
| "grad_norm": 0.004541746340692043, | |
| "learning_rate": 2.2194005843856636e-05, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 164448, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 5.888888888888889, | |
| "grad_norm": 0.003595268353819847, | |
| "learning_rate": 2.1712813430593436e-05, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 166016, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 5.944444444444445, | |
| "grad_norm": 0.05647879093885422, | |
| "learning_rate": 2.1232857193762924e-05, | |
| "loss": 0.1753, | |
| "num_input_tokens_seen": 167552, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "grad_norm": 0.024071840569376945, | |
| "learning_rate": 2.0754317625276983e-05, | |
| "loss": 0.0003, | |
| "num_input_tokens_seen": 169056, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "eval_loss": 0.18793392181396484, | |
| "eval_runtime": 1.1995, | |
| "eval_samples_per_second": 33.347, | |
| "eval_steps_per_second": 8.337, | |
| "num_input_tokens_seen": 169056, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 6.055555555555555, | |
| "grad_norm": 0.003708529518917203, | |
| "learning_rate": 2.02773746842965e-05, | |
| "loss": 0.0003, | |
| "num_input_tokens_seen": 170592, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 6.111111111111111, | |
| "grad_norm": 0.029693368822336197, | |
| "learning_rate": 1.980220772955602e-05, | |
| "loss": 0.0837, | |
| "num_input_tokens_seen": 172192, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 6.166666666666667, | |
| "grad_norm": 1.9385651350021362, | |
| "learning_rate": 1.932899545191433e-05, | |
| "loss": 0.0069, | |
| "num_input_tokens_seen": 173792, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 6.222222222222222, | |
| "grad_norm": 0.008497129194438457, | |
| "learning_rate": 1.8857915807156092e-05, | |
| "loss": 0.0003, | |
| "num_input_tokens_seen": 175360, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 6.277777777777778, | |
| "grad_norm": 0.005275554955005646, | |
| "learning_rate": 1.838914594906995e-05, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 176992, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 6.333333333333333, | |
| "grad_norm": 0.007108105346560478, | |
| "learning_rate": 1.792286216282824e-05, | |
| "loss": 0.0007, | |
| "num_input_tokens_seen": 178592, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 6.388888888888889, | |
| "grad_norm": 0.003662517061457038, | |
| "learning_rate": 1.7459239798693364e-05, | |
| "loss": 0.0003, | |
| "num_input_tokens_seen": 180128, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 6.444444444444445, | |
| "grad_norm": 0.03587842732667923, | |
| "learning_rate": 1.699845320607571e-05, | |
| "loss": 0.0002, | |
| "num_input_tokens_seen": 181632, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 6.5, | |
| "grad_norm": 0.5192928314208984, | |
| "learning_rate": 1.6540675667967974e-05, | |
| "loss": 0.0013, | |
| "num_input_tokens_seen": 183200, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 6.5, | |
| "eval_loss": 0.23766329884529114, | |
| "eval_runtime": 1.196, | |
| "eval_samples_per_second": 33.445, | |
| "eval_steps_per_second": 8.361, | |
| "num_input_tokens_seen": 183200, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 6.555555555555555, | |
| "grad_norm": 0.02627461589872837, | |
| "learning_rate": 1.60860793357805e-05, | |
| "loss": 0.0004, | |
| "num_input_tokens_seen": 184800, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 6.611111111111111, | |
| "grad_norm": 0.007544673513621092, | |
| "learning_rate": 1.56348351646022e-05, | |
| "loss": 0.0285, | |
| "num_input_tokens_seen": 186336, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 6.666666666666667, | |
| "grad_norm": 0.0033945958130061626, | |
| "learning_rate": 1.5187112848911323e-05, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 187904, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 6.722222222222222, | |
| "grad_norm": 0.004758753813803196, | |
| "learning_rate": 1.47430807587603e-05, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 189472, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 6.777777777777778, | |
| "grad_norm": 0.33814820647239685, | |
| "learning_rate": 1.430290587645865e-05, | |
| "loss": 0.0013, | |
| "num_input_tokens_seen": 191072, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 6.833333333333333, | |
| "grad_norm": 0.002982664154842496, | |
| "learning_rate": 1.3866753733777765e-05, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 192608, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 6.888888888888889, | |
| "grad_norm": 0.003444693749770522, | |
| "learning_rate": 1.343478834970121e-05, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 194208, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 6.944444444444445, | |
| "grad_norm": 0.0027797692455351353, | |
| "learning_rate": 1.3007172168743854e-05, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 195776, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "grad_norm": 0.00582587905228138, | |
| "learning_rate": 1.2584065999863102e-05, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 197344, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "eval_loss": 0.24828998744487762, | |
| "eval_runtime": 1.2102, | |
| "eval_samples_per_second": 33.053, | |
| "eval_steps_per_second": 8.263, | |
| "num_input_tokens_seen": 197344, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 7.055555555555555, | |
| "grad_norm": 0.012683599255979061, | |
| "learning_rate": 1.2165628955985314e-05, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 198944, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 7.111111111111111, | |
| "grad_norm": 0.0020441152155399323, | |
| "learning_rate": 1.175201839416988e-05, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 200512, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 7.166666666666667, | |
| "grad_norm": 0.0023130401968955994, | |
| "learning_rate": 1.1343389856433658e-05, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 202016, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 7.222222222222222, | |
| "grad_norm": 0.0022213098127394915, | |
| "learning_rate": 1.0939897011258001e-05, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 203648, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 7.277777777777778, | |
| "grad_norm": 0.002087386092171073, | |
| "learning_rate": 1.0541691595800337e-05, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 205184, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 7.333333333333333, | |
| "grad_norm": 0.0022648421581834555, | |
| "learning_rate": 1.0148923358832022e-05, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 206720, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 7.388888888888889, | |
| "grad_norm": 0.004804587922990322, | |
| "learning_rate": 9.761740004423927e-06, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 208320, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 7.444444444444445, | |
| "grad_norm": 0.0030888738110661507, | |
| "learning_rate": 9.380287136401e-06, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 209856, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 7.5, | |
| "grad_norm": 0.0017403181409463286, | |
| "learning_rate": 9.00470820358663e-06, | |
| "loss": 0.0002, | |
| "num_input_tokens_seen": 211392, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 7.5, | |
| "eval_loss": 0.25386351346969604, | |
| "eval_runtime": 1.1996, | |
| "eval_samples_per_second": 33.346, | |
| "eval_steps_per_second": 8.336, | |
| "num_input_tokens_seen": 211392, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 7.555555555555555, | |
| "grad_norm": 0.0020435000769793987, | |
| "learning_rate": 8.635144445857406e-06, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 212960, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 7.611111111111111, | |
| "grad_norm": 0.004450716078281403, | |
| "learning_rate": 8.271734841028553e-06, | |
| "loss": 0.0002, | |
| "num_input_tokens_seen": 214528, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 7.666666666666667, | |
| "grad_norm": 0.002327506896108389, | |
| "learning_rate": 7.914616052590071e-06, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 216000, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 7.722222222222222, | |
| "grad_norm": 0.0016173458425328135, | |
| "learning_rate": 7.563922378313218e-06, | |
| "loss": 0.0097, | |
| "num_input_tokens_seen": 217632, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 7.777777777777778, | |
| "grad_norm": 0.0022673553321510553, | |
| "learning_rate": 7.219785699746573e-06, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 219232, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 7.833333333333333, | |
| "grad_norm": 0.008347373455762863, | |
| "learning_rate": 6.882335432620779e-06, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 220800, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 7.888888888888889, | |
| "grad_norm": 0.002359850564971566, | |
| "learning_rate": 6.55169847818059e-06, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 222368, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 7.944444444444445, | |
| "grad_norm": 0.002948438050225377, | |
| "learning_rate": 6.22799917546252e-06, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 223968, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "grad_norm": 0.0022273629438132048, | |
| "learning_rate": 5.9113592545359945e-06, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 225536, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "eval_loss": 0.2521117627620697, | |
| "eval_runtime": 1.1746, | |
| "eval_samples_per_second": 34.053, | |
| "eval_steps_per_second": 8.513, | |
| "num_input_tokens_seen": 225536, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 8.055555555555555, | |
| "grad_norm": 0.0022272937931120396, | |
| "learning_rate": 5.601897790725643e-06, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 227168, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 8.11111111111111, | |
| "grad_norm": 0.001649300567805767, | |
| "learning_rate": 5.299731159831953e-06, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 228704, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 8.166666666666666, | |
| "grad_norm": 0.0024456402752548456, | |
| "learning_rate": 5.004972994367102e-06, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 230336, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 8.222222222222221, | |
| "grad_norm": 0.022852079942822456, | |
| "learning_rate": 4.7177341408224e-06, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 231936, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 8.277777777777779, | |
| "grad_norm": 0.0026604924350976944, | |
| "learning_rate": 4.438122617983443e-06, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 233472, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 8.333333333333334, | |
| "grad_norm": 0.0019071005517616868, | |
| "learning_rate": 4.166243576308712e-06, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 235040, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 8.38888888888889, | |
| "grad_norm": 0.007083934266120195, | |
| "learning_rate": 3.9021992583867325e-06, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 236608, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 8.444444444444445, | |
| "grad_norm": 0.00216863676905632, | |
| "learning_rate": 3.6460889604868626e-06, | |
| "loss": 0.0008, | |
| "num_input_tokens_seen": 238144, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 8.5, | |
| "grad_norm": 0.0029811670538038015, | |
| "learning_rate": 3.398008995217988e-06, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 239680, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 8.5, | |
| "eval_loss": 0.24617867171764374, | |
| "eval_runtime": 1.205, | |
| "eval_samples_per_second": 33.194, | |
| "eval_steps_per_second": 8.299, | |
| "num_input_tokens_seen": 239680, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 8.555555555555555, | |
| "grad_norm": 0.0019673961214721203, | |
| "learning_rate": 3.158052655309332e-06, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 241280, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 8.61111111111111, | |
| "grad_norm": 0.0018027002224698663, | |
| "learning_rate": 2.9263101785268254e-06, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 242816, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 8.666666666666666, | |
| "grad_norm": 0.0017442210810258985, | |
| "learning_rate": 2.7028687137384267e-06, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 244352, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 8.722222222222221, | |
| "grad_norm": 0.001671099103987217, | |
| "learning_rate": 2.487812288140945e-06, | |
| "loss": 0.0002, | |
| "num_input_tokens_seen": 245856, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 8.777777777777779, | |
| "grad_norm": 0.012044212780892849, | |
| "learning_rate": 2.281221775660894e-06, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 247456, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 8.833333333333334, | |
| "grad_norm": 0.001784978318028152, | |
| "learning_rate": 2.0831748665410765e-06, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 248992, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 8.88888888888889, | |
| "grad_norm": 0.001969333505257964, | |
| "learning_rate": 1.893746038124497e-06, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 250528, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 8.944444444444445, | |
| "grad_norm": 0.002932202536612749, | |
| "learning_rate": 1.713006526846439e-06, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 252128, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "grad_norm": 0.001777714816853404, | |
| "learning_rate": 1.541024301445404e-06, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 253696, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "eval_loss": 0.2544807493686676, | |
| "eval_runtime": 1.1897, | |
| "eval_samples_per_second": 33.622, | |
| "eval_steps_per_second": 8.405, | |
| "num_input_tokens_seen": 253696, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 9.055555555555555, | |
| "grad_norm": 0.0017283963970839977, | |
| "learning_rate": 1.3778640374027985e-06, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 255296, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 9.11111111111111, | |
| "grad_norm": 0.0016462679486721754, | |
| "learning_rate": 1.2235870926211619e-06, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 256896, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 9.166666666666666, | |
| "grad_norm": 0.0017024942208081484, | |
| "learning_rate": 1.0782514843499653e-06, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 258432, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 9.222222222222221, | |
| "grad_norm": 0.0017732703126966953, | |
| "learning_rate": 9.419118673676924e-07, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 260000, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 9.277777777777779, | |
| "grad_norm": 0.0017788108671084046, | |
| "learning_rate": 8.146195134284052e-07, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 261568, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 9.333333333333334, | |
| "grad_norm": 0.001521819387562573, | |
| "learning_rate": 6.964222919805391e-07, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 263200, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 9.38888888888889, | |
| "grad_norm": 0.07402946054935455, | |
| "learning_rate": 5.87364652165176e-07, | |
| "loss": 0.0002, | |
| "num_input_tokens_seen": 264736, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 9.444444444444445, | |
| "grad_norm": 0.005844974424690008, | |
| "learning_rate": 4.874876061005173e-07, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 266304, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 9.5, | |
| "grad_norm": 0.0018154801800847054, | |
| "learning_rate": 3.9682871345891883e-07, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 267840, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 9.5, | |
| "eval_loss": 0.24860987067222595, | |
| "eval_runtime": 1.1754, | |
| "eval_samples_per_second": 34.031, | |
| "eval_steps_per_second": 8.508, | |
| "num_input_tokens_seen": 267840, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 9.555555555555555, | |
| "grad_norm": 0.0017057630466297269, | |
| "learning_rate": 3.1542206734221924e-07, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 269376, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 9.61111111111111, | |
| "grad_norm": 0.004398024175316095, | |
| "learning_rate": 2.4329828146074095e-07, | |
| "loss": 0.0005, | |
| "num_input_tokens_seen": 270944, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 9.666666666666666, | |
| "grad_norm": 0.022089840844273567, | |
| "learning_rate": 1.8048447862070718e-07, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 272448, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 9.722222222222221, | |
| "grad_norm": 0.0015576216392219067, | |
| "learning_rate": 1.2700428052447033e-07, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 273984, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 9.777777777777779, | |
| "grad_norm": 0.021998705342411995, | |
| "learning_rate": 8.28777988873486e-08, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 275520, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 9.833333333333334, | |
| "grad_norm": 0.0017123895231634378, | |
| "learning_rate": 4.8121627874450625e-08, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 277152, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 9.88888888888889, | |
| "grad_norm": 0.001752789132297039, | |
| "learning_rate": 2.2748837860270267e-08, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 278688, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 9.944444444444445, | |
| "grad_norm": 0.0014716936275362968, | |
| "learning_rate": 6.768970513457151e-09, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 280256, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "grad_norm": 0.0017592560034245253, | |
| "learning_rate": 1.8803520859811406e-10, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 281856, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "eval_loss": 0.24965079128742218, | |
| "eval_runtime": 1.195, | |
| "eval_samples_per_second": 33.474, | |
| "eval_steps_per_second": 8.368, | |
| "num_input_tokens_seen": 281856, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "num_input_tokens_seen": 281856, | |
| "step": 900, | |
| "total_flos": 1.2691848290107392e+16, | |
| "train_loss": 0.195216263138508, | |
| "train_runtime": 253.6916, | |
| "train_samples_per_second": 14.19, | |
| "train_steps_per_second": 3.548 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 900, | |
| "num_input_tokens_seen": 281856, | |
| "num_train_epochs": 10, | |
| "save_steps": 45, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.2691848290107392e+16, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |