| { | |
| "best_global_step": 1800, | |
| "best_metric": 0.45277419686317444, | |
| "best_model_checkpoint": "checkpoints/rft-gemma-2-2b-it-math50k/math50k/gemma-2-2b-it-step-1/checkpoint-1800", | |
| "epoch": 0.9997493106041614, | |
| "eval_steps": 200, | |
| "global_step": 1994, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.00250689395838556, | |
| "grad_norm": 26.5, | |
| "learning_rate": 2.5e-08, | |
| "loss": 1.6194, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.00501378791677112, | |
| "grad_norm": 24.0, | |
| "learning_rate": 5e-08, | |
| "loss": 1.5561, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.007520681875156681, | |
| "grad_norm": 26.0, | |
| "learning_rate": 7.5e-08, | |
| "loss": 1.591, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.01002757583354224, | |
| "grad_norm": 26.625, | |
| "learning_rate": 1e-07, | |
| "loss": 1.5696, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.012534469791927802, | |
| "grad_norm": 25.125, | |
| "learning_rate": 1.25e-07, | |
| "loss": 1.6343, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.015041363750313362, | |
| "grad_norm": 24.625, | |
| "learning_rate": 1.5e-07, | |
| "loss": 1.5915, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.017548257708698924, | |
| "grad_norm": 27.75, | |
| "learning_rate": 1.75e-07, | |
| "loss": 1.6102, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.02005515166708448, | |
| "grad_norm": 28.0, | |
| "learning_rate": 2e-07, | |
| "loss": 1.5935, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.022562045625470043, | |
| "grad_norm": 25.125, | |
| "learning_rate": 2.25e-07, | |
| "loss": 1.5255, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.025068939583855605, | |
| "grad_norm": 25.25, | |
| "learning_rate": 2.5e-07, | |
| "loss": 1.5735, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.027575833542241163, | |
| "grad_norm": 26.5, | |
| "learning_rate": 2.75e-07, | |
| "loss": 1.6086, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.030082727500626724, | |
| "grad_norm": 28.625, | |
| "learning_rate": 3e-07, | |
| "loss": 1.5682, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.032589621459012286, | |
| "grad_norm": 23.375, | |
| "learning_rate": 3.25e-07, | |
| "loss": 1.5131, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.03509651541739785, | |
| "grad_norm": 25.875, | |
| "learning_rate": 3.5e-07, | |
| "loss": 1.5251, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.0376034093757834, | |
| "grad_norm": 23.875, | |
| "learning_rate": 3.75e-07, | |
| "loss": 1.4799, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.04011030333416896, | |
| "grad_norm": 23.0, | |
| "learning_rate": 4e-07, | |
| "loss": 1.5335, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.042617197292554525, | |
| "grad_norm": 22.875, | |
| "learning_rate": 4.2499999999999995e-07, | |
| "loss": 1.491, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.045124091250940086, | |
| "grad_norm": 24.5, | |
| "learning_rate": 4.5e-07, | |
| "loss": 1.4991, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.04763098520932565, | |
| "grad_norm": 22.875, | |
| "learning_rate": 4.7499999999999995e-07, | |
| "loss": 1.4735, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.05013787916771121, | |
| "grad_norm": 24.5, | |
| "learning_rate": 5e-07, | |
| "loss": 1.4697, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.052644773126096764, | |
| "grad_norm": 22.875, | |
| "learning_rate": 5.25e-07, | |
| "loss": 1.3817, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.055151667084482325, | |
| "grad_norm": 22.0, | |
| "learning_rate": 5.5e-07, | |
| "loss": 1.4256, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.05765856104286789, | |
| "grad_norm": 20.375, | |
| "learning_rate": 5.749999999999999e-07, | |
| "loss": 1.3805, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.06016545500125345, | |
| "grad_norm": 21.625, | |
| "learning_rate": 6e-07, | |
| "loss": 1.3207, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.06267234895963901, | |
| "grad_norm": 20.125, | |
| "learning_rate": 6.249999999999999e-07, | |
| "loss": 1.2704, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.06517924291802457, | |
| "grad_norm": 19.25, | |
| "learning_rate": 6.5e-07, | |
| "loss": 1.2073, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.06768613687641013, | |
| "grad_norm": 19.75, | |
| "learning_rate": 6.75e-07, | |
| "loss": 1.1496, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.0701930308347957, | |
| "grad_norm": 18.5, | |
| "learning_rate": 7e-07, | |
| "loss": 1.0701, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.07269992479318124, | |
| "grad_norm": 15.4375, | |
| "learning_rate": 7.249999999999999e-07, | |
| "loss": 0.9889, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.0752068187515668, | |
| "grad_norm": 13.9375, | |
| "learning_rate": 7.5e-07, | |
| "loss": 0.9031, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.07771371270995237, | |
| "grad_norm": 12.875, | |
| "learning_rate": 7.75e-07, | |
| "loss": 0.8469, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.08022060666833793, | |
| "grad_norm": 14.0, | |
| "learning_rate": 8e-07, | |
| "loss": 0.8162, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.08272750062672349, | |
| "grad_norm": 10.25, | |
| "learning_rate": 8.249999999999999e-07, | |
| "loss": 0.7473, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.08523439458510905, | |
| "grad_norm": 7.9375, | |
| "learning_rate": 8.499999999999999e-07, | |
| "loss": 0.6628, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.08774128854349461, | |
| "grad_norm": 6.96875, | |
| "learning_rate": 8.75e-07, | |
| "loss": 0.6392, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.09024818250188017, | |
| "grad_norm": 5.0, | |
| "learning_rate": 9e-07, | |
| "loss": 0.5986, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.09275507646026573, | |
| "grad_norm": 4.28125, | |
| "learning_rate": 9.25e-07, | |
| "loss": 0.5834, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.0952619704186513, | |
| "grad_norm": 3.796875, | |
| "learning_rate": 9.499999999999999e-07, | |
| "loss": 0.5458, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.09776886437703686, | |
| "grad_norm": 3.59375, | |
| "learning_rate": 9.75e-07, | |
| "loss": 0.5308, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.10027575833542242, | |
| "grad_norm": 3.15625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.5189, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.10027575833542242, | |
| "eval_loss": 0.5631369948387146, | |
| "eval_runtime": 23.3525, | |
| "eval_samples_per_second": 323.948, | |
| "eval_steps_per_second": 20.255, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.10278265229380797, | |
| "grad_norm": 3.125, | |
| "learning_rate": 9.972129319955407e-07, | |
| "loss": 0.5126, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.10528954625219353, | |
| "grad_norm": 3.15625, | |
| "learning_rate": 9.944258639910813e-07, | |
| "loss": 0.5109, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.10779644021057909, | |
| "grad_norm": 2.640625, | |
| "learning_rate": 9.91638795986622e-07, | |
| "loss": 0.4615, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.11030333416896465, | |
| "grad_norm": 2.734375, | |
| "learning_rate": 9.888517279821627e-07, | |
| "loss": 0.4818, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.11281022812735021, | |
| "grad_norm": 2.625, | |
| "learning_rate": 9.860646599777034e-07, | |
| "loss": 0.4724, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.11531712208573577, | |
| "grad_norm": 2.578125, | |
| "learning_rate": 9.832775919732441e-07, | |
| "loss": 0.479, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.11782401604412134, | |
| "grad_norm": 2.546875, | |
| "learning_rate": 9.804905239687848e-07, | |
| "loss": 0.4687, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.1203309100025069, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 9.777034559643255e-07, | |
| "loss": 0.46, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.12283780396089246, | |
| "grad_norm": 2.6875, | |
| "learning_rate": 9.749163879598662e-07, | |
| "loss": 0.4545, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.12534469791927802, | |
| "grad_norm": 2.703125, | |
| "learning_rate": 9.721293199554067e-07, | |
| "loss": 0.452, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.12785159187766357, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 9.693422519509476e-07, | |
| "loss": 0.4571, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.13035848583604914, | |
| "grad_norm": 2.734375, | |
| "learning_rate": 9.665551839464883e-07, | |
| "loss": 0.4517, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.1328653797944347, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 9.637681159420288e-07, | |
| "loss": 0.4377, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.13537227375282027, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 9.609810479375697e-07, | |
| "loss": 0.4492, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.1378791677112058, | |
| "grad_norm": 2.78125, | |
| "learning_rate": 9.581939799331104e-07, | |
| "loss": 0.4505, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.1403860616695914, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 9.554069119286509e-07, | |
| "loss": 0.4356, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.14289295562797694, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 9.526198439241917e-07, | |
| "loss": 0.4276, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.14539984958636248, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 9.498327759197324e-07, | |
| "loss": 0.445, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.14790674354474806, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 9.470457079152731e-07, | |
| "loss": 0.4264, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.1504136375031336, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 9.442586399108138e-07, | |
| "loss": 0.4374, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.15292053146151918, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 9.414715719063545e-07, | |
| "loss": 0.4371, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.15542742541990473, | |
| "grad_norm": 2.625, | |
| "learning_rate": 9.386845039018952e-07, | |
| "loss": 0.439, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.1579343193782903, | |
| "grad_norm": 2.25, | |
| "learning_rate": 9.358974358974359e-07, | |
| "loss": 0.4426, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.16044121333667585, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 9.331103678929766e-07, | |
| "loss": 0.4181, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.16294810729506143, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 9.303232998885172e-07, | |
| "loss": 0.4392, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.16545500125344698, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 9.27536231884058e-07, | |
| "loss": 0.4238, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.16796189521183255, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 9.247491638795987e-07, | |
| "loss": 0.4257, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.1704687891702181, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 9.219620958751393e-07, | |
| "loss": 0.4221, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.17297568312860365, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 9.191750278706801e-07, | |
| "loss": 0.4359, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.17548257708698922, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 9.163879598662206e-07, | |
| "loss": 0.4218, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.17798947104537477, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 9.136008918617613e-07, | |
| "loss": 0.4398, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.18049636500376035, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 9.108138238573021e-07, | |
| "loss": 0.4332, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.1830032589621459, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 9.080267558528427e-07, | |
| "loss": 0.4224, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.18551015292053147, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 9.052396878483834e-07, | |
| "loss": 0.4402, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.18801704687891702, | |
| "grad_norm": 2.5, | |
| "learning_rate": 9.024526198439241e-07, | |
| "loss": 0.4387, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.1905239408373026, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 8.996655518394648e-07, | |
| "loss": 0.4397, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.19303083479568814, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 8.968784838350055e-07, | |
| "loss": 0.4193, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.19553772875407371, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 8.940914158305462e-07, | |
| "loss": 0.407, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.19804462271245926, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 8.913043478260869e-07, | |
| "loss": 0.4269, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.20055151667084484, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 8.885172798216276e-07, | |
| "loss": 0.4167, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.20055151667084484, | |
| "eval_loss": 0.4732125401496887, | |
| "eval_runtime": 23.3615, | |
| "eval_samples_per_second": 323.823, | |
| "eval_steps_per_second": 20.247, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.20305841062923038, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 8.857302118171683e-07, | |
| "loss": 0.4305, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.20556530458761593, | |
| "grad_norm": 2.125, | |
| "learning_rate": 8.829431438127089e-07, | |
| "loss": 0.4296, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.2080721985460015, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 8.801560758082497e-07, | |
| "loss": 0.4354, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.21057909250438706, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 8.773690078037904e-07, | |
| "loss": 0.4269, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.21308598646277263, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 8.74581939799331e-07, | |
| "loss": 0.4243, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.21559288042115818, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 8.717948717948718e-07, | |
| "loss": 0.4183, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.21809977437954375, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 8.690078037904125e-07, | |
| "loss": 0.4278, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.2206066683379293, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 8.662207357859531e-07, | |
| "loss": 0.4124, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.22311356229631488, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 8.634336677814939e-07, | |
| "loss": 0.4099, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.22562045625470042, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 8.606465997770345e-07, | |
| "loss": 0.4162, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.228127350213086, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 8.578595317725752e-07, | |
| "loss": 0.4213, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.23063424417147155, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 8.550724637681159e-07, | |
| "loss": 0.4122, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.2331411381298571, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 8.522853957636566e-07, | |
| "loss": 0.4093, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.23564803208824267, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 8.494983277591973e-07, | |
| "loss": 0.4171, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.23815492604662822, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 8.46711259754738e-07, | |
| "loss": 0.3994, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.2406618200050138, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 8.439241917502787e-07, | |
| "loss": 0.4155, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.24316871396339934, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 8.411371237458194e-07, | |
| "loss": 0.4201, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.24567560792178492, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 8.383500557413601e-07, | |
| "loss": 0.4055, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.24818250188017046, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 8.355629877369008e-07, | |
| "loss": 0.412, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.25068939583855604, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 8.327759197324414e-07, | |
| "loss": 0.4134, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.2531962897969416, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 8.299888517279821e-07, | |
| "loss": 0.4174, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.25570318375532713, | |
| "grad_norm": 2.375, | |
| "learning_rate": 8.272017837235227e-07, | |
| "loss": 0.4264, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.2582100777137127, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 8.244147157190635e-07, | |
| "loss": 0.4162, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.2607169716720983, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 8.216276477146042e-07, | |
| "loss": 0.4258, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.26322386563048383, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 8.188405797101448e-07, | |
| "loss": 0.4148, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.2657307595888694, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 8.160535117056856e-07, | |
| "loss": 0.4126, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.26823765354725493, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 8.132664437012263e-07, | |
| "loss": 0.4132, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.27074454750564053, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 8.104793756967669e-07, | |
| "loss": 0.4082, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.2732514414640261, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 8.076923076923077e-07, | |
| "loss": 0.4247, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.2757583354224116, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 8.049052396878483e-07, | |
| "loss": 0.406, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.2782652293807972, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 8.02118171683389e-07, | |
| "loss": 0.4051, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 0.2807721233391828, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 7.993311036789297e-07, | |
| "loss": 0.4161, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.2832790172975683, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 7.965440356744704e-07, | |
| "loss": 0.4215, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 0.2857859112559539, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 7.937569676700111e-07, | |
| "loss": 0.4153, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.2882928052143394, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 7.909698996655518e-07, | |
| "loss": 0.4062, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.29079969917272497, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 7.881828316610925e-07, | |
| "loss": 0.4079, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.29330659313111057, | |
| "grad_norm": 2.125, | |
| "learning_rate": 7.853957636566332e-07, | |
| "loss": 0.4048, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 0.2958134870894961, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 7.826086956521739e-07, | |
| "loss": 0.4128, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.29832038104788167, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 7.798216276477145e-07, | |
| "loss": 0.4227, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 0.3008272750062672, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 7.770345596432553e-07, | |
| "loss": 0.4231, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.3008272750062672, | |
| "eval_loss": 0.4612683653831482, | |
| "eval_runtime": 23.2585, | |
| "eval_samples_per_second": 325.258, | |
| "eval_steps_per_second": 20.337, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.3033341689646528, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 7.74247491638796e-07, | |
| "loss": 0.3912, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 0.30584106292303836, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 7.714604236343366e-07, | |
| "loss": 0.3907, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.3083479568814239, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 7.686733556298774e-07, | |
| "loss": 0.3991, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 0.31085485083980946, | |
| "grad_norm": 2.25, | |
| "learning_rate": 7.658862876254181e-07, | |
| "loss": 0.4289, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.31336174479819506, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 7.630992196209587e-07, | |
| "loss": 0.4083, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.3158686387565806, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 7.603121516164995e-07, | |
| "loss": 0.4105, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.31837553271496616, | |
| "grad_norm": 2.125, | |
| "learning_rate": 7.575250836120402e-07, | |
| "loss": 0.4055, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 0.3208824266733517, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 7.547380156075807e-07, | |
| "loss": 0.4026, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.32338932063173725, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 7.519509476031214e-07, | |
| "loss": 0.409, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 0.32589621459012286, | |
| "grad_norm": 2.125, | |
| "learning_rate": 7.491638795986621e-07, | |
| "loss": 0.4251, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.3284031085485084, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 7.463768115942028e-07, | |
| "loss": 0.4094, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 0.33091000250689395, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 7.435897435897435e-07, | |
| "loss": 0.398, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.3334168964652795, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 7.408026755852842e-07, | |
| "loss": 0.399, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 0.3359237904236651, | |
| "grad_norm": 2.0, | |
| "learning_rate": 7.380156075808249e-07, | |
| "loss": 0.39, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.33843068438205065, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 7.352285395763656e-07, | |
| "loss": 0.4105, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 0.3409375783404362, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 7.324414715719063e-07, | |
| "loss": 0.4169, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.34344447229882175, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 7.29654403567447e-07, | |
| "loss": 0.3926, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 0.3459513662572073, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 7.268673355629877e-07, | |
| "loss": 0.4116, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.3484582602155929, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 7.240802675585283e-07, | |
| "loss": 0.4086, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 0.35096515417397844, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 7.212931995540691e-07, | |
| "loss": 0.4116, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.353472048132364, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 7.185061315496098e-07, | |
| "loss": 0.4019, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 0.35597894209074954, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 7.157190635451504e-07, | |
| "loss": 0.4076, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.35848583604913514, | |
| "grad_norm": 2.125, | |
| "learning_rate": 7.129319955406912e-07, | |
| "loss": 0.4094, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 0.3609927300075207, | |
| "grad_norm": 2.0, | |
| "learning_rate": 7.101449275362319e-07, | |
| "loss": 0.404, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.36349962396590624, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 7.073578595317725e-07, | |
| "loss": 0.4122, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 0.3660065179242918, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 7.045707915273133e-07, | |
| "loss": 0.4104, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.3685134118826774, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 7.01783723522854e-07, | |
| "loss": 0.4119, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 0.37102030584106294, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 6.989966555183946e-07, | |
| "loss": 0.4233, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.3735271997994485, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 6.962095875139353e-07, | |
| "loss": 0.4352, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 0.37603409375783403, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 6.93422519509476e-07, | |
| "loss": 0.405, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.3785409877162196, | |
| "grad_norm": 2.390625, | |
| "learning_rate": 6.906354515050167e-07, | |
| "loss": 0.4085, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 0.3810478816746052, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 6.878483835005574e-07, | |
| "loss": 0.4154, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.38355477563299073, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 6.850613154960981e-07, | |
| "loss": 0.4095, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 0.3860616695913763, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 6.822742474916388e-07, | |
| "loss": 0.4003, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.3885685635497618, | |
| "grad_norm": 2.125, | |
| "learning_rate": 6.794871794871795e-07, | |
| "loss": 0.4157, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 0.39107545750814743, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 6.767001114827202e-07, | |
| "loss": 0.4043, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.393582351466533, | |
| "grad_norm": 1.9921875, | |
| "learning_rate": 6.739130434782609e-07, | |
| "loss": 0.4015, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 0.3960892454249185, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 6.711259754738015e-07, | |
| "loss": 0.4236, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.39859613938330407, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 6.683389074693421e-07, | |
| "loss": 0.4064, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 0.4011030333416897, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 6.655518394648829e-07, | |
| "loss": 0.4137, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.4011030333416897, | |
| "eval_loss": 0.4568587839603424, | |
| "eval_runtime": 23.2871, | |
| "eval_samples_per_second": 324.857, | |
| "eval_steps_per_second": 20.312, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.4036099273000752, | |
| "grad_norm": 1.984375, | |
| "learning_rate": 6.627647714604236e-07, | |
| "loss": 0.4088, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 0.40611682125846077, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 6.599777034559642e-07, | |
| "loss": 0.4037, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.4086237152168463, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 6.57190635451505e-07, | |
| "loss": 0.4057, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 0.41113060917523186, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 6.544035674470457e-07, | |
| "loss": 0.4043, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.41363750313361747, | |
| "grad_norm": 2.125, | |
| "learning_rate": 6.516164994425863e-07, | |
| "loss": 0.379, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 0.416144397092003, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 6.488294314381271e-07, | |
| "loss": 0.3965, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.41865129105038856, | |
| "grad_norm": 2.125, | |
| "learning_rate": 6.460423634336678e-07, | |
| "loss": 0.4178, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 0.4211581850087741, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 6.432552954292084e-07, | |
| "loss": 0.3997, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.4236650789671597, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 6.404682274247491e-07, | |
| "loss": 0.3989, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 0.42617197292554526, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 6.376811594202898e-07, | |
| "loss": 0.4233, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.4286788668839308, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 6.348940914158305e-07, | |
| "loss": 0.3957, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 0.43118576084231636, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 6.321070234113712e-07, | |
| "loss": 0.4096, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.4336926548007019, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 6.293199554069119e-07, | |
| "loss": 0.3951, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 0.4361995487590875, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 6.265328874024526e-07, | |
| "loss": 0.406, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.43870644271747306, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 6.237458193979933e-07, | |
| "loss": 0.3963, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 0.4412133366758586, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 6.209587513935339e-07, | |
| "loss": 0.4097, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.44372023063424415, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 6.181716833890747e-07, | |
| "loss": 0.4227, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 0.44622712459262975, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 6.153846153846154e-07, | |
| "loss": 0.4054, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.4487340185510153, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 6.12597547380156e-07, | |
| "loss": 0.402, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 0.45124091250940085, | |
| "grad_norm": 2.25, | |
| "learning_rate": 6.098104793756968e-07, | |
| "loss": 0.4125, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.4537478064677864, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 6.070234113712375e-07, | |
| "loss": 0.4058, | |
| "step": 905 | |
| }, | |
| { | |
| "epoch": 0.456254700426172, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 6.042363433667781e-07, | |
| "loss": 0.4089, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.45876159438455755, | |
| "grad_norm": 1.9453125, | |
| "learning_rate": 6.014492753623189e-07, | |
| "loss": 0.3937, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 0.4612684883429431, | |
| "grad_norm": 2.375, | |
| "learning_rate": 5.986622073578596e-07, | |
| "loss": 0.3957, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.46377538230132864, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 5.958751393534002e-07, | |
| "loss": 0.4153, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 0.4662822762597142, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 5.930880713489409e-07, | |
| "loss": 0.4294, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.4687891702180998, | |
| "grad_norm": 1.96875, | |
| "learning_rate": 5.903010033444817e-07, | |
| "loss": 0.4122, | |
| "step": 935 | |
| }, | |
| { | |
| "epoch": 0.47129606417648534, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 5.875139353400222e-07, | |
| "loss": 0.4092, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.4738029581348709, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 5.847268673355629e-07, | |
| "loss": 0.4105, | |
| "step": 945 | |
| }, | |
| { | |
| "epoch": 0.47630985209325644, | |
| "grad_norm": 2.125, | |
| "learning_rate": 5.819397993311036e-07, | |
| "loss": 0.4177, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.47881674605164204, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 5.791527313266443e-07, | |
| "loss": 0.4053, | |
| "step": 955 | |
| }, | |
| { | |
| "epoch": 0.4813236400100276, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 5.76365663322185e-07, | |
| "loss": 0.4109, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.48383053396841313, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 5.735785953177257e-07, | |
| "loss": 0.4003, | |
| "step": 965 | |
| }, | |
| { | |
| "epoch": 0.4863374279267987, | |
| "grad_norm": 2.0, | |
| "learning_rate": 5.707915273132664e-07, | |
| "loss": 0.3999, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.48884432188518423, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 5.680044593088071e-07, | |
| "loss": 0.4036, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 0.49135121584356983, | |
| "grad_norm": 1.9609375, | |
| "learning_rate": 5.652173913043477e-07, | |
| "loss": 0.3808, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.4938581098019554, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 5.624303232998885e-07, | |
| "loss": 0.4175, | |
| "step": 985 | |
| }, | |
| { | |
| "epoch": 0.49636500376034093, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 5.596432552954292e-07, | |
| "loss": 0.4167, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.4988718977187265, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 5.568561872909698e-07, | |
| "loss": 0.3967, | |
| "step": 995 | |
| }, | |
| { | |
| "epoch": 0.5013787916771121, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 5.540691192865106e-07, | |
| "loss": 0.4088, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.5013787916771121, | |
| "eval_loss": 0.4543125629425049, | |
| "eval_runtime": 23.1965, | |
| "eval_samples_per_second": 326.127, | |
| "eval_steps_per_second": 20.391, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.5038856856354976, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 5.512820512820513e-07, | |
| "loss": 0.4081, | |
| "step": 1005 | |
| }, | |
| { | |
| "epoch": 0.5063925795938832, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 5.484949832775919e-07, | |
| "loss": 0.3995, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.5088994735522687, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 5.457079152731327e-07, | |
| "loss": 0.402, | |
| "step": 1015 | |
| }, | |
| { | |
| "epoch": 0.5114063675106543, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 5.429208472686734e-07, | |
| "loss": 0.3884, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.5139132614690398, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 5.40133779264214e-07, | |
| "loss": 0.428, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 0.5164201554274254, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 5.373467112597547e-07, | |
| "loss": 0.3845, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.518927049385811, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 5.345596432552955e-07, | |
| "loss": 0.3961, | |
| "step": 1035 | |
| }, | |
| { | |
| "epoch": 0.5214339433441966, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 5.317725752508361e-07, | |
| "loss": 0.3991, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.5239408373025821, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 5.289855072463768e-07, | |
| "loss": 0.3956, | |
| "step": 1045 | |
| }, | |
| { | |
| "epoch": 0.5264477312609677, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 5.261984392419175e-07, | |
| "loss": 0.3965, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.5289546252193532, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 5.234113712374582e-07, | |
| "loss": 0.4048, | |
| "step": 1055 | |
| }, | |
| { | |
| "epoch": 0.5314615191777388, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 5.206243032329989e-07, | |
| "loss": 0.4133, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.5339684131361243, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 5.178372352285395e-07, | |
| "loss": 0.4048, | |
| "step": 1065 | |
| }, | |
| { | |
| "epoch": 0.5364753070945099, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 5.150501672240803e-07, | |
| "loss": 0.4076, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.5389822010528955, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 5.12263099219621e-07, | |
| "loss": 0.4051, | |
| "step": 1075 | |
| }, | |
| { | |
| "epoch": 0.5414890950112811, | |
| "grad_norm": 2.125, | |
| "learning_rate": 5.094760312151615e-07, | |
| "loss": 0.3938, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.5439959889696666, | |
| "grad_norm": 2.125, | |
| "learning_rate": 5.066889632107023e-07, | |
| "loss": 0.3971, | |
| "step": 1085 | |
| }, | |
| { | |
| "epoch": 0.5465028829280522, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 5.03901895206243e-07, | |
| "loss": 0.4084, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.5490097768864377, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 5.011148272017836e-07, | |
| "loss": 0.3932, | |
| "step": 1095 | |
| }, | |
| { | |
| "epoch": 0.5515166708448233, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 4.983277591973244e-07, | |
| "loss": 0.4187, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.5540235648032088, | |
| "grad_norm": 1.9921875, | |
| "learning_rate": 4.95540691192865e-07, | |
| "loss": 0.398, | |
| "step": 1105 | |
| }, | |
| { | |
| "epoch": 0.5565304587615943, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 4.927536231884058e-07, | |
| "loss": 0.4302, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.5590373527199799, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 4.899665551839464e-07, | |
| "loss": 0.401, | |
| "step": 1115 | |
| }, | |
| { | |
| "epoch": 0.5615442466783656, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 4.871794871794871e-07, | |
| "loss": 0.4064, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.5640511406367511, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 4.843924191750278e-07, | |
| "loss": 0.3943, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 0.5665580345951367, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 4.816053511705685e-07, | |
| "loss": 0.397, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.5690649285535222, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 4.788182831661092e-07, | |
| "loss": 0.4074, | |
| "step": 1135 | |
| }, | |
| { | |
| "epoch": 0.5715718225119077, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 4.760312151616499e-07, | |
| "loss": 0.4075, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.5740787164702933, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 4.7324414715719066e-07, | |
| "loss": 0.4011, | |
| "step": 1145 | |
| }, | |
| { | |
| "epoch": 0.5765856104286788, | |
| "grad_norm": 2.0, | |
| "learning_rate": 4.704570791527313e-07, | |
| "loss": 0.3953, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.5790925043870644, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 4.67670011148272e-07, | |
| "loss": 0.3913, | |
| "step": 1155 | |
| }, | |
| { | |
| "epoch": 0.5815993983454499, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 4.648829431438127e-07, | |
| "loss": 0.4043, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.5841062923038356, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 4.620958751393534e-07, | |
| "loss": 0.3936, | |
| "step": 1165 | |
| }, | |
| { | |
| "epoch": 0.5866131862622211, | |
| "grad_norm": 2.125, | |
| "learning_rate": 4.5930880713489404e-07, | |
| "loss": 0.3932, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.5891200802206067, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 4.5652173913043473e-07, | |
| "loss": 0.4166, | |
| "step": 1175 | |
| }, | |
| { | |
| "epoch": 0.5916269741789922, | |
| "grad_norm": 1.9296875, | |
| "learning_rate": 4.537346711259755e-07, | |
| "loss": 0.3941, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.5941338681373778, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 4.509476031215161e-07, | |
| "loss": 0.4054, | |
| "step": 1185 | |
| }, | |
| { | |
| "epoch": 0.5966407620957633, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 4.481605351170568e-07, | |
| "loss": 0.4066, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.5991476560541489, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 4.453734671125975e-07, | |
| "loss": 0.393, | |
| "step": 1195 | |
| }, | |
| { | |
| "epoch": 0.6016545500125344, | |
| "grad_norm": 2.25, | |
| "learning_rate": 4.425863991081382e-07, | |
| "loss": 0.4181, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.6016545500125344, | |
| "eval_loss": 0.4533432722091675, | |
| "eval_runtime": 23.1151, | |
| "eval_samples_per_second": 327.276, | |
| "eval_steps_per_second": 20.463, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.60416144397092, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 4.397993311036789e-07, | |
| "loss": 0.4035, | |
| "step": 1205 | |
| }, | |
| { | |
| "epoch": 0.6066683379293056, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 4.370122630992196e-07, | |
| "loss": 0.3927, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.6091752318876912, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 4.342251950947603e-07, | |
| "loss": 0.4155, | |
| "step": 1215 | |
| }, | |
| { | |
| "epoch": 0.6116821258460767, | |
| "grad_norm": 1.9453125, | |
| "learning_rate": 4.3143812709030095e-07, | |
| "loss": 0.386, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.6141890198044623, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 4.2865105908584165e-07, | |
| "loss": 0.3916, | |
| "step": 1225 | |
| }, | |
| { | |
| "epoch": 0.6166959137628478, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 4.258639910813824e-07, | |
| "loss": 0.3888, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.6192028077212334, | |
| "grad_norm": 2.25, | |
| "learning_rate": 4.2307692307692304e-07, | |
| "loss": 0.4189, | |
| "step": 1235 | |
| }, | |
| { | |
| "epoch": 0.6217097016796189, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 4.2028985507246374e-07, | |
| "loss": 0.3975, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.6242165956380045, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 4.1750278706800444e-07, | |
| "loss": 0.3989, | |
| "step": 1245 | |
| }, | |
| { | |
| "epoch": 0.6267234895963901, | |
| "grad_norm": 2.390625, | |
| "learning_rate": 4.1471571906354513e-07, | |
| "loss": 0.409, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.6292303835547757, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 4.1192865105908583e-07, | |
| "loss": 0.395, | |
| "step": 1255 | |
| }, | |
| { | |
| "epoch": 0.6317372775131612, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 4.091415830546265e-07, | |
| "loss": 0.4085, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.6342441714715468, | |
| "grad_norm": 2.25, | |
| "learning_rate": 4.063545150501672e-07, | |
| "loss": 0.4032, | |
| "step": 1265 | |
| }, | |
| { | |
| "epoch": 0.6367510654299323, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 4.0356744704570787e-07, | |
| "loss": 0.4154, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.6392579593883179, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 4.0078037904124856e-07, | |
| "loss": 0.3958, | |
| "step": 1275 | |
| }, | |
| { | |
| "epoch": 0.6417648533467034, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 3.979933110367893e-07, | |
| "loss": 0.4168, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.644271747305089, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 3.9520624303232996e-07, | |
| "loss": 0.3934, | |
| "step": 1285 | |
| }, | |
| { | |
| "epoch": 0.6467786412634745, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 3.9241917502787065e-07, | |
| "loss": 0.415, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.6492855352218602, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 3.8963210702341135e-07, | |
| "loss": 0.3743, | |
| "step": 1295 | |
| }, | |
| { | |
| "epoch": 0.6517924291802457, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 3.8684503901895205e-07, | |
| "loss": 0.4128, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.6542993231386313, | |
| "grad_norm": 2.25, | |
| "learning_rate": 3.8405797101449274e-07, | |
| "loss": 0.3972, | |
| "step": 1305 | |
| }, | |
| { | |
| "epoch": 0.6568062170970168, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 3.8127090301003344e-07, | |
| "loss": 0.4052, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.6593131110554024, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 3.7848383500557414e-07, | |
| "loss": 0.4195, | |
| "step": 1315 | |
| }, | |
| { | |
| "epoch": 0.6618200050137879, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 3.756967670011148e-07, | |
| "loss": 0.3915, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.6643268989721735, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 3.729096989966555e-07, | |
| "loss": 0.4089, | |
| "step": 1325 | |
| }, | |
| { | |
| "epoch": 0.666833792930559, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 3.7012263099219623e-07, | |
| "loss": 0.4086, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.6693406868889445, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 3.6733556298773687e-07, | |
| "loss": 0.4052, | |
| "step": 1335 | |
| }, | |
| { | |
| "epoch": 0.6718475808473302, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 3.6454849498327757e-07, | |
| "loss": 0.4047, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.6743544748057158, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 3.6176142697881827e-07, | |
| "loss": 0.4061, | |
| "step": 1345 | |
| }, | |
| { | |
| "epoch": 0.6768613687641013, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 3.5897435897435896e-07, | |
| "loss": 0.3945, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.6793682627224868, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 3.5618729096989966e-07, | |
| "loss": 0.4077, | |
| "step": 1355 | |
| }, | |
| { | |
| "epoch": 0.6818751566808724, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 3.5340022296544036e-07, | |
| "loss": 0.4131, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.6843820506392579, | |
| "grad_norm": 2.125, | |
| "learning_rate": 3.5061315496098105e-07, | |
| "loss": 0.3994, | |
| "step": 1365 | |
| }, | |
| { | |
| "epoch": 0.6868889445976435, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 3.478260869565217e-07, | |
| "loss": 0.3856, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.689395838556029, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 3.450390189520624e-07, | |
| "loss": 0.4135, | |
| "step": 1375 | |
| }, | |
| { | |
| "epoch": 0.6919027325144146, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 3.4225195094760314e-07, | |
| "loss": 0.4037, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.6944096264728002, | |
| "grad_norm": 1.9765625, | |
| "learning_rate": 3.394648829431438e-07, | |
| "loss": 0.4239, | |
| "step": 1385 | |
| }, | |
| { | |
| "epoch": 0.6969165204311858, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 3.366778149386845e-07, | |
| "loss": 0.4, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.6994234143895713, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 3.338907469342252e-07, | |
| "loss": 0.4144, | |
| "step": 1395 | |
| }, | |
| { | |
| "epoch": 0.7019303083479569, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 3.311036789297659e-07, | |
| "loss": 0.4029, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.7019303083479569, | |
| "eval_loss": 0.4530901312828064, | |
| "eval_runtime": 23.1866, | |
| "eval_samples_per_second": 326.265, | |
| "eval_steps_per_second": 20.4, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.7044372023063424, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 3.283166109253065e-07, | |
| "loss": 0.3917, | |
| "step": 1405 | |
| }, | |
| { | |
| "epoch": 0.706944096264728, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 3.2552954292084727e-07, | |
| "loss": 0.4076, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.7094509902231135, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 3.2274247491638797e-07, | |
| "loss": 0.3925, | |
| "step": 1415 | |
| }, | |
| { | |
| "epoch": 0.7119578841814991, | |
| "grad_norm": 2.125, | |
| "learning_rate": 3.199554069119286e-07, | |
| "loss": 0.3963, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.7144647781398847, | |
| "grad_norm": 1.921875, | |
| "learning_rate": 3.171683389074693e-07, | |
| "loss": 0.3936, | |
| "step": 1425 | |
| }, | |
| { | |
| "epoch": 0.7169716720982703, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 3.1438127090301e-07, | |
| "loss": 0.4078, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.7194785660566558, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 3.115942028985507e-07, | |
| "loss": 0.4174, | |
| "step": 1435 | |
| }, | |
| { | |
| "epoch": 0.7219854600150414, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 3.088071348940914e-07, | |
| "loss": 0.3951, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.7244923539734269, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 3.060200668896321e-07, | |
| "loss": 0.3935, | |
| "step": 1445 | |
| }, | |
| { | |
| "epoch": 0.7269992479318125, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 3.032329988851728e-07, | |
| "loss": 0.4108, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.729506141890198, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 3.0044593088071344e-07, | |
| "loss": 0.4059, | |
| "step": 1455 | |
| }, | |
| { | |
| "epoch": 0.7320130358485836, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 2.976588628762542e-07, | |
| "loss": 0.3869, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.7345199298069691, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 2.948717948717949e-07, | |
| "loss": 0.4133, | |
| "step": 1465 | |
| }, | |
| { | |
| "epoch": 0.7370268237653548, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 2.9208472686733553e-07, | |
| "loss": 0.4084, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.7395337177237403, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 2.892976588628762e-07, | |
| "loss": 0.3936, | |
| "step": 1475 | |
| }, | |
| { | |
| "epoch": 0.7420406116821259, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 2.865105908584169e-07, | |
| "loss": 0.4094, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.7445475056405114, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 2.837235228539576e-07, | |
| "loss": 0.398, | |
| "step": 1485 | |
| }, | |
| { | |
| "epoch": 0.747054399598897, | |
| "grad_norm": 2.25, | |
| "learning_rate": 2.809364548494983e-07, | |
| "loss": 0.3953, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.7495612935572825, | |
| "grad_norm": 2.25, | |
| "learning_rate": 2.78149386845039e-07, | |
| "loss": 0.4104, | |
| "step": 1495 | |
| }, | |
| { | |
| "epoch": 0.7520681875156681, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 2.753623188405797e-07, | |
| "loss": 0.4121, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.7545750814740536, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 2.7257525083612035e-07, | |
| "loss": 0.3987, | |
| "step": 1505 | |
| }, | |
| { | |
| "epoch": 0.7570819754324392, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 2.697881828316611e-07, | |
| "loss": 0.3962, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.7595888693908248, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 2.670011148272018e-07, | |
| "loss": 0.4036, | |
| "step": 1515 | |
| }, | |
| { | |
| "epoch": 0.7620957633492104, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 2.6421404682274245e-07, | |
| "loss": 0.3921, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.7646026573075959, | |
| "grad_norm": 2.125, | |
| "learning_rate": 2.6142697881828314e-07, | |
| "loss": 0.3984, | |
| "step": 1525 | |
| }, | |
| { | |
| "epoch": 0.7671095512659815, | |
| "grad_norm": 2.125, | |
| "learning_rate": 2.5863991081382384e-07, | |
| "loss": 0.3938, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.769616445224367, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 2.5585284280936454e-07, | |
| "loss": 0.4116, | |
| "step": 1535 | |
| }, | |
| { | |
| "epoch": 0.7721233391827526, | |
| "grad_norm": 2.0, | |
| "learning_rate": 2.5306577480490523e-07, | |
| "loss": 0.3972, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.7746302331411381, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 2.5027870680044593e-07, | |
| "loss": 0.4223, | |
| "step": 1545 | |
| }, | |
| { | |
| "epoch": 0.7771371270995237, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 2.474916387959866e-07, | |
| "loss": 0.3911, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.7796440210579092, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 2.447045707915273e-07, | |
| "loss": 0.4181, | |
| "step": 1555 | |
| }, | |
| { | |
| "epoch": 0.7821509150162949, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 2.4191750278706797e-07, | |
| "loss": 0.4211, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.7846578089746804, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 2.391304347826087e-07, | |
| "loss": 0.3965, | |
| "step": 1565 | |
| }, | |
| { | |
| "epoch": 0.787164702933066, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 2.3634336677814936e-07, | |
| "loss": 0.4059, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.7896715968914515, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 2.3355629877369006e-07, | |
| "loss": 0.4058, | |
| "step": 1575 | |
| }, | |
| { | |
| "epoch": 0.792178490849837, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 2.3076923076923078e-07, | |
| "loss": 0.4011, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.7946853848082226, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 2.2798216276477145e-07, | |
| "loss": 0.4062, | |
| "step": 1585 | |
| }, | |
| { | |
| "epoch": 0.7971922787666081, | |
| "grad_norm": 2.125, | |
| "learning_rate": 2.2519509476031215e-07, | |
| "loss": 0.4004, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.7996991727249937, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 2.2240802675585282e-07, | |
| "loss": 0.3972, | |
| "step": 1595 | |
| }, | |
| { | |
| "epoch": 0.8022060666833793, | |
| "grad_norm": 2.125, | |
| "learning_rate": 2.1962095875139352e-07, | |
| "loss": 0.3867, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.8022060666833793, | |
| "eval_loss": 0.45278844237327576, | |
| "eval_runtime": 23.2205, | |
| "eval_samples_per_second": 325.79, | |
| "eval_steps_per_second": 20.37, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.8047129606417649, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 2.1683389074693424e-07, | |
| "loss": 0.4057, | |
| "step": 1605 | |
| }, | |
| { | |
| "epoch": 0.8072198546001504, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 2.140468227424749e-07, | |
| "loss": 0.3959, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.809726748558536, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 2.112597547380156e-07, | |
| "loss": 0.4051, | |
| "step": 1615 | |
| }, | |
| { | |
| "epoch": 0.8122336425169215, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 2.0847268673355628e-07, | |
| "loss": 0.3931, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.8147405364753071, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 2.0568561872909697e-07, | |
| "loss": 0.4104, | |
| "step": 1625 | |
| }, | |
| { | |
| "epoch": 0.8172474304336926, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 2.028985507246377e-07, | |
| "loss": 0.4078, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.8197543243920782, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 2.0011148272017837e-07, | |
| "loss": 0.408, | |
| "step": 1635 | |
| }, | |
| { | |
| "epoch": 0.8222612183504637, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 1.9732441471571906e-07, | |
| "loss": 0.3933, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.8247681123088494, | |
| "grad_norm": 1.9921875, | |
| "learning_rate": 1.9453734671125973e-07, | |
| "loss": 0.3922, | |
| "step": 1645 | |
| }, | |
| { | |
| "epoch": 0.8272750062672349, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 1.9175027870680043e-07, | |
| "loss": 0.3945, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.8297819002256205, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 1.8896321070234113e-07, | |
| "loss": 0.4046, | |
| "step": 1655 | |
| }, | |
| { | |
| "epoch": 0.832288794184006, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 1.8617614269788183e-07, | |
| "loss": 0.408, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.8347956881423916, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 1.8338907469342252e-07, | |
| "loss": 0.3936, | |
| "step": 1665 | |
| }, | |
| { | |
| "epoch": 0.8373025821007771, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 1.806020066889632e-07, | |
| "loss": 0.3954, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.8398094760591627, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 1.778149386845039e-07, | |
| "loss": 0.4063, | |
| "step": 1675 | |
| }, | |
| { | |
| "epoch": 0.8423163700175482, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 1.7502787068004459e-07, | |
| "loss": 0.3887, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.8448232639759338, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 1.7224080267558528e-07, | |
| "loss": 0.3935, | |
| "step": 1685 | |
| }, | |
| { | |
| "epoch": 0.8473301579343194, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 1.6945373467112598e-07, | |
| "loss": 0.3987, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.849837051892705, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 1.6666666666666665e-07, | |
| "loss": 0.4162, | |
| "step": 1695 | |
| }, | |
| { | |
| "epoch": 0.8523439458510905, | |
| "grad_norm": 2.4375, | |
| "learning_rate": 1.6387959866220735e-07, | |
| "loss": 0.4054, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.8548508398094761, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 1.6109253065774804e-07, | |
| "loss": 0.379, | |
| "step": 1705 | |
| }, | |
| { | |
| "epoch": 0.8573577337678616, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 1.5830546265328874e-07, | |
| "loss": 0.4159, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.8598646277262472, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 1.5551839464882944e-07, | |
| "loss": 0.3929, | |
| "step": 1715 | |
| }, | |
| { | |
| "epoch": 0.8623715216846327, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 1.527313266443701e-07, | |
| "loss": 0.3978, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.8648784156430183, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 1.499442586399108e-07, | |
| "loss": 0.4088, | |
| "step": 1725 | |
| }, | |
| { | |
| "epoch": 0.8673853096014038, | |
| "grad_norm": 2.125, | |
| "learning_rate": 1.471571906354515e-07, | |
| "loss": 0.4082, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.8698922035597895, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 1.443701226309922e-07, | |
| "loss": 0.4031, | |
| "step": 1735 | |
| }, | |
| { | |
| "epoch": 0.872399097518175, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 1.415830546265329e-07, | |
| "loss": 0.4096, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.8749059914765606, | |
| "grad_norm": 2.125, | |
| "learning_rate": 1.3879598662207357e-07, | |
| "loss": 0.3919, | |
| "step": 1745 | |
| }, | |
| { | |
| "epoch": 0.8774128854349461, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 1.3600891861761426e-07, | |
| "loss": 0.4226, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.8799197793933317, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 1.3322185061315496e-07, | |
| "loss": 0.3957, | |
| "step": 1755 | |
| }, | |
| { | |
| "epoch": 0.8824266733517172, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 1.3043478260869563e-07, | |
| "loss": 0.3943, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.8849335673101028, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 1.2764771460423635e-07, | |
| "loss": 0.3949, | |
| "step": 1765 | |
| }, | |
| { | |
| "epoch": 0.8874404612684883, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 1.2486064659977702e-07, | |
| "loss": 0.398, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.8899473552268738, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 1.2207357859531772e-07, | |
| "loss": 0.3996, | |
| "step": 1775 | |
| }, | |
| { | |
| "epoch": 0.8924542491852595, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 1.1928651059085842e-07, | |
| "loss": 0.3822, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.894961143143645, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 1.164994425863991e-07, | |
| "loss": 0.4035, | |
| "step": 1785 | |
| }, | |
| { | |
| "epoch": 0.8974680371020306, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 1.1371237458193978e-07, | |
| "loss": 0.4025, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.8999749310604162, | |
| "grad_norm": 2.4375, | |
| "learning_rate": 1.109253065774805e-07, | |
| "loss": 0.4118, | |
| "step": 1795 | |
| }, | |
| { | |
| "epoch": 0.9024818250188017, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 1.0813823857302118e-07, | |
| "loss": 0.3988, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.9024818250188017, | |
| "eval_loss": 0.45277419686317444, | |
| "eval_runtime": 23.3095, | |
| "eval_samples_per_second": 324.545, | |
| "eval_steps_per_second": 20.292, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.9049887189771872, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 1.0535117056856186e-07, | |
| "loss": 0.401, | |
| "step": 1805 | |
| }, | |
| { | |
| "epoch": 0.9074956129355728, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 1.0256410256410256e-07, | |
| "loss": 0.4118, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.9100025068939583, | |
| "grad_norm": 2.125, | |
| "learning_rate": 9.977703455964324e-08, | |
| "loss": 0.3953, | |
| "step": 1815 | |
| }, | |
| { | |
| "epoch": 0.912509400852344, | |
| "grad_norm": 2.125, | |
| "learning_rate": 9.698996655518395e-08, | |
| "loss": 0.3904, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.9150162948107295, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 9.420289855072464e-08, | |
| "loss": 0.4013, | |
| "step": 1825 | |
| }, | |
| { | |
| "epoch": 0.9175231887691151, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 9.141583054626532e-08, | |
| "loss": 0.4001, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.9200300827275006, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 8.862876254180602e-08, | |
| "loss": 0.4013, | |
| "step": 1835 | |
| }, | |
| { | |
| "epoch": 0.9225369766858862, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 8.58416945373467e-08, | |
| "loss": 0.3976, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.9250438706442717, | |
| "grad_norm": 1.9609375, | |
| "learning_rate": 8.30546265328874e-08, | |
| "loss": 0.4059, | |
| "step": 1845 | |
| }, | |
| { | |
| "epoch": 0.9275507646026573, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 8.02675585284281e-08, | |
| "loss": 0.4178, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.9300576585610428, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 7.748049052396878e-08, | |
| "loss": 0.4038, | |
| "step": 1855 | |
| }, | |
| { | |
| "epoch": 0.9325645525194284, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 7.469342251950947e-08, | |
| "loss": 0.4011, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.935071446477814, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 7.190635451505016e-08, | |
| "loss": 0.4116, | |
| "step": 1865 | |
| }, | |
| { | |
| "epoch": 0.9375783404361996, | |
| "grad_norm": 2.0, | |
| "learning_rate": 6.911928651059086e-08, | |
| "loss": 0.3976, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.9400852343945851, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 6.633221850613155e-08, | |
| "loss": 0.3998, | |
| "step": 1875 | |
| }, | |
| { | |
| "epoch": 0.9425921283529707, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 6.354515050167224e-08, | |
| "loss": 0.3925, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.9450990223113562, | |
| "grad_norm": 2.0, | |
| "learning_rate": 6.075808249721292e-08, | |
| "loss": 0.4036, | |
| "step": 1885 | |
| }, | |
| { | |
| "epoch": 0.9476059162697418, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 5.797101449275362e-08, | |
| "loss": 0.395, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 0.9501128102281273, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 5.5183946488294307e-08, | |
| "loss": 0.3848, | |
| "step": 1895 | |
| }, | |
| { | |
| "epoch": 0.9526197041865129, | |
| "grad_norm": 1.984375, | |
| "learning_rate": 5.2396878483835003e-08, | |
| "loss": 0.3964, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.9551265981448984, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 4.9609810479375694e-08, | |
| "loss": 0.3925, | |
| "step": 1905 | |
| }, | |
| { | |
| "epoch": 0.9576334921032841, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 4.6822742474916384e-08, | |
| "loss": 0.4246, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 0.9601403860616696, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 4.403567447045708e-08, | |
| "loss": 0.4056, | |
| "step": 1915 | |
| }, | |
| { | |
| "epoch": 0.9626472800200552, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 4.1248606465997764e-08, | |
| "loss": 0.4123, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.9651541739784407, | |
| "grad_norm": 2.25, | |
| "learning_rate": 3.846153846153846e-08, | |
| "loss": 0.4088, | |
| "step": 1925 | |
| }, | |
| { | |
| "epoch": 0.9676610679368263, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 3.567447045707915e-08, | |
| "loss": 0.3983, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 0.9701679618952118, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 3.288740245261984e-08, | |
| "loss": 0.3914, | |
| "step": 1935 | |
| }, | |
| { | |
| "epoch": 0.9726748558535974, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 3.010033444816053e-08, | |
| "loss": 0.426, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.9751817498119829, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 2.7313266443701226e-08, | |
| "loss": 0.3937, | |
| "step": 1945 | |
| }, | |
| { | |
| "epoch": 0.9776886437703685, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 2.4526198439241916e-08, | |
| "loss": 0.4081, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.9801955377287541, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 2.1739130434782606e-08, | |
| "loss": 0.4062, | |
| "step": 1955 | |
| }, | |
| { | |
| "epoch": 0.9827024316871397, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 1.89520624303233e-08, | |
| "loss": 0.4016, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.9852093256455252, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 1.616499442586399e-08, | |
| "loss": 0.4069, | |
| "step": 1965 | |
| }, | |
| { | |
| "epoch": 0.9877162196039108, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 1.3377926421404682e-08, | |
| "loss": 0.4096, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 0.9902231135622963, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 1.0590858416945374e-08, | |
| "loss": 0.4047, | |
| "step": 1975 | |
| }, | |
| { | |
| "epoch": 0.9927300075206819, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 7.803790412486064e-09, | |
| "loss": 0.4011, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.9952369014790674, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 5.016722408026756e-09, | |
| "loss": 0.3955, | |
| "step": 1985 | |
| }, | |
| { | |
| "epoch": 0.997743795437453, | |
| "grad_norm": 1.984375, | |
| "learning_rate": 2.229654403567447e-09, | |
| "loss": 0.3961, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 0.9997493106041614, | |
| "step": 1994, | |
| "total_flos": 5.2612673636715725e+17, | |
| "train_loss": 0.4929642112946199, | |
| "train_runtime": 2612.3746, | |
| "train_samples_per_second": 48.861, | |
| "train_steps_per_second": 0.763 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 1994, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 200, | |
| "stateful_callbacks": { | |
| "EarlyStoppingCallback": { | |
| "args": { | |
| "early_stopping_patience": 5, | |
| "early_stopping_threshold": 0.0 | |
| }, | |
| "attributes": { | |
| "early_stopping_patience_counter": 0 | |
| } | |
| }, | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5.2612673636715725e+17, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |