| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.9994666666666667, | |
| "eval_steps": 500, | |
| "global_step": 8436, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0035555555555555557, | |
| "grad_norm": 172.0995169680502, | |
| "learning_rate": 1.1848341232227489e-07, | |
| "loss": 2.2225, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.0071111111111111115, | |
| "grad_norm": 104.28584294834684, | |
| "learning_rate": 2.3696682464454978e-07, | |
| "loss": 2.0266, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.010666666666666666, | |
| "grad_norm": 23.951684052753993, | |
| "learning_rate": 3.5545023696682467e-07, | |
| "loss": 1.7378, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.014222222222222223, | |
| "grad_norm": 28.63768523364588, | |
| "learning_rate": 4.7393364928909956e-07, | |
| "loss": 1.4898, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.017777777777777778, | |
| "grad_norm": 13.504890360680221, | |
| "learning_rate": 5.924170616113745e-07, | |
| "loss": 1.1851, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.021333333333333333, | |
| "grad_norm": 6.1972205960321585, | |
| "learning_rate": 7.109004739336493e-07, | |
| "loss": 0.988, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.024888888888888887, | |
| "grad_norm": 5.065573416801065, | |
| "learning_rate": 8.293838862559242e-07, | |
| "loss": 0.862, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.028444444444444446, | |
| "grad_norm": 4.1523834288926516, | |
| "learning_rate": 9.478672985781991e-07, | |
| "loss": 0.7813, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.032, | |
| "grad_norm": 4.2769222488911405, | |
| "learning_rate": 1.0663507109004742e-06, | |
| "loss": 0.6638, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.035555555555555556, | |
| "grad_norm": 3.7926520111038613, | |
| "learning_rate": 1.184834123222749e-06, | |
| "loss": 0.6073, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.03911111111111111, | |
| "grad_norm": 4.5354775101779605, | |
| "learning_rate": 1.303317535545024e-06, | |
| "loss": 0.5392, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.042666666666666665, | |
| "grad_norm": 9.972389478302686, | |
| "learning_rate": 1.4218009478672987e-06, | |
| "loss": 0.5798, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.04622222222222222, | |
| "grad_norm": 3.3720288764197903, | |
| "learning_rate": 1.5402843601895737e-06, | |
| "loss": 0.5119, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.049777777777777775, | |
| "grad_norm": 2.9830238004238674, | |
| "learning_rate": 1.6587677725118483e-06, | |
| "loss": 0.4432, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.05333333333333334, | |
| "grad_norm": 3.0823358079048395, | |
| "learning_rate": 1.7772511848341234e-06, | |
| "loss": 0.4637, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.05688888888888889, | |
| "grad_norm": 2.7399052383817493, | |
| "learning_rate": 1.8957345971563982e-06, | |
| "loss": 0.4623, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.060444444444444446, | |
| "grad_norm": 2.5949470941499886, | |
| "learning_rate": 2.0142180094786733e-06, | |
| "loss": 0.4909, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.064, | |
| "grad_norm": 2.860434581778304, | |
| "learning_rate": 2.1327014218009483e-06, | |
| "loss": 0.4522, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.06755555555555555, | |
| "grad_norm": 2.4794062920348514, | |
| "learning_rate": 2.251184834123223e-06, | |
| "loss": 0.4683, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.07111111111111111, | |
| "grad_norm": 3.5898381841290385, | |
| "learning_rate": 2.369668246445498e-06, | |
| "loss": 0.4544, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.07466666666666667, | |
| "grad_norm": 2.8271249937433893, | |
| "learning_rate": 2.4881516587677726e-06, | |
| "loss": 0.452, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.07822222222222222, | |
| "grad_norm": 2.820485688519842, | |
| "learning_rate": 2.606635071090048e-06, | |
| "loss": 0.4594, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.08177777777777778, | |
| "grad_norm": 2.879680482909577, | |
| "learning_rate": 2.7251184834123223e-06, | |
| "loss": 0.4079, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.08533333333333333, | |
| "grad_norm": 2.2760447333960547, | |
| "learning_rate": 2.8436018957345973e-06, | |
| "loss": 0.4586, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.08888888888888889, | |
| "grad_norm": 2.831009166917502, | |
| "learning_rate": 2.9620853080568724e-06, | |
| "loss": 0.4143, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.09244444444444444, | |
| "grad_norm": 3.2359232461275895, | |
| "learning_rate": 3.0805687203791474e-06, | |
| "loss": 0.454, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.096, | |
| "grad_norm": 2.7067735723833932, | |
| "learning_rate": 3.1990521327014216e-06, | |
| "loss": 0.3998, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.09955555555555555, | |
| "grad_norm": 2.550645136169034, | |
| "learning_rate": 3.3175355450236967e-06, | |
| "loss": 0.397, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.10311111111111111, | |
| "grad_norm": 2.6477271129566162, | |
| "learning_rate": 3.4360189573459717e-06, | |
| "loss": 0.416, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.10666666666666667, | |
| "grad_norm": 2.744124645461815, | |
| "learning_rate": 3.5545023696682468e-06, | |
| "loss": 0.4521, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.11022222222222222, | |
| "grad_norm": 2.499585309198425, | |
| "learning_rate": 3.672985781990522e-06, | |
| "loss": 0.4023, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.11377777777777778, | |
| "grad_norm": 2.6278096303414467, | |
| "learning_rate": 3.7914691943127964e-06, | |
| "loss": 0.4191, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.11733333333333333, | |
| "grad_norm": 2.4188835712940326, | |
| "learning_rate": 3.9099526066350715e-06, | |
| "loss": 0.4122, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.12088888888888889, | |
| "grad_norm": 2.553975268194503, | |
| "learning_rate": 4.0284360189573465e-06, | |
| "loss": 0.3498, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.12444444444444444, | |
| "grad_norm": 2.834535859400579, | |
| "learning_rate": 4.146919431279622e-06, | |
| "loss": 0.4094, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.128, | |
| "grad_norm": 2.533973817990368, | |
| "learning_rate": 4.265402843601897e-06, | |
| "loss": 0.4298, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.13155555555555556, | |
| "grad_norm": 2.813906241826433, | |
| "learning_rate": 4.383886255924171e-06, | |
| "loss": 0.4216, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.1351111111111111, | |
| "grad_norm": 2.102931563969342, | |
| "learning_rate": 4.502369668246446e-06, | |
| "loss": 0.3808, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.13866666666666666, | |
| "grad_norm": 2.4379289560773896, | |
| "learning_rate": 4.620853080568721e-06, | |
| "loss": 0.3618, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.14222222222222222, | |
| "grad_norm": 2.3557567609798777, | |
| "learning_rate": 4.739336492890996e-06, | |
| "loss": 0.4044, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.14577777777777778, | |
| "grad_norm": 2.2820973068522514, | |
| "learning_rate": 4.857819905213271e-06, | |
| "loss": 0.4071, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.14933333333333335, | |
| "grad_norm": 2.6709678530509993, | |
| "learning_rate": 4.976303317535545e-06, | |
| "loss": 0.4272, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.15288888888888888, | |
| "grad_norm": 2.332134363712532, | |
| "learning_rate": 5.09478672985782e-06, | |
| "loss": 0.434, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.15644444444444444, | |
| "grad_norm": 2.9162979668749047, | |
| "learning_rate": 5.213270142180096e-06, | |
| "loss": 0.3695, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 2.2427213677361655, | |
| "learning_rate": 5.33175355450237e-06, | |
| "loss": 0.3723, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.16355555555555557, | |
| "grad_norm": 2.5901865124993, | |
| "learning_rate": 5.4502369668246446e-06, | |
| "loss": 0.4147, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.1671111111111111, | |
| "grad_norm": 2.56419802107506, | |
| "learning_rate": 5.5687203791469205e-06, | |
| "loss": 0.4083, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.17066666666666666, | |
| "grad_norm": 2.145912482611642, | |
| "learning_rate": 5.687203791469195e-06, | |
| "loss": 0.3631, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.17422222222222222, | |
| "grad_norm": 2.1572804538302983, | |
| "learning_rate": 5.8056872037914706e-06, | |
| "loss": 0.3838, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.17777777777777778, | |
| "grad_norm": 2.7221208940412955, | |
| "learning_rate": 5.924170616113745e-06, | |
| "loss": 0.3703, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.17777777777777778, | |
| "eval_loss": 0.27164188027381897, | |
| "eval_runtime": 561.686, | |
| "eval_samples_per_second": 17.804, | |
| "eval_steps_per_second": 4.451, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.18133333333333335, | |
| "grad_norm": 2.128760220417613, | |
| "learning_rate": 6.042654028436019e-06, | |
| "loss": 0.3936, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.18488888888888888, | |
| "grad_norm": 2.37349559892131, | |
| "learning_rate": 6.161137440758295e-06, | |
| "loss": 0.4097, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.18844444444444444, | |
| "grad_norm": 2.1546814583393954, | |
| "learning_rate": 6.279620853080569e-06, | |
| "loss": 0.3487, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.192, | |
| "grad_norm": 2.5691866709112174, | |
| "learning_rate": 6.398104265402843e-06, | |
| "loss": 0.3795, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.19555555555555557, | |
| "grad_norm": 2.511088780500042, | |
| "learning_rate": 6.516587677725119e-06, | |
| "loss": 0.3592, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.1991111111111111, | |
| "grad_norm": 2.1980105108863306, | |
| "learning_rate": 6.635071090047393e-06, | |
| "loss": 0.3759, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.20266666666666666, | |
| "grad_norm": 2.0372925079256508, | |
| "learning_rate": 6.753554502369669e-06, | |
| "loss": 0.3372, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.20622222222222222, | |
| "grad_norm": 2.4474157501007188, | |
| "learning_rate": 6.8720379146919435e-06, | |
| "loss": 0.3821, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.20977777777777779, | |
| "grad_norm": 2.6150488990545813, | |
| "learning_rate": 6.990521327014218e-06, | |
| "loss": 0.4033, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.21333333333333335, | |
| "grad_norm": 2.218675478875157, | |
| "learning_rate": 7.1090047393364935e-06, | |
| "loss": 0.3498, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.21688888888888888, | |
| "grad_norm": 2.60194848198847, | |
| "learning_rate": 7.227488151658768e-06, | |
| "loss": 0.3974, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.22044444444444444, | |
| "grad_norm": 2.4008012422084883, | |
| "learning_rate": 7.345971563981044e-06, | |
| "loss": 0.3522, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.224, | |
| "grad_norm": 2.370019125222766, | |
| "learning_rate": 7.464454976303318e-06, | |
| "loss": 0.3843, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.22755555555555557, | |
| "grad_norm": 2.319127909040294, | |
| "learning_rate": 7.582938388625593e-06, | |
| "loss": 0.3852, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.2311111111111111, | |
| "grad_norm": 2.0344327356388963, | |
| "learning_rate": 7.701421800947868e-06, | |
| "loss": 0.3753, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.23466666666666666, | |
| "grad_norm": 2.0974945886124274, | |
| "learning_rate": 7.819905213270143e-06, | |
| "loss": 0.3622, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.23822222222222222, | |
| "grad_norm": 2.3710225236326656, | |
| "learning_rate": 7.938388625592418e-06, | |
| "loss": 0.3776, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.24177777777777779, | |
| "grad_norm": 2.1972590118602353, | |
| "learning_rate": 8.056872037914693e-06, | |
| "loss": 0.4131, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.24533333333333332, | |
| "grad_norm": 2.124563531807995, | |
| "learning_rate": 8.175355450236966e-06, | |
| "loss": 0.4041, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.24888888888888888, | |
| "grad_norm": 2.186519973081525, | |
| "learning_rate": 8.293838862559243e-06, | |
| "loss": 0.4342, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.25244444444444447, | |
| "grad_norm": 2.2098045409685785, | |
| "learning_rate": 8.412322274881517e-06, | |
| "loss": 0.3753, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.256, | |
| "grad_norm": 2.364680759422569, | |
| "learning_rate": 8.530805687203793e-06, | |
| "loss": 0.3499, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.25955555555555554, | |
| "grad_norm": 2.0592638534598975, | |
| "learning_rate": 8.649289099526067e-06, | |
| "loss": 0.3676, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.26311111111111113, | |
| "grad_norm": 2.076874300192435, | |
| "learning_rate": 8.767772511848342e-06, | |
| "loss": 0.3882, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.26666666666666666, | |
| "grad_norm": 2.256989717343507, | |
| "learning_rate": 8.886255924170617e-06, | |
| "loss": 0.3906, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.2702222222222222, | |
| "grad_norm": 2.2777259263170753, | |
| "learning_rate": 9.004739336492892e-06, | |
| "loss": 0.3881, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.2737777777777778, | |
| "grad_norm": 2.0191108991103452, | |
| "learning_rate": 9.123222748815167e-06, | |
| "loss": 0.3598, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.2773333333333333, | |
| "grad_norm": 2.1955719220241114, | |
| "learning_rate": 9.241706161137442e-06, | |
| "loss": 0.3411, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.2808888888888889, | |
| "grad_norm": 1.8450512554264078, | |
| "learning_rate": 9.360189573459715e-06, | |
| "loss": 0.3989, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.28444444444444444, | |
| "grad_norm": 2.011115441632504, | |
| "learning_rate": 9.478672985781992e-06, | |
| "loss": 0.3982, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.288, | |
| "grad_norm": 1.8704472001913133, | |
| "learning_rate": 9.597156398104265e-06, | |
| "loss": 0.414, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.29155555555555557, | |
| "grad_norm": 1.9254101904021153, | |
| "learning_rate": 9.715639810426542e-06, | |
| "loss": 0.3767, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.2951111111111111, | |
| "grad_norm": 1.9015728855115495, | |
| "learning_rate": 9.834123222748815e-06, | |
| "loss": 0.3775, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.2986666666666667, | |
| "grad_norm": 1.928562219171237, | |
| "learning_rate": 9.95260663507109e-06, | |
| "loss": 0.3955, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.3022222222222222, | |
| "grad_norm": 1.5585912642130104, | |
| "learning_rate": 9.999984589042141e-06, | |
| "loss": 0.3897, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.30577777777777776, | |
| "grad_norm": 2.088285655295682, | |
| "learning_rate": 9.999890411310363e-06, | |
| "loss": 0.3657, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.30933333333333335, | |
| "grad_norm": 1.7831321620409892, | |
| "learning_rate": 9.999710619100732e-06, | |
| "loss": 0.3699, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.3128888888888889, | |
| "grad_norm": 1.8859386777237288, | |
| "learning_rate": 9.999445215491888e-06, | |
| "loss": 0.3675, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.3164444444444444, | |
| "grad_norm": 1.793847189739239, | |
| "learning_rate": 9.999094205028403e-06, | |
| "loss": 0.3804, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 1.8588345423039347, | |
| "learning_rate": 9.998657593720726e-06, | |
| "loss": 0.3628, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.32355555555555554, | |
| "grad_norm": 1.904522383364726, | |
| "learning_rate": 9.998135389045071e-06, | |
| "loss": 0.3832, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.32711111111111113, | |
| "grad_norm": 1.7658830671737389, | |
| "learning_rate": 9.997527599943288e-06, | |
| "loss": 0.3931, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.33066666666666666, | |
| "grad_norm": 1.8645179401650172, | |
| "learning_rate": 9.996834236822718e-06, | |
| "loss": 0.3587, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.3342222222222222, | |
| "grad_norm": 1.8432627384605438, | |
| "learning_rate": 9.996055311556002e-06, | |
| "loss": 0.4065, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.3377777777777778, | |
| "grad_norm": 1.8436289309250031, | |
| "learning_rate": 9.99519083748089e-06, | |
| "loss": 0.3861, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.3413333333333333, | |
| "grad_norm": 1.9136369003670703, | |
| "learning_rate": 9.994240829400006e-06, | |
| "loss": 0.3794, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.3448888888888889, | |
| "grad_norm": 1.559884705708754, | |
| "learning_rate": 9.993205303580596e-06, | |
| "loss": 0.3675, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.34844444444444445, | |
| "grad_norm": 2.0844042937670317, | |
| "learning_rate": 9.992084277754246e-06, | |
| "loss": 0.3725, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.352, | |
| "grad_norm": 1.4001469351101974, | |
| "learning_rate": 9.990877771116588e-06, | |
| "loss": 0.3526, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.35555555555555557, | |
| "grad_norm": 1.7302242639985734, | |
| "learning_rate": 9.989585804326963e-06, | |
| "loss": 0.3451, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.35555555555555557, | |
| "eval_loss": 0.2586575448513031, | |
| "eval_runtime": 561.7755, | |
| "eval_samples_per_second": 17.801, | |
| "eval_steps_per_second": 4.45, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.3591111111111111, | |
| "grad_norm": 2.1060593962865832, | |
| "learning_rate": 9.988208399508064e-06, | |
| "loss": 0.3923, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.3626666666666667, | |
| "grad_norm": 1.6475744003826194, | |
| "learning_rate": 9.986745580245569e-06, | |
| "loss": 0.3077, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.3662222222222222, | |
| "grad_norm": 1.9521091866638012, | |
| "learning_rate": 9.985197371587732e-06, | |
| "loss": 0.389, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.36977777777777776, | |
| "grad_norm": 1.7609515675334448, | |
| "learning_rate": 9.983563800044942e-06, | |
| "loss": 0.3424, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.37333333333333335, | |
| "grad_norm": 1.7210920690658038, | |
| "learning_rate": 9.981844893589294e-06, | |
| "loss": 0.3558, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.3768888888888889, | |
| "grad_norm": 1.823734659697161, | |
| "learning_rate": 9.980040681654085e-06, | |
| "loss": 0.3693, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.3804444444444444, | |
| "grad_norm": 2.102269417162816, | |
| "learning_rate": 9.978151195133326e-06, | |
| "loss": 0.3638, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.384, | |
| "grad_norm": 1.8033749091845297, | |
| "learning_rate": 9.976176466381205e-06, | |
| "loss": 0.3484, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.38755555555555554, | |
| "grad_norm": 1.8854677696591007, | |
| "learning_rate": 9.974116529211539e-06, | |
| "loss": 0.3967, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.39111111111111113, | |
| "grad_norm": 2.0272157520267218, | |
| "learning_rate": 9.971971418897189e-06, | |
| "loss": 0.3741, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.39466666666666667, | |
| "grad_norm": 2.0179018140684555, | |
| "learning_rate": 9.969741172169461e-06, | |
| "loss": 0.3904, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.3982222222222222, | |
| "grad_norm": 1.6226992565101939, | |
| "learning_rate": 9.967425827217473e-06, | |
| "loss": 0.3485, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.4017777777777778, | |
| "grad_norm": 1.9028497690136488, | |
| "learning_rate": 9.965025423687505e-06, | |
| "loss": 0.346, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.4053333333333333, | |
| "grad_norm": 1.694320712579824, | |
| "learning_rate": 9.962540002682314e-06, | |
| "loss": 0.3635, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.4088888888888889, | |
| "grad_norm": 1.6440393469215313, | |
| "learning_rate": 9.95996960676044e-06, | |
| "loss": 0.3794, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.41244444444444445, | |
| "grad_norm": 1.9859711063744807, | |
| "learning_rate": 9.957314279935467e-06, | |
| "loss": 0.3727, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.416, | |
| "grad_norm": 1.5764827911729749, | |
| "learning_rate": 9.954574067675276e-06, | |
| "loss": 0.3472, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.41955555555555557, | |
| "grad_norm": 2.0270228575955938, | |
| "learning_rate": 9.951749016901266e-06, | |
| "loss": 0.3651, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.4231111111111111, | |
| "grad_norm": 1.4711992564971241, | |
| "learning_rate": 9.948839175987543e-06, | |
| "loss": 0.4007, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.4266666666666667, | |
| "grad_norm": 1.6555299578050973, | |
| "learning_rate": 9.945844594760104e-06, | |
| "loss": 0.3662, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.43022222222222223, | |
| "grad_norm": 1.6087449112246428, | |
| "learning_rate": 9.94276532449597e-06, | |
| "loss": 0.3266, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.43377777777777776, | |
| "grad_norm": 1.7938918985177508, | |
| "learning_rate": 9.939601417922326e-06, | |
| "loss": 0.367, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.43733333333333335, | |
| "grad_norm": 1.9419042479062267, | |
| "learning_rate": 9.936352929215598e-06, | |
| "loss": 0.3479, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.4408888888888889, | |
| "grad_norm": 1.7389871732986788, | |
| "learning_rate": 9.933019914000537e-06, | |
| "loss": 0.3991, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.4444444444444444, | |
| "grad_norm": 1.954697966163684, | |
| "learning_rate": 9.929602429349267e-06, | |
| "loss": 0.387, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.448, | |
| "grad_norm": 1.9390505686602657, | |
| "learning_rate": 9.926100533780304e-06, | |
| "loss": 0.3623, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.45155555555555554, | |
| "grad_norm": 1.6639481540933314, | |
| "learning_rate": 9.922514287257553e-06, | |
| "loss": 0.3758, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.45511111111111113, | |
| "grad_norm": 1.722757928957694, | |
| "learning_rate": 9.918843751189285e-06, | |
| "loss": 0.3355, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.45866666666666667, | |
| "grad_norm": 1.845850757530145, | |
| "learning_rate": 9.915088988427085e-06, | |
| "loss": 0.3698, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.4622222222222222, | |
| "grad_norm": 1.44128404254532, | |
| "learning_rate": 9.911250063264768e-06, | |
| "loss": 0.4047, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.4657777777777778, | |
| "grad_norm": 1.7671518160334596, | |
| "learning_rate": 9.907327041437295e-06, | |
| "loss": 0.3692, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.4693333333333333, | |
| "grad_norm": 1.8380352484481248, | |
| "learning_rate": 9.903319990119629e-06, | |
| "loss": 0.36, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.4728888888888889, | |
| "grad_norm": 1.76427459962676, | |
| "learning_rate": 9.899228977925594e-06, | |
| "loss": 0.3741, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.47644444444444445, | |
| "grad_norm": 1.4897822709650264, | |
| "learning_rate": 9.895054074906703e-06, | |
| "loss": 0.3407, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 1.8107592753421746, | |
| "learning_rate": 9.890795352550949e-06, | |
| "loss": 0.3737, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.48355555555555557, | |
| "grad_norm": 1.7814141617442254, | |
| "learning_rate": 9.886452883781588e-06, | |
| "loss": 0.3706, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.4871111111111111, | |
| "grad_norm": 1.6423771491979522, | |
| "learning_rate": 9.882026742955892e-06, | |
| "loss": 0.3593, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.49066666666666664, | |
| "grad_norm": 1.9926182163486512, | |
| "learning_rate": 9.877517005863865e-06, | |
| "loss": 0.388, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.49422222222222223, | |
| "grad_norm": 1.6527200649892368, | |
| "learning_rate": 9.872923749726959e-06, | |
| "loss": 0.3825, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.49777777777777776, | |
| "grad_norm": 1.800321612826116, | |
| "learning_rate": 9.868247053196744e-06, | |
| "loss": 0.3406, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.5013333333333333, | |
| "grad_norm": 1.8998896812539383, | |
| "learning_rate": 9.86348699635356e-06, | |
| "loss": 0.3718, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.5048888888888889, | |
| "grad_norm": 1.8642598101048677, | |
| "learning_rate": 9.85864366070515e-06, | |
| "loss": 0.3728, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.5084444444444445, | |
| "grad_norm": 2.04147924521036, | |
| "learning_rate": 9.853717129185262e-06, | |
| "loss": 0.3371, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.512, | |
| "grad_norm": 1.765175754873959, | |
| "learning_rate": 9.848707486152231e-06, | |
| "loss": 0.3468, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.5155555555555555, | |
| "grad_norm": 1.7955950262413882, | |
| "learning_rate": 9.843614817387531e-06, | |
| "loss": 0.3456, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.5191111111111111, | |
| "grad_norm": 1.4037783734962412, | |
| "learning_rate": 9.838439210094309e-06, | |
| "loss": 0.3244, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.5226666666666666, | |
| "grad_norm": 1.8006249556531597, | |
| "learning_rate": 9.833180752895887e-06, | |
| "loss": 0.3391, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.5262222222222223, | |
| "grad_norm": 1.7020622735675546, | |
| "learning_rate": 9.827839535834258e-06, | |
| "loss": 0.3922, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.5297777777777778, | |
| "grad_norm": 1.6034083398484584, | |
| "learning_rate": 9.822415650368525e-06, | |
| "loss": 0.304, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.5333333333333333, | |
| "grad_norm": 1.7309514997235147, | |
| "learning_rate": 9.816909189373347e-06, | |
| "loss": 0.3531, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.5333333333333333, | |
| "eval_loss": 0.24488620460033417, | |
| "eval_runtime": 562.1833, | |
| "eval_samples_per_second": 17.788, | |
| "eval_steps_per_second": 4.447, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.5368888888888889, | |
| "grad_norm": 1.4581125274966544, | |
| "learning_rate": 9.81132024713735e-06, | |
| "loss": 0.3771, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.5404444444444444, | |
| "grad_norm": 1.6490332212552936, | |
| "learning_rate": 9.805648919361505e-06, | |
| "loss": 0.3848, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.544, | |
| "grad_norm": 1.7512970600212527, | |
| "learning_rate": 9.799895303157492e-06, | |
| "loss": 0.3694, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.5475555555555556, | |
| "grad_norm": 1.7421405313188358, | |
| "learning_rate": 9.794059497046043e-06, | |
| "loss": 0.3553, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.5511111111111111, | |
| "grad_norm": 1.7340918047507783, | |
| "learning_rate": 9.788141600955244e-06, | |
| "loss": 0.3357, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.5546666666666666, | |
| "grad_norm": 1.657973523226739, | |
| "learning_rate": 9.782141716218832e-06, | |
| "loss": 0.3448, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.5582222222222222, | |
| "grad_norm": 1.7266109549753084, | |
| "learning_rate": 9.77605994557446e-06, | |
| "loss": 0.3336, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.5617777777777778, | |
| "grad_norm": 1.7634795513841868, | |
| "learning_rate": 9.769896393161937e-06, | |
| "loss": 0.336, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.5653333333333334, | |
| "grad_norm": 1.7328448062964845, | |
| "learning_rate": 9.763651164521436e-06, | |
| "loss": 0.3505, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.5688888888888889, | |
| "grad_norm": 1.7601349288429824, | |
| "learning_rate": 9.7573243665917e-06, | |
| "loss": 0.3816, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.5724444444444444, | |
| "grad_norm": 1.887857912509665, | |
| "learning_rate": 9.750916107708205e-06, | |
| "loss": 0.358, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.576, | |
| "grad_norm": 1.8940080571652895, | |
| "learning_rate": 9.744426497601305e-06, | |
| "loss": 0.363, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.5795555555555556, | |
| "grad_norm": 1.5744873206102685, | |
| "learning_rate": 9.737855647394346e-06, | |
| "loss": 0.3544, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.5831111111111111, | |
| "grad_norm": 1.5744080074196256, | |
| "learning_rate": 9.73120366960178e-06, | |
| "loss": 0.375, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.5866666666666667, | |
| "grad_norm": 1.6398095171132219, | |
| "learning_rate": 9.724470678127226e-06, | |
| "loss": 0.3649, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.5902222222222222, | |
| "grad_norm": 1.4310246627875627, | |
| "learning_rate": 9.717656788261519e-06, | |
| "loss": 0.3716, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.5937777777777777, | |
| "grad_norm": 1.490999227794774, | |
| "learning_rate": 9.71076211668074e-06, | |
| "loss": 0.352, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.5973333333333334, | |
| "grad_norm": 1.6484132205325386, | |
| "learning_rate": 9.703786781444218e-06, | |
| "loss": 0.3555, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.6008888888888889, | |
| "grad_norm": 1.3854857319423775, | |
| "learning_rate": 9.69673090199251e-06, | |
| "loss": 0.3348, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.6044444444444445, | |
| "grad_norm": 1.6107410705301848, | |
| "learning_rate": 9.689594599145348e-06, | |
| "loss": 0.3499, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.608, | |
| "grad_norm": 1.520886748403311, | |
| "learning_rate": 9.682377995099581e-06, | |
| "loss": 0.3389, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.6115555555555555, | |
| "grad_norm": 1.4556730210725268, | |
| "learning_rate": 9.675081213427076e-06, | |
| "loss": 0.3412, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.6151111111111112, | |
| "grad_norm": 1.476388303700134, | |
| "learning_rate": 9.667704379072597e-06, | |
| "loss": 0.3363, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.6186666666666667, | |
| "grad_norm": 1.2168509424846436, | |
| "learning_rate": 9.660247618351683e-06, | |
| "loss": 0.3328, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.6222222222222222, | |
| "grad_norm": 1.395468629739029, | |
| "learning_rate": 9.652711058948463e-06, | |
| "loss": 0.3509, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.6257777777777778, | |
| "grad_norm": 1.586845461880222, | |
| "learning_rate": 9.645094829913487e-06, | |
| "loss": 0.3471, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.6293333333333333, | |
| "grad_norm": 1.5411518795473231, | |
| "learning_rate": 9.637399061661507e-06, | |
| "loss": 0.3246, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.6328888888888888, | |
| "grad_norm": 1.658660033117339, | |
| "learning_rate": 9.62962388596925e-06, | |
| "loss": 0.3399, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.6364444444444445, | |
| "grad_norm": 1.313159566501215, | |
| "learning_rate": 9.621769435973152e-06, | |
| "loss": 0.3478, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 1.8380402091451324, | |
| "learning_rate": 9.61383584616709e-06, | |
| "loss": 0.3251, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.6435555555555555, | |
| "grad_norm": 1.6180991422896933, | |
| "learning_rate": 9.60582325240007e-06, | |
| "loss": 0.3553, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.6471111111111111, | |
| "grad_norm": 1.8283857342608776, | |
| "learning_rate": 9.597731791873907e-06, | |
| "loss": 0.3594, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.6506666666666666, | |
| "grad_norm": 1.4175489521300049, | |
| "learning_rate": 9.58956160314087e-06, | |
| "loss": 0.3549, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.6542222222222223, | |
| "grad_norm": 1.6783488504498176, | |
| "learning_rate": 9.581312826101315e-06, | |
| "loss": 0.3813, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.6577777777777778, | |
| "grad_norm": 1.6351873747299641, | |
| "learning_rate": 9.572985602001283e-06, | |
| "loss": 0.3518, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.6613333333333333, | |
| "grad_norm": 1.3790848679324303, | |
| "learning_rate": 9.56458007343009e-06, | |
| "loss": 0.3303, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.6648888888888889, | |
| "grad_norm": 1.6322052333334587, | |
| "learning_rate": 9.556096384317878e-06, | |
| "loss": 0.3403, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.6684444444444444, | |
| "grad_norm": 1.788030342136729, | |
| "learning_rate": 9.547534679933155e-06, | |
| "loss": 0.3717, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.672, | |
| "grad_norm": 1.4934586402235337, | |
| "learning_rate": 9.538895106880302e-06, | |
| "loss": 0.3468, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 0.6755555555555556, | |
| "grad_norm": 1.9556398213487334, | |
| "learning_rate": 9.53017781309707e-06, | |
| "loss": 0.3495, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.6791111111111111, | |
| "grad_norm": 1.4201698189636593, | |
| "learning_rate": 9.521382947852042e-06, | |
| "loss": 0.3631, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 0.6826666666666666, | |
| "grad_norm": 1.8176078337580701, | |
| "learning_rate": 9.512510661742078e-06, | |
| "loss": 0.366, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.6862222222222222, | |
| "grad_norm": 1.5895629439283847, | |
| "learning_rate": 9.503561106689736e-06, | |
| "loss": 0.3165, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 0.6897777777777778, | |
| "grad_norm": 1.7257922798447645, | |
| "learning_rate": 9.494534435940668e-06, | |
| "loss": 0.3199, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.6933333333333334, | |
| "grad_norm": 1.3859470273389864, | |
| "learning_rate": 9.485430804061009e-06, | |
| "loss": 0.3244, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.6968888888888889, | |
| "grad_norm": 1.3389192102707597, | |
| "learning_rate": 9.476250366934708e-06, | |
| "loss": 0.3557, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.7004444444444444, | |
| "grad_norm": 1.761133913330945, | |
| "learning_rate": 9.466993281760879e-06, | |
| "loss": 0.3367, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 0.704, | |
| "grad_norm": 1.5576575807000288, | |
| "learning_rate": 9.457659707051099e-06, | |
| "loss": 0.335, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.7075555555555556, | |
| "grad_norm": 1.5125566207561287, | |
| "learning_rate": 9.448249802626696e-06, | |
| "loss": 0.3286, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 0.7111111111111111, | |
| "grad_norm": 1.7236714219097393, | |
| "learning_rate": 9.43876372961601e-06, | |
| "loss": 0.3544, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.7111111111111111, | |
| "eval_loss": 0.23682241141796112, | |
| "eval_runtime": 560.8939, | |
| "eval_samples_per_second": 17.829, | |
| "eval_steps_per_second": 4.457, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.7146666666666667, | |
| "grad_norm": 1.7803508157706263, | |
| "learning_rate": 9.429201650451642e-06, | |
| "loss": 0.3218, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 0.7182222222222222, | |
| "grad_norm": 1.6971031315045289, | |
| "learning_rate": 9.419563728867663e-06, | |
| "loss": 0.3417, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 0.7217777777777777, | |
| "grad_norm": 1.9366329088516083, | |
| "learning_rate": 9.409850129896812e-06, | |
| "loss": 0.3104, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 0.7253333333333334, | |
| "grad_norm": 1.85452483851228, | |
| "learning_rate": 9.40006101986768e-06, | |
| "loss": 0.3371, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 0.7288888888888889, | |
| "grad_norm": 1.4768370143060883, | |
| "learning_rate": 9.390196566401844e-06, | |
| "loss": 0.3324, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.7324444444444445, | |
| "grad_norm": 1.3195137184227357, | |
| "learning_rate": 9.38025693841102e-06, | |
| "loss": 0.3384, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 0.736, | |
| "grad_norm": 1.7121308917693614, | |
| "learning_rate": 9.370242306094141e-06, | |
| "loss": 0.3339, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 0.7395555555555555, | |
| "grad_norm": 1.3801023810052373, | |
| "learning_rate": 9.360152840934477e-06, | |
| "loss": 0.3449, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.7431111111111111, | |
| "grad_norm": 1.4391167681264767, | |
| "learning_rate": 9.349988715696671e-06, | |
| "loss": 0.3444, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 0.7466666666666667, | |
| "grad_norm": 1.840759552395967, | |
| "learning_rate": 9.33975010442379e-06, | |
| "loss": 0.3496, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.7502222222222222, | |
| "grad_norm": 1.348141880287597, | |
| "learning_rate": 9.329437182434351e-06, | |
| "loss": 0.3202, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 0.7537777777777778, | |
| "grad_norm": 1.528620379748828, | |
| "learning_rate": 9.31905012631931e-06, | |
| "loss": 0.3545, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 0.7573333333333333, | |
| "grad_norm": 1.502678851982848, | |
| "learning_rate": 9.30858911393904e-06, | |
| "loss": 0.3457, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 0.7608888888888888, | |
| "grad_norm": 1.591416150002211, | |
| "learning_rate": 9.298054324420294e-06, | |
| "loss": 0.3125, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 0.7644444444444445, | |
| "grad_norm": 1.5254470204546493, | |
| "learning_rate": 9.287445938153121e-06, | |
| "loss": 0.3596, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.768, | |
| "grad_norm": 1.230432920766134, | |
| "learning_rate": 9.276764136787798e-06, | |
| "loss": 0.3352, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.7715555555555556, | |
| "grad_norm": 1.8112353212418606, | |
| "learning_rate": 9.266009103231702e-06, | |
| "loss": 0.3504, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 0.7751111111111111, | |
| "grad_norm": 1.6435932354458154, | |
| "learning_rate": 9.255181021646182e-06, | |
| "loss": 0.3289, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 0.7786666666666666, | |
| "grad_norm": 1.3388409038180085, | |
| "learning_rate": 9.244280077443417e-06, | |
| "loss": 0.3542, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 0.7822222222222223, | |
| "grad_norm": 1.5875341933538416, | |
| "learning_rate": 9.233306457283223e-06, | |
| "loss": 0.3516, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.7857777777777778, | |
| "grad_norm": 1.5094881761609635, | |
| "learning_rate": 9.222260349069874e-06, | |
| "loss": 0.3489, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 0.7893333333333333, | |
| "grad_norm": 1.477094884348464, | |
| "learning_rate": 9.211141941948872e-06, | |
| "loss": 0.3581, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 0.7928888888888889, | |
| "grad_norm": 1.4717030162478277, | |
| "learning_rate": 9.199951426303711e-06, | |
| "loss": 0.3415, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 0.7964444444444444, | |
| "grad_norm": 1.5752422305129774, | |
| "learning_rate": 9.188688993752626e-06, | |
| "loss": 0.3355, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 1.5354049474859641, | |
| "learning_rate": 9.177354837145298e-06, | |
| "loss": 0.3394, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.8035555555555556, | |
| "grad_norm": 1.8308300488763203, | |
| "learning_rate": 9.165949150559561e-06, | |
| "loss": 0.3545, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 0.8071111111111111, | |
| "grad_norm": 1.7274391712847685, | |
| "learning_rate": 9.154472129298075e-06, | |
| "loss": 0.363, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 0.8106666666666666, | |
| "grad_norm": 1.663966013940676, | |
| "learning_rate": 9.142923969884984e-06, | |
| "loss": 0.3395, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 0.8142222222222222, | |
| "grad_norm": 1.631283026660004, | |
| "learning_rate": 9.131304870062554e-06, | |
| "loss": 0.3486, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 0.8177777777777778, | |
| "grad_norm": 1.6552982308578106, | |
| "learning_rate": 9.119615028787771e-06, | |
| "loss": 0.3509, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.8213333333333334, | |
| "grad_norm": 1.7276297897533288, | |
| "learning_rate": 9.107854646228961e-06, | |
| "loss": 0.325, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 0.8248888888888889, | |
| "grad_norm": 1.445647497408194, | |
| "learning_rate": 9.096023923762333e-06, | |
| "loss": 0.3149, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 0.8284444444444444, | |
| "grad_norm": 1.531947731156783, | |
| "learning_rate": 9.08412306396856e-06, | |
| "loss": 0.348, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 0.832, | |
| "grad_norm": 1.3576987022774867, | |
| "learning_rate": 9.072152270629281e-06, | |
| "loss": 0.3096, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 0.8355555555555556, | |
| "grad_norm": 1.4298680216684836, | |
| "learning_rate": 9.060111748723639e-06, | |
| "loss": 0.3609, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.8391111111111111, | |
| "grad_norm": 1.5782942370819155, | |
| "learning_rate": 9.048001704424747e-06, | |
| "loss": 0.3307, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 0.8426666666666667, | |
| "grad_norm": 1.6461644102732529, | |
| "learning_rate": 9.035822345096177e-06, | |
| "loss": 0.3327, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 0.8462222222222222, | |
| "grad_norm": 1.5843145785651733, | |
| "learning_rate": 9.023573879288394e-06, | |
| "loss": 0.3312, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 0.8497777777777777, | |
| "grad_norm": 1.5152546857205669, | |
| "learning_rate": 9.0112565167352e-06, | |
| "loss": 0.3298, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 0.8533333333333334, | |
| "grad_norm": 1.7304070586423994, | |
| "learning_rate": 8.99887046835013e-06, | |
| "loss": 0.3404, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.8568888888888889, | |
| "grad_norm": 1.461299493248939, | |
| "learning_rate": 8.986415946222843e-06, | |
| "loss": 0.3351, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 0.8604444444444445, | |
| "grad_norm": 1.6967152528749099, | |
| "learning_rate": 8.973893163615498e-06, | |
| "loss": 0.3257, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 0.864, | |
| "grad_norm": 1.4154067723973784, | |
| "learning_rate": 8.96130233495909e-06, | |
| "loss": 0.3199, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 0.8675555555555555, | |
| "grad_norm": 1.3361597312618834, | |
| "learning_rate": 8.948643675849793e-06, | |
| "loss": 0.3442, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 0.8711111111111111, | |
| "grad_norm": 1.4032866224408458, | |
| "learning_rate": 8.935917403045251e-06, | |
| "loss": 0.2947, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.8746666666666667, | |
| "grad_norm": 1.234939739680067, | |
| "learning_rate": 8.923123734460885e-06, | |
| "loss": 0.3577, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 0.8782222222222222, | |
| "grad_norm": 1.5765934665163166, | |
| "learning_rate": 8.910262889166144e-06, | |
| "loss": 0.3326, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 0.8817777777777778, | |
| "grad_norm": 1.5046341548865376, | |
| "learning_rate": 8.897335087380769e-06, | |
| "loss": 0.3212, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 0.8853333333333333, | |
| "grad_norm": 1.3276870900100486, | |
| "learning_rate": 8.884340550471008e-06, | |
| "loss": 0.3143, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 0.8888888888888888, | |
| "grad_norm": 1.719735619655969, | |
| "learning_rate": 8.87127950094584e-06, | |
| "loss": 0.3747, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.8888888888888888, | |
| "eval_loss": 0.23135392367839813, | |
| "eval_runtime": 562.1868, | |
| "eval_samples_per_second": 17.788, | |
| "eval_steps_per_second": 4.447, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.8924444444444445, | |
| "grad_norm": 1.584313301872745, | |
| "learning_rate": 8.85815216245315e-06, | |
| "loss": 0.3251, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 0.896, | |
| "grad_norm": 1.2854406639721594, | |
| "learning_rate": 8.844958759775917e-06, | |
| "loss": 0.3242, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 0.8995555555555556, | |
| "grad_norm": 1.3421636352208044, | |
| "learning_rate": 8.83169951882834e-06, | |
| "loss": 0.3069, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 0.9031111111111111, | |
| "grad_norm": 1.6982202912735271, | |
| "learning_rate": 8.818374666652001e-06, | |
| "loss": 0.3303, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 0.9066666666666666, | |
| "grad_norm": 1.3802398833209684, | |
| "learning_rate": 8.804984431411951e-06, | |
| "loss": 0.3558, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.9102222222222223, | |
| "grad_norm": 1.8913239549685246, | |
| "learning_rate": 8.791529042392813e-06, | |
| "loss": 0.3947, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 0.9137777777777778, | |
| "grad_norm": 1.4494060942613418, | |
| "learning_rate": 8.77800872999486e-06, | |
| "loss": 0.3362, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 0.9173333333333333, | |
| "grad_norm": 1.7204036116920214, | |
| "learning_rate": 8.764423725730062e-06, | |
| "loss": 0.3298, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 0.9208888888888889, | |
| "grad_norm": 1.6130463149964605, | |
| "learning_rate": 8.750774262218129e-06, | |
| "loss": 0.3218, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 0.9244444444444444, | |
| "grad_norm": 1.4272505738840544, | |
| "learning_rate": 8.737060573182518e-06, | |
| "loss": 0.3325, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.928, | |
| "grad_norm": 1.5909460584884059, | |
| "learning_rate": 8.723282893446447e-06, | |
| "loss": 0.3496, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 0.9315555555555556, | |
| "grad_norm": 2.0360938733984963, | |
| "learning_rate": 8.709441458928853e-06, | |
| "loss": 0.3197, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 0.9351111111111111, | |
| "grad_norm": 1.6918095124182533, | |
| "learning_rate": 8.695536506640369e-06, | |
| "loss": 0.3349, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 0.9386666666666666, | |
| "grad_norm": 1.561883507817091, | |
| "learning_rate": 8.681568274679264e-06, | |
| "loss": 0.3357, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 0.9422222222222222, | |
| "grad_norm": 1.635386123467993, | |
| "learning_rate": 8.66753700222735e-06, | |
| "loss": 0.3023, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.9457777777777778, | |
| "grad_norm": 1.6460980849436542, | |
| "learning_rate": 8.653442929545914e-06, | |
| "loss": 0.3482, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 0.9493333333333334, | |
| "grad_norm": 1.8476260091970051, | |
| "learning_rate": 8.639286297971575e-06, | |
| "loss": 0.3111, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 0.9528888888888889, | |
| "grad_norm": 1.5625524365842092, | |
| "learning_rate": 8.625067349912171e-06, | |
| "loss": 0.3333, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 0.9564444444444444, | |
| "grad_norm": 1.679549783886682, | |
| "learning_rate": 8.610786328842602e-06, | |
| "loss": 0.3012, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 1.7334271987057313, | |
| "learning_rate": 8.59644347930066e-06, | |
| "loss": 0.3158, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.9635555555555556, | |
| "grad_norm": 1.7183702234532738, | |
| "learning_rate": 8.582039046882842e-06, | |
| "loss": 0.3045, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 0.9671111111111111, | |
| "grad_norm": 1.677327314139312, | |
| "learning_rate": 8.567573278240147e-06, | |
| "loss": 0.3379, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 0.9706666666666667, | |
| "grad_norm": 1.4197759922345252, | |
| "learning_rate": 8.55304642107385e-06, | |
| "loss": 0.3376, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 0.9742222222222222, | |
| "grad_norm": 1.7365860935410007, | |
| "learning_rate": 8.538458724131258e-06, | |
| "loss": 0.3395, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 0.9777777777777777, | |
| "grad_norm": 1.5642529718868006, | |
| "learning_rate": 8.523810437201463e-06, | |
| "loss": 0.3105, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.9813333333333333, | |
| "grad_norm": 1.6285786801359268, | |
| "learning_rate": 8.509101811111045e-06, | |
| "loss": 0.314, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 0.9848888888888889, | |
| "grad_norm": 1.7932095997349375, | |
| "learning_rate": 8.494333097719795e-06, | |
| "loss": 0.3183, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 0.9884444444444445, | |
| "grad_norm": 1.7636055661476138, | |
| "learning_rate": 8.479504549916393e-06, | |
| "loss": 0.3459, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 0.992, | |
| "grad_norm": 1.7893218283734698, | |
| "learning_rate": 8.464616421614077e-06, | |
| "loss": 0.3655, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 0.9955555555555555, | |
| "grad_norm": 1.56040627840869, | |
| "learning_rate": 8.449668967746303e-06, | |
| "loss": 0.3145, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.9991111111111111, | |
| "grad_norm": 1.7372692555117912, | |
| "learning_rate": 8.434662444262374e-06, | |
| "loss": 0.3152, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 1.0026666666666666, | |
| "grad_norm": 1.3178611516659062, | |
| "learning_rate": 8.419597108123054e-06, | |
| "loss": 0.256, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 1.0062222222222221, | |
| "grad_norm": 1.7641513434209246, | |
| "learning_rate": 8.404473217296174e-06, | |
| "loss": 0.2304, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 1.0097777777777779, | |
| "grad_norm": 1.702777106397184, | |
| "learning_rate": 8.389291030752215e-06, | |
| "loss": 0.2451, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 1.0133333333333334, | |
| "grad_norm": 1.516656565976496, | |
| "learning_rate": 8.37405080845987e-06, | |
| "loss": 0.2463, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 1.016888888888889, | |
| "grad_norm": 1.2615996283177406, | |
| "learning_rate": 8.358752811381592e-06, | |
| "loss": 0.2439, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 1.0204444444444445, | |
| "grad_norm": 1.2426761993789008, | |
| "learning_rate": 8.343397301469127e-06, | |
| "loss": 0.2301, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 1.024, | |
| "grad_norm": 1.7414567869166766, | |
| "learning_rate": 8.327984541659035e-06, | |
| "loss": 0.26, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 1.0275555555555556, | |
| "grad_norm": 1.778546754169589, | |
| "learning_rate": 8.312514795868177e-06, | |
| "loss": 0.2537, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 1.031111111111111, | |
| "grad_norm": 1.693194016869835, | |
| "learning_rate": 8.296988328989195e-06, | |
| "loss": 0.2474, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 1.0346666666666666, | |
| "grad_norm": 1.4905129718116352, | |
| "learning_rate": 8.281405406885992e-06, | |
| "loss": 0.2259, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 1.0382222222222222, | |
| "grad_norm": 1.6844431624217413, | |
| "learning_rate": 8.265766296389164e-06, | |
| "loss": 0.2206, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 1.0417777777777777, | |
| "grad_norm": 1.4064579919162583, | |
| "learning_rate": 8.250071265291432e-06, | |
| "loss": 0.2498, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 1.0453333333333332, | |
| "grad_norm": 1.4383166925160618, | |
| "learning_rate": 8.23432058234307e-06, | |
| "loss": 0.2316, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 1.048888888888889, | |
| "grad_norm": 1.7880359369165812, | |
| "learning_rate": 8.218514517247287e-06, | |
| "loss": 0.2421, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 1.0524444444444445, | |
| "grad_norm": 1.49095155848045, | |
| "learning_rate": 8.202653340655614e-06, | |
| "loss": 0.2547, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 1.056, | |
| "grad_norm": 1.802867297616481, | |
| "learning_rate": 8.18673732416328e-06, | |
| "loss": 0.2609, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 1.0595555555555556, | |
| "grad_norm": 1.799375023246126, | |
| "learning_rate": 8.170766740304541e-06, | |
| "loss": 0.2369, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 1.0631111111111111, | |
| "grad_norm": 1.645090115101595, | |
| "learning_rate": 8.154741862548035e-06, | |
| "loss": 0.2519, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 1.0666666666666667, | |
| "grad_norm": 1.8315765038402207, | |
| "learning_rate": 8.13866296529208e-06, | |
| "loss": 0.2248, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.0666666666666667, | |
| "eval_loss": 0.23144060373306274, | |
| "eval_runtime": 562.045, | |
| "eval_samples_per_second": 17.792, | |
| "eval_steps_per_second": 4.448, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.0702222222222222, | |
| "grad_norm": 1.3604786834079945, | |
| "learning_rate": 8.122530323859992e-06, | |
| "loss": 0.2494, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 1.0737777777777777, | |
| "grad_norm": 1.472974815302568, | |
| "learning_rate": 8.106344214495359e-06, | |
| "loss": 0.2168, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 1.0773333333333333, | |
| "grad_norm": 1.9232740710019078, | |
| "learning_rate": 8.090104914357316e-06, | |
| "loss": 0.2544, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 1.0808888888888888, | |
| "grad_norm": 1.6517745707358162, | |
| "learning_rate": 8.073812701515799e-06, | |
| "loss": 0.2362, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 1.0844444444444445, | |
| "grad_norm": 1.5375717590050721, | |
| "learning_rate": 8.057467854946783e-06, | |
| "loss": 0.238, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 1.088, | |
| "grad_norm": 1.736104134714019, | |
| "learning_rate": 8.041070654527498e-06, | |
| "loss": 0.2329, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 1.0915555555555556, | |
| "grad_norm": 1.578126670290498, | |
| "learning_rate": 8.024621381031654e-06, | |
| "loss": 0.2525, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 1.0951111111111111, | |
| "grad_norm": 1.2995445031583646, | |
| "learning_rate": 8.008120316124612e-06, | |
| "loss": 0.2378, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 1.0986666666666667, | |
| "grad_norm": 1.9084352174123695, | |
| "learning_rate": 7.991567742358582e-06, | |
| "loss": 0.2469, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 1.1022222222222222, | |
| "grad_norm": 1.6004292294784017, | |
| "learning_rate": 7.974963943167761e-06, | |
| "loss": 0.2721, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 1.1057777777777777, | |
| "grad_norm": 1.4738079995177567, | |
| "learning_rate": 7.958309202863506e-06, | |
| "loss": 0.2457, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 1.1093333333333333, | |
| "grad_norm": 1.5493675656690653, | |
| "learning_rate": 7.941603806629444e-06, | |
| "loss": 0.2274, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 1.1128888888888888, | |
| "grad_norm": 1.6554292154622638, | |
| "learning_rate": 7.9248480405166e-06, | |
| "loss": 0.2595, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 1.1164444444444444, | |
| "grad_norm": 1.6112904935857704, | |
| "learning_rate": 7.908042191438497e-06, | |
| "loss": 0.2374, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": 1.4663251499352947, | |
| "learning_rate": 7.891186547166238e-06, | |
| "loss": 0.2128, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 1.1235555555555556, | |
| "grad_norm": 1.8636139047215206, | |
| "learning_rate": 7.874281396323589e-06, | |
| "loss": 0.2263, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 1.1271111111111112, | |
| "grad_norm": 1.6257921444204015, | |
| "learning_rate": 7.857327028382025e-06, | |
| "loss": 0.2392, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 1.1306666666666667, | |
| "grad_norm": 1.4066061759358834, | |
| "learning_rate": 7.84032373365578e-06, | |
| "loss": 0.2342, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 1.1342222222222222, | |
| "grad_norm": 1.5852680151393, | |
| "learning_rate": 7.823271803296876e-06, | |
| "loss": 0.2271, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 1.1377777777777778, | |
| "grad_norm": 1.7721860252109063, | |
| "learning_rate": 7.80617152929014e-06, | |
| "loss": 0.2376, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 1.1413333333333333, | |
| "grad_norm": 1.8867413038702499, | |
| "learning_rate": 7.789023204448189e-06, | |
| "loss": 0.2516, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 1.1448888888888888, | |
| "grad_norm": 1.4279840133381525, | |
| "learning_rate": 7.771827122406437e-06, | |
| "loss": 0.2265, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 1.1484444444444444, | |
| "grad_norm": 1.676800279171029, | |
| "learning_rate": 7.754583577618057e-06, | |
| "loss": 0.2554, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 1.152, | |
| "grad_norm": 1.6723494127405627, | |
| "learning_rate": 7.737292865348933e-06, | |
| "loss": 0.2408, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 1.1555555555555554, | |
| "grad_norm": 1.6148606083372026, | |
| "learning_rate": 7.719955281672618e-06, | |
| "loss": 0.2287, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 1.1591111111111112, | |
| "grad_norm": 1.6092526546730486, | |
| "learning_rate": 7.702571123465252e-06, | |
| "loss": 0.237, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 1.1626666666666667, | |
| "grad_norm": 1.3380193435685535, | |
| "learning_rate": 7.685140688400484e-06, | |
| "loss": 0.2393, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 1.1662222222222223, | |
| "grad_norm": 1.3406231671146336, | |
| "learning_rate": 7.66766427494438e-06, | |
| "loss": 0.2158, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 1.1697777777777778, | |
| "grad_norm": 1.5365708586926026, | |
| "learning_rate": 7.650142182350294e-06, | |
| "loss": 0.201, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 1.1733333333333333, | |
| "grad_norm": 1.7847958889549216, | |
| "learning_rate": 7.632574710653773e-06, | |
| "loss": 0.2627, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 1.1768888888888889, | |
| "grad_norm": 1.4770511975662048, | |
| "learning_rate": 7.614962160667384e-06, | |
| "loss": 0.221, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 1.1804444444444444, | |
| "grad_norm": 1.8043230337610534, | |
| "learning_rate": 7.597304833975596e-06, | |
| "loss": 0.2419, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 1.184, | |
| "grad_norm": 1.9363141324764201, | |
| "learning_rate": 7.579603032929597e-06, | |
| "loss": 0.2572, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 1.1875555555555555, | |
| "grad_norm": 1.600071864532325, | |
| "learning_rate": 7.56185706064212e-06, | |
| "loss": 0.2462, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 1.1911111111111112, | |
| "grad_norm": 1.5785414115422856, | |
| "learning_rate": 7.544067220982254e-06, | |
| "loss": 0.2312, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 1.1946666666666665, | |
| "grad_norm": 1.5789285671514135, | |
| "learning_rate": 7.526233818570245e-06, | |
| "loss": 0.2067, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 1.1982222222222223, | |
| "grad_norm": 1.7448328186975814, | |
| "learning_rate": 7.508357158772273e-06, | |
| "loss": 0.2448, | |
| "step": 3370 | |
| }, | |
| { | |
| "epoch": 1.2017777777777778, | |
| "grad_norm": 1.4619128557517416, | |
| "learning_rate": 7.490437547695224e-06, | |
| "loss": 0.2194, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 1.2053333333333334, | |
| "grad_norm": 1.6063307731749306, | |
| "learning_rate": 7.472475292181454e-06, | |
| "loss": 0.2501, | |
| "step": 3390 | |
| }, | |
| { | |
| "epoch": 1.208888888888889, | |
| "grad_norm": 1.9510115721688825, | |
| "learning_rate": 7.45447069980353e-06, | |
| "loss": 0.2515, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 1.2124444444444444, | |
| "grad_norm": 1.5856572080139135, | |
| "learning_rate": 7.4364240788589625e-06, | |
| "loss": 0.2461, | |
| "step": 3410 | |
| }, | |
| { | |
| "epoch": 1.216, | |
| "grad_norm": 1.846941973796494, | |
| "learning_rate": 7.418335738364931e-06, | |
| "loss": 0.2241, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 1.2195555555555555, | |
| "grad_norm": 1.8886992728965029, | |
| "learning_rate": 7.400205988052991e-06, | |
| "loss": 0.2298, | |
| "step": 3430 | |
| }, | |
| { | |
| "epoch": 1.223111111111111, | |
| "grad_norm": 1.6140767527032074, | |
| "learning_rate": 7.382035138363764e-06, | |
| "loss": 0.2516, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 1.2266666666666666, | |
| "grad_norm": 1.637777869962237, | |
| "learning_rate": 7.363823500441636e-06, | |
| "loss": 0.2422, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 1.2302222222222223, | |
| "grad_norm": 1.3783132940885547, | |
| "learning_rate": 7.345571386129413e-06, | |
| "loss": 0.2368, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 1.2337777777777779, | |
| "grad_norm": 1.750318456803832, | |
| "learning_rate": 7.327279107962995e-06, | |
| "loss": 0.2488, | |
| "step": 3470 | |
| }, | |
| { | |
| "epoch": 1.2373333333333334, | |
| "grad_norm": 1.7745176716418858, | |
| "learning_rate": 7.308946979166012e-06, | |
| "loss": 0.2277, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 1.240888888888889, | |
| "grad_norm": 1.7469697925399752, | |
| "learning_rate": 7.290575313644476e-06, | |
| "loss": 0.2329, | |
| "step": 3490 | |
| }, | |
| { | |
| "epoch": 1.2444444444444445, | |
| "grad_norm": 1.4439208816879574, | |
| "learning_rate": 7.272164425981387e-06, | |
| "loss": 0.2575, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.2444444444444445, | |
| "eval_loss": 0.22694812715053558, | |
| "eval_runtime": 564.2235, | |
| "eval_samples_per_second": 17.723, | |
| "eval_steps_per_second": 4.431, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.248, | |
| "grad_norm": 1.5767155030054063, | |
| "learning_rate": 7.253714631431366e-06, | |
| "loss": 0.2492, | |
| "step": 3510 | |
| }, | |
| { | |
| "epoch": 1.2515555555555555, | |
| "grad_norm": 1.5655624730827595, | |
| "learning_rate": 7.235226245915239e-06, | |
| "loss": 0.2259, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 1.255111111111111, | |
| "grad_norm": 1.8883245133962092, | |
| "learning_rate": 7.216699586014642e-06, | |
| "loss": 0.2487, | |
| "step": 3530 | |
| }, | |
| { | |
| "epoch": 1.2586666666666666, | |
| "grad_norm": 1.2903228684726653, | |
| "learning_rate": 7.198134968966588e-06, | |
| "loss": 0.2341, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 1.2622222222222224, | |
| "grad_norm": 1.6585013961180077, | |
| "learning_rate": 7.179532712658047e-06, | |
| "loss": 0.2625, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 1.2657777777777777, | |
| "grad_norm": 1.4955952405740183, | |
| "learning_rate": 7.160893135620488e-06, | |
| "loss": 0.2602, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 1.2693333333333334, | |
| "grad_norm": 1.8286387441617464, | |
| "learning_rate": 7.142216557024443e-06, | |
| "loss": 0.2221, | |
| "step": 3570 | |
| }, | |
| { | |
| "epoch": 1.272888888888889, | |
| "grad_norm": 1.6146123865735058, | |
| "learning_rate": 7.123503296674021e-06, | |
| "loss": 0.247, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 1.2764444444444445, | |
| "grad_norm": 1.4700165794501387, | |
| "learning_rate": 7.104753675001453e-06, | |
| "loss": 0.2405, | |
| "step": 3590 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": 1.4475828320209072, | |
| "learning_rate": 7.085968013061585e-06, | |
| "loss": 0.2452, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 1.2835555555555556, | |
| "grad_norm": 1.9854917772925798, | |
| "learning_rate": 7.067146632526398e-06, | |
| "loss": 0.2813, | |
| "step": 3610 | |
| }, | |
| { | |
| "epoch": 1.287111111111111, | |
| "grad_norm": 1.863775670718366, | |
| "learning_rate": 7.048289855679487e-06, | |
| "loss": 0.2272, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 1.2906666666666666, | |
| "grad_norm": 2.0238745081645693, | |
| "learning_rate": 7.029398005410551e-06, | |
| "loss": 0.2588, | |
| "step": 3630 | |
| }, | |
| { | |
| "epoch": 1.2942222222222222, | |
| "grad_norm": 1.8729516419448864, | |
| "learning_rate": 7.01047140520986e-06, | |
| "loss": 0.2403, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 1.2977777777777777, | |
| "grad_norm": 1.721501900738319, | |
| "learning_rate": 6.9915103791627146e-06, | |
| "loss": 0.2477, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 1.3013333333333335, | |
| "grad_norm": 1.6626021007269847, | |
| "learning_rate": 6.972515251943901e-06, | |
| "loss": 0.2279, | |
| "step": 3660 | |
| }, | |
| { | |
| "epoch": 1.3048888888888888, | |
| "grad_norm": 1.6716430135185554, | |
| "learning_rate": 6.953486348812127e-06, | |
| "loss": 0.2414, | |
| "step": 3670 | |
| }, | |
| { | |
| "epoch": 1.3084444444444445, | |
| "grad_norm": 1.4291636119458788, | |
| "learning_rate": 6.934423995604455e-06, | |
| "loss": 0.248, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 1.312, | |
| "grad_norm": 1.4674689793023254, | |
| "learning_rate": 6.915328518730724e-06, | |
| "loss": 0.2459, | |
| "step": 3690 | |
| }, | |
| { | |
| "epoch": 1.3155555555555556, | |
| "grad_norm": 1.5215618690023482, | |
| "learning_rate": 6.896200245167956e-06, | |
| "loss": 0.2546, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 1.3191111111111111, | |
| "grad_norm": 1.67624683709797, | |
| "learning_rate": 6.877039502454758e-06, | |
| "loss": 0.2006, | |
| "step": 3710 | |
| }, | |
| { | |
| "epoch": 1.3226666666666667, | |
| "grad_norm": 1.552246698817707, | |
| "learning_rate": 6.857846618685724e-06, | |
| "loss": 0.2213, | |
| "step": 3720 | |
| }, | |
| { | |
| "epoch": 1.3262222222222222, | |
| "grad_norm": 2.021180154460745, | |
| "learning_rate": 6.8386219225057945e-06, | |
| "loss": 0.2315, | |
| "step": 3730 | |
| }, | |
| { | |
| "epoch": 1.3297777777777777, | |
| "grad_norm": 1.8378386656471875, | |
| "learning_rate": 6.819365743104655e-06, | |
| "loss": 0.2235, | |
| "step": 3740 | |
| }, | |
| { | |
| "epoch": 1.3333333333333333, | |
| "grad_norm": 1.8383503621089257, | |
| "learning_rate": 6.8000784102110795e-06, | |
| "loss": 0.2348, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 1.3368888888888888, | |
| "grad_norm": 1.476660408503267, | |
| "learning_rate": 6.780760254087293e-06, | |
| "loss": 0.2433, | |
| "step": 3760 | |
| }, | |
| { | |
| "epoch": 1.3404444444444445, | |
| "grad_norm": 1.6056267413924534, | |
| "learning_rate": 6.7614116055233146e-06, | |
| "loss": 0.2511, | |
| "step": 3770 | |
| }, | |
| { | |
| "epoch": 1.3439999999999999, | |
| "grad_norm": 1.5433968607865032, | |
| "learning_rate": 6.742032795831298e-06, | |
| "loss": 0.2218, | |
| "step": 3780 | |
| }, | |
| { | |
| "epoch": 1.3475555555555556, | |
| "grad_norm": 1.8752695620093498, | |
| "learning_rate": 6.722624156839847e-06, | |
| "loss": 0.2607, | |
| "step": 3790 | |
| }, | |
| { | |
| "epoch": 1.3511111111111112, | |
| "grad_norm": 1.7018274048947808, | |
| "learning_rate": 6.703186020888347e-06, | |
| "loss": 0.2434, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 1.3546666666666667, | |
| "grad_norm": 1.7419410223233012, | |
| "learning_rate": 6.683718720821264e-06, | |
| "loss": 0.2494, | |
| "step": 3810 | |
| }, | |
| { | |
| "epoch": 1.3582222222222222, | |
| "grad_norm": 1.5145074056393906, | |
| "learning_rate": 6.664222589982451e-06, | |
| "loss": 0.2215, | |
| "step": 3820 | |
| }, | |
| { | |
| "epoch": 1.3617777777777778, | |
| "grad_norm": 1.2846516741089247, | |
| "learning_rate": 6.644697962209434e-06, | |
| "loss": 0.2346, | |
| "step": 3830 | |
| }, | |
| { | |
| "epoch": 1.3653333333333333, | |
| "grad_norm": 1.4951097829345636, | |
| "learning_rate": 6.6251451718277095e-06, | |
| "loss": 0.2122, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 1.3688888888888888, | |
| "grad_norm": 1.837176746272441, | |
| "learning_rate": 6.605564553644998e-06, | |
| "loss": 0.2289, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 1.3724444444444446, | |
| "grad_norm": 1.7541861945923773, | |
| "learning_rate": 6.585956442945531e-06, | |
| "loss": 0.2304, | |
| "step": 3860 | |
| }, | |
| { | |
| "epoch": 1.376, | |
| "grad_norm": 1.456084798251464, | |
| "learning_rate": 6.566321175484298e-06, | |
| "loss": 0.2524, | |
| "step": 3870 | |
| }, | |
| { | |
| "epoch": 1.3795555555555556, | |
| "grad_norm": 1.4021880078388174, | |
| "learning_rate": 6.546659087481304e-06, | |
| "loss": 0.2344, | |
| "step": 3880 | |
| }, | |
| { | |
| "epoch": 1.3831111111111112, | |
| "grad_norm": 1.386759603833687, | |
| "learning_rate": 6.526970515615807e-06, | |
| "loss": 0.2278, | |
| "step": 3890 | |
| }, | |
| { | |
| "epoch": 1.3866666666666667, | |
| "grad_norm": 1.9340717544487618, | |
| "learning_rate": 6.507255797020555e-06, | |
| "loss": 0.2299, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 1.3902222222222222, | |
| "grad_norm": 1.4309730673942778, | |
| "learning_rate": 6.487515269276015e-06, | |
| "loss": 0.2518, | |
| "step": 3910 | |
| }, | |
| { | |
| "epoch": 1.3937777777777778, | |
| "grad_norm": 1.5432073955843775, | |
| "learning_rate": 6.467749270404593e-06, | |
| "loss": 0.2196, | |
| "step": 3920 | |
| }, | |
| { | |
| "epoch": 1.3973333333333333, | |
| "grad_norm": 1.5255820019311863, | |
| "learning_rate": 6.4479581388648404e-06, | |
| "loss": 0.2527, | |
| "step": 3930 | |
| }, | |
| { | |
| "epoch": 1.4008888888888889, | |
| "grad_norm": 1.9387048217346732, | |
| "learning_rate": 6.428142213545662e-06, | |
| "loss": 0.2663, | |
| "step": 3940 | |
| }, | |
| { | |
| "epoch": 1.4044444444444444, | |
| "grad_norm": 1.4687424654762213, | |
| "learning_rate": 6.408301833760517e-06, | |
| "loss": 0.2141, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 1.408, | |
| "grad_norm": 1.6790491256350315, | |
| "learning_rate": 6.388437339241601e-06, | |
| "loss": 0.2419, | |
| "step": 3960 | |
| }, | |
| { | |
| "epoch": 1.4115555555555557, | |
| "grad_norm": 1.4986463255132796, | |
| "learning_rate": 6.368549070134036e-06, | |
| "loss": 0.2205, | |
| "step": 3970 | |
| }, | |
| { | |
| "epoch": 1.415111111111111, | |
| "grad_norm": 1.8639041315873657, | |
| "learning_rate": 6.348637366990038e-06, | |
| "loss": 0.2403, | |
| "step": 3980 | |
| }, | |
| { | |
| "epoch": 1.4186666666666667, | |
| "grad_norm": 1.8313804556837663, | |
| "learning_rate": 6.328702570763098e-06, | |
| "loss": 0.243, | |
| "step": 3990 | |
| }, | |
| { | |
| "epoch": 1.4222222222222223, | |
| "grad_norm": 1.6288666479905434, | |
| "learning_rate": 6.308745022802128e-06, | |
| "loss": 0.2376, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.4222222222222223, | |
| "eval_loss": 0.22332721948623657, | |
| "eval_runtime": 562.4439, | |
| "eval_samples_per_second": 17.78, | |
| "eval_steps_per_second": 4.445, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.4257777777777778, | |
| "grad_norm": 1.28363469470016, | |
| "learning_rate": 6.288765064845629e-06, | |
| "loss": 0.2119, | |
| "step": 4010 | |
| }, | |
| { | |
| "epoch": 1.4293333333333333, | |
| "grad_norm": 1.5685400141436767, | |
| "learning_rate": 6.268763039015833e-06, | |
| "loss": 0.2372, | |
| "step": 4020 | |
| }, | |
| { | |
| "epoch": 1.4328888888888889, | |
| "grad_norm": 1.2419732210599121, | |
| "learning_rate": 6.248739287812846e-06, | |
| "loss": 0.2378, | |
| "step": 4030 | |
| }, | |
| { | |
| "epoch": 1.4364444444444444, | |
| "grad_norm": 1.450791049105233, | |
| "learning_rate": 6.228694154108783e-06, | |
| "loss": 0.236, | |
| "step": 4040 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "grad_norm": 1.3478041984965912, | |
| "learning_rate": 6.208627981141902e-06, | |
| "loss": 0.2165, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 1.4435555555555555, | |
| "grad_norm": 1.6880548918845273, | |
| "learning_rate": 6.188541112510713e-06, | |
| "loss": 0.2405, | |
| "step": 4060 | |
| }, | |
| { | |
| "epoch": 1.447111111111111, | |
| "grad_norm": 1.489941080547117, | |
| "learning_rate": 6.168433892168113e-06, | |
| "loss": 0.2288, | |
| "step": 4070 | |
| }, | |
| { | |
| "epoch": 1.4506666666666668, | |
| "grad_norm": 2.036909885440752, | |
| "learning_rate": 6.148306664415476e-06, | |
| "loss": 0.235, | |
| "step": 4080 | |
| }, | |
| { | |
| "epoch": 1.4542222222222223, | |
| "grad_norm": 1.60733518117776, | |
| "learning_rate": 6.128159773896783e-06, | |
| "loss": 0.2143, | |
| "step": 4090 | |
| }, | |
| { | |
| "epoch": 1.4577777777777778, | |
| "grad_norm": 1.6002205563066152, | |
| "learning_rate": 6.107993565592693e-06, | |
| "loss": 0.239, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 1.4613333333333334, | |
| "grad_norm": 1.59924513215813, | |
| "learning_rate": 6.087808384814652e-06, | |
| "loss": 0.2185, | |
| "step": 4110 | |
| }, | |
| { | |
| "epoch": 1.464888888888889, | |
| "grad_norm": 1.6651512334739322, | |
| "learning_rate": 6.067604577198981e-06, | |
| "loss": 0.238, | |
| "step": 4120 | |
| }, | |
| { | |
| "epoch": 1.4684444444444444, | |
| "grad_norm": 1.6551324049801701, | |
| "learning_rate": 6.04738248870095e-06, | |
| "loss": 0.2238, | |
| "step": 4130 | |
| }, | |
| { | |
| "epoch": 1.472, | |
| "grad_norm": 1.5301258421668906, | |
| "learning_rate": 6.027142465588855e-06, | |
| "loss": 0.2453, | |
| "step": 4140 | |
| }, | |
| { | |
| "epoch": 1.4755555555555555, | |
| "grad_norm": 1.8144546212524773, | |
| "learning_rate": 6.006884854438099e-06, | |
| "loss": 0.2375, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 1.479111111111111, | |
| "grad_norm": 1.5099593511650293, | |
| "learning_rate": 5.9866100021252415e-06, | |
| "loss": 0.2331, | |
| "step": 4160 | |
| }, | |
| { | |
| "epoch": 1.4826666666666668, | |
| "grad_norm": 1.502590510458408, | |
| "learning_rate": 5.966318255822072e-06, | |
| "loss": 0.2131, | |
| "step": 4170 | |
| }, | |
| { | |
| "epoch": 1.4862222222222221, | |
| "grad_norm": 1.7399671557461471, | |
| "learning_rate": 5.946009962989659e-06, | |
| "loss": 0.243, | |
| "step": 4180 | |
| }, | |
| { | |
| "epoch": 1.4897777777777779, | |
| "grad_norm": 1.959843593418678, | |
| "learning_rate": 5.9256854713724e-06, | |
| "loss": 0.2344, | |
| "step": 4190 | |
| }, | |
| { | |
| "epoch": 1.4933333333333334, | |
| "grad_norm": 1.5187384802338688, | |
| "learning_rate": 5.905345128992072e-06, | |
| "loss": 0.2372, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 1.496888888888889, | |
| "grad_norm": 1.713913961820143, | |
| "learning_rate": 5.884989284141866e-06, | |
| "loss": 0.2137, | |
| "step": 4210 | |
| }, | |
| { | |
| "epoch": 1.5004444444444445, | |
| "grad_norm": 1.5301932679943313, | |
| "learning_rate": 5.86461828538043e-06, | |
| "loss": 0.2264, | |
| "step": 4220 | |
| }, | |
| { | |
| "epoch": 1.504, | |
| "grad_norm": 1.6650108469792486, | |
| "learning_rate": 5.84423248152589e-06, | |
| "loss": 0.2167, | |
| "step": 4230 | |
| }, | |
| { | |
| "epoch": 1.5075555555555555, | |
| "grad_norm": 1.7377610919859674, | |
| "learning_rate": 5.82383222164989e-06, | |
| "loss": 0.2223, | |
| "step": 4240 | |
| }, | |
| { | |
| "epoch": 1.511111111111111, | |
| "grad_norm": 1.8280200619954592, | |
| "learning_rate": 5.803417855071603e-06, | |
| "loss": 0.2361, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 1.5146666666666668, | |
| "grad_norm": 1.7315368181217787, | |
| "learning_rate": 5.782989731351762e-06, | |
| "loss": 0.2665, | |
| "step": 4260 | |
| }, | |
| { | |
| "epoch": 1.5182222222222221, | |
| "grad_norm": 1.6917154736502973, | |
| "learning_rate": 5.762548200286659e-06, | |
| "loss": 0.212, | |
| "step": 4270 | |
| }, | |
| { | |
| "epoch": 1.521777777777778, | |
| "grad_norm": 1.5262051452408105, | |
| "learning_rate": 5.742093611902168e-06, | |
| "loss": 0.2142, | |
| "step": 4280 | |
| }, | |
| { | |
| "epoch": 1.5253333333333332, | |
| "grad_norm": 1.4955231464253305, | |
| "learning_rate": 5.721626316447748e-06, | |
| "loss": 0.2302, | |
| "step": 4290 | |
| }, | |
| { | |
| "epoch": 1.528888888888889, | |
| "grad_norm": 1.729596636954076, | |
| "learning_rate": 5.7011466643904434e-06, | |
| "loss": 0.2209, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 1.5324444444444445, | |
| "grad_norm": 1.470928828267314, | |
| "learning_rate": 5.680655006408882e-06, | |
| "loss": 0.2398, | |
| "step": 4310 | |
| }, | |
| { | |
| "epoch": 1.536, | |
| "grad_norm": 1.4046672488847465, | |
| "learning_rate": 5.660151693387273e-06, | |
| "loss": 0.2335, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 1.5395555555555556, | |
| "grad_norm": 1.6687999325358385, | |
| "learning_rate": 5.639637076409404e-06, | |
| "loss": 0.2207, | |
| "step": 4330 | |
| }, | |
| { | |
| "epoch": 1.543111111111111, | |
| "grad_norm": 1.60564618911301, | |
| "learning_rate": 5.6191115067526135e-06, | |
| "loss": 0.2411, | |
| "step": 4340 | |
| }, | |
| { | |
| "epoch": 1.5466666666666666, | |
| "grad_norm": 1.6047937970455775, | |
| "learning_rate": 5.598575335881792e-06, | |
| "loss": 0.2161, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 1.5502222222222222, | |
| "grad_norm": 1.3451412373708476, | |
| "learning_rate": 5.578028915443356e-06, | |
| "loss": 0.2104, | |
| "step": 4360 | |
| }, | |
| { | |
| "epoch": 1.553777777777778, | |
| "grad_norm": 1.827680836587444, | |
| "learning_rate": 5.55747259725923e-06, | |
| "loss": 0.2333, | |
| "step": 4370 | |
| }, | |
| { | |
| "epoch": 1.5573333333333332, | |
| "grad_norm": 1.8474659285597943, | |
| "learning_rate": 5.536906733320816e-06, | |
| "loss": 0.2447, | |
| "step": 4380 | |
| }, | |
| { | |
| "epoch": 1.560888888888889, | |
| "grad_norm": 1.5571932949328393, | |
| "learning_rate": 5.516331675782973e-06, | |
| "loss": 0.2445, | |
| "step": 4390 | |
| }, | |
| { | |
| "epoch": 1.5644444444444443, | |
| "grad_norm": 1.9294806844289611, | |
| "learning_rate": 5.495747776957987e-06, | |
| "loss": 0.2382, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 1.568, | |
| "grad_norm": 1.3637347529801744, | |
| "learning_rate": 5.475155389309531e-06, | |
| "loss": 0.2162, | |
| "step": 4410 | |
| }, | |
| { | |
| "epoch": 1.5715555555555556, | |
| "grad_norm": 1.552594376889073, | |
| "learning_rate": 5.4545548654466366e-06, | |
| "loss": 0.2351, | |
| "step": 4420 | |
| }, | |
| { | |
| "epoch": 1.5751111111111111, | |
| "grad_norm": 1.563596866564994, | |
| "learning_rate": 5.433946558117654e-06, | |
| "loss": 0.2259, | |
| "step": 4430 | |
| }, | |
| { | |
| "epoch": 1.5786666666666667, | |
| "grad_norm": 1.9424477147575314, | |
| "learning_rate": 5.413330820204214e-06, | |
| "loss": 0.2269, | |
| "step": 4440 | |
| }, | |
| { | |
| "epoch": 1.5822222222222222, | |
| "grad_norm": 1.7161442287459214, | |
| "learning_rate": 5.392708004715178e-06, | |
| "loss": 0.233, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 1.5857777777777777, | |
| "grad_norm": 1.4458518805717744, | |
| "learning_rate": 5.372078464780603e-06, | |
| "loss": 0.2428, | |
| "step": 4460 | |
| }, | |
| { | |
| "epoch": 1.5893333333333333, | |
| "grad_norm": 1.7197914268509118, | |
| "learning_rate": 5.351442553645691e-06, | |
| "loss": 0.2095, | |
| "step": 4470 | |
| }, | |
| { | |
| "epoch": 1.592888888888889, | |
| "grad_norm": 1.7871712697682276, | |
| "learning_rate": 5.330800624664736e-06, | |
| "loss": 0.2375, | |
| "step": 4480 | |
| }, | |
| { | |
| "epoch": 1.5964444444444443, | |
| "grad_norm": 1.6154295338481346, | |
| "learning_rate": 5.310153031295079e-06, | |
| "loss": 0.2365, | |
| "step": 4490 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 1.8622833358204558, | |
| "learning_rate": 5.289500127091056e-06, | |
| "loss": 0.2521, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "eval_loss": 0.22019484639167786, | |
| "eval_runtime": 562.6101, | |
| "eval_samples_per_second": 17.774, | |
| "eval_steps_per_second": 4.444, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.6035555555555554, | |
| "grad_norm": 1.4160865462023664, | |
| "learning_rate": 5.26884226569794e-06, | |
| "loss": 0.2445, | |
| "step": 4510 | |
| }, | |
| { | |
| "epoch": 1.6071111111111112, | |
| "grad_norm": 1.6982387533503471, | |
| "learning_rate": 5.248179800845884e-06, | |
| "loss": 0.2586, | |
| "step": 4520 | |
| }, | |
| { | |
| "epoch": 1.6106666666666667, | |
| "grad_norm": 1.8063057152671183, | |
| "learning_rate": 5.227513086343875e-06, | |
| "loss": 0.2342, | |
| "step": 4530 | |
| }, | |
| { | |
| "epoch": 1.6142222222222222, | |
| "grad_norm": 1.8369946808465265, | |
| "learning_rate": 5.20684247607366e-06, | |
| "loss": 0.2149, | |
| "step": 4540 | |
| }, | |
| { | |
| "epoch": 1.6177777777777778, | |
| "grad_norm": 1.4919743522204885, | |
| "learning_rate": 5.186168323983702e-06, | |
| "loss": 0.2361, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 1.6213333333333333, | |
| "grad_norm": 1.908909797085476, | |
| "learning_rate": 5.1654909840831e-06, | |
| "loss": 0.2422, | |
| "step": 4560 | |
| }, | |
| { | |
| "epoch": 1.624888888888889, | |
| "grad_norm": 1.6970594817568836, | |
| "learning_rate": 5.144810810435553e-06, | |
| "loss": 0.2702, | |
| "step": 4570 | |
| }, | |
| { | |
| "epoch": 1.6284444444444444, | |
| "grad_norm": 1.914631182858778, | |
| "learning_rate": 5.124128157153273e-06, | |
| "loss": 0.211, | |
| "step": 4580 | |
| }, | |
| { | |
| "epoch": 1.6320000000000001, | |
| "grad_norm": 1.8308898752074714, | |
| "learning_rate": 5.103443378390935e-06, | |
| "loss": 0.213, | |
| "step": 4590 | |
| }, | |
| { | |
| "epoch": 1.6355555555555554, | |
| "grad_norm": 1.4716155031307734, | |
| "learning_rate": 5.08275682833961e-06, | |
| "loss": 0.2348, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 1.6391111111111112, | |
| "grad_norm": 1.3846959035420932, | |
| "learning_rate": 5.062068861220697e-06, | |
| "loss": 0.2323, | |
| "step": 4610 | |
| }, | |
| { | |
| "epoch": 1.6426666666666667, | |
| "grad_norm": 1.310528332429156, | |
| "learning_rate": 5.041379831279859e-06, | |
| "loss": 0.2274, | |
| "step": 4620 | |
| }, | |
| { | |
| "epoch": 1.6462222222222223, | |
| "grad_norm": 1.56294035415104, | |
| "learning_rate": 5.020690092780961e-06, | |
| "loss": 0.2382, | |
| "step": 4630 | |
| }, | |
| { | |
| "epoch": 1.6497777777777778, | |
| "grad_norm": 1.797053581769004, | |
| "learning_rate": 5e-06, | |
| "loss": 0.2263, | |
| "step": 4640 | |
| }, | |
| { | |
| "epoch": 1.6533333333333333, | |
| "grad_norm": 1.57684485333151, | |
| "learning_rate": 4.9793099072190406e-06, | |
| "loss": 0.2225, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 1.6568888888888889, | |
| "grad_norm": 2.0411280702141883, | |
| "learning_rate": 4.958620168720144e-06, | |
| "loss": 0.2225, | |
| "step": 4660 | |
| }, | |
| { | |
| "epoch": 1.6604444444444444, | |
| "grad_norm": 1.476641016823167, | |
| "learning_rate": 4.937931138779305e-06, | |
| "loss": 0.2438, | |
| "step": 4670 | |
| }, | |
| { | |
| "epoch": 1.6640000000000001, | |
| "grad_norm": 1.4259185034698016, | |
| "learning_rate": 4.917243171660391e-06, | |
| "loss": 0.2127, | |
| "step": 4680 | |
| }, | |
| { | |
| "epoch": 1.6675555555555555, | |
| "grad_norm": 1.9925037267732388, | |
| "learning_rate": 4.896556621609066e-06, | |
| "loss": 0.223, | |
| "step": 4690 | |
| }, | |
| { | |
| "epoch": 1.6711111111111112, | |
| "grad_norm": 1.3845653896887404, | |
| "learning_rate": 4.8758718428467275e-06, | |
| "loss": 0.2332, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 1.6746666666666665, | |
| "grad_norm": 1.5936847174408162, | |
| "learning_rate": 4.8551891895644485e-06, | |
| "loss": 0.2381, | |
| "step": 4710 | |
| }, | |
| { | |
| "epoch": 1.6782222222222223, | |
| "grad_norm": 1.8741655887113169, | |
| "learning_rate": 4.8345090159169015e-06, | |
| "loss": 0.2182, | |
| "step": 4720 | |
| }, | |
| { | |
| "epoch": 1.6817777777777778, | |
| "grad_norm": 2.0577120951961057, | |
| "learning_rate": 4.813831676016301e-06, | |
| "loss": 0.2323, | |
| "step": 4730 | |
| }, | |
| { | |
| "epoch": 1.6853333333333333, | |
| "grad_norm": 1.6887655358314864, | |
| "learning_rate": 4.793157523926343e-06, | |
| "loss": 0.2236, | |
| "step": 4740 | |
| }, | |
| { | |
| "epoch": 1.6888888888888889, | |
| "grad_norm": 1.669624887759933, | |
| "learning_rate": 4.772486913656126e-06, | |
| "loss": 0.216, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 1.6924444444444444, | |
| "grad_norm": 1.3957590014036165, | |
| "learning_rate": 4.751820199154116e-06, | |
| "loss": 0.2104, | |
| "step": 4760 | |
| }, | |
| { | |
| "epoch": 1.696, | |
| "grad_norm": 1.7601085948001791, | |
| "learning_rate": 4.731157734302063e-06, | |
| "loss": 0.2255, | |
| "step": 4770 | |
| }, | |
| { | |
| "epoch": 1.6995555555555555, | |
| "grad_norm": 1.4141936030167341, | |
| "learning_rate": 4.7104998729089456e-06, | |
| "loss": 0.2216, | |
| "step": 4780 | |
| }, | |
| { | |
| "epoch": 1.7031111111111112, | |
| "grad_norm": 1.5375991664201998, | |
| "learning_rate": 4.689846968704921e-06, | |
| "loss": 0.2316, | |
| "step": 4790 | |
| }, | |
| { | |
| "epoch": 1.7066666666666666, | |
| "grad_norm": 1.835379245628528, | |
| "learning_rate": 4.669199375335267e-06, | |
| "loss": 0.2211, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 1.7102222222222223, | |
| "grad_norm": 1.8813507703109071, | |
| "learning_rate": 4.64855744635431e-06, | |
| "loss": 0.2279, | |
| "step": 4810 | |
| }, | |
| { | |
| "epoch": 1.7137777777777776, | |
| "grad_norm": 1.6192801344534893, | |
| "learning_rate": 4.627921535219398e-06, | |
| "loss": 0.2076, | |
| "step": 4820 | |
| }, | |
| { | |
| "epoch": 1.7173333333333334, | |
| "grad_norm": 1.5047363033780152, | |
| "learning_rate": 4.607291995284824e-06, | |
| "loss": 0.2272, | |
| "step": 4830 | |
| }, | |
| { | |
| "epoch": 1.720888888888889, | |
| "grad_norm": 1.7489501841705488, | |
| "learning_rate": 4.586669179795789e-06, | |
| "loss": 0.2269, | |
| "step": 4840 | |
| }, | |
| { | |
| "epoch": 1.7244444444444444, | |
| "grad_norm": 1.5125229649844467, | |
| "learning_rate": 4.566053441882346e-06, | |
| "loss": 0.2187, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 1.728, | |
| "grad_norm": 1.456492370626904, | |
| "learning_rate": 4.545445134553365e-06, | |
| "loss": 0.2179, | |
| "step": 4860 | |
| }, | |
| { | |
| "epoch": 1.7315555555555555, | |
| "grad_norm": 1.620452560710039, | |
| "learning_rate": 4.52484461069047e-06, | |
| "loss": 0.2262, | |
| "step": 4870 | |
| }, | |
| { | |
| "epoch": 1.7351111111111113, | |
| "grad_norm": 2.0083784630353887, | |
| "learning_rate": 4.504252223042015e-06, | |
| "loss": 0.2363, | |
| "step": 4880 | |
| }, | |
| { | |
| "epoch": 1.7386666666666666, | |
| "grad_norm": 1.4284347298197593, | |
| "learning_rate": 4.4836683242170274e-06, | |
| "loss": 0.2297, | |
| "step": 4890 | |
| }, | |
| { | |
| "epoch": 1.7422222222222223, | |
| "grad_norm": 1.4968259463132965, | |
| "learning_rate": 4.463093266679185e-06, | |
| "loss": 0.2223, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 1.7457777777777777, | |
| "grad_norm": 1.625381108991568, | |
| "learning_rate": 4.442527402740773e-06, | |
| "loss": 0.2177, | |
| "step": 4910 | |
| }, | |
| { | |
| "epoch": 1.7493333333333334, | |
| "grad_norm": 1.7761034776967624, | |
| "learning_rate": 4.4219710845566445e-06, | |
| "loss": 0.2266, | |
| "step": 4920 | |
| }, | |
| { | |
| "epoch": 1.752888888888889, | |
| "grad_norm": 1.513194923019174, | |
| "learning_rate": 4.401424664118209e-06, | |
| "loss": 0.2385, | |
| "step": 4930 | |
| }, | |
| { | |
| "epoch": 1.7564444444444445, | |
| "grad_norm": 1.6662188116169265, | |
| "learning_rate": 4.380888493247389e-06, | |
| "loss": 0.2209, | |
| "step": 4940 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "grad_norm": 1.7192566216460916, | |
| "learning_rate": 4.360362923590599e-06, | |
| "loss": 0.2273, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 1.7635555555555555, | |
| "grad_norm": 1.6376141309754375, | |
| "learning_rate": 4.339848306612726e-06, | |
| "loss": 0.2263, | |
| "step": 4960 | |
| }, | |
| { | |
| "epoch": 1.767111111111111, | |
| "grad_norm": 1.5441961811580323, | |
| "learning_rate": 4.319344993591122e-06, | |
| "loss": 0.2317, | |
| "step": 4970 | |
| }, | |
| { | |
| "epoch": 1.7706666666666666, | |
| "grad_norm": 1.8214320335618939, | |
| "learning_rate": 4.298853335609558e-06, | |
| "loss": 0.2352, | |
| "step": 4980 | |
| }, | |
| { | |
| "epoch": 1.7742222222222224, | |
| "grad_norm": 1.56553607416482, | |
| "learning_rate": 4.278373683552252e-06, | |
| "loss": 0.2451, | |
| "step": 4990 | |
| }, | |
| { | |
| "epoch": 1.7777777777777777, | |
| "grad_norm": 1.3995626238477137, | |
| "learning_rate": 4.257906388097833e-06, | |
| "loss": 0.2119, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.7777777777777777, | |
| "eval_loss": 0.2164340764284134, | |
| "eval_runtime": 560.6747, | |
| "eval_samples_per_second": 17.836, | |
| "eval_steps_per_second": 4.459, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.7813333333333334, | |
| "grad_norm": 2.040538040793932, | |
| "learning_rate": 4.237451799713343e-06, | |
| "loss": 0.2311, | |
| "step": 5010 | |
| }, | |
| { | |
| "epoch": 1.7848888888888887, | |
| "grad_norm": 1.718359867250397, | |
| "learning_rate": 4.2170102686482386e-06, | |
| "loss": 0.2308, | |
| "step": 5020 | |
| }, | |
| { | |
| "epoch": 1.7884444444444445, | |
| "grad_norm": 1.647498620915099, | |
| "learning_rate": 4.196582144928398e-06, | |
| "loss": 0.2343, | |
| "step": 5030 | |
| }, | |
| { | |
| "epoch": 1.792, | |
| "grad_norm": 1.529219174043635, | |
| "learning_rate": 4.176167778350111e-06, | |
| "loss": 0.2471, | |
| "step": 5040 | |
| }, | |
| { | |
| "epoch": 1.7955555555555556, | |
| "grad_norm": 1.8299602144032394, | |
| "learning_rate": 4.155767518474112e-06, | |
| "loss": 0.2334, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 1.799111111111111, | |
| "grad_norm": 1.6343462536475093, | |
| "learning_rate": 4.135381714619572e-06, | |
| "loss": 0.2352, | |
| "step": 5060 | |
| }, | |
| { | |
| "epoch": 1.8026666666666666, | |
| "grad_norm": 1.9294723624845498, | |
| "learning_rate": 4.115010715858135e-06, | |
| "loss": 0.2295, | |
| "step": 5070 | |
| }, | |
| { | |
| "epoch": 1.8062222222222222, | |
| "grad_norm": 1.8402038191366281, | |
| "learning_rate": 4.09465487100793e-06, | |
| "loss": 0.2227, | |
| "step": 5080 | |
| }, | |
| { | |
| "epoch": 1.8097777777777777, | |
| "grad_norm": 1.8931304584295443, | |
| "learning_rate": 4.074314528627602e-06, | |
| "loss": 0.2355, | |
| "step": 5090 | |
| }, | |
| { | |
| "epoch": 1.8133333333333335, | |
| "grad_norm": 1.8206151546804537, | |
| "learning_rate": 4.053990037010342e-06, | |
| "loss": 0.2323, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 1.8168888888888888, | |
| "grad_norm": 1.5473952396079231, | |
| "learning_rate": 4.033681744177929e-06, | |
| "loss": 0.2069, | |
| "step": 5110 | |
| }, | |
| { | |
| "epoch": 1.8204444444444445, | |
| "grad_norm": 1.2199743932660083, | |
| "learning_rate": 4.013389997874759e-06, | |
| "loss": 0.2076, | |
| "step": 5120 | |
| }, | |
| { | |
| "epoch": 1.8239999999999998, | |
| "grad_norm": 1.7825722106285342, | |
| "learning_rate": 3.993115145561902e-06, | |
| "loss": 0.2425, | |
| "step": 5130 | |
| }, | |
| { | |
| "epoch": 1.8275555555555556, | |
| "grad_norm": 1.8303008392916014, | |
| "learning_rate": 3.9728575344111456e-06, | |
| "loss": 0.234, | |
| "step": 5140 | |
| }, | |
| { | |
| "epoch": 1.8311111111111111, | |
| "grad_norm": 1.2964915164879398, | |
| "learning_rate": 3.9526175112990515e-06, | |
| "loss": 0.1987, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 1.8346666666666667, | |
| "grad_norm": 1.5700753166440498, | |
| "learning_rate": 3.93239542280102e-06, | |
| "loss": 0.2137, | |
| "step": 5160 | |
| }, | |
| { | |
| "epoch": 1.8382222222222222, | |
| "grad_norm": 1.6406760092620998, | |
| "learning_rate": 3.912191615185349e-06, | |
| "loss": 0.2235, | |
| "step": 5170 | |
| }, | |
| { | |
| "epoch": 1.8417777777777777, | |
| "grad_norm": 1.5447905159493263, | |
| "learning_rate": 3.892006434407309e-06, | |
| "loss": 0.2218, | |
| "step": 5180 | |
| }, | |
| { | |
| "epoch": 1.8453333333333335, | |
| "grad_norm": 1.7383544264235498, | |
| "learning_rate": 3.871840226103219e-06, | |
| "loss": 0.2287, | |
| "step": 5190 | |
| }, | |
| { | |
| "epoch": 1.8488888888888888, | |
| "grad_norm": 1.9317016214891507, | |
| "learning_rate": 3.851693335584525e-06, | |
| "loss": 0.2228, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 1.8524444444444446, | |
| "grad_norm": 1.5692018080933492, | |
| "learning_rate": 3.831566107831889e-06, | |
| "loss": 0.2331, | |
| "step": 5210 | |
| }, | |
| { | |
| "epoch": 1.8559999999999999, | |
| "grad_norm": 2.050378660719503, | |
| "learning_rate": 3.8114588874892893e-06, | |
| "loss": 0.2137, | |
| "step": 5220 | |
| }, | |
| { | |
| "epoch": 1.8595555555555556, | |
| "grad_norm": 1.5271617708228957, | |
| "learning_rate": 3.791372018858099e-06, | |
| "loss": 0.2135, | |
| "step": 5230 | |
| }, | |
| { | |
| "epoch": 1.8631111111111112, | |
| "grad_norm": 1.31763541419423, | |
| "learning_rate": 3.7713058458912164e-06, | |
| "loss": 0.2217, | |
| "step": 5240 | |
| }, | |
| { | |
| "epoch": 1.8666666666666667, | |
| "grad_norm": 1.6488724873659462, | |
| "learning_rate": 3.751260712187156e-06, | |
| "loss": 0.2539, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 1.8702222222222222, | |
| "grad_norm": 1.392136229173735, | |
| "learning_rate": 3.731236960984169e-06, | |
| "loss": 0.2179, | |
| "step": 5260 | |
| }, | |
| { | |
| "epoch": 1.8737777777777778, | |
| "grad_norm": 1.6189512718112575, | |
| "learning_rate": 3.711234935154372e-06, | |
| "loss": 0.2183, | |
| "step": 5270 | |
| }, | |
| { | |
| "epoch": 1.8773333333333333, | |
| "grad_norm": 1.5548818693905742, | |
| "learning_rate": 3.6912549771978747e-06, | |
| "loss": 0.2354, | |
| "step": 5280 | |
| }, | |
| { | |
| "epoch": 1.8808888888888888, | |
| "grad_norm": 1.4728328055912387, | |
| "learning_rate": 3.6712974292369035e-06, | |
| "loss": 0.2268, | |
| "step": 5290 | |
| }, | |
| { | |
| "epoch": 1.8844444444444446, | |
| "grad_norm": 1.5435161738551857, | |
| "learning_rate": 3.651362633009962e-06, | |
| "loss": 0.204, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 1.888, | |
| "grad_norm": 1.5873129086509827, | |
| "learning_rate": 3.6314509298659663e-06, | |
| "loss": 0.208, | |
| "step": 5310 | |
| }, | |
| { | |
| "epoch": 1.8915555555555557, | |
| "grad_norm": 1.3391876728975607, | |
| "learning_rate": 3.6115626607584e-06, | |
| "loss": 0.2372, | |
| "step": 5320 | |
| }, | |
| { | |
| "epoch": 1.895111111111111, | |
| "grad_norm": 1.88178920211116, | |
| "learning_rate": 3.5916981662394856e-06, | |
| "loss": 0.2257, | |
| "step": 5330 | |
| }, | |
| { | |
| "epoch": 1.8986666666666667, | |
| "grad_norm": 1.764120901512499, | |
| "learning_rate": 3.5718577864543396e-06, | |
| "loss": 0.2103, | |
| "step": 5340 | |
| }, | |
| { | |
| "epoch": 1.9022222222222223, | |
| "grad_norm": 1.6698875487111986, | |
| "learning_rate": 3.552041861135161e-06, | |
| "loss": 0.211, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 1.9057777777777778, | |
| "grad_norm": 1.6957349016200651, | |
| "learning_rate": 3.532250729595408e-06, | |
| "loss": 0.2164, | |
| "step": 5360 | |
| }, | |
| { | |
| "epoch": 1.9093333333333333, | |
| "grad_norm": 1.5603565111247202, | |
| "learning_rate": 3.5124847307239863e-06, | |
| "loss": 0.2265, | |
| "step": 5370 | |
| }, | |
| { | |
| "epoch": 1.9128888888888889, | |
| "grad_norm": 1.5529468285695374, | |
| "learning_rate": 3.4927442029794467e-06, | |
| "loss": 0.2316, | |
| "step": 5380 | |
| }, | |
| { | |
| "epoch": 1.9164444444444444, | |
| "grad_norm": 1.7677530671686799, | |
| "learning_rate": 3.473029484384196e-06, | |
| "loss": 0.219, | |
| "step": 5390 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "grad_norm": 1.9782571884316444, | |
| "learning_rate": 3.4533409125186974e-06, | |
| "loss": 0.2252, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 1.9235555555555557, | |
| "grad_norm": 1.7371605678560165, | |
| "learning_rate": 3.4336788245157026e-06, | |
| "loss": 0.2222, | |
| "step": 5410 | |
| }, | |
| { | |
| "epoch": 1.927111111111111, | |
| "grad_norm": 1.7241089696999294, | |
| "learning_rate": 3.4140435570544708e-06, | |
| "loss": 0.2345, | |
| "step": 5420 | |
| }, | |
| { | |
| "epoch": 1.9306666666666668, | |
| "grad_norm": 1.7019802310043695, | |
| "learning_rate": 3.3944354463550035e-06, | |
| "loss": 0.214, | |
| "step": 5430 | |
| }, | |
| { | |
| "epoch": 1.934222222222222, | |
| "grad_norm": 1.8394276850187319, | |
| "learning_rate": 3.374854828172292e-06, | |
| "loss": 0.234, | |
| "step": 5440 | |
| }, | |
| { | |
| "epoch": 1.9377777777777778, | |
| "grad_norm": 1.7264682966489493, | |
| "learning_rate": 3.3553020377905663e-06, | |
| "loss": 0.2242, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 1.9413333333333334, | |
| "grad_norm": 1.6744044298365783, | |
| "learning_rate": 3.3357774100175513e-06, | |
| "loss": 0.2245, | |
| "step": 5460 | |
| }, | |
| { | |
| "epoch": 1.944888888888889, | |
| "grad_norm": 1.4991747809315612, | |
| "learning_rate": 3.316281279178737e-06, | |
| "loss": 0.2114, | |
| "step": 5470 | |
| }, | |
| { | |
| "epoch": 1.9484444444444444, | |
| "grad_norm": 1.5141154002091217, | |
| "learning_rate": 3.296813979111655e-06, | |
| "loss": 0.2182, | |
| "step": 5480 | |
| }, | |
| { | |
| "epoch": 1.952, | |
| "grad_norm": 1.7580533484108005, | |
| "learning_rate": 3.2773758431601543e-06, | |
| "loss": 0.2234, | |
| "step": 5490 | |
| }, | |
| { | |
| "epoch": 1.9555555555555557, | |
| "grad_norm": 1.6014365241780455, | |
| "learning_rate": 3.257967204168705e-06, | |
| "loss": 0.238, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 1.9555555555555557, | |
| "eval_loss": 0.21176277101039886, | |
| "eval_runtime": 560.9255, | |
| "eval_samples_per_second": 17.828, | |
| "eval_steps_per_second": 4.457, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 1.959111111111111, | |
| "grad_norm": 1.566927102750067, | |
| "learning_rate": 3.2385883944766867e-06, | |
| "loss": 0.1932, | |
| "step": 5510 | |
| }, | |
| { | |
| "epoch": 1.9626666666666668, | |
| "grad_norm": 1.7041733469332605, | |
| "learning_rate": 3.2192397459127077e-06, | |
| "loss": 0.2194, | |
| "step": 5520 | |
| }, | |
| { | |
| "epoch": 1.966222222222222, | |
| "grad_norm": 1.7846179835205314, | |
| "learning_rate": 3.199921589788923e-06, | |
| "loss": 0.2092, | |
| "step": 5530 | |
| }, | |
| { | |
| "epoch": 1.9697777777777778, | |
| "grad_norm": 1.482707355318634, | |
| "learning_rate": 3.180634256895345e-06, | |
| "loss": 0.2328, | |
| "step": 5540 | |
| }, | |
| { | |
| "epoch": 1.9733333333333334, | |
| "grad_norm": 1.6559180099205715, | |
| "learning_rate": 3.161378077494205e-06, | |
| "loss": 0.234, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 1.976888888888889, | |
| "grad_norm": 1.4931797613124567, | |
| "learning_rate": 3.142153381314278e-06, | |
| "loss": 0.2285, | |
| "step": 5560 | |
| }, | |
| { | |
| "epoch": 1.9804444444444445, | |
| "grad_norm": 1.6899228150340497, | |
| "learning_rate": 3.122960497545242e-06, | |
| "loss": 0.2347, | |
| "step": 5570 | |
| }, | |
| { | |
| "epoch": 1.984, | |
| "grad_norm": 1.6112817535514066, | |
| "learning_rate": 3.103799754832045e-06, | |
| "loss": 0.2017, | |
| "step": 5580 | |
| }, | |
| { | |
| "epoch": 1.9875555555555555, | |
| "grad_norm": 1.4492842053913877, | |
| "learning_rate": 3.0846714812692774e-06, | |
| "loss": 0.2282, | |
| "step": 5590 | |
| }, | |
| { | |
| "epoch": 1.991111111111111, | |
| "grad_norm": 1.6227303784789882, | |
| "learning_rate": 3.065576004395546e-06, | |
| "loss": 0.2193, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 1.9946666666666668, | |
| "grad_norm": 1.6532339878737676, | |
| "learning_rate": 3.046513651187874e-06, | |
| "loss": 0.205, | |
| "step": 5610 | |
| }, | |
| { | |
| "epoch": 1.9982222222222221, | |
| "grad_norm": 1.726150455488493, | |
| "learning_rate": 3.027484748056101e-06, | |
| "loss": 0.2052, | |
| "step": 5620 | |
| }, | |
| { | |
| "epoch": 2.001777777777778, | |
| "grad_norm": 1.2491575364238943, | |
| "learning_rate": 3.008489620837287e-06, | |
| "loss": 0.1793, | |
| "step": 5630 | |
| }, | |
| { | |
| "epoch": 2.005333333333333, | |
| "grad_norm": 1.539466703681713, | |
| "learning_rate": 2.989528594790142e-06, | |
| "loss": 0.133, | |
| "step": 5640 | |
| }, | |
| { | |
| "epoch": 2.008888888888889, | |
| "grad_norm": 1.5201921987042595, | |
| "learning_rate": 2.97060199458945e-06, | |
| "loss": 0.1364, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 2.0124444444444443, | |
| "grad_norm": 1.8387836805686166, | |
| "learning_rate": 2.9517101443205143e-06, | |
| "loss": 0.138, | |
| "step": 5660 | |
| }, | |
| { | |
| "epoch": 2.016, | |
| "grad_norm": 1.6624452979538558, | |
| "learning_rate": 2.9328533674736043e-06, | |
| "loss": 0.1372, | |
| "step": 5670 | |
| }, | |
| { | |
| "epoch": 2.0195555555555558, | |
| "grad_norm": 2.0375067274701464, | |
| "learning_rate": 2.914031986938417e-06, | |
| "loss": 0.1376, | |
| "step": 5680 | |
| }, | |
| { | |
| "epoch": 2.023111111111111, | |
| "grad_norm": 1.5020388133691598, | |
| "learning_rate": 2.895246324998549e-06, | |
| "loss": 0.132, | |
| "step": 5690 | |
| }, | |
| { | |
| "epoch": 2.026666666666667, | |
| "grad_norm": 1.5200304354769367, | |
| "learning_rate": 2.8764967033259793e-06, | |
| "loss": 0.1332, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 2.030222222222222, | |
| "grad_norm": 1.615938242121572, | |
| "learning_rate": 2.8577834429755586e-06, | |
| "loss": 0.137, | |
| "step": 5710 | |
| }, | |
| { | |
| "epoch": 2.033777777777778, | |
| "grad_norm": 1.7244206202588588, | |
| "learning_rate": 2.839106864379512e-06, | |
| "loss": 0.1311, | |
| "step": 5720 | |
| }, | |
| { | |
| "epoch": 2.037333333333333, | |
| "grad_norm": 1.4204204890159835, | |
| "learning_rate": 2.8204672873419565e-06, | |
| "loss": 0.1359, | |
| "step": 5730 | |
| }, | |
| { | |
| "epoch": 2.040888888888889, | |
| "grad_norm": 1.641810724006462, | |
| "learning_rate": 2.8018650310334118e-06, | |
| "loss": 0.1524, | |
| "step": 5740 | |
| }, | |
| { | |
| "epoch": 2.0444444444444443, | |
| "grad_norm": 1.6197231294728873, | |
| "learning_rate": 2.783300413985359e-06, | |
| "loss": 0.1216, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 2.048, | |
| "grad_norm": 1.7166152973793496, | |
| "learning_rate": 2.764773754084763e-06, | |
| "loss": 0.1393, | |
| "step": 5760 | |
| }, | |
| { | |
| "epoch": 2.0515555555555554, | |
| "grad_norm": 1.7305108784705923, | |
| "learning_rate": 2.7462853685686362e-06, | |
| "loss": 0.1429, | |
| "step": 5770 | |
| }, | |
| { | |
| "epoch": 2.055111111111111, | |
| "grad_norm": 1.2910967057789844, | |
| "learning_rate": 2.7278355740186123e-06, | |
| "loss": 0.1336, | |
| "step": 5780 | |
| }, | |
| { | |
| "epoch": 2.058666666666667, | |
| "grad_norm": 1.5080611405633613, | |
| "learning_rate": 2.7094246863555262e-06, | |
| "loss": 0.1359, | |
| "step": 5790 | |
| }, | |
| { | |
| "epoch": 2.062222222222222, | |
| "grad_norm": 1.8733744454525603, | |
| "learning_rate": 2.691053020833988e-06, | |
| "loss": 0.1388, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 2.065777777777778, | |
| "grad_norm": 1.7085324740063759, | |
| "learning_rate": 2.6727208920370063e-06, | |
| "loss": 0.1355, | |
| "step": 5810 | |
| }, | |
| { | |
| "epoch": 2.0693333333333332, | |
| "grad_norm": 1.5576784710780245, | |
| "learning_rate": 2.6544286138705867e-06, | |
| "loss": 0.1328, | |
| "step": 5820 | |
| }, | |
| { | |
| "epoch": 2.072888888888889, | |
| "grad_norm": 1.9703710936721526, | |
| "learning_rate": 2.636176499558364e-06, | |
| "loss": 0.1354, | |
| "step": 5830 | |
| }, | |
| { | |
| "epoch": 2.0764444444444443, | |
| "grad_norm": 1.5952203119705437, | |
| "learning_rate": 2.6179648616362374e-06, | |
| "loss": 0.1493, | |
| "step": 5840 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "grad_norm": 1.9073156525645674, | |
| "learning_rate": 2.599794011947012e-06, | |
| "loss": 0.1579, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 2.0835555555555554, | |
| "grad_norm": 1.7695748236621889, | |
| "learning_rate": 2.581664261635069e-06, | |
| "loss": 0.1446, | |
| "step": 5860 | |
| }, | |
| { | |
| "epoch": 2.087111111111111, | |
| "grad_norm": 1.8880183020861152, | |
| "learning_rate": 2.5635759211410396e-06, | |
| "loss": 0.1406, | |
| "step": 5870 | |
| }, | |
| { | |
| "epoch": 2.0906666666666665, | |
| "grad_norm": 1.5198269240530051, | |
| "learning_rate": 2.545529300196472e-06, | |
| "loss": 0.1244, | |
| "step": 5880 | |
| }, | |
| { | |
| "epoch": 2.094222222222222, | |
| "grad_norm": 1.9355343365767825, | |
| "learning_rate": 2.527524707818547e-06, | |
| "loss": 0.1289, | |
| "step": 5890 | |
| }, | |
| { | |
| "epoch": 2.097777777777778, | |
| "grad_norm": 1.546102626213903, | |
| "learning_rate": 2.5095624523047775e-06, | |
| "loss": 0.1151, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 2.1013333333333333, | |
| "grad_norm": 1.3237810299249595, | |
| "learning_rate": 2.491642841227729e-06, | |
| "loss": 0.1386, | |
| "step": 5910 | |
| }, | |
| { | |
| "epoch": 2.104888888888889, | |
| "grad_norm": 1.6354432410587478, | |
| "learning_rate": 2.4737661814297557e-06, | |
| "loss": 0.1152, | |
| "step": 5920 | |
| }, | |
| { | |
| "epoch": 2.1084444444444443, | |
| "grad_norm": 1.7641939157921844, | |
| "learning_rate": 2.455932779017747e-06, | |
| "loss": 0.1267, | |
| "step": 5930 | |
| }, | |
| { | |
| "epoch": 2.112, | |
| "grad_norm": 1.7717956617877848, | |
| "learning_rate": 2.438142939357882e-06, | |
| "loss": 0.1468, | |
| "step": 5940 | |
| }, | |
| { | |
| "epoch": 2.1155555555555554, | |
| "grad_norm": 1.9248857260031529, | |
| "learning_rate": 2.4203969670704065e-06, | |
| "loss": 0.1426, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 2.119111111111111, | |
| "grad_norm": 1.6693083011986807, | |
| "learning_rate": 2.4026951660244063e-06, | |
| "loss": 0.1519, | |
| "step": 5960 | |
| }, | |
| { | |
| "epoch": 2.1226666666666665, | |
| "grad_norm": 1.4577868069815147, | |
| "learning_rate": 2.385037839332616e-06, | |
| "loss": 0.1449, | |
| "step": 5970 | |
| }, | |
| { | |
| "epoch": 2.1262222222222222, | |
| "grad_norm": 1.5757247401728414, | |
| "learning_rate": 2.3674252893462304e-06, | |
| "loss": 0.1508, | |
| "step": 5980 | |
| }, | |
| { | |
| "epoch": 2.129777777777778, | |
| "grad_norm": 1.798414953668795, | |
| "learning_rate": 2.3498578176497055e-06, | |
| "loss": 0.1336, | |
| "step": 5990 | |
| }, | |
| { | |
| "epoch": 2.1333333333333333, | |
| "grad_norm": 1.3502333712237125, | |
| "learning_rate": 2.3323357250556213e-06, | |
| "loss": 0.1289, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 2.1333333333333333, | |
| "eval_loss": 0.24109843373298645, | |
| "eval_runtime": 561.0318, | |
| "eval_samples_per_second": 17.824, | |
| "eval_steps_per_second": 4.456, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 2.136888888888889, | |
| "grad_norm": 1.6807098639484461, | |
| "learning_rate": 2.3148593115995155e-06, | |
| "loss": 0.1232, | |
| "step": 6010 | |
| }, | |
| { | |
| "epoch": 2.1404444444444444, | |
| "grad_norm": 1.3750693562838343, | |
| "learning_rate": 2.2974288765347484e-06, | |
| "loss": 0.1406, | |
| "step": 6020 | |
| }, | |
| { | |
| "epoch": 2.144, | |
| "grad_norm": 1.7740210796916787, | |
| "learning_rate": 2.280044718327383e-06, | |
| "loss": 0.1366, | |
| "step": 6030 | |
| }, | |
| { | |
| "epoch": 2.1475555555555554, | |
| "grad_norm": 1.3613431283259703, | |
| "learning_rate": 2.262707134651069e-06, | |
| "loss": 0.1347, | |
| "step": 6040 | |
| }, | |
| { | |
| "epoch": 2.151111111111111, | |
| "grad_norm": 1.5001232721911446, | |
| "learning_rate": 2.2454164223819443e-06, | |
| "loss": 0.1435, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 2.1546666666666665, | |
| "grad_norm": 1.6096086307058128, | |
| "learning_rate": 2.228172877593563e-06, | |
| "loss": 0.1248, | |
| "step": 6060 | |
| }, | |
| { | |
| "epoch": 2.1582222222222223, | |
| "grad_norm": 1.4625689431665512, | |
| "learning_rate": 2.2109767955518135e-06, | |
| "loss": 0.129, | |
| "step": 6070 | |
| }, | |
| { | |
| "epoch": 2.1617777777777776, | |
| "grad_norm": 1.7396993983427422, | |
| "learning_rate": 2.193828470709863e-06, | |
| "loss": 0.1259, | |
| "step": 6080 | |
| }, | |
| { | |
| "epoch": 2.1653333333333333, | |
| "grad_norm": 1.4423513554123952, | |
| "learning_rate": 2.176728196703122e-06, | |
| "loss": 0.1308, | |
| "step": 6090 | |
| }, | |
| { | |
| "epoch": 2.168888888888889, | |
| "grad_norm": 1.9920936118384482, | |
| "learning_rate": 2.159676266344222e-06, | |
| "loss": 0.1496, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 2.1724444444444444, | |
| "grad_norm": 2.13727569719491, | |
| "learning_rate": 2.142672971617978e-06, | |
| "loss": 0.1359, | |
| "step": 6110 | |
| }, | |
| { | |
| "epoch": 2.176, | |
| "grad_norm": 1.5724700258419562, | |
| "learning_rate": 2.125718603676413e-06, | |
| "loss": 0.1412, | |
| "step": 6120 | |
| }, | |
| { | |
| "epoch": 2.1795555555555555, | |
| "grad_norm": 1.3817720285663424, | |
| "learning_rate": 2.1088134528337635e-06, | |
| "loss": 0.1357, | |
| "step": 6130 | |
| }, | |
| { | |
| "epoch": 2.1831111111111112, | |
| "grad_norm": 1.6852270201894561, | |
| "learning_rate": 2.091957808561505e-06, | |
| "loss": 0.1388, | |
| "step": 6140 | |
| }, | |
| { | |
| "epoch": 2.1866666666666665, | |
| "grad_norm": 1.5752301082061768, | |
| "learning_rate": 2.0751519594834025e-06, | |
| "loss": 0.1359, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 2.1902222222222223, | |
| "grad_norm": 1.9588237176858065, | |
| "learning_rate": 2.058396193370556e-06, | |
| "loss": 0.1364, | |
| "step": 6160 | |
| }, | |
| { | |
| "epoch": 2.1937777777777776, | |
| "grad_norm": 1.5906028620881005, | |
| "learning_rate": 2.0416907971364937e-06, | |
| "loss": 0.1286, | |
| "step": 6170 | |
| }, | |
| { | |
| "epoch": 2.1973333333333334, | |
| "grad_norm": 1.6040127033831966, | |
| "learning_rate": 2.0250360568322395e-06, | |
| "loss": 0.132, | |
| "step": 6180 | |
| }, | |
| { | |
| "epoch": 2.2008888888888887, | |
| "grad_norm": 1.903945940065679, | |
| "learning_rate": 2.0084322576414205e-06, | |
| "loss": 0.1311, | |
| "step": 6190 | |
| }, | |
| { | |
| "epoch": 2.2044444444444444, | |
| "grad_norm": 1.7327408494603853, | |
| "learning_rate": 1.991879683875386e-06, | |
| "loss": 0.1412, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 2.208, | |
| "grad_norm": 1.6938104353348038, | |
| "learning_rate": 1.975378618968348e-06, | |
| "loss": 0.1358, | |
| "step": 6210 | |
| }, | |
| { | |
| "epoch": 2.2115555555555555, | |
| "grad_norm": 1.498102728760879, | |
| "learning_rate": 1.958929345472503e-06, | |
| "loss": 0.1272, | |
| "step": 6220 | |
| }, | |
| { | |
| "epoch": 2.2151111111111113, | |
| "grad_norm": 1.5061713395545921, | |
| "learning_rate": 1.942532145053219e-06, | |
| "loss": 0.1335, | |
| "step": 6230 | |
| }, | |
| { | |
| "epoch": 2.2186666666666666, | |
| "grad_norm": 1.8881968807558394, | |
| "learning_rate": 1.926187298484201e-06, | |
| "loss": 0.13, | |
| "step": 6240 | |
| }, | |
| { | |
| "epoch": 2.2222222222222223, | |
| "grad_norm": 1.7409457044279315, | |
| "learning_rate": 1.9098950856426845e-06, | |
| "loss": 0.1197, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 2.2257777777777776, | |
| "grad_norm": 1.7410736866607524, | |
| "learning_rate": 1.893655785504644e-06, | |
| "loss": 0.136, | |
| "step": 6260 | |
| }, | |
| { | |
| "epoch": 2.2293333333333334, | |
| "grad_norm": 1.4673795329307866, | |
| "learning_rate": 1.8774696761400107e-06, | |
| "loss": 0.1351, | |
| "step": 6270 | |
| }, | |
| { | |
| "epoch": 2.2328888888888887, | |
| "grad_norm": 1.4286935284704283, | |
| "learning_rate": 1.8613370347079207e-06, | |
| "loss": 0.1316, | |
| "step": 6280 | |
| }, | |
| { | |
| "epoch": 2.2364444444444445, | |
| "grad_norm": 1.6752679462634348, | |
| "learning_rate": 1.845258137451968e-06, | |
| "loss": 0.1343, | |
| "step": 6290 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "grad_norm": 1.5334658674891999, | |
| "learning_rate": 1.8292332596954605e-06, | |
| "loss": 0.1252, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 2.2435555555555555, | |
| "grad_norm": 1.7816021858972186, | |
| "learning_rate": 1.8132626758367217e-06, | |
| "loss": 0.1373, | |
| "step": 6310 | |
| }, | |
| { | |
| "epoch": 2.2471111111111113, | |
| "grad_norm": 1.4751058571451898, | |
| "learning_rate": 1.7973466593443861e-06, | |
| "loss": 0.1238, | |
| "step": 6320 | |
| }, | |
| { | |
| "epoch": 2.2506666666666666, | |
| "grad_norm": 1.5737118263350949, | |
| "learning_rate": 1.7814854827527144e-06, | |
| "loss": 0.1331, | |
| "step": 6330 | |
| }, | |
| { | |
| "epoch": 2.2542222222222223, | |
| "grad_norm": 1.6723085510766795, | |
| "learning_rate": 1.7656794176569302e-06, | |
| "loss": 0.1392, | |
| "step": 6340 | |
| }, | |
| { | |
| "epoch": 2.2577777777777777, | |
| "grad_norm": 1.6074614963797307, | |
| "learning_rate": 1.749928734708568e-06, | |
| "loss": 0.1482, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 2.2613333333333334, | |
| "grad_norm": 1.514935517928495, | |
| "learning_rate": 1.734233703610838e-06, | |
| "loss": 0.1318, | |
| "step": 6360 | |
| }, | |
| { | |
| "epoch": 2.2648888888888887, | |
| "grad_norm": 2.1990045539686767, | |
| "learning_rate": 1.7185945931140086e-06, | |
| "loss": 0.1389, | |
| "step": 6370 | |
| }, | |
| { | |
| "epoch": 2.2684444444444445, | |
| "grad_norm": 1.7900402567821287, | |
| "learning_rate": 1.7030116710108068e-06, | |
| "loss": 0.1402, | |
| "step": 6380 | |
| }, | |
| { | |
| "epoch": 2.2720000000000002, | |
| "grad_norm": 1.5936415333953513, | |
| "learning_rate": 1.6874852041318246e-06, | |
| "loss": 0.1383, | |
| "step": 6390 | |
| }, | |
| { | |
| "epoch": 2.2755555555555556, | |
| "grad_norm": 1.6874167667097502, | |
| "learning_rate": 1.6720154583409642e-06, | |
| "loss": 0.1297, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 2.279111111111111, | |
| "grad_norm": 1.7461565673164665, | |
| "learning_rate": 1.6566026985308737e-06, | |
| "loss": 0.1265, | |
| "step": 6410 | |
| }, | |
| { | |
| "epoch": 2.2826666666666666, | |
| "grad_norm": 1.9943666083505533, | |
| "learning_rate": 1.6412471886184106e-06, | |
| "loss": 0.1433, | |
| "step": 6420 | |
| }, | |
| { | |
| "epoch": 2.2862222222222224, | |
| "grad_norm": 1.889269033390485, | |
| "learning_rate": 1.6259491915401322e-06, | |
| "loss": 0.1295, | |
| "step": 6430 | |
| }, | |
| { | |
| "epoch": 2.2897777777777777, | |
| "grad_norm": 1.9954192603921324, | |
| "learning_rate": 1.6107089692477856e-06, | |
| "loss": 0.1506, | |
| "step": 6440 | |
| }, | |
| { | |
| "epoch": 2.2933333333333334, | |
| "grad_norm": 1.73943513110269, | |
| "learning_rate": 1.5955267827038267e-06, | |
| "loss": 0.1309, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 2.2968888888888888, | |
| "grad_norm": 1.5696215992092173, | |
| "learning_rate": 1.5804028918769488e-06, | |
| "loss": 0.1245, | |
| "step": 6460 | |
| }, | |
| { | |
| "epoch": 2.3004444444444445, | |
| "grad_norm": 1.4480211516999386, | |
| "learning_rate": 1.5653375557376266e-06, | |
| "loss": 0.1419, | |
| "step": 6470 | |
| }, | |
| { | |
| "epoch": 2.304, | |
| "grad_norm": 1.7769598112511977, | |
| "learning_rate": 1.5503310322536962e-06, | |
| "loss": 0.1357, | |
| "step": 6480 | |
| }, | |
| { | |
| "epoch": 2.3075555555555556, | |
| "grad_norm": 1.6914490635403432, | |
| "learning_rate": 1.5353835783859244e-06, | |
| "loss": 0.1344, | |
| "step": 6490 | |
| }, | |
| { | |
| "epoch": 2.311111111111111, | |
| "grad_norm": 1.2896364219654397, | |
| "learning_rate": 1.5204954500836095e-06, | |
| "loss": 0.1336, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 2.311111111111111, | |
| "eval_loss": 0.2400493621826172, | |
| "eval_runtime": 562.3512, | |
| "eval_samples_per_second": 17.782, | |
| "eval_steps_per_second": 4.446, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 2.3146666666666667, | |
| "grad_norm": 1.6249516275302234, | |
| "learning_rate": 1.5056669022802051e-06, | |
| "loss": 0.1578, | |
| "step": 6510 | |
| }, | |
| { | |
| "epoch": 2.3182222222222224, | |
| "grad_norm": 1.5534728727358678, | |
| "learning_rate": 1.4908981888889562e-06, | |
| "loss": 0.1236, | |
| "step": 6520 | |
| }, | |
| { | |
| "epoch": 2.3217777777777777, | |
| "grad_norm": 2.305594450780404, | |
| "learning_rate": 1.4761895627985384e-06, | |
| "loss": 0.1437, | |
| "step": 6530 | |
| }, | |
| { | |
| "epoch": 2.3253333333333335, | |
| "grad_norm": 1.7525804358624415, | |
| "learning_rate": 1.461541275868742e-06, | |
| "loss": 0.1244, | |
| "step": 6540 | |
| }, | |
| { | |
| "epoch": 2.328888888888889, | |
| "grad_norm": 1.5857723879215653, | |
| "learning_rate": 1.4469535789261518e-06, | |
| "loss": 0.138, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 2.3324444444444445, | |
| "grad_norm": 1.4470785666281207, | |
| "learning_rate": 1.4324267217598543e-06, | |
| "loss": 0.1311, | |
| "step": 6560 | |
| }, | |
| { | |
| "epoch": 2.336, | |
| "grad_norm": 1.5783013529079604, | |
| "learning_rate": 1.41796095311716e-06, | |
| "loss": 0.1476, | |
| "step": 6570 | |
| }, | |
| { | |
| "epoch": 2.3395555555555556, | |
| "grad_norm": 1.792387189040966, | |
| "learning_rate": 1.4035565206993407e-06, | |
| "loss": 0.1313, | |
| "step": 6580 | |
| }, | |
| { | |
| "epoch": 2.343111111111111, | |
| "grad_norm": 2.0097219507066986, | |
| "learning_rate": 1.3892136711573983e-06, | |
| "loss": 0.1481, | |
| "step": 6590 | |
| }, | |
| { | |
| "epoch": 2.3466666666666667, | |
| "grad_norm": 1.6038575587094324, | |
| "learning_rate": 1.3749326500878308e-06, | |
| "loss": 0.1329, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 2.3502222222222224, | |
| "grad_norm": 1.8038941533229218, | |
| "learning_rate": 1.3607137020284267e-06, | |
| "loss": 0.1296, | |
| "step": 6610 | |
| }, | |
| { | |
| "epoch": 2.3537777777777777, | |
| "grad_norm": 1.5967517903597408, | |
| "learning_rate": 1.3465570704540877e-06, | |
| "loss": 0.1323, | |
| "step": 6620 | |
| }, | |
| { | |
| "epoch": 2.3573333333333335, | |
| "grad_norm": 1.6630671725280828, | |
| "learning_rate": 1.33246299777265e-06, | |
| "loss": 0.1353, | |
| "step": 6630 | |
| }, | |
| { | |
| "epoch": 2.360888888888889, | |
| "grad_norm": 1.6910996186336409, | |
| "learning_rate": 1.3184317253207379e-06, | |
| "loss": 0.1198, | |
| "step": 6640 | |
| }, | |
| { | |
| "epoch": 2.3644444444444446, | |
| "grad_norm": 1.667550829249205, | |
| "learning_rate": 1.3044634933596311e-06, | |
| "loss": 0.1398, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 2.368, | |
| "grad_norm": 1.3604264834299673, | |
| "learning_rate": 1.290558541071148e-06, | |
| "loss": 0.123, | |
| "step": 6660 | |
| }, | |
| { | |
| "epoch": 2.3715555555555556, | |
| "grad_norm": 1.4966865021721736, | |
| "learning_rate": 1.2767171065535538e-06, | |
| "loss": 0.1221, | |
| "step": 6670 | |
| }, | |
| { | |
| "epoch": 2.375111111111111, | |
| "grad_norm": 1.3751769981745194, | |
| "learning_rate": 1.2629394268174811e-06, | |
| "loss": 0.1398, | |
| "step": 6680 | |
| }, | |
| { | |
| "epoch": 2.3786666666666667, | |
| "grad_norm": 1.7552964254373993, | |
| "learning_rate": 1.2492257377818734e-06, | |
| "loss": 0.122, | |
| "step": 6690 | |
| }, | |
| { | |
| "epoch": 2.3822222222222225, | |
| "grad_norm": 1.984424873865648, | |
| "learning_rate": 1.235576274269938e-06, | |
| "loss": 0.1366, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 2.3857777777777778, | |
| "grad_norm": 1.8024296643627178, | |
| "learning_rate": 1.2219912700051417e-06, | |
| "loss": 0.1304, | |
| "step": 6710 | |
| }, | |
| { | |
| "epoch": 2.389333333333333, | |
| "grad_norm": 1.6704237658027163, | |
| "learning_rate": 1.2084709576071885e-06, | |
| "loss": 0.1339, | |
| "step": 6720 | |
| }, | |
| { | |
| "epoch": 2.392888888888889, | |
| "grad_norm": 1.8905223292433262, | |
| "learning_rate": 1.1950155685880504e-06, | |
| "loss": 0.138, | |
| "step": 6730 | |
| }, | |
| { | |
| "epoch": 2.3964444444444446, | |
| "grad_norm": 1.8585326052998994, | |
| "learning_rate": 1.1816253333479994e-06, | |
| "loss": 0.1402, | |
| "step": 6740 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 1.4117751565900303, | |
| "learning_rate": 1.1683004811716597e-06, | |
| "loss": 0.1219, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 2.4035555555555557, | |
| "grad_norm": 2.177441304004068, | |
| "learning_rate": 1.1550412402240852e-06, | |
| "loss": 0.1472, | |
| "step": 6760 | |
| }, | |
| { | |
| "epoch": 2.407111111111111, | |
| "grad_norm": 1.7312870442889088, | |
| "learning_rate": 1.1418478375468496e-06, | |
| "loss": 0.14, | |
| "step": 6770 | |
| }, | |
| { | |
| "epoch": 2.4106666666666667, | |
| "grad_norm": 1.4691171208612808, | |
| "learning_rate": 1.1287204990541612e-06, | |
| "loss": 0.1382, | |
| "step": 6780 | |
| }, | |
| { | |
| "epoch": 2.414222222222222, | |
| "grad_norm": 1.9102821919207582, | |
| "learning_rate": 1.1156594495289923e-06, | |
| "loss": 0.1508, | |
| "step": 6790 | |
| }, | |
| { | |
| "epoch": 2.417777777777778, | |
| "grad_norm": 1.5765296328104144, | |
| "learning_rate": 1.1026649126192334e-06, | |
| "loss": 0.1244, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 2.421333333333333, | |
| "grad_norm": 1.485558878346715, | |
| "learning_rate": 1.0897371108338572e-06, | |
| "loss": 0.1262, | |
| "step": 6810 | |
| }, | |
| { | |
| "epoch": 2.424888888888889, | |
| "grad_norm": 1.6805947418795415, | |
| "learning_rate": 1.076876265539115e-06, | |
| "loss": 0.1397, | |
| "step": 6820 | |
| }, | |
| { | |
| "epoch": 2.4284444444444446, | |
| "grad_norm": 1.8439671145791727, | |
| "learning_rate": 1.0640825969547498e-06, | |
| "loss": 0.1298, | |
| "step": 6830 | |
| }, | |
| { | |
| "epoch": 2.432, | |
| "grad_norm": 1.8675356289498493, | |
| "learning_rate": 1.051356324150209e-06, | |
| "loss": 0.1334, | |
| "step": 6840 | |
| }, | |
| { | |
| "epoch": 2.4355555555555557, | |
| "grad_norm": 2.097329265797065, | |
| "learning_rate": 1.0386976650409102e-06, | |
| "loss": 0.1342, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 2.439111111111111, | |
| "grad_norm": 1.7733262424549074, | |
| "learning_rate": 1.0261068363845034e-06, | |
| "loss": 0.1297, | |
| "step": 6860 | |
| }, | |
| { | |
| "epoch": 2.4426666666666668, | |
| "grad_norm": 1.7698885455909084, | |
| "learning_rate": 1.0135840537771574e-06, | |
| "loss": 0.1355, | |
| "step": 6870 | |
| }, | |
| { | |
| "epoch": 2.446222222222222, | |
| "grad_norm": 1.699595680180769, | |
| "learning_rate": 1.001129531649872e-06, | |
| "loss": 0.1255, | |
| "step": 6880 | |
| }, | |
| { | |
| "epoch": 2.449777777777778, | |
| "grad_norm": 1.8061641909036275, | |
| "learning_rate": 9.887434832647997e-07, | |
| "loss": 0.1355, | |
| "step": 6890 | |
| }, | |
| { | |
| "epoch": 2.453333333333333, | |
| "grad_norm": 1.8282679409791762, | |
| "learning_rate": 9.764261207116061e-07, | |
| "loss": 0.1437, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 2.456888888888889, | |
| "grad_norm": 1.8691781223789907, | |
| "learning_rate": 9.641776549038257e-07, | |
| "loss": 0.1274, | |
| "step": 6910 | |
| }, | |
| { | |
| "epoch": 2.4604444444444447, | |
| "grad_norm": 1.8720204975109627, | |
| "learning_rate": 9.519982955752549e-07, | |
| "loss": 0.1321, | |
| "step": 6920 | |
| }, | |
| { | |
| "epoch": 2.464, | |
| "grad_norm": 1.714725769185188, | |
| "learning_rate": 9.398882512763618e-07, | |
| "loss": 0.1299, | |
| "step": 6930 | |
| }, | |
| { | |
| "epoch": 2.4675555555555557, | |
| "grad_norm": 1.5736356325676821, | |
| "learning_rate": 9.278477293707189e-07, | |
| "loss": 0.1454, | |
| "step": 6940 | |
| }, | |
| { | |
| "epoch": 2.471111111111111, | |
| "grad_norm": 1.7235279739808778, | |
| "learning_rate": 9.158769360314412e-07, | |
| "loss": 0.1301, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 2.474666666666667, | |
| "grad_norm": 1.7964601353844663, | |
| "learning_rate": 9.039760762376665e-07, | |
| "loss": 0.1329, | |
| "step": 6960 | |
| }, | |
| { | |
| "epoch": 2.478222222222222, | |
| "grad_norm": 1.7113961505997257, | |
| "learning_rate": 8.921453537710406e-07, | |
| "loss": 0.1301, | |
| "step": 6970 | |
| }, | |
| { | |
| "epoch": 2.481777777777778, | |
| "grad_norm": 3.7247151362742708, | |
| "learning_rate": 8.803849712122292e-07, | |
| "loss": 0.1366, | |
| "step": 6980 | |
| }, | |
| { | |
| "epoch": 2.485333333333333, | |
| "grad_norm": 1.6042128553101094, | |
| "learning_rate": 8.686951299374474e-07, | |
| "loss": 0.1248, | |
| "step": 6990 | |
| }, | |
| { | |
| "epoch": 2.488888888888889, | |
| "grad_norm": 1.7566315817690532, | |
| "learning_rate": 8.570760301150166e-07, | |
| "loss": 0.1397, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 2.488888888888889, | |
| "eval_loss": 0.239632710814476, | |
| "eval_runtime": 563.0915, | |
| "eval_samples_per_second": 17.759, | |
| "eval_steps_per_second": 4.44, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 2.4924444444444447, | |
| "grad_norm": 1.915869222287072, | |
| "learning_rate": 8.455278707019255e-07, | |
| "loss": 0.133, | |
| "step": 7010 | |
| }, | |
| { | |
| "epoch": 2.496, | |
| "grad_norm": 1.4611242467498158, | |
| "learning_rate": 8.340508494404415e-07, | |
| "loss": 0.128, | |
| "step": 7020 | |
| }, | |
| { | |
| "epoch": 2.4995555555555553, | |
| "grad_norm": 1.8274207116893812, | |
| "learning_rate": 8.226451628547039e-07, | |
| "loss": 0.1304, | |
| "step": 7030 | |
| }, | |
| { | |
| "epoch": 2.503111111111111, | |
| "grad_norm": 1.5195837090357422, | |
| "learning_rate": 8.113110062473756e-07, | |
| "loss": 0.1337, | |
| "step": 7040 | |
| }, | |
| { | |
| "epoch": 2.506666666666667, | |
| "grad_norm": 1.534284195780538, | |
| "learning_rate": 8.000485736962899e-07, | |
| "loss": 0.1365, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 2.510222222222222, | |
| "grad_norm": 1.3874360730778557, | |
| "learning_rate": 7.888580580511307e-07, | |
| "loss": 0.1157, | |
| "step": 7060 | |
| }, | |
| { | |
| "epoch": 2.513777777777778, | |
| "grad_norm": 1.347897014568791, | |
| "learning_rate": 7.777396509301278e-07, | |
| "loss": 0.1258, | |
| "step": 7070 | |
| }, | |
| { | |
| "epoch": 2.517333333333333, | |
| "grad_norm": 1.5444960857241712, | |
| "learning_rate": 7.666935427167777e-07, | |
| "loss": 0.1261, | |
| "step": 7080 | |
| }, | |
| { | |
| "epoch": 2.520888888888889, | |
| "grad_norm": 1.5787802499569878, | |
| "learning_rate": 7.557199225565848e-07, | |
| "loss": 0.1353, | |
| "step": 7090 | |
| }, | |
| { | |
| "epoch": 2.5244444444444447, | |
| "grad_norm": 1.6575537900928325, | |
| "learning_rate": 7.448189783538184e-07, | |
| "loss": 0.1223, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 2.528, | |
| "grad_norm": 1.58456318992188, | |
| "learning_rate": 7.339908967683007e-07, | |
| "loss": 0.1227, | |
| "step": 7110 | |
| }, | |
| { | |
| "epoch": 2.5315555555555553, | |
| "grad_norm": 1.916341417565209, | |
| "learning_rate": 7.232358632122022e-07, | |
| "loss": 0.1365, | |
| "step": 7120 | |
| }, | |
| { | |
| "epoch": 2.535111111111111, | |
| "grad_norm": 2.009648842498942, | |
| "learning_rate": 7.125540618468784e-07, | |
| "loss": 0.1435, | |
| "step": 7130 | |
| }, | |
| { | |
| "epoch": 2.538666666666667, | |
| "grad_norm": 1.2589650678388224, | |
| "learning_rate": 7.019456755797083e-07, | |
| "loss": 0.1333, | |
| "step": 7140 | |
| }, | |
| { | |
| "epoch": 2.542222222222222, | |
| "grad_norm": 1.534526581817288, | |
| "learning_rate": 6.914108860609608e-07, | |
| "loss": 0.1372, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 2.545777777777778, | |
| "grad_norm": 1.5742622053962463, | |
| "learning_rate": 6.809498736806919e-07, | |
| "loss": 0.135, | |
| "step": 7160 | |
| }, | |
| { | |
| "epoch": 2.5493333333333332, | |
| "grad_norm": 1.876907152948741, | |
| "learning_rate": 6.705628175656498e-07, | |
| "loss": 0.1304, | |
| "step": 7170 | |
| }, | |
| { | |
| "epoch": 2.552888888888889, | |
| "grad_norm": 1.7507039554831174, | |
| "learning_rate": 6.602498955762105e-07, | |
| "loss": 0.1361, | |
| "step": 7180 | |
| }, | |
| { | |
| "epoch": 2.5564444444444443, | |
| "grad_norm": 1.5168112309443524, | |
| "learning_rate": 6.500112843033313e-07, | |
| "loss": 0.1235, | |
| "step": 7190 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "grad_norm": 1.366857399391539, | |
| "learning_rate": 6.39847159065523e-07, | |
| "loss": 0.1268, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 2.5635555555555554, | |
| "grad_norm": 1.7472209117726187, | |
| "learning_rate": 6.297576939058586e-07, | |
| "loss": 0.1338, | |
| "step": 7210 | |
| }, | |
| { | |
| "epoch": 2.567111111111111, | |
| "grad_norm": 1.5771285823832333, | |
| "learning_rate": 6.197430615889838e-07, | |
| "loss": 0.1304, | |
| "step": 7220 | |
| }, | |
| { | |
| "epoch": 2.570666666666667, | |
| "grad_norm": 1.5122386895026887, | |
| "learning_rate": 6.098034335981573e-07, | |
| "loss": 0.1255, | |
| "step": 7230 | |
| }, | |
| { | |
| "epoch": 2.574222222222222, | |
| "grad_norm": 1.5101320862852827, | |
| "learning_rate": 5.999389801323219e-07, | |
| "loss": 0.128, | |
| "step": 7240 | |
| }, | |
| { | |
| "epoch": 2.5777777777777775, | |
| "grad_norm": 1.751375058176443, | |
| "learning_rate": 5.901498701031894e-07, | |
| "loss": 0.131, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 2.5813333333333333, | |
| "grad_norm": 1.5370110538793642, | |
| "learning_rate": 5.804362711323391e-07, | |
| "loss": 0.1273, | |
| "step": 7260 | |
| }, | |
| { | |
| "epoch": 2.584888888888889, | |
| "grad_norm": 1.5422190674222276, | |
| "learning_rate": 5.707983495483593e-07, | |
| "loss": 0.122, | |
| "step": 7270 | |
| }, | |
| { | |
| "epoch": 2.5884444444444443, | |
| "grad_norm": 1.8111593254497258, | |
| "learning_rate": 5.612362703839907e-07, | |
| "loss": 0.1308, | |
| "step": 7280 | |
| }, | |
| { | |
| "epoch": 2.592, | |
| "grad_norm": 1.7898287718649462, | |
| "learning_rate": 5.517501973733059e-07, | |
| "loss": 0.1239, | |
| "step": 7290 | |
| }, | |
| { | |
| "epoch": 2.5955555555555554, | |
| "grad_norm": 1.5741550714022359, | |
| "learning_rate": 5.423402929489019e-07, | |
| "loss": 0.1242, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 2.599111111111111, | |
| "grad_norm": 1.7431025808198797, | |
| "learning_rate": 5.330067182391219e-07, | |
| "loss": 0.1258, | |
| "step": 7310 | |
| }, | |
| { | |
| "epoch": 2.602666666666667, | |
| "grad_norm": 1.669472703725672, | |
| "learning_rate": 5.237496330652925e-07, | |
| "loss": 0.1318, | |
| "step": 7320 | |
| }, | |
| { | |
| "epoch": 2.606222222222222, | |
| "grad_norm": 1.7086096850592123, | |
| "learning_rate": 5.145691959389932e-07, | |
| "loss": 0.1292, | |
| "step": 7330 | |
| }, | |
| { | |
| "epoch": 2.6097777777777775, | |
| "grad_norm": 1.79780883791639, | |
| "learning_rate": 5.054655640593325e-07, | |
| "loss": 0.1446, | |
| "step": 7340 | |
| }, | |
| { | |
| "epoch": 2.6133333333333333, | |
| "grad_norm": 1.760230682240199, | |
| "learning_rate": 4.964388933102666e-07, | |
| "loss": 0.1418, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 2.616888888888889, | |
| "grad_norm": 1.540197801989686, | |
| "learning_rate": 4.874893382579232e-07, | |
| "loss": 0.1269, | |
| "step": 7360 | |
| }, | |
| { | |
| "epoch": 2.6204444444444444, | |
| "grad_norm": 1.7177370855999565, | |
| "learning_rate": 4.786170521479588e-07, | |
| "loss": 0.1223, | |
| "step": 7370 | |
| }, | |
| { | |
| "epoch": 2.624, | |
| "grad_norm": 1.881294576905093, | |
| "learning_rate": 4.698221869029307e-07, | |
| "loss": 0.1443, | |
| "step": 7380 | |
| }, | |
| { | |
| "epoch": 2.6275555555555554, | |
| "grad_norm": 1.74196972034532, | |
| "learning_rate": 4.6110489311969876e-07, | |
| "loss": 0.1429, | |
| "step": 7390 | |
| }, | |
| { | |
| "epoch": 2.631111111111111, | |
| "grad_norm": 1.5651241374342044, | |
| "learning_rate": 4.524653200668461e-07, | |
| "loss": 0.1264, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 2.634666666666667, | |
| "grad_norm": 1.8251309622054404, | |
| "learning_rate": 4.439036156821225e-07, | |
| "loss": 0.1213, | |
| "step": 7410 | |
| }, | |
| { | |
| "epoch": 2.6382222222222222, | |
| "grad_norm": 1.4351427368380598, | |
| "learning_rate": 4.3541992656991163e-07, | |
| "loss": 0.1182, | |
| "step": 7420 | |
| }, | |
| { | |
| "epoch": 2.6417777777777776, | |
| "grad_norm": 1.9769377027322241, | |
| "learning_rate": 4.2701439799871847e-07, | |
| "loss": 0.1453, | |
| "step": 7430 | |
| }, | |
| { | |
| "epoch": 2.6453333333333333, | |
| "grad_norm": 1.6755217149463195, | |
| "learning_rate": 4.1868717389868694e-07, | |
| "loss": 0.1284, | |
| "step": 7440 | |
| }, | |
| { | |
| "epoch": 2.648888888888889, | |
| "grad_norm": 1.4882784431490907, | |
| "learning_rate": 4.1043839685913135e-07, | |
| "loss": 0.1289, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 2.6524444444444444, | |
| "grad_norm": 1.2678152146637376, | |
| "learning_rate": 4.022682081260942e-07, | |
| "loss": 0.122, | |
| "step": 7460 | |
| }, | |
| { | |
| "epoch": 2.656, | |
| "grad_norm": 1.7036091433400906, | |
| "learning_rate": 3.941767475999297e-07, | |
| "loss": 0.1292, | |
| "step": 7470 | |
| }, | |
| { | |
| "epoch": 2.6595555555555555, | |
| "grad_norm": 2.0073020304210485, | |
| "learning_rate": 3.8616415383291083e-07, | |
| "loss": 0.1281, | |
| "step": 7480 | |
| }, | |
| { | |
| "epoch": 2.663111111111111, | |
| "grad_norm": 1.7003882572239488, | |
| "learning_rate": 3.7823056402684856e-07, | |
| "loss": 0.1205, | |
| "step": 7490 | |
| }, | |
| { | |
| "epoch": 2.6666666666666665, | |
| "grad_norm": 1.8649824143158358, | |
| "learning_rate": 3.70376114030751e-07, | |
| "loss": 0.1405, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 2.6666666666666665, | |
| "eval_loss": 0.2399507761001587, | |
| "eval_runtime": 561.5965, | |
| "eval_samples_per_second": 17.806, | |
| "eval_steps_per_second": 4.452, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 2.6702222222222223, | |
| "grad_norm": 1.778861851144716, | |
| "learning_rate": 3.626009383384926e-07, | |
| "loss": 0.1424, | |
| "step": 7510 | |
| }, | |
| { | |
| "epoch": 2.6737777777777776, | |
| "grad_norm": 1.7506343466298935, | |
| "learning_rate": 3.549051700865136e-07, | |
| "loss": 0.1242, | |
| "step": 7520 | |
| }, | |
| { | |
| "epoch": 2.6773333333333333, | |
| "grad_norm": 1.5579333925843626, | |
| "learning_rate": 3.47288941051539e-07, | |
| "loss": 0.125, | |
| "step": 7530 | |
| }, | |
| { | |
| "epoch": 2.680888888888889, | |
| "grad_norm": 2.030096385748008, | |
| "learning_rate": 3.3975238164831893e-07, | |
| "loss": 0.1253, | |
| "step": 7540 | |
| }, | |
| { | |
| "epoch": 2.6844444444444444, | |
| "grad_norm": 1.635535994621638, | |
| "learning_rate": 3.322956209274031e-07, | |
| "loss": 0.1322, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 2.6879999999999997, | |
| "grad_norm": 1.7329277515156414, | |
| "learning_rate": 3.2491878657292643e-07, | |
| "loss": 0.1355, | |
| "step": 7560 | |
| }, | |
| { | |
| "epoch": 2.6915555555555555, | |
| "grad_norm": 1.7444157426686764, | |
| "learning_rate": 3.176220049004197e-07, | |
| "loss": 0.1179, | |
| "step": 7570 | |
| }, | |
| { | |
| "epoch": 2.6951111111111112, | |
| "grad_norm": 1.3483728954452034, | |
| "learning_rate": 3.104054008546525e-07, | |
| "loss": 0.1338, | |
| "step": 7580 | |
| }, | |
| { | |
| "epoch": 2.6986666666666665, | |
| "grad_norm": 1.3906620863471058, | |
| "learning_rate": 3.032690980074915e-07, | |
| "loss": 0.131, | |
| "step": 7590 | |
| }, | |
| { | |
| "epoch": 2.7022222222222223, | |
| "grad_norm": 1.8327466893042572, | |
| "learning_rate": 2.962132185557826e-07, | |
| "loss": 0.1223, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 2.7057777777777776, | |
| "grad_norm": 1.5547638545825841, | |
| "learning_rate": 2.892378833192611e-07, | |
| "loss": 0.1282, | |
| "step": 7610 | |
| }, | |
| { | |
| "epoch": 2.7093333333333334, | |
| "grad_norm": 1.804096897597165, | |
| "learning_rate": 2.823432117384822e-07, | |
| "loss": 0.1321, | |
| "step": 7620 | |
| }, | |
| { | |
| "epoch": 2.712888888888889, | |
| "grad_norm": 1.5920189474841397, | |
| "learning_rate": 2.755293218727739e-07, | |
| "loss": 0.1266, | |
| "step": 7630 | |
| }, | |
| { | |
| "epoch": 2.7164444444444444, | |
| "grad_norm": 1.95119518386987, | |
| "learning_rate": 2.6879633039821994e-07, | |
| "loss": 0.1356, | |
| "step": 7640 | |
| }, | |
| { | |
| "epoch": 2.7199999999999998, | |
| "grad_norm": 1.8385230420520196, | |
| "learning_rate": 2.62144352605655e-07, | |
| "loss": 0.1262, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 2.7235555555555555, | |
| "grad_norm": 1.7885799872230752, | |
| "learning_rate": 2.555735023986966e-07, | |
| "loss": 0.1315, | |
| "step": 7660 | |
| }, | |
| { | |
| "epoch": 2.7271111111111113, | |
| "grad_norm": 1.8941729319880476, | |
| "learning_rate": 2.4908389229179484e-07, | |
| "loss": 0.1179, | |
| "step": 7670 | |
| }, | |
| { | |
| "epoch": 2.7306666666666666, | |
| "grad_norm": 1.5725333890356554, | |
| "learning_rate": 2.4267563340830026e-07, | |
| "loss": 0.1122, | |
| "step": 7680 | |
| }, | |
| { | |
| "epoch": 2.7342222222222223, | |
| "grad_norm": 1.9949059298619423, | |
| "learning_rate": 2.363488354785648e-07, | |
| "loss": 0.1372, | |
| "step": 7690 | |
| }, | |
| { | |
| "epoch": 2.7377777777777776, | |
| "grad_norm": 1.706241835042834, | |
| "learning_rate": 2.301036068380641e-07, | |
| "loss": 0.1303, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 2.7413333333333334, | |
| "grad_norm": 1.5015166048586166, | |
| "learning_rate": 2.239400544255399e-07, | |
| "loss": 0.121, | |
| "step": 7710 | |
| }, | |
| { | |
| "epoch": 2.744888888888889, | |
| "grad_norm": 1.69358016809196, | |
| "learning_rate": 2.178582837811688e-07, | |
| "loss": 0.1249, | |
| "step": 7720 | |
| }, | |
| { | |
| "epoch": 2.7484444444444445, | |
| "grad_norm": 1.9732967017351475, | |
| "learning_rate": 2.1185839904475869e-07, | |
| "loss": 0.133, | |
| "step": 7730 | |
| }, | |
| { | |
| "epoch": 2.752, | |
| "grad_norm": 1.5594363807881604, | |
| "learning_rate": 2.0594050295395852e-07, | |
| "loss": 0.1304, | |
| "step": 7740 | |
| }, | |
| { | |
| "epoch": 2.7555555555555555, | |
| "grad_norm": 2.026099043557669, | |
| "learning_rate": 2.0010469684250856e-07, | |
| "loss": 0.1385, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 2.7591111111111113, | |
| "grad_norm": 1.5917173969753626, | |
| "learning_rate": 1.9435108063849684e-07, | |
| "loss": 0.1365, | |
| "step": 7760 | |
| }, | |
| { | |
| "epoch": 2.7626666666666666, | |
| "grad_norm": 1.7387563784538043, | |
| "learning_rate": 1.8867975286265106e-07, | |
| "loss": 0.1278, | |
| "step": 7770 | |
| }, | |
| { | |
| "epoch": 2.7662222222222224, | |
| "grad_norm": 1.491992475001642, | |
| "learning_rate": 1.830908106266538e-07, | |
| "loss": 0.1169, | |
| "step": 7780 | |
| }, | |
| { | |
| "epoch": 2.7697777777777777, | |
| "grad_norm": 1.8209635910179756, | |
| "learning_rate": 1.7758434963147665e-07, | |
| "loss": 0.143, | |
| "step": 7790 | |
| }, | |
| { | |
| "epoch": 2.7733333333333334, | |
| "grad_norm": 1.6054626426110197, | |
| "learning_rate": 1.7216046416574316e-07, | |
| "loss": 0.1335, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 2.7768888888888887, | |
| "grad_norm": 1.6151516199907796, | |
| "learning_rate": 1.66819247104113e-07, | |
| "loss": 0.1338, | |
| "step": 7810 | |
| }, | |
| { | |
| "epoch": 2.7804444444444445, | |
| "grad_norm": 1.9698941742198866, | |
| "learning_rate": 1.6156078990569313e-07, | |
| "loss": 0.1203, | |
| "step": 7820 | |
| }, | |
| { | |
| "epoch": 2.784, | |
| "grad_norm": 1.6305672042666572, | |
| "learning_rate": 1.563851826124696e-07, | |
| "loss": 0.1216, | |
| "step": 7830 | |
| }, | |
| { | |
| "epoch": 2.7875555555555556, | |
| "grad_norm": 1.0194788026355706, | |
| "learning_rate": 1.5129251384776998e-07, | |
| "loss": 0.1181, | |
| "step": 7840 | |
| }, | |
| { | |
| "epoch": 2.7911111111111113, | |
| "grad_norm": 1.7073067625712353, | |
| "learning_rate": 1.462828708147379e-07, | |
| "loss": 0.139, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 2.7946666666666666, | |
| "grad_norm": 1.4957713592543374, | |
| "learning_rate": 1.4135633929485026e-07, | |
| "loss": 0.1373, | |
| "step": 7860 | |
| }, | |
| { | |
| "epoch": 2.7982222222222224, | |
| "grad_norm": 1.6268976958462047, | |
| "learning_rate": 1.3651300364644126e-07, | |
| "loss": 0.1294, | |
| "step": 7870 | |
| }, | |
| { | |
| "epoch": 2.8017777777777777, | |
| "grad_norm": 1.3636030825381604, | |
| "learning_rate": 1.317529468032569e-07, | |
| "loss": 0.1158, | |
| "step": 7880 | |
| }, | |
| { | |
| "epoch": 2.8053333333333335, | |
| "grad_norm": 1.5147346477252843, | |
| "learning_rate": 1.2707625027304104e-07, | |
| "loss": 0.124, | |
| "step": 7890 | |
| }, | |
| { | |
| "epoch": 2.8088888888888888, | |
| "grad_norm": 1.7193516342629052, | |
| "learning_rate": 1.2248299413613607e-07, | |
| "loss": 0.1332, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 2.8124444444444445, | |
| "grad_norm": 1.6484553937509365, | |
| "learning_rate": 1.1797325704411e-07, | |
| "loss": 0.1214, | |
| "step": 7910 | |
| }, | |
| { | |
| "epoch": 2.816, | |
| "grad_norm": 1.6919284405549642, | |
| "learning_rate": 1.1354711621841208e-07, | |
| "loss": 0.133, | |
| "step": 7920 | |
| }, | |
| { | |
| "epoch": 2.8195555555555556, | |
| "grad_norm": 1.223501357852658, | |
| "learning_rate": 1.0920464744905157e-07, | |
| "loss": 0.1205, | |
| "step": 7930 | |
| }, | |
| { | |
| "epoch": 2.8231111111111113, | |
| "grad_norm": 1.5481520280664143, | |
| "learning_rate": 1.0494592509329716e-07, | |
| "loss": 0.1469, | |
| "step": 7940 | |
| }, | |
| { | |
| "epoch": 2.8266666666666667, | |
| "grad_norm": 1.7879544199201751, | |
| "learning_rate": 1.007710220744057e-07, | |
| "loss": 0.1269, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 2.830222222222222, | |
| "grad_norm": 1.513993378655108, | |
| "learning_rate": 9.668000988037163e-08, | |
| "loss": 0.1322, | |
| "step": 7960 | |
| }, | |
| { | |
| "epoch": 2.8337777777777777, | |
| "grad_norm": 1.7964467427017516, | |
| "learning_rate": 9.267295856270509e-08, | |
| "loss": 0.1354, | |
| "step": 7970 | |
| }, | |
| { | |
| "epoch": 2.8373333333333335, | |
| "grad_norm": 1.787987364521523, | |
| "learning_rate": 8.874993673523236e-08, | |
| "loss": 0.1319, | |
| "step": 7980 | |
| }, | |
| { | |
| "epoch": 2.840888888888889, | |
| "grad_norm": 1.6897870372176325, | |
| "learning_rate": 8.491101157291737e-08, | |
| "loss": 0.1274, | |
| "step": 7990 | |
| }, | |
| { | |
| "epoch": 2.8444444444444446, | |
| "grad_norm": 1.6105609971746402, | |
| "learning_rate": 8.115624881071594e-08, | |
| "loss": 0.1318, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 2.8444444444444446, | |
| "eval_loss": 0.23905394971370697, | |
| "eval_runtime": 559.7682, | |
| "eval_samples_per_second": 17.865, | |
| "eval_steps_per_second": 4.466, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 2.848, | |
| "grad_norm": 1.3881391902801445, | |
| "learning_rate": 7.748571274244776e-08, | |
| "loss": 0.1199, | |
| "step": 8010 | |
| }, | |
| { | |
| "epoch": 2.8515555555555556, | |
| "grad_norm": 1.8275543306577795, | |
| "learning_rate": 7.389946621969679e-08, | |
| "loss": 0.1494, | |
| "step": 8020 | |
| }, | |
| { | |
| "epoch": 2.8551111111111114, | |
| "grad_norm": 1.8960525825598256, | |
| "learning_rate": 7.039757065073316e-08, | |
| "loss": 0.1354, | |
| "step": 8030 | |
| }, | |
| { | |
| "epoch": 2.8586666666666667, | |
| "grad_norm": 1.6485916403071794, | |
| "learning_rate": 6.698008599946404e-08, | |
| "loss": 0.1246, | |
| "step": 8040 | |
| }, | |
| { | |
| "epoch": 2.862222222222222, | |
| "grad_norm": 1.2435705558011503, | |
| "learning_rate": 6.364707078440335e-08, | |
| "loss": 0.1266, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 2.8657777777777778, | |
| "grad_norm": 1.5746164801301799, | |
| "learning_rate": 6.039858207767479e-08, | |
| "loss": 0.134, | |
| "step": 8060 | |
| }, | |
| { | |
| "epoch": 2.8693333333333335, | |
| "grad_norm": 1.5169697571883205, | |
| "learning_rate": 5.723467550403039e-08, | |
| "loss": 0.1326, | |
| "step": 8070 | |
| }, | |
| { | |
| "epoch": 2.872888888888889, | |
| "grad_norm": 1.5881237505008923, | |
| "learning_rate": 5.4155405239897926e-08, | |
| "loss": 0.1488, | |
| "step": 8080 | |
| }, | |
| { | |
| "epoch": 2.8764444444444446, | |
| "grad_norm": 1.690061086159581, | |
| "learning_rate": 5.1160824012458367e-08, | |
| "loss": 0.1232, | |
| "step": 8090 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "grad_norm": 1.6253293072576216, | |
| "learning_rate": 4.825098309873544e-08, | |
| "loss": 0.1264, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 2.8835555555555556, | |
| "grad_norm": 1.8528993602738453, | |
| "learning_rate": 4.542593232472414e-08, | |
| "loss": 0.1328, | |
| "step": 8110 | |
| }, | |
| { | |
| "epoch": 2.887111111111111, | |
| "grad_norm": 1.949296952991108, | |
| "learning_rate": 4.268572006453364e-08, | |
| "loss": 0.1264, | |
| "step": 8120 | |
| }, | |
| { | |
| "epoch": 2.8906666666666667, | |
| "grad_norm": 1.5505902041733666, | |
| "learning_rate": 4.003039323956126e-08, | |
| "loss": 0.1308, | |
| "step": 8130 | |
| }, | |
| { | |
| "epoch": 2.894222222222222, | |
| "grad_norm": 0.9023008663346067, | |
| "learning_rate": 3.7459997317687014e-08, | |
| "loss": 0.1101, | |
| "step": 8140 | |
| }, | |
| { | |
| "epoch": 2.897777777777778, | |
| "grad_norm": 1.8468547733058307, | |
| "learning_rate": 3.4974576312497564e-08, | |
| "loss": 0.1249, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 2.9013333333333335, | |
| "grad_norm": 1.7056102650658924, | |
| "learning_rate": 3.25741727825285e-08, | |
| "loss": 0.1193, | |
| "step": 8160 | |
| }, | |
| { | |
| "epoch": 2.904888888888889, | |
| "grad_norm": 1.3690587953613977, | |
| "learning_rate": 3.025882783054046e-08, | |
| "loss": 0.1199, | |
| "step": 8170 | |
| }, | |
| { | |
| "epoch": 2.9084444444444446, | |
| "grad_norm": 1.3946208158917515, | |
| "learning_rate": 2.8028581102811924e-08, | |
| "loss": 0.1365, | |
| "step": 8180 | |
| }, | |
| { | |
| "epoch": 2.912, | |
| "grad_norm": 1.9644328667604294, | |
| "learning_rate": 2.588347078846254e-08, | |
| "loss": 0.1323, | |
| "step": 8190 | |
| }, | |
| { | |
| "epoch": 2.9155555555555557, | |
| "grad_norm": 1.7619431494028974, | |
| "learning_rate": 2.382353361879586e-08, | |
| "loss": 0.1244, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 2.919111111111111, | |
| "grad_norm": 1.6739735252712569, | |
| "learning_rate": 2.18488048666754e-08, | |
| "loss": 0.1241, | |
| "step": 8210 | |
| }, | |
| { | |
| "epoch": 2.9226666666666667, | |
| "grad_norm": 1.7618267751958017, | |
| "learning_rate": 1.995931834591569e-08, | |
| "loss": 0.132, | |
| "step": 8220 | |
| }, | |
| { | |
| "epoch": 2.926222222222222, | |
| "grad_norm": 1.5149144065240054, | |
| "learning_rate": 1.8155106410706613e-08, | |
| "loss": 0.1359, | |
| "step": 8230 | |
| }, | |
| { | |
| "epoch": 2.929777777777778, | |
| "grad_norm": 1.7464428231188038, | |
| "learning_rate": 1.6436199955057742e-08, | |
| "loss": 0.1477, | |
| "step": 8240 | |
| }, | |
| { | |
| "epoch": 2.9333333333333336, | |
| "grad_norm": 1.7961519057796862, | |
| "learning_rate": 1.480262841226987e-08, | |
| "loss": 0.1482, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 2.936888888888889, | |
| "grad_norm": 1.668237688338044, | |
| "learning_rate": 1.3254419754430981e-08, | |
| "loss": 0.1369, | |
| "step": 8260 | |
| }, | |
| { | |
| "epoch": 2.940444444444444, | |
| "grad_norm": 1.5710565780518715, | |
| "learning_rate": 1.1791600491937172e-08, | |
| "loss": 0.1265, | |
| "step": 8270 | |
| }, | |
| { | |
| "epoch": 2.944, | |
| "grad_norm": 1.6190463651101816, | |
| "learning_rate": 1.041419567303914e-08, | |
| "loss": 0.1233, | |
| "step": 8280 | |
| }, | |
| { | |
| "epoch": 2.9475555555555557, | |
| "grad_norm": 1.3359272700606026, | |
| "learning_rate": 9.12222888341252e-09, | |
| "loss": 0.1308, | |
| "step": 8290 | |
| }, | |
| { | |
| "epoch": 2.951111111111111, | |
| "grad_norm": 1.7965214936961842, | |
| "learning_rate": 7.915722245754876e-09, | |
| "loss": 0.141, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 2.9546666666666668, | |
| "grad_norm": 1.7433994283889143, | |
| "learning_rate": 6.7946964194059994e-09, | |
| "loss": 0.1493, | |
| "step": 8310 | |
| }, | |
| { | |
| "epoch": 2.958222222222222, | |
| "grad_norm": 1.6666804006077884, | |
| "learning_rate": 5.759170599994868e-09, | |
| "loss": 0.1284, | |
| "step": 8320 | |
| }, | |
| { | |
| "epoch": 2.961777777777778, | |
| "grad_norm": 1.4232443691197452, | |
| "learning_rate": 4.809162519110455e-09, | |
| "loss": 0.1231, | |
| "step": 8330 | |
| }, | |
| { | |
| "epoch": 2.9653333333333336, | |
| "grad_norm": 1.8464380977109713, | |
| "learning_rate": 3.944688443998646e-09, | |
| "loss": 0.1466, | |
| "step": 8340 | |
| }, | |
| { | |
| "epoch": 2.968888888888889, | |
| "grad_norm": 1.8474020149086245, | |
| "learning_rate": 3.16576317728301e-09, | |
| "loss": 0.126, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 2.9724444444444442, | |
| "grad_norm": 1.731427281949659, | |
| "learning_rate": 2.4724000567116768e-09, | |
| "loss": 0.1361, | |
| "step": 8360 | |
| }, | |
| { | |
| "epoch": 2.976, | |
| "grad_norm": 1.8993388895043506, | |
| "learning_rate": 1.86461095492918e-09, | |
| "loss": 0.1258, | |
| "step": 8370 | |
| }, | |
| { | |
| "epoch": 2.9795555555555557, | |
| "grad_norm": 1.676714063923629, | |
| "learning_rate": 1.3424062792738445e-09, | |
| "loss": 0.1311, | |
| "step": 8380 | |
| }, | |
| { | |
| "epoch": 2.983111111111111, | |
| "grad_norm": 1.714542756833673, | |
| "learning_rate": 9.057949715968183e-10, | |
| "loss": 0.1236, | |
| "step": 8390 | |
| }, | |
| { | |
| "epoch": 2.986666666666667, | |
| "grad_norm": 1.6829258625832335, | |
| "learning_rate": 5.547845081121939e-10, | |
| "loss": 0.1171, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 2.990222222222222, | |
| "grad_norm": 1.5917279579386703, | |
| "learning_rate": 2.89380899267111e-10, | |
| "loss": 0.1309, | |
| "step": 8410 | |
| }, | |
| { | |
| "epoch": 2.993777777777778, | |
| "grad_norm": 1.72982950263424, | |
| "learning_rate": 1.0958868963906188e-10, | |
| "loss": 0.1314, | |
| "step": 8420 | |
| }, | |
| { | |
| "epoch": 2.997333333333333, | |
| "grad_norm": 1.6121134095652765, | |
| "learning_rate": 1.541095785984048e-11, | |
| "loss": 0.1267, | |
| "step": 8430 | |
| }, | |
| { | |
| "epoch": 2.9994666666666667, | |
| "step": 8436, | |
| "total_flos": 621656373067776.0, | |
| "train_loss": 0.25353994178455436, | |
| "train_runtime": 39823.1973, | |
| "train_samples_per_second": 6.78, | |
| "train_steps_per_second": 0.212 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 8436, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 2000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 621656373067776.0, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |