{ "best_metric": null, "best_model_checkpoint": null, "epoch": 20.0, "eval_steps": 10, "global_step": 1640, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.12195121951219512, "grad_norm": 35.852298736572266, "learning_rate": 1.219512195121951e-07, "loss": 7.8912, "step": 10 }, { "epoch": 0.12195121951219512, "eval_loss": 7.806792736053467, "eval_runtime": 39.7349, "eval_samples_per_second": 444.47, "eval_steps_per_second": 0.453, "step": 10 }, { "epoch": 0.24390243902439024, "grad_norm": 35.98800277709961, "learning_rate": 2.439024390243902e-07, "loss": 7.6722, "step": 20 }, { "epoch": 0.24390243902439024, "eval_loss": 7.456321716308594, "eval_runtime": 39.2616, "eval_samples_per_second": 449.828, "eval_steps_per_second": 0.458, "step": 20 }, { "epoch": 0.36585365853658536, "grad_norm": 33.745487213134766, "learning_rate": 3.6585365853658536e-07, "loss": 7.2095, "step": 30 }, { "epoch": 0.36585365853658536, "eval_loss": 6.869460105895996, "eval_runtime": 39.2191, "eval_samples_per_second": 450.316, "eval_steps_per_second": 0.459, "step": 30 }, { "epoch": 0.4878048780487805, "grad_norm": 28.555143356323242, "learning_rate": 4.878048780487804e-07, "loss": 6.5213, "step": 40 }, { "epoch": 0.4878048780487805, "eval_loss": 6.08616828918457, "eval_runtime": 39.2368, "eval_samples_per_second": 450.114, "eval_steps_per_second": 0.459, "step": 40 }, { "epoch": 0.6097560975609756, "grad_norm": 17.861717224121094, "learning_rate": 6.097560975609756e-07, "loss": 5.6988, "step": 50 }, { "epoch": 0.6097560975609756, "eval_loss": 5.294862270355225, "eval_runtime": 39.2476, "eval_samples_per_second": 449.989, "eval_steps_per_second": 0.459, "step": 50 }, { "epoch": 0.7317073170731707, "grad_norm": 9.703508377075195, "learning_rate": 7.317073170731707e-07, "loss": 5.0031, "step": 60 }, { "epoch": 0.7317073170731707, "eval_loss": 4.72209358215332, "eval_runtime": 39.2415, "eval_samples_per_second": 450.059, "eval_steps_per_second": 0.459, "step": 60 }, { "epoch": 0.8536585365853658, "grad_norm": 5.830469131469727, "learning_rate": 8.536585365853657e-07, "loss": 4.5171, "step": 70 }, { "epoch": 0.8536585365853658, "eval_loss": 4.342286586761475, "eval_runtime": 39.2288, "eval_samples_per_second": 450.205, "eval_steps_per_second": 0.459, "step": 70 }, { "epoch": 0.975609756097561, "grad_norm": 4.307955265045166, "learning_rate": 9.756097560975609e-07, "loss": 4.1858, "step": 80 }, { "epoch": 0.975609756097561, "eval_loss": 4.0347723960876465, "eval_runtime": 39.2556, "eval_samples_per_second": 449.897, "eval_steps_per_second": 0.459, "step": 80 }, { "epoch": 1.0975609756097562, "grad_norm": 3.8736400604248047, "learning_rate": 1.0975609756097562e-06, "loss": 3.8907, "step": 90 }, { "epoch": 1.0975609756097562, "eval_loss": 3.7496390342712402, "eval_runtime": 39.2415, "eval_samples_per_second": 450.059, "eval_steps_per_second": 0.459, "step": 90 }, { "epoch": 1.2195121951219512, "grad_norm": 3.2329907417297363, "learning_rate": 1.2195121951219512e-06, "loss": 3.6078, "step": 100 }, { "epoch": 1.2195121951219512, "eval_loss": 3.480637788772583, "eval_runtime": 39.2636, "eval_samples_per_second": 449.805, "eval_steps_per_second": 0.458, "step": 100 }, { "epoch": 1.3414634146341464, "grad_norm": 2.6243083477020264, "learning_rate": 1.3414634146341463e-06, "loss": 3.3481, "step": 110 }, { "epoch": 1.3414634146341464, "eval_loss": 3.248450756072998, "eval_runtime": 39.2605, "eval_samples_per_second": 449.841, "eval_steps_per_second": 0.458, "step": 110 }, { "epoch": 1.4634146341463414, "grad_norm": 2.2950096130371094, "learning_rate": 1.4634146341463414e-06, "loss": 3.1262, "step": 120 }, { "epoch": 1.4634146341463414, "eval_loss": 3.042900323867798, "eval_runtime": 39.3067, "eval_samples_per_second": 449.313, "eval_steps_per_second": 0.458, "step": 120 }, { "epoch": 1.5853658536585367, "grad_norm": 2.118208408355713, "learning_rate": 1.5853658536585366e-06, "loss": 2.9296, "step": 130 }, { "epoch": 1.5853658536585367, "eval_loss": 2.842834711074829, "eval_runtime": 39.3323, "eval_samples_per_second": 449.021, "eval_steps_per_second": 0.458, "step": 130 }, { "epoch": 1.7073170731707317, "grad_norm": 1.656348466873169, "learning_rate": 1.7073170731707315e-06, "loss": 2.7478, "step": 140 }, { "epoch": 1.7073170731707317, "eval_loss": 2.671419382095337, "eval_runtime": 39.2963, "eval_samples_per_second": 449.431, "eval_steps_per_second": 0.458, "step": 140 }, { "epoch": 1.8292682926829267, "grad_norm": 1.4909868240356445, "learning_rate": 1.8292682926829268e-06, "loss": 2.5869, "step": 150 }, { "epoch": 1.8292682926829267, "eval_loss": 2.5269365310668945, "eval_runtime": 39.305, "eval_samples_per_second": 449.332, "eval_steps_per_second": 0.458, "step": 150 }, { "epoch": 1.951219512195122, "grad_norm": 1.4536538124084473, "learning_rate": 1.9512195121951218e-06, "loss": 2.4527, "step": 160 }, { "epoch": 1.951219512195122, "eval_loss": 2.4036409854888916, "eval_runtime": 39.3364, "eval_samples_per_second": 448.974, "eval_steps_per_second": 0.458, "step": 160 }, { "epoch": 2.073170731707317, "grad_norm": 1.1609474420547485, "learning_rate": 1.9999184556954774e-06, "loss": 2.3461, "step": 170 }, { "epoch": 2.073170731707317, "eval_loss": 2.299896717071533, "eval_runtime": 39.3179, "eval_samples_per_second": 449.185, "eval_steps_per_second": 0.458, "step": 170 }, { "epoch": 2.1951219512195124, "grad_norm": 0.9852533340454102, "learning_rate": 1.999420177550043e-06, "loss": 2.243, "step": 180 }, { "epoch": 2.1951219512195124, "eval_loss": 2.2197790145874023, "eval_runtime": 39.305, "eval_samples_per_second": 449.332, "eval_steps_per_second": 0.458, "step": 180 }, { "epoch": 2.317073170731707, "grad_norm": 0.7964138388633728, "learning_rate": 1.9984691491033903e-06, "loss": 2.1769, "step": 190 }, { "epoch": 2.317073170731707, "eval_loss": 2.1610889434814453, "eval_runtime": 39.3444, "eval_samples_per_second": 448.883, "eval_steps_per_second": 0.457, "step": 190 }, { "epoch": 2.4390243902439024, "grad_norm": 0.8308465480804443, "learning_rate": 1.9970658011837403e-06, "loss": 2.1223, "step": 200 }, { "epoch": 2.4390243902439024, "eval_loss": 2.120553493499756, "eval_runtime": 39.3228, "eval_samples_per_second": 449.129, "eval_steps_per_second": 0.458, "step": 200 }, { "epoch": 2.5609756097560976, "grad_norm": 0.593834638595581, "learning_rate": 1.995210769525899e-06, "loss": 2.0966, "step": 210 }, { "epoch": 2.5609756097560976, "eval_loss": 2.0926828384399414, "eval_runtime": 39.3549, "eval_samples_per_second": 448.762, "eval_steps_per_second": 0.457, "step": 210 }, { "epoch": 2.682926829268293, "grad_norm": 0.5518779158592224, "learning_rate": 1.9929048944832634e-06, "loss": 2.0771, "step": 220 }, { "epoch": 2.682926829268293, "eval_loss": 2.072599172592163, "eval_runtime": 39.3161, "eval_samples_per_second": 449.205, "eval_steps_per_second": 0.458, "step": 220 }, { "epoch": 2.8048780487804876, "grad_norm": 0.5161460041999817, "learning_rate": 1.9901492206471324e-06, "loss": 2.0494, "step": 230 }, { "epoch": 2.8048780487804876, "eval_loss": 2.056934118270874, "eval_runtime": 39.345, "eval_samples_per_second": 448.876, "eval_steps_per_second": 0.457, "step": 230 }, { "epoch": 2.926829268292683, "grad_norm": 0.49138274788856506, "learning_rate": 1.986944996373489e-06, "loss": 2.0358, "step": 240 }, { "epoch": 2.926829268292683, "eval_loss": 2.0439107418060303, "eval_runtime": 39.3247, "eval_samples_per_second": 449.107, "eval_steps_per_second": 0.458, "step": 240 }, { "epoch": 3.048780487804878, "grad_norm": 0.4589332044124603, "learning_rate": 1.9832936732174833e-06, "loss": 2.0267, "step": 250 }, { "epoch": 3.048780487804878, "eval_loss": 2.0325710773468018, "eval_runtime": 39.3333, "eval_samples_per_second": 449.008, "eval_steps_per_second": 0.458, "step": 250 }, { "epoch": 3.1707317073170733, "grad_norm": 0.5090314745903015, "learning_rate": 1.979196905275856e-06, "loss": 2.0137, "step": 260 }, { "epoch": 3.1707317073170733, "eval_loss": 2.022416830062866, "eval_runtime": 39.3644, "eval_samples_per_second": 448.654, "eval_steps_per_second": 0.457, "step": 260 }, { "epoch": 3.292682926829268, "grad_norm": 0.4382960796356201, "learning_rate": 1.974656548437613e-06, "loss": 2.0011, "step": 270 }, { "epoch": 3.292682926829268, "eval_loss": 2.013012647628784, "eval_runtime": 39.333, "eval_samples_per_second": 449.012, "eval_steps_per_second": 0.458, "step": 270 }, { "epoch": 3.4146341463414633, "grad_norm": 0.4359953999519348, "learning_rate": 1.9696746595432827e-06, "loss": 1.9985, "step": 280 }, { "epoch": 3.4146341463414633, "eval_loss": 2.004154920578003, "eval_runtime": 39.3656, "eval_samples_per_second": 448.641, "eval_steps_per_second": 0.457, "step": 280 }, { "epoch": 3.5365853658536586, "grad_norm": 0.4334374964237213, "learning_rate": 1.964253495453141e-06, "loss": 1.9824, "step": 290 }, { "epoch": 3.5365853658536586, "eval_loss": 1.995658040046692, "eval_runtime": 39.3661, "eval_samples_per_second": 448.635, "eval_steps_per_second": 0.457, "step": 290 }, { "epoch": 3.658536585365854, "grad_norm": 0.45431938767433167, "learning_rate": 1.9583955120248236e-06, "loss": 1.9834, "step": 300 }, { "epoch": 3.658536585365854, "eval_loss": 1.987615704536438, "eval_runtime": 39.3423, "eval_samples_per_second": 448.907, "eval_steps_per_second": 0.458, "step": 300 }, { "epoch": 3.7804878048780486, "grad_norm": 0.4443369507789612, "learning_rate": 1.9521033630007928e-06, "loss": 1.9771, "step": 310 }, { "epoch": 3.7804878048780486, "eval_loss": 1.9800046682357788, "eval_runtime": 39.3384, "eval_samples_per_second": 448.951, "eval_steps_per_second": 0.458, "step": 310 }, { "epoch": 3.902439024390244, "grad_norm": 0.41829970479011536, "learning_rate": 1.945379898806153e-06, "loss": 1.9685, "step": 320 }, { "epoch": 3.902439024390244, "eval_loss": 1.9727787971496582, "eval_runtime": 39.3267, "eval_samples_per_second": 449.084, "eval_steps_per_second": 0.458, "step": 320 }, { "epoch": 4.024390243902439, "grad_norm": 0.43398183584213257, "learning_rate": 1.9382281652573785e-06, "loss": 1.9591, "step": 330 }, { "epoch": 4.024390243902439, "eval_loss": 1.9658193588256836, "eval_runtime": 39.3633, "eval_samples_per_second": 448.666, "eval_steps_per_second": 0.457, "step": 330 }, { "epoch": 4.146341463414634, "grad_norm": 0.39597055315971375, "learning_rate": 1.9306514021825116e-06, "loss": 1.9487, "step": 340 }, { "epoch": 4.146341463414634, "eval_loss": 1.9592342376708984, "eval_runtime": 39.3471, "eval_samples_per_second": 448.852, "eval_steps_per_second": 0.457, "step": 340 }, { "epoch": 4.2682926829268295, "grad_norm": 0.4240054190158844, "learning_rate": 1.922653041953483e-06, "loss": 1.9454, "step": 350 }, { "epoch": 4.2682926829268295, "eval_loss": 1.9528599977493286, "eval_runtime": 39.4079, "eval_samples_per_second": 448.159, "eval_steps_per_second": 0.457, "step": 350 }, { "epoch": 4.390243902439025, "grad_norm": 0.3959615230560303, "learning_rate": 1.914236707931202e-06, "loss": 1.9361, "step": 360 }, { "epoch": 4.390243902439025, "eval_loss": 1.9468114376068115, "eval_runtime": 39.3624, "eval_samples_per_second": 448.676, "eval_steps_per_second": 0.457, "step": 360 }, { "epoch": 4.512195121951219, "grad_norm": 0.3723958432674408, "learning_rate": 1.905406212824126e-06, "loss": 1.9389, "step": 370 }, { "epoch": 4.512195121951219, "eval_loss": 1.940889596939087, "eval_runtime": 39.3409, "eval_samples_per_second": 448.922, "eval_steps_per_second": 0.458, "step": 370 }, { "epoch": 4.634146341463414, "grad_norm": 0.37107619643211365, "learning_rate": 1.8961655569610556e-06, "loss": 1.9279, "step": 380 }, { "epoch": 4.634146341463414, "eval_loss": 1.9352220296859741, "eval_runtime": 39.3794, "eval_samples_per_second": 448.483, "eval_steps_per_second": 0.457, "step": 380 }, { "epoch": 4.7560975609756095, "grad_norm": 0.37607431411743164, "learning_rate": 1.8865189264789318e-06, "loss": 1.9212, "step": 390 }, { "epoch": 4.7560975609756095, "eval_loss": 1.9297622442245483, "eval_runtime": 39.3451, "eval_samples_per_second": 448.875, "eval_steps_per_second": 0.457, "step": 390 }, { "epoch": 4.878048780487805, "grad_norm": 0.371985524892807, "learning_rate": 1.8764706914264633e-06, "loss": 1.9142, "step": 400 }, { "epoch": 4.878048780487805, "eval_loss": 1.9244760274887085, "eval_runtime": 39.3136, "eval_samples_per_second": 449.234, "eval_steps_per_second": 0.458, "step": 400 }, { "epoch": 5.0, "grad_norm": 0.4083458185195923, "learning_rate": 1.8660254037844386e-06, "loss": 1.9087, "step": 410 }, { "epoch": 5.0, "eval_loss": 1.9192752838134766, "eval_runtime": 39.3184, "eval_samples_per_second": 449.18, "eval_steps_per_second": 0.458, "step": 410 }, { "epoch": 5.121951219512195, "grad_norm": 0.36457160115242004, "learning_rate": 1.8551877954036162e-06, "loss": 1.9061, "step": 420 }, { "epoch": 5.121951219512195, "eval_loss": 1.9141229391098022, "eval_runtime": 39.3307, "eval_samples_per_second": 449.038, "eval_steps_per_second": 0.458, "step": 420 }, { "epoch": 5.2439024390243905, "grad_norm": 0.4809107482433319, "learning_rate": 1.8439627758611382e-06, "loss": 1.9013, "step": 430 }, { "epoch": 5.2439024390243905, "eval_loss": 1.908936619758606, "eval_runtime": 39.3459, "eval_samples_per_second": 448.865, "eval_steps_per_second": 0.457, "step": 430 }, { "epoch": 5.365853658536586, "grad_norm": 0.5720298290252686, "learning_rate": 1.832355430236427e-06, "loss": 1.8953, "step": 440 }, { "epoch": 5.365853658536586, "eval_loss": 1.9032728672027588, "eval_runtime": 39.357, "eval_samples_per_second": 448.739, "eval_steps_per_second": 0.457, "step": 440 }, { "epoch": 5.487804878048781, "grad_norm": 0.40899941325187683, "learning_rate": 1.8203710168075784e-06, "loss": 1.8877, "step": 450 }, { "epoch": 5.487804878048781, "eval_loss": 1.8964674472808838, "eval_runtime": 39.3338, "eval_samples_per_second": 449.003, "eval_steps_per_second": 0.458, "step": 450 }, { "epoch": 5.609756097560975, "grad_norm": 0.7770607471466064, "learning_rate": 1.8080149646692928e-06, "loss": 1.8794, "step": 460 }, { "epoch": 5.609756097560975, "eval_loss": 1.8876992464065552, "eval_runtime": 39.345, "eval_samples_per_second": 448.876, "eval_steps_per_second": 0.457, "step": 460 }, { "epoch": 5.7317073170731705, "grad_norm": 1.4756108522415161, "learning_rate": 1.7952928712734265e-06, "loss": 1.8732, "step": 470 }, { "epoch": 5.7317073170731705, "eval_loss": 1.8765217065811157, "eval_runtime": 39.3534, "eval_samples_per_second": 448.78, "eval_steps_per_second": 0.457, "step": 470 }, { "epoch": 5.853658536585366, "grad_norm": 3.089818239212036, "learning_rate": 1.7822104998932711e-06, "loss": 1.8652, "step": 480 }, { "epoch": 5.853658536585366, "eval_loss": 1.8682845830917358, "eval_runtime": 39.3351, "eval_samples_per_second": 448.988, "eval_steps_per_second": 0.458, "step": 480 }, { "epoch": 5.975609756097561, "grad_norm": 1.618462085723877, "learning_rate": 1.7687737770127184e-06, "loss": 1.8513, "step": 490 }, { "epoch": 5.975609756097561, "eval_loss": 1.8599414825439453, "eval_runtime": 39.3081, "eval_samples_per_second": 449.297, "eval_steps_per_second": 0.458, "step": 490 }, { "epoch": 6.097560975609756, "grad_norm": 1.5310617685317993, "learning_rate": 1.754988789641485e-06, "loss": 1.8501, "step": 500 }, { "epoch": 6.097560975609756, "eval_loss": 1.8557490110397339, "eval_runtime": 39.3877, "eval_samples_per_second": 448.389, "eval_steps_per_second": 0.457, "step": 500 }, { "epoch": 6.219512195121951, "grad_norm": 1.2465336322784424, "learning_rate": 1.7408617825576177e-06, "loss": 1.8475, "step": 510 }, { "epoch": 6.219512195121951, "eval_loss": 1.85213303565979, "eval_runtime": 39.3619, "eval_samples_per_second": 448.682, "eval_steps_per_second": 0.457, "step": 510 }, { "epoch": 6.341463414634147, "grad_norm": 0.8563424944877625, "learning_rate": 1.7263991554785288e-06, "loss": 1.8349, "step": 520 }, { "epoch": 6.341463414634147, "eval_loss": 1.8481909036636353, "eval_runtime": 39.3592, "eval_samples_per_second": 448.714, "eval_steps_per_second": 0.457, "step": 520 }, { "epoch": 6.463414634146342, "grad_norm": 0.47906893491744995, "learning_rate": 1.7116074601618415e-06, "loss": 1.8369, "step": 530 }, { "epoch": 6.463414634146342, "eval_loss": 1.8447495698928833, "eval_runtime": 39.3461, "eval_samples_per_second": 448.863, "eval_steps_per_second": 0.457, "step": 530 }, { "epoch": 6.585365853658536, "grad_norm": 0.3527175188064575, "learning_rate": 1.696493397437357e-06, "loss": 1.8288, "step": 540 }, { "epoch": 6.585365853658536, "eval_loss": 1.8413718938827515, "eval_runtime": 39.3552, "eval_samples_per_second": 448.759, "eval_steps_per_second": 0.457, "step": 540 }, { "epoch": 6.7073170731707314, "grad_norm": 0.37900474667549133, "learning_rate": 1.6810638141714932e-06, "loss": 1.8271, "step": 550 }, { "epoch": 6.7073170731707314, "eval_loss": 1.8382277488708496, "eval_runtime": 39.3397, "eval_samples_per_second": 448.936, "eval_steps_per_second": 0.458, "step": 550 }, { "epoch": 6.829268292682927, "grad_norm": 0.3187570869922638, "learning_rate": 1.665325700165565e-06, "loss": 1.8296, "step": 560 }, { "epoch": 6.829268292682927, "eval_loss": 1.8352141380310059, "eval_runtime": 39.3555, "eval_samples_per_second": 448.755, "eval_steps_per_second": 0.457, "step": 560 }, { "epoch": 6.951219512195122, "grad_norm": 0.36600008606910706, "learning_rate": 1.6492861849893147e-06, "loss": 1.8257, "step": 570 }, { "epoch": 6.951219512195122, "eval_loss": 1.832342505455017, "eval_runtime": 39.336, "eval_samples_per_second": 448.978, "eval_steps_per_second": 0.458, "step": 570 }, { "epoch": 7.073170731707317, "grad_norm": 0.288286954164505, "learning_rate": 1.6329525347511218e-06, "loss": 1.8238, "step": 580 }, { "epoch": 7.073170731707317, "eval_loss": 1.8295822143554688, "eval_runtime": 39.3243, "eval_samples_per_second": 449.112, "eval_steps_per_second": 0.458, "step": 580 }, { "epoch": 7.195121951219512, "grad_norm": 0.3237072825431824, "learning_rate": 1.6163321488063635e-06, "loss": 1.8174, "step": 590 }, { "epoch": 7.195121951219512, "eval_loss": 1.8268990516662598, "eval_runtime": 39.3498, "eval_samples_per_second": 448.82, "eval_steps_per_second": 0.457, "step": 590 }, { "epoch": 7.317073170731708, "grad_norm": 0.36146941781044006, "learning_rate": 1.599432556405412e-06, "loss": 1.8141, "step": 600 }, { "epoch": 7.317073170731708, "eval_loss": 1.8243464231491089, "eval_runtime": 39.3676, "eval_samples_per_second": 448.617, "eval_steps_per_second": 0.457, "step": 600 }, { "epoch": 7.439024390243903, "grad_norm": 0.3055365979671478, "learning_rate": 1.5822614132827836e-06, "loss": 1.8141, "step": 610 }, { "epoch": 7.439024390243903, "eval_loss": 1.8218821287155151, "eval_runtime": 39.3324, "eval_samples_per_second": 449.019, "eval_steps_per_second": 0.458, "step": 610 }, { "epoch": 7.560975609756097, "grad_norm": 0.2906692624092102, "learning_rate": 1.5648264981889934e-06, "loss": 1.8096, "step": 620 }, { "epoch": 7.560975609756097, "eval_loss": 1.8194576501846313, "eval_runtime": 39.3325, "eval_samples_per_second": 449.018, "eval_steps_per_second": 0.458, "step": 620 }, { "epoch": 7.682926829268292, "grad_norm": 0.3652225434780121, "learning_rate": 1.5471357093666804e-06, "loss": 1.8119, "step": 630 }, { "epoch": 7.682926829268292, "eval_loss": 1.8171180486679077, "eval_runtime": 39.3377, "eval_samples_per_second": 448.958, "eval_steps_per_second": 0.458, "step": 630 }, { "epoch": 7.804878048780488, "grad_norm": 0.3688996732234955, "learning_rate": 1.5291970609726005e-06, "loss": 1.8042, "step": 640 }, { "epoch": 7.804878048780488, "eval_loss": 1.8148137331008911, "eval_runtime": 39.3463, "eval_samples_per_second": 448.86, "eval_steps_per_second": 0.457, "step": 640 }, { "epoch": 7.926829268292683, "grad_norm": 0.28809812664985657, "learning_rate": 1.5110186794471103e-06, "loss": 1.7979, "step": 650 }, { "epoch": 7.926829268292683, "eval_loss": 1.8126047849655151, "eval_runtime": 39.3619, "eval_samples_per_second": 448.682, "eval_steps_per_second": 0.457, "step": 650 }, { "epoch": 8.048780487804878, "grad_norm": 0.2660142481327057, "learning_rate": 1.4926087998327837e-06, "loss": 1.804, "step": 660 }, { "epoch": 8.048780487804878, "eval_loss": 1.8104569911956787, "eval_runtime": 39.3952, "eval_samples_per_second": 448.303, "eval_steps_per_second": 0.457, "step": 660 }, { "epoch": 8.170731707317072, "grad_norm": 0.281999796628952, "learning_rate": 1.4739757620438307e-06, "loss": 1.7987, "step": 670 }, { "epoch": 8.170731707317072, "eval_loss": 1.8083666563034058, "eval_runtime": 39.3346, "eval_samples_per_second": 448.994, "eval_steps_per_second": 0.458, "step": 670 }, { "epoch": 8.292682926829269, "grad_norm": 0.2869739234447479, "learning_rate": 1.4551280070880087e-06, "loss": 1.7954, "step": 680 }, { "epoch": 8.292682926829269, "eval_loss": 1.8063015937805176, "eval_runtime": 39.3428, "eval_samples_per_second": 448.9, "eval_steps_per_second": 0.458, "step": 680 }, { "epoch": 8.414634146341463, "grad_norm": 0.2752714157104492, "learning_rate": 1.4360740732427365e-06, "loss": 1.797, "step": 690 }, { "epoch": 8.414634146341463, "eval_loss": 1.804310917854309, "eval_runtime": 39.3238, "eval_samples_per_second": 449.118, "eval_steps_per_second": 0.458, "step": 690 }, { "epoch": 8.536585365853659, "grad_norm": 0.4099307358264923, "learning_rate": 1.416822592187143e-06, "loss": 1.791, "step": 700 }, { "epoch": 8.536585365853659, "eval_loss": 1.802320122718811, "eval_runtime": 39.3277, "eval_samples_per_second": 449.073, "eval_steps_per_second": 0.458, "step": 700 }, { "epoch": 8.658536585365853, "grad_norm": 0.3235901892185211, "learning_rate": 1.3973822850918054e-06, "loss": 1.7893, "step": 710 }, { "epoch": 8.658536585365853, "eval_loss": 1.8004404306411743, "eval_runtime": 39.6357, "eval_samples_per_second": 445.583, "eval_steps_per_second": 0.454, "step": 710 }, { "epoch": 8.78048780487805, "grad_norm": 0.3761025071144104, "learning_rate": 1.3777619586679457e-06, "loss": 1.787, "step": 720 }, { "epoch": 8.78048780487805, "eval_loss": 1.7985868453979492, "eval_runtime": 39.2931, "eval_samples_per_second": 449.468, "eval_steps_per_second": 0.458, "step": 720 }, { "epoch": 8.902439024390244, "grad_norm": 0.2766464054584503, "learning_rate": 1.3579705011778765e-06, "loss": 1.7899, "step": 730 }, { "epoch": 8.902439024390244, "eval_loss": 1.7967922687530518, "eval_runtime": 39.3376, "eval_samples_per_second": 448.96, "eval_steps_per_second": 0.458, "step": 730 }, { "epoch": 9.024390243902438, "grad_norm": 0.3136584162712097, "learning_rate": 1.3380168784085026e-06, "loss": 1.7917, "step": 740 }, { "epoch": 9.024390243902438, "eval_loss": 1.7949668169021606, "eval_runtime": 39.3251, "eval_samples_per_second": 449.102, "eval_steps_per_second": 0.458, "step": 740 }, { "epoch": 9.146341463414634, "grad_norm": 0.268803209066391, "learning_rate": 1.3179101296097033e-06, "loss": 1.7838, "step": 750 }, { "epoch": 9.146341463414634, "eval_loss": 1.793213129043579, "eval_runtime": 39.3253, "eval_samples_per_second": 449.101, "eval_steps_per_second": 0.458, "step": 750 }, { "epoch": 9.268292682926829, "grad_norm": 0.2907431125640869, "learning_rate": 1.2976593633994346e-06, "loss": 1.7803, "step": 760 }, { "epoch": 9.268292682926829, "eval_loss": 1.7914844751358032, "eval_runtime": 39.3189, "eval_samples_per_second": 449.173, "eval_steps_per_second": 0.458, "step": 760 }, { "epoch": 9.390243902439025, "grad_norm": 0.3980807960033417, "learning_rate": 1.2772737536374078e-06, "loss": 1.7789, "step": 770 }, { "epoch": 9.390243902439025, "eval_loss": 1.7898335456848145, "eval_runtime": 39.3451, "eval_samples_per_second": 448.875, "eval_steps_per_second": 0.457, "step": 770 }, { "epoch": 9.512195121951219, "grad_norm": 0.30676034092903137, "learning_rate": 1.2567625352692126e-06, "loss": 1.7811, "step": 780 }, { "epoch": 9.512195121951219, "eval_loss": 1.7882270812988281, "eval_runtime": 39.3352, "eval_samples_per_second": 448.987, "eval_steps_per_second": 0.458, "step": 780 }, { "epoch": 9.634146341463415, "grad_norm": 0.24213315546512604, "learning_rate": 1.2361350001427649e-06, "loss": 1.7791, "step": 790 }, { "epoch": 9.634146341463415, "eval_loss": 1.786568522453308, "eval_runtime": 39.3646, "eval_samples_per_second": 448.652, "eval_steps_per_second": 0.457, "step": 790 }, { "epoch": 9.75609756097561, "grad_norm": 0.2722227871417999, "learning_rate": 1.2154004927989813e-06, "loss": 1.7742, "step": 800 }, { "epoch": 9.75609756097561, "eval_loss": 1.784982681274414, "eval_runtime": 39.3395, "eval_samples_per_second": 448.938, "eval_steps_per_second": 0.458, "step": 800 }, { "epoch": 9.878048780487806, "grad_norm": 0.2399929314851761, "learning_rate": 1.19456840623858e-06, "loss": 1.7717, "step": 810 }, { "epoch": 9.878048780487806, "eval_loss": 1.7834330797195435, "eval_runtime": 39.2984, "eval_samples_per_second": 449.408, "eval_steps_per_second": 0.458, "step": 810 }, { "epoch": 10.0, "grad_norm": 0.24806931614875793, "learning_rate": 1.1736481776669305e-06, "loss": 1.7788, "step": 820 }, { "epoch": 10.0, "eval_loss": 1.7818834781646729, "eval_runtime": 39.2977, "eval_samples_per_second": 449.415, "eval_steps_per_second": 0.458, "step": 820 }, { "epoch": 10.121951219512194, "grad_norm": 0.32369279861450195, "learning_rate": 1.1526492842188744e-06, "loss": 1.7719, "step": 830 }, { "epoch": 10.121951219512194, "eval_loss": 1.7803385257720947, "eval_runtime": 39.3284, "eval_samples_per_second": 449.064, "eval_steps_per_second": 0.458, "step": 830 }, { "epoch": 10.24390243902439, "grad_norm": 0.3276310861110687, "learning_rate": 1.1315812386654649e-06, "loss": 1.7675, "step": 840 }, { "epoch": 10.24390243902439, "eval_loss": 1.7788329124450684, "eval_runtime": 39.674, "eval_samples_per_second": 445.153, "eval_steps_per_second": 0.454, "step": 840 }, { "epoch": 10.365853658536585, "grad_norm": 0.2850521504878998, "learning_rate": 1.1104535851045538e-06, "loss": 1.7725, "step": 850 }, { "epoch": 10.365853658536585, "eval_loss": 1.7772928476333618, "eval_runtime": 39.3302, "eval_samples_per_second": 449.045, "eval_steps_per_second": 0.458, "step": 850 }, { "epoch": 10.487804878048781, "grad_norm": 0.27776798605918884, "learning_rate": 1.0892758946371942e-06, "loss": 1.7648, "step": 860 }, { "epoch": 10.487804878048781, "eval_loss": 1.7757339477539062, "eval_runtime": 39.3544, "eval_samples_per_second": 448.768, "eval_steps_per_second": 0.457, "step": 860 }, { "epoch": 10.609756097560975, "grad_norm": 0.5228049755096436, "learning_rate": 1.0680577610318071e-06, "loss": 1.7609, "step": 870 }, { "epoch": 10.609756097560975, "eval_loss": 1.7741671800613403, "eval_runtime": 39.3709, "eval_samples_per_second": 448.581, "eval_steps_per_second": 0.457, "step": 870 }, { "epoch": 10.731707317073171, "grad_norm": 0.3848848044872284, "learning_rate": 1.0468087963780787e-06, "loss": 1.7636, "step": 880 }, { "epoch": 10.731707317073171, "eval_loss": 1.77255380153656, "eval_runtime": 39.3539, "eval_samples_per_second": 448.773, "eval_steps_per_second": 0.457, "step": 880 }, { "epoch": 10.853658536585366, "grad_norm": 0.5747771263122559, "learning_rate": 1.0255386267325602e-06, "loss": 1.7598, "step": 890 }, { "epoch": 10.853658536585366, "eval_loss": 1.770812749862671, "eval_runtime": 39.3689, "eval_samples_per_second": 448.603, "eval_steps_per_second": 0.457, "step": 890 }, { "epoch": 10.975609756097562, "grad_norm": 1.4300990104675293, "learning_rate": 1.0042568877579387e-06, "loss": 1.7651, "step": 900 }, { "epoch": 10.975609756097562, "eval_loss": 1.7690285444259644, "eval_runtime": 39.3685, "eval_samples_per_second": 448.607, "eval_steps_per_second": 0.457, "step": 900 }, { "epoch": 11.097560975609756, "grad_norm": 0.43552011251449585, "learning_rate": 9.829732203579585e-07, "loss": 1.7598, "step": 910 }, { "epoch": 11.097560975609756, "eval_loss": 1.7672632932662964, "eval_runtime": 39.3224, "eval_samples_per_second": 449.133, "eval_steps_per_second": 0.458, "step": 910 }, { "epoch": 11.21951219512195, "grad_norm": 4.467670917510986, "learning_rate": 9.616972663099646e-07, "loss": 1.7585, "step": 920 }, { "epoch": 11.21951219512195, "eval_loss": 1.768608570098877, "eval_runtime": 39.3441, "eval_samples_per_second": 448.886, "eval_steps_per_second": 0.458, "step": 920 }, { "epoch": 11.341463414634147, "grad_norm": 4.179907321929932, "learning_rate": 9.40438663897054e-07, "loss": 1.7572, "step": 930 }, { "epoch": 11.341463414634147, "eval_loss": 1.7685447931289673, "eval_runtime": 39.3059, "eval_samples_per_second": 449.322, "eval_steps_per_second": 0.458, "step": 930 }, { "epoch": 11.463414634146341, "grad_norm": 4.369534492492676, "learning_rate": 9.192070435418078e-07, "loss": 1.7485, "step": 940 }, { "epoch": 11.463414634146341, "eval_loss": 1.7640718221664429, "eval_runtime": 39.3093, "eval_samples_per_second": 449.283, "eval_steps_per_second": 0.458, "step": 940 }, { "epoch": 11.585365853658537, "grad_norm": 1.9561405181884766, "learning_rate": 8.980120234435848e-07, "loss": 1.7528, "step": 950 }, { "epoch": 11.585365853658537, "eval_loss": 1.763830542564392, "eval_runtime": 39.3238, "eval_samples_per_second": 449.117, "eval_steps_per_second": 0.458, "step": 950 }, { "epoch": 11.707317073170731, "grad_norm": 2.5773379802703857, "learning_rate": 8.768632052213531e-07, "loss": 1.7545, "step": 960 }, { "epoch": 11.707317073170731, "eval_loss": 1.7629334926605225, "eval_runtime": 39.3328, "eval_samples_per_second": 449.014, "eval_steps_per_second": 0.458, "step": 960 }, { "epoch": 11.829268292682928, "grad_norm": 3.903297185897827, "learning_rate": 8.557701695640321e-07, "loss": 1.7508, "step": 970 }, { "epoch": 11.829268292682928, "eval_loss": 1.760330319404602, "eval_runtime": 39.3191, "eval_samples_per_second": 449.171, "eval_steps_per_second": 0.458, "step": 970 }, { "epoch": 11.951219512195122, "grad_norm": 2.7617976665496826, "learning_rate": 8.347424718903151e-07, "loss": 1.7542, "step": 980 }, { "epoch": 11.951219512195122, "eval_loss": 1.759660243988037, "eval_runtime": 39.3037, "eval_samples_per_second": 449.347, "eval_steps_per_second": 0.458, "step": 980 }, { "epoch": 12.073170731707316, "grad_norm": 3.2472615242004395, "learning_rate": 8.137896380199421e-07, "loss": 1.752, "step": 990 }, { "epoch": 12.073170731707316, "eval_loss": 1.7598719596862793, "eval_runtime": 39.6916, "eval_samples_per_second": 444.956, "eval_steps_per_second": 0.453, "step": 990 }, { "epoch": 12.195121951219512, "grad_norm": 4.050698757171631, "learning_rate": 7.929211598583793e-07, "loss": 1.7487, "step": 1000 }, { "epoch": 12.195121951219512, "eval_loss": 1.7570974826812744, "eval_runtime": 39.3566, "eval_samples_per_second": 448.743, "eval_steps_per_second": 0.457, "step": 1000 }, { "epoch": 12.317073170731707, "grad_norm": 2.279803991317749, "learning_rate": 7.721464910968626e-07, "loss": 1.7454, "step": 1010 }, { "epoch": 12.317073170731707, "eval_loss": 1.7562564611434937, "eval_runtime": 39.3538, "eval_samples_per_second": 448.775, "eval_steps_per_second": 0.457, "step": 1010 }, { "epoch": 12.439024390243903, "grad_norm": 3.100792407989502, "learning_rate": 7.514750429297527e-07, "loss": 1.7472, "step": 1020 }, { "epoch": 12.439024390243903, "eval_loss": 1.7561583518981934, "eval_runtime": 39.3529, "eval_samples_per_second": 448.786, "eval_steps_per_second": 0.457, "step": 1020 }, { "epoch": 12.560975609756097, "grad_norm": 3.9019737243652344, "learning_rate": 7.30916179791144e-07, "loss": 1.7411, "step": 1030 }, { "epoch": 12.560975609756097, "eval_loss": 1.7533916234970093, "eval_runtime": 39.3587, "eval_samples_per_second": 448.719, "eval_steps_per_second": 0.457, "step": 1030 }, { "epoch": 12.682926829268293, "grad_norm": 2.5269076824188232, "learning_rate": 7.104792151126514e-07, "loss": 1.7441, "step": 1040 }, { "epoch": 12.682926829268293, "eval_loss": 1.7524149417877197, "eval_runtime": 39.3483, "eval_samples_per_second": 448.838, "eval_steps_per_second": 0.457, "step": 1040 }, { "epoch": 12.804878048780488, "grad_norm": 3.1689910888671875, "learning_rate": 6.901734071043071e-07, "loss": 1.7391, "step": 1050 }, { "epoch": 12.804878048780488, "eval_loss": 1.752366542816162, "eval_runtime": 39.3518, "eval_samples_per_second": 448.798, "eval_steps_per_second": 0.457, "step": 1050 }, { "epoch": 12.926829268292684, "grad_norm": 3.6410083770751953, "learning_rate": 6.700079545604707e-07, "loss": 1.7441, "step": 1060 }, { "epoch": 12.926829268292684, "eval_loss": 1.749656319618225, "eval_runtime": 39.3618, "eval_samples_per_second": 448.684, "eval_steps_per_second": 0.457, "step": 1060 }, { "epoch": 13.048780487804878, "grad_norm": 1.9339579343795776, "learning_rate": 6.499919926926565e-07, "loss": 1.7415, "step": 1070 }, { "epoch": 13.048780487804878, "eval_loss": 1.7485558986663818, "eval_runtime": 39.3581, "eval_samples_per_second": 448.726, "eval_steps_per_second": 0.457, "step": 1070 }, { "epoch": 13.170731707317072, "grad_norm": 2.8899059295654297, "learning_rate": 6.301345889911636e-07, "loss": 1.7352, "step": 1080 }, { "epoch": 13.170731707317072, "eval_loss": 1.7480661869049072, "eval_runtime": 39.3461, "eval_samples_per_second": 448.863, "eval_steps_per_second": 0.457, "step": 1080 }, { "epoch": 13.292682926829269, "grad_norm": 3.037234306335449, "learning_rate": 6.104447391173858e-07, "loss": 1.7395, "step": 1090 }, { "epoch": 13.292682926829269, "eval_loss": 1.7456430196762085, "eval_runtime": 39.3861, "eval_samples_per_second": 448.407, "eval_steps_per_second": 0.457, "step": 1090 }, { "epoch": 13.414634146341463, "grad_norm": 1.7524123191833496, "learning_rate": 5.9093136282866e-07, "loss": 1.7317, "step": 1100 }, { "epoch": 13.414634146341463, "eval_loss": 1.7444212436676025, "eval_runtime": 39.3868, "eval_samples_per_second": 448.399, "eval_steps_per_second": 0.457, "step": 1100 }, { "epoch": 13.536585365853659, "grad_norm": 1.582607388496399, "learning_rate": 5.716032999375006e-07, "loss": 1.7356, "step": 1110 }, { "epoch": 13.536585365853659, "eval_loss": 1.743189811706543, "eval_runtime": 39.3597, "eval_samples_per_second": 448.707, "eval_steps_per_second": 0.457, "step": 1110 }, { "epoch": 13.658536585365853, "grad_norm": 0.7700549364089966, "learning_rate": 5.524693063070492e-07, "loss": 1.7347, "step": 1120 }, { "epoch": 13.658536585365853, "eval_loss": 1.7409182786941528, "eval_runtime": 39.3602, "eval_samples_per_second": 448.702, "eval_steps_per_second": 0.457, "step": 1120 }, { "epoch": 13.78048780487805, "grad_norm": 0.8657609820365906, "learning_rate": 5.335380498845559e-07, "loss": 1.7291, "step": 1130 }, { "epoch": 13.78048780487805, "eval_loss": 1.7383273839950562, "eval_runtime": 39.3511, "eval_samples_per_second": 448.806, "eval_steps_per_second": 0.457, "step": 1130 }, { "epoch": 13.902439024390244, "grad_norm": 0.5521230101585388, "learning_rate": 5.148181067746861e-07, "loss": 1.7238, "step": 1140 }, { "epoch": 13.902439024390244, "eval_loss": 1.7357066869735718, "eval_runtime": 39.3644, "eval_samples_per_second": 448.654, "eval_steps_per_second": 0.457, "step": 1140 }, { "epoch": 14.024390243902438, "grad_norm": 0.9353064894676208, "learning_rate": 4.963179573544356e-07, "loss": 1.7238, "step": 1150 }, { "epoch": 14.024390243902438, "eval_loss": 1.7331624031066895, "eval_runtime": 39.355, "eval_samples_per_second": 448.761, "eval_steps_per_second": 0.457, "step": 1150 }, { "epoch": 14.146341463414634, "grad_norm": 0.4633055329322815, "learning_rate": 4.780459824314066e-07, "loss": 1.7234, "step": 1160 }, { "epoch": 14.146341463414634, "eval_loss": 1.7308125495910645, "eval_runtime": 39.4238, "eval_samples_per_second": 447.978, "eval_steps_per_second": 0.457, "step": 1160 }, { "epoch": 14.268292682926829, "grad_norm": 0.5228179693222046, "learning_rate": 4.6001045944719594e-07, "loss": 1.7165, "step": 1170 }, { "epoch": 14.268292682926829, "eval_loss": 1.7286032438278198, "eval_runtime": 39.3556, "eval_samples_per_second": 448.755, "eval_steps_per_second": 0.457, "step": 1170 }, { "epoch": 14.390243902439025, "grad_norm": 0.3939041495323181, "learning_rate": 4.4221955872760573e-07, "loss": 1.7171, "step": 1180 }, { "epoch": 14.390243902439025, "eval_loss": 1.72659432888031, "eval_runtime": 39.3559, "eval_samples_per_second": 448.75, "eval_steps_per_second": 0.457, "step": 1180 }, { "epoch": 14.512195121951219, "grad_norm": 0.3697729706764221, "learning_rate": 4.246813397813794e-07, "loss": 1.7153, "step": 1190 }, { "epoch": 14.512195121951219, "eval_loss": 1.7247569561004639, "eval_runtime": 39.3585, "eval_samples_per_second": 448.722, "eval_steps_per_second": 0.457, "step": 1190 }, { "epoch": 14.634146341463415, "grad_norm": 0.38930952548980713, "learning_rate": 4.074037476491413e-07, "loss": 1.7147, "step": 1200 }, { "epoch": 14.634146341463415, "eval_loss": 1.7230459451675415, "eval_runtime": 39.3707, "eval_samples_per_second": 448.582, "eval_steps_per_second": 0.457, "step": 1200 }, { "epoch": 14.75609756097561, "grad_norm": 0.5216050148010254, "learning_rate": 3.9039460930418767e-07, "loss": 1.7093, "step": 1210 }, { "epoch": 14.75609756097561, "eval_loss": 1.721459984779358, "eval_runtime": 39.4124, "eval_samples_per_second": 448.108, "eval_steps_per_second": 0.457, "step": 1210 }, { "epoch": 14.878048780487806, "grad_norm": 0.6067308187484741, "learning_rate": 3.736616301067693e-07, "loss": 1.7114, "step": 1220 }, { "epoch": 14.878048780487806, "eval_loss": 1.7200278043746948, "eval_runtime": 39.3411, "eval_samples_per_second": 448.92, "eval_steps_per_second": 0.458, "step": 1220 }, { "epoch": 15.0, "grad_norm": 0.528874933719635, "learning_rate": 3.5721239031346063e-07, "loss": 1.7074, "step": 1230 }, { "epoch": 15.0, "eval_loss": 1.7186657190322876, "eval_runtime": 39.4208, "eval_samples_per_second": 448.012, "eval_steps_per_second": 0.457, "step": 1230 }, { "epoch": 15.121951219512194, "grad_norm": 0.4690570831298828, "learning_rate": 3.410543416432069e-07, "loss": 1.7068, "step": 1240 }, { "epoch": 15.121951219512194, "eval_loss": 1.7174080610275269, "eval_runtime": 39.3531, "eval_samples_per_second": 448.783, "eval_steps_per_second": 0.457, "step": 1240 }, { "epoch": 15.24390243902439, "grad_norm": 0.4555855989456177, "learning_rate": 3.2519480390159804e-07, "loss": 1.7067, "step": 1250 }, { "epoch": 15.24390243902439, "eval_loss": 1.7162292003631592, "eval_runtime": 39.3524, "eval_samples_per_second": 448.791, "eval_steps_per_second": 0.457, "step": 1250 }, { "epoch": 15.365853658536585, "grad_norm": 0.8303574919700623, "learning_rate": 3.096409616649023e-07, "loss": 1.7034, "step": 1260 }, { "epoch": 15.365853658536585, "eval_loss": 1.7151583433151245, "eval_runtime": 39.3553, "eval_samples_per_second": 448.758, "eval_steps_per_second": 0.457, "step": 1260 }, { "epoch": 15.487804878048781, "grad_norm": 0.5495628714561462, "learning_rate": 2.943998610253604e-07, "loss": 1.7075, "step": 1270 }, { "epoch": 15.487804878048781, "eval_loss": 1.7141631841659546, "eval_runtime": 39.3687, "eval_samples_per_second": 448.606, "eval_steps_per_second": 0.457, "step": 1270 }, { "epoch": 15.609756097560975, "grad_norm": 0.361331582069397, "learning_rate": 2.7947840639921303e-07, "loss": 1.7002, "step": 1280 }, { "epoch": 15.609756097560975, "eval_loss": 1.7131644487380981, "eval_runtime": 39.4083, "eval_samples_per_second": 448.155, "eval_steps_per_second": 0.457, "step": 1280 }, { "epoch": 15.731707317073171, "grad_norm": 0.4098544418811798, "learning_rate": 2.648833573989118e-07, "loss": 1.7055, "step": 1290 }, { "epoch": 15.731707317073171, "eval_loss": 1.712282657623291, "eval_runtime": 39.3799, "eval_samples_per_second": 448.478, "eval_steps_per_second": 0.457, "step": 1290 }, { "epoch": 15.853658536585366, "grad_norm": 0.5131831765174866, "learning_rate": 2.50621325770927e-07, "loss": 1.6976, "step": 1300 }, { "epoch": 15.853658536585366, "eval_loss": 1.7114192247390747, "eval_runtime": 39.3871, "eval_samples_per_second": 448.395, "eval_steps_per_second": 0.457, "step": 1300 }, { "epoch": 15.975609756097562, "grad_norm": 0.4333685338497162, "learning_rate": 2.3669877240054037e-07, "loss": 1.7002, "step": 1310 }, { "epoch": 15.975609756097562, "eval_loss": 1.710659146308899, "eval_runtime": 39.3698, "eval_samples_per_second": 448.593, "eval_steps_per_second": 0.457, "step": 1310 }, { "epoch": 16.097560975609756, "grad_norm": 0.36955586075782776, "learning_rate": 2.231220043849804e-07, "loss": 1.7015, "step": 1320 }, { "epoch": 16.097560975609756, "eval_loss": 1.7099283933639526, "eval_runtime": 39.3604, "eval_samples_per_second": 448.699, "eval_steps_per_second": 0.457, "step": 1320 }, { "epoch": 16.21951219512195, "grad_norm": 0.37774789333343506, "learning_rate": 2.0989717217622648e-07, "loss": 1.6987, "step": 1330 }, { "epoch": 16.21951219512195, "eval_loss": 1.70924973487854, "eval_runtime": 39.3549, "eval_samples_per_second": 448.763, "eval_steps_per_second": 0.457, "step": 1330 }, { "epoch": 16.341463414634145, "grad_norm": 0.389635294675827, "learning_rate": 1.9703026679477252e-07, "loss": 1.6985, "step": 1340 }, { "epoch": 16.341463414634145, "eval_loss": 1.7086195945739746, "eval_runtime": 39.7499, "eval_samples_per_second": 444.303, "eval_steps_per_second": 0.453, "step": 1340 }, { "epoch": 16.463414634146343, "grad_norm": 0.4067881405353546, "learning_rate": 1.845271171156184e-07, "loss": 1.6986, "step": 1350 }, { "epoch": 16.463414634146343, "eval_loss": 1.7080307006835938, "eval_runtime": 39.3203, "eval_samples_per_second": 449.157, "eval_steps_per_second": 0.458, "step": 1350 }, { "epoch": 16.585365853658537, "grad_norm": 0.33628836274147034, "learning_rate": 1.7239338722771324e-07, "loss": 1.6993, "step": 1360 }, { "epoch": 16.585365853658537, "eval_loss": 1.707476019859314, "eval_runtime": 39.357, "eval_samples_per_second": 448.739, "eval_steps_per_second": 0.457, "step": 1360 }, { "epoch": 16.70731707317073, "grad_norm": 0.31285569071769714, "learning_rate": 1.6063457386805003e-07, "loss": 1.6946, "step": 1370 }, { "epoch": 16.70731707317073, "eval_loss": 1.7069728374481201, "eval_runtime": 39.3881, "eval_samples_per_second": 448.384, "eval_steps_per_second": 0.457, "step": 1370 }, { "epoch": 16.829268292682926, "grad_norm": 0.3229863941669464, "learning_rate": 1.4925600393157322e-07, "loss": 1.6934, "step": 1380 }, { "epoch": 16.829268292682926, "eval_loss": 1.7064942121505737, "eval_runtime": 39.3862, "eval_samples_per_second": 448.406, "eval_steps_per_second": 0.457, "step": 1380 }, { "epoch": 16.951219512195124, "grad_norm": 0.32283350825309753, "learning_rate": 1.3826283205802424e-07, "loss": 1.6936, "step": 1390 }, { "epoch": 16.951219512195124, "eval_loss": 1.706059217453003, "eval_runtime": 39.374, "eval_samples_per_second": 448.544, "eval_steps_per_second": 0.457, "step": 1390 }, { "epoch": 17.073170731707318, "grad_norm": 0.2510131597518921, "learning_rate": 1.2766003829682504e-07, "loss": 1.6972, "step": 1400 }, { "epoch": 17.073170731707318, "eval_loss": 1.705664873123169, "eval_runtime": 39.3448, "eval_samples_per_second": 448.877, "eval_steps_per_second": 0.457, "step": 1400 }, { "epoch": 17.195121951219512, "grad_norm": 0.2857695519924164, "learning_rate": 1.1745242585104953e-07, "loss": 1.6923, "step": 1410 }, { "epoch": 17.195121951219512, "eval_loss": 1.7052934169769287, "eval_runtime": 39.3548, "eval_samples_per_second": 448.763, "eval_steps_per_second": 0.457, "step": 1410 }, { "epoch": 17.317073170731707, "grad_norm": 0.39086970686912537, "learning_rate": 1.0764461890151111e-07, "loss": 1.6943, "step": 1420 }, { "epoch": 17.317073170731707, "eval_loss": 1.704952359199524, "eval_runtime": 39.366, "eval_samples_per_second": 448.635, "eval_steps_per_second": 0.457, "step": 1420 }, { "epoch": 17.4390243902439, "grad_norm": 0.2526913583278656, "learning_rate": 9.824106051194858e-08, "loss": 1.6944, "step": 1430 }, { "epoch": 17.4390243902439, "eval_loss": 1.7046380043029785, "eval_runtime": 39.3509, "eval_samples_per_second": 448.809, "eval_steps_per_second": 0.457, "step": 1430 }, { "epoch": 17.5609756097561, "grad_norm": 0.2990001142024994, "learning_rate": 8.924601061626048e-08, "loss": 1.6929, "step": 1440 }, { "epoch": 17.5609756097561, "eval_loss": 1.7043615579605103, "eval_runtime": 39.3593, "eval_samples_per_second": 448.712, "eval_steps_per_second": 0.457, "step": 1440 }, { "epoch": 17.682926829268293, "grad_norm": 0.24767932295799255, "learning_rate": 8.066354408870047e-08, "loss": 1.6926, "step": 1450 }, { "epoch": 17.682926829268293, "eval_loss": 1.7040989398956299, "eval_runtime": 39.3568, "eval_samples_per_second": 448.741, "eval_steps_per_second": 0.457, "step": 1450 }, { "epoch": 17.804878048780488, "grad_norm": 0.3169814348220825, "learning_rate": 7.249754889790538e-08, "loss": 1.6926, "step": 1460 }, { "epoch": 17.804878048780488, "eval_loss": 1.703873634338379, "eval_runtime": 39.4, "eval_samples_per_second": 448.249, "eval_steps_per_second": 0.457, "step": 1460 }, { "epoch": 17.926829268292682, "grad_norm": 0.3294218182563782, "learning_rate": 6.475172434559573e-08, "loss": 1.6932, "step": 1470 }, { "epoch": 17.926829268292682, "eval_loss": 1.703667163848877, "eval_runtime": 39.4234, "eval_samples_per_second": 447.983, "eval_steps_per_second": 0.457, "step": 1470 }, { "epoch": 18.048780487804876, "grad_norm": 0.2851867079734802, "learning_rate": 5.742957939074411e-08, "loss": 1.6927, "step": 1480 }, { "epoch": 18.048780487804876, "eval_loss": 1.7034906148910522, "eval_runtime": 39.3855, "eval_samples_per_second": 448.414, "eval_steps_per_second": 0.457, "step": 1480 }, { "epoch": 18.170731707317074, "grad_norm": 0.2505706250667572, "learning_rate": 5.053443105997068e-08, "loss": 1.6905, "step": 1490 }, { "epoch": 18.170731707317074, "eval_loss": 1.7033272981643677, "eval_runtime": 39.3764, "eval_samples_per_second": 448.517, "eval_steps_per_second": 0.457, "step": 1490 }, { "epoch": 18.29268292682927, "grad_norm": 0.2556091248989105, "learning_rate": 4.4069402944887704e-08, "loss": 1.6928, "step": 1500 }, { "epoch": 18.29268292682927, "eval_loss": 1.703181505203247, "eval_runtime": 39.3582, "eval_samples_per_second": 448.725, "eval_steps_per_second": 0.457, "step": 1500 }, { "epoch": 18.414634146341463, "grad_norm": 0.2573912739753723, "learning_rate": 3.803742378707198e-08, "loss": 1.6944, "step": 1510 }, { "epoch": 18.414634146341463, "eval_loss": 1.703063726425171, "eval_runtime": 39.3337, "eval_samples_per_second": 449.004, "eval_steps_per_second": 0.458, "step": 1510 }, { "epoch": 18.536585365853657, "grad_norm": 0.24173639714717865, "learning_rate": 3.24412261513064e-08, "loss": 1.6925, "step": 1520 }, { "epoch": 18.536585365853657, "eval_loss": 1.7029577493667603, "eval_runtime": 39.3737, "eval_samples_per_second": 448.549, "eval_steps_per_second": 0.457, "step": 1520 }, { "epoch": 18.658536585365855, "grad_norm": 0.24515186250209808, "learning_rate": 2.7283345187693264e-08, "loss": 1.6944, "step": 1530 }, { "epoch": 18.658536585365855, "eval_loss": 1.7028616666793823, "eval_runtime": 39.3701, "eval_samples_per_second": 448.589, "eval_steps_per_second": 0.457, "step": 1530 }, { "epoch": 18.78048780487805, "grad_norm": 0.25829750299453735, "learning_rate": 2.256611748319792e-08, "loss": 1.6897, "step": 1540 }, { "epoch": 18.78048780487805, "eval_loss": 1.7027884721755981, "eval_runtime": 39.4047, "eval_samples_per_second": 448.195, "eval_steps_per_second": 0.457, "step": 1540 }, { "epoch": 18.902439024390244, "grad_norm": 0.2337442934513092, "learning_rate": 1.8291680003145073e-08, "loss": 1.6915, "step": 1550 }, { "epoch": 18.902439024390244, "eval_loss": 1.702728033065796, "eval_runtime": 39.8565, "eval_samples_per_second": 443.115, "eval_steps_per_second": 0.452, "step": 1550 }, { "epoch": 19.024390243902438, "grad_norm": 0.24271942675113678, "learning_rate": 1.4461969123145457e-08, "loss": 1.6891, "step": 1560 }, { "epoch": 19.024390243902438, "eval_loss": 1.7026790380477905, "eval_runtime": 39.3468, "eval_samples_per_second": 448.854, "eval_steps_per_second": 0.457, "step": 1560 }, { "epoch": 19.146341463414632, "grad_norm": 0.2199811339378357, "learning_rate": 1.107871975189234e-08, "loss": 1.6884, "step": 1570 }, { "epoch": 19.146341463414632, "eval_loss": 1.702639102935791, "eval_runtime": 39.3619, "eval_samples_per_second": 448.683, "eval_steps_per_second": 0.457, "step": 1570 }, { "epoch": 19.26829268292683, "grad_norm": 0.24018193781375885, "learning_rate": 8.143464545226297e-09, "loss": 1.6962, "step": 1580 }, { "epoch": 19.26829268292683, "eval_loss": 1.7026113271713257, "eval_runtime": 39.2823, "eval_samples_per_second": 449.591, "eval_steps_per_second": 0.458, "step": 1580 }, { "epoch": 19.390243902439025, "grad_norm": 0.23089687526226044, "learning_rate": 5.657533211820941e-09, "loss": 1.6918, "step": 1590 }, { "epoch": 19.390243902439025, "eval_loss": 1.7025905847549438, "eval_runtime": 39.2847, "eval_samples_per_second": 449.564, "eval_steps_per_second": 0.458, "step": 1590 }, { "epoch": 19.51219512195122, "grad_norm": 0.219436913728714, "learning_rate": 3.6220519108086654e-09, "loss": 1.6906, "step": 1600 }, { "epoch": 19.51219512195122, "eval_loss": 1.7025744915008545, "eval_runtime": 39.3227, "eval_samples_per_second": 449.13, "eval_steps_per_second": 0.458, "step": 1600 }, { "epoch": 19.634146341463413, "grad_norm": 0.21289722621440887, "learning_rate": 2.037942741615617e-09, "loss": 1.691, "step": 1610 }, { "epoch": 19.634146341463413, "eval_loss": 1.7025699615478516, "eval_runtime": 39.3676, "eval_samples_per_second": 448.617, "eval_steps_per_second": 0.457, "step": 1610 }, { "epoch": 19.75609756097561, "grad_norm": 0.2050682008266449, "learning_rate": 9.059233262386224e-10, "loss": 1.6963, "step": 1620 }, { "epoch": 19.75609756097561, "eval_loss": 1.7025647163391113, "eval_runtime": 39.2816, "eval_samples_per_second": 449.599, "eval_steps_per_second": 0.458, "step": 1620 }, { "epoch": 19.878048780487806, "grad_norm": 0.2104637622833252, "learning_rate": 2.265064841533437e-10, "loss": 1.69, "step": 1630 }, { "epoch": 19.878048780487806, "eval_loss": 1.7025623321533203, "eval_runtime": 39.3024, "eval_samples_per_second": 449.362, "eval_steps_per_second": 0.458, "step": 1630 }, { "epoch": 20.0, "grad_norm": 0.24021713435649872, "learning_rate": 0.0, "loss": 1.6877, "step": 1640 }, { "epoch": 20.0, "eval_loss": 1.7025611400604248, "eval_runtime": 39.2695, "eval_samples_per_second": 449.738, "eval_steps_per_second": 0.458, "step": 1640 } ], "logging_steps": 10, "max_steps": 1640, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.5456972789249475e+19, "train_batch_size": 64, "trial_name": null, "trial_params": null }