M5-v2 / trainer_state.json
chouss's picture
Uploading folder contents
bb4f000 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 20.0,
"eval_steps": 10,
"global_step": 1640,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.12195121951219512,
"grad_norm": 35.852298736572266,
"learning_rate": 1.219512195121951e-07,
"loss": 7.8912,
"step": 10
},
{
"epoch": 0.12195121951219512,
"eval_loss": 7.806792736053467,
"eval_runtime": 39.7349,
"eval_samples_per_second": 444.47,
"eval_steps_per_second": 0.453,
"step": 10
},
{
"epoch": 0.24390243902439024,
"grad_norm": 35.98800277709961,
"learning_rate": 2.439024390243902e-07,
"loss": 7.6722,
"step": 20
},
{
"epoch": 0.24390243902439024,
"eval_loss": 7.456321716308594,
"eval_runtime": 39.2616,
"eval_samples_per_second": 449.828,
"eval_steps_per_second": 0.458,
"step": 20
},
{
"epoch": 0.36585365853658536,
"grad_norm": 33.745487213134766,
"learning_rate": 3.6585365853658536e-07,
"loss": 7.2095,
"step": 30
},
{
"epoch": 0.36585365853658536,
"eval_loss": 6.869460105895996,
"eval_runtime": 39.2191,
"eval_samples_per_second": 450.316,
"eval_steps_per_second": 0.459,
"step": 30
},
{
"epoch": 0.4878048780487805,
"grad_norm": 28.555143356323242,
"learning_rate": 4.878048780487804e-07,
"loss": 6.5213,
"step": 40
},
{
"epoch": 0.4878048780487805,
"eval_loss": 6.08616828918457,
"eval_runtime": 39.2368,
"eval_samples_per_second": 450.114,
"eval_steps_per_second": 0.459,
"step": 40
},
{
"epoch": 0.6097560975609756,
"grad_norm": 17.861717224121094,
"learning_rate": 6.097560975609756e-07,
"loss": 5.6988,
"step": 50
},
{
"epoch": 0.6097560975609756,
"eval_loss": 5.294862270355225,
"eval_runtime": 39.2476,
"eval_samples_per_second": 449.989,
"eval_steps_per_second": 0.459,
"step": 50
},
{
"epoch": 0.7317073170731707,
"grad_norm": 9.703508377075195,
"learning_rate": 7.317073170731707e-07,
"loss": 5.0031,
"step": 60
},
{
"epoch": 0.7317073170731707,
"eval_loss": 4.72209358215332,
"eval_runtime": 39.2415,
"eval_samples_per_second": 450.059,
"eval_steps_per_second": 0.459,
"step": 60
},
{
"epoch": 0.8536585365853658,
"grad_norm": 5.830469131469727,
"learning_rate": 8.536585365853657e-07,
"loss": 4.5171,
"step": 70
},
{
"epoch": 0.8536585365853658,
"eval_loss": 4.342286586761475,
"eval_runtime": 39.2288,
"eval_samples_per_second": 450.205,
"eval_steps_per_second": 0.459,
"step": 70
},
{
"epoch": 0.975609756097561,
"grad_norm": 4.307955265045166,
"learning_rate": 9.756097560975609e-07,
"loss": 4.1858,
"step": 80
},
{
"epoch": 0.975609756097561,
"eval_loss": 4.0347723960876465,
"eval_runtime": 39.2556,
"eval_samples_per_second": 449.897,
"eval_steps_per_second": 0.459,
"step": 80
},
{
"epoch": 1.0975609756097562,
"grad_norm": 3.8736400604248047,
"learning_rate": 1.0975609756097562e-06,
"loss": 3.8907,
"step": 90
},
{
"epoch": 1.0975609756097562,
"eval_loss": 3.7496390342712402,
"eval_runtime": 39.2415,
"eval_samples_per_second": 450.059,
"eval_steps_per_second": 0.459,
"step": 90
},
{
"epoch": 1.2195121951219512,
"grad_norm": 3.2329907417297363,
"learning_rate": 1.2195121951219512e-06,
"loss": 3.6078,
"step": 100
},
{
"epoch": 1.2195121951219512,
"eval_loss": 3.480637788772583,
"eval_runtime": 39.2636,
"eval_samples_per_second": 449.805,
"eval_steps_per_second": 0.458,
"step": 100
},
{
"epoch": 1.3414634146341464,
"grad_norm": 2.6243083477020264,
"learning_rate": 1.3414634146341463e-06,
"loss": 3.3481,
"step": 110
},
{
"epoch": 1.3414634146341464,
"eval_loss": 3.248450756072998,
"eval_runtime": 39.2605,
"eval_samples_per_second": 449.841,
"eval_steps_per_second": 0.458,
"step": 110
},
{
"epoch": 1.4634146341463414,
"grad_norm": 2.2950096130371094,
"learning_rate": 1.4634146341463414e-06,
"loss": 3.1262,
"step": 120
},
{
"epoch": 1.4634146341463414,
"eval_loss": 3.042900323867798,
"eval_runtime": 39.3067,
"eval_samples_per_second": 449.313,
"eval_steps_per_second": 0.458,
"step": 120
},
{
"epoch": 1.5853658536585367,
"grad_norm": 2.118208408355713,
"learning_rate": 1.5853658536585366e-06,
"loss": 2.9296,
"step": 130
},
{
"epoch": 1.5853658536585367,
"eval_loss": 2.842834711074829,
"eval_runtime": 39.3323,
"eval_samples_per_second": 449.021,
"eval_steps_per_second": 0.458,
"step": 130
},
{
"epoch": 1.7073170731707317,
"grad_norm": 1.656348466873169,
"learning_rate": 1.7073170731707315e-06,
"loss": 2.7478,
"step": 140
},
{
"epoch": 1.7073170731707317,
"eval_loss": 2.671419382095337,
"eval_runtime": 39.2963,
"eval_samples_per_second": 449.431,
"eval_steps_per_second": 0.458,
"step": 140
},
{
"epoch": 1.8292682926829267,
"grad_norm": 1.4909868240356445,
"learning_rate": 1.8292682926829268e-06,
"loss": 2.5869,
"step": 150
},
{
"epoch": 1.8292682926829267,
"eval_loss": 2.5269365310668945,
"eval_runtime": 39.305,
"eval_samples_per_second": 449.332,
"eval_steps_per_second": 0.458,
"step": 150
},
{
"epoch": 1.951219512195122,
"grad_norm": 1.4536538124084473,
"learning_rate": 1.9512195121951218e-06,
"loss": 2.4527,
"step": 160
},
{
"epoch": 1.951219512195122,
"eval_loss": 2.4036409854888916,
"eval_runtime": 39.3364,
"eval_samples_per_second": 448.974,
"eval_steps_per_second": 0.458,
"step": 160
},
{
"epoch": 2.073170731707317,
"grad_norm": 1.1609474420547485,
"learning_rate": 1.9999184556954774e-06,
"loss": 2.3461,
"step": 170
},
{
"epoch": 2.073170731707317,
"eval_loss": 2.299896717071533,
"eval_runtime": 39.3179,
"eval_samples_per_second": 449.185,
"eval_steps_per_second": 0.458,
"step": 170
},
{
"epoch": 2.1951219512195124,
"grad_norm": 0.9852533340454102,
"learning_rate": 1.999420177550043e-06,
"loss": 2.243,
"step": 180
},
{
"epoch": 2.1951219512195124,
"eval_loss": 2.2197790145874023,
"eval_runtime": 39.305,
"eval_samples_per_second": 449.332,
"eval_steps_per_second": 0.458,
"step": 180
},
{
"epoch": 2.317073170731707,
"grad_norm": 0.7964138388633728,
"learning_rate": 1.9984691491033903e-06,
"loss": 2.1769,
"step": 190
},
{
"epoch": 2.317073170731707,
"eval_loss": 2.1610889434814453,
"eval_runtime": 39.3444,
"eval_samples_per_second": 448.883,
"eval_steps_per_second": 0.457,
"step": 190
},
{
"epoch": 2.4390243902439024,
"grad_norm": 0.8308465480804443,
"learning_rate": 1.9970658011837403e-06,
"loss": 2.1223,
"step": 200
},
{
"epoch": 2.4390243902439024,
"eval_loss": 2.120553493499756,
"eval_runtime": 39.3228,
"eval_samples_per_second": 449.129,
"eval_steps_per_second": 0.458,
"step": 200
},
{
"epoch": 2.5609756097560976,
"grad_norm": 0.593834638595581,
"learning_rate": 1.995210769525899e-06,
"loss": 2.0966,
"step": 210
},
{
"epoch": 2.5609756097560976,
"eval_loss": 2.0926828384399414,
"eval_runtime": 39.3549,
"eval_samples_per_second": 448.762,
"eval_steps_per_second": 0.457,
"step": 210
},
{
"epoch": 2.682926829268293,
"grad_norm": 0.5518779158592224,
"learning_rate": 1.9929048944832634e-06,
"loss": 2.0771,
"step": 220
},
{
"epoch": 2.682926829268293,
"eval_loss": 2.072599172592163,
"eval_runtime": 39.3161,
"eval_samples_per_second": 449.205,
"eval_steps_per_second": 0.458,
"step": 220
},
{
"epoch": 2.8048780487804876,
"grad_norm": 0.5161460041999817,
"learning_rate": 1.9901492206471324e-06,
"loss": 2.0494,
"step": 230
},
{
"epoch": 2.8048780487804876,
"eval_loss": 2.056934118270874,
"eval_runtime": 39.345,
"eval_samples_per_second": 448.876,
"eval_steps_per_second": 0.457,
"step": 230
},
{
"epoch": 2.926829268292683,
"grad_norm": 0.49138274788856506,
"learning_rate": 1.986944996373489e-06,
"loss": 2.0358,
"step": 240
},
{
"epoch": 2.926829268292683,
"eval_loss": 2.0439107418060303,
"eval_runtime": 39.3247,
"eval_samples_per_second": 449.107,
"eval_steps_per_second": 0.458,
"step": 240
},
{
"epoch": 3.048780487804878,
"grad_norm": 0.4589332044124603,
"learning_rate": 1.9832936732174833e-06,
"loss": 2.0267,
"step": 250
},
{
"epoch": 3.048780487804878,
"eval_loss": 2.0325710773468018,
"eval_runtime": 39.3333,
"eval_samples_per_second": 449.008,
"eval_steps_per_second": 0.458,
"step": 250
},
{
"epoch": 3.1707317073170733,
"grad_norm": 0.5090314745903015,
"learning_rate": 1.979196905275856e-06,
"loss": 2.0137,
"step": 260
},
{
"epoch": 3.1707317073170733,
"eval_loss": 2.022416830062866,
"eval_runtime": 39.3644,
"eval_samples_per_second": 448.654,
"eval_steps_per_second": 0.457,
"step": 260
},
{
"epoch": 3.292682926829268,
"grad_norm": 0.4382960796356201,
"learning_rate": 1.974656548437613e-06,
"loss": 2.0011,
"step": 270
},
{
"epoch": 3.292682926829268,
"eval_loss": 2.013012647628784,
"eval_runtime": 39.333,
"eval_samples_per_second": 449.012,
"eval_steps_per_second": 0.458,
"step": 270
},
{
"epoch": 3.4146341463414633,
"grad_norm": 0.4359953999519348,
"learning_rate": 1.9696746595432827e-06,
"loss": 1.9985,
"step": 280
},
{
"epoch": 3.4146341463414633,
"eval_loss": 2.004154920578003,
"eval_runtime": 39.3656,
"eval_samples_per_second": 448.641,
"eval_steps_per_second": 0.457,
"step": 280
},
{
"epoch": 3.5365853658536586,
"grad_norm": 0.4334374964237213,
"learning_rate": 1.964253495453141e-06,
"loss": 1.9824,
"step": 290
},
{
"epoch": 3.5365853658536586,
"eval_loss": 1.995658040046692,
"eval_runtime": 39.3661,
"eval_samples_per_second": 448.635,
"eval_steps_per_second": 0.457,
"step": 290
},
{
"epoch": 3.658536585365854,
"grad_norm": 0.45431938767433167,
"learning_rate": 1.9583955120248236e-06,
"loss": 1.9834,
"step": 300
},
{
"epoch": 3.658536585365854,
"eval_loss": 1.987615704536438,
"eval_runtime": 39.3423,
"eval_samples_per_second": 448.907,
"eval_steps_per_second": 0.458,
"step": 300
},
{
"epoch": 3.7804878048780486,
"grad_norm": 0.4443369507789612,
"learning_rate": 1.9521033630007928e-06,
"loss": 1.9771,
"step": 310
},
{
"epoch": 3.7804878048780486,
"eval_loss": 1.9800046682357788,
"eval_runtime": 39.3384,
"eval_samples_per_second": 448.951,
"eval_steps_per_second": 0.458,
"step": 310
},
{
"epoch": 3.902439024390244,
"grad_norm": 0.41829970479011536,
"learning_rate": 1.945379898806153e-06,
"loss": 1.9685,
"step": 320
},
{
"epoch": 3.902439024390244,
"eval_loss": 1.9727787971496582,
"eval_runtime": 39.3267,
"eval_samples_per_second": 449.084,
"eval_steps_per_second": 0.458,
"step": 320
},
{
"epoch": 4.024390243902439,
"grad_norm": 0.43398183584213257,
"learning_rate": 1.9382281652573785e-06,
"loss": 1.9591,
"step": 330
},
{
"epoch": 4.024390243902439,
"eval_loss": 1.9658193588256836,
"eval_runtime": 39.3633,
"eval_samples_per_second": 448.666,
"eval_steps_per_second": 0.457,
"step": 330
},
{
"epoch": 4.146341463414634,
"grad_norm": 0.39597055315971375,
"learning_rate": 1.9306514021825116e-06,
"loss": 1.9487,
"step": 340
},
{
"epoch": 4.146341463414634,
"eval_loss": 1.9592342376708984,
"eval_runtime": 39.3471,
"eval_samples_per_second": 448.852,
"eval_steps_per_second": 0.457,
"step": 340
},
{
"epoch": 4.2682926829268295,
"grad_norm": 0.4240054190158844,
"learning_rate": 1.922653041953483e-06,
"loss": 1.9454,
"step": 350
},
{
"epoch": 4.2682926829268295,
"eval_loss": 1.9528599977493286,
"eval_runtime": 39.4079,
"eval_samples_per_second": 448.159,
"eval_steps_per_second": 0.457,
"step": 350
},
{
"epoch": 4.390243902439025,
"grad_norm": 0.3959615230560303,
"learning_rate": 1.914236707931202e-06,
"loss": 1.9361,
"step": 360
},
{
"epoch": 4.390243902439025,
"eval_loss": 1.9468114376068115,
"eval_runtime": 39.3624,
"eval_samples_per_second": 448.676,
"eval_steps_per_second": 0.457,
"step": 360
},
{
"epoch": 4.512195121951219,
"grad_norm": 0.3723958432674408,
"learning_rate": 1.905406212824126e-06,
"loss": 1.9389,
"step": 370
},
{
"epoch": 4.512195121951219,
"eval_loss": 1.940889596939087,
"eval_runtime": 39.3409,
"eval_samples_per_second": 448.922,
"eval_steps_per_second": 0.458,
"step": 370
},
{
"epoch": 4.634146341463414,
"grad_norm": 0.37107619643211365,
"learning_rate": 1.8961655569610556e-06,
"loss": 1.9279,
"step": 380
},
{
"epoch": 4.634146341463414,
"eval_loss": 1.9352220296859741,
"eval_runtime": 39.3794,
"eval_samples_per_second": 448.483,
"eval_steps_per_second": 0.457,
"step": 380
},
{
"epoch": 4.7560975609756095,
"grad_norm": 0.37607431411743164,
"learning_rate": 1.8865189264789318e-06,
"loss": 1.9212,
"step": 390
},
{
"epoch": 4.7560975609756095,
"eval_loss": 1.9297622442245483,
"eval_runtime": 39.3451,
"eval_samples_per_second": 448.875,
"eval_steps_per_second": 0.457,
"step": 390
},
{
"epoch": 4.878048780487805,
"grad_norm": 0.371985524892807,
"learning_rate": 1.8764706914264633e-06,
"loss": 1.9142,
"step": 400
},
{
"epoch": 4.878048780487805,
"eval_loss": 1.9244760274887085,
"eval_runtime": 39.3136,
"eval_samples_per_second": 449.234,
"eval_steps_per_second": 0.458,
"step": 400
},
{
"epoch": 5.0,
"grad_norm": 0.4083458185195923,
"learning_rate": 1.8660254037844386e-06,
"loss": 1.9087,
"step": 410
},
{
"epoch": 5.0,
"eval_loss": 1.9192752838134766,
"eval_runtime": 39.3184,
"eval_samples_per_second": 449.18,
"eval_steps_per_second": 0.458,
"step": 410
},
{
"epoch": 5.121951219512195,
"grad_norm": 0.36457160115242004,
"learning_rate": 1.8551877954036162e-06,
"loss": 1.9061,
"step": 420
},
{
"epoch": 5.121951219512195,
"eval_loss": 1.9141229391098022,
"eval_runtime": 39.3307,
"eval_samples_per_second": 449.038,
"eval_steps_per_second": 0.458,
"step": 420
},
{
"epoch": 5.2439024390243905,
"grad_norm": 0.4809107482433319,
"learning_rate": 1.8439627758611382e-06,
"loss": 1.9013,
"step": 430
},
{
"epoch": 5.2439024390243905,
"eval_loss": 1.908936619758606,
"eval_runtime": 39.3459,
"eval_samples_per_second": 448.865,
"eval_steps_per_second": 0.457,
"step": 430
},
{
"epoch": 5.365853658536586,
"grad_norm": 0.5720298290252686,
"learning_rate": 1.832355430236427e-06,
"loss": 1.8953,
"step": 440
},
{
"epoch": 5.365853658536586,
"eval_loss": 1.9032728672027588,
"eval_runtime": 39.357,
"eval_samples_per_second": 448.739,
"eval_steps_per_second": 0.457,
"step": 440
},
{
"epoch": 5.487804878048781,
"grad_norm": 0.40899941325187683,
"learning_rate": 1.8203710168075784e-06,
"loss": 1.8877,
"step": 450
},
{
"epoch": 5.487804878048781,
"eval_loss": 1.8964674472808838,
"eval_runtime": 39.3338,
"eval_samples_per_second": 449.003,
"eval_steps_per_second": 0.458,
"step": 450
},
{
"epoch": 5.609756097560975,
"grad_norm": 0.7770607471466064,
"learning_rate": 1.8080149646692928e-06,
"loss": 1.8794,
"step": 460
},
{
"epoch": 5.609756097560975,
"eval_loss": 1.8876992464065552,
"eval_runtime": 39.345,
"eval_samples_per_second": 448.876,
"eval_steps_per_second": 0.457,
"step": 460
},
{
"epoch": 5.7317073170731705,
"grad_norm": 1.4756108522415161,
"learning_rate": 1.7952928712734265e-06,
"loss": 1.8732,
"step": 470
},
{
"epoch": 5.7317073170731705,
"eval_loss": 1.8765217065811157,
"eval_runtime": 39.3534,
"eval_samples_per_second": 448.78,
"eval_steps_per_second": 0.457,
"step": 470
},
{
"epoch": 5.853658536585366,
"grad_norm": 3.089818239212036,
"learning_rate": 1.7822104998932711e-06,
"loss": 1.8652,
"step": 480
},
{
"epoch": 5.853658536585366,
"eval_loss": 1.8682845830917358,
"eval_runtime": 39.3351,
"eval_samples_per_second": 448.988,
"eval_steps_per_second": 0.458,
"step": 480
},
{
"epoch": 5.975609756097561,
"grad_norm": 1.618462085723877,
"learning_rate": 1.7687737770127184e-06,
"loss": 1.8513,
"step": 490
},
{
"epoch": 5.975609756097561,
"eval_loss": 1.8599414825439453,
"eval_runtime": 39.3081,
"eval_samples_per_second": 449.297,
"eval_steps_per_second": 0.458,
"step": 490
},
{
"epoch": 6.097560975609756,
"grad_norm": 1.5310617685317993,
"learning_rate": 1.754988789641485e-06,
"loss": 1.8501,
"step": 500
},
{
"epoch": 6.097560975609756,
"eval_loss": 1.8557490110397339,
"eval_runtime": 39.3877,
"eval_samples_per_second": 448.389,
"eval_steps_per_second": 0.457,
"step": 500
},
{
"epoch": 6.219512195121951,
"grad_norm": 1.2465336322784424,
"learning_rate": 1.7408617825576177e-06,
"loss": 1.8475,
"step": 510
},
{
"epoch": 6.219512195121951,
"eval_loss": 1.85213303565979,
"eval_runtime": 39.3619,
"eval_samples_per_second": 448.682,
"eval_steps_per_second": 0.457,
"step": 510
},
{
"epoch": 6.341463414634147,
"grad_norm": 0.8563424944877625,
"learning_rate": 1.7263991554785288e-06,
"loss": 1.8349,
"step": 520
},
{
"epoch": 6.341463414634147,
"eval_loss": 1.8481909036636353,
"eval_runtime": 39.3592,
"eval_samples_per_second": 448.714,
"eval_steps_per_second": 0.457,
"step": 520
},
{
"epoch": 6.463414634146342,
"grad_norm": 0.47906893491744995,
"learning_rate": 1.7116074601618415e-06,
"loss": 1.8369,
"step": 530
},
{
"epoch": 6.463414634146342,
"eval_loss": 1.8447495698928833,
"eval_runtime": 39.3461,
"eval_samples_per_second": 448.863,
"eval_steps_per_second": 0.457,
"step": 530
},
{
"epoch": 6.585365853658536,
"grad_norm": 0.3527175188064575,
"learning_rate": 1.696493397437357e-06,
"loss": 1.8288,
"step": 540
},
{
"epoch": 6.585365853658536,
"eval_loss": 1.8413718938827515,
"eval_runtime": 39.3552,
"eval_samples_per_second": 448.759,
"eval_steps_per_second": 0.457,
"step": 540
},
{
"epoch": 6.7073170731707314,
"grad_norm": 0.37900474667549133,
"learning_rate": 1.6810638141714932e-06,
"loss": 1.8271,
"step": 550
},
{
"epoch": 6.7073170731707314,
"eval_loss": 1.8382277488708496,
"eval_runtime": 39.3397,
"eval_samples_per_second": 448.936,
"eval_steps_per_second": 0.458,
"step": 550
},
{
"epoch": 6.829268292682927,
"grad_norm": 0.3187570869922638,
"learning_rate": 1.665325700165565e-06,
"loss": 1.8296,
"step": 560
},
{
"epoch": 6.829268292682927,
"eval_loss": 1.8352141380310059,
"eval_runtime": 39.3555,
"eval_samples_per_second": 448.755,
"eval_steps_per_second": 0.457,
"step": 560
},
{
"epoch": 6.951219512195122,
"grad_norm": 0.36600008606910706,
"learning_rate": 1.6492861849893147e-06,
"loss": 1.8257,
"step": 570
},
{
"epoch": 6.951219512195122,
"eval_loss": 1.832342505455017,
"eval_runtime": 39.336,
"eval_samples_per_second": 448.978,
"eval_steps_per_second": 0.458,
"step": 570
},
{
"epoch": 7.073170731707317,
"grad_norm": 0.288286954164505,
"learning_rate": 1.6329525347511218e-06,
"loss": 1.8238,
"step": 580
},
{
"epoch": 7.073170731707317,
"eval_loss": 1.8295822143554688,
"eval_runtime": 39.3243,
"eval_samples_per_second": 449.112,
"eval_steps_per_second": 0.458,
"step": 580
},
{
"epoch": 7.195121951219512,
"grad_norm": 0.3237072825431824,
"learning_rate": 1.6163321488063635e-06,
"loss": 1.8174,
"step": 590
},
{
"epoch": 7.195121951219512,
"eval_loss": 1.8268990516662598,
"eval_runtime": 39.3498,
"eval_samples_per_second": 448.82,
"eval_steps_per_second": 0.457,
"step": 590
},
{
"epoch": 7.317073170731708,
"grad_norm": 0.36146941781044006,
"learning_rate": 1.599432556405412e-06,
"loss": 1.8141,
"step": 600
},
{
"epoch": 7.317073170731708,
"eval_loss": 1.8243464231491089,
"eval_runtime": 39.3676,
"eval_samples_per_second": 448.617,
"eval_steps_per_second": 0.457,
"step": 600
},
{
"epoch": 7.439024390243903,
"grad_norm": 0.3055365979671478,
"learning_rate": 1.5822614132827836e-06,
"loss": 1.8141,
"step": 610
},
{
"epoch": 7.439024390243903,
"eval_loss": 1.8218821287155151,
"eval_runtime": 39.3324,
"eval_samples_per_second": 449.019,
"eval_steps_per_second": 0.458,
"step": 610
},
{
"epoch": 7.560975609756097,
"grad_norm": 0.2906692624092102,
"learning_rate": 1.5648264981889934e-06,
"loss": 1.8096,
"step": 620
},
{
"epoch": 7.560975609756097,
"eval_loss": 1.8194576501846313,
"eval_runtime": 39.3325,
"eval_samples_per_second": 449.018,
"eval_steps_per_second": 0.458,
"step": 620
},
{
"epoch": 7.682926829268292,
"grad_norm": 0.3652225434780121,
"learning_rate": 1.5471357093666804e-06,
"loss": 1.8119,
"step": 630
},
{
"epoch": 7.682926829268292,
"eval_loss": 1.8171180486679077,
"eval_runtime": 39.3377,
"eval_samples_per_second": 448.958,
"eval_steps_per_second": 0.458,
"step": 630
},
{
"epoch": 7.804878048780488,
"grad_norm": 0.3688996732234955,
"learning_rate": 1.5291970609726005e-06,
"loss": 1.8042,
"step": 640
},
{
"epoch": 7.804878048780488,
"eval_loss": 1.8148137331008911,
"eval_runtime": 39.3463,
"eval_samples_per_second": 448.86,
"eval_steps_per_second": 0.457,
"step": 640
},
{
"epoch": 7.926829268292683,
"grad_norm": 0.28809812664985657,
"learning_rate": 1.5110186794471103e-06,
"loss": 1.7979,
"step": 650
},
{
"epoch": 7.926829268292683,
"eval_loss": 1.8126047849655151,
"eval_runtime": 39.3619,
"eval_samples_per_second": 448.682,
"eval_steps_per_second": 0.457,
"step": 650
},
{
"epoch": 8.048780487804878,
"grad_norm": 0.2660142481327057,
"learning_rate": 1.4926087998327837e-06,
"loss": 1.804,
"step": 660
},
{
"epoch": 8.048780487804878,
"eval_loss": 1.8104569911956787,
"eval_runtime": 39.3952,
"eval_samples_per_second": 448.303,
"eval_steps_per_second": 0.457,
"step": 660
},
{
"epoch": 8.170731707317072,
"grad_norm": 0.281999796628952,
"learning_rate": 1.4739757620438307e-06,
"loss": 1.7987,
"step": 670
},
{
"epoch": 8.170731707317072,
"eval_loss": 1.8083666563034058,
"eval_runtime": 39.3346,
"eval_samples_per_second": 448.994,
"eval_steps_per_second": 0.458,
"step": 670
},
{
"epoch": 8.292682926829269,
"grad_norm": 0.2869739234447479,
"learning_rate": 1.4551280070880087e-06,
"loss": 1.7954,
"step": 680
},
{
"epoch": 8.292682926829269,
"eval_loss": 1.8063015937805176,
"eval_runtime": 39.3428,
"eval_samples_per_second": 448.9,
"eval_steps_per_second": 0.458,
"step": 680
},
{
"epoch": 8.414634146341463,
"grad_norm": 0.2752714157104492,
"learning_rate": 1.4360740732427365e-06,
"loss": 1.797,
"step": 690
},
{
"epoch": 8.414634146341463,
"eval_loss": 1.804310917854309,
"eval_runtime": 39.3238,
"eval_samples_per_second": 449.118,
"eval_steps_per_second": 0.458,
"step": 690
},
{
"epoch": 8.536585365853659,
"grad_norm": 0.4099307358264923,
"learning_rate": 1.416822592187143e-06,
"loss": 1.791,
"step": 700
},
{
"epoch": 8.536585365853659,
"eval_loss": 1.802320122718811,
"eval_runtime": 39.3277,
"eval_samples_per_second": 449.073,
"eval_steps_per_second": 0.458,
"step": 700
},
{
"epoch": 8.658536585365853,
"grad_norm": 0.3235901892185211,
"learning_rate": 1.3973822850918054e-06,
"loss": 1.7893,
"step": 710
},
{
"epoch": 8.658536585365853,
"eval_loss": 1.8004404306411743,
"eval_runtime": 39.6357,
"eval_samples_per_second": 445.583,
"eval_steps_per_second": 0.454,
"step": 710
},
{
"epoch": 8.78048780487805,
"grad_norm": 0.3761025071144104,
"learning_rate": 1.3777619586679457e-06,
"loss": 1.787,
"step": 720
},
{
"epoch": 8.78048780487805,
"eval_loss": 1.7985868453979492,
"eval_runtime": 39.2931,
"eval_samples_per_second": 449.468,
"eval_steps_per_second": 0.458,
"step": 720
},
{
"epoch": 8.902439024390244,
"grad_norm": 0.2766464054584503,
"learning_rate": 1.3579705011778765e-06,
"loss": 1.7899,
"step": 730
},
{
"epoch": 8.902439024390244,
"eval_loss": 1.7967922687530518,
"eval_runtime": 39.3376,
"eval_samples_per_second": 448.96,
"eval_steps_per_second": 0.458,
"step": 730
},
{
"epoch": 9.024390243902438,
"grad_norm": 0.3136584162712097,
"learning_rate": 1.3380168784085026e-06,
"loss": 1.7917,
"step": 740
},
{
"epoch": 9.024390243902438,
"eval_loss": 1.7949668169021606,
"eval_runtime": 39.3251,
"eval_samples_per_second": 449.102,
"eval_steps_per_second": 0.458,
"step": 740
},
{
"epoch": 9.146341463414634,
"grad_norm": 0.268803209066391,
"learning_rate": 1.3179101296097033e-06,
"loss": 1.7838,
"step": 750
},
{
"epoch": 9.146341463414634,
"eval_loss": 1.793213129043579,
"eval_runtime": 39.3253,
"eval_samples_per_second": 449.101,
"eval_steps_per_second": 0.458,
"step": 750
},
{
"epoch": 9.268292682926829,
"grad_norm": 0.2907431125640869,
"learning_rate": 1.2976593633994346e-06,
"loss": 1.7803,
"step": 760
},
{
"epoch": 9.268292682926829,
"eval_loss": 1.7914844751358032,
"eval_runtime": 39.3189,
"eval_samples_per_second": 449.173,
"eval_steps_per_second": 0.458,
"step": 760
},
{
"epoch": 9.390243902439025,
"grad_norm": 0.3980807960033417,
"learning_rate": 1.2772737536374078e-06,
"loss": 1.7789,
"step": 770
},
{
"epoch": 9.390243902439025,
"eval_loss": 1.7898335456848145,
"eval_runtime": 39.3451,
"eval_samples_per_second": 448.875,
"eval_steps_per_second": 0.457,
"step": 770
},
{
"epoch": 9.512195121951219,
"grad_norm": 0.30676034092903137,
"learning_rate": 1.2567625352692126e-06,
"loss": 1.7811,
"step": 780
},
{
"epoch": 9.512195121951219,
"eval_loss": 1.7882270812988281,
"eval_runtime": 39.3352,
"eval_samples_per_second": 448.987,
"eval_steps_per_second": 0.458,
"step": 780
},
{
"epoch": 9.634146341463415,
"grad_norm": 0.24213315546512604,
"learning_rate": 1.2361350001427649e-06,
"loss": 1.7791,
"step": 790
},
{
"epoch": 9.634146341463415,
"eval_loss": 1.786568522453308,
"eval_runtime": 39.3646,
"eval_samples_per_second": 448.652,
"eval_steps_per_second": 0.457,
"step": 790
},
{
"epoch": 9.75609756097561,
"grad_norm": 0.2722227871417999,
"learning_rate": 1.2154004927989813e-06,
"loss": 1.7742,
"step": 800
},
{
"epoch": 9.75609756097561,
"eval_loss": 1.784982681274414,
"eval_runtime": 39.3395,
"eval_samples_per_second": 448.938,
"eval_steps_per_second": 0.458,
"step": 800
},
{
"epoch": 9.878048780487806,
"grad_norm": 0.2399929314851761,
"learning_rate": 1.19456840623858e-06,
"loss": 1.7717,
"step": 810
},
{
"epoch": 9.878048780487806,
"eval_loss": 1.7834330797195435,
"eval_runtime": 39.2984,
"eval_samples_per_second": 449.408,
"eval_steps_per_second": 0.458,
"step": 810
},
{
"epoch": 10.0,
"grad_norm": 0.24806931614875793,
"learning_rate": 1.1736481776669305e-06,
"loss": 1.7788,
"step": 820
},
{
"epoch": 10.0,
"eval_loss": 1.7818834781646729,
"eval_runtime": 39.2977,
"eval_samples_per_second": 449.415,
"eval_steps_per_second": 0.458,
"step": 820
},
{
"epoch": 10.121951219512194,
"grad_norm": 0.32369279861450195,
"learning_rate": 1.1526492842188744e-06,
"loss": 1.7719,
"step": 830
},
{
"epoch": 10.121951219512194,
"eval_loss": 1.7803385257720947,
"eval_runtime": 39.3284,
"eval_samples_per_second": 449.064,
"eval_steps_per_second": 0.458,
"step": 830
},
{
"epoch": 10.24390243902439,
"grad_norm": 0.3276310861110687,
"learning_rate": 1.1315812386654649e-06,
"loss": 1.7675,
"step": 840
},
{
"epoch": 10.24390243902439,
"eval_loss": 1.7788329124450684,
"eval_runtime": 39.674,
"eval_samples_per_second": 445.153,
"eval_steps_per_second": 0.454,
"step": 840
},
{
"epoch": 10.365853658536585,
"grad_norm": 0.2850521504878998,
"learning_rate": 1.1104535851045538e-06,
"loss": 1.7725,
"step": 850
},
{
"epoch": 10.365853658536585,
"eval_loss": 1.7772928476333618,
"eval_runtime": 39.3302,
"eval_samples_per_second": 449.045,
"eval_steps_per_second": 0.458,
"step": 850
},
{
"epoch": 10.487804878048781,
"grad_norm": 0.27776798605918884,
"learning_rate": 1.0892758946371942e-06,
"loss": 1.7648,
"step": 860
},
{
"epoch": 10.487804878048781,
"eval_loss": 1.7757339477539062,
"eval_runtime": 39.3544,
"eval_samples_per_second": 448.768,
"eval_steps_per_second": 0.457,
"step": 860
},
{
"epoch": 10.609756097560975,
"grad_norm": 0.5228049755096436,
"learning_rate": 1.0680577610318071e-06,
"loss": 1.7609,
"step": 870
},
{
"epoch": 10.609756097560975,
"eval_loss": 1.7741671800613403,
"eval_runtime": 39.3709,
"eval_samples_per_second": 448.581,
"eval_steps_per_second": 0.457,
"step": 870
},
{
"epoch": 10.731707317073171,
"grad_norm": 0.3848848044872284,
"learning_rate": 1.0468087963780787e-06,
"loss": 1.7636,
"step": 880
},
{
"epoch": 10.731707317073171,
"eval_loss": 1.77255380153656,
"eval_runtime": 39.3539,
"eval_samples_per_second": 448.773,
"eval_steps_per_second": 0.457,
"step": 880
},
{
"epoch": 10.853658536585366,
"grad_norm": 0.5747771263122559,
"learning_rate": 1.0255386267325602e-06,
"loss": 1.7598,
"step": 890
},
{
"epoch": 10.853658536585366,
"eval_loss": 1.770812749862671,
"eval_runtime": 39.3689,
"eval_samples_per_second": 448.603,
"eval_steps_per_second": 0.457,
"step": 890
},
{
"epoch": 10.975609756097562,
"grad_norm": 1.4300990104675293,
"learning_rate": 1.0042568877579387e-06,
"loss": 1.7651,
"step": 900
},
{
"epoch": 10.975609756097562,
"eval_loss": 1.7690285444259644,
"eval_runtime": 39.3685,
"eval_samples_per_second": 448.607,
"eval_steps_per_second": 0.457,
"step": 900
},
{
"epoch": 11.097560975609756,
"grad_norm": 0.43552011251449585,
"learning_rate": 9.829732203579585e-07,
"loss": 1.7598,
"step": 910
},
{
"epoch": 11.097560975609756,
"eval_loss": 1.7672632932662964,
"eval_runtime": 39.3224,
"eval_samples_per_second": 449.133,
"eval_steps_per_second": 0.458,
"step": 910
},
{
"epoch": 11.21951219512195,
"grad_norm": 4.467670917510986,
"learning_rate": 9.616972663099646e-07,
"loss": 1.7585,
"step": 920
},
{
"epoch": 11.21951219512195,
"eval_loss": 1.768608570098877,
"eval_runtime": 39.3441,
"eval_samples_per_second": 448.886,
"eval_steps_per_second": 0.458,
"step": 920
},
{
"epoch": 11.341463414634147,
"grad_norm": 4.179907321929932,
"learning_rate": 9.40438663897054e-07,
"loss": 1.7572,
"step": 930
},
{
"epoch": 11.341463414634147,
"eval_loss": 1.7685447931289673,
"eval_runtime": 39.3059,
"eval_samples_per_second": 449.322,
"eval_steps_per_second": 0.458,
"step": 930
},
{
"epoch": 11.463414634146341,
"grad_norm": 4.369534492492676,
"learning_rate": 9.192070435418078e-07,
"loss": 1.7485,
"step": 940
},
{
"epoch": 11.463414634146341,
"eval_loss": 1.7640718221664429,
"eval_runtime": 39.3093,
"eval_samples_per_second": 449.283,
"eval_steps_per_second": 0.458,
"step": 940
},
{
"epoch": 11.585365853658537,
"grad_norm": 1.9561405181884766,
"learning_rate": 8.980120234435848e-07,
"loss": 1.7528,
"step": 950
},
{
"epoch": 11.585365853658537,
"eval_loss": 1.763830542564392,
"eval_runtime": 39.3238,
"eval_samples_per_second": 449.117,
"eval_steps_per_second": 0.458,
"step": 950
},
{
"epoch": 11.707317073170731,
"grad_norm": 2.5773379802703857,
"learning_rate": 8.768632052213531e-07,
"loss": 1.7545,
"step": 960
},
{
"epoch": 11.707317073170731,
"eval_loss": 1.7629334926605225,
"eval_runtime": 39.3328,
"eval_samples_per_second": 449.014,
"eval_steps_per_second": 0.458,
"step": 960
},
{
"epoch": 11.829268292682928,
"grad_norm": 3.903297185897827,
"learning_rate": 8.557701695640321e-07,
"loss": 1.7508,
"step": 970
},
{
"epoch": 11.829268292682928,
"eval_loss": 1.760330319404602,
"eval_runtime": 39.3191,
"eval_samples_per_second": 449.171,
"eval_steps_per_second": 0.458,
"step": 970
},
{
"epoch": 11.951219512195122,
"grad_norm": 2.7617976665496826,
"learning_rate": 8.347424718903151e-07,
"loss": 1.7542,
"step": 980
},
{
"epoch": 11.951219512195122,
"eval_loss": 1.759660243988037,
"eval_runtime": 39.3037,
"eval_samples_per_second": 449.347,
"eval_steps_per_second": 0.458,
"step": 980
},
{
"epoch": 12.073170731707316,
"grad_norm": 3.2472615242004395,
"learning_rate": 8.137896380199421e-07,
"loss": 1.752,
"step": 990
},
{
"epoch": 12.073170731707316,
"eval_loss": 1.7598719596862793,
"eval_runtime": 39.6916,
"eval_samples_per_second": 444.956,
"eval_steps_per_second": 0.453,
"step": 990
},
{
"epoch": 12.195121951219512,
"grad_norm": 4.050698757171631,
"learning_rate": 7.929211598583793e-07,
"loss": 1.7487,
"step": 1000
},
{
"epoch": 12.195121951219512,
"eval_loss": 1.7570974826812744,
"eval_runtime": 39.3566,
"eval_samples_per_second": 448.743,
"eval_steps_per_second": 0.457,
"step": 1000
},
{
"epoch": 12.317073170731707,
"grad_norm": 2.279803991317749,
"learning_rate": 7.721464910968626e-07,
"loss": 1.7454,
"step": 1010
},
{
"epoch": 12.317073170731707,
"eval_loss": 1.7562564611434937,
"eval_runtime": 39.3538,
"eval_samples_per_second": 448.775,
"eval_steps_per_second": 0.457,
"step": 1010
},
{
"epoch": 12.439024390243903,
"grad_norm": 3.100792407989502,
"learning_rate": 7.514750429297527e-07,
"loss": 1.7472,
"step": 1020
},
{
"epoch": 12.439024390243903,
"eval_loss": 1.7561583518981934,
"eval_runtime": 39.3529,
"eval_samples_per_second": 448.786,
"eval_steps_per_second": 0.457,
"step": 1020
},
{
"epoch": 12.560975609756097,
"grad_norm": 3.9019737243652344,
"learning_rate": 7.30916179791144e-07,
"loss": 1.7411,
"step": 1030
},
{
"epoch": 12.560975609756097,
"eval_loss": 1.7533916234970093,
"eval_runtime": 39.3587,
"eval_samples_per_second": 448.719,
"eval_steps_per_second": 0.457,
"step": 1030
},
{
"epoch": 12.682926829268293,
"grad_norm": 2.5269076824188232,
"learning_rate": 7.104792151126514e-07,
"loss": 1.7441,
"step": 1040
},
{
"epoch": 12.682926829268293,
"eval_loss": 1.7524149417877197,
"eval_runtime": 39.3483,
"eval_samples_per_second": 448.838,
"eval_steps_per_second": 0.457,
"step": 1040
},
{
"epoch": 12.804878048780488,
"grad_norm": 3.1689910888671875,
"learning_rate": 6.901734071043071e-07,
"loss": 1.7391,
"step": 1050
},
{
"epoch": 12.804878048780488,
"eval_loss": 1.752366542816162,
"eval_runtime": 39.3518,
"eval_samples_per_second": 448.798,
"eval_steps_per_second": 0.457,
"step": 1050
},
{
"epoch": 12.926829268292684,
"grad_norm": 3.6410083770751953,
"learning_rate": 6.700079545604707e-07,
"loss": 1.7441,
"step": 1060
},
{
"epoch": 12.926829268292684,
"eval_loss": 1.749656319618225,
"eval_runtime": 39.3618,
"eval_samples_per_second": 448.684,
"eval_steps_per_second": 0.457,
"step": 1060
},
{
"epoch": 13.048780487804878,
"grad_norm": 1.9339579343795776,
"learning_rate": 6.499919926926565e-07,
"loss": 1.7415,
"step": 1070
},
{
"epoch": 13.048780487804878,
"eval_loss": 1.7485558986663818,
"eval_runtime": 39.3581,
"eval_samples_per_second": 448.726,
"eval_steps_per_second": 0.457,
"step": 1070
},
{
"epoch": 13.170731707317072,
"grad_norm": 2.8899059295654297,
"learning_rate": 6.301345889911636e-07,
"loss": 1.7352,
"step": 1080
},
{
"epoch": 13.170731707317072,
"eval_loss": 1.7480661869049072,
"eval_runtime": 39.3461,
"eval_samples_per_second": 448.863,
"eval_steps_per_second": 0.457,
"step": 1080
},
{
"epoch": 13.292682926829269,
"grad_norm": 3.037234306335449,
"learning_rate": 6.104447391173858e-07,
"loss": 1.7395,
"step": 1090
},
{
"epoch": 13.292682926829269,
"eval_loss": 1.7456430196762085,
"eval_runtime": 39.3861,
"eval_samples_per_second": 448.407,
"eval_steps_per_second": 0.457,
"step": 1090
},
{
"epoch": 13.414634146341463,
"grad_norm": 1.7524123191833496,
"learning_rate": 5.9093136282866e-07,
"loss": 1.7317,
"step": 1100
},
{
"epoch": 13.414634146341463,
"eval_loss": 1.7444212436676025,
"eval_runtime": 39.3868,
"eval_samples_per_second": 448.399,
"eval_steps_per_second": 0.457,
"step": 1100
},
{
"epoch": 13.536585365853659,
"grad_norm": 1.582607388496399,
"learning_rate": 5.716032999375006e-07,
"loss": 1.7356,
"step": 1110
},
{
"epoch": 13.536585365853659,
"eval_loss": 1.743189811706543,
"eval_runtime": 39.3597,
"eval_samples_per_second": 448.707,
"eval_steps_per_second": 0.457,
"step": 1110
},
{
"epoch": 13.658536585365853,
"grad_norm": 0.7700549364089966,
"learning_rate": 5.524693063070492e-07,
"loss": 1.7347,
"step": 1120
},
{
"epoch": 13.658536585365853,
"eval_loss": 1.7409182786941528,
"eval_runtime": 39.3602,
"eval_samples_per_second": 448.702,
"eval_steps_per_second": 0.457,
"step": 1120
},
{
"epoch": 13.78048780487805,
"grad_norm": 0.8657609820365906,
"learning_rate": 5.335380498845559e-07,
"loss": 1.7291,
"step": 1130
},
{
"epoch": 13.78048780487805,
"eval_loss": 1.7383273839950562,
"eval_runtime": 39.3511,
"eval_samples_per_second": 448.806,
"eval_steps_per_second": 0.457,
"step": 1130
},
{
"epoch": 13.902439024390244,
"grad_norm": 0.5521230101585388,
"learning_rate": 5.148181067746861e-07,
"loss": 1.7238,
"step": 1140
},
{
"epoch": 13.902439024390244,
"eval_loss": 1.7357066869735718,
"eval_runtime": 39.3644,
"eval_samples_per_second": 448.654,
"eval_steps_per_second": 0.457,
"step": 1140
},
{
"epoch": 14.024390243902438,
"grad_norm": 0.9353064894676208,
"learning_rate": 4.963179573544356e-07,
"loss": 1.7238,
"step": 1150
},
{
"epoch": 14.024390243902438,
"eval_loss": 1.7331624031066895,
"eval_runtime": 39.355,
"eval_samples_per_second": 448.761,
"eval_steps_per_second": 0.457,
"step": 1150
},
{
"epoch": 14.146341463414634,
"grad_norm": 0.4633055329322815,
"learning_rate": 4.780459824314066e-07,
"loss": 1.7234,
"step": 1160
},
{
"epoch": 14.146341463414634,
"eval_loss": 1.7308125495910645,
"eval_runtime": 39.4238,
"eval_samples_per_second": 447.978,
"eval_steps_per_second": 0.457,
"step": 1160
},
{
"epoch": 14.268292682926829,
"grad_norm": 0.5228179693222046,
"learning_rate": 4.6001045944719594e-07,
"loss": 1.7165,
"step": 1170
},
{
"epoch": 14.268292682926829,
"eval_loss": 1.7286032438278198,
"eval_runtime": 39.3556,
"eval_samples_per_second": 448.755,
"eval_steps_per_second": 0.457,
"step": 1170
},
{
"epoch": 14.390243902439025,
"grad_norm": 0.3939041495323181,
"learning_rate": 4.4221955872760573e-07,
"loss": 1.7171,
"step": 1180
},
{
"epoch": 14.390243902439025,
"eval_loss": 1.72659432888031,
"eval_runtime": 39.3559,
"eval_samples_per_second": 448.75,
"eval_steps_per_second": 0.457,
"step": 1180
},
{
"epoch": 14.512195121951219,
"grad_norm": 0.3697729706764221,
"learning_rate": 4.246813397813794e-07,
"loss": 1.7153,
"step": 1190
},
{
"epoch": 14.512195121951219,
"eval_loss": 1.7247569561004639,
"eval_runtime": 39.3585,
"eval_samples_per_second": 448.722,
"eval_steps_per_second": 0.457,
"step": 1190
},
{
"epoch": 14.634146341463415,
"grad_norm": 0.38930952548980713,
"learning_rate": 4.074037476491413e-07,
"loss": 1.7147,
"step": 1200
},
{
"epoch": 14.634146341463415,
"eval_loss": 1.7230459451675415,
"eval_runtime": 39.3707,
"eval_samples_per_second": 448.582,
"eval_steps_per_second": 0.457,
"step": 1200
},
{
"epoch": 14.75609756097561,
"grad_norm": 0.5216050148010254,
"learning_rate": 3.9039460930418767e-07,
"loss": 1.7093,
"step": 1210
},
{
"epoch": 14.75609756097561,
"eval_loss": 1.721459984779358,
"eval_runtime": 39.4124,
"eval_samples_per_second": 448.108,
"eval_steps_per_second": 0.457,
"step": 1210
},
{
"epoch": 14.878048780487806,
"grad_norm": 0.6067308187484741,
"learning_rate": 3.736616301067693e-07,
"loss": 1.7114,
"step": 1220
},
{
"epoch": 14.878048780487806,
"eval_loss": 1.7200278043746948,
"eval_runtime": 39.3411,
"eval_samples_per_second": 448.92,
"eval_steps_per_second": 0.458,
"step": 1220
},
{
"epoch": 15.0,
"grad_norm": 0.528874933719635,
"learning_rate": 3.5721239031346063e-07,
"loss": 1.7074,
"step": 1230
},
{
"epoch": 15.0,
"eval_loss": 1.7186657190322876,
"eval_runtime": 39.4208,
"eval_samples_per_second": 448.012,
"eval_steps_per_second": 0.457,
"step": 1230
},
{
"epoch": 15.121951219512194,
"grad_norm": 0.4690570831298828,
"learning_rate": 3.410543416432069e-07,
"loss": 1.7068,
"step": 1240
},
{
"epoch": 15.121951219512194,
"eval_loss": 1.7174080610275269,
"eval_runtime": 39.3531,
"eval_samples_per_second": 448.783,
"eval_steps_per_second": 0.457,
"step": 1240
},
{
"epoch": 15.24390243902439,
"grad_norm": 0.4555855989456177,
"learning_rate": 3.2519480390159804e-07,
"loss": 1.7067,
"step": 1250
},
{
"epoch": 15.24390243902439,
"eval_loss": 1.7162292003631592,
"eval_runtime": 39.3524,
"eval_samples_per_second": 448.791,
"eval_steps_per_second": 0.457,
"step": 1250
},
{
"epoch": 15.365853658536585,
"grad_norm": 0.8303574919700623,
"learning_rate": 3.096409616649023e-07,
"loss": 1.7034,
"step": 1260
},
{
"epoch": 15.365853658536585,
"eval_loss": 1.7151583433151245,
"eval_runtime": 39.3553,
"eval_samples_per_second": 448.758,
"eval_steps_per_second": 0.457,
"step": 1260
},
{
"epoch": 15.487804878048781,
"grad_norm": 0.5495628714561462,
"learning_rate": 2.943998610253604e-07,
"loss": 1.7075,
"step": 1270
},
{
"epoch": 15.487804878048781,
"eval_loss": 1.7141631841659546,
"eval_runtime": 39.3687,
"eval_samples_per_second": 448.606,
"eval_steps_per_second": 0.457,
"step": 1270
},
{
"epoch": 15.609756097560975,
"grad_norm": 0.361331582069397,
"learning_rate": 2.7947840639921303e-07,
"loss": 1.7002,
"step": 1280
},
{
"epoch": 15.609756097560975,
"eval_loss": 1.7131644487380981,
"eval_runtime": 39.4083,
"eval_samples_per_second": 448.155,
"eval_steps_per_second": 0.457,
"step": 1280
},
{
"epoch": 15.731707317073171,
"grad_norm": 0.4098544418811798,
"learning_rate": 2.648833573989118e-07,
"loss": 1.7055,
"step": 1290
},
{
"epoch": 15.731707317073171,
"eval_loss": 1.712282657623291,
"eval_runtime": 39.3799,
"eval_samples_per_second": 448.478,
"eval_steps_per_second": 0.457,
"step": 1290
},
{
"epoch": 15.853658536585366,
"grad_norm": 0.5131831765174866,
"learning_rate": 2.50621325770927e-07,
"loss": 1.6976,
"step": 1300
},
{
"epoch": 15.853658536585366,
"eval_loss": 1.7114192247390747,
"eval_runtime": 39.3871,
"eval_samples_per_second": 448.395,
"eval_steps_per_second": 0.457,
"step": 1300
},
{
"epoch": 15.975609756097562,
"grad_norm": 0.4333685338497162,
"learning_rate": 2.3669877240054037e-07,
"loss": 1.7002,
"step": 1310
},
{
"epoch": 15.975609756097562,
"eval_loss": 1.710659146308899,
"eval_runtime": 39.3698,
"eval_samples_per_second": 448.593,
"eval_steps_per_second": 0.457,
"step": 1310
},
{
"epoch": 16.097560975609756,
"grad_norm": 0.36955586075782776,
"learning_rate": 2.231220043849804e-07,
"loss": 1.7015,
"step": 1320
},
{
"epoch": 16.097560975609756,
"eval_loss": 1.7099283933639526,
"eval_runtime": 39.3604,
"eval_samples_per_second": 448.699,
"eval_steps_per_second": 0.457,
"step": 1320
},
{
"epoch": 16.21951219512195,
"grad_norm": 0.37774789333343506,
"learning_rate": 2.0989717217622648e-07,
"loss": 1.6987,
"step": 1330
},
{
"epoch": 16.21951219512195,
"eval_loss": 1.70924973487854,
"eval_runtime": 39.3549,
"eval_samples_per_second": 448.763,
"eval_steps_per_second": 0.457,
"step": 1330
},
{
"epoch": 16.341463414634145,
"grad_norm": 0.389635294675827,
"learning_rate": 1.9703026679477252e-07,
"loss": 1.6985,
"step": 1340
},
{
"epoch": 16.341463414634145,
"eval_loss": 1.7086195945739746,
"eval_runtime": 39.7499,
"eval_samples_per_second": 444.303,
"eval_steps_per_second": 0.453,
"step": 1340
},
{
"epoch": 16.463414634146343,
"grad_norm": 0.4067881405353546,
"learning_rate": 1.845271171156184e-07,
"loss": 1.6986,
"step": 1350
},
{
"epoch": 16.463414634146343,
"eval_loss": 1.7080307006835938,
"eval_runtime": 39.3203,
"eval_samples_per_second": 449.157,
"eval_steps_per_second": 0.458,
"step": 1350
},
{
"epoch": 16.585365853658537,
"grad_norm": 0.33628836274147034,
"learning_rate": 1.7239338722771324e-07,
"loss": 1.6993,
"step": 1360
},
{
"epoch": 16.585365853658537,
"eval_loss": 1.707476019859314,
"eval_runtime": 39.357,
"eval_samples_per_second": 448.739,
"eval_steps_per_second": 0.457,
"step": 1360
},
{
"epoch": 16.70731707317073,
"grad_norm": 0.31285569071769714,
"learning_rate": 1.6063457386805003e-07,
"loss": 1.6946,
"step": 1370
},
{
"epoch": 16.70731707317073,
"eval_loss": 1.7069728374481201,
"eval_runtime": 39.3881,
"eval_samples_per_second": 448.384,
"eval_steps_per_second": 0.457,
"step": 1370
},
{
"epoch": 16.829268292682926,
"grad_norm": 0.3229863941669464,
"learning_rate": 1.4925600393157322e-07,
"loss": 1.6934,
"step": 1380
},
{
"epoch": 16.829268292682926,
"eval_loss": 1.7064942121505737,
"eval_runtime": 39.3862,
"eval_samples_per_second": 448.406,
"eval_steps_per_second": 0.457,
"step": 1380
},
{
"epoch": 16.951219512195124,
"grad_norm": 0.32283350825309753,
"learning_rate": 1.3826283205802424e-07,
"loss": 1.6936,
"step": 1390
},
{
"epoch": 16.951219512195124,
"eval_loss": 1.706059217453003,
"eval_runtime": 39.374,
"eval_samples_per_second": 448.544,
"eval_steps_per_second": 0.457,
"step": 1390
},
{
"epoch": 17.073170731707318,
"grad_norm": 0.2510131597518921,
"learning_rate": 1.2766003829682504e-07,
"loss": 1.6972,
"step": 1400
},
{
"epoch": 17.073170731707318,
"eval_loss": 1.705664873123169,
"eval_runtime": 39.3448,
"eval_samples_per_second": 448.877,
"eval_steps_per_second": 0.457,
"step": 1400
},
{
"epoch": 17.195121951219512,
"grad_norm": 0.2857695519924164,
"learning_rate": 1.1745242585104953e-07,
"loss": 1.6923,
"step": 1410
},
{
"epoch": 17.195121951219512,
"eval_loss": 1.7052934169769287,
"eval_runtime": 39.3548,
"eval_samples_per_second": 448.763,
"eval_steps_per_second": 0.457,
"step": 1410
},
{
"epoch": 17.317073170731707,
"grad_norm": 0.39086970686912537,
"learning_rate": 1.0764461890151111e-07,
"loss": 1.6943,
"step": 1420
},
{
"epoch": 17.317073170731707,
"eval_loss": 1.704952359199524,
"eval_runtime": 39.366,
"eval_samples_per_second": 448.635,
"eval_steps_per_second": 0.457,
"step": 1420
},
{
"epoch": 17.4390243902439,
"grad_norm": 0.2526913583278656,
"learning_rate": 9.824106051194858e-08,
"loss": 1.6944,
"step": 1430
},
{
"epoch": 17.4390243902439,
"eval_loss": 1.7046380043029785,
"eval_runtime": 39.3509,
"eval_samples_per_second": 448.809,
"eval_steps_per_second": 0.457,
"step": 1430
},
{
"epoch": 17.5609756097561,
"grad_norm": 0.2990001142024994,
"learning_rate": 8.924601061626048e-08,
"loss": 1.6929,
"step": 1440
},
{
"epoch": 17.5609756097561,
"eval_loss": 1.7043615579605103,
"eval_runtime": 39.3593,
"eval_samples_per_second": 448.712,
"eval_steps_per_second": 0.457,
"step": 1440
},
{
"epoch": 17.682926829268293,
"grad_norm": 0.24767932295799255,
"learning_rate": 8.066354408870047e-08,
"loss": 1.6926,
"step": 1450
},
{
"epoch": 17.682926829268293,
"eval_loss": 1.7040989398956299,
"eval_runtime": 39.3568,
"eval_samples_per_second": 448.741,
"eval_steps_per_second": 0.457,
"step": 1450
},
{
"epoch": 17.804878048780488,
"grad_norm": 0.3169814348220825,
"learning_rate": 7.249754889790538e-08,
"loss": 1.6926,
"step": 1460
},
{
"epoch": 17.804878048780488,
"eval_loss": 1.703873634338379,
"eval_runtime": 39.4,
"eval_samples_per_second": 448.249,
"eval_steps_per_second": 0.457,
"step": 1460
},
{
"epoch": 17.926829268292682,
"grad_norm": 0.3294218182563782,
"learning_rate": 6.475172434559573e-08,
"loss": 1.6932,
"step": 1470
},
{
"epoch": 17.926829268292682,
"eval_loss": 1.703667163848877,
"eval_runtime": 39.4234,
"eval_samples_per_second": 447.983,
"eval_steps_per_second": 0.457,
"step": 1470
},
{
"epoch": 18.048780487804876,
"grad_norm": 0.2851867079734802,
"learning_rate": 5.742957939074411e-08,
"loss": 1.6927,
"step": 1480
},
{
"epoch": 18.048780487804876,
"eval_loss": 1.7034906148910522,
"eval_runtime": 39.3855,
"eval_samples_per_second": 448.414,
"eval_steps_per_second": 0.457,
"step": 1480
},
{
"epoch": 18.170731707317074,
"grad_norm": 0.2505706250667572,
"learning_rate": 5.053443105997068e-08,
"loss": 1.6905,
"step": 1490
},
{
"epoch": 18.170731707317074,
"eval_loss": 1.7033272981643677,
"eval_runtime": 39.3764,
"eval_samples_per_second": 448.517,
"eval_steps_per_second": 0.457,
"step": 1490
},
{
"epoch": 18.29268292682927,
"grad_norm": 0.2556091248989105,
"learning_rate": 4.4069402944887704e-08,
"loss": 1.6928,
"step": 1500
},
{
"epoch": 18.29268292682927,
"eval_loss": 1.703181505203247,
"eval_runtime": 39.3582,
"eval_samples_per_second": 448.725,
"eval_steps_per_second": 0.457,
"step": 1500
},
{
"epoch": 18.414634146341463,
"grad_norm": 0.2573912739753723,
"learning_rate": 3.803742378707198e-08,
"loss": 1.6944,
"step": 1510
},
{
"epoch": 18.414634146341463,
"eval_loss": 1.703063726425171,
"eval_runtime": 39.3337,
"eval_samples_per_second": 449.004,
"eval_steps_per_second": 0.458,
"step": 1510
},
{
"epoch": 18.536585365853657,
"grad_norm": 0.24173639714717865,
"learning_rate": 3.24412261513064e-08,
"loss": 1.6925,
"step": 1520
},
{
"epoch": 18.536585365853657,
"eval_loss": 1.7029577493667603,
"eval_runtime": 39.3737,
"eval_samples_per_second": 448.549,
"eval_steps_per_second": 0.457,
"step": 1520
},
{
"epoch": 18.658536585365855,
"grad_norm": 0.24515186250209808,
"learning_rate": 2.7283345187693264e-08,
"loss": 1.6944,
"step": 1530
},
{
"epoch": 18.658536585365855,
"eval_loss": 1.7028616666793823,
"eval_runtime": 39.3701,
"eval_samples_per_second": 448.589,
"eval_steps_per_second": 0.457,
"step": 1530
},
{
"epoch": 18.78048780487805,
"grad_norm": 0.25829750299453735,
"learning_rate": 2.256611748319792e-08,
"loss": 1.6897,
"step": 1540
},
{
"epoch": 18.78048780487805,
"eval_loss": 1.7027884721755981,
"eval_runtime": 39.4047,
"eval_samples_per_second": 448.195,
"eval_steps_per_second": 0.457,
"step": 1540
},
{
"epoch": 18.902439024390244,
"grad_norm": 0.2337442934513092,
"learning_rate": 1.8291680003145073e-08,
"loss": 1.6915,
"step": 1550
},
{
"epoch": 18.902439024390244,
"eval_loss": 1.702728033065796,
"eval_runtime": 39.8565,
"eval_samples_per_second": 443.115,
"eval_steps_per_second": 0.452,
"step": 1550
},
{
"epoch": 19.024390243902438,
"grad_norm": 0.24271942675113678,
"learning_rate": 1.4461969123145457e-08,
"loss": 1.6891,
"step": 1560
},
{
"epoch": 19.024390243902438,
"eval_loss": 1.7026790380477905,
"eval_runtime": 39.3468,
"eval_samples_per_second": 448.854,
"eval_steps_per_second": 0.457,
"step": 1560
},
{
"epoch": 19.146341463414632,
"grad_norm": 0.2199811339378357,
"learning_rate": 1.107871975189234e-08,
"loss": 1.6884,
"step": 1570
},
{
"epoch": 19.146341463414632,
"eval_loss": 1.702639102935791,
"eval_runtime": 39.3619,
"eval_samples_per_second": 448.683,
"eval_steps_per_second": 0.457,
"step": 1570
},
{
"epoch": 19.26829268292683,
"grad_norm": 0.24018193781375885,
"learning_rate": 8.143464545226297e-09,
"loss": 1.6962,
"step": 1580
},
{
"epoch": 19.26829268292683,
"eval_loss": 1.7026113271713257,
"eval_runtime": 39.2823,
"eval_samples_per_second": 449.591,
"eval_steps_per_second": 0.458,
"step": 1580
},
{
"epoch": 19.390243902439025,
"grad_norm": 0.23089687526226044,
"learning_rate": 5.657533211820941e-09,
"loss": 1.6918,
"step": 1590
},
{
"epoch": 19.390243902439025,
"eval_loss": 1.7025905847549438,
"eval_runtime": 39.2847,
"eval_samples_per_second": 449.564,
"eval_steps_per_second": 0.458,
"step": 1590
},
{
"epoch": 19.51219512195122,
"grad_norm": 0.219436913728714,
"learning_rate": 3.6220519108086654e-09,
"loss": 1.6906,
"step": 1600
},
{
"epoch": 19.51219512195122,
"eval_loss": 1.7025744915008545,
"eval_runtime": 39.3227,
"eval_samples_per_second": 449.13,
"eval_steps_per_second": 0.458,
"step": 1600
},
{
"epoch": 19.634146341463413,
"grad_norm": 0.21289722621440887,
"learning_rate": 2.037942741615617e-09,
"loss": 1.691,
"step": 1610
},
{
"epoch": 19.634146341463413,
"eval_loss": 1.7025699615478516,
"eval_runtime": 39.3676,
"eval_samples_per_second": 448.617,
"eval_steps_per_second": 0.457,
"step": 1610
},
{
"epoch": 19.75609756097561,
"grad_norm": 0.2050682008266449,
"learning_rate": 9.059233262386224e-10,
"loss": 1.6963,
"step": 1620
},
{
"epoch": 19.75609756097561,
"eval_loss": 1.7025647163391113,
"eval_runtime": 39.2816,
"eval_samples_per_second": 449.599,
"eval_steps_per_second": 0.458,
"step": 1620
},
{
"epoch": 19.878048780487806,
"grad_norm": 0.2104637622833252,
"learning_rate": 2.265064841533437e-10,
"loss": 1.69,
"step": 1630
},
{
"epoch": 19.878048780487806,
"eval_loss": 1.7025623321533203,
"eval_runtime": 39.3024,
"eval_samples_per_second": 449.362,
"eval_steps_per_second": 0.458,
"step": 1630
},
{
"epoch": 20.0,
"grad_norm": 0.24021713435649872,
"learning_rate": 0.0,
"loss": 1.6877,
"step": 1640
},
{
"epoch": 20.0,
"eval_loss": 1.7025611400604248,
"eval_runtime": 39.2695,
"eval_samples_per_second": 449.738,
"eval_steps_per_second": 0.458,
"step": 1640
}
],
"logging_steps": 10,
"max_steps": 1640,
"num_input_tokens_seen": 0,
"num_train_epochs": 20,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.5456972789249475e+19,
"train_batch_size": 64,
"trial_name": null,
"trial_params": null
}