Qwen2.5-Coder-1.5B-ds-coder-v2 / trainer_state.json
tohuy2710's picture
Upload model
4aa947f verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 12699,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.007559650366170565,
"grad_norm": 30.625,
"learning_rate": 2.4409448818897637e-07,
"loss": 0.997,
"step": 32
},
{
"epoch": 0.01511930073234113,
"grad_norm": 22.75,
"learning_rate": 4.960629921259843e-07,
"loss": 1.0127,
"step": 64
},
{
"epoch": 0.022678951098511695,
"grad_norm": 15.875,
"learning_rate": 7.480314960629922e-07,
"loss": 0.9785,
"step": 96
},
{
"epoch": 0.03023860146468226,
"grad_norm": 19.125,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.8948,
"step": 128
},
{
"epoch": 0.03779825183085282,
"grad_norm": 13.25,
"learning_rate": 1.251968503937008e-06,
"loss": 0.9001,
"step": 160
},
{
"epoch": 0.04535790219702339,
"grad_norm": 10.0,
"learning_rate": 1.5039370078740158e-06,
"loss": 0.9018,
"step": 192
},
{
"epoch": 0.05291755256319395,
"grad_norm": 9.5,
"learning_rate": 1.7559055118110239e-06,
"loss": 0.8796,
"step": 224
},
{
"epoch": 0.06047720292936452,
"grad_norm": 9.8125,
"learning_rate": 2.007874015748032e-06,
"loss": 0.7863,
"step": 256
},
{
"epoch": 0.06803685329553508,
"grad_norm": 13.125,
"learning_rate": 2.2598425196850397e-06,
"loss": 0.7503,
"step": 288
},
{
"epoch": 0.07559650366170564,
"grad_norm": 7.46875,
"learning_rate": 2.511811023622047e-06,
"loss": 0.7361,
"step": 320
},
{
"epoch": 0.08315615402787621,
"grad_norm": 10.125,
"learning_rate": 2.763779527559055e-06,
"loss": 0.7236,
"step": 352
},
{
"epoch": 0.09071580439404678,
"grad_norm": 8.6875,
"learning_rate": 3.015748031496063e-06,
"loss": 0.7074,
"step": 384
},
{
"epoch": 0.09827545476021735,
"grad_norm": 10.0625,
"learning_rate": 3.267716535433071e-06,
"loss": 0.7418,
"step": 416
},
{
"epoch": 0.1058351051263879,
"grad_norm": 17.25,
"learning_rate": 3.519685039370079e-06,
"loss": 0.6522,
"step": 448
},
{
"epoch": 0.11339475549255847,
"grad_norm": 4.8125,
"learning_rate": 3.7716535433070867e-06,
"loss": 0.6596,
"step": 480
},
{
"epoch": 0.12095440585872903,
"grad_norm": 9.375,
"learning_rate": 4.0236220472440945e-06,
"loss": 0.6762,
"step": 512
},
{
"epoch": 0.1285140562248996,
"grad_norm": 7.6875,
"learning_rate": 4.275590551181103e-06,
"loss": 0.6657,
"step": 544
},
{
"epoch": 0.13607370659107015,
"grad_norm": 11.4375,
"learning_rate": 4.52755905511811e-06,
"loss": 0.5974,
"step": 576
},
{
"epoch": 0.14363335695724072,
"grad_norm": 6.3125,
"learning_rate": 4.779527559055118e-06,
"loss": 0.6177,
"step": 608
},
{
"epoch": 0.1511930073234113,
"grad_norm": 16.75,
"learning_rate": 5.031496062992126e-06,
"loss": 0.5999,
"step": 640
},
{
"epoch": 0.15875265768958186,
"grad_norm": 11.625,
"learning_rate": 5.283464566929134e-06,
"loss": 0.5413,
"step": 672
},
{
"epoch": 0.16631230805575242,
"grad_norm": 6.46875,
"learning_rate": 5.535433070866142e-06,
"loss": 0.5589,
"step": 704
},
{
"epoch": 0.173871958421923,
"grad_norm": 8.6875,
"learning_rate": 5.78740157480315e-06,
"loss": 0.5491,
"step": 736
},
{
"epoch": 0.18143160878809356,
"grad_norm": 15.375,
"learning_rate": 6.039370078740158e-06,
"loss": 0.523,
"step": 768
},
{
"epoch": 0.18899125915426412,
"grad_norm": 6.875,
"learning_rate": 6.291338582677165e-06,
"loss": 0.5444,
"step": 800
},
{
"epoch": 0.1965509095204347,
"grad_norm": 24.375,
"learning_rate": 6.543307086614174e-06,
"loss": 0.5077,
"step": 832
},
{
"epoch": 0.20411055988660523,
"grad_norm": 7.21875,
"learning_rate": 6.795275590551181e-06,
"loss": 0.4579,
"step": 864
},
{
"epoch": 0.2116702102527758,
"grad_norm": 11.5625,
"learning_rate": 7.047244094488189e-06,
"loss": 0.4787,
"step": 896
},
{
"epoch": 0.21922986061894637,
"grad_norm": 7.84375,
"learning_rate": 7.2992125984251975e-06,
"loss": 0.4627,
"step": 928
},
{
"epoch": 0.22678951098511693,
"grad_norm": 7.53125,
"learning_rate": 7.551181102362205e-06,
"loss": 0.4694,
"step": 960
},
{
"epoch": 0.2343491613512875,
"grad_norm": 9.6875,
"learning_rate": 7.803149606299213e-06,
"loss": 0.4959,
"step": 992
},
{
"epoch": 0.24190881171745807,
"grad_norm": 9.4375,
"learning_rate": 8.055118110236221e-06,
"loss": 0.4708,
"step": 1024
},
{
"epoch": 0.24946846208362863,
"grad_norm": 8.875,
"learning_rate": 8.307086614173228e-06,
"loss": 0.4513,
"step": 1056
},
{
"epoch": 0.2570281124497992,
"grad_norm": 6.9375,
"learning_rate": 8.559055118110236e-06,
"loss": 0.4668,
"step": 1088
},
{
"epoch": 0.26458776281596974,
"grad_norm": 10.8125,
"learning_rate": 8.811023622047244e-06,
"loss": 0.4413,
"step": 1120
},
{
"epoch": 0.2721474131821403,
"grad_norm": 6.78125,
"learning_rate": 9.062992125984253e-06,
"loss": 0.4229,
"step": 1152
},
{
"epoch": 0.2797070635483109,
"grad_norm": 8.3125,
"learning_rate": 9.314960629921261e-06,
"loss": 0.4487,
"step": 1184
},
{
"epoch": 0.28726671391448144,
"grad_norm": 7.28125,
"learning_rate": 9.566929133858268e-06,
"loss": 0.4379,
"step": 1216
},
{
"epoch": 0.294826364280652,
"grad_norm": 7.375,
"learning_rate": 9.818897637795276e-06,
"loss": 0.4577,
"step": 1248
},
{
"epoch": 0.3023860146468226,
"grad_norm": 7.71875,
"learning_rate": 9.999984699413235e-06,
"loss": 0.4738,
"step": 1280
},
{
"epoch": 0.30994566501299314,
"grad_norm": 4.875,
"learning_rate": 9.999682468799545e-06,
"loss": 0.4452,
"step": 1312
},
{
"epoch": 0.3175053153791637,
"grad_norm": 8.3125,
"learning_rate": 9.998993405770503e-06,
"loss": 0.4617,
"step": 1344
},
{
"epoch": 0.3250649657453343,
"grad_norm": 7.21875,
"learning_rate": 9.997917563639873e-06,
"loss": 0.4511,
"step": 1376
},
{
"epoch": 0.33262461611150484,
"grad_norm": 5.78125,
"learning_rate": 9.99645502564707e-06,
"loss": 0.4646,
"step": 1408
},
{
"epoch": 0.3401842664776754,
"grad_norm": 7.21875,
"learning_rate": 9.994605904950693e-06,
"loss": 0.4219,
"step": 1440
},
{
"epoch": 0.347743916843846,
"grad_norm": 8.75,
"learning_rate": 9.992370344619799e-06,
"loss": 0.4755,
"step": 1472
},
{
"epoch": 0.35530356721001655,
"grad_norm": 5.59375,
"learning_rate": 9.98974851762281e-06,
"loss": 0.4415,
"step": 1504
},
{
"epoch": 0.3628632175761871,
"grad_norm": 10.8125,
"learning_rate": 9.986740626814144e-06,
"loss": 0.4628,
"step": 1536
},
{
"epoch": 0.3704228679423577,
"grad_norm": 5.71875,
"learning_rate": 9.983346904918514e-06,
"loss": 0.4241,
"step": 1568
},
{
"epoch": 0.37798251830852825,
"grad_norm": 7.40625,
"learning_rate": 9.97956761451292e-06,
"loss": 0.4183,
"step": 1600
},
{
"epoch": 0.3855421686746988,
"grad_norm": 4.71875,
"learning_rate": 9.975403048006342e-06,
"loss": 0.46,
"step": 1632
},
{
"epoch": 0.3931018190408694,
"grad_norm": 6.625,
"learning_rate": 9.970853527617105e-06,
"loss": 0.4349,
"step": 1664
},
{
"epoch": 0.40066146940703995,
"grad_norm": 11.3125,
"learning_rate": 9.965919405347958e-06,
"loss": 0.4359,
"step": 1696
},
{
"epoch": 0.40822111977321046,
"grad_norm": 6.5625,
"learning_rate": 9.960601062958833e-06,
"loss": 0.4228,
"step": 1728
},
{
"epoch": 0.41578077013938103,
"grad_norm": 6.75,
"learning_rate": 9.954898911937302e-06,
"loss": 0.4545,
"step": 1760
},
{
"epoch": 0.4233404205055516,
"grad_norm": 7.375,
"learning_rate": 9.94881339346676e-06,
"loss": 0.4468,
"step": 1792
},
{
"epoch": 0.43090007087172216,
"grad_norm": 7.0625,
"learning_rate": 9.942344978392267e-06,
"loss": 0.4355,
"step": 1824
},
{
"epoch": 0.43845972123789273,
"grad_norm": 6.21875,
"learning_rate": 9.935494167184133e-06,
"loss": 0.4563,
"step": 1856
},
{
"epoch": 0.4460193716040633,
"grad_norm": 7.34375,
"learning_rate": 9.928261489899187e-06,
"loss": 0.4273,
"step": 1888
},
{
"epoch": 0.45357902197023386,
"grad_norm": 6.40625,
"learning_rate": 9.920647506139774e-06,
"loss": 0.4072,
"step": 1920
},
{
"epoch": 0.46113867233640443,
"grad_norm": 5.25,
"learning_rate": 9.912652805010451e-06,
"loss": 0.4216,
"step": 1952
},
{
"epoch": 0.468698322702575,
"grad_norm": 5.9375,
"learning_rate": 9.90427800507241e-06,
"loss": 0.4853,
"step": 1984
},
{
"epoch": 0.47625797306874557,
"grad_norm": 6.625,
"learning_rate": 9.89552375429562e-06,
"loss": 0.4452,
"step": 2016
},
{
"epoch": 0.48381762343491613,
"grad_norm": 4.875,
"learning_rate": 9.886390730008688e-06,
"loss": 0.4505,
"step": 2048
},
{
"epoch": 0.4913772738010867,
"grad_norm": 8.9375,
"learning_rate": 9.87687963884646e-06,
"loss": 0.4279,
"step": 2080
},
{
"epoch": 0.49893692416725727,
"grad_norm": 4.46875,
"learning_rate": 9.86699121669534e-06,
"loss": 0.4225,
"step": 2112
},
{
"epoch": 0.5064965745334278,
"grad_norm": 4.75,
"learning_rate": 9.856726228636364e-06,
"loss": 0.4333,
"step": 2144
},
{
"epoch": 0.5140562248995983,
"grad_norm": 7.78125,
"learning_rate": 9.846085468885994e-06,
"loss": 0.4262,
"step": 2176
},
{
"epoch": 0.5216158752657689,
"grad_norm": 6.78125,
"learning_rate": 9.835069760734673e-06,
"loss": 0.4474,
"step": 2208
},
{
"epoch": 0.5291755256319395,
"grad_norm": 7.25,
"learning_rate": 9.823679956483122e-06,
"loss": 0.436,
"step": 2240
},
{
"epoch": 0.53673517599811,
"grad_norm": 5.71875,
"learning_rate": 9.81191693737641e-06,
"loss": 0.4293,
"step": 2272
},
{
"epoch": 0.5442948263642806,
"grad_norm": 7.84375,
"learning_rate": 9.799781613535747e-06,
"loss": 0.4253,
"step": 2304
},
{
"epoch": 0.5518544767304512,
"grad_norm": 5.875,
"learning_rate": 9.787274923888093e-06,
"loss": 0.4385,
"step": 2336
},
{
"epoch": 0.5594141270966217,
"grad_norm": 7.5625,
"learning_rate": 9.774397836093487e-06,
"loss": 0.4349,
"step": 2368
},
{
"epoch": 0.5669737774627923,
"grad_norm": 5.09375,
"learning_rate": 9.761151346470204e-06,
"loss": 0.3955,
"step": 2400
},
{
"epoch": 0.5745334278289629,
"grad_norm": 8.6875,
"learning_rate": 9.747536479917643e-06,
"loss": 0.4272,
"step": 2432
},
{
"epoch": 0.5820930781951335,
"grad_norm": 5.5625,
"learning_rate": 9.733554289837041e-06,
"loss": 0.4269,
"step": 2464
},
{
"epoch": 0.589652728561304,
"grad_norm": 6.40625,
"learning_rate": 9.719205858049978e-06,
"loss": 0.4676,
"step": 2496
},
{
"epoch": 0.5972123789274746,
"grad_norm": 3.578125,
"learning_rate": 9.704492294714658e-06,
"loss": 0.4336,
"step": 2528
},
{
"epoch": 0.6047720292936452,
"grad_norm": 8.4375,
"learning_rate": 9.689414738240026e-06,
"loss": 0.4546,
"step": 2560
},
{
"epoch": 0.6123316796598157,
"grad_norm": 6.65625,
"learning_rate": 9.673974355197684e-06,
"loss": 0.3893,
"step": 2592
},
{
"epoch": 0.6198913300259863,
"grad_norm": 6.78125,
"learning_rate": 9.658172340231636e-06,
"loss": 0.4525,
"step": 2624
},
{
"epoch": 0.6274509803921569,
"grad_norm": 5.78125,
"learning_rate": 9.642009915965844e-06,
"loss": 0.4303,
"step": 2656
},
{
"epoch": 0.6350106307583274,
"grad_norm": 6.125,
"learning_rate": 9.625488332909649e-06,
"loss": 0.4332,
"step": 2688
},
{
"epoch": 0.642570281124498,
"grad_norm": 20.375,
"learning_rate": 9.608608869361008e-06,
"loss": 0.4497,
"step": 2720
},
{
"epoch": 0.6501299314906686,
"grad_norm": 7.09375,
"learning_rate": 9.59137283130759e-06,
"loss": 0.4172,
"step": 2752
},
{
"epoch": 0.6576895818568391,
"grad_norm": 5.09375,
"learning_rate": 9.573781552325732e-06,
"loss": 0.4705,
"step": 2784
},
{
"epoch": 0.6652492322230097,
"grad_norm": 6.625,
"learning_rate": 9.555836393477254e-06,
"loss": 0.4326,
"step": 2816
},
{
"epoch": 0.6728088825891803,
"grad_norm": 5.0,
"learning_rate": 9.537538743204164e-06,
"loss": 0.4291,
"step": 2848
},
{
"epoch": 0.6803685329553508,
"grad_norm": 6.75,
"learning_rate": 9.518890017221214e-06,
"loss": 0.4252,
"step": 2880
},
{
"epoch": 0.6879281833215214,
"grad_norm": 7.125,
"learning_rate": 9.499891658406381e-06,
"loss": 0.4019,
"step": 2912
},
{
"epoch": 0.695487833687692,
"grad_norm": 6.46875,
"learning_rate": 9.480545136689222e-06,
"loss": 0.4161,
"step": 2944
},
{
"epoch": 0.7030474840538625,
"grad_norm": 4.0,
"learning_rate": 9.46085194893714e-06,
"loss": 0.4163,
"step": 2976
},
{
"epoch": 0.7106071344200331,
"grad_norm": 6.65625,
"learning_rate": 9.440813618839574e-06,
"loss": 0.4621,
"step": 3008
},
{
"epoch": 0.7181667847862037,
"grad_norm": 6.90625,
"learning_rate": 9.420431696790116e-06,
"loss": 0.4271,
"step": 3040
},
{
"epoch": 0.7257264351523742,
"grad_norm": 14.1875,
"learning_rate": 9.399707759766536e-06,
"loss": 0.4592,
"step": 3072
},
{
"epoch": 0.7332860855185448,
"grad_norm": 4.75,
"learning_rate": 9.378643411208785e-06,
"loss": 0.4204,
"step": 3104
},
{
"epoch": 0.7408457358847154,
"grad_norm": 6.40625,
"learning_rate": 9.357240280894935e-06,
"loss": 0.4407,
"step": 3136
},
{
"epoch": 0.7484053862508859,
"grad_norm": 7.84375,
"learning_rate": 9.335500024815066e-06,
"loss": 0.4412,
"step": 3168
},
{
"epoch": 0.7559650366170565,
"grad_norm": 6.15625,
"learning_rate": 9.313424325043156e-06,
"loss": 0.4509,
"step": 3200
},
{
"epoch": 0.7635246869832271,
"grad_norm": 6.71875,
"learning_rate": 9.291014889606927e-06,
"loss": 0.4084,
"step": 3232
},
{
"epoch": 0.7710843373493976,
"grad_norm": 7.8125,
"learning_rate": 9.268273452355698e-06,
"loss": 0.3744,
"step": 3264
},
{
"epoch": 0.7786439877155682,
"grad_norm": 5.5,
"learning_rate": 9.24520177282623e-06,
"loss": 0.4226,
"step": 3296
},
{
"epoch": 0.7862036380817388,
"grad_norm": 7.78125,
"learning_rate": 9.221801636106591e-06,
"loss": 0.4654,
"step": 3328
},
{
"epoch": 0.7937632884479093,
"grad_norm": 5.75,
"learning_rate": 9.198074852698042e-06,
"loss": 0.4746,
"step": 3360
},
{
"epoch": 0.8013229388140799,
"grad_norm": 10.0,
"learning_rate": 9.174023258374951e-06,
"loss": 0.4012,
"step": 3392
},
{
"epoch": 0.8088825891802505,
"grad_norm": 8.375,
"learning_rate": 9.149648714042763e-06,
"loss": 0.4434,
"step": 3424
},
{
"epoch": 0.8164422395464209,
"grad_norm": 5.9375,
"learning_rate": 9.124953105594014e-06,
"loss": 0.4421,
"step": 3456
},
{
"epoch": 0.8240018899125915,
"grad_norm": 4.75,
"learning_rate": 9.099938343762423e-06,
"loss": 0.4318,
"step": 3488
},
{
"epoch": 0.8315615402787621,
"grad_norm": 5.90625,
"learning_rate": 9.074606363975047e-06,
"loss": 0.4141,
"step": 3520
},
{
"epoch": 0.8391211906449326,
"grad_norm": 21.375,
"learning_rate": 9.048959126202543e-06,
"loss": 0.4459,
"step": 3552
},
{
"epoch": 0.8466808410111032,
"grad_norm": 13.0625,
"learning_rate": 9.022998614807519e-06,
"loss": 0.4449,
"step": 3584
},
{
"epoch": 0.8542404913772738,
"grad_norm": 6.59375,
"learning_rate": 8.996726838390995e-06,
"loss": 0.4375,
"step": 3616
},
{
"epoch": 0.8618001417434443,
"grad_norm": 5.625,
"learning_rate": 8.970145829637007e-06,
"loss": 0.4051,
"step": 3648
},
{
"epoch": 0.8693597921096149,
"grad_norm": 4.53125,
"learning_rate": 8.943257645155328e-06,
"loss": 0.4162,
"step": 3680
},
{
"epoch": 0.8769194424757855,
"grad_norm": 7.03125,
"learning_rate": 8.91606436532234e-06,
"loss": 0.4302,
"step": 3712
},
{
"epoch": 0.884479092841956,
"grad_norm": 6.0,
"learning_rate": 8.888568094120088e-06,
"loss": 0.4083,
"step": 3744
},
{
"epoch": 0.8920387432081266,
"grad_norm": 4.96875,
"learning_rate": 8.860770958973472e-06,
"loss": 0.3913,
"step": 3776
},
{
"epoch": 0.8995983935742972,
"grad_norm": 15.875,
"learning_rate": 8.832675110585663e-06,
"loss": 0.4226,
"step": 3808
},
{
"epoch": 0.9071580439404677,
"grad_norm": 6.8125,
"learning_rate": 8.804282722771691e-06,
"loss": 0.4122,
"step": 3840
},
{
"epoch": 0.9147176943066383,
"grad_norm": 5.625,
"learning_rate": 8.775595992290257e-06,
"loss": 0.4548,
"step": 3872
},
{
"epoch": 0.9222773446728089,
"grad_norm": 7.875,
"learning_rate": 8.746617138673761e-06,
"loss": 0.4418,
"step": 3904
},
{
"epoch": 0.9298369950389794,
"grad_norm": 12.375,
"learning_rate": 8.717348404056578e-06,
"loss": 0.4121,
"step": 3936
},
{
"epoch": 0.93739664540515,
"grad_norm": 6.3125,
"learning_rate": 8.687792053001587e-06,
"loss": 0.4491,
"step": 3968
},
{
"epoch": 0.9449562957713206,
"grad_norm": 5.625,
"learning_rate": 8.657950372324947e-06,
"loss": 0.4384,
"step": 4000
},
{
"epoch": 0.9525159461374911,
"grad_norm": 6.8125,
"learning_rate": 8.62782567091917e-06,
"loss": 0.4195,
"step": 4032
},
{
"epoch": 0.9600755965036617,
"grad_norm": 8.5625,
"learning_rate": 8.597420279574478e-06,
"loss": 0.4195,
"step": 4064
},
{
"epoch": 0.9676352468698323,
"grad_norm": 5.0625,
"learning_rate": 8.566736550798462e-06,
"loss": 0.4166,
"step": 4096
},
{
"epoch": 0.9751948972360028,
"grad_norm": 5.9375,
"learning_rate": 8.535776858634076e-06,
"loss": 0.4142,
"step": 4128
},
{
"epoch": 0.9827545476021734,
"grad_norm": 7.71875,
"learning_rate": 8.504543598475939e-06,
"loss": 0.4273,
"step": 4160
},
{
"epoch": 0.990314197968344,
"grad_norm": 5.59375,
"learning_rate": 8.47303918688501e-06,
"loss": 0.3669,
"step": 4192
},
{
"epoch": 0.9978738483345145,
"grad_norm": 6.09375,
"learning_rate": 8.441266061401613e-06,
"loss": 0.4211,
"step": 4224
},
{
"epoch": 1.005433498700685,
"grad_norm": 6.0,
"learning_rate": 8.40922668035685e-06,
"loss": 0.4052,
"step": 4256
},
{
"epoch": 1.0129931490668556,
"grad_norm": 5.5625,
"learning_rate": 8.376923522682372e-06,
"loss": 0.3879,
"step": 4288
},
{
"epoch": 1.0205527994330261,
"grad_norm": 7.46875,
"learning_rate": 8.344359087718607e-06,
"loss": 0.3971,
"step": 4320
},
{
"epoch": 1.0281124497991967,
"grad_norm": 6.375,
"learning_rate": 8.311535895021368e-06,
"loss": 0.3401,
"step": 4352
},
{
"epoch": 1.0356721001653673,
"grad_norm": 6.46875,
"learning_rate": 8.278456484166916e-06,
"loss": 0.3999,
"step": 4384
},
{
"epoch": 1.0432317505315378,
"grad_norm": 6.28125,
"learning_rate": 8.24512341455547e-06,
"loss": 0.349,
"step": 4416
},
{
"epoch": 1.0507914008977084,
"grad_norm": 13.375,
"learning_rate": 8.21153926521318e-06,
"loss": 0.4037,
"step": 4448
},
{
"epoch": 1.058351051263879,
"grad_norm": 7.125,
"learning_rate": 8.177706634592584e-06,
"loss": 0.3845,
"step": 4480
},
{
"epoch": 1.0659107016300495,
"grad_norm": 7.875,
"learning_rate": 8.143628140371565e-06,
"loss": 0.3731,
"step": 4512
},
{
"epoch": 1.07347035199622,
"grad_norm": 8.5625,
"learning_rate": 8.109306419250818e-06,
"loss": 0.3845,
"step": 4544
},
{
"epoch": 1.0810300023623907,
"grad_norm": 9.5,
"learning_rate": 8.074744126749839e-06,
"loss": 0.4437,
"step": 4576
},
{
"epoch": 1.0885896527285612,
"grad_norm": 9.625,
"learning_rate": 8.039943937001472e-06,
"loss": 0.3819,
"step": 4608
},
{
"epoch": 1.0961493030947318,
"grad_norm": 5.46875,
"learning_rate": 8.004908542545e-06,
"loss": 0.394,
"step": 4640
},
{
"epoch": 1.1037089534609024,
"grad_norm": 5.5625,
"learning_rate": 7.969640654117822e-06,
"loss": 0.4315,
"step": 4672
},
{
"epoch": 1.111268603827073,
"grad_norm": 8.5,
"learning_rate": 7.934143000445726e-06,
"loss": 0.4429,
"step": 4704
},
{
"epoch": 1.1188282541932435,
"grad_norm": 6.34375,
"learning_rate": 7.898418328031752e-06,
"loss": 0.4198,
"step": 4736
},
{
"epoch": 1.126387904559414,
"grad_norm": 7.21875,
"learning_rate": 7.8624694009437e-06,
"loss": 0.3899,
"step": 4768
},
{
"epoch": 1.1339475549255846,
"grad_norm": 4.65625,
"learning_rate": 7.826299000600262e-06,
"loss": 0.3963,
"step": 4800
},
{
"epoch": 1.1415072052917552,
"grad_norm": 4.375,
"learning_rate": 7.789909925555835e-06,
"loss": 0.3925,
"step": 4832
},
{
"epoch": 1.1490668556579258,
"grad_norm": 7.96875,
"learning_rate": 7.753304991283975e-06,
"loss": 0.4087,
"step": 4864
},
{
"epoch": 1.1566265060240963,
"grad_norm": 6.625,
"learning_rate": 7.71648702995957e-06,
"loss": 0.4089,
"step": 4896
},
{
"epoch": 1.164186156390267,
"grad_norm": 3.71875,
"learning_rate": 7.67945889023971e-06,
"loss": 0.4062,
"step": 4928
},
{
"epoch": 1.1717458067564375,
"grad_norm": 4.8125,
"learning_rate": 7.642223437043279e-06,
"loss": 0.41,
"step": 4960
},
{
"epoch": 1.179305457122608,
"grad_norm": 8.25,
"learning_rate": 7.604783551329298e-06,
"loss": 0.3862,
"step": 4992
},
{
"epoch": 1.1868651074887786,
"grad_norm": 5.09375,
"learning_rate": 7.567142129874012e-06,
"loss": 0.383,
"step": 5024
},
{
"epoch": 1.1944247578549492,
"grad_norm": 4.84375,
"learning_rate": 7.5293020850467705e-06,
"loss": 0.4054,
"step": 5056
},
{
"epoch": 1.2019844082211197,
"grad_norm": 6.3125,
"learning_rate": 7.491266344584691e-06,
"loss": 0.4077,
"step": 5088
},
{
"epoch": 1.2095440585872903,
"grad_norm": 6.8125,
"learning_rate": 7.453037851366136e-06,
"loss": 0.4097,
"step": 5120
},
{
"epoch": 1.2171037089534609,
"grad_norm": 4.59375,
"learning_rate": 7.414619563183017e-06,
"loss": 0.3759,
"step": 5152
},
{
"epoch": 1.2246633593196314,
"grad_norm": 4.5,
"learning_rate": 7.376014452511947e-06,
"loss": 0.3878,
"step": 5184
},
{
"epoch": 1.232223009685802,
"grad_norm": 6.25,
"learning_rate": 7.337225506284255e-06,
"loss": 0.3696,
"step": 5216
},
{
"epoch": 1.2397826600519726,
"grad_norm": 5.9375,
"learning_rate": 7.298255725654884e-06,
"loss": 0.4199,
"step": 5248
},
{
"epoch": 1.2473423104181431,
"grad_norm": 4.8125,
"learning_rate": 7.25910812577019e-06,
"loss": 0.4098,
"step": 5280
},
{
"epoch": 1.2549019607843137,
"grad_norm": 6.03125,
"learning_rate": 7.219785735534653e-06,
"loss": 0.3707,
"step": 5312
},
{
"epoch": 1.2624616111504843,
"grad_norm": 4.8125,
"learning_rate": 7.1802915973765184e-06,
"loss": 0.4299,
"step": 5344
},
{
"epoch": 1.2700212615166548,
"grad_norm": 4.03125,
"learning_rate": 7.14062876701242e-06,
"loss": 0.3897,
"step": 5376
},
{
"epoch": 1.2775809118828254,
"grad_norm": 5.375,
"learning_rate": 7.1008003132109345e-06,
"loss": 0.3932,
"step": 5408
},
{
"epoch": 1.285140562248996,
"grad_norm": 8.1875,
"learning_rate": 7.0608093175551615e-06,
"loss": 0.3614,
"step": 5440
},
{
"epoch": 1.2927002126151665,
"grad_norm": 5.71875,
"learning_rate": 7.020658874204286e-06,
"loss": 0.3998,
"step": 5472
},
{
"epoch": 1.3002598629813371,
"grad_norm": 4.5,
"learning_rate": 6.980352089654184e-06,
"loss": 0.3811,
"step": 5504
},
{
"epoch": 1.3078195133475077,
"grad_norm": 6.40625,
"learning_rate": 6.939892082497073e-06,
"loss": 0.379,
"step": 5536
},
{
"epoch": 1.3153791637136782,
"grad_norm": 7.21875,
"learning_rate": 6.899281983180213e-06,
"loss": 0.4307,
"step": 5568
},
{
"epoch": 1.3229388140798488,
"grad_norm": 6.21875,
"learning_rate": 6.858524933763706e-06,
"loss": 0.4209,
"step": 5600
},
{
"epoch": 1.3304984644460194,
"grad_norm": 5.5625,
"learning_rate": 6.817624087677388e-06,
"loss": 0.4009,
"step": 5632
},
{
"epoch": 1.33805811481219,
"grad_norm": 6.4375,
"learning_rate": 6.776582609476845e-06,
"loss": 0.3359,
"step": 5664
},
{
"epoch": 1.3456177651783605,
"grad_norm": 5.96875,
"learning_rate": 6.735403674598566e-06,
"loss": 0.414,
"step": 5696
},
{
"epoch": 1.353177415544531,
"grad_norm": 11.5,
"learning_rate": 6.694090469114254e-06,
"loss": 0.3847,
"step": 5728
},
{
"epoch": 1.3607370659107016,
"grad_norm": 3.78125,
"learning_rate": 6.652646189484317e-06,
"loss": 0.4152,
"step": 5760
},
{
"epoch": 1.3682967162768722,
"grad_norm": 5.25,
"learning_rate": 6.61107404231055e-06,
"loss": 0.3921,
"step": 5792
},
{
"epoch": 1.3758563666430428,
"grad_norm": 9.625,
"learning_rate": 6.569377244088044e-06,
"loss": 0.3857,
"step": 5824
},
{
"epoch": 1.3834160170092134,
"grad_norm": 5.375,
"learning_rate": 6.527559020956312e-06,
"loss": 0.3879,
"step": 5856
},
{
"epoch": 1.390975667375384,
"grad_norm": 5.40625,
"learning_rate": 6.485622608449684e-06,
"loss": 0.3835,
"step": 5888
},
{
"epoch": 1.3985353177415545,
"grad_norm": 6.6875,
"learning_rate": 6.443571251246964e-06,
"loss": 0.3753,
"step": 5920
},
{
"epoch": 1.406094968107725,
"grad_norm": 4.625,
"learning_rate": 6.401408202920387e-06,
"loss": 0.4092,
"step": 5952
},
{
"epoch": 1.4136546184738956,
"grad_norm": 6.78125,
"learning_rate": 6.3591367256838935e-06,
"loss": 0.4141,
"step": 5984
},
{
"epoch": 1.4212142688400662,
"grad_norm": 5.5,
"learning_rate": 6.316760090140713e-06,
"loss": 0.401,
"step": 6016
},
{
"epoch": 1.4287739192062368,
"grad_norm": 6.125,
"learning_rate": 6.274281575030321e-06,
"loss": 0.3655,
"step": 6048
},
{
"epoch": 1.4363335695724073,
"grad_norm": 6.9375,
"learning_rate": 6.2317044669747595e-06,
"loss": 0.4077,
"step": 6080
},
{
"epoch": 1.4438932199385779,
"grad_norm": 4.90625,
"learning_rate": 6.189032060224344e-06,
"loss": 0.3987,
"step": 6112
},
{
"epoch": 1.4514528703047485,
"grad_norm": 5.71875,
"learning_rate": 6.1462676564027814e-06,
"loss": 0.3943,
"step": 6144
},
{
"epoch": 1.459012520670919,
"grad_norm": 5.78125,
"learning_rate": 6.1034145642517236e-06,
"loss": 0.3839,
"step": 6176
},
{
"epoch": 1.4665721710370896,
"grad_norm": 19.25,
"learning_rate": 6.06047609937476e-06,
"loss": 0.412,
"step": 6208
},
{
"epoch": 1.4741318214032602,
"grad_norm": 7.375,
"learning_rate": 6.017455583980887e-06,
"loss": 0.3862,
"step": 6240
},
{
"epoch": 1.4816914717694307,
"grad_norm": 12.0625,
"learning_rate": 5.974356346627463e-06,
"loss": 0.4054,
"step": 6272
},
{
"epoch": 1.4892511221356013,
"grad_norm": 7.5,
"learning_rate": 5.931181721962682e-06,
"loss": 0.3772,
"step": 6304
},
{
"epoch": 1.4968107725017719,
"grad_norm": 7.46875,
"learning_rate": 5.887935050467547e-06,
"loss": 0.4123,
"step": 6336
},
{
"epoch": 1.5043704228679422,
"grad_norm": 8.4375,
"learning_rate": 5.844619678197434e-06,
"loss": 0.3815,
"step": 6368
},
{
"epoch": 1.511930073234113,
"grad_norm": 5.875,
"learning_rate": 5.801238956523192e-06,
"loss": 0.4348,
"step": 6400
},
{
"epoch": 1.5194897236002833,
"grad_norm": 6.375,
"learning_rate": 5.757796241871844e-06,
"loss": 0.3863,
"step": 6432
},
{
"epoch": 1.5270493739664541,
"grad_norm": 7.15625,
"learning_rate": 5.714294895466897e-06,
"loss": 0.4001,
"step": 6464
},
{
"epoch": 1.5346090243326245,
"grad_norm": 5.0,
"learning_rate": 5.670738283068273e-06,
"loss": 0.3793,
"step": 6496
},
{
"epoch": 1.5421686746987953,
"grad_norm": 6.65625,
"learning_rate": 5.627129774711912e-06,
"loss": 0.3899,
"step": 6528
},
{
"epoch": 1.5497283250649656,
"grad_norm": 5.75,
"learning_rate": 5.58347274444901e-06,
"loss": 0.3811,
"step": 6560
},
{
"epoch": 1.5572879754311364,
"grad_norm": 5.09375,
"learning_rate": 5.539770570084968e-06,
"loss": 0.3998,
"step": 6592
},
{
"epoch": 1.5648476257973067,
"grad_norm": 6.40625,
"learning_rate": 5.496026632918055e-06,
"loss": 0.3703,
"step": 6624
},
{
"epoch": 1.5724072761634775,
"grad_norm": 7.25,
"learning_rate": 5.452244317477785e-06,
"loss": 0.3892,
"step": 6656
},
{
"epoch": 1.5799669265296479,
"grad_norm": 7.46875,
"learning_rate": 5.40842701126305e-06,
"loss": 0.4057,
"step": 6688
},
{
"epoch": 1.5875265768958187,
"grad_norm": 5.625,
"learning_rate": 5.364578104480029e-06,
"loss": 0.4182,
"step": 6720
},
{
"epoch": 1.595086227261989,
"grad_norm": 5.25,
"learning_rate": 5.3207009897798825e-06,
"loss": 0.3668,
"step": 6752
},
{
"epoch": 1.6026458776281598,
"grad_norm": 7.28125,
"learning_rate": 5.276799061996252e-06,
"loss": 0.3976,
"step": 6784
},
{
"epoch": 1.6102055279943301,
"grad_norm": 5.625,
"learning_rate": 5.232875717882606e-06,
"loss": 0.4012,
"step": 6816
},
{
"epoch": 1.617765178360501,
"grad_norm": 8.5,
"learning_rate": 5.1889343558494264e-06,
"loss": 0.413,
"step": 6848
},
{
"epoch": 1.6253248287266713,
"grad_norm": 7.9375,
"learning_rate": 5.144978375701261e-06,
"loss": 0.435,
"step": 6880
},
{
"epoch": 1.632884479092842,
"grad_norm": 6.34375,
"learning_rate": 5.101011178373687e-06,
"loss": 0.3851,
"step": 6912
},
{
"epoch": 1.6404441294590124,
"grad_norm": 5.53125,
"learning_rate": 5.057036165670165e-06,
"loss": 0.3728,
"step": 6944
},
{
"epoch": 1.6480037798251832,
"grad_norm": 6.0625,
"learning_rate": 5.013056739998845e-06,
"loss": 0.4223,
"step": 6976
},
{
"epoch": 1.6555634301913535,
"grad_norm": 5.34375,
"learning_rate": 4.969076304109316e-06,
"loss": 0.4188,
"step": 7008
},
{
"epoch": 1.6631230805575243,
"grad_norm": 7.28125,
"learning_rate": 4.92509826082933e-06,
"loss": 0.3932,
"step": 7040
},
{
"epoch": 1.6706827309236947,
"grad_norm": 4.625,
"learning_rate": 4.881126012801512e-06,
"loss": 0.3873,
"step": 7072
},
{
"epoch": 1.6782423812898655,
"grad_norm": 5.8125,
"learning_rate": 4.8371629622201084e-06,
"loss": 0.3801,
"step": 7104
},
{
"epoch": 1.6858020316560358,
"grad_norm": 7.5,
"learning_rate": 4.793212510567741e-06,
"loss": 0.381,
"step": 7136
},
{
"epoch": 1.6933616820222066,
"grad_norm": 5.40625,
"learning_rate": 4.749278058352238e-06,
"loss": 0.4149,
"step": 7168
},
{
"epoch": 1.700921332388377,
"grad_norm": 4.0,
"learning_rate": 4.7053630048435295e-06,
"loss": 0.3805,
"step": 7200
},
{
"epoch": 1.7084809827545477,
"grad_norm": 5.6875,
"learning_rate": 4.661470747810635e-06,
"loss": 0.3912,
"step": 7232
},
{
"epoch": 1.716040633120718,
"grad_norm": 6.03125,
"learning_rate": 4.617604683258787e-06,
"loss": 0.4288,
"step": 7264
},
{
"epoch": 1.7236002834868889,
"grad_norm": 6.9375,
"learning_rate": 4.573768205166663e-06,
"loss": 0.3611,
"step": 7296
},
{
"epoch": 1.7311599338530592,
"grad_norm": 7.125,
"learning_rate": 4.529964705223796e-06,
"loss": 0.3765,
"step": 7328
},
{
"epoch": 1.73871958421923,
"grad_norm": 7.0625,
"learning_rate": 4.486197572568154e-06,
"loss": 0.4224,
"step": 7360
},
{
"epoch": 1.7462792345854004,
"grad_norm": 4.78125,
"learning_rate": 4.442470193523919e-06,
"loss": 0.4066,
"step": 7392
},
{
"epoch": 1.7538388849515711,
"grad_norm": 7.28125,
"learning_rate": 4.398785951339476e-06,
"loss": 0.4138,
"step": 7424
},
{
"epoch": 1.7613985353177415,
"grad_norm": 7.0,
"learning_rate": 4.355148225925658e-06,
"loss": 0.4131,
"step": 7456
},
{
"epoch": 1.7689581856839123,
"grad_norm": 6.125,
"learning_rate": 4.3115603935942226e-06,
"loss": 0.3907,
"step": 7488
},
{
"epoch": 1.7765178360500826,
"grad_norm": 6.1875,
"learning_rate": 4.268025826796636e-06,
"loss": 0.4282,
"step": 7520
},
{
"epoch": 1.7840774864162534,
"grad_norm": 6.15625,
"learning_rate": 4.224547893863133e-06,
"loss": 0.3837,
"step": 7552
},
{
"epoch": 1.7916371367824238,
"grad_norm": 7.46875,
"learning_rate": 4.181129958742107e-06,
"loss": 0.3634,
"step": 7584
},
{
"epoch": 1.7991967871485943,
"grad_norm": 6.0625,
"learning_rate": 4.137775380739839e-06,
"loss": 0.3652,
"step": 7616
},
{
"epoch": 1.806756437514765,
"grad_norm": 8.6875,
"learning_rate": 4.094487514260575e-06,
"loss": 0.4116,
"step": 7648
},
{
"epoch": 1.8143160878809355,
"grad_norm": 7.75,
"learning_rate": 4.051269708547008e-06,
"loss": 0.4062,
"step": 7680
},
{
"epoch": 1.821875738247106,
"grad_norm": 15.0,
"learning_rate": 4.0081253074211204e-06,
"loss": 0.3903,
"step": 7712
},
{
"epoch": 1.8294353886132766,
"grad_norm": 6.21875,
"learning_rate": 3.965057649025489e-06,
"loss": 0.3491,
"step": 7744
},
{
"epoch": 1.8369950389794472,
"grad_norm": 6.375,
"learning_rate": 3.9220700655649916e-06,
"loss": 0.3592,
"step": 7776
},
{
"epoch": 1.8445546893456177,
"grad_norm": 6.28125,
"learning_rate": 3.879165883048997e-06,
"loss": 0.3775,
"step": 7808
},
{
"epoch": 1.8521143397117883,
"grad_norm": 5.875,
"learning_rate": 3.8363484210340315e-06,
"loss": 0.4107,
"step": 7840
},
{
"epoch": 1.8596739900779589,
"grad_norm": 5.21875,
"learning_rate": 3.7936209923669286e-06,
"loss": 0.3756,
"step": 7872
},
{
"epoch": 1.8672336404441294,
"grad_norm": 6.25,
"learning_rate": 3.7509869029285216e-06,
"loss": 0.3981,
"step": 7904
},
{
"epoch": 1.8747932908103,
"grad_norm": 6.625,
"learning_rate": 3.708449451377851e-06,
"loss": 0.3983,
"step": 7936
},
{
"epoch": 1.8823529411764706,
"grad_norm": 5.9375,
"learning_rate": 3.6660119288969577e-06,
"loss": 0.4131,
"step": 7968
},
{
"epoch": 1.8899125915426411,
"grad_norm": 7.375,
"learning_rate": 3.623677618936221e-06,
"loss": 0.3686,
"step": 8000
},
{
"epoch": 1.8974722419088117,
"grad_norm": 6.3125,
"learning_rate": 3.5814497969603324e-06,
"loss": 0.3864,
"step": 8032
},
{
"epoch": 1.9050318922749823,
"grad_norm": 4.84375,
"learning_rate": 3.5393317301948517e-06,
"loss": 0.4053,
"step": 8064
},
{
"epoch": 1.9125915426411528,
"grad_norm": 5.125,
"learning_rate": 3.497326677373431e-06,
"loss": 0.3841,
"step": 8096
},
{
"epoch": 1.9201511930073234,
"grad_norm": 5.34375,
"learning_rate": 3.4554378884856694e-06,
"loss": 0.3439,
"step": 8128
},
{
"epoch": 1.927710843373494,
"grad_norm": 5.53125,
"learning_rate": 3.41366860452567e-06,
"loss": 0.4156,
"step": 8160
},
{
"epoch": 1.9352704937396645,
"grad_norm": 4.5,
"learning_rate": 3.372022057241269e-06,
"loss": 0.4258,
"step": 8192
},
{
"epoch": 1.942830144105835,
"grad_norm": 6.3125,
"learning_rate": 3.330501468883992e-06,
"loss": 0.419,
"step": 8224
},
{
"epoch": 1.9503897944720057,
"grad_norm": 9.4375,
"learning_rate": 3.289110051959754e-06,
"loss": 0.4285,
"step": 8256
},
{
"epoch": 1.9579494448381762,
"grad_norm": 6.1875,
"learning_rate": 3.247851008980286e-06,
"loss": 0.4105,
"step": 8288
},
{
"epoch": 1.9655090952043468,
"grad_norm": 8.375,
"learning_rate": 3.206727532215372e-06,
"loss": 0.4125,
"step": 8320
},
{
"epoch": 1.9730687455705174,
"grad_norm": 8.1875,
"learning_rate": 3.165742803445841e-06,
"loss": 0.3931,
"step": 8352
},
{
"epoch": 1.980628395936688,
"grad_norm": 6.03125,
"learning_rate": 3.1248999937174007e-06,
"loss": 0.3599,
"step": 8384
},
{
"epoch": 1.9881880463028585,
"grad_norm": 7.09375,
"learning_rate": 3.0842022630952784e-06,
"loss": 0.3852,
"step": 8416
},
{
"epoch": 1.995747696669029,
"grad_norm": 3.78125,
"learning_rate": 3.0436527604197374e-06,
"loss": 0.3744,
"step": 8448
},
{
"epoch": 2.0033073470351996,
"grad_norm": 6.34375,
"learning_rate": 3.00325462306243e-06,
"loss": 0.3665,
"step": 8480
},
{
"epoch": 2.01086699740137,
"grad_norm": 4.125,
"learning_rate": 2.9630109766836634e-06,
"loss": 0.3574,
"step": 8512
},
{
"epoch": 2.0184266477675408,
"grad_norm": 4.78125,
"learning_rate": 2.9229249349905686e-06,
"loss": 0.3651,
"step": 8544
},
{
"epoch": 2.025986298133711,
"grad_norm": 6.4375,
"learning_rate": 2.8829995994961725e-06,
"loss": 0.3792,
"step": 8576
},
{
"epoch": 2.033545948499882,
"grad_norm": 4.75,
"learning_rate": 2.8432380592794466e-06,
"loss": 0.4099,
"step": 8608
},
{
"epoch": 2.0411055988660523,
"grad_norm": 5.375,
"learning_rate": 2.8036433907462866e-06,
"loss": 0.3701,
"step": 8640
},
{
"epoch": 2.048665249232223,
"grad_norm": 6.46875,
"learning_rate": 2.7642186573915e-06,
"loss": 0.3454,
"step": 8672
},
{
"epoch": 2.0562248995983934,
"grad_norm": 8.1875,
"learning_rate": 2.724966909561765e-06,
"loss": 0.3676,
"step": 8704
},
{
"epoch": 2.063784549964564,
"grad_norm": 4.03125,
"learning_rate": 2.685891184219635e-06,
"loss": 0.3714,
"step": 8736
},
{
"epoch": 2.0713442003307345,
"grad_norm": 9.375,
"learning_rate": 2.646994504708551e-06,
"loss": 0.4112,
"step": 8768
},
{
"epoch": 2.0789038506969053,
"grad_norm": 5.0,
"learning_rate": 2.6082798805189347e-06,
"loss": 0.3587,
"step": 8800
},
{
"epoch": 2.0864635010630757,
"grad_norm": 11.25,
"learning_rate": 2.569750307055329e-06,
"loss": 0.3693,
"step": 8832
},
{
"epoch": 2.0940231514292464,
"grad_norm": 7.4375,
"learning_rate": 2.5314087654046403e-06,
"loss": 0.3768,
"step": 8864
},
{
"epoch": 2.101582801795417,
"grad_norm": 5.65625,
"learning_rate": 2.4932582221055024e-06,
"loss": 0.3525,
"step": 8896
},
{
"epoch": 2.1091424521615876,
"grad_norm": 7.59375,
"learning_rate": 2.455301628918727e-06,
"loss": 0.3835,
"step": 8928
},
{
"epoch": 2.116702102527758,
"grad_norm": 4.96875,
"learning_rate": 2.417541922598945e-06,
"loss": 0.3802,
"step": 8960
},
{
"epoch": 2.1242617528939287,
"grad_norm": 5.90625,
"learning_rate": 2.379982024667367e-06,
"loss": 0.4005,
"step": 8992
},
{
"epoch": 2.131821403260099,
"grad_norm": 10.1875,
"learning_rate": 2.3426248411857573e-06,
"loss": 0.3815,
"step": 9024
},
{
"epoch": 2.13938105362627,
"grad_norm": 6.09375,
"learning_rate": 2.3054732625315725e-06,
"loss": 0.3349,
"step": 9056
},
{
"epoch": 2.14694070399244,
"grad_norm": 7.28125,
"learning_rate": 2.2685301631743437e-06,
"loss": 0.3962,
"step": 9088
},
{
"epoch": 2.154500354358611,
"grad_norm": 4.0625,
"learning_rate": 2.2317984014532634e-06,
"loss": 0.3707,
"step": 9120
},
{
"epoch": 2.1620600047247813,
"grad_norm": 5.0625,
"learning_rate": 2.1952808193560367e-06,
"loss": 0.4251,
"step": 9152
},
{
"epoch": 2.169619655090952,
"grad_norm": 6.75,
"learning_rate": 2.158980242298989e-06,
"loss": 0.3916,
"step": 9184
},
{
"epoch": 2.1771793054571225,
"grad_norm": 9.9375,
"learning_rate": 2.12289947890847e-06,
"loss": 0.3959,
"step": 9216
},
{
"epoch": 2.1847389558232932,
"grad_norm": 5.5,
"learning_rate": 2.087041320803535e-06,
"loss": 0.3586,
"step": 9248
},
{
"epoch": 2.1922986061894636,
"grad_norm": 5.78125,
"learning_rate": 2.051408542379955e-06,
"loss": 0.3692,
"step": 9280
},
{
"epoch": 2.1998582565556344,
"grad_norm": 6.40625,
"learning_rate": 2.016003900595566e-06,
"loss": 0.3709,
"step": 9312
},
{
"epoch": 2.2074179069218047,
"grad_norm": 4.28125,
"learning_rate": 1.980830134756946e-06,
"loss": 0.3729,
"step": 9344
},
{
"epoch": 2.2149775572879755,
"grad_norm": 6.03125,
"learning_rate": 1.945889966307487e-06,
"loss": 0.3655,
"step": 9376
},
{
"epoch": 2.222537207654146,
"grad_norm": 6.28125,
"learning_rate": 1.911186098616819e-06,
"loss": 0.3661,
"step": 9408
},
{
"epoch": 2.2300968580203167,
"grad_norm": 8.875,
"learning_rate": 1.8767212167716536e-06,
"loss": 0.3685,
"step": 9440
},
{
"epoch": 2.237656508386487,
"grad_norm": 5.8125,
"learning_rate": 1.8424979873680332e-06,
"loss": 0.3623,
"step": 9472
},
{
"epoch": 2.245216158752658,
"grad_norm": 3.78125,
"learning_rate": 1.8085190583050166e-06,
"loss": 0.378,
"step": 9504
},
{
"epoch": 2.252775809118828,
"grad_norm": 6.375,
"learning_rate": 1.774787058579799e-06,
"loss": 0.3688,
"step": 9536
},
{
"epoch": 2.260335459484999,
"grad_norm": 7.625,
"learning_rate": 1.7413045980843119e-06,
"loss": 0.3841,
"step": 9568
},
{
"epoch": 2.2678951098511693,
"grad_norm": 5.34375,
"learning_rate": 1.7080742674032886e-06,
"loss": 0.3937,
"step": 9600
},
{
"epoch": 2.27545476021734,
"grad_norm": 6.75,
"learning_rate": 1.6750986376138207e-06,
"loss": 0.3752,
"step": 9632
},
{
"epoch": 2.2830144105835104,
"grad_norm": 6.40625,
"learning_rate": 1.6423802600864436e-06,
"loss": 0.394,
"step": 9664
},
{
"epoch": 2.290574060949681,
"grad_norm": 7.0625,
"learning_rate": 1.6099216662877204e-06,
"loss": 0.3868,
"step": 9696
},
{
"epoch": 2.2981337113158515,
"grad_norm": 6.5,
"learning_rate": 1.5777253675843873e-06,
"loss": 0.4061,
"step": 9728
},
{
"epoch": 2.3056933616820223,
"grad_norm": 12.25,
"learning_rate": 1.5457938550490387e-06,
"loss": 0.364,
"step": 9760
},
{
"epoch": 2.3132530120481927,
"grad_norm": 4.59375,
"learning_rate": 1.5141295992674e-06,
"loss": 0.3626,
"step": 9792
},
{
"epoch": 2.3208126624143635,
"grad_norm": 6.125,
"learning_rate": 1.482735050147161e-06,
"loss": 0.4113,
"step": 9824
},
{
"epoch": 2.328372312780534,
"grad_norm": 6.15625,
"learning_rate": 1.4516126367284355e-06,
"loss": 0.4139,
"step": 9856
},
{
"epoch": 2.3359319631467046,
"grad_norm": 4.875,
"learning_rate": 1.4207647669958186e-06,
"loss": 0.3712,
"step": 9888
},
{
"epoch": 2.343491613512875,
"grad_norm": 6.15625,
"learning_rate": 1.3901938276920712e-06,
"loss": 0.3864,
"step": 9920
},
{
"epoch": 2.3510512638790457,
"grad_norm": 4.6875,
"learning_rate": 1.359902184133467e-06,
"loss": 0.3999,
"step": 9952
},
{
"epoch": 2.358610914245216,
"grad_norm": 6.25,
"learning_rate": 1.3298921800267728e-06,
"loss": 0.3774,
"step": 9984
},
{
"epoch": 2.366170564611387,
"grad_norm": 6.4375,
"learning_rate": 1.3001661372879194e-06,
"loss": 0.4082,
"step": 10016
},
{
"epoch": 2.373730214977557,
"grad_norm": 9.1875,
"learning_rate": 1.2707263558623483e-06,
"loss": 0.3886,
"step": 10048
},
{
"epoch": 2.381289865343728,
"grad_norm": 5.875,
"learning_rate": 1.2415751135470693e-06,
"loss": 0.418,
"step": 10080
},
{
"epoch": 2.3888495157098983,
"grad_norm": 6.5625,
"learning_rate": 1.2127146658144095e-06,
"loss": 0.3886,
"step": 10112
},
{
"epoch": 2.396409166076069,
"grad_norm": 6.15625,
"learning_rate": 1.1841472456375219e-06,
"loss": 0.3816,
"step": 10144
},
{
"epoch": 2.4039688164422395,
"grad_norm": 4.46875,
"learning_rate": 1.1558750633175998e-06,
"loss": 0.3795,
"step": 10176
},
{
"epoch": 2.4115284668084103,
"grad_norm": 4.78125,
"learning_rate": 1.1279003063128773e-06,
"loss": 0.3967,
"step": 10208
},
{
"epoch": 2.4190881171745806,
"grad_norm": 4.25,
"learning_rate": 1.1002251390693763e-06,
"loss": 0.3719,
"step": 10240
},
{
"epoch": 2.4266477675407514,
"grad_norm": 4.09375,
"learning_rate": 1.0728517028534364e-06,
"loss": 0.3794,
"step": 10272
},
{
"epoch": 2.4342074179069217,
"grad_norm": 7.84375,
"learning_rate": 1.0457821155860488e-06,
"loss": 0.3876,
"step": 10304
},
{
"epoch": 2.4417670682730925,
"grad_norm": 4.40625,
"learning_rate": 1.0190184716789853e-06,
"loss": 0.4292,
"step": 10336
},
{
"epoch": 2.449326718639263,
"grad_norm": 6.78125,
"learning_rate": 9.925628418727563e-07,
"loss": 0.3976,
"step": 10368
},
{
"epoch": 2.4568863690054337,
"grad_norm": 4.5,
"learning_rate": 9.664172730763872e-07,
"loss": 0.4061,
"step": 10400
},
{
"epoch": 2.464446019371604,
"grad_norm": 5.90625,
"learning_rate": 9.405837882090534e-07,
"loss": 0.3729,
"step": 10432
},
{
"epoch": 2.472005669737775,
"grad_norm": 5.65625,
"learning_rate": 9.150643860435571e-07,
"loss": 0.3747,
"step": 10464
},
{
"epoch": 2.479565320103945,
"grad_norm": 7.34375,
"learning_rate": 8.898610410516873e-07,
"loss": 0.3779,
"step": 10496
},
{
"epoch": 2.487124970470116,
"grad_norm": 5.875,
"learning_rate": 8.649757032514439e-07,
"loss": 0.3851,
"step": 10528
},
{
"epoch": 2.4946846208362863,
"grad_norm": 11.0,
"learning_rate": 8.404102980561702e-07,
"loss": 0.3837,
"step": 10560
},
{
"epoch": 2.502244271202457,
"grad_norm": 5.3125,
"learning_rate": 8.161667261255746e-07,
"loss": 0.3362,
"step": 10592
},
{
"epoch": 2.5098039215686274,
"grad_norm": 7.34375,
"learning_rate": 7.922468632186748e-07,
"loss": 0.3638,
"step": 10624
},
{
"epoch": 2.5173635719347978,
"grad_norm": 4.34375,
"learning_rate": 7.686525600486743e-07,
"loss": 0.3874,
"step": 10656
},
{
"epoch": 2.5249232223009686,
"grad_norm": 7.53125,
"learning_rate": 7.453856421397598e-07,
"loss": 0.372,
"step": 10688
},
{
"epoch": 2.5324828726671393,
"grad_norm": 6.0625,
"learning_rate": 7.224479096858672e-07,
"loss": 0.4139,
"step": 10720
},
{
"epoch": 2.5400425230333097,
"grad_norm": 5.125,
"learning_rate": 6.9984113741139e-07,
"loss": 0.3876,
"step": 10752
},
{
"epoch": 2.54760217339948,
"grad_norm": 5.40625,
"learning_rate": 6.775670744338747e-07,
"loss": 0.3811,
"step": 10784
},
{
"epoch": 2.555161823765651,
"grad_norm": 5.9375,
"learning_rate": 6.556274441286809e-07,
"loss": 0.3698,
"step": 10816
},
{
"epoch": 2.5627214741318216,
"grad_norm": 4.09375,
"learning_rate": 6.340239439956486e-07,
"loss": 0.3724,
"step": 10848
},
{
"epoch": 2.570281124497992,
"grad_norm": 7.3125,
"learning_rate": 6.127582455277547e-07,
"loss": 0.4228,
"step": 10880
},
{
"epoch": 2.5778407748641623,
"grad_norm": 5.8125,
"learning_rate": 5.918319940817884e-07,
"loss": 0.3607,
"step": 10912
},
{
"epoch": 2.585400425230333,
"grad_norm": 10.625,
"learning_rate": 5.712468087510536e-07,
"loss": 0.3846,
"step": 10944
},
{
"epoch": 2.592960075596504,
"grad_norm": 8.375,
"learning_rate": 5.510042822400868e-07,
"loss": 0.4124,
"step": 10976
},
{
"epoch": 2.6005197259626742,
"grad_norm": 6.0,
"learning_rate": 5.311059807414376e-07,
"loss": 0.3939,
"step": 11008
},
{
"epoch": 2.6080793763288446,
"grad_norm": 7.96875,
"learning_rate": 5.11553443814482e-07,
"loss": 0.4022,
"step": 11040
},
{
"epoch": 2.6156390266950154,
"grad_norm": 9.1875,
"learning_rate": 4.923481842663114e-07,
"loss": 0.3872,
"step": 11072
},
{
"epoch": 2.623198677061186,
"grad_norm": 5.65625,
"learning_rate": 4.734916880346774e-07,
"loss": 0.4234,
"step": 11104
},
{
"epoch": 2.6307583274273565,
"grad_norm": 12.0,
"learning_rate": 4.549854140730325e-07,
"loss": 0.3591,
"step": 11136
},
{
"epoch": 2.638317977793527,
"grad_norm": 4.78125,
"learning_rate": 4.368307942376371e-07,
"loss": 0.4013,
"step": 11168
},
{
"epoch": 2.6458776281596976,
"grad_norm": 8.8125,
"learning_rate": 4.190292331767848e-07,
"loss": 0.3419,
"step": 11200
},
{
"epoch": 2.6534372785258684,
"grad_norm": 5.4375,
"learning_rate": 4.0158210822211496e-07,
"loss": 0.384,
"step": 11232
},
{
"epoch": 2.6609969288920388,
"grad_norm": 7.28125,
"learning_rate": 3.844907692820543e-07,
"loss": 0.3919,
"step": 11264
},
{
"epoch": 2.668556579258209,
"grad_norm": 4.625,
"learning_rate": 3.6775653873736774e-07,
"loss": 0.3707,
"step": 11296
},
{
"epoch": 2.67611622962438,
"grad_norm": 5.96875,
"learning_rate": 3.5138071133884235e-07,
"loss": 0.372,
"step": 11328
},
{
"epoch": 2.6836758799905507,
"grad_norm": 5.96875,
"learning_rate": 3.3536455410711654e-07,
"loss": 0.3835,
"step": 11360
},
{
"epoch": 2.691235530356721,
"grad_norm": 7.28125,
"learning_rate": 3.1970930623464403e-07,
"loss": 0.3677,
"step": 11392
},
{
"epoch": 2.6987951807228914,
"grad_norm": 5.3125,
"learning_rate": 3.044161789898159e-07,
"loss": 0.3694,
"step": 11424
},
{
"epoch": 2.706354831089062,
"grad_norm": 7.46875,
"learning_rate": 2.8948635562324747e-07,
"loss": 0.3782,
"step": 11456
},
{
"epoch": 2.713914481455233,
"grad_norm": 6.0625,
"learning_rate": 2.749209912762241e-07,
"loss": 0.3956,
"step": 11488
},
{
"epoch": 2.7214741318214033,
"grad_norm": 6.09375,
"learning_rate": 2.607212128913267e-07,
"loss": 0.406,
"step": 11520
},
{
"epoch": 2.7290337821875736,
"grad_norm": 8.75,
"learning_rate": 2.4688811912524204e-07,
"loss": 0.3621,
"step": 11552
},
{
"epoch": 2.7365934325537444,
"grad_norm": 4.65625,
"learning_rate": 2.3342278026375397e-07,
"loss": 0.3859,
"step": 11584
},
{
"epoch": 2.744153082919915,
"grad_norm": 5.9375,
"learning_rate": 2.203262381389365e-07,
"loss": 0.3772,
"step": 11616
},
{
"epoch": 2.7517127332860856,
"grad_norm": 5.21875,
"learning_rate": 2.0759950604854583e-07,
"loss": 0.3757,
"step": 11648
},
{
"epoch": 2.759272383652256,
"grad_norm": 6.09375,
"learning_rate": 1.9524356867761772e-07,
"loss": 0.4159,
"step": 11680
},
{
"epoch": 2.7668320340184267,
"grad_norm": 7.15625,
"learning_rate": 1.8325938202228276e-07,
"loss": 0.374,
"step": 11712
},
{
"epoch": 2.7743916843845975,
"grad_norm": 5.5,
"learning_rate": 1.7164787331580058e-07,
"loss": 0.3731,
"step": 11744
},
{
"epoch": 2.781951334750768,
"grad_norm": 6.375,
"learning_rate": 1.6040994095681716e-07,
"loss": 0.3725,
"step": 11776
},
{
"epoch": 2.789510985116938,
"grad_norm": 5.125,
"learning_rate": 1.4954645443985337e-07,
"loss": 0.4074,
"step": 11808
},
{
"epoch": 2.797070635483109,
"grad_norm": 5.3125,
"learning_rate": 1.3905825428803477e-07,
"loss": 0.3863,
"step": 11840
},
{
"epoch": 2.8046302858492793,
"grad_norm": 3.828125,
"learning_rate": 1.2894615198805415e-07,
"loss": 0.3922,
"step": 11872
},
{
"epoch": 2.81218993621545,
"grad_norm": 6.4375,
"learning_rate": 1.1921092992739025e-07,
"loss": 0.3889,
"step": 11904
},
{
"epoch": 2.8197495865816204,
"grad_norm": 7.75,
"learning_rate": 1.0985334133376991e-07,
"loss": 0.3989,
"step": 11936
},
{
"epoch": 2.8273092369477912,
"grad_norm": 8.0,
"learning_rate": 1.0087411021689252e-07,
"loss": 0.3941,
"step": 11968
},
{
"epoch": 2.8348688873139616,
"grad_norm": 5.5,
"learning_rate": 9.227393131240992e-08,
"loss": 0.3675,
"step": 12000
},
{
"epoch": 2.8424285376801324,
"grad_norm": 5.09375,
"learning_rate": 8.405347002817421e-08,
"loss": 0.3858,
"step": 12032
},
{
"epoch": 2.8499881880463027,
"grad_norm": 6.09375,
"learning_rate": 7.6213362392758e-08,
"loss": 0.4184,
"step": 12064
},
{
"epoch": 2.8575478384124735,
"grad_norm": 8.9375,
"learning_rate": 6.875421500623703e-08,
"loss": 0.3729,
"step": 12096
},
{
"epoch": 2.865107488778644,
"grad_norm": 7.1875,
"learning_rate": 6.167660499326322e-08,
"loss": 0.3836,
"step": 12128
},
{
"epoch": 2.8726671391448146,
"grad_norm": 4.59375,
"learning_rate": 5.498107995840774e-08,
"loss": 0.3789,
"step": 12160
},
{
"epoch": 2.880226789510985,
"grad_norm": 10.125,
"learning_rate": 4.86681579437942e-08,
"loss": 0.4012,
"step": 12192
},
{
"epoch": 2.8877864398771558,
"grad_norm": 4.21875,
"learning_rate": 4.273832738901529e-08,
"loss": 0.365,
"step": 12224
},
{
"epoch": 2.895346090243326,
"grad_norm": 6.0625,
"learning_rate": 3.719204709334345e-08,
"loss": 0.3903,
"step": 12256
},
{
"epoch": 2.902905740609497,
"grad_norm": 5.5625,
"learning_rate": 3.2029746180231023e-08,
"loss": 0.4063,
"step": 12288
},
{
"epoch": 2.9104653909756673,
"grad_norm": 6.8125,
"learning_rate": 2.725182406410842e-08,
"loss": 0.3909,
"step": 12320
},
{
"epoch": 2.918025041341838,
"grad_norm": 4.0625,
"learning_rate": 2.2858650419484985e-08,
"loss": 0.3711,
"step": 12352
},
{
"epoch": 2.9255846917080084,
"grad_norm": 5.71875,
"learning_rate": 1.8850565152339072e-08,
"loss": 0.3463,
"step": 12384
},
{
"epoch": 2.933144342074179,
"grad_norm": 5.0,
"learning_rate": 1.5227878373827443e-08,
"loss": 0.443,
"step": 12416
},
{
"epoch": 2.9407039924403495,
"grad_norm": 4.78125,
"learning_rate": 1.1990870376284435e-08,
"loss": 0.405,
"step": 12448
},
{
"epoch": 2.9482636428065203,
"grad_norm": 6.25,
"learning_rate": 9.139791611540438e-09,
"loss": 0.3871,
"step": 12480
},
{
"epoch": 2.9558232931726907,
"grad_norm": 5.28125,
"learning_rate": 6.6748626715407165e-09,
"loss": 0.4238,
"step": 12512
},
{
"epoch": 2.9633829435388614,
"grad_norm": 6.46875,
"learning_rate": 4.596274271280732e-09,
"loss": 0.3852,
"step": 12544
},
{
"epoch": 2.970942593905032,
"grad_norm": 5.8125,
"learning_rate": 2.9041872340479505e-09,
"loss": 0.3845,
"step": 12576
},
{
"epoch": 2.9785022442712026,
"grad_norm": 4.21875,
"learning_rate": 1.59873247897957e-09,
"loss": 0.4169,
"step": 12608
},
{
"epoch": 2.986061894637373,
"grad_norm": 5.96875,
"learning_rate": 6.800110109328462e-10,
"loss": 0.3872,
"step": 12640
},
{
"epoch": 2.9936215450035437,
"grad_norm": 11.125,
"learning_rate": 1.480939126713432e-10,
"loss": 0.3957,
"step": 12672
},
{
"epoch": 3.0,
"step": 12699,
"total_flos": 2.247355191985367e+17,
"train_loss": 0.4236195460108568,
"train_runtime": 3683.6752,
"train_samples_per_second": 13.789,
"train_steps_per_second": 3.447
}
],
"logging_steps": 32,
"max_steps": 12699,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.247355191985367e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}