5ep-checkpoint / trainer_state.json
tohuy2710's picture
QW2.5-1.5B-LeetCodeDataset 5ep on DSCoder-v2-instruct dataset
b601339 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.0,
"eval_steps": 500,
"global_step": 21165,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.007559650366170565,
"grad_norm": 32.75,
"learning_rate": 1.464336324988191e-07,
"loss": 0.6933,
"step": 32
},
{
"epoch": 0.01511930073234113,
"grad_norm": 14.8125,
"learning_rate": 2.975909305621162e-07,
"loss": 0.6983,
"step": 64
},
{
"epoch": 0.022678951098511695,
"grad_norm": 28.25,
"learning_rate": 4.4874822862541336e-07,
"loss": 0.667,
"step": 96
},
{
"epoch": 0.03023860146468226,
"grad_norm": 24.125,
"learning_rate": 5.999055266887105e-07,
"loss": 0.6409,
"step": 128
},
{
"epoch": 0.03779825183085282,
"grad_norm": 26.25,
"learning_rate": 7.510628247520075e-07,
"loss": 0.6695,
"step": 160
},
{
"epoch": 0.04535790219702339,
"grad_norm": 16.75,
"learning_rate": 9.022201228153047e-07,
"loss": 0.6807,
"step": 192
},
{
"epoch": 0.05291755256319395,
"grad_norm": 16.625,
"learning_rate": 1.0533774208786019e-06,
"loss": 0.672,
"step": 224
},
{
"epoch": 0.06047720292936452,
"grad_norm": 11.9375,
"learning_rate": 1.204534718941899e-06,
"loss": 0.6028,
"step": 256
},
{
"epoch": 0.06803685329553508,
"grad_norm": 17.125,
"learning_rate": 1.3556920170051963e-06,
"loss": 0.6022,
"step": 288
},
{
"epoch": 0.07559650366170564,
"grad_norm": 13.625,
"learning_rate": 1.5068493150684932e-06,
"loss": 0.5719,
"step": 320
},
{
"epoch": 0.08315615402787621,
"grad_norm": 13.5625,
"learning_rate": 1.6580066131317905e-06,
"loss": 0.5594,
"step": 352
},
{
"epoch": 0.09071580439404678,
"grad_norm": 9.1875,
"learning_rate": 1.8091639111950876e-06,
"loss": 0.5216,
"step": 384
},
{
"epoch": 0.09827545476021735,
"grad_norm": 13.6875,
"learning_rate": 1.9603212092583847e-06,
"loss": 0.5795,
"step": 416
},
{
"epoch": 0.1058351051263879,
"grad_norm": 20.0,
"learning_rate": 2.1114785073216816e-06,
"loss": 0.4809,
"step": 448
},
{
"epoch": 0.11339475549255847,
"grad_norm": 5.65625,
"learning_rate": 2.262635805384979e-06,
"loss": 0.5054,
"step": 480
},
{
"epoch": 0.12095440585872903,
"grad_norm": 12.125,
"learning_rate": 2.4137931034482762e-06,
"loss": 0.512,
"step": 512
},
{
"epoch": 0.1285140562248996,
"grad_norm": 8.0625,
"learning_rate": 2.5649504015115736e-06,
"loss": 0.5196,
"step": 544
},
{
"epoch": 0.13607370659107015,
"grad_norm": 6.625,
"learning_rate": 2.7161076995748705e-06,
"loss": 0.4724,
"step": 576
},
{
"epoch": 0.14363335695724072,
"grad_norm": 6.75,
"learning_rate": 2.8672649976381674e-06,
"loss": 0.4971,
"step": 608
},
{
"epoch": 0.1511930073234113,
"grad_norm": 15.4375,
"learning_rate": 3.0184222957014647e-06,
"loss": 0.4902,
"step": 640
},
{
"epoch": 0.15875265768958186,
"grad_norm": 9.8125,
"learning_rate": 3.1695795937647616e-06,
"loss": 0.4461,
"step": 672
},
{
"epoch": 0.16631230805575242,
"grad_norm": 5.78125,
"learning_rate": 3.3207368918280585e-06,
"loss": 0.4681,
"step": 704
},
{
"epoch": 0.173871958421923,
"grad_norm": 5.84375,
"learning_rate": 3.4718941898913562e-06,
"loss": 0.4747,
"step": 736
},
{
"epoch": 0.18143160878809356,
"grad_norm": 15.875,
"learning_rate": 3.623051487954653e-06,
"loss": 0.4603,
"step": 768
},
{
"epoch": 0.18899125915426412,
"grad_norm": 6.59375,
"learning_rate": 3.7742087860179504e-06,
"loss": 0.5077,
"step": 800
},
{
"epoch": 0.1965509095204347,
"grad_norm": 14.3125,
"learning_rate": 3.925366084081247e-06,
"loss": 0.4793,
"step": 832
},
{
"epoch": 0.20411055988660523,
"grad_norm": 8.6875,
"learning_rate": 4.076523382144545e-06,
"loss": 0.4397,
"step": 864
},
{
"epoch": 0.2116702102527758,
"grad_norm": 6.96875,
"learning_rate": 4.227680680207842e-06,
"loss": 0.4632,
"step": 896
},
{
"epoch": 0.21922986061894637,
"grad_norm": 7.6875,
"learning_rate": 4.3788379782711384e-06,
"loss": 0.4525,
"step": 928
},
{
"epoch": 0.22678951098511693,
"grad_norm": 8.5,
"learning_rate": 4.529995276334436e-06,
"loss": 0.4575,
"step": 960
},
{
"epoch": 0.2343491613512875,
"grad_norm": 9.75,
"learning_rate": 4.681152574397733e-06,
"loss": 0.4867,
"step": 992
},
{
"epoch": 0.24190881171745807,
"grad_norm": 8.0625,
"learning_rate": 4.83230987246103e-06,
"loss": 0.4624,
"step": 1024
},
{
"epoch": 0.24946846208362863,
"grad_norm": 7.59375,
"learning_rate": 4.983467170524327e-06,
"loss": 0.4408,
"step": 1056
},
{
"epoch": 0.2570281124497992,
"grad_norm": 7.5625,
"learning_rate": 5.134624468587624e-06,
"loss": 0.46,
"step": 1088
},
{
"epoch": 0.26458776281596974,
"grad_norm": 8.1875,
"learning_rate": 5.2857817666509215e-06,
"loss": 0.4347,
"step": 1120
},
{
"epoch": 0.2721474131821403,
"grad_norm": 8.8125,
"learning_rate": 5.436939064714218e-06,
"loss": 0.4165,
"step": 1152
},
{
"epoch": 0.2797070635483109,
"grad_norm": 8.625,
"learning_rate": 5.588096362777515e-06,
"loss": 0.4455,
"step": 1184
},
{
"epoch": 0.28726671391448144,
"grad_norm": 8.4375,
"learning_rate": 5.7392536608408135e-06,
"loss": 0.4309,
"step": 1216
},
{
"epoch": 0.294826364280652,
"grad_norm": 7.84375,
"learning_rate": 5.89041095890411e-06,
"loss": 0.4558,
"step": 1248
},
{
"epoch": 0.3023860146468226,
"grad_norm": 13.0,
"learning_rate": 6.041568256967407e-06,
"loss": 0.4711,
"step": 1280
},
{
"epoch": 0.30994566501299314,
"grad_norm": 5.90625,
"learning_rate": 6.192725555030704e-06,
"loss": 0.4422,
"step": 1312
},
{
"epoch": 0.3175053153791637,
"grad_norm": 8.375,
"learning_rate": 6.343882853094001e-06,
"loss": 0.4616,
"step": 1344
},
{
"epoch": 0.3250649657453343,
"grad_norm": 7.28125,
"learning_rate": 6.495040151157299e-06,
"loss": 0.4529,
"step": 1376
},
{
"epoch": 0.33262461611150484,
"grad_norm": 6.375,
"learning_rate": 6.646197449220596e-06,
"loss": 0.4655,
"step": 1408
},
{
"epoch": 0.3401842664776754,
"grad_norm": 7.5625,
"learning_rate": 6.797354747283893e-06,
"loss": 0.4238,
"step": 1440
},
{
"epoch": 0.347743916843846,
"grad_norm": 10.25,
"learning_rate": 6.94851204534719e-06,
"loss": 0.4786,
"step": 1472
},
{
"epoch": 0.35530356721001655,
"grad_norm": 5.25,
"learning_rate": 7.099669343410487e-06,
"loss": 0.4443,
"step": 1504
},
{
"epoch": 0.3628632175761871,
"grad_norm": 14.0625,
"learning_rate": 7.250826641473784e-06,
"loss": 0.467,
"step": 1536
},
{
"epoch": 0.3704228679423577,
"grad_norm": 5.84375,
"learning_rate": 7.4019839395370815e-06,
"loss": 0.4281,
"step": 1568
},
{
"epoch": 0.37798251830852825,
"grad_norm": 7.4375,
"learning_rate": 7.553141237600379e-06,
"loss": 0.4198,
"step": 1600
},
{
"epoch": 0.3855421686746988,
"grad_norm": 4.78125,
"learning_rate": 7.704298535663676e-06,
"loss": 0.4646,
"step": 1632
},
{
"epoch": 0.3931018190408694,
"grad_norm": 7.09375,
"learning_rate": 7.855455833726973e-06,
"loss": 0.4396,
"step": 1664
},
{
"epoch": 0.40066146940703995,
"grad_norm": 7.65625,
"learning_rate": 8.006613131790269e-06,
"loss": 0.4395,
"step": 1696
},
{
"epoch": 0.40822111977321046,
"grad_norm": 10.1875,
"learning_rate": 8.157770429853567e-06,
"loss": 0.4247,
"step": 1728
},
{
"epoch": 0.41578077013938103,
"grad_norm": 7.53125,
"learning_rate": 8.308927727916864e-06,
"loss": 0.4591,
"step": 1760
},
{
"epoch": 0.4233404205055516,
"grad_norm": 7.25,
"learning_rate": 8.460085025980162e-06,
"loss": 0.451,
"step": 1792
},
{
"epoch": 0.43090007087172216,
"grad_norm": 7.21875,
"learning_rate": 8.611242324043458e-06,
"loss": 0.4388,
"step": 1824
},
{
"epoch": 0.43845972123789273,
"grad_norm": 6.09375,
"learning_rate": 8.762399622106755e-06,
"loss": 0.4619,
"step": 1856
},
{
"epoch": 0.4460193716040633,
"grad_norm": 7.6875,
"learning_rate": 8.913556920170053e-06,
"loss": 0.4335,
"step": 1888
},
{
"epoch": 0.45357902197023386,
"grad_norm": 7.0625,
"learning_rate": 9.064714218233351e-06,
"loss": 0.4103,
"step": 1920
},
{
"epoch": 0.46113867233640443,
"grad_norm": 5.75,
"learning_rate": 9.215871516296648e-06,
"loss": 0.4254,
"step": 1952
},
{
"epoch": 0.468698322702575,
"grad_norm": 6.53125,
"learning_rate": 9.367028814359944e-06,
"loss": 0.4907,
"step": 1984
},
{
"epoch": 0.47625797306874557,
"grad_norm": 7.15625,
"learning_rate": 9.51818611242324e-06,
"loss": 0.449,
"step": 2016
},
{
"epoch": 0.48381762343491613,
"grad_norm": 5.71875,
"learning_rate": 9.669343410486539e-06,
"loss": 0.4554,
"step": 2048
},
{
"epoch": 0.4913772738010867,
"grad_norm": 9.125,
"learning_rate": 9.820500708549835e-06,
"loss": 0.43,
"step": 2080
},
{
"epoch": 0.49893692416725727,
"grad_norm": 4.65625,
"learning_rate": 9.971658006613133e-06,
"loss": 0.4262,
"step": 2112
},
{
"epoch": 0.5064965745334278,
"grad_norm": 9.375,
"learning_rate": 9.999954028675169e-06,
"loss": 0.437,
"step": 2144
},
{
"epoch": 0.5140562248995983,
"grad_norm": 7.71875,
"learning_rate": 9.999771232848482e-06,
"loss": 0.4294,
"step": 2176
},
{
"epoch": 0.5216158752657689,
"grad_norm": 6.84375,
"learning_rate": 9.999449169431064e-06,
"loss": 0.4498,
"step": 2208
},
{
"epoch": 0.5291755256319395,
"grad_norm": 8.0625,
"learning_rate": 9.998987847393924e-06,
"loss": 0.4379,
"step": 2240
},
{
"epoch": 0.53673517599811,
"grad_norm": 5.6875,
"learning_rate": 9.998387279587092e-06,
"loss": 0.4315,
"step": 2272
},
{
"epoch": 0.5442948263642806,
"grad_norm": 8.5,
"learning_rate": 9.99764748273926e-06,
"loss": 0.4309,
"step": 2304
},
{
"epoch": 0.5518544767304512,
"grad_norm": 5.71875,
"learning_rate": 9.996768477457317e-06,
"loss": 0.4405,
"step": 2336
},
{
"epoch": 0.5594141270966217,
"grad_norm": 7.65625,
"learning_rate": 9.99575028822577e-06,
"loss": 0.4381,
"step": 2368
},
{
"epoch": 0.5669737774627923,
"grad_norm": 4.59375,
"learning_rate": 9.994592943406071e-06,
"loss": 0.3963,
"step": 2400
},
{
"epoch": 0.5745334278289629,
"grad_norm": 9.0,
"learning_rate": 9.993296475235821e-06,
"loss": 0.4307,
"step": 2432
},
{
"epoch": 0.5820930781951335,
"grad_norm": 6.15625,
"learning_rate": 9.991860919827869e-06,
"loss": 0.4298,
"step": 2464
},
{
"epoch": 0.589652728561304,
"grad_norm": 7.0625,
"learning_rate": 9.990286317169315e-06,
"loss": 0.4717,
"step": 2496
},
{
"epoch": 0.5972123789274746,
"grad_norm": 3.53125,
"learning_rate": 9.988572711120388e-06,
"loss": 0.4357,
"step": 2528
},
{
"epoch": 0.6047720292936452,
"grad_norm": 8.125,
"learning_rate": 9.986720149413232e-06,
"loss": 0.4583,
"step": 2560
},
{
"epoch": 0.6123316796598157,
"grad_norm": 6.6875,
"learning_rate": 9.984728683650566e-06,
"loss": 0.3911,
"step": 2592
},
{
"epoch": 0.6198913300259863,
"grad_norm": 6.8125,
"learning_rate": 9.982598369304259e-06,
"loss": 0.455,
"step": 2624
},
{
"epoch": 0.6274509803921569,
"grad_norm": 5.375,
"learning_rate": 9.980329265713772e-06,
"loss": 0.4316,
"step": 2656
},
{
"epoch": 0.6350106307583274,
"grad_norm": 6.59375,
"learning_rate": 9.977921436084517e-06,
"loss": 0.4341,
"step": 2688
},
{
"epoch": 0.642570281124498,
"grad_norm": 18.5,
"learning_rate": 9.975374947486086e-06,
"loss": 0.4523,
"step": 2720
},
{
"epoch": 0.6501299314906686,
"grad_norm": 4.59375,
"learning_rate": 9.972689870850396e-06,
"loss": 0.4188,
"step": 2752
},
{
"epoch": 0.6576895818568391,
"grad_norm": 5.75,
"learning_rate": 9.969866280969693e-06,
"loss": 0.4731,
"step": 2784
},
{
"epoch": 0.6652492322230097,
"grad_norm": 6.5,
"learning_rate": 9.966904256494494e-06,
"loss": 0.4347,
"step": 2816
},
{
"epoch": 0.6728088825891803,
"grad_norm": 5.125,
"learning_rate": 9.963803879931372e-06,
"loss": 0.4309,
"step": 2848
},
{
"epoch": 0.6803685329553508,
"grad_norm": 6.25,
"learning_rate": 9.960565237640679e-06,
"loss": 0.4262,
"step": 2880
},
{
"epoch": 0.6879281833215214,
"grad_norm": 7.03125,
"learning_rate": 9.957188419834115e-06,
"loss": 0.4044,
"step": 2912
},
{
"epoch": 0.695487833687692,
"grad_norm": 6.0625,
"learning_rate": 9.953673520572248e-06,
"loss": 0.418,
"step": 2944
},
{
"epoch": 0.7030474840538625,
"grad_norm": 3.953125,
"learning_rate": 9.950020637761863e-06,
"loss": 0.4171,
"step": 2976
},
{
"epoch": 0.7106071344200331,
"grad_norm": 7.3125,
"learning_rate": 9.946229873153257e-06,
"loss": 0.4638,
"step": 3008
},
{
"epoch": 0.7181667847862037,
"grad_norm": 7.375,
"learning_rate": 9.942301332337387e-06,
"loss": 0.4293,
"step": 3040
},
{
"epoch": 0.7257264351523742,
"grad_norm": 14.8125,
"learning_rate": 9.938235124742947e-06,
"loss": 0.4617,
"step": 3072
},
{
"epoch": 0.7332860855185448,
"grad_norm": 4.875,
"learning_rate": 9.934031363633306e-06,
"loss": 0.4218,
"step": 3104
},
{
"epoch": 0.7408457358847154,
"grad_norm": 6.875,
"learning_rate": 9.929690166103354e-06,
"loss": 0.4406,
"step": 3136
},
{
"epoch": 0.7484053862508859,
"grad_norm": 7.8125,
"learning_rate": 9.925211653076251e-06,
"loss": 0.4416,
"step": 3168
},
{
"epoch": 0.7559650366170565,
"grad_norm": 5.84375,
"learning_rate": 9.920595949300049e-06,
"loss": 0.454,
"step": 3200
},
{
"epoch": 0.7635246869832271,
"grad_norm": 7.5,
"learning_rate": 9.915843183344215e-06,
"loss": 0.4099,
"step": 3232
},
{
"epoch": 0.7710843373493976,
"grad_norm": 6.8125,
"learning_rate": 9.910953487596066e-06,
"loss": 0.3762,
"step": 3264
},
{
"epoch": 0.7786439877155682,
"grad_norm": 5.40625,
"learning_rate": 9.905926998257057e-06,
"loss": 0.424,
"step": 3296
},
{
"epoch": 0.7862036380817388,
"grad_norm": 7.78125,
"learning_rate": 9.900763855339009e-06,
"loss": 0.4663,
"step": 3328
},
{
"epoch": 0.7937632884479093,
"grad_norm": 5.5,
"learning_rate": 9.895464202660195e-06,
"loss": 0.4758,
"step": 3360
},
{
"epoch": 0.8013229388140799,
"grad_norm": 10.0625,
"learning_rate": 9.890028187841343e-06,
"loss": 0.4024,
"step": 3392
},
{
"epoch": 0.8088825891802505,
"grad_norm": 8.5625,
"learning_rate": 9.88445596230152e-06,
"loss": 0.4461,
"step": 3424
},
{
"epoch": 0.8164422395464209,
"grad_norm": 6.21875,
"learning_rate": 9.878747681253908e-06,
"loss": 0.4429,
"step": 3456
},
{
"epoch": 0.8240018899125915,
"grad_norm": 4.71875,
"learning_rate": 9.872903503701495e-06,
"loss": 0.4333,
"step": 3488
},
{
"epoch": 0.8315615402787621,
"grad_norm": 6.375,
"learning_rate": 9.866923592432633e-06,
"loss": 0.4168,
"step": 3520
},
{
"epoch": 0.8391211906449326,
"grad_norm": 23.25,
"learning_rate": 9.860808114016512e-06,
"loss": 0.4475,
"step": 3552
},
{
"epoch": 0.8466808410111032,
"grad_norm": 11.9375,
"learning_rate": 9.854557238798515e-06,
"loss": 0.4458,
"step": 3584
},
{
"epoch": 0.8542404913772738,
"grad_norm": 6.84375,
"learning_rate": 9.848171140895471e-06,
"loss": 0.4405,
"step": 3616
},
{
"epoch": 0.8618001417434443,
"grad_norm": 5.625,
"learning_rate": 9.841649998190818e-06,
"loss": 0.4059,
"step": 3648
},
{
"epoch": 0.8693597921096149,
"grad_norm": 4.34375,
"learning_rate": 9.834993992329629e-06,
"loss": 0.4179,
"step": 3680
},
{
"epoch": 0.8769194424757855,
"grad_norm": 7.34375,
"learning_rate": 9.828203308713568e-06,
"loss": 0.4298,
"step": 3712
},
{
"epoch": 0.884479092841956,
"grad_norm": 5.9375,
"learning_rate": 9.821278136495722e-06,
"loss": 0.4098,
"step": 3744
},
{
"epoch": 0.8920387432081266,
"grad_norm": 5.0625,
"learning_rate": 9.814218668575322e-06,
"loss": 0.3931,
"step": 3776
},
{
"epoch": 0.8995983935742972,
"grad_norm": 7.65625,
"learning_rate": 9.807025101592388e-06,
"loss": 0.4239,
"step": 3808
},
{
"epoch": 0.9071580439404677,
"grad_norm": 6.78125,
"learning_rate": 9.79969763592223e-06,
"loss": 0.4136,
"step": 3840
},
{
"epoch": 0.9147176943066383,
"grad_norm": 5.875,
"learning_rate": 9.792236475669889e-06,
"loss": 0.4549,
"step": 3872
},
{
"epoch": 0.9222773446728089,
"grad_norm": 7.75,
"learning_rate": 9.784641828664435e-06,
"loss": 0.4433,
"step": 3904
},
{
"epoch": 0.9298369950389794,
"grad_norm": 8.125,
"learning_rate": 9.776913906453184e-06,
"loss": 0.4139,
"step": 3936
},
{
"epoch": 0.93739664540515,
"grad_norm": 6.75,
"learning_rate": 9.76905292429581e-06,
"loss": 0.4514,
"step": 3968
},
{
"epoch": 0.9449562957713206,
"grad_norm": 5.65625,
"learning_rate": 9.76105910115834e-06,
"loss": 0.4389,
"step": 4000
},
{
"epoch": 0.9525159461374911,
"grad_norm": 6.875,
"learning_rate": 9.752932659707054e-06,
"loss": 0.4219,
"step": 4032
},
{
"epoch": 0.9600755965036617,
"grad_norm": 8.75,
"learning_rate": 9.7446738263023e-06,
"loss": 0.4214,
"step": 4064
},
{
"epoch": 0.9676352468698323,
"grad_norm": 5.1875,
"learning_rate": 9.736282830992165e-06,
"loss": 0.4179,
"step": 4096
},
{
"epoch": 0.9751948972360028,
"grad_norm": 5.90625,
"learning_rate": 9.727759907506085e-06,
"loss": 0.4156,
"step": 4128
},
{
"epoch": 0.9827545476021734,
"grad_norm": 7.3125,
"learning_rate": 9.719105293248327e-06,
"loss": 0.4292,
"step": 4160
},
{
"epoch": 0.990314197968344,
"grad_norm": 5.375,
"learning_rate": 9.710319229291374e-06,
"loss": 0.3681,
"step": 4192
},
{
"epoch": 0.9978738483345145,
"grad_norm": 5.5,
"learning_rate": 9.701401960369218e-06,
"loss": 0.4233,
"step": 4224
},
{
"epoch": 1.005433498700685,
"grad_norm": 6.59375,
"learning_rate": 9.692353734870532e-06,
"loss": 0.41,
"step": 4256
},
{
"epoch": 1.0129931490668556,
"grad_norm": 8.25,
"learning_rate": 9.683174804831763e-06,
"loss": 0.396,
"step": 4288
},
{
"epoch": 1.0205527994330261,
"grad_norm": 7.09375,
"learning_rate": 9.673865425930104e-06,
"loss": 0.4012,
"step": 4320
},
{
"epoch": 1.0281124497991967,
"grad_norm": 6.53125,
"learning_rate": 9.66442585747637e-06,
"loss": 0.348,
"step": 4352
},
{
"epoch": 1.0356721001653673,
"grad_norm": 6.75,
"learning_rate": 9.654856362407787e-06,
"loss": 0.4068,
"step": 4384
},
{
"epoch": 1.0432317505315378,
"grad_norm": 6.21875,
"learning_rate": 9.645157207280652e-06,
"loss": 0.3564,
"step": 4416
},
{
"epoch": 1.0507914008977084,
"grad_norm": 6.65625,
"learning_rate": 9.635328662262922e-06,
"loss": 0.4081,
"step": 4448
},
{
"epoch": 1.058351051263879,
"grad_norm": 6.84375,
"learning_rate": 9.625371001126678e-06,
"loss": 0.388,
"step": 4480
},
{
"epoch": 1.0659107016300495,
"grad_norm": 7.6875,
"learning_rate": 9.615284501240505e-06,
"loss": 0.3767,
"step": 4512
},
{
"epoch": 1.07347035199622,
"grad_norm": 9.4375,
"learning_rate": 9.605069443561768e-06,
"loss": 0.3913,
"step": 4544
},
{
"epoch": 1.0810300023623907,
"grad_norm": 9.25,
"learning_rate": 9.594726112628781e-06,
"loss": 0.4482,
"step": 4576
},
{
"epoch": 1.0885896527285612,
"grad_norm": 7.34375,
"learning_rate": 9.584254796552877e-06,
"loss": 0.3863,
"step": 4608
},
{
"epoch": 1.0961493030947318,
"grad_norm": 5.4375,
"learning_rate": 9.573655787010397e-06,
"loss": 0.3993,
"step": 4640
},
{
"epoch": 1.1037089534609024,
"grad_norm": 5.5,
"learning_rate": 9.562929379234554e-06,
"loss": 0.4405,
"step": 4672
},
{
"epoch": 1.111268603827073,
"grad_norm": 9.5,
"learning_rate": 9.55207587200721e-06,
"loss": 0.4459,
"step": 4704
},
{
"epoch": 1.1188282541932435,
"grad_norm": 6.25,
"learning_rate": 9.541095567650558e-06,
"loss": 0.4266,
"step": 4736
},
{
"epoch": 1.126387904559414,
"grad_norm": 7.3125,
"learning_rate": 9.529988772018699e-06,
"loss": 0.3956,
"step": 4768
},
{
"epoch": 1.1339475549255846,
"grad_norm": 4.46875,
"learning_rate": 9.518755794489123e-06,
"loss": 0.4003,
"step": 4800
},
{
"epoch": 1.1415072052917552,
"grad_norm": 4.5,
"learning_rate": 9.507396947954086e-06,
"loss": 0.3983,
"step": 4832
},
{
"epoch": 1.1490668556579258,
"grad_norm": 7.84375,
"learning_rate": 9.495912548811908e-06,
"loss": 0.4201,
"step": 4864
},
{
"epoch": 1.1566265060240963,
"grad_norm": 6.84375,
"learning_rate": 9.48430291695814e-06,
"loss": 0.4136,
"step": 4896
},
{
"epoch": 1.164186156390267,
"grad_norm": 5.15625,
"learning_rate": 9.472568375776669e-06,
"loss": 0.4123,
"step": 4928
},
{
"epoch": 1.1717458067564375,
"grad_norm": 4.96875,
"learning_rate": 9.46070925213071e-06,
"loss": 0.4157,
"step": 4960
},
{
"epoch": 1.179305457122608,
"grad_norm": 8.5,
"learning_rate": 9.448725876353692e-06,
"loss": 0.3889,
"step": 4992
},
{
"epoch": 1.1868651074887786,
"grad_norm": 5.34375,
"learning_rate": 9.43661858224006e-06,
"loss": 0.3872,
"step": 5024
},
{
"epoch": 1.1944247578549492,
"grad_norm": 4.84375,
"learning_rate": 9.42438770703598e-06,
"loss": 0.4089,
"step": 5056
},
{
"epoch": 1.2019844082211197,
"grad_norm": 5.96875,
"learning_rate": 9.412033591429947e-06,
"loss": 0.4128,
"step": 5088
},
{
"epoch": 1.2095440585872903,
"grad_norm": 6.5,
"learning_rate": 9.399556579543285e-06,
"loss": 0.4154,
"step": 5120
},
{
"epoch": 1.2171037089534609,
"grad_norm": 4.5625,
"learning_rate": 9.386957018920576e-06,
"loss": 0.3826,
"step": 5152
},
{
"epoch": 1.2246633593196314,
"grad_norm": 5.15625,
"learning_rate": 9.374235260519967e-06,
"loss": 0.3957,
"step": 5184
},
{
"epoch": 1.232223009685802,
"grad_norm": 6.1875,
"learning_rate": 9.361391658703396e-06,
"loss": 0.3757,
"step": 5216
},
{
"epoch": 1.2397826600519726,
"grad_norm": 5.75,
"learning_rate": 9.348426571226732e-06,
"loss": 0.4287,
"step": 5248
},
{
"epoch": 1.2473423104181431,
"grad_norm": 4.65625,
"learning_rate": 9.335340359229798e-06,
"loss": 0.4172,
"step": 5280
},
{
"epoch": 1.2549019607843137,
"grad_norm": 6.40625,
"learning_rate": 9.322133387226313e-06,
"loss": 0.3773,
"step": 5312
},
{
"epoch": 1.2624616111504843,
"grad_norm": 4.6875,
"learning_rate": 9.308806023093745e-06,
"loss": 0.4368,
"step": 5344
},
{
"epoch": 1.2700212615166548,
"grad_norm": 4.125,
"learning_rate": 9.295358638063054e-06,
"loss": 0.393,
"step": 5376
},
{
"epoch": 1.2775809118828254,
"grad_norm": 4.875,
"learning_rate": 9.281791606708365e-06,
"loss": 0.3973,
"step": 5408
},
{
"epoch": 1.285140562248996,
"grad_norm": 8.75,
"learning_rate": 9.268105306936521e-06,
"loss": 0.3701,
"step": 5440
},
{
"epoch": 1.2927002126151665,
"grad_norm": 6.0,
"learning_rate": 9.254300119976564e-06,
"loss": 0.4084,
"step": 5472
},
{
"epoch": 1.3002598629813371,
"grad_norm": 4.5625,
"learning_rate": 9.240376430369114e-06,
"loss": 0.3885,
"step": 5504
},
{
"epoch": 1.3078195133475077,
"grad_norm": 6.375,
"learning_rate": 9.226334625955655e-06,
"loss": 0.3864,
"step": 5536
},
{
"epoch": 1.3153791637136782,
"grad_norm": 8.3125,
"learning_rate": 9.212175097867738e-06,
"loss": 0.4363,
"step": 5568
},
{
"epoch": 1.3229388140798488,
"grad_norm": 6.15625,
"learning_rate": 9.197898240516083e-06,
"loss": 0.424,
"step": 5600
},
{
"epoch": 1.3304984644460194,
"grad_norm": 6.03125,
"learning_rate": 9.183504451579587e-06,
"loss": 0.4084,
"step": 5632
},
{
"epoch": 1.33805811481219,
"grad_norm": 6.28125,
"learning_rate": 9.168994131994257e-06,
"loss": 0.3426,
"step": 5664
},
{
"epoch": 1.3456177651783605,
"grad_norm": 6.0625,
"learning_rate": 9.154367685942039e-06,
"loss": 0.422,
"step": 5696
},
{
"epoch": 1.353177415544531,
"grad_norm": 4.3125,
"learning_rate": 9.139625520839548e-06,
"loss": 0.388,
"step": 5728
},
{
"epoch": 1.3607370659107016,
"grad_norm": 3.65625,
"learning_rate": 9.12476804732674e-06,
"loss": 0.4255,
"step": 5760
},
{
"epoch": 1.3682967162768722,
"grad_norm": 5.1875,
"learning_rate": 9.109795679255455e-06,
"loss": 0.3975,
"step": 5792
},
{
"epoch": 1.3758563666430428,
"grad_norm": 9.5,
"learning_rate": 9.094708833677904e-06,
"loss": 0.3914,
"step": 5824
},
{
"epoch": 1.3834160170092134,
"grad_norm": 5.28125,
"learning_rate": 9.079507930835039e-06,
"loss": 0.3943,
"step": 5856
},
{
"epoch": 1.390975667375384,
"grad_norm": 6.09375,
"learning_rate": 9.064193394144857e-06,
"loss": 0.3909,
"step": 5888
},
{
"epoch": 1.3985353177415545,
"grad_norm": 6.84375,
"learning_rate": 9.048765650190601e-06,
"loss": 0.3835,
"step": 5920
},
{
"epoch": 1.406094968107725,
"grad_norm": 4.59375,
"learning_rate": 9.033225128708877e-06,
"loss": 0.4175,
"step": 5952
},
{
"epoch": 1.4136546184738956,
"grad_norm": 6.65625,
"learning_rate": 9.017572262577691e-06,
"loss": 0.4212,
"step": 5984
},
{
"epoch": 1.4212142688400662,
"grad_norm": 5.4375,
"learning_rate": 9.001807487804384e-06,
"loss": 0.4079,
"step": 6016
},
{
"epoch": 1.4287739192062368,
"grad_norm": 6.40625,
"learning_rate": 8.985931243513481e-06,
"loss": 0.373,
"step": 6048
},
{
"epoch": 1.4363335695724073,
"grad_norm": 7.40625,
"learning_rate": 8.96994397193448e-06,
"loss": 0.4122,
"step": 6080
},
{
"epoch": 1.4438932199385779,
"grad_norm": 4.96875,
"learning_rate": 8.953846118389514e-06,
"loss": 0.4034,
"step": 6112
},
{
"epoch": 1.4514528703047485,
"grad_norm": 5.53125,
"learning_rate": 8.937638131280952e-06,
"loss": 0.4034,
"step": 6144
},
{
"epoch": 1.459012520670919,
"grad_norm": 5.625,
"learning_rate": 8.921320462078916e-06,
"loss": 0.3862,
"step": 6176
},
{
"epoch": 1.4665721710370896,
"grad_norm": 14.25,
"learning_rate": 8.904893565308697e-06,
"loss": 0.4192,
"step": 6208
},
{
"epoch": 1.4741318214032602,
"grad_norm": 6.9375,
"learning_rate": 8.888357898538095e-06,
"loss": 0.3923,
"step": 6240
},
{
"epoch": 1.4816914717694307,
"grad_norm": 9.25,
"learning_rate": 8.871713922364684e-06,
"loss": 0.4096,
"step": 6272
},
{
"epoch": 1.4892511221356013,
"grad_norm": 7.65625,
"learning_rate": 8.854962100402962e-06,
"loss": 0.3838,
"step": 6304
},
{
"epoch": 1.4968107725017719,
"grad_norm": 7.9375,
"learning_rate": 8.83810289927146e-06,
"loss": 0.4177,
"step": 6336
},
{
"epoch": 1.5043704228679422,
"grad_norm": 7.9375,
"learning_rate": 8.821136788579725e-06,
"loss": 0.3896,
"step": 6368
},
{
"epoch": 1.511930073234113,
"grad_norm": 7.0625,
"learning_rate": 8.804064240915253e-06,
"loss": 0.4424,
"step": 6400
},
{
"epoch": 1.5194897236002833,
"grad_norm": 6.21875,
"learning_rate": 8.786885731830322e-06,
"loss": 0.3924,
"step": 6432
},
{
"epoch": 1.5270493739664541,
"grad_norm": 8.625,
"learning_rate": 8.769601739828735e-06,
"loss": 0.4056,
"step": 6464
},
{
"epoch": 1.5346090243326245,
"grad_norm": 4.9375,
"learning_rate": 8.752212746352506e-06,
"loss": 0.3844,
"step": 6496
},
{
"epoch": 1.5421686746987953,
"grad_norm": 6.78125,
"learning_rate": 8.734719235768441e-06,
"loss": 0.3959,
"step": 6528
},
{
"epoch": 1.5497283250649656,
"grad_norm": 5.5,
"learning_rate": 8.717121695354651e-06,
"loss": 0.3885,
"step": 6560
},
{
"epoch": 1.5572879754311364,
"grad_norm": 4.8125,
"learning_rate": 8.699420615286974e-06,
"loss": 0.4049,
"step": 6592
},
{
"epoch": 1.5648476257973067,
"grad_norm": 6.21875,
"learning_rate": 8.681616488625323e-06,
"loss": 0.3783,
"step": 6624
},
{
"epoch": 1.5724072761634775,
"grad_norm": 6.90625,
"learning_rate": 8.663709811299954e-06,
"loss": 0.397,
"step": 6656
},
{
"epoch": 1.5799669265296479,
"grad_norm": 7.1875,
"learning_rate": 8.64570108209765e-06,
"loss": 0.415,
"step": 6688
},
{
"epoch": 1.5875265768958187,
"grad_norm": 5.4375,
"learning_rate": 8.627590802647829e-06,
"loss": 0.4219,
"step": 6720
},
{
"epoch": 1.595086227261989,
"grad_norm": 5.1875,
"learning_rate": 8.609379477408569e-06,
"loss": 0.3725,
"step": 6752
},
{
"epoch": 1.6026458776281598,
"grad_norm": 7.25,
"learning_rate": 8.591067613652552e-06,
"loss": 0.4042,
"step": 6784
},
{
"epoch": 1.6102055279943301,
"grad_norm": 5.5,
"learning_rate": 8.572655721452954e-06,
"loss": 0.4092,
"step": 6816
},
{
"epoch": 1.617765178360501,
"grad_norm": 7.4375,
"learning_rate": 8.554144313669208e-06,
"loss": 0.4191,
"step": 6848
},
{
"epoch": 1.6253248287266713,
"grad_norm": 7.0625,
"learning_rate": 8.535533905932739e-06,
"loss": 0.4391,
"step": 6880
},
{
"epoch": 1.632884479092842,
"grad_norm": 6.625,
"learning_rate": 8.516825016632594e-06,
"loss": 0.3896,
"step": 6912
},
{
"epoch": 1.6404441294590124,
"grad_norm": 5.28125,
"learning_rate": 8.498018166901008e-06,
"loss": 0.3774,
"step": 6944
},
{
"epoch": 1.6480037798251832,
"grad_norm": 6.1875,
"learning_rate": 8.479113880598875e-06,
"loss": 0.4245,
"step": 6976
},
{
"epoch": 1.6555634301913535,
"grad_norm": 5.5625,
"learning_rate": 8.460112684301172e-06,
"loss": 0.4239,
"step": 7008
},
{
"epoch": 1.6631230805575243,
"grad_norm": 6.71875,
"learning_rate": 8.441015107282281e-06,
"loss": 0.4013,
"step": 7040
},
{
"epoch": 1.6706827309236947,
"grad_norm": 3.65625,
"learning_rate": 8.421821681501248e-06,
"loss": 0.3926,
"step": 7072
},
{
"epoch": 1.6782423812898655,
"grad_norm": 5.40625,
"learning_rate": 8.402532941586968e-06,
"loss": 0.3848,
"step": 7104
},
{
"epoch": 1.6858020316560358,
"grad_norm": 7.125,
"learning_rate": 8.38314942482329e-06,
"loss": 0.3846,
"step": 7136
},
{
"epoch": 1.6933616820222066,
"grad_norm": 5.6875,
"learning_rate": 8.363671671134053e-06,
"loss": 0.4196,
"step": 7168
},
{
"epoch": 1.700921332388377,
"grad_norm": 4.03125,
"learning_rate": 8.344100223068048e-06,
"loss": 0.3903,
"step": 7200
},
{
"epoch": 1.7084809827545477,
"grad_norm": 5.78125,
"learning_rate": 8.324435625783901e-06,
"loss": 0.3984,
"step": 7232
},
{
"epoch": 1.716040633120718,
"grad_norm": 5.40625,
"learning_rate": 8.304678427034891e-06,
"loss": 0.4324,
"step": 7264
},
{
"epoch": 1.7236002834868889,
"grad_norm": 7.09375,
"learning_rate": 8.28482917715369e-06,
"loss": 0.3662,
"step": 7296
},
{
"epoch": 1.7311599338530592,
"grad_norm": 7.0,
"learning_rate": 8.26488842903704e-06,
"loss": 0.3834,
"step": 7328
},
{
"epoch": 1.73871958421923,
"grad_norm": 7.25,
"learning_rate": 8.244856738130339e-06,
"loss": 0.4279,
"step": 7360
},
{
"epoch": 1.7462792345854004,
"grad_norm": 4.5625,
"learning_rate": 8.224734662412187e-06,
"loss": 0.405,
"step": 7392
},
{
"epoch": 1.7538388849515711,
"grad_norm": 7.1875,
"learning_rate": 8.204522762378829e-06,
"loss": 0.4181,
"step": 7424
},
{
"epoch": 1.7613985353177415,
"grad_norm": 7.15625,
"learning_rate": 8.184221601028546e-06,
"loss": 0.4179,
"step": 7456
},
{
"epoch": 1.7689581856839123,
"grad_norm": 6.3125,
"learning_rate": 8.16383174384598e-06,
"loss": 0.3963,
"step": 7488
},
{
"epoch": 1.7765178360500826,
"grad_norm": 6.28125,
"learning_rate": 8.143353758786372e-06,
"loss": 0.4346,
"step": 7520
},
{
"epoch": 1.7840774864162534,
"grad_norm": 6.0625,
"learning_rate": 8.12278821625975e-06,
"loss": 0.3891,
"step": 7552
},
{
"epoch": 1.7916371367824238,
"grad_norm": 7.21875,
"learning_rate": 8.102135689115036e-06,
"loss": 0.3684,
"step": 7584
},
{
"epoch": 1.7991967871485943,
"grad_norm": 5.84375,
"learning_rate": 8.081396752624087e-06,
"loss": 0.3718,
"step": 7616
},
{
"epoch": 1.806756437514765,
"grad_norm": 7.90625,
"learning_rate": 8.060571984465679e-06,
"loss": 0.4179,
"step": 7648
},
{
"epoch": 1.8143160878809355,
"grad_norm": 11.875,
"learning_rate": 8.039661964709414e-06,
"loss": 0.4095,
"step": 7680
},
{
"epoch": 1.821875738247106,
"grad_norm": 15.0,
"learning_rate": 8.018667275799552e-06,
"loss": 0.3964,
"step": 7712
},
{
"epoch": 1.8294353886132766,
"grad_norm": 6.34375,
"learning_rate": 7.997588502538796e-06,
"loss": 0.3525,
"step": 7744
},
{
"epoch": 1.8369950389794472,
"grad_norm": 6.3125,
"learning_rate": 7.976426232072008e-06,
"loss": 0.3667,
"step": 7776
},
{
"epoch": 1.8445546893456177,
"grad_norm": 6.6875,
"learning_rate": 7.955181053869841e-06,
"loss": 0.3845,
"step": 7808
},
{
"epoch": 1.8521143397117883,
"grad_norm": 6.59375,
"learning_rate": 7.933853559712328e-06,
"loss": 0.416,
"step": 7840
},
{
"epoch": 1.8596739900779589,
"grad_norm": 5.40625,
"learning_rate": 7.912444343672395e-06,
"loss": 0.3795,
"step": 7872
},
{
"epoch": 1.8672336404441294,
"grad_norm": 5.96875,
"learning_rate": 7.890954002099312e-06,
"loss": 0.4025,
"step": 7904
},
{
"epoch": 1.8747932908103,
"grad_norm": 6.5,
"learning_rate": 7.869383133602091e-06,
"loss": 0.4047,
"step": 7936
},
{
"epoch": 1.8823529411764706,
"grad_norm": 5.875,
"learning_rate": 7.847732339032796e-06,
"loss": 0.4168,
"step": 7968
},
{
"epoch": 1.8899125915426411,
"grad_norm": 7.46875,
"learning_rate": 7.826002221469822e-06,
"loss": 0.374,
"step": 8000
},
{
"epoch": 1.8974722419088117,
"grad_norm": 6.0625,
"learning_rate": 7.804193386201086e-06,
"loss": 0.3925,
"step": 8032
},
{
"epoch": 1.9050318922749823,
"grad_norm": 5.21875,
"learning_rate": 7.782306440707173e-06,
"loss": 0.4083,
"step": 8064
},
{
"epoch": 1.9125915426411528,
"grad_norm": 5.0,
"learning_rate": 7.760341994644406e-06,
"loss": 0.3894,
"step": 8096
},
{
"epoch": 1.9201511930073234,
"grad_norm": 5.34375,
"learning_rate": 7.738300659827878e-06,
"loss": 0.3491,
"step": 8128
},
{
"epoch": 1.927710843373494,
"grad_norm": 5.8125,
"learning_rate": 7.7161830502144e-06,
"loss": 0.4238,
"step": 8160
},
{
"epoch": 1.9352704937396645,
"grad_norm": 4.84375,
"learning_rate": 7.693989781885396e-06,
"loss": 0.4313,
"step": 8192
},
{
"epoch": 1.942830144105835,
"grad_norm": 6.25,
"learning_rate": 7.671721473029756e-06,
"loss": 0.4254,
"step": 8224
},
{
"epoch": 1.9503897944720057,
"grad_norm": 9.0,
"learning_rate": 7.649378743926603e-06,
"loss": 0.4327,
"step": 8256
},
{
"epoch": 1.9579494448381762,
"grad_norm": 6.21875,
"learning_rate": 7.626962216928025e-06,
"loss": 0.4143,
"step": 8288
},
{
"epoch": 1.9655090952043468,
"grad_norm": 9.1875,
"learning_rate": 7.60447251644173e-06,
"loss": 0.417,
"step": 8320
},
{
"epoch": 1.9730687455705174,
"grad_norm": 8.1875,
"learning_rate": 7.58191026891366e-06,
"loss": 0.399,
"step": 8352
},
{
"epoch": 1.980628395936688,
"grad_norm": 5.90625,
"learning_rate": 7.559276102810541e-06,
"loss": 0.3637,
"step": 8384
},
{
"epoch": 1.9881880463028585,
"grad_norm": 7.28125,
"learning_rate": 7.536570648602377e-06,
"loss": 0.3907,
"step": 8416
},
{
"epoch": 1.995747696669029,
"grad_norm": 3.78125,
"learning_rate": 7.513794538744885e-06,
"loss": 0.3775,
"step": 8448
},
{
"epoch": 2.0033073470351996,
"grad_norm": 6.0625,
"learning_rate": 7.49094840766188e-06,
"loss": 0.3695,
"step": 8480
},
{
"epoch": 2.01086699740137,
"grad_norm": 4.0625,
"learning_rate": 7.468032891727606e-06,
"loss": 0.3548,
"step": 8512
},
{
"epoch": 2.0184266477675408,
"grad_norm": 4.8125,
"learning_rate": 7.445048629249007e-06,
"loss": 0.3596,
"step": 8544
},
{
"epoch": 2.025986298133711,
"grad_norm": 5.21875,
"learning_rate": 7.421996260447948e-06,
"loss": 0.3741,
"step": 8576
},
{
"epoch": 2.033545948499882,
"grad_norm": 4.625,
"learning_rate": 7.398876427443379e-06,
"loss": 0.4047,
"step": 8608
},
{
"epoch": 2.0411055988660523,
"grad_norm": 5.34375,
"learning_rate": 7.375689774233453e-06,
"loss": 0.3667,
"step": 8640
},
{
"epoch": 2.048665249232223,
"grad_norm": 6.40625,
"learning_rate": 7.352436946677589e-06,
"loss": 0.3425,
"step": 8672
},
{
"epoch": 2.0562248995983934,
"grad_norm": 8.25,
"learning_rate": 7.329118592478473e-06,
"loss": 0.3651,
"step": 8704
},
{
"epoch": 2.063784549964564,
"grad_norm": 4.1875,
"learning_rate": 7.305735361164028e-06,
"loss": 0.3707,
"step": 8736
},
{
"epoch": 2.0713442003307345,
"grad_norm": 9.375,
"learning_rate": 7.282287904069308e-06,
"loss": 0.406,
"step": 8768
},
{
"epoch": 2.0789038506969053,
"grad_norm": 4.71875,
"learning_rate": 7.258776874318371e-06,
"loss": 0.3535,
"step": 8800
},
{
"epoch": 2.0864635010630757,
"grad_norm": 9.3125,
"learning_rate": 7.235202926806075e-06,
"loss": 0.3671,
"step": 8832
},
{
"epoch": 2.0940231514292464,
"grad_norm": 7.53125,
"learning_rate": 7.211566718179837e-06,
"loss": 0.372,
"step": 8864
},
{
"epoch": 2.101582801795417,
"grad_norm": 5.625,
"learning_rate": 7.1878689068213466e-06,
"loss": 0.3503,
"step": 8896
},
{
"epoch": 2.1091424521615876,
"grad_norm": 7.59375,
"learning_rate": 7.164110152828223e-06,
"loss": 0.3796,
"step": 8928
},
{
"epoch": 2.116702102527758,
"grad_norm": 4.9375,
"learning_rate": 7.140291117995632e-06,
"loss": 0.3778,
"step": 8960
},
{
"epoch": 2.1242617528939287,
"grad_norm": 5.78125,
"learning_rate": 7.116412465797849e-06,
"loss": 0.3986,
"step": 8992
},
{
"epoch": 2.131821403260099,
"grad_norm": 10.3125,
"learning_rate": 7.092474861369778e-06,
"loss": 0.3781,
"step": 9024
},
{
"epoch": 2.13938105362627,
"grad_norm": 5.71875,
"learning_rate": 7.068478971488427e-06,
"loss": 0.3316,
"step": 9056
},
{
"epoch": 2.14694070399244,
"grad_norm": 6.96875,
"learning_rate": 7.04442546455433e-06,
"loss": 0.3911,
"step": 9088
},
{
"epoch": 2.154500354358611,
"grad_norm": 4.03125,
"learning_rate": 7.020315010572936e-06,
"loss": 0.3651,
"step": 9120
},
{
"epoch": 2.1620600047247813,
"grad_norm": 5.125,
"learning_rate": 6.996148281135936e-06,
"loss": 0.4247,
"step": 9152
},
{
"epoch": 2.169619655090952,
"grad_norm": 6.71875,
"learning_rate": 6.971925949402571e-06,
"loss": 0.3893,
"step": 9184
},
{
"epoch": 2.1771793054571225,
"grad_norm": 7.5,
"learning_rate": 6.947648690080866e-06,
"loss": 0.3916,
"step": 9216
},
{
"epoch": 2.1847389558232932,
"grad_norm": 5.5,
"learning_rate": 6.923317179408844e-06,
"loss": 0.3539,
"step": 9248
},
{
"epoch": 2.1922986061894636,
"grad_norm": 5.84375,
"learning_rate": 6.898932095135686e-06,
"loss": 0.3643,
"step": 9280
},
{
"epoch": 2.1998582565556344,
"grad_norm": 6.90625,
"learning_rate": 6.8744941165028625e-06,
"loss": 0.3667,
"step": 9312
},
{
"epoch": 2.2074179069218047,
"grad_norm": 4.65625,
"learning_rate": 6.850003924225196e-06,
"loss": 0.3679,
"step": 9344
},
{
"epoch": 2.2149775572879755,
"grad_norm": 6.1875,
"learning_rate": 6.825462200471913e-06,
"loss": 0.3646,
"step": 9376
},
{
"epoch": 2.222537207654146,
"grad_norm": 6.4375,
"learning_rate": 6.800869628847639e-06,
"loss": 0.3672,
"step": 9408
},
{
"epoch": 2.2300968580203167,
"grad_norm": 8.6875,
"learning_rate": 6.776226894373358e-06,
"loss": 0.3661,
"step": 9440
},
{
"epoch": 2.237656508386487,
"grad_norm": 5.59375,
"learning_rate": 6.751534683467326e-06,
"loss": 0.3592,
"step": 9472
},
{
"epoch": 2.245216158752658,
"grad_norm": 3.828125,
"learning_rate": 6.726793683925956e-06,
"loss": 0.3756,
"step": 9504
},
{
"epoch": 2.252775809118828,
"grad_norm": 6.53125,
"learning_rate": 6.70200458490466e-06,
"loss": 0.3646,
"step": 9536
},
{
"epoch": 2.260335459484999,
"grad_norm": 7.96875,
"learning_rate": 6.67716807689865e-06,
"loss": 0.3805,
"step": 9568
},
{
"epoch": 2.2678951098511693,
"grad_norm": 6.0,
"learning_rate": 6.652284851723706e-06,
"loss": 0.3929,
"step": 9600
},
{
"epoch": 2.27545476021734,
"grad_norm": 6.6875,
"learning_rate": 6.627355602496903e-06,
"loss": 0.3732,
"step": 9632
},
{
"epoch": 2.2830144105835104,
"grad_norm": 6.15625,
"learning_rate": 6.602381023617308e-06,
"loss": 0.392,
"step": 9664
},
{
"epoch": 2.290574060949681,
"grad_norm": 6.78125,
"learning_rate": 6.577361810746638e-06,
"loss": 0.3814,
"step": 9696
},
{
"epoch": 2.2981337113158515,
"grad_norm": 6.375,
"learning_rate": 6.552298660789875e-06,
"loss": 0.4029,
"step": 9728
},
{
"epoch": 2.3056933616820223,
"grad_norm": 4.96875,
"learning_rate": 6.5271922718758655e-06,
"loss": 0.3594,
"step": 9760
},
{
"epoch": 2.3132530120481927,
"grad_norm": 4.75,
"learning_rate": 6.502043343337864e-06,
"loss": 0.3599,
"step": 9792
},
{
"epoch": 2.3208126624143635,
"grad_norm": 7.875,
"learning_rate": 6.476852575694061e-06,
"loss": 0.4065,
"step": 9824
},
{
"epoch": 2.328372312780534,
"grad_norm": 6.0625,
"learning_rate": 6.451620670628062e-06,
"loss": 0.4125,
"step": 9856
},
{
"epoch": 2.3359319631467046,
"grad_norm": 4.9375,
"learning_rate": 6.426348330969353e-06,
"loss": 0.3699,
"step": 9888
},
{
"epoch": 2.343491613512875,
"grad_norm": 6.1875,
"learning_rate": 6.40103626067371e-06,
"loss": 0.3839,
"step": 9920
},
{
"epoch": 2.3510512638790457,
"grad_norm": 4.65625,
"learning_rate": 6.375685164803604e-06,
"loss": 0.4004,
"step": 9952
},
{
"epoch": 2.358610914245216,
"grad_norm": 12.0625,
"learning_rate": 6.350295749508551e-06,
"loss": 0.3754,
"step": 9984
},
{
"epoch": 2.366170564611387,
"grad_norm": 5.40625,
"learning_rate": 6.324868722005448e-06,
"loss": 0.4067,
"step": 10016
},
{
"epoch": 2.373730214977557,
"grad_norm": 9.0,
"learning_rate": 6.299404790558874e-06,
"loss": 0.3891,
"step": 10048
},
{
"epoch": 2.381289865343728,
"grad_norm": 6.15625,
"learning_rate": 6.273904664461358e-06,
"loss": 0.4121,
"step": 10080
},
{
"epoch": 2.3888495157098983,
"grad_norm": 6.0625,
"learning_rate": 6.248369054013622e-06,
"loss": 0.385,
"step": 10112
},
{
"epoch": 2.396409166076069,
"grad_norm": 5.96875,
"learning_rate": 6.2227986705048016e-06,
"loss": 0.3822,
"step": 10144
},
{
"epoch": 2.4039688164422395,
"grad_norm": 4.25,
"learning_rate": 6.1971942261926235e-06,
"loss": 0.3776,
"step": 10176
},
{
"epoch": 2.4115284668084103,
"grad_norm": 4.65625,
"learning_rate": 6.171556434283574e-06,
"loss": 0.3936,
"step": 10208
},
{
"epoch": 2.4190881171745806,
"grad_norm": 4.25,
"learning_rate": 6.145886008913031e-06,
"loss": 0.3732,
"step": 10240
},
{
"epoch": 2.4266477675407514,
"grad_norm": 4.0,
"learning_rate": 6.120183665125369e-06,
"loss": 0.3761,
"step": 10272
},
{
"epoch": 2.4342074179069217,
"grad_norm": 7.40625,
"learning_rate": 6.094450118854042e-06,
"loss": 0.3833,
"step": 10304
},
{
"epoch": 2.4417670682730925,
"grad_norm": 4.5625,
"learning_rate": 6.068686086901644e-06,
"loss": 0.4269,
"step": 10336
},
{
"epoch": 2.449326718639263,
"grad_norm": 6.28125,
"learning_rate": 6.042892286919943e-06,
"loss": 0.3946,
"step": 10368
},
{
"epoch": 2.4568863690054337,
"grad_norm": 4.1875,
"learning_rate": 6.017069437389889e-06,
"loss": 0.4035,
"step": 10400
},
{
"epoch": 2.464446019371604,
"grad_norm": 5.875,
"learning_rate": 5.991218257601597e-06,
"loss": 0.372,
"step": 10432
},
{
"epoch": 2.472005669737775,
"grad_norm": 5.75,
"learning_rate": 5.965339467634319e-06,
"loss": 0.3695,
"step": 10464
},
{
"epoch": 2.479565320103945,
"grad_norm": 6.75,
"learning_rate": 5.939433788336384e-06,
"loss": 0.3745,
"step": 10496
},
{
"epoch": 2.487124970470116,
"grad_norm": 5.65625,
"learning_rate": 5.913501941305114e-06,
"loss": 0.3821,
"step": 10528
},
{
"epoch": 2.4946846208362863,
"grad_norm": 6.4375,
"learning_rate": 5.88754464886673e-06,
"loss": 0.3796,
"step": 10560
},
{
"epoch": 2.502244271202457,
"grad_norm": 5.125,
"learning_rate": 5.861562634056232e-06,
"loss": 0.337,
"step": 10592
},
{
"epoch": 2.5098039215686274,
"grad_norm": 14.6875,
"learning_rate": 5.83555662059725e-06,
"loss": 0.361,
"step": 10624
},
{
"epoch": 2.5173635719347978,
"grad_norm": 4.25,
"learning_rate": 5.8095273328818965e-06,
"loss": 0.3816,
"step": 10656
},
{
"epoch": 2.5249232223009686,
"grad_norm": 6.3125,
"learning_rate": 5.783475495950583e-06,
"loss": 0.3682,
"step": 10688
},
{
"epoch": 2.5324828726671393,
"grad_norm": 6.125,
"learning_rate": 5.7574018354718254e-06,
"loss": 0.41,
"step": 10720
},
{
"epoch": 2.5400425230333097,
"grad_norm": 4.5625,
"learning_rate": 5.731307077722026e-06,
"loss": 0.3869,
"step": 10752
},
{
"epoch": 2.54760217339948,
"grad_norm": 5.21875,
"learning_rate": 5.705191949565252e-06,
"loss": 0.3776,
"step": 10784
},
{
"epoch": 2.555161823765651,
"grad_norm": 5.625,
"learning_rate": 5.6790571784329785e-06,
"loss": 0.3676,
"step": 10816
},
{
"epoch": 2.5627214741318216,
"grad_norm": 4.125,
"learning_rate": 5.6529034923038384e-06,
"loss": 0.3715,
"step": 10848
},
{
"epoch": 2.570281124497992,
"grad_norm": 5.71875,
"learning_rate": 5.626731619683333e-06,
"loss": 0.4199,
"step": 10880
},
{
"epoch": 2.5778407748641623,
"grad_norm": 5.65625,
"learning_rate": 5.600542289583547e-06,
"loss": 0.3604,
"step": 10912
},
{
"epoch": 2.585400425230333,
"grad_norm": 11.125,
"learning_rate": 5.574336231502837e-06,
"loss": 0.381,
"step": 10944
},
{
"epoch": 2.592960075596504,
"grad_norm": 7.40625,
"learning_rate": 5.548114175405518e-06,
"loss": 0.4102,
"step": 10976
},
{
"epoch": 2.6005197259626742,
"grad_norm": 5.71875,
"learning_rate": 5.521876851701522e-06,
"loss": 0.3902,
"step": 11008
},
{
"epoch": 2.6080793763288446,
"grad_norm": 7.6875,
"learning_rate": 5.49562499122606e-06,
"loss": 0.3992,
"step": 11040
},
{
"epoch": 2.6156390266950154,
"grad_norm": 8.6875,
"learning_rate": 5.46935932521926e-06,
"loss": 0.3845,
"step": 11072
},
{
"epoch": 2.623198677061186,
"grad_norm": 5.5,
"learning_rate": 5.443080585305802e-06,
"loss": 0.4265,
"step": 11104
},
{
"epoch": 2.6307583274273565,
"grad_norm": 10.875,
"learning_rate": 5.416789503474538e-06,
"loss": 0.3552,
"step": 11136
},
{
"epoch": 2.638317977793527,
"grad_norm": 5.40625,
"learning_rate": 5.390486812058096e-06,
"loss": 0.3974,
"step": 11168
},
{
"epoch": 2.6458776281596976,
"grad_norm": 5.75,
"learning_rate": 5.364173243712492e-06,
"loss": 0.3389,
"step": 11200
},
{
"epoch": 2.6534372785258684,
"grad_norm": 5.75,
"learning_rate": 5.337849531396714e-06,
"loss": 0.3822,
"step": 11232
},
{
"epoch": 2.6609969288920388,
"grad_norm": 7.25,
"learning_rate": 5.31151640835231e-06,
"loss": 0.3903,
"step": 11264
},
{
"epoch": 2.668556579258209,
"grad_norm": 4.375,
"learning_rate": 5.28517460808296e-06,
"loss": 0.3687,
"step": 11296
},
{
"epoch": 2.67611622962438,
"grad_norm": 5.75,
"learning_rate": 5.258824864334047e-06,
"loss": 0.3677,
"step": 11328
},
{
"epoch": 2.6836758799905507,
"grad_norm": 6.0,
"learning_rate": 5.2324679110722185e-06,
"loss": 0.3796,
"step": 11360
},
{
"epoch": 2.691235530356721,
"grad_norm": 7.125,
"learning_rate": 5.206104482464942e-06,
"loss": 0.3653,
"step": 11392
},
{
"epoch": 2.6987951807228914,
"grad_norm": 5.125,
"learning_rate": 5.179735312860053e-06,
"loss": 0.3673,
"step": 11424
},
{
"epoch": 2.706354831089062,
"grad_norm": 7.8125,
"learning_rate": 5.153361136765301e-06,
"loss": 0.3779,
"step": 11456
},
{
"epoch": 2.713914481455233,
"grad_norm": 7.40625,
"learning_rate": 5.126982688827892e-06,
"loss": 0.3924,
"step": 11488
},
{
"epoch": 2.7214741318214033,
"grad_norm": 6.4375,
"learning_rate": 5.100600703814021e-06,
"loss": 0.4042,
"step": 11520
},
{
"epoch": 2.7290337821875736,
"grad_norm": 8.6875,
"learning_rate": 5.074215916588412e-06,
"loss": 0.3592,
"step": 11552
},
{
"epoch": 2.7365934325537444,
"grad_norm": 4.375,
"learning_rate": 5.0478290620938385e-06,
"loss": 0.3844,
"step": 11584
},
{
"epoch": 2.744153082919915,
"grad_norm": 6.5625,
"learning_rate": 5.021440875330659e-06,
"loss": 0.3744,
"step": 11616
},
{
"epoch": 2.7517127332860856,
"grad_norm": 4.96875,
"learning_rate": 4.995052091336344e-06,
"loss": 0.3721,
"step": 11648
},
{
"epoch": 2.759272383652256,
"grad_norm": 5.96875,
"learning_rate": 4.968663445164999e-06,
"loss": 0.4132,
"step": 11680
},
{
"epoch": 2.7668320340184267,
"grad_norm": 7.03125,
"learning_rate": 4.942275671866891e-06,
"loss": 0.3718,
"step": 11712
},
{
"epoch": 2.7743916843845975,
"grad_norm": 5.78125,
"learning_rate": 4.915889506467969e-06,
"loss": 0.3718,
"step": 11744
},
{
"epoch": 2.781951334750768,
"grad_norm": 6.03125,
"learning_rate": 4.889505683949403e-06,
"loss": 0.3687,
"step": 11776
},
{
"epoch": 2.789510985116938,
"grad_norm": 5.25,
"learning_rate": 4.86312493922709e-06,
"loss": 0.4039,
"step": 11808
},
{
"epoch": 2.797070635483109,
"grad_norm": 5.21875,
"learning_rate": 4.836748007131208e-06,
"loss": 0.3829,
"step": 11840
},
{
"epoch": 2.8046302858492793,
"grad_norm": 3.890625,
"learning_rate": 4.81037562238573e-06,
"loss": 0.3893,
"step": 11872
},
{
"epoch": 2.81218993621545,
"grad_norm": 5.46875,
"learning_rate": 4.784008519587961e-06,
"loss": 0.3847,
"step": 11904
},
{
"epoch": 2.8197495865816204,
"grad_norm": 8.0625,
"learning_rate": 4.7576474331880815e-06,
"loss": 0.3949,
"step": 11936
},
{
"epoch": 2.8273092369477912,
"grad_norm": 7.71875,
"learning_rate": 4.731293097468688e-06,
"loss": 0.3903,
"step": 11968
},
{
"epoch": 2.8348688873139616,
"grad_norm": 5.4375,
"learning_rate": 4.704946246524333e-06,
"loss": 0.367,
"step": 12000
},
{
"epoch": 2.8424285376801324,
"grad_norm": 5.03125,
"learning_rate": 4.678607614241086e-06,
"loss": 0.3865,
"step": 12032
},
{
"epoch": 2.8499881880463027,
"grad_norm": 5.3125,
"learning_rate": 4.652277934276088e-06,
"loss": 0.4144,
"step": 12064
},
{
"epoch": 2.8575478384124735,
"grad_norm": 10.125,
"learning_rate": 4.625957940037112e-06,
"loss": 0.3697,
"step": 12096
},
{
"epoch": 2.865107488778644,
"grad_norm": 7.375,
"learning_rate": 4.5996483646621406e-06,
"loss": 0.3794,
"step": 12128
},
{
"epoch": 2.8726671391448146,
"grad_norm": 4.5625,
"learning_rate": 4.573349940998937e-06,
"loss": 0.3742,
"step": 12160
},
{
"epoch": 2.880226789510985,
"grad_norm": 5.5,
"learning_rate": 4.547063401584638e-06,
"loss": 0.3967,
"step": 12192
},
{
"epoch": 2.8877864398771558,
"grad_norm": 4.28125,
"learning_rate": 4.52078947862535e-06,
"loss": 0.363,
"step": 12224
},
{
"epoch": 2.895346090243326,
"grad_norm": 6.09375,
"learning_rate": 4.494528903975744e-06,
"loss": 0.3876,
"step": 12256
},
{
"epoch": 2.902905740609497,
"grad_norm": 5.5625,
"learning_rate": 4.4682824091186855e-06,
"loss": 0.4017,
"step": 12288
},
{
"epoch": 2.9104653909756673,
"grad_norm": 7.71875,
"learning_rate": 4.4420507251448385e-06,
"loss": 0.3884,
"step": 12320
},
{
"epoch": 2.918025041341838,
"grad_norm": 3.953125,
"learning_rate": 4.415834582732324e-06,
"loss": 0.3687,
"step": 12352
},
{
"epoch": 2.9255846917080084,
"grad_norm": 5.4375,
"learning_rate": 4.389634712126353e-06,
"loss": 0.3445,
"step": 12384
},
{
"epoch": 2.933144342074179,
"grad_norm": 4.875,
"learning_rate": 4.3634518431188825e-06,
"loss": 0.4402,
"step": 12416
},
{
"epoch": 2.9407039924403495,
"grad_norm": 4.875,
"learning_rate": 4.3372867050283005e-06,
"loss": 0.4026,
"step": 12448
},
{
"epoch": 2.9482636428065203,
"grad_norm": 6.0,
"learning_rate": 4.311140026679104e-06,
"loss": 0.3846,
"step": 12480
},
{
"epoch": 2.9558232931726907,
"grad_norm": 5.40625,
"learning_rate": 4.285012536381593e-06,
"loss": 0.4215,
"step": 12512
},
{
"epoch": 2.9633829435388614,
"grad_norm": 6.1875,
"learning_rate": 4.258904961911593e-06,
"loss": 0.3835,
"step": 12544
},
{
"epoch": 2.970942593905032,
"grad_norm": 5.78125,
"learning_rate": 4.232818030490172e-06,
"loss": 0.381,
"step": 12576
},
{
"epoch": 2.9785022442712026,
"grad_norm": 4.46875,
"learning_rate": 4.206752468763398e-06,
"loss": 0.4174,
"step": 12608
},
{
"epoch": 2.986061894637373,
"grad_norm": 5.375,
"learning_rate": 4.1807090027820874e-06,
"loss": 0.3842,
"step": 12640
},
{
"epoch": 2.9936215450035437,
"grad_norm": 10.4375,
"learning_rate": 4.15468835798158e-06,
"loss": 0.3946,
"step": 12672
},
{
"epoch": 3.001181195369714,
"grad_norm": 6.625,
"learning_rate": 4.128691259161543e-06,
"loss": 0.377,
"step": 12704
},
{
"epoch": 3.008740845735885,
"grad_norm": 6.90625,
"learning_rate": 4.102718430465772e-06,
"loss": 0.3613,
"step": 12736
},
{
"epoch": 3.016300496102055,
"grad_norm": 4.65625,
"learning_rate": 4.0767705953620226e-06,
"loss": 0.3953,
"step": 12768
},
{
"epoch": 3.023860146468226,
"grad_norm": 4.53125,
"learning_rate": 4.050848476621861e-06,
"loss": 0.3763,
"step": 12800
},
{
"epoch": 3.0314197968343963,
"grad_norm": 7.9375,
"learning_rate": 4.024952796300526e-06,
"loss": 0.3671,
"step": 12832
},
{
"epoch": 3.038979447200567,
"grad_norm": 3.765625,
"learning_rate": 3.999084275716824e-06,
"loss": 0.3639,
"step": 12864
},
{
"epoch": 3.0465390975667375,
"grad_norm": 4.59375,
"learning_rate": 3.973243635433033e-06,
"loss": 0.3926,
"step": 12896
},
{
"epoch": 3.0540987479329083,
"grad_norm": 5.40625,
"learning_rate": 3.947431595234823e-06,
"loss": 0.3694,
"step": 12928
},
{
"epoch": 3.0616583982990786,
"grad_norm": 4.8125,
"learning_rate": 3.921648874111224e-06,
"loss": 0.3668,
"step": 12960
},
{
"epoch": 3.0692180486652494,
"grad_norm": 7.375,
"learning_rate": 3.895896190234587e-06,
"loss": 0.3968,
"step": 12992
},
{
"epoch": 3.0767776990314197,
"grad_norm": 5.5625,
"learning_rate": 3.870174260940576e-06,
"loss": 0.3777,
"step": 13024
},
{
"epoch": 3.0843373493975905,
"grad_norm": 6.28125,
"learning_rate": 3.844483802708201e-06,
"loss": 0.3611,
"step": 13056
},
{
"epoch": 3.091896999763761,
"grad_norm": 7.0,
"learning_rate": 3.818825531139844e-06,
"loss": 0.3831,
"step": 13088
},
{
"epoch": 3.0994566501299317,
"grad_norm": 3.21875,
"learning_rate": 3.7932001609413387e-06,
"loss": 0.3749,
"step": 13120
},
{
"epoch": 3.107016300496102,
"grad_norm": 5.34375,
"learning_rate": 3.7676084059020613e-06,
"loss": 0.3775,
"step": 13152
},
{
"epoch": 3.114575950862273,
"grad_norm": 6.03125,
"learning_rate": 3.742050978875036e-06,
"loss": 0.3637,
"step": 13184
},
{
"epoch": 3.122135601228443,
"grad_norm": 6.625,
"learning_rate": 3.7165285917570924e-06,
"loss": 0.3696,
"step": 13216
},
{
"epoch": 3.129695251594614,
"grad_norm": 5.0,
"learning_rate": 3.6910419554690345e-06,
"loss": 0.3533,
"step": 13248
},
{
"epoch": 3.1372549019607843,
"grad_norm": 6.09375,
"learning_rate": 3.665591779935825e-06,
"loss": 0.3846,
"step": 13280
},
{
"epoch": 3.144814552326955,
"grad_norm": 5.03125,
"learning_rate": 3.6401787740668294e-06,
"loss": 0.3692,
"step": 13312
},
{
"epoch": 3.1523742026931254,
"grad_norm": 6.0625,
"learning_rate": 3.61480364573605e-06,
"loss": 0.4014,
"step": 13344
},
{
"epoch": 3.159933853059296,
"grad_norm": 5.75,
"learning_rate": 3.5894671017624284e-06,
"loss": 0.3938,
"step": 13376
},
{
"epoch": 3.1674935034254665,
"grad_norm": 4.21875,
"learning_rate": 3.5641698478901415e-06,
"loss": 0.3689,
"step": 13408
},
{
"epoch": 3.1750531537916373,
"grad_norm": 6.28125,
"learning_rate": 3.5389125887689467e-06,
"loss": 0.3838,
"step": 13440
},
{
"epoch": 3.1826128041578077,
"grad_norm": 7.59375,
"learning_rate": 3.513696027934561e-06,
"loss": 0.3836,
"step": 13472
},
{
"epoch": 3.1901724545239785,
"grad_norm": 5.90625,
"learning_rate": 3.488520867789056e-06,
"loss": 0.3615,
"step": 13504
},
{
"epoch": 3.197732104890149,
"grad_norm": 9.5625,
"learning_rate": 3.4633878095812945e-06,
"loss": 0.3421,
"step": 13536
},
{
"epoch": 3.2052917552563196,
"grad_norm": 5.875,
"learning_rate": 3.4382975533874025e-06,
"loss": 0.3784,
"step": 13568
},
{
"epoch": 3.21285140562249,
"grad_norm": 6.03125,
"learning_rate": 3.413250798091261e-06,
"loss": 0.3552,
"step": 13600
},
{
"epoch": 3.2204110559886607,
"grad_norm": 5.625,
"learning_rate": 3.3882482413650437e-06,
"loss": 0.3831,
"step": 13632
},
{
"epoch": 3.227970706354831,
"grad_norm": 6.9375,
"learning_rate": 3.363290579649785e-06,
"loss": 0.4101,
"step": 13664
},
{
"epoch": 3.235530356721002,
"grad_norm": 7.1875,
"learning_rate": 3.3383785081359734e-06,
"loss": 0.3955,
"step": 13696
},
{
"epoch": 3.243090007087172,
"grad_norm": 5.3125,
"learning_rate": 3.3135127207441935e-06,
"loss": 0.3705,
"step": 13728
},
{
"epoch": 3.2506496574533426,
"grad_norm": 5.375,
"learning_rate": 3.2886939101058e-06,
"loss": 0.3674,
"step": 13760
},
{
"epoch": 3.2582093078195133,
"grad_norm": 6.625,
"learning_rate": 3.263922767543611e-06,
"loss": 0.3454,
"step": 13792
},
{
"epoch": 3.265768958185684,
"grad_norm": 4.3125,
"learning_rate": 3.239199983052669e-06,
"loss": 0.3621,
"step": 13824
},
{
"epoch": 3.2733286085518545,
"grad_norm": 6.15625,
"learning_rate": 3.2145262452810046e-06,
"loss": 0.3794,
"step": 13856
},
{
"epoch": 3.280888258918025,
"grad_norm": 5.625,
"learning_rate": 3.1899022415104675e-06,
"loss": 0.3956,
"step": 13888
},
{
"epoch": 3.2884479092841956,
"grad_norm": 4.0,
"learning_rate": 3.1653286576375787e-06,
"loss": 0.3491,
"step": 13920
},
{
"epoch": 3.2960075596503664,
"grad_norm": 4.375,
"learning_rate": 3.140806178154415e-06,
"loss": 0.3698,
"step": 13952
},
{
"epoch": 3.3035672100165367,
"grad_norm": 8.0625,
"learning_rate": 3.1163354861295604e-06,
"loss": 0.3903,
"step": 13984
},
{
"epoch": 3.311126860382707,
"grad_norm": 7.46875,
"learning_rate": 3.091917263189066e-06,
"loss": 0.3397,
"step": 14016
},
{
"epoch": 3.318686510748878,
"grad_norm": 4.6875,
"learning_rate": 3.0675521894974647e-06,
"loss": 0.3895,
"step": 14048
},
{
"epoch": 3.3262461611150487,
"grad_norm": 4.96875,
"learning_rate": 3.0432409437388346e-06,
"loss": 0.3847,
"step": 14080
},
{
"epoch": 3.333805811481219,
"grad_norm": 7.96875,
"learning_rate": 3.0189842030978795e-06,
"loss": 0.3942,
"step": 14112
},
{
"epoch": 3.3413654618473894,
"grad_norm": 4.9375,
"learning_rate": 2.9947826432410816e-06,
"loss": 0.3959,
"step": 14144
},
{
"epoch": 3.34892511221356,
"grad_norm": 6.40625,
"learning_rate": 2.9706369382978726e-06,
"loss": 0.392,
"step": 14176
},
{
"epoch": 3.3564847625797305,
"grad_norm": 7.90625,
"learning_rate": 2.946547760841853e-06,
"loss": 0.3895,
"step": 14208
},
{
"epoch": 3.3640444129459013,
"grad_norm": 7.53125,
"learning_rate": 2.9225157818720674e-06,
"loss": 0.4072,
"step": 14240
},
{
"epoch": 3.3716040633120716,
"grad_norm": 6.625,
"learning_rate": 2.898541670794304e-06,
"loss": 0.3749,
"step": 14272
},
{
"epoch": 3.3791637136782424,
"grad_norm": 15.0625,
"learning_rate": 2.8746260954024544e-06,
"loss": 0.4098,
"step": 14304
},
{
"epoch": 3.3867233640444128,
"grad_norm": 5.15625,
"learning_rate": 2.850769721859913e-06,
"loss": 0.3795,
"step": 14336
},
{
"epoch": 3.3942830144105836,
"grad_norm": 7.625,
"learning_rate": 2.8269732146810147e-06,
"loss": 0.3841,
"step": 14368
},
{
"epoch": 3.401842664776754,
"grad_norm": 5.21875,
"learning_rate": 2.8032372367125306e-06,
"loss": 0.3812,
"step": 14400
},
{
"epoch": 3.4094023151429247,
"grad_norm": 4.40625,
"learning_rate": 2.7795624491152097e-06,
"loss": 0.3795,
"step": 14432
},
{
"epoch": 3.416961965509095,
"grad_norm": 5.21875,
"learning_rate": 2.755949511345343e-06,
"loss": 0.3996,
"step": 14464
},
{
"epoch": 3.424521615875266,
"grad_norm": 7.875,
"learning_rate": 2.73239908113642e-06,
"loss": 0.3947,
"step": 14496
},
{
"epoch": 3.432081266241436,
"grad_norm": 5.9375,
"learning_rate": 2.7089118144807885e-06,
"loss": 0.3866,
"step": 14528
},
{
"epoch": 3.439640916607607,
"grad_norm": 5.40625,
"learning_rate": 2.6854883656113896e-06,
"loss": 0.354,
"step": 14560
},
{
"epoch": 3.4472005669737773,
"grad_norm": 5.15625,
"learning_rate": 2.662129386983533e-06,
"loss": 0.3723,
"step": 14592
},
{
"epoch": 3.454760217339948,
"grad_norm": 9.375,
"learning_rate": 2.6388355292567247e-06,
"loss": 0.3359,
"step": 14624
},
{
"epoch": 3.4623198677061184,
"grad_norm": 4.34375,
"learning_rate": 2.61560744127654e-06,
"loss": 0.4005,
"step": 14656
},
{
"epoch": 3.4698795180722892,
"grad_norm": 5.65625,
"learning_rate": 2.592445770056551e-06,
"loss": 0.3485,
"step": 14688
},
{
"epoch": 3.4774391684384596,
"grad_norm": 6.21875,
"learning_rate": 2.569351160760307e-06,
"loss": 0.3756,
"step": 14720
},
{
"epoch": 3.4849988188046304,
"grad_norm": 8.375,
"learning_rate": 2.546324256683359e-06,
"loss": 0.3885,
"step": 14752
},
{
"epoch": 3.4925584691708007,
"grad_norm": 6.71875,
"learning_rate": 2.523365699235346e-06,
"loss": 0.374,
"step": 14784
},
{
"epoch": 3.5001181195369715,
"grad_norm": 5.25,
"learning_rate": 2.5004761279221236e-06,
"loss": 0.3993,
"step": 14816
},
{
"epoch": 3.507677769903142,
"grad_norm": 5.65625,
"learning_rate": 2.4776561803279524e-06,
"loss": 0.3895,
"step": 14848
},
{
"epoch": 3.5152374202693126,
"grad_norm": 7.375,
"learning_rate": 2.4549064920977407e-06,
"loss": 0.3596,
"step": 14880
},
{
"epoch": 3.522797070635483,
"grad_norm": 9.25,
"learning_rate": 2.4322276969193347e-06,
"loss": 0.3495,
"step": 14912
},
{
"epoch": 3.5303567210016538,
"grad_norm": 4.46875,
"learning_rate": 2.409620426505872e-06,
"loss": 0.3972,
"step": 14944
},
{
"epoch": 3.537916371367824,
"grad_norm": 6.625,
"learning_rate": 2.3870853105781803e-06,
"loss": 0.3702,
"step": 14976
},
{
"epoch": 3.545476021733995,
"grad_norm": 7.40625,
"learning_rate": 2.364622976847238e-06,
"loss": 0.3541,
"step": 15008
},
{
"epoch": 3.5530356721001652,
"grad_norm": 8.5,
"learning_rate": 2.3422340509966984e-06,
"loss": 0.4204,
"step": 15040
},
{
"epoch": 3.560595322466336,
"grad_norm": 5.28125,
"learning_rate": 2.3199191566654393e-06,
"loss": 0.3944,
"step": 15072
},
{
"epoch": 3.5681549728325064,
"grad_norm": 6.3125,
"learning_rate": 2.297678915430223e-06,
"loss": 0.3695,
"step": 15104
},
{
"epoch": 3.575714623198677,
"grad_norm": 6.1875,
"learning_rate": 2.275513946788348e-06,
"loss": 0.3747,
"step": 15136
},
{
"epoch": 3.5832742735648475,
"grad_norm": 5.34375,
"learning_rate": 2.253424868140425e-06,
"loss": 0.3704,
"step": 15168
},
{
"epoch": 3.5908339239310183,
"grad_norm": 4.71875,
"learning_rate": 2.2314122947731554e-06,
"loss": 0.3951,
"step": 15200
},
{
"epoch": 3.5983935742971886,
"grad_norm": 7.8125,
"learning_rate": 2.2094768398422063e-06,
"loss": 0.3807,
"step": 15232
},
{
"epoch": 3.6059532246633594,
"grad_norm": 5.625,
"learning_rate": 2.1876191143551225e-06,
"loss": 0.3907,
"step": 15264
},
{
"epoch": 3.61351287502953,
"grad_norm": 7.5,
"learning_rate": 2.1658397271543195e-06,
"loss": 0.3534,
"step": 15296
},
{
"epoch": 3.6210725253957006,
"grad_norm": 5.25,
"learning_rate": 2.1441392849001048e-06,
"loss": 0.3922,
"step": 15328
},
{
"epoch": 3.628632175761871,
"grad_norm": 7.1875,
"learning_rate": 2.122518392053803e-06,
"loss": 0.3487,
"step": 15360
},
{
"epoch": 3.6361918261280417,
"grad_norm": 8.25,
"learning_rate": 2.1009776508608924e-06,
"loss": 0.356,
"step": 15392
},
{
"epoch": 3.643751476494212,
"grad_norm": 7.46875,
"learning_rate": 2.0795176613342576e-06,
"loss": 0.3652,
"step": 15424
},
{
"epoch": 3.651311126860383,
"grad_norm": 6.34375,
"learning_rate": 2.058139021237454e-06,
"loss": 0.3784,
"step": 15456
},
{
"epoch": 3.658870777226553,
"grad_norm": 8.75,
"learning_rate": 2.0368423260680677e-06,
"loss": 0.3724,
"step": 15488
},
{
"epoch": 3.666430427592724,
"grad_norm": 6.6875,
"learning_rate": 2.015628169041125e-06,
"loss": 0.3554,
"step": 15520
},
{
"epoch": 3.6739900779588943,
"grad_norm": 8.0625,
"learning_rate": 1.9944971410725706e-06,
"loss": 0.3807,
"step": 15552
},
{
"epoch": 3.681549728325065,
"grad_norm": 6.3125,
"learning_rate": 1.973449830762806e-06,
"loss": 0.375,
"step": 15584
},
{
"epoch": 3.6891093786912355,
"grad_norm": 5.6875,
"learning_rate": 1.952486824380294e-06,
"loss": 0.3849,
"step": 15616
},
{
"epoch": 3.6966690290574062,
"grad_norm": 4.28125,
"learning_rate": 1.9316087058452304e-06,
"loss": 0.3551,
"step": 15648
},
{
"epoch": 3.7042286794235766,
"grad_norm": 5.125,
"learning_rate": 1.910816056713275e-06,
"loss": 0.35,
"step": 15680
},
{
"epoch": 3.7117883297897474,
"grad_norm": 6.1875,
"learning_rate": 1.890109456159362e-06,
"loss": 0.3814,
"step": 15712
},
{
"epoch": 3.7193479801559177,
"grad_norm": 4.875,
"learning_rate": 1.8694894809615478e-06,
"loss": 0.3751,
"step": 15744
},
{
"epoch": 3.7269076305220885,
"grad_norm": 5.40625,
"learning_rate": 1.848956705484971e-06,
"loss": 0.3796,
"step": 15776
},
{
"epoch": 3.734467280888259,
"grad_norm": 8.0,
"learning_rate": 1.8285117016658316e-06,
"loss": 0.3878,
"step": 15808
},
{
"epoch": 3.7420269312544296,
"grad_norm": 3.40625,
"learning_rate": 1.808155038995471e-06,
"loss": 0.3749,
"step": 15840
},
{
"epoch": 3.7495865816206,
"grad_norm": 4.96875,
"learning_rate": 1.7878872845045058e-06,
"loss": 0.3878,
"step": 15872
},
{
"epoch": 3.757146231986771,
"grad_norm": 4.9375,
"learning_rate": 1.767709002747034e-06,
"loss": 0.3523,
"step": 15904
},
{
"epoch": 3.764705882352941,
"grad_norm": 4.96875,
"learning_rate": 1.7476207557849067e-06,
"loss": 0.3773,
"step": 15936
},
{
"epoch": 3.7722655327191115,
"grad_norm": 6.09375,
"learning_rate": 1.727623103172082e-06,
"loss": 0.3723,
"step": 15968
},
{
"epoch": 3.7798251830852823,
"grad_norm": 7.1875,
"learning_rate": 1.707716601939019e-06,
"loss": 0.3705,
"step": 16000
},
{
"epoch": 3.787384833451453,
"grad_norm": 5.90625,
"learning_rate": 1.6879018065771885e-06,
"loss": 0.4124,
"step": 16032
},
{
"epoch": 3.7949444838176234,
"grad_norm": 5.46875,
"learning_rate": 1.6681792690235975e-06,
"loss": 0.3389,
"step": 16064
},
{
"epoch": 3.8025041341837937,
"grad_norm": 7.21875,
"learning_rate": 1.6485495386454458e-06,
"loss": 0.4017,
"step": 16096
},
{
"epoch": 3.8100637845499645,
"grad_norm": 6.40625,
"learning_rate": 1.629013162224799e-06,
"loss": 0.3876,
"step": 16128
},
{
"epoch": 3.8176234349161353,
"grad_norm": 4.625,
"learning_rate": 1.6095706839433705e-06,
"loss": 0.3619,
"step": 16160
},
{
"epoch": 3.8251830852823057,
"grad_norm": 4.90625,
"learning_rate": 1.5902226453673609e-06,
"loss": 0.3741,
"step": 16192
},
{
"epoch": 3.832742735648476,
"grad_norm": 5.0,
"learning_rate": 1.5709695854323715e-06,
"loss": 0.3624,
"step": 16224
},
{
"epoch": 3.840302386014647,
"grad_norm": 6.03125,
"learning_rate": 1.5518120404283922e-06,
"loss": 0.3819,
"step": 16256
},
{
"epoch": 3.8478620363808176,
"grad_norm": 6.5,
"learning_rate": 1.5327505439848706e-06,
"loss": 0.3872,
"step": 16288
},
{
"epoch": 3.855421686746988,
"grad_norm": 5.78125,
"learning_rate": 1.513785627055831e-06,
"loss": 0.3408,
"step": 16320
},
{
"epoch": 3.8629813371131583,
"grad_norm": 4.59375,
"learning_rate": 1.4949178179051043e-06,
"loss": 0.3919,
"step": 16352
},
{
"epoch": 3.870540987479329,
"grad_norm": 5.6875,
"learning_rate": 1.4761476420916015e-06,
"loss": 0.3582,
"step": 16384
},
{
"epoch": 3.8781006378455,
"grad_norm": 7.28125,
"learning_rate": 1.4574756224546755e-06,
"loss": 0.3809,
"step": 16416
},
{
"epoch": 3.88566028821167,
"grad_norm": 7.875,
"learning_rate": 1.4389022790995611e-06,
"loss": 0.4036,
"step": 16448
},
{
"epoch": 3.8932199385778405,
"grad_norm": 6.0,
"learning_rate": 1.4204281293828858e-06,
"loss": 0.3885,
"step": 16480
},
{
"epoch": 3.9007795889440113,
"grad_norm": 6.40625,
"learning_rate": 1.4020536878982576e-06,
"loss": 0.3713,
"step": 16512
},
{
"epoch": 3.908339239310182,
"grad_norm": 4.90625,
"learning_rate": 1.3837794664619337e-06,
"loss": 0.3758,
"step": 16544
},
{
"epoch": 3.9158988896763525,
"grad_norm": 7.9375,
"learning_rate": 1.3656059740985622e-06,
"loss": 0.3725,
"step": 16576
},
{
"epoch": 3.923458540042523,
"grad_norm": 5.34375,
"learning_rate": 1.3475337170270013e-06,
"loss": 0.3931,
"step": 16608
},
{
"epoch": 3.9310181904086936,
"grad_norm": 7.9375,
"learning_rate": 1.3295631986462292e-06,
"loss": 0.3809,
"step": 16640
},
{
"epoch": 3.9385778407748644,
"grad_norm": 7.3125,
"learning_rate": 1.311694919521302e-06,
"loss": 0.3828,
"step": 16672
},
{
"epoch": 3.9461374911410347,
"grad_norm": 6.90625,
"learning_rate": 1.2939293773694323e-06,
"loss": 0.3723,
"step": 16704
},
{
"epoch": 3.953697141507205,
"grad_norm": 5.65625,
"learning_rate": 1.2762670670461119e-06,
"loss": 0.3594,
"step": 16736
},
{
"epoch": 3.961256791873376,
"grad_norm": 7.09375,
"learning_rate": 1.258708480531331e-06,
"loss": 0.3808,
"step": 16768
},
{
"epoch": 3.9688164422395467,
"grad_norm": 6.375,
"learning_rate": 1.2412541069158752e-06,
"loss": 0.3977,
"step": 16800
},
{
"epoch": 3.976376092605717,
"grad_norm": 8.5625,
"learning_rate": 1.223904432387702e-06,
"loss": 0.3656,
"step": 16832
},
{
"epoch": 3.9839357429718874,
"grad_norm": 7.6875,
"learning_rate": 1.2066599402183953e-06,
"loss": 0.3721,
"step": 16864
},
{
"epoch": 3.991495393338058,
"grad_norm": 5.75,
"learning_rate": 1.1895211107497124e-06,
"loss": 0.392,
"step": 16896
},
{
"epoch": 3.999055043704229,
"grad_norm": 7.0,
"learning_rate": 1.1724884213801874e-06,
"loss": 0.3781,
"step": 16928
},
{
"epoch": 4.006614694070399,
"grad_norm": 5.75,
"learning_rate": 1.155562346551855e-06,
"loss": 0.3662,
"step": 16960
},
{
"epoch": 4.01417434443657,
"grad_norm": 8.6875,
"learning_rate": 1.1387433577370172e-06,
"loss": 0.3938,
"step": 16992
},
{
"epoch": 4.02173399480274,
"grad_norm": 6.5,
"learning_rate": 1.1220319234251191e-06,
"loss": 0.4218,
"step": 17024
},
{
"epoch": 4.029293645168911,
"grad_norm": 6.875,
"learning_rate": 1.1054285091096978e-06,
"loss": 0.3709,
"step": 17056
},
{
"epoch": 4.0368532955350815,
"grad_norm": 5.84375,
"learning_rate": 1.088933577275415e-06,
"loss": 0.354,
"step": 17088
},
{
"epoch": 4.044412945901252,
"grad_norm": 6.21875,
"learning_rate": 1.0725475873851764e-06,
"loss": 0.3689,
"step": 17120
},
{
"epoch": 4.051972596267422,
"grad_norm": 6.5,
"learning_rate": 1.0562709958673318e-06,
"loss": 0.3731,
"step": 17152
},
{
"epoch": 4.0595322466335935,
"grad_norm": 5.75,
"learning_rate": 1.0401042561029617e-06,
"loss": 0.3691,
"step": 17184
},
{
"epoch": 4.067091896999764,
"grad_norm": 4.71875,
"learning_rate": 1.0240478184132486e-06,
"loss": 0.3861,
"step": 17216
},
{
"epoch": 4.074651547365934,
"grad_norm": 5.90625,
"learning_rate": 1.008102130046938e-06,
"loss": 0.3691,
"step": 17248
},
{
"epoch": 4.0822111977321045,
"grad_norm": 5.625,
"learning_rate": 9.92267635167866e-07,
"loss": 0.3865,
"step": 17280
},
{
"epoch": 4.089770848098276,
"grad_norm": 7.1875,
"learning_rate": 9.765447748426098e-07,
"loss": 0.3743,
"step": 17312
},
{
"epoch": 4.097330498464446,
"grad_norm": 10.4375,
"learning_rate": 9.60933987028177e-07,
"loss": 0.3831,
"step": 17344
},
{
"epoch": 4.104890148830616,
"grad_norm": 7.0,
"learning_rate": 9.454357065598285e-07,
"loss": 0.4306,
"step": 17376
},
{
"epoch": 4.112449799196787,
"grad_norm": 9.8125,
"learning_rate": 9.300503651389515e-07,
"loss": 0.3735,
"step": 17408
},
{
"epoch": 4.120009449562958,
"grad_norm": 9.6875,
"learning_rate": 9.147783913210395e-07,
"loss": 0.4346,
"step": 17440
},
{
"epoch": 4.127569099929128,
"grad_norm": 6.3125,
"learning_rate": 8.996202105037549e-07,
"loss": 0.4109,
"step": 17472
},
{
"epoch": 4.135128750295299,
"grad_norm": 7.59375,
"learning_rate": 8.845762449150846e-07,
"loss": 0.3595,
"step": 17504
},
{
"epoch": 4.142688400661469,
"grad_norm": 4.59375,
"learning_rate": 8.696469136015645e-07,
"loss": 0.3627,
"step": 17536
},
{
"epoch": 4.15024805102764,
"grad_norm": 6.4375,
"learning_rate": 8.548326324166268e-07,
"loss": 0.4108,
"step": 17568
},
{
"epoch": 4.157807701393811,
"grad_norm": 6.34375,
"learning_rate": 8.40133814008997e-07,
"loss": 0.3924,
"step": 17600
},
{
"epoch": 4.165367351759981,
"grad_norm": 7.1875,
"learning_rate": 8.255508678112167e-07,
"loss": 0.3692,
"step": 17632
},
{
"epoch": 4.172927002126151,
"grad_norm": 6.21875,
"learning_rate": 8.110842000282271e-07,
"loss": 0.3766,
"step": 17664
},
{
"epoch": 4.1804866524923225,
"grad_norm": 6.28125,
"learning_rate": 7.967342136260576e-07,
"loss": 0.3871,
"step": 17696
},
{
"epoch": 4.188046302858493,
"grad_norm": 4.4375,
"learning_rate": 7.825013083206029e-07,
"loss": 0.3621,
"step": 17728
},
{
"epoch": 4.195605953224663,
"grad_norm": 5.625,
"learning_rate": 7.683858805664923e-07,
"loss": 0.3834,
"step": 17760
},
{
"epoch": 4.203165603590834,
"grad_norm": 6.6875,
"learning_rate": 7.543883235460325e-07,
"loss": 0.3615,
"step": 17792
},
{
"epoch": 4.210725253957005,
"grad_norm": 4.5625,
"learning_rate": 7.405090271582765e-07,
"loss": 0.3525,
"step": 17824
},
{
"epoch": 4.218284904323175,
"grad_norm": 3.9375,
"learning_rate": 7.267483780081419e-07,
"loss": 0.3559,
"step": 17856
},
{
"epoch": 4.2258445546893455,
"grad_norm": 5.25,
"learning_rate": 7.131067593956609e-07,
"loss": 0.3724,
"step": 17888
},
{
"epoch": 4.233404205055516,
"grad_norm": 5.15625,
"learning_rate": 6.995845513052879e-07,
"loss": 0.401,
"step": 17920
},
{
"epoch": 4.240963855421687,
"grad_norm": 4.34375,
"learning_rate": 6.861821303953264e-07,
"loss": 0.389,
"step": 17952
},
{
"epoch": 4.248523505787857,
"grad_norm": 6.25,
"learning_rate": 6.7289986998743e-07,
"loss": 0.4079,
"step": 17984
},
{
"epoch": 4.256083156154028,
"grad_norm": 7.78125,
"learning_rate": 6.597381400562087e-07,
"loss": 0.3613,
"step": 18016
},
{
"epoch": 4.263642806520198,
"grad_norm": 5.5625,
"learning_rate": 6.466973072189187e-07,
"loss": 0.3457,
"step": 18048
},
{
"epoch": 4.271202456886369,
"grad_norm": 6.21875,
"learning_rate": 6.337777347252549e-07,
"loss": 0.3685,
"step": 18080
},
{
"epoch": 4.27876210725254,
"grad_norm": 6.0,
"learning_rate": 6.209797824472292e-07,
"loss": 0.3891,
"step": 18112
},
{
"epoch": 4.28632175761871,
"grad_norm": 5.90625,
"learning_rate": 6.083038068691472e-07,
"loss": 0.3649,
"step": 18144
},
{
"epoch": 4.29388140798488,
"grad_norm": 5.09375,
"learning_rate": 5.957501610776828e-07,
"loss": 0.3546,
"step": 18176
},
{
"epoch": 4.301441058351052,
"grad_norm": 6.5,
"learning_rate": 5.833191947520312e-07,
"loss": 0.3714,
"step": 18208
},
{
"epoch": 4.309000708717222,
"grad_norm": 5.34375,
"learning_rate": 5.710112541541845e-07,
"loss": 0.3742,
"step": 18240
},
{
"epoch": 4.316560359083392,
"grad_norm": 5.875,
"learning_rate": 5.588266821192745e-07,
"loss": 0.3735,
"step": 18272
},
{
"epoch": 4.324120009449563,
"grad_norm": 4.5,
"learning_rate": 5.467658180460284e-07,
"loss": 0.3636,
"step": 18304
},
{
"epoch": 4.331679659815734,
"grad_norm": 6.03125,
"learning_rate": 5.348289978873127e-07,
"loss": 0.3822,
"step": 18336
},
{
"epoch": 4.339239310181904,
"grad_norm": 7.21875,
"learning_rate": 5.230165541407784e-07,
"loss": 0.3793,
"step": 18368
},
{
"epoch": 4.346798960548075,
"grad_norm": 7.53125,
"learning_rate": 5.11328815839594e-07,
"loss": 0.321,
"step": 18400
},
{
"epoch": 4.354358610914245,
"grad_norm": 4.71875,
"learning_rate": 4.997661085432892e-07,
"loss": 0.3217,
"step": 18432
},
{
"epoch": 4.361918261280416,
"grad_norm": 6.125,
"learning_rate": 4.883287543286742e-07,
"loss": 0.385,
"step": 18464
},
{
"epoch": 4.3694779116465865,
"grad_norm": 4.9375,
"learning_rate": 4.770170717808803e-07,
"loss": 0.3809,
"step": 18496
},
{
"epoch": 4.377037562012757,
"grad_norm": 8.25,
"learning_rate": 4.65831375984479e-07,
"loss": 0.3516,
"step": 18528
},
{
"epoch": 4.384597212378927,
"grad_norm": 6.21875,
"learning_rate": 4.5477197851470647e-07,
"loss": 0.4025,
"step": 18560
},
{
"epoch": 4.392156862745098,
"grad_norm": 8.25,
"learning_rate": 4.4383918742878507e-07,
"loss": 0.3838,
"step": 18592
},
{
"epoch": 4.399716513111269,
"grad_norm": 5.375,
"learning_rate": 4.3303330725734284e-07,
"loss": 0.3612,
"step": 18624
},
{
"epoch": 4.407276163477439,
"grad_norm": 9.125,
"learning_rate": 4.223546389959321e-07,
"loss": 0.3869,
"step": 18656
},
{
"epoch": 4.4148358138436095,
"grad_norm": 10.4375,
"learning_rate": 4.1180348009664084e-07,
"loss": 0.3787,
"step": 18688
},
{
"epoch": 4.422395464209781,
"grad_norm": 4.71875,
"learning_rate": 4.013801244598131e-07,
"loss": 0.3816,
"step": 18720
},
{
"epoch": 4.429955114575951,
"grad_norm": 11.3125,
"learning_rate": 3.910848624258573e-07,
"loss": 0.38,
"step": 18752
},
{
"epoch": 4.437514764942121,
"grad_norm": 6.125,
"learning_rate": 3.809179807671637e-07,
"loss": 0.3407,
"step": 18784
},
{
"epoch": 4.445074415308292,
"grad_norm": 6.15625,
"learning_rate": 3.7087976268011026e-07,
"loss": 0.348,
"step": 18816
},
{
"epoch": 4.452634065674463,
"grad_norm": 7.875,
"learning_rate": 3.609704877771825e-07,
"loss": 0.3691,
"step": 18848
},
{
"epoch": 4.460193716040633,
"grad_norm": 5.625,
"learning_rate": 3.511904320791742e-07,
"loss": 0.3718,
"step": 18880
},
{
"epoch": 4.467753366406804,
"grad_norm": 5.5625,
"learning_rate": 3.4153986800751104e-07,
"loss": 0.3875,
"step": 18912
},
{
"epoch": 4.475313016772974,
"grad_norm": 8.9375,
"learning_rate": 3.3201906437665355e-07,
"loss": 0.3819,
"step": 18944
},
{
"epoch": 4.482872667139145,
"grad_norm": 5.875,
"learning_rate": 3.2262828638661093e-07,
"loss": 0.3796,
"step": 18976
},
{
"epoch": 4.490432317505316,
"grad_norm": 6.28125,
"learning_rate": 3.1336779561555674e-07,
"loss": 0.3351,
"step": 19008
},
{
"epoch": 4.497991967871486,
"grad_norm": 8.75,
"learning_rate": 3.0423785001254256e-07,
"loss": 0.3565,
"step": 19040
},
{
"epoch": 4.505551618237656,
"grad_norm": 5.84375,
"learning_rate": 2.9523870389030653e-07,
"loss": 0.357,
"step": 19072
},
{
"epoch": 4.5131112686038275,
"grad_norm": 4.375,
"learning_rate": 2.8637060791820105e-07,
"loss": 0.3654,
"step": 19104
},
{
"epoch": 4.520670918969998,
"grad_norm": 6.59375,
"learning_rate": 2.7763380911519646e-07,
"loss": 0.3916,
"step": 19136
},
{
"epoch": 4.528230569336168,
"grad_norm": 5.5,
"learning_rate": 2.690285508430135e-07,
"loss": 0.3595,
"step": 19168
},
{
"epoch": 4.5357902197023385,
"grad_norm": 6.96875,
"learning_rate": 2.605550727993367e-07,
"loss": 0.3779,
"step": 19200
},
{
"epoch": 4.543349870068509,
"grad_norm": 6.6875,
"learning_rate": 2.522136110111395e-07,
"loss": 0.3547,
"step": 19232
},
{
"epoch": 4.55090952043468,
"grad_norm": 6.125,
"learning_rate": 2.4400439782810814e-07,
"loss": 0.3585,
"step": 19264
},
{
"epoch": 4.5584691708008505,
"grad_norm": 6.0625,
"learning_rate": 2.3592766191617655e-07,
"loss": 0.3724,
"step": 19296
},
{
"epoch": 4.566028821167021,
"grad_norm": 11.3125,
"learning_rate": 2.2798362825114496e-07,
"loss": 0.3584,
"step": 19328
},
{
"epoch": 4.573588471533192,
"grad_norm": 7.96875,
"learning_rate": 2.2017251811242702e-07,
"loss": 0.3591,
"step": 19360
},
{
"epoch": 4.581148121899362,
"grad_norm": 6.46875,
"learning_rate": 2.124945490768715e-07,
"loss": 0.3554,
"step": 19392
},
{
"epoch": 4.588707772265533,
"grad_norm": 4.21875,
"learning_rate": 2.0494993501271708e-07,
"loss": 0.3778,
"step": 19424
},
{
"epoch": 4.596267422631703,
"grad_norm": 4.34375,
"learning_rate": 1.975388860736216e-07,
"loss": 0.3931,
"step": 19456
},
{
"epoch": 4.603827072997873,
"grad_norm": 5.71875,
"learning_rate": 1.9026160869281773e-07,
"loss": 0.3732,
"step": 19488
},
{
"epoch": 4.611386723364045,
"grad_norm": 6.75,
"learning_rate": 1.831183055773561e-07,
"loss": 0.379,
"step": 19520
},
{
"epoch": 4.618946373730215,
"grad_norm": 5.46875,
"learning_rate": 1.7610917570246465e-07,
"loss": 0.3433,
"step": 19552
},
{
"epoch": 4.626506024096385,
"grad_norm": 7.1875,
"learning_rate": 1.6923441430600152e-07,
"loss": 0.4048,
"step": 19584
},
{
"epoch": 4.634065674462557,
"grad_norm": 7.46875,
"learning_rate": 1.624942128830198e-07,
"loss": 0.3748,
"step": 19616
},
{
"epoch": 4.641625324828727,
"grad_norm": 8.0625,
"learning_rate": 1.5588875918043255e-07,
"loss": 0.3976,
"step": 19648
},
{
"epoch": 4.649184975194897,
"grad_norm": 4.3125,
"learning_rate": 1.4941823719178185e-07,
"loss": 0.3547,
"step": 19680
},
{
"epoch": 4.656744625561068,
"grad_norm": 7.3125,
"learning_rate": 1.430828271521173e-07,
"loss": 0.3879,
"step": 19712
},
{
"epoch": 4.664304275927238,
"grad_norm": 5.625,
"learning_rate": 1.3688270553296968e-07,
"loss": 0.3835,
"step": 19744
},
{
"epoch": 4.671863926293409,
"grad_norm": 5.125,
"learning_rate": 1.3081804503744188e-07,
"loss": 0.4104,
"step": 19776
},
{
"epoch": 4.6794235766595795,
"grad_norm": 5.71875,
"learning_rate": 1.2488901459539404e-07,
"loss": 0.3825,
"step": 19808
},
{
"epoch": 4.68698322702575,
"grad_norm": 6.6875,
"learning_rate": 1.1909577935873939e-07,
"loss": 0.3976,
"step": 19840
},
{
"epoch": 4.694542877391921,
"grad_norm": 6.15625,
"learning_rate": 1.1343850069684415e-07,
"loss": 0.3609,
"step": 19872
},
{
"epoch": 4.7021025277580915,
"grad_norm": 4.65625,
"learning_rate": 1.079173361920316e-07,
"loss": 0.4363,
"step": 19904
},
{
"epoch": 4.709662178124262,
"grad_norm": 7.15625,
"learning_rate": 1.0253243963519343e-07,
"loss": 0.3626,
"step": 19936
},
{
"epoch": 4.717221828490432,
"grad_norm": 7.9375,
"learning_rate": 9.728396102150872e-08,
"loss": 0.3629,
"step": 19968
},
{
"epoch": 4.7247814788566025,
"grad_norm": 6.875,
"learning_rate": 9.217204654625778e-08,
"loss": 0.4077,
"step": 20000
},
{
"epoch": 4.732341129222774,
"grad_norm": 7.03125,
"learning_rate": 8.71968386007599e-08,
"loss": 0.3592,
"step": 20032
},
{
"epoch": 4.739900779588944,
"grad_norm": 8.5625,
"learning_rate": 8.235847576839984e-08,
"loss": 0.3695,
"step": 20064
},
{
"epoch": 4.747460429955114,
"grad_norm": 6.9375,
"learning_rate": 7.765709282077149e-08,
"loss": 0.3835,
"step": 20096
},
{
"epoch": 4.755020080321285,
"grad_norm": 6.75,
"learning_rate": 7.309282071392087e-08,
"loss": 0.3384,
"step": 20128
},
{
"epoch": 4.762579730687456,
"grad_norm": 7.25,
"learning_rate": 6.866578658470179e-08,
"loss": 0.3575,
"step": 20160
},
{
"epoch": 4.770139381053626,
"grad_norm": 5.78125,
"learning_rate": 6.437611374723152e-08,
"loss": 0.3799,
"step": 20192
},
{
"epoch": 4.777699031419797,
"grad_norm": 6.9375,
"learning_rate": 6.022392168945623e-08,
"loss": 0.3736,
"step": 20224
},
{
"epoch": 4.785258681785967,
"grad_norm": 3.953125,
"learning_rate": 5.620932606982599e-08,
"loss": 0.3855,
"step": 20256
},
{
"epoch": 4.792818332152138,
"grad_norm": 6.28125,
"learning_rate": 5.233243871406779e-08,
"loss": 0.3909,
"step": 20288
},
{
"epoch": 4.800377982518309,
"grad_norm": 5.3125,
"learning_rate": 4.859336761207645e-08,
"loss": 0.3379,
"step": 20320
},
{
"epoch": 4.807937632884479,
"grad_norm": 8.0,
"learning_rate": 4.499221691490085e-08,
"loss": 0.3821,
"step": 20352
},
{
"epoch": 4.815497283250649,
"grad_norm": 3.84375,
"learning_rate": 4.152908693184743e-08,
"loss": 0.3674,
"step": 20384
},
{
"epoch": 4.8230569336168205,
"grad_norm": 5.875,
"learning_rate": 3.820407412768234e-08,
"loss": 0.406,
"step": 20416
},
{
"epoch": 4.830616583982991,
"grad_norm": 7.96875,
"learning_rate": 3.5017271119949234e-08,
"loss": 0.4036,
"step": 20448
},
{
"epoch": 4.838176234349161,
"grad_norm": 6.21875,
"learning_rate": 3.196876667638404e-08,
"loss": 0.3762,
"step": 20480
},
{
"epoch": 4.845735884715332,
"grad_norm": 6.40625,
"learning_rate": 2.9058645712445876e-08,
"loss": 0.3783,
"step": 20512
},
{
"epoch": 4.853295535081503,
"grad_norm": 3.984375,
"learning_rate": 2.628698928895057e-08,
"loss": 0.3324,
"step": 20544
},
{
"epoch": 4.860855185447673,
"grad_norm": 5.9375,
"learning_rate": 2.365387460981361e-08,
"loss": 0.3826,
"step": 20576
},
{
"epoch": 4.8684148358138435,
"grad_norm": 4.28125,
"learning_rate": 2.1159375019897398e-08,
"loss": 0.3714,
"step": 20608
},
{
"epoch": 4.875974486180014,
"grad_norm": 7.875,
"learning_rate": 1.8803560002971232e-08,
"loss": 0.3905,
"step": 20640
},
{
"epoch": 4.883534136546185,
"grad_norm": 10.75,
"learning_rate": 1.65864951797734e-08,
"loss": 0.3769,
"step": 20672
},
{
"epoch": 4.891093786912355,
"grad_norm": 8.875,
"learning_rate": 1.450824230618486e-08,
"loss": 0.3892,
"step": 20704
},
{
"epoch": 4.898653437278526,
"grad_norm": 6.6875,
"learning_rate": 1.2568859271508971e-08,
"loss": 0.3954,
"step": 20736
},
{
"epoch": 4.906213087644696,
"grad_norm": 7.28125,
"learning_rate": 1.0768400096856645e-08,
"loss": 0.3848,
"step": 20768
},
{
"epoch": 4.913772738010867,
"grad_norm": 5.40625,
"learning_rate": 9.106914933646461e-09,
"loss": 0.3892,
"step": 20800
},
{
"epoch": 4.921332388377038,
"grad_norm": 6.9375,
"learning_rate": 7.58445006220132e-09,
"loss": 0.3719,
"step": 20832
},
{
"epoch": 4.928892038743208,
"grad_norm": 5.625,
"learning_rate": 6.201047890465606e-09,
"loss": 0.389,
"step": 20864
},
{
"epoch": 4.936451689109378,
"grad_norm": 8.875,
"learning_rate": 4.9567469528194465e-09,
"loss": 0.3416,
"step": 20896
},
{
"epoch": 4.94401133947555,
"grad_norm": 6.46875,
"learning_rate": 3.851581909007363e-09,
"loss": 0.396,
"step": 20928
},
{
"epoch": 4.95157098984172,
"grad_norm": 8.875,
"learning_rate": 2.885583543172921e-09,
"loss": 0.3682,
"step": 20960
},
{
"epoch": 4.95913064020789,
"grad_norm": 7.875,
"learning_rate": 2.0587787629994248e-09,
"loss": 0.3538,
"step": 20992
},
{
"epoch": 4.966690290574061,
"grad_norm": 21.5,
"learning_rate": 1.37119059896329e-09,
"loss": 0.398,
"step": 21024
},
{
"epoch": 4.974249940940232,
"grad_norm": 5.71875,
"learning_rate": 8.228382036901128e-10,
"loss": 0.3672,
"step": 21056
},
{
"epoch": 4.981809591306402,
"grad_norm": 4.84375,
"learning_rate": 4.1373685142176433e-10,
"loss": 0.3525,
"step": 21088
},
{
"epoch": 4.989369241672573,
"grad_norm": 6.96875,
"learning_rate": 1.4389793759173043e-10,
"loss": 0.34,
"step": 21120
},
{
"epoch": 4.996928892038743,
"grad_norm": 3.75,
"learning_rate": 1.3328978507032298e-11,
"loss": 0.3755,
"step": 21152
},
{
"epoch": 5.0,
"step": 21165,
"total_flos": 3.745021438134743e+17,
"train_loss": 0.3988672580218839,
"train_runtime": 7931.2769,
"train_samples_per_second": 10.674,
"train_steps_per_second": 2.669
}
],
"logging_steps": 32,
"max_steps": 21165,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.745021438134743e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}