fucked-nemo / trainer_state.json
Delta-Vector's picture
Upload folder using huggingface_hub
57b0440 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9966703662597114,
"eval_steps": 500,
"global_step": 900,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0022197558268590455,
"grad_norm": 8.399400554535633,
"learning_rate": 1.111111111111111e-08,
"loss": 0.6208,
"step": 1
},
{
"epoch": 0.004439511653718091,
"grad_norm": 9.23527463594977,
"learning_rate": 2.222222222222222e-08,
"loss": 0.5876,
"step": 2
},
{
"epoch": 0.006659267480577136,
"grad_norm": 7.892116076103547,
"learning_rate": 3.3333333333333334e-08,
"loss": 0.5594,
"step": 3
},
{
"epoch": 0.008879023307436182,
"grad_norm": 8.263962816983785,
"learning_rate": 4.444444444444444e-08,
"loss": 0.5842,
"step": 4
},
{
"epoch": 0.011098779134295227,
"grad_norm": 8.12452966304493,
"learning_rate": 5.555555555555555e-08,
"loss": 0.5967,
"step": 5
},
{
"epoch": 0.013318534961154272,
"grad_norm": 8.349469406229243,
"learning_rate": 6.666666666666667e-08,
"loss": 0.5914,
"step": 6
},
{
"epoch": 0.01553829078801332,
"grad_norm": 8.367872537902208,
"learning_rate": 7.777777777777778e-08,
"loss": 0.5972,
"step": 7
},
{
"epoch": 0.017758046614872364,
"grad_norm": 8.248807052651621,
"learning_rate": 8.888888888888888e-08,
"loss": 0.5833,
"step": 8
},
{
"epoch": 0.01997780244173141,
"grad_norm": 8.458007397509917,
"learning_rate": 1.0000000000000001e-07,
"loss": 0.6075,
"step": 9
},
{
"epoch": 0.022197558268590455,
"grad_norm": 8.368446611829034,
"learning_rate": 1.111111111111111e-07,
"loss": 0.606,
"step": 10
},
{
"epoch": 0.0244173140954495,
"grad_norm": 8.308128238411795,
"learning_rate": 1.2222222222222222e-07,
"loss": 0.6244,
"step": 11
},
{
"epoch": 0.026637069922308545,
"grad_norm": 8.537780094365171,
"learning_rate": 1.3333333333333334e-07,
"loss": 0.6135,
"step": 12
},
{
"epoch": 0.02885682574916759,
"grad_norm": 8.158296304061713,
"learning_rate": 1.4444444444444442e-07,
"loss": 0.6152,
"step": 13
},
{
"epoch": 0.03107658157602664,
"grad_norm": 8.532386684065743,
"learning_rate": 1.5555555555555556e-07,
"loss": 0.5913,
"step": 14
},
{
"epoch": 0.033296337402885685,
"grad_norm": 8.436502712386789,
"learning_rate": 1.6666666666666665e-07,
"loss": 0.5808,
"step": 15
},
{
"epoch": 0.03551609322974473,
"grad_norm": 8.030910487309407,
"learning_rate": 1.7777777777777776e-07,
"loss": 0.592,
"step": 16
},
{
"epoch": 0.03773584905660377,
"grad_norm": 7.479947998376637,
"learning_rate": 1.8888888888888888e-07,
"loss": 0.6097,
"step": 17
},
{
"epoch": 0.03995560488346282,
"grad_norm": 8.75074608782379,
"learning_rate": 2.0000000000000002e-07,
"loss": 0.5939,
"step": 18
},
{
"epoch": 0.042175360710321866,
"grad_norm": 7.880536846496551,
"learning_rate": 2.1111111111111108e-07,
"loss": 0.5618,
"step": 19
},
{
"epoch": 0.04439511653718091,
"grad_norm": 7.603741618669525,
"learning_rate": 2.222222222222222e-07,
"loss": 0.592,
"step": 20
},
{
"epoch": 0.04661487236403995,
"grad_norm": 7.427486297252882,
"learning_rate": 2.3333333333333333e-07,
"loss": 0.5807,
"step": 21
},
{
"epoch": 0.048834628190899,
"grad_norm": 7.28835072640129,
"learning_rate": 2.4444444444444445e-07,
"loss": 0.6103,
"step": 22
},
{
"epoch": 0.051054384017758046,
"grad_norm": 7.234395660436924,
"learning_rate": 2.5555555555555553e-07,
"loss": 0.603,
"step": 23
},
{
"epoch": 0.05327413984461709,
"grad_norm": 7.508445944227732,
"learning_rate": 2.6666666666666667e-07,
"loss": 0.6008,
"step": 24
},
{
"epoch": 0.05549389567147614,
"grad_norm": 7.099644123190661,
"learning_rate": 2.7777777777777776e-07,
"loss": 0.5769,
"step": 25
},
{
"epoch": 0.05771365149833518,
"grad_norm": 6.932344388923125,
"learning_rate": 2.8888888888888885e-07,
"loss": 0.627,
"step": 26
},
{
"epoch": 0.05993340732519423,
"grad_norm": 6.645121649801645,
"learning_rate": 3e-07,
"loss": 0.632,
"step": 27
},
{
"epoch": 0.06215316315205328,
"grad_norm": 7.192778252181474,
"learning_rate": 3.111111111111111e-07,
"loss": 0.5832,
"step": 28
},
{
"epoch": 0.06437291897891231,
"grad_norm": 6.6876109677067355,
"learning_rate": 3.222222222222222e-07,
"loss": 0.5623,
"step": 29
},
{
"epoch": 0.06659267480577137,
"grad_norm": 6.2825192558471015,
"learning_rate": 3.333333333333333e-07,
"loss": 0.5788,
"step": 30
},
{
"epoch": 0.06881243063263041,
"grad_norm": 7.481092834682601,
"learning_rate": 3.4444444444444444e-07,
"loss": 0.571,
"step": 31
},
{
"epoch": 0.07103218645948946,
"grad_norm": 6.473626658016538,
"learning_rate": 3.5555555555555553e-07,
"loss": 0.5928,
"step": 32
},
{
"epoch": 0.0732519422863485,
"grad_norm": 6.2535869677634635,
"learning_rate": 3.666666666666666e-07,
"loss": 0.5691,
"step": 33
},
{
"epoch": 0.07547169811320754,
"grad_norm": 6.185332269346511,
"learning_rate": 3.7777777777777775e-07,
"loss": 0.5601,
"step": 34
},
{
"epoch": 0.07769145394006659,
"grad_norm": 6.288484433565551,
"learning_rate": 3.8888888888888884e-07,
"loss": 0.564,
"step": 35
},
{
"epoch": 0.07991120976692564,
"grad_norm": 5.985590937835178,
"learning_rate": 4.0000000000000003e-07,
"loss": 0.5568,
"step": 36
},
{
"epoch": 0.08213096559378469,
"grad_norm": 6.103172138710466,
"learning_rate": 4.1111111111111107e-07,
"loss": 0.5643,
"step": 37
},
{
"epoch": 0.08435072142064373,
"grad_norm": 6.55206929232527,
"learning_rate": 4.2222222222222216e-07,
"loss": 0.5953,
"step": 38
},
{
"epoch": 0.08657047724750278,
"grad_norm": 5.715926102273091,
"learning_rate": 4.3333333333333335e-07,
"loss": 0.5619,
"step": 39
},
{
"epoch": 0.08879023307436182,
"grad_norm": 5.450445663024957,
"learning_rate": 4.444444444444444e-07,
"loss": 0.5631,
"step": 40
},
{
"epoch": 0.09100998890122086,
"grad_norm": 5.542981405394689,
"learning_rate": 4.555555555555556e-07,
"loss": 0.5577,
"step": 41
},
{
"epoch": 0.0932297447280799,
"grad_norm": 5.725678282306514,
"learning_rate": 4.6666666666666666e-07,
"loss": 0.5848,
"step": 42
},
{
"epoch": 0.09544950055493896,
"grad_norm": 5.382657479056415,
"learning_rate": 4.777777777777777e-07,
"loss": 0.5737,
"step": 43
},
{
"epoch": 0.097669256381798,
"grad_norm": 5.0680029115322744,
"learning_rate": 4.888888888888889e-07,
"loss": 0.5716,
"step": 44
},
{
"epoch": 0.09988901220865705,
"grad_norm": 5.37896827418845,
"learning_rate": 5e-07,
"loss": 0.5814,
"step": 45
},
{
"epoch": 0.10210876803551609,
"grad_norm": 5.140988943990483,
"learning_rate": 5.111111111111111e-07,
"loss": 0.5563,
"step": 46
},
{
"epoch": 0.10432852386237514,
"grad_norm": 5.167606311709604,
"learning_rate": 5.222222222222222e-07,
"loss": 0.5789,
"step": 47
},
{
"epoch": 0.10654827968923418,
"grad_norm": 4.910299969476651,
"learning_rate": 5.333333333333333e-07,
"loss": 0.5374,
"step": 48
},
{
"epoch": 0.10876803551609324,
"grad_norm": 4.518136573407695,
"learning_rate": 5.444444444444444e-07,
"loss": 0.5456,
"step": 49
},
{
"epoch": 0.11098779134295228,
"grad_norm": 4.654972052593856,
"learning_rate": 5.555555555555555e-07,
"loss": 0.5756,
"step": 50
},
{
"epoch": 0.11320754716981132,
"grad_norm": 4.4424350535018755,
"learning_rate": 5.666666666666667e-07,
"loss": 0.5694,
"step": 51
},
{
"epoch": 0.11542730299667037,
"grad_norm": 4.446642718416529,
"learning_rate": 5.777777777777777e-07,
"loss": 0.5397,
"step": 52
},
{
"epoch": 0.11764705882352941,
"grad_norm": 4.139602412755836,
"learning_rate": 5.888888888888889e-07,
"loss": 0.5493,
"step": 53
},
{
"epoch": 0.11986681465038845,
"grad_norm": 3.8012053176011866,
"learning_rate": 6e-07,
"loss": 0.5818,
"step": 54
},
{
"epoch": 0.1220865704772475,
"grad_norm": 3.674488655760073,
"learning_rate": 6.111111111111111e-07,
"loss": 0.5515,
"step": 55
},
{
"epoch": 0.12430632630410655,
"grad_norm": 3.773334442965557,
"learning_rate": 6.222222222222223e-07,
"loss": 0.5285,
"step": 56
},
{
"epoch": 0.12652608213096558,
"grad_norm": 3.3471615813958353,
"learning_rate": 6.333333333333333e-07,
"loss": 0.5283,
"step": 57
},
{
"epoch": 0.12874583795782463,
"grad_norm": 3.7680138998687176,
"learning_rate": 6.444444444444444e-07,
"loss": 0.5468,
"step": 58
},
{
"epoch": 0.1309655937846837,
"grad_norm": 3.1334055249642336,
"learning_rate": 6.555555555555555e-07,
"loss": 0.5356,
"step": 59
},
{
"epoch": 0.13318534961154274,
"grad_norm": 2.942044115739925,
"learning_rate": 6.666666666666666e-07,
"loss": 0.5272,
"step": 60
},
{
"epoch": 0.13540510543840178,
"grad_norm": 3.182126804198039,
"learning_rate": 6.777777777777778e-07,
"loss": 0.5288,
"step": 61
},
{
"epoch": 0.13762486126526083,
"grad_norm": 2.6901086101723535,
"learning_rate": 6.888888888888889e-07,
"loss": 0.5167,
"step": 62
},
{
"epoch": 0.13984461709211987,
"grad_norm": 2.586223950116098,
"learning_rate": 7e-07,
"loss": 0.5499,
"step": 63
},
{
"epoch": 0.14206437291897892,
"grad_norm": 2.5700536482288086,
"learning_rate": 7.111111111111111e-07,
"loss": 0.5063,
"step": 64
},
{
"epoch": 0.14428412874583796,
"grad_norm": 2.4661921220618512,
"learning_rate": 7.222222222222221e-07,
"loss": 0.5564,
"step": 65
},
{
"epoch": 0.146503884572697,
"grad_norm": 2.330950891833989,
"learning_rate": 7.333333333333332e-07,
"loss": 0.5182,
"step": 66
},
{
"epoch": 0.14872364039955605,
"grad_norm": 2.5180905748914895,
"learning_rate": 7.444444444444444e-07,
"loss": 0.532,
"step": 67
},
{
"epoch": 0.1509433962264151,
"grad_norm": 2.6095616302569384,
"learning_rate": 7.555555555555555e-07,
"loss": 0.5358,
"step": 68
},
{
"epoch": 0.15316315205327413,
"grad_norm": 2.6685044499493844,
"learning_rate": 7.666666666666666e-07,
"loss": 0.5094,
"step": 69
},
{
"epoch": 0.15538290788013318,
"grad_norm": 2.961669120901193,
"learning_rate": 7.777777777777777e-07,
"loss": 0.5259,
"step": 70
},
{
"epoch": 0.15760266370699222,
"grad_norm": 2.4860738166604515,
"learning_rate": 7.888888888888888e-07,
"loss": 0.4868,
"step": 71
},
{
"epoch": 0.1598224195338513,
"grad_norm": 2.2865818119799464,
"learning_rate": 8.000000000000001e-07,
"loss": 0.5076,
"step": 72
},
{
"epoch": 0.16204217536071033,
"grad_norm": 1.8849597000054163,
"learning_rate": 8.11111111111111e-07,
"loss": 0.5298,
"step": 73
},
{
"epoch": 0.16426193118756938,
"grad_norm": 2.10197468284917,
"learning_rate": 8.222222222222221e-07,
"loss": 0.5174,
"step": 74
},
{
"epoch": 0.16648168701442842,
"grad_norm": 2.134411200325002,
"learning_rate": 8.333333333333332e-07,
"loss": 0.5101,
"step": 75
},
{
"epoch": 0.16870144284128746,
"grad_norm": 2.0548396664198694,
"learning_rate": 8.444444444444443e-07,
"loss": 0.512,
"step": 76
},
{
"epoch": 0.1709211986681465,
"grad_norm": 2.190003103991552,
"learning_rate": 8.555555555555556e-07,
"loss": 0.5322,
"step": 77
},
{
"epoch": 0.17314095449500555,
"grad_norm": 2.3012756353361232,
"learning_rate": 8.666666666666667e-07,
"loss": 0.5209,
"step": 78
},
{
"epoch": 0.1753607103218646,
"grad_norm": 2.767046453765713,
"learning_rate": 8.777777777777777e-07,
"loss": 0.5438,
"step": 79
},
{
"epoch": 0.17758046614872364,
"grad_norm": 1.9909116361275931,
"learning_rate": 8.888888888888888e-07,
"loss": 0.4869,
"step": 80
},
{
"epoch": 0.17980022197558268,
"grad_norm": 2.0137646431651195,
"learning_rate": 8.999999999999999e-07,
"loss": 0.4994,
"step": 81
},
{
"epoch": 0.18201997780244172,
"grad_norm": 1.6363334375526886,
"learning_rate": 9.111111111111112e-07,
"loss": 0.4962,
"step": 82
},
{
"epoch": 0.18423973362930077,
"grad_norm": 1.6903094131455099,
"learning_rate": 9.222222222222222e-07,
"loss": 0.5035,
"step": 83
},
{
"epoch": 0.1864594894561598,
"grad_norm": 1.6665441809105392,
"learning_rate": 9.333333333333333e-07,
"loss": 0.5172,
"step": 84
},
{
"epoch": 0.18867924528301888,
"grad_norm": 1.9115942542365676,
"learning_rate": 9.444444444444444e-07,
"loss": 0.5245,
"step": 85
},
{
"epoch": 0.19089900110987792,
"grad_norm": 1.8171555565938122,
"learning_rate": 9.555555555555554e-07,
"loss": 0.5199,
"step": 86
},
{
"epoch": 0.19311875693673697,
"grad_norm": 1.5495672352550462,
"learning_rate": 9.666666666666666e-07,
"loss": 0.5064,
"step": 87
},
{
"epoch": 0.195338512763596,
"grad_norm": 1.6942828602627467,
"learning_rate": 9.777777777777778e-07,
"loss": 0.4985,
"step": 88
},
{
"epoch": 0.19755826859045506,
"grad_norm": 1.4110662925081685,
"learning_rate": 9.888888888888888e-07,
"loss": 0.5058,
"step": 89
},
{
"epoch": 0.1997780244173141,
"grad_norm": 1.3896068677559,
"learning_rate": 1e-06,
"loss": 0.5138,
"step": 90
},
{
"epoch": 0.20199778024417314,
"grad_norm": 1.8634594942686744,
"learning_rate": 9.9987640588308e-07,
"loss": 0.5005,
"step": 91
},
{
"epoch": 0.20421753607103219,
"grad_norm": 1.601950407868211,
"learning_rate": 9.997525365008662e-07,
"loss": 0.5351,
"step": 92
},
{
"epoch": 0.20643729189789123,
"grad_norm": 1.7619605783577625,
"learning_rate": 9.996283909327387e-07,
"loss": 0.4846,
"step": 93
},
{
"epoch": 0.20865704772475027,
"grad_norm": 1.738504362351875,
"learning_rate": 9.995039682539681e-07,
"loss": 0.5152,
"step": 94
},
{
"epoch": 0.21087680355160932,
"grad_norm": 1.9023064418592266,
"learning_rate": 9.99379267535692e-07,
"loss": 0.4943,
"step": 95
},
{
"epoch": 0.21309655937846836,
"grad_norm": 1.5940614863386777,
"learning_rate": 9.992542878448919e-07,
"loss": 0.4928,
"step": 96
},
{
"epoch": 0.2153163152053274,
"grad_norm": 1.388955486975375,
"learning_rate": 9.991290282443698e-07,
"loss": 0.5046,
"step": 97
},
{
"epoch": 0.21753607103218647,
"grad_norm": 1.341456992180103,
"learning_rate": 9.990034877927254e-07,
"loss": 0.53,
"step": 98
},
{
"epoch": 0.21975582685904552,
"grad_norm": 1.522858356142826,
"learning_rate": 9.988776655443322e-07,
"loss": 0.4929,
"step": 99
},
{
"epoch": 0.22197558268590456,
"grad_norm": 1.4541516733946238,
"learning_rate": 9.987515605493133e-07,
"loss": 0.5394,
"step": 100
},
{
"epoch": 0.2241953385127636,
"grad_norm": 1.376705603294956,
"learning_rate": 9.986251718535183e-07,
"loss": 0.5118,
"step": 101
},
{
"epoch": 0.22641509433962265,
"grad_norm": 1.3823606190937259,
"learning_rate": 9.984984984984985e-07,
"loss": 0.5047,
"step": 102
},
{
"epoch": 0.2286348501664817,
"grad_norm": 1.363673309466558,
"learning_rate": 9.98371539521483e-07,
"loss": 0.4903,
"step": 103
},
{
"epoch": 0.23085460599334073,
"grad_norm": 1.416401669190688,
"learning_rate": 9.982442939553548e-07,
"loss": 0.514,
"step": 104
},
{
"epoch": 0.23307436182019978,
"grad_norm": 1.3605729764609202,
"learning_rate": 9.981167608286253e-07,
"loss": 0.4843,
"step": 105
},
{
"epoch": 0.23529411764705882,
"grad_norm": 1.2651391854034004,
"learning_rate": 9.979889391654097e-07,
"loss": 0.4934,
"step": 106
},
{
"epoch": 0.23751387347391786,
"grad_norm": 1.3876541204990487,
"learning_rate": 9.978608279854033e-07,
"loss": 0.5184,
"step": 107
},
{
"epoch": 0.2397336293007769,
"grad_norm": 1.3019101475882795,
"learning_rate": 9.977324263038547e-07,
"loss": 0.5168,
"step": 108
},
{
"epoch": 0.24195338512763595,
"grad_norm": 1.234950346880673,
"learning_rate": 9.976037331315424e-07,
"loss": 0.512,
"step": 109
},
{
"epoch": 0.244173140954495,
"grad_norm": 1.2715031826930232,
"learning_rate": 9.974747474747475e-07,
"loss": 0.5051,
"step": 110
},
{
"epoch": 0.24639289678135406,
"grad_norm": 1.4815309620586665,
"learning_rate": 9.973454683352293e-07,
"loss": 0.5585,
"step": 111
},
{
"epoch": 0.2486126526082131,
"grad_norm": 1.4584424491170997,
"learning_rate": 9.972158947101999e-07,
"loss": 0.5183,
"step": 112
},
{
"epoch": 0.25083240843507215,
"grad_norm": 1.4855761461506833,
"learning_rate": 9.970860255922969e-07,
"loss": 0.4973,
"step": 113
},
{
"epoch": 0.25305216426193117,
"grad_norm": 1.3279548120498825,
"learning_rate": 9.969558599695586e-07,
"loss": 0.4962,
"step": 114
},
{
"epoch": 0.25527192008879024,
"grad_norm": 1.3134752725185352,
"learning_rate": 9.968253968253967e-07,
"loss": 0.4947,
"step": 115
},
{
"epoch": 0.25749167591564925,
"grad_norm": 2.592615996953234,
"learning_rate": 9.96694635138571e-07,
"loss": 0.4884,
"step": 116
},
{
"epoch": 0.2597114317425083,
"grad_norm": 1.4087868752655734,
"learning_rate": 9.965635738831615e-07,
"loss": 0.4791,
"step": 117
},
{
"epoch": 0.2619311875693674,
"grad_norm": 1.3013116867327663,
"learning_rate": 9.964322120285423e-07,
"loss": 0.5039,
"step": 118
},
{
"epoch": 0.2641509433962264,
"grad_norm": 1.2841064842046501,
"learning_rate": 9.963005485393543e-07,
"loss": 0.4973,
"step": 119
},
{
"epoch": 0.2663706992230855,
"grad_norm": 1.2439124542464444,
"learning_rate": 9.96168582375479e-07,
"loss": 0.4913,
"step": 120
},
{
"epoch": 0.2685904550499445,
"grad_norm": 1.280079305801681,
"learning_rate": 9.960363124920087e-07,
"loss": 0.5116,
"step": 121
},
{
"epoch": 0.27081021087680357,
"grad_norm": 1.2624652099591023,
"learning_rate": 9.959037378392216e-07,
"loss": 0.5099,
"step": 122
},
{
"epoch": 0.2730299667036626,
"grad_norm": 1.345328014236658,
"learning_rate": 9.957708573625527e-07,
"loss": 0.473,
"step": 123
},
{
"epoch": 0.27524972253052166,
"grad_norm": 1.3398634155494387,
"learning_rate": 9.95637670002566e-07,
"loss": 0.5058,
"step": 124
},
{
"epoch": 0.27746947835738067,
"grad_norm": 1.4620545182624154,
"learning_rate": 9.95504174694926e-07,
"loss": 0.5056,
"step": 125
},
{
"epoch": 0.27968923418423974,
"grad_norm": 1.3816636005003708,
"learning_rate": 9.953703703703704e-07,
"loss": 0.4986,
"step": 126
},
{
"epoch": 0.28190899001109876,
"grad_norm": 1.4382772137460398,
"learning_rate": 9.952362559546802e-07,
"loss": 0.5216,
"step": 127
},
{
"epoch": 0.28412874583795783,
"grad_norm": 1.28382749112946,
"learning_rate": 9.951018303686517e-07,
"loss": 0.5172,
"step": 128
},
{
"epoch": 0.28634850166481685,
"grad_norm": 1.2599739929931204,
"learning_rate": 9.949670925280681e-07,
"loss": 0.5084,
"step": 129
},
{
"epoch": 0.2885682574916759,
"grad_norm": 1.4005233619233985,
"learning_rate": 9.94832041343669e-07,
"loss": 0.4911,
"step": 130
},
{
"epoch": 0.290788013318535,
"grad_norm": 1.2931159398960264,
"learning_rate": 9.946966757211227e-07,
"loss": 0.5065,
"step": 131
},
{
"epoch": 0.293007769145394,
"grad_norm": 1.272184000110546,
"learning_rate": 9.945609945609945e-07,
"loss": 0.4924,
"step": 132
},
{
"epoch": 0.2952275249722531,
"grad_norm": 1.2437113070167456,
"learning_rate": 9.94424996758719e-07,
"loss": 0.4599,
"step": 133
},
{
"epoch": 0.2974472807991121,
"grad_norm": 1.4773247625940102,
"learning_rate": 9.942886812045691e-07,
"loss": 0.4857,
"step": 134
},
{
"epoch": 0.29966703662597116,
"grad_norm": 1.2008656849842716,
"learning_rate": 9.941520467836258e-07,
"loss": 0.4963,
"step": 135
},
{
"epoch": 0.3018867924528302,
"grad_norm": 1.3929619707536258,
"learning_rate": 9.94015092375748e-07,
"loss": 0.4684,
"step": 136
},
{
"epoch": 0.30410654827968925,
"grad_norm": 1.2767970377058495,
"learning_rate": 9.938778168555424e-07,
"loss": 0.4851,
"step": 137
},
{
"epoch": 0.30632630410654826,
"grad_norm": 1.2651068865999724,
"learning_rate": 9.937402190923317e-07,
"loss": 0.4991,
"step": 138
},
{
"epoch": 0.30854605993340734,
"grad_norm": 1.1342128631392274,
"learning_rate": 9.93602297950124e-07,
"loss": 0.5015,
"step": 139
},
{
"epoch": 0.31076581576026635,
"grad_norm": 1.3536175243314477,
"learning_rate": 9.934640522875816e-07,
"loss": 0.4759,
"step": 140
},
{
"epoch": 0.3129855715871254,
"grad_norm": 1.2186849663505919,
"learning_rate": 9.933254809579898e-07,
"loss": 0.4948,
"step": 141
},
{
"epoch": 0.31520532741398444,
"grad_norm": 1.2320342626044847,
"learning_rate": 9.931865828092243e-07,
"loss": 0.4887,
"step": 142
},
{
"epoch": 0.3174250832408435,
"grad_norm": 1.417534749096284,
"learning_rate": 9.930473566837202e-07,
"loss": 0.4714,
"step": 143
},
{
"epoch": 0.3196448390677026,
"grad_norm": 1.2396165037232065,
"learning_rate": 9.929078014184397e-07,
"loss": 0.4834,
"step": 144
},
{
"epoch": 0.3218645948945616,
"grad_norm": 1.494912541496174,
"learning_rate": 9.92767915844839e-07,
"loss": 0.4882,
"step": 145
},
{
"epoch": 0.32408435072142067,
"grad_norm": 4.4612294268387975,
"learning_rate": 9.926276987888362e-07,
"loss": 0.4855,
"step": 146
},
{
"epoch": 0.3263041065482797,
"grad_norm": 1.2043063588616731,
"learning_rate": 9.924871490707788e-07,
"loss": 0.4936,
"step": 147
},
{
"epoch": 0.32852386237513875,
"grad_norm": 1.287740870218377,
"learning_rate": 9.923462655054104e-07,
"loss": 0.4837,
"step": 148
},
{
"epoch": 0.33074361820199777,
"grad_norm": 1.5935225239592739,
"learning_rate": 9.922050469018363e-07,
"loss": 0.4947,
"step": 149
},
{
"epoch": 0.33296337402885684,
"grad_norm": 1.2023766619786924,
"learning_rate": 9.92063492063492e-07,
"loss": 0.4334,
"step": 150
},
{
"epoch": 0.33518312985571586,
"grad_norm": 1.5038127059764161,
"learning_rate": 9.919215997881075e-07,
"loss": 0.4745,
"step": 151
},
{
"epoch": 0.3374028856825749,
"grad_norm": 1.322847323172384,
"learning_rate": 9.917793688676744e-07,
"loss": 0.4842,
"step": 152
},
{
"epoch": 0.33962264150943394,
"grad_norm": 1.3698735083886853,
"learning_rate": 9.91636798088411e-07,
"loss": 0.4627,
"step": 153
},
{
"epoch": 0.341842397336293,
"grad_norm": 1.1751076647142753,
"learning_rate": 9.914938862307282e-07,
"loss": 0.4954,
"step": 154
},
{
"epoch": 0.34406215316315203,
"grad_norm": 1.252282484942035,
"learning_rate": 9.91350632069195e-07,
"loss": 0.515,
"step": 155
},
{
"epoch": 0.3462819089900111,
"grad_norm": 1.1925027486739732,
"learning_rate": 9.91207034372502e-07,
"loss": 0.5076,
"step": 156
},
{
"epoch": 0.34850166481687017,
"grad_norm": 1.156886995816501,
"learning_rate": 9.91063091903428e-07,
"loss": 0.4588,
"step": 157
},
{
"epoch": 0.3507214206437292,
"grad_norm": 1.254079717620731,
"learning_rate": 9.909188034188032e-07,
"loss": 0.4387,
"step": 158
},
{
"epoch": 0.35294117647058826,
"grad_norm": 1.3476069702564326,
"learning_rate": 9.907741676694746e-07,
"loss": 0.5,
"step": 159
},
{
"epoch": 0.3551609322974473,
"grad_norm": 1.1852846148649574,
"learning_rate": 9.906291834002677e-07,
"loss": 0.5027,
"step": 160
},
{
"epoch": 0.35738068812430634,
"grad_norm": 1.1877456827279553,
"learning_rate": 9.90483849349953e-07,
"loss": 0.4803,
"step": 161
},
{
"epoch": 0.35960044395116536,
"grad_norm": 1.2427891755370146,
"learning_rate": 9.903381642512078e-07,
"loss": 0.4789,
"step": 162
},
{
"epoch": 0.36182019977802443,
"grad_norm": 1.098532077739215,
"learning_rate": 9.90192126830579e-07,
"loss": 0.4777,
"step": 163
},
{
"epoch": 0.36403995560488345,
"grad_norm": 1.168399201796612,
"learning_rate": 9.900457358084477e-07,
"loss": 0.5102,
"step": 164
},
{
"epoch": 0.3662597114317425,
"grad_norm": 2.213806937146682,
"learning_rate": 9.898989898989898e-07,
"loss": 0.5044,
"step": 165
},
{
"epoch": 0.36847946725860153,
"grad_norm": 1.2440142016900406,
"learning_rate": 9.8975188781014e-07,
"loss": 0.4744,
"step": 166
},
{
"epoch": 0.3706992230854606,
"grad_norm": 1.3052497044223825,
"learning_rate": 9.896044282435533e-07,
"loss": 0.4971,
"step": 167
},
{
"epoch": 0.3729189789123196,
"grad_norm": 1.2043761038679413,
"learning_rate": 9.89456609894566e-07,
"loss": 0.4406,
"step": 168
},
{
"epoch": 0.3751387347391787,
"grad_norm": 1.2022759347548375,
"learning_rate": 9.893084314521587e-07,
"loss": 0.486,
"step": 169
},
{
"epoch": 0.37735849056603776,
"grad_norm": 1.5411131311153146,
"learning_rate": 9.89159891598916e-07,
"loss": 0.4745,
"step": 170
},
{
"epoch": 0.3795782463928968,
"grad_norm": 1.5173133638369167,
"learning_rate": 9.89010989010989e-07,
"loss": 0.4847,
"step": 171
},
{
"epoch": 0.38179800221975585,
"grad_norm": 1.3709113146054337,
"learning_rate": 9.888617223580548e-07,
"loss": 0.4859,
"step": 172
},
{
"epoch": 0.38401775804661487,
"grad_norm": 1.1941263065851728,
"learning_rate": 9.887120903032776e-07,
"loss": 0.499,
"step": 173
},
{
"epoch": 0.38623751387347394,
"grad_norm": 1.2043015575494198,
"learning_rate": 9.88562091503268e-07,
"loss": 0.4902,
"step": 174
},
{
"epoch": 0.38845726970033295,
"grad_norm": 1.5205275361868167,
"learning_rate": 9.884117246080436e-07,
"loss": 0.4861,
"step": 175
},
{
"epoch": 0.390677025527192,
"grad_norm": 1.1068775140013685,
"learning_rate": 9.882609882609883e-07,
"loss": 0.487,
"step": 176
},
{
"epoch": 0.39289678135405104,
"grad_norm": 1.3479560920415647,
"learning_rate": 9.88109881098811e-07,
"loss": 0.549,
"step": 177
},
{
"epoch": 0.3951165371809101,
"grad_norm": 1.1681890050238768,
"learning_rate": 9.87958401751505e-07,
"loss": 0.4529,
"step": 178
},
{
"epoch": 0.3973362930077691,
"grad_norm": 1.1510714354592815,
"learning_rate": 9.87806548842307e-07,
"loss": 0.4776,
"step": 179
},
{
"epoch": 0.3995560488346282,
"grad_norm": 1.183102322925365,
"learning_rate": 9.876543209876544e-07,
"loss": 0.4897,
"step": 180
},
{
"epoch": 0.4017758046614872,
"grad_norm": 3.10446224930389,
"learning_rate": 9.875017167971433e-07,
"loss": 0.4818,
"step": 181
},
{
"epoch": 0.4039955604883463,
"grad_norm": 1.1580041989577758,
"learning_rate": 9.873487348734873e-07,
"loss": 0.456,
"step": 182
},
{
"epoch": 0.40621531631520535,
"grad_norm": 1.182486697276762,
"learning_rate": 9.87195373812474e-07,
"loss": 0.479,
"step": 183
},
{
"epoch": 0.40843507214206437,
"grad_norm": 1.1946276510033909,
"learning_rate": 9.870416322029225e-07,
"loss": 0.4603,
"step": 184
},
{
"epoch": 0.41065482796892344,
"grad_norm": 1.2789687454969172,
"learning_rate": 9.86887508626639e-07,
"loss": 0.4801,
"step": 185
},
{
"epoch": 0.41287458379578246,
"grad_norm": 1.2477786049263804,
"learning_rate": 9.867330016583748e-07,
"loss": 0.4859,
"step": 186
},
{
"epoch": 0.41509433962264153,
"grad_norm": 1.2098018904070043,
"learning_rate": 9.86578109865781e-07,
"loss": 0.4795,
"step": 187
},
{
"epoch": 0.41731409544950054,
"grad_norm": 1.1843208583676361,
"learning_rate": 9.864228318093655e-07,
"loss": 0.4777,
"step": 188
},
{
"epoch": 0.4195338512763596,
"grad_norm": 1.4793512914013776,
"learning_rate": 9.862671660424468e-07,
"loss": 0.4822,
"step": 189
},
{
"epoch": 0.42175360710321863,
"grad_norm": 1.4592257280425385,
"learning_rate": 9.861111111111112e-07,
"loss": 0.4575,
"step": 190
},
{
"epoch": 0.4239733629300777,
"grad_norm": 1.2669454524463584,
"learning_rate": 9.859546655541649e-07,
"loss": 0.4555,
"step": 191
},
{
"epoch": 0.4261931187569367,
"grad_norm": 1.1138120648848997,
"learning_rate": 9.857978279030909e-07,
"loss": 0.4837,
"step": 192
},
{
"epoch": 0.4284128745837958,
"grad_norm": 1.1842746998711116,
"learning_rate": 9.85640596682002e-07,
"loss": 0.4705,
"step": 193
},
{
"epoch": 0.4306326304106548,
"grad_norm": 1.8958140258222622,
"learning_rate": 9.854829704075935e-07,
"loss": 0.4773,
"step": 194
},
{
"epoch": 0.4328523862375139,
"grad_norm": 1.9842874728486837,
"learning_rate": 9.853249475890984e-07,
"loss": 0.514,
"step": 195
},
{
"epoch": 0.43507214206437295,
"grad_norm": 1.2411453520509366,
"learning_rate": 9.851665267282396e-07,
"loss": 0.4689,
"step": 196
},
{
"epoch": 0.43729189789123196,
"grad_norm": 2.6834809304915552,
"learning_rate": 9.850077063191818e-07,
"loss": 0.4913,
"step": 197
},
{
"epoch": 0.43951165371809103,
"grad_norm": 2.617374737158509,
"learning_rate": 9.848484848484847e-07,
"loss": 0.4686,
"step": 198
},
{
"epoch": 0.44173140954495005,
"grad_norm": 1.3023938843230873,
"learning_rate": 9.846888607950555e-07,
"loss": 0.4708,
"step": 199
},
{
"epoch": 0.4439511653718091,
"grad_norm": 1.1559453889413471,
"learning_rate": 9.845288326300983e-07,
"loss": 0.4847,
"step": 200
},
{
"epoch": 0.44617092119866814,
"grad_norm": 1.932913560002246,
"learning_rate": 9.84368398817068e-07,
"loss": 0.505,
"step": 201
},
{
"epoch": 0.4483906770255272,
"grad_norm": 1.2322652967881342,
"learning_rate": 9.842075578116187e-07,
"loss": 0.4844,
"step": 202
},
{
"epoch": 0.4506104328523862,
"grad_norm": 1.1577427278107666,
"learning_rate": 9.840463080615557e-07,
"loss": 0.48,
"step": 203
},
{
"epoch": 0.4528301886792453,
"grad_norm": 1.1732950223965302,
"learning_rate": 9.838846480067854e-07,
"loss": 0.4937,
"step": 204
},
{
"epoch": 0.4550499445061043,
"grad_norm": 1.2128135155508084,
"learning_rate": 9.83722576079264e-07,
"loss": 0.4481,
"step": 205
},
{
"epoch": 0.4572697003329634,
"grad_norm": 1.3789926682637172,
"learning_rate": 9.835600907029478e-07,
"loss": 0.4694,
"step": 206
},
{
"epoch": 0.4594894561598224,
"grad_norm": 1.3013032938491416,
"learning_rate": 9.833971902937419e-07,
"loss": 0.4582,
"step": 207
},
{
"epoch": 0.46170921198668147,
"grad_norm": 1.2874771776462224,
"learning_rate": 9.832338732594486e-07,
"loss": 0.4971,
"step": 208
},
{
"epoch": 0.46392896781354054,
"grad_norm": 1.1720459802557406,
"learning_rate": 9.830701379997154e-07,
"loss": 0.4612,
"step": 209
},
{
"epoch": 0.46614872364039955,
"grad_norm": 1.3336018286890268,
"learning_rate": 9.829059829059829e-07,
"loss": 0.4905,
"step": 210
},
{
"epoch": 0.4683684794672586,
"grad_norm": 1.163695673111041,
"learning_rate": 9.827414063614321e-07,
"loss": 0.4661,
"step": 211
},
{
"epoch": 0.47058823529411764,
"grad_norm": 1.1345772287223108,
"learning_rate": 9.825764067409312e-07,
"loss": 0.4809,
"step": 212
},
{
"epoch": 0.4728079911209767,
"grad_norm": 1.1200862265686184,
"learning_rate": 9.824109824109824e-07,
"loss": 0.4442,
"step": 213
},
{
"epoch": 0.4750277469478357,
"grad_norm": 1.0965562558931972,
"learning_rate": 9.822451317296677e-07,
"loss": 0.48,
"step": 214
},
{
"epoch": 0.4772475027746948,
"grad_norm": 1.1933430382619212,
"learning_rate": 9.82078853046595e-07,
"loss": 0.4563,
"step": 215
},
{
"epoch": 0.4794672586015538,
"grad_norm": 1.1998968490880912,
"learning_rate": 9.819121447028424e-07,
"loss": 0.4756,
"step": 216
},
{
"epoch": 0.4816870144284129,
"grad_norm": 1.1502035962314217,
"learning_rate": 9.817450050309042e-07,
"loss": 0.4703,
"step": 217
},
{
"epoch": 0.4839067702552719,
"grad_norm": 1.270076075579517,
"learning_rate": 9.815774323546344e-07,
"loss": 0.5022,
"step": 218
},
{
"epoch": 0.48612652608213097,
"grad_norm": 1.18116171953247,
"learning_rate": 9.814094249891915e-07,
"loss": 0.4889,
"step": 219
},
{
"epoch": 0.48834628190899,
"grad_norm": 1.1529220976426569,
"learning_rate": 9.812409812409812e-07,
"loss": 0.4591,
"step": 220
},
{
"epoch": 0.49056603773584906,
"grad_norm": 1.3482489121076722,
"learning_rate": 9.810720994076e-07,
"loss": 0.5174,
"step": 221
},
{
"epoch": 0.49278579356270813,
"grad_norm": 1.1388000036876147,
"learning_rate": 9.809027777777776e-07,
"loss": 0.488,
"step": 222
},
{
"epoch": 0.49500554938956715,
"grad_norm": 1.3402730462866097,
"learning_rate": 9.807330146313196e-07,
"loss": 0.4707,
"step": 223
},
{
"epoch": 0.4972253052164262,
"grad_norm": 1.1624876644697852,
"learning_rate": 9.805628082390483e-07,
"loss": 0.4868,
"step": 224
},
{
"epoch": 0.49944506104328523,
"grad_norm": 1.2049149784394284,
"learning_rate": 9.80392156862745e-07,
"loss": 0.4875,
"step": 225
},
{
"epoch": 0.5016648168701443,
"grad_norm": 1.1421640775244137,
"learning_rate": 9.802210587550902e-07,
"loss": 0.4601,
"step": 226
},
{
"epoch": 0.5038845726970034,
"grad_norm": 1.184925206570989,
"learning_rate": 9.800495121596038e-07,
"loss": 0.4606,
"step": 227
},
{
"epoch": 0.5061043285238623,
"grad_norm": 1.1520289725248647,
"learning_rate": 9.79877515310586e-07,
"loss": 0.4674,
"step": 228
},
{
"epoch": 0.5083240843507214,
"grad_norm": 1.252272773720146,
"learning_rate": 9.79705066433056e-07,
"loss": 0.4836,
"step": 229
},
{
"epoch": 0.5105438401775805,
"grad_norm": 1.1392993281880255,
"learning_rate": 9.7953216374269e-07,
"loss": 0.4582,
"step": 230
},
{
"epoch": 0.5127635960044395,
"grad_norm": 1.2813095854917667,
"learning_rate": 9.79358805445762e-07,
"loss": 0.4837,
"step": 231
},
{
"epoch": 0.5149833518312985,
"grad_norm": 1.2741773233692082,
"learning_rate": 9.791849897390794e-07,
"loss": 0.4822,
"step": 232
},
{
"epoch": 0.5172031076581576,
"grad_norm": 1.3581663168459062,
"learning_rate": 9.79010714809922e-07,
"loss": 0.4656,
"step": 233
},
{
"epoch": 0.5194228634850167,
"grad_norm": 2.104111037313451,
"learning_rate": 9.788359788359789e-07,
"loss": 0.5158,
"step": 234
},
{
"epoch": 0.5216426193118757,
"grad_norm": 1.2100715298455516,
"learning_rate": 9.78660779985283e-07,
"loss": 0.4593,
"step": 235
},
{
"epoch": 0.5238623751387348,
"grad_norm": 1.513760002478016,
"learning_rate": 9.78485116416151e-07,
"loss": 0.4717,
"step": 236
},
{
"epoch": 0.5260821309655938,
"grad_norm": 1.1016577801281828,
"learning_rate": 9.783089862771138e-07,
"loss": 0.4695,
"step": 237
},
{
"epoch": 0.5283018867924528,
"grad_norm": 1.9018294843725223,
"learning_rate": 9.781323877068556e-07,
"loss": 0.4757,
"step": 238
},
{
"epoch": 0.5305216426193119,
"grad_norm": 1.1210699506672195,
"learning_rate": 9.779553188341472e-07,
"loss": 0.475,
"step": 239
},
{
"epoch": 0.532741398446171,
"grad_norm": 1.6761590116014484,
"learning_rate": 9.777777777777778e-07,
"loss": 0.4586,
"step": 240
},
{
"epoch": 0.5349611542730299,
"grad_norm": 1.2531925677261875,
"learning_rate": 9.775997626464915e-07,
"loss": 0.4936,
"step": 241
},
{
"epoch": 0.537180910099889,
"grad_norm": 1.2820596623624074,
"learning_rate": 9.774212715389185e-07,
"loss": 0.4765,
"step": 242
},
{
"epoch": 0.5394006659267481,
"grad_norm": 1.2797631555506939,
"learning_rate": 9.772423025435074e-07,
"loss": 0.4621,
"step": 243
},
{
"epoch": 0.5416204217536071,
"grad_norm": 1.1547969793343313,
"learning_rate": 9.77062853738457e-07,
"loss": 0.4926,
"step": 244
},
{
"epoch": 0.5438401775804661,
"grad_norm": 1.2176966207623454,
"learning_rate": 9.76882923191648e-07,
"loss": 0.4658,
"step": 245
},
{
"epoch": 0.5460599334073252,
"grad_norm": 1.2205016744946482,
"learning_rate": 9.767025089605736e-07,
"loss": 0.4734,
"step": 246
},
{
"epoch": 0.5482796892341842,
"grad_norm": 1.3715021678576031,
"learning_rate": 9.765216090922686e-07,
"loss": 0.4634,
"step": 247
},
{
"epoch": 0.5504994450610433,
"grad_norm": 1.1265979634435113,
"learning_rate": 9.763402216232405e-07,
"loss": 0.4831,
"step": 248
},
{
"epoch": 0.5527192008879024,
"grad_norm": 1.2027088078575965,
"learning_rate": 9.761583445793972e-07,
"loss": 0.4913,
"step": 249
},
{
"epoch": 0.5549389567147613,
"grad_norm": 1.2098376963577495,
"learning_rate": 9.75975975975976e-07,
"loss": 0.4867,
"step": 250
},
{
"epoch": 0.5571587125416204,
"grad_norm": 1.229754850841517,
"learning_rate": 9.75793113817471e-07,
"loss": 0.4387,
"step": 251
},
{
"epoch": 0.5593784683684795,
"grad_norm": 1.1234804098250084,
"learning_rate": 9.756097560975609e-07,
"loss": 0.5042,
"step": 252
},
{
"epoch": 0.5615982241953386,
"grad_norm": 1.9036530663015045,
"learning_rate": 9.75425900799035e-07,
"loss": 0.4775,
"step": 253
},
{
"epoch": 0.5638179800221975,
"grad_norm": 1.3497568353885407,
"learning_rate": 9.752415458937197e-07,
"loss": 0.4589,
"step": 254
},
{
"epoch": 0.5660377358490566,
"grad_norm": 1.3514286330767507,
"learning_rate": 9.750566893424036e-07,
"loss": 0.4751,
"step": 255
},
{
"epoch": 0.5682574916759157,
"grad_norm": 1.190120995376288,
"learning_rate": 9.748713290947623e-07,
"loss": 0.4762,
"step": 256
},
{
"epoch": 0.5704772475027747,
"grad_norm": 1.2646510711529224,
"learning_rate": 9.74685463089283e-07,
"loss": 0.4544,
"step": 257
},
{
"epoch": 0.5726970033296337,
"grad_norm": 1.3453068295168764,
"learning_rate": 9.744990892531876e-07,
"loss": 0.4677,
"step": 258
},
{
"epoch": 0.5749167591564928,
"grad_norm": 3.4979722875360952,
"learning_rate": 9.74312205502356e-07,
"loss": 0.4896,
"step": 259
},
{
"epoch": 0.5771365149833518,
"grad_norm": 1.2532683261068736,
"learning_rate": 9.74124809741248e-07,
"loss": 0.4452,
"step": 260
},
{
"epoch": 0.5793562708102109,
"grad_norm": 1.4072099932257984,
"learning_rate": 9.739368998628257e-07,
"loss": 0.4677,
"step": 261
},
{
"epoch": 0.58157602663707,
"grad_norm": 1.2844489474254415,
"learning_rate": 9.737484737484737e-07,
"loss": 0.496,
"step": 262
},
{
"epoch": 0.5837957824639289,
"grad_norm": 1.1205502247017423,
"learning_rate": 9.735595292679198e-07,
"loss": 0.4944,
"step": 263
},
{
"epoch": 0.586015538290788,
"grad_norm": 1.1335936128840318,
"learning_rate": 9.733700642791553e-07,
"loss": 0.4588,
"step": 264
},
{
"epoch": 0.5882352941176471,
"grad_norm": 1.1909628036709445,
"learning_rate": 9.731800766283525e-07,
"loss": 0.4591,
"step": 265
},
{
"epoch": 0.5904550499445061,
"grad_norm": 1.0798410289093272,
"learning_rate": 9.729895641497852e-07,
"loss": 0.4733,
"step": 266
},
{
"epoch": 0.5926748057713651,
"grad_norm": 1.1518991537178038,
"learning_rate": 9.727985246657446e-07,
"loss": 0.4894,
"step": 267
},
{
"epoch": 0.5948945615982242,
"grad_norm": 1.2167928052629051,
"learning_rate": 9.726069559864573e-07,
"loss": 0.4632,
"step": 268
},
{
"epoch": 0.5971143174250833,
"grad_norm": 1.0891996396945058,
"learning_rate": 9.724148559100016e-07,
"loss": 0.4648,
"step": 269
},
{
"epoch": 0.5993340732519423,
"grad_norm": 1.202740165554673,
"learning_rate": 9.722222222222222e-07,
"loss": 0.4835,
"step": 270
},
{
"epoch": 0.6015538290788013,
"grad_norm": 1.1271713441331268,
"learning_rate": 9.720290526966466e-07,
"loss": 0.4757,
"step": 271
},
{
"epoch": 0.6037735849056604,
"grad_norm": 1.268219289831493,
"learning_rate": 9.71835345094398e-07,
"loss": 0.4518,
"step": 272
},
{
"epoch": 0.6059933407325194,
"grad_norm": 1.3048586285945205,
"learning_rate": 9.716410971641097e-07,
"loss": 0.4686,
"step": 273
},
{
"epoch": 0.6082130965593785,
"grad_norm": 1.2646393518299217,
"learning_rate": 9.714463066418373e-07,
"loss": 0.4544,
"step": 274
},
{
"epoch": 0.6104328523862376,
"grad_norm": 1.246098492059234,
"learning_rate": 9.712509712509713e-07,
"loss": 0.418,
"step": 275
},
{
"epoch": 0.6126526082130965,
"grad_norm": 1.2773926202045507,
"learning_rate": 9.710550887021476e-07,
"loss": 0.4728,
"step": 276
},
{
"epoch": 0.6148723640399556,
"grad_norm": 1.1731853445553053,
"learning_rate": 9.708586566931587e-07,
"loss": 0.4567,
"step": 277
},
{
"epoch": 0.6170921198668147,
"grad_norm": 1.3328475152898882,
"learning_rate": 9.70661672908864e-07,
"loss": 0.4713,
"step": 278
},
{
"epoch": 0.6193118756936737,
"grad_norm": 1.1482955010668634,
"learning_rate": 9.70464135021097e-07,
"loss": 0.4895,
"step": 279
},
{
"epoch": 0.6215316315205327,
"grad_norm": 1.164481369160247,
"learning_rate": 9.702660406885758e-07,
"loss": 0.4527,
"step": 280
},
{
"epoch": 0.6237513873473918,
"grad_norm": 1.1898936972633753,
"learning_rate": 9.700673875568092e-07,
"loss": 0.4636,
"step": 281
},
{
"epoch": 0.6259711431742508,
"grad_norm": 1.2690117073282012,
"learning_rate": 9.698681732580038e-07,
"loss": 0.4924,
"step": 282
},
{
"epoch": 0.6281908990011099,
"grad_norm": 1.1267647194841144,
"learning_rate": 9.696683954109696e-07,
"loss": 0.4444,
"step": 283
},
{
"epoch": 0.6304106548279689,
"grad_norm": 1.1860501989885999,
"learning_rate": 9.694680516210262e-07,
"loss": 0.4841,
"step": 284
},
{
"epoch": 0.632630410654828,
"grad_norm": 1.4878295849057805,
"learning_rate": 9.692671394799055e-07,
"loss": 0.4591,
"step": 285
},
{
"epoch": 0.634850166481687,
"grad_norm": 1.1868310362481995,
"learning_rate": 9.690656565656565e-07,
"loss": 0.4654,
"step": 286
},
{
"epoch": 0.6370699223085461,
"grad_norm": 1.2316539504907182,
"learning_rate": 9.688636004425479e-07,
"loss": 0.4518,
"step": 287
},
{
"epoch": 0.6392896781354052,
"grad_norm": 1.2704968380561188,
"learning_rate": 9.686609686609686e-07,
"loss": 0.4798,
"step": 288
},
{
"epoch": 0.6415094339622641,
"grad_norm": 1.2632953724431832,
"learning_rate": 9.684577587573309e-07,
"loss": 0.4471,
"step": 289
},
{
"epoch": 0.6437291897891232,
"grad_norm": 1.0780974022320988,
"learning_rate": 9.682539682539682e-07,
"loss": 0.4555,
"step": 290
},
{
"epoch": 0.6459489456159823,
"grad_norm": 1.207590913191836,
"learning_rate": 9.680495946590367e-07,
"loss": 0.4549,
"step": 291
},
{
"epoch": 0.6481687014428413,
"grad_norm": 1.3017930886634022,
"learning_rate": 9.67844635466412e-07,
"loss": 0.4721,
"step": 292
},
{
"epoch": 0.6503884572697003,
"grad_norm": 1.2987311419798997,
"learning_rate": 9.676390881555874e-07,
"loss": 0.4751,
"step": 293
},
{
"epoch": 0.6526082130965594,
"grad_norm": 1.1063543285266058,
"learning_rate": 9.674329501915708e-07,
"loss": 0.465,
"step": 294
},
{
"epoch": 0.6548279689234184,
"grad_norm": 1.2064062875188606,
"learning_rate": 9.672262190247801e-07,
"loss": 0.4509,
"step": 295
},
{
"epoch": 0.6570477247502775,
"grad_norm": 2.002682048611068,
"learning_rate": 9.670188920909382e-07,
"loss": 0.4593,
"step": 296
},
{
"epoch": 0.6592674805771365,
"grad_norm": 1.1171063749595782,
"learning_rate": 9.668109668109667e-07,
"loss": 0.4584,
"step": 297
},
{
"epoch": 0.6614872364039955,
"grad_norm": 1.3329915028296324,
"learning_rate": 9.666024405908798e-07,
"loss": 0.4809,
"step": 298
},
{
"epoch": 0.6637069922308546,
"grad_norm": 1.1190182828582709,
"learning_rate": 9.663933108216756e-07,
"loss": 0.4596,
"step": 299
},
{
"epoch": 0.6659267480577137,
"grad_norm": 1.230691071202789,
"learning_rate": 9.66183574879227e-07,
"loss": 0.4773,
"step": 300
},
{
"epoch": 0.6681465038845728,
"grad_norm": 1.0969420350931496,
"learning_rate": 9.659732301241734e-07,
"loss": 0.4709,
"step": 301
},
{
"epoch": 0.6703662597114317,
"grad_norm": 1.1935278681188108,
"learning_rate": 9.657622739018086e-07,
"loss": 0.4713,
"step": 302
},
{
"epoch": 0.6725860155382908,
"grad_norm": 1.1497579674788543,
"learning_rate": 9.6555070354197e-07,
"loss": 0.4495,
"step": 303
},
{
"epoch": 0.6748057713651499,
"grad_norm": 1.5074031109473398,
"learning_rate": 9.653385163589246e-07,
"loss": 0.4696,
"step": 304
},
{
"epoch": 0.6770255271920089,
"grad_norm": 1.310465827597004,
"learning_rate": 9.65125709651257e-07,
"loss": 0.4803,
"step": 305
},
{
"epoch": 0.6792452830188679,
"grad_norm": 1.1816909676159393,
"learning_rate": 9.649122807017545e-07,
"loss": 0.4911,
"step": 306
},
{
"epoch": 0.681465038845727,
"grad_norm": 1.3977695099320595,
"learning_rate": 9.646982267772897e-07,
"loss": 0.4322,
"step": 307
},
{
"epoch": 0.683684794672586,
"grad_norm": 1.0483630262556494,
"learning_rate": 9.644835451287064e-07,
"loss": 0.4604,
"step": 308
},
{
"epoch": 0.6859045504994451,
"grad_norm": 1.143328369080368,
"learning_rate": 9.642682329906999e-07,
"loss": 0.4711,
"step": 309
},
{
"epoch": 0.6881243063263041,
"grad_norm": 1.0876189100922673,
"learning_rate": 9.640522875816993e-07,
"loss": 0.4589,
"step": 310
},
{
"epoch": 0.6903440621531631,
"grad_norm": 1.2496179510548522,
"learning_rate": 9.638357061037473e-07,
"loss": 0.4442,
"step": 311
},
{
"epoch": 0.6925638179800222,
"grad_norm": 1.334399245591051,
"learning_rate": 9.636184857423795e-07,
"loss": 0.4698,
"step": 312
},
{
"epoch": 0.6947835738068813,
"grad_norm": 1.374440791039879,
"learning_rate": 9.634006236665025e-07,
"loss": 0.461,
"step": 313
},
{
"epoch": 0.6970033296337403,
"grad_norm": 1.3126306926020817,
"learning_rate": 9.63182117028271e-07,
"loss": 0.4677,
"step": 314
},
{
"epoch": 0.6992230854605993,
"grad_norm": 1.30568674164611,
"learning_rate": 9.629629629629628e-07,
"loss": 0.472,
"step": 315
},
{
"epoch": 0.7014428412874584,
"grad_norm": 1.2946732132074095,
"learning_rate": 9.627431585888558e-07,
"loss": 0.4471,
"step": 316
},
{
"epoch": 0.7036625971143174,
"grad_norm": 1.1052745863913855,
"learning_rate": 9.625227010070992e-07,
"loss": 0.4593,
"step": 317
},
{
"epoch": 0.7058823529411765,
"grad_norm": 1.2464058744248228,
"learning_rate": 9.623015873015874e-07,
"loss": 0.4499,
"step": 318
},
{
"epoch": 0.7081021087680355,
"grad_norm": 1.1991454487689286,
"learning_rate": 9.620798145388308e-07,
"loss": 0.4929,
"step": 319
},
{
"epoch": 0.7103218645948945,
"grad_norm": 1.117849019068085,
"learning_rate": 9.618573797678275e-07,
"loss": 0.4651,
"step": 320
},
{
"epoch": 0.7125416204217536,
"grad_norm": 1.8116588599393262,
"learning_rate": 9.616342800199301e-07,
"loss": 0.4644,
"step": 321
},
{
"epoch": 0.7147613762486127,
"grad_norm": 1.3083582936161693,
"learning_rate": 9.614105123087158e-07,
"loss": 0.4822,
"step": 322
},
{
"epoch": 0.7169811320754716,
"grad_norm": 1.213650512640914,
"learning_rate": 9.611860736298516e-07,
"loss": 0.4695,
"step": 323
},
{
"epoch": 0.7192008879023307,
"grad_norm": 1.1404257325984666,
"learning_rate": 9.60960960960961e-07,
"loss": 0.4642,
"step": 324
},
{
"epoch": 0.7214206437291898,
"grad_norm": 1.2553157610647856,
"learning_rate": 9.60735171261487e-07,
"loss": 0.4627,
"step": 325
},
{
"epoch": 0.7236403995560489,
"grad_norm": 1.0750989884958981,
"learning_rate": 9.605087014725568e-07,
"loss": 0.4664,
"step": 326
},
{
"epoch": 0.7258601553829079,
"grad_norm": 1.146407946651267,
"learning_rate": 9.602815485168427e-07,
"loss": 0.4704,
"step": 327
},
{
"epoch": 0.7280799112097669,
"grad_norm": 1.2052851100604456,
"learning_rate": 9.600537092984222e-07,
"loss": 0.4482,
"step": 328
},
{
"epoch": 0.730299667036626,
"grad_norm": 1.1376530340216333,
"learning_rate": 9.59825180702639e-07,
"loss": 0.4371,
"step": 329
},
{
"epoch": 0.732519422863485,
"grad_norm": 1.2633210965308679,
"learning_rate": 9.595959595959596e-07,
"loss": 0.4647,
"step": 330
},
{
"epoch": 0.7347391786903441,
"grad_norm": 1.130890031131311,
"learning_rate": 9.593660428258304e-07,
"loss": 0.4417,
"step": 331
},
{
"epoch": 0.7369589345172031,
"grad_norm": 1.6554313379519792,
"learning_rate": 9.591354272205336e-07,
"loss": 0.4493,
"step": 332
},
{
"epoch": 0.7391786903440621,
"grad_norm": 1.1068174319261177,
"learning_rate": 9.58904109589041e-07,
"loss": 0.472,
"step": 333
},
{
"epoch": 0.7413984461709212,
"grad_norm": 1.5125525973713272,
"learning_rate": 9.586720867208672e-07,
"loss": 0.4929,
"step": 334
},
{
"epoch": 0.7436182019977803,
"grad_norm": 1.3329378728556205,
"learning_rate": 9.584393553859202e-07,
"loss": 0.4932,
"step": 335
},
{
"epoch": 0.7458379578246392,
"grad_norm": 1.067151392962331,
"learning_rate": 9.582059123343528e-07,
"loss": 0.4669,
"step": 336
},
{
"epoch": 0.7480577136514983,
"grad_norm": 1.221315492579946,
"learning_rate": 9.579717542964097e-07,
"loss": 0.4955,
"step": 337
},
{
"epoch": 0.7502774694783574,
"grad_norm": 1.1576509365414092,
"learning_rate": 9.577368779822768e-07,
"loss": 0.4518,
"step": 338
},
{
"epoch": 0.7524972253052165,
"grad_norm": 1.1450956197088513,
"learning_rate": 9.575012800819252e-07,
"loss": 0.4387,
"step": 339
},
{
"epoch": 0.7547169811320755,
"grad_norm": 1.1595835131376107,
"learning_rate": 9.572649572649572e-07,
"loss": 0.448,
"step": 340
},
{
"epoch": 0.7569367369589345,
"grad_norm": 1.1123527161829898,
"learning_rate": 9.570279061804486e-07,
"loss": 0.4603,
"step": 341
},
{
"epoch": 0.7591564927857936,
"grad_norm": 1.1853821634183275,
"learning_rate": 9.567901234567902e-07,
"loss": 0.4705,
"step": 342
},
{
"epoch": 0.7613762486126526,
"grad_norm": 1.1103107481328276,
"learning_rate": 9.565516057015283e-07,
"loss": 0.4596,
"step": 343
},
{
"epoch": 0.7635960044395117,
"grad_norm": 1.4055764520185114,
"learning_rate": 9.563123495012039e-07,
"loss": 0.4938,
"step": 344
},
{
"epoch": 0.7658157602663707,
"grad_norm": 1.1750825777251128,
"learning_rate": 9.560723514211886e-07,
"loss": 0.466,
"step": 345
},
{
"epoch": 0.7680355160932297,
"grad_norm": 1.5998973140767496,
"learning_rate": 9.55831608005521e-07,
"loss": 0.4445,
"step": 346
},
{
"epoch": 0.7702552719200888,
"grad_norm": 1.2332564173521476,
"learning_rate": 9.55590115776741e-07,
"loss": 0.4557,
"step": 347
},
{
"epoch": 0.7724750277469479,
"grad_norm": 1.178215276166538,
"learning_rate": 9.553478712357217e-07,
"loss": 0.4689,
"step": 348
},
{
"epoch": 0.7746947835738068,
"grad_norm": 1.7533561679314174,
"learning_rate": 9.551048708615012e-07,
"loss": 0.4562,
"step": 349
},
{
"epoch": 0.7769145394006659,
"grad_norm": 1.2467282871005076,
"learning_rate": 9.54861111111111e-07,
"loss": 0.4965,
"step": 350
},
{
"epoch": 0.779134295227525,
"grad_norm": 1.501873430859811,
"learning_rate": 9.546165884194052e-07,
"loss": 0.4655,
"step": 351
},
{
"epoch": 0.781354051054384,
"grad_norm": 1.1753902585686589,
"learning_rate": 9.543712991988853e-07,
"loss": 0.4631,
"step": 352
},
{
"epoch": 0.7835738068812431,
"grad_norm": 1.2254133258446995,
"learning_rate": 9.541252398395256e-07,
"loss": 0.4804,
"step": 353
},
{
"epoch": 0.7857935627081021,
"grad_norm": 1.2034719888364724,
"learning_rate": 9.538784067085953e-07,
"loss": 0.5163,
"step": 354
},
{
"epoch": 0.7880133185349611,
"grad_norm": 1.1137953498058453,
"learning_rate": 9.536307961504813e-07,
"loss": 0.465,
"step": 355
},
{
"epoch": 0.7902330743618202,
"grad_norm": 1.2049505304206072,
"learning_rate": 9.533824044865054e-07,
"loss": 0.4426,
"step": 356
},
{
"epoch": 0.7924528301886793,
"grad_norm": 1.146694810534997,
"learning_rate": 9.531332280147445e-07,
"loss": 0.4595,
"step": 357
},
{
"epoch": 0.7946725860155383,
"grad_norm": 1.285324115763439,
"learning_rate": 9.528832630098452e-07,
"loss": 0.4404,
"step": 358
},
{
"epoch": 0.7968923418423973,
"grad_norm": 1.1746063498596073,
"learning_rate": 9.526325057228385e-07,
"loss": 0.4556,
"step": 359
},
{
"epoch": 0.7991120976692564,
"grad_norm": 1.2397369281527393,
"learning_rate": 9.523809523809523e-07,
"loss": 0.4552,
"step": 360
},
{
"epoch": 0.8013318534961155,
"grad_norm": 1.2890444853501573,
"learning_rate": 9.521285991874226e-07,
"loss": 0.4895,
"step": 361
},
{
"epoch": 0.8035516093229744,
"grad_norm": 1.249790528516529,
"learning_rate": 9.518754423213023e-07,
"loss": 0.4611,
"step": 362
},
{
"epoch": 0.8057713651498335,
"grad_norm": 1.1607609355409652,
"learning_rate": 9.516214779372675e-07,
"loss": 0.4634,
"step": 363
},
{
"epoch": 0.8079911209766926,
"grad_norm": 1.1944680113287869,
"learning_rate": 9.513667021654242e-07,
"loss": 0.4658,
"step": 364
},
{
"epoch": 0.8102108768035516,
"grad_norm": 1.212730380100326,
"learning_rate": 9.51111111111111e-07,
"loss": 0.4652,
"step": 365
},
{
"epoch": 0.8124306326304107,
"grad_norm": 1.350952435196083,
"learning_rate": 9.508547008547009e-07,
"loss": 0.4821,
"step": 366
},
{
"epoch": 0.8146503884572697,
"grad_norm": 1.4170872636632865,
"learning_rate": 9.505974674514e-07,
"loss": 0.4584,
"step": 367
},
{
"epoch": 0.8168701442841287,
"grad_norm": 1.1374959977745926,
"learning_rate": 9.503394069310468e-07,
"loss": 0.4529,
"step": 368
},
{
"epoch": 0.8190899001109878,
"grad_norm": 1.1079720865904779,
"learning_rate": 9.500805152979066e-07,
"loss": 0.4569,
"step": 369
},
{
"epoch": 0.8213096559378469,
"grad_norm": 1.3283932417196296,
"learning_rate": 9.498207885304659e-07,
"loss": 0.457,
"step": 370
},
{
"epoch": 0.8235294117647058,
"grad_norm": 2.167492019217474,
"learning_rate": 9.495602225812242e-07,
"loss": 0.4861,
"step": 371
},
{
"epoch": 0.8257491675915649,
"grad_norm": 1.1806082837203582,
"learning_rate": 9.492988133764832e-07,
"loss": 0.4635,
"step": 372
},
{
"epoch": 0.827968923418424,
"grad_norm": 1.2232597162793415,
"learning_rate": 9.490365568161355e-07,
"loss": 0.4739,
"step": 373
},
{
"epoch": 0.8301886792452831,
"grad_norm": 1.2478222292556507,
"learning_rate": 9.487734487734488e-07,
"loss": 0.435,
"step": 374
},
{
"epoch": 0.832408435072142,
"grad_norm": 1.9264518586562565,
"learning_rate": 9.485094850948509e-07,
"loss": 0.4612,
"step": 375
},
{
"epoch": 0.8346281908990011,
"grad_norm": 1.1851589787911743,
"learning_rate": 9.482446615997105e-07,
"loss": 0.4741,
"step": 376
},
{
"epoch": 0.8368479467258602,
"grad_norm": 1.1482240087848128,
"learning_rate": 9.47978974080116e-07,
"loss": 0.4677,
"step": 377
},
{
"epoch": 0.8390677025527192,
"grad_norm": 1.1666675897500949,
"learning_rate": 9.477124183006535e-07,
"loss": 0.439,
"step": 378
},
{
"epoch": 0.8412874583795783,
"grad_norm": 1.2967015189495499,
"learning_rate": 9.474449899981814e-07,
"loss": 0.4389,
"step": 379
},
{
"epoch": 0.8435072142064373,
"grad_norm": 1.2240339325997547,
"learning_rate": 9.471766848816029e-07,
"loss": 0.4967,
"step": 380
},
{
"epoch": 0.8457269700332963,
"grad_norm": 1.165209891992943,
"learning_rate": 9.469074986316366e-07,
"loss": 0.4385,
"step": 381
},
{
"epoch": 0.8479467258601554,
"grad_norm": 1.1944098330708015,
"learning_rate": 9.466374269005847e-07,
"loss": 0.4526,
"step": 382
},
{
"epoch": 0.8501664816870145,
"grad_norm": 1.3220747050818542,
"learning_rate": 9.463664653120996e-07,
"loss": 0.4554,
"step": 383
},
{
"epoch": 0.8523862375138734,
"grad_norm": 1.156857474774302,
"learning_rate": 9.460946094609461e-07,
"loss": 0.5017,
"step": 384
},
{
"epoch": 0.8546059933407325,
"grad_norm": 2.5207247056310527,
"learning_rate": 9.45821854912764e-07,
"loss": 0.4877,
"step": 385
},
{
"epoch": 0.8568257491675916,
"grad_norm": 1.278904776025417,
"learning_rate": 9.455481972038263e-07,
"loss": 0.424,
"step": 386
},
{
"epoch": 0.8590455049944506,
"grad_norm": 1.3104661256143586,
"learning_rate": 9.45273631840796e-07,
"loss": 0.4836,
"step": 387
},
{
"epoch": 0.8612652608213096,
"grad_norm": 1.1577542337096038,
"learning_rate": 9.449981543004798e-07,
"loss": 0.4501,
"step": 388
},
{
"epoch": 0.8634850166481687,
"grad_norm": 1.2698206722641987,
"learning_rate": 9.447217600295803e-07,
"loss": 0.4708,
"step": 389
},
{
"epoch": 0.8657047724750278,
"grad_norm": 1.26815456990031,
"learning_rate": 9.444444444444445e-07,
"loss": 0.4801,
"step": 390
},
{
"epoch": 0.8679245283018868,
"grad_norm": 1.4800730180210795,
"learning_rate": 9.441662029308107e-07,
"loss": 0.4446,
"step": 391
},
{
"epoch": 0.8701442841287459,
"grad_norm": 1.433745516973746,
"learning_rate": 9.438870308435526e-07,
"loss": 0.4395,
"step": 392
},
{
"epoch": 0.8723640399556049,
"grad_norm": 1.2405606709329904,
"learning_rate": 9.436069235064209e-07,
"loss": 0.4719,
"step": 393
},
{
"epoch": 0.8745837957824639,
"grad_norm": 1.3222046837847288,
"learning_rate": 9.433258762117823e-07,
"loss": 0.48,
"step": 394
},
{
"epoch": 0.876803551609323,
"grad_norm": 1.0972747969843053,
"learning_rate": 9.430438842203547e-07,
"loss": 0.4567,
"step": 395
},
{
"epoch": 0.8790233074361821,
"grad_norm": 1.231282503350091,
"learning_rate": 9.427609427609426e-07,
"loss": 0.4253,
"step": 396
},
{
"epoch": 0.881243063263041,
"grad_norm": 1.1148776290337907,
"learning_rate": 9.424770470301668e-07,
"loss": 0.4665,
"step": 397
},
{
"epoch": 0.8834628190899001,
"grad_norm": 1.5167092733459486,
"learning_rate": 9.421921921921921e-07,
"loss": 0.488,
"step": 398
},
{
"epoch": 0.8856825749167592,
"grad_norm": 1.1200324548701535,
"learning_rate": 9.419063733784546e-07,
"loss": 0.4573,
"step": 399
},
{
"epoch": 0.8879023307436182,
"grad_norm": 1.1941442835979987,
"learning_rate": 9.416195856873822e-07,
"loss": 0.474,
"step": 400
},
{
"epoch": 0.8901220865704772,
"grad_norm": 1.2378053597101148,
"learning_rate": 9.413318241841163e-07,
"loss": 0.4275,
"step": 401
},
{
"epoch": 0.8923418423973363,
"grad_norm": 1.1736891938334981,
"learning_rate": 9.410430839002267e-07,
"loss": 0.4813,
"step": 402
},
{
"epoch": 0.8945615982241953,
"grad_norm": 1.2926588136627257,
"learning_rate": 9.407533598334279e-07,
"loss": 0.4101,
"step": 403
},
{
"epoch": 0.8967813540510544,
"grad_norm": 1.335812503096899,
"learning_rate": 9.404626469472886e-07,
"loss": 0.482,
"step": 404
},
{
"epoch": 0.8990011098779135,
"grad_norm": 1.2606281524911447,
"learning_rate": 9.401709401709401e-07,
"loss": 0.4613,
"step": 405
},
{
"epoch": 0.9012208657047724,
"grad_norm": 1.0757260989123811,
"learning_rate": 9.398782343987823e-07,
"loss": 0.442,
"step": 406
},
{
"epoch": 0.9034406215316315,
"grad_norm": 1.2487755708412085,
"learning_rate": 9.395845244901848e-07,
"loss": 0.4849,
"step": 407
},
{
"epoch": 0.9056603773584906,
"grad_norm": 1.125481879252537,
"learning_rate": 9.392898052691868e-07,
"loss": 0.4811,
"step": 408
},
{
"epoch": 0.9078801331853497,
"grad_norm": 1.2379802179470494,
"learning_rate": 9.38994071524192e-07,
"loss": 0.454,
"step": 409
},
{
"epoch": 0.9100998890122086,
"grad_norm": 1.4638820996239457,
"learning_rate": 9.386973180076627e-07,
"loss": 0.4568,
"step": 410
},
{
"epoch": 0.9123196448390677,
"grad_norm": 1.164619393855904,
"learning_rate": 9.383995394358089e-07,
"loss": 0.4572,
"step": 411
},
{
"epoch": 0.9145394006659268,
"grad_norm": 1.1482541560920236,
"learning_rate": 9.381007304882738e-07,
"loss": 0.4612,
"step": 412
},
{
"epoch": 0.9167591564927858,
"grad_norm": 1.2267789304428984,
"learning_rate": 9.378008858078182e-07,
"loss": 0.4509,
"step": 413
},
{
"epoch": 0.9189789123196448,
"grad_norm": 1.163057869832221,
"learning_rate": 9.374999999999999e-07,
"loss": 0.4438,
"step": 414
},
{
"epoch": 0.9211986681465039,
"grad_norm": 1.1962284300244166,
"learning_rate": 9.371980676328503e-07,
"loss": 0.4847,
"step": 415
},
{
"epoch": 0.9234184239733629,
"grad_norm": 1.4078630335966822,
"learning_rate": 9.368950832365467e-07,
"loss": 0.4442,
"step": 416
},
{
"epoch": 0.925638179800222,
"grad_norm": 1.2172971912901611,
"learning_rate": 9.365910413030831e-07,
"loss": 0.4596,
"step": 417
},
{
"epoch": 0.9278579356270811,
"grad_norm": 1.0828193308936622,
"learning_rate": 9.362859362859364e-07,
"loss": 0.516,
"step": 418
},
{
"epoch": 0.93007769145394,
"grad_norm": 1.1924221228533827,
"learning_rate": 9.359797625997276e-07,
"loss": 0.4652,
"step": 419
},
{
"epoch": 0.9322974472807991,
"grad_norm": 1.1409012876229394,
"learning_rate": 9.35672514619883e-07,
"loss": 0.473,
"step": 420
},
{
"epoch": 0.9345172031076582,
"grad_norm": 1.2176749478496565,
"learning_rate": 9.353641866822886e-07,
"loss": 0.4879,
"step": 421
},
{
"epoch": 0.9367369589345172,
"grad_norm": 1.1431631431171365,
"learning_rate": 9.350547730829421e-07,
"loss": 0.4541,
"step": 422
},
{
"epoch": 0.9389567147613762,
"grad_norm": 1.248764858494691,
"learning_rate": 9.347442680776014e-07,
"loss": 0.4484,
"step": 423
},
{
"epoch": 0.9411764705882353,
"grad_norm": 1.1323458083986906,
"learning_rate": 9.344326658814291e-07,
"loss": 0.4701,
"step": 424
},
{
"epoch": 0.9433962264150944,
"grad_norm": 1.3325163110508749,
"learning_rate": 9.341199606686333e-07,
"loss": 0.4863,
"step": 425
},
{
"epoch": 0.9456159822419534,
"grad_norm": 1.1125334297727054,
"learning_rate": 9.33806146572104e-07,
"loss": 0.4399,
"step": 426
},
{
"epoch": 0.9478357380688124,
"grad_norm": 1.3670229521071158,
"learning_rate": 9.334912176830472e-07,
"loss": 0.4747,
"step": 427
},
{
"epoch": 0.9500554938956715,
"grad_norm": 1.198715980258311,
"learning_rate": 9.331751680506128e-07,
"loss": 0.4563,
"step": 428
},
{
"epoch": 0.9522752497225305,
"grad_norm": 1.2305992586404284,
"learning_rate": 9.328579916815211e-07,
"loss": 0.4533,
"step": 429
},
{
"epoch": 0.9544950055493896,
"grad_norm": 1.2010004180709277,
"learning_rate": 9.325396825396825e-07,
"loss": 0.4237,
"step": 430
},
{
"epoch": 0.9567147613762487,
"grad_norm": 1.177319039118549,
"learning_rate": 9.322202345458159e-07,
"loss": 0.4447,
"step": 431
},
{
"epoch": 0.9589345172031076,
"grad_norm": 1.0816083992867438,
"learning_rate": 9.318996415770608e-07,
"loss": 0.4583,
"step": 432
},
{
"epoch": 0.9611542730299667,
"grad_norm": 1.3105321787098454,
"learning_rate": 9.315778974665869e-07,
"loss": 0.4602,
"step": 433
},
{
"epoch": 0.9633740288568258,
"grad_norm": 1.2708668269742336,
"learning_rate": 9.312549960031973e-07,
"loss": 0.4674,
"step": 434
},
{
"epoch": 0.9655937846836848,
"grad_norm": 1.25271035550266,
"learning_rate": 9.309309309309308e-07,
"loss": 0.4702,
"step": 435
},
{
"epoch": 0.9678135405105438,
"grad_norm": 1.1383606781004785,
"learning_rate": 9.306056959486562e-07,
"loss": 0.4809,
"step": 436
},
{
"epoch": 0.9700332963374029,
"grad_norm": 1.117206827269427,
"learning_rate": 9.302792847096645e-07,
"loss": 0.4703,
"step": 437
},
{
"epoch": 0.9722530521642619,
"grad_norm": 3.9310744991947653,
"learning_rate": 9.29951690821256e-07,
"loss": 0.453,
"step": 438
},
{
"epoch": 0.974472807991121,
"grad_norm": 1.141380677715348,
"learning_rate": 9.296229078443233e-07,
"loss": 0.4646,
"step": 439
},
{
"epoch": 0.97669256381798,
"grad_norm": 1.2167415105647477,
"learning_rate": 9.292929292929292e-07,
"loss": 0.4739,
"step": 440
},
{
"epoch": 0.978912319644839,
"grad_norm": 1.161523722412979,
"learning_rate": 9.289617486338798e-07,
"loss": 0.4446,
"step": 441
},
{
"epoch": 0.9811320754716981,
"grad_norm": 1.2805336942642866,
"learning_rate": 9.286293592862935e-07,
"loss": 0.4888,
"step": 442
},
{
"epoch": 0.9833518312985572,
"grad_norm": 1.311736822354556,
"learning_rate": 9.282957546211659e-07,
"loss": 0.4404,
"step": 443
},
{
"epoch": 0.9855715871254163,
"grad_norm": 1.102716937173303,
"learning_rate": 9.279609279609279e-07,
"loss": 0.472,
"step": 444
},
{
"epoch": 0.9877913429522752,
"grad_norm": 1.1845457089510798,
"learning_rate": 9.27624872579001e-07,
"loss": 0.4773,
"step": 445
},
{
"epoch": 0.9900110987791343,
"grad_norm": 1.348555373283787,
"learning_rate": 9.272875816993462e-07,
"loss": 0.4512,
"step": 446
},
{
"epoch": 0.9922308546059934,
"grad_norm": 1.2463685196593113,
"learning_rate": 9.269490484960099e-07,
"loss": 0.4632,
"step": 447
},
{
"epoch": 0.9944506104328524,
"grad_norm": 1.304607151889411,
"learning_rate": 9.266092660926608e-07,
"loss": 0.4482,
"step": 448
},
{
"epoch": 0.9966703662597114,
"grad_norm": 1.2795731918669933,
"learning_rate": 9.262682275621277e-07,
"loss": 0.4793,
"step": 449
},
{
"epoch": 0.9988901220865705,
"grad_norm": 1.2183029212205303,
"learning_rate": 9.25925925925926e-07,
"loss": 0.4554,
"step": 450
},
{
"epoch": 1.0,
"grad_norm": 1.5581011378096141,
"learning_rate": 9.255823541537827e-07,
"loss": 0.4665,
"step": 451
},
{
"epoch": 1.002219755826859,
"grad_norm": 1.30008543587019,
"learning_rate": 9.252375051631556e-07,
"loss": 0.4768,
"step": 452
},
{
"epoch": 1.0044395116537181,
"grad_norm": 1.1541545328399352,
"learning_rate": 9.24891371818746e-07,
"loss": 0.4466,
"step": 453
},
{
"epoch": 1.0066592674805772,
"grad_norm": 1.1080167970739996,
"learning_rate": 9.245439469320066e-07,
"loss": 0.4324,
"step": 454
},
{
"epoch": 1.0088790233074363,
"grad_norm": 1.205265215466603,
"learning_rate": 9.241952232606438e-07,
"loss": 0.451,
"step": 455
},
{
"epoch": 1.0110987791342951,
"grad_norm": 1.1951346456937733,
"learning_rate": 9.238451935081147e-07,
"loss": 0.4575,
"step": 456
},
{
"epoch": 1.0133185349611542,
"grad_norm": 1.1381714006582448,
"learning_rate": 9.234938503231185e-07,
"loss": 0.4649,
"step": 457
},
{
"epoch": 1.0155382907880133,
"grad_norm": 1.2265651801615038,
"learning_rate": 9.23141186299081e-07,
"loss": 0.4546,
"step": 458
},
{
"epoch": 1.0177580466148723,
"grad_norm": 1.2250114443086708,
"learning_rate": 9.227871939736346e-07,
"loss": 0.4529,
"step": 459
},
{
"epoch": 1.0199778024417314,
"grad_norm": 1.1144643265198617,
"learning_rate": 9.22431865828092e-07,
"loss": 0.4721,
"step": 460
},
{
"epoch": 1.0221975582685905,
"grad_norm": 1.2857875509373777,
"learning_rate": 9.220751942869145e-07,
"loss": 0.4742,
"step": 461
},
{
"epoch": 1.0244173140954496,
"grad_norm": 1.4558104967510546,
"learning_rate": 9.217171717171717e-07,
"loss": 0.4828,
"step": 462
},
{
"epoch": 1.0266370699223086,
"grad_norm": 1.393341081291035,
"learning_rate": 9.213577904279991e-07,
"loss": 0.475,
"step": 463
},
{
"epoch": 1.0288568257491675,
"grad_norm": 1.2720063428740336,
"learning_rate": 9.209970426700463e-07,
"loss": 0.4764,
"step": 464
},
{
"epoch": 1.0310765815760266,
"grad_norm": 1.1643324541727542,
"learning_rate": 9.206349206349206e-07,
"loss": 0.4483,
"step": 465
},
{
"epoch": 1.0332963374028856,
"grad_norm": 1.5806078292788657,
"learning_rate": 9.202714164546225e-07,
"loss": 0.4405,
"step": 466
},
{
"epoch": 1.0355160932297447,
"grad_norm": 1.0571713177484243,
"learning_rate": 9.199065222009772e-07,
"loss": 0.4582,
"step": 467
},
{
"epoch": 1.0377358490566038,
"grad_norm": 1.254164694817346,
"learning_rate": 9.195402298850575e-07,
"loss": 0.484,
"step": 468
},
{
"epoch": 1.0399556048834628,
"grad_norm": 1.2093916690336683,
"learning_rate": 9.191725314566005e-07,
"loss": 0.4524,
"step": 469
},
{
"epoch": 1.042175360710322,
"grad_norm": 1.2484369194361347,
"learning_rate": 9.188034188034187e-07,
"loss": 0.4281,
"step": 470
},
{
"epoch": 1.044395116537181,
"grad_norm": 1.23432208173896,
"learning_rate": 9.184328837508027e-07,
"loss": 0.4592,
"step": 471
},
{
"epoch": 1.04661487236404,
"grad_norm": 1.1814190867646936,
"learning_rate": 9.18060918060918e-07,
"loss": 0.4501,
"step": 472
},
{
"epoch": 1.048834628190899,
"grad_norm": 1.2396682446950464,
"learning_rate": 9.176875134321942e-07,
"loss": 0.4796,
"step": 473
},
{
"epoch": 1.051054384017758,
"grad_norm": 1.194735634290447,
"learning_rate": 9.173126614987079e-07,
"loss": 0.4689,
"step": 474
},
{
"epoch": 1.053274139844617,
"grad_norm": 1.5032349073655937,
"learning_rate": 9.169363538295577e-07,
"loss": 0.4756,
"step": 475
},
{
"epoch": 1.0554938956714761,
"grad_norm": 1.3604468620288677,
"learning_rate": 9.165585819282317e-07,
"loss": 0.4516,
"step": 476
},
{
"epoch": 1.0577136514983352,
"grad_norm": 1.235119564536718,
"learning_rate": 9.161793372319687e-07,
"loss": 0.4954,
"step": 477
},
{
"epoch": 1.0599334073251943,
"grad_norm": 1.2064481172474952,
"learning_rate": 9.157986111111111e-07,
"loss": 0.4947,
"step": 478
},
{
"epoch": 1.0621531631520533,
"grad_norm": 1.3341949744876858,
"learning_rate": 9.154163948684497e-07,
"loss": 0.4471,
"step": 479
},
{
"epoch": 1.0643729189789124,
"grad_norm": 1.0774430784246407,
"learning_rate": 9.15032679738562e-07,
"loss": 0.4367,
"step": 480
},
{
"epoch": 1.0665926748057715,
"grad_norm": 1.171666586826098,
"learning_rate": 9.146474568871424e-07,
"loss": 0.4458,
"step": 481
},
{
"epoch": 1.0688124306326303,
"grad_norm": 1.143814272450129,
"learning_rate": 9.142607174103237e-07,
"loss": 0.4509,
"step": 482
},
{
"epoch": 1.0710321864594894,
"grad_norm": 1.1649593431642897,
"learning_rate": 9.138724523339908e-07,
"loss": 0.4691,
"step": 483
},
{
"epoch": 1.0732519422863485,
"grad_norm": 1.5096818227684794,
"learning_rate": 9.134826526130873e-07,
"loss": 0.4331,
"step": 484
},
{
"epoch": 1.0754716981132075,
"grad_norm": 1.1214569676579733,
"learning_rate": 9.130913091309129e-07,
"loss": 0.4198,
"step": 485
},
{
"epoch": 1.0776914539400666,
"grad_norm": 1.2255039253369904,
"learning_rate": 9.126984126984127e-07,
"loss": 0.4359,
"step": 486
},
{
"epoch": 1.0799112097669257,
"grad_norm": 1.2922111222788437,
"learning_rate": 9.12303954053457e-07,
"loss": 0.4382,
"step": 487
},
{
"epoch": 1.0821309655937847,
"grad_norm": 1.1584633311684494,
"learning_rate": 9.119079238601149e-07,
"loss": 0.4483,
"step": 488
},
{
"epoch": 1.0843507214206438,
"grad_norm": 1.3615298699495637,
"learning_rate": 9.115103127079175e-07,
"loss": 0.4785,
"step": 489
},
{
"epoch": 1.0865704772475029,
"grad_norm": 1.291303318155004,
"learning_rate": 9.11111111111111e-07,
"loss": 0.4471,
"step": 490
},
{
"epoch": 1.0887902330743617,
"grad_norm": 1.1619939851316075,
"learning_rate": 9.107103095079046e-07,
"loss": 0.4465,
"step": 491
},
{
"epoch": 1.0910099889012208,
"grad_norm": 1.1890455479283029,
"learning_rate": 9.103078982597053e-07,
"loss": 0.4335,
"step": 492
},
{
"epoch": 1.0932297447280799,
"grad_norm": 1.1706446925482716,
"learning_rate": 9.099038676503465e-07,
"loss": 0.4727,
"step": 493
},
{
"epoch": 1.095449500554939,
"grad_norm": 1.5095826242654946,
"learning_rate": 9.094982078853046e-07,
"loss": 0.4665,
"step": 494
},
{
"epoch": 1.097669256381798,
"grad_norm": 1.255552038855652,
"learning_rate": 9.09090909090909e-07,
"loss": 0.4653,
"step": 495
},
{
"epoch": 1.099889012208657,
"grad_norm": 1.358267392724399,
"learning_rate": 9.086819613135402e-07,
"loss": 0.4586,
"step": 496
},
{
"epoch": 1.1021087680355162,
"grad_norm": 1.1441452405187045,
"learning_rate": 9.082713545188189e-07,
"loss": 0.4505,
"step": 497
},
{
"epoch": 1.1043285238623752,
"grad_norm": 1.2397036751106139,
"learning_rate": 9.078590785907859e-07,
"loss": 0.4654,
"step": 498
},
{
"epoch": 1.106548279689234,
"grad_norm": 1.1197301651396006,
"learning_rate": 9.074451233310703e-07,
"loss": 0.4366,
"step": 499
},
{
"epoch": 1.1087680355160932,
"grad_norm": 1.7953863737465532,
"learning_rate": 9.070294784580499e-07,
"loss": 0.444,
"step": 500
},
{
"epoch": 1.1109877913429522,
"grad_norm": 1.2859250505007516,
"learning_rate": 9.066121336059985e-07,
"loss": 0.4709,
"step": 501
},
{
"epoch": 1.1132075471698113,
"grad_norm": 1.2037158956032694,
"learning_rate": 9.061930783242258e-07,
"loss": 0.4703,
"step": 502
},
{
"epoch": 1.1154273029966704,
"grad_norm": 1.172260705011405,
"learning_rate": 9.057723020762036e-07,
"loss": 0.4427,
"step": 503
},
{
"epoch": 1.1176470588235294,
"grad_norm": 1.3027716139586167,
"learning_rate": 9.053497942386829e-07,
"loss": 0.4536,
"step": 504
},
{
"epoch": 1.1198668146503885,
"grad_norm": 1.3774370449571356,
"learning_rate": 9.049255441008018e-07,
"loss": 0.4909,
"step": 505
},
{
"epoch": 1.1220865704772476,
"grad_norm": 1.447300148943117,
"learning_rate": 9.044995408631772e-07,
"loss": 0.4549,
"step": 506
},
{
"epoch": 1.1243063263041067,
"grad_norm": 1.2807259644533868,
"learning_rate": 9.040717736369911e-07,
"loss": 0.4341,
"step": 507
},
{
"epoch": 1.1265260821309655,
"grad_norm": 1.1295120428776313,
"learning_rate": 9.036422314430613e-07,
"loss": 0.444,
"step": 508
},
{
"epoch": 1.1287458379578246,
"grad_norm": 3.7694331207240075,
"learning_rate": 9.032109032109031e-07,
"loss": 0.4533,
"step": 509
},
{
"epoch": 1.1309655937846836,
"grad_norm": 1.2673358154101324,
"learning_rate": 9.027777777777778e-07,
"loss": 0.4547,
"step": 510
},
{
"epoch": 1.1331853496115427,
"grad_norm": 1.0849428645078705,
"learning_rate": 9.02342843887729e-07,
"loss": 0.448,
"step": 511
},
{
"epoch": 1.1354051054384018,
"grad_norm": 1.2751616613231416,
"learning_rate": 9.019060901906091e-07,
"loss": 0.4482,
"step": 512
},
{
"epoch": 1.1376248612652609,
"grad_norm": 1.5515951785162783,
"learning_rate": 9.014675052410901e-07,
"loss": 0.4327,
"step": 513
},
{
"epoch": 1.13984461709212,
"grad_norm": 1.2001179078592417,
"learning_rate": 9.010270774976657e-07,
"loss": 0.4714,
"step": 514
},
{
"epoch": 1.142064372918979,
"grad_norm": 1.2832027894003408,
"learning_rate": 9.005847953216374e-07,
"loss": 0.4276,
"step": 515
},
{
"epoch": 1.1442841287458378,
"grad_norm": 1.390415895978781,
"learning_rate": 9.0014064697609e-07,
"loss": 0.474,
"step": 516
},
{
"epoch": 1.146503884572697,
"grad_norm": 1.1340741814152506,
"learning_rate": 8.996946206248531e-07,
"loss": 0.4533,
"step": 517
},
{
"epoch": 1.148723640399556,
"grad_norm": 1.1940733998984663,
"learning_rate": 8.9924670433145e-07,
"loss": 0.453,
"step": 518
},
{
"epoch": 1.150943396226415,
"grad_norm": 1.2386850801704261,
"learning_rate": 8.987968860580326e-07,
"loss": 0.4656,
"step": 519
},
{
"epoch": 1.1531631520532741,
"grad_norm": 1.2055848872454982,
"learning_rate": 8.983451536643025e-07,
"loss": 0.4472,
"step": 520
},
{
"epoch": 1.1553829078801332,
"grad_norm": 1.238704680658842,
"learning_rate": 8.978914949064202e-07,
"loss": 0.4566,
"step": 521
},
{
"epoch": 1.1576026637069923,
"grad_norm": 1.66335973863715,
"learning_rate": 8.974358974358974e-07,
"loss": 0.4146,
"step": 522
},
{
"epoch": 1.1598224195338513,
"grad_norm": 1.86204877500857,
"learning_rate": 8.969783487984772e-07,
"loss": 0.4367,
"step": 523
},
{
"epoch": 1.1620421753607104,
"grad_norm": 1.3492062703447547,
"learning_rate": 8.965188364329994e-07,
"loss": 0.4767,
"step": 524
},
{
"epoch": 1.1642619311875695,
"grad_norm": 1.4884364595297717,
"learning_rate": 8.960573476702508e-07,
"loss": 0.4576,
"step": 525
},
{
"epoch": 1.1664816870144283,
"grad_norm": 1.0881816727421652,
"learning_rate": 8.955938697318007e-07,
"loss": 0.4477,
"step": 526
},
{
"epoch": 1.1687014428412874,
"grad_norm": 1.1883566141669124,
"learning_rate": 8.951283897288215e-07,
"loss": 0.4428,
"step": 527
},
{
"epoch": 1.1709211986681465,
"grad_norm": 1.2612670034718356,
"learning_rate": 8.946608946608947e-07,
"loss": 0.4731,
"step": 528
},
{
"epoch": 1.1731409544950056,
"grad_norm": 1.3074342572577153,
"learning_rate": 8.941913714147987e-07,
"loss": 0.4576,
"step": 529
},
{
"epoch": 1.1753607103218646,
"grad_norm": 1.251759051462499,
"learning_rate": 8.93719806763285e-07,
"loss": 0.485,
"step": 530
},
{
"epoch": 1.1775804661487237,
"grad_norm": 1.3767626029872162,
"learning_rate": 8.932461873638343e-07,
"loss": 0.4251,
"step": 531
},
{
"epoch": 1.1798002219755828,
"grad_norm": 1.3350789369730232,
"learning_rate": 8.927704997573992e-07,
"loss": 0.4418,
"step": 532
},
{
"epoch": 1.1820199778024416,
"grad_norm": 1.4193673394872068,
"learning_rate": 8.922927303671287e-07,
"loss": 0.441,
"step": 533
},
{
"epoch": 1.1842397336293007,
"grad_norm": 1.1272492205345337,
"learning_rate": 8.918128654970759e-07,
"loss": 0.4512,
"step": 534
},
{
"epoch": 1.1864594894561598,
"grad_norm": 1.3459032882268571,
"learning_rate": 8.913308913308914e-07,
"loss": 0.4599,
"step": 535
},
{
"epoch": 1.1886792452830188,
"grad_norm": 1.5931812342777447,
"learning_rate": 8.908467939304943e-07,
"loss": 0.4704,
"step": 536
},
{
"epoch": 1.190899001109878,
"grad_norm": 1.2904912837118774,
"learning_rate": 8.903605592347314e-07,
"loss": 0.4597,
"step": 537
},
{
"epoch": 1.193118756936737,
"grad_norm": 1.2366314987596871,
"learning_rate": 8.898721730580136e-07,
"loss": 0.4479,
"step": 538
},
{
"epoch": 1.195338512763596,
"grad_norm": 1.2669676274338157,
"learning_rate": 8.89381621088938e-07,
"loss": 0.4406,
"step": 539
},
{
"epoch": 1.197558268590455,
"grad_norm": 1.5122726418580126,
"learning_rate": 8.888888888888888e-07,
"loss": 0.4588,
"step": 540
},
{
"epoch": 1.1997780244173142,
"grad_norm": 1.6734165075786887,
"learning_rate": 8.88393961890621e-07,
"loss": 0.4644,
"step": 541
},
{
"epoch": 1.2019977802441733,
"grad_norm": 1.1915259000682752,
"learning_rate": 8.878968253968254e-07,
"loss": 0.4432,
"step": 542
},
{
"epoch": 1.204217536071032,
"grad_norm": 1.1243580352921179,
"learning_rate": 8.873974645786726e-07,
"loss": 0.4858,
"step": 543
},
{
"epoch": 1.2064372918978912,
"grad_norm": 1.4122350275641105,
"learning_rate": 8.868958644743398e-07,
"loss": 0.4413,
"step": 544
},
{
"epoch": 1.2086570477247502,
"grad_norm": 1.1853078309680358,
"learning_rate": 8.863920099875155e-07,
"loss": 0.4656,
"step": 545
},
{
"epoch": 1.2108768035516093,
"grad_norm": 1.2556536339152258,
"learning_rate": 8.858858858858857e-07,
"loss": 0.4379,
"step": 546
},
{
"epoch": 1.2130965593784684,
"grad_norm": 1.2397897495493284,
"learning_rate": 8.853774767995986e-07,
"loss": 0.4414,
"step": 547
},
{
"epoch": 1.2153163152053275,
"grad_norm": 1.1971508516589355,
"learning_rate": 8.848667672197082e-07,
"loss": 0.4527,
"step": 548
},
{
"epoch": 1.2175360710321865,
"grad_norm": 1.2293665933088456,
"learning_rate": 8.843537414965987e-07,
"loss": 0.4831,
"step": 549
},
{
"epoch": 1.2197558268590456,
"grad_norm": 1.2843242155663375,
"learning_rate": 8.838383838383837e-07,
"loss": 0.4388,
"step": 550
},
{
"epoch": 1.2219755826859044,
"grad_norm": 1.2647261857821326,
"learning_rate": 8.833206783092888e-07,
"loss": 0.4831,
"step": 551
},
{
"epoch": 1.2241953385127635,
"grad_norm": 1.2640883445235913,
"learning_rate": 8.82800608828006e-07,
"loss": 0.4592,
"step": 552
},
{
"epoch": 1.2264150943396226,
"grad_norm": 1.226695277140237,
"learning_rate": 8.822781591660309e-07,
"loss": 0.4496,
"step": 553
},
{
"epoch": 1.2286348501664817,
"grad_norm": 1.3665638844069508,
"learning_rate": 8.817533129459734e-07,
"loss": 0.4377,
"step": 554
},
{
"epoch": 1.2308546059933407,
"grad_norm": 1.1377261082753805,
"learning_rate": 8.812260536398466e-07,
"loss": 0.4619,
"step": 555
},
{
"epoch": 1.2330743618201998,
"grad_norm": 1.3404047561588681,
"learning_rate": 8.806963645673323e-07,
"loss": 0.4296,
"step": 556
},
{
"epoch": 1.2352941176470589,
"grad_norm": 1.239224322237675,
"learning_rate": 8.801642288940208e-07,
"loss": 0.4483,
"step": 557
},
{
"epoch": 1.237513873473918,
"grad_norm": 1.41317135673617,
"learning_rate": 8.796296296296296e-07,
"loss": 0.4726,
"step": 558
},
{
"epoch": 1.239733629300777,
"grad_norm": 1.2246570480421979,
"learning_rate": 8.790925496261922e-07,
"loss": 0.4657,
"step": 559
},
{
"epoch": 1.2419533851276359,
"grad_norm": 1.145084471090032,
"learning_rate": 8.785529715762273e-07,
"loss": 0.4681,
"step": 560
},
{
"epoch": 1.244173140954495,
"grad_norm": 1.1950811185929102,
"learning_rate": 8.780108780108779e-07,
"loss": 0.4571,
"step": 561
},
{
"epoch": 1.246392896781354,
"grad_norm": 1.6802303305526853,
"learning_rate": 8.774662512980269e-07,
"loss": 0.5134,
"step": 562
},
{
"epoch": 1.248612652608213,
"grad_norm": 1.399146976432964,
"learning_rate": 8.76919073640385e-07,
"loss": 0.4767,
"step": 563
},
{
"epoch": 1.2508324084350722,
"grad_norm": 1.3028199076735263,
"learning_rate": 8.763693270735523e-07,
"loss": 0.4544,
"step": 564
},
{
"epoch": 1.2530521642619312,
"grad_norm": 1.219614197871295,
"learning_rate": 8.758169934640523e-07,
"loss": 0.4519,
"step": 565
},
{
"epoch": 1.2552719200887903,
"grad_norm": 1.5682463354398537,
"learning_rate": 8.752620545073375e-07,
"loss": 0.4523,
"step": 566
},
{
"epoch": 1.2574916759156491,
"grad_norm": 1.3495608997045074,
"learning_rate": 8.747044917257683e-07,
"loss": 0.4495,
"step": 567
},
{
"epoch": 1.2597114317425082,
"grad_norm": 1.1434776149739643,
"learning_rate": 8.741442864665613e-07,
"loss": 0.4335,
"step": 568
},
{
"epoch": 1.2619311875693673,
"grad_norm": 1.2041213920739398,
"learning_rate": 8.735814198997095e-07,
"loss": 0.4563,
"step": 569
},
{
"epoch": 1.2641509433962264,
"grad_norm": 1.5799823438116973,
"learning_rate": 8.730158730158729e-07,
"loss": 0.4615,
"step": 570
},
{
"epoch": 1.2663706992230854,
"grad_norm": 1.1717065129212314,
"learning_rate": 8.724476266242374e-07,
"loss": 0.4475,
"step": 571
},
{
"epoch": 1.2685904550499445,
"grad_norm": 1.4247152366950546,
"learning_rate": 8.718766613503455e-07,
"loss": 0.4716,
"step": 572
},
{
"epoch": 1.2708102108768036,
"grad_norm": 1.1615063544388773,
"learning_rate": 8.713029576338928e-07,
"loss": 0.4679,
"step": 573
},
{
"epoch": 1.2730299667036626,
"grad_norm": 1.5053455438727286,
"learning_rate": 8.707264957264957e-07,
"loss": 0.4325,
"step": 574
},
{
"epoch": 1.2752497225305217,
"grad_norm": 1.3431594762650616,
"learning_rate": 8.701472556894243e-07,
"loss": 0.4643,
"step": 575
},
{
"epoch": 1.2774694783573808,
"grad_norm": 1.1121465911199522,
"learning_rate": 8.695652173913042e-07,
"loss": 0.4644,
"step": 576
},
{
"epoch": 1.2796892341842399,
"grad_norm": 1.3638372065146298,
"learning_rate": 8.689803605057842e-07,
"loss": 0.4508,
"step": 577
},
{
"epoch": 1.2819089900110987,
"grad_norm": 1.5304826167436472,
"learning_rate": 8.683926645091693e-07,
"loss": 0.4841,
"step": 578
},
{
"epoch": 1.2841287458379578,
"grad_norm": 1.3770541470358233,
"learning_rate": 8.678021086780211e-07,
"loss": 0.4744,
"step": 579
},
{
"epoch": 1.2863485016648168,
"grad_norm": 1.1966392163007855,
"learning_rate": 8.672086720867207e-07,
"loss": 0.464,
"step": 580
},
{
"epoch": 1.288568257491676,
"grad_norm": 1.575072361063041,
"learning_rate": 8.666123336049986e-07,
"loss": 0.447,
"step": 581
},
{
"epoch": 1.290788013318535,
"grad_norm": 1.4678136512766098,
"learning_rate": 8.660130718954247e-07,
"loss": 0.4647,
"step": 582
},
{
"epoch": 1.293007769145394,
"grad_norm": 1.2838206658949551,
"learning_rate": 8.654108654108654e-07,
"loss": 0.4491,
"step": 583
},
{
"epoch": 1.2952275249722531,
"grad_norm": 1.3185270999095682,
"learning_rate": 8.648056923918991e-07,
"loss": 0.4223,
"step": 584
},
{
"epoch": 1.297447280799112,
"grad_norm": 1.1670301771260112,
"learning_rate": 8.641975308641973e-07,
"loss": 0.4498,
"step": 585
},
{
"epoch": 1.299667036625971,
"grad_norm": 1.1945169622452185,
"learning_rate": 8.635863586358635e-07,
"loss": 0.4578,
"step": 586
},
{
"epoch": 1.3018867924528301,
"grad_norm": 1.3353650702342224,
"learning_rate": 8.629721532947337e-07,
"loss": 0.4327,
"step": 587
},
{
"epoch": 1.3041065482796892,
"grad_norm": 1.2887658691742558,
"learning_rate": 8.623548922056385e-07,
"loss": 0.4506,
"step": 588
},
{
"epoch": 1.3063263041065483,
"grad_norm": 1.3153941558260245,
"learning_rate": 8.617345525076198e-07,
"loss": 0.4619,
"step": 589
},
{
"epoch": 1.3085460599334073,
"grad_norm": 1.084957644832423,
"learning_rate": 8.611111111111111e-07,
"loss": 0.4689,
"step": 590
},
{
"epoch": 1.3107658157602664,
"grad_norm": 1.2958884686931098,
"learning_rate": 8.604845446950709e-07,
"loss": 0.4393,
"step": 591
},
{
"epoch": 1.3129855715871255,
"grad_norm": 1.2704036344255825,
"learning_rate": 8.598548297040758e-07,
"loss": 0.4621,
"step": 592
},
{
"epoch": 1.3152053274139845,
"grad_norm": 1.181804955291271,
"learning_rate": 8.592219423453679e-07,
"loss": 0.4512,
"step": 593
},
{
"epoch": 1.3174250832408436,
"grad_norm": 1.213923802037942,
"learning_rate": 8.585858585858585e-07,
"loss": 0.4351,
"step": 594
},
{
"epoch": 1.3196448390677027,
"grad_norm": 1.4427504658682893,
"learning_rate": 8.579465541490858e-07,
"loss": 0.4479,
"step": 595
},
{
"epoch": 1.3218645948945615,
"grad_norm": 1.2101667184282197,
"learning_rate": 8.573040045121262e-07,
"loss": 0.4531,
"step": 596
},
{
"epoch": 1.3240843507214206,
"grad_norm": 1.3091204432988863,
"learning_rate": 8.566581849024596e-07,
"loss": 0.4551,
"step": 597
},
{
"epoch": 1.3263041065482797,
"grad_norm": 1.148635010119325,
"learning_rate": 8.560090702947845e-07,
"loss": 0.4581,
"step": 598
},
{
"epoch": 1.3285238623751388,
"grad_norm": 1.3555133948699525,
"learning_rate": 8.553566354077861e-07,
"loss": 0.4502,
"step": 599
},
{
"epoch": 1.3307436182019978,
"grad_norm": 1.206500098872861,
"learning_rate": 8.547008547008546e-07,
"loss": 0.4621,
"step": 600
},
{
"epoch": 1.332963374028857,
"grad_norm": 1.4696355238727896,
"learning_rate": 8.540417023707511e-07,
"loss": 0.3981,
"step": 601
},
{
"epoch": 1.3351831298557157,
"grad_norm": 1.2754461892433568,
"learning_rate": 8.533791523482246e-07,
"loss": 0.433,
"step": 602
},
{
"epoch": 1.3374028856825748,
"grad_norm": 1.1181521102343912,
"learning_rate": 8.527131782945737e-07,
"loss": 0.446,
"step": 603
},
{
"epoch": 1.3396226415094339,
"grad_norm": 1.2949604135230177,
"learning_rate": 8.520437535981577e-07,
"loss": 0.4226,
"step": 604
},
{
"epoch": 1.341842397336293,
"grad_norm": 2.189494969676846,
"learning_rate": 8.513708513708513e-07,
"loss": 0.4601,
"step": 605
},
{
"epoch": 1.344062153163152,
"grad_norm": 1.3067648560060685,
"learning_rate": 8.506944444444443e-07,
"loss": 0.484,
"step": 606
},
{
"epoch": 1.346281908990011,
"grad_norm": 1.3900814826076342,
"learning_rate": 8.500145053669857e-07,
"loss": 0.4759,
"step": 607
},
{
"epoch": 1.3485016648168702,
"grad_norm": 1.1602214205697887,
"learning_rate": 8.493310063990691e-07,
"loss": 0.4253,
"step": 608
},
{
"epoch": 1.3507214206437292,
"grad_norm": 1.152221622632636,
"learning_rate": 8.486439195100612e-07,
"loss": 0.4041,
"step": 609
},
{
"epoch": 1.3529411764705883,
"grad_norm": 2.0055544359824835,
"learning_rate": 8.479532163742691e-07,
"loss": 0.4666,
"step": 610
},
{
"epoch": 1.3551609322974474,
"grad_norm": 1.1841127652388874,
"learning_rate": 8.472588683670478e-07,
"loss": 0.4699,
"step": 611
},
{
"epoch": 1.3573806881243065,
"grad_norm": 1.2389603354426622,
"learning_rate": 8.465608465608465e-07,
"loss": 0.443,
"step": 612
},
{
"epoch": 1.3596004439511653,
"grad_norm": 1.1564359890832918,
"learning_rate": 8.458591217211907e-07,
"loss": 0.4429,
"step": 613
},
{
"epoch": 1.3618201997780244,
"grad_norm": 1.41227328222579,
"learning_rate": 8.451536643026004e-07,
"loss": 0.4454,
"step": 614
},
{
"epoch": 1.3640399556048834,
"grad_norm": 1.1079876664577466,
"learning_rate": 8.444444444444443e-07,
"loss": 0.4752,
"step": 615
},
{
"epoch": 1.3662597114317425,
"grad_norm": 1.1899499269279072,
"learning_rate": 8.437314319667262e-07,
"loss": 0.4702,
"step": 616
},
{
"epoch": 1.3684794672586016,
"grad_norm": 1.3226199727516845,
"learning_rate": 8.430145963658029e-07,
"loss": 0.4412,
"step": 617
},
{
"epoch": 1.3706992230854607,
"grad_norm": 1.3142527110965414,
"learning_rate": 8.422939068100359e-07,
"loss": 0.4631,
"step": 618
},
{
"epoch": 1.3729189789123195,
"grad_norm": 1.2703215823532485,
"learning_rate": 8.415693321353698e-07,
"loss": 0.4096,
"step": 619
},
{
"epoch": 1.3751387347391786,
"grad_norm": 1.1283519440796594,
"learning_rate": 8.408408408408408e-07,
"loss": 0.4582,
"step": 620
},
{
"epoch": 1.3773584905660377,
"grad_norm": 1.1637932666394057,
"learning_rate": 8.401084010840107e-07,
"loss": 0.4424,
"step": 621
},
{
"epoch": 1.3795782463928967,
"grad_norm": 1.3654727451004818,
"learning_rate": 8.393719806763283e-07,
"loss": 0.4493,
"step": 622
},
{
"epoch": 1.3817980022197558,
"grad_norm": 1.6030535704205855,
"learning_rate": 8.386315470784134e-07,
"loss": 0.4474,
"step": 623
},
{
"epoch": 1.3840177580466149,
"grad_norm": 11.646261061824008,
"learning_rate": 8.378870673952642e-07,
"loss": 0.4641,
"step": 624
},
{
"epoch": 1.386237513873474,
"grad_norm": 1.6285236345285772,
"learning_rate": 8.371385083713851e-07,
"loss": 0.4587,
"step": 625
},
{
"epoch": 1.388457269700333,
"grad_norm": 1.258299361572604,
"learning_rate": 8.363858363858362e-07,
"loss": 0.4583,
"step": 626
},
{
"epoch": 1.390677025527192,
"grad_norm": 1.382346104349446,
"learning_rate": 8.356290174471993e-07,
"loss": 0.4608,
"step": 627
},
{
"epoch": 1.3928967813540512,
"grad_norm": 1.6092979440429054,
"learning_rate": 8.348680171884591e-07,
"loss": 0.517,
"step": 628
},
{
"epoch": 1.3951165371809102,
"grad_norm": 1.1511935463486243,
"learning_rate": 8.341028008618035e-07,
"loss": 0.4187,
"step": 629
},
{
"epoch": 1.397336293007769,
"grad_norm": 1.4893180915287862,
"learning_rate": 8.333333333333332e-07,
"loss": 0.4465,
"step": 630
},
{
"epoch": 1.3995560488346281,
"grad_norm": 1.1654172290058302,
"learning_rate": 8.325595790776849e-07,
"loss": 0.4592,
"step": 631
},
{
"epoch": 1.4017758046614872,
"grad_norm": 1.2468188833825287,
"learning_rate": 8.317815021725636e-07,
"loss": 0.4525,
"step": 632
},
{
"epoch": 1.4039955604883463,
"grad_norm": 1.1758197640195867,
"learning_rate": 8.309990662931839e-07,
"loss": 0.4227,
"step": 633
},
{
"epoch": 1.4062153163152054,
"grad_norm": 1.302048112682148,
"learning_rate": 8.302122347066167e-07,
"loss": 0.4494,
"step": 634
},
{
"epoch": 1.4084350721420644,
"grad_norm": 1.3410878474631271,
"learning_rate": 8.294209702660407e-07,
"loss": 0.429,
"step": 635
},
{
"epoch": 1.4106548279689235,
"grad_norm": 1.5784172471454891,
"learning_rate": 8.286252354048963e-07,
"loss": 0.4518,
"step": 636
},
{
"epoch": 1.4128745837957823,
"grad_norm": 1.3266078326407231,
"learning_rate": 8.27824992130941e-07,
"loss": 0.4561,
"step": 637
},
{
"epoch": 1.4150943396226414,
"grad_norm": 1.1730794847154962,
"learning_rate": 8.27020202020202e-07,
"loss": 0.4498,
"step": 638
},
{
"epoch": 1.4173140954495005,
"grad_norm": 1.4070270520482746,
"learning_rate": 8.262108262108262e-07,
"loss": 0.4474,
"step": 639
},
{
"epoch": 1.4195338512763596,
"grad_norm": 1.213825849940393,
"learning_rate": 8.253968253968254e-07,
"loss": 0.4538,
"step": 640
},
{
"epoch": 1.4217536071032186,
"grad_norm": 1.1622227713378128,
"learning_rate": 8.245781598217128e-07,
"loss": 0.4265,
"step": 641
},
{
"epoch": 1.4239733629300777,
"grad_norm": 1.3722638535641742,
"learning_rate": 8.237547892720306e-07,
"loss": 0.4232,
"step": 642
},
{
"epoch": 1.4261931187569368,
"grad_norm": 2.166606954999253,
"learning_rate": 8.229266730707652e-07,
"loss": 0.4542,
"step": 643
},
{
"epoch": 1.4284128745837958,
"grad_norm": 1.1966806088779693,
"learning_rate": 8.220937700706485e-07,
"loss": 0.4427,
"step": 644
},
{
"epoch": 1.430632630410655,
"grad_norm": 1.2876269724582017,
"learning_rate": 8.212560386473431e-07,
"loss": 0.451,
"step": 645
},
{
"epoch": 1.432852386237514,
"grad_norm": 1.2565208495830507,
"learning_rate": 8.204134366925064e-07,
"loss": 0.4819,
"step": 646
},
{
"epoch": 1.435072142064373,
"grad_norm": 1.1280465608470205,
"learning_rate": 8.19565921606738e-07,
"loss": 0.4408,
"step": 647
},
{
"epoch": 1.437291897891232,
"grad_norm": 1.1454315806648652,
"learning_rate": 8.187134502923975e-07,
"loss": 0.4647,
"step": 648
},
{
"epoch": 1.439511653718091,
"grad_norm": 1.1995949528833032,
"learning_rate": 8.178559791463017e-07,
"loss": 0.4449,
"step": 649
},
{
"epoch": 1.44173140954495,
"grad_norm": 1.213107151128081,
"learning_rate": 8.169934640522875e-07,
"loss": 0.4423,
"step": 650
},
{
"epoch": 1.4439511653718091,
"grad_norm": 1.1669562141188647,
"learning_rate": 8.161258603736478e-07,
"loss": 0.4564,
"step": 651
},
{
"epoch": 1.4461709211986682,
"grad_norm": 1.2542020168248378,
"learning_rate": 8.152531229454307e-07,
"loss": 0.4807,
"step": 652
},
{
"epoch": 1.4483906770255273,
"grad_norm": 1.2337597702311685,
"learning_rate": 8.143752060666006e-07,
"loss": 0.4578,
"step": 653
},
{
"epoch": 1.4506104328523861,
"grad_norm": 1.1163506747553094,
"learning_rate": 8.134920634920636e-07,
"loss": 0.4507,
"step": 654
},
{
"epoch": 1.4528301886792452,
"grad_norm": 1.2061276153808267,
"learning_rate": 8.126036484245439e-07,
"loss": 0.4692,
"step": 655
},
{
"epoch": 1.4550499445061043,
"grad_norm": 1.1005759181356636,
"learning_rate": 8.117099135063206e-07,
"loss": 0.4205,
"step": 656
},
{
"epoch": 1.4572697003329633,
"grad_norm": 1.1848316729798107,
"learning_rate": 8.108108108108107e-07,
"loss": 0.441,
"step": 657
},
{
"epoch": 1.4594894561598224,
"grad_norm": 1.19691331615668,
"learning_rate": 8.099062918340026e-07,
"loss": 0.4295,
"step": 658
},
{
"epoch": 1.4617092119866815,
"grad_norm": 1.469767362231428,
"learning_rate": 8.089963074857335e-07,
"loss": 0.4715,
"step": 659
},
{
"epoch": 1.4639289678135405,
"grad_norm": 1.3533624301127642,
"learning_rate": 8.080808080808079e-07,
"loss": 0.4351,
"step": 660
},
{
"epoch": 1.4661487236403996,
"grad_norm": 1.2855778063785157,
"learning_rate": 8.071597433299561e-07,
"loss": 0.4668,
"step": 661
},
{
"epoch": 1.4683684794672587,
"grad_norm": 1.2831187882443813,
"learning_rate": 8.062330623306233e-07,
"loss": 0.4395,
"step": 662
},
{
"epoch": 1.4705882352941178,
"grad_norm": 1.2080799785694776,
"learning_rate": 8.053007135575944e-07,
"loss": 0.4555,
"step": 663
},
{
"epoch": 1.4728079911209768,
"grad_norm": 1.5842263462761013,
"learning_rate": 8.043626448534423e-07,
"loss": 0.4186,
"step": 664
},
{
"epoch": 1.4750277469478357,
"grad_norm": 1.5084192437587065,
"learning_rate": 8.034188034188033e-07,
"loss": 0.4542,
"step": 665
},
{
"epoch": 1.4772475027746947,
"grad_norm": 1.2075109855494262,
"learning_rate": 8.02469135802469e-07,
"loss": 0.4295,
"step": 666
},
{
"epoch": 1.4794672586015538,
"grad_norm": 1.3944924708355404,
"learning_rate": 8.015135878912968e-07,
"loss": 0.4511,
"step": 667
},
{
"epoch": 1.4816870144284129,
"grad_norm": 1.1322175488542188,
"learning_rate": 8.005521048999311e-07,
"loss": 0.4442,
"step": 668
},
{
"epoch": 1.483906770255272,
"grad_norm": 1.196489191309896,
"learning_rate": 7.995846313603322e-07,
"loss": 0.4798,
"step": 669
},
{
"epoch": 1.486126526082131,
"grad_norm": 1.2141015327602915,
"learning_rate": 7.986111111111112e-07,
"loss": 0.4623,
"step": 670
},
{
"epoch": 1.4883462819089899,
"grad_norm": 1.1570484813036512,
"learning_rate": 7.976314872866596e-07,
"loss": 0.4358,
"step": 671
},
{
"epoch": 1.490566037735849,
"grad_norm": 1.1307507968914452,
"learning_rate": 7.966457023060795e-07,
"loss": 0.4929,
"step": 672
},
{
"epoch": 1.492785793562708,
"grad_norm": 1.3352988179015075,
"learning_rate": 7.956536978618998e-07,
"loss": 0.4622,
"step": 673
},
{
"epoch": 1.495005549389567,
"grad_norm": 1.2820634824407855,
"learning_rate": 7.946554149085794e-07,
"loss": 0.4454,
"step": 674
},
{
"epoch": 1.4972253052164262,
"grad_norm": 1.1826692037429019,
"learning_rate": 7.936507936507937e-07,
"loss": 0.4604,
"step": 675
},
{
"epoch": 1.4994450610432852,
"grad_norm": 1.3318875370925376,
"learning_rate": 7.926397735314932e-07,
"loss": 0.4625,
"step": 676
},
{
"epoch": 1.5016648168701443,
"grad_norm": 1.2519631673745286,
"learning_rate": 7.916222932197372e-07,
"loss": 0.4385,
"step": 677
},
{
"epoch": 1.5038845726970034,
"grad_norm": 1.2914098984220168,
"learning_rate": 7.905982905982905e-07,
"loss": 0.4353,
"step": 678
},
{
"epoch": 1.5061043285238624,
"grad_norm": 1.1899729635623018,
"learning_rate": 7.895677027509823e-07,
"loss": 0.443,
"step": 679
},
{
"epoch": 1.5083240843507215,
"grad_norm": 1.190444535694286,
"learning_rate": 7.885304659498207e-07,
"loss": 0.4614,
"step": 680
},
{
"epoch": 1.5105438401775806,
"grad_norm": 1.1706400844430933,
"learning_rate": 7.874865156418553e-07,
"loss": 0.4331,
"step": 681
},
{
"epoch": 1.5127635960044397,
"grad_norm": 1.4812029130894295,
"learning_rate": 7.864357864357864e-07,
"loss": 0.458,
"step": 682
},
{
"epoch": 1.5149833518312985,
"grad_norm": 1.5221542759514808,
"learning_rate": 7.853782120883096e-07,
"loss": 0.4573,
"step": 683
},
{
"epoch": 1.5172031076581576,
"grad_norm": 1.178963451844953,
"learning_rate": 7.843137254901962e-07,
"loss": 0.4401,
"step": 684
},
{
"epoch": 1.5194228634850167,
"grad_norm": 1.1351996419308124,
"learning_rate": 7.832422586520947e-07,
"loss": 0.4916,
"step": 685
},
{
"epoch": 1.5216426193118757,
"grad_norm": 1.09628101612322,
"learning_rate": 7.821637426900585e-07,
"loss": 0.4332,
"step": 686
},
{
"epoch": 1.5238623751387348,
"grad_norm": 1.213021644918351,
"learning_rate": 7.81078107810781e-07,
"loss": 0.4498,
"step": 687
},
{
"epoch": 1.5260821309655936,
"grad_norm": 1.1545827521479424,
"learning_rate": 7.799852832965415e-07,
"loss": 0.4458,
"step": 688
},
{
"epoch": 1.5283018867924527,
"grad_norm": 1.4669602766610934,
"learning_rate": 7.788851974898486e-07,
"loss": 0.453,
"step": 689
},
{
"epoch": 1.5305216426193118,
"grad_norm": 1.2805205310095251,
"learning_rate": 7.777777777777777e-07,
"loss": 0.4475,
"step": 690
},
{
"epoch": 1.5327413984461709,
"grad_norm": 1.356427768134019,
"learning_rate": 7.76662950575994e-07,
"loss": 0.4363,
"step": 691
},
{
"epoch": 1.53496115427303,
"grad_norm": 2.519051772383195,
"learning_rate": 7.755406413124534e-07,
"loss": 0.4676,
"step": 692
},
{
"epoch": 1.537180910099889,
"grad_norm": 1.4450670338822147,
"learning_rate": 7.744107744107744e-07,
"loss": 0.452,
"step": 693
},
{
"epoch": 1.539400665926748,
"grad_norm": 1.2099248717480078,
"learning_rate": 7.732732732732732e-07,
"loss": 0.4321,
"step": 694
},
{
"epoch": 1.5416204217536071,
"grad_norm": 1.090590842333066,
"learning_rate": 7.721280602636534e-07,
"loss": 0.4705,
"step": 695
},
{
"epoch": 1.5438401775804662,
"grad_norm": 1.3443127837215874,
"learning_rate": 7.709750566893423e-07,
"loss": 0.4407,
"step": 696
},
{
"epoch": 1.5460599334073253,
"grad_norm": 1.1806195813187879,
"learning_rate": 7.698141827834659e-07,
"loss": 0.4495,
"step": 697
},
{
"epoch": 1.5482796892341844,
"grad_norm": 1.3811319584265567,
"learning_rate": 7.686453576864536e-07,
"loss": 0.4396,
"step": 698
},
{
"epoch": 1.5504994450610434,
"grad_norm": 1.6274469192909502,
"learning_rate": 7.674684994272622e-07,
"loss": 0.4607,
"step": 699
},
{
"epoch": 1.5527192008879025,
"grad_norm": 1.1151313287955884,
"learning_rate": 7.662835249042146e-07,
"loss": 0.4654,
"step": 700
},
{
"epoch": 1.5549389567147613,
"grad_norm": 1.107422673857088,
"learning_rate": 7.650903498654364e-07,
"loss": 0.4654,
"step": 701
},
{
"epoch": 1.5571587125416204,
"grad_norm": 3.3935262710809413,
"learning_rate": 7.638888888888888e-07,
"loss": 0.4136,
"step": 702
},
{
"epoch": 1.5593784683684795,
"grad_norm": 1.3044030455695086,
"learning_rate": 7.626790553619821e-07,
"loss": 0.4806,
"step": 703
},
{
"epoch": 1.5615982241953386,
"grad_norm": 1.6105121644457865,
"learning_rate": 7.614607614607614e-07,
"loss": 0.458,
"step": 704
},
{
"epoch": 1.5638179800221974,
"grad_norm": 1.1574126639573554,
"learning_rate": 7.602339181286549e-07,
"loss": 0.4326,
"step": 705
},
{
"epoch": 1.5660377358490565,
"grad_norm": 2.166517215956068,
"learning_rate": 7.589984350547731e-07,
"loss": 0.4513,
"step": 706
},
{
"epoch": 1.5682574916759155,
"grad_norm": 1.264720713924714,
"learning_rate": 7.577542206517472e-07,
"loss": 0.4527,
"step": 707
},
{
"epoch": 1.5704772475027746,
"grad_norm": 1.1937319067279388,
"learning_rate": 7.565011820330969e-07,
"loss": 0.4309,
"step": 708
},
{
"epoch": 1.5726970033296337,
"grad_norm": 1.3157411175887248,
"learning_rate": 7.552392249901146e-07,
"loss": 0.4419,
"step": 709
},
{
"epoch": 1.5749167591564928,
"grad_norm": 1.6021534077017254,
"learning_rate": 7.539682539682539e-07,
"loss": 0.4693,
"step": 710
},
{
"epoch": 1.5771365149833518,
"grad_norm": 1.3815775452863337,
"learning_rate": 7.526881720430106e-07,
"loss": 0.4225,
"step": 711
},
{
"epoch": 1.579356270810211,
"grad_norm": 1.1669990156242764,
"learning_rate": 7.513988808952837e-07,
"loss": 0.4427,
"step": 712
},
{
"epoch": 1.58157602663707,
"grad_norm": 1.336716126017333,
"learning_rate": 7.501002807862012e-07,
"loss": 0.4764,
"step": 713
},
{
"epoch": 1.583795782463929,
"grad_norm": 1.1491190746227358,
"learning_rate": 7.48792270531401e-07,
"loss": 0.4724,
"step": 714
},
{
"epoch": 1.5860155382907881,
"grad_norm": 1.4714988469601522,
"learning_rate": 7.474747474747474e-07,
"loss": 0.4381,
"step": 715
},
{
"epoch": 1.5882352941176472,
"grad_norm": 1.330675570974309,
"learning_rate": 7.46147607461476e-07,
"loss": 0.4319,
"step": 716
},
{
"epoch": 1.5904550499445063,
"grad_norm": 1.1325025765131167,
"learning_rate": 7.448107448107446e-07,
"loss": 0.4531,
"step": 717
},
{
"epoch": 1.592674805771365,
"grad_norm": 1.324548351381287,
"learning_rate": 7.434640522875816e-07,
"loss": 0.4659,
"step": 718
},
{
"epoch": 1.5948945615982242,
"grad_norm": 1.3011201860764343,
"learning_rate": 7.421074210742108e-07,
"loss": 0.4439,
"step": 719
},
{
"epoch": 1.5971143174250833,
"grad_norm": 1.6130819582835183,
"learning_rate": 7.407407407407406e-07,
"loss": 0.4421,
"step": 720
},
{
"epoch": 1.5993340732519423,
"grad_norm": 1.5662724419530687,
"learning_rate": 7.393638992152003e-07,
"loss": 0.4602,
"step": 721
},
{
"epoch": 1.6015538290788012,
"grad_norm": 1.1942408444514543,
"learning_rate": 7.37976782752902e-07,
"loss": 0.4527,
"step": 722
},
{
"epoch": 1.6037735849056602,
"grad_norm": 1.2859091953871118,
"learning_rate": 7.365792759051186e-07,
"loss": 0.4335,
"step": 723
},
{
"epoch": 1.6059933407325193,
"grad_norm": 1.3567371753220179,
"learning_rate": 7.351712614870509e-07,
"loss": 0.4482,
"step": 724
},
{
"epoch": 1.6082130965593784,
"grad_norm": 1.227995279262155,
"learning_rate": 7.337526205450733e-07,
"loss": 0.435,
"step": 725
},
{
"epoch": 1.6104328523862375,
"grad_norm": 1.894681494953077,
"learning_rate": 7.323232323232324e-07,
"loss": 0.4,
"step": 726
},
{
"epoch": 1.6126526082130965,
"grad_norm": 1.7711751997207135,
"learning_rate": 7.308829742289818e-07,
"loss": 0.4532,
"step": 727
},
{
"epoch": 1.6148723640399556,
"grad_norm": 1.2189223180486914,
"learning_rate": 7.294317217981341e-07,
"loss": 0.4361,
"step": 728
},
{
"epoch": 1.6170921198668147,
"grad_norm": 1.221687708753864,
"learning_rate": 7.279693486590037e-07,
"loss": 0.4503,
"step": 729
},
{
"epoch": 1.6193118756936737,
"grad_norm": 1.4298000334341903,
"learning_rate": 7.264957264957265e-07,
"loss": 0.4704,
"step": 730
},
{
"epoch": 1.6215316315205328,
"grad_norm": 1.435956678641849,
"learning_rate": 7.250107250107249e-07,
"loss": 0.4317,
"step": 731
},
{
"epoch": 1.6237513873473919,
"grad_norm": 1.1874120488254374,
"learning_rate": 7.235142118863048e-07,
"loss": 0.4452,
"step": 732
},
{
"epoch": 1.625971143174251,
"grad_norm": 1.2271468200041469,
"learning_rate": 7.220060527453523e-07,
"loss": 0.4732,
"step": 733
},
{
"epoch": 1.62819089900111,
"grad_norm": 1.256323602386584,
"learning_rate": 7.204861111111112e-07,
"loss": 0.4229,
"step": 734
},
{
"epoch": 1.6304106548279689,
"grad_norm": 1.2589209693187742,
"learning_rate": 7.189542483660131e-07,
"loss": 0.4629,
"step": 735
},
{
"epoch": 1.632630410654828,
"grad_norm": 1.1321916007978945,
"learning_rate": 7.174103237095362e-07,
"loss": 0.4413,
"step": 736
},
{
"epoch": 1.634850166481687,
"grad_norm": 1.2308758075551893,
"learning_rate": 7.158541941150637e-07,
"loss": 0.4484,
"step": 737
},
{
"epoch": 1.637069922308546,
"grad_norm": 1.4080143965250038,
"learning_rate": 7.142857142857143e-07,
"loss": 0.4315,
"step": 738
},
{
"epoch": 1.6392896781354052,
"grad_norm": 1.1972733836836713,
"learning_rate": 7.12704736609119e-07,
"loss": 0.4617,
"step": 739
},
{
"epoch": 1.641509433962264,
"grad_norm": 1.444178694974046,
"learning_rate": 7.11111111111111e-07,
"loss": 0.4308,
"step": 740
},
{
"epoch": 1.643729189789123,
"grad_norm": 1.9630648369634558,
"learning_rate": 7.095046854082998e-07,
"loss": 0.4358,
"step": 741
},
{
"epoch": 1.6459489456159822,
"grad_norm": 1.0478295513396192,
"learning_rate": 7.078853046594981e-07,
"loss": 0.4354,
"step": 742
},
{
"epoch": 1.6481687014428412,
"grad_norm": 1.5259200054970437,
"learning_rate": 7.062528115159693e-07,
"loss": 0.4542,
"step": 743
},
{
"epoch": 1.6503884572697003,
"grad_norm": 1.3033791571298337,
"learning_rate": 7.046070460704607e-07,
"loss": 0.4604,
"step": 744
},
{
"epoch": 1.6526082130965594,
"grad_norm": 1.2686542276794928,
"learning_rate": 7.029478458049886e-07,
"loss": 0.4468,
"step": 745
},
{
"epoch": 1.6548279689234184,
"grad_norm": 1.389909299627517,
"learning_rate": 7.012750455373405e-07,
"loss": 0.4311,
"step": 746
},
{
"epoch": 1.6570477247502775,
"grad_norm": 1.2863040122952574,
"learning_rate": 6.99588477366255e-07,
"loss": 0.4397,
"step": 747
},
{
"epoch": 1.6592674805771366,
"grad_norm": 1.5830202579995283,
"learning_rate": 6.978879706152433e-07,
"loss": 0.4405,
"step": 748
},
{
"epoch": 1.6614872364039956,
"grad_norm": 1.5295124154011601,
"learning_rate": 6.961733517750114e-07,
"loss": 0.461,
"step": 749
},
{
"epoch": 1.6637069922308547,
"grad_norm": 1.2908712828825855,
"learning_rate": 6.944444444444444e-07,
"loss": 0.4427,
"step": 750
},
{
"epoch": 1.6659267480577138,
"grad_norm": 1.3248070106181145,
"learning_rate": 6.92701069270107e-07,
"loss": 0.4571,
"step": 751
},
{
"epoch": 1.6681465038845729,
"grad_norm": 1.9717448900035122,
"learning_rate": 6.909430438842202e-07,
"loss": 0.454,
"step": 752
},
{
"epoch": 1.6703662597114317,
"grad_norm": 1.1474725389983733,
"learning_rate": 6.891701828410688e-07,
"loss": 0.4518,
"step": 753
},
{
"epoch": 1.6725860155382908,
"grad_norm": 1.1422547185949583,
"learning_rate": 6.87382297551789e-07,
"loss": 0.4319,
"step": 754
},
{
"epoch": 1.6748057713651499,
"grad_norm": 1.3008377268645959,
"learning_rate": 6.855791962174942e-07,
"loss": 0.4496,
"step": 755
},
{
"epoch": 1.677025527192009,
"grad_norm": 2.5374443382266274,
"learning_rate": 6.837606837606838e-07,
"loss": 0.4627,
"step": 756
},
{
"epoch": 1.6792452830188678,
"grad_norm": 1.3126351528616036,
"learning_rate": 6.819265617548878e-07,
"loss": 0.474,
"step": 757
},
{
"epoch": 1.6814650388457268,
"grad_norm": 1.1532261673764852,
"learning_rate": 6.800766283524904e-07,
"loss": 0.4146,
"step": 758
},
{
"epoch": 1.683684794672586,
"grad_norm": 1.2578942626083018,
"learning_rate": 6.782106782106782e-07,
"loss": 0.4427,
"step": 759
},
{
"epoch": 1.685904550499445,
"grad_norm": 1.171921369101573,
"learning_rate": 6.763285024154589e-07,
"loss": 0.4545,
"step": 760
},
{
"epoch": 1.688124306326304,
"grad_norm": 1.336852590496218,
"learning_rate": 6.744298884036874e-07,
"loss": 0.4391,
"step": 761
},
{
"epoch": 1.6903440621531631,
"grad_norm": 1.567111800099075,
"learning_rate": 6.725146198830411e-07,
"loss": 0.4275,
"step": 762
},
{
"epoch": 1.6925638179800222,
"grad_norm": 1.232121475002925,
"learning_rate": 6.705824767498777e-07,
"loss": 0.451,
"step": 763
},
{
"epoch": 1.6947835738068813,
"grad_norm": 1.588473784999118,
"learning_rate": 6.686332350049164e-07,
"loss": 0.4423,
"step": 764
},
{
"epoch": 1.6970033296337403,
"grad_norm": 1.1992763714793029,
"learning_rate": 6.666666666666666e-07,
"loss": 0.4518,
"step": 765
},
{
"epoch": 1.6992230854605994,
"grad_norm": 1.2848849550190784,
"learning_rate": 6.646825396825396e-07,
"loss": 0.4525,
"step": 766
},
{
"epoch": 1.7014428412874585,
"grad_norm": 1.1951992270325973,
"learning_rate": 6.626806178375684e-07,
"loss": 0.4299,
"step": 767
},
{
"epoch": 1.7036625971143176,
"grad_norm": 1.376140764505536,
"learning_rate": 6.606606606606606e-07,
"loss": 0.4398,
"step": 768
},
{
"epoch": 1.7058823529411766,
"grad_norm": 1.2201459082778296,
"learning_rate": 6.586224233283057e-07,
"loss": 0.4319,
"step": 769
},
{
"epoch": 1.7081021087680355,
"grad_norm": 1.3935864054018974,
"learning_rate": 6.565656565656566e-07,
"loss": 0.4729,
"step": 770
},
{
"epoch": 1.7103218645948945,
"grad_norm": 1.5233429855618519,
"learning_rate": 6.54490106544901e-07,
"loss": 0.4468,
"step": 771
},
{
"epoch": 1.7125416204217536,
"grad_norm": 1.2950847008110937,
"learning_rate": 6.523955147808359e-07,
"loss": 0.4488,
"step": 772
},
{
"epoch": 1.7147613762486127,
"grad_norm": 1.1452116937420966,
"learning_rate": 6.502816180235534e-07,
"loss": 0.4654,
"step": 773
},
{
"epoch": 1.7169811320754715,
"grad_norm": 1.595146736998358,
"learning_rate": 6.481481481481481e-07,
"loss": 0.4497,
"step": 774
},
{
"epoch": 1.7192008879023306,
"grad_norm": 1.3087928114755394,
"learning_rate": 6.459948320413435e-07,
"loss": 0.4484,
"step": 775
},
{
"epoch": 1.7214206437291897,
"grad_norm": 1.8195299066925699,
"learning_rate": 6.438213914849428e-07,
"loss": 0.4467,
"step": 776
},
{
"epoch": 1.7236403995560488,
"grad_norm": 1.0905833294557823,
"learning_rate": 6.416275430359938e-07,
"loss": 0.4479,
"step": 777
},
{
"epoch": 1.7258601553829078,
"grad_norm": 1.3285142739698166,
"learning_rate": 6.394129979035639e-07,
"loss": 0.4532,
"step": 778
},
{
"epoch": 1.728079911209767,
"grad_norm": 1.263445398115666,
"learning_rate": 6.371774618220115e-07,
"loss": 0.4311,
"step": 779
},
{
"epoch": 1.730299667036626,
"grad_norm": 1.2264280529869596,
"learning_rate": 6.349206349206349e-07,
"loss": 0.4214,
"step": 780
},
{
"epoch": 1.732519422863485,
"grad_norm": 1.369934602758291,
"learning_rate": 6.326422115895799e-07,
"loss": 0.4458,
"step": 781
},
{
"epoch": 1.734739178690344,
"grad_norm": 1.5901424052124737,
"learning_rate": 6.303418803418803e-07,
"loss": 0.4239,
"step": 782
},
{
"epoch": 1.7369589345172032,
"grad_norm": 1.580914386725684,
"learning_rate": 6.280193236714975e-07,
"loss": 0.433,
"step": 783
},
{
"epoch": 1.7391786903440623,
"grad_norm": 1.9007391003117418,
"learning_rate": 6.256742179072276e-07,
"loss": 0.4532,
"step": 784
},
{
"epoch": 1.7413984461709213,
"grad_norm": 1.307226977166083,
"learning_rate": 6.233062330623306e-07,
"loss": 0.4782,
"step": 785
},
{
"epoch": 1.7436182019977804,
"grad_norm": 1.2581634341891148,
"learning_rate": 6.209150326797385e-07,
"loss": 0.4772,
"step": 786
},
{
"epoch": 1.7458379578246392,
"grad_norm": 1.3123852294397897,
"learning_rate": 6.185002736726874e-07,
"loss": 0.4502,
"step": 787
},
{
"epoch": 1.7480577136514983,
"grad_norm": 1.315432695618896,
"learning_rate": 6.16061606160616e-07,
"loss": 0.4793,
"step": 788
},
{
"epoch": 1.7502774694783574,
"grad_norm": 1.3331739427739435,
"learning_rate": 6.135986733001658e-07,
"loss": 0.4351,
"step": 789
},
{
"epoch": 1.7524972253052165,
"grad_norm": 1.1750674279086586,
"learning_rate": 6.111111111111112e-07,
"loss": 0.4221,
"step": 790
},
{
"epoch": 1.7547169811320755,
"grad_norm": 1.154791123197388,
"learning_rate": 6.085985482970407e-07,
"loss": 0.4308,
"step": 791
},
{
"epoch": 1.7569367369589344,
"grad_norm": 1.3127678091741015,
"learning_rate": 6.060606060606061e-07,
"loss": 0.4453,
"step": 792
},
{
"epoch": 1.7591564927857934,
"grad_norm": 1.2866260542006003,
"learning_rate": 6.034968979131415e-07,
"loss": 0.4532,
"step": 793
},
{
"epoch": 1.7613762486126525,
"grad_norm": 1.2476359497788672,
"learning_rate": 6.009070294784579e-07,
"loss": 0.4429,
"step": 794
},
{
"epoch": 1.7635960044395116,
"grad_norm": 1.1932146525046583,
"learning_rate": 5.982905982905982e-07,
"loss": 0.4782,
"step": 795
},
{
"epoch": 1.7658157602663707,
"grad_norm": 1.2447276146507429,
"learning_rate": 5.956471935853378e-07,
"loss": 0.4509,
"step": 796
},
{
"epoch": 1.7680355160932297,
"grad_norm": 1.154358093852454,
"learning_rate": 5.929763960852043e-07,
"loss": 0.4278,
"step": 797
},
{
"epoch": 1.7702552719200888,
"grad_norm": 1.1339930293132137,
"learning_rate": 5.902777777777778e-07,
"loss": 0.4396,
"step": 798
},
{
"epoch": 1.7724750277469479,
"grad_norm": 1.353145494940799,
"learning_rate": 5.875509016870273e-07,
"loss": 0.4506,
"step": 799
},
{
"epoch": 1.774694783573807,
"grad_norm": 1.3269883420055035,
"learning_rate": 5.847953216374269e-07,
"loss": 0.4377,
"step": 800
},
{
"epoch": 1.776914539400666,
"grad_norm": 1.7657131677608475,
"learning_rate": 5.820105820105819e-07,
"loss": 0.4807,
"step": 801
},
{
"epoch": 1.779134295227525,
"grad_norm": 1.2762956654212174,
"learning_rate": 5.791962174940897e-07,
"loss": 0.4496,
"step": 802
},
{
"epoch": 1.7813540510543842,
"grad_norm": 1.447030988390941,
"learning_rate": 5.763517528223411e-07,
"loss": 0.4488,
"step": 803
},
{
"epoch": 1.7835738068812432,
"grad_norm": 1.4386048928316586,
"learning_rate": 5.734767025089605e-07,
"loss": 0.4639,
"step": 804
},
{
"epoch": 1.785793562708102,
"grad_norm": 1.2290691126902766,
"learning_rate": 5.705705705705706e-07,
"loss": 0.501,
"step": 805
},
{
"epoch": 1.7880133185349611,
"grad_norm": 1.1130486023477824,
"learning_rate": 5.67632850241546e-07,
"loss": 0.4492,
"step": 806
},
{
"epoch": 1.7902330743618202,
"grad_norm": 1.27395889325988,
"learning_rate": 5.646630236794171e-07,
"loss": 0.4267,
"step": 807
},
{
"epoch": 1.7924528301886793,
"grad_norm": 1.081240362636203,
"learning_rate": 5.616605616605615e-07,
"loss": 0.4457,
"step": 808
},
{
"epoch": 1.7946725860155381,
"grad_norm": 1.2001786727599388,
"learning_rate": 5.586249232658072e-07,
"loss": 0.4244,
"step": 809
},
{
"epoch": 1.7968923418423972,
"grad_norm": 1.2942776238812326,
"learning_rate": 5.555555555555554e-07,
"loss": 0.4396,
"step": 810
},
{
"epoch": 1.7991120976692563,
"grad_norm": 1.3095924522448517,
"learning_rate": 5.524518932340161e-07,
"loss": 0.4394,
"step": 811
},
{
"epoch": 1.8013318534961154,
"grad_norm": 1.1837983669154903,
"learning_rate": 5.493133583021223e-07,
"loss": 0.4739,
"step": 812
},
{
"epoch": 1.8035516093229744,
"grad_norm": 1.3161140920438619,
"learning_rate": 5.461393596986818e-07,
"loss": 0.4466,
"step": 813
},
{
"epoch": 1.8057713651498335,
"grad_norm": 1.4519854795085234,
"learning_rate": 5.42929292929293e-07,
"loss": 0.4476,
"step": 814
},
{
"epoch": 1.8079911209766926,
"grad_norm": 1.1806490538563992,
"learning_rate": 5.396825396825396e-07,
"loss": 0.4523,
"step": 815
},
{
"epoch": 1.8102108768035516,
"grad_norm": 1.2972477037538859,
"learning_rate": 5.363984674329501e-07,
"loss": 0.4516,
"step": 816
},
{
"epoch": 1.8124306326304107,
"grad_norm": 1.2156981658718196,
"learning_rate": 5.330764290301862e-07,
"loss": 0.4682,
"step": 817
},
{
"epoch": 1.8146503884572698,
"grad_norm": 1.3383256730414803,
"learning_rate": 5.297157622739017e-07,
"loss": 0.4441,
"step": 818
},
{
"epoch": 1.8168701442841289,
"grad_norm": 1.1561855543216444,
"learning_rate": 5.263157894736842e-07,
"loss": 0.4385,
"step": 819
},
{
"epoch": 1.819089900110988,
"grad_norm": 1.3166365086952017,
"learning_rate": 5.22875816993464e-07,
"loss": 0.4412,
"step": 820
},
{
"epoch": 1.821309655937847,
"grad_norm": 1.3947882182298343,
"learning_rate": 5.193951347797501e-07,
"loss": 0.4431,
"step": 821
},
{
"epoch": 1.8235294117647058,
"grad_norm": 1.2940515335047955,
"learning_rate": 5.158730158730157e-07,
"loss": 0.4718,
"step": 822
},
{
"epoch": 1.825749167591565,
"grad_norm": 1.6597858983745644,
"learning_rate": 5.123087159015301e-07,
"loss": 0.4494,
"step": 823
},
{
"epoch": 1.827968923418424,
"grad_norm": 1.3713070534311818,
"learning_rate": 5.087014725568942e-07,
"loss": 0.458,
"step": 824
},
{
"epoch": 1.830188679245283,
"grad_norm": 1.3214602635726282,
"learning_rate": 5.050505050505049e-07,
"loss": 0.4208,
"step": 825
},
{
"epoch": 1.832408435072142,
"grad_norm": 1.19733028360872,
"learning_rate": 5.013550135501355e-07,
"loss": 0.4495,
"step": 826
},
{
"epoch": 1.834628190899001,
"grad_norm": 1.1839632614546902,
"learning_rate": 4.976141785957736e-07,
"loss": 0.4593,
"step": 827
},
{
"epoch": 1.83684794672586,
"grad_norm": 1.3407995957608345,
"learning_rate": 4.938271604938272e-07,
"loss": 0.4534,
"step": 828
},
{
"epoch": 1.8390677025527191,
"grad_norm": 1.1447226640936632,
"learning_rate": 4.899930986887508e-07,
"loss": 0.4243,
"step": 829
},
{
"epoch": 1.8412874583795782,
"grad_norm": 1.1716186165798743,
"learning_rate": 4.86111111111111e-07,
"loss": 0.4254,
"step": 830
},
{
"epoch": 1.8435072142064373,
"grad_norm": 1.4155964443562337,
"learning_rate": 4.821802935010482e-07,
"loss": 0.4827,
"step": 831
},
{
"epoch": 1.8457269700332963,
"grad_norm": 1.0602142782272528,
"learning_rate": 4.781997187060478e-07,
"loss": 0.4235,
"step": 832
},
{
"epoch": 1.8479467258601554,
"grad_norm": 2.6338388785846347,
"learning_rate": 4.741684359518754e-07,
"loss": 0.4381,
"step": 833
},
{
"epoch": 1.8501664816870145,
"grad_norm": 1.2197211361897908,
"learning_rate": 4.7008547008547005e-07,
"loss": 0.4388,
"step": 834
},
{
"epoch": 1.8523862375138735,
"grad_norm": 1.7745318375917518,
"learning_rate": 4.659498207885304e-07,
"loss": 0.4871,
"step": 835
},
{
"epoch": 1.8546059933407326,
"grad_norm": 1.721485464100693,
"learning_rate": 4.6176046176046174e-07,
"loss": 0.4741,
"step": 836
},
{
"epoch": 1.8568257491675917,
"grad_norm": 1.1767003327204806,
"learning_rate": 4.57516339869281e-07,
"loss": 0.4103,
"step": 837
},
{
"epoch": 1.8590455049944508,
"grad_norm": 1.6104783471864577,
"learning_rate": 4.532163742690058e-07,
"loss": 0.4706,
"step": 838
},
{
"epoch": 1.8612652608213096,
"grad_norm": 1.237482652598919,
"learning_rate": 4.48859455481972e-07,
"loss": 0.4352,
"step": 839
},
{
"epoch": 1.8634850166481687,
"grad_norm": 1.5155898726016581,
"learning_rate": 4.4444444444444433e-07,
"loss": 0.4588,
"step": 840
},
{
"epoch": 1.8657047724750278,
"grad_norm": 2.617740589380866,
"learning_rate": 4.399701715137956e-07,
"loss": 0.4668,
"step": 841
},
{
"epoch": 1.8679245283018868,
"grad_norm": 1.1020239968511378,
"learning_rate": 4.3543543543543544e-07,
"loss": 0.4349,
"step": 842
},
{
"epoch": 1.870144284128746,
"grad_norm": 1.35587074648194,
"learning_rate": 4.308390022675737e-07,
"loss": 0.4259,
"step": 843
},
{
"epoch": 1.8723640399556047,
"grad_norm": 1.1609067209173856,
"learning_rate": 4.26179604261796e-07,
"loss": 0.4586,
"step": 844
},
{
"epoch": 1.8745837957824638,
"grad_norm": 1.1568537405414596,
"learning_rate": 4.214559386973179e-07,
"loss": 0.4667,
"step": 845
},
{
"epoch": 1.8768035516093229,
"grad_norm": 1.1053999396633238,
"learning_rate": 4.166666666666666e-07,
"loss": 0.4447,
"step": 846
},
{
"epoch": 1.879023307436182,
"grad_norm": 1.1469720416952864,
"learning_rate": 4.118104118104118e-07,
"loss": 0.4105,
"step": 847
},
{
"epoch": 1.881243063263041,
"grad_norm": 1.4950241181155033,
"learning_rate": 4.0688575899843503e-07,
"loss": 0.4523,
"step": 848
},
{
"epoch": 1.8834628190899,
"grad_norm": 1.783385866887097,
"learning_rate": 4.0189125295508264e-07,
"loss": 0.473,
"step": 849
},
{
"epoch": 1.8856825749167592,
"grad_norm": 1.3445672475572865,
"learning_rate": 3.968253968253968e-07,
"loss": 0.4438,
"step": 850
},
{
"epoch": 1.8879023307436182,
"grad_norm": 1.2800024731322417,
"learning_rate": 3.9168665067945643e-07,
"loss": 0.4613,
"step": 851
},
{
"epoch": 1.8901220865704773,
"grad_norm": 1.1359156205032364,
"learning_rate": 3.864734299516908e-07,
"loss": 0.4135,
"step": 852
},
{
"epoch": 1.8923418423973364,
"grad_norm": 1.1663061656581661,
"learning_rate": 3.81184103811841e-07,
"loss": 0.4661,
"step": 853
},
{
"epoch": 1.8945615982241955,
"grad_norm": 1.079911454850862,
"learning_rate": 3.758169934640523e-07,
"loss": 0.3968,
"step": 854
},
{
"epoch": 1.8967813540510545,
"grad_norm": 1.1354316719905342,
"learning_rate": 3.703703703703703e-07,
"loss": 0.4685,
"step": 855
},
{
"epoch": 1.8990011098779136,
"grad_norm": 1.2597607085859803,
"learning_rate": 3.6484245439469314e-07,
"loss": 0.4482,
"step": 856
},
{
"epoch": 1.9012208657047724,
"grad_norm": 1.2607408697644482,
"learning_rate": 3.592314118629908e-07,
"loss": 0.4292,
"step": 857
},
{
"epoch": 1.9034406215316315,
"grad_norm": 1.3646381579934892,
"learning_rate": 3.535353535353535e-07,
"loss": 0.4717,
"step": 858
},
{
"epoch": 1.9056603773584906,
"grad_norm": 1.1311629124430989,
"learning_rate": 3.477523324851569e-07,
"loss": 0.4686,
"step": 859
},
{
"epoch": 1.9078801331853497,
"grad_norm": 1.3891775849805408,
"learning_rate": 3.4188034188034184e-07,
"loss": 0.4417,
"step": 860
},
{
"epoch": 1.9100998890122085,
"grad_norm": 1.2686903993236105,
"learning_rate": 3.359173126614987e-07,
"loss": 0.4435,
"step": 861
},
{
"epoch": 1.9123196448390676,
"grad_norm": 1.23697755590465,
"learning_rate": 3.298611111111111e-07,
"loss": 0.444,
"step": 862
},
{
"epoch": 1.9145394006659266,
"grad_norm": 1.1646889799724214,
"learning_rate": 3.237095363079614e-07,
"loss": 0.4461,
"step": 863
},
{
"epoch": 1.9167591564927857,
"grad_norm": 1.1690789125536099,
"learning_rate": 3.1746031746031743e-07,
"loss": 0.4381,
"step": 864
},
{
"epoch": 1.9189789123196448,
"grad_norm": 1.856131712657273,
"learning_rate": 3.11111111111111e-07,
"loss": 0.4303,
"step": 865
},
{
"epoch": 1.9211986681465039,
"grad_norm": 1.2861751372732355,
"learning_rate": 3.046594982078853e-07,
"loss": 0.4721,
"step": 866
},
{
"epoch": 1.923418423973363,
"grad_norm": 1.1448253628790064,
"learning_rate": 2.9810298102981023e-07,
"loss": 0.4303,
"step": 867
},
{
"epoch": 1.925638179800222,
"grad_norm": 1.5061396832602665,
"learning_rate": 2.914389799635701e-07,
"loss": 0.4459,
"step": 868
},
{
"epoch": 1.927857935627081,
"grad_norm": 1.6049126455575795,
"learning_rate": 2.846648301193756e-07,
"loss": 0.5042,
"step": 869
},
{
"epoch": 1.9300776914539401,
"grad_norm": 1.3466952658104057,
"learning_rate": 2.7777777777777776e-07,
"loss": 0.4514,
"step": 870
},
{
"epoch": 1.9322974472807992,
"grad_norm": 1.2359889482809692,
"learning_rate": 2.707749766573296e-07,
"loss": 0.4615,
"step": 871
},
{
"epoch": 1.9345172031076583,
"grad_norm": 1.3803667899720569,
"learning_rate": 2.63653483992467e-07,
"loss": 0.4753,
"step": 872
},
{
"epoch": 1.9367369589345174,
"grad_norm": 1.3418543813232986,
"learning_rate": 2.5641025641025636e-07,
"loss": 0.4426,
"step": 873
},
{
"epoch": 1.9389567147613762,
"grad_norm": 1.1541425203967928,
"learning_rate": 2.4904214559386974e-07,
"loss": 0.436,
"step": 874
},
{
"epoch": 1.9411764705882353,
"grad_norm": 1.2229762566685807,
"learning_rate": 2.4154589371980677e-07,
"loss": 0.4561,
"step": 875
},
{
"epoch": 1.9433962264150944,
"grad_norm": 1.365624157409681,
"learning_rate": 2.3391812865497075e-07,
"loss": 0.4738,
"step": 876
},
{
"epoch": 1.9456159822419534,
"grad_norm": 1.0957773012671013,
"learning_rate": 2.2615535889872173e-07,
"loss": 0.4272,
"step": 877
},
{
"epoch": 1.9478357380688123,
"grad_norm": 1.2975570955631437,
"learning_rate": 2.1825396825396822e-07,
"loss": 0.4617,
"step": 878
},
{
"epoch": 1.9500554938956713,
"grad_norm": 1.1855387460284055,
"learning_rate": 2.1021021021021017e-07,
"loss": 0.4445,
"step": 879
},
{
"epoch": 1.9522752497225304,
"grad_norm": 1.2084720652589793,
"learning_rate": 2.0202020202020197e-07,
"loss": 0.4423,
"step": 880
},
{
"epoch": 1.9544950055493895,
"grad_norm": 1.1657717383702717,
"learning_rate": 1.9367991845056064e-07,
"loss": 0.4102,
"step": 881
},
{
"epoch": 1.9567147613762486,
"grad_norm": 1.1672852942067808,
"learning_rate": 1.8518518518518516e-07,
"loss": 0.4333,
"step": 882
},
{
"epoch": 1.9589345172031076,
"grad_norm": 1.2025436670810457,
"learning_rate": 1.7653167185877466e-07,
"loss": 0.4474,
"step": 883
},
{
"epoch": 1.9611542730299667,
"grad_norm": 1.1252788533397284,
"learning_rate": 1.6771488469601673e-07,
"loss": 0.4477,
"step": 884
},
{
"epoch": 1.9633740288568258,
"grad_norm": 1.1170346454418365,
"learning_rate": 1.5873015873015872e-07,
"loss": 0.4553,
"step": 885
},
{
"epoch": 1.9655937846836848,
"grad_norm": 1.2907160443119068,
"learning_rate": 1.4957264957264952e-07,
"loss": 0.4592,
"step": 886
},
{
"epoch": 1.967813540510544,
"grad_norm": 1.2266346238748143,
"learning_rate": 1.4023732470334413e-07,
"loss": 0.4694,
"step": 887
},
{
"epoch": 1.970033296337403,
"grad_norm": 1.4351882414602743,
"learning_rate": 1.30718954248366e-07,
"loss": 0.4587,
"step": 888
},
{
"epoch": 1.972253052164262,
"grad_norm": 1.1407373000182206,
"learning_rate": 1.2101210121012102e-07,
"loss": 0.443,
"step": 889
},
{
"epoch": 1.9744728079911211,
"grad_norm": 1.0938698878218327,
"learning_rate": 1.1111111111111108e-07,
"loss": 0.4529,
"step": 890
},
{
"epoch": 1.97669256381798,
"grad_norm": 1.8232355279940384,
"learning_rate": 1.01010101010101e-07,
"loss": 0.463,
"step": 891
},
{
"epoch": 1.978912319644839,
"grad_norm": 1.2544295442701097,
"learning_rate": 9.070294784580498e-08,
"loss": 0.4321,
"step": 892
},
{
"epoch": 1.9811320754716981,
"grad_norm": 1.4748170150022004,
"learning_rate": 8.018327605956471e-08,
"loss": 0.4776,
"step": 893
},
{
"epoch": 1.9833518312985572,
"grad_norm": 1.278573261536441,
"learning_rate": 6.944444444444444e-08,
"loss": 0.4288,
"step": 894
},
{
"epoch": 1.9855715871254163,
"grad_norm": 1.2558621483454577,
"learning_rate": 5.8479532163742687e-08,
"loss": 0.46,
"step": 895
},
{
"epoch": 1.987791342952275,
"grad_norm": 1.3026759598014417,
"learning_rate": 4.7281323877068556e-08,
"loss": 0.4658,
"step": 896
},
{
"epoch": 1.9900110987791342,
"grad_norm": 1.5203732374448793,
"learning_rate": 3.584229390681003e-08,
"loss": 0.4388,
"step": 897
},
{
"epoch": 1.9922308546059933,
"grad_norm": 1.4656249651411055,
"learning_rate": 2.4154589371980675e-08,
"loss": 0.4527,
"step": 898
},
{
"epoch": 1.9944506104328523,
"grad_norm": 4.406801594675205,
"learning_rate": 1.221001221001221e-08,
"loss": 0.4363,
"step": 899
},
{
"epoch": 1.9966703662597114,
"grad_norm": 1.2228266856157815,
"learning_rate": 0,
"loss": 0.4676,
"step": 900
}
],
"logging_steps": 1,
"max_steps": 900,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 225,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 7.993206625754153e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}