1epoch_OpenThoughts-114k / trainer_state.json
sedrickkeh's picture
End of training
7348916 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9997192588433464,
"eval_steps": 500,
"global_step": 1187,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0008422234699606962,
"grad_norm": 5.7768874168396,
"learning_rate": 8.403361344537815e-08,
"loss": 0.8113,
"step": 1
},
{
"epoch": 0.0016844469399213925,
"grad_norm": 5.731680393218994,
"learning_rate": 1.680672268907563e-07,
"loss": 0.818,
"step": 2
},
{
"epoch": 0.0025266704098820887,
"grad_norm": 5.816023349761963,
"learning_rate": 2.5210084033613445e-07,
"loss": 0.842,
"step": 3
},
{
"epoch": 0.003368893879842785,
"grad_norm": 5.791347503662109,
"learning_rate": 3.361344537815126e-07,
"loss": 0.842,
"step": 4
},
{
"epoch": 0.004211117349803481,
"grad_norm": 5.896840572357178,
"learning_rate": 4.201680672268908e-07,
"loss": 0.8589,
"step": 5
},
{
"epoch": 0.0050533408197641775,
"grad_norm": 5.944826602935791,
"learning_rate": 5.042016806722689e-07,
"loss": 0.8664,
"step": 6
},
{
"epoch": 0.005895564289724873,
"grad_norm": 5.9079694747924805,
"learning_rate": 5.882352941176471e-07,
"loss": 0.8953,
"step": 7
},
{
"epoch": 0.00673778775968557,
"grad_norm": 5.896533489227295,
"learning_rate": 6.722689075630252e-07,
"loss": 0.8605,
"step": 8
},
{
"epoch": 0.007580011229646266,
"grad_norm": 5.51594352722168,
"learning_rate": 7.563025210084034e-07,
"loss": 0.8391,
"step": 9
},
{
"epoch": 0.008422234699606962,
"grad_norm": 5.275195121765137,
"learning_rate": 8.403361344537816e-07,
"loss": 0.8306,
"step": 10
},
{
"epoch": 0.009264458169567658,
"grad_norm": 5.213021278381348,
"learning_rate": 9.243697478991598e-07,
"loss": 0.8346,
"step": 11
},
{
"epoch": 0.010106681639528355,
"grad_norm": 4.408111572265625,
"learning_rate": 1.0084033613445378e-06,
"loss": 0.8056,
"step": 12
},
{
"epoch": 0.010948905109489052,
"grad_norm": 4.242059230804443,
"learning_rate": 1.092436974789916e-06,
"loss": 0.8157,
"step": 13
},
{
"epoch": 0.011791128579449747,
"grad_norm": 4.183578014373779,
"learning_rate": 1.1764705882352942e-06,
"loss": 0.8271,
"step": 14
},
{
"epoch": 0.012633352049410443,
"grad_norm": 3.4327707290649414,
"learning_rate": 1.2605042016806724e-06,
"loss": 0.7981,
"step": 15
},
{
"epoch": 0.01347557551937114,
"grad_norm": 2.344697952270508,
"learning_rate": 1.3445378151260504e-06,
"loss": 0.7111,
"step": 16
},
{
"epoch": 0.014317798989331837,
"grad_norm": 2.276667833328247,
"learning_rate": 1.4285714285714286e-06,
"loss": 0.7833,
"step": 17
},
{
"epoch": 0.015160022459292532,
"grad_norm": 2.071214437484741,
"learning_rate": 1.5126050420168068e-06,
"loss": 0.7482,
"step": 18
},
{
"epoch": 0.016002245929253228,
"grad_norm": 1.9965254068374634,
"learning_rate": 1.5966386554621848e-06,
"loss": 0.7637,
"step": 19
},
{
"epoch": 0.016844469399213923,
"grad_norm": 1.9486117362976074,
"learning_rate": 1.6806722689075632e-06,
"loss": 0.7394,
"step": 20
},
{
"epoch": 0.01768669286917462,
"grad_norm": 2.220632314682007,
"learning_rate": 1.7647058823529414e-06,
"loss": 0.7298,
"step": 21
},
{
"epoch": 0.018528916339135316,
"grad_norm": 2.720287799835205,
"learning_rate": 1.8487394957983196e-06,
"loss": 0.7119,
"step": 22
},
{
"epoch": 0.019371139809096015,
"grad_norm": 2.9575459957122803,
"learning_rate": 1.932773109243698e-06,
"loss": 0.6989,
"step": 23
},
{
"epoch": 0.02021336327905671,
"grad_norm": 3.2116012573242188,
"learning_rate": 2.0168067226890756e-06,
"loss": 0.7339,
"step": 24
},
{
"epoch": 0.021055586749017405,
"grad_norm": 2.941859722137451,
"learning_rate": 2.100840336134454e-06,
"loss": 0.7393,
"step": 25
},
{
"epoch": 0.021897810218978103,
"grad_norm": 2.7433841228485107,
"learning_rate": 2.184873949579832e-06,
"loss": 0.7494,
"step": 26
},
{
"epoch": 0.022740033688938798,
"grad_norm": 2.205691337585449,
"learning_rate": 2.2689075630252102e-06,
"loss": 0.7106,
"step": 27
},
{
"epoch": 0.023582257158899493,
"grad_norm": 1.7549865245819092,
"learning_rate": 2.3529411764705885e-06,
"loss": 0.6942,
"step": 28
},
{
"epoch": 0.02442448062886019,
"grad_norm": 1.133491039276123,
"learning_rate": 2.4369747899159667e-06,
"loss": 0.6527,
"step": 29
},
{
"epoch": 0.025266704098820886,
"grad_norm": 1.1498465538024902,
"learning_rate": 2.521008403361345e-06,
"loss": 0.7023,
"step": 30
},
{
"epoch": 0.026108927568781585,
"grad_norm": 1.1065810918807983,
"learning_rate": 2.605042016806723e-06,
"loss": 0.6585,
"step": 31
},
{
"epoch": 0.02695115103874228,
"grad_norm": 1.1229432821273804,
"learning_rate": 2.689075630252101e-06,
"loss": 0.6625,
"step": 32
},
{
"epoch": 0.027793374508702975,
"grad_norm": 1.0526050329208374,
"learning_rate": 2.7731092436974795e-06,
"loss": 0.6456,
"step": 33
},
{
"epoch": 0.028635597978663673,
"grad_norm": 1.0395145416259766,
"learning_rate": 2.8571428571428573e-06,
"loss": 0.629,
"step": 34
},
{
"epoch": 0.029477821448624368,
"grad_norm": 1.0640065670013428,
"learning_rate": 2.9411764705882355e-06,
"loss": 0.661,
"step": 35
},
{
"epoch": 0.030320044918585063,
"grad_norm": 0.8369417786598206,
"learning_rate": 3.0252100840336137e-06,
"loss": 0.6184,
"step": 36
},
{
"epoch": 0.03116226838854576,
"grad_norm": 0.7107870578765869,
"learning_rate": 3.109243697478992e-06,
"loss": 0.6024,
"step": 37
},
{
"epoch": 0.032004491858506456,
"grad_norm": 0.6917080283164978,
"learning_rate": 3.1932773109243696e-06,
"loss": 0.6407,
"step": 38
},
{
"epoch": 0.032846715328467155,
"grad_norm": 0.7435740828514099,
"learning_rate": 3.2773109243697483e-06,
"loss": 0.642,
"step": 39
},
{
"epoch": 0.033688938798427846,
"grad_norm": 0.7839305996894836,
"learning_rate": 3.3613445378151265e-06,
"loss": 0.6068,
"step": 40
},
{
"epoch": 0.034531162268388545,
"grad_norm": 0.8303804993629456,
"learning_rate": 3.4453781512605043e-06,
"loss": 0.6189,
"step": 41
},
{
"epoch": 0.03537338573834924,
"grad_norm": 0.7673403024673462,
"learning_rate": 3.529411764705883e-06,
"loss": 0.6036,
"step": 42
},
{
"epoch": 0.03621560920830994,
"grad_norm": 0.6664486527442932,
"learning_rate": 3.6134453781512607e-06,
"loss": 0.614,
"step": 43
},
{
"epoch": 0.03705783267827063,
"grad_norm": 0.5672026872634888,
"learning_rate": 3.6974789915966393e-06,
"loss": 0.5862,
"step": 44
},
{
"epoch": 0.03790005614823133,
"grad_norm": 0.7759891152381897,
"learning_rate": 3.781512605042017e-06,
"loss": 0.6116,
"step": 45
},
{
"epoch": 0.03874227961819203,
"grad_norm": 0.7109480500221252,
"learning_rate": 3.865546218487396e-06,
"loss": 0.5891,
"step": 46
},
{
"epoch": 0.03958450308815272,
"grad_norm": 0.7215145826339722,
"learning_rate": 3.9495798319327735e-06,
"loss": 0.6177,
"step": 47
},
{
"epoch": 0.04042672655811342,
"grad_norm": 0.5773827433586121,
"learning_rate": 4.033613445378151e-06,
"loss": 0.5719,
"step": 48
},
{
"epoch": 0.04126895002807412,
"grad_norm": 0.6216230988502502,
"learning_rate": 4.11764705882353e-06,
"loss": 0.5907,
"step": 49
},
{
"epoch": 0.04211117349803481,
"grad_norm": 0.5774328708648682,
"learning_rate": 4.201680672268908e-06,
"loss": 0.5462,
"step": 50
},
{
"epoch": 0.04295339696799551,
"grad_norm": 0.592668890953064,
"learning_rate": 4.2857142857142855e-06,
"loss": 0.5845,
"step": 51
},
{
"epoch": 0.043795620437956206,
"grad_norm": 0.6327844858169556,
"learning_rate": 4.369747899159664e-06,
"loss": 0.5814,
"step": 52
},
{
"epoch": 0.0446378439079169,
"grad_norm": 0.603449285030365,
"learning_rate": 4.453781512605043e-06,
"loss": 0.6044,
"step": 53
},
{
"epoch": 0.045480067377877596,
"grad_norm": 0.5610288977622986,
"learning_rate": 4.5378151260504205e-06,
"loss": 0.597,
"step": 54
},
{
"epoch": 0.046322290847838295,
"grad_norm": 0.49728789925575256,
"learning_rate": 4.621848739495799e-06,
"loss": 0.6002,
"step": 55
},
{
"epoch": 0.047164514317798986,
"grad_norm": 0.5607268214225769,
"learning_rate": 4.705882352941177e-06,
"loss": 0.5839,
"step": 56
},
{
"epoch": 0.048006737787759685,
"grad_norm": 0.4914703965187073,
"learning_rate": 4.7899159663865555e-06,
"loss": 0.582,
"step": 57
},
{
"epoch": 0.04884896125772038,
"grad_norm": 0.4450504183769226,
"learning_rate": 4.873949579831933e-06,
"loss": 0.5652,
"step": 58
},
{
"epoch": 0.04969118472768108,
"grad_norm": 0.4177859127521515,
"learning_rate": 4.957983193277311e-06,
"loss": 0.5312,
"step": 59
},
{
"epoch": 0.05053340819764177,
"grad_norm": 0.47395533323287964,
"learning_rate": 5.04201680672269e-06,
"loss": 0.6046,
"step": 60
},
{
"epoch": 0.05137563166760247,
"grad_norm": 0.43919098377227783,
"learning_rate": 5.1260504201680675e-06,
"loss": 0.5607,
"step": 61
},
{
"epoch": 0.05221785513756317,
"grad_norm": 0.47760307788848877,
"learning_rate": 5.210084033613446e-06,
"loss": 0.5549,
"step": 62
},
{
"epoch": 0.05306007860752386,
"grad_norm": 0.4238983392715454,
"learning_rate": 5.294117647058824e-06,
"loss": 0.5652,
"step": 63
},
{
"epoch": 0.05390230207748456,
"grad_norm": 0.4531624913215637,
"learning_rate": 5.378151260504202e-06,
"loss": 0.5299,
"step": 64
},
{
"epoch": 0.05474452554744526,
"grad_norm": 0.4546626806259155,
"learning_rate": 5.4621848739495795e-06,
"loss": 0.5402,
"step": 65
},
{
"epoch": 0.05558674901740595,
"grad_norm": 0.4843672811985016,
"learning_rate": 5.546218487394959e-06,
"loss": 0.5637,
"step": 66
},
{
"epoch": 0.05642897248736665,
"grad_norm": 0.4193810522556305,
"learning_rate": 5.630252100840337e-06,
"loss": 0.5422,
"step": 67
},
{
"epoch": 0.057271195957327346,
"grad_norm": 0.40377169847488403,
"learning_rate": 5.7142857142857145e-06,
"loss": 0.5283,
"step": 68
},
{
"epoch": 0.05811341942728804,
"grad_norm": 0.4971763491630554,
"learning_rate": 5.798319327731093e-06,
"loss": 0.5506,
"step": 69
},
{
"epoch": 0.058955642897248736,
"grad_norm": 0.5227189660072327,
"learning_rate": 5.882352941176471e-06,
"loss": 0.5351,
"step": 70
},
{
"epoch": 0.059797866367209435,
"grad_norm": 0.4623558223247528,
"learning_rate": 5.9663865546218495e-06,
"loss": 0.5565,
"step": 71
},
{
"epoch": 0.060640089837170126,
"grad_norm": 0.4405473470687866,
"learning_rate": 6.050420168067227e-06,
"loss": 0.5565,
"step": 72
},
{
"epoch": 0.061482313307130824,
"grad_norm": 0.47143077850341797,
"learning_rate": 6.134453781512606e-06,
"loss": 0.5559,
"step": 73
},
{
"epoch": 0.06232453677709152,
"grad_norm": 0.4616853892803192,
"learning_rate": 6.218487394957984e-06,
"loss": 0.5442,
"step": 74
},
{
"epoch": 0.06316676024705221,
"grad_norm": 0.48797038197517395,
"learning_rate": 6.3025210084033615e-06,
"loss": 0.5266,
"step": 75
},
{
"epoch": 0.06400898371701291,
"grad_norm": 0.47729045152664185,
"learning_rate": 6.386554621848739e-06,
"loss": 0.514,
"step": 76
},
{
"epoch": 0.06485120718697361,
"grad_norm": 0.4070008099079132,
"learning_rate": 6.470588235294119e-06,
"loss": 0.5352,
"step": 77
},
{
"epoch": 0.06569343065693431,
"grad_norm": 0.4604280889034271,
"learning_rate": 6.5546218487394966e-06,
"loss": 0.5761,
"step": 78
},
{
"epoch": 0.06653565412689501,
"grad_norm": 0.41718626022338867,
"learning_rate": 6.638655462184874e-06,
"loss": 0.5269,
"step": 79
},
{
"epoch": 0.06737787759685569,
"grad_norm": 0.44142675399780273,
"learning_rate": 6.722689075630253e-06,
"loss": 0.5572,
"step": 80
},
{
"epoch": 0.06822010106681639,
"grad_norm": 0.4859495759010315,
"learning_rate": 6.806722689075631e-06,
"loss": 0.5411,
"step": 81
},
{
"epoch": 0.06906232453677709,
"grad_norm": 0.4969702959060669,
"learning_rate": 6.8907563025210085e-06,
"loss": 0.5432,
"step": 82
},
{
"epoch": 0.06990454800673779,
"grad_norm": 0.42125970125198364,
"learning_rate": 6.974789915966387e-06,
"loss": 0.5136,
"step": 83
},
{
"epoch": 0.07074677147669849,
"grad_norm": 0.43726083636283875,
"learning_rate": 7.058823529411766e-06,
"loss": 0.5413,
"step": 84
},
{
"epoch": 0.07158899494665918,
"grad_norm": 0.4877779483795166,
"learning_rate": 7.1428571428571436e-06,
"loss": 0.5187,
"step": 85
},
{
"epoch": 0.07243121841661988,
"grad_norm": 0.44341692328453064,
"learning_rate": 7.226890756302521e-06,
"loss": 0.5633,
"step": 86
},
{
"epoch": 0.07327344188658057,
"grad_norm": 0.473922997713089,
"learning_rate": 7.310924369747899e-06,
"loss": 0.539,
"step": 87
},
{
"epoch": 0.07411566535654127,
"grad_norm": 0.49199458956718445,
"learning_rate": 7.394957983193279e-06,
"loss": 0.5605,
"step": 88
},
{
"epoch": 0.07495788882650196,
"grad_norm": 0.44293299317359924,
"learning_rate": 7.478991596638656e-06,
"loss": 0.5469,
"step": 89
},
{
"epoch": 0.07580011229646266,
"grad_norm": 0.4823598861694336,
"learning_rate": 7.563025210084034e-06,
"loss": 0.5655,
"step": 90
},
{
"epoch": 0.07664233576642336,
"grad_norm": 0.49608561396598816,
"learning_rate": 7.647058823529411e-06,
"loss": 0.5285,
"step": 91
},
{
"epoch": 0.07748455923638406,
"grad_norm": 0.40790101885795593,
"learning_rate": 7.731092436974791e-06,
"loss": 0.5254,
"step": 92
},
{
"epoch": 0.07832678270634474,
"grad_norm": 0.5122684836387634,
"learning_rate": 7.815126050420168e-06,
"loss": 0.531,
"step": 93
},
{
"epoch": 0.07916900617630544,
"grad_norm": 0.5086395740509033,
"learning_rate": 7.899159663865547e-06,
"loss": 0.5588,
"step": 94
},
{
"epoch": 0.08001122964626614,
"grad_norm": 0.5407152771949768,
"learning_rate": 7.983193277310926e-06,
"loss": 0.5284,
"step": 95
},
{
"epoch": 0.08085345311622684,
"grad_norm": 0.45712265372276306,
"learning_rate": 8.067226890756303e-06,
"loss": 0.5228,
"step": 96
},
{
"epoch": 0.08169567658618754,
"grad_norm": 0.5024585127830505,
"learning_rate": 8.151260504201681e-06,
"loss": 0.5521,
"step": 97
},
{
"epoch": 0.08253790005614824,
"grad_norm": 0.4624791741371155,
"learning_rate": 8.23529411764706e-06,
"loss": 0.5154,
"step": 98
},
{
"epoch": 0.08338012352610892,
"grad_norm": 0.51868736743927,
"learning_rate": 8.319327731092438e-06,
"loss": 0.5123,
"step": 99
},
{
"epoch": 0.08422234699606962,
"grad_norm": 0.4298367202281952,
"learning_rate": 8.403361344537815e-06,
"loss": 0.5507,
"step": 100
},
{
"epoch": 0.08506457046603032,
"grad_norm": 0.5061134099960327,
"learning_rate": 8.487394957983194e-06,
"loss": 0.5317,
"step": 101
},
{
"epoch": 0.08590679393599102,
"grad_norm": 0.45311102271080017,
"learning_rate": 8.571428571428571e-06,
"loss": 0.558,
"step": 102
},
{
"epoch": 0.08674901740595171,
"grad_norm": 0.5305558443069458,
"learning_rate": 8.655462184873951e-06,
"loss": 0.5218,
"step": 103
},
{
"epoch": 0.08759124087591241,
"grad_norm": 0.5467281341552734,
"learning_rate": 8.739495798319328e-06,
"loss": 0.5526,
"step": 104
},
{
"epoch": 0.08843346434587311,
"grad_norm": 0.515506386756897,
"learning_rate": 8.823529411764707e-06,
"loss": 0.5335,
"step": 105
},
{
"epoch": 0.0892756878158338,
"grad_norm": 0.5897248387336731,
"learning_rate": 8.907563025210085e-06,
"loss": 0.5093,
"step": 106
},
{
"epoch": 0.0901179112857945,
"grad_norm": 0.5122184157371521,
"learning_rate": 8.991596638655462e-06,
"loss": 0.5161,
"step": 107
},
{
"epoch": 0.09096013475575519,
"grad_norm": 0.49633896350860596,
"learning_rate": 9.075630252100841e-06,
"loss": 0.5306,
"step": 108
},
{
"epoch": 0.09180235822571589,
"grad_norm": 0.479739785194397,
"learning_rate": 9.15966386554622e-06,
"loss": 0.5316,
"step": 109
},
{
"epoch": 0.09264458169567659,
"grad_norm": 0.5709593296051025,
"learning_rate": 9.243697478991598e-06,
"loss": 0.5276,
"step": 110
},
{
"epoch": 0.09348680516563729,
"grad_norm": 0.521088719367981,
"learning_rate": 9.327731092436975e-06,
"loss": 0.5465,
"step": 111
},
{
"epoch": 0.09432902863559797,
"grad_norm": 0.5315929651260376,
"learning_rate": 9.411764705882354e-06,
"loss": 0.5103,
"step": 112
},
{
"epoch": 0.09517125210555867,
"grad_norm": 0.4991461932659149,
"learning_rate": 9.49579831932773e-06,
"loss": 0.5311,
"step": 113
},
{
"epoch": 0.09601347557551937,
"grad_norm": 0.5171940326690674,
"learning_rate": 9.579831932773111e-06,
"loss": 0.5332,
"step": 114
},
{
"epoch": 0.09685569904548007,
"grad_norm": 0.5097129344940186,
"learning_rate": 9.663865546218488e-06,
"loss": 0.514,
"step": 115
},
{
"epoch": 0.09769792251544077,
"grad_norm": 0.5047439336776733,
"learning_rate": 9.747899159663867e-06,
"loss": 0.5127,
"step": 116
},
{
"epoch": 0.09854014598540146,
"grad_norm": 0.45830124616622925,
"learning_rate": 9.831932773109244e-06,
"loss": 0.5268,
"step": 117
},
{
"epoch": 0.09938236945536216,
"grad_norm": 0.481951504945755,
"learning_rate": 9.915966386554622e-06,
"loss": 0.5223,
"step": 118
},
{
"epoch": 0.10022459292532285,
"grad_norm": 0.530089795589447,
"learning_rate": 1e-05,
"loss": 0.5265,
"step": 119
},
{
"epoch": 0.10106681639528355,
"grad_norm": 0.5048061013221741,
"learning_rate": 9.999978367986988e-06,
"loss": 0.5125,
"step": 120
},
{
"epoch": 0.10190903986524424,
"grad_norm": 0.5036464333534241,
"learning_rate": 9.999913472135126e-06,
"loss": 0.5475,
"step": 121
},
{
"epoch": 0.10275126333520494,
"grad_norm": 0.4843362867832184,
"learning_rate": 9.999805313005946e-06,
"loss": 0.5124,
"step": 122
},
{
"epoch": 0.10359348680516564,
"grad_norm": 0.4779297411441803,
"learning_rate": 9.99965389153533e-06,
"loss": 0.5044,
"step": 123
},
{
"epoch": 0.10443571027512634,
"grad_norm": 0.4498569071292877,
"learning_rate": 9.999459209033495e-06,
"loss": 0.488,
"step": 124
},
{
"epoch": 0.10527793374508702,
"grad_norm": 0.4998040795326233,
"learning_rate": 9.999221267184993e-06,
"loss": 0.506,
"step": 125
},
{
"epoch": 0.10612015721504772,
"grad_norm": 0.5822563171386719,
"learning_rate": 9.998940068048688e-06,
"loss": 0.5578,
"step": 126
},
{
"epoch": 0.10696238068500842,
"grad_norm": 0.5187222957611084,
"learning_rate": 9.998615614057743e-06,
"loss": 0.519,
"step": 127
},
{
"epoch": 0.10780460415496912,
"grad_norm": 0.44510042667388916,
"learning_rate": 9.998247908019594e-06,
"loss": 0.4965,
"step": 128
},
{
"epoch": 0.10864682762492982,
"grad_norm": 0.5618104934692383,
"learning_rate": 9.997836953115927e-06,
"loss": 0.5108,
"step": 129
},
{
"epoch": 0.10948905109489052,
"grad_norm": 0.4618164896965027,
"learning_rate": 9.997382752902658e-06,
"loss": 0.5406,
"step": 130
},
{
"epoch": 0.1103312745648512,
"grad_norm": 0.4968518018722534,
"learning_rate": 9.996885311309892e-06,
"loss": 0.5188,
"step": 131
},
{
"epoch": 0.1111734980348119,
"grad_norm": 0.49284565448760986,
"learning_rate": 9.996344632641895e-06,
"loss": 0.4996,
"step": 132
},
{
"epoch": 0.1120157215047726,
"grad_norm": 0.5027459263801575,
"learning_rate": 9.995760721577053e-06,
"loss": 0.5256,
"step": 133
},
{
"epoch": 0.1128579449747333,
"grad_norm": 0.5380898714065552,
"learning_rate": 9.995133583167833e-06,
"loss": 0.5252,
"step": 134
},
{
"epoch": 0.113700168444694,
"grad_norm": 0.4836495816707611,
"learning_rate": 9.994463222840748e-06,
"loss": 0.4899,
"step": 135
},
{
"epoch": 0.11454239191465469,
"grad_norm": 0.5266380906105042,
"learning_rate": 9.993749646396286e-06,
"loss": 0.5333,
"step": 136
},
{
"epoch": 0.11538461538461539,
"grad_norm": 0.45768430829048157,
"learning_rate": 9.992992860008893e-06,
"loss": 0.4913,
"step": 137
},
{
"epoch": 0.11622683885457608,
"grad_norm": 0.5433592200279236,
"learning_rate": 9.99219287022689e-06,
"loss": 0.5155,
"step": 138
},
{
"epoch": 0.11706906232453677,
"grad_norm": 0.4485207796096802,
"learning_rate": 9.991349683972435e-06,
"loss": 0.4762,
"step": 139
},
{
"epoch": 0.11791128579449747,
"grad_norm": 0.5021021366119385,
"learning_rate": 9.990463308541452e-06,
"loss": 0.5166,
"step": 140
},
{
"epoch": 0.11875350926445817,
"grad_norm": 0.5295252799987793,
"learning_rate": 9.989533751603578e-06,
"loss": 0.5191,
"step": 141
},
{
"epoch": 0.11959573273441887,
"grad_norm": 0.5277601480484009,
"learning_rate": 9.988561021202083e-06,
"loss": 0.5067,
"step": 142
},
{
"epoch": 0.12043795620437957,
"grad_norm": 0.48189201951026917,
"learning_rate": 9.987545125753818e-06,
"loss": 0.4935,
"step": 143
},
{
"epoch": 0.12128017967434025,
"grad_norm": 0.578239917755127,
"learning_rate": 9.986486074049131e-06,
"loss": 0.5292,
"step": 144
},
{
"epoch": 0.12212240314430095,
"grad_norm": 0.5277495384216309,
"learning_rate": 9.985383875251783e-06,
"loss": 0.5124,
"step": 145
},
{
"epoch": 0.12296462661426165,
"grad_norm": 0.5657446384429932,
"learning_rate": 9.98423853889889e-06,
"loss": 0.5164,
"step": 146
},
{
"epoch": 0.12380685008422235,
"grad_norm": 0.4794774353504181,
"learning_rate": 9.983050074900824e-06,
"loss": 0.5111,
"step": 147
},
{
"epoch": 0.12464907355418305,
"grad_norm": 0.5772200226783752,
"learning_rate": 9.98181849354113e-06,
"loss": 0.5066,
"step": 148
},
{
"epoch": 0.12549129702414374,
"grad_norm": 0.5424597859382629,
"learning_rate": 9.980543805476447e-06,
"loss": 0.5137,
"step": 149
},
{
"epoch": 0.12633352049410443,
"grad_norm": 0.6257909536361694,
"learning_rate": 9.979226021736396e-06,
"loss": 0.5385,
"step": 150
},
{
"epoch": 0.12717574396406514,
"grad_norm": 0.5045788288116455,
"learning_rate": 9.977865153723508e-06,
"loss": 0.4789,
"step": 151
},
{
"epoch": 0.12801796743402583,
"grad_norm": 0.5379694700241089,
"learning_rate": 9.976461213213104e-06,
"loss": 0.5066,
"step": 152
},
{
"epoch": 0.12886019090398654,
"grad_norm": 0.6253272891044617,
"learning_rate": 9.975014212353212e-06,
"loss": 0.5247,
"step": 153
},
{
"epoch": 0.12970241437394722,
"grad_norm": 0.4655895233154297,
"learning_rate": 9.973524163664447e-06,
"loss": 0.5167,
"step": 154
},
{
"epoch": 0.1305446378439079,
"grad_norm": 0.5709823369979858,
"learning_rate": 9.971991080039912e-06,
"loss": 0.5379,
"step": 155
},
{
"epoch": 0.13138686131386862,
"grad_norm": 0.5366809964179993,
"learning_rate": 9.970414974745077e-06,
"loss": 0.5017,
"step": 156
},
{
"epoch": 0.1322290847838293,
"grad_norm": 0.5675758123397827,
"learning_rate": 9.968795861417676e-06,
"loss": 0.5339,
"step": 157
},
{
"epoch": 0.13307130825379002,
"grad_norm": 0.5337751507759094,
"learning_rate": 9.967133754067581e-06,
"loss": 0.5008,
"step": 158
},
{
"epoch": 0.1339135317237507,
"grad_norm": 0.6179817318916321,
"learning_rate": 9.965428667076687e-06,
"loss": 0.5308,
"step": 159
},
{
"epoch": 0.13475575519371139,
"grad_norm": 0.5175896286964417,
"learning_rate": 9.963680615198774e-06,
"loss": 0.51,
"step": 160
},
{
"epoch": 0.1355979786636721,
"grad_norm": 0.5223482847213745,
"learning_rate": 9.961889613559396e-06,
"loss": 0.5324,
"step": 161
},
{
"epoch": 0.13644020213363278,
"grad_norm": 0.6074829697608948,
"learning_rate": 9.960055677655743e-06,
"loss": 0.4971,
"step": 162
},
{
"epoch": 0.1372824256035935,
"grad_norm": 0.4789965748786926,
"learning_rate": 9.958178823356503e-06,
"loss": 0.4979,
"step": 163
},
{
"epoch": 0.13812464907355418,
"grad_norm": 0.521506130695343,
"learning_rate": 9.956259066901733e-06,
"loss": 0.5463,
"step": 164
},
{
"epoch": 0.1389668725435149,
"grad_norm": 0.5657532215118408,
"learning_rate": 9.954296424902709e-06,
"loss": 0.4963,
"step": 165
},
{
"epoch": 0.13980909601347558,
"grad_norm": 0.513095498085022,
"learning_rate": 9.95229091434179e-06,
"loss": 0.538,
"step": 166
},
{
"epoch": 0.14065131948343626,
"grad_norm": 0.5661138892173767,
"learning_rate": 9.950242552572272e-06,
"loss": 0.508,
"step": 167
},
{
"epoch": 0.14149354295339697,
"grad_norm": 0.5491284132003784,
"learning_rate": 9.948151357318228e-06,
"loss": 0.5308,
"step": 168
},
{
"epoch": 0.14233576642335766,
"grad_norm": 0.5399675369262695,
"learning_rate": 9.946017346674362e-06,
"loss": 0.5111,
"step": 169
},
{
"epoch": 0.14317798989331837,
"grad_norm": 0.5326489210128784,
"learning_rate": 9.943840539105853e-06,
"loss": 0.5101,
"step": 170
},
{
"epoch": 0.14402021336327905,
"grad_norm": 0.47655802965164185,
"learning_rate": 9.941620953448195e-06,
"loss": 0.4939,
"step": 171
},
{
"epoch": 0.14486243683323977,
"grad_norm": 0.5954977869987488,
"learning_rate": 9.939358608907026e-06,
"loss": 0.5266,
"step": 172
},
{
"epoch": 0.14570466030320045,
"grad_norm": 0.48619142174720764,
"learning_rate": 9.937053525057977e-06,
"loss": 0.5084,
"step": 173
},
{
"epoch": 0.14654688377316114,
"grad_norm": 0.5034385323524475,
"learning_rate": 9.934705721846487e-06,
"loss": 0.4797,
"step": 174
},
{
"epoch": 0.14738910724312185,
"grad_norm": 0.4761101305484772,
"learning_rate": 9.932315219587641e-06,
"loss": 0.5146,
"step": 175
},
{
"epoch": 0.14823133071308253,
"grad_norm": 0.5504338145256042,
"learning_rate": 9.92988203896599e-06,
"loss": 0.4966,
"step": 176
},
{
"epoch": 0.14907355418304324,
"grad_norm": 0.5048502087593079,
"learning_rate": 9.927406201035368e-06,
"loss": 0.5066,
"step": 177
},
{
"epoch": 0.14991577765300393,
"grad_norm": 0.5819829702377319,
"learning_rate": 9.924887727218724e-06,
"loss": 0.5488,
"step": 178
},
{
"epoch": 0.1507580011229646,
"grad_norm": 0.524554967880249,
"learning_rate": 9.922326639307918e-06,
"loss": 0.4962,
"step": 179
},
{
"epoch": 0.15160022459292533,
"grad_norm": 0.4543086886405945,
"learning_rate": 9.919722959463545e-06,
"loss": 0.5099,
"step": 180
},
{
"epoch": 0.152442448062886,
"grad_norm": 0.5056329965591431,
"learning_rate": 9.917076710214739e-06,
"loss": 0.4723,
"step": 181
},
{
"epoch": 0.15328467153284672,
"grad_norm": 0.5620024800300598,
"learning_rate": 9.914387914458983e-06,
"loss": 0.5111,
"step": 182
},
{
"epoch": 0.1541268950028074,
"grad_norm": 0.475263386964798,
"learning_rate": 9.911656595461899e-06,
"loss": 0.5034,
"step": 183
},
{
"epoch": 0.15496911847276812,
"grad_norm": 0.5066370964050293,
"learning_rate": 9.908882776857057e-06,
"loss": 0.5212,
"step": 184
},
{
"epoch": 0.1558113419427288,
"grad_norm": 0.594321608543396,
"learning_rate": 9.906066482645774e-06,
"loss": 0.5361,
"step": 185
},
{
"epoch": 0.1566535654126895,
"grad_norm": 0.5244519114494324,
"learning_rate": 9.903207737196892e-06,
"loss": 0.4945,
"step": 186
},
{
"epoch": 0.1574957888826502,
"grad_norm": 0.5021108984947205,
"learning_rate": 9.900306565246579e-06,
"loss": 0.4724,
"step": 187
},
{
"epoch": 0.15833801235261089,
"grad_norm": 0.4762863516807556,
"learning_rate": 9.89736299189811e-06,
"loss": 0.4969,
"step": 188
},
{
"epoch": 0.1591802358225716,
"grad_norm": 0.605094313621521,
"learning_rate": 9.894377042621654e-06,
"loss": 0.5432,
"step": 189
},
{
"epoch": 0.16002245929253228,
"grad_norm": 0.5847768187522888,
"learning_rate": 9.891348743254046e-06,
"loss": 0.4992,
"step": 190
},
{
"epoch": 0.160864682762493,
"grad_norm": 0.5650973320007324,
"learning_rate": 9.888278119998573e-06,
"loss": 0.4782,
"step": 191
},
{
"epoch": 0.16170690623245368,
"grad_norm": 0.5127384066581726,
"learning_rate": 9.885165199424738e-06,
"loss": 0.4917,
"step": 192
},
{
"epoch": 0.16254912970241436,
"grad_norm": 0.4501786231994629,
"learning_rate": 9.882010008468038e-06,
"loss": 0.4922,
"step": 193
},
{
"epoch": 0.16339135317237508,
"grad_norm": 0.5227730870246887,
"learning_rate": 9.878812574429722e-06,
"loss": 0.4937,
"step": 194
},
{
"epoch": 0.16423357664233576,
"grad_norm": 0.4444067180156708,
"learning_rate": 9.875572924976568e-06,
"loss": 0.4955,
"step": 195
},
{
"epoch": 0.16507580011229647,
"grad_norm": 0.5843701362609863,
"learning_rate": 9.87229108814063e-06,
"loss": 0.4759,
"step": 196
},
{
"epoch": 0.16591802358225716,
"grad_norm": 0.5321926474571228,
"learning_rate": 9.868967092319003e-06,
"loss": 0.5134,
"step": 197
},
{
"epoch": 0.16676024705221784,
"grad_norm": 0.5984789133071899,
"learning_rate": 9.865600966273576e-06,
"loss": 0.516,
"step": 198
},
{
"epoch": 0.16760247052217855,
"grad_norm": 0.4739929437637329,
"learning_rate": 9.86219273913078e-06,
"loss": 0.4972,
"step": 199
},
{
"epoch": 0.16844469399213924,
"grad_norm": 0.6019454002380371,
"learning_rate": 9.858742440381343e-06,
"loss": 0.5177,
"step": 200
},
{
"epoch": 0.16928691746209995,
"grad_norm": 0.5428659915924072,
"learning_rate": 9.855250099880026e-06,
"loss": 0.5024,
"step": 201
},
{
"epoch": 0.17012914093206064,
"grad_norm": 0.5898825526237488,
"learning_rate": 9.851715747845372e-06,
"loss": 0.4778,
"step": 202
},
{
"epoch": 0.17097136440202135,
"grad_norm": 0.46226394176483154,
"learning_rate": 9.848139414859441e-06,
"loss": 0.4869,
"step": 203
},
{
"epoch": 0.17181358787198203,
"grad_norm": 0.5695976614952087,
"learning_rate": 9.844521131867546e-06,
"loss": 0.5295,
"step": 204
},
{
"epoch": 0.17265581134194272,
"grad_norm": 0.5284704566001892,
"learning_rate": 9.840860930177984e-06,
"loss": 0.4868,
"step": 205
},
{
"epoch": 0.17349803481190343,
"grad_norm": 0.49998217821121216,
"learning_rate": 9.837158841461767e-06,
"loss": 0.4867,
"step": 206
},
{
"epoch": 0.1743402582818641,
"grad_norm": 0.5210819840431213,
"learning_rate": 9.833414897752346e-06,
"loss": 0.4872,
"step": 207
},
{
"epoch": 0.17518248175182483,
"grad_norm": 0.5861724019050598,
"learning_rate": 9.829629131445342e-06,
"loss": 0.5168,
"step": 208
},
{
"epoch": 0.1760247052217855,
"grad_norm": 0.5185627937316895,
"learning_rate": 9.825801575298248e-06,
"loss": 0.4957,
"step": 209
},
{
"epoch": 0.17686692869174622,
"grad_norm": 0.5859359502792358,
"learning_rate": 9.821932262430164e-06,
"loss": 0.5051,
"step": 210
},
{
"epoch": 0.1777091521617069,
"grad_norm": 0.5268411040306091,
"learning_rate": 9.818021226321502e-06,
"loss": 0.5064,
"step": 211
},
{
"epoch": 0.1785513756316676,
"grad_norm": 0.48425450921058655,
"learning_rate": 9.814068500813692e-06,
"loss": 0.4873,
"step": 212
},
{
"epoch": 0.1793935991016283,
"grad_norm": 0.5642316937446594,
"learning_rate": 9.8100741201089e-06,
"loss": 0.5047,
"step": 213
},
{
"epoch": 0.180235822571589,
"grad_norm": 0.5157051682472229,
"learning_rate": 9.806038118769724e-06,
"loss": 0.5235,
"step": 214
},
{
"epoch": 0.1810780460415497,
"grad_norm": 0.5594722032546997,
"learning_rate": 9.801960531718898e-06,
"loss": 0.5114,
"step": 215
},
{
"epoch": 0.18192026951151039,
"grad_norm": 0.5461316704750061,
"learning_rate": 9.797841394238987e-06,
"loss": 0.5059,
"step": 216
},
{
"epoch": 0.1827624929814711,
"grad_norm": 0.4522736072540283,
"learning_rate": 9.793680741972084e-06,
"loss": 0.4978,
"step": 217
},
{
"epoch": 0.18360471645143178,
"grad_norm": 0.5327430963516235,
"learning_rate": 9.789478610919508e-06,
"loss": 0.4797,
"step": 218
},
{
"epoch": 0.18444693992139247,
"grad_norm": 0.4836098551750183,
"learning_rate": 9.785235037441473e-06,
"loss": 0.5018,
"step": 219
},
{
"epoch": 0.18528916339135318,
"grad_norm": 0.4598475396633148,
"learning_rate": 9.780950058256802e-06,
"loss": 0.4982,
"step": 220
},
{
"epoch": 0.18613138686131386,
"grad_norm": 0.5103244781494141,
"learning_rate": 9.77662371044258e-06,
"loss": 0.4927,
"step": 221
},
{
"epoch": 0.18697361033127458,
"grad_norm": 0.47890129685401917,
"learning_rate": 9.77225603143385e-06,
"loss": 0.5054,
"step": 222
},
{
"epoch": 0.18781583380123526,
"grad_norm": 0.5484627485275269,
"learning_rate": 9.767847059023292e-06,
"loss": 0.495,
"step": 223
},
{
"epoch": 0.18865805727119594,
"grad_norm": 0.5115911364555359,
"learning_rate": 9.763396831360884e-06,
"loss": 0.4868,
"step": 224
},
{
"epoch": 0.18950028074115666,
"grad_norm": 0.6416915655136108,
"learning_rate": 9.75890538695358e-06,
"loss": 0.4732,
"step": 225
},
{
"epoch": 0.19034250421111734,
"grad_norm": 0.5456217527389526,
"learning_rate": 9.75437276466497e-06,
"loss": 0.5113,
"step": 226
},
{
"epoch": 0.19118472768107805,
"grad_norm": 0.6080687046051025,
"learning_rate": 9.749799003714954e-06,
"loss": 0.5086,
"step": 227
},
{
"epoch": 0.19202695115103874,
"grad_norm": 0.5577587485313416,
"learning_rate": 9.745184143679398e-06,
"loss": 0.4832,
"step": 228
},
{
"epoch": 0.19286917462099945,
"grad_norm": 0.588985025882721,
"learning_rate": 9.74052822448978e-06,
"loss": 0.4893,
"step": 229
},
{
"epoch": 0.19371139809096014,
"grad_norm": 0.6427119970321655,
"learning_rate": 9.735831286432869e-06,
"loss": 0.5022,
"step": 230
},
{
"epoch": 0.19455362156092082,
"grad_norm": 0.6053304672241211,
"learning_rate": 9.731093370150349e-06,
"loss": 0.4925,
"step": 231
},
{
"epoch": 0.19539584503088153,
"grad_norm": 0.6324568390846252,
"learning_rate": 9.72631451663849e-06,
"loss": 0.4693,
"step": 232
},
{
"epoch": 0.19623806850084222,
"grad_norm": 0.6193096041679382,
"learning_rate": 9.721494767247779e-06,
"loss": 0.5334,
"step": 233
},
{
"epoch": 0.19708029197080293,
"grad_norm": 0.6784276366233826,
"learning_rate": 9.71663416368257e-06,
"loss": 0.5018,
"step": 234
},
{
"epoch": 0.1979225154407636,
"grad_norm": 0.5089966058731079,
"learning_rate": 9.71173274800072e-06,
"loss": 0.4836,
"step": 235
},
{
"epoch": 0.19876473891072433,
"grad_norm": 0.6575846672058105,
"learning_rate": 9.70679056261322e-06,
"loss": 0.4707,
"step": 236
},
{
"epoch": 0.199606962380685,
"grad_norm": 0.6496636867523193,
"learning_rate": 9.70180765028384e-06,
"loss": 0.5029,
"step": 237
},
{
"epoch": 0.2004491858506457,
"grad_norm": 0.5280755758285522,
"learning_rate": 9.696784054128749e-06,
"loss": 0.4846,
"step": 238
},
{
"epoch": 0.2012914093206064,
"grad_norm": 0.5946359038352966,
"learning_rate": 9.691719817616148e-06,
"loss": 0.4812,
"step": 239
},
{
"epoch": 0.2021336327905671,
"grad_norm": 0.5537473559379578,
"learning_rate": 9.686614984565888e-06,
"loss": 0.4942,
"step": 240
},
{
"epoch": 0.2029758562605278,
"grad_norm": 0.4930758476257324,
"learning_rate": 9.681469599149093e-06,
"loss": 0.4953,
"step": 241
},
{
"epoch": 0.2038180797304885,
"grad_norm": 0.6003844738006592,
"learning_rate": 9.676283705887783e-06,
"loss": 0.4892,
"step": 242
},
{
"epoch": 0.20466030320044917,
"grad_norm": 0.5657713413238525,
"learning_rate": 9.671057349654481e-06,
"loss": 0.5321,
"step": 243
},
{
"epoch": 0.20550252667040989,
"grad_norm": 0.49356991052627563,
"learning_rate": 9.66579057567183e-06,
"loss": 0.4886,
"step": 244
},
{
"epoch": 0.20634475014037057,
"grad_norm": 0.5777039527893066,
"learning_rate": 9.660483429512198e-06,
"loss": 0.4814,
"step": 245
},
{
"epoch": 0.20718697361033128,
"grad_norm": 0.5081586837768555,
"learning_rate": 9.65513595709729e-06,
"loss": 0.501,
"step": 246
},
{
"epoch": 0.20802919708029197,
"grad_norm": 0.5518150329589844,
"learning_rate": 9.649748204697741e-06,
"loss": 0.5143,
"step": 247
},
{
"epoch": 0.20887142055025268,
"grad_norm": 0.4836235046386719,
"learning_rate": 9.644320218932723e-06,
"loss": 0.4852,
"step": 248
},
{
"epoch": 0.20971364402021336,
"grad_norm": 0.5687448978424072,
"learning_rate": 9.63885204676954e-06,
"loss": 0.4866,
"step": 249
},
{
"epoch": 0.21055586749017405,
"grad_norm": 0.5448442697525024,
"learning_rate": 9.63334373552322e-06,
"loss": 0.4646,
"step": 250
},
{
"epoch": 0.21139809096013476,
"grad_norm": 0.5512805581092834,
"learning_rate": 9.627795332856107e-06,
"loss": 0.4892,
"step": 251
},
{
"epoch": 0.21224031443009544,
"grad_norm": 0.64194256067276,
"learning_rate": 9.622206886777448e-06,
"loss": 0.4869,
"step": 252
},
{
"epoch": 0.21308253790005616,
"grad_norm": 0.5275607109069824,
"learning_rate": 9.616578445642982e-06,
"loss": 0.4919,
"step": 253
},
{
"epoch": 0.21392476137001684,
"grad_norm": 0.5536665320396423,
"learning_rate": 9.61091005815451e-06,
"loss": 0.519,
"step": 254
},
{
"epoch": 0.21476698483997755,
"grad_norm": 0.5774405002593994,
"learning_rate": 9.605201773359485e-06,
"loss": 0.4935,
"step": 255
},
{
"epoch": 0.21560920830993824,
"grad_norm": 0.5153358578681946,
"learning_rate": 9.599453640650585e-06,
"loss": 0.5004,
"step": 256
},
{
"epoch": 0.21645143177989892,
"grad_norm": 0.5064471960067749,
"learning_rate": 9.59366570976528e-06,
"loss": 0.494,
"step": 257
},
{
"epoch": 0.21729365524985964,
"grad_norm": 0.48846712708473206,
"learning_rate": 9.587838030785413e-06,
"loss": 0.4753,
"step": 258
},
{
"epoch": 0.21813587871982032,
"grad_norm": 0.4824623465538025,
"learning_rate": 9.581970654136752e-06,
"loss": 0.4837,
"step": 259
},
{
"epoch": 0.21897810218978103,
"grad_norm": 0.5995156764984131,
"learning_rate": 9.576063630588563e-06,
"loss": 0.48,
"step": 260
},
{
"epoch": 0.21982032565974172,
"grad_norm": 0.4862680733203888,
"learning_rate": 9.570117011253173e-06,
"loss": 0.4709,
"step": 261
},
{
"epoch": 0.2206625491297024,
"grad_norm": 0.65143883228302,
"learning_rate": 9.56413084758552e-06,
"loss": 0.4809,
"step": 262
},
{
"epoch": 0.2215047725996631,
"grad_norm": 0.5107113122940063,
"learning_rate": 9.55810519138271e-06,
"loss": 0.5325,
"step": 263
},
{
"epoch": 0.2223469960696238,
"grad_norm": 0.5462081432342529,
"learning_rate": 9.552040094783575e-06,
"loss": 0.498,
"step": 264
},
{
"epoch": 0.2231892195395845,
"grad_norm": 0.5198892951011658,
"learning_rate": 9.545935610268213e-06,
"loss": 0.4971,
"step": 265
},
{
"epoch": 0.2240314430095452,
"grad_norm": 0.5494446754455566,
"learning_rate": 9.53979179065754e-06,
"loss": 0.4989,
"step": 266
},
{
"epoch": 0.2248736664795059,
"grad_norm": 0.4715658724308014,
"learning_rate": 9.533608689112827e-06,
"loss": 0.5003,
"step": 267
},
{
"epoch": 0.2257158899494666,
"grad_norm": 0.5305702686309814,
"learning_rate": 9.527386359135254e-06,
"loss": 0.477,
"step": 268
},
{
"epoch": 0.22655811341942728,
"grad_norm": 0.5072541236877441,
"learning_rate": 9.521124854565425e-06,
"loss": 0.4649,
"step": 269
},
{
"epoch": 0.227400336889388,
"grad_norm": 0.4624420702457428,
"learning_rate": 9.514824229582922e-06,
"loss": 0.4823,
"step": 270
},
{
"epoch": 0.22824256035934867,
"grad_norm": 0.5773640275001526,
"learning_rate": 9.508484538705823e-06,
"loss": 0.5053,
"step": 271
},
{
"epoch": 0.22908478382930939,
"grad_norm": 0.5840558409690857,
"learning_rate": 9.50210583679024e-06,
"loss": 0.5006,
"step": 272
},
{
"epoch": 0.22992700729927007,
"grad_norm": 0.5893429517745972,
"learning_rate": 9.495688179029838e-06,
"loss": 0.5046,
"step": 273
},
{
"epoch": 0.23076923076923078,
"grad_norm": 0.46160465478897095,
"learning_rate": 9.48923162095536e-06,
"loss": 0.4527,
"step": 274
},
{
"epoch": 0.23161145423919147,
"grad_norm": 0.5931537747383118,
"learning_rate": 9.482736218434144e-06,
"loss": 0.4901,
"step": 275
},
{
"epoch": 0.23245367770915215,
"grad_norm": 0.5156863927841187,
"learning_rate": 9.476202027669644e-06,
"loss": 0.5004,
"step": 276
},
{
"epoch": 0.23329590117911286,
"grad_norm": 0.5780513286590576,
"learning_rate": 9.469629105200937e-06,
"loss": 0.4915,
"step": 277
},
{
"epoch": 0.23413812464907355,
"grad_norm": 0.6066457629203796,
"learning_rate": 9.463017507902245e-06,
"loss": 0.5244,
"step": 278
},
{
"epoch": 0.23498034811903426,
"grad_norm": 0.5100727081298828,
"learning_rate": 9.45636729298243e-06,
"loss": 0.4975,
"step": 279
},
{
"epoch": 0.23582257158899494,
"grad_norm": 0.5455880761146545,
"learning_rate": 9.449678517984503e-06,
"loss": 0.4704,
"step": 280
},
{
"epoch": 0.23666479505895563,
"grad_norm": 0.5221508741378784,
"learning_rate": 9.442951240785135e-06,
"loss": 0.5014,
"step": 281
},
{
"epoch": 0.23750701852891634,
"grad_norm": 0.5977054834365845,
"learning_rate": 9.436185519594145e-06,
"loss": 0.4822,
"step": 282
},
{
"epoch": 0.23834924199887703,
"grad_norm": 0.501565158367157,
"learning_rate": 9.429381412954e-06,
"loss": 0.4991,
"step": 283
},
{
"epoch": 0.23919146546883774,
"grad_norm": 0.6670308709144592,
"learning_rate": 9.422538979739307e-06,
"loss": 0.4719,
"step": 284
},
{
"epoch": 0.24003368893879842,
"grad_norm": 0.626640796661377,
"learning_rate": 9.415658279156312e-06,
"loss": 0.5209,
"step": 285
},
{
"epoch": 0.24087591240875914,
"grad_norm": 0.6013544201850891,
"learning_rate": 9.408739370742372e-06,
"loss": 0.4626,
"step": 286
},
{
"epoch": 0.24171813587871982,
"grad_norm": 0.572346568107605,
"learning_rate": 9.401782314365458e-06,
"loss": 0.4987,
"step": 287
},
{
"epoch": 0.2425603593486805,
"grad_norm": 0.5629340410232544,
"learning_rate": 9.39478717022362e-06,
"loss": 0.4988,
"step": 288
},
{
"epoch": 0.24340258281864122,
"grad_norm": 0.5211260914802551,
"learning_rate": 9.387753998844482e-06,
"loss": 0.4614,
"step": 289
},
{
"epoch": 0.2442448062886019,
"grad_norm": 0.5111398100852966,
"learning_rate": 9.380682861084703e-06,
"loss": 0.4653,
"step": 290
},
{
"epoch": 0.2450870297585626,
"grad_norm": 0.4929146468639374,
"learning_rate": 9.37357381812946e-06,
"loss": 0.505,
"step": 291
},
{
"epoch": 0.2459292532285233,
"grad_norm": 0.3973555862903595,
"learning_rate": 9.366426931491917e-06,
"loss": 0.4561,
"step": 292
},
{
"epoch": 0.246771476698484,
"grad_norm": 0.5543331503868103,
"learning_rate": 9.359242263012693e-06,
"loss": 0.4888,
"step": 293
},
{
"epoch": 0.2476137001684447,
"grad_norm": 0.46625789999961853,
"learning_rate": 9.352019874859326e-06,
"loss": 0.4976,
"step": 294
},
{
"epoch": 0.24845592363840538,
"grad_norm": 0.4668426811695099,
"learning_rate": 9.344759829525734e-06,
"loss": 0.4638,
"step": 295
},
{
"epoch": 0.2492981471083661,
"grad_norm": 0.501638650894165,
"learning_rate": 9.33746218983167e-06,
"loss": 0.4863,
"step": 296
},
{
"epoch": 0.2501403705783268,
"grad_norm": 0.445046067237854,
"learning_rate": 9.330127018922195e-06,
"loss": 0.4802,
"step": 297
},
{
"epoch": 0.2509825940482875,
"grad_norm": 0.4827885627746582,
"learning_rate": 9.32275438026711e-06,
"loss": 0.4833,
"step": 298
},
{
"epoch": 0.2518248175182482,
"grad_norm": 0.46409258246421814,
"learning_rate": 9.315344337660422e-06,
"loss": 0.4949,
"step": 299
},
{
"epoch": 0.25266704098820886,
"grad_norm": 0.46458157896995544,
"learning_rate": 9.307896955219787e-06,
"loss": 0.4828,
"step": 300
},
{
"epoch": 0.25350926445816957,
"grad_norm": 0.452510267496109,
"learning_rate": 9.300412297385954e-06,
"loss": 0.5037,
"step": 301
},
{
"epoch": 0.2543514879281303,
"grad_norm": 0.4101113975048065,
"learning_rate": 9.29289042892221e-06,
"loss": 0.4539,
"step": 302
},
{
"epoch": 0.25519371139809094,
"grad_norm": 0.5279428362846375,
"learning_rate": 9.285331414913816e-06,
"loss": 0.5,
"step": 303
},
{
"epoch": 0.25603593486805165,
"grad_norm": 0.42487549781799316,
"learning_rate": 9.277735320767449e-06,
"loss": 0.492,
"step": 304
},
{
"epoch": 0.25687815833801236,
"grad_norm": 0.47415146231651306,
"learning_rate": 9.270102212210632e-06,
"loss": 0.5013,
"step": 305
},
{
"epoch": 0.2577203818079731,
"grad_norm": 0.4520109295845032,
"learning_rate": 9.262432155291167e-06,
"loss": 0.4561,
"step": 306
},
{
"epoch": 0.25856260527793373,
"grad_norm": 0.45563986897468567,
"learning_rate": 9.254725216376562e-06,
"loss": 0.4953,
"step": 307
},
{
"epoch": 0.25940482874789444,
"grad_norm": 0.4406437277793884,
"learning_rate": 9.246981462153456e-06,
"loss": 0.4741,
"step": 308
},
{
"epoch": 0.26024705221785516,
"grad_norm": 0.46866127848625183,
"learning_rate": 9.239200959627048e-06,
"loss": 0.4782,
"step": 309
},
{
"epoch": 0.2610892756878158,
"grad_norm": 0.526932954788208,
"learning_rate": 9.231383776120512e-06,
"loss": 0.4758,
"step": 310
},
{
"epoch": 0.2619314991577765,
"grad_norm": 0.4274905323982239,
"learning_rate": 9.223529979274411e-06,
"loss": 0.4831,
"step": 311
},
{
"epoch": 0.26277372262773724,
"grad_norm": 0.5935413241386414,
"learning_rate": 9.215639637046121e-06,
"loss": 0.5021,
"step": 312
},
{
"epoch": 0.2636159460976979,
"grad_norm": 0.48600393533706665,
"learning_rate": 9.207712817709237e-06,
"loss": 0.497,
"step": 313
},
{
"epoch": 0.2644581695676586,
"grad_norm": 0.4954933524131775,
"learning_rate": 9.19974958985298e-06,
"loss": 0.5099,
"step": 314
},
{
"epoch": 0.2653003930376193,
"grad_norm": 0.6196016073226929,
"learning_rate": 9.191750022381613e-06,
"loss": 0.4668,
"step": 315
},
{
"epoch": 0.26614261650758003,
"grad_norm": 0.5300858020782471,
"learning_rate": 9.183714184513832e-06,
"loss": 0.5054,
"step": 316
},
{
"epoch": 0.2669848399775407,
"grad_norm": 0.5432302951812744,
"learning_rate": 9.175642145782179e-06,
"loss": 0.4651,
"step": 317
},
{
"epoch": 0.2678270634475014,
"grad_norm": 0.5008415579795837,
"learning_rate": 9.16753397603243e-06,
"loss": 0.4882,
"step": 318
},
{
"epoch": 0.2686692869174621,
"grad_norm": 0.6660073399543762,
"learning_rate": 9.159389745423003e-06,
"loss": 0.5076,
"step": 319
},
{
"epoch": 0.26951151038742277,
"grad_norm": 0.4663107097148895,
"learning_rate": 9.151209524424333e-06,
"loss": 0.4694,
"step": 320
},
{
"epoch": 0.2703537338573835,
"grad_norm": 0.5836421251296997,
"learning_rate": 9.142993383818284e-06,
"loss": 0.489,
"step": 321
},
{
"epoch": 0.2711959573273442,
"grad_norm": 0.4592418372631073,
"learning_rate": 9.134741394697517e-06,
"loss": 0.486,
"step": 322
},
{
"epoch": 0.2720381807973049,
"grad_norm": 0.4770454466342926,
"learning_rate": 9.126453628464889e-06,
"loss": 0.4597,
"step": 323
},
{
"epoch": 0.27288040426726556,
"grad_norm": 0.5071132183074951,
"learning_rate": 9.118130156832823e-06,
"loss": 0.4714,
"step": 324
},
{
"epoch": 0.2737226277372263,
"grad_norm": 0.49201327562332153,
"learning_rate": 9.109771051822702e-06,
"loss": 0.4799,
"step": 325
},
{
"epoch": 0.274564851207187,
"grad_norm": 0.49194371700286865,
"learning_rate": 9.10137638576423e-06,
"loss": 0.4754,
"step": 326
},
{
"epoch": 0.27540707467714765,
"grad_norm": 0.46792376041412354,
"learning_rate": 9.09294623129482e-06,
"loss": 0.484,
"step": 327
},
{
"epoch": 0.27624929814710836,
"grad_norm": 0.458034873008728,
"learning_rate": 9.084480661358954e-06,
"loss": 0.4731,
"step": 328
},
{
"epoch": 0.27709152161706907,
"grad_norm": 0.4599028527736664,
"learning_rate": 9.07597974920756e-06,
"loss": 0.4656,
"step": 329
},
{
"epoch": 0.2779337450870298,
"grad_norm": 0.5501565337181091,
"learning_rate": 9.067443568397378e-06,
"loss": 0.4943,
"step": 330
},
{
"epoch": 0.27877596855699044,
"grad_norm": 0.45656952261924744,
"learning_rate": 9.058872192790314e-06,
"loss": 0.496,
"step": 331
},
{
"epoch": 0.27961819202695115,
"grad_norm": 0.5053873062133789,
"learning_rate": 9.05026569655281e-06,
"loss": 0.4582,
"step": 332
},
{
"epoch": 0.28046041549691186,
"grad_norm": 0.48897361755371094,
"learning_rate": 9.041624154155208e-06,
"loss": 0.5159,
"step": 333
},
{
"epoch": 0.2813026389668725,
"grad_norm": 0.4899885058403015,
"learning_rate": 9.032947640371086e-06,
"loss": 0.4979,
"step": 334
},
{
"epoch": 0.28214486243683323,
"grad_norm": 0.5201389789581299,
"learning_rate": 9.02423623027663e-06,
"loss": 0.4887,
"step": 335
},
{
"epoch": 0.28298708590679394,
"grad_norm": 0.6644883751869202,
"learning_rate": 9.01548999924997e-06,
"loss": 0.5219,
"step": 336
},
{
"epoch": 0.28382930937675466,
"grad_norm": 0.4967377781867981,
"learning_rate": 9.006709022970547e-06,
"loss": 0.4924,
"step": 337
},
{
"epoch": 0.2846715328467153,
"grad_norm": 0.5921658873558044,
"learning_rate": 8.997893377418432e-06,
"loss": 0.4556,
"step": 338
},
{
"epoch": 0.285513756316676,
"grad_norm": 0.5479557514190674,
"learning_rate": 8.98904313887369e-06,
"loss": 0.4698,
"step": 339
},
{
"epoch": 0.28635597978663674,
"grad_norm": 0.5691632032394409,
"learning_rate": 8.980158383915714e-06,
"loss": 0.5025,
"step": 340
},
{
"epoch": 0.2871982032565974,
"grad_norm": 0.5059176087379456,
"learning_rate": 8.971239189422555e-06,
"loss": 0.4808,
"step": 341
},
{
"epoch": 0.2880404267265581,
"grad_norm": 0.6245074272155762,
"learning_rate": 8.962285632570266e-06,
"loss": 0.4723,
"step": 342
},
{
"epoch": 0.2888826501965188,
"grad_norm": 0.6249215006828308,
"learning_rate": 8.953297790832231e-06,
"loss": 0.506,
"step": 343
},
{
"epoch": 0.28972487366647953,
"grad_norm": 0.4356972277164459,
"learning_rate": 8.944275741978495e-06,
"loss": 0.4961,
"step": 344
},
{
"epoch": 0.2905670971364402,
"grad_norm": 0.6184642910957336,
"learning_rate": 8.935219564075087e-06,
"loss": 0.4913,
"step": 345
},
{
"epoch": 0.2914093206064009,
"grad_norm": 0.48030075430870056,
"learning_rate": 8.92612933548335e-06,
"loss": 0.4517,
"step": 346
},
{
"epoch": 0.2922515440763616,
"grad_norm": 0.4469691216945648,
"learning_rate": 8.917005134859263e-06,
"loss": 0.4582,
"step": 347
},
{
"epoch": 0.29309376754632227,
"grad_norm": 0.4990270733833313,
"learning_rate": 8.907847041152757e-06,
"loss": 0.4647,
"step": 348
},
{
"epoch": 0.293935991016283,
"grad_norm": 0.46036815643310547,
"learning_rate": 8.89865513360703e-06,
"loss": 0.4978,
"step": 349
},
{
"epoch": 0.2947782144862437,
"grad_norm": 0.5019863843917847,
"learning_rate": 8.889429491757872e-06,
"loss": 0.4791,
"step": 350
},
{
"epoch": 0.2956204379562044,
"grad_norm": 0.5118913054466248,
"learning_rate": 8.88017019543296e-06,
"loss": 0.4541,
"step": 351
},
{
"epoch": 0.29646266142616506,
"grad_norm": 0.4879518151283264,
"learning_rate": 8.870877324751186e-06,
"loss": 0.4567,
"step": 352
},
{
"epoch": 0.2973048848961258,
"grad_norm": 0.5687685012817383,
"learning_rate": 8.861550960121946e-06,
"loss": 0.5065,
"step": 353
},
{
"epoch": 0.2981471083660865,
"grad_norm": 0.42508652806282043,
"learning_rate": 8.852191182244456e-06,
"loss": 0.4966,
"step": 354
},
{
"epoch": 0.29898933183604715,
"grad_norm": 0.49280911684036255,
"learning_rate": 8.842798072107055e-06,
"loss": 0.4654,
"step": 355
},
{
"epoch": 0.29983155530600786,
"grad_norm": 0.4535382091999054,
"learning_rate": 8.833371710986493e-06,
"loss": 0.4796,
"step": 356
},
{
"epoch": 0.30067377877596857,
"grad_norm": 0.4359462261199951,
"learning_rate": 8.823912180447237e-06,
"loss": 0.4937,
"step": 357
},
{
"epoch": 0.3015160022459292,
"grad_norm": 0.44173017144203186,
"learning_rate": 8.81441956234076e-06,
"loss": 0.4827,
"step": 358
},
{
"epoch": 0.30235822571588994,
"grad_norm": 0.5167005658149719,
"learning_rate": 8.804893938804839e-06,
"loss": 0.501,
"step": 359
},
{
"epoch": 0.30320044918585065,
"grad_norm": 0.46836942434310913,
"learning_rate": 8.795335392262841e-06,
"loss": 0.4475,
"step": 360
},
{
"epoch": 0.30404267265581136,
"grad_norm": 0.5072566866874695,
"learning_rate": 8.785744005423003e-06,
"loss": 0.4709,
"step": 361
},
{
"epoch": 0.304884896125772,
"grad_norm": 0.533694326877594,
"learning_rate": 8.77611986127773e-06,
"loss": 0.4942,
"step": 362
},
{
"epoch": 0.30572711959573273,
"grad_norm": 0.4281693398952484,
"learning_rate": 8.766463043102864e-06,
"loss": 0.477,
"step": 363
},
{
"epoch": 0.30656934306569344,
"grad_norm": 0.4821215867996216,
"learning_rate": 8.756773634456975e-06,
"loss": 0.4886,
"step": 364
},
{
"epoch": 0.3074115665356541,
"grad_norm": 0.4269597828388214,
"learning_rate": 8.747051719180626e-06,
"loss": 0.4878,
"step": 365
},
{
"epoch": 0.3082537900056148,
"grad_norm": 0.47855547070503235,
"learning_rate": 8.737297381395657e-06,
"loss": 0.4844,
"step": 366
},
{
"epoch": 0.3090960134755755,
"grad_norm": 0.46225982904434204,
"learning_rate": 8.727510705504453e-06,
"loss": 0.4724,
"step": 367
},
{
"epoch": 0.30993823694553624,
"grad_norm": 0.46247169375419617,
"learning_rate": 8.717691776189214e-06,
"loss": 0.4732,
"step": 368
},
{
"epoch": 0.3107804604154969,
"grad_norm": 0.5116891860961914,
"learning_rate": 8.707840678411223e-06,
"loss": 0.4744,
"step": 369
},
{
"epoch": 0.3116226838854576,
"grad_norm": 0.4474673867225647,
"learning_rate": 8.69795749741011e-06,
"loss": 0.4783,
"step": 370
},
{
"epoch": 0.3124649073554183,
"grad_norm": 0.46301230788230896,
"learning_rate": 8.688042318703111e-06,
"loss": 0.4609,
"step": 371
},
{
"epoch": 0.313307130825379,
"grad_norm": 0.5067939758300781,
"learning_rate": 8.678095228084343e-06,
"loss": 0.4971,
"step": 372
},
{
"epoch": 0.3141493542953397,
"grad_norm": 0.5659363865852356,
"learning_rate": 8.66811631162404e-06,
"loss": 0.4724,
"step": 373
},
{
"epoch": 0.3149915777653004,
"grad_norm": 0.42198798060417175,
"learning_rate": 8.65810565566782e-06,
"loss": 0.4802,
"step": 374
},
{
"epoch": 0.3158338012352611,
"grad_norm": 0.546751081943512,
"learning_rate": 8.648063346835943e-06,
"loss": 0.4355,
"step": 375
},
{
"epoch": 0.31667602470522177,
"grad_norm": 0.5956584215164185,
"learning_rate": 8.637989472022548e-06,
"loss": 0.4957,
"step": 376
},
{
"epoch": 0.3175182481751825,
"grad_norm": 0.47946029901504517,
"learning_rate": 8.627884118394913e-06,
"loss": 0.4776,
"step": 377
},
{
"epoch": 0.3183604716451432,
"grad_norm": 0.5487171411514282,
"learning_rate": 8.617747373392697e-06,
"loss": 0.4857,
"step": 378
},
{
"epoch": 0.31920269511510385,
"grad_norm": 0.6688556671142578,
"learning_rate": 8.607579324727175e-06,
"loss": 0.4831,
"step": 379
},
{
"epoch": 0.32004491858506456,
"grad_norm": 0.4539017081260681,
"learning_rate": 8.597380060380493e-06,
"loss": 0.4841,
"step": 380
},
{
"epoch": 0.3208871420550253,
"grad_norm": 0.600853681564331,
"learning_rate": 8.5871496686049e-06,
"loss": 0.5023,
"step": 381
},
{
"epoch": 0.321729365524986,
"grad_norm": 0.6517627835273743,
"learning_rate": 8.576888237921983e-06,
"loss": 0.4938,
"step": 382
},
{
"epoch": 0.32257158899494665,
"grad_norm": 0.42329713702201843,
"learning_rate": 8.566595857121902e-06,
"loss": 0.4849,
"step": 383
},
{
"epoch": 0.32341381246490736,
"grad_norm": 0.5802134275436401,
"learning_rate": 8.556272615262623e-06,
"loss": 0.5134,
"step": 384
},
{
"epoch": 0.32425603593486807,
"grad_norm": 0.5750747323036194,
"learning_rate": 8.545918601669147e-06,
"loss": 0.4516,
"step": 385
},
{
"epoch": 0.3250982594048287,
"grad_norm": 0.42843982577323914,
"learning_rate": 8.535533905932739e-06,
"loss": 0.486,
"step": 386
},
{
"epoch": 0.32594048287478944,
"grad_norm": 0.4920358955860138,
"learning_rate": 8.525118617910144e-06,
"loss": 0.4864,
"step": 387
},
{
"epoch": 0.32678270634475015,
"grad_norm": 0.5304857492446899,
"learning_rate": 8.514672827722824e-06,
"loss": 0.5064,
"step": 388
},
{
"epoch": 0.32762492981471086,
"grad_norm": 0.5001351237297058,
"learning_rate": 8.504196625756166e-06,
"loss": 0.4711,
"step": 389
},
{
"epoch": 0.3284671532846715,
"grad_norm": 0.46036016941070557,
"learning_rate": 8.493690102658703e-06,
"loss": 0.4413,
"step": 390
},
{
"epoch": 0.32930937675463223,
"grad_norm": 0.5410290360450745,
"learning_rate": 8.483153349341336e-06,
"loss": 0.4736,
"step": 391
},
{
"epoch": 0.33015160022459294,
"grad_norm": 0.4987352788448334,
"learning_rate": 8.472586456976534e-06,
"loss": 0.5136,
"step": 392
},
{
"epoch": 0.3309938236945536,
"grad_norm": 0.5358459949493408,
"learning_rate": 8.461989516997565e-06,
"loss": 0.4932,
"step": 393
},
{
"epoch": 0.3318360471645143,
"grad_norm": 0.5530543923377991,
"learning_rate": 8.45136262109768e-06,
"loss": 0.4795,
"step": 394
},
{
"epoch": 0.332678270634475,
"grad_norm": 0.5860891938209534,
"learning_rate": 8.440705861229344e-06,
"loss": 0.4831,
"step": 395
},
{
"epoch": 0.3335204941044357,
"grad_norm": 0.5428664684295654,
"learning_rate": 8.430019329603423e-06,
"loss": 0.4794,
"step": 396
},
{
"epoch": 0.3343627175743964,
"grad_norm": 0.6753725409507751,
"learning_rate": 8.41930311868839e-06,
"loss": 0.4751,
"step": 397
},
{
"epoch": 0.3352049410443571,
"grad_norm": 0.4751342535018921,
"learning_rate": 8.408557321209534e-06,
"loss": 0.5061,
"step": 398
},
{
"epoch": 0.3360471645143178,
"grad_norm": 0.6972861289978027,
"learning_rate": 8.397782030148147e-06,
"loss": 0.5005,
"step": 399
},
{
"epoch": 0.3368893879842785,
"grad_norm": 0.5487985014915466,
"learning_rate": 8.386977338740724e-06,
"loss": 0.4795,
"step": 400
},
{
"epoch": 0.3377316114542392,
"grad_norm": 0.43178796768188477,
"learning_rate": 8.376143340478153e-06,
"loss": 0.4353,
"step": 401
},
{
"epoch": 0.3385738349241999,
"grad_norm": 0.5557446479797363,
"learning_rate": 8.365280129104912e-06,
"loss": 0.5051,
"step": 402
},
{
"epoch": 0.33941605839416056,
"grad_norm": 0.5225008726119995,
"learning_rate": 8.354387798618254e-06,
"loss": 0.4792,
"step": 403
},
{
"epoch": 0.34025828186412127,
"grad_norm": 0.5050113797187805,
"learning_rate": 8.34346644326739e-06,
"loss": 0.4454,
"step": 404
},
{
"epoch": 0.341100505334082,
"grad_norm": 0.45230233669281006,
"learning_rate": 8.332516157552684e-06,
"loss": 0.4878,
"step": 405
},
{
"epoch": 0.3419427288040427,
"grad_norm": 0.49938130378723145,
"learning_rate": 8.321537036224822e-06,
"loss": 0.4974,
"step": 406
},
{
"epoch": 0.34278495227400335,
"grad_norm": 0.4604662358760834,
"learning_rate": 8.310529174284004e-06,
"loss": 0.4912,
"step": 407
},
{
"epoch": 0.34362717574396406,
"grad_norm": 0.42852020263671875,
"learning_rate": 8.299492666979114e-06,
"loss": 0.4733,
"step": 408
},
{
"epoch": 0.3444693992139248,
"grad_norm": 0.5300406813621521,
"learning_rate": 8.288427609806899e-06,
"loss": 0.4936,
"step": 409
},
{
"epoch": 0.34531162268388543,
"grad_norm": 0.46928277611732483,
"learning_rate": 8.277334098511147e-06,
"loss": 0.4815,
"step": 410
},
{
"epoch": 0.34615384615384615,
"grad_norm": 0.46530747413635254,
"learning_rate": 8.266212229081846e-06,
"loss": 0.4928,
"step": 411
},
{
"epoch": 0.34699606962380686,
"grad_norm": 0.44099971652030945,
"learning_rate": 8.255062097754371e-06,
"loss": 0.4854,
"step": 412
},
{
"epoch": 0.34783829309376757,
"grad_norm": 0.4617399275302887,
"learning_rate": 8.243883801008632e-06,
"loss": 0.4707,
"step": 413
},
{
"epoch": 0.3486805165637282,
"grad_norm": 0.5029078125953674,
"learning_rate": 8.232677435568252e-06,
"loss": 0.4753,
"step": 414
},
{
"epoch": 0.34952274003368894,
"grad_norm": 0.46492263674736023,
"learning_rate": 8.221443098399733e-06,
"loss": 0.4455,
"step": 415
},
{
"epoch": 0.35036496350364965,
"grad_norm": 0.4844779968261719,
"learning_rate": 8.210180886711603e-06,
"loss": 0.4819,
"step": 416
},
{
"epoch": 0.3512071869736103,
"grad_norm": 0.4448031783103943,
"learning_rate": 8.198890897953586e-06,
"loss": 0.4637,
"step": 417
},
{
"epoch": 0.352049410443571,
"grad_norm": 0.46674251556396484,
"learning_rate": 8.187573229815757e-06,
"loss": 0.4806,
"step": 418
},
{
"epoch": 0.35289163391353173,
"grad_norm": 0.4910249412059784,
"learning_rate": 8.176227980227693e-06,
"loss": 0.4937,
"step": 419
},
{
"epoch": 0.35373385738349244,
"grad_norm": 0.4387272894382477,
"learning_rate": 8.164855247357628e-06,
"loss": 0.4963,
"step": 420
},
{
"epoch": 0.3545760808534531,
"grad_norm": 0.46658802032470703,
"learning_rate": 8.153455129611605e-06,
"loss": 0.484,
"step": 421
},
{
"epoch": 0.3554183043234138,
"grad_norm": 0.41567468643188477,
"learning_rate": 8.142027725632622e-06,
"loss": 0.4565,
"step": 422
},
{
"epoch": 0.3562605277933745,
"grad_norm": 0.5063756704330444,
"learning_rate": 8.130573134299782e-06,
"loss": 0.4982,
"step": 423
},
{
"epoch": 0.3571027512633352,
"grad_norm": 0.48313266038894653,
"learning_rate": 8.119091454727427e-06,
"loss": 0.5071,
"step": 424
},
{
"epoch": 0.3579449747332959,
"grad_norm": 0.40613314509391785,
"learning_rate": 8.107582786264299e-06,
"loss": 0.4645,
"step": 425
},
{
"epoch": 0.3587871982032566,
"grad_norm": 0.5263804793357849,
"learning_rate": 8.09604722849266e-06,
"loss": 0.483,
"step": 426
},
{
"epoch": 0.3596294216732173,
"grad_norm": 0.41154658794403076,
"learning_rate": 8.084484881227449e-06,
"loss": 0.4742,
"step": 427
},
{
"epoch": 0.360471645143178,
"grad_norm": 0.45373862981796265,
"learning_rate": 8.072895844515398e-06,
"loss": 0.4975,
"step": 428
},
{
"epoch": 0.3613138686131387,
"grad_norm": 0.5073238611221313,
"learning_rate": 8.061280218634192e-06,
"loss": 0.4869,
"step": 429
},
{
"epoch": 0.3621560920830994,
"grad_norm": 0.44339868426322937,
"learning_rate": 8.049638104091575e-06,
"loss": 0.4695,
"step": 430
},
{
"epoch": 0.36299831555306006,
"grad_norm": 0.4490543603897095,
"learning_rate": 8.037969601624495e-06,
"loss": 0.4818,
"step": 431
},
{
"epoch": 0.36384053902302077,
"grad_norm": 0.4600644111633301,
"learning_rate": 8.026274812198235e-06,
"loss": 0.4678,
"step": 432
},
{
"epoch": 0.3646827624929815,
"grad_norm": 0.553426206111908,
"learning_rate": 8.014553837005527e-06,
"loss": 0.4811,
"step": 433
},
{
"epoch": 0.3655249859629422,
"grad_norm": 0.4366907477378845,
"learning_rate": 8.002806777465685e-06,
"loss": 0.4924,
"step": 434
},
{
"epoch": 0.36636720943290285,
"grad_norm": 0.5031819343566895,
"learning_rate": 7.99103373522373e-06,
"loss": 0.4777,
"step": 435
},
{
"epoch": 0.36720943290286356,
"grad_norm": 0.4188793897628784,
"learning_rate": 7.9792348121495e-06,
"loss": 0.4866,
"step": 436
},
{
"epoch": 0.3680516563728243,
"grad_norm": 0.5696221590042114,
"learning_rate": 7.967410110336782e-06,
"loss": 0.4991,
"step": 437
},
{
"epoch": 0.36889387984278493,
"grad_norm": 0.49628570675849915,
"learning_rate": 7.955559732102414e-06,
"loss": 0.4774,
"step": 438
},
{
"epoch": 0.36973610331274565,
"grad_norm": 0.4882410168647766,
"learning_rate": 7.943683779985412e-06,
"loss": 0.449,
"step": 439
},
{
"epoch": 0.37057832678270636,
"grad_norm": 0.48540210723876953,
"learning_rate": 7.931782356746076e-06,
"loss": 0.4842,
"step": 440
},
{
"epoch": 0.371420550252667,
"grad_norm": 0.5185336470603943,
"learning_rate": 7.919855565365102e-06,
"loss": 0.4869,
"step": 441
},
{
"epoch": 0.3722627737226277,
"grad_norm": 0.5434253811836243,
"learning_rate": 7.907903509042696e-06,
"loss": 0.4941,
"step": 442
},
{
"epoch": 0.37310499719258844,
"grad_norm": 0.5163787007331848,
"learning_rate": 7.895926291197667e-06,
"loss": 0.4892,
"step": 443
},
{
"epoch": 0.37394722066254915,
"grad_norm": 0.44851982593536377,
"learning_rate": 7.883924015466554e-06,
"loss": 0.4845,
"step": 444
},
{
"epoch": 0.3747894441325098,
"grad_norm": 0.49744945764541626,
"learning_rate": 7.871896785702707e-06,
"loss": 0.4848,
"step": 445
},
{
"epoch": 0.3756316676024705,
"grad_norm": 0.40333092212677,
"learning_rate": 7.859844705975405e-06,
"loss": 0.4714,
"step": 446
},
{
"epoch": 0.37647389107243123,
"grad_norm": 0.4801202714443207,
"learning_rate": 7.847767880568944e-06,
"loss": 0.4677,
"step": 447
},
{
"epoch": 0.3773161145423919,
"grad_norm": 0.469712495803833,
"learning_rate": 7.835666413981744e-06,
"loss": 0.4804,
"step": 448
},
{
"epoch": 0.3781583380123526,
"grad_norm": 0.5278536677360535,
"learning_rate": 7.823540410925434e-06,
"loss": 0.4955,
"step": 449
},
{
"epoch": 0.3790005614823133,
"grad_norm": 0.5002195239067078,
"learning_rate": 7.811389976323963e-06,
"loss": 0.5015,
"step": 450
},
{
"epoch": 0.379842784952274,
"grad_norm": 0.5122157335281372,
"learning_rate": 7.799215215312667e-06,
"loss": 0.4618,
"step": 451
},
{
"epoch": 0.3806850084222347,
"grad_norm": 0.4218645393848419,
"learning_rate": 7.787016233237387e-06,
"loss": 0.4651,
"step": 452
},
{
"epoch": 0.3815272318921954,
"grad_norm": 0.4596264362335205,
"learning_rate": 7.774793135653537e-06,
"loss": 0.4818,
"step": 453
},
{
"epoch": 0.3823694553621561,
"grad_norm": 0.5322352051734924,
"learning_rate": 7.7625460283252e-06,
"loss": 0.477,
"step": 454
},
{
"epoch": 0.38321167883211676,
"grad_norm": 0.4300509989261627,
"learning_rate": 7.750275017224208e-06,
"loss": 0.4693,
"step": 455
},
{
"epoch": 0.3840539023020775,
"grad_norm": 0.4399045407772064,
"learning_rate": 7.737980208529232e-06,
"loss": 0.4498,
"step": 456
},
{
"epoch": 0.3848961257720382,
"grad_norm": 0.49768343567848206,
"learning_rate": 7.725661708624855e-06,
"loss": 0.4487,
"step": 457
},
{
"epoch": 0.3857383492419989,
"grad_norm": 0.47401291131973267,
"learning_rate": 7.713319624100657e-06,
"loss": 0.4897,
"step": 458
},
{
"epoch": 0.38658057271195956,
"grad_norm": 0.4233331084251404,
"learning_rate": 7.700954061750295e-06,
"loss": 0.4838,
"step": 459
},
{
"epoch": 0.38742279618192027,
"grad_norm": 0.45259544253349304,
"learning_rate": 7.688565128570564e-06,
"loss": 0.4935,
"step": 460
},
{
"epoch": 0.388265019651881,
"grad_norm": 0.5088585615158081,
"learning_rate": 7.676152931760496e-06,
"loss": 0.4579,
"step": 461
},
{
"epoch": 0.38910724312184164,
"grad_norm": 0.3829163908958435,
"learning_rate": 7.663717578720412e-06,
"loss": 0.4354,
"step": 462
},
{
"epoch": 0.38994946659180235,
"grad_norm": 0.46096691489219666,
"learning_rate": 7.651259177050996e-06,
"loss": 0.4781,
"step": 463
},
{
"epoch": 0.39079169006176306,
"grad_norm": 0.4710644781589508,
"learning_rate": 7.638777834552372e-06,
"loss": 0.4683,
"step": 464
},
{
"epoch": 0.3916339135317238,
"grad_norm": 0.4167614281177521,
"learning_rate": 7.626273659223166e-06,
"loss": 0.476,
"step": 465
},
{
"epoch": 0.39247613700168443,
"grad_norm": 0.46090462803840637,
"learning_rate": 7.61374675925957e-06,
"loss": 0.4579,
"step": 466
},
{
"epoch": 0.39331836047164515,
"grad_norm": 0.4632381498813629,
"learning_rate": 7.601197243054411e-06,
"loss": 0.484,
"step": 467
},
{
"epoch": 0.39416058394160586,
"grad_norm": 0.3899933993816376,
"learning_rate": 7.588625219196208e-06,
"loss": 0.4761,
"step": 468
},
{
"epoch": 0.3950028074115665,
"grad_norm": 0.424213171005249,
"learning_rate": 7.576030796468233e-06,
"loss": 0.4765,
"step": 469
},
{
"epoch": 0.3958450308815272,
"grad_norm": 0.4018155336380005,
"learning_rate": 7.563414083847573e-06,
"loss": 0.4628,
"step": 470
},
{
"epoch": 0.39668725435148794,
"grad_norm": 0.41502058506011963,
"learning_rate": 7.5507751905041885e-06,
"loss": 0.4882,
"step": 471
},
{
"epoch": 0.39752947782144865,
"grad_norm": 0.39122501015663147,
"learning_rate": 7.538114225799955e-06,
"loss": 0.476,
"step": 472
},
{
"epoch": 0.3983717012914093,
"grad_norm": 0.4315950274467468,
"learning_rate": 7.525431299287737e-06,
"loss": 0.4919,
"step": 473
},
{
"epoch": 0.39921392476137,
"grad_norm": 0.43616175651550293,
"learning_rate": 7.512726520710429e-06,
"loss": 0.4607,
"step": 474
},
{
"epoch": 0.40005614823133073,
"grad_norm": 0.4297219216823578,
"learning_rate": 7.500000000000001e-06,
"loss": 0.4841,
"step": 475
},
{
"epoch": 0.4008983717012914,
"grad_norm": 0.4343617260456085,
"learning_rate": 7.4872518472765594e-06,
"loss": 0.4981,
"step": 476
},
{
"epoch": 0.4017405951712521,
"grad_norm": 0.4350733757019043,
"learning_rate": 7.474482172847391e-06,
"loss": 0.4766,
"step": 477
},
{
"epoch": 0.4025828186412128,
"grad_norm": 0.42052868008613586,
"learning_rate": 7.461691087205993e-06,
"loss": 0.4421,
"step": 478
},
{
"epoch": 0.40342504211117347,
"grad_norm": 0.43227866291999817,
"learning_rate": 7.4488787010311425e-06,
"loss": 0.4643,
"step": 479
},
{
"epoch": 0.4042672655811342,
"grad_norm": 0.4426368772983551,
"learning_rate": 7.436045125185923e-06,
"loss": 0.4906,
"step": 480
},
{
"epoch": 0.4051094890510949,
"grad_norm": 0.436074823141098,
"learning_rate": 7.423190470716761e-06,
"loss": 0.4437,
"step": 481
},
{
"epoch": 0.4059517125210556,
"grad_norm": 0.40371957421302795,
"learning_rate": 7.4103148488524824e-06,
"loss": 0.4843,
"step": 482
},
{
"epoch": 0.40679393599101626,
"grad_norm": 0.47098270058631897,
"learning_rate": 7.3974183710033334e-06,
"loss": 0.4811,
"step": 483
},
{
"epoch": 0.407636159460977,
"grad_norm": 0.4514760375022888,
"learning_rate": 7.384501148760024e-06,
"loss": 0.4919,
"step": 484
},
{
"epoch": 0.4084783829309377,
"grad_norm": 0.4151240289211273,
"learning_rate": 7.371563293892761e-06,
"loss": 0.493,
"step": 485
},
{
"epoch": 0.40932060640089835,
"grad_norm": 0.4454137980937958,
"learning_rate": 7.3586049183502875e-06,
"loss": 0.4652,
"step": 486
},
{
"epoch": 0.41016282987085906,
"grad_norm": 0.4351678490638733,
"learning_rate": 7.345626134258897e-06,
"loss": 0.4761,
"step": 487
},
{
"epoch": 0.41100505334081977,
"grad_norm": 0.4036249816417694,
"learning_rate": 7.3326270539214826e-06,
"loss": 0.4785,
"step": 488
},
{
"epoch": 0.4118472768107805,
"grad_norm": 0.46877068281173706,
"learning_rate": 7.319607789816555e-06,
"loss": 0.4532,
"step": 489
},
{
"epoch": 0.41268950028074114,
"grad_norm": 0.41278883814811707,
"learning_rate": 7.306568454597269e-06,
"loss": 0.4644,
"step": 490
},
{
"epoch": 0.41353172375070185,
"grad_norm": 0.4356534779071808,
"learning_rate": 7.293509161090453e-06,
"loss": 0.488,
"step": 491
},
{
"epoch": 0.41437394722066256,
"grad_norm": 0.45111823081970215,
"learning_rate": 7.28043002229563e-06,
"loss": 0.4584,
"step": 492
},
{
"epoch": 0.4152161706906232,
"grad_norm": 0.510417103767395,
"learning_rate": 7.2673311513840395e-06,
"loss": 0.4709,
"step": 493
},
{
"epoch": 0.41605839416058393,
"grad_norm": 0.46581393480300903,
"learning_rate": 7.2542126616976596e-06,
"loss": 0.4468,
"step": 494
},
{
"epoch": 0.41690061763054465,
"grad_norm": 0.5376666784286499,
"learning_rate": 7.241074666748228e-06,
"loss": 0.4703,
"step": 495
},
{
"epoch": 0.41774284110050536,
"grad_norm": 0.47817128896713257,
"learning_rate": 7.227917280216254e-06,
"loss": 0.5045,
"step": 496
},
{
"epoch": 0.418585064570466,
"grad_norm": 0.5402735471725464,
"learning_rate": 7.214740615950041e-06,
"loss": 0.4845,
"step": 497
},
{
"epoch": 0.4194272880404267,
"grad_norm": 0.4992601275444031,
"learning_rate": 7.201544787964698e-06,
"loss": 0.4657,
"step": 498
},
{
"epoch": 0.42026951151038744,
"grad_norm": 0.45684367418289185,
"learning_rate": 7.188329910441154e-06,
"loss": 0.4952,
"step": 499
},
{
"epoch": 0.4211117349803481,
"grad_norm": 0.519280731678009,
"learning_rate": 7.175096097725169e-06,
"loss": 0.4763,
"step": 500
},
{
"epoch": 0.4219539584503088,
"grad_norm": 0.4592819809913635,
"learning_rate": 7.161843464326349e-06,
"loss": 0.484,
"step": 501
},
{
"epoch": 0.4227961819202695,
"grad_norm": 0.4927172064781189,
"learning_rate": 7.148572124917148e-06,
"loss": 0.465,
"step": 502
},
{
"epoch": 0.42363840539023023,
"grad_norm": 0.4536415934562683,
"learning_rate": 7.135282194331881e-06,
"loss": 0.4831,
"step": 503
},
{
"epoch": 0.4244806288601909,
"grad_norm": 0.5175096392631531,
"learning_rate": 7.121973787565727e-06,
"loss": 0.4689,
"step": 504
},
{
"epoch": 0.4253228523301516,
"grad_norm": 0.48967283964157104,
"learning_rate": 7.1086470197737405e-06,
"loss": 0.4808,
"step": 505
},
{
"epoch": 0.4261650758001123,
"grad_norm": 0.4530904293060303,
"learning_rate": 7.095302006269842e-06,
"loss": 0.4601,
"step": 506
},
{
"epoch": 0.42700729927007297,
"grad_norm": 0.48659196496009827,
"learning_rate": 7.0819388625258385e-06,
"loss": 0.4942,
"step": 507
},
{
"epoch": 0.4278495227400337,
"grad_norm": 0.5468390583992004,
"learning_rate": 7.06855770417041e-06,
"loss": 0.4671,
"step": 508
},
{
"epoch": 0.4286917462099944,
"grad_norm": 0.465054452419281,
"learning_rate": 7.05515864698811e-06,
"loss": 0.4608,
"step": 509
},
{
"epoch": 0.4295339696799551,
"grad_norm": 0.4302669167518616,
"learning_rate": 7.041741806918372e-06,
"loss": 0.4582,
"step": 510
},
{
"epoch": 0.43037619314991576,
"grad_norm": 0.4395880699157715,
"learning_rate": 7.028307300054499e-06,
"loss": 0.4585,
"step": 511
},
{
"epoch": 0.4312184166198765,
"grad_norm": 0.4656515419483185,
"learning_rate": 7.014855242642662e-06,
"loss": 0.4557,
"step": 512
},
{
"epoch": 0.4320606400898372,
"grad_norm": 0.4596274793148041,
"learning_rate": 7.0013857510808934e-06,
"loss": 0.483,
"step": 513
},
{
"epoch": 0.43290286355979785,
"grad_norm": 0.5049313902854919,
"learning_rate": 6.987898941918082e-06,
"loss": 0.4748,
"step": 514
},
{
"epoch": 0.43374508702975856,
"grad_norm": 0.4165242910385132,
"learning_rate": 6.974394931852957e-06,
"loss": 0.4917,
"step": 515
},
{
"epoch": 0.43458731049971927,
"grad_norm": 0.402126282453537,
"learning_rate": 6.960873837733089e-06,
"loss": 0.4691,
"step": 516
},
{
"epoch": 0.43542953396968,
"grad_norm": 0.444795697927475,
"learning_rate": 6.94733577655387e-06,
"loss": 0.4783,
"step": 517
},
{
"epoch": 0.43627175743964064,
"grad_norm": 0.4488651752471924,
"learning_rate": 6.933780865457508e-06,
"loss": 0.4687,
"step": 518
},
{
"epoch": 0.43711398090960135,
"grad_norm": 0.44755545258522034,
"learning_rate": 6.920209221732007e-06,
"loss": 0.4759,
"step": 519
},
{
"epoch": 0.43795620437956206,
"grad_norm": 0.4631388187408447,
"learning_rate": 6.90662096281016e-06,
"loss": 0.4559,
"step": 520
},
{
"epoch": 0.4387984278495227,
"grad_norm": 0.47367194294929504,
"learning_rate": 6.893016206268518e-06,
"loss": 0.4921,
"step": 521
},
{
"epoch": 0.43964065131948343,
"grad_norm": 0.4509705603122711,
"learning_rate": 6.879395069826394e-06,
"loss": 0.488,
"step": 522
},
{
"epoch": 0.44048287478944415,
"grad_norm": 0.4384632408618927,
"learning_rate": 6.865757671344827e-06,
"loss": 0.4634,
"step": 523
},
{
"epoch": 0.4413250982594048,
"grad_norm": 0.43855151534080505,
"learning_rate": 6.85210412882557e-06,
"loss": 0.4577,
"step": 524
},
{
"epoch": 0.4421673217293655,
"grad_norm": 0.4600560963153839,
"learning_rate": 6.838434560410064e-06,
"loss": 0.4548,
"step": 525
},
{
"epoch": 0.4430095451993262,
"grad_norm": 0.4522815942764282,
"learning_rate": 6.824749084378428e-06,
"loss": 0.4455,
"step": 526
},
{
"epoch": 0.44385176866928694,
"grad_norm": 0.44057661294937134,
"learning_rate": 6.811047819148413e-06,
"loss": 0.462,
"step": 527
},
{
"epoch": 0.4446939921392476,
"grad_norm": 0.40139660239219666,
"learning_rate": 6.7973308832744035e-06,
"loss": 0.4661,
"step": 528
},
{
"epoch": 0.4455362156092083,
"grad_norm": 0.48533400893211365,
"learning_rate": 6.783598395446371e-06,
"loss": 0.4767,
"step": 529
},
{
"epoch": 0.446378439079169,
"grad_norm": 0.39077308773994446,
"learning_rate": 6.769850474488859e-06,
"loss": 0.4465,
"step": 530
},
{
"epoch": 0.4472206625491297,
"grad_norm": 0.4498750865459442,
"learning_rate": 6.756087239359948e-06,
"loss": 0.467,
"step": 531
},
{
"epoch": 0.4480628860190904,
"grad_norm": 0.4155936539173126,
"learning_rate": 6.742308809150232e-06,
"loss": 0.4667,
"step": 532
},
{
"epoch": 0.4489051094890511,
"grad_norm": 0.44090476632118225,
"learning_rate": 6.728515303081782e-06,
"loss": 0.4641,
"step": 533
},
{
"epoch": 0.4497473329590118,
"grad_norm": 0.43852943181991577,
"learning_rate": 6.714706840507122e-06,
"loss": 0.4667,
"step": 534
},
{
"epoch": 0.45058955642897247,
"grad_norm": 0.42327818274497986,
"learning_rate": 6.700883540908185e-06,
"loss": 0.4574,
"step": 535
},
{
"epoch": 0.4514317798989332,
"grad_norm": 0.4370694160461426,
"learning_rate": 6.687045523895292e-06,
"loss": 0.4699,
"step": 536
},
{
"epoch": 0.4522740033688939,
"grad_norm": 0.45204928517341614,
"learning_rate": 6.673192909206109e-06,
"loss": 0.483,
"step": 537
},
{
"epoch": 0.45311622683885455,
"grad_norm": 0.4360177218914032,
"learning_rate": 6.6593258167046115e-06,
"loss": 0.5015,
"step": 538
},
{
"epoch": 0.45395845030881526,
"grad_norm": 0.5273513793945312,
"learning_rate": 6.64544436638005e-06,
"loss": 0.4976,
"step": 539
},
{
"epoch": 0.454800673778776,
"grad_norm": 0.45911315083503723,
"learning_rate": 6.63154867834591e-06,
"loss": 0.4805,
"step": 540
},
{
"epoch": 0.4556428972487367,
"grad_norm": 0.46684151887893677,
"learning_rate": 6.617638872838874e-06,
"loss": 0.4719,
"step": 541
},
{
"epoch": 0.45648512071869735,
"grad_norm": 0.44970595836639404,
"learning_rate": 6.603715070217779e-06,
"loss": 0.467,
"step": 542
},
{
"epoch": 0.45732734418865806,
"grad_norm": 0.45776426792144775,
"learning_rate": 6.589777390962575e-06,
"loss": 0.4645,
"step": 543
},
{
"epoch": 0.45816956765861877,
"grad_norm": 0.4789178669452667,
"learning_rate": 6.5758259556732896e-06,
"loss": 0.4857,
"step": 544
},
{
"epoch": 0.4590117911285794,
"grad_norm": 0.4421025514602661,
"learning_rate": 6.561860885068972e-06,
"loss": 0.4294,
"step": 545
},
{
"epoch": 0.45985401459854014,
"grad_norm": 0.46660929918289185,
"learning_rate": 6.547882299986658e-06,
"loss": 0.4715,
"step": 546
},
{
"epoch": 0.46069623806850085,
"grad_norm": 0.4705526530742645,
"learning_rate": 6.53389032138032e-06,
"loss": 0.4902,
"step": 547
},
{
"epoch": 0.46153846153846156,
"grad_norm": 0.44154104590415955,
"learning_rate": 6.519885070319827e-06,
"loss": 0.4955,
"step": 548
},
{
"epoch": 0.4623806850084222,
"grad_norm": 0.4737115502357483,
"learning_rate": 6.505866667989884e-06,
"loss": 0.4666,
"step": 549
},
{
"epoch": 0.46322290847838293,
"grad_norm": 0.4516438841819763,
"learning_rate": 6.491835235688999e-06,
"loss": 0.4667,
"step": 550
},
{
"epoch": 0.46406513194834365,
"grad_norm": 0.39548397064208984,
"learning_rate": 6.477790894828422e-06,
"loss": 0.4495,
"step": 551
},
{
"epoch": 0.4649073554183043,
"grad_norm": 0.4581812620162964,
"learning_rate": 6.463733766931096e-06,
"loss": 0.4766,
"step": 552
},
{
"epoch": 0.465749578888265,
"grad_norm": 0.5608298182487488,
"learning_rate": 6.449663973630613e-06,
"loss": 0.4651,
"step": 553
},
{
"epoch": 0.4665918023582257,
"grad_norm": 0.433051735162735,
"learning_rate": 6.435581636670154e-06,
"loss": 0.4615,
"step": 554
},
{
"epoch": 0.46743402582818644,
"grad_norm": 0.45779332518577576,
"learning_rate": 6.421486877901436e-06,
"loss": 0.4748,
"step": 555
},
{
"epoch": 0.4682762492981471,
"grad_norm": 0.5338222980499268,
"learning_rate": 6.407379819283661e-06,
"loss": 0.4677,
"step": 556
},
{
"epoch": 0.4691184727681078,
"grad_norm": 0.4301375150680542,
"learning_rate": 6.393260582882462e-06,
"loss": 0.4461,
"step": 557
},
{
"epoch": 0.4699606962380685,
"grad_norm": 0.3875766694545746,
"learning_rate": 6.379129290868837e-06,
"loss": 0.4608,
"step": 558
},
{
"epoch": 0.4708029197080292,
"grad_norm": 0.49393564462661743,
"learning_rate": 6.364986065518106e-06,
"loss": 0.454,
"step": 559
},
{
"epoch": 0.4716451431779899,
"grad_norm": 0.433536559343338,
"learning_rate": 6.350831029208844e-06,
"loss": 0.4648,
"step": 560
},
{
"epoch": 0.4724873666479506,
"grad_norm": 0.3517136871814728,
"learning_rate": 6.336664304421818e-06,
"loss": 0.4587,
"step": 561
},
{
"epoch": 0.47332959011791126,
"grad_norm": 0.4940553307533264,
"learning_rate": 6.322486013738942e-06,
"loss": 0.4799,
"step": 562
},
{
"epoch": 0.47417181358787197,
"grad_norm": 0.4068980813026428,
"learning_rate": 6.308296279842204e-06,
"loss": 0.4621,
"step": 563
},
{
"epoch": 0.4750140370578327,
"grad_norm": 0.41587033867836,
"learning_rate": 6.294095225512604e-06,
"loss": 0.4695,
"step": 564
},
{
"epoch": 0.4758562605277934,
"grad_norm": 0.4289080798625946,
"learning_rate": 6.279882973629101e-06,
"loss": 0.4606,
"step": 565
},
{
"epoch": 0.47669848399775405,
"grad_norm": 0.3981984555721283,
"learning_rate": 6.265659647167542e-06,
"loss": 0.4778,
"step": 566
},
{
"epoch": 0.47754070746771476,
"grad_norm": 0.41328713297843933,
"learning_rate": 6.2514253691996e-06,
"loss": 0.4698,
"step": 567
},
{
"epoch": 0.4783829309376755,
"grad_norm": 0.509678304195404,
"learning_rate": 6.237180262891709e-06,
"loss": 0.4817,
"step": 568
},
{
"epoch": 0.47922515440763613,
"grad_norm": 0.4245469272136688,
"learning_rate": 6.222924451504001e-06,
"loss": 0.4453,
"step": 569
},
{
"epoch": 0.48006737787759685,
"grad_norm": 0.44388455152511597,
"learning_rate": 6.208658058389232e-06,
"loss": 0.4514,
"step": 570
},
{
"epoch": 0.48090960134755756,
"grad_norm": 0.4333799481391907,
"learning_rate": 6.194381206991723e-06,
"loss": 0.4746,
"step": 571
},
{
"epoch": 0.48175182481751827,
"grad_norm": 0.46454960107803345,
"learning_rate": 6.180094020846291e-06,
"loss": 0.4685,
"step": 572
},
{
"epoch": 0.4825940482874789,
"grad_norm": 0.41998291015625,
"learning_rate": 6.165796623577171e-06,
"loss": 0.4747,
"step": 573
},
{
"epoch": 0.48343627175743964,
"grad_norm": 0.3703177869319916,
"learning_rate": 6.15148913889696e-06,
"loss": 0.4871,
"step": 574
},
{
"epoch": 0.48427849522740035,
"grad_norm": 0.5082182884216309,
"learning_rate": 6.1371716906055336e-06,
"loss": 0.4732,
"step": 575
},
{
"epoch": 0.485120718697361,
"grad_norm": 0.4215005040168762,
"learning_rate": 6.122844402588982e-06,
"loss": 0.4494,
"step": 576
},
{
"epoch": 0.4859629421673217,
"grad_norm": 0.42425084114074707,
"learning_rate": 6.10850739881854e-06,
"loss": 0.4969,
"step": 577
},
{
"epoch": 0.48680516563728243,
"grad_norm": 0.4311264455318451,
"learning_rate": 6.094160803349508e-06,
"loss": 0.469,
"step": 578
},
{
"epoch": 0.48764738910724315,
"grad_norm": 0.41804108023643494,
"learning_rate": 6.079804740320181e-06,
"loss": 0.4426,
"step": 579
},
{
"epoch": 0.4884896125772038,
"grad_norm": 0.41517356038093567,
"learning_rate": 6.065439333950776e-06,
"loss": 0.4928,
"step": 580
},
{
"epoch": 0.4893318360471645,
"grad_norm": 0.42737406492233276,
"learning_rate": 6.051064708542357e-06,
"loss": 0.4801,
"step": 581
},
{
"epoch": 0.4901740595171252,
"grad_norm": 0.43230849504470825,
"learning_rate": 6.036680988475756e-06,
"loss": 0.4823,
"step": 582
},
{
"epoch": 0.4910162829870859,
"grad_norm": 0.4388849437236786,
"learning_rate": 6.022288298210502e-06,
"loss": 0.4669,
"step": 583
},
{
"epoch": 0.4918585064570466,
"grad_norm": 0.37067726254463196,
"learning_rate": 6.00788676228374e-06,
"loss": 0.4689,
"step": 584
},
{
"epoch": 0.4927007299270073,
"grad_norm": 0.47136881947517395,
"learning_rate": 5.993476505309154e-06,
"loss": 0.4483,
"step": 585
},
{
"epoch": 0.493542953396968,
"grad_norm": 0.42969149351119995,
"learning_rate": 5.979057651975893e-06,
"loss": 0.4769,
"step": 586
},
{
"epoch": 0.4943851768669287,
"grad_norm": 0.40368854999542236,
"learning_rate": 5.964630327047485e-06,
"loss": 0.4696,
"step": 587
},
{
"epoch": 0.4952274003368894,
"grad_norm": 0.41034096479415894,
"learning_rate": 5.9501946553607615e-06,
"loss": 0.4644,
"step": 588
},
{
"epoch": 0.4960696238068501,
"grad_norm": 0.39626145362854004,
"learning_rate": 5.935750761824777e-06,
"loss": 0.4946,
"step": 589
},
{
"epoch": 0.49691184727681076,
"grad_norm": 0.3816596567630768,
"learning_rate": 5.921298771419731e-06,
"loss": 0.4747,
"step": 590
},
{
"epoch": 0.49775407074677147,
"grad_norm": 0.37927156686782837,
"learning_rate": 5.906838809195879e-06,
"loss": 0.4609,
"step": 591
},
{
"epoch": 0.4985962942167322,
"grad_norm": 0.4023749828338623,
"learning_rate": 5.8923710002724595e-06,
"loss": 0.4696,
"step": 592
},
{
"epoch": 0.4994385176866929,
"grad_norm": 0.39053875207901,
"learning_rate": 5.877895469836604e-06,
"loss": 0.4747,
"step": 593
},
{
"epoch": 0.5002807411566536,
"grad_norm": 0.4169563353061676,
"learning_rate": 5.863412343142258e-06,
"loss": 0.4278,
"step": 594
},
{
"epoch": 0.5011229646266142,
"grad_norm": 0.37836650013923645,
"learning_rate": 5.848921745509094e-06,
"loss": 0.4845,
"step": 595
},
{
"epoch": 0.501965188096575,
"grad_norm": 0.3742503523826599,
"learning_rate": 5.8344238023214305e-06,
"loss": 0.4789,
"step": 596
},
{
"epoch": 0.5028074115665356,
"grad_norm": 0.4057691693305969,
"learning_rate": 5.819918639027149e-06,
"loss": 0.46,
"step": 597
},
{
"epoch": 0.5036496350364964,
"grad_norm": 0.41869857907295227,
"learning_rate": 5.805406381136598e-06,
"loss": 0.4445,
"step": 598
},
{
"epoch": 0.5044918585064571,
"grad_norm": 0.4319513142108917,
"learning_rate": 5.790887154221521e-06,
"loss": 0.4922,
"step": 599
},
{
"epoch": 0.5053340819764177,
"grad_norm": 0.4296809434890747,
"learning_rate": 5.776361083913959e-06,
"loss": 0.472,
"step": 600
},
{
"epoch": 0.5061763054463785,
"grad_norm": 0.4201277494430542,
"learning_rate": 5.7618282959051685e-06,
"loss": 0.4929,
"step": 601
},
{
"epoch": 0.5070185289163391,
"grad_norm": 0.4361591637134552,
"learning_rate": 5.747288915944533e-06,
"loss": 0.4883,
"step": 602
},
{
"epoch": 0.5078607523862998,
"grad_norm": 0.44996178150177,
"learning_rate": 5.7327430698384775e-06,
"loss": 0.464,
"step": 603
},
{
"epoch": 0.5087029758562606,
"grad_norm": 0.42448410391807556,
"learning_rate": 5.718190883449373e-06,
"loss": 0.4874,
"step": 604
},
{
"epoch": 0.5095451993262212,
"grad_norm": 0.39895886182785034,
"learning_rate": 5.703632482694453e-06,
"loss": 0.4809,
"step": 605
},
{
"epoch": 0.5103874227961819,
"grad_norm": 0.38959255814552307,
"learning_rate": 5.689067993544726e-06,
"loss": 0.4764,
"step": 606
},
{
"epoch": 0.5112296462661426,
"grad_norm": 0.401683509349823,
"learning_rate": 5.674497542023875e-06,
"loss": 0.462,
"step": 607
},
{
"epoch": 0.5120718697361033,
"grad_norm": 0.3931426405906677,
"learning_rate": 5.659921254207183e-06,
"loss": 0.4601,
"step": 608
},
{
"epoch": 0.512914093206064,
"grad_norm": 0.40031349658966064,
"learning_rate": 5.645339256220427e-06,
"loss": 0.4754,
"step": 609
},
{
"epoch": 0.5137563166760247,
"grad_norm": 0.3735891282558441,
"learning_rate": 5.630751674238796e-06,
"loss": 0.4692,
"step": 610
},
{
"epoch": 0.5145985401459854,
"grad_norm": 0.38055193424224854,
"learning_rate": 5.616158634485793e-06,
"loss": 0.4608,
"step": 611
},
{
"epoch": 0.5154407636159462,
"grad_norm": 0.3828696310520172,
"learning_rate": 5.601560263232153e-06,
"loss": 0.4503,
"step": 612
},
{
"epoch": 0.5162829870859068,
"grad_norm": 0.4191468358039856,
"learning_rate": 5.5869566867947344e-06,
"loss": 0.4595,
"step": 613
},
{
"epoch": 0.5171252105558675,
"grad_norm": 0.4003213346004486,
"learning_rate": 5.572348031535442e-06,
"loss": 0.4844,
"step": 614
},
{
"epoch": 0.5179674340258282,
"grad_norm": 0.4294838309288025,
"learning_rate": 5.557734423860122e-06,
"loss": 0.4766,
"step": 615
},
{
"epoch": 0.5188096574957889,
"grad_norm": 0.40257686376571655,
"learning_rate": 5.543115990217478e-06,
"loss": 0.4449,
"step": 616
},
{
"epoch": 0.5196518809657495,
"grad_norm": 0.4232497811317444,
"learning_rate": 5.528492857097966e-06,
"loss": 0.4923,
"step": 617
},
{
"epoch": 0.5204941044357103,
"grad_norm": 0.40218302607536316,
"learning_rate": 5.513865151032709e-06,
"loss": 0.4734,
"step": 618
},
{
"epoch": 0.521336327905671,
"grad_norm": 0.44613218307495117,
"learning_rate": 5.499232998592399e-06,
"loss": 0.4776,
"step": 619
},
{
"epoch": 0.5221785513756316,
"grad_norm": 0.387199342250824,
"learning_rate": 5.484596526386198e-06,
"loss": 0.4559,
"step": 620
},
{
"epoch": 0.5230207748455924,
"grad_norm": 0.40792933106422424,
"learning_rate": 5.469955861060653e-06,
"loss": 0.4555,
"step": 621
},
{
"epoch": 0.523862998315553,
"grad_norm": 0.43032124638557434,
"learning_rate": 5.455311129298586e-06,
"loss": 0.4751,
"step": 622
},
{
"epoch": 0.5247052217855137,
"grad_norm": 0.4241279661655426,
"learning_rate": 5.44066245781801e-06,
"loss": 0.4617,
"step": 623
},
{
"epoch": 0.5255474452554745,
"grad_norm": 0.496736615896225,
"learning_rate": 5.426009973371026e-06,
"loss": 0.4822,
"step": 624
},
{
"epoch": 0.5263896687254351,
"grad_norm": 0.3980671465396881,
"learning_rate": 5.4113538027427245e-06,
"loss": 0.4444,
"step": 625
},
{
"epoch": 0.5272318921953958,
"grad_norm": 0.4685482978820801,
"learning_rate": 5.396694072750099e-06,
"loss": 0.4616,
"step": 626
},
{
"epoch": 0.5280741156653566,
"grad_norm": 0.5459217429161072,
"learning_rate": 5.382030910240936e-06,
"loss": 0.4992,
"step": 627
},
{
"epoch": 0.5289163391353172,
"grad_norm": 0.4384770095348358,
"learning_rate": 5.367364442092724e-06,
"loss": 0.4957,
"step": 628
},
{
"epoch": 0.529758562605278,
"grad_norm": 0.518413782119751,
"learning_rate": 5.352694795211555e-06,
"loss": 0.4672,
"step": 629
},
{
"epoch": 0.5306007860752386,
"grad_norm": 0.4350719749927521,
"learning_rate": 5.338022096531028e-06,
"loss": 0.4752,
"step": 630
},
{
"epoch": 0.5314430095451993,
"grad_norm": 0.39527785778045654,
"learning_rate": 5.3233464730111426e-06,
"loss": 0.4673,
"step": 631
},
{
"epoch": 0.5322852330151601,
"grad_norm": 0.4205104410648346,
"learning_rate": 5.308668051637213e-06,
"loss": 0.472,
"step": 632
},
{
"epoch": 0.5331274564851207,
"grad_norm": 0.4029710590839386,
"learning_rate": 5.29398695941876e-06,
"loss": 0.446,
"step": 633
},
{
"epoch": 0.5339696799550814,
"grad_norm": 0.3991697430610657,
"learning_rate": 5.279303323388413e-06,
"loss": 0.4524,
"step": 634
},
{
"epoch": 0.5348119034250421,
"grad_norm": 0.44701534509658813,
"learning_rate": 5.2646172706008154e-06,
"loss": 0.4895,
"step": 635
},
{
"epoch": 0.5356541268950028,
"grad_norm": 0.38555386662483215,
"learning_rate": 5.249928928131523e-06,
"loss": 0.4863,
"step": 636
},
{
"epoch": 0.5364963503649635,
"grad_norm": 0.41185519099235535,
"learning_rate": 5.235238423075899e-06,
"loss": 0.4537,
"step": 637
},
{
"epoch": 0.5373385738349242,
"grad_norm": 0.42284438014030457,
"learning_rate": 5.220545882548024e-06,
"loss": 0.4314,
"step": 638
},
{
"epoch": 0.5381807973048849,
"grad_norm": 0.4240425229072571,
"learning_rate": 5.20585143367959e-06,
"loss": 0.475,
"step": 639
},
{
"epoch": 0.5390230207748455,
"grad_norm": 0.3929160535335541,
"learning_rate": 5.191155203618796e-06,
"loss": 0.4807,
"step": 640
},
{
"epoch": 0.5398652442448063,
"grad_norm": 0.4253106713294983,
"learning_rate": 5.176457319529264e-06,
"loss": 0.4671,
"step": 641
},
{
"epoch": 0.540707467714767,
"grad_norm": 0.41673335433006287,
"learning_rate": 5.161757908588917e-06,
"loss": 0.4632,
"step": 642
},
{
"epoch": 0.5415496911847277,
"grad_norm": 0.39571234583854675,
"learning_rate": 5.147057097988898e-06,
"loss": 0.4997,
"step": 643
},
{
"epoch": 0.5423919146546884,
"grad_norm": 0.4015507996082306,
"learning_rate": 5.132355014932455e-06,
"loss": 0.4616,
"step": 644
},
{
"epoch": 0.543234138124649,
"grad_norm": 0.38526424765586853,
"learning_rate": 5.1176517866338495e-06,
"loss": 0.4742,
"step": 645
},
{
"epoch": 0.5440763615946098,
"grad_norm": 0.41674524545669556,
"learning_rate": 5.102947540317254e-06,
"loss": 0.469,
"step": 646
},
{
"epoch": 0.5449185850645705,
"grad_norm": 0.4026922881603241,
"learning_rate": 5.088242403215644e-06,
"loss": 0.4537,
"step": 647
},
{
"epoch": 0.5457608085345311,
"grad_norm": 0.4339665472507477,
"learning_rate": 5.073536502569708e-06,
"loss": 0.4755,
"step": 648
},
{
"epoch": 0.5466030320044919,
"grad_norm": 0.46872204542160034,
"learning_rate": 5.058829965626742e-06,
"loss": 0.4777,
"step": 649
},
{
"epoch": 0.5474452554744526,
"grad_norm": 0.42406511306762695,
"learning_rate": 5.0441229196395416e-06,
"loss": 0.4523,
"step": 650
},
{
"epoch": 0.5482874789444132,
"grad_norm": 0.4089900851249695,
"learning_rate": 5.029415491865311e-06,
"loss": 0.4713,
"step": 651
},
{
"epoch": 0.549129702414374,
"grad_norm": 0.4266711175441742,
"learning_rate": 5.014707809564562e-06,
"loss": 0.457,
"step": 652
},
{
"epoch": 0.5499719258843346,
"grad_norm": 0.39317575097084045,
"learning_rate": 5e-06,
"loss": 0.4486,
"step": 653
},
{
"epoch": 0.5508141493542953,
"grad_norm": 0.35429295897483826,
"learning_rate": 4.98529219043544e-06,
"loss": 0.43,
"step": 654
},
{
"epoch": 0.5516563728242561,
"grad_norm": 0.4003024101257324,
"learning_rate": 4.97058450813469e-06,
"loss": 0.4655,
"step": 655
},
{
"epoch": 0.5524985962942167,
"grad_norm": 0.37687185406684875,
"learning_rate": 4.955877080360462e-06,
"loss": 0.4599,
"step": 656
},
{
"epoch": 0.5533408197641775,
"grad_norm": 0.43657246232032776,
"learning_rate": 4.94117003437326e-06,
"loss": 0.4669,
"step": 657
},
{
"epoch": 0.5541830432341381,
"grad_norm": 0.3669135570526123,
"learning_rate": 4.926463497430293e-06,
"loss": 0.4728,
"step": 658
},
{
"epoch": 0.5550252667040988,
"grad_norm": 0.44429507851600647,
"learning_rate": 4.911757596784358e-06,
"loss": 0.4655,
"step": 659
},
{
"epoch": 0.5558674901740596,
"grad_norm": 0.42411744594573975,
"learning_rate": 4.897052459682749e-06,
"loss": 0.4752,
"step": 660
},
{
"epoch": 0.5567097136440202,
"grad_norm": 0.4312066435813904,
"learning_rate": 4.882348213366152e-06,
"loss": 0.498,
"step": 661
},
{
"epoch": 0.5575519371139809,
"grad_norm": 0.3983328342437744,
"learning_rate": 4.867644985067548e-06,
"loss": 0.4738,
"step": 662
},
{
"epoch": 0.5583941605839416,
"grad_norm": 0.39142411947250366,
"learning_rate": 4.8529429020111035e-06,
"loss": 0.4389,
"step": 663
},
{
"epoch": 0.5592363840539023,
"grad_norm": 0.39371412992477417,
"learning_rate": 4.838242091411085e-06,
"loss": 0.4884,
"step": 664
},
{
"epoch": 0.560078607523863,
"grad_norm": 0.45728200674057007,
"learning_rate": 4.823542680470738e-06,
"loss": 0.5223,
"step": 665
},
{
"epoch": 0.5609208309938237,
"grad_norm": 0.4624292254447937,
"learning_rate": 4.808844796381205e-06,
"loss": 0.4471,
"step": 666
},
{
"epoch": 0.5617630544637844,
"grad_norm": 0.3894542157649994,
"learning_rate": 4.794148566320412e-06,
"loss": 0.467,
"step": 667
},
{
"epoch": 0.562605277933745,
"grad_norm": 0.39054685831069946,
"learning_rate": 4.779454117451978e-06,
"loss": 0.4515,
"step": 668
},
{
"epoch": 0.5634475014037058,
"grad_norm": 0.4176219701766968,
"learning_rate": 4.7647615769241e-06,
"loss": 0.4652,
"step": 669
},
{
"epoch": 0.5642897248736665,
"grad_norm": 0.4907318949699402,
"learning_rate": 4.750071071868478e-06,
"loss": 0.4899,
"step": 670
},
{
"epoch": 0.5651319483436271,
"grad_norm": 0.44418320059776306,
"learning_rate": 4.7353827293991845e-06,
"loss": 0.4385,
"step": 671
},
{
"epoch": 0.5659741718135879,
"grad_norm": 0.40623655915260315,
"learning_rate": 4.720696676611589e-06,
"loss": 0.4393,
"step": 672
},
{
"epoch": 0.5668163952835485,
"grad_norm": 0.4422489404678345,
"learning_rate": 4.706013040581242e-06,
"loss": 0.4545,
"step": 673
},
{
"epoch": 0.5676586187535093,
"grad_norm": 0.46714815497398376,
"learning_rate": 4.691331948362789e-06,
"loss": 0.4459,
"step": 674
},
{
"epoch": 0.56850084222347,
"grad_norm": 0.4542917013168335,
"learning_rate": 4.676653526988858e-06,
"loss": 0.4744,
"step": 675
},
{
"epoch": 0.5693430656934306,
"grad_norm": 0.4214479625225067,
"learning_rate": 4.661977903468974e-06,
"loss": 0.4547,
"step": 676
},
{
"epoch": 0.5701852891633914,
"grad_norm": 0.3610055148601532,
"learning_rate": 4.647305204788445e-06,
"loss": 0.4427,
"step": 677
},
{
"epoch": 0.571027512633352,
"grad_norm": 0.4122559130191803,
"learning_rate": 4.632635557907277e-06,
"loss": 0.4881,
"step": 678
},
{
"epoch": 0.5718697361033127,
"grad_norm": 0.3858247399330139,
"learning_rate": 4.617969089759066e-06,
"loss": 0.4591,
"step": 679
},
{
"epoch": 0.5727119595732735,
"grad_norm": 0.3892834186553955,
"learning_rate": 4.603305927249902e-06,
"loss": 0.4604,
"step": 680
},
{
"epoch": 0.5735541830432341,
"grad_norm": 0.415051132440567,
"learning_rate": 4.588646197257278e-06,
"loss": 0.4817,
"step": 681
},
{
"epoch": 0.5743964065131948,
"grad_norm": 0.3991219997406006,
"learning_rate": 4.573990026628976e-06,
"loss": 0.4761,
"step": 682
},
{
"epoch": 0.5752386299831556,
"grad_norm": 0.4070507287979126,
"learning_rate": 4.559337542181993e-06,
"loss": 0.4514,
"step": 683
},
{
"epoch": 0.5760808534531162,
"grad_norm": 0.40679407119750977,
"learning_rate": 4.544688870701416e-06,
"loss": 0.4671,
"step": 684
},
{
"epoch": 0.5769230769230769,
"grad_norm": 0.38636258244514465,
"learning_rate": 4.53004413893935e-06,
"loss": 0.4757,
"step": 685
},
{
"epoch": 0.5777653003930376,
"grad_norm": 0.40904495120048523,
"learning_rate": 4.5154034736138035e-06,
"loss": 0.4678,
"step": 686
},
{
"epoch": 0.5786075238629983,
"grad_norm": 0.3879135549068451,
"learning_rate": 4.500767001407604e-06,
"loss": 0.467,
"step": 687
},
{
"epoch": 0.5794497473329591,
"grad_norm": 0.3964616060256958,
"learning_rate": 4.486134848967292e-06,
"loss": 0.4487,
"step": 688
},
{
"epoch": 0.5802919708029197,
"grad_norm": 0.39060208201408386,
"learning_rate": 4.471507142902036e-06,
"loss": 0.4671,
"step": 689
},
{
"epoch": 0.5811341942728804,
"grad_norm": 0.4536742568016052,
"learning_rate": 4.4568840097825225e-06,
"loss": 0.474,
"step": 690
},
{
"epoch": 0.5819764177428411,
"grad_norm": 0.3637699484825134,
"learning_rate": 4.4422655761398785e-06,
"loss": 0.4465,
"step": 691
},
{
"epoch": 0.5828186412128018,
"grad_norm": 0.36765387654304504,
"learning_rate": 4.427651968464559e-06,
"loss": 0.4307,
"step": 692
},
{
"epoch": 0.5836608646827625,
"grad_norm": 0.42411142587661743,
"learning_rate": 4.413043313205266e-06,
"loss": 0.4689,
"step": 693
},
{
"epoch": 0.5845030881527232,
"grad_norm": 0.36600083112716675,
"learning_rate": 4.3984397367678475e-06,
"loss": 0.4659,
"step": 694
},
{
"epoch": 0.5853453116226839,
"grad_norm": 0.42704781889915466,
"learning_rate": 4.383841365514208e-06,
"loss": 0.4869,
"step": 695
},
{
"epoch": 0.5861875350926445,
"grad_norm": 0.3502572774887085,
"learning_rate": 4.369248325761205e-06,
"loss": 0.4559,
"step": 696
},
{
"epoch": 0.5870297585626053,
"grad_norm": 0.412727415561676,
"learning_rate": 4.354660743779575e-06,
"loss": 0.4551,
"step": 697
},
{
"epoch": 0.587871982032566,
"grad_norm": 0.43753746151924133,
"learning_rate": 4.340078745792818e-06,
"loss": 0.4875,
"step": 698
},
{
"epoch": 0.5887142055025266,
"grad_norm": 0.3773738443851471,
"learning_rate": 4.325502457976126e-06,
"loss": 0.4571,
"step": 699
},
{
"epoch": 0.5895564289724874,
"grad_norm": 0.3818628191947937,
"learning_rate": 4.310932006455276e-06,
"loss": 0.4368,
"step": 700
},
{
"epoch": 0.590398652442448,
"grad_norm": 0.4297826588153839,
"learning_rate": 4.296367517305548e-06,
"loss": 0.4613,
"step": 701
},
{
"epoch": 0.5912408759124088,
"grad_norm": 0.4199880361557007,
"learning_rate": 4.281809116550629e-06,
"loss": 0.4512,
"step": 702
},
{
"epoch": 0.5920830993823695,
"grad_norm": 0.3798048496246338,
"learning_rate": 4.267256930161523e-06,
"loss": 0.4399,
"step": 703
},
{
"epoch": 0.5929253228523301,
"grad_norm": 0.38816648721694946,
"learning_rate": 4.252711084055468e-06,
"loss": 0.4415,
"step": 704
},
{
"epoch": 0.5937675463222909,
"grad_norm": 0.38051608204841614,
"learning_rate": 4.238171704094833e-06,
"loss": 0.4622,
"step": 705
},
{
"epoch": 0.5946097697922516,
"grad_norm": 0.39357250928878784,
"learning_rate": 4.223638916086044e-06,
"loss": 0.4664,
"step": 706
},
{
"epoch": 0.5954519932622122,
"grad_norm": 0.4027978777885437,
"learning_rate": 4.209112845778481e-06,
"loss": 0.4643,
"step": 707
},
{
"epoch": 0.596294216732173,
"grad_norm": 0.38121509552001953,
"learning_rate": 4.194593618863404e-06,
"loss": 0.4472,
"step": 708
},
{
"epoch": 0.5971364402021336,
"grad_norm": 0.3984185755252838,
"learning_rate": 4.180081360972852e-06,
"loss": 0.4479,
"step": 709
},
{
"epoch": 0.5979786636720943,
"grad_norm": 0.4842238128185272,
"learning_rate": 4.165576197678571e-06,
"loss": 0.4685,
"step": 710
},
{
"epoch": 0.5988208871420551,
"grad_norm": 0.38819262385368347,
"learning_rate": 4.151078254490908e-06,
"loss": 0.4776,
"step": 711
},
{
"epoch": 0.5996631106120157,
"grad_norm": 0.4201744496822357,
"learning_rate": 4.136587656857744e-06,
"loss": 0.471,
"step": 712
},
{
"epoch": 0.6005053340819764,
"grad_norm": 0.4200340211391449,
"learning_rate": 4.122104530163397e-06,
"loss": 0.4309,
"step": 713
},
{
"epoch": 0.6013475575519371,
"grad_norm": 0.37184223532676697,
"learning_rate": 4.107628999727542e-06,
"loss": 0.462,
"step": 714
},
{
"epoch": 0.6021897810218978,
"grad_norm": 0.43214350938796997,
"learning_rate": 4.09316119080412e-06,
"loss": 0.4824,
"step": 715
},
{
"epoch": 0.6030320044918585,
"grad_norm": 0.3922446370124817,
"learning_rate": 4.0787012285802695e-06,
"loss": 0.4716,
"step": 716
},
{
"epoch": 0.6038742279618192,
"grad_norm": 0.4424769878387451,
"learning_rate": 4.064249238175223e-06,
"loss": 0.4681,
"step": 717
},
{
"epoch": 0.6047164514317799,
"grad_norm": 0.41791683435440063,
"learning_rate": 4.04980534463924e-06,
"loss": 0.4773,
"step": 718
},
{
"epoch": 0.6055586749017406,
"grad_norm": 0.3661481738090515,
"learning_rate": 4.035369672952516e-06,
"loss": 0.4553,
"step": 719
},
{
"epoch": 0.6064008983717013,
"grad_norm": 0.3772629201412201,
"learning_rate": 4.020942348024108e-06,
"loss": 0.432,
"step": 720
},
{
"epoch": 0.607243121841662,
"grad_norm": 0.4189452826976776,
"learning_rate": 4.0065234946908456e-06,
"loss": 0.472,
"step": 721
},
{
"epoch": 0.6080853453116227,
"grad_norm": 0.38295382261276245,
"learning_rate": 3.992113237716261e-06,
"loss": 0.487,
"step": 722
},
{
"epoch": 0.6089275687815834,
"grad_norm": 0.39993205666542053,
"learning_rate": 3.977711701789499e-06,
"loss": 0.4594,
"step": 723
},
{
"epoch": 0.609769792251544,
"grad_norm": 0.40375831723213196,
"learning_rate": 3.963319011524246e-06,
"loss": 0.4476,
"step": 724
},
{
"epoch": 0.6106120157215048,
"grad_norm": 0.39132022857666016,
"learning_rate": 3.948935291457645e-06,
"loss": 0.4686,
"step": 725
},
{
"epoch": 0.6114542391914655,
"grad_norm": 0.3846696615219116,
"learning_rate": 3.934560666049226e-06,
"loss": 0.4318,
"step": 726
},
{
"epoch": 0.6122964626614261,
"grad_norm": 0.3892068564891815,
"learning_rate": 3.920195259679822e-06,
"loss": 0.455,
"step": 727
},
{
"epoch": 0.6131386861313869,
"grad_norm": 0.4346502125263214,
"learning_rate": 3.905839196650494e-06,
"loss": 0.474,
"step": 728
},
{
"epoch": 0.6139809096013475,
"grad_norm": 0.38509148359298706,
"learning_rate": 3.891492601181462e-06,
"loss": 0.4617,
"step": 729
},
{
"epoch": 0.6148231330713082,
"grad_norm": 0.35971835255622864,
"learning_rate": 3.877155597411019e-06,
"loss": 0.4574,
"step": 730
},
{
"epoch": 0.615665356541269,
"grad_norm": 0.4372704029083252,
"learning_rate": 3.862828309394469e-06,
"loss": 0.4864,
"step": 731
},
{
"epoch": 0.6165075800112296,
"grad_norm": 0.41383740305900574,
"learning_rate": 3.8485108611030415e-06,
"loss": 0.4679,
"step": 732
},
{
"epoch": 0.6173498034811904,
"grad_norm": 0.3286128044128418,
"learning_rate": 3.834203376422831e-06,
"loss": 0.4487,
"step": 733
},
{
"epoch": 0.618192026951151,
"grad_norm": 0.4194988012313843,
"learning_rate": 3.8199059791537105e-06,
"loss": 0.4588,
"step": 734
},
{
"epoch": 0.6190342504211117,
"grad_norm": 0.4303875267505646,
"learning_rate": 3.805618793008279e-06,
"loss": 0.4594,
"step": 735
},
{
"epoch": 0.6198764738910725,
"grad_norm": 0.38063475489616394,
"learning_rate": 3.7913419416107692e-06,
"loss": 0.4497,
"step": 736
},
{
"epoch": 0.6207186973610331,
"grad_norm": 0.3899458348751068,
"learning_rate": 3.777075548496001e-06,
"loss": 0.4649,
"step": 737
},
{
"epoch": 0.6215609208309938,
"grad_norm": 0.39380112290382385,
"learning_rate": 3.7628197371082916e-06,
"loss": 0.4539,
"step": 738
},
{
"epoch": 0.6224031443009546,
"grad_norm": 0.4633565843105316,
"learning_rate": 3.7485746308004013e-06,
"loss": 0.4817,
"step": 739
},
{
"epoch": 0.6232453677709152,
"grad_norm": 0.37416234612464905,
"learning_rate": 3.7343403528324574e-06,
"loss": 0.4651,
"step": 740
},
{
"epoch": 0.6240875912408759,
"grad_norm": 0.4205109775066376,
"learning_rate": 3.7201170263709004e-06,
"loss": 0.4576,
"step": 741
},
{
"epoch": 0.6249298147108366,
"grad_norm": 0.3740626871585846,
"learning_rate": 3.705904774487396e-06,
"loss": 0.4523,
"step": 742
},
{
"epoch": 0.6257720381807973,
"grad_norm": 0.39562278985977173,
"learning_rate": 3.6917037201577977e-06,
"loss": 0.4616,
"step": 743
},
{
"epoch": 0.626614261650758,
"grad_norm": 0.4320240318775177,
"learning_rate": 3.6775139862610577e-06,
"loss": 0.478,
"step": 744
},
{
"epoch": 0.6274564851207187,
"grad_norm": 0.356246680021286,
"learning_rate": 3.6633356955781827e-06,
"loss": 0.4365,
"step": 745
},
{
"epoch": 0.6282987085906794,
"grad_norm": 0.4068841338157654,
"learning_rate": 3.649168970791157e-06,
"loss": 0.4614,
"step": 746
},
{
"epoch": 0.62914093206064,
"grad_norm": 0.4344610571861267,
"learning_rate": 3.635013934481895e-06,
"loss": 0.437,
"step": 747
},
{
"epoch": 0.6299831555306008,
"grad_norm": 0.3860117793083191,
"learning_rate": 3.620870709131163e-06,
"loss": 0.4724,
"step": 748
},
{
"epoch": 0.6308253790005615,
"grad_norm": 0.3780432343482971,
"learning_rate": 3.6067394171175397e-06,
"loss": 0.4866,
"step": 749
},
{
"epoch": 0.6316676024705222,
"grad_norm": 0.3598399758338928,
"learning_rate": 3.5926201807163384e-06,
"loss": 0.4655,
"step": 750
},
{
"epoch": 0.6325098259404829,
"grad_norm": 0.3982943594455719,
"learning_rate": 3.578513122098566e-06,
"loss": 0.4721,
"step": 751
},
{
"epoch": 0.6333520494104435,
"grad_norm": 0.4459211826324463,
"learning_rate": 3.564418363329848e-06,
"loss": 0.446,
"step": 752
},
{
"epoch": 0.6341942728804043,
"grad_norm": 0.45306849479675293,
"learning_rate": 3.5503360263693887e-06,
"loss": 0.4618,
"step": 753
},
{
"epoch": 0.635036496350365,
"grad_norm": 0.351254940032959,
"learning_rate": 3.5362662330689067e-06,
"loss": 0.4425,
"step": 754
},
{
"epoch": 0.6358787198203256,
"grad_norm": 0.3903196156024933,
"learning_rate": 3.5222091051715803e-06,
"loss": 0.4659,
"step": 755
},
{
"epoch": 0.6367209432902864,
"grad_norm": 0.3876386284828186,
"learning_rate": 3.5081647643110028e-06,
"loss": 0.4447,
"step": 756
},
{
"epoch": 0.637563166760247,
"grad_norm": 0.4539998173713684,
"learning_rate": 3.4941333320101173e-06,
"loss": 0.4648,
"step": 757
},
{
"epoch": 0.6384053902302077,
"grad_norm": 0.43911775946617126,
"learning_rate": 3.480114929680176e-06,
"loss": 0.4684,
"step": 758
},
{
"epoch": 0.6392476137001685,
"grad_norm": 0.36161547899246216,
"learning_rate": 3.466109678619681e-06,
"loss": 0.445,
"step": 759
},
{
"epoch": 0.6400898371701291,
"grad_norm": 0.4111180007457733,
"learning_rate": 3.4521177000133456e-06,
"loss": 0.4635,
"step": 760
},
{
"epoch": 0.6409320606400898,
"grad_norm": 0.42481526732444763,
"learning_rate": 3.4381391149310294e-06,
"loss": 0.4194,
"step": 761
},
{
"epoch": 0.6417742841100506,
"grad_norm": 0.44296056032180786,
"learning_rate": 3.4241740443267112e-06,
"loss": 0.4611,
"step": 762
},
{
"epoch": 0.6426165075800112,
"grad_norm": 0.3898600935935974,
"learning_rate": 3.4102226090374246e-06,
"loss": 0.4717,
"step": 763
},
{
"epoch": 0.643458731049972,
"grad_norm": 0.41485339403152466,
"learning_rate": 3.3962849297822225e-06,
"loss": 0.4246,
"step": 764
},
{
"epoch": 0.6443009545199326,
"grad_norm": 0.42214149236679077,
"learning_rate": 3.3823611271611266e-06,
"loss": 0.456,
"step": 765
},
{
"epoch": 0.6451431779898933,
"grad_norm": 0.3852960765361786,
"learning_rate": 3.368451321654091e-06,
"loss": 0.4626,
"step": 766
},
{
"epoch": 0.6459854014598541,
"grad_norm": 0.3872814178466797,
"learning_rate": 3.35455563361995e-06,
"loss": 0.4632,
"step": 767
},
{
"epoch": 0.6468276249298147,
"grad_norm": 0.41995540261268616,
"learning_rate": 3.3406741832953893e-06,
"loss": 0.4672,
"step": 768
},
{
"epoch": 0.6476698483997754,
"grad_norm": 0.43301689624786377,
"learning_rate": 3.3268070907938915e-06,
"loss": 0.4625,
"step": 769
},
{
"epoch": 0.6485120718697361,
"grad_norm": 0.3875824809074402,
"learning_rate": 3.3129544761047093e-06,
"loss": 0.4773,
"step": 770
},
{
"epoch": 0.6493542953396968,
"grad_norm": 0.40870511531829834,
"learning_rate": 3.2991164590918162e-06,
"loss": 0.4682,
"step": 771
},
{
"epoch": 0.6501965188096575,
"grad_norm": 0.43525877594947815,
"learning_rate": 3.2852931594928804e-06,
"loss": 0.4594,
"step": 772
},
{
"epoch": 0.6510387422796182,
"grad_norm": 0.40894654393196106,
"learning_rate": 3.271484696918218e-06,
"loss": 0.4695,
"step": 773
},
{
"epoch": 0.6518809657495789,
"grad_norm": 0.42762210965156555,
"learning_rate": 3.2576911908497695e-06,
"loss": 0.4597,
"step": 774
},
{
"epoch": 0.6527231892195395,
"grad_norm": 0.33561986684799194,
"learning_rate": 3.2439127606400546e-06,
"loss": 0.4486,
"step": 775
},
{
"epoch": 0.6535654126895003,
"grad_norm": 0.4465593695640564,
"learning_rate": 3.2301495255111426e-06,
"loss": 0.4937,
"step": 776
},
{
"epoch": 0.654407636159461,
"grad_norm": 0.3989386558532715,
"learning_rate": 3.2164016045536306e-06,
"loss": 0.4541,
"step": 777
},
{
"epoch": 0.6552498596294217,
"grad_norm": 0.38122543692588806,
"learning_rate": 3.202669116725598e-06,
"loss": 0.4426,
"step": 778
},
{
"epoch": 0.6560920830993824,
"grad_norm": 0.3579535484313965,
"learning_rate": 3.1889521808515888e-06,
"loss": 0.464,
"step": 779
},
{
"epoch": 0.656934306569343,
"grad_norm": 0.38302677869796753,
"learning_rate": 3.1752509156215738e-06,
"loss": 0.4418,
"step": 780
},
{
"epoch": 0.6577765300393038,
"grad_norm": 0.3925486207008362,
"learning_rate": 3.1615654395899377e-06,
"loss": 0.4621,
"step": 781
},
{
"epoch": 0.6586187535092645,
"grad_norm": 0.3953411877155304,
"learning_rate": 3.1478958711744324e-06,
"loss": 0.4503,
"step": 782
},
{
"epoch": 0.6594609769792251,
"grad_norm": 0.3958507478237152,
"learning_rate": 3.1342423286551756e-06,
"loss": 0.4718,
"step": 783
},
{
"epoch": 0.6603032004491859,
"grad_norm": 0.39234763383865356,
"learning_rate": 3.120604930173608e-06,
"loss": 0.4806,
"step": 784
},
{
"epoch": 0.6611454239191465,
"grad_norm": 0.3791189193725586,
"learning_rate": 3.1069837937314846e-06,
"loss": 0.4574,
"step": 785
},
{
"epoch": 0.6619876473891072,
"grad_norm": 0.36702293157577515,
"learning_rate": 3.093379037189842e-06,
"loss": 0.4556,
"step": 786
},
{
"epoch": 0.662829870859068,
"grad_norm": 0.3860447406768799,
"learning_rate": 3.0797907782679944e-06,
"loss": 0.439,
"step": 787
},
{
"epoch": 0.6636720943290286,
"grad_norm": 0.37849584221839905,
"learning_rate": 3.0662191345424925e-06,
"loss": 0.4869,
"step": 788
},
{
"epoch": 0.6645143177989893,
"grad_norm": 0.3850601017475128,
"learning_rate": 3.0526642234461313e-06,
"loss": 0.4673,
"step": 789
},
{
"epoch": 0.66535654126895,
"grad_norm": 0.3666015565395355,
"learning_rate": 3.039126162266912e-06,
"loss": 0.4341,
"step": 790
},
{
"epoch": 0.6661987647389107,
"grad_norm": 0.36709144711494446,
"learning_rate": 3.0256050681470446e-06,
"loss": 0.4873,
"step": 791
},
{
"epoch": 0.6670409882088714,
"grad_norm": 0.3814394474029541,
"learning_rate": 3.012101058081919e-06,
"loss": 0.4806,
"step": 792
},
{
"epoch": 0.6678832116788321,
"grad_norm": 0.3725821375846863,
"learning_rate": 2.9986142489191074e-06,
"loss": 0.4441,
"step": 793
},
{
"epoch": 0.6687254351487928,
"grad_norm": 0.3743121325969696,
"learning_rate": 2.9851447573573383e-06,
"loss": 0.4677,
"step": 794
},
{
"epoch": 0.6695676586187536,
"grad_norm": 0.3642611801624298,
"learning_rate": 2.971692699945502e-06,
"loss": 0.4593,
"step": 795
},
{
"epoch": 0.6704098820887142,
"grad_norm": 0.36425134539604187,
"learning_rate": 2.958258193081629e-06,
"loss": 0.4546,
"step": 796
},
{
"epoch": 0.6712521055586749,
"grad_norm": 0.36076676845550537,
"learning_rate": 2.9448413530118912e-06,
"loss": 0.4589,
"step": 797
},
{
"epoch": 0.6720943290286356,
"grad_norm": 0.3566359877586365,
"learning_rate": 2.9314422958295906e-06,
"loss": 0.4733,
"step": 798
},
{
"epoch": 0.6729365524985963,
"grad_norm": 0.3717946410179138,
"learning_rate": 2.9180611374741623e-06,
"loss": 0.4578,
"step": 799
},
{
"epoch": 0.673778775968557,
"grad_norm": 0.34884119033813477,
"learning_rate": 2.904697993730159e-06,
"loss": 0.439,
"step": 800
},
{
"epoch": 0.6746209994385177,
"grad_norm": 0.374682754278183,
"learning_rate": 2.891352980226262e-06,
"loss": 0.4692,
"step": 801
},
{
"epoch": 0.6754632229084784,
"grad_norm": 0.35721421241760254,
"learning_rate": 2.8780262124342755e-06,
"loss": 0.4848,
"step": 802
},
{
"epoch": 0.676305446378439,
"grad_norm": 0.41552138328552246,
"learning_rate": 2.8647178056681197e-06,
"loss": 0.4716,
"step": 803
},
{
"epoch": 0.6771476698483998,
"grad_norm": 0.37886112928390503,
"learning_rate": 2.8514278750828537e-06,
"loss": 0.4754,
"step": 804
},
{
"epoch": 0.6779898933183605,
"grad_norm": 0.3755894601345062,
"learning_rate": 2.838156535673652e-06,
"loss": 0.4457,
"step": 805
},
{
"epoch": 0.6788321167883211,
"grad_norm": 0.35193660855293274,
"learning_rate": 2.8249039022748315e-06,
"loss": 0.4909,
"step": 806
},
{
"epoch": 0.6796743402582819,
"grad_norm": 0.3655393421649933,
"learning_rate": 2.8116700895588473e-06,
"loss": 0.4767,
"step": 807
},
{
"epoch": 0.6805165637282425,
"grad_norm": 0.38788074254989624,
"learning_rate": 2.798455212035305e-06,
"loss": 0.4875,
"step": 808
},
{
"epoch": 0.6813587871982033,
"grad_norm": 0.3898300528526306,
"learning_rate": 2.785259384049959e-06,
"loss": 0.462,
"step": 809
},
{
"epoch": 0.682201010668164,
"grad_norm": 0.34134534001350403,
"learning_rate": 2.7720827197837475e-06,
"loss": 0.4641,
"step": 810
},
{
"epoch": 0.6830432341381246,
"grad_norm": 0.352770060300827,
"learning_rate": 2.7589253332517736e-06,
"loss": 0.4659,
"step": 811
},
{
"epoch": 0.6838854576080854,
"grad_norm": 0.37993523478507996,
"learning_rate": 2.745787338302341e-06,
"loss": 0.4567,
"step": 812
},
{
"epoch": 0.684727681078046,
"grad_norm": 0.35517552495002747,
"learning_rate": 2.7326688486159613e-06,
"loss": 0.4586,
"step": 813
},
{
"epoch": 0.6855699045480067,
"grad_norm": 0.3561233878135681,
"learning_rate": 2.7195699777043723e-06,
"loss": 0.4721,
"step": 814
},
{
"epoch": 0.6864121280179675,
"grad_norm": 0.4417415261268616,
"learning_rate": 2.706490838909547e-06,
"loss": 0.4788,
"step": 815
},
{
"epoch": 0.6872543514879281,
"grad_norm": 0.3459290564060211,
"learning_rate": 2.6934315454027323e-06,
"loss": 0.4818,
"step": 816
},
{
"epoch": 0.6880965749578888,
"grad_norm": 0.36064231395721436,
"learning_rate": 2.680392210183446e-06,
"loss": 0.4524,
"step": 817
},
{
"epoch": 0.6889387984278496,
"grad_norm": 0.3664780259132385,
"learning_rate": 2.6673729460785174e-06,
"loss": 0.4666,
"step": 818
},
{
"epoch": 0.6897810218978102,
"grad_norm": 0.40207329392433167,
"learning_rate": 2.6543738657411033e-06,
"loss": 0.4433,
"step": 819
},
{
"epoch": 0.6906232453677709,
"grad_norm": 0.3451634347438812,
"learning_rate": 2.6413950816497146e-06,
"loss": 0.4731,
"step": 820
},
{
"epoch": 0.6914654688377316,
"grad_norm": 0.3632340729236603,
"learning_rate": 2.628436706107238e-06,
"loss": 0.4773,
"step": 821
},
{
"epoch": 0.6923076923076923,
"grad_norm": 0.41608792543411255,
"learning_rate": 2.6154988512399784e-06,
"loss": 0.4705,
"step": 822
},
{
"epoch": 0.6931499157776531,
"grad_norm": 0.38861706852912903,
"learning_rate": 2.6025816289966703e-06,
"loss": 0.4725,
"step": 823
},
{
"epoch": 0.6939921392476137,
"grad_norm": 0.36379146575927734,
"learning_rate": 2.5896851511475184e-06,
"loss": 0.4898,
"step": 824
},
{
"epoch": 0.6948343627175744,
"grad_norm": 0.35487642884254456,
"learning_rate": 2.5768095292832412e-06,
"loss": 0.4552,
"step": 825
},
{
"epoch": 0.6956765861875351,
"grad_norm": 0.3770192861557007,
"learning_rate": 2.5639548748140803e-06,
"loss": 0.4327,
"step": 826
},
{
"epoch": 0.6965188096574958,
"grad_norm": 0.33418017625808716,
"learning_rate": 2.5511212989688587e-06,
"loss": 0.4585,
"step": 827
},
{
"epoch": 0.6973610331274565,
"grad_norm": 0.3845199942588806,
"learning_rate": 2.5383089127940087e-06,
"loss": 0.4692,
"step": 828
},
{
"epoch": 0.6982032565974172,
"grad_norm": 0.38967469334602356,
"learning_rate": 2.525517827152614e-06,
"loss": 0.4701,
"step": 829
},
{
"epoch": 0.6990454800673779,
"grad_norm": 0.3837958872318268,
"learning_rate": 2.5127481527234397e-06,
"loss": 0.4746,
"step": 830
},
{
"epoch": 0.6998877035373385,
"grad_norm": 0.3344170153141022,
"learning_rate": 2.5000000000000015e-06,
"loss": 0.4391,
"step": 831
},
{
"epoch": 0.7007299270072993,
"grad_norm": 0.3698020279407501,
"learning_rate": 2.487273479289574e-06,
"loss": 0.4525,
"step": 832
},
{
"epoch": 0.70157215047726,
"grad_norm": 0.3206958472728729,
"learning_rate": 2.4745687007122636e-06,
"loss": 0.4596,
"step": 833
},
{
"epoch": 0.7024143739472206,
"grad_norm": 0.37502309679985046,
"learning_rate": 2.4618857742000463e-06,
"loss": 0.458,
"step": 834
},
{
"epoch": 0.7032565974171814,
"grad_norm": 0.3526128828525543,
"learning_rate": 2.449224809495815e-06,
"loss": 0.4851,
"step": 835
},
{
"epoch": 0.704098820887142,
"grad_norm": 0.35734719038009644,
"learning_rate": 2.436585916152426e-06,
"loss": 0.4281,
"step": 836
},
{
"epoch": 0.7049410443571027,
"grad_norm": 0.41668057441711426,
"learning_rate": 2.423969203531768e-06,
"loss": 0.4487,
"step": 837
},
{
"epoch": 0.7057832678270635,
"grad_norm": 0.3506368100643158,
"learning_rate": 2.411374780803793e-06,
"loss": 0.4606,
"step": 838
},
{
"epoch": 0.7066254912970241,
"grad_norm": 0.3638664484024048,
"learning_rate": 2.3988027569455895e-06,
"loss": 0.4605,
"step": 839
},
{
"epoch": 0.7074677147669849,
"grad_norm": 0.4103443920612335,
"learning_rate": 2.3862532407404306e-06,
"loss": 0.449,
"step": 840
},
{
"epoch": 0.7083099382369455,
"grad_norm": 0.3737192451953888,
"learning_rate": 2.373726340776837e-06,
"loss": 0.4451,
"step": 841
},
{
"epoch": 0.7091521617069062,
"grad_norm": 0.39184120297431946,
"learning_rate": 2.361222165447628e-06,
"loss": 0.4641,
"step": 842
},
{
"epoch": 0.709994385176867,
"grad_norm": 0.4118758738040924,
"learning_rate": 2.348740822949006e-06,
"loss": 0.4908,
"step": 843
},
{
"epoch": 0.7108366086468276,
"grad_norm": 0.35428386926651,
"learning_rate": 2.33628242127959e-06,
"loss": 0.4454,
"step": 844
},
{
"epoch": 0.7116788321167883,
"grad_norm": 0.4048777222633362,
"learning_rate": 2.323847068239504e-06,
"loss": 0.4765,
"step": 845
},
{
"epoch": 0.712521055586749,
"grad_norm": 0.40927958488464355,
"learning_rate": 2.3114348714294355e-06,
"loss": 0.4608,
"step": 846
},
{
"epoch": 0.7133632790567097,
"grad_norm": 0.39046961069107056,
"learning_rate": 2.2990459382497086e-06,
"loss": 0.4751,
"step": 847
},
{
"epoch": 0.7142055025266704,
"grad_norm": 0.37439659237861633,
"learning_rate": 2.2866803758993446e-06,
"loss": 0.4899,
"step": 848
},
{
"epoch": 0.7150477259966311,
"grad_norm": 0.3961264491081238,
"learning_rate": 2.274338291375147e-06,
"loss": 0.4674,
"step": 849
},
{
"epoch": 0.7158899494665918,
"grad_norm": 0.37867552042007446,
"learning_rate": 2.262019791470772e-06,
"loss": 0.4495,
"step": 850
},
{
"epoch": 0.7167321729365524,
"grad_norm": 0.3406490087509155,
"learning_rate": 2.2497249827757933e-06,
"loss": 0.4941,
"step": 851
},
{
"epoch": 0.7175743964065132,
"grad_norm": 0.3773308992385864,
"learning_rate": 2.2374539716748034e-06,
"loss": 0.4789,
"step": 852
},
{
"epoch": 0.7184166198764739,
"grad_norm": 0.3965294063091278,
"learning_rate": 2.225206864346465e-06,
"loss": 0.474,
"step": 853
},
{
"epoch": 0.7192588433464346,
"grad_norm": 0.3620738983154297,
"learning_rate": 2.2129837667626147e-06,
"loss": 0.436,
"step": 854
},
{
"epoch": 0.7201010668163953,
"grad_norm": 0.3788299560546875,
"learning_rate": 2.2007847846873342e-06,
"loss": 0.4686,
"step": 855
},
{
"epoch": 0.720943290286356,
"grad_norm": 0.3688678741455078,
"learning_rate": 2.188610023676041e-06,
"loss": 0.4698,
"step": 856
},
{
"epoch": 0.7217855137563167,
"grad_norm": 0.3527979552745819,
"learning_rate": 2.176459589074566e-06,
"loss": 0.4463,
"step": 857
},
{
"epoch": 0.7226277372262774,
"grad_norm": 0.35224097967147827,
"learning_rate": 2.164333586018259e-06,
"loss": 0.4563,
"step": 858
},
{
"epoch": 0.723469960696238,
"grad_norm": 0.36833032965660095,
"learning_rate": 2.1522321194310577e-06,
"loss": 0.4714,
"step": 859
},
{
"epoch": 0.7243121841661988,
"grad_norm": 0.3530896306037903,
"learning_rate": 2.1401552940245962e-06,
"loss": 0.4585,
"step": 860
},
{
"epoch": 0.7251544076361595,
"grad_norm": 0.37035128474235535,
"learning_rate": 2.1281032142972933e-06,
"loss": 0.4443,
"step": 861
},
{
"epoch": 0.7259966311061201,
"grad_norm": 0.3335658609867096,
"learning_rate": 2.1160759845334483e-06,
"loss": 0.4704,
"step": 862
},
{
"epoch": 0.7268388545760809,
"grad_norm": 0.3341200351715088,
"learning_rate": 2.1040737088023323e-06,
"loss": 0.4591,
"step": 863
},
{
"epoch": 0.7276810780460415,
"grad_norm": 0.33908799290657043,
"learning_rate": 2.0920964909573065e-06,
"loss": 0.4628,
"step": 864
},
{
"epoch": 0.7285233015160022,
"grad_norm": 0.35746559500694275,
"learning_rate": 2.080144434634898e-06,
"loss": 0.4501,
"step": 865
},
{
"epoch": 0.729365524985963,
"grad_norm": 0.3367277979850769,
"learning_rate": 2.068217643253925e-06,
"loss": 0.4557,
"step": 866
},
{
"epoch": 0.7302077484559236,
"grad_norm": 0.35350626707077026,
"learning_rate": 2.056316220014588e-06,
"loss": 0.4379,
"step": 867
},
{
"epoch": 0.7310499719258844,
"grad_norm": 0.36783963441848755,
"learning_rate": 2.0444402678975876e-06,
"loss": 0.4491,
"step": 868
},
{
"epoch": 0.731892195395845,
"grad_norm": 0.3690626621246338,
"learning_rate": 2.0325898896632178e-06,
"loss": 0.4369,
"step": 869
},
{
"epoch": 0.7327344188658057,
"grad_norm": 0.36176997423171997,
"learning_rate": 2.0207651878505e-06,
"loss": 0.4682,
"step": 870
},
{
"epoch": 0.7335766423357665,
"grad_norm": 0.3541899621486664,
"learning_rate": 2.0089662647762716e-06,
"loss": 0.4621,
"step": 871
},
{
"epoch": 0.7344188658057271,
"grad_norm": 0.3505668342113495,
"learning_rate": 1.997193222534316e-06,
"loss": 0.4271,
"step": 872
},
{
"epoch": 0.7352610892756878,
"grad_norm": 0.3986637592315674,
"learning_rate": 1.9854461629944764e-06,
"loss": 0.4557,
"step": 873
},
{
"epoch": 0.7361033127456486,
"grad_norm": 0.3776889443397522,
"learning_rate": 1.9737251878017678e-06,
"loss": 0.4389,
"step": 874
},
{
"epoch": 0.7369455362156092,
"grad_norm": 0.36136969923973083,
"learning_rate": 1.962030398375506e-06,
"loss": 0.4618,
"step": 875
},
{
"epoch": 0.7377877596855699,
"grad_norm": 0.35746991634368896,
"learning_rate": 1.950361895908427e-06,
"loss": 0.4429,
"step": 876
},
{
"epoch": 0.7386299831555306,
"grad_norm": 0.3493303060531616,
"learning_rate": 1.9387197813658092e-06,
"loss": 0.4768,
"step": 877
},
{
"epoch": 0.7394722066254913,
"grad_norm": 0.36661627888679504,
"learning_rate": 1.927104155484602e-06,
"loss": 0.4613,
"step": 878
},
{
"epoch": 0.740314430095452,
"grad_norm": 0.3595631718635559,
"learning_rate": 1.915515118772555e-06,
"loss": 0.4718,
"step": 879
},
{
"epoch": 0.7411566535654127,
"grad_norm": 0.3635704219341278,
"learning_rate": 1.9039527715073424e-06,
"loss": 0.4571,
"step": 880
},
{
"epoch": 0.7419988770353734,
"grad_norm": 0.32493817806243896,
"learning_rate": 1.8924172137357038e-06,
"loss": 0.4787,
"step": 881
},
{
"epoch": 0.742841100505334,
"grad_norm": 0.3330385684967041,
"learning_rate": 1.8809085452725744e-06,
"loss": 0.4564,
"step": 882
},
{
"epoch": 0.7436833239752948,
"grad_norm": 0.3349233567714691,
"learning_rate": 1.8694268657002197e-06,
"loss": 0.4492,
"step": 883
},
{
"epoch": 0.7445255474452555,
"grad_norm": 0.3418170213699341,
"learning_rate": 1.8579722743673773e-06,
"loss": 0.4697,
"step": 884
},
{
"epoch": 0.7453677709152162,
"grad_norm": 0.4015568792819977,
"learning_rate": 1.8465448703883959e-06,
"loss": 0.4776,
"step": 885
},
{
"epoch": 0.7462099943851769,
"grad_norm": 0.34949299693107605,
"learning_rate": 1.8351447526423728e-06,
"loss": 0.4805,
"step": 886
},
{
"epoch": 0.7470522178551375,
"grad_norm": 0.33957937359809875,
"learning_rate": 1.8237720197723075e-06,
"loss": 0.4721,
"step": 887
},
{
"epoch": 0.7478944413250983,
"grad_norm": 0.38134124875068665,
"learning_rate": 1.812426770184243e-06,
"loss": 0.4715,
"step": 888
},
{
"epoch": 0.748736664795059,
"grad_norm": 0.3836059272289276,
"learning_rate": 1.8011091020464138e-06,
"loss": 0.5041,
"step": 889
},
{
"epoch": 0.7495788882650196,
"grad_norm": 0.34067878127098083,
"learning_rate": 1.789819113288397e-06,
"loss": 0.445,
"step": 890
},
{
"epoch": 0.7504211117349804,
"grad_norm": 0.3494390845298767,
"learning_rate": 1.7785569016002686e-06,
"loss": 0.4438,
"step": 891
},
{
"epoch": 0.751263335204941,
"grad_norm": 0.3596593141555786,
"learning_rate": 1.7673225644317487e-06,
"loss": 0.4784,
"step": 892
},
{
"epoch": 0.7521055586749017,
"grad_norm": 0.35766345262527466,
"learning_rate": 1.75611619899137e-06,
"loss": 0.4532,
"step": 893
},
{
"epoch": 0.7529477821448625,
"grad_norm": 0.35504239797592163,
"learning_rate": 1.7449379022456297e-06,
"loss": 0.4469,
"step": 894
},
{
"epoch": 0.7537900056148231,
"grad_norm": 0.3405526280403137,
"learning_rate": 1.7337877709181527e-06,
"loss": 0.4492,
"step": 895
},
{
"epoch": 0.7546322290847838,
"grad_norm": 0.3638141453266144,
"learning_rate": 1.7226659014888548e-06,
"loss": 0.4509,
"step": 896
},
{
"epoch": 0.7554744525547445,
"grad_norm": 0.36837533116340637,
"learning_rate": 1.711572390193102e-06,
"loss": 0.4557,
"step": 897
},
{
"epoch": 0.7563166760247052,
"grad_norm": 0.37345463037490845,
"learning_rate": 1.7005073330208881e-06,
"loss": 0.4515,
"step": 898
},
{
"epoch": 0.757158899494666,
"grad_norm": 0.34140193462371826,
"learning_rate": 1.689470825715998e-06,
"loss": 0.4511,
"step": 899
},
{
"epoch": 0.7580011229646266,
"grad_norm": 0.33038443326950073,
"learning_rate": 1.6784629637751814e-06,
"loss": 0.4395,
"step": 900
},
{
"epoch": 0.7588433464345873,
"grad_norm": 0.35559070110321045,
"learning_rate": 1.6674838424473172e-06,
"loss": 0.4892,
"step": 901
},
{
"epoch": 0.759685569904548,
"grad_norm": 0.34186041355133057,
"learning_rate": 1.6565335567326112e-06,
"loss": 0.4472,
"step": 902
},
{
"epoch": 0.7605277933745087,
"grad_norm": 0.36697492003440857,
"learning_rate": 1.6456122013817477e-06,
"loss": 0.4597,
"step": 903
},
{
"epoch": 0.7613700168444694,
"grad_norm": 0.3907342553138733,
"learning_rate": 1.6347198708950884e-06,
"loss": 0.4845,
"step": 904
},
{
"epoch": 0.7622122403144301,
"grad_norm": 0.3752238154411316,
"learning_rate": 1.6238566595218475e-06,
"loss": 0.4525,
"step": 905
},
{
"epoch": 0.7630544637843908,
"grad_norm": 0.41171255707740784,
"learning_rate": 1.6130226612592787e-06,
"loss": 0.4556,
"step": 906
},
{
"epoch": 0.7638966872543514,
"grad_norm": 0.3502821922302246,
"learning_rate": 1.6022179698518525e-06,
"loss": 0.4688,
"step": 907
},
{
"epoch": 0.7647389107243122,
"grad_norm": 0.3488776683807373,
"learning_rate": 1.591442678790467e-06,
"loss": 0.4798,
"step": 908
},
{
"epoch": 0.7655811341942729,
"grad_norm": 0.36690619587898254,
"learning_rate": 1.580696881311611e-06,
"loss": 0.4869,
"step": 909
},
{
"epoch": 0.7664233576642335,
"grad_norm": 0.3431231677532196,
"learning_rate": 1.5699806703965787e-06,
"loss": 0.4285,
"step": 910
},
{
"epoch": 0.7672655811341943,
"grad_norm": 0.35018178820610046,
"learning_rate": 1.5592941387706562e-06,
"loss": 0.4608,
"step": 911
},
{
"epoch": 0.768107804604155,
"grad_norm": 0.36093589663505554,
"learning_rate": 1.5486373789023206e-06,
"loss": 0.4785,
"step": 912
},
{
"epoch": 0.7689500280741156,
"grad_norm": 0.3167992830276489,
"learning_rate": 1.538010483002435e-06,
"loss": 0.445,
"step": 913
},
{
"epoch": 0.7697922515440764,
"grad_norm": 0.37118929624557495,
"learning_rate": 1.5274135430234654e-06,
"loss": 0.4591,
"step": 914
},
{
"epoch": 0.770634475014037,
"grad_norm": 0.3630264699459076,
"learning_rate": 1.5168466506586654e-06,
"loss": 0.4588,
"step": 915
},
{
"epoch": 0.7714766984839978,
"grad_norm": 0.33517614006996155,
"learning_rate": 1.506309897341297e-06,
"loss": 0.4644,
"step": 916
},
{
"epoch": 0.7723189219539585,
"grad_norm": 0.35454773902893066,
"learning_rate": 1.4958033742438348e-06,
"loss": 0.4808,
"step": 917
},
{
"epoch": 0.7731611454239191,
"grad_norm": 0.3116423487663269,
"learning_rate": 1.4853271722771772e-06,
"loss": 0.4403,
"step": 918
},
{
"epoch": 0.7740033688938799,
"grad_norm": 0.3462409973144531,
"learning_rate": 1.4748813820898554e-06,
"loss": 0.434,
"step": 919
},
{
"epoch": 0.7748455923638405,
"grad_norm": 0.3144718110561371,
"learning_rate": 1.4644660940672628e-06,
"loss": 0.4408,
"step": 920
},
{
"epoch": 0.7756878158338012,
"grad_norm": 0.3506813049316406,
"learning_rate": 1.454081398330855e-06,
"loss": 0.4548,
"step": 921
},
{
"epoch": 0.776530039303762,
"grad_norm": 0.3580954074859619,
"learning_rate": 1.4437273847373778e-06,
"loss": 0.4638,
"step": 922
},
{
"epoch": 0.7773722627737226,
"grad_norm": 0.3401355445384979,
"learning_rate": 1.4334041428781003e-06,
"loss": 0.4827,
"step": 923
},
{
"epoch": 0.7782144862436833,
"grad_norm": 0.34176456928253174,
"learning_rate": 1.4231117620780188e-06,
"loss": 0.5025,
"step": 924
},
{
"epoch": 0.779056709713644,
"grad_norm": 0.3647969663143158,
"learning_rate": 1.4128503313951008e-06,
"loss": 0.4278,
"step": 925
},
{
"epoch": 0.7798989331836047,
"grad_norm": 0.3605908751487732,
"learning_rate": 1.4026199396195078e-06,
"loss": 0.4816,
"step": 926
},
{
"epoch": 0.7807411566535654,
"grad_norm": 0.33657947182655334,
"learning_rate": 1.3924206752728282e-06,
"loss": 0.4488,
"step": 927
},
{
"epoch": 0.7815833801235261,
"grad_norm": 0.3494366407394409,
"learning_rate": 1.3822526266073044e-06,
"loss": 0.446,
"step": 928
},
{
"epoch": 0.7824256035934868,
"grad_norm": 0.4006534516811371,
"learning_rate": 1.3721158816050872e-06,
"loss": 0.4588,
"step": 929
},
{
"epoch": 0.7832678270634476,
"grad_norm": 0.36393067240715027,
"learning_rate": 1.3620105279774532e-06,
"loss": 0.4539,
"step": 930
},
{
"epoch": 0.7841100505334082,
"grad_norm": 0.369555801153183,
"learning_rate": 1.3519366531640589e-06,
"loss": 0.4803,
"step": 931
},
{
"epoch": 0.7849522740033689,
"grad_norm": 0.31321248412132263,
"learning_rate": 1.3418943443321807e-06,
"loss": 0.475,
"step": 932
},
{
"epoch": 0.7857944974733296,
"grad_norm": 0.3947639763355255,
"learning_rate": 1.3318836883759634e-06,
"loss": 0.4564,
"step": 933
},
{
"epoch": 0.7866367209432903,
"grad_norm": 0.3348684012889862,
"learning_rate": 1.3219047719156575e-06,
"loss": 0.4617,
"step": 934
},
{
"epoch": 0.787478944413251,
"grad_norm": 0.35532504320144653,
"learning_rate": 1.3119576812968893e-06,
"loss": 0.477,
"step": 935
},
{
"epoch": 0.7883211678832117,
"grad_norm": 0.35622647404670715,
"learning_rate": 1.3020425025898926e-06,
"loss": 0.4599,
"step": 936
},
{
"epoch": 0.7891633913531724,
"grad_norm": 0.3481491208076477,
"learning_rate": 1.292159321588778e-06,
"loss": 0.4724,
"step": 937
},
{
"epoch": 0.790005614823133,
"grad_norm": 0.377271443605423,
"learning_rate": 1.282308223810786e-06,
"loss": 0.4607,
"step": 938
},
{
"epoch": 0.7908478382930938,
"grad_norm": 0.35245975852012634,
"learning_rate": 1.272489294495548e-06,
"loss": 0.4545,
"step": 939
},
{
"epoch": 0.7916900617630545,
"grad_norm": 0.3466741144657135,
"learning_rate": 1.2627026186043423e-06,
"loss": 0.434,
"step": 940
},
{
"epoch": 0.7925322852330151,
"grad_norm": 0.3157729506492615,
"learning_rate": 1.252948280819375e-06,
"loss": 0.4306,
"step": 941
},
{
"epoch": 0.7933745087029759,
"grad_norm": 0.339692085981369,
"learning_rate": 1.243226365543026e-06,
"loss": 0.4624,
"step": 942
},
{
"epoch": 0.7942167321729365,
"grad_norm": 0.3418128192424774,
"learning_rate": 1.2335369568971362e-06,
"loss": 0.4604,
"step": 943
},
{
"epoch": 0.7950589556428973,
"grad_norm": 0.33322200179100037,
"learning_rate": 1.2238801387222716e-06,
"loss": 0.4623,
"step": 944
},
{
"epoch": 0.795901179112858,
"grad_norm": 0.3198584020137787,
"learning_rate": 1.2142559945769995e-06,
"loss": 0.4588,
"step": 945
},
{
"epoch": 0.7967434025828186,
"grad_norm": 0.3580982983112335,
"learning_rate": 1.2046646077371615e-06,
"loss": 0.4891,
"step": 946
},
{
"epoch": 0.7975856260527794,
"grad_norm": 0.2945074141025543,
"learning_rate": 1.1951060611951615e-06,
"loss": 0.4566,
"step": 947
},
{
"epoch": 0.79842784952274,
"grad_norm": 0.3214232921600342,
"learning_rate": 1.185580437659241e-06,
"loss": 0.4424,
"step": 948
},
{
"epoch": 0.7992700729927007,
"grad_norm": 0.3181054890155792,
"learning_rate": 1.1760878195527642e-06,
"loss": 0.4495,
"step": 949
},
{
"epoch": 0.8001122964626615,
"grad_norm": 0.3124549686908722,
"learning_rate": 1.1666282890135083e-06,
"loss": 0.4596,
"step": 950
},
{
"epoch": 0.8009545199326221,
"grad_norm": 0.3557204008102417,
"learning_rate": 1.1572019278929457e-06,
"loss": 0.4542,
"step": 951
},
{
"epoch": 0.8017967434025828,
"grad_norm": 0.3370462954044342,
"learning_rate": 1.147808817755544e-06,
"loss": 0.4626,
"step": 952
},
{
"epoch": 0.8026389668725435,
"grad_norm": 0.3505118489265442,
"learning_rate": 1.1384490398780563e-06,
"loss": 0.463,
"step": 953
},
{
"epoch": 0.8034811903425042,
"grad_norm": 0.36364004015922546,
"learning_rate": 1.129122675248816e-06,
"loss": 0.4577,
"step": 954
},
{
"epoch": 0.8043234138124649,
"grad_norm": 0.3689129650592804,
"learning_rate": 1.1198298045670402e-06,
"loss": 0.4615,
"step": 955
},
{
"epoch": 0.8051656372824256,
"grad_norm": 0.37028342485427856,
"learning_rate": 1.1105705082421303e-06,
"loss": 0.4859,
"step": 956
},
{
"epoch": 0.8060078607523863,
"grad_norm": 0.3648272752761841,
"learning_rate": 1.1013448663929704e-06,
"loss": 0.4671,
"step": 957
},
{
"epoch": 0.8068500842223469,
"grad_norm": 0.345866322517395,
"learning_rate": 1.0921529588472446e-06,
"loss": 0.4561,
"step": 958
},
{
"epoch": 0.8076923076923077,
"grad_norm": 0.3294440507888794,
"learning_rate": 1.0829948651407374e-06,
"loss": 0.4804,
"step": 959
},
{
"epoch": 0.8085345311622684,
"grad_norm": 0.3212808668613434,
"learning_rate": 1.0738706645166508e-06,
"loss": 0.4342,
"step": 960
},
{
"epoch": 0.8093767546322291,
"grad_norm": 0.33868685364723206,
"learning_rate": 1.0647804359249143e-06,
"loss": 0.4843,
"step": 961
},
{
"epoch": 0.8102189781021898,
"grad_norm": 0.36153316497802734,
"learning_rate": 1.0557242580215066e-06,
"loss": 0.4651,
"step": 962
},
{
"epoch": 0.8110612015721504,
"grad_norm": 0.33889979124069214,
"learning_rate": 1.0467022091677692e-06,
"loss": 0.4553,
"step": 963
},
{
"epoch": 0.8119034250421112,
"grad_norm": 0.34202301502227783,
"learning_rate": 1.037714367429734e-06,
"loss": 0.506,
"step": 964
},
{
"epoch": 0.8127456485120719,
"grad_norm": 0.3445115089416504,
"learning_rate": 1.0287608105774456e-06,
"loss": 0.4386,
"step": 965
},
{
"epoch": 0.8135878719820325,
"grad_norm": 0.34236183762550354,
"learning_rate": 1.019841616084286e-06,
"loss": 0.4441,
"step": 966
},
{
"epoch": 0.8144300954519933,
"grad_norm": 0.3376852869987488,
"learning_rate": 1.0109568611263094e-06,
"loss": 0.4643,
"step": 967
},
{
"epoch": 0.815272318921954,
"grad_norm": 0.33317050337791443,
"learning_rate": 1.002106622581569e-06,
"loss": 0.4628,
"step": 968
},
{
"epoch": 0.8161145423919146,
"grad_norm": 0.3268939256668091,
"learning_rate": 9.932909770294542e-07,
"loss": 0.4513,
"step": 969
},
{
"epoch": 0.8169567658618754,
"grad_norm": 0.32083117961883545,
"learning_rate": 9.845100007500292e-07,
"loss": 0.4724,
"step": 970
},
{
"epoch": 0.817798989331836,
"grad_norm": 0.36022859811782837,
"learning_rate": 9.757637697233723e-07,
"loss": 0.4802,
"step": 971
},
{
"epoch": 0.8186412128017967,
"grad_norm": 0.34767282009124756,
"learning_rate": 9.670523596289138e-07,
"loss": 0.4534,
"step": 972
},
{
"epoch": 0.8194834362717575,
"grad_norm": 0.3325275778770447,
"learning_rate": 9.58375845844793e-07,
"loss": 0.4681,
"step": 973
},
{
"epoch": 0.8203256597417181,
"grad_norm": 0.3268345296382904,
"learning_rate": 9.497343034471896e-07,
"loss": 0.4752,
"step": 974
},
{
"epoch": 0.8211678832116789,
"grad_norm": 0.31763315200805664,
"learning_rate": 9.41127807209688e-07,
"loss": 0.4296,
"step": 975
},
{
"epoch": 0.8220101066816395,
"grad_norm": 0.35879212617874146,
"learning_rate": 9.325564316026236e-07,
"loss": 0.4732,
"step": 976
},
{
"epoch": 0.8228523301516002,
"grad_norm": 0.3344736695289612,
"learning_rate": 9.240202507924412e-07,
"loss": 0.456,
"step": 977
},
{
"epoch": 0.823694553621561,
"grad_norm": 0.3530421555042267,
"learning_rate": 9.155193386410466e-07,
"loss": 0.4732,
"step": 978
},
{
"epoch": 0.8245367770915216,
"grad_norm": 0.32757529616355896,
"learning_rate": 9.070537687051817e-07,
"loss": 0.4557,
"step": 979
},
{
"epoch": 0.8253790005614823,
"grad_norm": 0.31608912348747253,
"learning_rate": 8.986236142357707e-07,
"loss": 0.4505,
"step": 980
},
{
"epoch": 0.826221224031443,
"grad_norm": 0.32184287905693054,
"learning_rate": 8.902289481772996e-07,
"loss": 0.4261,
"step": 981
},
{
"epoch": 0.8270634475014037,
"grad_norm": 0.36576586961746216,
"learning_rate": 8.818698431671774e-07,
"loss": 0.4738,
"step": 982
},
{
"epoch": 0.8279056709713644,
"grad_norm": 0.34923386573791504,
"learning_rate": 8.735463715351139e-07,
"loss": 0.4699,
"step": 983
},
{
"epoch": 0.8287478944413251,
"grad_norm": 0.3303389549255371,
"learning_rate": 8.652586053024836e-07,
"loss": 0.447,
"step": 984
},
{
"epoch": 0.8295901179112858,
"grad_norm": 0.3435465097427368,
"learning_rate": 8.570066161817176e-07,
"loss": 0.429,
"step": 985
},
{
"epoch": 0.8304323413812464,
"grad_norm": 0.30803781747817993,
"learning_rate": 8.487904755756676e-07,
"loss": 0.4628,
"step": 986
},
{
"epoch": 0.8312745648512072,
"grad_norm": 0.3358916640281677,
"learning_rate": 8.406102545769989e-07,
"loss": 0.4662,
"step": 987
},
{
"epoch": 0.8321167883211679,
"grad_norm": 0.3121514618396759,
"learning_rate": 8.324660239675697e-07,
"loss": 0.4482,
"step": 988
},
{
"epoch": 0.8329590117911286,
"grad_norm": 0.33737143874168396,
"learning_rate": 8.243578542178227e-07,
"loss": 0.4405,
"step": 989
},
{
"epoch": 0.8338012352610893,
"grad_norm": 0.34829044342041016,
"learning_rate": 8.16285815486168e-07,
"loss": 0.4804,
"step": 990
},
{
"epoch": 0.83464345873105,
"grad_norm": 0.352693110704422,
"learning_rate": 8.082499776183883e-07,
"loss": 0.4785,
"step": 991
},
{
"epoch": 0.8354856822010107,
"grad_norm": 0.31321579217910767,
"learning_rate": 8.002504101470204e-07,
"loss": 0.4538,
"step": 992
},
{
"epoch": 0.8363279056709714,
"grad_norm": 0.2995845079421997,
"learning_rate": 7.922871822907641e-07,
"loss": 0.4467,
"step": 993
},
{
"epoch": 0.837170129140932,
"grad_norm": 0.32853782176971436,
"learning_rate": 7.843603629538804e-07,
"loss": 0.488,
"step": 994
},
{
"epoch": 0.8380123526108928,
"grad_norm": 0.31039342284202576,
"learning_rate": 7.764700207255904e-07,
"loss": 0.4746,
"step": 995
},
{
"epoch": 0.8388545760808535,
"grad_norm": 0.3386421501636505,
"learning_rate": 7.686162238794898e-07,
"loss": 0.4655,
"step": 996
},
{
"epoch": 0.8396967995508141,
"grad_norm": 0.34342360496520996,
"learning_rate": 7.607990403729526e-07,
"loss": 0.4375,
"step": 997
},
{
"epoch": 0.8405390230207749,
"grad_norm": 0.3342914581298828,
"learning_rate": 7.530185378465459e-07,
"loss": 0.4417,
"step": 998
},
{
"epoch": 0.8413812464907355,
"grad_norm": 0.3670804798603058,
"learning_rate": 7.452747836234392e-07,
"loss": 0.472,
"step": 999
},
{
"epoch": 0.8422234699606962,
"grad_norm": 0.32072174549102783,
"learning_rate": 7.375678447088347e-07,
"loss": 0.4538,
"step": 1000
},
{
"epoch": 0.843065693430657,
"grad_norm": 0.3503890037536621,
"learning_rate": 7.298977877893688e-07,
"loss": 0.4648,
"step": 1001
},
{
"epoch": 0.8439079169006176,
"grad_norm": 0.3428994417190552,
"learning_rate": 7.222646792325516e-07,
"loss": 0.4639,
"step": 1002
},
{
"epoch": 0.8447501403705783,
"grad_norm": 0.3595855236053467,
"learning_rate": 7.146685850861851e-07,
"loss": 0.4595,
"step": 1003
},
{
"epoch": 0.845592363840539,
"grad_norm": 0.34215492010116577,
"learning_rate": 7.071095710777925e-07,
"loss": 0.4493,
"step": 1004
},
{
"epoch": 0.8464345873104997,
"grad_norm": 0.3216904401779175,
"learning_rate": 6.995877026140468e-07,
"loss": 0.4516,
"step": 1005
},
{
"epoch": 0.8472768107804605,
"grad_norm": 0.33294597268104553,
"learning_rate": 6.921030447802146e-07,
"loss": 0.4595,
"step": 1006
},
{
"epoch": 0.8481190342504211,
"grad_norm": 0.3170475959777832,
"learning_rate": 6.846556623395795e-07,
"loss": 0.4438,
"step": 1007
},
{
"epoch": 0.8489612577203818,
"grad_norm": 0.3171912133693695,
"learning_rate": 6.772456197328919e-07,
"loss": 0.4724,
"step": 1008
},
{
"epoch": 0.8498034811903425,
"grad_norm": 0.34563779830932617,
"learning_rate": 6.698729810778065e-07,
"loss": 0.4691,
"step": 1009
},
{
"epoch": 0.8506457046603032,
"grad_norm": 0.32657885551452637,
"learning_rate": 6.625378101683317e-07,
"loss": 0.4664,
"step": 1010
},
{
"epoch": 0.8514879281302639,
"grad_norm": 0.35269269347190857,
"learning_rate": 6.552401704742678e-07,
"loss": 0.4788,
"step": 1011
},
{
"epoch": 0.8523301516002246,
"grad_norm": 0.3336371183395386,
"learning_rate": 6.479801251406748e-07,
"loss": 0.4678,
"step": 1012
},
{
"epoch": 0.8531723750701853,
"grad_norm": 0.3277773857116699,
"learning_rate": 6.40757736987307e-07,
"loss": 0.4671,
"step": 1013
},
{
"epoch": 0.8540145985401459,
"grad_norm": 0.3333572447299957,
"learning_rate": 6.335730685080838e-07,
"loss": 0.4708,
"step": 1014
},
{
"epoch": 0.8548568220101067,
"grad_norm": 0.3374318778514862,
"learning_rate": 6.26426181870542e-07,
"loss": 0.4384,
"step": 1015
},
{
"epoch": 0.8556990454800674,
"grad_norm": 0.3308733105659485,
"learning_rate": 6.193171389152996e-07,
"loss": 0.4482,
"step": 1016
},
{
"epoch": 0.856541268950028,
"grad_norm": 0.347648561000824,
"learning_rate": 6.122460011555187e-07,
"loss": 0.4679,
"step": 1017
},
{
"epoch": 0.8573834924199888,
"grad_norm": 0.3401923179626465,
"learning_rate": 6.052128297763804e-07,
"loss": 0.5031,
"step": 1018
},
{
"epoch": 0.8582257158899494,
"grad_norm": 0.329550176858902,
"learning_rate": 5.982176856345445e-07,
"loss": 0.4705,
"step": 1019
},
{
"epoch": 0.8590679393599102,
"grad_norm": 0.3228277862071991,
"learning_rate": 5.912606292576284e-07,
"loss": 0.4551,
"step": 1020
},
{
"epoch": 0.8599101628298709,
"grad_norm": 0.3513161242008209,
"learning_rate": 5.843417208436908e-07,
"loss": 0.4995,
"step": 1021
},
{
"epoch": 0.8607523862998315,
"grad_norm": 0.33879998326301575,
"learning_rate": 5.774610202606939e-07,
"loss": 0.4468,
"step": 1022
},
{
"epoch": 0.8615946097697923,
"grad_norm": 0.3087005019187927,
"learning_rate": 5.706185870460018e-07,
"loss": 0.4404,
"step": 1023
},
{
"epoch": 0.862436833239753,
"grad_norm": 0.334729939699173,
"learning_rate": 5.63814480405856e-07,
"loss": 0.4747,
"step": 1024
},
{
"epoch": 0.8632790567097136,
"grad_norm": 0.3216525614261627,
"learning_rate": 5.570487592148666e-07,
"loss": 0.4475,
"step": 1025
},
{
"epoch": 0.8641212801796744,
"grad_norm": 0.3208960294723511,
"learning_rate": 5.503214820154979e-07,
"loss": 0.462,
"step": 1026
},
{
"epoch": 0.864963503649635,
"grad_norm": 0.35795074701309204,
"learning_rate": 5.436327070175729e-07,
"loss": 0.4573,
"step": 1027
},
{
"epoch": 0.8658057271195957,
"grad_norm": 0.33961227536201477,
"learning_rate": 5.369824920977567e-07,
"loss": 0.4597,
"step": 1028
},
{
"epoch": 0.8666479505895565,
"grad_norm": 0.32470422983169556,
"learning_rate": 5.303708947990638e-07,
"loss": 0.4706,
"step": 1029
},
{
"epoch": 0.8674901740595171,
"grad_norm": 0.31693166494369507,
"learning_rate": 5.237979723303582e-07,
"loss": 0.4491,
"step": 1030
},
{
"epoch": 0.8683323975294778,
"grad_norm": 0.33701324462890625,
"learning_rate": 5.172637815658583e-07,
"loss": 0.471,
"step": 1031
},
{
"epoch": 0.8691746209994385,
"grad_norm": 0.3477896451950073,
"learning_rate": 5.107683790446411e-07,
"loss": 0.465,
"step": 1032
},
{
"epoch": 0.8700168444693992,
"grad_norm": 0.3382064700126648,
"learning_rate": 5.04311820970163e-07,
"loss": 0.5033,
"step": 1033
},
{
"epoch": 0.87085906793936,
"grad_norm": 0.33173805475234985,
"learning_rate": 4.978941632097612e-07,
"loss": 0.4809,
"step": 1034
},
{
"epoch": 0.8717012914093206,
"grad_norm": 0.3257426917552948,
"learning_rate": 4.915154612941781e-07,
"loss": 0.4539,
"step": 1035
},
{
"epoch": 0.8725435148792813,
"grad_norm": 0.3181188404560089,
"learning_rate": 4.851757704170796e-07,
"loss": 0.4636,
"step": 1036
},
{
"epoch": 0.873385738349242,
"grad_norm": 0.3330731689929962,
"learning_rate": 4.788751454345763e-07,
"loss": 0.4539,
"step": 1037
},
{
"epoch": 0.8742279618192027,
"grad_norm": 0.3220534324645996,
"learning_rate": 4.726136408647464e-07,
"loss": 0.4509,
"step": 1038
},
{
"epoch": 0.8750701852891634,
"grad_norm": 0.30984482169151306,
"learning_rate": 4.663913108871726e-07,
"loss": 0.432,
"step": 1039
},
{
"epoch": 0.8759124087591241,
"grad_norm": 0.3303835988044739,
"learning_rate": 4.60208209342462e-07,
"loss": 0.45,
"step": 1040
},
{
"epoch": 0.8767546322290848,
"grad_norm": 0.3490801453590393,
"learning_rate": 4.540643897317887e-07,
"loss": 0.4558,
"step": 1041
},
{
"epoch": 0.8775968556990454,
"grad_norm": 0.31086137890815735,
"learning_rate": 4.4795990521642684e-07,
"loss": 0.4454,
"step": 1042
},
{
"epoch": 0.8784390791690062,
"grad_norm": 0.34392303228378296,
"learning_rate": 4.4189480861729137e-07,
"loss": 0.4594,
"step": 1043
},
{
"epoch": 0.8792813026389669,
"grad_norm": 0.3247262239456177,
"learning_rate": 4.35869152414482e-07,
"loss": 0.4592,
"step": 1044
},
{
"epoch": 0.8801235261089275,
"grad_norm": 0.34201639890670776,
"learning_rate": 4.2988298874682754e-07,
"loss": 0.441,
"step": 1045
},
{
"epoch": 0.8809657495788883,
"grad_norm": 0.35518065094947815,
"learning_rate": 4.239363694114368e-07,
"loss": 0.4735,
"step": 1046
},
{
"epoch": 0.881807973048849,
"grad_norm": 0.3133198916912079,
"learning_rate": 4.1802934586324897e-07,
"loss": 0.4543,
"step": 1047
},
{
"epoch": 0.8826501965188096,
"grad_norm": 0.3672572076320648,
"learning_rate": 4.1216196921458786e-07,
"loss": 0.4466,
"step": 1048
},
{
"epoch": 0.8834924199887704,
"grad_norm": 0.3202572464942932,
"learning_rate": 4.0633429023472004e-07,
"loss": 0.4643,
"step": 1049
},
{
"epoch": 0.884334643458731,
"grad_norm": 0.33787569403648376,
"learning_rate": 4.0054635934941633e-07,
"loss": 0.4664,
"step": 1050
},
{
"epoch": 0.8851768669286918,
"grad_norm": 0.3472073972225189,
"learning_rate": 3.947982266405159e-07,
"loss": 0.4589,
"step": 1051
},
{
"epoch": 0.8860190903986525,
"grad_norm": 0.3364206850528717,
"learning_rate": 3.890899418454913e-07,
"loss": 0.451,
"step": 1052
},
{
"epoch": 0.8868613138686131,
"grad_norm": 0.3336820602416992,
"learning_rate": 3.834215543570191e-07,
"loss": 0.4541,
"step": 1053
},
{
"epoch": 0.8877035373385739,
"grad_norm": 0.35087835788726807,
"learning_rate": 3.777931132225526e-07,
"loss": 0.4952,
"step": 1054
},
{
"epoch": 0.8885457608085345,
"grad_norm": 0.36997100710868835,
"learning_rate": 3.72204667143895e-07,
"loss": 0.4624,
"step": 1055
},
{
"epoch": 0.8893879842784952,
"grad_norm": 0.34807565808296204,
"learning_rate": 3.666562644767824e-07,
"loss": 0.48,
"step": 1056
},
{
"epoch": 0.890230207748456,
"grad_norm": 0.300436407327652,
"learning_rate": 3.611479532304618e-07,
"loss": 0.4361,
"step": 1057
},
{
"epoch": 0.8910724312184166,
"grad_norm": 0.3128056824207306,
"learning_rate": 3.556797810672785e-07,
"loss": 0.4511,
"step": 1058
},
{
"epoch": 0.8919146546883773,
"grad_norm": 0.3215906322002411,
"learning_rate": 3.5025179530225995e-07,
"loss": 0.4541,
"step": 1059
},
{
"epoch": 0.892756878158338,
"grad_norm": 0.3172469139099121,
"learning_rate": 3.4486404290271115e-07,
"loss": 0.4803,
"step": 1060
},
{
"epoch": 0.8935991016282987,
"grad_norm": 0.31866055727005005,
"learning_rate": 3.395165704878023e-07,
"loss": 0.4688,
"step": 1061
},
{
"epoch": 0.8944413250982594,
"grad_norm": 0.3102911710739136,
"learning_rate": 3.3420942432817127e-07,
"loss": 0.46,
"step": 1062
},
{
"epoch": 0.8952835485682201,
"grad_norm": 0.2940250039100647,
"learning_rate": 3.289426503455201e-07,
"loss": 0.4482,
"step": 1063
},
{
"epoch": 0.8961257720381808,
"grad_norm": 0.34147220849990845,
"learning_rate": 3.237162941122185e-07,
"loss": 0.4358,
"step": 1064
},
{
"epoch": 0.8969679955081415,
"grad_norm": 0.32565441727638245,
"learning_rate": 3.185304008509077e-07,
"loss": 0.4543,
"step": 1065
},
{
"epoch": 0.8978102189781022,
"grad_norm": 0.34025701880455017,
"learning_rate": 3.133850154341139e-07,
"loss": 0.4523,
"step": 1066
},
{
"epoch": 0.8986524424480629,
"grad_norm": 0.3196007311344147,
"learning_rate": 3.082801823838527e-07,
"loss": 0.43,
"step": 1067
},
{
"epoch": 0.8994946659180236,
"grad_norm": 0.3036392331123352,
"learning_rate": 3.0321594587125083e-07,
"loss": 0.4414,
"step": 1068
},
{
"epoch": 0.9003368893879843,
"grad_norm": 0.32302045822143555,
"learning_rate": 2.9819234971616154e-07,
"loss": 0.4655,
"step": 1069
},
{
"epoch": 0.9011791128579449,
"grad_norm": 0.3299964368343353,
"learning_rate": 2.932094373867811e-07,
"loss": 0.4653,
"step": 1070
},
{
"epoch": 0.9020213363279057,
"grad_norm": 0.32102474570274353,
"learning_rate": 2.882672519992824e-07,
"loss": 0.434,
"step": 1071
},
{
"epoch": 0.9028635597978664,
"grad_norm": 0.3391880393028259,
"learning_rate": 2.833658363174302e-07,
"loss": 0.4677,
"step": 1072
},
{
"epoch": 0.903705783267827,
"grad_norm": 0.318908154964447,
"learning_rate": 2.785052327522214e-07,
"loss": 0.4661,
"step": 1073
},
{
"epoch": 0.9045480067377878,
"grad_norm": 0.37129920721054077,
"learning_rate": 2.73685483361511e-07,
"loss": 0.4942,
"step": 1074
},
{
"epoch": 0.9053902302077484,
"grad_norm": 0.3275892734527588,
"learning_rate": 2.6890662984965234e-07,
"loss": 0.4769,
"step": 1075
},
{
"epoch": 0.9062324536777091,
"grad_norm": 0.32207468152046204,
"learning_rate": 2.6416871356713224e-07,
"loss": 0.459,
"step": 1076
},
{
"epoch": 0.9070746771476699,
"grad_norm": 0.32156863808631897,
"learning_rate": 2.594717755102205e-07,
"loss": 0.4553,
"step": 1077
},
{
"epoch": 0.9079169006176305,
"grad_norm": 0.3304842710494995,
"learning_rate": 2.548158563206038e-07,
"loss": 0.4779,
"step": 1078
},
{
"epoch": 0.9087591240875912,
"grad_norm": 0.3306860029697418,
"learning_rate": 2.5020099628504603e-07,
"loss": 0.4543,
"step": 1079
},
{
"epoch": 0.909601347557552,
"grad_norm": 0.3527275621891022,
"learning_rate": 2.4562723533503084e-07,
"loss": 0.4604,
"step": 1080
},
{
"epoch": 0.9104435710275126,
"grad_norm": 0.33252066373825073,
"learning_rate": 2.4109461304642254e-07,
"loss": 0.4467,
"step": 1081
},
{
"epoch": 0.9112857944974734,
"grad_norm": 0.3353213667869568,
"learning_rate": 2.3660316863911682e-07,
"loss": 0.4306,
"step": 1082
},
{
"epoch": 0.912128017967434,
"grad_norm": 0.33303454518318176,
"learning_rate": 2.3215294097670927e-07,
"loss": 0.456,
"step": 1083
},
{
"epoch": 0.9129702414373947,
"grad_norm": 0.3394293487071991,
"learning_rate": 2.277439685661509e-07,
"loss": 0.4491,
"step": 1084
},
{
"epoch": 0.9138124649073555,
"grad_norm": 0.30794280767440796,
"learning_rate": 2.2337628955742263e-07,
"loss": 0.4618,
"step": 1085
},
{
"epoch": 0.9146546883773161,
"grad_norm": 0.31820741295814514,
"learning_rate": 2.1904994174319903e-07,
"loss": 0.4513,
"step": 1086
},
{
"epoch": 0.9154969118472768,
"grad_norm": 0.3417550325393677,
"learning_rate": 2.1476496255852685e-07,
"loss": 0.4437,
"step": 1087
},
{
"epoch": 0.9163391353172375,
"grad_norm": 0.3287929892539978,
"learning_rate": 2.1052138908049303e-07,
"loss": 0.4683,
"step": 1088
},
{
"epoch": 0.9171813587871982,
"grad_norm": 0.3297256529331207,
"learning_rate": 2.0631925802791608e-07,
"loss": 0.4547,
"step": 1089
},
{
"epoch": 0.9180235822571589,
"grad_norm": 0.33473873138427734,
"learning_rate": 2.0215860576101532e-07,
"loss": 0.4455,
"step": 1090
},
{
"epoch": 0.9188658057271196,
"grad_norm": 0.334439754486084,
"learning_rate": 1.9803946828110376e-07,
"loss": 0.4639,
"step": 1091
},
{
"epoch": 0.9197080291970803,
"grad_norm": 0.3432200849056244,
"learning_rate": 1.9396188123027736e-07,
"loss": 0.4752,
"step": 1092
},
{
"epoch": 0.9205502526670409,
"grad_norm": 0.3343367278575897,
"learning_rate": 1.8992587989110133e-07,
"loss": 0.4781,
"step": 1093
},
{
"epoch": 0.9213924761370017,
"grad_norm": 0.33942049741744995,
"learning_rate": 1.8593149918630927e-07,
"loss": 0.4673,
"step": 1094
},
{
"epoch": 0.9222346996069624,
"grad_norm": 0.32365378737449646,
"learning_rate": 1.8197877367849948e-07,
"loss": 0.482,
"step": 1095
},
{
"epoch": 0.9230769230769231,
"grad_norm": 0.34894439578056335,
"learning_rate": 1.7806773756983641e-07,
"loss": 0.4658,
"step": 1096
},
{
"epoch": 0.9239191465468838,
"grad_norm": 0.35942527651786804,
"learning_rate": 1.7419842470175196e-07,
"loss": 0.4569,
"step": 1097
},
{
"epoch": 0.9247613700168444,
"grad_norm": 0.31009694933891296,
"learning_rate": 1.7037086855465902e-07,
"loss": 0.4564,
"step": 1098
},
{
"epoch": 0.9256035934868052,
"grad_norm": 0.30984339118003845,
"learning_rate": 1.6658510224765333e-07,
"loss": 0.4567,
"step": 1099
},
{
"epoch": 0.9264458169567659,
"grad_norm": 0.3273439109325409,
"learning_rate": 1.6284115853823445e-07,
"loss": 0.454,
"step": 1100
},
{
"epoch": 0.9272880404267265,
"grad_norm": 0.3297070562839508,
"learning_rate": 1.5913906982201744e-07,
"loss": 0.4438,
"step": 1101
},
{
"epoch": 0.9281302638966873,
"grad_norm": 0.31525179743766785,
"learning_rate": 1.554788681324554e-07,
"loss": 0.4877,
"step": 1102
},
{
"epoch": 0.928972487366648,
"grad_norm": 0.3105153739452362,
"learning_rate": 1.5186058514055912e-07,
"loss": 0.4317,
"step": 1103
},
{
"epoch": 0.9298147108366086,
"grad_norm": 0.3106979429721832,
"learning_rate": 1.482842521546285e-07,
"loss": 0.4627,
"step": 1104
},
{
"epoch": 0.9306569343065694,
"grad_norm": 0.31040892004966736,
"learning_rate": 1.447499001199748e-07,
"loss": 0.4383,
"step": 1105
},
{
"epoch": 0.93149915777653,
"grad_norm": 0.33874398469924927,
"learning_rate": 1.4125755961865827e-07,
"loss": 0.4794,
"step": 1106
},
{
"epoch": 0.9323413812464907,
"grad_norm": 0.3188782036304474,
"learning_rate": 1.3780726086922103e-07,
"loss": 0.4523,
"step": 1107
},
{
"epoch": 0.9331836047164515,
"grad_norm": 0.326948344707489,
"learning_rate": 1.3439903372642615e-07,
"loss": 0.4595,
"step": 1108
},
{
"epoch": 0.9340258281864121,
"grad_norm": 0.3299093544483185,
"learning_rate": 1.3103290768099796e-07,
"loss": 0.4418,
"step": 1109
},
{
"epoch": 0.9348680516563729,
"grad_norm": 0.3170269727706909,
"learning_rate": 1.2770891185937106e-07,
"loss": 0.4831,
"step": 1110
},
{
"epoch": 0.9357102751263335,
"grad_norm": 0.3100402057170868,
"learning_rate": 1.244270750234333e-07,
"loss": 0.4914,
"step": 1111
},
{
"epoch": 0.9365524985962942,
"grad_norm": 0.31475040316581726,
"learning_rate": 1.2118742557027885e-07,
"loss": 0.4592,
"step": 1112
},
{
"epoch": 0.937394722066255,
"grad_norm": 0.32388702034950256,
"learning_rate": 1.1798999153196433e-07,
"loss": 0.4579,
"step": 1113
},
{
"epoch": 0.9382369455362156,
"grad_norm": 0.31572088599205017,
"learning_rate": 1.1483480057526364e-07,
"loss": 0.4435,
"step": 1114
},
{
"epoch": 0.9390791690061763,
"grad_norm": 0.3283679187297821,
"learning_rate": 1.1172188000142803e-07,
"loss": 0.4483,
"step": 1115
},
{
"epoch": 0.939921392476137,
"grad_norm": 0.32103320956230164,
"learning_rate": 1.0865125674595467e-07,
"loss": 0.4839,
"step": 1116
},
{
"epoch": 0.9407636159460977,
"grad_norm": 0.3113073706626892,
"learning_rate": 1.0562295737834738e-07,
"loss": 0.4756,
"step": 1117
},
{
"epoch": 0.9416058394160584,
"grad_norm": 0.3419308066368103,
"learning_rate": 1.026370081018907e-07,
"loss": 0.4271,
"step": 1118
},
{
"epoch": 0.9424480628860191,
"grad_norm": 0.4538518488407135,
"learning_rate": 9.969343475342285e-08,
"loss": 0.4584,
"step": 1119
},
{
"epoch": 0.9432902863559798,
"grad_norm": 0.35381177067756653,
"learning_rate": 9.679226280310982e-08,
"loss": 0.477,
"step": 1120
},
{
"epoch": 0.9441325098259404,
"grad_norm": 0.3468037545681,
"learning_rate": 9.393351735422773e-08,
"loss": 0.4517,
"step": 1121
},
{
"epoch": 0.9449747332959012,
"grad_norm": 0.3164037764072418,
"learning_rate": 9.111722314294358e-08,
"loss": 0.446,
"step": 1122
},
{
"epoch": 0.9458169567658619,
"grad_norm": 0.33864447474479675,
"learning_rate": 8.834340453810375e-08,
"loss": 0.481,
"step": 1123
},
{
"epoch": 0.9466591802358225,
"grad_norm": 0.31847962737083435,
"learning_rate": 8.561208554101863e-08,
"loss": 0.452,
"step": 1124
},
{
"epoch": 0.9475014037057833,
"grad_norm": 0.3508765995502472,
"learning_rate": 8.29232897852611e-08,
"loss": 0.4696,
"step": 1125
},
{
"epoch": 0.9483436271757439,
"grad_norm": 0.3419753909111023,
"learning_rate": 8.027704053645613e-08,
"loss": 0.4534,
"step": 1126
},
{
"epoch": 0.9491858506457047,
"grad_norm": 0.3147391080856323,
"learning_rate": 7.76733606920832e-08,
"loss": 0.4466,
"step": 1127
},
{
"epoch": 0.9500280741156654,
"grad_norm": 0.3117488920688629,
"learning_rate": 7.511227278127697e-08,
"loss": 0.4598,
"step": 1128
},
{
"epoch": 0.950870297585626,
"grad_norm": 0.30068475008010864,
"learning_rate": 7.259379896463248e-08,
"loss": 0.462,
"step": 1129
},
{
"epoch": 0.9517125210555868,
"grad_norm": 0.32078614830970764,
"learning_rate": 7.011796103401192e-08,
"loss": 0.4382,
"step": 1130
},
{
"epoch": 0.9525547445255474,
"grad_norm": 0.31206777691841125,
"learning_rate": 6.768478041236037e-08,
"loss": 0.4833,
"step": 1131
},
{
"epoch": 0.9533969679955081,
"grad_norm": 0.31248289346694946,
"learning_rate": 6.529427815351374e-08,
"loss": 0.4607,
"step": 1132
},
{
"epoch": 0.9542391914654689,
"grad_norm": 0.3496483862400055,
"learning_rate": 6.294647494202444e-08,
"loss": 0.4231,
"step": 1133
},
{
"epoch": 0.9550814149354295,
"grad_norm": 0.32398563623428345,
"learning_rate": 6.064139109297485e-08,
"loss": 0.4466,
"step": 1134
},
{
"epoch": 0.9559236384053902,
"grad_norm": 0.3160340189933777,
"learning_rate": 5.8379046551807486e-08,
"loss": 0.443,
"step": 1135
},
{
"epoch": 0.956765861875351,
"grad_norm": 0.3245152235031128,
"learning_rate": 5.615946089414737e-08,
"loss": 0.4684,
"step": 1136
},
{
"epoch": 0.9576080853453116,
"grad_norm": 0.3228430151939392,
"learning_rate": 5.398265332563935e-08,
"loss": 0.4714,
"step": 1137
},
{
"epoch": 0.9584503088152723,
"grad_norm": 0.3297078609466553,
"learning_rate": 5.1848642681773254e-08,
"loss": 0.4613,
"step": 1138
},
{
"epoch": 0.959292532285233,
"grad_norm": 0.32669320702552795,
"learning_rate": 4.975744742772848e-08,
"loss": 0.4832,
"step": 1139
},
{
"epoch": 0.9601347557551937,
"grad_norm": 0.3655106723308563,
"learning_rate": 4.770908565820964e-08,
"loss": 0.4831,
"step": 1140
},
{
"epoch": 0.9609769792251545,
"grad_norm": 0.3126697242259979,
"learning_rate": 4.5703575097292286e-08,
"loss": 0.4384,
"step": 1141
},
{
"epoch": 0.9618192026951151,
"grad_norm": 0.33409810066223145,
"learning_rate": 4.37409330982691e-08,
"loss": 0.4351,
"step": 1142
},
{
"epoch": 0.9626614261650758,
"grad_norm": 0.3374941945075989,
"learning_rate": 4.182117664349783e-08,
"loss": 0.424,
"step": 1143
},
{
"epoch": 0.9635036496350365,
"grad_norm": 0.36208537220954895,
"learning_rate": 3.99443223442586e-08,
"loss": 0.4723,
"step": 1144
},
{
"epoch": 0.9643458731049972,
"grad_norm": 0.2885582447052002,
"learning_rate": 3.8110386440605164e-08,
"loss": 0.4464,
"step": 1145
},
{
"epoch": 0.9651880965749579,
"grad_norm": 0.30136173963546753,
"learning_rate": 3.631938480122777e-08,
"loss": 0.4362,
"step": 1146
},
{
"epoch": 0.9660303200449186,
"grad_norm": 0.3135235905647278,
"learning_rate": 3.457133292331494e-08,
"loss": 0.4462,
"step": 1147
},
{
"epoch": 0.9668725435148793,
"grad_norm": 0.3099575936794281,
"learning_rate": 3.2866245932418606e-08,
"loss": 0.4595,
"step": 1148
},
{
"epoch": 0.9677147669848399,
"grad_norm": 0.30907002091407776,
"learning_rate": 3.120413858232474e-08,
"loss": 0.4586,
"step": 1149
},
{
"epoch": 0.9685569904548007,
"grad_norm": 0.3242543935775757,
"learning_rate": 2.9585025254924572e-08,
"loss": 0.4581,
"step": 1150
},
{
"epoch": 0.9693992139247614,
"grad_norm": 0.33935239911079407,
"learning_rate": 2.8008919960090253e-08,
"loss": 0.4586,
"step": 1151
},
{
"epoch": 0.970241437394722,
"grad_norm": 0.30846089124679565,
"learning_rate": 2.6475836335553838e-08,
"loss": 0.4396,
"step": 1152
},
{
"epoch": 0.9710836608646828,
"grad_norm": 0.30442187190055847,
"learning_rate": 2.4985787646788497e-08,
"loss": 0.4384,
"step": 1153
},
{
"epoch": 0.9719258843346434,
"grad_norm": 0.31546372175216675,
"learning_rate": 2.3538786786896918e-08,
"loss": 0.4443,
"step": 1154
},
{
"epoch": 0.9727681078046042,
"grad_norm": 0.32305002212524414,
"learning_rate": 2.2134846276494205e-08,
"loss": 0.4267,
"step": 1155
},
{
"epoch": 0.9736103312745649,
"grad_norm": 0.3233368694782257,
"learning_rate": 2.0773978263605164e-08,
"loss": 0.4707,
"step": 1156
},
{
"epoch": 0.9744525547445255,
"grad_norm": 0.31967225670814514,
"learning_rate": 1.9456194523554404e-08,
"loss": 0.4829,
"step": 1157
},
{
"epoch": 0.9752947782144863,
"grad_norm": 0.31925785541534424,
"learning_rate": 1.8181506458869735e-08,
"loss": 0.4679,
"step": 1158
},
{
"epoch": 0.976137001684447,
"grad_norm": 0.3099507689476013,
"learning_rate": 1.69499250991767e-08,
"loss": 0.4498,
"step": 1159
},
{
"epoch": 0.9769792251544076,
"grad_norm": 0.32314586639404297,
"learning_rate": 1.576146110111032e-08,
"loss": 0.4782,
"step": 1160
},
{
"epoch": 0.9778214486243684,
"grad_norm": 0.3091767728328705,
"learning_rate": 1.4616124748217387e-08,
"loss": 0.4562,
"step": 1161
},
{
"epoch": 0.978663672094329,
"grad_norm": 0.3342806398868561,
"learning_rate": 1.351392595087042e-08,
"loss": 0.4515,
"step": 1162
},
{
"epoch": 0.9795058955642897,
"grad_norm": 0.3417477607727051,
"learning_rate": 1.2454874246181081e-08,
"loss": 0.4769,
"step": 1163
},
{
"epoch": 0.9803481190342505,
"grad_norm": 0.32453441619873047,
"learning_rate": 1.1438978797916888e-08,
"loss": 0.4575,
"step": 1164
},
{
"epoch": 0.9811903425042111,
"grad_norm": 0.3606019914150238,
"learning_rate": 1.0466248396424072e-08,
"loss": 0.4627,
"step": 1165
},
{
"epoch": 0.9820325659741718,
"grad_norm": 0.3059011697769165,
"learning_rate": 9.536691458548741e-09,
"loss": 0.4475,
"step": 1166
},
{
"epoch": 0.9828747894441325,
"grad_norm": 0.3114994466304779,
"learning_rate": 8.650316027566386e-09,
"loss": 0.454,
"step": 1167
},
{
"epoch": 0.9837170129140932,
"grad_norm": 0.3212180733680725,
"learning_rate": 7.807129773110822e-09,
"loss": 0.4512,
"step": 1168
},
{
"epoch": 0.9845592363840538,
"grad_norm": 0.3352208733558655,
"learning_rate": 7.007139991108136e-09,
"loss": 0.4453,
"step": 1169
},
{
"epoch": 0.9854014598540146,
"grad_norm": 0.33941376209259033,
"learning_rate": 6.25035360371451e-09,
"loss": 0.4793,
"step": 1170
},
{
"epoch": 0.9862436833239753,
"grad_norm": 0.3476077914237976,
"learning_rate": 5.536777159254603e-09,
"loss": 0.4851,
"step": 1171
},
{
"epoch": 0.987085906793936,
"grad_norm": 0.33196163177490234,
"learning_rate": 4.866416832167153e-09,
"loss": 0.4625,
"step": 1172
},
{
"epoch": 0.9879281302638967,
"grad_norm": 0.32519593834877014,
"learning_rate": 4.239278422948911e-09,
"loss": 0.4776,
"step": 1173
},
{
"epoch": 0.9887703537338574,
"grad_norm": 0.3022322356700897,
"learning_rate": 3.655367358106343e-09,
"loss": 0.4776,
"step": 1174
},
{
"epoch": 0.9896125772038181,
"grad_norm": 0.3089068830013275,
"learning_rate": 3.1146886901090024e-09,
"loss": 0.448,
"step": 1175
},
{
"epoch": 0.9904548006737788,
"grad_norm": 0.31966155767440796,
"learning_rate": 2.617247097342901e-09,
"loss": 0.4386,
"step": 1176
},
{
"epoch": 0.9912970241437394,
"grad_norm": 0.3162902593612671,
"learning_rate": 2.1630468840738716e-09,
"loss": 0.4569,
"step": 1177
},
{
"epoch": 0.9921392476137002,
"grad_norm": 0.3310737907886505,
"learning_rate": 1.7520919804075997e-09,
"loss": 0.4602,
"step": 1178
},
{
"epoch": 0.9929814710836609,
"grad_norm": 0.33026689291000366,
"learning_rate": 1.3843859422574269e-09,
"loss": 0.4783,
"step": 1179
},
{
"epoch": 0.9938236945536215,
"grad_norm": 0.3080196976661682,
"learning_rate": 1.0599319513115992e-09,
"loss": 0.4715,
"step": 1180
},
{
"epoch": 0.9946659180235823,
"grad_norm": 0.31929126381874084,
"learning_rate": 7.787328150071771e-10,
"loss": 0.4488,
"step": 1181
},
{
"epoch": 0.9955081414935429,
"grad_norm": 0.3194161057472229,
"learning_rate": 5.40790966505611e-10,
"loss": 0.43,
"step": 1182
},
{
"epoch": 0.9963503649635036,
"grad_norm": 0.3364320695400238,
"learning_rate": 3.4610846467109106e-10,
"loss": 0.4737,
"step": 1183
},
{
"epoch": 0.9971925884334644,
"grad_norm": 0.317007452249527,
"learning_rate": 1.9468699405444936e-10,
"loss": 0.4806,
"step": 1184
},
{
"epoch": 0.998034811903425,
"grad_norm": 0.3290872871875763,
"learning_rate": 8.652786487484133e-11,
"loss": 0.4443,
"step": 1185
},
{
"epoch": 0.9988770353733858,
"grad_norm": 0.3284390866756439,
"learning_rate": 2.1632013013084265e-11,
"loss": 0.4694,
"step": 1186
},
{
"epoch": 0.9997192588433464,
"grad_norm": 0.3138267397880554,
"learning_rate": 0.0,
"loss": 0.4357,
"step": 1187
},
{
"epoch": 0.9997192588433464,
"step": 1187,
"total_flos": 1539275766759424.0,
"train_loss": 0.48665387864462345,
"train_runtime": 40217.8604,
"train_samples_per_second": 2.833,
"train_steps_per_second": 0.03
}
],
"logging_steps": 1.0,
"max_steps": 1187,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1539275766759424.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}