flyingbugs's picture
Model save
53df12f verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.990512333965844,
"eval_steps": 500,
"global_step": 789,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.003795066413662239,
"grad_norm": 40.676379099115614,
"learning_rate": 0.0,
"loss": 11.9416,
"step": 1
},
{
"epoch": 0.007590132827324478,
"grad_norm": 42.322879433334954,
"learning_rate": 6.329113924050633e-07,
"loss": 11.9608,
"step": 2
},
{
"epoch": 0.011385199240986717,
"grad_norm": 42.07696167602527,
"learning_rate": 1.2658227848101265e-06,
"loss": 11.945,
"step": 3
},
{
"epoch": 0.015180265654648957,
"grad_norm": 38.788099000092245,
"learning_rate": 1.8987341772151901e-06,
"loss": 12.0041,
"step": 4
},
{
"epoch": 0.018975332068311195,
"grad_norm": 42.96987271855079,
"learning_rate": 2.531645569620253e-06,
"loss": 11.8344,
"step": 5
},
{
"epoch": 0.022770398481973434,
"grad_norm": 47.40729251364656,
"learning_rate": 3.1645569620253167e-06,
"loss": 11.539,
"step": 6
},
{
"epoch": 0.026565464895635674,
"grad_norm": 60.56847420882898,
"learning_rate": 3.7974683544303802e-06,
"loss": 10.7422,
"step": 7
},
{
"epoch": 0.030360531309297913,
"grad_norm": 72.9198099998618,
"learning_rate": 4.430379746835443e-06,
"loss": 10.2752,
"step": 8
},
{
"epoch": 0.03415559772296015,
"grad_norm": 53.002409298416595,
"learning_rate": 5.063291139240506e-06,
"loss": 6.558,
"step": 9
},
{
"epoch": 0.03795066413662239,
"grad_norm": 45.04149623657093,
"learning_rate": 5.69620253164557e-06,
"loss": 6.4024,
"step": 10
},
{
"epoch": 0.04174573055028463,
"grad_norm": 38.13238837210333,
"learning_rate": 6.329113924050633e-06,
"loss": 5.6875,
"step": 11
},
{
"epoch": 0.04554079696394687,
"grad_norm": 11.302613549595163,
"learning_rate": 6.9620253164556965e-06,
"loss": 3.4274,
"step": 12
},
{
"epoch": 0.04933586337760911,
"grad_norm": 7.72949331282132,
"learning_rate": 7.5949367088607605e-06,
"loss": 3.1596,
"step": 13
},
{
"epoch": 0.05313092979127135,
"grad_norm": 6.335189446265455,
"learning_rate": 8.227848101265822e-06,
"loss": 3.1567,
"step": 14
},
{
"epoch": 0.056925996204933584,
"grad_norm": 5.204280145656099,
"learning_rate": 8.860759493670886e-06,
"loss": 3.0093,
"step": 15
},
{
"epoch": 0.06072106261859583,
"grad_norm": 5.327369399133293,
"learning_rate": 9.49367088607595e-06,
"loss": 2.8687,
"step": 16
},
{
"epoch": 0.06451612903225806,
"grad_norm": 3.5902058385378806,
"learning_rate": 1.0126582278481012e-05,
"loss": 2.734,
"step": 17
},
{
"epoch": 0.0683111954459203,
"grad_norm": 2.9005082993372295,
"learning_rate": 1.0759493670886076e-05,
"loss": 2.7081,
"step": 18
},
{
"epoch": 0.07210626185958255,
"grad_norm": 1.974688758447809,
"learning_rate": 1.139240506329114e-05,
"loss": 2.4974,
"step": 19
},
{
"epoch": 0.07590132827324478,
"grad_norm": 1.7518384946545134,
"learning_rate": 1.2025316455696203e-05,
"loss": 2.4681,
"step": 20
},
{
"epoch": 0.07969639468690702,
"grad_norm": 1.7541148172357248,
"learning_rate": 1.2658227848101267e-05,
"loss": 2.4543,
"step": 21
},
{
"epoch": 0.08349146110056926,
"grad_norm": 1.1723792012584537,
"learning_rate": 1.3291139240506329e-05,
"loss": 2.4052,
"step": 22
},
{
"epoch": 0.0872865275142315,
"grad_norm": 1.4804034373191297,
"learning_rate": 1.3924050632911393e-05,
"loss": 2.2488,
"step": 23
},
{
"epoch": 0.09108159392789374,
"grad_norm": 1.1889737176936943,
"learning_rate": 1.4556962025316457e-05,
"loss": 2.2785,
"step": 24
},
{
"epoch": 0.09487666034155598,
"grad_norm": 0.9204209529441236,
"learning_rate": 1.5189873417721521e-05,
"loss": 2.1683,
"step": 25
},
{
"epoch": 0.09867172675521822,
"grad_norm": 2.0703118707538612,
"learning_rate": 1.5822784810126583e-05,
"loss": 2.2589,
"step": 26
},
{
"epoch": 0.10246679316888045,
"grad_norm": 1.647865271118571,
"learning_rate": 1.6455696202531644e-05,
"loss": 2.0461,
"step": 27
},
{
"epoch": 0.1062618595825427,
"grad_norm": 1.0500404961114602,
"learning_rate": 1.7088607594936708e-05,
"loss": 2.1666,
"step": 28
},
{
"epoch": 0.11005692599620494,
"grad_norm": 1.0076071725849705,
"learning_rate": 1.7721518987341772e-05,
"loss": 2.0978,
"step": 29
},
{
"epoch": 0.11385199240986717,
"grad_norm": 0.7701867978420391,
"learning_rate": 1.8354430379746836e-05,
"loss": 2.065,
"step": 30
},
{
"epoch": 0.11764705882352941,
"grad_norm": 1.6109351104153533,
"learning_rate": 1.89873417721519e-05,
"loss": 2.1119,
"step": 31
},
{
"epoch": 0.12144212523719165,
"grad_norm": 1.0413487669830812,
"learning_rate": 1.962025316455696e-05,
"loss": 2.0572,
"step": 32
},
{
"epoch": 0.1252371916508539,
"grad_norm": 0.6427805199550526,
"learning_rate": 2.0253164556962025e-05,
"loss": 2.0372,
"step": 33
},
{
"epoch": 0.12903225806451613,
"grad_norm": 0.9510255187453951,
"learning_rate": 2.088607594936709e-05,
"loss": 1.9751,
"step": 34
},
{
"epoch": 0.13282732447817835,
"grad_norm": 1.284633561009481,
"learning_rate": 2.1518987341772153e-05,
"loss": 1.9798,
"step": 35
},
{
"epoch": 0.1366223908918406,
"grad_norm": 0.9327064040358906,
"learning_rate": 2.2151898734177217e-05,
"loss": 1.8782,
"step": 36
},
{
"epoch": 0.14041745730550284,
"grad_norm": 1.2045615961015605,
"learning_rate": 2.278481012658228e-05,
"loss": 1.9079,
"step": 37
},
{
"epoch": 0.1442125237191651,
"grad_norm": 0.6537247427033752,
"learning_rate": 2.341772151898734e-05,
"loss": 1.9007,
"step": 38
},
{
"epoch": 0.14800759013282733,
"grad_norm": 0.6502571827261109,
"learning_rate": 2.4050632911392405e-05,
"loss": 1.8699,
"step": 39
},
{
"epoch": 0.15180265654648956,
"grad_norm": 0.6323816047475327,
"learning_rate": 2.468354430379747e-05,
"loss": 1.8669,
"step": 40
},
{
"epoch": 0.1555977229601518,
"grad_norm": 0.768465784108011,
"learning_rate": 2.5316455696202533e-05,
"loss": 1.9124,
"step": 41
},
{
"epoch": 0.15939278937381404,
"grad_norm": 2.418396322077454,
"learning_rate": 2.5949367088607597e-05,
"loss": 1.8707,
"step": 42
},
{
"epoch": 0.16318785578747627,
"grad_norm": 0.9037807886900223,
"learning_rate": 2.6582278481012658e-05,
"loss": 1.8516,
"step": 43
},
{
"epoch": 0.16698292220113853,
"grad_norm": 0.7641590282650628,
"learning_rate": 2.7215189873417722e-05,
"loss": 1.7878,
"step": 44
},
{
"epoch": 0.17077798861480076,
"grad_norm": 1.0966608585702264,
"learning_rate": 2.7848101265822786e-05,
"loss": 1.802,
"step": 45
},
{
"epoch": 0.174573055028463,
"grad_norm": 0.5688394185446388,
"learning_rate": 2.848101265822785e-05,
"loss": 1.8322,
"step": 46
},
{
"epoch": 0.17836812144212524,
"grad_norm": 0.7478846619933489,
"learning_rate": 2.9113924050632914e-05,
"loss": 1.8947,
"step": 47
},
{
"epoch": 0.18216318785578747,
"grad_norm": 0.6127680701652983,
"learning_rate": 2.9746835443037974e-05,
"loss": 1.7662,
"step": 48
},
{
"epoch": 0.1859582542694497,
"grad_norm": 0.5373790882769766,
"learning_rate": 3.0379746835443042e-05,
"loss": 1.6683,
"step": 49
},
{
"epoch": 0.18975332068311196,
"grad_norm": 0.6149095748944515,
"learning_rate": 3.10126582278481e-05,
"loss": 1.8495,
"step": 50
},
{
"epoch": 0.1935483870967742,
"grad_norm": 0.5426195225589562,
"learning_rate": 3.1645569620253167e-05,
"loss": 1.6796,
"step": 51
},
{
"epoch": 0.19734345351043645,
"grad_norm": 0.4410284346705972,
"learning_rate": 3.227848101265823e-05,
"loss": 1.6897,
"step": 52
},
{
"epoch": 0.20113851992409867,
"grad_norm": 0.546083625231314,
"learning_rate": 3.291139240506329e-05,
"loss": 1.6566,
"step": 53
},
{
"epoch": 0.2049335863377609,
"grad_norm": 0.4569806165138858,
"learning_rate": 3.354430379746836e-05,
"loss": 1.7379,
"step": 54
},
{
"epoch": 0.20872865275142316,
"grad_norm": 0.48748025942610634,
"learning_rate": 3.4177215189873416e-05,
"loss": 1.7216,
"step": 55
},
{
"epoch": 0.2125237191650854,
"grad_norm": 0.467756322924934,
"learning_rate": 3.4810126582278487e-05,
"loss": 1.674,
"step": 56
},
{
"epoch": 0.21631878557874762,
"grad_norm": 0.42683955668547663,
"learning_rate": 3.5443037974683544e-05,
"loss": 1.6544,
"step": 57
},
{
"epoch": 0.22011385199240988,
"grad_norm": 0.41134447964549253,
"learning_rate": 3.607594936708861e-05,
"loss": 1.6935,
"step": 58
},
{
"epoch": 0.2239089184060721,
"grad_norm": 0.42354491596778654,
"learning_rate": 3.670886075949367e-05,
"loss": 1.6797,
"step": 59
},
{
"epoch": 0.22770398481973433,
"grad_norm": 0.4286295643092341,
"learning_rate": 3.7341772151898736e-05,
"loss": 1.6842,
"step": 60
},
{
"epoch": 0.2314990512333966,
"grad_norm": 0.3600064803456228,
"learning_rate": 3.79746835443038e-05,
"loss": 1.6634,
"step": 61
},
{
"epoch": 0.23529411764705882,
"grad_norm": 0.44129600262533414,
"learning_rate": 3.8607594936708864e-05,
"loss": 1.5771,
"step": 62
},
{
"epoch": 0.23908918406072105,
"grad_norm": 0.3297533512548015,
"learning_rate": 3.924050632911392e-05,
"loss": 1.6301,
"step": 63
},
{
"epoch": 0.2428842504743833,
"grad_norm": 0.4193088363955817,
"learning_rate": 3.987341772151899e-05,
"loss": 1.612,
"step": 64
},
{
"epoch": 0.24667931688804554,
"grad_norm": 0.3712454081789884,
"learning_rate": 4.050632911392405e-05,
"loss": 1.6879,
"step": 65
},
{
"epoch": 0.2504743833017078,
"grad_norm": 0.5989768610829974,
"learning_rate": 4.113924050632912e-05,
"loss": 1.65,
"step": 66
},
{
"epoch": 0.25426944971537,
"grad_norm": 0.3742664335808708,
"learning_rate": 4.177215189873418e-05,
"loss": 1.6507,
"step": 67
},
{
"epoch": 0.25806451612903225,
"grad_norm": 0.4015420287040378,
"learning_rate": 4.240506329113924e-05,
"loss": 1.5732,
"step": 68
},
{
"epoch": 0.2618595825426945,
"grad_norm": 0.38585498440668603,
"learning_rate": 4.3037974683544305e-05,
"loss": 1.6849,
"step": 69
},
{
"epoch": 0.2656546489563567,
"grad_norm": 0.41777561117005696,
"learning_rate": 4.367088607594937e-05,
"loss": 1.6464,
"step": 70
},
{
"epoch": 0.269449715370019,
"grad_norm": 0.4351076634889842,
"learning_rate": 4.430379746835443e-05,
"loss": 1.5629,
"step": 71
},
{
"epoch": 0.2732447817836812,
"grad_norm": 0.4225694194804717,
"learning_rate": 4.49367088607595e-05,
"loss": 1.6265,
"step": 72
},
{
"epoch": 0.27703984819734345,
"grad_norm": 0.4394992721980678,
"learning_rate": 4.556962025316456e-05,
"loss": 1.5624,
"step": 73
},
{
"epoch": 0.2808349146110057,
"grad_norm": 0.4003804835773307,
"learning_rate": 4.6202531645569625e-05,
"loss": 1.5874,
"step": 74
},
{
"epoch": 0.2846299810246679,
"grad_norm": 0.4939872467069243,
"learning_rate": 4.683544303797468e-05,
"loss": 1.5396,
"step": 75
},
{
"epoch": 0.2884250474383302,
"grad_norm": 0.4309171830970703,
"learning_rate": 4.7468354430379746e-05,
"loss": 1.5761,
"step": 76
},
{
"epoch": 0.2922201138519924,
"grad_norm": 0.39520809396209,
"learning_rate": 4.810126582278481e-05,
"loss": 1.5694,
"step": 77
},
{
"epoch": 0.29601518026565465,
"grad_norm": 0.4415799961370828,
"learning_rate": 4.8734177215189874e-05,
"loss": 1.5954,
"step": 78
},
{
"epoch": 0.2998102466793169,
"grad_norm": 0.5067670270148436,
"learning_rate": 4.936708860759494e-05,
"loss": 1.6202,
"step": 79
},
{
"epoch": 0.3036053130929791,
"grad_norm": 0.7404307183816868,
"learning_rate": 5e-05,
"loss": 1.5495,
"step": 80
},
{
"epoch": 0.30740037950664134,
"grad_norm": 0.8756938333123472,
"learning_rate": 4.992957746478874e-05,
"loss": 1.5814,
"step": 81
},
{
"epoch": 0.3111954459203036,
"grad_norm": 0.7408728968865662,
"learning_rate": 4.9859154929577466e-05,
"loss": 1.603,
"step": 82
},
{
"epoch": 0.31499051233396586,
"grad_norm": 0.47334219872168504,
"learning_rate": 4.97887323943662e-05,
"loss": 1.6161,
"step": 83
},
{
"epoch": 0.3187855787476281,
"grad_norm": 0.5540870840573802,
"learning_rate": 4.971830985915493e-05,
"loss": 1.5272,
"step": 84
},
{
"epoch": 0.3225806451612903,
"grad_norm": 0.6886269029001646,
"learning_rate": 4.9647887323943665e-05,
"loss": 1.5546,
"step": 85
},
{
"epoch": 0.32637571157495254,
"grad_norm": 0.6086933597931,
"learning_rate": 4.95774647887324e-05,
"loss": 1.5839,
"step": 86
},
{
"epoch": 0.3301707779886148,
"grad_norm": 0.3807159000304318,
"learning_rate": 4.950704225352113e-05,
"loss": 1.4863,
"step": 87
},
{
"epoch": 0.33396584440227706,
"grad_norm": 0.543798541996791,
"learning_rate": 4.9436619718309864e-05,
"loss": 1.5316,
"step": 88
},
{
"epoch": 0.3377609108159393,
"grad_norm": 0.6589993614586068,
"learning_rate": 4.936619718309859e-05,
"loss": 1.602,
"step": 89
},
{
"epoch": 0.3415559772296015,
"grad_norm": 0.410627121431289,
"learning_rate": 4.929577464788733e-05,
"loss": 1.4986,
"step": 90
},
{
"epoch": 0.34535104364326374,
"grad_norm": 0.4306732329266045,
"learning_rate": 4.9225352112676056e-05,
"loss": 1.565,
"step": 91
},
{
"epoch": 0.349146110056926,
"grad_norm": 0.40943202120451866,
"learning_rate": 4.915492957746479e-05,
"loss": 1.6048,
"step": 92
},
{
"epoch": 0.35294117647058826,
"grad_norm": 0.46187620640321075,
"learning_rate": 4.908450704225353e-05,
"loss": 1.5409,
"step": 93
},
{
"epoch": 0.3567362428842505,
"grad_norm": 0.43416784435795913,
"learning_rate": 4.9014084507042255e-05,
"loss": 1.5513,
"step": 94
},
{
"epoch": 0.3605313092979127,
"grad_norm": 0.364351822155291,
"learning_rate": 4.894366197183099e-05,
"loss": 1.4514,
"step": 95
},
{
"epoch": 0.36432637571157495,
"grad_norm": 0.40903169908598264,
"learning_rate": 4.887323943661972e-05,
"loss": 1.4418,
"step": 96
},
{
"epoch": 0.3681214421252372,
"grad_norm": 0.41077705665324876,
"learning_rate": 4.8802816901408454e-05,
"loss": 1.4692,
"step": 97
},
{
"epoch": 0.3719165085388994,
"grad_norm": 0.4325231730159397,
"learning_rate": 4.873239436619719e-05,
"loss": 1.5407,
"step": 98
},
{
"epoch": 0.3757115749525617,
"grad_norm": 0.4596146921793953,
"learning_rate": 4.866197183098592e-05,
"loss": 1.5242,
"step": 99
},
{
"epoch": 0.3795066413662239,
"grad_norm": 0.40247629555127784,
"learning_rate": 4.8591549295774653e-05,
"loss": 1.5224,
"step": 100
},
{
"epoch": 0.38330170777988615,
"grad_norm": 0.43395587345438613,
"learning_rate": 4.852112676056338e-05,
"loss": 1.5369,
"step": 101
},
{
"epoch": 0.3870967741935484,
"grad_norm": 0.4725234162111987,
"learning_rate": 4.845070422535212e-05,
"loss": 1.4368,
"step": 102
},
{
"epoch": 0.3908918406072106,
"grad_norm": 0.4866189429925447,
"learning_rate": 4.838028169014085e-05,
"loss": 1.5053,
"step": 103
},
{
"epoch": 0.3946869070208729,
"grad_norm": 0.4069556864302445,
"learning_rate": 4.830985915492958e-05,
"loss": 1.5418,
"step": 104
},
{
"epoch": 0.3984819734345351,
"grad_norm": 0.47678521453205674,
"learning_rate": 4.8239436619718316e-05,
"loss": 1.5176,
"step": 105
},
{
"epoch": 0.40227703984819735,
"grad_norm": 0.4033161409910721,
"learning_rate": 4.8169014084507045e-05,
"loss": 1.5062,
"step": 106
},
{
"epoch": 0.4060721062618596,
"grad_norm": 0.35292601314916233,
"learning_rate": 4.809859154929578e-05,
"loss": 1.4527,
"step": 107
},
{
"epoch": 0.4098671726755218,
"grad_norm": 0.427382233984873,
"learning_rate": 4.8028169014084515e-05,
"loss": 1.5015,
"step": 108
},
{
"epoch": 0.41366223908918404,
"grad_norm": 0.484448814811885,
"learning_rate": 4.7957746478873244e-05,
"loss": 1.453,
"step": 109
},
{
"epoch": 0.4174573055028463,
"grad_norm": 0.4000789193244044,
"learning_rate": 4.788732394366197e-05,
"loss": 1.5646,
"step": 110
},
{
"epoch": 0.42125237191650855,
"grad_norm": 0.4721061236219507,
"learning_rate": 4.78169014084507e-05,
"loss": 1.4607,
"step": 111
},
{
"epoch": 0.4250474383301708,
"grad_norm": 0.40813799871710404,
"learning_rate": 4.7746478873239436e-05,
"loss": 1.4116,
"step": 112
},
{
"epoch": 0.428842504743833,
"grad_norm": 0.3809065440876614,
"learning_rate": 4.767605633802817e-05,
"loss": 1.4811,
"step": 113
},
{
"epoch": 0.43263757115749524,
"grad_norm": 0.47019438418202353,
"learning_rate": 4.76056338028169e-05,
"loss": 1.4493,
"step": 114
},
{
"epoch": 0.4364326375711575,
"grad_norm": 0.4016494235316403,
"learning_rate": 4.7535211267605635e-05,
"loss": 1.4059,
"step": 115
},
{
"epoch": 0.44022770398481975,
"grad_norm": 0.3971278151970772,
"learning_rate": 4.7464788732394363e-05,
"loss": 1.4072,
"step": 116
},
{
"epoch": 0.444022770398482,
"grad_norm": 0.47646350981005675,
"learning_rate": 4.73943661971831e-05,
"loss": 1.4458,
"step": 117
},
{
"epoch": 0.4478178368121442,
"grad_norm": 0.5020191715292881,
"learning_rate": 4.7323943661971834e-05,
"loss": 1.4899,
"step": 118
},
{
"epoch": 0.45161290322580644,
"grad_norm": 0.32593662880734586,
"learning_rate": 4.725352112676056e-05,
"loss": 1.4262,
"step": 119
},
{
"epoch": 0.45540796963946867,
"grad_norm": 0.4973960513889332,
"learning_rate": 4.71830985915493e-05,
"loss": 1.4303,
"step": 120
},
{
"epoch": 0.45920303605313095,
"grad_norm": 0.39806899280642116,
"learning_rate": 4.7112676056338026e-05,
"loss": 1.3826,
"step": 121
},
{
"epoch": 0.4629981024667932,
"grad_norm": 0.3781859871987095,
"learning_rate": 4.704225352112676e-05,
"loss": 1.4079,
"step": 122
},
{
"epoch": 0.4667931688804554,
"grad_norm": 0.42814978256121805,
"learning_rate": 4.69718309859155e-05,
"loss": 1.4056,
"step": 123
},
{
"epoch": 0.47058823529411764,
"grad_norm": 0.3316565174596705,
"learning_rate": 4.6901408450704225e-05,
"loss": 1.411,
"step": 124
},
{
"epoch": 0.47438330170777987,
"grad_norm": 0.36187807400376804,
"learning_rate": 4.683098591549296e-05,
"loss": 1.4228,
"step": 125
},
{
"epoch": 0.4781783681214421,
"grad_norm": 0.3841542596171203,
"learning_rate": 4.676056338028169e-05,
"loss": 1.4345,
"step": 126
},
{
"epoch": 0.4819734345351044,
"grad_norm": 0.33745746660281706,
"learning_rate": 4.6690140845070424e-05,
"loss": 1.4757,
"step": 127
},
{
"epoch": 0.4857685009487666,
"grad_norm": 0.36136826648297415,
"learning_rate": 4.661971830985915e-05,
"loss": 1.4269,
"step": 128
},
{
"epoch": 0.48956356736242884,
"grad_norm": 0.4206874108397252,
"learning_rate": 4.654929577464789e-05,
"loss": 1.3558,
"step": 129
},
{
"epoch": 0.49335863377609107,
"grad_norm": 0.3211522112176049,
"learning_rate": 4.647887323943662e-05,
"loss": 1.4066,
"step": 130
},
{
"epoch": 0.4971537001897533,
"grad_norm": 0.42159243957048326,
"learning_rate": 4.640845070422535e-05,
"loss": 1.3845,
"step": 131
},
{
"epoch": 0.5009487666034156,
"grad_norm": 0.36524560316144566,
"learning_rate": 4.633802816901409e-05,
"loss": 1.4484,
"step": 132
},
{
"epoch": 0.5047438330170778,
"grad_norm": 0.3360317678022846,
"learning_rate": 4.6267605633802816e-05,
"loss": 1.4249,
"step": 133
},
{
"epoch": 0.50853889943074,
"grad_norm": 0.38267639665072495,
"learning_rate": 4.619718309859155e-05,
"loss": 1.4267,
"step": 134
},
{
"epoch": 0.5123339658444023,
"grad_norm": 0.3515968333157541,
"learning_rate": 4.6126760563380286e-05,
"loss": 1.3638,
"step": 135
},
{
"epoch": 0.5161290322580645,
"grad_norm": 0.3399745983579114,
"learning_rate": 4.6056338028169015e-05,
"loss": 1.4098,
"step": 136
},
{
"epoch": 0.5199240986717267,
"grad_norm": 0.39077092993266566,
"learning_rate": 4.598591549295775e-05,
"loss": 1.4015,
"step": 137
},
{
"epoch": 0.523719165085389,
"grad_norm": 0.42453420106967205,
"learning_rate": 4.591549295774648e-05,
"loss": 1.5311,
"step": 138
},
{
"epoch": 0.5275142314990512,
"grad_norm": 0.3986911368509283,
"learning_rate": 4.5845070422535214e-05,
"loss": 1.4451,
"step": 139
},
{
"epoch": 0.5313092979127134,
"grad_norm": 0.35723248125580975,
"learning_rate": 4.577464788732395e-05,
"loss": 1.4333,
"step": 140
},
{
"epoch": 0.5351043643263758,
"grad_norm": 0.31332630291444385,
"learning_rate": 4.570422535211268e-05,
"loss": 1.3828,
"step": 141
},
{
"epoch": 0.538899430740038,
"grad_norm": 0.43136704591444386,
"learning_rate": 4.563380281690141e-05,
"loss": 1.4064,
"step": 142
},
{
"epoch": 0.5426944971537002,
"grad_norm": 0.3803746072665268,
"learning_rate": 4.556338028169014e-05,
"loss": 1.4452,
"step": 143
},
{
"epoch": 0.5464895635673624,
"grad_norm": 0.3639453496228484,
"learning_rate": 4.5492957746478876e-05,
"loss": 1.3488,
"step": 144
},
{
"epoch": 0.5502846299810247,
"grad_norm": 0.3840846743562568,
"learning_rate": 4.542253521126761e-05,
"loss": 1.3771,
"step": 145
},
{
"epoch": 0.5540796963946869,
"grad_norm": 0.36449348238261,
"learning_rate": 4.535211267605634e-05,
"loss": 1.4484,
"step": 146
},
{
"epoch": 0.5578747628083491,
"grad_norm": 0.3515807826750923,
"learning_rate": 4.5281690140845075e-05,
"loss": 1.3991,
"step": 147
},
{
"epoch": 0.5616698292220114,
"grad_norm": 0.3873512613341988,
"learning_rate": 4.5211267605633804e-05,
"loss": 1.3949,
"step": 148
},
{
"epoch": 0.5654648956356736,
"grad_norm": 0.31296388966092,
"learning_rate": 4.514084507042254e-05,
"loss": 1.3685,
"step": 149
},
{
"epoch": 0.5692599620493358,
"grad_norm": 0.33918560649969454,
"learning_rate": 4.507042253521127e-05,
"loss": 1.332,
"step": 150
},
{
"epoch": 0.573055028462998,
"grad_norm": 0.3340984719004553,
"learning_rate": 4.5e-05,
"loss": 1.3577,
"step": 151
},
{
"epoch": 0.5768500948766604,
"grad_norm": 0.36267287408052973,
"learning_rate": 4.492957746478874e-05,
"loss": 1.437,
"step": 152
},
{
"epoch": 0.5806451612903226,
"grad_norm": 0.35764903406771575,
"learning_rate": 4.4859154929577467e-05,
"loss": 1.3907,
"step": 153
},
{
"epoch": 0.5844402277039848,
"grad_norm": 0.3583401107412542,
"learning_rate": 4.47887323943662e-05,
"loss": 1.3919,
"step": 154
},
{
"epoch": 0.5882352941176471,
"grad_norm": 0.4041586732718652,
"learning_rate": 4.471830985915493e-05,
"loss": 1.3997,
"step": 155
},
{
"epoch": 0.5920303605313093,
"grad_norm": 0.35367385034436627,
"learning_rate": 4.4647887323943666e-05,
"loss": 1.4102,
"step": 156
},
{
"epoch": 0.5958254269449715,
"grad_norm": 0.3304368676274889,
"learning_rate": 4.45774647887324e-05,
"loss": 1.3693,
"step": 157
},
{
"epoch": 0.5996204933586338,
"grad_norm": 0.3205742197630557,
"learning_rate": 4.450704225352113e-05,
"loss": 1.3337,
"step": 158
},
{
"epoch": 0.603415559772296,
"grad_norm": 0.35445122655886857,
"learning_rate": 4.4436619718309865e-05,
"loss": 1.44,
"step": 159
},
{
"epoch": 0.6072106261859582,
"grad_norm": 0.3573966131889885,
"learning_rate": 4.436619718309859e-05,
"loss": 1.3803,
"step": 160
},
{
"epoch": 0.6110056925996205,
"grad_norm": 0.3586312772305008,
"learning_rate": 4.429577464788733e-05,
"loss": 1.3438,
"step": 161
},
{
"epoch": 0.6148007590132827,
"grad_norm": 0.3783176659435124,
"learning_rate": 4.4225352112676064e-05,
"loss": 1.3404,
"step": 162
},
{
"epoch": 0.618595825426945,
"grad_norm": 0.3379901326719384,
"learning_rate": 4.415492957746479e-05,
"loss": 1.3409,
"step": 163
},
{
"epoch": 0.6223908918406073,
"grad_norm": 0.3777309847906582,
"learning_rate": 4.408450704225353e-05,
"loss": 1.3955,
"step": 164
},
{
"epoch": 0.6261859582542695,
"grad_norm": 0.3761776753259742,
"learning_rate": 4.4014084507042256e-05,
"loss": 1.3231,
"step": 165
},
{
"epoch": 0.6299810246679317,
"grad_norm": 0.36862056674799515,
"learning_rate": 4.394366197183099e-05,
"loss": 1.3725,
"step": 166
},
{
"epoch": 0.6337760910815939,
"grad_norm": 0.3674498306369189,
"learning_rate": 4.3873239436619726e-05,
"loss": 1.3188,
"step": 167
},
{
"epoch": 0.6375711574952562,
"grad_norm": 0.38270505298891316,
"learning_rate": 4.3802816901408455e-05,
"loss": 1.3907,
"step": 168
},
{
"epoch": 0.6413662239089184,
"grad_norm": 0.36403624669642476,
"learning_rate": 4.373239436619718e-05,
"loss": 1.3062,
"step": 169
},
{
"epoch": 0.6451612903225806,
"grad_norm": 0.39703076974571927,
"learning_rate": 4.366197183098591e-05,
"loss": 1.3589,
"step": 170
},
{
"epoch": 0.6489563567362429,
"grad_norm": 0.34114997296211463,
"learning_rate": 4.359154929577465e-05,
"loss": 1.3198,
"step": 171
},
{
"epoch": 0.6527514231499051,
"grad_norm": 0.41175841528886803,
"learning_rate": 4.352112676056338e-05,
"loss": 1.4185,
"step": 172
},
{
"epoch": 0.6565464895635673,
"grad_norm": 0.4174144929484366,
"learning_rate": 4.345070422535211e-05,
"loss": 1.3929,
"step": 173
},
{
"epoch": 0.6603415559772297,
"grad_norm": 0.3428838083958315,
"learning_rate": 4.3380281690140846e-05,
"loss": 1.3627,
"step": 174
},
{
"epoch": 0.6641366223908919,
"grad_norm": 0.3929570557569129,
"learning_rate": 4.3309859154929575e-05,
"loss": 1.3529,
"step": 175
},
{
"epoch": 0.6679316888045541,
"grad_norm": 0.3573057662364649,
"learning_rate": 4.323943661971831e-05,
"loss": 1.3285,
"step": 176
},
{
"epoch": 0.6717267552182163,
"grad_norm": 0.4100140619685759,
"learning_rate": 4.3169014084507045e-05,
"loss": 1.3363,
"step": 177
},
{
"epoch": 0.6755218216318786,
"grad_norm": 0.3445528400302041,
"learning_rate": 4.3098591549295774e-05,
"loss": 1.3098,
"step": 178
},
{
"epoch": 0.6793168880455408,
"grad_norm": 0.37717433556231583,
"learning_rate": 4.302816901408451e-05,
"loss": 1.3108,
"step": 179
},
{
"epoch": 0.683111954459203,
"grad_norm": 0.3715317239019883,
"learning_rate": 4.295774647887324e-05,
"loss": 1.354,
"step": 180
},
{
"epoch": 0.6869070208728653,
"grad_norm": 0.35195067644474287,
"learning_rate": 4.288732394366197e-05,
"loss": 1.2881,
"step": 181
},
{
"epoch": 0.6907020872865275,
"grad_norm": 0.35595665080741584,
"learning_rate": 4.281690140845071e-05,
"loss": 1.3259,
"step": 182
},
{
"epoch": 0.6944971537001897,
"grad_norm": 0.3046365612241057,
"learning_rate": 4.2746478873239436e-05,
"loss": 1.3144,
"step": 183
},
{
"epoch": 0.698292220113852,
"grad_norm": 0.3819917913784112,
"learning_rate": 4.267605633802817e-05,
"loss": 1.322,
"step": 184
},
{
"epoch": 0.7020872865275142,
"grad_norm": 0.3299468340192363,
"learning_rate": 4.26056338028169e-05,
"loss": 1.3381,
"step": 185
},
{
"epoch": 0.7058823529411765,
"grad_norm": 0.3740628787564136,
"learning_rate": 4.2535211267605635e-05,
"loss": 1.3556,
"step": 186
},
{
"epoch": 0.7096774193548387,
"grad_norm": 0.35933774593302825,
"learning_rate": 4.2464788732394364e-05,
"loss": 1.3261,
"step": 187
},
{
"epoch": 0.713472485768501,
"grad_norm": 0.3670666433581175,
"learning_rate": 4.23943661971831e-05,
"loss": 1.3449,
"step": 188
},
{
"epoch": 0.7172675521821632,
"grad_norm": 0.36936784178468773,
"learning_rate": 4.2323943661971834e-05,
"loss": 1.2828,
"step": 189
},
{
"epoch": 0.7210626185958254,
"grad_norm": 0.3367523658701917,
"learning_rate": 4.225352112676056e-05,
"loss": 1.3748,
"step": 190
},
{
"epoch": 0.7248576850094877,
"grad_norm": 0.3352692091278163,
"learning_rate": 4.21830985915493e-05,
"loss": 1.285,
"step": 191
},
{
"epoch": 0.7286527514231499,
"grad_norm": 0.32844599471562874,
"learning_rate": 4.211267605633803e-05,
"loss": 1.3514,
"step": 192
},
{
"epoch": 0.7324478178368121,
"grad_norm": 0.32445569378348055,
"learning_rate": 4.204225352112676e-05,
"loss": 1.3256,
"step": 193
},
{
"epoch": 0.7362428842504743,
"grad_norm": 0.3071410557778373,
"learning_rate": 4.19718309859155e-05,
"loss": 1.3007,
"step": 194
},
{
"epoch": 0.7400379506641366,
"grad_norm": 0.3080989046350531,
"learning_rate": 4.1901408450704226e-05,
"loss": 1.2903,
"step": 195
},
{
"epoch": 0.7438330170777988,
"grad_norm": 0.31056134555547077,
"learning_rate": 4.183098591549296e-05,
"loss": 1.3123,
"step": 196
},
{
"epoch": 0.7476280834914611,
"grad_norm": 1.8895780326868505,
"learning_rate": 4.176056338028169e-05,
"loss": 1.311,
"step": 197
},
{
"epoch": 0.7514231499051234,
"grad_norm": 0.3526960047508018,
"learning_rate": 4.1690140845070425e-05,
"loss": 1.3291,
"step": 198
},
{
"epoch": 0.7552182163187856,
"grad_norm": 0.34865523371730295,
"learning_rate": 4.161971830985916e-05,
"loss": 1.309,
"step": 199
},
{
"epoch": 0.7590132827324478,
"grad_norm": 0.31651506379715605,
"learning_rate": 4.154929577464789e-05,
"loss": 1.2653,
"step": 200
},
{
"epoch": 0.7628083491461101,
"grad_norm": 0.332113537560613,
"learning_rate": 4.1478873239436624e-05,
"loss": 1.3122,
"step": 201
},
{
"epoch": 0.7666034155597723,
"grad_norm": 0.6094071531831814,
"learning_rate": 4.140845070422535e-05,
"loss": 1.3009,
"step": 202
},
{
"epoch": 0.7703984819734345,
"grad_norm": 0.33649374700866697,
"learning_rate": 4.133802816901409e-05,
"loss": 1.3174,
"step": 203
},
{
"epoch": 0.7741935483870968,
"grad_norm": 0.36498451990927117,
"learning_rate": 4.126760563380282e-05,
"loss": 1.2808,
"step": 204
},
{
"epoch": 0.777988614800759,
"grad_norm": 0.3139489823599198,
"learning_rate": 4.119718309859155e-05,
"loss": 1.3246,
"step": 205
},
{
"epoch": 0.7817836812144212,
"grad_norm": 0.3296563025384292,
"learning_rate": 4.1126760563380286e-05,
"loss": 1.2238,
"step": 206
},
{
"epoch": 0.7855787476280834,
"grad_norm": 0.321736102972286,
"learning_rate": 4.1056338028169015e-05,
"loss": 1.2509,
"step": 207
},
{
"epoch": 0.7893738140417458,
"grad_norm": 0.3252820879325987,
"learning_rate": 4.098591549295775e-05,
"loss": 1.2725,
"step": 208
},
{
"epoch": 0.793168880455408,
"grad_norm": 0.3211609450260476,
"learning_rate": 4.091549295774648e-05,
"loss": 1.2437,
"step": 209
},
{
"epoch": 0.7969639468690702,
"grad_norm": 0.3310154286833479,
"learning_rate": 4.0845070422535214e-05,
"loss": 1.2911,
"step": 210
},
{
"epoch": 0.8007590132827325,
"grad_norm": 0.34047936964211944,
"learning_rate": 4.077464788732395e-05,
"loss": 1.241,
"step": 211
},
{
"epoch": 0.8045540796963947,
"grad_norm": 0.3601451931912554,
"learning_rate": 4.070422535211268e-05,
"loss": 1.3091,
"step": 212
},
{
"epoch": 0.8083491461100569,
"grad_norm": 0.341630894880969,
"learning_rate": 4.063380281690141e-05,
"loss": 1.2166,
"step": 213
},
{
"epoch": 0.8121442125237192,
"grad_norm": 0.3231118912446341,
"learning_rate": 4.056338028169014e-05,
"loss": 1.2608,
"step": 214
},
{
"epoch": 0.8159392789373814,
"grad_norm": 0.344358800411142,
"learning_rate": 4.049295774647888e-05,
"loss": 1.313,
"step": 215
},
{
"epoch": 0.8197343453510436,
"grad_norm": 0.34274391380453484,
"learning_rate": 4.042253521126761e-05,
"loss": 1.2415,
"step": 216
},
{
"epoch": 0.8235294117647058,
"grad_norm": 0.3365818031342838,
"learning_rate": 4.035211267605634e-05,
"loss": 1.2411,
"step": 217
},
{
"epoch": 0.8273244781783681,
"grad_norm": 0.3528447853747438,
"learning_rate": 4.0281690140845076e-05,
"loss": 1.2693,
"step": 218
},
{
"epoch": 0.8311195445920304,
"grad_norm": 0.38058232199864395,
"learning_rate": 4.0211267605633804e-05,
"loss": 1.262,
"step": 219
},
{
"epoch": 0.8349146110056926,
"grad_norm": 0.36105483077105904,
"learning_rate": 4.014084507042254e-05,
"loss": 1.2696,
"step": 220
},
{
"epoch": 0.8387096774193549,
"grad_norm": 0.3103848396032734,
"learning_rate": 4.0070422535211275e-05,
"loss": 1.268,
"step": 221
},
{
"epoch": 0.8425047438330171,
"grad_norm": 0.3848294788065234,
"learning_rate": 4e-05,
"loss": 1.2518,
"step": 222
},
{
"epoch": 0.8462998102466793,
"grad_norm": 0.33080202739238657,
"learning_rate": 3.992957746478874e-05,
"loss": 1.1915,
"step": 223
},
{
"epoch": 0.8500948766603416,
"grad_norm": 0.38000956902567407,
"learning_rate": 3.985915492957747e-05,
"loss": 1.2117,
"step": 224
},
{
"epoch": 0.8538899430740038,
"grad_norm": 0.3715575769758883,
"learning_rate": 3.97887323943662e-05,
"loss": 1.2583,
"step": 225
},
{
"epoch": 0.857685009487666,
"grad_norm": 0.3720392057036161,
"learning_rate": 3.971830985915493e-05,
"loss": 1.2877,
"step": 226
},
{
"epoch": 0.8614800759013282,
"grad_norm": 0.3653290894884262,
"learning_rate": 3.9647887323943666e-05,
"loss": 1.2717,
"step": 227
},
{
"epoch": 0.8652751423149905,
"grad_norm": 0.4265644803365511,
"learning_rate": 3.9577464788732395e-05,
"loss": 1.258,
"step": 228
},
{
"epoch": 0.8690702087286527,
"grad_norm": 0.3386462365150063,
"learning_rate": 3.950704225352112e-05,
"loss": 1.2225,
"step": 229
},
{
"epoch": 0.872865275142315,
"grad_norm": 0.43926314371533426,
"learning_rate": 3.943661971830986e-05,
"loss": 1.2347,
"step": 230
},
{
"epoch": 0.8766603415559773,
"grad_norm": 0.32840599320539543,
"learning_rate": 3.9366197183098594e-05,
"loss": 1.2332,
"step": 231
},
{
"epoch": 0.8804554079696395,
"grad_norm": 0.42367117988848685,
"learning_rate": 3.929577464788732e-05,
"loss": 1.2675,
"step": 232
},
{
"epoch": 0.8842504743833017,
"grad_norm": 0.37102000500926696,
"learning_rate": 3.922535211267606e-05,
"loss": 1.2231,
"step": 233
},
{
"epoch": 0.888045540796964,
"grad_norm": 0.3617890203831545,
"learning_rate": 3.9154929577464786e-05,
"loss": 1.2568,
"step": 234
},
{
"epoch": 0.8918406072106262,
"grad_norm": 0.3535203843496746,
"learning_rate": 3.908450704225352e-05,
"loss": 1.2599,
"step": 235
},
{
"epoch": 0.8956356736242884,
"grad_norm": 0.35336005521716213,
"learning_rate": 3.9014084507042256e-05,
"loss": 1.2644,
"step": 236
},
{
"epoch": 0.8994307400379506,
"grad_norm": 0.3415373219143306,
"learning_rate": 3.8943661971830985e-05,
"loss": 1.2199,
"step": 237
},
{
"epoch": 0.9032258064516129,
"grad_norm": 0.36547141577633835,
"learning_rate": 3.887323943661972e-05,
"loss": 1.2606,
"step": 238
},
{
"epoch": 0.9070208728652751,
"grad_norm": 0.3715572237799435,
"learning_rate": 3.880281690140845e-05,
"loss": 1.2158,
"step": 239
},
{
"epoch": 0.9108159392789373,
"grad_norm": 0.34742124410508274,
"learning_rate": 3.8732394366197184e-05,
"loss": 1.2344,
"step": 240
},
{
"epoch": 0.9146110056925996,
"grad_norm": 0.3484664762204998,
"learning_rate": 3.866197183098592e-05,
"loss": 1.2287,
"step": 241
},
{
"epoch": 0.9184060721062619,
"grad_norm": 0.30765739989139884,
"learning_rate": 3.859154929577465e-05,
"loss": 1.2026,
"step": 242
},
{
"epoch": 0.9222011385199241,
"grad_norm": 0.35586172749374156,
"learning_rate": 3.852112676056338e-05,
"loss": 1.1763,
"step": 243
},
{
"epoch": 0.9259962049335864,
"grad_norm": 0.36025629908095097,
"learning_rate": 3.845070422535211e-05,
"loss": 1.2319,
"step": 244
},
{
"epoch": 0.9297912713472486,
"grad_norm": 0.3715154160227618,
"learning_rate": 3.8380281690140847e-05,
"loss": 1.2393,
"step": 245
},
{
"epoch": 0.9335863377609108,
"grad_norm": 0.3635351228373479,
"learning_rate": 3.8309859154929575e-05,
"loss": 1.2302,
"step": 246
},
{
"epoch": 0.937381404174573,
"grad_norm": 0.3731784332441304,
"learning_rate": 3.823943661971831e-05,
"loss": 1.2251,
"step": 247
},
{
"epoch": 0.9411764705882353,
"grad_norm": 0.3594415010248249,
"learning_rate": 3.8169014084507046e-05,
"loss": 1.1908,
"step": 248
},
{
"epoch": 0.9449715370018975,
"grad_norm": 0.3998773141506776,
"learning_rate": 3.8098591549295774e-05,
"loss": 1.2357,
"step": 249
},
{
"epoch": 0.9487666034155597,
"grad_norm": 0.349061111387022,
"learning_rate": 3.802816901408451e-05,
"loss": 1.1901,
"step": 250
},
{
"epoch": 0.952561669829222,
"grad_norm": 0.35591269469725034,
"learning_rate": 3.795774647887324e-05,
"loss": 1.1795,
"step": 251
},
{
"epoch": 0.9563567362428842,
"grad_norm": 0.3421541759167281,
"learning_rate": 3.788732394366197e-05,
"loss": 1.1796,
"step": 252
},
{
"epoch": 0.9601518026565465,
"grad_norm": 0.36503325847175494,
"learning_rate": 3.781690140845071e-05,
"loss": 1.2151,
"step": 253
},
{
"epoch": 0.9639468690702088,
"grad_norm": 0.4230734128325036,
"learning_rate": 3.774647887323944e-05,
"loss": 1.2481,
"step": 254
},
{
"epoch": 0.967741935483871,
"grad_norm": 0.36192963764234665,
"learning_rate": 3.767605633802817e-05,
"loss": 1.2155,
"step": 255
},
{
"epoch": 0.9715370018975332,
"grad_norm": 0.4095967215802884,
"learning_rate": 3.76056338028169e-05,
"loss": 1.1792,
"step": 256
},
{
"epoch": 0.9753320683111955,
"grad_norm": 0.3815528851810032,
"learning_rate": 3.7535211267605636e-05,
"loss": 1.1657,
"step": 257
},
{
"epoch": 0.9791271347248577,
"grad_norm": 0.4320746767417787,
"learning_rate": 3.746478873239437e-05,
"loss": 1.1457,
"step": 258
},
{
"epoch": 0.9829222011385199,
"grad_norm": 0.3777616212932332,
"learning_rate": 3.73943661971831e-05,
"loss": 1.2336,
"step": 259
},
{
"epoch": 0.9867172675521821,
"grad_norm": 0.37724218868971127,
"learning_rate": 3.7323943661971835e-05,
"loss": 1.1951,
"step": 260
},
{
"epoch": 0.9905123339658444,
"grad_norm": 0.36945973634236734,
"learning_rate": 3.725352112676056e-05,
"loss": 1.1559,
"step": 261
},
{
"epoch": 0.9943074003795066,
"grad_norm": 0.3904926455458019,
"learning_rate": 3.71830985915493e-05,
"loss": 1.1524,
"step": 262
},
{
"epoch": 0.9981024667931688,
"grad_norm": 0.3724692970695682,
"learning_rate": 3.711267605633803e-05,
"loss": 1.2107,
"step": 263
},
{
"epoch": 1.0,
"grad_norm": 0.3724692970695682,
"learning_rate": 3.704225352112676e-05,
"loss": 1.1305,
"step": 264
},
{
"epoch": 1.0037950664136623,
"grad_norm": 0.5780005022114832,
"learning_rate": 3.69718309859155e-05,
"loss": 1.0284,
"step": 265
},
{
"epoch": 1.0075901328273245,
"grad_norm": 0.49268409957462456,
"learning_rate": 3.6901408450704226e-05,
"loss": 1.0607,
"step": 266
},
{
"epoch": 1.0113851992409868,
"grad_norm": 0.38871662254837497,
"learning_rate": 3.683098591549296e-05,
"loss": 0.9872,
"step": 267
},
{
"epoch": 1.015180265654649,
"grad_norm": 0.37177307169622453,
"learning_rate": 3.676056338028169e-05,
"loss": 0.9977,
"step": 268
},
{
"epoch": 1.0189753320683113,
"grad_norm": 0.4762575267486963,
"learning_rate": 3.6690140845070425e-05,
"loss": 0.9903,
"step": 269
},
{
"epoch": 1.0227703984819734,
"grad_norm": 0.4387230065396426,
"learning_rate": 3.661971830985916e-05,
"loss": 0.9975,
"step": 270
},
{
"epoch": 1.0265654648956357,
"grad_norm": 0.3495097236775341,
"learning_rate": 3.654929577464789e-05,
"loss": 1.0235,
"step": 271
},
{
"epoch": 1.0303605313092978,
"grad_norm": 0.42106629232774034,
"learning_rate": 3.6478873239436624e-05,
"loss": 0.9921,
"step": 272
},
{
"epoch": 1.0341555977229602,
"grad_norm": 0.3768323548261238,
"learning_rate": 3.640845070422535e-05,
"loss": 0.9918,
"step": 273
},
{
"epoch": 1.0379506641366223,
"grad_norm": 0.41704360648035427,
"learning_rate": 3.633802816901409e-05,
"loss": 0.9928,
"step": 274
},
{
"epoch": 1.0417457305502846,
"grad_norm": 0.3737706259042612,
"learning_rate": 3.626760563380282e-05,
"loss": 0.9992,
"step": 275
},
{
"epoch": 1.045540796963947,
"grad_norm": 0.44307625684387103,
"learning_rate": 3.619718309859155e-05,
"loss": 0.9803,
"step": 276
},
{
"epoch": 1.049335863377609,
"grad_norm": 0.4263539560573131,
"learning_rate": 3.612676056338029e-05,
"loss": 1.0075,
"step": 277
},
{
"epoch": 1.0531309297912714,
"grad_norm": 0.42906097981386576,
"learning_rate": 3.6056338028169015e-05,
"loss": 1.0186,
"step": 278
},
{
"epoch": 1.0569259962049335,
"grad_norm": 0.3983384890358378,
"learning_rate": 3.598591549295775e-05,
"loss": 0.9353,
"step": 279
},
{
"epoch": 1.060721062618596,
"grad_norm": 0.4348959321673255,
"learning_rate": 3.5915492957746486e-05,
"loss": 0.9596,
"step": 280
},
{
"epoch": 1.064516129032258,
"grad_norm": 0.41488330626855313,
"learning_rate": 3.5845070422535214e-05,
"loss": 0.9804,
"step": 281
},
{
"epoch": 1.0683111954459203,
"grad_norm": 0.47753850530638825,
"learning_rate": 3.577464788732395e-05,
"loss": 0.9969,
"step": 282
},
{
"epoch": 1.0721062618595825,
"grad_norm": 0.4461058259813206,
"learning_rate": 3.570422535211268e-05,
"loss": 0.9629,
"step": 283
},
{
"epoch": 1.0759013282732448,
"grad_norm": 0.525877446295084,
"learning_rate": 3.5633802816901413e-05,
"loss": 0.9627,
"step": 284
},
{
"epoch": 1.079696394686907,
"grad_norm": 0.3783953013459411,
"learning_rate": 3.556338028169014e-05,
"loss": 0.9611,
"step": 285
},
{
"epoch": 1.0834914611005693,
"grad_norm": 0.44520759638931123,
"learning_rate": 3.549295774647888e-05,
"loss": 0.9889,
"step": 286
},
{
"epoch": 1.0872865275142316,
"grad_norm": 0.4229490670322319,
"learning_rate": 3.542253521126761e-05,
"loss": 0.9915,
"step": 287
},
{
"epoch": 1.0910815939278937,
"grad_norm": 0.3815617957185081,
"learning_rate": 3.5352112676056334e-05,
"loss": 0.966,
"step": 288
},
{
"epoch": 1.094876660341556,
"grad_norm": 0.395533872108558,
"learning_rate": 3.528169014084507e-05,
"loss": 1.0366,
"step": 289
},
{
"epoch": 1.0986717267552182,
"grad_norm": 0.41849123473761213,
"learning_rate": 3.5211267605633805e-05,
"loss": 0.9409,
"step": 290
},
{
"epoch": 1.1024667931688805,
"grad_norm": 0.3867808193523748,
"learning_rate": 3.514084507042253e-05,
"loss": 0.9732,
"step": 291
},
{
"epoch": 1.1062618595825426,
"grad_norm": 0.37883013114906094,
"learning_rate": 3.507042253521127e-05,
"loss": 0.9588,
"step": 292
},
{
"epoch": 1.110056925996205,
"grad_norm": 0.44746697388429957,
"learning_rate": 3.5e-05,
"loss": 0.954,
"step": 293
},
{
"epoch": 1.113851992409867,
"grad_norm": 0.38909313755459546,
"learning_rate": 3.492957746478873e-05,
"loss": 0.9533,
"step": 294
},
{
"epoch": 1.1176470588235294,
"grad_norm": 0.3646036272665472,
"learning_rate": 3.485915492957747e-05,
"loss": 1.0043,
"step": 295
},
{
"epoch": 1.1214421252371916,
"grad_norm": 0.36031414031878256,
"learning_rate": 3.4788732394366196e-05,
"loss": 0.9704,
"step": 296
},
{
"epoch": 1.125237191650854,
"grad_norm": 0.4109144576532839,
"learning_rate": 3.471830985915493e-05,
"loss": 0.9358,
"step": 297
},
{
"epoch": 1.129032258064516,
"grad_norm": 0.3467797137689974,
"learning_rate": 3.464788732394366e-05,
"loss": 0.9811,
"step": 298
},
{
"epoch": 1.1328273244781784,
"grad_norm": 0.37963314518161295,
"learning_rate": 3.4577464788732395e-05,
"loss": 0.9404,
"step": 299
},
{
"epoch": 1.1366223908918407,
"grad_norm": 0.3779672716796801,
"learning_rate": 3.450704225352113e-05,
"loss": 0.9932,
"step": 300
},
{
"epoch": 1.1404174573055028,
"grad_norm": 0.37736891533578454,
"learning_rate": 3.443661971830986e-05,
"loss": 0.9731,
"step": 301
},
{
"epoch": 1.1442125237191652,
"grad_norm": 0.3820154184169849,
"learning_rate": 3.4366197183098594e-05,
"loss": 0.916,
"step": 302
},
{
"epoch": 1.1480075901328273,
"grad_norm": 0.39503352041311773,
"learning_rate": 3.429577464788732e-05,
"loss": 0.9577,
"step": 303
},
{
"epoch": 1.1518026565464896,
"grad_norm": 0.43572023161676426,
"learning_rate": 3.422535211267606e-05,
"loss": 0.9414,
"step": 304
},
{
"epoch": 1.1555977229601517,
"grad_norm": 0.3854011025242472,
"learning_rate": 3.4154929577464786e-05,
"loss": 0.9492,
"step": 305
},
{
"epoch": 1.159392789373814,
"grad_norm": 0.5698304299806217,
"learning_rate": 3.408450704225352e-05,
"loss": 0.9524,
"step": 306
},
{
"epoch": 1.1631878557874762,
"grad_norm": 0.5791491138865479,
"learning_rate": 3.401408450704226e-05,
"loss": 0.9901,
"step": 307
},
{
"epoch": 1.1669829222011385,
"grad_norm": 0.34656634207610254,
"learning_rate": 3.3943661971830985e-05,
"loss": 0.9648,
"step": 308
},
{
"epoch": 1.1707779886148009,
"grad_norm": 0.44960058565014205,
"learning_rate": 3.387323943661972e-05,
"loss": 0.9512,
"step": 309
},
{
"epoch": 1.174573055028463,
"grad_norm": 0.4213564138576865,
"learning_rate": 3.380281690140845e-05,
"loss": 0.9467,
"step": 310
},
{
"epoch": 1.1783681214421253,
"grad_norm": 0.3870339615488126,
"learning_rate": 3.3732394366197184e-05,
"loss": 0.9675,
"step": 311
},
{
"epoch": 1.1821631878557874,
"grad_norm": 0.42045038491207265,
"learning_rate": 3.366197183098592e-05,
"loss": 0.9627,
"step": 312
},
{
"epoch": 1.1859582542694498,
"grad_norm": 0.37861966247180495,
"learning_rate": 3.359154929577465e-05,
"loss": 0.9533,
"step": 313
},
{
"epoch": 1.189753320683112,
"grad_norm": 0.37131387100078783,
"learning_rate": 3.352112676056338e-05,
"loss": 0.9542,
"step": 314
},
{
"epoch": 1.1935483870967742,
"grad_norm": 0.3866294420305364,
"learning_rate": 3.345070422535211e-05,
"loss": 0.9379,
"step": 315
},
{
"epoch": 1.1973434535104364,
"grad_norm": 0.3528178303596138,
"learning_rate": 3.338028169014085e-05,
"loss": 0.9583,
"step": 316
},
{
"epoch": 1.2011385199240987,
"grad_norm": 0.33850480915152176,
"learning_rate": 3.330985915492958e-05,
"loss": 0.9866,
"step": 317
},
{
"epoch": 1.2049335863377608,
"grad_norm": 0.37253626756602154,
"learning_rate": 3.323943661971831e-05,
"loss": 0.9613,
"step": 318
},
{
"epoch": 1.2087286527514232,
"grad_norm": 0.3624680561310595,
"learning_rate": 3.3169014084507046e-05,
"loss": 0.9347,
"step": 319
},
{
"epoch": 1.2125237191650853,
"grad_norm": 0.3977319549681773,
"learning_rate": 3.3098591549295775e-05,
"loss": 0.954,
"step": 320
},
{
"epoch": 1.2163187855787476,
"grad_norm": 0.34889400353654876,
"learning_rate": 3.302816901408451e-05,
"loss": 0.9618,
"step": 321
},
{
"epoch": 1.22011385199241,
"grad_norm": 0.4012526817615114,
"learning_rate": 3.295774647887324e-05,
"loss": 0.9672,
"step": 322
},
{
"epoch": 1.223908918406072,
"grad_norm": 0.34618980189739385,
"learning_rate": 3.2887323943661974e-05,
"loss": 0.93,
"step": 323
},
{
"epoch": 1.2277039848197344,
"grad_norm": 0.40142550331017823,
"learning_rate": 3.281690140845071e-05,
"loss": 0.9992,
"step": 324
},
{
"epoch": 1.2314990512333965,
"grad_norm": 0.34430457568887357,
"learning_rate": 3.274647887323944e-05,
"loss": 0.9435,
"step": 325
},
{
"epoch": 1.2352941176470589,
"grad_norm": 0.43075788035644597,
"learning_rate": 3.267605633802817e-05,
"loss": 0.9404,
"step": 326
},
{
"epoch": 1.239089184060721,
"grad_norm": 0.3767377401531525,
"learning_rate": 3.26056338028169e-05,
"loss": 0.9663,
"step": 327
},
{
"epoch": 1.2428842504743833,
"grad_norm": 0.3898115314724779,
"learning_rate": 3.2535211267605636e-05,
"loss": 0.9026,
"step": 328
},
{
"epoch": 1.2466793168880455,
"grad_norm": 0.40617176565855967,
"learning_rate": 3.246478873239437e-05,
"loss": 0.8905,
"step": 329
},
{
"epoch": 1.2504743833017078,
"grad_norm": 0.3894155611306649,
"learning_rate": 3.23943661971831e-05,
"loss": 0.8758,
"step": 330
},
{
"epoch": 1.2542694497153701,
"grad_norm": 0.3914100377457853,
"learning_rate": 3.2323943661971835e-05,
"loss": 0.9687,
"step": 331
},
{
"epoch": 1.2580645161290323,
"grad_norm": 0.3552785040525562,
"learning_rate": 3.2253521126760564e-05,
"loss": 0.9237,
"step": 332
},
{
"epoch": 1.2618595825426944,
"grad_norm": 0.3651138990245735,
"learning_rate": 3.21830985915493e-05,
"loss": 0.9243,
"step": 333
},
{
"epoch": 1.2656546489563567,
"grad_norm": 0.4295068393047224,
"learning_rate": 3.2112676056338034e-05,
"loss": 0.8723,
"step": 334
},
{
"epoch": 1.269449715370019,
"grad_norm": 0.3541237256684971,
"learning_rate": 3.204225352112676e-05,
"loss": 0.9428,
"step": 335
},
{
"epoch": 1.2732447817836812,
"grad_norm": 0.37364182808153207,
"learning_rate": 3.19718309859155e-05,
"loss": 0.903,
"step": 336
},
{
"epoch": 1.2770398481973435,
"grad_norm": 0.3818669370399161,
"learning_rate": 3.1901408450704227e-05,
"loss": 0.9753,
"step": 337
},
{
"epoch": 1.2808349146110056,
"grad_norm": 0.41367984259446006,
"learning_rate": 3.183098591549296e-05,
"loss": 0.8779,
"step": 338
},
{
"epoch": 1.284629981024668,
"grad_norm": 0.3407928143150951,
"learning_rate": 3.17605633802817e-05,
"loss": 0.8482,
"step": 339
},
{
"epoch": 1.2884250474383303,
"grad_norm": 0.38362921065411254,
"learning_rate": 3.1690140845070426e-05,
"loss": 0.8964,
"step": 340
},
{
"epoch": 1.2922201138519924,
"grad_norm": 0.35890176064207385,
"learning_rate": 3.161971830985916e-05,
"loss": 0.9427,
"step": 341
},
{
"epoch": 1.2960151802656545,
"grad_norm": 0.4962234868355443,
"learning_rate": 3.154929577464789e-05,
"loss": 0.9165,
"step": 342
},
{
"epoch": 1.2998102466793169,
"grad_norm": 0.36584780868632094,
"learning_rate": 3.1478873239436625e-05,
"loss": 0.899,
"step": 343
},
{
"epoch": 1.3036053130929792,
"grad_norm": 4.751224586247482,
"learning_rate": 3.140845070422535e-05,
"loss": 0.912,
"step": 344
},
{
"epoch": 1.3074003795066413,
"grad_norm": 0.44198483600504035,
"learning_rate": 3.133802816901409e-05,
"loss": 0.9117,
"step": 345
},
{
"epoch": 1.3111954459203037,
"grad_norm": 0.35403629757009547,
"learning_rate": 3.1267605633802824e-05,
"loss": 0.8442,
"step": 346
},
{
"epoch": 1.3149905123339658,
"grad_norm": 0.4611271158658786,
"learning_rate": 3.1197183098591545e-05,
"loss": 0.9209,
"step": 347
},
{
"epoch": 1.3187855787476281,
"grad_norm": 0.39696035823749076,
"learning_rate": 3.112676056338028e-05,
"loss": 0.9007,
"step": 348
},
{
"epoch": 1.3225806451612903,
"grad_norm": 0.4382018188365014,
"learning_rate": 3.1056338028169016e-05,
"loss": 0.8897,
"step": 349
},
{
"epoch": 1.3263757115749526,
"grad_norm": 0.3881963973980307,
"learning_rate": 3.0985915492957744e-05,
"loss": 0.9211,
"step": 350
},
{
"epoch": 1.3301707779886147,
"grad_norm": 0.43443810043419534,
"learning_rate": 3.091549295774648e-05,
"loss": 0.8199,
"step": 351
},
{
"epoch": 1.333965844402277,
"grad_norm": 0.3777996090052327,
"learning_rate": 3.084507042253521e-05,
"loss": 0.9002,
"step": 352
},
{
"epoch": 1.3377609108159394,
"grad_norm": 0.43790501118392844,
"learning_rate": 3.077464788732394e-05,
"loss": 0.8511,
"step": 353
},
{
"epoch": 1.3415559772296015,
"grad_norm": 0.38576989869810147,
"learning_rate": 3.070422535211268e-05,
"loss": 0.8804,
"step": 354
},
{
"epoch": 1.3453510436432636,
"grad_norm": 0.4198404322112285,
"learning_rate": 3.063380281690141e-05,
"loss": 0.8429,
"step": 355
},
{
"epoch": 1.349146110056926,
"grad_norm": 0.4064331117977174,
"learning_rate": 3.056338028169014e-05,
"loss": 0.8801,
"step": 356
},
{
"epoch": 1.3529411764705883,
"grad_norm": 0.3888758253187192,
"learning_rate": 3.0492957746478874e-05,
"loss": 0.8412,
"step": 357
},
{
"epoch": 1.3567362428842504,
"grad_norm": 0.4280383216672898,
"learning_rate": 3.0422535211267606e-05,
"loss": 0.8773,
"step": 358
},
{
"epoch": 1.3605313092979128,
"grad_norm": 0.38613140380110794,
"learning_rate": 3.0352112676056338e-05,
"loss": 0.8744,
"step": 359
},
{
"epoch": 1.364326375711575,
"grad_norm": 0.3898063403199866,
"learning_rate": 3.028169014084507e-05,
"loss": 0.915,
"step": 360
},
{
"epoch": 1.3681214421252372,
"grad_norm": 0.3764318877156931,
"learning_rate": 3.0211267605633802e-05,
"loss": 0.8608,
"step": 361
},
{
"epoch": 1.3719165085388993,
"grad_norm": 0.4122586182852323,
"learning_rate": 3.0140845070422537e-05,
"loss": 0.8701,
"step": 362
},
{
"epoch": 1.3757115749525617,
"grad_norm": 0.3620236893184111,
"learning_rate": 3.007042253521127e-05,
"loss": 0.8942,
"step": 363
},
{
"epoch": 1.3795066413662238,
"grad_norm": 0.37926123680056173,
"learning_rate": 3e-05,
"loss": 0.8891,
"step": 364
},
{
"epoch": 1.3833017077798861,
"grad_norm": 0.343923168265348,
"learning_rate": 2.9929577464788733e-05,
"loss": 0.8446,
"step": 365
},
{
"epoch": 1.3870967741935485,
"grad_norm": 0.3679170824951201,
"learning_rate": 2.9859154929577465e-05,
"loss": 0.8802,
"step": 366
},
{
"epoch": 1.3908918406072106,
"grad_norm": 0.38734992394160833,
"learning_rate": 2.9788732394366196e-05,
"loss": 0.8466,
"step": 367
},
{
"epoch": 1.394686907020873,
"grad_norm": 0.34092253238421727,
"learning_rate": 2.971830985915493e-05,
"loss": 0.8519,
"step": 368
},
{
"epoch": 1.398481973434535,
"grad_norm": 0.34958674468818207,
"learning_rate": 2.9647887323943664e-05,
"loss": 0.8737,
"step": 369
},
{
"epoch": 1.4022770398481974,
"grad_norm": 0.35032341032710373,
"learning_rate": 2.9577464788732395e-05,
"loss": 0.8382,
"step": 370
},
{
"epoch": 1.4060721062618595,
"grad_norm": 0.3494922383005997,
"learning_rate": 2.9507042253521127e-05,
"loss": 0.8982,
"step": 371
},
{
"epoch": 1.4098671726755219,
"grad_norm": 0.38739759770519333,
"learning_rate": 2.943661971830986e-05,
"loss": 0.8243,
"step": 372
},
{
"epoch": 1.413662239089184,
"grad_norm": 0.3573923007413701,
"learning_rate": 2.936619718309859e-05,
"loss": 0.8578,
"step": 373
},
{
"epoch": 1.4174573055028463,
"grad_norm": 0.39297966441814697,
"learning_rate": 2.9295774647887326e-05,
"loss": 0.8513,
"step": 374
},
{
"epoch": 1.4212523719165087,
"grad_norm": 0.38232950338663857,
"learning_rate": 2.9225352112676058e-05,
"loss": 0.8286,
"step": 375
},
{
"epoch": 1.4250474383301708,
"grad_norm": 0.3523948806353041,
"learning_rate": 2.915492957746479e-05,
"loss": 0.8635,
"step": 376
},
{
"epoch": 1.428842504743833,
"grad_norm": 0.3591608755558029,
"learning_rate": 2.9084507042253522e-05,
"loss": 0.8888,
"step": 377
},
{
"epoch": 1.4326375711574952,
"grad_norm": 0.35900201904920037,
"learning_rate": 2.9014084507042254e-05,
"loss": 0.8231,
"step": 378
},
{
"epoch": 1.4364326375711576,
"grad_norm": 0.35795912144996667,
"learning_rate": 2.894366197183099e-05,
"loss": 0.8255,
"step": 379
},
{
"epoch": 1.4402277039848197,
"grad_norm": 0.3797358675127462,
"learning_rate": 2.887323943661972e-05,
"loss": 0.8503,
"step": 380
},
{
"epoch": 1.444022770398482,
"grad_norm": 0.3552169788598724,
"learning_rate": 2.8802816901408453e-05,
"loss": 0.8564,
"step": 381
},
{
"epoch": 1.4478178368121442,
"grad_norm": 0.37376581791180713,
"learning_rate": 2.8732394366197185e-05,
"loss": 0.8377,
"step": 382
},
{
"epoch": 1.4516129032258065,
"grad_norm": 0.7502992104110956,
"learning_rate": 2.8661971830985917e-05,
"loss": 0.8279,
"step": 383
},
{
"epoch": 1.4554079696394686,
"grad_norm": 0.3768304128911496,
"learning_rate": 2.859154929577465e-05,
"loss": 0.8897,
"step": 384
},
{
"epoch": 1.459203036053131,
"grad_norm": 0.39205851069847414,
"learning_rate": 2.8521126760563384e-05,
"loss": 0.8674,
"step": 385
},
{
"epoch": 1.462998102466793,
"grad_norm": 0.3714063080148733,
"learning_rate": 2.8450704225352116e-05,
"loss": 0.8697,
"step": 386
},
{
"epoch": 1.4667931688804554,
"grad_norm": 0.3616627354346947,
"learning_rate": 2.8380281690140847e-05,
"loss": 0.893,
"step": 387
},
{
"epoch": 1.4705882352941178,
"grad_norm": 0.4642255185584646,
"learning_rate": 2.830985915492958e-05,
"loss": 0.8762,
"step": 388
},
{
"epoch": 1.4743833017077799,
"grad_norm": 0.38985832642032053,
"learning_rate": 2.823943661971831e-05,
"loss": 0.8207,
"step": 389
},
{
"epoch": 1.478178368121442,
"grad_norm": 0.359289150282377,
"learning_rate": 2.8169014084507046e-05,
"loss": 0.8318,
"step": 390
},
{
"epoch": 1.4819734345351043,
"grad_norm": 0.40051356251030884,
"learning_rate": 2.809859154929578e-05,
"loss": 0.8547,
"step": 391
},
{
"epoch": 1.4857685009487667,
"grad_norm": 0.35588585359954095,
"learning_rate": 2.802816901408451e-05,
"loss": 0.8382,
"step": 392
},
{
"epoch": 1.4895635673624288,
"grad_norm": 0.4362970351234456,
"learning_rate": 2.7957746478873242e-05,
"loss": 0.8512,
"step": 393
},
{
"epoch": 1.4933586337760911,
"grad_norm": 0.39780404209661613,
"learning_rate": 2.7887323943661974e-05,
"loss": 0.8566,
"step": 394
},
{
"epoch": 1.4971537001897532,
"grad_norm": 0.3543006643738593,
"learning_rate": 2.7816901408450706e-05,
"loss": 0.8164,
"step": 395
},
{
"epoch": 1.5009487666034156,
"grad_norm": 0.3624209522643872,
"learning_rate": 2.774647887323944e-05,
"loss": 0.8412,
"step": 396
},
{
"epoch": 1.504743833017078,
"grad_norm": 0.38255823213172885,
"learning_rate": 2.7676056338028173e-05,
"loss": 0.8391,
"step": 397
},
{
"epoch": 1.50853889943074,
"grad_norm": 0.37534346845104655,
"learning_rate": 2.7605633802816905e-05,
"loss": 0.8284,
"step": 398
},
{
"epoch": 1.5123339658444022,
"grad_norm": 0.35308102770879485,
"learning_rate": 2.7535211267605637e-05,
"loss": 0.8407,
"step": 399
},
{
"epoch": 1.5161290322580645,
"grad_norm": 0.38424095604350006,
"learning_rate": 2.746478873239437e-05,
"loss": 0.8666,
"step": 400
},
{
"epoch": 1.5199240986717268,
"grad_norm": 0.34207797831391507,
"learning_rate": 2.7394366197183104e-05,
"loss": 0.7864,
"step": 401
},
{
"epoch": 1.523719165085389,
"grad_norm": 0.3363348768898608,
"learning_rate": 2.7323943661971836e-05,
"loss": 0.8201,
"step": 402
},
{
"epoch": 1.527514231499051,
"grad_norm": 0.34968288001827885,
"learning_rate": 2.7253521126760568e-05,
"loss": 0.8231,
"step": 403
},
{
"epoch": 1.5313092979127134,
"grad_norm": 0.3466545025341771,
"learning_rate": 2.71830985915493e-05,
"loss": 0.8041,
"step": 404
},
{
"epoch": 1.5351043643263758,
"grad_norm": 0.33715888607483385,
"learning_rate": 2.711267605633803e-05,
"loss": 0.8213,
"step": 405
},
{
"epoch": 1.538899430740038,
"grad_norm": 7.725075467931952,
"learning_rate": 2.704225352112676e-05,
"loss": 1.1218,
"step": 406
},
{
"epoch": 1.5426944971537002,
"grad_norm": 0.45719932835143645,
"learning_rate": 2.6971830985915492e-05,
"loss": 0.8355,
"step": 407
},
{
"epoch": 1.5464895635673623,
"grad_norm": 0.3922243740769575,
"learning_rate": 2.6901408450704224e-05,
"loss": 0.8396,
"step": 408
},
{
"epoch": 1.5502846299810247,
"grad_norm": 0.4164707458206949,
"learning_rate": 2.6830985915492955e-05,
"loss": 0.8048,
"step": 409
},
{
"epoch": 1.554079696394687,
"grad_norm": 0.4012701636461865,
"learning_rate": 2.676056338028169e-05,
"loss": 0.7864,
"step": 410
},
{
"epoch": 1.5578747628083491,
"grad_norm": 0.38009564915085836,
"learning_rate": 2.6690140845070423e-05,
"loss": 0.847,
"step": 411
},
{
"epoch": 1.5616698292220113,
"grad_norm": 0.37109962238832533,
"learning_rate": 2.6619718309859155e-05,
"loss": 0.8075,
"step": 412
},
{
"epoch": 1.5654648956356736,
"grad_norm": 0.37973799418935394,
"learning_rate": 2.6549295774647886e-05,
"loss": 0.7821,
"step": 413
},
{
"epoch": 1.569259962049336,
"grad_norm": 0.3838422068617137,
"learning_rate": 2.6478873239436618e-05,
"loss": 0.7786,
"step": 414
},
{
"epoch": 1.573055028462998,
"grad_norm": 0.36727060294619013,
"learning_rate": 2.640845070422535e-05,
"loss": 0.8254,
"step": 415
},
{
"epoch": 1.5768500948766604,
"grad_norm": 0.3452313240951604,
"learning_rate": 2.6338028169014085e-05,
"loss": 0.8032,
"step": 416
},
{
"epoch": 1.5806451612903225,
"grad_norm": 0.36311633672078497,
"learning_rate": 2.6267605633802817e-05,
"loss": 0.8468,
"step": 417
},
{
"epoch": 1.5844402277039848,
"grad_norm": 0.3387602472022874,
"learning_rate": 2.619718309859155e-05,
"loss": 0.7776,
"step": 418
},
{
"epoch": 1.5882352941176472,
"grad_norm": 0.3595348416992687,
"learning_rate": 2.612676056338028e-05,
"loss": 0.8514,
"step": 419
},
{
"epoch": 1.5920303605313093,
"grad_norm": 0.35596653835598024,
"learning_rate": 2.6056338028169013e-05,
"loss": 0.817,
"step": 420
},
{
"epoch": 1.5958254269449714,
"grad_norm": 0.36506341667069697,
"learning_rate": 2.5985915492957745e-05,
"loss": 0.8395,
"step": 421
},
{
"epoch": 1.5996204933586338,
"grad_norm": 0.36415929569164257,
"learning_rate": 2.591549295774648e-05,
"loss": 0.8515,
"step": 422
},
{
"epoch": 1.603415559772296,
"grad_norm": 0.5892710040323252,
"learning_rate": 2.5845070422535212e-05,
"loss": 0.7888,
"step": 423
},
{
"epoch": 1.6072106261859582,
"grad_norm": 0.35500763234160915,
"learning_rate": 2.5774647887323944e-05,
"loss": 0.7894,
"step": 424
},
{
"epoch": 1.6110056925996203,
"grad_norm": 0.34084121947508983,
"learning_rate": 2.5704225352112676e-05,
"loss": 0.8086,
"step": 425
},
{
"epoch": 1.6148007590132827,
"grad_norm": 0.35106826844667405,
"learning_rate": 2.5633802816901408e-05,
"loss": 0.7703,
"step": 426
},
{
"epoch": 1.618595825426945,
"grad_norm": 0.36175688801736494,
"learning_rate": 2.5563380281690143e-05,
"loss": 0.7622,
"step": 427
},
{
"epoch": 1.6223908918406074,
"grad_norm": 0.35842239970167306,
"learning_rate": 2.5492957746478875e-05,
"loss": 0.792,
"step": 428
},
{
"epoch": 1.6261859582542695,
"grad_norm": 0.35826103263060743,
"learning_rate": 2.5422535211267607e-05,
"loss": 0.8215,
"step": 429
},
{
"epoch": 1.6299810246679316,
"grad_norm": 0.38772107106461334,
"learning_rate": 2.535211267605634e-05,
"loss": 0.8115,
"step": 430
},
{
"epoch": 1.633776091081594,
"grad_norm": 0.3458138140632706,
"learning_rate": 2.528169014084507e-05,
"loss": 0.8029,
"step": 431
},
{
"epoch": 1.6375711574952563,
"grad_norm": 0.4224145146637147,
"learning_rate": 2.5211267605633802e-05,
"loss": 0.8306,
"step": 432
},
{
"epoch": 1.6413662239089184,
"grad_norm": 0.5117068275141828,
"learning_rate": 2.5140845070422537e-05,
"loss": 0.7577,
"step": 433
},
{
"epoch": 1.6451612903225805,
"grad_norm": 0.3473854275418071,
"learning_rate": 2.507042253521127e-05,
"loss": 0.7662,
"step": 434
},
{
"epoch": 1.6489563567362429,
"grad_norm": 0.3555181336404861,
"learning_rate": 2.5e-05,
"loss": 0.804,
"step": 435
},
{
"epoch": 1.6527514231499052,
"grad_norm": 0.3500712376111828,
"learning_rate": 2.4929577464788733e-05,
"loss": 0.7417,
"step": 436
},
{
"epoch": 1.6565464895635673,
"grad_norm": 0.3505056746424919,
"learning_rate": 2.4859154929577465e-05,
"loss": 0.7944,
"step": 437
},
{
"epoch": 1.6603415559772297,
"grad_norm": 0.36784343958896565,
"learning_rate": 2.47887323943662e-05,
"loss": 0.7961,
"step": 438
},
{
"epoch": 1.6641366223908918,
"grad_norm": 0.3669774598271857,
"learning_rate": 2.4718309859154932e-05,
"loss": 0.7567,
"step": 439
},
{
"epoch": 1.6679316888045541,
"grad_norm": 0.34489087586672096,
"learning_rate": 2.4647887323943664e-05,
"loss": 0.766,
"step": 440
},
{
"epoch": 1.6717267552182165,
"grad_norm": 0.36279510050446423,
"learning_rate": 2.4577464788732396e-05,
"loss": 0.7672,
"step": 441
},
{
"epoch": 1.6755218216318786,
"grad_norm": 0.34275090318643775,
"learning_rate": 2.4507042253521128e-05,
"loss": 0.7948,
"step": 442
},
{
"epoch": 1.6793168880455407,
"grad_norm": 0.378065543146473,
"learning_rate": 2.443661971830986e-05,
"loss": 0.8153,
"step": 443
},
{
"epoch": 1.683111954459203,
"grad_norm": 1.791318649963188,
"learning_rate": 2.4366197183098595e-05,
"loss": 0.8117,
"step": 444
},
{
"epoch": 1.6869070208728654,
"grad_norm": 0.36800100945563463,
"learning_rate": 2.4295774647887327e-05,
"loss": 0.7851,
"step": 445
},
{
"epoch": 1.6907020872865275,
"grad_norm": 0.3373377072086416,
"learning_rate": 2.422535211267606e-05,
"loss": 0.7551,
"step": 446
},
{
"epoch": 1.6944971537001896,
"grad_norm": 0.35869489687151435,
"learning_rate": 2.415492957746479e-05,
"loss": 0.788,
"step": 447
},
{
"epoch": 1.698292220113852,
"grad_norm": 0.341668326755974,
"learning_rate": 2.4084507042253522e-05,
"loss": 0.7612,
"step": 448
},
{
"epoch": 1.7020872865275143,
"grad_norm": 0.3271619623041308,
"learning_rate": 2.4014084507042258e-05,
"loss": 0.7992,
"step": 449
},
{
"epoch": 1.7058823529411766,
"grad_norm": 0.33883580323107215,
"learning_rate": 2.3943661971830986e-05,
"loss": 0.7728,
"step": 450
},
{
"epoch": 1.7096774193548387,
"grad_norm": 0.34974390118247833,
"learning_rate": 2.3873239436619718e-05,
"loss": 0.7915,
"step": 451
},
{
"epoch": 1.7134724857685009,
"grad_norm": 0.40308370945557404,
"learning_rate": 2.380281690140845e-05,
"loss": 0.7914,
"step": 452
},
{
"epoch": 1.7172675521821632,
"grad_norm": 0.34743389276515596,
"learning_rate": 2.3732394366197182e-05,
"loss": 0.7854,
"step": 453
},
{
"epoch": 1.7210626185958255,
"grad_norm": 0.31840011179428723,
"learning_rate": 2.3661971830985917e-05,
"loss": 0.7484,
"step": 454
},
{
"epoch": 1.7248576850094877,
"grad_norm": 2.425668243341061,
"learning_rate": 2.359154929577465e-05,
"loss": 0.7962,
"step": 455
},
{
"epoch": 1.7286527514231498,
"grad_norm": 0.37036801347675136,
"learning_rate": 2.352112676056338e-05,
"loss": 0.7617,
"step": 456
},
{
"epoch": 1.7324478178368121,
"grad_norm": 0.3150918805937071,
"learning_rate": 2.3450704225352113e-05,
"loss": 0.7516,
"step": 457
},
{
"epoch": 1.7362428842504745,
"grad_norm": 0.33496987185753024,
"learning_rate": 2.3380281690140845e-05,
"loss": 0.7849,
"step": 458
},
{
"epoch": 1.7400379506641366,
"grad_norm": 0.37385461131475584,
"learning_rate": 2.3309859154929576e-05,
"loss": 0.7559,
"step": 459
},
{
"epoch": 1.7438330170777987,
"grad_norm": 0.3356794391190202,
"learning_rate": 2.323943661971831e-05,
"loss": 0.7827,
"step": 460
},
{
"epoch": 1.747628083491461,
"grad_norm": 0.344979221934888,
"learning_rate": 2.3169014084507044e-05,
"loss": 0.7786,
"step": 461
},
{
"epoch": 1.7514231499051234,
"grad_norm": 4.1271301929621655,
"learning_rate": 2.3098591549295775e-05,
"loss": 0.7438,
"step": 462
},
{
"epoch": 1.7552182163187857,
"grad_norm": 0.3659335714148735,
"learning_rate": 2.3028169014084507e-05,
"loss": 0.7623,
"step": 463
},
{
"epoch": 1.7590132827324478,
"grad_norm": 0.34522620431290213,
"learning_rate": 2.295774647887324e-05,
"loss": 0.765,
"step": 464
},
{
"epoch": 1.76280834914611,
"grad_norm": 3.4878321234909455,
"learning_rate": 2.2887323943661974e-05,
"loss": 0.7358,
"step": 465
},
{
"epoch": 1.7666034155597723,
"grad_norm": 0.36185912284794375,
"learning_rate": 2.2816901408450706e-05,
"loss": 0.7888,
"step": 466
},
{
"epoch": 1.7703984819734346,
"grad_norm": 0.3780766434104228,
"learning_rate": 2.2746478873239438e-05,
"loss": 0.7631,
"step": 467
},
{
"epoch": 1.7741935483870968,
"grad_norm": 0.37849286620609834,
"learning_rate": 2.267605633802817e-05,
"loss": 0.7979,
"step": 468
},
{
"epoch": 1.7779886148007589,
"grad_norm": 0.6292815862832937,
"learning_rate": 2.2605633802816902e-05,
"loss": 0.7614,
"step": 469
},
{
"epoch": 1.7817836812144212,
"grad_norm": 0.3670208814989711,
"learning_rate": 2.2535211267605634e-05,
"loss": 0.7699,
"step": 470
},
{
"epoch": 1.7855787476280836,
"grad_norm": 0.35241221593205657,
"learning_rate": 2.246478873239437e-05,
"loss": 0.73,
"step": 471
},
{
"epoch": 1.789373814041746,
"grad_norm": 0.371801668580609,
"learning_rate": 2.23943661971831e-05,
"loss": 0.7635,
"step": 472
},
{
"epoch": 1.793168880455408,
"grad_norm": 0.6171173841176137,
"learning_rate": 2.2323943661971833e-05,
"loss": 0.7761,
"step": 473
},
{
"epoch": 1.7969639468690701,
"grad_norm": 0.3753404514450965,
"learning_rate": 2.2253521126760565e-05,
"loss": 0.7317,
"step": 474
},
{
"epoch": 1.8007590132827325,
"grad_norm": 0.45912844350250365,
"learning_rate": 2.2183098591549297e-05,
"loss": 0.7553,
"step": 475
},
{
"epoch": 1.8045540796963948,
"grad_norm": 0.3555592548908027,
"learning_rate": 2.2112676056338032e-05,
"loss": 0.751,
"step": 476
},
{
"epoch": 1.808349146110057,
"grad_norm": 0.46308086167618195,
"learning_rate": 2.2042253521126764e-05,
"loss": 0.7567,
"step": 477
},
{
"epoch": 1.812144212523719,
"grad_norm": 0.403735950921561,
"learning_rate": 2.1971830985915496e-05,
"loss": 0.7224,
"step": 478
},
{
"epoch": 1.8159392789373814,
"grad_norm": 0.3519350136242228,
"learning_rate": 2.1901408450704227e-05,
"loss": 0.7513,
"step": 479
},
{
"epoch": 1.8197343453510437,
"grad_norm": 0.3714218234826955,
"learning_rate": 2.1830985915492956e-05,
"loss": 0.7276,
"step": 480
},
{
"epoch": 1.8235294117647058,
"grad_norm": 0.3533159687307933,
"learning_rate": 2.176056338028169e-05,
"loss": 0.7433,
"step": 481
},
{
"epoch": 1.827324478178368,
"grad_norm": 0.3439119981823242,
"learning_rate": 2.1690140845070423e-05,
"loss": 0.7653,
"step": 482
},
{
"epoch": 1.8311195445920303,
"grad_norm": 0.36328274596616844,
"learning_rate": 2.1619718309859155e-05,
"loss": 0.7657,
"step": 483
},
{
"epoch": 1.8349146110056926,
"grad_norm": 0.8421375737113024,
"learning_rate": 2.1549295774647887e-05,
"loss": 0.7213,
"step": 484
},
{
"epoch": 1.838709677419355,
"grad_norm": 0.3397192166233515,
"learning_rate": 2.147887323943662e-05,
"loss": 0.6979,
"step": 485
},
{
"epoch": 1.842504743833017,
"grad_norm": 0.33873829936623934,
"learning_rate": 2.1408450704225354e-05,
"loss": 0.7337,
"step": 486
},
{
"epoch": 1.8462998102466792,
"grad_norm": 0.3507589458777712,
"learning_rate": 2.1338028169014086e-05,
"loss": 0.7638,
"step": 487
},
{
"epoch": 1.8500948766603416,
"grad_norm": 0.34744611910247314,
"learning_rate": 2.1267605633802818e-05,
"loss": 0.7342,
"step": 488
},
{
"epoch": 1.853889943074004,
"grad_norm": 0.3313064090062832,
"learning_rate": 2.119718309859155e-05,
"loss": 0.7273,
"step": 489
},
{
"epoch": 1.857685009487666,
"grad_norm": 0.3443277472073694,
"learning_rate": 2.112676056338028e-05,
"loss": 0.7718,
"step": 490
},
{
"epoch": 1.8614800759013281,
"grad_norm": 0.32526922515793544,
"learning_rate": 2.1056338028169013e-05,
"loss": 0.7305,
"step": 491
},
{
"epoch": 1.8652751423149905,
"grad_norm": 0.3446338285118589,
"learning_rate": 2.098591549295775e-05,
"loss": 0.7147,
"step": 492
},
{
"epoch": 1.8690702087286528,
"grad_norm": 0.32823906688451404,
"learning_rate": 2.091549295774648e-05,
"loss": 0.7638,
"step": 493
},
{
"epoch": 1.8728652751423152,
"grad_norm": 0.3447373493532683,
"learning_rate": 2.0845070422535212e-05,
"loss": 0.7257,
"step": 494
},
{
"epoch": 1.8766603415559773,
"grad_norm": 0.3431152647470359,
"learning_rate": 2.0774647887323944e-05,
"loss": 0.7303,
"step": 495
},
{
"epoch": 1.8804554079696394,
"grad_norm": 0.3367205633690086,
"learning_rate": 2.0704225352112676e-05,
"loss": 0.7327,
"step": 496
},
{
"epoch": 1.8842504743833017,
"grad_norm": 0.3658436595344883,
"learning_rate": 2.063380281690141e-05,
"loss": 0.7413,
"step": 497
},
{
"epoch": 1.888045540796964,
"grad_norm": 0.33456795877937295,
"learning_rate": 2.0563380281690143e-05,
"loss": 0.738,
"step": 498
},
{
"epoch": 1.8918406072106262,
"grad_norm": 0.3398893013580835,
"learning_rate": 2.0492957746478875e-05,
"loss": 0.7213,
"step": 499
},
{
"epoch": 1.8956356736242883,
"grad_norm": 1.553115272832865,
"learning_rate": 2.0422535211267607e-05,
"loss": 0.6888,
"step": 500
},
{
"epoch": 1.8994307400379506,
"grad_norm": 0.5051907115623565,
"learning_rate": 2.035211267605634e-05,
"loss": 0.7275,
"step": 501
},
{
"epoch": 1.903225806451613,
"grad_norm": 0.34293191097531744,
"learning_rate": 2.028169014084507e-05,
"loss": 0.7638,
"step": 502
},
{
"epoch": 1.907020872865275,
"grad_norm": 0.3620972508496107,
"learning_rate": 2.0211267605633806e-05,
"loss": 0.7031,
"step": 503
},
{
"epoch": 1.9108159392789372,
"grad_norm": 0.34574094706911696,
"learning_rate": 2.0140845070422538e-05,
"loss": 0.757,
"step": 504
},
{
"epoch": 1.9146110056925996,
"grad_norm": 0.34430115765970903,
"learning_rate": 2.007042253521127e-05,
"loss": 0.7401,
"step": 505
},
{
"epoch": 1.918406072106262,
"grad_norm": 6.765553941090464,
"learning_rate": 2e-05,
"loss": 0.7114,
"step": 506
},
{
"epoch": 1.9222011385199242,
"grad_norm": 0.7045299827330415,
"learning_rate": 1.9929577464788734e-05,
"loss": 0.7447,
"step": 507
},
{
"epoch": 1.9259962049335864,
"grad_norm": 0.3516203173732409,
"learning_rate": 1.9859154929577465e-05,
"loss": 0.7468,
"step": 508
},
{
"epoch": 1.9297912713472485,
"grad_norm": 0.3824428144119352,
"learning_rate": 1.9788732394366197e-05,
"loss": 0.6964,
"step": 509
},
{
"epoch": 1.9335863377609108,
"grad_norm": 0.36234975042617634,
"learning_rate": 1.971830985915493e-05,
"loss": 0.7264,
"step": 510
},
{
"epoch": 1.9373814041745732,
"grad_norm": 0.3649714669931618,
"learning_rate": 1.964788732394366e-05,
"loss": 0.7247,
"step": 511
},
{
"epoch": 1.9411764705882353,
"grad_norm": 1.961504050122774,
"learning_rate": 1.9577464788732393e-05,
"loss": 0.7452,
"step": 512
},
{
"epoch": 1.9449715370018974,
"grad_norm": 0.3455700092311562,
"learning_rate": 1.9507042253521128e-05,
"loss": 0.7309,
"step": 513
},
{
"epoch": 1.9487666034155597,
"grad_norm": 17.563495820773788,
"learning_rate": 1.943661971830986e-05,
"loss": 0.6987,
"step": 514
},
{
"epoch": 1.952561669829222,
"grad_norm": 0.35090587817601493,
"learning_rate": 1.9366197183098592e-05,
"loss": 0.7054,
"step": 515
},
{
"epoch": 1.9563567362428842,
"grad_norm": 0.33688730696331154,
"learning_rate": 1.9295774647887324e-05,
"loss": 0.7209,
"step": 516
},
{
"epoch": 1.9601518026565465,
"grad_norm": 0.3524936384093124,
"learning_rate": 1.9225352112676056e-05,
"loss": 0.7327,
"step": 517
},
{
"epoch": 1.9639468690702087,
"grad_norm": 0.7086938585390536,
"learning_rate": 1.9154929577464788e-05,
"loss": 0.7309,
"step": 518
},
{
"epoch": 1.967741935483871,
"grad_norm": 0.34506511630966064,
"learning_rate": 1.9084507042253523e-05,
"loss": 0.7345,
"step": 519
},
{
"epoch": 1.9715370018975333,
"grad_norm": 0.35098770892564646,
"learning_rate": 1.9014084507042255e-05,
"loss": 0.7203,
"step": 520
},
{
"epoch": 1.9753320683111955,
"grad_norm": 0.32894312990338453,
"learning_rate": 1.8943661971830987e-05,
"loss": 0.7131,
"step": 521
},
{
"epoch": 1.9791271347248576,
"grad_norm": 0.33275918406714977,
"learning_rate": 1.887323943661972e-05,
"loss": 0.6732,
"step": 522
},
{
"epoch": 1.98292220113852,
"grad_norm": 0.33446659449449373,
"learning_rate": 1.880281690140845e-05,
"loss": 0.6737,
"step": 523
},
{
"epoch": 1.9867172675521823,
"grad_norm": 0.3177113192716395,
"learning_rate": 1.8732394366197186e-05,
"loss": 0.707,
"step": 524
},
{
"epoch": 1.9905123339658444,
"grad_norm": 0.3729299834607769,
"learning_rate": 1.8661971830985917e-05,
"loss": 0.7081,
"step": 525
},
{
"epoch": 1.9943074003795065,
"grad_norm": 0.3609269271396142,
"learning_rate": 1.859154929577465e-05,
"loss": 0.7614,
"step": 526
},
{
"epoch": 1.9981024667931688,
"grad_norm": 0.3299235777305554,
"learning_rate": 1.852112676056338e-05,
"loss": 0.6835,
"step": 527
},
{
"epoch": 2.0,
"grad_norm": 0.4996345129749359,
"learning_rate": 1.8450704225352113e-05,
"loss": 0.6427,
"step": 528
},
{
"epoch": 2.0037950664136623,
"grad_norm": 0.45662831595351755,
"learning_rate": 1.8380281690140845e-05,
"loss": 0.5582,
"step": 529
},
{
"epoch": 2.0075901328273247,
"grad_norm": 0.3373754796983528,
"learning_rate": 1.830985915492958e-05,
"loss": 0.554,
"step": 530
},
{
"epoch": 2.0113851992409866,
"grad_norm": 0.3999235102111141,
"learning_rate": 1.8239436619718312e-05,
"loss": 0.5648,
"step": 531
},
{
"epoch": 2.015180265654649,
"grad_norm": 0.4957060228927706,
"learning_rate": 1.8169014084507044e-05,
"loss": 0.5678,
"step": 532
},
{
"epoch": 2.0189753320683113,
"grad_norm": 0.366026829447788,
"learning_rate": 1.8098591549295776e-05,
"loss": 0.572,
"step": 533
},
{
"epoch": 2.0227703984819736,
"grad_norm": 0.3730867075117646,
"learning_rate": 1.8028169014084508e-05,
"loss": 0.5579,
"step": 534
},
{
"epoch": 2.0265654648956355,
"grad_norm": 0.38737988658749095,
"learning_rate": 1.7957746478873243e-05,
"loss": 0.5444,
"step": 535
},
{
"epoch": 2.030360531309298,
"grad_norm": 0.3873142708390432,
"learning_rate": 1.7887323943661975e-05,
"loss": 0.5602,
"step": 536
},
{
"epoch": 2.03415559772296,
"grad_norm": 0.3577156330375341,
"learning_rate": 1.7816901408450707e-05,
"loss": 0.5472,
"step": 537
},
{
"epoch": 2.0379506641366225,
"grad_norm": 0.37959460923631255,
"learning_rate": 1.774647887323944e-05,
"loss": 0.5628,
"step": 538
},
{
"epoch": 2.041745730550285,
"grad_norm": 0.3701196590623353,
"learning_rate": 1.7676056338028167e-05,
"loss": 0.5541,
"step": 539
},
{
"epoch": 2.0455407969639468,
"grad_norm": 0.3198733292592515,
"learning_rate": 1.7605633802816902e-05,
"loss": 0.5688,
"step": 540
},
{
"epoch": 2.049335863377609,
"grad_norm": 0.34272079350589185,
"learning_rate": 1.7535211267605634e-05,
"loss": 0.5586,
"step": 541
},
{
"epoch": 2.0531309297912714,
"grad_norm": 0.33190146117311037,
"learning_rate": 1.7464788732394366e-05,
"loss": 0.5546,
"step": 542
},
{
"epoch": 2.0569259962049338,
"grad_norm": 0.32168314158039846,
"learning_rate": 1.7394366197183098e-05,
"loss": 0.5422,
"step": 543
},
{
"epoch": 2.0607210626185957,
"grad_norm": 0.30199071229974844,
"learning_rate": 1.732394366197183e-05,
"loss": 0.5426,
"step": 544
},
{
"epoch": 2.064516129032258,
"grad_norm": 0.32431671372423404,
"learning_rate": 1.7253521126760565e-05,
"loss": 0.5471,
"step": 545
},
{
"epoch": 2.0683111954459203,
"grad_norm": 0.3423821310623563,
"learning_rate": 1.7183098591549297e-05,
"loss": 0.5497,
"step": 546
},
{
"epoch": 2.0721062618595827,
"grad_norm": 0.33860248270199556,
"learning_rate": 1.711267605633803e-05,
"loss": 0.5579,
"step": 547
},
{
"epoch": 2.0759013282732446,
"grad_norm": 0.30209368374322493,
"learning_rate": 1.704225352112676e-05,
"loss": 0.5473,
"step": 548
},
{
"epoch": 2.079696394686907,
"grad_norm": 0.3299973931539092,
"learning_rate": 1.6971830985915493e-05,
"loss": 0.5491,
"step": 549
},
{
"epoch": 2.0834914611005693,
"grad_norm": 0.3378384780122214,
"learning_rate": 1.6901408450704224e-05,
"loss": 0.544,
"step": 550
},
{
"epoch": 2.0872865275142316,
"grad_norm": 0.3138639173352364,
"learning_rate": 1.683098591549296e-05,
"loss": 0.5661,
"step": 551
},
{
"epoch": 2.091081593927894,
"grad_norm": 0.3112803456849732,
"learning_rate": 1.676056338028169e-05,
"loss": 0.5468,
"step": 552
},
{
"epoch": 2.094876660341556,
"grad_norm": 0.3101756824425779,
"learning_rate": 1.6690140845070424e-05,
"loss": 0.5458,
"step": 553
},
{
"epoch": 2.098671726755218,
"grad_norm": 0.3041901167865272,
"learning_rate": 1.6619718309859155e-05,
"loss": 0.5415,
"step": 554
},
{
"epoch": 2.1024667931688805,
"grad_norm": 0.33835981422416234,
"learning_rate": 1.6549295774647887e-05,
"loss": 0.5727,
"step": 555
},
{
"epoch": 2.106261859582543,
"grad_norm": 0.29344503296785507,
"learning_rate": 1.647887323943662e-05,
"loss": 0.5766,
"step": 556
},
{
"epoch": 2.1100569259962048,
"grad_norm": 0.30503345553542305,
"learning_rate": 1.6408450704225354e-05,
"loss": 0.5762,
"step": 557
},
{
"epoch": 2.113851992409867,
"grad_norm": 0.3267699844381843,
"learning_rate": 1.6338028169014086e-05,
"loss": 0.555,
"step": 558
},
{
"epoch": 2.1176470588235294,
"grad_norm": 0.2877971521091349,
"learning_rate": 1.6267605633802818e-05,
"loss": 0.5342,
"step": 559
},
{
"epoch": 2.121442125237192,
"grad_norm": 0.290664109191175,
"learning_rate": 1.619718309859155e-05,
"loss": 0.5393,
"step": 560
},
{
"epoch": 2.1252371916508537,
"grad_norm": 0.2984274699047115,
"learning_rate": 1.6126760563380282e-05,
"loss": 0.5391,
"step": 561
},
{
"epoch": 2.129032258064516,
"grad_norm": 0.295058161849283,
"learning_rate": 1.6056338028169017e-05,
"loss": 0.5298,
"step": 562
},
{
"epoch": 2.1328273244781784,
"grad_norm": 0.3043696014528808,
"learning_rate": 1.598591549295775e-05,
"loss": 0.5405,
"step": 563
},
{
"epoch": 2.1366223908918407,
"grad_norm": 3.573999992110902,
"learning_rate": 1.591549295774648e-05,
"loss": 0.563,
"step": 564
},
{
"epoch": 2.140417457305503,
"grad_norm": 0.5209513276404492,
"learning_rate": 1.5845070422535213e-05,
"loss": 0.5426,
"step": 565
},
{
"epoch": 2.144212523719165,
"grad_norm": 0.29287662180360463,
"learning_rate": 1.5774647887323945e-05,
"loss": 0.5416,
"step": 566
},
{
"epoch": 2.1480075901328273,
"grad_norm": 0.28979637822603127,
"learning_rate": 1.5704225352112677e-05,
"loss": 0.5377,
"step": 567
},
{
"epoch": 2.1518026565464896,
"grad_norm": 0.28998636243856923,
"learning_rate": 1.5633802816901412e-05,
"loss": 0.5384,
"step": 568
},
{
"epoch": 2.155597722960152,
"grad_norm": 0.3017037417549146,
"learning_rate": 1.556338028169014e-05,
"loss": 0.5507,
"step": 569
},
{
"epoch": 2.159392789373814,
"grad_norm": 0.2986286910490102,
"learning_rate": 1.5492957746478872e-05,
"loss": 0.5451,
"step": 570
},
{
"epoch": 2.163187855787476,
"grad_norm": 0.2856093743247544,
"learning_rate": 1.5422535211267604e-05,
"loss": 0.5483,
"step": 571
},
{
"epoch": 2.1669829222011385,
"grad_norm": 0.2697989551795859,
"learning_rate": 1.535211267605634e-05,
"loss": 0.5236,
"step": 572
},
{
"epoch": 2.170777988614801,
"grad_norm": 0.2989596771393371,
"learning_rate": 1.528169014084507e-05,
"loss": 0.531,
"step": 573
},
{
"epoch": 2.174573055028463,
"grad_norm": 0.2788047145142846,
"learning_rate": 1.5211267605633803e-05,
"loss": 0.5316,
"step": 574
},
{
"epoch": 2.178368121442125,
"grad_norm": 0.31605366948483243,
"learning_rate": 1.5140845070422535e-05,
"loss": 0.5202,
"step": 575
},
{
"epoch": 2.1821631878557874,
"grad_norm": 0.3003376250755627,
"learning_rate": 1.5070422535211269e-05,
"loss": 0.5603,
"step": 576
},
{
"epoch": 2.18595825426945,
"grad_norm": 0.28983664435703244,
"learning_rate": 1.5e-05,
"loss": 0.53,
"step": 577
},
{
"epoch": 2.189753320683112,
"grad_norm": 0.2848013087432656,
"learning_rate": 1.4929577464788732e-05,
"loss": 0.5491,
"step": 578
},
{
"epoch": 2.193548387096774,
"grad_norm": 0.3034108703426625,
"learning_rate": 1.4859154929577466e-05,
"loss": 0.5693,
"step": 579
},
{
"epoch": 2.1973434535104364,
"grad_norm": 0.2939176697262612,
"learning_rate": 1.4788732394366198e-05,
"loss": 0.5616,
"step": 580
},
{
"epoch": 2.2011385199240987,
"grad_norm": 0.28400675092707883,
"learning_rate": 1.471830985915493e-05,
"loss": 0.5231,
"step": 581
},
{
"epoch": 2.204933586337761,
"grad_norm": 0.2908878550699226,
"learning_rate": 1.4647887323943663e-05,
"loss": 0.5557,
"step": 582
},
{
"epoch": 2.2087286527514234,
"grad_norm": 0.2797432784894231,
"learning_rate": 1.4577464788732395e-05,
"loss": 0.5373,
"step": 583
},
{
"epoch": 2.2125237191650853,
"grad_norm": 0.31020903274413864,
"learning_rate": 1.4507042253521127e-05,
"loss": 0.5342,
"step": 584
},
{
"epoch": 2.2163187855787476,
"grad_norm": 0.2753405582724788,
"learning_rate": 1.443661971830986e-05,
"loss": 0.5449,
"step": 585
},
{
"epoch": 2.22011385199241,
"grad_norm": 0.2828761135482831,
"learning_rate": 1.4366197183098592e-05,
"loss": 0.5455,
"step": 586
},
{
"epoch": 2.2239089184060723,
"grad_norm": 0.28938761819021896,
"learning_rate": 1.4295774647887324e-05,
"loss": 0.5456,
"step": 587
},
{
"epoch": 2.227703984819734,
"grad_norm": 0.36424349156623803,
"learning_rate": 1.4225352112676058e-05,
"loss": 0.5334,
"step": 588
},
{
"epoch": 2.2314990512333965,
"grad_norm": 0.2969012632696866,
"learning_rate": 1.415492957746479e-05,
"loss": 0.5226,
"step": 589
},
{
"epoch": 2.235294117647059,
"grad_norm": 0.2882992999735945,
"learning_rate": 1.4084507042253523e-05,
"loss": 0.5398,
"step": 590
},
{
"epoch": 2.239089184060721,
"grad_norm": 0.30032891934779227,
"learning_rate": 1.4014084507042255e-05,
"loss": 0.5487,
"step": 591
},
{
"epoch": 2.242884250474383,
"grad_norm": 0.2916853780931874,
"learning_rate": 1.3943661971830987e-05,
"loss": 0.5495,
"step": 592
},
{
"epoch": 2.2466793168880455,
"grad_norm": 0.29828803220883604,
"learning_rate": 1.387323943661972e-05,
"loss": 0.5556,
"step": 593
},
{
"epoch": 2.250474383301708,
"grad_norm": 0.2977152087829231,
"learning_rate": 1.3802816901408452e-05,
"loss": 0.5489,
"step": 594
},
{
"epoch": 2.25426944971537,
"grad_norm": 0.2825563858779121,
"learning_rate": 1.3732394366197184e-05,
"loss": 0.5309,
"step": 595
},
{
"epoch": 2.258064516129032,
"grad_norm": 0.28760194599894473,
"learning_rate": 1.3661971830985918e-05,
"loss": 0.539,
"step": 596
},
{
"epoch": 2.2618595825426944,
"grad_norm": 0.2888087056865644,
"learning_rate": 1.359154929577465e-05,
"loss": 0.521,
"step": 597
},
{
"epoch": 2.2656546489563567,
"grad_norm": 0.28959446412927714,
"learning_rate": 1.352112676056338e-05,
"loss": 0.5615,
"step": 598
},
{
"epoch": 2.269449715370019,
"grad_norm": 0.2836050248813387,
"learning_rate": 1.3450704225352112e-05,
"loss": 0.5308,
"step": 599
},
{
"epoch": 2.2732447817836814,
"grad_norm": 0.2925293335648262,
"learning_rate": 1.3380281690140845e-05,
"loss": 0.5585,
"step": 600
},
{
"epoch": 2.2770398481973433,
"grad_norm": 0.28125524640102967,
"learning_rate": 1.3309859154929577e-05,
"loss": 0.5399,
"step": 601
},
{
"epoch": 2.2808349146110056,
"grad_norm": 0.28689704924017473,
"learning_rate": 1.3239436619718309e-05,
"loss": 0.5598,
"step": 602
},
{
"epoch": 2.284629981024668,
"grad_norm": 0.2858866230213846,
"learning_rate": 1.3169014084507043e-05,
"loss": 0.5469,
"step": 603
},
{
"epoch": 2.2884250474383303,
"grad_norm": 0.28082448102671365,
"learning_rate": 1.3098591549295775e-05,
"loss": 0.5396,
"step": 604
},
{
"epoch": 2.292220113851992,
"grad_norm": 0.2805388192207067,
"learning_rate": 1.3028169014084506e-05,
"loss": 0.5381,
"step": 605
},
{
"epoch": 2.2960151802656545,
"grad_norm": 0.29534726973175235,
"learning_rate": 1.295774647887324e-05,
"loss": 0.5334,
"step": 606
},
{
"epoch": 2.299810246679317,
"grad_norm": 0.3968695104972142,
"learning_rate": 1.2887323943661972e-05,
"loss": 0.5459,
"step": 607
},
{
"epoch": 2.3036053130929792,
"grad_norm": 0.28531647444468233,
"learning_rate": 1.2816901408450704e-05,
"loss": 0.538,
"step": 608
},
{
"epoch": 2.3074003795066416,
"grad_norm": 0.27875697488772433,
"learning_rate": 1.2746478873239437e-05,
"loss": 0.5256,
"step": 609
},
{
"epoch": 2.3111954459203035,
"grad_norm": 0.29116938628491784,
"learning_rate": 1.267605633802817e-05,
"loss": 0.5486,
"step": 610
},
{
"epoch": 2.314990512333966,
"grad_norm": 0.28266640080403505,
"learning_rate": 1.2605633802816901e-05,
"loss": 0.5516,
"step": 611
},
{
"epoch": 2.318785578747628,
"grad_norm": 0.28788575070645567,
"learning_rate": 1.2535211267605635e-05,
"loss": 0.5498,
"step": 612
},
{
"epoch": 2.3225806451612905,
"grad_norm": 0.29080956133376323,
"learning_rate": 1.2464788732394367e-05,
"loss": 0.5483,
"step": 613
},
{
"epoch": 2.3263757115749524,
"grad_norm": 0.28331005746690635,
"learning_rate": 1.23943661971831e-05,
"loss": 0.5531,
"step": 614
},
{
"epoch": 2.3301707779886147,
"grad_norm": 0.7869919707438379,
"learning_rate": 1.2323943661971832e-05,
"loss": 0.5277,
"step": 615
},
{
"epoch": 2.333965844402277,
"grad_norm": 0.2799783010145508,
"learning_rate": 1.2253521126760564e-05,
"loss": 0.56,
"step": 616
},
{
"epoch": 2.3377609108159394,
"grad_norm": 0.2914143820918292,
"learning_rate": 1.2183098591549297e-05,
"loss": 0.531,
"step": 617
},
{
"epoch": 2.3415559772296017,
"grad_norm": 0.27626912897990086,
"learning_rate": 1.211267605633803e-05,
"loss": 0.5522,
"step": 618
},
{
"epoch": 2.3453510436432636,
"grad_norm": 0.2890561667152315,
"learning_rate": 1.2042253521126761e-05,
"loss": 0.536,
"step": 619
},
{
"epoch": 2.349146110056926,
"grad_norm": 0.27188803334648987,
"learning_rate": 1.1971830985915493e-05,
"loss": 0.5375,
"step": 620
},
{
"epoch": 2.3529411764705883,
"grad_norm": 0.296537438819136,
"learning_rate": 1.1901408450704225e-05,
"loss": 0.5704,
"step": 621
},
{
"epoch": 2.3567362428842507,
"grad_norm": 0.28321020660015106,
"learning_rate": 1.1830985915492958e-05,
"loss": 0.5227,
"step": 622
},
{
"epoch": 2.3605313092979125,
"grad_norm": 0.2769980841540514,
"learning_rate": 1.176056338028169e-05,
"loss": 0.532,
"step": 623
},
{
"epoch": 2.364326375711575,
"grad_norm": 0.2814547149063904,
"learning_rate": 1.1690140845070422e-05,
"loss": 0.5161,
"step": 624
},
{
"epoch": 2.3681214421252372,
"grad_norm": 0.3093719869067708,
"learning_rate": 1.1619718309859156e-05,
"loss": 0.5544,
"step": 625
},
{
"epoch": 2.3719165085388996,
"grad_norm": 0.2797828206747714,
"learning_rate": 1.1549295774647888e-05,
"loss": 0.5362,
"step": 626
},
{
"epoch": 2.375711574952562,
"grad_norm": 0.283398314007054,
"learning_rate": 1.147887323943662e-05,
"loss": 0.5358,
"step": 627
},
{
"epoch": 2.379506641366224,
"grad_norm": 0.27465848958231376,
"learning_rate": 1.1408450704225353e-05,
"loss": 0.535,
"step": 628
},
{
"epoch": 2.383301707779886,
"grad_norm": 0.2781097636400478,
"learning_rate": 1.1338028169014085e-05,
"loss": 0.5383,
"step": 629
},
{
"epoch": 2.3870967741935485,
"grad_norm": 0.2778565483058106,
"learning_rate": 1.1267605633802817e-05,
"loss": 0.5543,
"step": 630
},
{
"epoch": 2.3908918406072104,
"grad_norm": 0.2823052495052644,
"learning_rate": 1.119718309859155e-05,
"loss": 0.5408,
"step": 631
},
{
"epoch": 2.3946869070208727,
"grad_norm": 0.28404378834113125,
"learning_rate": 1.1126760563380282e-05,
"loss": 0.5318,
"step": 632
},
{
"epoch": 2.398481973434535,
"grad_norm": 0.29031639315037605,
"learning_rate": 1.1056338028169016e-05,
"loss": 0.5425,
"step": 633
},
{
"epoch": 2.4022770398481974,
"grad_norm": 0.29120075006692786,
"learning_rate": 1.0985915492957748e-05,
"loss": 0.5521,
"step": 634
},
{
"epoch": 2.4060721062618597,
"grad_norm": 0.2657851910356759,
"learning_rate": 1.0915492957746478e-05,
"loss": 0.5266,
"step": 635
},
{
"epoch": 2.4098671726755216,
"grad_norm": 0.2856890050169505,
"learning_rate": 1.0845070422535212e-05,
"loss": 0.5144,
"step": 636
},
{
"epoch": 2.413662239089184,
"grad_norm": 0.28660534910588764,
"learning_rate": 1.0774647887323943e-05,
"loss": 0.54,
"step": 637
},
{
"epoch": 2.4174573055028463,
"grad_norm": 0.2704237183249149,
"learning_rate": 1.0704225352112677e-05,
"loss": 0.5115,
"step": 638
},
{
"epoch": 2.4212523719165087,
"grad_norm": 0.2805045230822619,
"learning_rate": 1.0633802816901409e-05,
"loss": 0.5318,
"step": 639
},
{
"epoch": 2.4250474383301706,
"grad_norm": 0.27700827174591425,
"learning_rate": 1.056338028169014e-05,
"loss": 0.5428,
"step": 640
},
{
"epoch": 2.428842504743833,
"grad_norm": 0.279299771259773,
"learning_rate": 1.0492957746478874e-05,
"loss": 0.5154,
"step": 641
},
{
"epoch": 2.4326375711574952,
"grad_norm": 0.2847606511747606,
"learning_rate": 1.0422535211267606e-05,
"loss": 0.5323,
"step": 642
},
{
"epoch": 2.4364326375711576,
"grad_norm": 0.27597585292104576,
"learning_rate": 1.0352112676056338e-05,
"loss": 0.5352,
"step": 643
},
{
"epoch": 2.44022770398482,
"grad_norm": 0.27337185224069727,
"learning_rate": 1.0281690140845072e-05,
"loss": 0.5297,
"step": 644
},
{
"epoch": 2.444022770398482,
"grad_norm": 0.27083176810928383,
"learning_rate": 1.0211267605633803e-05,
"loss": 0.5303,
"step": 645
},
{
"epoch": 2.447817836812144,
"grad_norm": 0.2737732695781148,
"learning_rate": 1.0140845070422535e-05,
"loss": 0.5378,
"step": 646
},
{
"epoch": 2.4516129032258065,
"grad_norm": 0.2677111111096664,
"learning_rate": 1.0070422535211269e-05,
"loss": 0.5341,
"step": 647
},
{
"epoch": 2.455407969639469,
"grad_norm": 0.2735733120204792,
"learning_rate": 1e-05,
"loss": 0.5301,
"step": 648
},
{
"epoch": 2.4592030360531307,
"grad_norm": 0.28513913524417667,
"learning_rate": 9.929577464788733e-06,
"loss": 0.5547,
"step": 649
},
{
"epoch": 2.462998102466793,
"grad_norm": 0.31094952941529436,
"learning_rate": 9.859154929577465e-06,
"loss": 0.5408,
"step": 650
},
{
"epoch": 2.4667931688804554,
"grad_norm": 0.2811147528534109,
"learning_rate": 9.788732394366196e-06,
"loss": 0.5584,
"step": 651
},
{
"epoch": 2.4705882352941178,
"grad_norm": 0.26809202373396435,
"learning_rate": 9.71830985915493e-06,
"loss": 0.5373,
"step": 652
},
{
"epoch": 2.47438330170778,
"grad_norm": 0.2724937239144615,
"learning_rate": 9.647887323943662e-06,
"loss": 0.5346,
"step": 653
},
{
"epoch": 2.478178368121442,
"grad_norm": 0.2771071380026875,
"learning_rate": 9.577464788732394e-06,
"loss": 0.5338,
"step": 654
},
{
"epoch": 2.4819734345351043,
"grad_norm": 0.26805530994942034,
"learning_rate": 9.507042253521127e-06,
"loss": 0.5299,
"step": 655
},
{
"epoch": 2.4857685009487667,
"grad_norm": 0.26287556944605794,
"learning_rate": 9.43661971830986e-06,
"loss": 0.5144,
"step": 656
},
{
"epoch": 2.489563567362429,
"grad_norm": 0.2737136354266476,
"learning_rate": 9.366197183098593e-06,
"loss": 0.5592,
"step": 657
},
{
"epoch": 2.493358633776091,
"grad_norm": 0.27902190529419096,
"learning_rate": 9.295774647887325e-06,
"loss": 0.5314,
"step": 658
},
{
"epoch": 2.4971537001897532,
"grad_norm": 0.3077774048521983,
"learning_rate": 9.225352112676057e-06,
"loss": 0.5158,
"step": 659
},
{
"epoch": 2.5009487666034156,
"grad_norm": 0.2669994106055199,
"learning_rate": 9.15492957746479e-06,
"loss": 0.5241,
"step": 660
},
{
"epoch": 2.504743833017078,
"grad_norm": 0.27136749539382665,
"learning_rate": 9.084507042253522e-06,
"loss": 0.5319,
"step": 661
},
{
"epoch": 2.5085388994307403,
"grad_norm": 0.26965116398921407,
"learning_rate": 9.014084507042254e-06,
"loss": 0.5302,
"step": 662
},
{
"epoch": 2.512333965844402,
"grad_norm": 0.27850031037266687,
"learning_rate": 8.943661971830987e-06,
"loss": 0.5492,
"step": 663
},
{
"epoch": 2.5161290322580645,
"grad_norm": 0.26529291226768226,
"learning_rate": 8.87323943661972e-06,
"loss": 0.5162,
"step": 664
},
{
"epoch": 2.519924098671727,
"grad_norm": 0.27543957726578394,
"learning_rate": 8.802816901408451e-06,
"loss": 0.5467,
"step": 665
},
{
"epoch": 2.5237191650853887,
"grad_norm": 0.2665837507293305,
"learning_rate": 8.732394366197183e-06,
"loss": 0.5278,
"step": 666
},
{
"epoch": 2.527514231499051,
"grad_norm": 0.2718546969346271,
"learning_rate": 8.661971830985915e-06,
"loss": 0.5226,
"step": 667
},
{
"epoch": 2.5313092979127134,
"grad_norm": 0.2669806632930308,
"learning_rate": 8.591549295774648e-06,
"loss": 0.5262,
"step": 668
},
{
"epoch": 2.5351043643263758,
"grad_norm": 0.2804334297760963,
"learning_rate": 8.52112676056338e-06,
"loss": 0.5288,
"step": 669
},
{
"epoch": 2.538899430740038,
"grad_norm": 0.27742567299170273,
"learning_rate": 8.450704225352112e-06,
"loss": 0.5194,
"step": 670
},
{
"epoch": 2.5426944971537004,
"grad_norm": 0.2717615775386385,
"learning_rate": 8.380281690140846e-06,
"loss": 0.5181,
"step": 671
},
{
"epoch": 2.5464895635673623,
"grad_norm": 3.695927757003071,
"learning_rate": 8.309859154929578e-06,
"loss": 0.5427,
"step": 672
},
{
"epoch": 2.5502846299810247,
"grad_norm": 0.2682299104808624,
"learning_rate": 8.23943661971831e-06,
"loss": 0.5345,
"step": 673
},
{
"epoch": 2.554079696394687,
"grad_norm": 0.2705362650588033,
"learning_rate": 8.169014084507043e-06,
"loss": 0.5356,
"step": 674
},
{
"epoch": 2.557874762808349,
"grad_norm": 0.2580754036750414,
"learning_rate": 8.098591549295775e-06,
"loss": 0.5236,
"step": 675
},
{
"epoch": 2.5616698292220113,
"grad_norm": 0.26410479235377304,
"learning_rate": 8.028169014084509e-06,
"loss": 0.5201,
"step": 676
},
{
"epoch": 2.5654648956356736,
"grad_norm": 0.279831512810424,
"learning_rate": 7.95774647887324e-06,
"loss": 0.5237,
"step": 677
},
{
"epoch": 2.569259962049336,
"grad_norm": 0.2758036496147709,
"learning_rate": 7.887323943661972e-06,
"loss": 0.5277,
"step": 678
},
{
"epoch": 2.5730550284629983,
"grad_norm": 0.2740332826711167,
"learning_rate": 7.816901408450706e-06,
"loss": 0.522,
"step": 679
},
{
"epoch": 2.5768500948766606,
"grad_norm": 0.2712319043787441,
"learning_rate": 7.746478873239436e-06,
"loss": 0.543,
"step": 680
},
{
"epoch": 2.5806451612903225,
"grad_norm": 0.27532261280937215,
"learning_rate": 7.67605633802817e-06,
"loss": 0.5458,
"step": 681
},
{
"epoch": 2.584440227703985,
"grad_norm": 0.26526883990973066,
"learning_rate": 7.6056338028169015e-06,
"loss": 0.512,
"step": 682
},
{
"epoch": 2.588235294117647,
"grad_norm": 0.26879212221727106,
"learning_rate": 7.535211267605634e-06,
"loss": 0.5303,
"step": 683
},
{
"epoch": 2.592030360531309,
"grad_norm": 0.2774385453947983,
"learning_rate": 7.464788732394366e-06,
"loss": 0.5402,
"step": 684
},
{
"epoch": 2.5958254269449714,
"grad_norm": 0.257398269752527,
"learning_rate": 7.394366197183099e-06,
"loss": 0.5247,
"step": 685
},
{
"epoch": 2.5996204933586338,
"grad_norm": 0.2565285601842637,
"learning_rate": 7.3239436619718316e-06,
"loss": 0.5164,
"step": 686
},
{
"epoch": 2.603415559772296,
"grad_norm": 0.27360155552561366,
"learning_rate": 7.2535211267605634e-06,
"loss": 0.5267,
"step": 687
},
{
"epoch": 2.6072106261859584,
"grad_norm": 0.26189044051899535,
"learning_rate": 7.183098591549296e-06,
"loss": 0.5166,
"step": 688
},
{
"epoch": 2.6110056925996203,
"grad_norm": 0.26933090087091954,
"learning_rate": 7.112676056338029e-06,
"loss": 0.5159,
"step": 689
},
{
"epoch": 2.6148007590132827,
"grad_norm": 0.2671569269048854,
"learning_rate": 7.042253521126762e-06,
"loss": 0.5079,
"step": 690
},
{
"epoch": 2.618595825426945,
"grad_norm": 0.27670023731940674,
"learning_rate": 6.9718309859154935e-06,
"loss": 0.5244,
"step": 691
},
{
"epoch": 2.6223908918406074,
"grad_norm": 0.2676597443783351,
"learning_rate": 6.901408450704226e-06,
"loss": 0.5229,
"step": 692
},
{
"epoch": 2.6261859582542693,
"grad_norm": 0.27230790474709754,
"learning_rate": 6.830985915492959e-06,
"loss": 0.5432,
"step": 693
},
{
"epoch": 2.6299810246679316,
"grad_norm": 0.2618090537081882,
"learning_rate": 6.76056338028169e-06,
"loss": 0.5298,
"step": 694
},
{
"epoch": 2.633776091081594,
"grad_norm": 0.2762889540671275,
"learning_rate": 6.690140845070423e-06,
"loss": 0.5235,
"step": 695
},
{
"epoch": 2.6375711574952563,
"grad_norm": 0.26801760946888925,
"learning_rate": 6.6197183098591546e-06,
"loss": 0.5265,
"step": 696
},
{
"epoch": 2.6413662239089186,
"grad_norm": 0.26552233684222143,
"learning_rate": 6.549295774647887e-06,
"loss": 0.512,
"step": 697
},
{
"epoch": 2.6451612903225805,
"grad_norm": 0.2718083784887795,
"learning_rate": 6.47887323943662e-06,
"loss": 0.5129,
"step": 698
},
{
"epoch": 2.648956356736243,
"grad_norm": 0.2627152828751525,
"learning_rate": 6.408450704225352e-06,
"loss": 0.5463,
"step": 699
},
{
"epoch": 2.652751423149905,
"grad_norm": 0.2807123544864212,
"learning_rate": 6.338028169014085e-06,
"loss": 0.5212,
"step": 700
},
{
"epoch": 2.656546489563567,
"grad_norm": 0.29805054175627144,
"learning_rate": 6.267605633802817e-06,
"loss": 0.5302,
"step": 701
},
{
"epoch": 2.6603415559772294,
"grad_norm": 0.26983762167101605,
"learning_rate": 6.19718309859155e-06,
"loss": 0.5266,
"step": 702
},
{
"epoch": 2.6641366223908918,
"grad_norm": 0.25765147338316874,
"learning_rate": 6.126760563380282e-06,
"loss": 0.5077,
"step": 703
},
{
"epoch": 2.667931688804554,
"grad_norm": 0.26244389612770763,
"learning_rate": 6.056338028169015e-06,
"loss": 0.5275,
"step": 704
},
{
"epoch": 2.6717267552182165,
"grad_norm": 0.25871700446988755,
"learning_rate": 5.9859154929577465e-06,
"loss": 0.5263,
"step": 705
},
{
"epoch": 2.675521821631879,
"grad_norm": 0.25937072857521587,
"learning_rate": 5.915492957746479e-06,
"loss": 0.5293,
"step": 706
},
{
"epoch": 2.6793168880455407,
"grad_norm": 0.26124300321138905,
"learning_rate": 5.845070422535211e-06,
"loss": 0.5329,
"step": 707
},
{
"epoch": 2.683111954459203,
"grad_norm": 0.27733223248436273,
"learning_rate": 5.774647887323944e-06,
"loss": 0.5388,
"step": 708
},
{
"epoch": 2.6869070208728654,
"grad_norm": 0.26778317763534515,
"learning_rate": 5.7042253521126766e-06,
"loss": 0.5163,
"step": 709
},
{
"epoch": 2.6907020872865273,
"grad_norm": 0.2606831932890902,
"learning_rate": 5.6338028169014084e-06,
"loss": 0.5375,
"step": 710
},
{
"epoch": 2.6944971537001896,
"grad_norm": 0.2676107377258338,
"learning_rate": 5.563380281690141e-06,
"loss": 0.5331,
"step": 711
},
{
"epoch": 2.698292220113852,
"grad_norm": 0.2610782827132618,
"learning_rate": 5.492957746478874e-06,
"loss": 0.5162,
"step": 712
},
{
"epoch": 2.7020872865275143,
"grad_norm": 0.256554297163306,
"learning_rate": 5.422535211267606e-06,
"loss": 0.5108,
"step": 713
},
{
"epoch": 2.7058823529411766,
"grad_norm": 0.25195774405958,
"learning_rate": 5.3521126760563385e-06,
"loss": 0.5071,
"step": 714
},
{
"epoch": 2.709677419354839,
"grad_norm": 0.2604563422371166,
"learning_rate": 5.28169014084507e-06,
"loss": 0.5334,
"step": 715
},
{
"epoch": 2.713472485768501,
"grad_norm": 0.25769535239143054,
"learning_rate": 5.211267605633803e-06,
"loss": 0.5147,
"step": 716
},
{
"epoch": 2.717267552182163,
"grad_norm": 0.27049241184577183,
"learning_rate": 5.140845070422536e-06,
"loss": 0.5396,
"step": 717
},
{
"epoch": 2.7210626185958255,
"grad_norm": 0.2542624069634068,
"learning_rate": 5.070422535211268e-06,
"loss": 0.5167,
"step": 718
},
{
"epoch": 2.7248576850094874,
"grad_norm": 0.27647020901115144,
"learning_rate": 5e-06,
"loss": 0.5124,
"step": 719
},
{
"epoch": 2.72865275142315,
"grad_norm": 0.27668074054609093,
"learning_rate": 4.929577464788732e-06,
"loss": 0.5349,
"step": 720
},
{
"epoch": 2.732447817836812,
"grad_norm": 0.266997161461896,
"learning_rate": 4.859154929577465e-06,
"loss": 0.5338,
"step": 721
},
{
"epoch": 2.7362428842504745,
"grad_norm": 0.25499285462315663,
"learning_rate": 4.788732394366197e-06,
"loss": 0.5115,
"step": 722
},
{
"epoch": 2.740037950664137,
"grad_norm": 0.27013448186002204,
"learning_rate": 4.71830985915493e-06,
"loss": 0.5409,
"step": 723
},
{
"epoch": 2.7438330170777987,
"grad_norm": 0.253803129514788,
"learning_rate": 4.647887323943662e-06,
"loss": 0.5054,
"step": 724
},
{
"epoch": 2.747628083491461,
"grad_norm": 0.26734493420989397,
"learning_rate": 4.577464788732395e-06,
"loss": 0.5185,
"step": 725
},
{
"epoch": 2.7514231499051234,
"grad_norm": 0.2518253836546897,
"learning_rate": 4.507042253521127e-06,
"loss": 0.4995,
"step": 726
},
{
"epoch": 2.7552182163187857,
"grad_norm": 0.26729850134714506,
"learning_rate": 4.43661971830986e-06,
"loss": 0.5271,
"step": 727
},
{
"epoch": 2.7590132827324476,
"grad_norm": 0.2557254985730845,
"learning_rate": 4.3661971830985915e-06,
"loss": 0.5145,
"step": 728
},
{
"epoch": 2.76280834914611,
"grad_norm": 0.2600062548594085,
"learning_rate": 4.295774647887324e-06,
"loss": 0.5383,
"step": 729
},
{
"epoch": 2.7666034155597723,
"grad_norm": 0.26190547953904125,
"learning_rate": 4.225352112676056e-06,
"loss": 0.5187,
"step": 730
},
{
"epoch": 2.7703984819734346,
"grad_norm": 0.2551484956199937,
"learning_rate": 4.154929577464789e-06,
"loss": 0.5247,
"step": 731
},
{
"epoch": 2.774193548387097,
"grad_norm": 0.32790367354195843,
"learning_rate": 4.0845070422535216e-06,
"loss": 0.5256,
"step": 732
},
{
"epoch": 2.777988614800759,
"grad_norm": 0.2529288981258316,
"learning_rate": 4.014084507042254e-06,
"loss": 0.5206,
"step": 733
},
{
"epoch": 2.781783681214421,
"grad_norm": 0.25751867278005997,
"learning_rate": 3.943661971830986e-06,
"loss": 0.5161,
"step": 734
},
{
"epoch": 2.7855787476280836,
"grad_norm": 0.2537828228850862,
"learning_rate": 3.873239436619718e-06,
"loss": 0.5243,
"step": 735
},
{
"epoch": 2.789373814041746,
"grad_norm": 0.26059153621633013,
"learning_rate": 3.8028169014084508e-06,
"loss": 0.5282,
"step": 736
},
{
"epoch": 2.793168880455408,
"grad_norm": 0.2652199144684476,
"learning_rate": 3.732394366197183e-06,
"loss": 0.5185,
"step": 737
},
{
"epoch": 2.79696394686907,
"grad_norm": 0.25376408411959034,
"learning_rate": 3.6619718309859158e-06,
"loss": 0.5126,
"step": 738
},
{
"epoch": 2.8007590132827325,
"grad_norm": 0.2566069834499804,
"learning_rate": 3.591549295774648e-06,
"loss": 0.5193,
"step": 739
},
{
"epoch": 2.804554079696395,
"grad_norm": 0.26572339887853663,
"learning_rate": 3.521126760563381e-06,
"loss": 0.5308,
"step": 740
},
{
"epoch": 2.808349146110057,
"grad_norm": 0.27879502012748697,
"learning_rate": 3.450704225352113e-06,
"loss": 0.5275,
"step": 741
},
{
"epoch": 2.812144212523719,
"grad_norm": 0.2622324832980246,
"learning_rate": 3.380281690140845e-06,
"loss": 0.5131,
"step": 742
},
{
"epoch": 2.8159392789373814,
"grad_norm": 0.24537825898029234,
"learning_rate": 3.3098591549295773e-06,
"loss": 0.4856,
"step": 743
},
{
"epoch": 2.8197343453510437,
"grad_norm": 0.2543828318551744,
"learning_rate": 3.23943661971831e-06,
"loss": 0.5246,
"step": 744
},
{
"epoch": 2.8235294117647056,
"grad_norm": 0.2536595352782432,
"learning_rate": 3.1690140845070423e-06,
"loss": 0.519,
"step": 745
},
{
"epoch": 2.827324478178368,
"grad_norm": 0.25880101874103556,
"learning_rate": 3.098591549295775e-06,
"loss": 0.5142,
"step": 746
},
{
"epoch": 2.8311195445920303,
"grad_norm": 0.25135812908983174,
"learning_rate": 3.0281690140845073e-06,
"loss": 0.52,
"step": 747
},
{
"epoch": 2.8349146110056926,
"grad_norm": 0.25161083849538896,
"learning_rate": 2.9577464788732396e-06,
"loss": 0.5174,
"step": 748
},
{
"epoch": 2.838709677419355,
"grad_norm": 0.249365627729948,
"learning_rate": 2.887323943661972e-06,
"loss": 0.5049,
"step": 749
},
{
"epoch": 2.8425047438330173,
"grad_norm": 0.27068313935344396,
"learning_rate": 2.8169014084507042e-06,
"loss": 0.5418,
"step": 750
},
{
"epoch": 2.846299810246679,
"grad_norm": 0.25858282367441654,
"learning_rate": 2.746478873239437e-06,
"loss": 0.5264,
"step": 751
},
{
"epoch": 2.8500948766603416,
"grad_norm": 0.2546112010044873,
"learning_rate": 2.6760563380281692e-06,
"loss": 0.5283,
"step": 752
},
{
"epoch": 2.853889943074004,
"grad_norm": 0.2548188993718373,
"learning_rate": 2.6056338028169015e-06,
"loss": 0.4955,
"step": 753
},
{
"epoch": 2.857685009487666,
"grad_norm": 0.2544369797297898,
"learning_rate": 2.535211267605634e-06,
"loss": 0.5169,
"step": 754
},
{
"epoch": 2.861480075901328,
"grad_norm": 0.25562955536269694,
"learning_rate": 2.464788732394366e-06,
"loss": 0.5207,
"step": 755
},
{
"epoch": 2.8652751423149905,
"grad_norm": 0.2550210562564166,
"learning_rate": 2.3943661971830984e-06,
"loss": 0.5014,
"step": 756
},
{
"epoch": 2.869070208728653,
"grad_norm": 0.2629661666159187,
"learning_rate": 2.323943661971831e-06,
"loss": 0.5472,
"step": 757
},
{
"epoch": 2.872865275142315,
"grad_norm": 0.25157037569143936,
"learning_rate": 2.2535211267605635e-06,
"loss": 0.5201,
"step": 758
},
{
"epoch": 2.8766603415559775,
"grad_norm": 0.25262767099521116,
"learning_rate": 2.1830985915492958e-06,
"loss": 0.5121,
"step": 759
},
{
"epoch": 2.8804554079696394,
"grad_norm": 0.2567966604132278,
"learning_rate": 2.112676056338028e-06,
"loss": 0.5106,
"step": 760
},
{
"epoch": 2.8842504743833017,
"grad_norm": 0.24721204656096993,
"learning_rate": 2.0422535211267608e-06,
"loss": 0.522,
"step": 761
},
{
"epoch": 2.888045540796964,
"grad_norm": 0.2608511744958355,
"learning_rate": 1.971830985915493e-06,
"loss": 0.5347,
"step": 762
},
{
"epoch": 2.891840607210626,
"grad_norm": 0.251443140121684,
"learning_rate": 1.9014084507042254e-06,
"loss": 0.5128,
"step": 763
},
{
"epoch": 2.8956356736242883,
"grad_norm": 0.2546872353977325,
"learning_rate": 1.8309859154929579e-06,
"loss": 0.5166,
"step": 764
},
{
"epoch": 2.8994307400379506,
"grad_norm": 0.24785012743166315,
"learning_rate": 1.7605633802816904e-06,
"loss": 0.5242,
"step": 765
},
{
"epoch": 2.903225806451613,
"grad_norm": 0.24898154056510727,
"learning_rate": 1.6901408450704225e-06,
"loss": 0.5194,
"step": 766
},
{
"epoch": 2.9070208728652753,
"grad_norm": 0.24011875250619547,
"learning_rate": 1.619718309859155e-06,
"loss": 0.5068,
"step": 767
},
{
"epoch": 2.9108159392789372,
"grad_norm": 0.24850835533756865,
"learning_rate": 1.5492957746478875e-06,
"loss": 0.5129,
"step": 768
},
{
"epoch": 2.9146110056925996,
"grad_norm": 0.24689534757466478,
"learning_rate": 1.4788732394366198e-06,
"loss": 0.5043,
"step": 769
},
{
"epoch": 2.918406072106262,
"grad_norm": 0.25299167125111766,
"learning_rate": 1.4084507042253521e-06,
"loss": 0.5213,
"step": 770
},
{
"epoch": 2.9222011385199242,
"grad_norm": 0.24718063426634684,
"learning_rate": 1.3380281690140846e-06,
"loss": 0.5092,
"step": 771
},
{
"epoch": 2.925996204933586,
"grad_norm": 0.4496626206530403,
"learning_rate": 1.267605633802817e-06,
"loss": 0.5218,
"step": 772
},
{
"epoch": 2.9297912713472485,
"grad_norm": 0.27131925369394255,
"learning_rate": 1.1971830985915492e-06,
"loss": 0.5243,
"step": 773
},
{
"epoch": 2.933586337760911,
"grad_norm": 0.2531657926909203,
"learning_rate": 1.1267605633802817e-06,
"loss": 0.5198,
"step": 774
},
{
"epoch": 2.937381404174573,
"grad_norm": 0.24333248660649015,
"learning_rate": 1.056338028169014e-06,
"loss": 0.5079,
"step": 775
},
{
"epoch": 2.9411764705882355,
"grad_norm": 0.25810615958690464,
"learning_rate": 9.859154929577465e-07,
"loss": 0.5155,
"step": 776
},
{
"epoch": 2.9449715370018974,
"grad_norm": 0.2572122860259345,
"learning_rate": 9.154929577464789e-07,
"loss": 0.5303,
"step": 777
},
{
"epoch": 2.9487666034155597,
"grad_norm": 0.24613803884061292,
"learning_rate": 8.450704225352112e-07,
"loss": 0.5201,
"step": 778
},
{
"epoch": 2.952561669829222,
"grad_norm": 0.2513759548301003,
"learning_rate": 7.746478873239438e-07,
"loss": 0.5109,
"step": 779
},
{
"epoch": 2.956356736242884,
"grad_norm": 0.26337162401626346,
"learning_rate": 7.042253521126761e-07,
"loss": 0.5405,
"step": 780
},
{
"epoch": 2.9601518026565463,
"grad_norm": 0.24729566735616695,
"learning_rate": 6.338028169014085e-07,
"loss": 0.4995,
"step": 781
},
{
"epoch": 2.9639468690702087,
"grad_norm": 0.2688897364872683,
"learning_rate": 5.633802816901409e-07,
"loss": 0.5172,
"step": 782
},
{
"epoch": 2.967741935483871,
"grad_norm": 0.255054655194233,
"learning_rate": 4.929577464788733e-07,
"loss": 0.5083,
"step": 783
},
{
"epoch": 2.9715370018975333,
"grad_norm": 0.24777486782876265,
"learning_rate": 4.225352112676056e-07,
"loss": 0.5159,
"step": 784
},
{
"epoch": 2.9753320683111957,
"grad_norm": 0.255639600519456,
"learning_rate": 3.5211267605633803e-07,
"loss": 0.5155,
"step": 785
},
{
"epoch": 2.9791271347248576,
"grad_norm": 0.2536850585325846,
"learning_rate": 2.8169014084507043e-07,
"loss": 0.5173,
"step": 786
},
{
"epoch": 2.98292220113852,
"grad_norm": 0.2502987412329274,
"learning_rate": 2.112676056338028e-07,
"loss": 0.5189,
"step": 787
},
{
"epoch": 2.9867172675521823,
"grad_norm": 0.2561066491746939,
"learning_rate": 1.4084507042253522e-07,
"loss": 0.5128,
"step": 788
},
{
"epoch": 2.990512333965844,
"grad_norm": 0.2519597130832323,
"learning_rate": 7.042253521126761e-08,
"loss": 0.5178,
"step": 789
},
{
"epoch": 2.990512333965844,
"step": 789,
"total_flos": 8.761222688732611e+18,
"train_loss": 1.0859681145773188,
"train_runtime": 27041.1536,
"train_samples_per_second": 0.468,
"train_steps_per_second": 0.029
}
],
"logging_steps": 1,
"max_steps": 789,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 8.761222688732611e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}