flyingbugs's picture
Model save
22ff50f verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9884526558891453,
"eval_steps": 500,
"global_step": 648,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004618937644341801,
"grad_norm": 45.25296644144981,
"learning_rate": 7.692307692307694e-07,
"loss": 11.8644,
"step": 1
},
{
"epoch": 0.009237875288683603,
"grad_norm": 44.825429016294684,
"learning_rate": 1.5384615384615387e-06,
"loss": 11.8618,
"step": 2
},
{
"epoch": 0.013856812933025405,
"grad_norm": 44.93191688101396,
"learning_rate": 2.307692307692308e-06,
"loss": 11.898,
"step": 3
},
{
"epoch": 0.018475750577367205,
"grad_norm": 46.84259305412865,
"learning_rate": 3.0769230769230774e-06,
"loss": 11.7075,
"step": 4
},
{
"epoch": 0.023094688221709007,
"grad_norm": 46.492693043930196,
"learning_rate": 3.846153846153847e-06,
"loss": 11.7197,
"step": 5
},
{
"epoch": 0.02771362586605081,
"grad_norm": 64.6690544238958,
"learning_rate": 4.615384615384616e-06,
"loss": 10.7887,
"step": 6
},
{
"epoch": 0.03233256351039261,
"grad_norm": 82.32022307686906,
"learning_rate": 5.3846153846153855e-06,
"loss": 10.0161,
"step": 7
},
{
"epoch": 0.03695150115473441,
"grad_norm": 55.35331252001288,
"learning_rate": 6.153846153846155e-06,
"loss": 6.2174,
"step": 8
},
{
"epoch": 0.04157043879907621,
"grad_norm": 46.95163849907996,
"learning_rate": 6.923076923076923e-06,
"loss": 5.5606,
"step": 9
},
{
"epoch": 0.046189376443418015,
"grad_norm": 35.85106927473068,
"learning_rate": 7.692307692307694e-06,
"loss": 4.7142,
"step": 10
},
{
"epoch": 0.050808314087759814,
"grad_norm": 10.138891755014328,
"learning_rate": 8.461538461538462e-06,
"loss": 2.988,
"step": 11
},
{
"epoch": 0.05542725173210162,
"grad_norm": 7.169754957583301,
"learning_rate": 9.230769230769232e-06,
"loss": 2.9407,
"step": 12
},
{
"epoch": 0.06004618937644342,
"grad_norm": 5.418802790980522,
"learning_rate": 1e-05,
"loss": 2.6657,
"step": 13
},
{
"epoch": 0.06466512702078522,
"grad_norm": 3.936099245971307,
"learning_rate": 1.0769230769230771e-05,
"loss": 2.5281,
"step": 14
},
{
"epoch": 0.06928406466512702,
"grad_norm": 5.215604698240768,
"learning_rate": 1.153846153846154e-05,
"loss": 2.3639,
"step": 15
},
{
"epoch": 0.07390300230946882,
"grad_norm": 2.9877601710797608,
"learning_rate": 1.230769230769231e-05,
"loss": 2.2834,
"step": 16
},
{
"epoch": 0.07852193995381063,
"grad_norm": 2.227678705858673,
"learning_rate": 1.3076923076923078e-05,
"loss": 2.1987,
"step": 17
},
{
"epoch": 0.08314087759815242,
"grad_norm": 1.854233305485284,
"learning_rate": 1.3846153846153847e-05,
"loss": 2.1381,
"step": 18
},
{
"epoch": 0.08775981524249422,
"grad_norm": 1.3401909149269855,
"learning_rate": 1.4615384615384617e-05,
"loss": 2.0858,
"step": 19
},
{
"epoch": 0.09237875288683603,
"grad_norm": 1.6368616490215928,
"learning_rate": 1.5384615384615387e-05,
"loss": 2.097,
"step": 20
},
{
"epoch": 0.09699769053117784,
"grad_norm": 1.2443879158341338,
"learning_rate": 1.6153846153846154e-05,
"loss": 1.9641,
"step": 21
},
{
"epoch": 0.10161662817551963,
"grad_norm": 1.0144555760888003,
"learning_rate": 1.6923076923076924e-05,
"loss": 1.9293,
"step": 22
},
{
"epoch": 0.10623556581986143,
"grad_norm": 0.9225678791817972,
"learning_rate": 1.7692307692307694e-05,
"loss": 1.9122,
"step": 23
},
{
"epoch": 0.11085450346420324,
"grad_norm": 0.9641321067391283,
"learning_rate": 1.8461538461538465e-05,
"loss": 1.9113,
"step": 24
},
{
"epoch": 0.11547344110854503,
"grad_norm": 1.192862015459919,
"learning_rate": 1.923076923076923e-05,
"loss": 1.7089,
"step": 25
},
{
"epoch": 0.12009237875288684,
"grad_norm": 0.8465109739378411,
"learning_rate": 2e-05,
"loss": 1.8069,
"step": 26
},
{
"epoch": 0.12471131639722864,
"grad_norm": 0.7695542319857172,
"learning_rate": 2.0769230769230772e-05,
"loss": 1.8287,
"step": 27
},
{
"epoch": 0.12933025404157045,
"grad_norm": 0.9293831189277315,
"learning_rate": 2.1538461538461542e-05,
"loss": 1.8104,
"step": 28
},
{
"epoch": 0.13394919168591224,
"grad_norm": 0.7185960112632453,
"learning_rate": 2.230769230769231e-05,
"loss": 1.7064,
"step": 29
},
{
"epoch": 0.13856812933025403,
"grad_norm": 0.7336288783029987,
"learning_rate": 2.307692307692308e-05,
"loss": 1.6517,
"step": 30
},
{
"epoch": 0.14318706697459585,
"grad_norm": 0.6828766290076126,
"learning_rate": 2.384615384615385e-05,
"loss": 1.7431,
"step": 31
},
{
"epoch": 0.14780600461893764,
"grad_norm": 0.5145712947833815,
"learning_rate": 2.461538461538462e-05,
"loss": 1.5834,
"step": 32
},
{
"epoch": 0.15242494226327943,
"grad_norm": 0.6417478022119582,
"learning_rate": 2.5384615384615383e-05,
"loss": 1.692,
"step": 33
},
{
"epoch": 0.15704387990762125,
"grad_norm": 0.5688173114727769,
"learning_rate": 2.6153846153846157e-05,
"loss": 1.6158,
"step": 34
},
{
"epoch": 0.16166281755196305,
"grad_norm": 0.8263990984938342,
"learning_rate": 2.6923076923076923e-05,
"loss": 1.6364,
"step": 35
},
{
"epoch": 0.16628175519630484,
"grad_norm": 1.0264849469612845,
"learning_rate": 2.7692307692307694e-05,
"loss": 1.5413,
"step": 36
},
{
"epoch": 0.17090069284064666,
"grad_norm": 2.250035537035704,
"learning_rate": 2.846153846153846e-05,
"loss": 1.5871,
"step": 37
},
{
"epoch": 0.17551963048498845,
"grad_norm": 1.002275067056586,
"learning_rate": 2.9230769230769234e-05,
"loss": 1.5433,
"step": 38
},
{
"epoch": 0.18013856812933027,
"grad_norm": 1.9935058812341042,
"learning_rate": 3e-05,
"loss": 1.6011,
"step": 39
},
{
"epoch": 0.18475750577367206,
"grad_norm": 1.206807050823559,
"learning_rate": 3.0769230769230774e-05,
"loss": 1.5819,
"step": 40
},
{
"epoch": 0.18937644341801385,
"grad_norm": 1.1736278924769548,
"learning_rate": 3.153846153846154e-05,
"loss": 1.5122,
"step": 41
},
{
"epoch": 0.19399538106235567,
"grad_norm": 0.8409129028257115,
"learning_rate": 3.230769230769231e-05,
"loss": 1.5071,
"step": 42
},
{
"epoch": 0.19861431870669746,
"grad_norm": 0.6625553698573982,
"learning_rate": 3.307692307692308e-05,
"loss": 1.53,
"step": 43
},
{
"epoch": 0.20323325635103925,
"grad_norm": 0.756310059950587,
"learning_rate": 3.384615384615385e-05,
"loss": 1.5769,
"step": 44
},
{
"epoch": 0.20785219399538107,
"grad_norm": 8.472329056228727,
"learning_rate": 3.461538461538462e-05,
"loss": 1.4221,
"step": 45
},
{
"epoch": 0.21247113163972287,
"grad_norm": 0.7917462272605508,
"learning_rate": 3.538461538461539e-05,
"loss": 1.5526,
"step": 46
},
{
"epoch": 0.21709006928406466,
"grad_norm": 0.7580955226244473,
"learning_rate": 3.615384615384615e-05,
"loss": 1.5343,
"step": 47
},
{
"epoch": 0.22170900692840648,
"grad_norm": 0.6632446475914987,
"learning_rate": 3.692307692307693e-05,
"loss": 1.492,
"step": 48
},
{
"epoch": 0.22632794457274827,
"grad_norm": 0.6129569543834651,
"learning_rate": 3.769230769230769e-05,
"loss": 1.4069,
"step": 49
},
{
"epoch": 0.23094688221709006,
"grad_norm": 0.5090956204644423,
"learning_rate": 3.846153846153846e-05,
"loss": 1.405,
"step": 50
},
{
"epoch": 0.23556581986143188,
"grad_norm": 0.657330045548283,
"learning_rate": 3.923076923076923e-05,
"loss": 1.4713,
"step": 51
},
{
"epoch": 0.24018475750577367,
"grad_norm": 0.47017845203857506,
"learning_rate": 4e-05,
"loss": 1.4125,
"step": 52
},
{
"epoch": 0.24480369515011546,
"grad_norm": 0.5841016361582656,
"learning_rate": 4.0769230769230773e-05,
"loss": 1.4348,
"step": 53
},
{
"epoch": 0.24942263279445728,
"grad_norm": 0.3994410855687934,
"learning_rate": 4.1538461538461544e-05,
"loss": 1.4714,
"step": 54
},
{
"epoch": 0.2540415704387991,
"grad_norm": 0.558514116404996,
"learning_rate": 4.230769230769231e-05,
"loss": 1.394,
"step": 55
},
{
"epoch": 0.2586605080831409,
"grad_norm": 0.48183529201650405,
"learning_rate": 4.3076923076923084e-05,
"loss": 1.4467,
"step": 56
},
{
"epoch": 0.2632794457274827,
"grad_norm": 0.48228333236038273,
"learning_rate": 4.384615384615385e-05,
"loss": 1.4255,
"step": 57
},
{
"epoch": 0.2678983833718245,
"grad_norm": 0.5043914232943452,
"learning_rate": 4.461538461538462e-05,
"loss": 1.4421,
"step": 58
},
{
"epoch": 0.27251732101616627,
"grad_norm": 0.3865694791459168,
"learning_rate": 4.538461538461539e-05,
"loss": 1.3688,
"step": 59
},
{
"epoch": 0.27713625866050806,
"grad_norm": 0.4632516969738209,
"learning_rate": 4.615384615384616e-05,
"loss": 1.404,
"step": 60
},
{
"epoch": 0.2817551963048499,
"grad_norm": 0.4217951236749384,
"learning_rate": 4.692307692307693e-05,
"loss": 1.3532,
"step": 61
},
{
"epoch": 0.2863741339491917,
"grad_norm": 0.43568761143350265,
"learning_rate": 4.76923076923077e-05,
"loss": 1.4655,
"step": 62
},
{
"epoch": 0.2909930715935335,
"grad_norm": 0.41267649903587583,
"learning_rate": 4.846153846153846e-05,
"loss": 1.3825,
"step": 63
},
{
"epoch": 0.2956120092378753,
"grad_norm": 0.38334285686596203,
"learning_rate": 4.923076923076924e-05,
"loss": 1.3914,
"step": 64
},
{
"epoch": 0.3002309468822171,
"grad_norm": 0.4861746271469332,
"learning_rate": 5e-05,
"loss": 1.3829,
"step": 65
},
{
"epoch": 0.30484988452655887,
"grad_norm": 0.4087503555578362,
"learning_rate": 4.991423670668954e-05,
"loss": 1.4243,
"step": 66
},
{
"epoch": 0.3094688221709007,
"grad_norm": 0.392542277496221,
"learning_rate": 4.982847341337908e-05,
"loss": 1.3669,
"step": 67
},
{
"epoch": 0.3140877598152425,
"grad_norm": 0.38201084354416653,
"learning_rate": 4.9742710120068616e-05,
"loss": 1.2792,
"step": 68
},
{
"epoch": 0.3187066974595843,
"grad_norm": 0.4217958358323986,
"learning_rate": 4.965694682675815e-05,
"loss": 1.3613,
"step": 69
},
{
"epoch": 0.3233256351039261,
"grad_norm": 0.357490874226858,
"learning_rate": 4.957118353344769e-05,
"loss": 1.3085,
"step": 70
},
{
"epoch": 0.3279445727482679,
"grad_norm": 0.4292971818427176,
"learning_rate": 4.948542024013723e-05,
"loss": 1.3364,
"step": 71
},
{
"epoch": 0.3325635103926097,
"grad_norm": 0.41481319251416343,
"learning_rate": 4.9399656946826764e-05,
"loss": 1.3797,
"step": 72
},
{
"epoch": 0.3371824480369515,
"grad_norm": 0.41227298551638986,
"learning_rate": 4.931389365351629e-05,
"loss": 1.345,
"step": 73
},
{
"epoch": 0.3418013856812933,
"grad_norm": 0.4248874538669992,
"learning_rate": 4.922813036020583e-05,
"loss": 1.4199,
"step": 74
},
{
"epoch": 0.3464203233256351,
"grad_norm": 0.37707967458743286,
"learning_rate": 4.914236706689537e-05,
"loss": 1.3285,
"step": 75
},
{
"epoch": 0.3510392609699769,
"grad_norm": 0.36340729414019757,
"learning_rate": 4.9056603773584906e-05,
"loss": 1.2781,
"step": 76
},
{
"epoch": 0.3556581986143187,
"grad_norm": 0.35839863339126377,
"learning_rate": 4.897084048027444e-05,
"loss": 1.3159,
"step": 77
},
{
"epoch": 0.36027713625866054,
"grad_norm": 0.3969444246860546,
"learning_rate": 4.8885077186963984e-05,
"loss": 1.3052,
"step": 78
},
{
"epoch": 0.3648960739030023,
"grad_norm": 0.33145904528277687,
"learning_rate": 4.879931389365352e-05,
"loss": 1.2502,
"step": 79
},
{
"epoch": 0.3695150115473441,
"grad_norm": 0.36882954490737097,
"learning_rate": 4.8713550600343055e-05,
"loss": 1.2702,
"step": 80
},
{
"epoch": 0.3741339491916859,
"grad_norm": 0.4296769872556639,
"learning_rate": 4.862778730703259e-05,
"loss": 1.3097,
"step": 81
},
{
"epoch": 0.3787528868360277,
"grad_norm": 0.36740977790850166,
"learning_rate": 4.854202401372213e-05,
"loss": 1.295,
"step": 82
},
{
"epoch": 0.3833718244803695,
"grad_norm": 0.3569789067532997,
"learning_rate": 4.845626072041167e-05,
"loss": 1.3317,
"step": 83
},
{
"epoch": 0.38799076212471134,
"grad_norm": 0.3875185646389721,
"learning_rate": 4.8370497427101204e-05,
"loss": 1.3065,
"step": 84
},
{
"epoch": 0.39260969976905313,
"grad_norm": 0.3467070651984785,
"learning_rate": 4.828473413379074e-05,
"loss": 1.2553,
"step": 85
},
{
"epoch": 0.3972286374133949,
"grad_norm": 0.37921420328698663,
"learning_rate": 4.819897084048028e-05,
"loss": 1.3128,
"step": 86
},
{
"epoch": 0.4018475750577367,
"grad_norm": 0.3557011389885715,
"learning_rate": 4.811320754716982e-05,
"loss": 1.3516,
"step": 87
},
{
"epoch": 0.4064665127020785,
"grad_norm": 0.34907978146071894,
"learning_rate": 4.8027444253859346e-05,
"loss": 1.239,
"step": 88
},
{
"epoch": 0.4110854503464203,
"grad_norm": 0.5265860526142492,
"learning_rate": 4.794168096054888e-05,
"loss": 1.3027,
"step": 89
},
{
"epoch": 0.41570438799076215,
"grad_norm": 0.4542046995416649,
"learning_rate": 4.7855917667238424e-05,
"loss": 1.2369,
"step": 90
},
{
"epoch": 0.42032332563510394,
"grad_norm": 0.4939638405917615,
"learning_rate": 4.777015437392796e-05,
"loss": 1.3235,
"step": 91
},
{
"epoch": 0.42494226327944573,
"grad_norm": 0.3430518477169353,
"learning_rate": 4.7684391080617495e-05,
"loss": 1.2758,
"step": 92
},
{
"epoch": 0.4295612009237875,
"grad_norm": 0.3963825770199695,
"learning_rate": 4.759862778730704e-05,
"loss": 1.2462,
"step": 93
},
{
"epoch": 0.4341801385681293,
"grad_norm": 0.5754776095260544,
"learning_rate": 4.751286449399657e-05,
"loss": 1.2786,
"step": 94
},
{
"epoch": 0.4387990762124711,
"grad_norm": 0.396473177650489,
"learning_rate": 4.742710120068611e-05,
"loss": 1.2397,
"step": 95
},
{
"epoch": 0.44341801385681295,
"grad_norm": 0.4241046241414141,
"learning_rate": 4.7341337907375644e-05,
"loss": 1.2946,
"step": 96
},
{
"epoch": 0.44803695150115475,
"grad_norm": 0.6185434363973696,
"learning_rate": 4.7255574614065186e-05,
"loss": 1.2622,
"step": 97
},
{
"epoch": 0.45265588914549654,
"grad_norm": 0.6064709389902101,
"learning_rate": 4.716981132075472e-05,
"loss": 1.3209,
"step": 98
},
{
"epoch": 0.45727482678983833,
"grad_norm": 0.3841650978625775,
"learning_rate": 4.708404802744426e-05,
"loss": 1.3221,
"step": 99
},
{
"epoch": 0.4618937644341801,
"grad_norm": 0.5306346360303769,
"learning_rate": 4.699828473413379e-05,
"loss": 1.2391,
"step": 100
},
{
"epoch": 0.4665127020785219,
"grad_norm": 0.40624469524856405,
"learning_rate": 4.6912521440823335e-05,
"loss": 1.2917,
"step": 101
},
{
"epoch": 0.47113163972286376,
"grad_norm": 0.3748347930578789,
"learning_rate": 4.682675814751287e-05,
"loss": 1.2727,
"step": 102
},
{
"epoch": 0.47575057736720555,
"grad_norm": 0.39353112215227126,
"learning_rate": 4.6740994854202406e-05,
"loss": 1.2427,
"step": 103
},
{
"epoch": 0.48036951501154734,
"grad_norm": 0.4271086229392175,
"learning_rate": 4.665523156089194e-05,
"loss": 1.2567,
"step": 104
},
{
"epoch": 0.48498845265588914,
"grad_norm": 0.36741077006033196,
"learning_rate": 4.656946826758148e-05,
"loss": 1.2629,
"step": 105
},
{
"epoch": 0.4896073903002309,
"grad_norm": 0.46352758388784876,
"learning_rate": 4.648370497427101e-05,
"loss": 1.2285,
"step": 106
},
{
"epoch": 0.4942263279445728,
"grad_norm": 0.4176994384483685,
"learning_rate": 4.639794168096055e-05,
"loss": 1.2691,
"step": 107
},
{
"epoch": 0.49884526558891457,
"grad_norm": 0.40189683263467985,
"learning_rate": 4.631217838765009e-05,
"loss": 1.2682,
"step": 108
},
{
"epoch": 0.5034642032332564,
"grad_norm": 0.42699205422680747,
"learning_rate": 4.6226415094339625e-05,
"loss": 1.2517,
"step": 109
},
{
"epoch": 0.5080831408775982,
"grad_norm": 0.4698753587746509,
"learning_rate": 4.614065180102916e-05,
"loss": 1.2393,
"step": 110
},
{
"epoch": 0.5127020785219399,
"grad_norm": 0.3504204810683879,
"learning_rate": 4.6054888507718697e-05,
"loss": 1.2566,
"step": 111
},
{
"epoch": 0.5173210161662818,
"grad_norm": 0.4524525116850949,
"learning_rate": 4.596912521440824e-05,
"loss": 1.2733,
"step": 112
},
{
"epoch": 0.5219399538106235,
"grad_norm": 0.4230656926639352,
"learning_rate": 4.5883361921097774e-05,
"loss": 1.2319,
"step": 113
},
{
"epoch": 0.5265588914549654,
"grad_norm": 0.3820384658447351,
"learning_rate": 4.579759862778731e-05,
"loss": 1.2558,
"step": 114
},
{
"epoch": 0.5311778290993071,
"grad_norm": 0.3665450230109494,
"learning_rate": 4.5711835334476845e-05,
"loss": 1.2881,
"step": 115
},
{
"epoch": 0.535796766743649,
"grad_norm": 0.4087483894523286,
"learning_rate": 4.562607204116639e-05,
"loss": 1.26,
"step": 116
},
{
"epoch": 0.5404157043879908,
"grad_norm": 0.41146166918870436,
"learning_rate": 4.554030874785592e-05,
"loss": 1.2322,
"step": 117
},
{
"epoch": 0.5450346420323325,
"grad_norm": 0.4055034728492308,
"learning_rate": 4.545454545454546e-05,
"loss": 1.2624,
"step": 118
},
{
"epoch": 0.5496535796766744,
"grad_norm": 0.4076002197629925,
"learning_rate": 4.5368782161234994e-05,
"loss": 1.2132,
"step": 119
},
{
"epoch": 0.5542725173210161,
"grad_norm": 0.4096656226141803,
"learning_rate": 4.528301886792453e-05,
"loss": 1.212,
"step": 120
},
{
"epoch": 0.558891454965358,
"grad_norm": 0.45292299348582127,
"learning_rate": 4.5197255574614065e-05,
"loss": 1.2521,
"step": 121
},
{
"epoch": 0.5635103926096998,
"grad_norm": 0.47691080402396246,
"learning_rate": 4.51114922813036e-05,
"loss": 1.2124,
"step": 122
},
{
"epoch": 0.5681293302540416,
"grad_norm": 0.3672263347476514,
"learning_rate": 4.502572898799314e-05,
"loss": 1.255,
"step": 123
},
{
"epoch": 0.5727482678983834,
"grad_norm": 0.5135372006027548,
"learning_rate": 4.493996569468268e-05,
"loss": 1.264,
"step": 124
},
{
"epoch": 0.5773672055427251,
"grad_norm": 0.4186420191814305,
"learning_rate": 4.4854202401372214e-05,
"loss": 1.2311,
"step": 125
},
{
"epoch": 0.581986143187067,
"grad_norm": 0.3960702283494651,
"learning_rate": 4.476843910806175e-05,
"loss": 1.2213,
"step": 126
},
{
"epoch": 0.5866050808314087,
"grad_norm": 0.45841552166164706,
"learning_rate": 4.468267581475129e-05,
"loss": 1.2546,
"step": 127
},
{
"epoch": 0.5912240184757506,
"grad_norm": 0.5540593651816136,
"learning_rate": 4.459691252144083e-05,
"loss": 1.2485,
"step": 128
},
{
"epoch": 0.5958429561200924,
"grad_norm": 0.4350841892664989,
"learning_rate": 4.451114922813036e-05,
"loss": 1.1779,
"step": 129
},
{
"epoch": 0.6004618937644342,
"grad_norm": 0.41415465076661256,
"learning_rate": 4.4425385934819905e-05,
"loss": 1.2213,
"step": 130
},
{
"epoch": 0.605080831408776,
"grad_norm": 0.43260680346181374,
"learning_rate": 4.433962264150944e-05,
"loss": 1.1881,
"step": 131
},
{
"epoch": 0.6096997690531177,
"grad_norm": 0.38437793308584145,
"learning_rate": 4.4253859348198976e-05,
"loss": 1.2405,
"step": 132
},
{
"epoch": 0.6143187066974596,
"grad_norm": 0.3864903709709888,
"learning_rate": 4.416809605488851e-05,
"loss": 1.1905,
"step": 133
},
{
"epoch": 0.6189376443418014,
"grad_norm": 0.46335898784247814,
"learning_rate": 4.408233276157805e-05,
"loss": 1.2196,
"step": 134
},
{
"epoch": 0.6235565819861432,
"grad_norm": 0.39019784886633396,
"learning_rate": 4.399656946826758e-05,
"loss": 1.2307,
"step": 135
},
{
"epoch": 0.628175519630485,
"grad_norm": 0.39201754709973075,
"learning_rate": 4.391080617495712e-05,
"loss": 1.2197,
"step": 136
},
{
"epoch": 0.6327944572748267,
"grad_norm": 0.4472712625853919,
"learning_rate": 4.3825042881646653e-05,
"loss": 1.2654,
"step": 137
},
{
"epoch": 0.6374133949191686,
"grad_norm": 0.41394234119083717,
"learning_rate": 4.3739279588336196e-05,
"loss": 1.2084,
"step": 138
},
{
"epoch": 0.6420323325635104,
"grad_norm": 0.4390421584088183,
"learning_rate": 4.365351629502573e-05,
"loss": 1.1906,
"step": 139
},
{
"epoch": 0.6466512702078522,
"grad_norm": 0.3978764453848232,
"learning_rate": 4.356775300171527e-05,
"loss": 1.1921,
"step": 140
},
{
"epoch": 0.651270207852194,
"grad_norm": 0.4111320382670719,
"learning_rate": 4.34819897084048e-05,
"loss": 1.2029,
"step": 141
},
{
"epoch": 0.6558891454965358,
"grad_norm": 0.3936810917423322,
"learning_rate": 4.3396226415094345e-05,
"loss": 1.1991,
"step": 142
},
{
"epoch": 0.6605080831408776,
"grad_norm": 0.4311058139582661,
"learning_rate": 4.331046312178388e-05,
"loss": 1.1959,
"step": 143
},
{
"epoch": 0.6651270207852193,
"grad_norm": 0.43385297721471777,
"learning_rate": 4.3224699828473416e-05,
"loss": 1.1425,
"step": 144
},
{
"epoch": 0.6697459584295612,
"grad_norm": 0.3882223915682857,
"learning_rate": 4.313893653516296e-05,
"loss": 1.1969,
"step": 145
},
{
"epoch": 0.674364896073903,
"grad_norm": 0.4702262507572648,
"learning_rate": 4.305317324185249e-05,
"loss": 1.2106,
"step": 146
},
{
"epoch": 0.6789838337182448,
"grad_norm": 0.4051522632275429,
"learning_rate": 4.296740994854203e-05,
"loss": 1.2008,
"step": 147
},
{
"epoch": 0.6836027713625866,
"grad_norm": 0.5072626343154708,
"learning_rate": 4.2881646655231564e-05,
"loss": 1.2071,
"step": 148
},
{
"epoch": 0.6882217090069284,
"grad_norm": 0.37522436710756185,
"learning_rate": 4.27958833619211e-05,
"loss": 1.1822,
"step": 149
},
{
"epoch": 0.6928406466512702,
"grad_norm": 0.4774192692159993,
"learning_rate": 4.2710120068610635e-05,
"loss": 1.1542,
"step": 150
},
{
"epoch": 0.6974595842956121,
"grad_norm": 0.4054503494439086,
"learning_rate": 4.262435677530017e-05,
"loss": 1.1611,
"step": 151
},
{
"epoch": 0.7020785219399538,
"grad_norm": 0.39781436839318884,
"learning_rate": 4.2538593481989706e-05,
"loss": 1.1589,
"step": 152
},
{
"epoch": 0.7066974595842956,
"grad_norm": 0.43318797460846725,
"learning_rate": 4.245283018867925e-05,
"loss": 1.173,
"step": 153
},
{
"epoch": 0.7113163972286374,
"grad_norm": 0.4515699299945318,
"learning_rate": 4.2367066895368784e-05,
"loss": 1.1521,
"step": 154
},
{
"epoch": 0.7159353348729792,
"grad_norm": 0.4004890149526293,
"learning_rate": 4.228130360205832e-05,
"loss": 1.2057,
"step": 155
},
{
"epoch": 0.7205542725173211,
"grad_norm": 0.4512971327311992,
"learning_rate": 4.219554030874786e-05,
"loss": 1.2066,
"step": 156
},
{
"epoch": 0.7251732101616628,
"grad_norm": 0.49555002643676266,
"learning_rate": 4.21097770154374e-05,
"loss": 1.1706,
"step": 157
},
{
"epoch": 0.7297921478060047,
"grad_norm": 0.3945632904762031,
"learning_rate": 4.202401372212693e-05,
"loss": 1.1529,
"step": 158
},
{
"epoch": 0.7344110854503464,
"grad_norm": 0.4620285910286222,
"learning_rate": 4.193825042881647e-05,
"loss": 1.1383,
"step": 159
},
{
"epoch": 0.7390300230946882,
"grad_norm": 0.4694861109911682,
"learning_rate": 4.185248713550601e-05,
"loss": 1.1483,
"step": 160
},
{
"epoch": 0.74364896073903,
"grad_norm": 0.3946259584079765,
"learning_rate": 4.1766723842195546e-05,
"loss": 1.1912,
"step": 161
},
{
"epoch": 0.7482678983833718,
"grad_norm": 0.5020335583253374,
"learning_rate": 4.168096054888508e-05,
"loss": 1.1776,
"step": 162
},
{
"epoch": 0.7528868360277137,
"grad_norm": 0.37781975397095013,
"learning_rate": 4.159519725557462e-05,
"loss": 1.1142,
"step": 163
},
{
"epoch": 0.7575057736720554,
"grad_norm": 0.4502825316174222,
"learning_rate": 4.150943396226415e-05,
"loss": 1.1478,
"step": 164
},
{
"epoch": 0.7621247113163973,
"grad_norm": 0.42487579936476383,
"learning_rate": 4.142367066895369e-05,
"loss": 1.1822,
"step": 165
},
{
"epoch": 0.766743648960739,
"grad_norm": 0.43026335283586065,
"learning_rate": 4.1337907375643224e-05,
"loss": 1.1914,
"step": 166
},
{
"epoch": 0.7713625866050808,
"grad_norm": 0.5294097238904028,
"learning_rate": 4.125214408233276e-05,
"loss": 1.1951,
"step": 167
},
{
"epoch": 0.7759815242494227,
"grad_norm": 0.44665660025941395,
"learning_rate": 4.11663807890223e-05,
"loss": 1.127,
"step": 168
},
{
"epoch": 0.7806004618937644,
"grad_norm": 0.41481703928115893,
"learning_rate": 4.108061749571184e-05,
"loss": 1.1644,
"step": 169
},
{
"epoch": 0.7852193995381063,
"grad_norm": 0.4634424495916848,
"learning_rate": 4.099485420240137e-05,
"loss": 1.1684,
"step": 170
},
{
"epoch": 0.789838337182448,
"grad_norm": 0.4809687026819544,
"learning_rate": 4.0909090909090915e-05,
"loss": 1.2085,
"step": 171
},
{
"epoch": 0.7944572748267898,
"grad_norm": 0.4345855305204371,
"learning_rate": 4.082332761578045e-05,
"loss": 1.1763,
"step": 172
},
{
"epoch": 0.7990762124711316,
"grad_norm": 0.5335722328326998,
"learning_rate": 4.0737564322469986e-05,
"loss": 1.1455,
"step": 173
},
{
"epoch": 0.8036951501154734,
"grad_norm": 0.4302694431659224,
"learning_rate": 4.065180102915952e-05,
"loss": 1.1956,
"step": 174
},
{
"epoch": 0.8083140877598153,
"grad_norm": 0.4498046936953925,
"learning_rate": 4.0566037735849064e-05,
"loss": 1.1647,
"step": 175
},
{
"epoch": 0.812933025404157,
"grad_norm": 0.4716640523412435,
"learning_rate": 4.04802744425386e-05,
"loss": 1.1407,
"step": 176
},
{
"epoch": 0.8175519630484989,
"grad_norm": 0.3816448249002052,
"learning_rate": 4.0394511149228135e-05,
"loss": 1.1629,
"step": 177
},
{
"epoch": 0.8221709006928406,
"grad_norm": 0.44281834810463583,
"learning_rate": 4.030874785591767e-05,
"loss": 1.1451,
"step": 178
},
{
"epoch": 0.8267898383371824,
"grad_norm": 0.42876636549312874,
"learning_rate": 4.0222984562607206e-05,
"loss": 1.1494,
"step": 179
},
{
"epoch": 0.8314087759815243,
"grad_norm": 0.3657965946096636,
"learning_rate": 4.013722126929674e-05,
"loss": 1.1514,
"step": 180
},
{
"epoch": 0.836027713625866,
"grad_norm": 0.4437557794898911,
"learning_rate": 4.0051457975986277e-05,
"loss": 1.1833,
"step": 181
},
{
"epoch": 0.8406466512702079,
"grad_norm": 0.3798813868447088,
"learning_rate": 3.996569468267582e-05,
"loss": 1.1463,
"step": 182
},
{
"epoch": 0.8452655889145496,
"grad_norm": 0.38652045663162626,
"learning_rate": 3.9879931389365354e-05,
"loss": 1.1719,
"step": 183
},
{
"epoch": 0.8498845265588915,
"grad_norm": 0.4881346497433177,
"learning_rate": 3.979416809605489e-05,
"loss": 1.1761,
"step": 184
},
{
"epoch": 0.8545034642032333,
"grad_norm": 0.3723967537366347,
"learning_rate": 3.9708404802744425e-05,
"loss": 1.1262,
"step": 185
},
{
"epoch": 0.859122401847575,
"grad_norm": 0.4019359410140912,
"learning_rate": 3.962264150943397e-05,
"loss": 1.1722,
"step": 186
},
{
"epoch": 0.8637413394919169,
"grad_norm": 0.37917713470995734,
"learning_rate": 3.95368782161235e-05,
"loss": 1.1083,
"step": 187
},
{
"epoch": 0.8683602771362586,
"grad_norm": 0.3996768943396843,
"learning_rate": 3.945111492281304e-05,
"loss": 1.1393,
"step": 188
},
{
"epoch": 0.8729792147806005,
"grad_norm": 0.33635184740200275,
"learning_rate": 3.9365351629502574e-05,
"loss": 1.1424,
"step": 189
},
{
"epoch": 0.8775981524249422,
"grad_norm": 0.34474204593919566,
"learning_rate": 3.9279588336192116e-05,
"loss": 1.106,
"step": 190
},
{
"epoch": 0.8822170900692841,
"grad_norm": 0.38139854668364576,
"learning_rate": 3.919382504288165e-05,
"loss": 1.1761,
"step": 191
},
{
"epoch": 0.8868360277136259,
"grad_norm": 0.414203964005048,
"learning_rate": 3.910806174957119e-05,
"loss": 1.1347,
"step": 192
},
{
"epoch": 0.8914549653579676,
"grad_norm": 0.3744183946839814,
"learning_rate": 3.902229845626072e-05,
"loss": 1.1527,
"step": 193
},
{
"epoch": 0.8960739030023095,
"grad_norm": 0.4509640126522507,
"learning_rate": 3.893653516295026e-05,
"loss": 1.1006,
"step": 194
},
{
"epoch": 0.9006928406466512,
"grad_norm": 0.3964963457875317,
"learning_rate": 3.8850771869639794e-05,
"loss": 1.1754,
"step": 195
},
{
"epoch": 0.9053117782909931,
"grad_norm": 0.4682966087347042,
"learning_rate": 3.876500857632933e-05,
"loss": 1.1523,
"step": 196
},
{
"epoch": 0.9099307159353349,
"grad_norm": 0.4385360930902435,
"learning_rate": 3.867924528301887e-05,
"loss": 1.1809,
"step": 197
},
{
"epoch": 0.9145496535796767,
"grad_norm": 0.472978339177182,
"learning_rate": 3.859348198970841e-05,
"loss": 1.1044,
"step": 198
},
{
"epoch": 0.9191685912240185,
"grad_norm": 0.39667114476727733,
"learning_rate": 3.850771869639794e-05,
"loss": 1.1276,
"step": 199
},
{
"epoch": 0.9237875288683602,
"grad_norm": 0.40220081756755266,
"learning_rate": 3.842195540308748e-05,
"loss": 1.1306,
"step": 200
},
{
"epoch": 0.9284064665127021,
"grad_norm": 0.36244953234324667,
"learning_rate": 3.833619210977702e-05,
"loss": 1.116,
"step": 201
},
{
"epoch": 0.9330254041570438,
"grad_norm": 0.3582572306082161,
"learning_rate": 3.8250428816466556e-05,
"loss": 1.0726,
"step": 202
},
{
"epoch": 0.9376443418013857,
"grad_norm": 0.3933040185142341,
"learning_rate": 3.816466552315609e-05,
"loss": 1.1178,
"step": 203
},
{
"epoch": 0.9422632794457275,
"grad_norm": 0.4138284478924474,
"learning_rate": 3.807890222984563e-05,
"loss": 1.1191,
"step": 204
},
{
"epoch": 0.9468822170900693,
"grad_norm": 0.40764816032039813,
"learning_rate": 3.799313893653517e-05,
"loss": 1.1507,
"step": 205
},
{
"epoch": 0.9515011547344111,
"grad_norm": 0.37219074275300856,
"learning_rate": 3.7907375643224705e-05,
"loss": 1.1216,
"step": 206
},
{
"epoch": 0.9561200923787528,
"grad_norm": 0.3898259714485093,
"learning_rate": 3.782161234991424e-05,
"loss": 1.1525,
"step": 207
},
{
"epoch": 0.9607390300230947,
"grad_norm": 0.3592868914763391,
"learning_rate": 3.7735849056603776e-05,
"loss": 1.1619,
"step": 208
},
{
"epoch": 0.9653579676674365,
"grad_norm": 0.3586440659199271,
"learning_rate": 3.765008576329331e-05,
"loss": 1.1442,
"step": 209
},
{
"epoch": 0.9699769053117783,
"grad_norm": 0.3866665377205709,
"learning_rate": 3.756432246998285e-05,
"loss": 1.1563,
"step": 210
},
{
"epoch": 0.9745958429561201,
"grad_norm": 0.3717389847088139,
"learning_rate": 3.747855917667238e-05,
"loss": 1.0655,
"step": 211
},
{
"epoch": 0.9792147806004619,
"grad_norm": 0.39340294038730006,
"learning_rate": 3.7392795883361925e-05,
"loss": 1.1545,
"step": 212
},
{
"epoch": 0.9838337182448037,
"grad_norm": 0.5061136403610701,
"learning_rate": 3.730703259005146e-05,
"loss": 1.16,
"step": 213
},
{
"epoch": 0.9884526558891455,
"grad_norm": 0.41044684366987416,
"learning_rate": 3.7221269296740996e-05,
"loss": 1.105,
"step": 214
},
{
"epoch": 0.9930715935334873,
"grad_norm": 0.41261635406819347,
"learning_rate": 3.713550600343053e-05,
"loss": 1.1117,
"step": 215
},
{
"epoch": 0.9976905311778291,
"grad_norm": 0.44235447542070044,
"learning_rate": 3.704974271012007e-05,
"loss": 1.1488,
"step": 216
},
{
"epoch": 1.0,
"grad_norm": 0.44235447542070044,
"learning_rate": 3.696397941680961e-05,
"loss": 1.1596,
"step": 217
},
{
"epoch": 1.0046189376443417,
"grad_norm": 0.6891295031581401,
"learning_rate": 3.6878216123499144e-05,
"loss": 0.9869,
"step": 218
},
{
"epoch": 1.0092378752886837,
"grad_norm": 0.4823167345929953,
"learning_rate": 3.679245283018868e-05,
"loss": 1.0036,
"step": 219
},
{
"epoch": 1.0138568129330254,
"grad_norm": 0.5227870163327095,
"learning_rate": 3.670668953687822e-05,
"loss": 0.9849,
"step": 220
},
{
"epoch": 1.0184757505773672,
"grad_norm": 0.47223058185124184,
"learning_rate": 3.662092624356776e-05,
"loss": 1.0332,
"step": 221
},
{
"epoch": 1.023094688221709,
"grad_norm": 0.5338243169949252,
"learning_rate": 3.653516295025729e-05,
"loss": 1.0014,
"step": 222
},
{
"epoch": 1.0277136258660509,
"grad_norm": 0.4548742976129423,
"learning_rate": 3.644939965694683e-05,
"loss": 0.9646,
"step": 223
},
{
"epoch": 1.0323325635103926,
"grad_norm": 0.49580933432368174,
"learning_rate": 3.6363636363636364e-05,
"loss": 0.9218,
"step": 224
},
{
"epoch": 1.0369515011547343,
"grad_norm": 0.3908063058149935,
"learning_rate": 3.62778730703259e-05,
"loss": 1.0165,
"step": 225
},
{
"epoch": 1.0415704387990763,
"grad_norm": 0.4021977249153246,
"learning_rate": 3.6192109777015435e-05,
"loss": 1.0025,
"step": 226
},
{
"epoch": 1.046189376443418,
"grad_norm": 0.40884622514618685,
"learning_rate": 3.610634648370498e-05,
"loss": 1.0464,
"step": 227
},
{
"epoch": 1.0508083140877598,
"grad_norm": 0.3666010689145339,
"learning_rate": 3.602058319039451e-05,
"loss": 0.9519,
"step": 228
},
{
"epoch": 1.0554272517321017,
"grad_norm": 0.46877386467619137,
"learning_rate": 3.593481989708405e-05,
"loss": 0.9789,
"step": 229
},
{
"epoch": 1.0600461893764435,
"grad_norm": 0.3649178736269907,
"learning_rate": 3.5849056603773584e-05,
"loss": 0.9565,
"step": 230
},
{
"epoch": 1.0646651270207852,
"grad_norm": 0.43217782600814103,
"learning_rate": 3.5763293310463126e-05,
"loss": 1.025,
"step": 231
},
{
"epoch": 1.069284064665127,
"grad_norm": 0.37632921782268824,
"learning_rate": 3.567753001715266e-05,
"loss": 0.9578,
"step": 232
},
{
"epoch": 1.073903002309469,
"grad_norm": 0.39151071917799973,
"learning_rate": 3.55917667238422e-05,
"loss": 0.939,
"step": 233
},
{
"epoch": 1.0785219399538106,
"grad_norm": 0.446725915299358,
"learning_rate": 3.550600343053174e-05,
"loss": 1.0117,
"step": 234
},
{
"epoch": 1.0831408775981524,
"grad_norm": 0.3437917462374703,
"learning_rate": 3.5420240137221275e-05,
"loss": 0.979,
"step": 235
},
{
"epoch": 1.0877598152424943,
"grad_norm": 0.40118919914744827,
"learning_rate": 3.533447684391081e-05,
"loss": 1.0272,
"step": 236
},
{
"epoch": 1.092378752886836,
"grad_norm": 0.38833207440701323,
"learning_rate": 3.5248713550600346e-05,
"loss": 0.9728,
"step": 237
},
{
"epoch": 1.0969976905311778,
"grad_norm": 0.37470626615759534,
"learning_rate": 3.516295025728988e-05,
"loss": 1.0039,
"step": 238
},
{
"epoch": 1.1016166281755195,
"grad_norm": 0.37832376622721,
"learning_rate": 3.507718696397942e-05,
"loss": 0.9709,
"step": 239
},
{
"epoch": 1.1062355658198615,
"grad_norm": 0.37575277543970614,
"learning_rate": 3.499142367066895e-05,
"loss": 0.9598,
"step": 240
},
{
"epoch": 1.1108545034642032,
"grad_norm": 0.3918084045585946,
"learning_rate": 3.490566037735849e-05,
"loss": 1.0202,
"step": 241
},
{
"epoch": 1.115473441108545,
"grad_norm": 0.33667356514343355,
"learning_rate": 3.481989708404803e-05,
"loss": 0.9545,
"step": 242
},
{
"epoch": 1.120092378752887,
"grad_norm": 0.3604987673382122,
"learning_rate": 3.4734133790737566e-05,
"loss": 0.984,
"step": 243
},
{
"epoch": 1.1247113163972287,
"grad_norm": 0.37223726236772037,
"learning_rate": 3.46483704974271e-05,
"loss": 0.9814,
"step": 244
},
{
"epoch": 1.1293302540415704,
"grad_norm": 0.34939465558922544,
"learning_rate": 3.456260720411664e-05,
"loss": 0.9816,
"step": 245
},
{
"epoch": 1.1339491916859123,
"grad_norm": 0.3573912519949966,
"learning_rate": 3.447684391080618e-05,
"loss": 0.9972,
"step": 246
},
{
"epoch": 1.138568129330254,
"grad_norm": 0.4031819480781697,
"learning_rate": 3.4391080617495715e-05,
"loss": 0.9881,
"step": 247
},
{
"epoch": 1.1431870669745958,
"grad_norm": 0.3523551067512535,
"learning_rate": 3.430531732418525e-05,
"loss": 0.9492,
"step": 248
},
{
"epoch": 1.1478060046189376,
"grad_norm": 0.352131805344466,
"learning_rate": 3.421955403087479e-05,
"loss": 0.9465,
"step": 249
},
{
"epoch": 1.1524249422632795,
"grad_norm": 0.3916733420640088,
"learning_rate": 3.413379073756433e-05,
"loss": 0.9806,
"step": 250
},
{
"epoch": 1.1570438799076213,
"grad_norm": 0.42565127210055675,
"learning_rate": 3.404802744425386e-05,
"loss": 0.9895,
"step": 251
},
{
"epoch": 1.161662817551963,
"grad_norm": 0.4359240208343157,
"learning_rate": 3.39622641509434e-05,
"loss": 0.9896,
"step": 252
},
{
"epoch": 1.1662817551963047,
"grad_norm": 0.3733987980120564,
"learning_rate": 3.3876500857632934e-05,
"loss": 1.011,
"step": 253
},
{
"epoch": 1.1709006928406467,
"grad_norm": 0.48528415648027595,
"learning_rate": 3.379073756432247e-05,
"loss": 0.9554,
"step": 254
},
{
"epoch": 1.1755196304849884,
"grad_norm": 0.36122469936856433,
"learning_rate": 3.3704974271012005e-05,
"loss": 1.0107,
"step": 255
},
{
"epoch": 1.1801385681293302,
"grad_norm": 0.4741466726987846,
"learning_rate": 3.361921097770154e-05,
"loss": 0.9542,
"step": 256
},
{
"epoch": 1.1847575057736721,
"grad_norm": 0.33485042589554137,
"learning_rate": 3.353344768439108e-05,
"loss": 0.9601,
"step": 257
},
{
"epoch": 1.1893764434180139,
"grad_norm": 0.3602020831708084,
"learning_rate": 3.344768439108062e-05,
"loss": 0.9727,
"step": 258
},
{
"epoch": 1.1939953810623556,
"grad_norm": 0.39708980657754167,
"learning_rate": 3.3361921097770154e-05,
"loss": 0.9544,
"step": 259
},
{
"epoch": 1.1986143187066975,
"grad_norm": 0.34738638952867634,
"learning_rate": 3.3276157804459696e-05,
"loss": 1.013,
"step": 260
},
{
"epoch": 1.2032332563510393,
"grad_norm": 0.38860008525544176,
"learning_rate": 3.319039451114923e-05,
"loss": 0.9391,
"step": 261
},
{
"epoch": 1.207852193995381,
"grad_norm": 0.3808491207463103,
"learning_rate": 3.310463121783877e-05,
"loss": 0.9267,
"step": 262
},
{
"epoch": 1.212471131639723,
"grad_norm": 0.37033056794856956,
"learning_rate": 3.30188679245283e-05,
"loss": 0.9468,
"step": 263
},
{
"epoch": 1.2170900692840647,
"grad_norm": 0.4029735877629849,
"learning_rate": 3.2933104631217845e-05,
"loss": 0.9671,
"step": 264
},
{
"epoch": 1.2217090069284064,
"grad_norm": 0.3784712748228157,
"learning_rate": 3.284734133790738e-05,
"loss": 0.9889,
"step": 265
},
{
"epoch": 1.2263279445727482,
"grad_norm": 0.3958356147198165,
"learning_rate": 3.2761578044596916e-05,
"loss": 0.9514,
"step": 266
},
{
"epoch": 1.2309468822170901,
"grad_norm": 0.4353883109645583,
"learning_rate": 3.267581475128645e-05,
"loss": 0.9293,
"step": 267
},
{
"epoch": 1.2355658198614319,
"grad_norm": 0.372340134824253,
"learning_rate": 3.259005145797599e-05,
"loss": 0.915,
"step": 268
},
{
"epoch": 1.2401847575057736,
"grad_norm": 0.3967485369979095,
"learning_rate": 3.250428816466552e-05,
"loss": 0.9182,
"step": 269
},
{
"epoch": 1.2448036951501154,
"grad_norm": 0.3768111099406703,
"learning_rate": 3.241852487135506e-05,
"loss": 0.9524,
"step": 270
},
{
"epoch": 1.2494226327944573,
"grad_norm": 0.38199479309443446,
"learning_rate": 3.2332761578044594e-05,
"loss": 1.0,
"step": 271
},
{
"epoch": 1.254041570438799,
"grad_norm": 0.4126184019390743,
"learning_rate": 3.2246998284734136e-05,
"loss": 0.9338,
"step": 272
},
{
"epoch": 1.2586605080831408,
"grad_norm": 0.3740818698914696,
"learning_rate": 3.216123499142367e-05,
"loss": 0.9422,
"step": 273
},
{
"epoch": 1.2632794457274827,
"grad_norm": 0.4037478736079324,
"learning_rate": 3.207547169811321e-05,
"loss": 0.9703,
"step": 274
},
{
"epoch": 1.2678983833718245,
"grad_norm": 0.3643958838531465,
"learning_rate": 3.198970840480275e-05,
"loss": 0.9365,
"step": 275
},
{
"epoch": 1.2725173210161662,
"grad_norm": 0.384951445523061,
"learning_rate": 3.1903945111492285e-05,
"loss": 0.945,
"step": 276
},
{
"epoch": 1.2771362586605082,
"grad_norm": 0.39364858966626226,
"learning_rate": 3.181818181818182e-05,
"loss": 0.9276,
"step": 277
},
{
"epoch": 1.28175519630485,
"grad_norm": 0.44785865893725496,
"learning_rate": 3.1732418524871356e-05,
"loss": 0.9517,
"step": 278
},
{
"epoch": 1.2863741339491916,
"grad_norm": 0.43759086590723906,
"learning_rate": 3.16466552315609e-05,
"loss": 0.932,
"step": 279
},
{
"epoch": 1.2909930715935336,
"grad_norm": 0.3746649335316895,
"learning_rate": 3.1560891938250434e-05,
"loss": 0.9427,
"step": 280
},
{
"epoch": 1.2956120092378753,
"grad_norm": 0.4015363902029942,
"learning_rate": 3.147512864493997e-05,
"loss": 0.9575,
"step": 281
},
{
"epoch": 1.300230946882217,
"grad_norm": 0.3999632210277524,
"learning_rate": 3.1389365351629505e-05,
"loss": 0.9238,
"step": 282
},
{
"epoch": 1.3048498845265588,
"grad_norm": 0.3974520315782316,
"learning_rate": 3.130360205831904e-05,
"loss": 0.9914,
"step": 283
},
{
"epoch": 1.3094688221709008,
"grad_norm": 0.4697264448476519,
"learning_rate": 3.1217838765008576e-05,
"loss": 0.9036,
"step": 284
},
{
"epoch": 1.3140877598152425,
"grad_norm": 0.4025278695786773,
"learning_rate": 3.113207547169811e-05,
"loss": 0.8741,
"step": 285
},
{
"epoch": 1.3187066974595842,
"grad_norm": 0.38046308767631387,
"learning_rate": 3.1046312178387653e-05,
"loss": 0.9746,
"step": 286
},
{
"epoch": 1.323325635103926,
"grad_norm": 0.41165903924900493,
"learning_rate": 3.096054888507719e-05,
"loss": 0.885,
"step": 287
},
{
"epoch": 1.327944572748268,
"grad_norm": 0.3226449054439531,
"learning_rate": 3.0874785591766724e-05,
"loss": 0.9641,
"step": 288
},
{
"epoch": 1.3325635103926097,
"grad_norm": 0.3836633146406067,
"learning_rate": 3.078902229845626e-05,
"loss": 0.9268,
"step": 289
},
{
"epoch": 1.3371824480369514,
"grad_norm": 0.36814960073891967,
"learning_rate": 3.07032590051458e-05,
"loss": 0.8925,
"step": 290
},
{
"epoch": 1.3418013856812934,
"grad_norm": 0.37708063495806354,
"learning_rate": 3.061749571183534e-05,
"loss": 0.9363,
"step": 291
},
{
"epoch": 1.346420323325635,
"grad_norm": 0.342382310644866,
"learning_rate": 3.053173241852487e-05,
"loss": 0.9368,
"step": 292
},
{
"epoch": 1.3510392609699768,
"grad_norm": 0.41245467212929415,
"learning_rate": 3.044596912521441e-05,
"loss": 0.8805,
"step": 293
},
{
"epoch": 1.3556581986143188,
"grad_norm": 0.3955572050429705,
"learning_rate": 3.0360205831903948e-05,
"loss": 0.9836,
"step": 294
},
{
"epoch": 1.3602771362586605,
"grad_norm": 0.38011156892555,
"learning_rate": 3.0274442538593483e-05,
"loss": 0.8983,
"step": 295
},
{
"epoch": 1.3648960739030023,
"grad_norm": 0.3545202413938311,
"learning_rate": 3.018867924528302e-05,
"loss": 0.9088,
"step": 296
},
{
"epoch": 1.3695150115473442,
"grad_norm": 0.3380964662771741,
"learning_rate": 3.0102915951972554e-05,
"loss": 0.9715,
"step": 297
},
{
"epoch": 1.374133949191686,
"grad_norm": 0.3556332600444117,
"learning_rate": 3.0017152658662096e-05,
"loss": 0.9631,
"step": 298
},
{
"epoch": 1.3787528868360277,
"grad_norm": 0.3180445175865554,
"learning_rate": 2.9931389365351632e-05,
"loss": 0.9544,
"step": 299
},
{
"epoch": 1.3833718244803694,
"grad_norm": 0.36900301453307965,
"learning_rate": 2.9845626072041167e-05,
"loss": 0.9662,
"step": 300
},
{
"epoch": 1.3879907621247114,
"grad_norm": 0.31564149940364516,
"learning_rate": 2.9759862778730706e-05,
"loss": 0.9354,
"step": 301
},
{
"epoch": 1.3926096997690531,
"grad_norm": 0.3935622013359831,
"learning_rate": 2.9674099485420242e-05,
"loss": 0.8987,
"step": 302
},
{
"epoch": 1.3972286374133949,
"grad_norm": 0.34557682582990706,
"learning_rate": 2.9588336192109777e-05,
"loss": 0.9599,
"step": 303
},
{
"epoch": 1.4018475750577366,
"grad_norm": 0.41068086387166536,
"learning_rate": 2.9502572898799313e-05,
"loss": 0.9381,
"step": 304
},
{
"epoch": 1.4064665127020786,
"grad_norm": 0.4078171186449019,
"learning_rate": 2.9416809605488855e-05,
"loss": 0.9069,
"step": 305
},
{
"epoch": 1.4110854503464203,
"grad_norm": 0.3609839247569862,
"learning_rate": 2.933104631217839e-05,
"loss": 0.8944,
"step": 306
},
{
"epoch": 1.415704387990762,
"grad_norm": 0.3704153941585653,
"learning_rate": 2.9245283018867926e-05,
"loss": 0.9565,
"step": 307
},
{
"epoch": 1.420323325635104,
"grad_norm": 0.41866891003218715,
"learning_rate": 2.915951972555746e-05,
"loss": 0.9516,
"step": 308
},
{
"epoch": 1.4249422632794457,
"grad_norm": 0.38372292384367024,
"learning_rate": 2.9073756432247e-05,
"loss": 0.9227,
"step": 309
},
{
"epoch": 1.4295612009237875,
"grad_norm": 0.4521918809129475,
"learning_rate": 2.8987993138936536e-05,
"loss": 0.914,
"step": 310
},
{
"epoch": 1.4341801385681294,
"grad_norm": 0.43389302652647216,
"learning_rate": 2.890222984562607e-05,
"loss": 0.9708,
"step": 311
},
{
"epoch": 1.4387990762124712,
"grad_norm": 0.34583731680017876,
"learning_rate": 2.8816466552315614e-05,
"loss": 0.9048,
"step": 312
},
{
"epoch": 1.443418013856813,
"grad_norm": 0.42294216257734296,
"learning_rate": 2.873070325900515e-05,
"loss": 0.9576,
"step": 313
},
{
"epoch": 1.4480369515011549,
"grad_norm": 0.410333052951403,
"learning_rate": 2.8644939965694685e-05,
"loss": 0.946,
"step": 314
},
{
"epoch": 1.4526558891454966,
"grad_norm": 0.46458011329290766,
"learning_rate": 2.855917667238422e-05,
"loss": 0.9488,
"step": 315
},
{
"epoch": 1.4572748267898383,
"grad_norm": 0.3934720699719875,
"learning_rate": 2.847341337907376e-05,
"loss": 0.9193,
"step": 316
},
{
"epoch": 1.46189376443418,
"grad_norm": 0.39775537654004506,
"learning_rate": 2.8387650085763295e-05,
"loss": 0.937,
"step": 317
},
{
"epoch": 1.4665127020785218,
"grad_norm": 0.37407063469756424,
"learning_rate": 2.830188679245283e-05,
"loss": 0.9263,
"step": 318
},
{
"epoch": 1.4711316397228638,
"grad_norm": 0.409865293791187,
"learning_rate": 2.8216123499142366e-05,
"loss": 0.8727,
"step": 319
},
{
"epoch": 1.4757505773672055,
"grad_norm": 0.3290861816675839,
"learning_rate": 2.8130360205831908e-05,
"loss": 0.9069,
"step": 320
},
{
"epoch": 1.4803695150115472,
"grad_norm": 0.35588786632379504,
"learning_rate": 2.8044596912521443e-05,
"loss": 0.9249,
"step": 321
},
{
"epoch": 1.4849884526558892,
"grad_norm": 0.36210399162092216,
"learning_rate": 2.795883361921098e-05,
"loss": 0.931,
"step": 322
},
{
"epoch": 1.489607390300231,
"grad_norm": 0.3628482774174395,
"learning_rate": 2.7873070325900514e-05,
"loss": 0.9102,
"step": 323
},
{
"epoch": 1.4942263279445727,
"grad_norm": 0.36820800470040616,
"learning_rate": 2.7787307032590053e-05,
"loss": 0.8871,
"step": 324
},
{
"epoch": 1.4988452655889146,
"grad_norm": 0.3587615678074862,
"learning_rate": 2.770154373927959e-05,
"loss": 0.9014,
"step": 325
},
{
"epoch": 1.5034642032332564,
"grad_norm": 0.39157974666916345,
"learning_rate": 2.7615780445969124e-05,
"loss": 0.923,
"step": 326
},
{
"epoch": 1.508083140877598,
"grad_norm": 0.35626311294526025,
"learning_rate": 2.7530017152658667e-05,
"loss": 0.8638,
"step": 327
},
{
"epoch": 1.51270207852194,
"grad_norm": 0.44031914960883256,
"learning_rate": 2.7444253859348202e-05,
"loss": 0.8825,
"step": 328
},
{
"epoch": 1.5173210161662818,
"grad_norm": 0.381125215675271,
"learning_rate": 2.7358490566037738e-05,
"loss": 0.9065,
"step": 329
},
{
"epoch": 1.5219399538106235,
"grad_norm": 0.43977635119282493,
"learning_rate": 2.7272727272727273e-05,
"loss": 0.9128,
"step": 330
},
{
"epoch": 1.5265588914549655,
"grad_norm": 0.4208941191882328,
"learning_rate": 2.7186963979416812e-05,
"loss": 0.8756,
"step": 331
},
{
"epoch": 1.531177829099307,
"grad_norm": 0.3860111673520812,
"learning_rate": 2.7101200686106348e-05,
"loss": 0.9002,
"step": 332
},
{
"epoch": 1.535796766743649,
"grad_norm": 0.4342667720275487,
"learning_rate": 2.7015437392795883e-05,
"loss": 0.8825,
"step": 333
},
{
"epoch": 1.540415704387991,
"grad_norm": 0.37734526564351245,
"learning_rate": 2.692967409948542e-05,
"loss": 0.9385,
"step": 334
},
{
"epoch": 1.5450346420323324,
"grad_norm": 0.39206017342416394,
"learning_rate": 2.684391080617496e-05,
"loss": 0.865,
"step": 335
},
{
"epoch": 1.5496535796766744,
"grad_norm": 0.3729387010672457,
"learning_rate": 2.6758147512864496e-05,
"loss": 0.9211,
"step": 336
},
{
"epoch": 1.5542725173210161,
"grad_norm": 0.3541644290930152,
"learning_rate": 2.6672384219554032e-05,
"loss": 0.9078,
"step": 337
},
{
"epoch": 1.5588914549653579,
"grad_norm": 0.34755252926582025,
"learning_rate": 2.658662092624357e-05,
"loss": 0.9103,
"step": 338
},
{
"epoch": 1.5635103926096998,
"grad_norm": 0.32500629115515567,
"learning_rate": 2.6500857632933106e-05,
"loss": 0.8961,
"step": 339
},
{
"epoch": 1.5681293302540416,
"grad_norm": 0.3436952289017308,
"learning_rate": 2.641509433962264e-05,
"loss": 0.8974,
"step": 340
},
{
"epoch": 1.5727482678983833,
"grad_norm": 0.3734993967368118,
"learning_rate": 2.6329331046312177e-05,
"loss": 0.9208,
"step": 341
},
{
"epoch": 1.5773672055427252,
"grad_norm": 0.32351797254895637,
"learning_rate": 2.624356775300172e-05,
"loss": 0.908,
"step": 342
},
{
"epoch": 1.581986143187067,
"grad_norm": 0.38475950268548875,
"learning_rate": 2.6157804459691255e-05,
"loss": 0.8858,
"step": 343
},
{
"epoch": 1.5866050808314087,
"grad_norm": 0.3599269609045405,
"learning_rate": 2.607204116638079e-05,
"loss": 0.9002,
"step": 344
},
{
"epoch": 1.5912240184757507,
"grad_norm": 0.33015227344594983,
"learning_rate": 2.5986277873070326e-05,
"loss": 0.8852,
"step": 345
},
{
"epoch": 1.5958429561200924,
"grad_norm": 0.3840233987858635,
"learning_rate": 2.5900514579759865e-05,
"loss": 0.8845,
"step": 346
},
{
"epoch": 1.6004618937644342,
"grad_norm": 0.3387963826306509,
"learning_rate": 2.58147512864494e-05,
"loss": 0.8531,
"step": 347
},
{
"epoch": 1.605080831408776,
"grad_norm": 0.3791687159061299,
"learning_rate": 2.5728987993138936e-05,
"loss": 0.9232,
"step": 348
},
{
"epoch": 1.6096997690531176,
"grad_norm": 0.5409865724311915,
"learning_rate": 2.564322469982847e-05,
"loss": 0.8184,
"step": 349
},
{
"epoch": 1.6143187066974596,
"grad_norm": 0.3738772595287673,
"learning_rate": 2.5557461406518014e-05,
"loss": 0.8712,
"step": 350
},
{
"epoch": 1.6189376443418015,
"grad_norm": 0.36639340662031855,
"learning_rate": 2.547169811320755e-05,
"loss": 0.8836,
"step": 351
},
{
"epoch": 1.623556581986143,
"grad_norm": 0.3112957956622142,
"learning_rate": 2.5385934819897085e-05,
"loss": 0.8731,
"step": 352
},
{
"epoch": 1.628175519630485,
"grad_norm": 0.3874481637023455,
"learning_rate": 2.5300171526586624e-05,
"loss": 0.8626,
"step": 353
},
{
"epoch": 1.6327944572748267,
"grad_norm": 0.38856688767517716,
"learning_rate": 2.521440823327616e-05,
"loss": 0.895,
"step": 354
},
{
"epoch": 1.6374133949191685,
"grad_norm": 0.34652251353621616,
"learning_rate": 2.5128644939965695e-05,
"loss": 0.8582,
"step": 355
},
{
"epoch": 1.6420323325635104,
"grad_norm": 0.3911492831746351,
"learning_rate": 2.504288164665523e-05,
"loss": 0.8806,
"step": 356
},
{
"epoch": 1.6466512702078522,
"grad_norm": 0.34086893912173744,
"learning_rate": 2.495711835334477e-05,
"loss": 0.8753,
"step": 357
},
{
"epoch": 1.651270207852194,
"grad_norm": 0.34494349485579706,
"learning_rate": 2.4871355060034308e-05,
"loss": 0.8662,
"step": 358
},
{
"epoch": 1.6558891454965359,
"grad_norm": 0.3378824008408299,
"learning_rate": 2.4785591766723843e-05,
"loss": 0.9184,
"step": 359
},
{
"epoch": 1.6605080831408776,
"grad_norm": 0.33853652605072154,
"learning_rate": 2.4699828473413382e-05,
"loss": 0.9263,
"step": 360
},
{
"epoch": 1.6651270207852193,
"grad_norm": 0.3318268519490256,
"learning_rate": 2.4614065180102914e-05,
"loss": 0.8856,
"step": 361
},
{
"epoch": 1.6697459584295613,
"grad_norm": 0.32643514443358607,
"learning_rate": 2.4528301886792453e-05,
"loss": 0.9142,
"step": 362
},
{
"epoch": 1.674364896073903,
"grad_norm": 0.3091357991096254,
"learning_rate": 2.4442538593481992e-05,
"loss": 0.8528,
"step": 363
},
{
"epoch": 1.6789838337182448,
"grad_norm": 0.3200201992367394,
"learning_rate": 2.4356775300171528e-05,
"loss": 0.8665,
"step": 364
},
{
"epoch": 1.6836027713625867,
"grad_norm": 0.3235166023807737,
"learning_rate": 2.4271012006861067e-05,
"loss": 0.8416,
"step": 365
},
{
"epoch": 1.6882217090069283,
"grad_norm": 0.3452409051766759,
"learning_rate": 2.4185248713550602e-05,
"loss": 0.8733,
"step": 366
},
{
"epoch": 1.6928406466512702,
"grad_norm": 0.3528699743692465,
"learning_rate": 2.409948542024014e-05,
"loss": 0.8392,
"step": 367
},
{
"epoch": 1.6974595842956122,
"grad_norm": 0.31607024182040927,
"learning_rate": 2.4013722126929673e-05,
"loss": 0.8836,
"step": 368
},
{
"epoch": 1.7020785219399537,
"grad_norm": 0.3626530120521926,
"learning_rate": 2.3927958833619212e-05,
"loss": 0.9006,
"step": 369
},
{
"epoch": 1.7066974595842956,
"grad_norm": 0.3256927415805486,
"learning_rate": 2.3842195540308747e-05,
"loss": 0.8562,
"step": 370
},
{
"epoch": 1.7113163972286374,
"grad_norm": 0.3403222130919274,
"learning_rate": 2.3756432246998286e-05,
"loss": 0.9014,
"step": 371
},
{
"epoch": 1.7159353348729791,
"grad_norm": 0.36730283480958964,
"learning_rate": 2.3670668953687822e-05,
"loss": 0.8691,
"step": 372
},
{
"epoch": 1.720554272517321,
"grad_norm": 0.3561126742634618,
"learning_rate": 2.358490566037736e-05,
"loss": 0.9024,
"step": 373
},
{
"epoch": 1.7251732101616628,
"grad_norm": 0.3297313643164702,
"learning_rate": 2.3499142367066896e-05,
"loss": 0.8638,
"step": 374
},
{
"epoch": 1.7297921478060045,
"grad_norm": 0.326908599827038,
"learning_rate": 2.3413379073756435e-05,
"loss": 0.8791,
"step": 375
},
{
"epoch": 1.7344110854503465,
"grad_norm": 0.37833346487473063,
"learning_rate": 2.332761578044597e-05,
"loss": 0.8907,
"step": 376
},
{
"epoch": 1.7390300230946882,
"grad_norm": 0.3135862511681741,
"learning_rate": 2.3241852487135506e-05,
"loss": 0.8207,
"step": 377
},
{
"epoch": 1.74364896073903,
"grad_norm": 0.352771369562728,
"learning_rate": 2.3156089193825045e-05,
"loss": 0.8703,
"step": 378
},
{
"epoch": 1.748267898383372,
"grad_norm": 0.34199641492812893,
"learning_rate": 2.307032590051458e-05,
"loss": 0.8907,
"step": 379
},
{
"epoch": 1.7528868360277137,
"grad_norm": 0.318788768465171,
"learning_rate": 2.298456260720412e-05,
"loss": 0.8415,
"step": 380
},
{
"epoch": 1.7575057736720554,
"grad_norm": 0.33466382448926185,
"learning_rate": 2.2898799313893655e-05,
"loss": 0.8573,
"step": 381
},
{
"epoch": 1.7621247113163974,
"grad_norm": 0.3260617497365741,
"learning_rate": 2.2813036020583194e-05,
"loss": 0.8656,
"step": 382
},
{
"epoch": 1.7667436489607389,
"grad_norm": 0.317254014182347,
"learning_rate": 2.272727272727273e-05,
"loss": 0.8237,
"step": 383
},
{
"epoch": 1.7713625866050808,
"grad_norm": 0.3364939466647194,
"learning_rate": 2.2641509433962265e-05,
"loss": 0.8697,
"step": 384
},
{
"epoch": 1.7759815242494228,
"grad_norm": 0.33438809571374783,
"learning_rate": 2.25557461406518e-05,
"loss": 0.9011,
"step": 385
},
{
"epoch": 1.7806004618937643,
"grad_norm": 0.33134388703225337,
"learning_rate": 2.246998284734134e-05,
"loss": 0.8364,
"step": 386
},
{
"epoch": 1.7852193995381063,
"grad_norm": 0.34624983658858305,
"learning_rate": 2.2384219554030875e-05,
"loss": 0.8683,
"step": 387
},
{
"epoch": 1.789838337182448,
"grad_norm": 0.34158478682929055,
"learning_rate": 2.2298456260720414e-05,
"loss": 0.8389,
"step": 388
},
{
"epoch": 1.7944572748267897,
"grad_norm": 0.3616294808136299,
"learning_rate": 2.2212692967409952e-05,
"loss": 0.8655,
"step": 389
},
{
"epoch": 1.7990762124711317,
"grad_norm": 0.33102365916991044,
"learning_rate": 2.2126929674099488e-05,
"loss": 0.8658,
"step": 390
},
{
"epoch": 1.8036951501154734,
"grad_norm": 0.338356513264304,
"learning_rate": 2.2041166380789023e-05,
"loss": 0.8732,
"step": 391
},
{
"epoch": 1.8083140877598152,
"grad_norm": 0.32787987426147397,
"learning_rate": 2.195540308747856e-05,
"loss": 0.8645,
"step": 392
},
{
"epoch": 1.8129330254041571,
"grad_norm": 0.32109788114169946,
"learning_rate": 2.1869639794168098e-05,
"loss": 0.8591,
"step": 393
},
{
"epoch": 1.8175519630484989,
"grad_norm": 0.33819944765862064,
"learning_rate": 2.1783876500857633e-05,
"loss": 0.895,
"step": 394
},
{
"epoch": 1.8221709006928406,
"grad_norm": 0.3668487877045436,
"learning_rate": 2.1698113207547172e-05,
"loss": 0.8819,
"step": 395
},
{
"epoch": 1.8267898383371826,
"grad_norm": 0.3269652547446026,
"learning_rate": 2.1612349914236708e-05,
"loss": 0.8493,
"step": 396
},
{
"epoch": 1.8314087759815243,
"grad_norm": 0.33778992304763433,
"learning_rate": 2.1526586620926247e-05,
"loss": 0.8351,
"step": 397
},
{
"epoch": 1.836027713625866,
"grad_norm": 0.349707216299572,
"learning_rate": 2.1440823327615782e-05,
"loss": 0.8768,
"step": 398
},
{
"epoch": 1.840646651270208,
"grad_norm": 0.32624428898752067,
"learning_rate": 2.1355060034305318e-05,
"loss": 0.8481,
"step": 399
},
{
"epoch": 1.8452655889145495,
"grad_norm": 0.347976357550313,
"learning_rate": 2.1269296740994853e-05,
"loss": 0.8659,
"step": 400
},
{
"epoch": 1.8498845265588915,
"grad_norm": 0.3219327539713086,
"learning_rate": 2.1183533447684392e-05,
"loss": 0.8255,
"step": 401
},
{
"epoch": 1.8545034642032334,
"grad_norm": 0.31248081805551803,
"learning_rate": 2.109777015437393e-05,
"loss": 0.8367,
"step": 402
},
{
"epoch": 1.859122401847575,
"grad_norm": 0.3593213275316607,
"learning_rate": 2.1012006861063466e-05,
"loss": 0.8365,
"step": 403
},
{
"epoch": 1.863741339491917,
"grad_norm": 0.32888922295840495,
"learning_rate": 2.0926243567753005e-05,
"loss": 0.8483,
"step": 404
},
{
"epoch": 1.8683602771362586,
"grad_norm": 0.3458799525817461,
"learning_rate": 2.084048027444254e-05,
"loss": 0.837,
"step": 405
},
{
"epoch": 1.8729792147806004,
"grad_norm": 0.3831719169390039,
"learning_rate": 2.0754716981132076e-05,
"loss": 0.848,
"step": 406
},
{
"epoch": 1.8775981524249423,
"grad_norm": 0.3034409322781977,
"learning_rate": 2.0668953687821612e-05,
"loss": 0.8246,
"step": 407
},
{
"epoch": 1.882217090069284,
"grad_norm": 0.3482544680772682,
"learning_rate": 2.058319039451115e-05,
"loss": 0.8744,
"step": 408
},
{
"epoch": 1.8868360277136258,
"grad_norm": 0.34054478548420897,
"learning_rate": 2.0497427101200686e-05,
"loss": 0.8227,
"step": 409
},
{
"epoch": 1.8914549653579678,
"grad_norm": 0.35009131537188054,
"learning_rate": 2.0411663807890225e-05,
"loss": 0.8271,
"step": 410
},
{
"epoch": 1.8960739030023095,
"grad_norm": 0.3201316898245023,
"learning_rate": 2.032590051457976e-05,
"loss": 0.8069,
"step": 411
},
{
"epoch": 1.9006928406466512,
"grad_norm": 0.32737270013669384,
"learning_rate": 2.02401372212693e-05,
"loss": 0.8173,
"step": 412
},
{
"epoch": 1.9053117782909932,
"grad_norm": 0.3097470930485408,
"learning_rate": 2.0154373927958835e-05,
"loss": 0.8118,
"step": 413
},
{
"epoch": 1.909930715935335,
"grad_norm": 0.3700953368096899,
"learning_rate": 2.006861063464837e-05,
"loss": 0.8307,
"step": 414
},
{
"epoch": 1.9145496535796767,
"grad_norm": 0.3173919230074883,
"learning_rate": 1.998284734133791e-05,
"loss": 0.823,
"step": 415
},
{
"epoch": 1.9191685912240186,
"grad_norm": 0.3448974269511893,
"learning_rate": 1.9897084048027445e-05,
"loss": 0.8571,
"step": 416
},
{
"epoch": 1.9237875288683601,
"grad_norm": 0.33180020532999993,
"learning_rate": 1.9811320754716984e-05,
"loss": 0.8561,
"step": 417
},
{
"epoch": 1.928406466512702,
"grad_norm": 0.32814426294137816,
"learning_rate": 1.972555746140652e-05,
"loss": 0.8212,
"step": 418
},
{
"epoch": 1.9330254041570438,
"grad_norm": 0.364224794656403,
"learning_rate": 1.9639794168096058e-05,
"loss": 0.8548,
"step": 419
},
{
"epoch": 1.9376443418013856,
"grad_norm": 0.3174316662629093,
"learning_rate": 1.9554030874785594e-05,
"loss": 0.8653,
"step": 420
},
{
"epoch": 1.9422632794457275,
"grad_norm": 0.33678819331461846,
"learning_rate": 1.946826758147513e-05,
"loss": 0.8143,
"step": 421
},
{
"epoch": 1.9468822170900693,
"grad_norm": 0.325835386466701,
"learning_rate": 1.9382504288164665e-05,
"loss": 0.8661,
"step": 422
},
{
"epoch": 1.951501154734411,
"grad_norm": 0.34072883871477183,
"learning_rate": 1.9296740994854204e-05,
"loss": 0.8071,
"step": 423
},
{
"epoch": 1.956120092378753,
"grad_norm": 0.31936101968844666,
"learning_rate": 1.921097770154374e-05,
"loss": 0.8138,
"step": 424
},
{
"epoch": 1.9607390300230947,
"grad_norm": 0.3286920929197838,
"learning_rate": 1.9125214408233278e-05,
"loss": 0.8111,
"step": 425
},
{
"epoch": 1.9653579676674364,
"grad_norm": 0.32547558432170376,
"learning_rate": 1.9039451114922813e-05,
"loss": 0.8248,
"step": 426
},
{
"epoch": 1.9699769053117784,
"grad_norm": 0.33732126219340836,
"learning_rate": 1.8953687821612352e-05,
"loss": 0.8362,
"step": 427
},
{
"epoch": 1.9745958429561201,
"grad_norm": 0.3242042848462493,
"learning_rate": 1.8867924528301888e-05,
"loss": 0.7791,
"step": 428
},
{
"epoch": 1.9792147806004619,
"grad_norm": 0.32400217713938734,
"learning_rate": 1.8782161234991423e-05,
"loss": 0.843,
"step": 429
},
{
"epoch": 1.9838337182448038,
"grad_norm": 0.3844432315595193,
"learning_rate": 1.8696397941680962e-05,
"loss": 0.8789,
"step": 430
},
{
"epoch": 1.9884526558891455,
"grad_norm": 0.3309235001990823,
"learning_rate": 1.8610634648370498e-05,
"loss": 0.8169,
"step": 431
},
{
"epoch": 1.9930715935334873,
"grad_norm": 0.4002655052016613,
"learning_rate": 1.8524871355060037e-05,
"loss": 0.8283,
"step": 432
},
{
"epoch": 1.9976905311778292,
"grad_norm": 0.34012816311982813,
"learning_rate": 1.8439108061749572e-05,
"loss": 0.8453,
"step": 433
},
{
"epoch": 2.0,
"grad_norm": 0.4877551807217465,
"learning_rate": 1.835334476843911e-05,
"loss": 0.6982,
"step": 434
},
{
"epoch": 2.004618937644342,
"grad_norm": 0.5055897807411366,
"learning_rate": 1.8267581475128647e-05,
"loss": 0.6803,
"step": 435
},
{
"epoch": 2.0092378752886835,
"grad_norm": 0.43452386710182306,
"learning_rate": 1.8181818181818182e-05,
"loss": 0.6456,
"step": 436
},
{
"epoch": 2.0138568129330254,
"grad_norm": 0.42375332929620585,
"learning_rate": 1.8096054888507718e-05,
"loss": 0.6414,
"step": 437
},
{
"epoch": 2.0184757505773674,
"grad_norm": 0.3884241110283737,
"learning_rate": 1.8010291595197256e-05,
"loss": 0.629,
"step": 438
},
{
"epoch": 2.023094688221709,
"grad_norm": 0.40922251987632363,
"learning_rate": 1.7924528301886792e-05,
"loss": 0.6267,
"step": 439
},
{
"epoch": 2.027713625866051,
"grad_norm": 0.43687163267417695,
"learning_rate": 1.783876500857633e-05,
"loss": 0.6796,
"step": 440
},
{
"epoch": 2.032332563510393,
"grad_norm": 0.3883534416086552,
"learning_rate": 1.775300171526587e-05,
"loss": 0.6554,
"step": 441
},
{
"epoch": 2.0369515011547343,
"grad_norm": 0.39081540922165897,
"learning_rate": 1.7667238421955405e-05,
"loss": 0.6315,
"step": 442
},
{
"epoch": 2.0415704387990763,
"grad_norm": 0.44992939815601635,
"learning_rate": 1.758147512864494e-05,
"loss": 0.6763,
"step": 443
},
{
"epoch": 2.046189376443418,
"grad_norm": 0.38661921193546667,
"learning_rate": 1.7495711835334476e-05,
"loss": 0.6538,
"step": 444
},
{
"epoch": 2.0508083140877598,
"grad_norm": 0.4616626617938836,
"learning_rate": 1.7409948542024015e-05,
"loss": 0.6424,
"step": 445
},
{
"epoch": 2.0554272517321017,
"grad_norm": 0.3392527799217968,
"learning_rate": 1.732418524871355e-05,
"loss": 0.6242,
"step": 446
},
{
"epoch": 2.0600461893764432,
"grad_norm": 0.41864656271116496,
"learning_rate": 1.723842195540309e-05,
"loss": 0.6374,
"step": 447
},
{
"epoch": 2.064665127020785,
"grad_norm": 0.4024534282656605,
"learning_rate": 1.7152658662092625e-05,
"loss": 0.6615,
"step": 448
},
{
"epoch": 2.069284064665127,
"grad_norm": 0.3519571716869604,
"learning_rate": 1.7066895368782164e-05,
"loss": 0.6533,
"step": 449
},
{
"epoch": 2.0739030023094687,
"grad_norm": 0.3806120930697768,
"learning_rate": 1.69811320754717e-05,
"loss": 0.6131,
"step": 450
},
{
"epoch": 2.0785219399538106,
"grad_norm": 0.3757629867659349,
"learning_rate": 1.6895368782161235e-05,
"loss": 0.6091,
"step": 451
},
{
"epoch": 2.0831408775981526,
"grad_norm": 0.3462804994403296,
"learning_rate": 1.680960548885077e-05,
"loss": 0.6022,
"step": 452
},
{
"epoch": 2.087759815242494,
"grad_norm": 0.3761692951200746,
"learning_rate": 1.672384219554031e-05,
"loss": 0.6422,
"step": 453
},
{
"epoch": 2.092378752886836,
"grad_norm": 0.3522284937794803,
"learning_rate": 1.6638078902229848e-05,
"loss": 0.6546,
"step": 454
},
{
"epoch": 2.096997690531178,
"grad_norm": 0.3195566602844831,
"learning_rate": 1.6552315608919384e-05,
"loss": 0.6155,
"step": 455
},
{
"epoch": 2.1016166281755195,
"grad_norm": 0.3388396709522284,
"learning_rate": 1.6466552315608923e-05,
"loss": 0.6327,
"step": 456
},
{
"epoch": 2.1062355658198615,
"grad_norm": 0.32768589384808267,
"learning_rate": 1.6380789022298458e-05,
"loss": 0.6374,
"step": 457
},
{
"epoch": 2.1108545034642034,
"grad_norm": 0.3587829436710132,
"learning_rate": 1.6295025728987994e-05,
"loss": 0.6446,
"step": 458
},
{
"epoch": 2.115473441108545,
"grad_norm": 0.33772996294490176,
"learning_rate": 1.620926243567753e-05,
"loss": 0.6646,
"step": 459
},
{
"epoch": 2.120092378752887,
"grad_norm": 0.31235821835989375,
"learning_rate": 1.6123499142367068e-05,
"loss": 0.6224,
"step": 460
},
{
"epoch": 2.1247113163972284,
"grad_norm": 0.33046997848972876,
"learning_rate": 1.6037735849056604e-05,
"loss": 0.6099,
"step": 461
},
{
"epoch": 2.1293302540415704,
"grad_norm": 0.3335638972255125,
"learning_rate": 1.5951972555746142e-05,
"loss": 0.6555,
"step": 462
},
{
"epoch": 2.1339491916859123,
"grad_norm": 0.31315475171235707,
"learning_rate": 1.5866209262435678e-05,
"loss": 0.6324,
"step": 463
},
{
"epoch": 2.138568129330254,
"grad_norm": 0.3527050089973555,
"learning_rate": 1.5780445969125217e-05,
"loss": 0.6497,
"step": 464
},
{
"epoch": 2.143187066974596,
"grad_norm": 0.3198849618527542,
"learning_rate": 1.5694682675814752e-05,
"loss": 0.6067,
"step": 465
},
{
"epoch": 2.147806004618938,
"grad_norm": 0.3366417318084208,
"learning_rate": 1.5608919382504288e-05,
"loss": 0.6308,
"step": 466
},
{
"epoch": 2.1524249422632793,
"grad_norm": 0.30810936638251046,
"learning_rate": 1.5523156089193827e-05,
"loss": 0.647,
"step": 467
},
{
"epoch": 2.1570438799076213,
"grad_norm": 0.32065096961789075,
"learning_rate": 1.5437392795883362e-05,
"loss": 0.6375,
"step": 468
},
{
"epoch": 2.161662817551963,
"grad_norm": 0.3402079384531757,
"learning_rate": 1.53516295025729e-05,
"loss": 0.6293,
"step": 469
},
{
"epoch": 2.1662817551963047,
"grad_norm": 0.3160970634660786,
"learning_rate": 1.5265866209262437e-05,
"loss": 0.639,
"step": 470
},
{
"epoch": 2.1709006928406467,
"grad_norm": 0.33395470746765665,
"learning_rate": 1.5180102915951974e-05,
"loss": 0.6172,
"step": 471
},
{
"epoch": 2.1755196304849886,
"grad_norm": 0.32068652716045143,
"learning_rate": 1.509433962264151e-05,
"loss": 0.6295,
"step": 472
},
{
"epoch": 2.18013856812933,
"grad_norm": 0.31626954019646325,
"learning_rate": 1.5008576329331048e-05,
"loss": 0.6095,
"step": 473
},
{
"epoch": 2.184757505773672,
"grad_norm": 0.31690976991470143,
"learning_rate": 1.4922813036020584e-05,
"loss": 0.6007,
"step": 474
},
{
"epoch": 2.1893764434180136,
"grad_norm": 0.32879422227035715,
"learning_rate": 1.4837049742710121e-05,
"loss": 0.6146,
"step": 475
},
{
"epoch": 2.1939953810623556,
"grad_norm": 0.3106082682233409,
"learning_rate": 1.4751286449399656e-05,
"loss": 0.6092,
"step": 476
},
{
"epoch": 2.1986143187066975,
"grad_norm": 0.3119422394397202,
"learning_rate": 1.4665523156089195e-05,
"loss": 0.641,
"step": 477
},
{
"epoch": 2.203233256351039,
"grad_norm": 0.3184854313927424,
"learning_rate": 1.457975986277873e-05,
"loss": 0.6492,
"step": 478
},
{
"epoch": 2.207852193995381,
"grad_norm": 0.30992353661978633,
"learning_rate": 1.4493996569468268e-05,
"loss": 0.616,
"step": 479
},
{
"epoch": 2.212471131639723,
"grad_norm": 0.3073065761450925,
"learning_rate": 1.4408233276157807e-05,
"loss": 0.6287,
"step": 480
},
{
"epoch": 2.2170900692840645,
"grad_norm": 0.32170712965268816,
"learning_rate": 1.4322469982847342e-05,
"loss": 0.6397,
"step": 481
},
{
"epoch": 2.2217090069284064,
"grad_norm": 0.3177347718089072,
"learning_rate": 1.423670668953688e-05,
"loss": 0.6134,
"step": 482
},
{
"epoch": 2.2263279445727484,
"grad_norm": 0.3053906969475859,
"learning_rate": 1.4150943396226415e-05,
"loss": 0.6197,
"step": 483
},
{
"epoch": 2.23094688221709,
"grad_norm": 0.31185215285559037,
"learning_rate": 1.4065180102915954e-05,
"loss": 0.6213,
"step": 484
},
{
"epoch": 2.235565819861432,
"grad_norm": 0.3399147531916454,
"learning_rate": 1.397941680960549e-05,
"loss": 0.627,
"step": 485
},
{
"epoch": 2.240184757505774,
"grad_norm": 0.3104730058834459,
"learning_rate": 1.3893653516295027e-05,
"loss": 0.6595,
"step": 486
},
{
"epoch": 2.2448036951501154,
"grad_norm": 0.316970486961682,
"learning_rate": 1.3807890222984562e-05,
"loss": 0.6148,
"step": 487
},
{
"epoch": 2.2494226327944573,
"grad_norm": 0.3245277789723318,
"learning_rate": 1.3722126929674101e-05,
"loss": 0.6003,
"step": 488
},
{
"epoch": 2.2540415704387993,
"grad_norm": 0.3128213790193216,
"learning_rate": 1.3636363636363637e-05,
"loss": 0.6241,
"step": 489
},
{
"epoch": 2.258660508083141,
"grad_norm": 0.3175338228494751,
"learning_rate": 1.3550600343053174e-05,
"loss": 0.5903,
"step": 490
},
{
"epoch": 2.2632794457274827,
"grad_norm": 0.3388649383425264,
"learning_rate": 1.346483704974271e-05,
"loss": 0.635,
"step": 491
},
{
"epoch": 2.2678983833718247,
"grad_norm": 0.31257026434241314,
"learning_rate": 1.3379073756432248e-05,
"loss": 0.6202,
"step": 492
},
{
"epoch": 2.272517321016166,
"grad_norm": 0.31866294271317863,
"learning_rate": 1.3293310463121785e-05,
"loss": 0.63,
"step": 493
},
{
"epoch": 2.277136258660508,
"grad_norm": 0.3263273099571491,
"learning_rate": 1.320754716981132e-05,
"loss": 0.6094,
"step": 494
},
{
"epoch": 2.28175519630485,
"grad_norm": 0.31334483423019416,
"learning_rate": 1.312178387650086e-05,
"loss": 0.6407,
"step": 495
},
{
"epoch": 2.2863741339491916,
"grad_norm": 0.324174726095215,
"learning_rate": 1.3036020583190395e-05,
"loss": 0.6351,
"step": 496
},
{
"epoch": 2.2909930715935336,
"grad_norm": 0.3561775288332575,
"learning_rate": 1.2950257289879932e-05,
"loss": 0.6244,
"step": 497
},
{
"epoch": 2.295612009237875,
"grad_norm": 0.3348475908034659,
"learning_rate": 1.2864493996569468e-05,
"loss": 0.6352,
"step": 498
},
{
"epoch": 2.300230946882217,
"grad_norm": 0.32559926531812206,
"learning_rate": 1.2778730703259007e-05,
"loss": 0.6194,
"step": 499
},
{
"epoch": 2.304849884526559,
"grad_norm": 0.3302332483566113,
"learning_rate": 1.2692967409948542e-05,
"loss": 0.6449,
"step": 500
},
{
"epoch": 2.3094688221709005,
"grad_norm": 0.31300663007139984,
"learning_rate": 1.260720411663808e-05,
"loss": 0.632,
"step": 501
},
{
"epoch": 2.3140877598152425,
"grad_norm": 0.33803570154479295,
"learning_rate": 1.2521440823327615e-05,
"loss": 0.6252,
"step": 502
},
{
"epoch": 2.3187066974595845,
"grad_norm": 0.31045883741100605,
"learning_rate": 1.2435677530017154e-05,
"loss": 0.6023,
"step": 503
},
{
"epoch": 2.323325635103926,
"grad_norm": 0.3128142722243615,
"learning_rate": 1.2349914236706691e-05,
"loss": 0.6256,
"step": 504
},
{
"epoch": 2.327944572748268,
"grad_norm": 0.33642818664166885,
"learning_rate": 1.2264150943396227e-05,
"loss": 0.632,
"step": 505
},
{
"epoch": 2.3325635103926095,
"grad_norm": 0.3180949384555643,
"learning_rate": 1.2178387650085764e-05,
"loss": 0.6302,
"step": 506
},
{
"epoch": 2.3371824480369514,
"grad_norm": 0.3226664203568327,
"learning_rate": 1.2092624356775301e-05,
"loss": 0.6165,
"step": 507
},
{
"epoch": 2.3418013856812934,
"grad_norm": 0.31779146987599094,
"learning_rate": 1.2006861063464837e-05,
"loss": 0.6183,
"step": 508
},
{
"epoch": 2.346420323325635,
"grad_norm": 0.3291695667613088,
"learning_rate": 1.1921097770154374e-05,
"loss": 0.6099,
"step": 509
},
{
"epoch": 2.351039260969977,
"grad_norm": 0.3248375601160582,
"learning_rate": 1.1835334476843911e-05,
"loss": 0.5836,
"step": 510
},
{
"epoch": 2.355658198614319,
"grad_norm": 0.3128908753673337,
"learning_rate": 1.1749571183533448e-05,
"loss": 0.6241,
"step": 511
},
{
"epoch": 2.3602771362586603,
"grad_norm": 0.33927266383154653,
"learning_rate": 1.1663807890222985e-05,
"loss": 0.6001,
"step": 512
},
{
"epoch": 2.3648960739030023,
"grad_norm": 0.33781400666903655,
"learning_rate": 1.1578044596912522e-05,
"loss": 0.6549,
"step": 513
},
{
"epoch": 2.3695150115473442,
"grad_norm": 0.31388536665996153,
"learning_rate": 1.149228130360206e-05,
"loss": 0.6175,
"step": 514
},
{
"epoch": 2.3741339491916857,
"grad_norm": 0.33167923088031215,
"learning_rate": 1.1406518010291597e-05,
"loss": 0.6098,
"step": 515
},
{
"epoch": 2.3787528868360277,
"grad_norm": 0.31245925099568317,
"learning_rate": 1.1320754716981132e-05,
"loss": 0.619,
"step": 516
},
{
"epoch": 2.3833718244803697,
"grad_norm": 0.3151300736837639,
"learning_rate": 1.123499142367067e-05,
"loss": 0.6237,
"step": 517
},
{
"epoch": 2.387990762124711,
"grad_norm": 0.31538013816224536,
"learning_rate": 1.1149228130360207e-05,
"loss": 0.5854,
"step": 518
},
{
"epoch": 2.392609699769053,
"grad_norm": 0.332673356040553,
"learning_rate": 1.1063464837049744e-05,
"loss": 0.6015,
"step": 519
},
{
"epoch": 2.397228637413395,
"grad_norm": 0.3094164295495207,
"learning_rate": 1.097770154373928e-05,
"loss": 0.5822,
"step": 520
},
{
"epoch": 2.4018475750577366,
"grad_norm": 0.3103005922372045,
"learning_rate": 1.0891938250428817e-05,
"loss": 0.6252,
"step": 521
},
{
"epoch": 2.4064665127020786,
"grad_norm": 0.31940642070422026,
"learning_rate": 1.0806174957118354e-05,
"loss": 0.634,
"step": 522
},
{
"epoch": 2.4110854503464205,
"grad_norm": 0.3167594520750731,
"learning_rate": 1.0720411663807891e-05,
"loss": 0.6106,
"step": 523
},
{
"epoch": 2.415704387990762,
"grad_norm": 0.31063860202480925,
"learning_rate": 1.0634648370497427e-05,
"loss": 0.6368,
"step": 524
},
{
"epoch": 2.420323325635104,
"grad_norm": 0.31276431825186013,
"learning_rate": 1.0548885077186965e-05,
"loss": 0.6589,
"step": 525
},
{
"epoch": 2.424942263279446,
"grad_norm": 0.3139985478127158,
"learning_rate": 1.0463121783876503e-05,
"loss": 0.6487,
"step": 526
},
{
"epoch": 2.4295612009237875,
"grad_norm": 0.31086239939835536,
"learning_rate": 1.0377358490566038e-05,
"loss": 0.5991,
"step": 527
},
{
"epoch": 2.4341801385681294,
"grad_norm": 0.30594661236068854,
"learning_rate": 1.0291595197255575e-05,
"loss": 0.5866,
"step": 528
},
{
"epoch": 2.438799076212471,
"grad_norm": 0.3064422197984899,
"learning_rate": 1.0205831903945113e-05,
"loss": 0.5956,
"step": 529
},
{
"epoch": 2.443418013856813,
"grad_norm": 0.32369688265143115,
"learning_rate": 1.012006861063465e-05,
"loss": 0.6075,
"step": 530
},
{
"epoch": 2.448036951501155,
"grad_norm": 0.30546749928441097,
"learning_rate": 1.0034305317324185e-05,
"loss": 0.608,
"step": 531
},
{
"epoch": 2.4526558891454964,
"grad_norm": 0.32048540857524926,
"learning_rate": 9.948542024013722e-06,
"loss": 0.6227,
"step": 532
},
{
"epoch": 2.4572748267898383,
"grad_norm": 0.32608169391627234,
"learning_rate": 9.86277873070326e-06,
"loss": 0.6282,
"step": 533
},
{
"epoch": 2.4618937644341803,
"grad_norm": 0.2996687509401124,
"learning_rate": 9.777015437392797e-06,
"loss": 0.5987,
"step": 534
},
{
"epoch": 2.466512702078522,
"grad_norm": 0.318426077906867,
"learning_rate": 9.691252144082332e-06,
"loss": 0.5884,
"step": 535
},
{
"epoch": 2.4711316397228638,
"grad_norm": 0.3074991022485019,
"learning_rate": 9.60548885077187e-06,
"loss": 0.6123,
"step": 536
},
{
"epoch": 2.4757505773672057,
"grad_norm": 0.28957516026291263,
"learning_rate": 9.519725557461407e-06,
"loss": 0.588,
"step": 537
},
{
"epoch": 2.4803695150115472,
"grad_norm": 0.30241771810453205,
"learning_rate": 9.433962264150944e-06,
"loss": 0.6185,
"step": 538
},
{
"epoch": 2.484988452655889,
"grad_norm": 0.31598111634312004,
"learning_rate": 9.348198970840481e-06,
"loss": 0.6276,
"step": 539
},
{
"epoch": 2.4896073903002307,
"grad_norm": 0.2925742508465086,
"learning_rate": 9.262435677530018e-06,
"loss": 0.5798,
"step": 540
},
{
"epoch": 2.4942263279445727,
"grad_norm": 0.318723447163499,
"learning_rate": 9.176672384219556e-06,
"loss": 0.5911,
"step": 541
},
{
"epoch": 2.4988452655889146,
"grad_norm": 0.3016002866050146,
"learning_rate": 9.090909090909091e-06,
"loss": 0.631,
"step": 542
},
{
"epoch": 2.503464203233256,
"grad_norm": 0.29466207790193094,
"learning_rate": 9.005145797598628e-06,
"loss": 0.6066,
"step": 543
},
{
"epoch": 2.508083140877598,
"grad_norm": 0.30246413124806476,
"learning_rate": 8.919382504288165e-06,
"loss": 0.6195,
"step": 544
},
{
"epoch": 2.51270207852194,
"grad_norm": 0.31204742642947775,
"learning_rate": 8.833619210977703e-06,
"loss": 0.6297,
"step": 545
},
{
"epoch": 2.5173210161662816,
"grad_norm": 0.3094170333008464,
"learning_rate": 8.747855917667238e-06,
"loss": 0.6243,
"step": 546
},
{
"epoch": 2.5219399538106235,
"grad_norm": 0.30492749690332244,
"learning_rate": 8.662092624356775e-06,
"loss": 0.6415,
"step": 547
},
{
"epoch": 2.5265588914549655,
"grad_norm": 0.3059936985616151,
"learning_rate": 8.576329331046313e-06,
"loss": 0.6154,
"step": 548
},
{
"epoch": 2.531177829099307,
"grad_norm": 0.32828974601831745,
"learning_rate": 8.49056603773585e-06,
"loss": 0.6047,
"step": 549
},
{
"epoch": 2.535796766743649,
"grad_norm": 0.32907820459380666,
"learning_rate": 8.404802744425385e-06,
"loss": 0.634,
"step": 550
},
{
"epoch": 2.540415704387991,
"grad_norm": 0.31746295493158116,
"learning_rate": 8.319039451114924e-06,
"loss": 0.5881,
"step": 551
},
{
"epoch": 2.5450346420323324,
"grad_norm": 0.3012391204791618,
"learning_rate": 8.233276157804461e-06,
"loss": 0.6167,
"step": 552
},
{
"epoch": 2.5496535796766744,
"grad_norm": 0.30416389082247347,
"learning_rate": 8.147512864493997e-06,
"loss": 0.6265,
"step": 553
},
{
"epoch": 2.5542725173210163,
"grad_norm": 0.2915634979508051,
"learning_rate": 8.061749571183534e-06,
"loss": 0.6025,
"step": 554
},
{
"epoch": 2.558891454965358,
"grad_norm": 0.2846909314046296,
"learning_rate": 7.975986277873071e-06,
"loss": 0.5788,
"step": 555
},
{
"epoch": 2.5635103926097,
"grad_norm": 0.30256519179774816,
"learning_rate": 7.890222984562608e-06,
"loss": 0.612,
"step": 556
},
{
"epoch": 2.5681293302540418,
"grad_norm": 0.30719634888183034,
"learning_rate": 7.804459691252144e-06,
"loss": 0.6166,
"step": 557
},
{
"epoch": 2.5727482678983833,
"grad_norm": 0.29612233584872105,
"learning_rate": 7.718696397941681e-06,
"loss": 0.594,
"step": 558
},
{
"epoch": 2.5773672055427252,
"grad_norm": 0.29648487120184286,
"learning_rate": 7.632933104631218e-06,
"loss": 0.6207,
"step": 559
},
{
"epoch": 2.581986143187067,
"grad_norm": 0.30369323976740464,
"learning_rate": 7.547169811320755e-06,
"loss": 0.6082,
"step": 560
},
{
"epoch": 2.5866050808314087,
"grad_norm": 0.3209073493299517,
"learning_rate": 7.461406518010292e-06,
"loss": 0.6407,
"step": 561
},
{
"epoch": 2.5912240184757507,
"grad_norm": 0.28800531726279605,
"learning_rate": 7.375643224699828e-06,
"loss": 0.6014,
"step": 562
},
{
"epoch": 2.5958429561200926,
"grad_norm": 0.3040178032418306,
"learning_rate": 7.289879931389365e-06,
"loss": 0.6271,
"step": 563
},
{
"epoch": 2.600461893764434,
"grad_norm": 0.29000325443801117,
"learning_rate": 7.2041166380789034e-06,
"loss": 0.5905,
"step": 564
},
{
"epoch": 2.605080831408776,
"grad_norm": 0.30342183045486265,
"learning_rate": 7.11835334476844e-06,
"loss": 0.6114,
"step": 565
},
{
"epoch": 2.6096997690531176,
"grad_norm": 0.31290901491302864,
"learning_rate": 7.032590051457977e-06,
"loss": 0.6094,
"step": 566
},
{
"epoch": 2.6143187066974596,
"grad_norm": 0.3000750618448554,
"learning_rate": 6.946826758147513e-06,
"loss": 0.5917,
"step": 567
},
{
"epoch": 2.6189376443418015,
"grad_norm": 0.2995296594725856,
"learning_rate": 6.8610634648370505e-06,
"loss": 0.6048,
"step": 568
},
{
"epoch": 2.623556581986143,
"grad_norm": 0.2944335185190586,
"learning_rate": 6.775300171526587e-06,
"loss": 0.5882,
"step": 569
},
{
"epoch": 2.628175519630485,
"grad_norm": 0.2885229105710456,
"learning_rate": 6.689536878216124e-06,
"loss": 0.593,
"step": 570
},
{
"epoch": 2.6327944572748265,
"grad_norm": 0.2942862998246492,
"learning_rate": 6.60377358490566e-06,
"loss": 0.6185,
"step": 571
},
{
"epoch": 2.6374133949191685,
"grad_norm": 0.31553011376845375,
"learning_rate": 6.518010291595198e-06,
"loss": 0.6007,
"step": 572
},
{
"epoch": 2.6420323325635104,
"grad_norm": 0.2966801599080869,
"learning_rate": 6.432246998284734e-06,
"loss": 0.6125,
"step": 573
},
{
"epoch": 2.646651270207852,
"grad_norm": 0.2970143525163274,
"learning_rate": 6.346483704974271e-06,
"loss": 0.5921,
"step": 574
},
{
"epoch": 2.651270207852194,
"grad_norm": 0.3128736490285455,
"learning_rate": 6.2607204116638075e-06,
"loss": 0.6102,
"step": 575
},
{
"epoch": 2.655889145496536,
"grad_norm": 0.30914134090170936,
"learning_rate": 6.1749571183533456e-06,
"loss": 0.6006,
"step": 576
},
{
"epoch": 2.6605080831408774,
"grad_norm": 0.3078371862780313,
"learning_rate": 6.089193825042882e-06,
"loss": 0.593,
"step": 577
},
{
"epoch": 2.6651270207852193,
"grad_norm": 0.30146772951443424,
"learning_rate": 6.003430531732418e-06,
"loss": 0.6125,
"step": 578
},
{
"epoch": 2.6697459584295613,
"grad_norm": 0.30882561935591346,
"learning_rate": 5.9176672384219555e-06,
"loss": 0.6013,
"step": 579
},
{
"epoch": 2.674364896073903,
"grad_norm": 0.28110450783967517,
"learning_rate": 5.831903945111493e-06,
"loss": 0.5743,
"step": 580
},
{
"epoch": 2.678983833718245,
"grad_norm": 0.30536384250829457,
"learning_rate": 5.74614065180103e-06,
"loss": 0.6208,
"step": 581
},
{
"epoch": 2.6836027713625867,
"grad_norm": 0.2930887379252701,
"learning_rate": 5.660377358490566e-06,
"loss": 0.5791,
"step": 582
},
{
"epoch": 2.6882217090069283,
"grad_norm": 0.32896140495963444,
"learning_rate": 5.574614065180103e-06,
"loss": 0.6211,
"step": 583
},
{
"epoch": 2.69284064665127,
"grad_norm": 0.29453059557853367,
"learning_rate": 5.48885077186964e-06,
"loss": 0.5906,
"step": 584
},
{
"epoch": 2.697459584295612,
"grad_norm": 0.30693498355518306,
"learning_rate": 5.403087478559177e-06,
"loss": 0.63,
"step": 585
},
{
"epoch": 2.7020785219399537,
"grad_norm": 0.3017258180366783,
"learning_rate": 5.317324185248713e-06,
"loss": 0.601,
"step": 586
},
{
"epoch": 2.7066974595842956,
"grad_norm": 0.31371400590626525,
"learning_rate": 5.231560891938251e-06,
"loss": 0.6089,
"step": 587
},
{
"epoch": 2.7113163972286376,
"grad_norm": 0.29289284929552173,
"learning_rate": 5.145797598627788e-06,
"loss": 0.5969,
"step": 588
},
{
"epoch": 2.715935334872979,
"grad_norm": 0.2996970953786668,
"learning_rate": 5.060034305317325e-06,
"loss": 0.5646,
"step": 589
},
{
"epoch": 2.720554272517321,
"grad_norm": 0.30593621591250736,
"learning_rate": 4.974271012006861e-06,
"loss": 0.6456,
"step": 590
},
{
"epoch": 2.725173210161663,
"grad_norm": 0.2891838609203197,
"learning_rate": 4.8885077186963984e-06,
"loss": 0.6098,
"step": 591
},
{
"epoch": 2.7297921478060045,
"grad_norm": 0.2864503436952264,
"learning_rate": 4.802744425385935e-06,
"loss": 0.5994,
"step": 592
},
{
"epoch": 2.7344110854503465,
"grad_norm": 0.28999881325471377,
"learning_rate": 4.716981132075472e-06,
"loss": 0.5949,
"step": 593
},
{
"epoch": 2.7390300230946885,
"grad_norm": 0.2883193997686141,
"learning_rate": 4.631217838765009e-06,
"loss": 0.5941,
"step": 594
},
{
"epoch": 2.74364896073903,
"grad_norm": 0.29071294001736114,
"learning_rate": 4.5454545454545455e-06,
"loss": 0.6101,
"step": 595
},
{
"epoch": 2.748267898383372,
"grad_norm": 0.2957883184239064,
"learning_rate": 4.459691252144083e-06,
"loss": 0.6046,
"step": 596
},
{
"epoch": 2.752886836027714,
"grad_norm": 0.29510923276497686,
"learning_rate": 4.373927958833619e-06,
"loss": 0.5997,
"step": 597
},
{
"epoch": 2.7575057736720554,
"grad_norm": 0.30703278190177846,
"learning_rate": 4.288164665523156e-06,
"loss": 0.6029,
"step": 598
},
{
"epoch": 2.7621247113163974,
"grad_norm": 0.30110004215184205,
"learning_rate": 4.202401372212693e-06,
"loss": 0.5732,
"step": 599
},
{
"epoch": 2.766743648960739,
"grad_norm": 0.29132059491277623,
"learning_rate": 4.116638078902231e-06,
"loss": 0.6211,
"step": 600
},
{
"epoch": 2.771362586605081,
"grad_norm": 0.3018546629152024,
"learning_rate": 4.030874785591767e-06,
"loss": 0.598,
"step": 601
},
{
"epoch": 2.775981524249423,
"grad_norm": 0.31576235977377226,
"learning_rate": 3.945111492281304e-06,
"loss": 0.5734,
"step": 602
},
{
"epoch": 2.7806004618937643,
"grad_norm": 0.2939270357095238,
"learning_rate": 3.8593481989708406e-06,
"loss": 0.6236,
"step": 603
},
{
"epoch": 2.7852193995381063,
"grad_norm": 0.2882407709715833,
"learning_rate": 3.7735849056603773e-06,
"loss": 0.5967,
"step": 604
},
{
"epoch": 2.789838337182448,
"grad_norm": 0.30641714688724436,
"learning_rate": 3.687821612349914e-06,
"loss": 0.6005,
"step": 605
},
{
"epoch": 2.7944572748267897,
"grad_norm": 0.2938901343846581,
"learning_rate": 3.6020583190394517e-06,
"loss": 0.6025,
"step": 606
},
{
"epoch": 2.7990762124711317,
"grad_norm": 0.29394293242709874,
"learning_rate": 3.5162950257289885e-06,
"loss": 0.6067,
"step": 607
},
{
"epoch": 2.803695150115473,
"grad_norm": 0.30418272161564036,
"learning_rate": 3.4305317324185253e-06,
"loss": 0.62,
"step": 608
},
{
"epoch": 2.808314087759815,
"grad_norm": 0.30816710967724775,
"learning_rate": 3.344768439108062e-06,
"loss": 0.5882,
"step": 609
},
{
"epoch": 2.812933025404157,
"grad_norm": 0.3084563272646307,
"learning_rate": 3.259005145797599e-06,
"loss": 0.6135,
"step": 610
},
{
"epoch": 2.8175519630484986,
"grad_norm": 0.29033205660854283,
"learning_rate": 3.1732418524871356e-06,
"loss": 0.5892,
"step": 611
},
{
"epoch": 2.8221709006928406,
"grad_norm": 0.28136227548781406,
"learning_rate": 3.0874785591766728e-06,
"loss": 0.5984,
"step": 612
},
{
"epoch": 2.8267898383371826,
"grad_norm": 0.2973686781339448,
"learning_rate": 3.001715265866209e-06,
"loss": 0.5741,
"step": 613
},
{
"epoch": 2.831408775981524,
"grad_norm": 0.3106961907487671,
"learning_rate": 2.9159519725557463e-06,
"loss": 0.5992,
"step": 614
},
{
"epoch": 2.836027713625866,
"grad_norm": 0.27929670816628865,
"learning_rate": 2.830188679245283e-06,
"loss": 0.5877,
"step": 615
},
{
"epoch": 2.840646651270208,
"grad_norm": 0.28400222443706963,
"learning_rate": 2.74442538593482e-06,
"loss": 0.5855,
"step": 616
},
{
"epoch": 2.8452655889145495,
"grad_norm": 0.28923415489826115,
"learning_rate": 2.6586620926243566e-06,
"loss": 0.5977,
"step": 617
},
{
"epoch": 2.8498845265588915,
"grad_norm": 0.2881072453329511,
"learning_rate": 2.572898799313894e-06,
"loss": 0.5788,
"step": 618
},
{
"epoch": 2.8545034642032334,
"grad_norm": 0.2833832263872196,
"learning_rate": 2.4871355060034306e-06,
"loss": 0.5966,
"step": 619
},
{
"epoch": 2.859122401847575,
"grad_norm": 0.2769106623328087,
"learning_rate": 2.4013722126929674e-06,
"loss": 0.6022,
"step": 620
},
{
"epoch": 2.863741339491917,
"grad_norm": 0.28980445777598735,
"learning_rate": 2.3156089193825046e-06,
"loss": 0.6236,
"step": 621
},
{
"epoch": 2.868360277136259,
"grad_norm": 0.2784211619090078,
"learning_rate": 2.2298456260720414e-06,
"loss": 0.5823,
"step": 622
},
{
"epoch": 2.8729792147806004,
"grad_norm": 0.28572355467693006,
"learning_rate": 2.144082332761578e-06,
"loss": 0.5837,
"step": 623
},
{
"epoch": 2.8775981524249423,
"grad_norm": 0.2890309239729817,
"learning_rate": 2.0583190394511153e-06,
"loss": 0.6087,
"step": 624
},
{
"epoch": 2.8822170900692843,
"grad_norm": 0.29305983698286625,
"learning_rate": 1.972555746140652e-06,
"loss": 0.5719,
"step": 625
},
{
"epoch": 2.886836027713626,
"grad_norm": 0.29029113175529553,
"learning_rate": 1.8867924528301887e-06,
"loss": 0.6006,
"step": 626
},
{
"epoch": 2.8914549653579678,
"grad_norm": 0.29558132786512137,
"learning_rate": 1.8010291595197259e-06,
"loss": 0.5884,
"step": 627
},
{
"epoch": 2.8960739030023097,
"grad_norm": 0.27737061591132084,
"learning_rate": 1.7152658662092626e-06,
"loss": 0.5747,
"step": 628
},
{
"epoch": 2.9006928406466512,
"grad_norm": 0.2906865526927262,
"learning_rate": 1.6295025728987994e-06,
"loss": 0.5819,
"step": 629
},
{
"epoch": 2.905311778290993,
"grad_norm": 0.28636004347869753,
"learning_rate": 1.5437392795883364e-06,
"loss": 0.6126,
"step": 630
},
{
"epoch": 2.909930715935335,
"grad_norm": 0.26919586037813015,
"learning_rate": 1.4579759862778732e-06,
"loss": 0.5639,
"step": 631
},
{
"epoch": 2.9145496535796767,
"grad_norm": 0.2895844765233449,
"learning_rate": 1.37221269296741e-06,
"loss": 0.5951,
"step": 632
},
{
"epoch": 2.9191685912240186,
"grad_norm": 0.30405484992769594,
"learning_rate": 1.286449399656947e-06,
"loss": 0.605,
"step": 633
},
{
"epoch": 2.92378752886836,
"grad_norm": 0.2742127886928758,
"learning_rate": 1.2006861063464837e-06,
"loss": 0.5789,
"step": 634
},
{
"epoch": 2.928406466512702,
"grad_norm": 0.27537402783983544,
"learning_rate": 1.1149228130360207e-06,
"loss": 0.5939,
"step": 635
},
{
"epoch": 2.9330254041570436,
"grad_norm": 0.2793205508584995,
"learning_rate": 1.0291595197255577e-06,
"loss": 0.5891,
"step": 636
},
{
"epoch": 2.9376443418013856,
"grad_norm": 0.28197217742689246,
"learning_rate": 9.433962264150943e-07,
"loss": 0.5953,
"step": 637
},
{
"epoch": 2.9422632794457275,
"grad_norm": 0.2860122536798405,
"learning_rate": 8.576329331046313e-07,
"loss": 0.5965,
"step": 638
},
{
"epoch": 2.946882217090069,
"grad_norm": 0.28798434128793676,
"learning_rate": 7.718696397941682e-07,
"loss": 0.6019,
"step": 639
},
{
"epoch": 2.951501154734411,
"grad_norm": 0.28140891231620974,
"learning_rate": 6.86106346483705e-07,
"loss": 0.5995,
"step": 640
},
{
"epoch": 2.956120092378753,
"grad_norm": 0.2721204910616356,
"learning_rate": 6.003430531732418e-07,
"loss": 0.593,
"step": 641
},
{
"epoch": 2.9607390300230945,
"grad_norm": 0.27557988074120476,
"learning_rate": 5.145797598627788e-07,
"loss": 0.597,
"step": 642
},
{
"epoch": 2.9653579676674364,
"grad_norm": 0.28093710438458425,
"learning_rate": 4.2881646655231566e-07,
"loss": 0.5957,
"step": 643
},
{
"epoch": 2.9699769053117784,
"grad_norm": 0.2825367132386875,
"learning_rate": 3.430531732418525e-07,
"loss": 0.597,
"step": 644
},
{
"epoch": 2.97459584295612,
"grad_norm": 0.2817869433020897,
"learning_rate": 2.572898799313894e-07,
"loss": 0.5952,
"step": 645
},
{
"epoch": 2.979214780600462,
"grad_norm": 0.29534758243871173,
"learning_rate": 1.7152658662092624e-07,
"loss": 0.5938,
"step": 646
},
{
"epoch": 2.983833718244804,
"grad_norm": 0.2811639385499561,
"learning_rate": 8.576329331046312e-08,
"loss": 0.5835,
"step": 647
},
{
"epoch": 2.9884526558891453,
"grad_norm": 0.2938373155724318,
"learning_rate": 0.0,
"loss": 0.5883,
"step": 648
},
{
"epoch": 2.9884526558891453,
"step": 648,
"total_flos": 7.103540182070067e+18,
"train_loss": 1.0860004471959892,
"train_runtime": 39219.4735,
"train_samples_per_second": 0.265,
"train_steps_per_second": 0.017
}
],
"logging_steps": 1,
"max_steps": 648,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 7.103540182070067e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}