vlrethinker-full-ep5 / trainer_state.json
hbXNov's picture
Add files using upload-large-folder tool
50996cc verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.0,
"eval_steps": 500,
"global_step": 545,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.009195402298850575,
"grad_norm": 20.714553450281343,
"learning_rate": 1.0000000000000002e-06,
"loss": 2.2012,
"step": 1
},
{
"epoch": 0.01839080459770115,
"grad_norm": 16.813897390933732,
"learning_rate": 2.0000000000000003e-06,
"loss": 2.1382,
"step": 2
},
{
"epoch": 0.027586206896551724,
"grad_norm": 10.685135212219203,
"learning_rate": 3e-06,
"loss": 2.2476,
"step": 3
},
{
"epoch": 0.0367816091954023,
"grad_norm": 14.421748935553252,
"learning_rate": 4.000000000000001e-06,
"loss": 2.3914,
"step": 4
},
{
"epoch": 0.04597701149425287,
"grad_norm": 8.628671157622785,
"learning_rate": 5e-06,
"loss": 1.6079,
"step": 5
},
{
"epoch": 0.05517241379310345,
"grad_norm": 6.948007057279052,
"learning_rate": 6e-06,
"loss": 1.6653,
"step": 6
},
{
"epoch": 0.06436781609195402,
"grad_norm": 11.209735445974314,
"learning_rate": 7e-06,
"loss": 1.9133,
"step": 7
},
{
"epoch": 0.0735632183908046,
"grad_norm": 7.97219877448209,
"learning_rate": 8.000000000000001e-06,
"loss": 1.5034,
"step": 8
},
{
"epoch": 0.08275862068965517,
"grad_norm": 139.5080565326191,
"learning_rate": 9e-06,
"loss": 1.5352,
"step": 9
},
{
"epoch": 0.09195402298850575,
"grad_norm": 23.862722229248178,
"learning_rate": 1e-05,
"loss": 2.3425,
"step": 10
},
{
"epoch": 0.10114942528735632,
"grad_norm": 22.34844242652855,
"learning_rate": 1.1000000000000001e-05,
"loss": 2.2261,
"step": 11
},
{
"epoch": 0.1103448275862069,
"grad_norm": 15.393131095910384,
"learning_rate": 1.2e-05,
"loss": 1.7781,
"step": 12
},
{
"epoch": 0.11954022988505747,
"grad_norm": 12.868959642332323,
"learning_rate": 1.3000000000000001e-05,
"loss": 1.8264,
"step": 13
},
{
"epoch": 0.12873563218390804,
"grad_norm": 9.227770966015983,
"learning_rate": 1.4e-05,
"loss": 1.7844,
"step": 14
},
{
"epoch": 0.13793103448275862,
"grad_norm": 6.272388047403451,
"learning_rate": 1.5000000000000002e-05,
"loss": 1.7839,
"step": 15
},
{
"epoch": 0.1471264367816092,
"grad_norm": 5.913338642035567,
"learning_rate": 1.6000000000000003e-05,
"loss": 1.5099,
"step": 16
},
{
"epoch": 0.15632183908045977,
"grad_norm": 11.31744159936266,
"learning_rate": 1.7e-05,
"loss": 1.6116,
"step": 17
},
{
"epoch": 0.16551724137931034,
"grad_norm": 4.969853987438896,
"learning_rate": 1.8e-05,
"loss": 1.5317,
"step": 18
},
{
"epoch": 0.17471264367816092,
"grad_norm": 5.355015500818536,
"learning_rate": 1.9e-05,
"loss": 1.6409,
"step": 19
},
{
"epoch": 0.1839080459770115,
"grad_norm": 5.196414742911105,
"learning_rate": 2e-05,
"loss": 1.5442,
"step": 20
},
{
"epoch": 0.19310344827586207,
"grad_norm": 5.614271101884435,
"learning_rate": 1.999988738608264e-05,
"loss": 1.5029,
"step": 21
},
{
"epoch": 0.20229885057471264,
"grad_norm": 5.015394546767328,
"learning_rate": 1.9999549547148767e-05,
"loss": 1.8169,
"step": 22
},
{
"epoch": 0.21149425287356322,
"grad_norm": 4.557156450415814,
"learning_rate": 1.9998986491652896e-05,
"loss": 1.5093,
"step": 23
},
{
"epoch": 0.2206896551724138,
"grad_norm": 5.945840310610162,
"learning_rate": 1.9998198233685676e-05,
"loss": 1.6238,
"step": 24
},
{
"epoch": 0.22988505747126436,
"grad_norm": 7.357312504680639,
"learning_rate": 1.9997184792973504e-05,
"loss": 1.4395,
"step": 25
},
{
"epoch": 0.23908045977011494,
"grad_norm": 9.510181997589852,
"learning_rate": 1.999594619487806e-05,
"loss": 1.3813,
"step": 26
},
{
"epoch": 0.2482758620689655,
"grad_norm": 5.191011707934582,
"learning_rate": 1.999448247039565e-05,
"loss": 1.399,
"step": 27
},
{
"epoch": 0.2574712643678161,
"grad_norm": 6.777591477906461,
"learning_rate": 1.999279365615644e-05,
"loss": 1.4485,
"step": 28
},
{
"epoch": 0.26666666666666666,
"grad_norm": 12.91448741331523,
"learning_rate": 1.9990879794423536e-05,
"loss": 1.5291,
"step": 29
},
{
"epoch": 0.27586206896551724,
"grad_norm": 7.9192564908233525,
"learning_rate": 1.9988740933091932e-05,
"loss": 1.752,
"step": 30
},
{
"epoch": 0.2850574712643678,
"grad_norm": 13.307612828008661,
"learning_rate": 1.9986377125687305e-05,
"loss": 1.5955,
"step": 31
},
{
"epoch": 0.2942528735632184,
"grad_norm": 5.626525978301407,
"learning_rate": 1.998378843136468e-05,
"loss": 1.3663,
"step": 32
},
{
"epoch": 0.30344827586206896,
"grad_norm": 7.3508239388348375,
"learning_rate": 1.998097491490695e-05,
"loss": 1.6621,
"step": 33
},
{
"epoch": 0.31264367816091954,
"grad_norm": 8.066689186099643,
"learning_rate": 1.9977936646723254e-05,
"loss": 1.5935,
"step": 34
},
{
"epoch": 0.3218390804597701,
"grad_norm": 6.180669424711909,
"learning_rate": 1.99746737028472e-05,
"loss": 1.7871,
"step": 35
},
{
"epoch": 0.3310344827586207,
"grad_norm": 9.183166350518048,
"learning_rate": 1.9971186164934995e-05,
"loss": 1.7529,
"step": 36
},
{
"epoch": 0.34022988505747126,
"grad_norm": 7.006667910909564,
"learning_rate": 1.996747412026337e-05,
"loss": 1.6017,
"step": 37
},
{
"epoch": 0.34942528735632183,
"grad_norm": 5.476091693081514,
"learning_rate": 1.9963537661727415e-05,
"loss": 1.5574,
"step": 38
},
{
"epoch": 0.3586206896551724,
"grad_norm": 7.073151215201376,
"learning_rate": 1.995937688783824e-05,
"loss": 1.52,
"step": 39
},
{
"epoch": 0.367816091954023,
"grad_norm": 6.105787470572727,
"learning_rate": 1.995499190272053e-05,
"loss": 1.6445,
"step": 40
},
{
"epoch": 0.37701149425287356,
"grad_norm": 26.79480096943695,
"learning_rate": 1.9950382816109904e-05,
"loss": 1.5081,
"step": 41
},
{
"epoch": 0.38620689655172413,
"grad_norm": 12.678846665605036,
"learning_rate": 1.994554974335022e-05,
"loss": 1.2374,
"step": 42
},
{
"epoch": 0.3954022988505747,
"grad_norm": 8.982247296188962,
"learning_rate": 1.9940492805390644e-05,
"loss": 1.3977,
"step": 43
},
{
"epoch": 0.4045977011494253,
"grad_norm": 9.660805568531522,
"learning_rate": 1.9935212128782637e-05,
"loss": 1.4276,
"step": 44
},
{
"epoch": 0.41379310344827586,
"grad_norm": 5.878685972804514,
"learning_rate": 1.9929707845676796e-05,
"loss": 1.498,
"step": 45
},
{
"epoch": 0.42298850574712643,
"grad_norm": 6.1805386339462425,
"learning_rate": 1.992398009381954e-05,
"loss": 1.5585,
"step": 46
},
{
"epoch": 0.432183908045977,
"grad_norm": 7.004448856725815,
"learning_rate": 1.991802901654966e-05,
"loss": 1.5439,
"step": 47
},
{
"epoch": 0.4413793103448276,
"grad_norm": 21.3461812408264,
"learning_rate": 1.9911854762794747e-05,
"loss": 1.48,
"step": 48
},
{
"epoch": 0.45057471264367815,
"grad_norm": 11.305699015280368,
"learning_rate": 1.9905457487067438e-05,
"loss": 1.5159,
"step": 49
},
{
"epoch": 0.45977011494252873,
"grad_norm": 9.362223347622393,
"learning_rate": 1.9898837349461573e-05,
"loss": 1.3899,
"step": 50
},
{
"epoch": 0.4689655172413793,
"grad_norm": 6.86071978513186,
"learning_rate": 1.989199451564819e-05,
"loss": 1.3236,
"step": 51
},
{
"epoch": 0.4781609195402299,
"grad_norm": 10.919914686023162,
"learning_rate": 1.9884929156871348e-05,
"loss": 1.5464,
"step": 52
},
{
"epoch": 0.48735632183908045,
"grad_norm": 18.62756427908137,
"learning_rate": 1.9877641449943884e-05,
"loss": 1.4592,
"step": 53
},
{
"epoch": 0.496551724137931,
"grad_norm": 14.125885744657854,
"learning_rate": 1.9870131577242958e-05,
"loss": 1.6143,
"step": 54
},
{
"epoch": 0.5057471264367817,
"grad_norm": 21.319792981406064,
"learning_rate": 1.98623997267055e-05,
"loss": 1.6257,
"step": 55
},
{
"epoch": 0.5149425287356322,
"grad_norm": 7.07342180739188,
"learning_rate": 1.98544460918235e-05,
"loss": 1.3489,
"step": 56
},
{
"epoch": 0.5241379310344828,
"grad_norm": 11.602183894060184,
"learning_rate": 1.984627087163918e-05,
"loss": 1.3555,
"step": 57
},
{
"epoch": 0.5333333333333333,
"grad_norm": 5.711430596116647,
"learning_rate": 1.9837874270740005e-05,
"loss": 1.4868,
"step": 58
},
{
"epoch": 0.542528735632184,
"grad_norm": 6.872030995436107,
"learning_rate": 1.9829256499253548e-05,
"loss": 1.4138,
"step": 59
},
{
"epoch": 0.5517241379310345,
"grad_norm": 6.389710028362303,
"learning_rate": 1.982041777284226e-05,
"loss": 1.532,
"step": 60
},
{
"epoch": 0.5609195402298851,
"grad_norm": 8.960725431515376,
"learning_rate": 1.9811358312698052e-05,
"loss": 1.4233,
"step": 61
},
{
"epoch": 0.5701149425287356,
"grad_norm": 7.068530411045381,
"learning_rate": 1.980207834553677e-05,
"loss": 1.4343,
"step": 62
},
{
"epoch": 0.5793103448275863,
"grad_norm": 7.937280281308531,
"learning_rate": 1.9792578103592506e-05,
"loss": 1.4436,
"step": 63
},
{
"epoch": 0.5885057471264368,
"grad_norm": 4.993919261195511,
"learning_rate": 1.978285782461182e-05,
"loss": 1.1707,
"step": 64
},
{
"epoch": 0.5977011494252874,
"grad_norm": 9.299339194434403,
"learning_rate": 1.977291775184775e-05,
"loss": 1.3752,
"step": 65
},
{
"epoch": 0.6068965517241379,
"grad_norm": 8.969891010411576,
"learning_rate": 1.976275813405374e-05,
"loss": 1.7358,
"step": 66
},
{
"epoch": 0.6160919540229886,
"grad_norm": 7.846317322412413,
"learning_rate": 1.9752379225477436e-05,
"loss": 1.6702,
"step": 67
},
{
"epoch": 0.6252873563218391,
"grad_norm": 10.35641201740817,
"learning_rate": 1.974178128585429e-05,
"loss": 1.6179,
"step": 68
},
{
"epoch": 0.6344827586206897,
"grad_norm": 13.053360167992375,
"learning_rate": 1.973096458040108e-05,
"loss": 1.3878,
"step": 69
},
{
"epoch": 0.6436781609195402,
"grad_norm": 8.85650402977275,
"learning_rate": 1.9719929379809262e-05,
"loss": 1.402,
"step": 70
},
{
"epoch": 0.6528735632183909,
"grad_norm": 7.259573301011822,
"learning_rate": 1.9708675960238214e-05,
"loss": 1.325,
"step": 71
},
{
"epoch": 0.6620689655172414,
"grad_norm": 8.2385844490914,
"learning_rate": 1.9697204603308303e-05,
"loss": 1.5098,
"step": 72
},
{
"epoch": 0.671264367816092,
"grad_norm": 6.950518749393352,
"learning_rate": 1.9685515596093844e-05,
"loss": 1.318,
"step": 73
},
{
"epoch": 0.6804597701149425,
"grad_norm": 9.109982816285358,
"learning_rate": 1.967360923111593e-05,
"loss": 1.4189,
"step": 74
},
{
"epoch": 0.6896551724137931,
"grad_norm": 10.452230731667223,
"learning_rate": 1.9661485806335095e-05,
"loss": 1.4102,
"step": 75
},
{
"epoch": 0.6988505747126437,
"grad_norm": 19.344365444774066,
"learning_rate": 1.964914562514386e-05,
"loss": 1.7136,
"step": 76
},
{
"epoch": 0.7080459770114943,
"grad_norm": 7.435243566159918,
"learning_rate": 1.9636588996359145e-05,
"loss": 1.3748,
"step": 77
},
{
"epoch": 0.7172413793103448,
"grad_norm": 9.733411972174634,
"learning_rate": 1.9623816234214538e-05,
"loss": 1.3021,
"step": 78
},
{
"epoch": 0.7264367816091954,
"grad_norm": 9.942892711776333,
"learning_rate": 1.9610827658352448e-05,
"loss": 1.5538,
"step": 79
},
{
"epoch": 0.735632183908046,
"grad_norm": 9.084274644415883,
"learning_rate": 1.959762359381606e-05,
"loss": 1.5767,
"step": 80
},
{
"epoch": 0.7448275862068966,
"grad_norm": 6.513368914673006,
"learning_rate": 1.9584204371041257e-05,
"loss": 1.6025,
"step": 81
},
{
"epoch": 0.7540229885057471,
"grad_norm": 7.173737363149948,
"learning_rate": 1.957057032584832e-05,
"loss": 1.8008,
"step": 82
},
{
"epoch": 0.7632183908045977,
"grad_norm": 7.6666142708069,
"learning_rate": 1.955672179943351e-05,
"loss": 1.1672,
"step": 83
},
{
"epoch": 0.7724137931034483,
"grad_norm": 12.748744044610136,
"learning_rate": 1.9542659138360575e-05,
"loss": 1.6484,
"step": 84
},
{
"epoch": 0.7816091954022989,
"grad_norm": 10.435352031122768,
"learning_rate": 1.9528382694552033e-05,
"loss": 1.7322,
"step": 85
},
{
"epoch": 0.7908045977011494,
"grad_norm": 8.348806624357442,
"learning_rate": 1.9513892825280387e-05,
"loss": 1.6316,
"step": 86
},
{
"epoch": 0.8,
"grad_norm": 7.561464021812533,
"learning_rate": 1.9499189893159178e-05,
"loss": 1.5837,
"step": 87
},
{
"epoch": 0.8091954022988506,
"grad_norm": 9.833304197128921,
"learning_rate": 1.9484274266133918e-05,
"loss": 1.8191,
"step": 88
},
{
"epoch": 0.8183908045977012,
"grad_norm": 36.91977456946538,
"learning_rate": 1.9469146317472867e-05,
"loss": 1.6587,
"step": 89
},
{
"epoch": 0.8275862068965517,
"grad_norm": 9.18997721365779,
"learning_rate": 1.9453806425757706e-05,
"loss": 1.6042,
"step": 90
},
{
"epoch": 0.8367816091954023,
"grad_norm": 5.8375441349876285,
"learning_rate": 1.9438254974874055e-05,
"loss": 1.4569,
"step": 91
},
{
"epoch": 0.8459770114942529,
"grad_norm": 10.33531533117819,
"learning_rate": 1.9422492354001876e-05,
"loss": 1.554,
"step": 92
},
{
"epoch": 0.8551724137931035,
"grad_norm": 7.653431229054158,
"learning_rate": 1.9406518957605716e-05,
"loss": 1.6409,
"step": 93
},
{
"epoch": 0.864367816091954,
"grad_norm": 5.139794337597655,
"learning_rate": 1.9390335185424852e-05,
"loss": 1.4226,
"step": 94
},
{
"epoch": 0.8735632183908046,
"grad_norm": 6.772516024095916,
"learning_rate": 1.9373941442463286e-05,
"loss": 1.6716,
"step": 95
},
{
"epoch": 0.8827586206896552,
"grad_norm": 12.456055439523869,
"learning_rate": 1.9357338138979586e-05,
"loss": 1.3682,
"step": 96
},
{
"epoch": 0.8919540229885058,
"grad_norm": 6.3363194804101886,
"learning_rate": 1.9340525690476665e-05,
"loss": 1.5991,
"step": 97
},
{
"epoch": 0.9011494252873563,
"grad_norm": 6.178188975859817,
"learning_rate": 1.9323504517691335e-05,
"loss": 1.512,
"step": 98
},
{
"epoch": 0.9103448275862069,
"grad_norm": 5.469881912998274,
"learning_rate": 1.9306275046583804e-05,
"loss": 1.3198,
"step": 99
},
{
"epoch": 0.9195402298850575,
"grad_norm": 92.72146423017554,
"learning_rate": 1.9288837708327018e-05,
"loss": 1.325,
"step": 100
},
{
"epoch": 0.9287356321839081,
"grad_norm": 10.225670198371311,
"learning_rate": 1.9271192939295863e-05,
"loss": 1.3693,
"step": 101
},
{
"epoch": 0.9379310344827586,
"grad_norm": 15.221298407043829,
"learning_rate": 1.925334118105623e-05,
"loss": 1.3868,
"step": 102
},
{
"epoch": 0.9471264367816092,
"grad_norm": 7.296423072692595,
"learning_rate": 1.9235282880354e-05,
"loss": 1.4702,
"step": 103
},
{
"epoch": 0.9563218390804598,
"grad_norm": 11.090698404128442,
"learning_rate": 1.9217018489103832e-05,
"loss": 1.493,
"step": 104
},
{
"epoch": 0.9655172413793104,
"grad_norm": 13.113004849625796,
"learning_rate": 1.9198548464377875e-05,
"loss": 1.5315,
"step": 105
},
{
"epoch": 0.9747126436781609,
"grad_norm": 6.5579585036461765,
"learning_rate": 1.917987326839431e-05,
"loss": 1.401,
"step": 106
},
{
"epoch": 0.9839080459770115,
"grad_norm": 23.571017224968177,
"learning_rate": 1.9160993368505803e-05,
"loss": 1.5408,
"step": 107
},
{
"epoch": 0.993103448275862,
"grad_norm": 7.932308650354931,
"learning_rate": 1.914190923718779e-05,
"loss": 1.563,
"step": 108
},
{
"epoch": 1.0,
"grad_norm": 7.932308650354931,
"learning_rate": 1.912262135202667e-05,
"loss": 0.9182,
"step": 109
},
{
"epoch": 1.0091954022988505,
"grad_norm": 6.574864715652341,
"learning_rate": 1.9103130195707846e-05,
"loss": 1.3379,
"step": 110
},
{
"epoch": 1.018390804597701,
"grad_norm": 8.237386794566326,
"learning_rate": 1.9083436256003643e-05,
"loss": 1.4205,
"step": 111
},
{
"epoch": 1.0275862068965518,
"grad_norm": 6.220467444924186,
"learning_rate": 1.906354002576111e-05,
"loss": 1.4788,
"step": 112
},
{
"epoch": 1.0367816091954023,
"grad_norm": 13.393857842936615,
"learning_rate": 1.9043442002889663e-05,
"loss": 1.2128,
"step": 113
},
{
"epoch": 1.0459770114942528,
"grad_norm": 7.240700390139476,
"learning_rate": 1.9023142690348663e-05,
"loss": 1.4041,
"step": 114
},
{
"epoch": 1.0551724137931036,
"grad_norm": 12.403313459805997,
"learning_rate": 1.90026425961348e-05,
"loss": 1.1957,
"step": 115
},
{
"epoch": 1.064367816091954,
"grad_norm": 5.294546200505072,
"learning_rate": 1.898194223326939e-05,
"loss": 1.5244,
"step": 116
},
{
"epoch": 1.0735632183908046,
"grad_norm": 8.062777181362618,
"learning_rate": 1.8961042119785534e-05,
"loss": 1.5571,
"step": 117
},
{
"epoch": 1.0827586206896551,
"grad_norm": 6.630411039426618,
"learning_rate": 1.893994277871515e-05,
"loss": 1.4017,
"step": 118
},
{
"epoch": 1.0919540229885056,
"grad_norm": 7.466049537995627,
"learning_rate": 1.891864473807589e-05,
"loss": 1.6523,
"step": 119
},
{
"epoch": 1.1011494252873564,
"grad_norm": 6.878184185710444,
"learning_rate": 1.8897148530857944e-05,
"loss": 1.7305,
"step": 120
},
{
"epoch": 1.110344827586207,
"grad_norm": 10.183361403701092,
"learning_rate": 1.8875454695010655e-05,
"loss": 1.4861,
"step": 121
},
{
"epoch": 1.1195402298850574,
"grad_norm": 7.846946240488356,
"learning_rate": 1.8853563773429102e-05,
"loss": 1.3378,
"step": 122
},
{
"epoch": 1.1287356321839082,
"grad_norm": 8.380364873375658,
"learning_rate": 1.8831476313940495e-05,
"loss": 1.2773,
"step": 123
},
{
"epoch": 1.1379310344827587,
"grad_norm": 7.13754465392972,
"learning_rate": 1.8809192869290463e-05,
"loss": 1.3115,
"step": 124
},
{
"epoch": 1.1471264367816092,
"grad_norm": 16.170663480840823,
"learning_rate": 1.878671399712923e-05,
"loss": 1.5776,
"step": 125
},
{
"epoch": 1.1563218390804597,
"grad_norm": 6.760966961951662,
"learning_rate": 1.8764040259997642e-05,
"loss": 1.6387,
"step": 126
},
{
"epoch": 1.1655172413793102,
"grad_norm": 6.231697770807728,
"learning_rate": 1.874117222531312e-05,
"loss": 1.4857,
"step": 127
},
{
"epoch": 1.174712643678161,
"grad_norm": 11.512205247824191,
"learning_rate": 1.8718110465355436e-05,
"loss": 1.958,
"step": 128
},
{
"epoch": 1.1839080459770115,
"grad_norm": 10.234002782144223,
"learning_rate": 1.8694855557252395e-05,
"loss": 1.6003,
"step": 129
},
{
"epoch": 1.193103448275862,
"grad_norm": 6.775464121063177,
"learning_rate": 1.8671408082965394e-05,
"loss": 1.3716,
"step": 130
},
{
"epoch": 1.2022988505747128,
"grad_norm": 9.532716070689466,
"learning_rate": 1.8647768629274865e-05,
"loss": 1.2361,
"step": 131
},
{
"epoch": 1.2114942528735633,
"grad_norm": 5.598812395655789,
"learning_rate": 1.8623937787765582e-05,
"loss": 1.2849,
"step": 132
},
{
"epoch": 1.2206896551724138,
"grad_norm": 12.593202228212819,
"learning_rate": 1.8599916154811858e-05,
"loss": 1.3579,
"step": 133
},
{
"epoch": 1.2298850574712643,
"grad_norm": 7.992727204119873,
"learning_rate": 1.8575704331562624e-05,
"loss": 1.293,
"step": 134
},
{
"epoch": 1.2390804597701148,
"grad_norm": 10.180939056019497,
"learning_rate": 1.8551302923926387e-05,
"loss": 1.3632,
"step": 135
},
{
"epoch": 1.2482758620689656,
"grad_norm": 5.835100637584005,
"learning_rate": 1.8526712542556054e-05,
"loss": 1.4304,
"step": 136
},
{
"epoch": 1.257471264367816,
"grad_norm": 7.133504661503169,
"learning_rate": 1.8501933802833664e-05,
"loss": 1.4319,
"step": 137
},
{
"epoch": 1.2666666666666666,
"grad_norm": 6.700994373390855,
"learning_rate": 1.8476967324854987e-05,
"loss": 1.6399,
"step": 138
},
{
"epoch": 1.2758620689655173,
"grad_norm": 7.6168901919768315,
"learning_rate": 1.8451813733413998e-05,
"loss": 1.4226,
"step": 139
},
{
"epoch": 1.2850574712643679,
"grad_norm": 6.869638065615107,
"learning_rate": 1.8426473657987238e-05,
"loss": 1.3926,
"step": 140
},
{
"epoch": 1.2942528735632184,
"grad_norm": 7.9246065537695145,
"learning_rate": 1.8400947732718083e-05,
"loss": 1.3882,
"step": 141
},
{
"epoch": 1.303448275862069,
"grad_norm": 35.57695232792952,
"learning_rate": 1.837523659640085e-05,
"loss": 1.2931,
"step": 142
},
{
"epoch": 1.3126436781609194,
"grad_norm": 6.537575634392787,
"learning_rate": 1.8349340892464827e-05,
"loss": 1.3601,
"step": 143
},
{
"epoch": 1.3218390804597702,
"grad_norm": 5.796967091596528,
"learning_rate": 1.832326126895816e-05,
"loss": 1.2791,
"step": 144
},
{
"epoch": 1.3310344827586207,
"grad_norm": 5.370682917159942,
"learning_rate": 1.8296998378531634e-05,
"loss": 1.6052,
"step": 145
},
{
"epoch": 1.3402298850574712,
"grad_norm": 8.766511156656957,
"learning_rate": 1.827055287842236e-05,
"loss": 1.3518,
"step": 146
},
{
"epoch": 1.349425287356322,
"grad_norm": 6.480740512124651,
"learning_rate": 1.8243925430437314e-05,
"loss": 1.311,
"step": 147
},
{
"epoch": 1.3586206896551725,
"grad_norm": 7.645471465966849,
"learning_rate": 1.821711670093676e-05,
"loss": 1.291,
"step": 148
},
{
"epoch": 1.367816091954023,
"grad_norm": 9.381673919145971,
"learning_rate": 1.81901273608176e-05,
"loss": 1.4457,
"step": 149
},
{
"epoch": 1.3770114942528735,
"grad_norm": 92.46895216336263,
"learning_rate": 1.8162958085496572e-05,
"loss": 1.2527,
"step": 150
},
{
"epoch": 1.386206896551724,
"grad_norm": 5.256281556855925,
"learning_rate": 1.8135609554893345e-05,
"loss": 1.3901,
"step": 151
},
{
"epoch": 1.3954022988505748,
"grad_norm": 6.207996783084738,
"learning_rate": 1.810808245341352e-05,
"loss": 1.3934,
"step": 152
},
{
"epoch": 1.4045977011494253,
"grad_norm": 7.475298218689304,
"learning_rate": 1.8080377469931468e-05,
"loss": 1.5079,
"step": 153
},
{
"epoch": 1.4137931034482758,
"grad_norm": 7.348051374244608,
"learning_rate": 1.8052495297773135e-05,
"loss": 1.3069,
"step": 154
},
{
"epoch": 1.4229885057471265,
"grad_norm": 5.764809442997243,
"learning_rate": 1.802443663469867e-05,
"loss": 1.4919,
"step": 155
},
{
"epoch": 1.432183908045977,
"grad_norm": 6.715860371189423,
"learning_rate": 1.7996202182884938e-05,
"loss": 1.4631,
"step": 156
},
{
"epoch": 1.4413793103448276,
"grad_norm": 6.647142576932514,
"learning_rate": 1.7967792648907993e-05,
"loss": 1.5767,
"step": 157
},
{
"epoch": 1.450574712643678,
"grad_norm": 15.258238976802454,
"learning_rate": 1.7939208743725378e-05,
"loss": 1.4467,
"step": 158
},
{
"epoch": 1.4597701149425286,
"grad_norm": 7.134307398087775,
"learning_rate": 1.7910451182658318e-05,
"loss": 1.3992,
"step": 159
},
{
"epoch": 1.4689655172413794,
"grad_norm": 10.178435844025032,
"learning_rate": 1.7881520685373836e-05,
"loss": 1.3086,
"step": 160
},
{
"epoch": 1.4781609195402299,
"grad_norm": 7.9995750026556065,
"learning_rate": 1.7852417975866735e-05,
"loss": 1.3984,
"step": 161
},
{
"epoch": 1.4873563218390804,
"grad_norm": 6.04856446144021,
"learning_rate": 1.7823143782441498e-05,
"loss": 1.3864,
"step": 162
},
{
"epoch": 1.4965517241379311,
"grad_norm": 7.302148673860431,
"learning_rate": 1.779369883769403e-05,
"loss": 1.4692,
"step": 163
},
{
"epoch": 1.5057471264367817,
"grad_norm": 11.710455921764995,
"learning_rate": 1.7764083878493342e-05,
"loss": 1.3108,
"step": 164
},
{
"epoch": 1.5149425287356322,
"grad_norm": 6.297229352579108,
"learning_rate": 1.7734299645963126e-05,
"loss": 1.6995,
"step": 165
},
{
"epoch": 1.524137931034483,
"grad_norm": 21.21748624748657,
"learning_rate": 1.7704346885463173e-05,
"loss": 1.3864,
"step": 166
},
{
"epoch": 1.5333333333333332,
"grad_norm": 7.694329489180455,
"learning_rate": 1.7674226346570756e-05,
"loss": 1.4465,
"step": 167
},
{
"epoch": 1.542528735632184,
"grad_norm": 6.791665210167091,
"learning_rate": 1.7643938783061844e-05,
"loss": 1.3967,
"step": 168
},
{
"epoch": 1.5517241379310345,
"grad_norm": 18.25267999804304,
"learning_rate": 1.761348495289225e-05,
"loss": 1.7708,
"step": 169
},
{
"epoch": 1.560919540229885,
"grad_norm": 11.606015421810417,
"learning_rate": 1.7582865618178673e-05,
"loss": 1.38,
"step": 170
},
{
"epoch": 1.5701149425287357,
"grad_norm": 6.76568404259339,
"learning_rate": 1.755208154517961e-05,
"loss": 1.7734,
"step": 171
},
{
"epoch": 1.5793103448275863,
"grad_norm": 7.457232551239884,
"learning_rate": 1.752113350427617e-05,
"loss": 1.3568,
"step": 172
},
{
"epoch": 1.5885057471264368,
"grad_norm": 10.071218139243994,
"learning_rate": 1.7490022269952836e-05,
"loss": 1.3582,
"step": 173
},
{
"epoch": 1.5977011494252875,
"grad_norm": 8.467685174322579,
"learning_rate": 1.7458748620778047e-05,
"loss": 1.4399,
"step": 174
},
{
"epoch": 1.6068965517241378,
"grad_norm": 6.051347000729604,
"learning_rate": 1.742731333938472e-05,
"loss": 1.3508,
"step": 175
},
{
"epoch": 1.6160919540229886,
"grad_norm": 6.367343243904751,
"learning_rate": 1.7395717212450673e-05,
"loss": 1.3251,
"step": 176
},
{
"epoch": 1.625287356321839,
"grad_norm": 7.724598036207127,
"learning_rate": 1.736396103067893e-05,
"loss": 1.2026,
"step": 177
},
{
"epoch": 1.6344827586206896,
"grad_norm": 5.76807974896288,
"learning_rate": 1.733204558877795e-05,
"loss": 1.1807,
"step": 178
},
{
"epoch": 1.6436781609195403,
"grad_norm": 8.50190392019292,
"learning_rate": 1.729997168544171e-05,
"loss": 1.2231,
"step": 179
},
{
"epoch": 1.6528735632183909,
"grad_norm": 19.54162117368854,
"learning_rate": 1.7267740123329756e-05,
"loss": 1.5237,
"step": 180
},
{
"epoch": 1.6620689655172414,
"grad_norm": 9.986270310555119,
"learning_rate": 1.7235351709047072e-05,
"loss": 1.2517,
"step": 181
},
{
"epoch": 1.6712643678160921,
"grad_norm": 6.78295518963419,
"learning_rate": 1.720280725312393e-05,
"loss": 1.6053,
"step": 182
},
{
"epoch": 1.6804597701149424,
"grad_norm": 6.601674166563654,
"learning_rate": 1.7170107569995588e-05,
"loss": 1.2712,
"step": 183
},
{
"epoch": 1.6896551724137931,
"grad_norm": 8.184620262857814,
"learning_rate": 1.7137253477981916e-05,
"loss": 1.3293,
"step": 184
},
{
"epoch": 1.6988505747126437,
"grad_norm": 6.739412467459474,
"learning_rate": 1.7104245799266917e-05,
"loss": 1.0026,
"step": 185
},
{
"epoch": 1.7080459770114942,
"grad_norm": 5.556603900105146,
"learning_rate": 1.707108535987815e-05,
"loss": 1.6606,
"step": 186
},
{
"epoch": 1.717241379310345,
"grad_norm": 12.138471189450616,
"learning_rate": 1.7037772989666043e-05,
"loss": 1.3003,
"step": 187
},
{
"epoch": 1.7264367816091954,
"grad_norm": 5.204252391318651,
"learning_rate": 1.7004309522283162e-05,
"loss": 1.4929,
"step": 188
},
{
"epoch": 1.735632183908046,
"grad_norm": 14.919779522258695,
"learning_rate": 1.6970695795163322e-05,
"loss": 1.6902,
"step": 189
},
{
"epoch": 1.7448275862068967,
"grad_norm": 5.328033889559845,
"learning_rate": 1.693693264950062e-05,
"loss": 1.4431,
"step": 190
},
{
"epoch": 1.754022988505747,
"grad_norm": 6.230962658840152,
"learning_rate": 1.6903020930228424e-05,
"loss": 1.4314,
"step": 191
},
{
"epoch": 1.7632183908045977,
"grad_norm": 6.180575508805239,
"learning_rate": 1.6868961485998178e-05,
"loss": 1.5364,
"step": 192
},
{
"epoch": 1.7724137931034483,
"grad_norm": 10.76113257757336,
"learning_rate": 1.683475516915821e-05,
"loss": 1.3914,
"step": 193
},
{
"epoch": 1.7816091954022988,
"grad_norm": 6.856163941107209,
"learning_rate": 1.6800402835732367e-05,
"loss": 1.304,
"step": 194
},
{
"epoch": 1.7908045977011495,
"grad_norm": 6.413125687720114,
"learning_rate": 1.6765905345398618e-05,
"loss": 1.3577,
"step": 195
},
{
"epoch": 1.8,
"grad_norm": 10.350633192944896,
"learning_rate": 1.6731263561467514e-05,
"loss": 1.3384,
"step": 196
},
{
"epoch": 1.8091954022988506,
"grad_norm": 6.680868526375388,
"learning_rate": 1.6696478350860625e-05,
"loss": 1.322,
"step": 197
},
{
"epoch": 1.8183908045977013,
"grad_norm": 9.172318252799384,
"learning_rate": 1.666155058408879e-05,
"loss": 1.6331,
"step": 198
},
{
"epoch": 1.8275862068965516,
"grad_norm": 8.408442568480286,
"learning_rate": 1.6626481135230378e-05,
"loss": 1.6042,
"step": 199
},
{
"epoch": 1.8367816091954023,
"grad_norm": 7.431075981024314,
"learning_rate": 1.6591270881909393e-05,
"loss": 1.5691,
"step": 200
},
{
"epoch": 1.8459770114942529,
"grad_norm": 17.102642928318303,
"learning_rate": 1.6555920705273513e-05,
"loss": 1.7698,
"step": 201
},
{
"epoch": 1.8551724137931034,
"grad_norm": 14.163498166355847,
"learning_rate": 1.6520431489972043e-05,
"loss": 1.4268,
"step": 202
},
{
"epoch": 1.8643678160919541,
"grad_norm": 8.38433733288465,
"learning_rate": 1.6484804124133772e-05,
"loss": 1.4326,
"step": 203
},
{
"epoch": 1.8735632183908046,
"grad_norm": 7.414923080451205,
"learning_rate": 1.6449039499344755e-05,
"loss": 1.4021,
"step": 204
},
{
"epoch": 1.8827586206896552,
"grad_norm": 9.285429331174253,
"learning_rate": 1.6413138510625994e-05,
"loss": 1.537,
"step": 205
},
{
"epoch": 1.891954022988506,
"grad_norm": 8.620259857009387,
"learning_rate": 1.637710205641103e-05,
"loss": 1.5474,
"step": 206
},
{
"epoch": 1.9011494252873562,
"grad_norm": 7.5352577306905175,
"learning_rate": 1.634093103852349e-05,
"loss": 1.276,
"step": 207
},
{
"epoch": 1.910344827586207,
"grad_norm": 8.551871535313907,
"learning_rate": 1.6304626362154484e-05,
"loss": 1.2695,
"step": 208
},
{
"epoch": 1.9195402298850575,
"grad_norm": 11.581334952401058,
"learning_rate": 1.6268188935839976e-05,
"loss": 1.5916,
"step": 209
},
{
"epoch": 1.928735632183908,
"grad_norm": 13.17525028833506,
"learning_rate": 1.623161967143803e-05,
"loss": 1.6626,
"step": 210
},
{
"epoch": 1.9379310344827587,
"grad_norm": 8.444643409343747,
"learning_rate": 1.6194919484106016e-05,
"loss": 1.3036,
"step": 211
},
{
"epoch": 1.9471264367816092,
"grad_norm": 7.6138309875760415,
"learning_rate": 1.6158089292277674e-05,
"loss": 1.6266,
"step": 212
},
{
"epoch": 1.9563218390804598,
"grad_norm": 8.510948546395023,
"learning_rate": 1.612113001764016e-05,
"loss": 1.2229,
"step": 213
},
{
"epoch": 1.9655172413793105,
"grad_norm": 18.34541377646805,
"learning_rate": 1.6084042585110955e-05,
"loss": 1.5161,
"step": 214
},
{
"epoch": 1.9747126436781608,
"grad_norm": 8.232021713485729,
"learning_rate": 1.6046827922814746e-05,
"loss": 1.5459,
"step": 215
},
{
"epoch": 1.9839080459770115,
"grad_norm": 7.80867265713955,
"learning_rate": 1.6009486962060175e-05,
"loss": 1.311,
"step": 216
},
{
"epoch": 1.993103448275862,
"grad_norm": 10.173776002475448,
"learning_rate": 1.597202063731655e-05,
"loss": 1.4924,
"step": 217
},
{
"epoch": 2.0,
"grad_norm": 9.754643284384423,
"learning_rate": 1.5934429886190444e-05,
"loss": 0.9814,
"step": 218
},
{
"epoch": 2.0091954022988507,
"grad_norm": 9.478427097239926,
"learning_rate": 1.5896715649402245e-05,
"loss": 1.6133,
"step": 219
},
{
"epoch": 2.018390804597701,
"grad_norm": 8.166444573768159,
"learning_rate": 1.585887887076261e-05,
"loss": 1.4502,
"step": 220
},
{
"epoch": 2.027586206896552,
"grad_norm": 8.541283789837138,
"learning_rate": 1.582092049714884e-05,
"loss": 1.6396,
"step": 221
},
{
"epoch": 2.036781609195402,
"grad_norm": 11.682225296224088,
"learning_rate": 1.5782841478481187e-05,
"loss": 1.5421,
"step": 222
},
{
"epoch": 2.045977011494253,
"grad_norm": 10.149484070655848,
"learning_rate": 1.5744642767699093e-05,
"loss": 1.314,
"step": 223
},
{
"epoch": 2.0551724137931036,
"grad_norm": 8.549351099175704,
"learning_rate": 1.5706325320737327e-05,
"loss": 1.1816,
"step": 224
},
{
"epoch": 2.064367816091954,
"grad_norm": 6.459017391887839,
"learning_rate": 1.566789009650206e-05,
"loss": 1.2528,
"step": 225
},
{
"epoch": 2.0735632183908046,
"grad_norm": 8.222834591178689,
"learning_rate": 1.562933805684689e-05,
"loss": 1.4919,
"step": 226
},
{
"epoch": 2.0827586206896553,
"grad_norm": 9.249895356593102,
"learning_rate": 1.5590670166548752e-05,
"loss": 1.1503,
"step": 227
},
{
"epoch": 2.0919540229885056,
"grad_norm": 7.8698554294535406,
"learning_rate": 1.5551887393283778e-05,
"loss": 1.4001,
"step": 228
},
{
"epoch": 2.1011494252873564,
"grad_norm": 14.354528964959558,
"learning_rate": 1.551299070760309e-05,
"loss": 1.4355,
"step": 229
},
{
"epoch": 2.110344827586207,
"grad_norm": 12.62190606379736,
"learning_rate": 1.547398108290849e-05,
"loss": 1.3149,
"step": 230
},
{
"epoch": 2.1195402298850574,
"grad_norm": 5.985166640280286,
"learning_rate": 1.5434859495428126e-05,
"loss": 1.4758,
"step": 231
},
{
"epoch": 2.128735632183908,
"grad_norm": 8.93057666695323,
"learning_rate": 1.539562692419205e-05,
"loss": 1.4132,
"step": 232
},
{
"epoch": 2.1379310344827585,
"grad_norm": 169.89901344705734,
"learning_rate": 1.5356284351007713e-05,
"loss": 1.2222,
"step": 233
},
{
"epoch": 2.147126436781609,
"grad_norm": 10.407513886004416,
"learning_rate": 1.5316832760435395e-05,
"loss": 1.403,
"step": 234
},
{
"epoch": 2.15632183908046,
"grad_norm": 7.673828476190051,
"learning_rate": 1.5277273139763584e-05,
"loss": 1.2657,
"step": 235
},
{
"epoch": 2.1655172413793102,
"grad_norm": 8.662439314553673,
"learning_rate": 1.5237606478984244e-05,
"loss": 1.4838,
"step": 236
},
{
"epoch": 2.174712643678161,
"grad_norm": 6.161005447060972,
"learning_rate": 1.5197833770768053e-05,
"loss": 1.2036,
"step": 237
},
{
"epoch": 2.1839080459770113,
"grad_norm": 10.211172062940802,
"learning_rate": 1.515795601043956e-05,
"loss": 1.3413,
"step": 238
},
{
"epoch": 2.193103448275862,
"grad_norm": 31.652507696152473,
"learning_rate": 1.5117974195952286e-05,
"loss": 1.4092,
"step": 239
},
{
"epoch": 2.2022988505747128,
"grad_norm": 8.725107105944577,
"learning_rate": 1.5077889327863725e-05,
"loss": 1.1694,
"step": 240
},
{
"epoch": 2.211494252873563,
"grad_norm": 8.66761735043033,
"learning_rate": 1.5037702409310324e-05,
"loss": 1.387,
"step": 241
},
{
"epoch": 2.220689655172414,
"grad_norm": 9.553327260316669,
"learning_rate": 1.499741444598238e-05,
"loss": 1.2606,
"step": 242
},
{
"epoch": 2.2298850574712645,
"grad_norm": 10.696600046863653,
"learning_rate": 1.4957026446098867e-05,
"loss": 1.4158,
"step": 243
},
{
"epoch": 2.239080459770115,
"grad_norm": 10.39962688994084,
"learning_rate": 1.4916539420382203e-05,
"loss": 1.3589,
"step": 244
},
{
"epoch": 2.2482758620689656,
"grad_norm": 14.53443112403548,
"learning_rate": 1.4875954382032956e-05,
"loss": 1.4326,
"step": 245
},
{
"epoch": 2.2574712643678163,
"grad_norm": 7.410550756457302,
"learning_rate": 1.4835272346704494e-05,
"loss": 1.1635,
"step": 246
},
{
"epoch": 2.2666666666666666,
"grad_norm": 7.6372427743277775,
"learning_rate": 1.4794494332477566e-05,
"loss": 1.4257,
"step": 247
},
{
"epoch": 2.2758620689655173,
"grad_norm": 21.947698234199052,
"learning_rate": 1.4753621359834822e-05,
"loss": 1.4056,
"step": 248
},
{
"epoch": 2.2850574712643676,
"grad_norm": 11.646825257901302,
"learning_rate": 1.4712654451635275e-05,
"loss": 1.5212,
"step": 249
},
{
"epoch": 2.2942528735632184,
"grad_norm": 17.799789150094412,
"learning_rate": 1.4671594633088704e-05,
"loss": 1.163,
"step": 250
},
{
"epoch": 2.303448275862069,
"grad_norm": 11.049404976513573,
"learning_rate": 1.4630442931730007e-05,
"loss": 1.3228,
"step": 251
},
{
"epoch": 2.3126436781609194,
"grad_norm": 10.135916865637768,
"learning_rate": 1.4589200377393467e-05,
"loss": 1.5016,
"step": 252
},
{
"epoch": 2.32183908045977,
"grad_norm": 15.774619581016921,
"learning_rate": 1.4547868002186996e-05,
"loss": 1.5846,
"step": 253
},
{
"epoch": 2.3310344827586205,
"grad_norm": 10.754021244507555,
"learning_rate": 1.4506446840466302e-05,
"loss": 1.2985,
"step": 254
},
{
"epoch": 2.340229885057471,
"grad_norm": 13.937037843771375,
"learning_rate": 1.4464937928809009e-05,
"loss": 1.28,
"step": 255
},
{
"epoch": 2.349425287356322,
"grad_norm": 12.31608417163875,
"learning_rate": 1.4423342305988697e-05,
"loss": 1.4902,
"step": 256
},
{
"epoch": 2.3586206896551722,
"grad_norm": 9.954291617642005,
"learning_rate": 1.4381661012948933e-05,
"loss": 1.2722,
"step": 257
},
{
"epoch": 2.367816091954023,
"grad_norm": 11.04918384734279,
"learning_rate": 1.4339895092777204e-05,
"loss": 1.2628,
"step": 258
},
{
"epoch": 2.3770114942528737,
"grad_norm": 6.763154511930277,
"learning_rate": 1.4298045590678814e-05,
"loss": 1.1636,
"step": 259
},
{
"epoch": 2.386206896551724,
"grad_norm": 13.574514226985405,
"learning_rate": 1.425611355395074e-05,
"loss": 1.428,
"step": 260
},
{
"epoch": 2.3954022988505748,
"grad_norm": 9.65472307533206,
"learning_rate": 1.4214100031955404e-05,
"loss": 1.2303,
"step": 261
},
{
"epoch": 2.4045977011494255,
"grad_norm": 8.266644434941332,
"learning_rate": 1.4172006076094427e-05,
"loss": 1.6992,
"step": 262
},
{
"epoch": 2.413793103448276,
"grad_norm": 11.226367730103076,
"learning_rate": 1.4129832739782314e-05,
"loss": 1.3781,
"step": 263
},
{
"epoch": 2.4229885057471265,
"grad_norm": 10.547590497185766,
"learning_rate": 1.408758107842009e-05,
"loss": 1.4745,
"step": 264
},
{
"epoch": 2.432183908045977,
"grad_norm": 38.63935876164692,
"learning_rate": 1.4045252149368886e-05,
"loss": 1.4921,
"step": 265
},
{
"epoch": 2.4413793103448276,
"grad_norm": 9.852443772549051,
"learning_rate": 1.4002847011923484e-05,
"loss": 1.584,
"step": 266
},
{
"epoch": 2.4505747126436783,
"grad_norm": 12.380130191453606,
"learning_rate": 1.3960366727285809e-05,
"loss": 1.5535,
"step": 267
},
{
"epoch": 2.4597701149425286,
"grad_norm": 8.857515071879746,
"learning_rate": 1.391781235853836e-05,
"loss": 1.3223,
"step": 268
},
{
"epoch": 2.4689655172413794,
"grad_norm": 9.627852836741733,
"learning_rate": 1.3875184970617621e-05,
"loss": 1.5267,
"step": 269
},
{
"epoch": 2.4781609195402297,
"grad_norm": 12.112138710475412,
"learning_rate": 1.3832485630287395e-05,
"loss": 1.5247,
"step": 270
},
{
"epoch": 2.4873563218390804,
"grad_norm": 14.536255701000336,
"learning_rate": 1.3789715406112132e-05,
"loss": 1.5334,
"step": 271
},
{
"epoch": 2.496551724137931,
"grad_norm": 11.226710463125984,
"learning_rate": 1.3746875368430156e-05,
"loss": 1.474,
"step": 272
},
{
"epoch": 2.5057471264367814,
"grad_norm": 9.986652995059503,
"learning_rate": 1.3703966589326905e-05,
"loss": 1.1953,
"step": 273
},
{
"epoch": 2.514942528735632,
"grad_norm": 15.399922495441178,
"learning_rate": 1.3660990142608093e-05,
"loss": 1.3754,
"step": 274
},
{
"epoch": 2.524137931034483,
"grad_norm": 14.096871218357013,
"learning_rate": 1.3617947103772833e-05,
"loss": 1.5314,
"step": 275
},
{
"epoch": 2.533333333333333,
"grad_norm": 20.22748729117087,
"learning_rate": 1.357483854998673e-05,
"loss": 1.2614,
"step": 276
},
{
"epoch": 2.542528735632184,
"grad_norm": 15.107752634691163,
"learning_rate": 1.3531665560054922e-05,
"loss": 1.2576,
"step": 277
},
{
"epoch": 2.5517241379310347,
"grad_norm": 9.065614108838506,
"learning_rate": 1.3488429214395078e-05,
"loss": 1.3296,
"step": 278
},
{
"epoch": 2.560919540229885,
"grad_norm": 10.218458690356865,
"learning_rate": 1.3445130595010366e-05,
"loss": 1.4652,
"step": 279
},
{
"epoch": 2.5701149425287357,
"grad_norm": 52.06028062114195,
"learning_rate": 1.3401770785462375e-05,
"loss": 1.2604,
"step": 280
},
{
"epoch": 2.5793103448275865,
"grad_norm": 11.237376278555484,
"learning_rate": 1.3358350870843994e-05,
"loss": 1.4764,
"step": 281
},
{
"epoch": 2.5885057471264368,
"grad_norm": 19.610789343097895,
"learning_rate": 1.3314871937752266e-05,
"loss": 1.7019,
"step": 282
},
{
"epoch": 2.5977011494252875,
"grad_norm": 12.406919127163583,
"learning_rate": 1.3271335074261183e-05,
"loss": 1.4766,
"step": 283
},
{
"epoch": 2.606896551724138,
"grad_norm": 11.381243573694883,
"learning_rate": 1.3227741369894464e-05,
"loss": 1.3762,
"step": 284
},
{
"epoch": 2.6160919540229886,
"grad_norm": 9.470405274888344,
"learning_rate": 1.3184091915598301e-05,
"loss": 1.3369,
"step": 285
},
{
"epoch": 2.625287356321839,
"grad_norm": 22.889960665827505,
"learning_rate": 1.3140387803714025e-05,
"loss": 1.2954,
"step": 286
},
{
"epoch": 2.6344827586206896,
"grad_norm": 9.18406523843117,
"learning_rate": 1.309663012795081e-05,
"loss": 1.2422,
"step": 287
},
{
"epoch": 2.6436781609195403,
"grad_norm": 11.268913127502152,
"learning_rate": 1.3052819983358269e-05,
"loss": 1.4489,
"step": 288
},
{
"epoch": 2.6528735632183906,
"grad_norm": 13.1896905658553,
"learning_rate": 1.3008958466299068e-05,
"loss": 1.7273,
"step": 289
},
{
"epoch": 2.6620689655172414,
"grad_norm": 10.572311262669949,
"learning_rate": 1.2965046674421491e-05,
"loss": 1.4719,
"step": 290
},
{
"epoch": 2.671264367816092,
"grad_norm": 10.35820251976187,
"learning_rate": 1.2921085706631959e-05,
"loss": 1.4539,
"step": 291
},
{
"epoch": 2.6804597701149424,
"grad_norm": 8.704710016620966,
"learning_rate": 1.2877076663067539e-05,
"loss": 1.3574,
"step": 292
},
{
"epoch": 2.689655172413793,
"grad_norm": 6.599649782918219,
"learning_rate": 1.2833020645068402e-05,
"loss": 1.3322,
"step": 293
},
{
"epoch": 2.698850574712644,
"grad_norm": 11.362295773438365,
"learning_rate": 1.2788918755150279e-05,
"loss": 1.2928,
"step": 294
},
{
"epoch": 2.708045977011494,
"grad_norm": 37.10678640120499,
"learning_rate": 1.2744772096976853e-05,
"loss": 1.3816,
"step": 295
},
{
"epoch": 2.717241379310345,
"grad_norm": 10.62094103048475,
"learning_rate": 1.2700581775332157e-05,
"loss": 1.3672,
"step": 296
},
{
"epoch": 2.7264367816091957,
"grad_norm": 10.75935817308168,
"learning_rate": 1.2656348896092898e-05,
"loss": 1.4492,
"step": 297
},
{
"epoch": 2.735632183908046,
"grad_norm": 8.077547364505877,
"learning_rate": 1.2612074566200823e-05,
"loss": 1.3044,
"step": 298
},
{
"epoch": 2.7448275862068967,
"grad_norm": 12.203291711398258,
"learning_rate": 1.2567759893634972e-05,
"loss": 1.5552,
"step": 299
},
{
"epoch": 2.754022988505747,
"grad_norm": 8.513713623012512,
"learning_rate": 1.2523405987383987e-05,
"loss": 1.2848,
"step": 300
},
{
"epoch": 2.7632183908045977,
"grad_norm": 8.619755790356049,
"learning_rate": 1.2479013957418343e-05,
"loss": 1.4136,
"step": 301
},
{
"epoch": 2.772413793103448,
"grad_norm": 10.308980428163558,
"learning_rate": 1.2434584914662573e-05,
"loss": 1.2261,
"step": 302
},
{
"epoch": 2.781609195402299,
"grad_norm": 10.11745765990052,
"learning_rate": 1.2390119970967465e-05,
"loss": 1.8462,
"step": 303
},
{
"epoch": 2.7908045977011495,
"grad_norm": 13.585789032324755,
"learning_rate": 1.2345620239082236e-05,
"loss": 1.3516,
"step": 304
},
{
"epoch": 2.8,
"grad_norm": 8.3622289803184,
"learning_rate": 1.23010868326267e-05,
"loss": 1.2363,
"step": 305
},
{
"epoch": 2.8091954022988506,
"grad_norm": 11.137605463009926,
"learning_rate": 1.2256520866063375e-05,
"loss": 1.5193,
"step": 306
},
{
"epoch": 2.8183908045977013,
"grad_norm": 7.9674983156349395,
"learning_rate": 1.221192345466961e-05,
"loss": 1.3356,
"step": 307
},
{
"epoch": 2.8275862068965516,
"grad_norm": 9.86095902628443,
"learning_rate": 1.2167295714509675e-05,
"loss": 1.6582,
"step": 308
},
{
"epoch": 2.8367816091954023,
"grad_norm": 8.9590317304695,
"learning_rate": 1.2122638762406824e-05,
"loss": 1.2642,
"step": 309
},
{
"epoch": 2.845977011494253,
"grad_norm": 11.895962433965508,
"learning_rate": 1.2077953715915347e-05,
"loss": 1.2452,
"step": 310
},
{
"epoch": 2.8551724137931034,
"grad_norm": 8.091787768627592,
"learning_rate": 1.2033241693292607e-05,
"loss": 1.6858,
"step": 311
},
{
"epoch": 2.864367816091954,
"grad_norm": 12.390457672512209,
"learning_rate": 1.1988503813471058e-05,
"loss": 1.2549,
"step": 312
},
{
"epoch": 2.873563218390805,
"grad_norm": 9.762226489244592,
"learning_rate": 1.1943741196030223e-05,
"loss": 1.2067,
"step": 313
},
{
"epoch": 2.882758620689655,
"grad_norm": 10.427493990874517,
"learning_rate": 1.1898954961168712e-05,
"loss": 1.2787,
"step": 314
},
{
"epoch": 2.891954022988506,
"grad_norm": 9.795089175759868,
"learning_rate": 1.1854146229676153e-05,
"loss": 1.5051,
"step": 315
},
{
"epoch": 2.901149425287356,
"grad_norm": 10.796252759918572,
"learning_rate": 1.180931612290517e-05,
"loss": 1.4446,
"step": 316
},
{
"epoch": 2.910344827586207,
"grad_norm": 9.367765215773524,
"learning_rate": 1.1764465762743301e-05,
"loss": 1.5287,
"step": 317
},
{
"epoch": 2.9195402298850572,
"grad_norm": 9.973558960509926,
"learning_rate": 1.1719596271584937e-05,
"loss": 1.3678,
"step": 318
},
{
"epoch": 2.928735632183908,
"grad_norm": 8.48030402628292,
"learning_rate": 1.1674708772303227e-05,
"loss": 1.7673,
"step": 319
},
{
"epoch": 2.9379310344827587,
"grad_norm": 9.941583395501333,
"learning_rate": 1.1629804388221977e-05,
"loss": 1.3052,
"step": 320
},
{
"epoch": 2.947126436781609,
"grad_norm": 20.295022543023112,
"learning_rate": 1.1584884243087542e-05,
"loss": 1.4888,
"step": 321
},
{
"epoch": 2.9563218390804598,
"grad_norm": 14.791841790346075,
"learning_rate": 1.1539949461040704e-05,
"loss": 1.4082,
"step": 322
},
{
"epoch": 2.9655172413793105,
"grad_norm": 11.397818542102845,
"learning_rate": 1.1495001166588538e-05,
"loss": 1.2513,
"step": 323
},
{
"epoch": 2.974712643678161,
"grad_norm": 14.537936931794308,
"learning_rate": 1.1450040484576268e-05,
"loss": 1.3915,
"step": 324
},
{
"epoch": 2.9839080459770115,
"grad_norm": 10.380255650087015,
"learning_rate": 1.140506854015912e-05,
"loss": 1.4326,
"step": 325
},
{
"epoch": 2.9931034482758623,
"grad_norm": 10.815042710269552,
"learning_rate": 1.1360086458774173e-05,
"loss": 1.3435,
"step": 326
},
{
"epoch": 3.0,
"grad_norm": 14.352409000062071,
"learning_rate": 1.1315095366112179e-05,
"loss": 0.8037,
"step": 327
},
{
"epoch": 3.0091954022988507,
"grad_norm": 8.564622119135102,
"learning_rate": 1.1270096388089405e-05,
"loss": 1.2927,
"step": 328
},
{
"epoch": 3.018390804597701,
"grad_norm": 9.827927749771158,
"learning_rate": 1.1225090650819443e-05,
"loss": 1.2504,
"step": 329
},
{
"epoch": 3.027586206896552,
"grad_norm": 13.500491825543243,
"learning_rate": 1.118007928058505e-05,
"loss": 1.2751,
"step": 330
},
{
"epoch": 3.036781609195402,
"grad_norm": 13.400074147719494,
"learning_rate": 1.1135063403809942e-05,
"loss": 1.5854,
"step": 331
},
{
"epoch": 3.045977011494253,
"grad_norm": 9.303488220671063,
"learning_rate": 1.1090044147030612e-05,
"loss": 1.4025,
"step": 332
},
{
"epoch": 3.0551724137931036,
"grad_norm": 12.154817360125381,
"learning_rate": 1.104502263686814e-05,
"loss": 1.4901,
"step": 333
},
{
"epoch": 3.064367816091954,
"grad_norm": 11.037010387746522,
"learning_rate": 1.1000000000000001e-05,
"loss": 1.2772,
"step": 334
},
{
"epoch": 3.0735632183908046,
"grad_norm": 8.990590987937784,
"learning_rate": 1.095497736313186e-05,
"loss": 1.5939,
"step": 335
},
{
"epoch": 3.0827586206896553,
"grad_norm": 9.84526893242006,
"learning_rate": 1.0909955852969392e-05,
"loss": 1.4225,
"step": 336
},
{
"epoch": 3.0919540229885056,
"grad_norm": 12.240144797467202,
"learning_rate": 1.0864936596190059e-05,
"loss": 1.6045,
"step": 337
},
{
"epoch": 3.1011494252873564,
"grad_norm": 8.484859364769594,
"learning_rate": 1.0819920719414953e-05,
"loss": 1.3782,
"step": 338
},
{
"epoch": 3.110344827586207,
"grad_norm": 11.97211375025308,
"learning_rate": 1.0774909349180558e-05,
"loss": 1.3038,
"step": 339
},
{
"epoch": 3.1195402298850574,
"grad_norm": 12.573640767997977,
"learning_rate": 1.07299036119106e-05,
"loss": 1.4471,
"step": 340
},
{
"epoch": 3.128735632183908,
"grad_norm": 9.896343612454453,
"learning_rate": 1.0684904633887822e-05,
"loss": 1.4695,
"step": 341
},
{
"epoch": 3.1379310344827585,
"grad_norm": 12.867671267558745,
"learning_rate": 1.063991354122583e-05,
"loss": 1.327,
"step": 342
},
{
"epoch": 3.147126436781609,
"grad_norm": 31.710152688174862,
"learning_rate": 1.0594931459840882e-05,
"loss": 1.1624,
"step": 343
},
{
"epoch": 3.15632183908046,
"grad_norm": 13.711391540751984,
"learning_rate": 1.0549959515423736e-05,
"loss": 1.4283,
"step": 344
},
{
"epoch": 3.1655172413793102,
"grad_norm": 10.048106577520812,
"learning_rate": 1.0504998833411465e-05,
"loss": 1.3794,
"step": 345
},
{
"epoch": 3.174712643678161,
"grad_norm": 15.095877578355061,
"learning_rate": 1.0460050538959299e-05,
"loss": 1.2234,
"step": 346
},
{
"epoch": 3.1839080459770113,
"grad_norm": 10.008382234710478,
"learning_rate": 1.0415115756912462e-05,
"loss": 1.4849,
"step": 347
},
{
"epoch": 3.193103448275862,
"grad_norm": 10.834339570759653,
"learning_rate": 1.0370195611778027e-05,
"loss": 1.4008,
"step": 348
},
{
"epoch": 3.2022988505747128,
"grad_norm": 11.606165690508291,
"learning_rate": 1.0325291227696776e-05,
"loss": 1.2378,
"step": 349
},
{
"epoch": 3.211494252873563,
"grad_norm": 10.000685238606074,
"learning_rate": 1.0280403728415067e-05,
"loss": 1.5133,
"step": 350
},
{
"epoch": 3.220689655172414,
"grad_norm": 12.169373195656735,
"learning_rate": 1.0235534237256702e-05,
"loss": 1.5,
"step": 351
},
{
"epoch": 3.2298850574712645,
"grad_norm": 9.464191323383275,
"learning_rate": 1.0190683877094832e-05,
"loss": 1.3682,
"step": 352
},
{
"epoch": 3.239080459770115,
"grad_norm": 24.696085137335555,
"learning_rate": 1.0145853770323846e-05,
"loss": 1.2056,
"step": 353
},
{
"epoch": 3.2482758620689656,
"grad_norm": 18.76575800304636,
"learning_rate": 1.0101045038831292e-05,
"loss": 1.142,
"step": 354
},
{
"epoch": 3.2574712643678163,
"grad_norm": 10.042567143762055,
"learning_rate": 1.0056258803969778e-05,
"loss": 1.3638,
"step": 355
},
{
"epoch": 3.2666666666666666,
"grad_norm": 11.318980756154916,
"learning_rate": 1.0011496186528947e-05,
"loss": 1.35,
"step": 356
},
{
"epoch": 3.2758620689655173,
"grad_norm": 40.050906379076665,
"learning_rate": 9.966758306707394e-06,
"loss": 1.2106,
"step": 357
},
{
"epoch": 3.2850574712643676,
"grad_norm": 9.353674650560334,
"learning_rate": 9.922046284084657e-06,
"loss": 1.3442,
"step": 358
},
{
"epoch": 3.2942528735632184,
"grad_norm": 9.677203924503825,
"learning_rate": 9.877361237593177e-06,
"loss": 1.3453,
"step": 359
},
{
"epoch": 3.303448275862069,
"grad_norm": 6.960886228003972,
"learning_rate": 9.832704285490326e-06,
"loss": 1.259,
"step": 360
},
{
"epoch": 3.3126436781609194,
"grad_norm": 9.958595921108332,
"learning_rate": 9.788076545330392e-06,
"loss": 1.5625,
"step": 361
},
{
"epoch": 3.32183908045977,
"grad_norm": 51.85252081874326,
"learning_rate": 9.74347913393663e-06,
"loss": 1.5267,
"step": 362
},
{
"epoch": 3.3310344827586205,
"grad_norm": 11.704494609330304,
"learning_rate": 9.698913167373302e-06,
"loss": 1.2225,
"step": 363
},
{
"epoch": 3.340229885057471,
"grad_norm": 14.283897531385442,
"learning_rate": 9.654379760917765e-06,
"loss": 1.4331,
"step": 364
},
{
"epoch": 3.349425287356322,
"grad_norm": 13.26198377276854,
"learning_rate": 9.609880029032537e-06,
"loss": 1.4148,
"step": 365
},
{
"epoch": 3.3586206896551722,
"grad_norm": 38.207752905565194,
"learning_rate": 9.56541508533743e-06,
"loss": 1.3888,
"step": 366
},
{
"epoch": 3.367816091954023,
"grad_norm": 14.71112455943351,
"learning_rate": 9.520986042581657e-06,
"loss": 1.4406,
"step": 367
},
{
"epoch": 3.3770114942528737,
"grad_norm": 10.84175054233535,
"learning_rate": 9.476594012616016e-06,
"loss": 1.4795,
"step": 368
},
{
"epoch": 3.386206896551724,
"grad_norm": 19.73830068691523,
"learning_rate": 9.43224010636503e-06,
"loss": 1.5071,
"step": 369
},
{
"epoch": 3.3954022988505748,
"grad_norm": 21.400680474505883,
"learning_rate": 9.387925433799183e-06,
"loss": 1.6345,
"step": 370
},
{
"epoch": 3.4045977011494255,
"grad_norm": 8.520388056506897,
"learning_rate": 9.343651103907101e-06,
"loss": 1.1921,
"step": 371
},
{
"epoch": 3.413793103448276,
"grad_norm": 15.614373578838704,
"learning_rate": 9.299418224667846e-06,
"loss": 1.3103,
"step": 372
},
{
"epoch": 3.4229885057471265,
"grad_norm": 10.195337396411798,
"learning_rate": 9.255227903023148e-06,
"loss": 1.0011,
"step": 373
},
{
"epoch": 3.432183908045977,
"grad_norm": 9.980736431264198,
"learning_rate": 9.211081244849724e-06,
"loss": 1.4138,
"step": 374
},
{
"epoch": 3.4413793103448276,
"grad_norm": 6.407392093756993,
"learning_rate": 9.166979354931602e-06,
"loss": 1.3992,
"step": 375
},
{
"epoch": 3.4505747126436783,
"grad_norm": 10.33551202023238,
"learning_rate": 9.122923336932466e-06,
"loss": 1.2931,
"step": 376
},
{
"epoch": 3.4597701149425286,
"grad_norm": 7.860952040626876,
"learning_rate": 9.078914293368042e-06,
"loss": 1.3782,
"step": 377
},
{
"epoch": 3.4689655172413794,
"grad_norm": 10.376178758945807,
"learning_rate": 9.034953325578513e-06,
"loss": 1.5396,
"step": 378
},
{
"epoch": 3.4781609195402297,
"grad_norm": 8.92810875038656,
"learning_rate": 8.991041533700935e-06,
"loss": 1.1866,
"step": 379
},
{
"epoch": 3.4873563218390804,
"grad_norm": 32.96472192490382,
"learning_rate": 8.947180016641736e-06,
"loss": 1.4369,
"step": 380
},
{
"epoch": 3.496551724137931,
"grad_norm": 9.034006970425779,
"learning_rate": 8.903369872049192e-06,
"loss": 1.3536,
"step": 381
},
{
"epoch": 3.5057471264367814,
"grad_norm": 8.45428130551336,
"learning_rate": 8.859612196285977e-06,
"loss": 1.248,
"step": 382
},
{
"epoch": 3.514942528735632,
"grad_norm": 10.682107181389576,
"learning_rate": 8.815908084401704e-06,
"loss": 1.4265,
"step": 383
},
{
"epoch": 3.524137931034483,
"grad_norm": 14.488560507246083,
"learning_rate": 8.772258630105537e-06,
"loss": 1.4996,
"step": 384
},
{
"epoch": 3.533333333333333,
"grad_norm": 11.402417158079917,
"learning_rate": 8.728664925738818e-06,
"loss": 1.4463,
"step": 385
},
{
"epoch": 3.542528735632184,
"grad_norm": 11.881058609468937,
"learning_rate": 8.685128062247739e-06,
"loss": 1.8416,
"step": 386
},
{
"epoch": 3.5517241379310347,
"grad_norm": 10.296799405046839,
"learning_rate": 8.641649129156007e-06,
"loss": 1.3956,
"step": 387
},
{
"epoch": 3.560919540229885,
"grad_norm": 9.65931052787777,
"learning_rate": 8.598229214537627e-06,
"loss": 1.3552,
"step": 388
},
{
"epoch": 3.5701149425287357,
"grad_norm": 13.969316212816242,
"learning_rate": 8.554869404989636e-06,
"loss": 1.3024,
"step": 389
},
{
"epoch": 3.5793103448275865,
"grad_norm": 8.281255790239513,
"learning_rate": 8.511570785604928e-06,
"loss": 1.6863,
"step": 390
},
{
"epoch": 3.5885057471264368,
"grad_norm": 22.529643925769257,
"learning_rate": 8.46833443994508e-06,
"loss": 1.4396,
"step": 391
},
{
"epoch": 3.5977011494252875,
"grad_norm": 10.800560357820313,
"learning_rate": 8.42516145001327e-06,
"loss": 1.318,
"step": 392
},
{
"epoch": 3.606896551724138,
"grad_norm": 10.69995569676154,
"learning_rate": 8.382052896227168e-06,
"loss": 1.1625,
"step": 393
},
{
"epoch": 3.6160919540229886,
"grad_norm": 9.934078096312229,
"learning_rate": 8.339009857391912e-06,
"loss": 1.328,
"step": 394
},
{
"epoch": 3.625287356321839,
"grad_norm": 26.37916950240029,
"learning_rate": 8.296033410673096e-06,
"loss": 1.1736,
"step": 395
},
{
"epoch": 3.6344827586206896,
"grad_norm": 14.08324266869287,
"learning_rate": 8.253124631569847e-06,
"loss": 1.5264,
"step": 396
},
{
"epoch": 3.6436781609195403,
"grad_norm": 9.839565730283748,
"learning_rate": 8.210284593887869e-06,
"loss": 1.4744,
"step": 397
},
{
"epoch": 3.6528735632183906,
"grad_norm": 11.700788192703863,
"learning_rate": 8.167514369712608e-06,
"loss": 1.1398,
"step": 398
},
{
"epoch": 3.6620689655172414,
"grad_norm": 11.788317447015977,
"learning_rate": 8.124815029382382e-06,
"loss": 1.3801,
"step": 399
},
{
"epoch": 3.671264367816092,
"grad_norm": 12.788897221803238,
"learning_rate": 8.082187641461642e-06,
"loss": 1.3303,
"step": 400
},
{
"epoch": 3.6804597701149424,
"grad_norm": 13.166615581355577,
"learning_rate": 8.03963327271419e-06,
"loss": 1.375,
"step": 401
},
{
"epoch": 3.689655172413793,
"grad_norm": 7.295624548089385,
"learning_rate": 7.99715298807652e-06,
"loss": 1.1687,
"step": 402
},
{
"epoch": 3.698850574712644,
"grad_norm": 12.478601761113927,
"learning_rate": 7.954747850631117e-06,
"loss": 1.3044,
"step": 403
},
{
"epoch": 3.708045977011494,
"grad_norm": 12.1752499296347,
"learning_rate": 7.912418921579914e-06,
"loss": 1.3738,
"step": 404
},
{
"epoch": 3.717241379310345,
"grad_norm": 9.637242180760817,
"learning_rate": 7.870167260217687e-06,
"loss": 1.5205,
"step": 405
},
{
"epoch": 3.7264367816091957,
"grad_norm": 11.278800274918918,
"learning_rate": 7.827993923905578e-06,
"loss": 1.2157,
"step": 406
},
{
"epoch": 3.735632183908046,
"grad_norm": 8.172216735429602,
"learning_rate": 7.785899968044599e-06,
"loss": 1.1936,
"step": 407
},
{
"epoch": 3.7448275862068967,
"grad_norm": 8.096635818421476,
"learning_rate": 7.743886446049263e-06,
"loss": 1.5856,
"step": 408
},
{
"epoch": 3.754022988505747,
"grad_norm": 11.682569075404121,
"learning_rate": 7.701954409321187e-06,
"loss": 1.4744,
"step": 409
},
{
"epoch": 3.7632183908045977,
"grad_norm": 10.35206309414598,
"learning_rate": 7.660104907222801e-06,
"loss": 1.2172,
"step": 410
},
{
"epoch": 3.772413793103448,
"grad_norm": 11.014122758958818,
"learning_rate": 7.618338987051068e-06,
"loss": 1.0511,
"step": 411
},
{
"epoch": 3.781609195402299,
"grad_norm": 8.85710415272957,
"learning_rate": 7.576657694011309e-06,
"loss": 1.3102,
"step": 412
},
{
"epoch": 3.7908045977011495,
"grad_norm": 12.157205224266306,
"learning_rate": 7.535062071190995e-06,
"loss": 1.1799,
"step": 413
},
{
"epoch": 3.8,
"grad_norm": 8.418851066550117,
"learning_rate": 7.493553159533702e-06,
"loss": 1.2111,
"step": 414
},
{
"epoch": 3.8091954022988506,
"grad_norm": 7.713059082134044,
"learning_rate": 7.452131997813006e-06,
"loss": 1.2234,
"step": 415
},
{
"epoch": 3.8183908045977013,
"grad_norm": 15.425297595516845,
"learning_rate": 7.410799622606539e-06,
"loss": 1.2979,
"step": 416
},
{
"epoch": 3.8275862068965516,
"grad_norm": 8.601420815684877,
"learning_rate": 7.369557068269997e-06,
"loss": 1.1259,
"step": 417
},
{
"epoch": 3.8367816091954023,
"grad_norm": 12.94115881368024,
"learning_rate": 7.3284053669112975e-06,
"loss": 1.3448,
"step": 418
},
{
"epoch": 3.845977011494253,
"grad_norm": 6.589550268233686,
"learning_rate": 7.287345548364728e-06,
"loss": 1.1129,
"step": 419
},
{
"epoch": 3.8551724137931034,
"grad_norm": 6.501312647159815,
"learning_rate": 7.2463786401651835e-06,
"loss": 1.1362,
"step": 420
},
{
"epoch": 3.864367816091954,
"grad_norm": 18.587292777735644,
"learning_rate": 7.205505667522437e-06,
"loss": 1.2959,
"step": 421
},
{
"epoch": 3.873563218390805,
"grad_norm": 15.21488681302892,
"learning_rate": 7.164727653295512e-06,
"loss": 1.3545,
"step": 422
},
{
"epoch": 3.882758620689655,
"grad_norm": 8.577037418300366,
"learning_rate": 7.124045617967048e-06,
"loss": 1.4131,
"step": 423
},
{
"epoch": 3.891954022988506,
"grad_norm": 10.190524659959603,
"learning_rate": 7.0834605796178e-06,
"loss": 1.4512,
"step": 424
},
{
"epoch": 3.901149425287356,
"grad_norm": 7.727343437140264,
"learning_rate": 7.042973553901133e-06,
"loss": 1.6387,
"step": 425
},
{
"epoch": 3.910344827586207,
"grad_norm": 22.911347407744433,
"learning_rate": 7.002585554017622e-06,
"loss": 1.3267,
"step": 426
},
{
"epoch": 3.9195402298850572,
"grad_norm": 10.152149046552406,
"learning_rate": 6.962297590689678e-06,
"loss": 1.2264,
"step": 427
},
{
"epoch": 3.928735632183908,
"grad_norm": 16.580706197370287,
"learning_rate": 6.922110672136282e-06,
"loss": 1.3127,
"step": 428
},
{
"epoch": 3.9379310344827587,
"grad_norm": 13.844974648013892,
"learning_rate": 6.882025804047718e-06,
"loss": 1.4424,
"step": 429
},
{
"epoch": 3.947126436781609,
"grad_norm": 13.915774750616139,
"learning_rate": 6.842043989560443e-06,
"loss": 1.645,
"step": 430
},
{
"epoch": 3.9563218390804598,
"grad_norm": 15.603474543946852,
"learning_rate": 6.802166229231952e-06,
"loss": 1.4729,
"step": 431
},
{
"epoch": 3.9655172413793105,
"grad_norm": 9.41327469077912,
"learning_rate": 6.76239352101576e-06,
"loss": 1.3605,
"step": 432
},
{
"epoch": 3.974712643678161,
"grad_norm": 14.457634374687824,
"learning_rate": 6.722726860236417e-06,
"loss": 1.5076,
"step": 433
},
{
"epoch": 3.9839080459770115,
"grad_norm": 9.965691507607113,
"learning_rate": 6.683167239564608e-06,
"loss": 1.4915,
"step": 434
},
{
"epoch": 3.9931034482758623,
"grad_norm": 51.33613508537111,
"learning_rate": 6.64371564899229e-06,
"loss": 1.1819,
"step": 435
},
{
"epoch": 4.0,
"grad_norm": 9.648485492120264,
"learning_rate": 6.604373075807953e-06,
"loss": 1.0046,
"step": 436
},
{
"epoch": 4.00919540229885,
"grad_norm": 17.169545137709832,
"learning_rate": 6.5651405045718764e-06,
"loss": 1.3074,
"step": 437
},
{
"epoch": 4.0183908045977015,
"grad_norm": 8.532422194018014,
"learning_rate": 6.526018917091517e-06,
"loss": 1.2025,
"step": 438
},
{
"epoch": 4.027586206896552,
"grad_norm": 7.280675134670931,
"learning_rate": 6.4870092923969155e-06,
"loss": 1.2716,
"step": 439
},
{
"epoch": 4.036781609195402,
"grad_norm": 7.781465123090883,
"learning_rate": 6.4481126067162235e-06,
"loss": 1.4485,
"step": 440
},
{
"epoch": 4.045977011494253,
"grad_norm": 9.325027749699055,
"learning_rate": 6.40932983345125e-06,
"loss": 1.4869,
"step": 441
},
{
"epoch": 4.055172413793104,
"grad_norm": 12.498570864158324,
"learning_rate": 6.3706619431531134e-06,
"loss": 1.3256,
"step": 442
},
{
"epoch": 4.064367816091954,
"grad_norm": 6.668369910017107,
"learning_rate": 6.3321099034979435e-06,
"loss": 1.2178,
"step": 443
},
{
"epoch": 4.073563218390804,
"grad_norm": 8.81432890911392,
"learning_rate": 6.29367467926268e-06,
"loss": 1.3246,
"step": 444
},
{
"epoch": 4.082758620689655,
"grad_norm": 9.5021881868705,
"learning_rate": 6.2553572323009094e-06,
"loss": 1.1871,
"step": 445
},
{
"epoch": 4.091954022988506,
"grad_norm": 22.404258092859155,
"learning_rate": 6.217158521518818e-06,
"loss": 1.031,
"step": 446
},
{
"epoch": 4.101149425287356,
"grad_norm": 8.322221675044315,
"learning_rate": 6.179079502851167e-06,
"loss": 1.3306,
"step": 447
},
{
"epoch": 4.110344827586207,
"grad_norm": 6.926175392564334,
"learning_rate": 6.141121129237393e-06,
"loss": 1.4648,
"step": 448
},
{
"epoch": 4.119540229885057,
"grad_norm": 10.795367849841528,
"learning_rate": 6.103284350597757e-06,
"loss": 1.4771,
"step": 449
},
{
"epoch": 4.128735632183908,
"grad_norm": 14.500806487945011,
"learning_rate": 6.0655701138095605e-06,
"loss": 1.2192,
"step": 450
},
{
"epoch": 4.137931034482759,
"grad_norm": 16.827094370255022,
"learning_rate": 6.027979362683454e-06,
"loss": 1.3679,
"step": 451
},
{
"epoch": 4.147126436781609,
"grad_norm": 8.463630645608777,
"learning_rate": 5.990513037939828e-06,
"loss": 1.3866,
"step": 452
},
{
"epoch": 4.1563218390804595,
"grad_norm": 8.548138098505374,
"learning_rate": 5.953172077185257e-06,
"loss": 1.4866,
"step": 453
},
{
"epoch": 4.165517241379311,
"grad_norm": 10.311093163627458,
"learning_rate": 5.915957414889049e-06,
"loss": 1.1892,
"step": 454
},
{
"epoch": 4.174712643678161,
"grad_norm": 11.532873997330638,
"learning_rate": 5.878869982359845e-06,
"loss": 1.3153,
"step": 455
},
{
"epoch": 4.183908045977011,
"grad_norm": 13.043804766692809,
"learning_rate": 5.841910707722327e-06,
"loss": 1.4138,
"step": 456
},
{
"epoch": 4.1931034482758625,
"grad_norm": 16.829895447641974,
"learning_rate": 5.805080515893983e-06,
"loss": 1.478,
"step": 457
},
{
"epoch": 4.202298850574713,
"grad_norm": 9.03056316789035,
"learning_rate": 5.7683803285619686e-06,
"loss": 1.361,
"step": 458
},
{
"epoch": 4.211494252873563,
"grad_norm": 8.787065551733527,
"learning_rate": 5.731811064160027e-06,
"loss": 1.3326,
"step": 459
},
{
"epoch": 4.220689655172414,
"grad_norm": 13.275359960429332,
"learning_rate": 5.695373637845521e-06,
"loss": 1.5723,
"step": 460
},
{
"epoch": 4.2298850574712645,
"grad_norm": 17.444256387950272,
"learning_rate": 5.659068961476514e-06,
"loss": 1.3682,
"step": 461
},
{
"epoch": 4.239080459770115,
"grad_norm": 15.78925693833788,
"learning_rate": 5.622897943588974e-06,
"loss": 1.5834,
"step": 462
},
{
"epoch": 4.248275862068965,
"grad_norm": 11.080860880855491,
"learning_rate": 5.5868614893740135e-06,
"loss": 1.5276,
"step": 463
},
{
"epoch": 4.257471264367816,
"grad_norm": 10.022698346820714,
"learning_rate": 5.550960500655247e-06,
"loss": 1.3053,
"step": 464
},
{
"epoch": 4.266666666666667,
"grad_norm": 7.298963870340777,
"learning_rate": 5.515195875866231e-06,
"loss": 1.2085,
"step": 465
},
{
"epoch": 4.275862068965517,
"grad_norm": 14.37636573814027,
"learning_rate": 5.479568510027963e-06,
"loss": 1.545,
"step": 466
},
{
"epoch": 4.285057471264368,
"grad_norm": 17.666776715292947,
"learning_rate": 5.444079294726491e-06,
"loss": 1.2861,
"step": 467
},
{
"epoch": 4.294252873563218,
"grad_norm": 8.829651952832249,
"learning_rate": 5.408729118090613e-06,
"loss": 1.2104,
"step": 468
},
{
"epoch": 4.303448275862069,
"grad_norm": 9.0505304099472,
"learning_rate": 5.373518864769627e-06,
"loss": 1.1511,
"step": 469
},
{
"epoch": 4.31264367816092,
"grad_norm": 8.787289804009772,
"learning_rate": 5.338449415911216e-06,
"loss": 1.5977,
"step": 470
},
{
"epoch": 4.32183908045977,
"grad_norm": 8.815130387834458,
"learning_rate": 5.30352164913938e-06,
"loss": 1.1611,
"step": 471
},
{
"epoch": 4.3310344827586205,
"grad_norm": 10.238419809690685,
"learning_rate": 5.268736438532487e-06,
"loss": 1.392,
"step": 472
},
{
"epoch": 4.340229885057472,
"grad_norm": 11.519780193080537,
"learning_rate": 5.234094654601386e-06,
"loss": 1.2612,
"step": 473
},
{
"epoch": 4.349425287356322,
"grad_norm": 7.1767835668590205,
"learning_rate": 5.199597164267637e-06,
"loss": 1.2562,
"step": 474
},
{
"epoch": 4.358620689655172,
"grad_norm": 12.876197100579766,
"learning_rate": 5.1652448308417935e-06,
"loss": 1.4492,
"step": 475
},
{
"epoch": 4.3678160919540225,
"grad_norm": 17.282709384518455,
"learning_rate": 5.131038514001825e-06,
"loss": 1.2496,
"step": 476
},
{
"epoch": 4.377011494252874,
"grad_norm": 14.639340531142409,
"learning_rate": 5.096979069771579e-06,
"loss": 1.4873,
"step": 477
},
{
"epoch": 4.386206896551724,
"grad_norm": 14.24447446889582,
"learning_rate": 5.063067350499382e-06,
"loss": 1.2217,
"step": 478
},
{
"epoch": 4.395402298850574,
"grad_norm": 37.86045060057325,
"learning_rate": 5.029304204836682e-06,
"loss": 1.4817,
"step": 479
},
{
"epoch": 4.4045977011494255,
"grad_norm": 10.562532665897013,
"learning_rate": 4.9956904777168384e-06,
"loss": 1.4619,
"step": 480
},
{
"epoch": 4.413793103448276,
"grad_norm": 10.086438372444816,
"learning_rate": 4.96222701033396e-06,
"loss": 1.5967,
"step": 481
},
{
"epoch": 4.422988505747126,
"grad_norm": 10.497122443019308,
"learning_rate": 4.928914640121858e-06,
"loss": 1.1646,
"step": 482
},
{
"epoch": 4.432183908045977,
"grad_norm": 16.64984057120252,
"learning_rate": 4.895754200733085e-06,
"loss": 1.278,
"step": 483
},
{
"epoch": 4.441379310344828,
"grad_norm": 10.713798877735572,
"learning_rate": 4.8627465220180876e-06,
"loss": 1.5983,
"step": 484
},
{
"epoch": 4.450574712643678,
"grad_norm": 134.93369623218058,
"learning_rate": 4.8298924300044156e-06,
"loss": 1.3882,
"step": 485
},
{
"epoch": 4.459770114942529,
"grad_norm": 12.769535026952651,
"learning_rate": 4.797192746876076e-06,
"loss": 1.3936,
"step": 486
},
{
"epoch": 4.468965517241379,
"grad_norm": 11.974390164411723,
"learning_rate": 4.764648290952932e-06,
"loss": 1.3739,
"step": 487
},
{
"epoch": 4.47816091954023,
"grad_norm": 13.16047182415963,
"learning_rate": 4.732259876670246e-06,
"loss": 1.4498,
"step": 488
},
{
"epoch": 4.487356321839081,
"grad_norm": 14.777382947651647,
"learning_rate": 4.7000283145582895e-06,
"loss": 1.1714,
"step": 489
},
{
"epoch": 4.496551724137931,
"grad_norm": 12.02307948203342,
"learning_rate": 4.6679544112220556e-06,
"loss": 1.5671,
"step": 490
},
{
"epoch": 4.505747126436781,
"grad_norm": 13.098215516758408,
"learning_rate": 4.636038969321073e-06,
"loss": 1.5305,
"step": 491
},
{
"epoch": 4.514942528735633,
"grad_norm": 15.243691962740888,
"learning_rate": 4.604282787549332e-06,
"loss": 1.5576,
"step": 492
},
{
"epoch": 4.524137931034483,
"grad_norm": 11.192240538762729,
"learning_rate": 4.572686660615285e-06,
"loss": 1.1947,
"step": 493
},
{
"epoch": 4.533333333333333,
"grad_norm": 10.14659787199047,
"learning_rate": 4.541251379221955e-06,
"loss": 1.4249,
"step": 494
},
{
"epoch": 4.5425287356321835,
"grad_norm": 9.650402852268794,
"learning_rate": 4.509977730047164e-06,
"loss": 1.3046,
"step": 495
},
{
"epoch": 4.551724137931035,
"grad_norm": 14.130488430747782,
"learning_rate": 4.47886649572383e-06,
"loss": 1.6035,
"step": 496
},
{
"epoch": 4.560919540229885,
"grad_norm": 11.192178018552626,
"learning_rate": 4.447918454820396e-06,
"loss": 1.298,
"step": 497
},
{
"epoch": 4.570114942528735,
"grad_norm": 12.405597150681189,
"learning_rate": 4.417134381821326e-06,
"loss": 1.5134,
"step": 498
},
{
"epoch": 4.5793103448275865,
"grad_norm": 10.531396717925594,
"learning_rate": 4.386515047107751e-06,
"loss": 1.4031,
"step": 499
},
{
"epoch": 4.588505747126437,
"grad_norm": 13.099105387499396,
"learning_rate": 4.356061216938159e-06,
"loss": 1.4768,
"step": 500
},
{
"epoch": 4.597701149425287,
"grad_norm": 12.359508812966814,
"learning_rate": 4.325773653429247e-06,
"loss": 1.2485,
"step": 501
},
{
"epoch": 4.606896551724138,
"grad_norm": 31.377171273518833,
"learning_rate": 4.2956531145368285e-06,
"loss": 1.2531,
"step": 502
},
{
"epoch": 4.6160919540229886,
"grad_norm": 13.014730836178563,
"learning_rate": 4.265700354036876e-06,
"loss": 1.5782,
"step": 503
},
{
"epoch": 4.625287356321839,
"grad_norm": 10.599618786371353,
"learning_rate": 4.235916121506657e-06,
"loss": 1.1847,
"step": 504
},
{
"epoch": 4.63448275862069,
"grad_norm": 34.427638598511074,
"learning_rate": 4.206301162305973e-06,
"loss": 1.4019,
"step": 505
},
{
"epoch": 4.64367816091954,
"grad_norm": 19.310795806347553,
"learning_rate": 4.176856217558502e-06,
"loss": 1.5381,
"step": 506
},
{
"epoch": 4.652873563218391,
"grad_norm": 14.82958137975314,
"learning_rate": 4.147582024133265e-06,
"loss": 1.5117,
"step": 507
},
{
"epoch": 4.662068965517241,
"grad_norm": 11.090032542872493,
"learning_rate": 4.118479314626168e-06,
"loss": 1.4451,
"step": 508
},
{
"epoch": 4.671264367816092,
"grad_norm": 11.955549282502883,
"learning_rate": 4.089548817341689e-06,
"loss": 1.1528,
"step": 509
},
{
"epoch": 4.680459770114942,
"grad_norm": 28.85360373708239,
"learning_rate": 4.0607912562746265e-06,
"loss": 1.5181,
"step": 510
},
{
"epoch": 4.689655172413794,
"grad_norm": 10.621370388777802,
"learning_rate": 4.032207351092009e-06,
"loss": 1.213,
"step": 511
},
{
"epoch": 4.698850574712644,
"grad_norm": 18.051763664566412,
"learning_rate": 4.003797817115066e-06,
"loss": 1.4712,
"step": 512
},
{
"epoch": 4.708045977011494,
"grad_norm": 23.332927849711663,
"learning_rate": 3.975563365301336e-06,
"loss": 1.3973,
"step": 513
},
{
"epoch": 4.7172413793103445,
"grad_norm": 11.162486781374527,
"learning_rate": 3.9475047022268644e-06,
"loss": 1.5162,
"step": 514
},
{
"epoch": 4.726436781609196,
"grad_norm": 10.059503420640997,
"learning_rate": 3.919622530068535e-06,
"loss": 1.3472,
"step": 515
},
{
"epoch": 4.735632183908046,
"grad_norm": 11.8247751074863,
"learning_rate": 3.8919175465864855e-06,
"loss": 1.2245,
"step": 516
},
{
"epoch": 4.744827586206896,
"grad_norm": 42.81481813808986,
"learning_rate": 3.864390445106658e-06,
"loss": 1.1561,
"step": 517
},
{
"epoch": 4.7540229885057474,
"grad_norm": 9.370699926092977,
"learning_rate": 3.837041914503432e-06,
"loss": 1.2819,
"step": 518
},
{
"epoch": 4.763218390804598,
"grad_norm": 45.58103480423792,
"learning_rate": 3.8098726391824015e-06,
"loss": 1.2213,
"step": 519
},
{
"epoch": 4.772413793103448,
"grad_norm": 11.95812186592475,
"learning_rate": 3.7828832990632402e-06,
"loss": 1.2812,
"step": 520
},
{
"epoch": 4.781609195402299,
"grad_norm": 10.169736090325921,
"learning_rate": 3.7560745695626877e-06,
"loss": 1.4757,
"step": 521
},
{
"epoch": 4.7908045977011495,
"grad_norm": 116.08773199573162,
"learning_rate": 3.7294471215776383e-06,
"loss": 1.3319,
"step": 522
},
{
"epoch": 4.8,
"grad_norm": 18.16653418521717,
"learning_rate": 3.7030016214683684e-06,
"loss": 1.2273,
"step": 523
},
{
"epoch": 4.809195402298851,
"grad_norm": 13.735726905274648,
"learning_rate": 3.6767387310418446e-06,
"loss": 1.291,
"step": 524
},
{
"epoch": 4.818390804597701,
"grad_norm": 31.68598274978449,
"learning_rate": 3.6506591075351762e-06,
"loss": 1.4346,
"step": 525
},
{
"epoch": 4.827586206896552,
"grad_norm": 12.830965155921534,
"learning_rate": 3.624763403599151e-06,
"loss": 1.3724,
"step": 526
},
{
"epoch": 4.836781609195402,
"grad_norm": 10.039164607525647,
"learning_rate": 3.5990522672819186e-06,
"loss": 1.3728,
"step": 527
},
{
"epoch": 4.845977011494253,
"grad_norm": 8.219551415671267,
"learning_rate": 3.573526342012763e-06,
"loss": 1.1454,
"step": 528
},
{
"epoch": 4.855172413793103,
"grad_norm": 15.30123567798395,
"learning_rate": 3.5481862665860063e-06,
"loss": 1.4489,
"step": 529
},
{
"epoch": 4.864367816091954,
"grad_norm": 11.507622582090901,
"learning_rate": 3.5230326751450138e-06,
"loss": 1.4098,
"step": 530
},
{
"epoch": 4.873563218390805,
"grad_norm": 13.34855796184993,
"learning_rate": 3.4980661971663375e-06,
"loss": 1.5815,
"step": 531
},
{
"epoch": 4.882758620689655,
"grad_norm": 7.43809443209873,
"learning_rate": 3.473287457443949e-06,
"loss": 1.2174,
"step": 532
},
{
"epoch": 4.8919540229885055,
"grad_norm": 20.28836303983412,
"learning_rate": 3.448697076073618e-06,
"loss": 1.3706,
"step": 533
},
{
"epoch": 4.901149425287357,
"grad_norm": 24.204538560430375,
"learning_rate": 3.4242956684373785e-06,
"loss": 1.2004,
"step": 534
},
{
"epoch": 4.910344827586207,
"grad_norm": 9.678071221104739,
"learning_rate": 3.4000838451881447e-06,
"loss": 1.2744,
"step": 535
},
{
"epoch": 4.919540229885057,
"grad_norm": 9.534325720534405,
"learning_rate": 3.376062212234421e-06,
"loss": 1.1697,
"step": 536
},
{
"epoch": 4.928735632183908,
"grad_norm": 14.97433781895165,
"learning_rate": 3.3522313707251385e-06,
"loss": 1.5248,
"step": 537
},
{
"epoch": 4.937931034482759,
"grad_norm": 9.912862051218715,
"learning_rate": 3.328591917034608e-06,
"loss": 1.3452,
"step": 538
},
{
"epoch": 4.947126436781609,
"grad_norm": 12.538408477826952,
"learning_rate": 3.3051444427476095e-06,
"loss": 1.1771,
"step": 539
},
{
"epoch": 4.956321839080459,
"grad_norm": 9.274005905173206,
"learning_rate": 3.2818895346445656e-06,
"loss": 1.3837,
"step": 540
},
{
"epoch": 4.9655172413793105,
"grad_norm": 14.74869885668137,
"learning_rate": 3.2588277746868825e-06,
"loss": 1.2489,
"step": 541
},
{
"epoch": 4.974712643678161,
"grad_norm": 9.239237891584507,
"learning_rate": 3.235959740002361e-06,
"loss": 1.3102,
"step": 542
},
{
"epoch": 4.983908045977012,
"grad_norm": 10.22129387678354,
"learning_rate": 3.2132860028707758e-06,
"loss": 1.213,
"step": 543
},
{
"epoch": 4.993103448275862,
"grad_norm": 11.074155532612387,
"learning_rate": 3.1908071307095377e-06,
"loss": 1.1949,
"step": 544
},
{
"epoch": 5.0,
"grad_norm": 11.074155532612387,
"learning_rate": 3.1685236860595066e-06,
"loss": 0.9934,
"step": 545
}
],
"logging_steps": 1.0,
"max_steps": 648,
"num_input_tokens_seen": 0,
"num_train_epochs": 6,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 304722427510784.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}