openthoughts2_math_300k / trainer_state.json
neginr's picture
Upload model
36cc05c verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.9977228936766505,
"eval_steps": 500,
"global_step": 890,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005605184795936241,
"grad_norm": 5.90385013851782,
"learning_rate": 8.98876404494382e-07,
"loss": 0.807,
"step": 1
},
{
"epoch": 0.011210369591872483,
"grad_norm": 6.006366750773737,
"learning_rate": 1.797752808988764e-06,
"loss": 0.8236,
"step": 2
},
{
"epoch": 0.016815554387808723,
"grad_norm": 5.925343792163714,
"learning_rate": 2.696629213483146e-06,
"loss": 0.8109,
"step": 3
},
{
"epoch": 0.022420739183744966,
"grad_norm": 5.527952797665692,
"learning_rate": 3.595505617977528e-06,
"loss": 0.7994,
"step": 4
},
{
"epoch": 0.028025923979681205,
"grad_norm": 4.373958309383871,
"learning_rate": 4.494382022471911e-06,
"loss": 0.7563,
"step": 5
},
{
"epoch": 0.033631108775617445,
"grad_norm": 2.4658304323036786,
"learning_rate": 5.393258426966292e-06,
"loss": 0.713,
"step": 6
},
{
"epoch": 0.039236293571553685,
"grad_norm": 2.047561880721445,
"learning_rate": 6.292134831460674e-06,
"loss": 0.7011,
"step": 7
},
{
"epoch": 0.04484147836748993,
"grad_norm": 4.026532746780989,
"learning_rate": 7.191011235955056e-06,
"loss": 0.7074,
"step": 8
},
{
"epoch": 0.05044666316342617,
"grad_norm": 4.325745201316262,
"learning_rate": 8.08988764044944e-06,
"loss": 0.6906,
"step": 9
},
{
"epoch": 0.05605184795936241,
"grad_norm": 3.8759783016193707,
"learning_rate": 8.988764044943822e-06,
"loss": 0.6496,
"step": 10
},
{
"epoch": 0.06165703275529865,
"grad_norm": 4.428668216367442,
"learning_rate": 9.887640449438202e-06,
"loss": 0.6512,
"step": 11
},
{
"epoch": 0.06726221755123489,
"grad_norm": 3.492900535526824,
"learning_rate": 1.0786516853932584e-05,
"loss": 0.6363,
"step": 12
},
{
"epoch": 0.07286740234717114,
"grad_norm": 2.1273856381054657,
"learning_rate": 1.1685393258426966e-05,
"loss": 0.6132,
"step": 13
},
{
"epoch": 0.07847258714310737,
"grad_norm": 2.1122501992949747,
"learning_rate": 1.2584269662921348e-05,
"loss": 0.5844,
"step": 14
},
{
"epoch": 0.08407777193904362,
"grad_norm": 2.5622529515656667,
"learning_rate": 1.348314606741573e-05,
"loss": 0.5948,
"step": 15
},
{
"epoch": 0.08968295673497986,
"grad_norm": 1.8958213967675108,
"learning_rate": 1.4382022471910113e-05,
"loss": 0.5727,
"step": 16
},
{
"epoch": 0.0952881415309161,
"grad_norm": 0.9617068848658947,
"learning_rate": 1.5280898876404495e-05,
"loss": 0.5603,
"step": 17
},
{
"epoch": 0.10089332632685234,
"grad_norm": 1.2698908465747545,
"learning_rate": 1.617977528089888e-05,
"loss": 0.5606,
"step": 18
},
{
"epoch": 0.10649851112278858,
"grad_norm": 1.0006685793638639,
"learning_rate": 1.707865168539326e-05,
"loss": 0.5489,
"step": 19
},
{
"epoch": 0.11210369591872482,
"grad_norm": 0.7031106693279928,
"learning_rate": 1.7977528089887643e-05,
"loss": 0.5355,
"step": 20
},
{
"epoch": 0.11770888071466107,
"grad_norm": 0.9009903971329004,
"learning_rate": 1.8876404494382024e-05,
"loss": 0.5292,
"step": 21
},
{
"epoch": 0.1233140655105973,
"grad_norm": 0.6766379367642102,
"learning_rate": 1.9775280898876404e-05,
"loss": 0.5199,
"step": 22
},
{
"epoch": 0.12891925030653353,
"grad_norm": 0.6738985046630845,
"learning_rate": 2.067415730337079e-05,
"loss": 0.5161,
"step": 23
},
{
"epoch": 0.13452443510246978,
"grad_norm": 0.6180587250422095,
"learning_rate": 2.1573033707865168e-05,
"loss": 0.5061,
"step": 24
},
{
"epoch": 0.14012961989840603,
"grad_norm": 0.4834626230192744,
"learning_rate": 2.2471910112359556e-05,
"loss": 0.5139,
"step": 25
},
{
"epoch": 0.14573480469434227,
"grad_norm": 0.5878637275212983,
"learning_rate": 2.3370786516853933e-05,
"loss": 0.5027,
"step": 26
},
{
"epoch": 0.15133998949027852,
"grad_norm": 0.5004842186398177,
"learning_rate": 2.426966292134832e-05,
"loss": 0.5011,
"step": 27
},
{
"epoch": 0.15694517428621474,
"grad_norm": 0.5254669056909826,
"learning_rate": 2.5168539325842697e-05,
"loss": 0.4909,
"step": 28
},
{
"epoch": 0.16255035908215099,
"grad_norm": 0.4765392239552605,
"learning_rate": 2.606741573033708e-05,
"loss": 0.4984,
"step": 29
},
{
"epoch": 0.16815554387808723,
"grad_norm": 0.46499584686266865,
"learning_rate": 2.696629213483146e-05,
"loss": 0.4831,
"step": 30
},
{
"epoch": 0.17376072867402348,
"grad_norm": 0.4681038310213469,
"learning_rate": 2.7865168539325845e-05,
"loss": 0.4868,
"step": 31
},
{
"epoch": 0.17936591346995973,
"grad_norm": 0.5306865190068607,
"learning_rate": 2.8764044943820226e-05,
"loss": 0.4799,
"step": 32
},
{
"epoch": 0.18497109826589594,
"grad_norm": 0.5310145854714095,
"learning_rate": 2.966292134831461e-05,
"loss": 0.4773,
"step": 33
},
{
"epoch": 0.1905762830618322,
"grad_norm": 0.761548939341114,
"learning_rate": 3.056179775280899e-05,
"loss": 0.4812,
"step": 34
},
{
"epoch": 0.19618146785776844,
"grad_norm": 1.200005749726904,
"learning_rate": 3.1460674157303374e-05,
"loss": 0.4911,
"step": 35
},
{
"epoch": 0.20178665265370468,
"grad_norm": 0.8174168347705099,
"learning_rate": 3.235955056179776e-05,
"loss": 0.467,
"step": 36
},
{
"epoch": 0.20739183744964093,
"grad_norm": 0.4081677644510946,
"learning_rate": 3.325842696629214e-05,
"loss": 0.4635,
"step": 37
},
{
"epoch": 0.21299702224557715,
"grad_norm": 0.6329076432258968,
"learning_rate": 3.415730337078652e-05,
"loss": 0.4709,
"step": 38
},
{
"epoch": 0.2186022070415134,
"grad_norm": 1.0112197078534875,
"learning_rate": 3.50561797752809e-05,
"loss": 0.4696,
"step": 39
},
{
"epoch": 0.22420739183744964,
"grad_norm": 0.9221867209391528,
"learning_rate": 3.5955056179775286e-05,
"loss": 0.4611,
"step": 40
},
{
"epoch": 0.2298125766333859,
"grad_norm": 0.84837092144476,
"learning_rate": 3.685393258426967e-05,
"loss": 0.474,
"step": 41
},
{
"epoch": 0.23541776142932214,
"grad_norm": 1.1853308215342242,
"learning_rate": 3.775280898876405e-05,
"loss": 0.4678,
"step": 42
},
{
"epoch": 0.24102294622525836,
"grad_norm": 0.5210063275074145,
"learning_rate": 3.865168539325843e-05,
"loss": 0.4562,
"step": 43
},
{
"epoch": 0.2466281310211946,
"grad_norm": 0.908397332328363,
"learning_rate": 3.955056179775281e-05,
"loss": 0.4674,
"step": 44
},
{
"epoch": 0.2522333158171308,
"grad_norm": 1.1080166716443598,
"learning_rate": 4.04494382022472e-05,
"loss": 0.4508,
"step": 45
},
{
"epoch": 0.25783850061306707,
"grad_norm": 0.45360566647654976,
"learning_rate": 4.134831460674158e-05,
"loss": 0.4509,
"step": 46
},
{
"epoch": 0.2634436854090033,
"grad_norm": 0.9861729575846025,
"learning_rate": 4.224719101123595e-05,
"loss": 0.4551,
"step": 47
},
{
"epoch": 0.26904887020493956,
"grad_norm": 0.46277631804738467,
"learning_rate": 4.3146067415730337e-05,
"loss": 0.446,
"step": 48
},
{
"epoch": 0.2746540550008758,
"grad_norm": 0.7678134928965062,
"learning_rate": 4.404494382022472e-05,
"loss": 0.4518,
"step": 49
},
{
"epoch": 0.28025923979681205,
"grad_norm": 0.6470876222514552,
"learning_rate": 4.494382022471911e-05,
"loss": 0.4545,
"step": 50
},
{
"epoch": 0.2858644245927483,
"grad_norm": 0.6677697699594126,
"learning_rate": 4.584269662921348e-05,
"loss": 0.4542,
"step": 51
},
{
"epoch": 0.29146960938868455,
"grad_norm": 0.9005318621831405,
"learning_rate": 4.6741573033707865e-05,
"loss": 0.4472,
"step": 52
},
{
"epoch": 0.2970747941846208,
"grad_norm": 1.0135754268066979,
"learning_rate": 4.764044943820225e-05,
"loss": 0.4449,
"step": 53
},
{
"epoch": 0.30267997898055704,
"grad_norm": 1.3120984053765912,
"learning_rate": 4.853932584269664e-05,
"loss": 0.4514,
"step": 54
},
{
"epoch": 0.30828516377649323,
"grad_norm": 0.9125344012381023,
"learning_rate": 4.943820224719101e-05,
"loss": 0.448,
"step": 55
},
{
"epoch": 0.3138903485724295,
"grad_norm": 1.5065061920138354,
"learning_rate": 5.0337078651685394e-05,
"loss": 0.4513,
"step": 56
},
{
"epoch": 0.3194955333683657,
"grad_norm": 0.7122548779299299,
"learning_rate": 5.123595505617978e-05,
"loss": 0.4447,
"step": 57
},
{
"epoch": 0.32510071816430197,
"grad_norm": 1.3540801647141738,
"learning_rate": 5.213483146067416e-05,
"loss": 0.4556,
"step": 58
},
{
"epoch": 0.3307059029602382,
"grad_norm": 0.8323407754065357,
"learning_rate": 5.303370786516854e-05,
"loss": 0.4345,
"step": 59
},
{
"epoch": 0.33631108775617446,
"grad_norm": 1.116821907282998,
"learning_rate": 5.393258426966292e-05,
"loss": 0.4491,
"step": 60
},
{
"epoch": 0.3419162725521107,
"grad_norm": 1.0824014046607489,
"learning_rate": 5.4831460674157306e-05,
"loss": 0.4472,
"step": 61
},
{
"epoch": 0.34752145734804696,
"grad_norm": 0.9265407958585422,
"learning_rate": 5.573033707865169e-05,
"loss": 0.4362,
"step": 62
},
{
"epoch": 0.3531266421439832,
"grad_norm": 0.8981510798852507,
"learning_rate": 5.662921348314607e-05,
"loss": 0.453,
"step": 63
},
{
"epoch": 0.35873182693991945,
"grad_norm": 0.9858658036066037,
"learning_rate": 5.752808988764045e-05,
"loss": 0.4428,
"step": 64
},
{
"epoch": 0.36433701173585564,
"grad_norm": 1.2513865798978006,
"learning_rate": 5.8426966292134835e-05,
"loss": 0.4428,
"step": 65
},
{
"epoch": 0.3699421965317919,
"grad_norm": 0.9397639573210418,
"learning_rate": 5.932584269662922e-05,
"loss": 0.4324,
"step": 66
},
{
"epoch": 0.37554738132772814,
"grad_norm": 1.4017380301394493,
"learning_rate": 6.0224719101123596e-05,
"loss": 0.45,
"step": 67
},
{
"epoch": 0.3811525661236644,
"grad_norm": 1.0777658678202968,
"learning_rate": 6.112359550561798e-05,
"loss": 0.4401,
"step": 68
},
{
"epoch": 0.38675775091960063,
"grad_norm": 0.912558239818085,
"learning_rate": 6.202247191011237e-05,
"loss": 0.4351,
"step": 69
},
{
"epoch": 0.3923629357155369,
"grad_norm": 1.2273797098955324,
"learning_rate": 6.292134831460675e-05,
"loss": 0.4516,
"step": 70
},
{
"epoch": 0.3979681205114731,
"grad_norm": 1.1806852980385731,
"learning_rate": 6.382022471910112e-05,
"loss": 0.4442,
"step": 71
},
{
"epoch": 0.40357330530740937,
"grad_norm": 1.1616577982500746,
"learning_rate": 6.471910112359552e-05,
"loss": 0.4342,
"step": 72
},
{
"epoch": 0.4091784901033456,
"grad_norm": 0.8390636237723854,
"learning_rate": 6.561797752808989e-05,
"loss": 0.427,
"step": 73
},
{
"epoch": 0.41478367489928186,
"grad_norm": 0.8499496288682458,
"learning_rate": 6.651685393258428e-05,
"loss": 0.4379,
"step": 74
},
{
"epoch": 0.42038885969521805,
"grad_norm": 1.17937338360059,
"learning_rate": 6.741573033707866e-05,
"loss": 0.4291,
"step": 75
},
{
"epoch": 0.4259940444911543,
"grad_norm": 0.7821897701029938,
"learning_rate": 6.831460674157304e-05,
"loss": 0.4351,
"step": 76
},
{
"epoch": 0.43159922928709055,
"grad_norm": 0.8440055305481178,
"learning_rate": 6.921348314606743e-05,
"loss": 0.4304,
"step": 77
},
{
"epoch": 0.4372044140830268,
"grad_norm": 0.7526712919502756,
"learning_rate": 7.01123595505618e-05,
"loss": 0.4366,
"step": 78
},
{
"epoch": 0.44280959887896304,
"grad_norm": 1.1114429881926073,
"learning_rate": 7.101123595505618e-05,
"loss": 0.4326,
"step": 79
},
{
"epoch": 0.4484147836748993,
"grad_norm": 0.7746430179490161,
"learning_rate": 7.191011235955057e-05,
"loss": 0.4336,
"step": 80
},
{
"epoch": 0.45401996847083553,
"grad_norm": 0.8505120829834041,
"learning_rate": 7.280898876404495e-05,
"loss": 0.4352,
"step": 81
},
{
"epoch": 0.4596251532667718,
"grad_norm": 1.4415361142111385,
"learning_rate": 7.370786516853934e-05,
"loss": 0.4385,
"step": 82
},
{
"epoch": 0.465230338062708,
"grad_norm": 0.7475789995240804,
"learning_rate": 7.46067415730337e-05,
"loss": 0.431,
"step": 83
},
{
"epoch": 0.4708355228586443,
"grad_norm": 0.9854738368310488,
"learning_rate": 7.55056179775281e-05,
"loss": 0.4327,
"step": 84
},
{
"epoch": 0.47644070765458046,
"grad_norm": 1.4375947776402878,
"learning_rate": 7.640449438202248e-05,
"loss": 0.4355,
"step": 85
},
{
"epoch": 0.4820458924505167,
"grad_norm": 0.7290469112827799,
"learning_rate": 7.730337078651686e-05,
"loss": 0.4335,
"step": 86
},
{
"epoch": 0.48765107724645296,
"grad_norm": 1.1019633173104773,
"learning_rate": 7.820224719101124e-05,
"loss": 0.4303,
"step": 87
},
{
"epoch": 0.4932562620423892,
"grad_norm": 0.9810209491585931,
"learning_rate": 7.910112359550562e-05,
"loss": 0.4296,
"step": 88
},
{
"epoch": 0.49886144683832545,
"grad_norm": 0.8966074291671375,
"learning_rate": 8e-05,
"loss": 0.4338,
"step": 89
},
{
"epoch": 0.5044666316342616,
"grad_norm": 1.0809659461252454,
"learning_rate": 7.999969234487637e-05,
"loss": 0.4323,
"step": 90
},
{
"epoch": 0.5100718164301979,
"grad_norm": 1.44533907238761,
"learning_rate": 7.999876938423802e-05,
"loss": 0.4436,
"step": 91
},
{
"epoch": 0.5156770012261341,
"grad_norm": 0.7701781299751447,
"learning_rate": 7.999723113228264e-05,
"loss": 0.4406,
"step": 92
},
{
"epoch": 0.5212821860220704,
"grad_norm": 0.9804243571239605,
"learning_rate": 7.999507761267278e-05,
"loss": 0.4245,
"step": 93
},
{
"epoch": 0.5268873708180066,
"grad_norm": 1.218580909562173,
"learning_rate": 7.999230885853554e-05,
"loss": 0.444,
"step": 94
},
{
"epoch": 0.5324925556139429,
"grad_norm": 0.6868435592682877,
"learning_rate": 7.998892491246195e-05,
"loss": 0.4316,
"step": 95
},
{
"epoch": 0.5380977404098791,
"grad_norm": 0.7857929091209908,
"learning_rate": 7.998492582650644e-05,
"loss": 0.4292,
"step": 96
},
{
"epoch": 0.5437029252058154,
"grad_norm": 0.960028166874925,
"learning_rate": 7.998031166218598e-05,
"loss": 0.434,
"step": 97
},
{
"epoch": 0.5493081100017516,
"grad_norm": 0.7041102432235921,
"learning_rate": 7.997508249047913e-05,
"loss": 0.4215,
"step": 98
},
{
"epoch": 0.5549132947976878,
"grad_norm": 0.7044014441304309,
"learning_rate": 7.996923839182498e-05,
"loss": 0.425,
"step": 99
},
{
"epoch": 0.5605184795936241,
"grad_norm": 0.6385459114746951,
"learning_rate": 7.996277945612184e-05,
"loss": 0.42,
"step": 100
},
{
"epoch": 0.5661236643895603,
"grad_norm": 0.7731522061106563,
"learning_rate": 7.995570578272598e-05,
"loss": 0.4253,
"step": 101
},
{
"epoch": 0.5717288491854966,
"grad_norm": 0.7332716664705065,
"learning_rate": 7.994801748044995e-05,
"loss": 0.4313,
"step": 102
},
{
"epoch": 0.5773340339814328,
"grad_norm": 0.6776804568843835,
"learning_rate": 7.993971466756107e-05,
"loss": 0.4188,
"step": 103
},
{
"epoch": 0.5829392187773691,
"grad_norm": 0.5922393199265042,
"learning_rate": 7.993079747177948e-05,
"loss": 0.4184,
"step": 104
},
{
"epoch": 0.5885444035733053,
"grad_norm": 0.398394656224325,
"learning_rate": 7.99212660302762e-05,
"loss": 0.4155,
"step": 105
},
{
"epoch": 0.5941495883692416,
"grad_norm": 0.5350176373169183,
"learning_rate": 7.991112048967111e-05,
"loss": 0.4157,
"step": 106
},
{
"epoch": 0.5997547731651778,
"grad_norm": 0.38641904379334474,
"learning_rate": 7.990036100603055e-05,
"loss": 0.4119,
"step": 107
},
{
"epoch": 0.6053599579611141,
"grad_norm": 0.5283945082650043,
"learning_rate": 7.988898774486507e-05,
"loss": 0.4129,
"step": 108
},
{
"epoch": 0.6109651427570503,
"grad_norm": 0.5488840852639991,
"learning_rate": 7.987700088112675e-05,
"loss": 0.4224,
"step": 109
},
{
"epoch": 0.6165703275529865,
"grad_norm": 0.5785726461047852,
"learning_rate": 7.986440059920659e-05,
"loss": 0.4077,
"step": 110
},
{
"epoch": 0.6221755123489228,
"grad_norm": 0.5777052205377616,
"learning_rate": 7.985118709293167e-05,
"loss": 0.4166,
"step": 111
},
{
"epoch": 0.627780697144859,
"grad_norm": 0.596491662412661,
"learning_rate": 7.983736056556212e-05,
"loss": 0.4132,
"step": 112
},
{
"epoch": 0.6333858819407953,
"grad_norm": 0.6957057882031554,
"learning_rate": 7.982292122978806e-05,
"loss": 0.4178,
"step": 113
},
{
"epoch": 0.6389910667367315,
"grad_norm": 0.6847230422083609,
"learning_rate": 7.980786930772624e-05,
"loss": 0.4118,
"step": 114
},
{
"epoch": 0.6445962515326678,
"grad_norm": 0.6837460304512333,
"learning_rate": 7.979220503091673e-05,
"loss": 0.4147,
"step": 115
},
{
"epoch": 0.6502014363286039,
"grad_norm": 0.6798597739557275,
"learning_rate": 7.977592864031929e-05,
"loss": 0.4171,
"step": 116
},
{
"epoch": 0.6558066211245402,
"grad_norm": 0.548015114928552,
"learning_rate": 7.975904038630963e-05,
"loss": 0.4117,
"step": 117
},
{
"epoch": 0.6614118059204764,
"grad_norm": 0.5642042657981582,
"learning_rate": 7.974154052867569e-05,
"loss": 0.4126,
"step": 118
},
{
"epoch": 0.6670169907164126,
"grad_norm": 0.7655295571497013,
"learning_rate": 7.97234293366135e-05,
"loss": 0.4154,
"step": 119
},
{
"epoch": 0.6726221755123489,
"grad_norm": 0.874608237347998,
"learning_rate": 7.970470708872308e-05,
"loss": 0.4236,
"step": 120
},
{
"epoch": 0.6782273603082851,
"grad_norm": 0.8310771349721764,
"learning_rate": 7.968537407300423e-05,
"loss": 0.421,
"step": 121
},
{
"epoch": 0.6838325451042214,
"grad_norm": 0.6553657946996332,
"learning_rate": 7.966543058685203e-05,
"loss": 0.4035,
"step": 122
},
{
"epoch": 0.6894377299001576,
"grad_norm": 0.5023716536308365,
"learning_rate": 7.964487693705224e-05,
"loss": 0.416,
"step": 123
},
{
"epoch": 0.6950429146960939,
"grad_norm": 0.4876124028877786,
"learning_rate": 7.962371343977664e-05,
"loss": 0.4116,
"step": 124
},
{
"epoch": 0.7006480994920301,
"grad_norm": 0.4811788654442628,
"learning_rate": 7.960194042057817e-05,
"loss": 0.4181,
"step": 125
},
{
"epoch": 0.7062532842879664,
"grad_norm": 0.7564883843961022,
"learning_rate": 7.957955821438588e-05,
"loss": 0.4061,
"step": 126
},
{
"epoch": 0.7118584690839026,
"grad_norm": 0.6430241009529823,
"learning_rate": 7.955656716549977e-05,
"loss": 0.4099,
"step": 127
},
{
"epoch": 0.7174636538798389,
"grad_norm": 0.5578593223115135,
"learning_rate": 7.953296762758556e-05,
"loss": 0.422,
"step": 128
},
{
"epoch": 0.7230688386757751,
"grad_norm": 0.6294882800949052,
"learning_rate": 7.950875996366916e-05,
"loss": 0.4195,
"step": 129
},
{
"epoch": 0.7286740234717113,
"grad_norm": 0.5769470393949646,
"learning_rate": 7.948394454613117e-05,
"loss": 0.4057,
"step": 130
},
{
"epoch": 0.7342792082676476,
"grad_norm": 0.6608310787451351,
"learning_rate": 7.945852175670113e-05,
"loss": 0.4117,
"step": 131
},
{
"epoch": 0.7398843930635838,
"grad_norm": 0.6476166241559258,
"learning_rate": 7.943249198645159e-05,
"loss": 0.4115,
"step": 132
},
{
"epoch": 0.7454895778595201,
"grad_norm": 0.41750277808352504,
"learning_rate": 7.940585563579216e-05,
"loss": 0.4187,
"step": 133
},
{
"epoch": 0.7510947626554563,
"grad_norm": 0.5395404061131682,
"learning_rate": 7.937861311446334e-05,
"loss": 0.4097,
"step": 134
},
{
"epoch": 0.7566999474513926,
"grad_norm": 0.5858318930064079,
"learning_rate": 7.935076484153019e-05,
"loss": 0.4003,
"step": 135
},
{
"epoch": 0.7623051322473288,
"grad_norm": 0.5173078099977891,
"learning_rate": 7.932231124537589e-05,
"loss": 0.4056,
"step": 136
},
{
"epoch": 0.7679103170432651,
"grad_norm": 0.4498691054115375,
"learning_rate": 7.929325276369519e-05,
"loss": 0.4066,
"step": 137
},
{
"epoch": 0.7735155018392013,
"grad_norm": 0.42467263261017896,
"learning_rate": 7.92635898434876e-05,
"loss": 0.4064,
"step": 138
},
{
"epoch": 0.7791206866351374,
"grad_norm": 0.37958937321762776,
"learning_rate": 7.923332294105063e-05,
"loss": 0.4034,
"step": 139
},
{
"epoch": 0.7847258714310738,
"grad_norm": 0.3468484137692954,
"learning_rate": 7.920245252197263e-05,
"loss": 0.4039,
"step": 140
},
{
"epoch": 0.7903310562270099,
"grad_norm": 0.4169469149627637,
"learning_rate": 7.917097906112574e-05,
"loss": 0.4087,
"step": 141
},
{
"epoch": 0.7959362410229462,
"grad_norm": 0.41834364109362276,
"learning_rate": 7.913890304265853e-05,
"loss": 0.405,
"step": 142
},
{
"epoch": 0.8015414258188824,
"grad_norm": 0.3513985390988816,
"learning_rate": 7.910622495998858e-05,
"loss": 0.4006,
"step": 143
},
{
"epoch": 0.8071466106148187,
"grad_norm": 0.30562923751430504,
"learning_rate": 7.907294531579487e-05,
"loss": 0.399,
"step": 144
},
{
"epoch": 0.8127517954107549,
"grad_norm": 0.25016597293101506,
"learning_rate": 7.903906462201004e-05,
"loss": 0.404,
"step": 145
},
{
"epoch": 0.8183569802066912,
"grad_norm": 0.330583944503076,
"learning_rate": 7.900458339981254e-05,
"loss": 0.4001,
"step": 146
},
{
"epoch": 0.8239621650026274,
"grad_norm": 0.4232095940752348,
"learning_rate": 7.896950217961862e-05,
"loss": 0.4058,
"step": 147
},
{
"epoch": 0.8295673497985637,
"grad_norm": 0.4726435967002529,
"learning_rate": 7.893382150107413e-05,
"loss": 0.3979,
"step": 148
},
{
"epoch": 0.8351725345944999,
"grad_norm": 0.5224683089251781,
"learning_rate": 7.889754191304624e-05,
"loss": 0.4016,
"step": 149
},
{
"epoch": 0.8407777193904361,
"grad_norm": 0.6703932689764993,
"learning_rate": 7.886066397361502e-05,
"loss": 0.4019,
"step": 150
},
{
"epoch": 0.8463829041863724,
"grad_norm": 0.7976450116603597,
"learning_rate": 7.882318825006482e-05,
"loss": 0.4042,
"step": 151
},
{
"epoch": 0.8519880889823086,
"grad_norm": 0.8440592541875934,
"learning_rate": 7.878511531887553e-05,
"loss": 0.405,
"step": 152
},
{
"epoch": 0.8575932737782449,
"grad_norm": 0.7574577755866619,
"learning_rate": 7.874644576571382e-05,
"loss": 0.4141,
"step": 153
},
{
"epoch": 0.8631984585741811,
"grad_norm": 0.6868823675156073,
"learning_rate": 7.870718018542394e-05,
"loss": 0.4085,
"step": 154
},
{
"epoch": 0.8688036433701174,
"grad_norm": 0.7949981417283204,
"learning_rate": 7.866731918201877e-05,
"loss": 0.4123,
"step": 155
},
{
"epoch": 0.8744088281660536,
"grad_norm": 0.7652258936220558,
"learning_rate": 7.862686336867042e-05,
"loss": 0.4074,
"step": 156
},
{
"epoch": 0.8800140129619899,
"grad_norm": 0.5492331474373146,
"learning_rate": 7.858581336770078e-05,
"loss": 0.412,
"step": 157
},
{
"epoch": 0.8856191977579261,
"grad_norm": 0.4985373864867304,
"learning_rate": 7.854416981057202e-05,
"loss": 0.4001,
"step": 158
},
{
"epoch": 0.8912243825538623,
"grad_norm": 0.6843980488570995,
"learning_rate": 7.850193333787679e-05,
"loss": 0.3962,
"step": 159
},
{
"epoch": 0.8968295673497986,
"grad_norm": 0.3471378020904331,
"learning_rate": 7.845910459932851e-05,
"loss": 0.3988,
"step": 160
},
{
"epoch": 0.9024347521457348,
"grad_norm": 0.5626872736911909,
"learning_rate": 7.841568425375118e-05,
"loss": 0.3996,
"step": 161
},
{
"epoch": 0.9080399369416711,
"grad_norm": 0.7412359084209332,
"learning_rate": 7.83716729690694e-05,
"loss": 0.3996,
"step": 162
},
{
"epoch": 0.9136451217376073,
"grad_norm": 0.3955248842527265,
"learning_rate": 7.832707142229803e-05,
"loss": 0.4003,
"step": 163
},
{
"epoch": 0.9192503065335436,
"grad_norm": 0.47855528089224836,
"learning_rate": 7.828188029953179e-05,
"loss": 0.4002,
"step": 164
},
{
"epoch": 0.9248554913294798,
"grad_norm": 0.5159440067301492,
"learning_rate": 7.823610029593471e-05,
"loss": 0.3962,
"step": 165
},
{
"epoch": 0.930460676125416,
"grad_norm": 0.3576812605070119,
"learning_rate": 7.818973211572943e-05,
"loss": 0.393,
"step": 166
},
{
"epoch": 0.9360658609213522,
"grad_norm": 0.35348537654761086,
"learning_rate": 7.814277647218634e-05,
"loss": 0.4037,
"step": 167
},
{
"epoch": 0.9416710457172885,
"grad_norm": 0.39413342377875193,
"learning_rate": 7.809523408761266e-05,
"loss": 0.3942,
"step": 168
},
{
"epoch": 0.9472762305132247,
"grad_norm": 0.3951438373093825,
"learning_rate": 7.80471056933413e-05,
"loss": 0.4012,
"step": 169
},
{
"epoch": 0.9528814153091609,
"grad_norm": 0.45433061259053,
"learning_rate": 7.799839202971963e-05,
"loss": 0.3982,
"step": 170
},
{
"epoch": 0.9584866001050972,
"grad_norm": 0.519370105848961,
"learning_rate": 7.794909384609807e-05,
"loss": 0.3994,
"step": 171
},
{
"epoch": 0.9640917849010334,
"grad_norm": 0.5495567414849357,
"learning_rate": 7.789921190081851e-05,
"loss": 0.3979,
"step": 172
},
{
"epoch": 0.9696969696969697,
"grad_norm": 0.6394859589748585,
"learning_rate": 7.784874696120279e-05,
"loss": 0.3959,
"step": 173
},
{
"epoch": 0.9753021544929059,
"grad_norm": 0.7704242402793997,
"learning_rate": 7.779769980354077e-05,
"loss": 0.4027,
"step": 174
},
{
"epoch": 0.9809073392888422,
"grad_norm": 0.9985269827974932,
"learning_rate": 7.774607121307841e-05,
"loss": 0.405,
"step": 175
},
{
"epoch": 0.9865125240847784,
"grad_norm": 0.868487642699029,
"learning_rate": 7.769386198400576e-05,
"loss": 0.3957,
"step": 176
},
{
"epoch": 0.9921177088807147,
"grad_norm": 0.40924003925560676,
"learning_rate": 7.764107291944464e-05,
"loss": 0.3905,
"step": 177
},
{
"epoch": 0.9977228936766509,
"grad_norm": 0.4918744419515202,
"learning_rate": 7.758770483143634e-05,
"loss": 0.389,
"step": 178
},
{
"epoch": 1.0052548607461902,
"grad_norm": 1.487265905041594,
"learning_rate": 7.753375854092918e-05,
"loss": 0.7519,
"step": 179
},
{
"epoch": 1.0108600455421264,
"grad_norm": 1.0599518409785367,
"learning_rate": 7.747923487776579e-05,
"loss": 0.3924,
"step": 180
},
{
"epoch": 1.0164652303380628,
"grad_norm": 0.6685215482157915,
"learning_rate": 7.742413468067038e-05,
"loss": 0.3886,
"step": 181
},
{
"epoch": 1.022070415133999,
"grad_norm": 0.6798972273258548,
"learning_rate": 7.736845879723585e-05,
"loss": 0.3925,
"step": 182
},
{
"epoch": 1.0276755999299352,
"grad_norm": 0.5898524978289069,
"learning_rate": 7.731220808391072e-05,
"loss": 0.3799,
"step": 183
},
{
"epoch": 1.0332807847258714,
"grad_norm": 0.5305286635410624,
"learning_rate": 7.725538340598603e-05,
"loss": 0.3858,
"step": 184
},
{
"epoch": 1.0388859695218078,
"grad_norm": 0.5290381001929488,
"learning_rate": 7.719798563758193e-05,
"loss": 0.3792,
"step": 185
},
{
"epoch": 1.044491154317744,
"grad_norm": 0.42976752812289126,
"learning_rate": 7.71400156616343e-05,
"loss": 0.378,
"step": 186
},
{
"epoch": 1.0500963391136802,
"grad_norm": 0.4255688619941376,
"learning_rate": 7.708147436988112e-05,
"loss": 0.3838,
"step": 187
},
{
"epoch": 1.0557015239096164,
"grad_norm": 0.3844923007802862,
"learning_rate": 7.702236266284886e-05,
"loss": 0.3838,
"step": 188
},
{
"epoch": 1.0613067087055525,
"grad_norm": 0.4493904358687705,
"learning_rate": 7.696268144983844e-05,
"loss": 0.3773,
"step": 189
},
{
"epoch": 1.066911893501489,
"grad_norm": 0.43703395699798525,
"learning_rate": 7.690243164891146e-05,
"loss": 0.3789,
"step": 190
},
{
"epoch": 1.0725170782974252,
"grad_norm": 0.3129550094380894,
"learning_rate": 7.684161418687588e-05,
"loss": 0.3841,
"step": 191
},
{
"epoch": 1.0781222630933613,
"grad_norm": 0.3196436601574946,
"learning_rate": 7.678022999927191e-05,
"loss": 0.375,
"step": 192
},
{
"epoch": 1.0837274478892975,
"grad_norm": 0.32439557065583957,
"learning_rate": 7.671828003035754e-05,
"loss": 0.3808,
"step": 193
},
{
"epoch": 1.0893326326852337,
"grad_norm": 0.3551997178098723,
"learning_rate": 7.665576523309402e-05,
"loss": 0.3808,
"step": 194
},
{
"epoch": 1.0949378174811701,
"grad_norm": 0.28222901059062117,
"learning_rate": 7.659268656913125e-05,
"loss": 0.3755,
"step": 195
},
{
"epoch": 1.1005430022771063,
"grad_norm": 0.24413884185384668,
"learning_rate": 7.652904500879294e-05,
"loss": 0.3771,
"step": 196
},
{
"epoch": 1.1061481870730425,
"grad_norm": 0.3258543915444851,
"learning_rate": 7.646484153106168e-05,
"loss": 0.3819,
"step": 197
},
{
"epoch": 1.1117533718689787,
"grad_norm": 0.30494792365590756,
"learning_rate": 7.640007712356394e-05,
"loss": 0.3739,
"step": 198
},
{
"epoch": 1.1173585566649151,
"grad_norm": 0.3416322895952249,
"learning_rate": 7.633475278255477e-05,
"loss": 0.3729,
"step": 199
},
{
"epoch": 1.1229637414608513,
"grad_norm": 0.33058200454972514,
"learning_rate": 7.626886951290262e-05,
"loss": 0.3778,
"step": 200
},
{
"epoch": 1.1285689262567875,
"grad_norm": 0.24553286849423914,
"learning_rate": 7.620242832807375e-05,
"loss": 0.3815,
"step": 201
},
{
"epoch": 1.1341741110527237,
"grad_norm": 0.3094016267852568,
"learning_rate": 7.61354302501167e-05,
"loss": 0.3739,
"step": 202
},
{
"epoch": 1.1397792958486601,
"grad_norm": 0.3110189115438822,
"learning_rate": 7.606787630964658e-05,
"loss": 0.3744,
"step": 203
},
{
"epoch": 1.1453844806445963,
"grad_norm": 0.3105648524438776,
"learning_rate": 7.599976754582917e-05,
"loss": 0.3733,
"step": 204
},
{
"epoch": 1.1509896654405325,
"grad_norm": 0.38377775896842775,
"learning_rate": 7.593110500636499e-05,
"loss": 0.3777,
"step": 205
},
{
"epoch": 1.1565948502364687,
"grad_norm": 0.5289840818037539,
"learning_rate": 7.586188974747315e-05,
"loss": 0.3748,
"step": 206
},
{
"epoch": 1.1622000350324049,
"grad_norm": 0.6002663848970253,
"learning_rate": 7.579212283387508e-05,
"loss": 0.376,
"step": 207
},
{
"epoch": 1.1678052198283413,
"grad_norm": 0.543546869431326,
"learning_rate": 7.57218053387782e-05,
"loss": 0.3818,
"step": 208
},
{
"epoch": 1.1734104046242775,
"grad_norm": 0.38254861936914103,
"learning_rate": 7.565093834385944e-05,
"loss": 0.3733,
"step": 209
},
{
"epoch": 1.1790155894202137,
"grad_norm": 0.25789361293808194,
"learning_rate": 7.557952293924843e-05,
"loss": 0.3741,
"step": 210
},
{
"epoch": 1.1846207742161499,
"grad_norm": 0.3509813225234253,
"learning_rate": 7.550756022351098e-05,
"loss": 0.3766,
"step": 211
},
{
"epoch": 1.1902259590120863,
"grad_norm": 0.41381875355849096,
"learning_rate": 7.5435051303632e-05,
"loss": 0.3771,
"step": 212
},
{
"epoch": 1.1958311438080225,
"grad_norm": 0.31160966900292714,
"learning_rate": 7.53619972949985e-05,
"loss": 0.37,
"step": 213
},
{
"epoch": 1.2014363286039587,
"grad_norm": 0.25377869542538456,
"learning_rate": 7.528839932138248e-05,
"loss": 0.3742,
"step": 214
},
{
"epoch": 1.2070415133998948,
"grad_norm": 0.3670686486051864,
"learning_rate": 7.521425851492366e-05,
"loss": 0.3741,
"step": 215
},
{
"epoch": 1.2126466981958313,
"grad_norm": 0.35278618781458415,
"learning_rate": 7.513957601611196e-05,
"loss": 0.3689,
"step": 216
},
{
"epoch": 1.2182518829917675,
"grad_norm": 0.27734720069224306,
"learning_rate": 7.506435297377006e-05,
"loss": 0.3709,
"step": 217
},
{
"epoch": 1.2238570677877036,
"grad_norm": 0.3278146817806966,
"learning_rate": 7.498859054503568e-05,
"loss": 0.3758,
"step": 218
},
{
"epoch": 1.2294622525836398,
"grad_norm": 0.4148552525962653,
"learning_rate": 7.491228989534378e-05,
"loss": 0.3722,
"step": 219
},
{
"epoch": 1.235067437379576,
"grad_norm": 0.4473202149669945,
"learning_rate": 7.483545219840865e-05,
"loss": 0.3754,
"step": 220
},
{
"epoch": 1.2406726221755124,
"grad_norm": 0.44427273810379947,
"learning_rate": 7.475807863620587e-05,
"loss": 0.3762,
"step": 221
},
{
"epoch": 1.2462778069714486,
"grad_norm": 0.49016712568602555,
"learning_rate": 7.468017039895404e-05,
"loss": 0.3761,
"step": 222
},
{
"epoch": 1.2518829917673848,
"grad_norm": 0.47873415753545473,
"learning_rate": 7.460172868509664e-05,
"loss": 0.3734,
"step": 223
},
{
"epoch": 1.257488176563321,
"grad_norm": 0.43672364622786203,
"learning_rate": 7.452275470128338e-05,
"loss": 0.3721,
"step": 224
},
{
"epoch": 1.2630933613592572,
"grad_norm": 0.43990010868929574,
"learning_rate": 7.444324966235179e-05,
"loss": 0.374,
"step": 225
},
{
"epoch": 1.2686985461551936,
"grad_norm": 0.4713815408143108,
"learning_rate": 7.436321479130855e-05,
"loss": 0.3713,
"step": 226
},
{
"epoch": 1.2743037309511298,
"grad_norm": 0.45464514691112873,
"learning_rate": 7.428265131931053e-05,
"loss": 0.3706,
"step": 227
},
{
"epoch": 1.279908915747066,
"grad_norm": 0.510024464970406,
"learning_rate": 7.420156048564599e-05,
"loss": 0.3741,
"step": 228
},
{
"epoch": 1.2855141005430024,
"grad_norm": 0.5372593621720837,
"learning_rate": 7.411994353771542e-05,
"loss": 0.3696,
"step": 229
},
{
"epoch": 1.2911192853389384,
"grad_norm": 0.4219487710133918,
"learning_rate": 7.40378017310125e-05,
"loss": 0.3711,
"step": 230
},
{
"epoch": 1.2967244701348748,
"grad_norm": 0.2523356442192507,
"learning_rate": 7.395513632910455e-05,
"loss": 0.371,
"step": 231
},
{
"epoch": 1.302329654930811,
"grad_norm": 0.3232824061176527,
"learning_rate": 7.38719486036133e-05,
"loss": 0.3755,
"step": 232
},
{
"epoch": 1.3079348397267472,
"grad_norm": 0.3600749335990331,
"learning_rate": 7.378823983419529e-05,
"loss": 0.373,
"step": 233
},
{
"epoch": 1.3135400245226836,
"grad_norm": 0.31629844859755163,
"learning_rate": 7.370401130852207e-05,
"loss": 0.3734,
"step": 234
},
{
"epoch": 1.3191452093186198,
"grad_norm": 0.30569567935552483,
"learning_rate": 7.361926432226053e-05,
"loss": 0.377,
"step": 235
},
{
"epoch": 1.324750394114556,
"grad_norm": 0.2484814268194845,
"learning_rate": 7.35340001790529e-05,
"loss": 0.3711,
"step": 236
},
{
"epoch": 1.3303555789104922,
"grad_norm": 0.2481897952153626,
"learning_rate": 7.34482201904967e-05,
"loss": 0.3769,
"step": 237
},
{
"epoch": 1.3359607637064284,
"grad_norm": 0.301422303142377,
"learning_rate": 7.336192567612458e-05,
"loss": 0.3746,
"step": 238
},
{
"epoch": 1.3415659485023648,
"grad_norm": 0.23838306194654293,
"learning_rate": 7.327511796338402e-05,
"loss": 0.3776,
"step": 239
},
{
"epoch": 1.347171133298301,
"grad_norm": 0.24847852807759896,
"learning_rate": 7.318779838761688e-05,
"loss": 0.3673,
"step": 240
},
{
"epoch": 1.3527763180942372,
"grad_norm": 0.32043115894332014,
"learning_rate": 7.309996829203894e-05,
"loss": 0.3706,
"step": 241
},
{
"epoch": 1.3583815028901733,
"grad_norm": 0.27995723770731035,
"learning_rate": 7.301162902771911e-05,
"loss": 0.3698,
"step": 242
},
{
"epoch": 1.3639866876861095,
"grad_norm": 0.23369201897150527,
"learning_rate": 7.292278195355875e-05,
"loss": 0.3765,
"step": 243
},
{
"epoch": 1.369591872482046,
"grad_norm": 0.2782336040131324,
"learning_rate": 7.28334284362708e-05,
"loss": 0.3698,
"step": 244
},
{
"epoch": 1.3751970572779821,
"grad_norm": 0.32558286225893,
"learning_rate": 7.274356985035856e-05,
"loss": 0.363,
"step": 245
},
{
"epoch": 1.3808022420739183,
"grad_norm": 0.3651496945923943,
"learning_rate": 7.265320757809478e-05,
"loss": 0.3708,
"step": 246
},
{
"epoch": 1.3864074268698547,
"grad_norm": 0.37775778360725737,
"learning_rate": 7.256234300950025e-05,
"loss": 0.3739,
"step": 247
},
{
"epoch": 1.392012611665791,
"grad_norm": 0.3865569735032884,
"learning_rate": 7.247097754232251e-05,
"loss": 0.3663,
"step": 248
},
{
"epoch": 1.3976177964617271,
"grad_norm": 0.4600474923978821,
"learning_rate": 7.237911258201422e-05,
"loss": 0.3725,
"step": 249
},
{
"epoch": 1.4032229812576633,
"grad_norm": 0.578230623305736,
"learning_rate": 7.228674954171169e-05,
"loss": 0.3717,
"step": 250
},
{
"epoch": 1.4088281660535995,
"grad_norm": 0.5579950344731053,
"learning_rate": 7.219388984221304e-05,
"loss": 0.375,
"step": 251
},
{
"epoch": 1.414433350849536,
"grad_norm": 0.42369290331551335,
"learning_rate": 7.210053491195638e-05,
"loss": 0.3673,
"step": 252
},
{
"epoch": 1.420038535645472,
"grad_norm": 0.3852634055274567,
"learning_rate": 7.200668618699786e-05,
"loss": 0.3669,
"step": 253
},
{
"epoch": 1.4256437204414083,
"grad_norm": 0.3342986128914958,
"learning_rate": 7.191234511098952e-05,
"loss": 0.3675,
"step": 254
},
{
"epoch": 1.4312489052373445,
"grad_norm": 0.28417998997516397,
"learning_rate": 7.181751313515716e-05,
"loss": 0.3736,
"step": 255
},
{
"epoch": 1.4368540900332807,
"grad_norm": 0.316021315356994,
"learning_rate": 7.172219171827788e-05,
"loss": 0.3652,
"step": 256
},
{
"epoch": 1.442459274829217,
"grad_norm": 0.3330261543122812,
"learning_rate": 7.162638232665785e-05,
"loss": 0.3781,
"step": 257
},
{
"epoch": 1.4480644596251533,
"grad_norm": 0.41235945578908095,
"learning_rate": 7.153008643410957e-05,
"loss": 0.3676,
"step": 258
},
{
"epoch": 1.4536696444210895,
"grad_norm": 0.5120467154192809,
"learning_rate": 7.143330552192925e-05,
"loss": 0.3688,
"step": 259
},
{
"epoch": 1.4592748292170257,
"grad_norm": 0.566273828052107,
"learning_rate": 7.13360410788741e-05,
"loss": 0.3728,
"step": 260
},
{
"epoch": 1.4648800140129619,
"grad_norm": 0.48257393278128896,
"learning_rate": 7.123829460113933e-05,
"loss": 0.3698,
"step": 261
},
{
"epoch": 1.4704851988088983,
"grad_norm": 0.3962978665568913,
"learning_rate": 7.114006759233514e-05,
"loss": 0.3708,
"step": 262
},
{
"epoch": 1.4760903836048345,
"grad_norm": 0.43632437470514573,
"learning_rate": 7.104136156346368e-05,
"loss": 0.3776,
"step": 263
},
{
"epoch": 1.4816955684007707,
"grad_norm": 0.5027739581146445,
"learning_rate": 7.094217803289573e-05,
"loss": 0.377,
"step": 264
},
{
"epoch": 1.487300753196707,
"grad_norm": 0.4748778836922837,
"learning_rate": 7.084251852634736e-05,
"loss": 0.374,
"step": 265
},
{
"epoch": 1.4929059379926433,
"grad_norm": 0.31244286402393573,
"learning_rate": 7.074238457685644e-05,
"loss": 0.3656,
"step": 266
},
{
"epoch": 1.4985111227885795,
"grad_norm": 0.2969138202613333,
"learning_rate": 7.064177772475912e-05,
"loss": 0.377,
"step": 267
},
{
"epoch": 1.5041163075845156,
"grad_norm": 0.41051969660231596,
"learning_rate": 7.054069951766608e-05,
"loss": 0.3763,
"step": 268
},
{
"epoch": 1.5097214923804518,
"grad_norm": 0.38457550467039503,
"learning_rate": 7.043915151043871e-05,
"loss": 0.3714,
"step": 269
},
{
"epoch": 1.5153266771763882,
"grad_norm": 0.29650502362650927,
"learning_rate": 7.033713526516528e-05,
"loss": 0.3708,
"step": 270
},
{
"epoch": 1.5209318619723244,
"grad_norm": 0.4069481640243356,
"learning_rate": 7.023465235113678e-05,
"loss": 0.3734,
"step": 271
},
{
"epoch": 1.5265370467682606,
"grad_norm": 0.4592931917273877,
"learning_rate": 7.013170434482291e-05,
"loss": 0.3697,
"step": 272
},
{
"epoch": 1.532142231564197,
"grad_norm": 0.34558335688526987,
"learning_rate": 7.002829282984776e-05,
"loss": 0.3601,
"step": 273
},
{
"epoch": 1.537747416360133,
"grad_norm": 0.32785340771208665,
"learning_rate": 6.992441939696543e-05,
"loss": 0.3708,
"step": 274
},
{
"epoch": 1.5433526011560694,
"grad_norm": 0.42401801041386833,
"learning_rate": 6.982008564403562e-05,
"loss": 0.3709,
"step": 275
},
{
"epoch": 1.5489577859520056,
"grad_norm": 0.3554711523304497,
"learning_rate": 6.971529317599903e-05,
"loss": 0.3625,
"step": 276
},
{
"epoch": 1.5545629707479418,
"grad_norm": 0.33335189910758606,
"learning_rate": 6.961004360485263e-05,
"loss": 0.3723,
"step": 277
},
{
"epoch": 1.5601681555438782,
"grad_norm": 0.3198930394708329,
"learning_rate": 6.950433854962489e-05,
"loss": 0.3601,
"step": 278
},
{
"epoch": 1.5657733403398142,
"grad_norm": 0.27989792618134535,
"learning_rate": 6.939817963635095e-05,
"loss": 0.3703,
"step": 279
},
{
"epoch": 1.5713785251357506,
"grad_norm": 0.29366802755384774,
"learning_rate": 6.929156849804745e-05,
"loss": 0.3714,
"step": 280
},
{
"epoch": 1.5769837099316868,
"grad_norm": 0.2785219347149019,
"learning_rate": 6.918450677468754e-05,
"loss": 0.3763,
"step": 281
},
{
"epoch": 1.582588894727623,
"grad_norm": 0.2578554063108834,
"learning_rate": 6.907699611317563e-05,
"loss": 0.3708,
"step": 282
},
{
"epoch": 1.5881940795235594,
"grad_norm": 0.24508422370288088,
"learning_rate": 6.896903816732199e-05,
"loss": 0.3808,
"step": 283
},
{
"epoch": 1.5937992643194954,
"grad_norm": 0.3084468014981416,
"learning_rate": 6.88606345978174e-05,
"loss": 0.3668,
"step": 284
},
{
"epoch": 1.5994044491154318,
"grad_norm": 0.3064359422764903,
"learning_rate": 6.875178707220752e-05,
"loss": 0.3703,
"step": 285
},
{
"epoch": 1.605009633911368,
"grad_norm": 0.259368905565046,
"learning_rate": 6.86424972648673e-05,
"loss": 0.3682,
"step": 286
},
{
"epoch": 1.6106148187073042,
"grad_norm": 0.3138681158378778,
"learning_rate": 6.853276685697522e-05,
"loss": 0.361,
"step": 287
},
{
"epoch": 1.6162200035032406,
"grad_norm": 0.28723735473752765,
"learning_rate": 6.842259753648736e-05,
"loss": 0.3691,
"step": 288
},
{
"epoch": 1.6218251882991768,
"grad_norm": 0.22917475477913027,
"learning_rate": 6.831199099811154e-05,
"loss": 0.3738,
"step": 289
},
{
"epoch": 1.627430373095113,
"grad_norm": 0.22158438127508895,
"learning_rate": 6.820094894328115e-05,
"loss": 0.3673,
"step": 290
},
{
"epoch": 1.6330355578910494,
"grad_norm": 0.24914259580358247,
"learning_rate": 6.808947308012907e-05,
"loss": 0.3623,
"step": 291
},
{
"epoch": 1.6386407426869853,
"grad_norm": 0.27721435112401327,
"learning_rate": 6.797756512346131e-05,
"loss": 0.371,
"step": 292
},
{
"epoch": 1.6442459274829218,
"grad_norm": 0.28690822410974365,
"learning_rate": 6.786522679473069e-05,
"loss": 0.3704,
"step": 293
},
{
"epoch": 1.649851112278858,
"grad_norm": 0.29175870389773917,
"learning_rate": 6.775245982201031e-05,
"loss": 0.3705,
"step": 294
},
{
"epoch": 1.6554562970747941,
"grad_norm": 0.31996465044367267,
"learning_rate": 6.763926593996704e-05,
"loss": 0.3621,
"step": 295
},
{
"epoch": 1.6610614818707305,
"grad_norm": 0.28536850062056507,
"learning_rate": 6.752564688983475e-05,
"loss": 0.3678,
"step": 296
},
{
"epoch": 1.6666666666666665,
"grad_norm": 0.25078819883082826,
"learning_rate": 6.741160441938761e-05,
"loss": 0.3633,
"step": 297
},
{
"epoch": 1.672271851462603,
"grad_norm": 0.18918507286548947,
"learning_rate": 6.729714028291311e-05,
"loss": 0.3641,
"step": 298
},
{
"epoch": 1.6778770362585391,
"grad_norm": 0.1854770882692491,
"learning_rate": 6.718225624118518e-05,
"loss": 0.3736,
"step": 299
},
{
"epoch": 1.6834822210544753,
"grad_norm": 0.2398880873671139,
"learning_rate": 6.7066954061437e-05,
"loss": 0.3627,
"step": 300
},
{
"epoch": 1.6890874058504117,
"grad_norm": 0.2708404661115209,
"learning_rate": 6.695123551733391e-05,
"loss": 0.3615,
"step": 301
},
{
"epoch": 1.694692590646348,
"grad_norm": 0.22472657962922266,
"learning_rate": 6.683510238894603e-05,
"loss": 0.3601,
"step": 302
},
{
"epoch": 1.700297775442284,
"grad_norm": 0.1879478767675107,
"learning_rate": 6.671855646272099e-05,
"loss": 0.3704,
"step": 303
},
{
"epoch": 1.7059029602382203,
"grad_norm": 0.1898981801511567,
"learning_rate": 6.660159953145632e-05,
"loss": 0.3741,
"step": 304
},
{
"epoch": 1.7115081450341565,
"grad_norm": 0.23990986850106974,
"learning_rate": 6.648423339427203e-05,
"loss": 0.365,
"step": 305
},
{
"epoch": 1.717113329830093,
"grad_norm": 0.2961808669969359,
"learning_rate": 6.636645985658274e-05,
"loss": 0.3703,
"step": 306
},
{
"epoch": 1.722718514626029,
"grad_norm": 0.28018628616841723,
"learning_rate": 6.62482807300701e-05,
"loss": 0.3725,
"step": 307
},
{
"epoch": 1.7283236994219653,
"grad_norm": 0.23407750459599536,
"learning_rate": 6.612969783265477e-05,
"loss": 0.3774,
"step": 308
},
{
"epoch": 1.7339288842179017,
"grad_norm": 0.2348613025637216,
"learning_rate": 6.601071298846859e-05,
"loss": 0.3678,
"step": 309
},
{
"epoch": 1.7395340690138377,
"grad_norm": 0.27859996434994966,
"learning_rate": 6.589132802782636e-05,
"loss": 0.3681,
"step": 310
},
{
"epoch": 1.745139253809774,
"grad_norm": 0.30291442596447254,
"learning_rate": 6.577154478719786e-05,
"loss": 0.3626,
"step": 311
},
{
"epoch": 1.7507444386057103,
"grad_norm": 0.34755080893012474,
"learning_rate": 6.565136510917946e-05,
"loss": 0.3618,
"step": 312
},
{
"epoch": 1.7563496234016465,
"grad_norm": 0.4116767959803346,
"learning_rate": 6.553079084246583e-05,
"loss": 0.3681,
"step": 313
},
{
"epoch": 1.7619548081975829,
"grad_norm": 0.46137945732023167,
"learning_rate": 6.540982384182154e-05,
"loss": 0.3724,
"step": 314
},
{
"epoch": 1.7675599929935188,
"grad_norm": 0.5111615976410253,
"learning_rate": 6.528846596805246e-05,
"loss": 0.3656,
"step": 315
},
{
"epoch": 1.7731651777894553,
"grad_norm": 0.5142382717546382,
"learning_rate": 6.516671908797717e-05,
"loss": 0.3652,
"step": 316
},
{
"epoch": 1.7787703625853915,
"grad_norm": 0.45272084053581846,
"learning_rate": 6.504458507439825e-05,
"loss": 0.3708,
"step": 317
},
{
"epoch": 1.7843755473813276,
"grad_norm": 0.3546131780130616,
"learning_rate": 6.492206580607344e-05,
"loss": 0.372,
"step": 318
},
{
"epoch": 1.789980732177264,
"grad_norm": 0.35982588012048017,
"learning_rate": 6.479916316768677e-05,
"loss": 0.368,
"step": 319
},
{
"epoch": 1.7955859169732002,
"grad_norm": 0.42694941448039553,
"learning_rate": 6.467587904981959e-05,
"loss": 0.3724,
"step": 320
},
{
"epoch": 1.8011911017691364,
"grad_norm": 0.38051976086581385,
"learning_rate": 6.455221534892138e-05,
"loss": 0.3714,
"step": 321
},
{
"epoch": 1.8067962865650729,
"grad_norm": 0.23803744685632255,
"learning_rate": 6.442817396728073e-05,
"loss": 0.363,
"step": 322
},
{
"epoch": 1.8124014713610088,
"grad_norm": 0.19779044986555921,
"learning_rate": 6.430375681299596e-05,
"loss": 0.3652,
"step": 323
},
{
"epoch": 1.8180066561569452,
"grad_norm": 0.2848062424619444,
"learning_rate": 6.417896579994583e-05,
"loss": 0.3701,
"step": 324
},
{
"epoch": 1.8236118409528814,
"grad_norm": 0.3241743893145074,
"learning_rate": 6.405380284776007e-05,
"loss": 0.3631,
"step": 325
},
{
"epoch": 1.8292170257488176,
"grad_norm": 0.23348066280971422,
"learning_rate": 6.392826988178984e-05,
"loss": 0.3655,
"step": 326
},
{
"epoch": 1.834822210544754,
"grad_norm": 0.22485128049462547,
"learning_rate": 6.380236883307814e-05,
"loss": 0.3649,
"step": 327
},
{
"epoch": 1.84042739534069,
"grad_norm": 0.26646687828106147,
"learning_rate": 6.367610163833015e-05,
"loss": 0.3704,
"step": 328
},
{
"epoch": 1.8460325801366264,
"grad_norm": 0.22618321758644827,
"learning_rate": 6.35494702398833e-05,
"loss": 0.3622,
"step": 329
},
{
"epoch": 1.8516377649325626,
"grad_norm": 0.22154925411761456,
"learning_rate": 6.342247658567753e-05,
"loss": 0.366,
"step": 330
},
{
"epoch": 1.8572429497284988,
"grad_norm": 0.29666284481675853,
"learning_rate": 6.329512262922525e-05,
"loss": 0.3689,
"step": 331
},
{
"epoch": 1.8628481345244352,
"grad_norm": 0.2939745343835554,
"learning_rate": 6.316741032958133e-05,
"loss": 0.3592,
"step": 332
},
{
"epoch": 1.8684533193203714,
"grad_norm": 0.2738063356122982,
"learning_rate": 6.303934165131296e-05,
"loss": 0.3632,
"step": 333
},
{
"epoch": 1.8740585041163076,
"grad_norm": 0.25344755125569035,
"learning_rate": 6.291091856446935e-05,
"loss": 0.3682,
"step": 334
},
{
"epoch": 1.8796636889122438,
"grad_norm": 0.22436307153393606,
"learning_rate": 6.278214304455156e-05,
"loss": 0.3657,
"step": 335
},
{
"epoch": 1.88526887370818,
"grad_norm": 0.25070997700969644,
"learning_rate": 6.265301707248199e-05,
"loss": 0.3699,
"step": 336
},
{
"epoch": 1.8908740585041164,
"grad_norm": 0.2733139068534298,
"learning_rate": 6.252354263457403e-05,
"loss": 0.3695,
"step": 337
},
{
"epoch": 1.8964792433000526,
"grad_norm": 0.2866080779686849,
"learning_rate": 6.239372172250134e-05,
"loss": 0.3714,
"step": 338
},
{
"epoch": 1.9020844280959888,
"grad_norm": 0.21011808838464177,
"learning_rate": 6.226355633326739e-05,
"loss": 0.3664,
"step": 339
},
{
"epoch": 1.9076896128919252,
"grad_norm": 0.1975979876067148,
"learning_rate": 6.21330484691746e-05,
"loss": 0.3669,
"step": 340
},
{
"epoch": 1.9132947976878611,
"grad_norm": 0.21416216664449675,
"learning_rate": 6.200220013779366e-05,
"loss": 0.3668,
"step": 341
},
{
"epoch": 1.9188999824837976,
"grad_norm": 0.21995877340660605,
"learning_rate": 6.187101335193252e-05,
"loss": 0.3602,
"step": 342
},
{
"epoch": 1.9245051672797338,
"grad_norm": 0.19030470602968677,
"learning_rate": 6.173949012960552e-05,
"loss": 0.3617,
"step": 343
},
{
"epoch": 1.93011035207567,
"grad_norm": 0.2128195145941448,
"learning_rate": 6.160763249400236e-05,
"loss": 0.3624,
"step": 344
},
{
"epoch": 1.9357155368716064,
"grad_norm": 0.245421223589709,
"learning_rate": 6.147544247345684e-05,
"loss": 0.3603,
"step": 345
},
{
"epoch": 1.9413207216675423,
"grad_norm": 0.21883215784819807,
"learning_rate": 6.134292210141585e-05,
"loss": 0.3594,
"step": 346
},
{
"epoch": 1.9469259064634787,
"grad_norm": 0.1967876169444006,
"learning_rate": 6.121007341640797e-05,
"loss": 0.368,
"step": 347
},
{
"epoch": 1.952531091259415,
"grad_norm": 0.20005898323834861,
"learning_rate": 6.10768984620121e-05,
"loss": 0.3735,
"step": 348
},
{
"epoch": 1.9581362760553511,
"grad_norm": 0.23622231826107862,
"learning_rate": 6.0943399286826126e-05,
"loss": 0.3621,
"step": 349
},
{
"epoch": 1.9637414608512875,
"grad_norm": 0.27814473844495363,
"learning_rate": 6.080957794443529e-05,
"loss": 0.3583,
"step": 350
},
{
"epoch": 1.9693466456472237,
"grad_norm": 0.302395060856694,
"learning_rate": 6.067543649338069e-05,
"loss": 0.3626,
"step": 351
},
{
"epoch": 1.97495183044316,
"grad_norm": 0.2330957117550289,
"learning_rate": 6.0540976997127534e-05,
"loss": 0.3626,
"step": 352
},
{
"epoch": 1.9805570152390963,
"grad_norm": 0.17919274741898725,
"learning_rate": 6.040620152403351e-05,
"loss": 0.3699,
"step": 353
},
{
"epoch": 1.9861622000350323,
"grad_norm": 0.19040666115368088,
"learning_rate": 6.0271112147316816e-05,
"loss": 0.362,
"step": 354
},
{
"epoch": 1.9917673848309687,
"grad_norm": 0.19757884670083642,
"learning_rate": 6.013571094502443e-05,
"loss": 0.3609,
"step": 355
},
{
"epoch": 1.997372569626905,
"grad_norm": 0.2205769315300188,
"learning_rate": 6.000000000000001e-05,
"loss": 0.3747,
"step": 356
},
{
"epoch": 2.0056051847959364,
"grad_norm": 0.2741039461067367,
"learning_rate": 5.986398139985195e-05,
"loss": 0.339,
"step": 357
},
{
"epoch": 2.0112103695918724,
"grad_norm": 0.31588543514752565,
"learning_rate": 5.97276572369212e-05,
"loss": 0.3439,
"step": 358
},
{
"epoch": 2.016815554387809,
"grad_norm": 0.46336824661725023,
"learning_rate": 5.959102960824914e-05,
"loss": 0.3396,
"step": 359
},
{
"epoch": 2.0224207391837448,
"grad_norm": 0.6543572364352943,
"learning_rate": 5.945410061554526e-05,
"loss": 0.3462,
"step": 360
},
{
"epoch": 2.028025923979681,
"grad_norm": 0.7224099179065516,
"learning_rate": 5.931687236515485e-05,
"loss": 0.3452,
"step": 361
},
{
"epoch": 2.0336311087756176,
"grad_norm": 0.5577072940081442,
"learning_rate": 5.917934696802667e-05,
"loss": 0.3393,
"step": 362
},
{
"epoch": 2.0392362935715536,
"grad_norm": 0.33124598923207127,
"learning_rate": 5.904152653968032e-05,
"loss": 0.3403,
"step": 363
},
{
"epoch": 2.04484147836749,
"grad_norm": 0.4930639395918798,
"learning_rate": 5.890341320017389e-05,
"loss": 0.3404,
"step": 364
},
{
"epoch": 2.0504466631634264,
"grad_norm": 0.4584696038181033,
"learning_rate": 5.8765009074071176e-05,
"loss": 0.3413,
"step": 365
},
{
"epoch": 2.0560518479593624,
"grad_norm": 0.3093299417205952,
"learning_rate": 5.8626316290409124e-05,
"loss": 0.3414,
"step": 366
},
{
"epoch": 2.0616570327552988,
"grad_norm": 0.39704392594198545,
"learning_rate": 5.8487336982665016e-05,
"loss": 0.337,
"step": 367
},
{
"epoch": 2.0672622175512347,
"grad_norm": 0.30243949060200587,
"learning_rate": 5.8348073288723625e-05,
"loss": 0.342,
"step": 368
},
{
"epoch": 2.072867402347171,
"grad_norm": 0.2815416454530177,
"learning_rate": 5.820852735084443e-05,
"loss": 0.3382,
"step": 369
},
{
"epoch": 2.0784725871431076,
"grad_norm": 0.37397854680211123,
"learning_rate": 5.8068701315628564e-05,
"loss": 0.338,
"step": 370
},
{
"epoch": 2.0840777719390435,
"grad_norm": 0.2659776331075204,
"learning_rate": 5.792859733398582e-05,
"loss": 0.338,
"step": 371
},
{
"epoch": 2.08968295673498,
"grad_norm": 0.263065032898508,
"learning_rate": 5.7788217561101604e-05,
"loss": 0.3399,
"step": 372
},
{
"epoch": 2.095288141530916,
"grad_norm": 0.25395323107827705,
"learning_rate": 5.7647564156403734e-05,
"loss": 0.3436,
"step": 373
},
{
"epoch": 2.1008933263268523,
"grad_norm": 0.26293260303616806,
"learning_rate": 5.750663928352923e-05,
"loss": 0.3335,
"step": 374
},
{
"epoch": 2.1064985111227887,
"grad_norm": 0.3165184344301909,
"learning_rate": 5.7365445110291063e-05,
"loss": 0.3308,
"step": 375
},
{
"epoch": 2.1121036959187247,
"grad_norm": 0.2209241133233396,
"learning_rate": 5.7223983808644757e-05,
"loss": 0.3384,
"step": 376
},
{
"epoch": 2.117708880714661,
"grad_norm": 0.22848826893513474,
"learning_rate": 5.7082257554655046e-05,
"loss": 0.3302,
"step": 377
},
{
"epoch": 2.123314065510597,
"grad_norm": 0.19866524720209594,
"learning_rate": 5.6940268528462324e-05,
"loss": 0.3325,
"step": 378
},
{
"epoch": 2.1289192503065335,
"grad_norm": 0.18924595566511337,
"learning_rate": 5.6798018914249176e-05,
"loss": 0.3409,
"step": 379
},
{
"epoch": 2.13452443510247,
"grad_norm": 0.2094808002199248,
"learning_rate": 5.665551090020671e-05,
"loss": 0.3368,
"step": 380
},
{
"epoch": 2.140129619898406,
"grad_norm": 0.1896298642995286,
"learning_rate": 5.651274667850099e-05,
"loss": 0.3382,
"step": 381
},
{
"epoch": 2.1457348046943423,
"grad_norm": 0.22305481312184167,
"learning_rate": 5.6369728445239216e-05,
"loss": 0.3365,
"step": 382
},
{
"epoch": 2.1513399894902787,
"grad_norm": 0.18402697711670618,
"learning_rate": 5.622645840043599e-05,
"loss": 0.3327,
"step": 383
},
{
"epoch": 2.1569451742862147,
"grad_norm": 0.18539919040240732,
"learning_rate": 5.60829387479795e-05,
"loss": 0.3367,
"step": 384
},
{
"epoch": 2.162550359082151,
"grad_norm": 0.21582225739629068,
"learning_rate": 5.5939171695597546e-05,
"loss": 0.3395,
"step": 385
},
{
"epoch": 2.168155543878087,
"grad_norm": 0.16294915710384228,
"learning_rate": 5.579515945482366e-05,
"loss": 0.3356,
"step": 386
},
{
"epoch": 2.1737607286740235,
"grad_norm": 0.17506193706528747,
"learning_rate": 5.5650904240963015e-05,
"loss": 0.3389,
"step": 387
},
{
"epoch": 2.17936591346996,
"grad_norm": 0.17986589577627823,
"learning_rate": 5.55064082730584e-05,
"loss": 0.3376,
"step": 388
},
{
"epoch": 2.184971098265896,
"grad_norm": 0.20225696584185415,
"learning_rate": 5.536167377385606e-05,
"loss": 0.3352,
"step": 389
},
{
"epoch": 2.1905762830618323,
"grad_norm": 0.20452311461397996,
"learning_rate": 5.521670296977151e-05,
"loss": 0.3427,
"step": 390
},
{
"epoch": 2.1961814678577682,
"grad_norm": 0.20853709114695754,
"learning_rate": 5.507149809085528e-05,
"loss": 0.3414,
"step": 391
},
{
"epoch": 2.2017866526537047,
"grad_norm": 0.19806872698081288,
"learning_rate": 5.4926061370758616e-05,
"loss": 0.3382,
"step": 392
},
{
"epoch": 2.207391837449641,
"grad_norm": 0.18154490506390078,
"learning_rate": 5.4780395046699116e-05,
"loss": 0.3334,
"step": 393
},
{
"epoch": 2.212997022245577,
"grad_norm": 0.20341890410491417,
"learning_rate": 5.4634501359426345e-05,
"loss": 0.3404,
"step": 394
},
{
"epoch": 2.2186022070415135,
"grad_norm": 0.22993218917327035,
"learning_rate": 5.4488382553187307e-05,
"loss": 0.3443,
"step": 395
},
{
"epoch": 2.2242073918374494,
"grad_norm": 0.26354208951183833,
"learning_rate": 5.434204087569199e-05,
"loss": 0.3377,
"step": 396
},
{
"epoch": 2.229812576633386,
"grad_norm": 0.26935013286659065,
"learning_rate": 5.419547857807871e-05,
"loss": 0.3383,
"step": 397
},
{
"epoch": 2.2354177614293222,
"grad_norm": 0.216246753815593,
"learning_rate": 5.404869791487958e-05,
"loss": 0.3354,
"step": 398
},
{
"epoch": 2.241022946225258,
"grad_norm": 0.18048698989035397,
"learning_rate": 5.390170114398575e-05,
"loss": 0.3425,
"step": 399
},
{
"epoch": 2.2466281310211946,
"grad_norm": 0.14602735362790997,
"learning_rate": 5.375449052661271e-05,
"loss": 0.3395,
"step": 400
},
{
"epoch": 2.252233315817131,
"grad_norm": 0.21006582382336506,
"learning_rate": 5.360706832726548e-05,
"loss": 0.3364,
"step": 401
},
{
"epoch": 2.257838500613067,
"grad_norm": 0.23551991763406468,
"learning_rate": 5.345943681370381e-05,
"loss": 0.3411,
"step": 402
},
{
"epoch": 2.2634436854090034,
"grad_norm": 0.23856885482494433,
"learning_rate": 5.33115982569073e-05,
"loss": 0.3418,
"step": 403
},
{
"epoch": 2.2690488702049394,
"grad_norm": 0.2300140164186402,
"learning_rate": 5.31635549310404e-05,
"loss": 0.3378,
"step": 404
},
{
"epoch": 2.274654055000876,
"grad_norm": 0.195987902529524,
"learning_rate": 5.3015309113417513e-05,
"loss": 0.3311,
"step": 405
},
{
"epoch": 2.280259239796812,
"grad_norm": 0.23463070349016596,
"learning_rate": 5.286686308446788e-05,
"loss": 0.3451,
"step": 406
},
{
"epoch": 2.285864424592748,
"grad_norm": 0.2288982821893169,
"learning_rate": 5.27182191277006e-05,
"loss": 0.3377,
"step": 407
},
{
"epoch": 2.2914696093886846,
"grad_norm": 0.2546834665154465,
"learning_rate": 5.256937952966942e-05,
"loss": 0.3377,
"step": 408
},
{
"epoch": 2.2970747941846206,
"grad_norm": 0.2278705546240396,
"learning_rate": 5.242034657993756e-05,
"loss": 0.3327,
"step": 409
},
{
"epoch": 2.302679978980557,
"grad_norm": 0.2120066354222386,
"learning_rate": 5.227112257104256e-05,
"loss": 0.3367,
"step": 410
},
{
"epoch": 2.3082851637764934,
"grad_norm": 0.21472567166361733,
"learning_rate": 5.2121709798460965e-05,
"loss": 0.3313,
"step": 411
},
{
"epoch": 2.3138903485724294,
"grad_norm": 0.1362367689167016,
"learning_rate": 5.197211056057304e-05,
"loss": 0.3351,
"step": 412
},
{
"epoch": 2.319495533368366,
"grad_norm": 0.15862071124581603,
"learning_rate": 5.182232715862738e-05,
"loss": 0.3338,
"step": 413
},
{
"epoch": 2.325100718164302,
"grad_norm": 0.18326499258519496,
"learning_rate": 5.167236189670551e-05,
"loss": 0.3404,
"step": 414
},
{
"epoch": 2.330705902960238,
"grad_norm": 0.1581861572597703,
"learning_rate": 5.152221708168652e-05,
"loss": 0.3375,
"step": 415
},
{
"epoch": 2.3363110877561746,
"grad_norm": 0.14207676687336432,
"learning_rate": 5.137189502321149e-05,
"loss": 0.3433,
"step": 416
},
{
"epoch": 2.3419162725521105,
"grad_norm": 0.1688755529335156,
"learning_rate": 5.122139803364798e-05,
"loss": 0.337,
"step": 417
},
{
"epoch": 2.347521457348047,
"grad_norm": 0.17084826149933108,
"learning_rate": 5.1070728428054506e-05,
"loss": 0.3337,
"step": 418
},
{
"epoch": 2.3531266421439834,
"grad_norm": 0.17086756093968097,
"learning_rate": 5.091988852414485e-05,
"loss": 0.3379,
"step": 419
},
{
"epoch": 2.3587318269399193,
"grad_norm": 0.17512853477255008,
"learning_rate": 5.07688806422525e-05,
"loss": 0.3346,
"step": 420
},
{
"epoch": 2.3643370117358558,
"grad_norm": 0.13968477165183565,
"learning_rate": 5.0617707105294876e-05,
"loss": 0.337,
"step": 421
},
{
"epoch": 2.3699421965317917,
"grad_norm": 0.1551281956472879,
"learning_rate": 5.046637023873763e-05,
"loss": 0.3414,
"step": 422
},
{
"epoch": 2.375547381327728,
"grad_norm": 0.13800979999513777,
"learning_rate": 5.0314872370558895e-05,
"loss": 0.332,
"step": 423
},
{
"epoch": 2.3811525661236645,
"grad_norm": 0.1367384754270518,
"learning_rate": 5.016321583121342e-05,
"loss": 0.3402,
"step": 424
},
{
"epoch": 2.3867577509196005,
"grad_norm": 0.1638479532373916,
"learning_rate": 5.00114029535968e-05,
"loss": 0.3325,
"step": 425
},
{
"epoch": 2.392362935715537,
"grad_norm": 0.16001330446175308,
"learning_rate": 4.985943607300951e-05,
"loss": 0.3378,
"step": 426
},
{
"epoch": 2.3979681205114733,
"grad_norm": 0.18621075203491558,
"learning_rate": 4.9707317527121e-05,
"loss": 0.3395,
"step": 427
},
{
"epoch": 2.4035733053074093,
"grad_norm": 0.18597047109928305,
"learning_rate": 4.9555049655933786e-05,
"loss": 0.3383,
"step": 428
},
{
"epoch": 2.4091784901033457,
"grad_norm": 0.16909692251131547,
"learning_rate": 4.940263480174741e-05,
"loss": 0.3336,
"step": 429
},
{
"epoch": 2.4147836748992817,
"grad_norm": 0.12955190193212013,
"learning_rate": 4.9250075309122414e-05,
"loss": 0.336,
"step": 430
},
{
"epoch": 2.420388859695218,
"grad_norm": 0.15190807156866015,
"learning_rate": 4.909737352484427e-05,
"loss": 0.3399,
"step": 431
},
{
"epoch": 2.425994044491154,
"grad_norm": 0.1573962735046912,
"learning_rate": 4.894453179788728e-05,
"loss": 0.3408,
"step": 432
},
{
"epoch": 2.4315992292870905,
"grad_norm": 0.1370853516111357,
"learning_rate": 4.879155247937849e-05,
"loss": 0.3318,
"step": 433
},
{
"epoch": 2.437204414083027,
"grad_norm": 0.15508060558426304,
"learning_rate": 4.8638437922561445e-05,
"loss": 0.3435,
"step": 434
},
{
"epoch": 2.442809598878963,
"grad_norm": 0.14664188616583732,
"learning_rate": 4.8485190482760046e-05,
"loss": 0.3303,
"step": 435
},
{
"epoch": 2.4484147836748993,
"grad_norm": 0.13239676829377417,
"learning_rate": 4.833181251734228e-05,
"loss": 0.3358,
"step": 436
},
{
"epoch": 2.4540199684708357,
"grad_norm": 0.1689890101652452,
"learning_rate": 4.8178306385684014e-05,
"loss": 0.3379,
"step": 437
},
{
"epoch": 2.4596251532667717,
"grad_norm": 0.19600685835243647,
"learning_rate": 4.802467444913263e-05,
"loss": 0.3375,
"step": 438
},
{
"epoch": 2.465230338062708,
"grad_norm": 0.1614065765277386,
"learning_rate": 4.787091907097075e-05,
"loss": 0.3353,
"step": 439
},
{
"epoch": 2.4708355228586445,
"grad_norm": 0.1508189987286625,
"learning_rate": 4.771704261637988e-05,
"loss": 0.3349,
"step": 440
},
{
"epoch": 2.4764407076545805,
"grad_norm": 0.18101253119293248,
"learning_rate": 4.756304745240398e-05,
"loss": 0.3408,
"step": 441
},
{
"epoch": 2.482045892450517,
"grad_norm": 0.1826261585945622,
"learning_rate": 4.740893594791314e-05,
"loss": 0.3351,
"step": 442
},
{
"epoch": 2.487651077246453,
"grad_norm": 0.14052710829053464,
"learning_rate": 4.7254710473567035e-05,
"loss": 0.3357,
"step": 443
},
{
"epoch": 2.4932562620423893,
"grad_norm": 0.14437812699305436,
"learning_rate": 4.710037340177855e-05,
"loss": 0.3323,
"step": 444
},
{
"epoch": 2.4988614468383252,
"grad_norm": 0.15602474948645395,
"learning_rate": 4.694592710667723e-05,
"loss": 0.3315,
"step": 445
},
{
"epoch": 2.5044666316342616,
"grad_norm": 0.14055328691934155,
"learning_rate": 4.6791373964072755e-05,
"loss": 0.3417,
"step": 446
},
{
"epoch": 2.510071816430198,
"grad_norm": 0.1388879025795148,
"learning_rate": 4.663671635141844e-05,
"loss": 0.3334,
"step": 447
},
{
"epoch": 2.515677001226134,
"grad_norm": 0.14203432091623683,
"learning_rate": 4.648195664777466e-05,
"loss": 0.3293,
"step": 448
},
{
"epoch": 2.5212821860220704,
"grad_norm": 0.1362951045485,
"learning_rate": 4.6327097233772167e-05,
"loss": 0.3398,
"step": 449
},
{
"epoch": 2.526887370818007,
"grad_norm": 0.13873000403640978,
"learning_rate": 4.617214049157559e-05,
"loss": 0.3447,
"step": 450
},
{
"epoch": 2.532492555613943,
"grad_norm": 0.13932412475931288,
"learning_rate": 4.601708880484672e-05,
"loss": 0.3378,
"step": 451
},
{
"epoch": 2.5380977404098792,
"grad_norm": 0.13616797667697145,
"learning_rate": 4.586194455870782e-05,
"loss": 0.3357,
"step": 452
},
{
"epoch": 2.5437029252058156,
"grad_norm": 0.12388008240947325,
"learning_rate": 4.5706710139705035e-05,
"loss": 0.3367,
"step": 453
},
{
"epoch": 2.5493081100017516,
"grad_norm": 0.13428327954219868,
"learning_rate": 4.555138793577156e-05,
"loss": 0.3372,
"step": 454
},
{
"epoch": 2.5549132947976876,
"grad_norm": 0.12462707556187247,
"learning_rate": 4.5395980336191e-05,
"loss": 0.3386,
"step": 455
},
{
"epoch": 2.560518479593624,
"grad_norm": 0.1235771755300071,
"learning_rate": 4.524048973156056e-05,
"loss": 0.3381,
"step": 456
},
{
"epoch": 2.5661236643895604,
"grad_norm": 0.14711243405086183,
"learning_rate": 4.508491851375431e-05,
"loss": 0.3316,
"step": 457
},
{
"epoch": 2.5717288491854964,
"grad_norm": 0.14459114492388414,
"learning_rate": 4.4929269075886345e-05,
"loss": 0.3298,
"step": 458
},
{
"epoch": 2.577334033981433,
"grad_norm": 0.12808061163839768,
"learning_rate": 4.477354381227405e-05,
"loss": 0.3365,
"step": 459
},
{
"epoch": 2.582939218777369,
"grad_norm": 0.1605215774840903,
"learning_rate": 4.4617745118401146e-05,
"loss": 0.3436,
"step": 460
},
{
"epoch": 2.588544403573305,
"grad_norm": 0.1686937530955673,
"learning_rate": 4.446187539088098e-05,
"loss": 0.3401,
"step": 461
},
{
"epoch": 2.5941495883692416,
"grad_norm": 0.1527872446582201,
"learning_rate": 4.4305937027419554e-05,
"loss": 0.336,
"step": 462
},
{
"epoch": 2.599754773165178,
"grad_norm": 0.15128554865510688,
"learning_rate": 4.4149932426778726e-05,
"loss": 0.3344,
"step": 463
},
{
"epoch": 2.605359957961114,
"grad_norm": 0.14920667419147088,
"learning_rate": 4.399386398873919e-05,
"loss": 0.337,
"step": 464
},
{
"epoch": 2.6109651427570504,
"grad_norm": 0.1744457398926105,
"learning_rate": 4.383773411406369e-05,
"loss": 0.3315,
"step": 465
},
{
"epoch": 2.6165703275529864,
"grad_norm": 0.17078707601958157,
"learning_rate": 4.368154520446e-05,
"loss": 0.3381,
"step": 466
},
{
"epoch": 2.6221755123489228,
"grad_norm": 0.13633202612822465,
"learning_rate": 4.352529966254408e-05,
"loss": 0.3356,
"step": 467
},
{
"epoch": 2.6277806971448587,
"grad_norm": 0.15243033406951811,
"learning_rate": 4.336899989180297e-05,
"loss": 0.336,
"step": 468
},
{
"epoch": 2.633385881940795,
"grad_norm": 0.14747243716476988,
"learning_rate": 4.3212648296557956e-05,
"loss": 0.3404,
"step": 469
},
{
"epoch": 2.6389910667367316,
"grad_norm": 0.14313810836029056,
"learning_rate": 4.305624728192749e-05,
"loss": 0.3383,
"step": 470
},
{
"epoch": 2.6445962515326675,
"grad_norm": 0.15292894547047348,
"learning_rate": 4.289979925379025e-05,
"loss": 0.3347,
"step": 471
},
{
"epoch": 2.650201436328604,
"grad_norm": 0.16666900752167832,
"learning_rate": 4.274330661874812e-05,
"loss": 0.3389,
"step": 472
},
{
"epoch": 2.6558066211245404,
"grad_norm": 0.14101399886632246,
"learning_rate": 4.258677178408914e-05,
"loss": 0.3472,
"step": 473
},
{
"epoch": 2.6614118059204763,
"grad_norm": 0.14283842141759293,
"learning_rate": 4.2430197157750506e-05,
"loss": 0.3288,
"step": 474
},
{
"epoch": 2.6670169907164127,
"grad_norm": 0.18182542514094624,
"learning_rate": 4.227358514828151e-05,
"loss": 0.3344,
"step": 475
},
{
"epoch": 2.672622175512349,
"grad_norm": 0.15313956411935128,
"learning_rate": 4.2116938164806523e-05,
"loss": 0.3448,
"step": 476
},
{
"epoch": 2.678227360308285,
"grad_norm": 0.15589616508314744,
"learning_rate": 4.19602586169879e-05,
"loss": 0.3429,
"step": 477
},
{
"epoch": 2.6838325451042215,
"grad_norm": 0.18115048482512122,
"learning_rate": 4.1803548914988915e-05,
"loss": 0.3341,
"step": 478
},
{
"epoch": 2.6894377299001575,
"grad_norm": 0.15383530203792303,
"learning_rate": 4.164681146943672e-05,
"loss": 0.3369,
"step": 479
},
{
"epoch": 2.695042914696094,
"grad_norm": 0.18132657455214243,
"learning_rate": 4.1490048691385184e-05,
"loss": 0.3387,
"step": 480
},
{
"epoch": 2.70064809949203,
"grad_norm": 0.1663579198570477,
"learning_rate": 4.133326299227796e-05,
"loss": 0.3426,
"step": 481
},
{
"epoch": 2.7062532842879663,
"grad_norm": 0.15912216978968627,
"learning_rate": 4.1176456783911186e-05,
"loss": 0.3391,
"step": 482
},
{
"epoch": 2.7118584690839027,
"grad_norm": 0.14961940121164838,
"learning_rate": 4.1019632478396535e-05,
"loss": 0.3346,
"step": 483
},
{
"epoch": 2.7174636538798387,
"grad_norm": 0.13821373229896533,
"learning_rate": 4.0862792488124084e-05,
"loss": 0.3444,
"step": 484
},
{
"epoch": 2.723068838675775,
"grad_norm": 0.1446248191283951,
"learning_rate": 4.070593922572515e-05,
"loss": 0.3397,
"step": 485
},
{
"epoch": 2.7286740234717115,
"grad_norm": 0.14611396725110462,
"learning_rate": 4.0549075104035235e-05,
"loss": 0.3381,
"step": 486
},
{
"epoch": 2.7342792082676475,
"grad_norm": 0.15021839121813585,
"learning_rate": 4.0392202536056864e-05,
"loss": 0.3376,
"step": 487
},
{
"epoch": 2.739884393063584,
"grad_norm": 0.1222365954508722,
"learning_rate": 4.023532393492249e-05,
"loss": 0.3418,
"step": 488
},
{
"epoch": 2.7454895778595203,
"grad_norm": 0.13385222223998502,
"learning_rate": 4.007844171385742e-05,
"loss": 0.3375,
"step": 489
},
{
"epoch": 2.7510947626554563,
"grad_norm": 0.16084769560491935,
"learning_rate": 3.992155828614259e-05,
"loss": 0.3383,
"step": 490
},
{
"epoch": 2.7566999474513927,
"grad_norm": 0.11593149953710691,
"learning_rate": 3.976467606507752e-05,
"loss": 0.334,
"step": 491
},
{
"epoch": 2.7623051322473287,
"grad_norm": 0.12135253379499054,
"learning_rate": 3.960779746394315e-05,
"loss": 0.3369,
"step": 492
},
{
"epoch": 2.767910317043265,
"grad_norm": 0.15157050861544064,
"learning_rate": 3.9450924895964785e-05,
"loss": 0.3378,
"step": 493
},
{
"epoch": 2.773515501839201,
"grad_norm": 0.13097668206944849,
"learning_rate": 3.929406077427486e-05,
"loss": 0.3378,
"step": 494
},
{
"epoch": 2.7791206866351374,
"grad_norm": 0.16603204878535843,
"learning_rate": 3.913720751187593e-05,
"loss": 0.335,
"step": 495
},
{
"epoch": 2.784725871431074,
"grad_norm": 0.14752638359057793,
"learning_rate": 3.898036752160348e-05,
"loss": 0.3333,
"step": 496
},
{
"epoch": 2.79033105622701,
"grad_norm": 0.11483015755025572,
"learning_rate": 3.882354321608883e-05,
"loss": 0.3324,
"step": 497
},
{
"epoch": 2.7959362410229462,
"grad_norm": 0.15227040786502544,
"learning_rate": 3.8666737007722055e-05,
"loss": 0.3334,
"step": 498
},
{
"epoch": 2.8015414258188827,
"grad_norm": 0.15767384723901806,
"learning_rate": 3.8509951308614816e-05,
"loss": 0.3346,
"step": 499
},
{
"epoch": 2.8071466106148186,
"grad_norm": 0.12626963411483594,
"learning_rate": 3.8353188530563296e-05,
"loss": 0.3433,
"step": 500
},
{
"epoch": 2.812751795410755,
"grad_norm": 0.13733431001138763,
"learning_rate": 3.8196451085011085e-05,
"loss": 0.3327,
"step": 501
},
{
"epoch": 2.8183569802066915,
"grad_norm": 0.14680623002425597,
"learning_rate": 3.80397413830121e-05,
"loss": 0.3362,
"step": 502
},
{
"epoch": 2.8239621650026274,
"grad_norm": 0.13113991590813315,
"learning_rate": 3.7883061835193476e-05,
"loss": 0.3316,
"step": 503
},
{
"epoch": 2.829567349798564,
"grad_norm": 0.14765888572997615,
"learning_rate": 3.772641485171849e-05,
"loss": 0.3354,
"step": 504
},
{
"epoch": 2.8351725345945,
"grad_norm": 0.13209441385102483,
"learning_rate": 3.756980284224951e-05,
"loss": 0.3387,
"step": 505
},
{
"epoch": 2.840777719390436,
"grad_norm": 0.14879932604775253,
"learning_rate": 3.7413228215910866e-05,
"loss": 0.3369,
"step": 506
},
{
"epoch": 2.846382904186372,
"grad_norm": 0.13679808581693134,
"learning_rate": 3.725669338125189e-05,
"loss": 0.3316,
"step": 507
},
{
"epoch": 2.8519880889823086,
"grad_norm": 0.14870932780889795,
"learning_rate": 3.710020074620976e-05,
"loss": 0.3411,
"step": 508
},
{
"epoch": 2.857593273778245,
"grad_norm": 0.15465220506717361,
"learning_rate": 3.6943752718072526e-05,
"loss": 0.3431,
"step": 509
},
{
"epoch": 2.863198458574181,
"grad_norm": 0.13086189881896804,
"learning_rate": 3.6787351703442064e-05,
"loss": 0.3361,
"step": 510
},
{
"epoch": 2.8688036433701174,
"grad_norm": 0.15533250172898358,
"learning_rate": 3.663100010819704e-05,
"loss": 0.3409,
"step": 511
},
{
"epoch": 2.874408828166054,
"grad_norm": 0.15368379331778698,
"learning_rate": 3.6474700337455946e-05,
"loss": 0.3366,
"step": 512
},
{
"epoch": 2.8800140129619898,
"grad_norm": 0.13213250938368978,
"learning_rate": 3.631845479554001e-05,
"loss": 0.3404,
"step": 513
},
{
"epoch": 2.885619197757926,
"grad_norm": 0.12630288060204878,
"learning_rate": 3.616226588593634e-05,
"loss": 0.3364,
"step": 514
},
{
"epoch": 2.891224382553862,
"grad_norm": 0.13593521293454738,
"learning_rate": 3.6006136011260835e-05,
"loss": 0.3381,
"step": 515
},
{
"epoch": 2.8968295673497986,
"grad_norm": 0.11554742949705811,
"learning_rate": 3.5850067573221294e-05,
"loss": 0.331,
"step": 516
},
{
"epoch": 2.9024347521457345,
"grad_norm": 0.1198528477058993,
"learning_rate": 3.569406297258045e-05,
"loss": 0.3382,
"step": 517
},
{
"epoch": 2.908039936941671,
"grad_norm": 0.12987613409396606,
"learning_rate": 3.553812460911903e-05,
"loss": 0.332,
"step": 518
},
{
"epoch": 2.9136451217376074,
"grad_norm": 0.10360442006807373,
"learning_rate": 3.538225488159886e-05,
"loss": 0.3345,
"step": 519
},
{
"epoch": 2.9192503065335433,
"grad_norm": 0.11999841966016728,
"learning_rate": 3.5226456187725966e-05,
"loss": 0.3356,
"step": 520
},
{
"epoch": 2.9248554913294798,
"grad_norm": 0.12157002192921504,
"learning_rate": 3.507073092411366e-05,
"loss": 0.331,
"step": 521
},
{
"epoch": 2.930460676125416,
"grad_norm": 0.11962352346017135,
"learning_rate": 3.4915081486245696e-05,
"loss": 0.3221,
"step": 522
},
{
"epoch": 2.936065860921352,
"grad_norm": 0.10590981284086302,
"learning_rate": 3.4759510268439444e-05,
"loss": 0.3271,
"step": 523
},
{
"epoch": 2.9416710457172885,
"grad_norm": 0.12508588575966806,
"learning_rate": 3.460401966380901e-05,
"loss": 0.334,
"step": 524
},
{
"epoch": 2.947276230513225,
"grad_norm": 0.12411992558068433,
"learning_rate": 3.4448612064228455e-05,
"loss": 0.3342,
"step": 525
},
{
"epoch": 2.952881415309161,
"grad_norm": 0.12323545333378516,
"learning_rate": 3.4293289860294985e-05,
"loss": 0.3397,
"step": 526
},
{
"epoch": 2.9584866001050973,
"grad_norm": 0.12211808120842779,
"learning_rate": 3.4138055441292186e-05,
"loss": 0.333,
"step": 527
},
{
"epoch": 2.9640917849010333,
"grad_norm": 0.11492860031416643,
"learning_rate": 3.3982911195153294e-05,
"loss": 0.3329,
"step": 528
},
{
"epoch": 2.9696969696969697,
"grad_norm": 0.13508535922750942,
"learning_rate": 3.3827859508424415e-05,
"loss": 0.3398,
"step": 529
},
{
"epoch": 2.9753021544929057,
"grad_norm": 0.11565425526423825,
"learning_rate": 3.367290276622785e-05,
"loss": 0.3365,
"step": 530
},
{
"epoch": 2.980907339288842,
"grad_norm": 0.13139009508968535,
"learning_rate": 3.3518043352225354e-05,
"loss": 0.3312,
"step": 531
},
{
"epoch": 2.9865125240847785,
"grad_norm": 0.1362299448189819,
"learning_rate": 3.3363283648581564e-05,
"loss": 0.3292,
"step": 532
},
{
"epoch": 2.9921177088807145,
"grad_norm": 0.12683117549730533,
"learning_rate": 3.3208626035927265e-05,
"loss": 0.3306,
"step": 533
},
{
"epoch": 2.997722893676651,
"grad_norm": 0.10969694374220536,
"learning_rate": 3.305407289332279e-05,
"loss": 0.3331,
"step": 534
},
{
"epoch": 3.00525486074619,
"grad_norm": 0.3592894801843749,
"learning_rate": 3.289962659822146e-05,
"loss": 0.612,
"step": 535
},
{
"epoch": 3.0108600455421266,
"grad_norm": 0.2839378100457828,
"learning_rate": 3.274528952643296e-05,
"loss": 0.309,
"step": 536
},
{
"epoch": 3.0164652303380626,
"grad_norm": 0.2432146356050859,
"learning_rate": 3.259106405208686e-05,
"loss": 0.3106,
"step": 537
},
{
"epoch": 3.022070415133999,
"grad_norm": 0.2777016538919042,
"learning_rate": 3.2436952547596016e-05,
"loss": 0.3208,
"step": 538
},
{
"epoch": 3.0276755999299354,
"grad_norm": 0.29446096248561565,
"learning_rate": 3.228295738362013e-05,
"loss": 0.3128,
"step": 539
},
{
"epoch": 3.0332807847258714,
"grad_norm": 0.2134321257824934,
"learning_rate": 3.212908092902925e-05,
"loss": 0.3123,
"step": 540
},
{
"epoch": 3.038885969521808,
"grad_norm": 0.2929761811829302,
"learning_rate": 3.1975325550867376e-05,
"loss": 0.3149,
"step": 541
},
{
"epoch": 3.0444911543177438,
"grad_norm": 0.23083582350400958,
"learning_rate": 3.182169361431599e-05,
"loss": 0.3181,
"step": 542
},
{
"epoch": 3.05009633911368,
"grad_norm": 0.22809559973140262,
"learning_rate": 3.1668187482657724e-05,
"loss": 0.3196,
"step": 543
},
{
"epoch": 3.0557015239096166,
"grad_norm": 0.2825820642402612,
"learning_rate": 3.151480951723997e-05,
"loss": 0.3146,
"step": 544
},
{
"epoch": 3.0613067087055525,
"grad_norm": 0.2033350745344508,
"learning_rate": 3.1361562077438575e-05,
"loss": 0.3122,
"step": 545
},
{
"epoch": 3.066911893501489,
"grad_norm": 0.21932897795785233,
"learning_rate": 3.120844752062153e-05,
"loss": 0.3124,
"step": 546
},
{
"epoch": 3.072517078297425,
"grad_norm": 0.21168302798367983,
"learning_rate": 3.1055468202112734e-05,
"loss": 0.3166,
"step": 547
},
{
"epoch": 3.0781222630933613,
"grad_norm": 0.17772176835113396,
"learning_rate": 3.090262647515575e-05,
"loss": 0.3077,
"step": 548
},
{
"epoch": 3.0837274478892978,
"grad_norm": 0.17841507645772034,
"learning_rate": 3.0749924690877606e-05,
"loss": 0.314,
"step": 549
},
{
"epoch": 3.0893326326852337,
"grad_norm": 0.1771081138385722,
"learning_rate": 3.0597365198252605e-05,
"loss": 0.3145,
"step": 550
},
{
"epoch": 3.09493781748117,
"grad_norm": 0.17292167901780775,
"learning_rate": 3.044495034406623e-05,
"loss": 0.3141,
"step": 551
},
{
"epoch": 3.100543002277106,
"grad_norm": 0.15958589623983585,
"learning_rate": 3.0292682472879016e-05,
"loss": 0.309,
"step": 552
},
{
"epoch": 3.1061481870730425,
"grad_norm": 0.17823919421432835,
"learning_rate": 3.014056392699051e-05,
"loss": 0.3107,
"step": 553
},
{
"epoch": 3.111753371868979,
"grad_norm": 0.14347972248545185,
"learning_rate": 2.998859704640321e-05,
"loss": 0.3122,
"step": 554
},
{
"epoch": 3.117358556664915,
"grad_norm": 0.15502920693257438,
"learning_rate": 2.9836784168786587e-05,
"loss": 0.3079,
"step": 555
},
{
"epoch": 3.1229637414608513,
"grad_norm": 0.14905838975522384,
"learning_rate": 2.968512762944112e-05,
"loss": 0.3085,
"step": 556
},
{
"epoch": 3.1285689262567877,
"grad_norm": 0.15067799733981543,
"learning_rate": 2.953362976126238e-05,
"loss": 0.3173,
"step": 557
},
{
"epoch": 3.1341741110527237,
"grad_norm": 0.1352871810582152,
"learning_rate": 2.9382292894705137e-05,
"loss": 0.3168,
"step": 558
},
{
"epoch": 3.13977929584866,
"grad_norm": 0.13712672403520984,
"learning_rate": 2.9231119357747514e-05,
"loss": 0.3096,
"step": 559
},
{
"epoch": 3.145384480644596,
"grad_norm": 0.13040410642011227,
"learning_rate": 2.908011147585516e-05,
"loss": 0.313,
"step": 560
},
{
"epoch": 3.1509896654405325,
"grad_norm": 0.11413659831512099,
"learning_rate": 2.8929271571945504e-05,
"loss": 0.3173,
"step": 561
},
{
"epoch": 3.156594850236469,
"grad_norm": 0.13404522891478707,
"learning_rate": 2.8778601966352028e-05,
"loss": 0.3129,
"step": 562
},
{
"epoch": 3.162200035032405,
"grad_norm": 0.12169309736493167,
"learning_rate": 2.8628104976788527e-05,
"loss": 0.3144,
"step": 563
},
{
"epoch": 3.1678052198283413,
"grad_norm": 0.12136151456869423,
"learning_rate": 2.8477782918313495e-05,
"loss": 0.3101,
"step": 564
},
{
"epoch": 3.1734104046242773,
"grad_norm": 0.12720103445578,
"learning_rate": 2.83276381032945e-05,
"loss": 0.3064,
"step": 565
},
{
"epoch": 3.1790155894202137,
"grad_norm": 0.11658578152683795,
"learning_rate": 2.8177672841372642e-05,
"loss": 0.3104,
"step": 566
},
{
"epoch": 3.18462077421615,
"grad_norm": 0.1297562392547841,
"learning_rate": 2.802788943942697e-05,
"loss": 0.3123,
"step": 567
},
{
"epoch": 3.190225959012086,
"grad_norm": 0.11985456340120923,
"learning_rate": 2.787829020153904e-05,
"loss": 0.3146,
"step": 568
},
{
"epoch": 3.1958311438080225,
"grad_norm": 0.1262651870939076,
"learning_rate": 2.772887742895745e-05,
"loss": 0.3075,
"step": 569
},
{
"epoch": 3.2014363286039584,
"grad_norm": 0.12464097287272473,
"learning_rate": 2.7579653420062444e-05,
"loss": 0.3045,
"step": 570
},
{
"epoch": 3.207041513399895,
"grad_norm": 0.12341347110151742,
"learning_rate": 2.7430620470330588e-05,
"loss": 0.3052,
"step": 571
},
{
"epoch": 3.2126466981958313,
"grad_norm": 0.11689537607575821,
"learning_rate": 2.7281780872299397e-05,
"loss": 0.3092,
"step": 572
},
{
"epoch": 3.2182518829917672,
"grad_norm": 0.12733447393829486,
"learning_rate": 2.7133136915532117e-05,
"loss": 0.3126,
"step": 573
},
{
"epoch": 3.2238570677877036,
"grad_norm": 0.1123962660816928,
"learning_rate": 2.69846908865825e-05,
"loss": 0.3125,
"step": 574
},
{
"epoch": 3.22946225258364,
"grad_norm": 0.13480471605245697,
"learning_rate": 2.68364450689596e-05,
"loss": 0.3186,
"step": 575
},
{
"epoch": 3.235067437379576,
"grad_norm": 0.11275478327505037,
"learning_rate": 2.6688401743092704e-05,
"loss": 0.3172,
"step": 576
},
{
"epoch": 3.2406726221755124,
"grad_norm": 0.12484323278884911,
"learning_rate": 2.6540563186296186e-05,
"loss": 0.3102,
"step": 577
},
{
"epoch": 3.2462778069714484,
"grad_norm": 0.11197370815895867,
"learning_rate": 2.639293167273453e-05,
"loss": 0.3031,
"step": 578
},
{
"epoch": 3.251882991767385,
"grad_norm": 0.11454623610347744,
"learning_rate": 2.6245509473387296e-05,
"loss": 0.3065,
"step": 579
},
{
"epoch": 3.2574881765633212,
"grad_norm": 0.12304422921255855,
"learning_rate": 2.609829885601425e-05,
"loss": 0.3089,
"step": 580
},
{
"epoch": 3.263093361359257,
"grad_norm": 0.11259423111356101,
"learning_rate": 2.5951302085120437e-05,
"loss": 0.3105,
"step": 581
},
{
"epoch": 3.2686985461551936,
"grad_norm": 0.11271478685448443,
"learning_rate": 2.5804521421921305e-05,
"loss": 0.3068,
"step": 582
},
{
"epoch": 3.2743037309511296,
"grad_norm": 0.11647961636111298,
"learning_rate": 2.5657959124308036e-05,
"loss": 0.316,
"step": 583
},
{
"epoch": 3.279908915747066,
"grad_norm": 0.12470089799540612,
"learning_rate": 2.551161744681271e-05,
"loss": 0.3122,
"step": 584
},
{
"epoch": 3.2855141005430024,
"grad_norm": 0.09459288821151796,
"learning_rate": 2.5365498640573675e-05,
"loss": 0.3082,
"step": 585
},
{
"epoch": 3.2911192853389384,
"grad_norm": 0.12027588781638404,
"learning_rate": 2.5219604953300897e-05,
"loss": 0.3104,
"step": 586
},
{
"epoch": 3.296724470134875,
"grad_norm": 0.09693184441523361,
"learning_rate": 2.5073938629241404e-05,
"loss": 0.3113,
"step": 587
},
{
"epoch": 3.302329654930811,
"grad_norm": 0.11431466893797337,
"learning_rate": 2.4928501909144735e-05,
"loss": 0.3122,
"step": 588
},
{
"epoch": 3.307934839726747,
"grad_norm": 0.10947655679887262,
"learning_rate": 2.4783297030228504e-05,
"loss": 0.3059,
"step": 589
},
{
"epoch": 3.3135400245226836,
"grad_norm": 0.10724694744795145,
"learning_rate": 2.4638326226143955e-05,
"loss": 0.3059,
"step": 590
},
{
"epoch": 3.3191452093186196,
"grad_norm": 0.10607719365259763,
"learning_rate": 2.449359172694161e-05,
"loss": 0.3126,
"step": 591
},
{
"epoch": 3.324750394114556,
"grad_norm": 0.10720345042194546,
"learning_rate": 2.4349095759037e-05,
"loss": 0.3089,
"step": 592
},
{
"epoch": 3.3303555789104924,
"grad_norm": 0.10273817164660135,
"learning_rate": 2.4204840545176356e-05,
"loss": 0.3108,
"step": 593
},
{
"epoch": 3.3359607637064284,
"grad_norm": 0.09975537559192163,
"learning_rate": 2.406082830440247e-05,
"loss": 0.3124,
"step": 594
},
{
"epoch": 3.3415659485023648,
"grad_norm": 0.09406955914926102,
"learning_rate": 2.3917061252020513e-05,
"loss": 0.316,
"step": 595
},
{
"epoch": 3.3471711332983007,
"grad_norm": 0.09940842953248977,
"learning_rate": 2.3773541599564016e-05,
"loss": 0.3127,
"step": 596
},
{
"epoch": 3.352776318094237,
"grad_norm": 0.10553838056133437,
"learning_rate": 2.36302715547608e-05,
"loss": 0.3057,
"step": 597
},
{
"epoch": 3.3583815028901736,
"grad_norm": 0.10680543765646766,
"learning_rate": 2.3487253321499025e-05,
"loss": 0.3064,
"step": 598
},
{
"epoch": 3.3639866876861095,
"grad_norm": 0.10212325660144811,
"learning_rate": 2.3344489099793298e-05,
"loss": 0.3054,
"step": 599
},
{
"epoch": 3.369591872482046,
"grad_norm": 0.09345936279910022,
"learning_rate": 2.3201981085750848e-05,
"loss": 0.3091,
"step": 600
},
{
"epoch": 3.3751970572779824,
"grad_norm": 0.10692872809212166,
"learning_rate": 2.3059731471537692e-05,
"loss": 0.3057,
"step": 601
},
{
"epoch": 3.3808022420739183,
"grad_norm": 0.10292193319438646,
"learning_rate": 2.2917742445344957e-05,
"loss": 0.3039,
"step": 602
},
{
"epoch": 3.3864074268698547,
"grad_norm": 0.1044728564683231,
"learning_rate": 2.2776016191355247e-05,
"loss": 0.307,
"step": 603
},
{
"epoch": 3.3920126116657907,
"grad_norm": 0.11121222286242763,
"learning_rate": 2.2634554889708946e-05,
"loss": 0.3146,
"step": 604
},
{
"epoch": 3.397617796461727,
"grad_norm": 0.0953250187025364,
"learning_rate": 2.2493360716470778e-05,
"loss": 0.3088,
"step": 605
},
{
"epoch": 3.403222981257663,
"grad_norm": 0.10907994857009719,
"learning_rate": 2.2352435843596276e-05,
"loss": 0.3122,
"step": 606
},
{
"epoch": 3.4088281660535995,
"grad_norm": 0.09794843878191668,
"learning_rate": 2.2211782438898403e-05,
"loss": 0.3072,
"step": 607
},
{
"epoch": 3.414433350849536,
"grad_norm": 0.1106427273961921,
"learning_rate": 2.207140266601419e-05,
"loss": 0.3173,
"step": 608
},
{
"epoch": 3.420038535645472,
"grad_norm": 0.09676869359844141,
"learning_rate": 2.193129868437145e-05,
"loss": 0.3097,
"step": 609
},
{
"epoch": 3.4256437204414083,
"grad_norm": 0.1127315347743384,
"learning_rate": 2.179147264915558e-05,
"loss": 0.3087,
"step": 610
},
{
"epoch": 3.4312489052373447,
"grad_norm": 0.10860744939106713,
"learning_rate": 2.1651926711276374e-05,
"loss": 0.3064,
"step": 611
},
{
"epoch": 3.4368540900332807,
"grad_norm": 0.10854328769381996,
"learning_rate": 2.1512663017334994e-05,
"loss": 0.3098,
"step": 612
},
{
"epoch": 3.442459274829217,
"grad_norm": 0.11762452038097213,
"learning_rate": 2.1373683709590873e-05,
"loss": 0.3115,
"step": 613
},
{
"epoch": 3.4480644596251535,
"grad_norm": 0.10805927729310495,
"learning_rate": 2.1234990925928827e-05,
"loss": 0.3078,
"step": 614
},
{
"epoch": 3.4536696444210895,
"grad_norm": 0.1210574300692429,
"learning_rate": 2.1096586799826123e-05,
"loss": 0.3131,
"step": 615
},
{
"epoch": 3.459274829217026,
"grad_norm": 0.11322408230361472,
"learning_rate": 2.0958473460319685e-05,
"loss": 0.3045,
"step": 616
},
{
"epoch": 3.464880014012962,
"grad_norm": 0.12288017952430454,
"learning_rate": 2.0820653031973363e-05,
"loss": 0.3004,
"step": 617
},
{
"epoch": 3.4704851988088983,
"grad_norm": 0.11467508943459158,
"learning_rate": 2.0683127634845155e-05,
"loss": 0.3118,
"step": 618
},
{
"epoch": 3.4760903836048342,
"grad_norm": 0.10089539729462121,
"learning_rate": 2.0545899384454753e-05,
"loss": 0.3115,
"step": 619
},
{
"epoch": 3.4816955684007707,
"grad_norm": 0.10903963497738692,
"learning_rate": 2.040897039175087e-05,
"loss": 0.3183,
"step": 620
},
{
"epoch": 3.487300753196707,
"grad_norm": 0.0960280103883697,
"learning_rate": 2.0272342763078806e-05,
"loss": 0.3168,
"step": 621
},
{
"epoch": 3.492905937992643,
"grad_norm": 0.1019907001338773,
"learning_rate": 2.0136018600148065e-05,
"loss": 0.314,
"step": 622
},
{
"epoch": 3.4985111227885795,
"grad_norm": 0.09703216086704151,
"learning_rate": 2.0000000000000012e-05,
"loss": 0.3114,
"step": 623
},
{
"epoch": 3.504116307584516,
"grad_norm": 0.09927974827651981,
"learning_rate": 1.9864289054975595e-05,
"loss": 0.3066,
"step": 624
},
{
"epoch": 3.509721492380452,
"grad_norm": 0.10115048078252437,
"learning_rate": 1.9728887852683204e-05,
"loss": 0.3063,
"step": 625
},
{
"epoch": 3.5153266771763882,
"grad_norm": 0.09617916095602932,
"learning_rate": 1.959379847596652e-05,
"loss": 0.3078,
"step": 626
},
{
"epoch": 3.5209318619723247,
"grad_norm": 0.10018831202713846,
"learning_rate": 1.9459023002872466e-05,
"loss": 0.306,
"step": 627
},
{
"epoch": 3.5265370467682606,
"grad_norm": 0.09046907662038928,
"learning_rate": 1.9324563506619323e-05,
"loss": 0.3093,
"step": 628
},
{
"epoch": 3.532142231564197,
"grad_norm": 0.09707258005221424,
"learning_rate": 1.9190422055564716e-05,
"loss": 0.3068,
"step": 629
},
{
"epoch": 3.537747416360133,
"grad_norm": 0.08790868545557047,
"learning_rate": 1.9056600713173884e-05,
"loss": 0.3063,
"step": 630
},
{
"epoch": 3.5433526011560694,
"grad_norm": 0.09522724255822446,
"learning_rate": 1.8923101537987906e-05,
"loss": 0.3071,
"step": 631
},
{
"epoch": 3.5489577859520054,
"grad_norm": 0.09275069555664794,
"learning_rate": 1.878992658359205e-05,
"loss": 0.3082,
"step": 632
},
{
"epoch": 3.554562970747942,
"grad_norm": 0.09662779046438905,
"learning_rate": 1.865707789858416e-05,
"loss": 0.317,
"step": 633
},
{
"epoch": 3.560168155543878,
"grad_norm": 0.09038439246839879,
"learning_rate": 1.852455752654318e-05,
"loss": 0.3095,
"step": 634
},
{
"epoch": 3.565773340339814,
"grad_norm": 0.0892351312327684,
"learning_rate": 1.839236750599767e-05,
"loss": 0.3099,
"step": 635
},
{
"epoch": 3.5713785251357506,
"grad_norm": 0.0901544987475721,
"learning_rate": 1.8260509870394475e-05,
"loss": 0.3145,
"step": 636
},
{
"epoch": 3.576983709931687,
"grad_norm": 0.08859989195760687,
"learning_rate": 1.8128986648067487e-05,
"loss": 0.3054,
"step": 637
},
{
"epoch": 3.582588894727623,
"grad_norm": 0.09128402171545218,
"learning_rate": 1.7997799862206346e-05,
"loss": 0.3121,
"step": 638
},
{
"epoch": 3.5881940795235594,
"grad_norm": 0.09185126057088099,
"learning_rate": 1.78669515308254e-05,
"loss": 0.3103,
"step": 639
},
{
"epoch": 3.5937992643194954,
"grad_norm": 0.0893997681385203,
"learning_rate": 1.7736443666732626e-05,
"loss": 0.3099,
"step": 640
},
{
"epoch": 3.599404449115432,
"grad_norm": 0.0927092479814224,
"learning_rate": 1.7606278277498674e-05,
"loss": 0.3096,
"step": 641
},
{
"epoch": 3.6050096339113678,
"grad_norm": 0.09322798178482147,
"learning_rate": 1.747645736542599e-05,
"loss": 0.312,
"step": 642
},
{
"epoch": 3.610614818707304,
"grad_norm": 0.09305885545146318,
"learning_rate": 1.7346982927518014e-05,
"loss": 0.3121,
"step": 643
},
{
"epoch": 3.6162200035032406,
"grad_norm": 0.09354862097700213,
"learning_rate": 1.721785695544846e-05,
"loss": 0.3084,
"step": 644
},
{
"epoch": 3.6218251882991765,
"grad_norm": 0.08879212707112467,
"learning_rate": 1.7089081435530667e-05,
"loss": 0.3103,
"step": 645
},
{
"epoch": 3.627430373095113,
"grad_norm": 0.08777213540858403,
"learning_rate": 1.6960658348687046e-05,
"loss": 0.3094,
"step": 646
},
{
"epoch": 3.6330355578910494,
"grad_norm": 0.09446494446048785,
"learning_rate": 1.683258967041866e-05,
"loss": 0.3099,
"step": 647
},
{
"epoch": 3.6386407426869853,
"grad_norm": 0.0856866484739207,
"learning_rate": 1.6704877370774748e-05,
"loss": 0.3046,
"step": 648
},
{
"epoch": 3.6442459274829218,
"grad_norm": 0.08653060673663804,
"learning_rate": 1.6577523414322478e-05,
"loss": 0.3039,
"step": 649
},
{
"epoch": 3.649851112278858,
"grad_norm": 0.0914995782178045,
"learning_rate": 1.6450529760116705e-05,
"loss": 0.3115,
"step": 650
},
{
"epoch": 3.655456297074794,
"grad_norm": 0.08842063602461302,
"learning_rate": 1.6323898361669857e-05,
"loss": 0.3099,
"step": 651
},
{
"epoch": 3.6610614818707305,
"grad_norm": 0.09196505054199886,
"learning_rate": 1.6197631166921856e-05,
"loss": 0.3059,
"step": 652
},
{
"epoch": 3.6666666666666665,
"grad_norm": 0.08826582262682107,
"learning_rate": 1.6071730118210173e-05,
"loss": 0.3065,
"step": 653
},
{
"epoch": 3.672271851462603,
"grad_norm": 0.08601459200435725,
"learning_rate": 1.594619715223994e-05,
"loss": 0.3083,
"step": 654
},
{
"epoch": 3.677877036258539,
"grad_norm": 0.09329501275849174,
"learning_rate": 1.5821034200054176e-05,
"loss": 0.3116,
"step": 655
},
{
"epoch": 3.6834822210544753,
"grad_norm": 0.08578241057447392,
"learning_rate": 1.569624318700405e-05,
"loss": 0.3111,
"step": 656
},
{
"epoch": 3.6890874058504117,
"grad_norm": 0.08212635956097417,
"learning_rate": 1.5571826032719287e-05,
"loss": 0.3103,
"step": 657
},
{
"epoch": 3.6946925906463477,
"grad_norm": 0.0920585030532975,
"learning_rate": 1.5447784651078642e-05,
"loss": 0.3119,
"step": 658
},
{
"epoch": 3.700297775442284,
"grad_norm": 0.08381481949938734,
"learning_rate": 1.532412095018044e-05,
"loss": 0.3053,
"step": 659
},
{
"epoch": 3.7059029602382205,
"grad_norm": 0.08430489721177843,
"learning_rate": 1.5200836832313246e-05,
"loss": 0.308,
"step": 660
},
{
"epoch": 3.7115081450341565,
"grad_norm": 0.09318737393858478,
"learning_rate": 1.5077934193926584e-05,
"loss": 0.3125,
"step": 661
},
{
"epoch": 3.717113329830093,
"grad_norm": 0.09089782801509312,
"learning_rate": 1.4955414925601757e-05,
"loss": 0.316,
"step": 662
},
{
"epoch": 3.7227185146260293,
"grad_norm": 0.08583258468324842,
"learning_rate": 1.4833280912022834e-05,
"loss": 0.3064,
"step": 663
},
{
"epoch": 3.7283236994219653,
"grad_norm": 0.08591690878371737,
"learning_rate": 1.4711534031947543e-05,
"loss": 0.3194,
"step": 664
},
{
"epoch": 3.7339288842179017,
"grad_norm": 0.08916277837335898,
"learning_rate": 1.459017615817846e-05,
"loss": 0.3096,
"step": 665
},
{
"epoch": 3.7395340690138377,
"grad_norm": 0.08697008778870499,
"learning_rate": 1.4469209157534172e-05,
"loss": 0.3042,
"step": 666
},
{
"epoch": 3.745139253809774,
"grad_norm": 0.08513292108575704,
"learning_rate": 1.4348634890820554e-05,
"loss": 0.3042,
"step": 667
},
{
"epoch": 3.75074443860571,
"grad_norm": 0.09258652498297838,
"learning_rate": 1.4228455212802149e-05,
"loss": 0.3081,
"step": 668
},
{
"epoch": 3.7563496234016465,
"grad_norm": 0.08525936977928056,
"learning_rate": 1.4108671972173644e-05,
"loss": 0.3109,
"step": 669
},
{
"epoch": 3.761954808197583,
"grad_norm": 0.08470431515893076,
"learning_rate": 1.3989287011531425e-05,
"loss": 0.312,
"step": 670
},
{
"epoch": 3.767559992993519,
"grad_norm": 0.09004964227688511,
"learning_rate": 1.3870302167345222e-05,
"loss": 0.3079,
"step": 671
},
{
"epoch": 3.7731651777894553,
"grad_norm": 0.08825735283568481,
"learning_rate": 1.3751719269929908e-05,
"loss": 0.3049,
"step": 672
},
{
"epoch": 3.7787703625853917,
"grad_norm": 0.08533456832494438,
"learning_rate": 1.3633540143417268e-05,
"loss": 0.3046,
"step": 673
},
{
"epoch": 3.7843755473813276,
"grad_norm": 0.09333040420463685,
"learning_rate": 1.3515766605727984e-05,
"loss": 0.3056,
"step": 674
},
{
"epoch": 3.789980732177264,
"grad_norm": 0.09109816447279726,
"learning_rate": 1.3398400468543682e-05,
"loss": 0.3138,
"step": 675
},
{
"epoch": 3.7955859169732005,
"grad_norm": 0.08953063350846481,
"learning_rate": 1.328144353727903e-05,
"loss": 0.3095,
"step": 676
},
{
"epoch": 3.8011911017691364,
"grad_norm": 0.08970232410085,
"learning_rate": 1.3164897611053981e-05,
"loss": 0.3092,
"step": 677
},
{
"epoch": 3.806796286565073,
"grad_norm": 0.08560717524353414,
"learning_rate": 1.3048764482666112e-05,
"loss": 0.3068,
"step": 678
},
{
"epoch": 3.812401471361009,
"grad_norm": 0.0870128302022409,
"learning_rate": 1.2933045938563012e-05,
"loss": 0.3103,
"step": 679
},
{
"epoch": 3.8180066561569452,
"grad_norm": 0.08488653192608872,
"learning_rate": 1.281774375881482e-05,
"loss": 0.3092,
"step": 680
},
{
"epoch": 3.823611840952881,
"grad_norm": 0.08327605535332809,
"learning_rate": 1.2702859717086886e-05,
"loss": 0.3077,
"step": 681
},
{
"epoch": 3.8292170257488176,
"grad_norm": 0.082880372307135,
"learning_rate": 1.2588395580612392e-05,
"loss": 0.3115,
"step": 682
},
{
"epoch": 3.834822210544754,
"grad_norm": 0.08662923231375559,
"learning_rate": 1.247435311016525e-05,
"loss": 0.3066,
"step": 683
},
{
"epoch": 3.84042739534069,
"grad_norm": 0.08229450378457145,
"learning_rate": 1.2360734060032967e-05,
"loss": 0.3053,
"step": 684
},
{
"epoch": 3.8460325801366264,
"grad_norm": 0.08665772393719716,
"learning_rate": 1.2247540177989695e-05,
"loss": 0.3047,
"step": 685
},
{
"epoch": 3.851637764932563,
"grad_norm": 0.08865321374818363,
"learning_rate": 1.2134773205269323e-05,
"loss": 0.3132,
"step": 686
},
{
"epoch": 3.857242949728499,
"grad_norm": 0.09256990438194068,
"learning_rate": 1.2022434876538696e-05,
"loss": 0.3098,
"step": 687
},
{
"epoch": 3.862848134524435,
"grad_norm": 0.08775878742002668,
"learning_rate": 1.191052691987094e-05,
"loss": 0.308,
"step": 688
},
{
"epoch": 3.8684533193203716,
"grad_norm": 0.09049516342589015,
"learning_rate": 1.1799051056718844e-05,
"loss": 0.308,
"step": 689
},
{
"epoch": 3.8740585041163076,
"grad_norm": 0.0850218500678186,
"learning_rate": 1.1688009001888475e-05,
"loss": 0.3082,
"step": 690
},
{
"epoch": 3.8796636889122436,
"grad_norm": 0.09015284370086712,
"learning_rate": 1.1577402463512652e-05,
"loss": 0.3125,
"step": 691
},
{
"epoch": 3.88526887370818,
"grad_norm": 0.08936579824727202,
"learning_rate": 1.1467233143024803e-05,
"loss": 0.2996,
"step": 692
},
{
"epoch": 3.8908740585041164,
"grad_norm": 0.08641714653740455,
"learning_rate": 1.1357502735132715e-05,
"loss": 0.3085,
"step": 693
},
{
"epoch": 3.8964792433000524,
"grad_norm": 0.08002468185657431,
"learning_rate": 1.1248212927792502e-05,
"loss": 0.3074,
"step": 694
},
{
"epoch": 3.9020844280959888,
"grad_norm": 0.08225416168538756,
"learning_rate": 1.1139365402182625e-05,
"loss": 0.3056,
"step": 695
},
{
"epoch": 3.907689612891925,
"grad_norm": 0.08346850523992327,
"learning_rate": 1.1030961832678014e-05,
"loss": 0.309,
"step": 696
},
{
"epoch": 3.913294797687861,
"grad_norm": 0.08024780002044651,
"learning_rate": 1.0923003886824382e-05,
"loss": 0.3063,
"step": 697
},
{
"epoch": 3.9188999824837976,
"grad_norm": 0.09339560984140326,
"learning_rate": 1.081549322531247e-05,
"loss": 0.3029,
"step": 698
},
{
"epoch": 3.924505167279734,
"grad_norm": 0.08375958744991581,
"learning_rate": 1.0708431501952567e-05,
"loss": 0.3181,
"step": 699
},
{
"epoch": 3.93011035207567,
"grad_norm": 0.07819955008853643,
"learning_rate": 1.060182036364907e-05,
"loss": 0.3072,
"step": 700
},
{
"epoch": 3.9357155368716064,
"grad_norm": 0.07835982764160071,
"learning_rate": 1.0495661450375114e-05,
"loss": 0.302,
"step": 701
},
{
"epoch": 3.9413207216675423,
"grad_norm": 0.08325758169200392,
"learning_rate": 1.0389956395147389e-05,
"loss": 0.3062,
"step": 702
},
{
"epoch": 3.9469259064634787,
"grad_norm": 0.08333976071773525,
"learning_rate": 1.0284706824000983e-05,
"loss": 0.3146,
"step": 703
},
{
"epoch": 3.9525310912594147,
"grad_norm": 0.0786546369775888,
"learning_rate": 1.0179914355964384e-05,
"loss": 0.3099,
"step": 704
},
{
"epoch": 3.958136276055351,
"grad_norm": 0.0826606587187586,
"learning_rate": 1.0075580603034569e-05,
"loss": 0.3078,
"step": 705
},
{
"epoch": 3.9637414608512875,
"grad_norm": 0.08157885632921875,
"learning_rate": 9.971707170152243e-06,
"loss": 0.3025,
"step": 706
},
{
"epoch": 3.9693466456472235,
"grad_norm": 0.07569811929437366,
"learning_rate": 9.86829565517709e-06,
"loss": 0.3052,
"step": 707
},
{
"epoch": 3.97495183044316,
"grad_norm": 0.08433452418842283,
"learning_rate": 9.765347648863228e-06,
"loss": 0.3103,
"step": 708
},
{
"epoch": 3.9805570152390963,
"grad_norm": 0.08116174303332119,
"learning_rate": 9.662864734834736e-06,
"loss": 0.3162,
"step": 709
},
{
"epoch": 3.9861622000350323,
"grad_norm": 0.07742272052051664,
"learning_rate": 9.560848489561292e-06,
"loss": 0.3088,
"step": 710
},
{
"epoch": 3.9917673848309687,
"grad_norm": 0.08299995390154452,
"learning_rate": 9.459300482333931e-06,
"loss": 0.3133,
"step": 711
},
{
"epoch": 3.997372569626905,
"grad_norm": 0.0834553917977672,
"learning_rate": 9.358222275240884e-06,
"loss": 0.3094,
"step": 712
},
{
"epoch": 4.005605184795936,
"grad_norm": 0.13513807332008165,
"learning_rate": 9.257615423143566e-06,
"loss": 0.2945,
"step": 713
},
{
"epoch": 4.011210369591873,
"grad_norm": 0.10621089229052157,
"learning_rate": 9.157481473652643e-06,
"loss": 0.2885,
"step": 714
},
{
"epoch": 4.016815554387809,
"grad_norm": 0.08714199459949697,
"learning_rate": 9.05782196710427e-06,
"loss": 0.2958,
"step": 715
},
{
"epoch": 4.022420739183745,
"grad_norm": 0.10547298624299982,
"learning_rate": 8.958638436536322e-06,
"loss": 0.2904,
"step": 716
},
{
"epoch": 4.028025923979682,
"grad_norm": 0.11824374552084228,
"learning_rate": 8.85993240766487e-06,
"loss": 0.2939,
"step": 717
},
{
"epoch": 4.033631108775618,
"grad_norm": 0.11166981880398426,
"learning_rate": 8.761705398860684e-06,
"loss": 0.2879,
"step": 718
},
{
"epoch": 4.039236293571554,
"grad_norm": 0.10093675260999213,
"learning_rate": 8.6639589211259e-06,
"loss": 0.292,
"step": 719
},
{
"epoch": 4.0448414783674895,
"grad_norm": 0.10073956157160133,
"learning_rate": 8.566694478070748e-06,
"loss": 0.2849,
"step": 720
},
{
"epoch": 4.050446663163426,
"grad_norm": 0.10476897069362466,
"learning_rate": 8.469913565890443e-06,
"loss": 0.2929,
"step": 721
},
{
"epoch": 4.056051847959362,
"grad_norm": 0.10098301015704451,
"learning_rate": 8.373617673342154e-06,
"loss": 0.2915,
"step": 722
},
{
"epoch": 4.061657032755298,
"grad_norm": 0.09703917635173606,
"learning_rate": 8.277808281722116e-06,
"loss": 0.2899,
"step": 723
},
{
"epoch": 4.067262217551235,
"grad_norm": 0.10034460587837235,
"learning_rate": 8.182486864842852e-06,
"loss": 0.2917,
"step": 724
},
{
"epoch": 4.072867402347171,
"grad_norm": 0.09179532694160371,
"learning_rate": 8.087654889010475e-06,
"loss": 0.2892,
"step": 725
},
{
"epoch": 4.078472587143107,
"grad_norm": 0.10753395912690455,
"learning_rate": 7.993313813002137e-06,
"loss": 0.2905,
"step": 726
},
{
"epoch": 4.084077771939044,
"grad_norm": 0.08822865040425992,
"learning_rate": 7.899465088043632e-06,
"loss": 0.2861,
"step": 727
},
{
"epoch": 4.08968295673498,
"grad_norm": 0.08713560322111398,
"learning_rate": 7.806110157786978e-06,
"loss": 0.2903,
"step": 728
},
{
"epoch": 4.095288141530916,
"grad_norm": 0.09170567396266732,
"learning_rate": 7.713250458288333e-06,
"loss": 0.2901,
"step": 729
},
{
"epoch": 4.100893326326853,
"grad_norm": 0.09585484257486802,
"learning_rate": 7.620887417985789e-06,
"loss": 0.2887,
"step": 730
},
{
"epoch": 4.106498511122789,
"grad_norm": 0.08818209427141882,
"learning_rate": 7.529022457677504e-06,
"loss": 0.2924,
"step": 731
},
{
"epoch": 4.112103695918725,
"grad_norm": 0.08443201723287494,
"learning_rate": 7.437656990499746e-06,
"loss": 0.2863,
"step": 732
},
{
"epoch": 4.117708880714661,
"grad_norm": 0.08873488187304306,
"learning_rate": 7.346792421905231e-06,
"loss": 0.2938,
"step": 733
},
{
"epoch": 4.1233140655105975,
"grad_norm": 0.08694222494262985,
"learning_rate": 7.2564301496414535e-06,
"loss": 0.2924,
"step": 734
},
{
"epoch": 4.1289192503065335,
"grad_norm": 0.08256227612309179,
"learning_rate": 7.166571563729223e-06,
"loss": 0.2917,
"step": 735
},
{
"epoch": 4.1345244351024695,
"grad_norm": 0.08001663820565025,
"learning_rate": 7.07721804644125e-06,
"loss": 0.285,
"step": 736
},
{
"epoch": 4.140129619898406,
"grad_norm": 0.08149206652526432,
"learning_rate": 6.988370972280911e-06,
"loss": 0.287,
"step": 737
},
{
"epoch": 4.145734804694342,
"grad_norm": 0.0830237803672675,
"learning_rate": 6.900031707961083e-06,
"loss": 0.2929,
"step": 738
},
{
"epoch": 4.151339989490278,
"grad_norm": 0.07889881500439108,
"learning_rate": 6.812201612383132e-06,
"loss": 0.2908,
"step": 739
},
{
"epoch": 4.156945174286215,
"grad_norm": 0.07965629951868874,
"learning_rate": 6.724882036615991e-06,
"loss": 0.2911,
"step": 740
},
{
"epoch": 4.162550359082151,
"grad_norm": 0.08142470834051524,
"learning_rate": 6.638074323875426e-06,
"loss": 0.2845,
"step": 741
},
{
"epoch": 4.168155543878087,
"grad_norm": 0.0790024403007617,
"learning_rate": 6.551779809503305e-06,
"loss": 0.2912,
"step": 742
},
{
"epoch": 4.173760728674024,
"grad_norm": 0.08162794031253676,
"learning_rate": 6.465999820947107e-06,
"loss": 0.287,
"step": 743
},
{
"epoch": 4.17936591346996,
"grad_norm": 0.08060613968699802,
"learning_rate": 6.380735677739474e-06,
"loss": 0.2917,
"step": 744
},
{
"epoch": 4.184971098265896,
"grad_norm": 0.0763576924622887,
"learning_rate": 6.295988691477939e-06,
"loss": 0.2895,
"step": 745
},
{
"epoch": 4.190576283061832,
"grad_norm": 0.07704795721026231,
"learning_rate": 6.2117601658047234e-06,
"loss": 0.2914,
"step": 746
},
{
"epoch": 4.196181467857769,
"grad_norm": 0.08636531460875918,
"learning_rate": 6.128051396386707e-06,
"loss": 0.2908,
"step": 747
},
{
"epoch": 4.201786652653705,
"grad_norm": 0.08428531970028809,
"learning_rate": 6.044863670895473e-06,
"loss": 0.292,
"step": 748
},
{
"epoch": 4.207391837449641,
"grad_norm": 0.07541521932519012,
"learning_rate": 5.962198268987514e-06,
"loss": 0.2956,
"step": 749
},
{
"epoch": 4.2129970222455775,
"grad_norm": 0.08036165670495596,
"learning_rate": 5.880056462284573e-06,
"loss": 0.2955,
"step": 750
},
{
"epoch": 4.2186022070415135,
"grad_norm": 0.07962458343055896,
"learning_rate": 5.798439514354024e-06,
"loss": 0.2904,
"step": 751
},
{
"epoch": 4.224207391837449,
"grad_norm": 0.07557676462990631,
"learning_rate": 5.7173486806894804e-06,
"loss": 0.2943,
"step": 752
},
{
"epoch": 4.229812576633386,
"grad_norm": 0.07611814912098987,
"learning_rate": 5.6367852086914555e-06,
"loss": 0.2921,
"step": 753
},
{
"epoch": 4.235417761429322,
"grad_norm": 0.07591915780682665,
"learning_rate": 5.556750337648207e-06,
"loss": 0.2966,
"step": 754
},
{
"epoch": 4.241022946225258,
"grad_norm": 0.07451638627135487,
"learning_rate": 5.477245298716636e-06,
"loss": 0.2916,
"step": 755
},
{
"epoch": 4.246628131021194,
"grad_norm": 0.07408998865978028,
"learning_rate": 5.398271314903376e-06,
"loss": 0.2922,
"step": 756
},
{
"epoch": 4.252233315817131,
"grad_norm": 0.07545427591965101,
"learning_rate": 5.3198296010459604e-06,
"loss": 0.2894,
"step": 757
},
{
"epoch": 4.257838500613067,
"grad_norm": 0.07784079160936691,
"learning_rate": 5.241921363794143e-06,
"loss": 0.2899,
"step": 758
},
{
"epoch": 4.263443685409003,
"grad_norm": 0.07434860497601323,
"learning_rate": 5.1645478015913556e-06,
"loss": 0.2938,
"step": 759
},
{
"epoch": 4.26904887020494,
"grad_norm": 0.07524905441026238,
"learning_rate": 5.0877101046562335e-06,
"loss": 0.2925,
"step": 760
},
{
"epoch": 4.274654055000876,
"grad_norm": 0.07534375562369973,
"learning_rate": 5.011409454964336e-06,
"loss": 0.2956,
"step": 761
},
{
"epoch": 4.280259239796812,
"grad_norm": 0.07564823418534802,
"learning_rate": 4.935647026229951e-06,
"loss": 0.2897,
"step": 762
},
{
"epoch": 4.285864424592749,
"grad_norm": 0.07430360134148153,
"learning_rate": 4.860423983888054e-06,
"loss": 0.2909,
"step": 763
},
{
"epoch": 4.291469609388685,
"grad_norm": 0.07690448982555907,
"learning_rate": 4.785741485076356e-06,
"loss": 0.2921,
"step": 764
},
{
"epoch": 4.297074794184621,
"grad_norm": 0.07333540337330373,
"learning_rate": 4.711600678617521e-06,
"loss": 0.2924,
"step": 765
},
{
"epoch": 4.302679978980557,
"grad_norm": 0.0707515459460907,
"learning_rate": 4.6380027050015165e-06,
"loss": 0.2897,
"step": 766
},
{
"epoch": 4.308285163776493,
"grad_norm": 0.07535393854931703,
"learning_rate": 4.564948696368014e-06,
"loss": 0.2941,
"step": 767
},
{
"epoch": 4.313890348572429,
"grad_norm": 0.07963087040482336,
"learning_rate": 4.492439776489024e-06,
"loss": 0.2928,
"step": 768
},
{
"epoch": 4.319495533368365,
"grad_norm": 0.07387271710468704,
"learning_rate": 4.420477060751575e-06,
"loss": 0.292,
"step": 769
},
{
"epoch": 4.325100718164302,
"grad_norm": 0.07736925416730672,
"learning_rate": 4.349061656140583e-06,
"loss": 0.2944,
"step": 770
},
{
"epoch": 4.330705902960238,
"grad_norm": 0.07529231788326105,
"learning_rate": 4.278194661221804e-06,
"loss": 0.2879,
"step": 771
},
{
"epoch": 4.336311087756174,
"grad_norm": 0.08026949246594235,
"learning_rate": 4.207877166124936e-06,
"loss": 0.2917,
"step": 772
},
{
"epoch": 4.341916272552111,
"grad_norm": 0.07620416762621339,
"learning_rate": 4.138110252526866e-06,
"loss": 0.291,
"step": 773
},
{
"epoch": 4.347521457348047,
"grad_norm": 0.07422185414389502,
"learning_rate": 4.068894993635009e-06,
"loss": 0.2907,
"step": 774
},
{
"epoch": 4.353126642143983,
"grad_norm": 0.07226310970169152,
"learning_rate": 4.000232454170827e-06,
"loss": 0.2875,
"step": 775
},
{
"epoch": 4.35873182693992,
"grad_norm": 0.07432275041631858,
"learning_rate": 3.932123690353425e-06,
"loss": 0.2885,
"step": 776
},
{
"epoch": 4.364337011735856,
"grad_norm": 0.0732145587763894,
"learning_rate": 3.8645697498833e-06,
"loss": 0.298,
"step": 777
},
{
"epoch": 4.369942196531792,
"grad_norm": 0.07336339349758086,
"learning_rate": 3.7975716719262522e-06,
"loss": 0.2892,
"step": 778
},
{
"epoch": 4.375547381327728,
"grad_norm": 0.07442021970479001,
"learning_rate": 3.7311304870973807e-06,
"loss": 0.2899,
"step": 779
},
{
"epoch": 4.3811525661236645,
"grad_norm": 0.07353220474247239,
"learning_rate": 3.6652472174452337e-06,
"loss": 0.2877,
"step": 780
},
{
"epoch": 4.3867577509196005,
"grad_norm": 0.0715587497335768,
"learning_rate": 3.599922876436077e-06,
"loss": 0.289,
"step": 781
},
{
"epoch": 4.3923629357155365,
"grad_norm": 0.06874108127193815,
"learning_rate": 3.535158468938331e-06,
"loss": 0.2923,
"step": 782
},
{
"epoch": 4.397968120511473,
"grad_norm": 0.0731981866264841,
"learning_rate": 3.4709549912070693e-06,
"loss": 0.2895,
"step": 783
},
{
"epoch": 4.403573305307409,
"grad_norm": 0.07134091663650302,
"learning_rate": 3.4073134308687574e-06,
"loss": 0.2946,
"step": 784
},
{
"epoch": 4.409178490103345,
"grad_norm": 0.07158473138919103,
"learning_rate": 3.3442347669059917e-06,
"loss": 0.2888,
"step": 785
},
{
"epoch": 4.414783674899282,
"grad_norm": 0.0705765283738734,
"learning_rate": 3.2817199696424785e-06,
"loss": 0.2921,
"step": 786
},
{
"epoch": 4.420388859695218,
"grad_norm": 0.0713673125862577,
"learning_rate": 3.219770000728102e-06,
"loss": 0.2897,
"step": 787
},
{
"epoch": 4.425994044491154,
"grad_norm": 0.07163002908771758,
"learning_rate": 3.1583858131241274e-06,
"loss": 0.2931,
"step": 788
},
{
"epoch": 4.431599229287091,
"grad_norm": 0.07068386839729027,
"learning_rate": 3.0975683510885512e-06,
"loss": 0.2915,
"step": 789
},
{
"epoch": 4.437204414083027,
"grad_norm": 0.06828223728970015,
"learning_rate": 3.0373185501615655e-06,
"loss": 0.2863,
"step": 790
},
{
"epoch": 4.442809598878963,
"grad_norm": 0.06925305783885854,
"learning_rate": 2.97763733715116e-06,
"loss": 0.286,
"step": 791
},
{
"epoch": 4.448414783674899,
"grad_norm": 0.07093825781423818,
"learning_rate": 2.9185256301188782e-06,
"loss": 0.2918,
"step": 792
},
{
"epoch": 4.454019968470836,
"grad_norm": 0.07292024935590449,
"learning_rate": 2.8599843383657178e-06,
"loss": 0.2849,
"step": 793
},
{
"epoch": 4.459625153266772,
"grad_norm": 0.07136882672592504,
"learning_rate": 2.8020143624180796e-06,
"loss": 0.2941,
"step": 794
},
{
"epoch": 4.465230338062708,
"grad_norm": 0.07262486143198338,
"learning_rate": 2.744616594013976e-06,
"loss": 0.29,
"step": 795
},
{
"epoch": 4.4708355228586445,
"grad_norm": 0.07044266557562419,
"learning_rate": 2.6877919160892817e-06,
"loss": 0.286,
"step": 796
},
{
"epoch": 4.4764407076545805,
"grad_norm": 0.07076171900766302,
"learning_rate": 2.631541202764161e-06,
"loss": 0.2913,
"step": 797
},
{
"epoch": 4.482045892450516,
"grad_norm": 0.07266466034845562,
"learning_rate": 2.5758653193296244e-06,
"loss": 0.2966,
"step": 798
},
{
"epoch": 4.487651077246453,
"grad_norm": 0.06891412203515149,
"learning_rate": 2.520765122234212e-06,
"loss": 0.2938,
"step": 799
},
{
"epoch": 4.493256262042389,
"grad_norm": 0.06927383792607966,
"learning_rate": 2.4662414590708216e-06,
"loss": 0.2886,
"step": 800
},
{
"epoch": 4.498861446838325,
"grad_norm": 0.06986747799982956,
"learning_rate": 2.4122951685636674e-06,
"loss": 0.2895,
"step": 801
},
{
"epoch": 4.504466631634262,
"grad_norm": 0.07085917182268424,
"learning_rate": 2.3589270805553842e-06,
"loss": 0.293,
"step": 802
},
{
"epoch": 4.510071816430198,
"grad_norm": 0.07134101177324086,
"learning_rate": 2.3061380159942593e-06,
"loss": 0.2919,
"step": 803
},
{
"epoch": 4.515677001226134,
"grad_norm": 0.07152411549915325,
"learning_rate": 2.2539287869215974e-06,
"loss": 0.2909,
"step": 804
},
{
"epoch": 4.52128218602207,
"grad_norm": 0.0709029110171059,
"learning_rate": 2.2023001964592485e-06,
"loss": 0.2909,
"step": 805
},
{
"epoch": 4.526887370818007,
"grad_norm": 0.07123396088624603,
"learning_rate": 2.1512530387972187e-06,
"loss": 0.2894,
"step": 806
},
{
"epoch": 4.532492555613943,
"grad_norm": 0.07087391238341717,
"learning_rate": 2.100788099181501e-06,
"loss": 0.292,
"step": 807
},
{
"epoch": 4.538097740409879,
"grad_norm": 0.07244705960873961,
"learning_rate": 2.050906153901946e-06,
"loss": 0.2886,
"step": 808
},
{
"epoch": 4.543702925205816,
"grad_norm": 0.0702625208879413,
"learning_rate": 2.0016079702803683e-06,
"loss": 0.2912,
"step": 809
},
{
"epoch": 4.549308110001752,
"grad_norm": 0.07025143510925803,
"learning_rate": 1.9528943066586993e-06,
"loss": 0.2859,
"step": 810
},
{
"epoch": 4.554913294797688,
"grad_norm": 0.06797065457217236,
"learning_rate": 1.9047659123873475e-06,
"loss": 0.2897,
"step": 811
},
{
"epoch": 4.560518479593624,
"grad_norm": 0.06967626315727225,
"learning_rate": 1.8572235278136741e-06,
"loss": 0.2896,
"step": 812
},
{
"epoch": 4.56612366438956,
"grad_norm": 0.06946061969739742,
"learning_rate": 1.81026788427058e-06,
"loss": 0.2919,
"step": 813
},
{
"epoch": 4.571728849185496,
"grad_norm": 0.06956148278872504,
"learning_rate": 1.7638997040652928e-06,
"loss": 0.2865,
"step": 814
},
{
"epoch": 4.577334033981433,
"grad_norm": 0.06903042275149272,
"learning_rate": 1.7181197004682148e-06,
"loss": 0.2987,
"step": 815
},
{
"epoch": 4.582939218777369,
"grad_norm": 0.07027061783424711,
"learning_rate": 1.6729285777019776e-06,
"loss": 0.2943,
"step": 816
},
{
"epoch": 4.588544403573305,
"grad_norm": 0.07497684616923145,
"learning_rate": 1.6283270309306098e-06,
"loss": 0.2954,
"step": 817
},
{
"epoch": 4.594149588369241,
"grad_norm": 0.06928091037284238,
"learning_rate": 1.58431574624883e-06,
"loss": 0.2978,
"step": 818
},
{
"epoch": 4.599754773165178,
"grad_norm": 0.06987516073546424,
"learning_rate": 1.5408954006715004e-06,
"loss": 0.2906,
"step": 819
},
{
"epoch": 4.605359957961114,
"grad_norm": 0.0679280004765429,
"learning_rate": 1.4980666621232076e-06,
"loss": 0.2913,
"step": 820
},
{
"epoch": 4.61096514275705,
"grad_norm": 0.06872535211929137,
"learning_rate": 1.4558301894279958e-06,
"loss": 0.2929,
"step": 821
},
{
"epoch": 4.616570327552987,
"grad_norm": 0.07416106638882912,
"learning_rate": 1.4141866322992237e-06,
"loss": 0.2965,
"step": 822
},
{
"epoch": 4.622175512348923,
"grad_norm": 0.06811896916091317,
"learning_rate": 1.3731366313295858e-06,
"loss": 0.2885,
"step": 823
},
{
"epoch": 4.627780697144859,
"grad_norm": 0.0695579136671645,
"learning_rate": 1.3326808179812266e-06,
"loss": 0.291,
"step": 824
},
{
"epoch": 4.633385881940796,
"grad_norm": 0.06863496179473963,
"learning_rate": 1.292819814576065e-06,
"loss": 0.294,
"step": 825
},
{
"epoch": 4.638991066736732,
"grad_norm": 0.0705807081382788,
"learning_rate": 1.253554234286196e-06,
"loss": 0.2904,
"step": 826
},
{
"epoch": 4.6445962515326675,
"grad_norm": 0.06663794364232753,
"learning_rate": 1.214884681124473e-06,
"loss": 0.2925,
"step": 827
},
{
"epoch": 4.650201436328604,
"grad_norm": 0.06891427767384369,
"learning_rate": 1.1768117499351983e-06,
"loss": 0.2947,
"step": 828
},
{
"epoch": 4.65580662112454,
"grad_norm": 0.06804227143364437,
"learning_rate": 1.1393360263849895e-06,
"loss": 0.296,
"step": 829
},
{
"epoch": 4.661411805920476,
"grad_norm": 0.06729466113720775,
"learning_rate": 1.1024580869537682e-06,
"loss": 0.2895,
"step": 830
},
{
"epoch": 4.667016990716412,
"grad_norm": 0.06981525289632151,
"learning_rate": 1.0661784989258784e-06,
"loss": 0.2952,
"step": 831
},
{
"epoch": 4.672622175512349,
"grad_norm": 0.06887481964028351,
"learning_rate": 1.0304978203813864e-06,
"loss": 0.2942,
"step": 832
},
{
"epoch": 4.678227360308285,
"grad_norm": 0.06799466764149216,
"learning_rate": 9.954166001874665e-07,
"loss": 0.2911,
"step": 833
},
{
"epoch": 4.683832545104221,
"grad_norm": 0.06713373354236589,
"learning_rate": 9.609353779899711e-07,
"loss": 0.2937,
"step": 834
},
{
"epoch": 4.689437729900158,
"grad_norm": 0.06801377589396243,
"learning_rate": 9.270546842051398e-07,
"loss": 0.2917,
"step": 835
},
{
"epoch": 4.695042914696094,
"grad_norm": 0.06877475417334285,
"learning_rate": 8.937750400114243e-07,
"loss": 0.2951,
"step": 836
},
{
"epoch": 4.70064809949203,
"grad_norm": 0.0682963203858977,
"learning_rate": 8.610969573414762e-07,
"loss": 0.2851,
"step": 837
},
{
"epoch": 4.706253284287967,
"grad_norm": 0.06874191377961358,
"learning_rate": 8.290209388742698e-07,
"loss": 0.2923,
"step": 838
},
{
"epoch": 4.711858469083903,
"grad_norm": 0.06818766825685828,
"learning_rate": 7.975474780273828e-07,
"loss": 0.2903,
"step": 839
},
{
"epoch": 4.717463653879839,
"grad_norm": 0.06850750083385908,
"learning_rate": 7.666770589493854e-07,
"loss": 0.2912,
"step": 840
},
{
"epoch": 4.7230688386757755,
"grad_norm": 0.06669019358923452,
"learning_rate": 7.36410156512406e-07,
"loss": 0.2886,
"step": 841
},
{
"epoch": 4.7286740234717115,
"grad_norm": 0.0685257652819839,
"learning_rate": 7.0674723630483e-07,
"loss": 0.2955,
"step": 842
},
{
"epoch": 4.7342792082676475,
"grad_norm": 0.06869894321372189,
"learning_rate": 6.776887546241196e-07,
"loss": 0.2894,
"step": 843
},
{
"epoch": 4.7398843930635834,
"grad_norm": 0.0658977357878887,
"learning_rate": 6.492351584698231e-07,
"loss": 0.29,
"step": 844
},
{
"epoch": 4.74548957785952,
"grad_norm": 0.06766778529150386,
"learning_rate": 6.213868855366656e-07,
"loss": 0.2919,
"step": 845
},
{
"epoch": 4.751094762655456,
"grad_norm": 0.06789970654207798,
"learning_rate": 5.94144364207847e-07,
"loss": 0.2848,
"step": 846
},
{
"epoch": 4.756699947451392,
"grad_norm": 0.06763905592382854,
"learning_rate": 5.675080135484212e-07,
"loss": 0.2919,
"step": 847
},
{
"epoch": 4.762305132247329,
"grad_norm": 0.0673637188944257,
"learning_rate": 5.41478243298883e-07,
"loss": 0.2895,
"step": 848
},
{
"epoch": 4.767910317043265,
"grad_norm": 0.06599718582014574,
"learning_rate": 5.160554538688356e-07,
"loss": 0.2866,
"step": 849
},
{
"epoch": 4.773515501839201,
"grad_norm": 0.06680161975531686,
"learning_rate": 4.912400363308534e-07,
"loss": 0.2905,
"step": 850
},
{
"epoch": 4.779120686635137,
"grad_norm": 0.06774042383141317,
"learning_rate": 4.670323724144599e-07,
"loss": 0.29,
"step": 851
},
{
"epoch": 4.784725871431074,
"grad_norm": 0.06766365842045775,
"learning_rate": 4.434328345002348e-07,
"loss": 0.2893,
"step": 852
},
{
"epoch": 4.79033105622701,
"grad_norm": 0.0669757827435882,
"learning_rate": 4.204417856141252e-07,
"loss": 0.2934,
"step": 853
},
{
"epoch": 4.795936241022947,
"grad_norm": 0.06783350979091353,
"learning_rate": 3.980595794218278e-07,
"loss": 0.2933,
"step": 854
},
{
"epoch": 4.801541425818883,
"grad_norm": 0.06756472881490112,
"learning_rate": 3.762865602233623e-07,
"loss": 0.2938,
"step": 855
},
{
"epoch": 4.807146610614819,
"grad_norm": 0.06573406645601809,
"learning_rate": 3.551230629477731e-07,
"loss": 0.2838,
"step": 856
},
{
"epoch": 4.812751795410755,
"grad_norm": 0.06500971296841233,
"learning_rate": 3.3456941314798264e-07,
"loss": 0.2858,
"step": 857
},
{
"epoch": 4.8183569802066915,
"grad_norm": 0.06666567495190365,
"learning_rate": 3.14625926995773e-07,
"loss": 0.2928,
"step": 858
},
{
"epoch": 4.823962165002627,
"grad_norm": 0.06842156859320352,
"learning_rate": 2.9529291127693204e-07,
"loss": 0.2978,
"step": 859
},
{
"epoch": 4.829567349798563,
"grad_norm": 0.06770694825005188,
"learning_rate": 2.765706633865195e-07,
"loss": 0.2937,
"step": 860
},
{
"epoch": 4.8351725345945,
"grad_norm": 0.06561834235318093,
"learning_rate": 2.584594713243105e-07,
"loss": 0.2885,
"step": 861
},
{
"epoch": 4.840777719390436,
"grad_norm": 0.06847439054641216,
"learning_rate": 2.409596136903636e-07,
"loss": 0.2934,
"step": 862
},
{
"epoch": 4.846382904186372,
"grad_norm": 0.06925202861994563,
"learning_rate": 2.2407135968072203e-07,
"loss": 0.2912,
"step": 863
},
{
"epoch": 4.851988088982308,
"grad_norm": 0.06568658183290636,
"learning_rate": 2.0779496908327034e-07,
"loss": 0.2865,
"step": 864
},
{
"epoch": 4.857593273778245,
"grad_norm": 0.06527850349185818,
"learning_rate": 1.9213069227376423e-07,
"loss": 0.285,
"step": 865
},
{
"epoch": 4.863198458574181,
"grad_norm": 0.06537692688098144,
"learning_rate": 1.7707877021195364e-07,
"loss": 0.2893,
"step": 866
},
{
"epoch": 4.868803643370118,
"grad_norm": 0.0673736030444187,
"learning_rate": 1.6263943443788344e-07,
"loss": 0.2929,
"step": 867
},
{
"epoch": 4.874408828166054,
"grad_norm": 0.06944665125489001,
"learning_rate": 1.488129070683364e-07,
"loss": 0.2891,
"step": 868
},
{
"epoch": 4.88001401296199,
"grad_norm": 0.06656997632932934,
"learning_rate": 1.355994007934136e-07,
"loss": 0.2897,
"step": 869
},
{
"epoch": 4.885619197757926,
"grad_norm": 0.06760741888344543,
"learning_rate": 1.229991188732571e-07,
"loss": 0.2976,
"step": 870
},
{
"epoch": 4.891224382553863,
"grad_norm": 0.06541550926939739,
"learning_rate": 1.1101225513493685e-07,
"loss": 0.2867,
"step": 871
},
{
"epoch": 4.896829567349799,
"grad_norm": 0.06563519047490654,
"learning_rate": 9.963899396944865e-08,
"loss": 0.2908,
"step": 872
},
{
"epoch": 4.9024347521457345,
"grad_norm": 0.0667333344033991,
"learning_rate": 8.887951032889863e-08,
"loss": 0.2927,
"step": 873
},
{
"epoch": 4.908039936941671,
"grad_norm": 0.06534766168074065,
"learning_rate": 7.873396972379876e-08,
"loss": 0.287,
"step": 874
},
{
"epoch": 4.913645121737607,
"grad_norm": 0.06573554633063639,
"learning_rate": 6.920252822053109e-08,
"loss": 0.2935,
"step": 875
},
{
"epoch": 4.919250306533543,
"grad_norm": 0.06692957578454926,
"learning_rate": 6.028533243893186e-08,
"loss": 0.297,
"step": 876
},
{
"epoch": 4.924855491329479,
"grad_norm": 0.06696022392708174,
"learning_rate": 5.19825195500534e-08,
"loss": 0.2933,
"step": 877
},
{
"epoch": 4.930460676125416,
"grad_norm": 0.06551574218774454,
"learning_rate": 4.429421727403682e-08,
"loss": 0.2934,
"step": 878
},
{
"epoch": 4.936065860921352,
"grad_norm": 0.06734924333505671,
"learning_rate": 3.722054387816698e-08,
"loss": 0.2887,
"step": 879
},
{
"epoch": 4.941671045717289,
"grad_norm": 0.06685929779911977,
"learning_rate": 3.076160817503393e-08,
"loss": 0.2919,
"step": 880
},
{
"epoch": 4.947276230513225,
"grad_norm": 0.0657094812782885,
"learning_rate": 2.491750952087202e-08,
"loss": 0.2899,
"step": 881
},
{
"epoch": 4.952881415309161,
"grad_norm": 0.06618212935511,
"learning_rate": 1.968833781402335e-08,
"loss": 0.2934,
"step": 882
},
{
"epoch": 4.958486600105097,
"grad_norm": 0.06522788950148393,
"learning_rate": 1.5074173493565548e-08,
"loss": 0.2889,
"step": 883
},
{
"epoch": 4.964091784901034,
"grad_norm": 0.06617327426574475,
"learning_rate": 1.1075087538059415e-08,
"loss": 0.2869,
"step": 884
},
{
"epoch": 4.96969696969697,
"grad_norm": 0.06616131705312475,
"learning_rate": 7.69114146446981e-09,
"loss": 0.2862,
"step": 885
},
{
"epoch": 4.975302154492906,
"grad_norm": 0.06628519866860673,
"learning_rate": 4.922387327219724e-09,
"loss": 0.2946,
"step": 886
},
{
"epoch": 4.9809073392888426,
"grad_norm": 0.06516030532915469,
"learning_rate": 2.7688677173687285e-09,
"loss": 0.2916,
"step": 887
},
{
"epoch": 4.9865125240847785,
"grad_norm": 0.06490676692933119,
"learning_rate": 1.2306157619956793e-09,
"loss": 0.2929,
"step": 888
},
{
"epoch": 4.9921177088807145,
"grad_norm": 0.06796682705221838,
"learning_rate": 3.0765512364361317e-10,
"loss": 0.2958,
"step": 889
},
{
"epoch": 4.9977228936766505,
"grad_norm": 0.06629486855855717,
"learning_rate": 0.0,
"loss": 0.2924,
"step": 890
},
{
"epoch": 4.9977228936766505,
"step": 890,
"total_flos": 2.367212535366969e+19,
"train_loss": 0.0,
"train_runtime": 1.7461,
"train_samples_per_second": 261530.086,
"train_steps_per_second": 509.716
}
],
"logging_steps": 1,
"max_steps": 890,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.367212535366969e+19,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}