knc6's picture
Upload 11 files
ed72316 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.06842285323297982,
"eval_steps": 500,
"global_step": 650,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00010526592805073818,
"grad_norm": 2.6939258575439453,
"learning_rate": 0.0,
"loss": 0.8515,
"step": 1
},
{
"epoch": 0.00021053185610147635,
"grad_norm": 2.7966604232788086,
"learning_rate": 1.6666666666666667e-05,
"loss": 0.8166,
"step": 2
},
{
"epoch": 0.00031579778415221455,
"grad_norm": 2.257108211517334,
"learning_rate": 3.3333333333333335e-05,
"loss": 0.8018,
"step": 3
},
{
"epoch": 0.0004210637122029527,
"grad_norm": 1.1128956079483032,
"learning_rate": 5e-05,
"loss": 0.5497,
"step": 4
},
{
"epoch": 0.0005263296402536909,
"grad_norm": 1.2874521017074585,
"learning_rate": 4.999473462510531e-05,
"loss": 0.6127,
"step": 5
},
{
"epoch": 0.0006315955683044291,
"grad_norm": 0.9699161648750305,
"learning_rate": 4.998946925021062e-05,
"loss": 0.6407,
"step": 6
},
{
"epoch": 0.0007368614963551673,
"grad_norm": 1.0630613565444946,
"learning_rate": 4.998420387531593e-05,
"loss": 0.4727,
"step": 7
},
{
"epoch": 0.0008421274244059054,
"grad_norm": 0.882173478603363,
"learning_rate": 4.997893850042124e-05,
"loss": 0.5235,
"step": 8
},
{
"epoch": 0.0009473933524566436,
"grad_norm": 0.676689088344574,
"learning_rate": 4.997367312552654e-05,
"loss": 0.4341,
"step": 9
},
{
"epoch": 0.0010526592805073817,
"grad_norm": 0.7519457936286926,
"learning_rate": 4.996840775063184e-05,
"loss": 0.4338,
"step": 10
},
{
"epoch": 0.00115792520855812,
"grad_norm": 0.7073312401771545,
"learning_rate": 4.996314237573715e-05,
"loss": 0.3924,
"step": 11
},
{
"epoch": 0.0012631911366088582,
"grad_norm": 0.7425239086151123,
"learning_rate": 4.995787700084246e-05,
"loss": 0.4859,
"step": 12
},
{
"epoch": 0.0013684570646595963,
"grad_norm": 0.6777500510215759,
"learning_rate": 4.995261162594777e-05,
"loss": 0.4235,
"step": 13
},
{
"epoch": 0.0014737229927103345,
"grad_norm": 0.6901292204856873,
"learning_rate": 4.994734625105308e-05,
"loss": 0.4709,
"step": 14
},
{
"epoch": 0.0015789889207610726,
"grad_norm": 0.8694287538528442,
"learning_rate": 4.994208087615839e-05,
"loss": 0.5086,
"step": 15
},
{
"epoch": 0.0016842548488118108,
"grad_norm": 0.6798275113105774,
"learning_rate": 4.99368155012637e-05,
"loss": 0.4937,
"step": 16
},
{
"epoch": 0.001789520776862549,
"grad_norm": 0.7667484879493713,
"learning_rate": 4.9931550126369e-05,
"loss": 0.4974,
"step": 17
},
{
"epoch": 0.0018947867049132871,
"grad_norm": 0.6613733172416687,
"learning_rate": 4.992628475147431e-05,
"loss": 0.4181,
"step": 18
},
{
"epoch": 0.0020000526329640254,
"grad_norm": 0.7069230079650879,
"learning_rate": 4.992101937657962e-05,
"loss": 0.4834,
"step": 19
},
{
"epoch": 0.0021053185610147634,
"grad_norm": 0.5691242814064026,
"learning_rate": 4.991575400168492e-05,
"loss": 0.4405,
"step": 20
},
{
"epoch": 0.002210584489065502,
"grad_norm": 0.701371431350708,
"learning_rate": 4.991048862679023e-05,
"loss": 0.3933,
"step": 21
},
{
"epoch": 0.00231585041711624,
"grad_norm": 0.5670080780982971,
"learning_rate": 4.990522325189554e-05,
"loss": 0.5061,
"step": 22
},
{
"epoch": 0.002421116345166978,
"grad_norm": 0.6001436114311218,
"learning_rate": 4.9899957877000847e-05,
"loss": 0.4879,
"step": 23
},
{
"epoch": 0.0025263822732177164,
"grad_norm": 0.6185859441757202,
"learning_rate": 4.9894692502106156e-05,
"loss": 0.4006,
"step": 24
},
{
"epoch": 0.0026316482012684545,
"grad_norm": 0.677813708782196,
"learning_rate": 4.988942712721146e-05,
"loss": 0.4347,
"step": 25
},
{
"epoch": 0.0027369141293191925,
"grad_norm": 0.5357967019081116,
"learning_rate": 4.988416175231677e-05,
"loss": 0.4594,
"step": 26
},
{
"epoch": 0.002842180057369931,
"grad_norm": 0.5995861291885376,
"learning_rate": 4.9878896377422076e-05,
"loss": 0.4947,
"step": 27
},
{
"epoch": 0.002947445985420669,
"grad_norm": 0.5909422636032104,
"learning_rate": 4.9873631002527385e-05,
"loss": 0.5316,
"step": 28
},
{
"epoch": 0.003052711913471407,
"grad_norm": 0.6816675662994385,
"learning_rate": 4.9868365627632694e-05,
"loss": 0.4705,
"step": 29
},
{
"epoch": 0.003157977841522145,
"grad_norm": 0.5410743355751038,
"learning_rate": 4.9863100252737996e-05,
"loss": 0.4229,
"step": 30
},
{
"epoch": 0.0032632437695728836,
"grad_norm": 0.7362250089645386,
"learning_rate": 4.9857834877843305e-05,
"loss": 0.4922,
"step": 31
},
{
"epoch": 0.0033685096976236216,
"grad_norm": 0.7518715262413025,
"learning_rate": 4.9852569502948614e-05,
"loss": 0.3942,
"step": 32
},
{
"epoch": 0.0034737756256743597,
"grad_norm": 0.6200836300849915,
"learning_rate": 4.9847304128053916e-05,
"loss": 0.3937,
"step": 33
},
{
"epoch": 0.003579041553725098,
"grad_norm": 0.6816834807395935,
"learning_rate": 4.9842038753159225e-05,
"loss": 0.492,
"step": 34
},
{
"epoch": 0.003684307481775836,
"grad_norm": 0.6341183185577393,
"learning_rate": 4.9836773378264534e-05,
"loss": 0.5873,
"step": 35
},
{
"epoch": 0.0037895734098265742,
"grad_norm": 0.5888874530792236,
"learning_rate": 4.9831508003369843e-05,
"loss": 0.3784,
"step": 36
},
{
"epoch": 0.0038948393378773127,
"grad_norm": 0.503926157951355,
"learning_rate": 4.982624262847515e-05,
"loss": 0.5116,
"step": 37
},
{
"epoch": 0.004000105265928051,
"grad_norm": 0.6205700039863586,
"learning_rate": 4.982097725358046e-05,
"loss": 0.4237,
"step": 38
},
{
"epoch": 0.004105371193978789,
"grad_norm": 0.47863858938217163,
"learning_rate": 4.9815711878685764e-05,
"loss": 0.5397,
"step": 39
},
{
"epoch": 0.004210637122029527,
"grad_norm": 0.5036730766296387,
"learning_rate": 4.981044650379107e-05,
"loss": 0.46,
"step": 40
},
{
"epoch": 0.004315903050080265,
"grad_norm": 0.4822523593902588,
"learning_rate": 4.9805181128896375e-05,
"loss": 0.4988,
"step": 41
},
{
"epoch": 0.004421168978131004,
"grad_norm": 0.5173696875572205,
"learning_rate": 4.9799915754001684e-05,
"loss": 0.4003,
"step": 42
},
{
"epoch": 0.004526434906181742,
"grad_norm": 0.6021311283111572,
"learning_rate": 4.979465037910699e-05,
"loss": 0.4306,
"step": 43
},
{
"epoch": 0.00463170083423248,
"grad_norm": 0.5137932300567627,
"learning_rate": 4.97893850042123e-05,
"loss": 0.4453,
"step": 44
},
{
"epoch": 0.004736966762283218,
"grad_norm": 0.5420482158660889,
"learning_rate": 4.978411962931761e-05,
"loss": 0.5377,
"step": 45
},
{
"epoch": 0.004842232690333956,
"grad_norm": 0.5643067359924316,
"learning_rate": 4.977885425442292e-05,
"loss": 0.4519,
"step": 46
},
{
"epoch": 0.004947498618384694,
"grad_norm": 0.5466287136077881,
"learning_rate": 4.977358887952823e-05,
"loss": 0.4221,
"step": 47
},
{
"epoch": 0.005052764546435433,
"grad_norm": 0.5712279677391052,
"learning_rate": 4.976832350463354e-05,
"loss": 0.4987,
"step": 48
},
{
"epoch": 0.005158030474486171,
"grad_norm": 0.4822379946708679,
"learning_rate": 4.976305812973884e-05,
"loss": 0.4848,
"step": 49
},
{
"epoch": 0.005263296402536909,
"grad_norm": 0.5017122626304626,
"learning_rate": 4.975779275484414e-05,
"loss": 0.4196,
"step": 50
},
{
"epoch": 0.005368562330587647,
"grad_norm": 0.4559021592140198,
"learning_rate": 4.975252737994945e-05,
"loss": 0.4412,
"step": 51
},
{
"epoch": 0.005473828258638385,
"grad_norm": 0.5421490669250488,
"learning_rate": 4.974726200505476e-05,
"loss": 0.3746,
"step": 52
},
{
"epoch": 0.005579094186689123,
"grad_norm": 0.46819037199020386,
"learning_rate": 4.974199663016007e-05,
"loss": 0.4521,
"step": 53
},
{
"epoch": 0.005684360114739862,
"grad_norm": 0.45857539772987366,
"learning_rate": 4.973673125526538e-05,
"loss": 0.3941,
"step": 54
},
{
"epoch": 0.0057896260427906,
"grad_norm": 0.5490565896034241,
"learning_rate": 4.973146588037069e-05,
"loss": 0.4551,
"step": 55
},
{
"epoch": 0.005894891970841338,
"grad_norm": 0.5232876539230347,
"learning_rate": 4.9726200505475997e-05,
"loss": 0.4356,
"step": 56
},
{
"epoch": 0.006000157898892076,
"grad_norm": 0.5434950590133667,
"learning_rate": 4.97209351305813e-05,
"loss": 0.436,
"step": 57
},
{
"epoch": 0.006105423826942814,
"grad_norm": 0.44252631068229675,
"learning_rate": 4.971566975568661e-05,
"loss": 0.4263,
"step": 58
},
{
"epoch": 0.006210689754993552,
"grad_norm": 0.49957412481307983,
"learning_rate": 4.971040438079192e-05,
"loss": 0.4422,
"step": 59
},
{
"epoch": 0.00631595568304429,
"grad_norm": 0.46676474809646606,
"learning_rate": 4.970513900589722e-05,
"loss": 0.434,
"step": 60
},
{
"epoch": 0.006421221611095029,
"grad_norm": 0.5107528567314148,
"learning_rate": 4.969987363100253e-05,
"loss": 0.5225,
"step": 61
},
{
"epoch": 0.006526487539145767,
"grad_norm": 0.4967051148414612,
"learning_rate": 4.969460825610784e-05,
"loss": 0.4199,
"step": 62
},
{
"epoch": 0.006631753467196505,
"grad_norm": 0.4968240559101105,
"learning_rate": 4.9689342881213146e-05,
"loss": 0.4157,
"step": 63
},
{
"epoch": 0.006737019395247243,
"grad_norm": 0.5468823909759521,
"learning_rate": 4.9684077506318455e-05,
"loss": 0.4204,
"step": 64
},
{
"epoch": 0.006842285323297981,
"grad_norm": 0.49830362200737,
"learning_rate": 4.967881213142376e-05,
"loss": 0.4381,
"step": 65
},
{
"epoch": 0.006947551251348719,
"grad_norm": 0.6488986611366272,
"learning_rate": 4.9673546756529066e-05,
"loss": 0.5919,
"step": 66
},
{
"epoch": 0.007052817179399458,
"grad_norm": 0.5668662190437317,
"learning_rate": 4.9668281381634375e-05,
"loss": 0.4089,
"step": 67
},
{
"epoch": 0.007158083107450196,
"grad_norm": 0.5446314811706543,
"learning_rate": 4.9663016006739684e-05,
"loss": 0.4116,
"step": 68
},
{
"epoch": 0.007263349035500934,
"grad_norm": 0.5011276006698608,
"learning_rate": 4.9657750631844993e-05,
"loss": 0.4808,
"step": 69
},
{
"epoch": 0.007368614963551672,
"grad_norm": 0.7226698398590088,
"learning_rate": 4.9652485256950296e-05,
"loss": 0.4645,
"step": 70
},
{
"epoch": 0.00747388089160241,
"grad_norm": 0.47501352429389954,
"learning_rate": 4.9647219882055605e-05,
"loss": 0.5207,
"step": 71
},
{
"epoch": 0.0075791468196531485,
"grad_norm": 0.4856880307197571,
"learning_rate": 4.9641954507160914e-05,
"loss": 0.4878,
"step": 72
},
{
"epoch": 0.007684412747703887,
"grad_norm": 0.5225908756256104,
"learning_rate": 4.9636689132266216e-05,
"loss": 0.514,
"step": 73
},
{
"epoch": 0.007789678675754625,
"grad_norm": 0.526539146900177,
"learning_rate": 4.9631423757371525e-05,
"loss": 0.4572,
"step": 74
},
{
"epoch": 0.007894944603805363,
"grad_norm": 0.49719616770744324,
"learning_rate": 4.9626158382476834e-05,
"loss": 0.4352,
"step": 75
},
{
"epoch": 0.008000210531856101,
"grad_norm": 0.5542761087417603,
"learning_rate": 4.962089300758214e-05,
"loss": 0.5408,
"step": 76
},
{
"epoch": 0.00810547645990684,
"grad_norm": 0.4830870032310486,
"learning_rate": 4.961562763268745e-05,
"loss": 0.3941,
"step": 77
},
{
"epoch": 0.008210742387957578,
"grad_norm": 0.504296600818634,
"learning_rate": 4.961036225779276e-05,
"loss": 0.4609,
"step": 78
},
{
"epoch": 0.008316008316008316,
"grad_norm": 0.5107358694076538,
"learning_rate": 4.960509688289807e-05,
"loss": 0.4313,
"step": 79
},
{
"epoch": 0.008421274244059054,
"grad_norm": 0.5339490175247192,
"learning_rate": 4.959983150800337e-05,
"loss": 0.5046,
"step": 80
},
{
"epoch": 0.008526540172109793,
"grad_norm": 0.4737516939640045,
"learning_rate": 4.9594566133108675e-05,
"loss": 0.5728,
"step": 81
},
{
"epoch": 0.00863180610016053,
"grad_norm": 0.4952607750892639,
"learning_rate": 4.9589300758213984e-05,
"loss": 0.4464,
"step": 82
},
{
"epoch": 0.008737072028211269,
"grad_norm": 0.5083893537521362,
"learning_rate": 4.958403538331929e-05,
"loss": 0.4429,
"step": 83
},
{
"epoch": 0.008842337956262008,
"grad_norm": 0.5977057218551636,
"learning_rate": 4.95787700084246e-05,
"loss": 0.4367,
"step": 84
},
{
"epoch": 0.008947603884312745,
"grad_norm": 0.6330780982971191,
"learning_rate": 4.957350463352991e-05,
"loss": 0.4736,
"step": 85
},
{
"epoch": 0.009052869812363484,
"grad_norm": 0.4460638761520386,
"learning_rate": 4.956823925863522e-05,
"loss": 0.4933,
"step": 86
},
{
"epoch": 0.00915813574041422,
"grad_norm": 0.508703351020813,
"learning_rate": 4.956297388374053e-05,
"loss": 0.5794,
"step": 87
},
{
"epoch": 0.00926340166846496,
"grad_norm": 0.45487043261528015,
"learning_rate": 4.955770850884584e-05,
"loss": 0.5156,
"step": 88
},
{
"epoch": 0.009368667596515699,
"grad_norm": 0.46359360218048096,
"learning_rate": 4.955244313395114e-05,
"loss": 0.4634,
"step": 89
},
{
"epoch": 0.009473933524566436,
"grad_norm": 0.5234309434890747,
"learning_rate": 4.954717775905645e-05,
"loss": 0.383,
"step": 90
},
{
"epoch": 0.009579199452617175,
"grad_norm": 0.5344865322113037,
"learning_rate": 4.954191238416175e-05,
"loss": 0.4619,
"step": 91
},
{
"epoch": 0.009684465380667912,
"grad_norm": 0.6055357456207275,
"learning_rate": 4.953664700926706e-05,
"loss": 0.495,
"step": 92
},
{
"epoch": 0.00978973130871865,
"grad_norm": 0.4749431014060974,
"learning_rate": 4.953138163437237e-05,
"loss": 0.5209,
"step": 93
},
{
"epoch": 0.009894997236769388,
"grad_norm": 0.4775514602661133,
"learning_rate": 4.952611625947768e-05,
"loss": 0.4064,
"step": 94
},
{
"epoch": 0.010000263164820127,
"grad_norm": 0.4580100178718567,
"learning_rate": 4.952085088458299e-05,
"loss": 0.4479,
"step": 95
},
{
"epoch": 0.010105529092870866,
"grad_norm": 0.5589710474014282,
"learning_rate": 4.9515585509688296e-05,
"loss": 0.4072,
"step": 96
},
{
"epoch": 0.010210795020921603,
"grad_norm": 0.45461875200271606,
"learning_rate": 4.95103201347936e-05,
"loss": 0.4933,
"step": 97
},
{
"epoch": 0.010316060948972342,
"grad_norm": 0.4552902579307556,
"learning_rate": 4.950505475989891e-05,
"loss": 0.4038,
"step": 98
},
{
"epoch": 0.010421326877023079,
"grad_norm": 0.5590063333511353,
"learning_rate": 4.9499789385004216e-05,
"loss": 0.4928,
"step": 99
},
{
"epoch": 0.010526592805073818,
"grad_norm": 0.5689685344696045,
"learning_rate": 4.949452401010952e-05,
"loss": 0.3962,
"step": 100
},
{
"epoch": 0.010631858733124555,
"grad_norm": 0.5378232598304749,
"learning_rate": 4.948925863521483e-05,
"loss": 0.4843,
"step": 101
},
{
"epoch": 0.010737124661175294,
"grad_norm": 0.6677789688110352,
"learning_rate": 4.948399326032014e-05,
"loss": 0.5839,
"step": 102
},
{
"epoch": 0.010842390589226033,
"grad_norm": 0.4315250515937805,
"learning_rate": 4.9478727885425446e-05,
"loss": 0.5467,
"step": 103
},
{
"epoch": 0.01094765651727677,
"grad_norm": 0.6344457268714905,
"learning_rate": 4.9473462510530755e-05,
"loss": 0.5048,
"step": 104
},
{
"epoch": 0.011052922445327509,
"grad_norm": 0.41527998447418213,
"learning_rate": 4.946819713563606e-05,
"loss": 0.5559,
"step": 105
},
{
"epoch": 0.011158188373378246,
"grad_norm": 0.46887871623039246,
"learning_rate": 4.9462931760741366e-05,
"loss": 0.4165,
"step": 106
},
{
"epoch": 0.011263454301428985,
"grad_norm": 0.5572345852851868,
"learning_rate": 4.9457666385846675e-05,
"loss": 0.4496,
"step": 107
},
{
"epoch": 0.011368720229479724,
"grad_norm": 0.46033406257629395,
"learning_rate": 4.9452401010951984e-05,
"loss": 0.4699,
"step": 108
},
{
"epoch": 0.011473986157530461,
"grad_norm": 0.5205333232879639,
"learning_rate": 4.944713563605729e-05,
"loss": 0.4291,
"step": 109
},
{
"epoch": 0.0115792520855812,
"grad_norm": 0.5044732689857483,
"learning_rate": 4.9441870261162595e-05,
"loss": 0.4424,
"step": 110
},
{
"epoch": 0.011684518013631937,
"grad_norm": 0.5410451889038086,
"learning_rate": 4.9436604886267904e-05,
"loss": 0.3803,
"step": 111
},
{
"epoch": 0.011789783941682676,
"grad_norm": 0.5163026452064514,
"learning_rate": 4.943133951137321e-05,
"loss": 0.4276,
"step": 112
},
{
"epoch": 0.011895049869733413,
"grad_norm": 0.4330487847328186,
"learning_rate": 4.9426074136478516e-05,
"loss": 0.4644,
"step": 113
},
{
"epoch": 0.012000315797784152,
"grad_norm": 0.41046929359436035,
"learning_rate": 4.9420808761583825e-05,
"loss": 0.4161,
"step": 114
},
{
"epoch": 0.012105581725834891,
"grad_norm": 0.4908786714076996,
"learning_rate": 4.9415543386689134e-05,
"loss": 0.4502,
"step": 115
},
{
"epoch": 0.012210847653885628,
"grad_norm": 0.4866664707660675,
"learning_rate": 4.941027801179444e-05,
"loss": 0.3923,
"step": 116
},
{
"epoch": 0.012316113581936367,
"grad_norm": 0.4272409677505493,
"learning_rate": 4.940501263689975e-05,
"loss": 0.4828,
"step": 117
},
{
"epoch": 0.012421379509987104,
"grad_norm": 0.517900288105011,
"learning_rate": 4.939974726200506e-05,
"loss": 0.4661,
"step": 118
},
{
"epoch": 0.012526645438037843,
"grad_norm": 0.5139513611793518,
"learning_rate": 4.939448188711037e-05,
"loss": 0.536,
"step": 119
},
{
"epoch": 0.01263191136608858,
"grad_norm": 0.5204519629478455,
"learning_rate": 4.938921651221567e-05,
"loss": 0.4156,
"step": 120
},
{
"epoch": 0.01273717729413932,
"grad_norm": 0.566659152507782,
"learning_rate": 4.9383951137320974e-05,
"loss": 0.4675,
"step": 121
},
{
"epoch": 0.012842443222190058,
"grad_norm": 0.5262351632118225,
"learning_rate": 4.937868576242628e-05,
"loss": 0.5002,
"step": 122
},
{
"epoch": 0.012947709150240795,
"grad_norm": 0.5888293981552124,
"learning_rate": 4.937342038753159e-05,
"loss": 0.4058,
"step": 123
},
{
"epoch": 0.013052975078291534,
"grad_norm": 0.5911523103713989,
"learning_rate": 4.93681550126369e-05,
"loss": 0.392,
"step": 124
},
{
"epoch": 0.013158241006342272,
"grad_norm": 0.48798367381095886,
"learning_rate": 4.936288963774221e-05,
"loss": 0.4442,
"step": 125
},
{
"epoch": 0.01326350693439301,
"grad_norm": 0.5228798985481262,
"learning_rate": 4.935762426284752e-05,
"loss": 0.4673,
"step": 126
},
{
"epoch": 0.01336877286244375,
"grad_norm": 0.4832141399383545,
"learning_rate": 4.935235888795283e-05,
"loss": 0.4259,
"step": 127
},
{
"epoch": 0.013474038790494487,
"grad_norm": 0.6188245415687561,
"learning_rate": 4.934709351305814e-05,
"loss": 0.4982,
"step": 128
},
{
"epoch": 0.013579304718545225,
"grad_norm": 0.4905821979045868,
"learning_rate": 4.934182813816344e-05,
"loss": 0.4539,
"step": 129
},
{
"epoch": 0.013684570646595963,
"grad_norm": 0.6506298184394836,
"learning_rate": 4.933656276326875e-05,
"loss": 0.3982,
"step": 130
},
{
"epoch": 0.013789836574646702,
"grad_norm": 0.570380687713623,
"learning_rate": 4.933129738837405e-05,
"loss": 0.3901,
"step": 131
},
{
"epoch": 0.013895102502697439,
"grad_norm": 0.44687098264694214,
"learning_rate": 4.932603201347936e-05,
"loss": 0.4176,
"step": 132
},
{
"epoch": 0.014000368430748178,
"grad_norm": 0.6272158622741699,
"learning_rate": 4.932076663858467e-05,
"loss": 0.4455,
"step": 133
},
{
"epoch": 0.014105634358798917,
"grad_norm": 0.6358391046524048,
"learning_rate": 4.931550126368998e-05,
"loss": 0.384,
"step": 134
},
{
"epoch": 0.014210900286849654,
"grad_norm": 0.6558123826980591,
"learning_rate": 4.931023588879529e-05,
"loss": 0.5024,
"step": 135
},
{
"epoch": 0.014316166214900393,
"grad_norm": 0.4577985107898712,
"learning_rate": 4.9304970513900596e-05,
"loss": 0.3906,
"step": 136
},
{
"epoch": 0.01442143214295113,
"grad_norm": 0.5580503344535828,
"learning_rate": 4.92997051390059e-05,
"loss": 0.4589,
"step": 137
},
{
"epoch": 0.014526698071001869,
"grad_norm": 0.5660861134529114,
"learning_rate": 4.929443976411121e-05,
"loss": 0.3913,
"step": 138
},
{
"epoch": 0.014631963999052606,
"grad_norm": 0.49188342690467834,
"learning_rate": 4.9289174389216516e-05,
"loss": 0.3951,
"step": 139
},
{
"epoch": 0.014737229927103345,
"grad_norm": 0.6210848689079285,
"learning_rate": 4.9283909014321825e-05,
"loss": 0.4282,
"step": 140
},
{
"epoch": 0.014842495855154084,
"grad_norm": 0.48430967330932617,
"learning_rate": 4.927864363942713e-05,
"loss": 0.4667,
"step": 141
},
{
"epoch": 0.01494776178320482,
"grad_norm": 0.5269038677215576,
"learning_rate": 4.9273378264532436e-05,
"loss": 0.3845,
"step": 142
},
{
"epoch": 0.01505302771125556,
"grad_norm": 0.5490912199020386,
"learning_rate": 4.9268112889637745e-05,
"loss": 0.4477,
"step": 143
},
{
"epoch": 0.015158293639306297,
"grad_norm": 0.4111802279949188,
"learning_rate": 4.9262847514743054e-05,
"loss": 0.4351,
"step": 144
},
{
"epoch": 0.015263559567357036,
"grad_norm": 0.48929688334465027,
"learning_rate": 4.9257582139848357e-05,
"loss": 0.4512,
"step": 145
},
{
"epoch": 0.015368825495407775,
"grad_norm": 0.9201393723487854,
"learning_rate": 4.9252316764953666e-05,
"loss": 0.5254,
"step": 146
},
{
"epoch": 0.015474091423458512,
"grad_norm": 0.5191910862922668,
"learning_rate": 4.9247051390058975e-05,
"loss": 0.5455,
"step": 147
},
{
"epoch": 0.01557935735150925,
"grad_norm": 0.3562093675136566,
"learning_rate": 4.9241786015164284e-05,
"loss": 0.5303,
"step": 148
},
{
"epoch": 0.015684623279559988,
"grad_norm": 0.7195460796356201,
"learning_rate": 4.923652064026959e-05,
"loss": 0.4389,
"step": 149
},
{
"epoch": 0.015789889207610725,
"grad_norm": 0.448176771402359,
"learning_rate": 4.9231255265374895e-05,
"loss": 0.3987,
"step": 150
},
{
"epoch": 0.015895155135661466,
"grad_norm": 0.48504385352134705,
"learning_rate": 4.9225989890480204e-05,
"loss": 0.4725,
"step": 151
},
{
"epoch": 0.016000421063712203,
"grad_norm": 0.5456967353820801,
"learning_rate": 4.922072451558551e-05,
"loss": 0.5143,
"step": 152
},
{
"epoch": 0.01610568699176294,
"grad_norm": 0.61397784948349,
"learning_rate": 4.9215459140690815e-05,
"loss": 0.4295,
"step": 153
},
{
"epoch": 0.01621095291981368,
"grad_norm": 0.6359485387802124,
"learning_rate": 4.9210193765796124e-05,
"loss": 0.4498,
"step": 154
},
{
"epoch": 0.016316218847864418,
"grad_norm": 0.5002400279045105,
"learning_rate": 4.920492839090143e-05,
"loss": 0.467,
"step": 155
},
{
"epoch": 0.016421484775915155,
"grad_norm": 0.5669925212860107,
"learning_rate": 4.919966301600674e-05,
"loss": 0.5151,
"step": 156
},
{
"epoch": 0.016526750703965892,
"grad_norm": 0.4109033942222595,
"learning_rate": 4.919439764111205e-05,
"loss": 0.4672,
"step": 157
},
{
"epoch": 0.016632016632016633,
"grad_norm": 0.5119397044181824,
"learning_rate": 4.918913226621736e-05,
"loss": 0.4846,
"step": 158
},
{
"epoch": 0.01673728256006737,
"grad_norm": 0.5187058448791504,
"learning_rate": 4.918386689132267e-05,
"loss": 0.4698,
"step": 159
},
{
"epoch": 0.016842548488118107,
"grad_norm": 0.55632483959198,
"learning_rate": 4.917860151642797e-05,
"loss": 0.3904,
"step": 160
},
{
"epoch": 0.016947814416168848,
"grad_norm": 0.5332942008972168,
"learning_rate": 4.917333614153328e-05,
"loss": 0.4253,
"step": 161
},
{
"epoch": 0.017053080344219585,
"grad_norm": 0.5523495078086853,
"learning_rate": 4.916807076663858e-05,
"loss": 0.415,
"step": 162
},
{
"epoch": 0.017158346272270322,
"grad_norm": 0.5162644386291504,
"learning_rate": 4.916280539174389e-05,
"loss": 0.514,
"step": 163
},
{
"epoch": 0.01726361220032106,
"grad_norm": 0.414809912443161,
"learning_rate": 4.91575400168492e-05,
"loss": 0.4757,
"step": 164
},
{
"epoch": 0.0173688781283718,
"grad_norm": 0.5634474754333496,
"learning_rate": 4.915227464195451e-05,
"loss": 0.3643,
"step": 165
},
{
"epoch": 0.017474144056422537,
"grad_norm": 0.5438713431358337,
"learning_rate": 4.914700926705982e-05,
"loss": 0.4315,
"step": 166
},
{
"epoch": 0.017579409984473274,
"grad_norm": 0.49885427951812744,
"learning_rate": 4.914174389216513e-05,
"loss": 0.4697,
"step": 167
},
{
"epoch": 0.017684675912524015,
"grad_norm": 0.46923205256462097,
"learning_rate": 4.913647851727044e-05,
"loss": 0.4189,
"step": 168
},
{
"epoch": 0.017789941840574752,
"grad_norm": 0.4423271119594574,
"learning_rate": 4.913121314237574e-05,
"loss": 0.4602,
"step": 169
},
{
"epoch": 0.01789520776862549,
"grad_norm": 0.6115851402282715,
"learning_rate": 4.912594776748105e-05,
"loss": 0.4399,
"step": 170
},
{
"epoch": 0.018000473696676227,
"grad_norm": 0.5554397106170654,
"learning_rate": 4.912068239258635e-05,
"loss": 0.4262,
"step": 171
},
{
"epoch": 0.018105739624726967,
"grad_norm": 0.565323531627655,
"learning_rate": 4.911541701769166e-05,
"loss": 0.4424,
"step": 172
},
{
"epoch": 0.018211005552777704,
"grad_norm": 0.44236519932746887,
"learning_rate": 4.911015164279697e-05,
"loss": 0.424,
"step": 173
},
{
"epoch": 0.01831627148082844,
"grad_norm": 0.6567726731300354,
"learning_rate": 4.910488626790228e-05,
"loss": 0.4231,
"step": 174
},
{
"epoch": 0.018421537408879182,
"grad_norm": 0.42518746852874756,
"learning_rate": 4.9099620893007586e-05,
"loss": 0.4878,
"step": 175
},
{
"epoch": 0.01852680333692992,
"grad_norm": 0.5739135146141052,
"learning_rate": 4.9094355518112895e-05,
"loss": 0.4514,
"step": 176
},
{
"epoch": 0.018632069264980657,
"grad_norm": 0.628442645072937,
"learning_rate": 4.90890901432182e-05,
"loss": 0.3625,
"step": 177
},
{
"epoch": 0.018737335193031397,
"grad_norm": 0.445872962474823,
"learning_rate": 4.9083824768323507e-05,
"loss": 0.5256,
"step": 178
},
{
"epoch": 0.018842601121082134,
"grad_norm": 0.5037261247634888,
"learning_rate": 4.9078559393428816e-05,
"loss": 0.4322,
"step": 179
},
{
"epoch": 0.01894786704913287,
"grad_norm": 0.5586241483688354,
"learning_rate": 4.9073294018534125e-05,
"loss": 0.5682,
"step": 180
},
{
"epoch": 0.01905313297718361,
"grad_norm": 0.5735304355621338,
"learning_rate": 4.906802864363943e-05,
"loss": 0.4486,
"step": 181
},
{
"epoch": 0.01915839890523435,
"grad_norm": 0.6629624962806702,
"learning_rate": 4.9062763268744736e-05,
"loss": 0.4748,
"step": 182
},
{
"epoch": 0.019263664833285087,
"grad_norm": 0.5536085963249207,
"learning_rate": 4.9057497893850045e-05,
"loss": 0.3779,
"step": 183
},
{
"epoch": 0.019368930761335824,
"grad_norm": 0.37973251938819885,
"learning_rate": 4.9052232518955354e-05,
"loss": 0.4913,
"step": 184
},
{
"epoch": 0.019474196689386564,
"grad_norm": 0.6046680212020874,
"learning_rate": 4.9046967144060656e-05,
"loss": 0.4644,
"step": 185
},
{
"epoch": 0.0195794626174373,
"grad_norm": 0.5051435828208923,
"learning_rate": 4.9041701769165965e-05,
"loss": 0.5042,
"step": 186
},
{
"epoch": 0.01968472854548804,
"grad_norm": 0.5261257290840149,
"learning_rate": 4.9036436394271274e-05,
"loss": 0.4679,
"step": 187
},
{
"epoch": 0.019789994473538776,
"grad_norm": 0.5349376797676086,
"learning_rate": 4.903117101937658e-05,
"loss": 0.4206,
"step": 188
},
{
"epoch": 0.019895260401589517,
"grad_norm": 0.5617197751998901,
"learning_rate": 4.902590564448189e-05,
"loss": 0.3974,
"step": 189
},
{
"epoch": 0.020000526329640254,
"grad_norm": 0.549514889717102,
"learning_rate": 4.90206402695872e-05,
"loss": 0.5034,
"step": 190
},
{
"epoch": 0.02010579225769099,
"grad_norm": 0.6475022435188293,
"learning_rate": 4.9015374894692503e-05,
"loss": 0.4651,
"step": 191
},
{
"epoch": 0.02021105818574173,
"grad_norm": 0.6060453057289124,
"learning_rate": 4.901010951979781e-05,
"loss": 0.3981,
"step": 192
},
{
"epoch": 0.02031632411379247,
"grad_norm": 0.6936651468276978,
"learning_rate": 4.9004844144903115e-05,
"loss": 0.3804,
"step": 193
},
{
"epoch": 0.020421590041843206,
"grad_norm": 0.44638895988464355,
"learning_rate": 4.8999578770008424e-05,
"loss": 0.4596,
"step": 194
},
{
"epoch": 0.020526855969893943,
"grad_norm": 0.5297572612762451,
"learning_rate": 4.899431339511373e-05,
"loss": 0.4385,
"step": 195
},
{
"epoch": 0.020632121897944684,
"grad_norm": 0.5046480894088745,
"learning_rate": 4.898904802021904e-05,
"loss": 0.4557,
"step": 196
},
{
"epoch": 0.02073738782599542,
"grad_norm": 0.5276935696601868,
"learning_rate": 4.898378264532435e-05,
"loss": 0.39,
"step": 197
},
{
"epoch": 0.020842653754046158,
"grad_norm": 0.4923096001148224,
"learning_rate": 4.897851727042966e-05,
"loss": 0.4585,
"step": 198
},
{
"epoch": 0.0209479196820969,
"grad_norm": 0.4554820954799652,
"learning_rate": 4.897325189553497e-05,
"loss": 0.5175,
"step": 199
},
{
"epoch": 0.021053185610147636,
"grad_norm": 0.47559452056884766,
"learning_rate": 4.896798652064027e-05,
"loss": 0.5275,
"step": 200
},
{
"epoch": 0.021158451538198373,
"grad_norm": 0.5070779323577881,
"learning_rate": 4.896272114574558e-05,
"loss": 0.4958,
"step": 201
},
{
"epoch": 0.02126371746624911,
"grad_norm": 0.5040444135665894,
"learning_rate": 4.895745577085088e-05,
"loss": 0.4616,
"step": 202
},
{
"epoch": 0.02136898339429985,
"grad_norm": 0.5290699601173401,
"learning_rate": 4.895219039595619e-05,
"loss": 0.5178,
"step": 203
},
{
"epoch": 0.021474249322350588,
"grad_norm": 0.5007508993148804,
"learning_rate": 4.89469250210615e-05,
"loss": 0.4489,
"step": 204
},
{
"epoch": 0.021579515250401325,
"grad_norm": 0.6373962759971619,
"learning_rate": 4.894165964616681e-05,
"loss": 0.4124,
"step": 205
},
{
"epoch": 0.021684781178452066,
"grad_norm": 0.5132836699485779,
"learning_rate": 4.893639427127212e-05,
"loss": 0.4534,
"step": 206
},
{
"epoch": 0.021790047106502803,
"grad_norm": 0.6253231167793274,
"learning_rate": 4.893112889637743e-05,
"loss": 0.3986,
"step": 207
},
{
"epoch": 0.02189531303455354,
"grad_norm": 0.5937986373901367,
"learning_rate": 4.8925863521482736e-05,
"loss": 0.3956,
"step": 208
},
{
"epoch": 0.022000578962604277,
"grad_norm": 0.4578053951263428,
"learning_rate": 4.892059814658804e-05,
"loss": 0.4068,
"step": 209
},
{
"epoch": 0.022105844890655018,
"grad_norm": 0.5060281157493591,
"learning_rate": 4.891533277169335e-05,
"loss": 0.5179,
"step": 210
},
{
"epoch": 0.022211110818705755,
"grad_norm": 0.561792254447937,
"learning_rate": 4.8910067396798657e-05,
"loss": 0.4547,
"step": 211
},
{
"epoch": 0.022316376746756492,
"grad_norm": 0.38052886724472046,
"learning_rate": 4.890480202190396e-05,
"loss": 0.4493,
"step": 212
},
{
"epoch": 0.022421642674807233,
"grad_norm": 0.5639155507087708,
"learning_rate": 4.889953664700927e-05,
"loss": 0.4239,
"step": 213
},
{
"epoch": 0.02252690860285797,
"grad_norm": 0.5452573299407959,
"learning_rate": 4.889427127211458e-05,
"loss": 0.4393,
"step": 214
},
{
"epoch": 0.022632174530908707,
"grad_norm": 0.4861447811126709,
"learning_rate": 4.8889005897219886e-05,
"loss": 0.4971,
"step": 215
},
{
"epoch": 0.022737440458959448,
"grad_norm": 0.5619585514068604,
"learning_rate": 4.8883740522325195e-05,
"loss": 0.3992,
"step": 216
},
{
"epoch": 0.022842706387010185,
"grad_norm": 0.5488256812095642,
"learning_rate": 4.88784751474305e-05,
"loss": 0.4155,
"step": 217
},
{
"epoch": 0.022947972315060922,
"grad_norm": 0.517796516418457,
"learning_rate": 4.8873209772535806e-05,
"loss": 0.5018,
"step": 218
},
{
"epoch": 0.02305323824311166,
"grad_norm": 0.6027892827987671,
"learning_rate": 4.8867944397641115e-05,
"loss": 0.4684,
"step": 219
},
{
"epoch": 0.0231585041711624,
"grad_norm": 0.47196510434150696,
"learning_rate": 4.8862679022746424e-05,
"loss": 0.4423,
"step": 220
},
{
"epoch": 0.023263770099213137,
"grad_norm": 0.41390231251716614,
"learning_rate": 4.8857413647851726e-05,
"loss": 0.4031,
"step": 221
},
{
"epoch": 0.023369036027263874,
"grad_norm": 0.5514193773269653,
"learning_rate": 4.8852148272957035e-05,
"loss": 0.6308,
"step": 222
},
{
"epoch": 0.023474301955314615,
"grad_norm": 0.4564357101917267,
"learning_rate": 4.8846882898062344e-05,
"loss": 0.5284,
"step": 223
},
{
"epoch": 0.023579567883365352,
"grad_norm": 0.45888492465019226,
"learning_rate": 4.8841617523167653e-05,
"loss": 0.4536,
"step": 224
},
{
"epoch": 0.02368483381141609,
"grad_norm": 0.4363495409488678,
"learning_rate": 4.8836352148272956e-05,
"loss": 0.4838,
"step": 225
},
{
"epoch": 0.023790099739466827,
"grad_norm": 0.40970975160598755,
"learning_rate": 4.8831086773378265e-05,
"loss": 0.5299,
"step": 226
},
{
"epoch": 0.023895365667517567,
"grad_norm": 0.5274611711502075,
"learning_rate": 4.8825821398483574e-05,
"loss": 0.3967,
"step": 227
},
{
"epoch": 0.024000631595568304,
"grad_norm": 0.5038068890571594,
"learning_rate": 4.882055602358888e-05,
"loss": 0.5067,
"step": 228
},
{
"epoch": 0.02410589752361904,
"grad_norm": 0.5031372904777527,
"learning_rate": 4.881529064869419e-05,
"loss": 0.3756,
"step": 229
},
{
"epoch": 0.024211163451669782,
"grad_norm": 0.49740293622016907,
"learning_rate": 4.88100252737995e-05,
"loss": 0.4809,
"step": 230
},
{
"epoch": 0.02431642937972052,
"grad_norm": 0.4950021207332611,
"learning_rate": 4.88047598989048e-05,
"loss": 0.4149,
"step": 231
},
{
"epoch": 0.024421695307771257,
"grad_norm": 0.46618038415908813,
"learning_rate": 4.879949452401011e-05,
"loss": 0.4737,
"step": 232
},
{
"epoch": 0.024526961235821994,
"grad_norm": 0.4663354158401489,
"learning_rate": 4.8794229149115414e-05,
"loss": 0.3884,
"step": 233
},
{
"epoch": 0.024632227163872734,
"grad_norm": 0.6165478229522705,
"learning_rate": 4.878896377422072e-05,
"loss": 0.3875,
"step": 234
},
{
"epoch": 0.02473749309192347,
"grad_norm": 0.4838646948337555,
"learning_rate": 4.878369839932603e-05,
"loss": 0.4679,
"step": 235
},
{
"epoch": 0.02484275901997421,
"grad_norm": 0.49089592695236206,
"learning_rate": 4.877843302443134e-05,
"loss": 0.5484,
"step": 236
},
{
"epoch": 0.02494802494802495,
"grad_norm": 0.4166033864021301,
"learning_rate": 4.877316764953665e-05,
"loss": 0.4594,
"step": 237
},
{
"epoch": 0.025053290876075687,
"grad_norm": 0.6557610630989075,
"learning_rate": 4.876790227464196e-05,
"loss": 0.422,
"step": 238
},
{
"epoch": 0.025158556804126424,
"grad_norm": 0.4997393786907196,
"learning_rate": 4.876263689974727e-05,
"loss": 0.4165,
"step": 239
},
{
"epoch": 0.02526382273217716,
"grad_norm": 0.3650420606136322,
"learning_rate": 4.875737152485258e-05,
"loss": 0.4758,
"step": 240
},
{
"epoch": 0.0253690886602279,
"grad_norm": 0.5316746830940247,
"learning_rate": 4.875210614995788e-05,
"loss": 0.4703,
"step": 241
},
{
"epoch": 0.02547435458827864,
"grad_norm": 0.3838014602661133,
"learning_rate": 4.874684077506318e-05,
"loss": 0.6512,
"step": 242
},
{
"epoch": 0.025579620516329376,
"grad_norm": 0.5243346095085144,
"learning_rate": 4.874157540016849e-05,
"loss": 0.4515,
"step": 243
},
{
"epoch": 0.025684886444380117,
"grad_norm": 0.46801677346229553,
"learning_rate": 4.87363100252738e-05,
"loss": 0.4605,
"step": 244
},
{
"epoch": 0.025790152372430854,
"grad_norm": 0.4614790081977844,
"learning_rate": 4.873104465037911e-05,
"loss": 0.4101,
"step": 245
},
{
"epoch": 0.02589541830048159,
"grad_norm": 0.4433145821094513,
"learning_rate": 4.872577927548442e-05,
"loss": 0.4578,
"step": 246
},
{
"epoch": 0.026000684228532328,
"grad_norm": 0.43368014693260193,
"learning_rate": 4.872051390058973e-05,
"loss": 0.4077,
"step": 247
},
{
"epoch": 0.02610595015658307,
"grad_norm": 0.4347352385520935,
"learning_rate": 4.8715248525695036e-05,
"loss": 0.4451,
"step": 248
},
{
"epoch": 0.026211216084633806,
"grad_norm": 0.5047518610954285,
"learning_rate": 4.870998315080034e-05,
"loss": 0.4308,
"step": 249
},
{
"epoch": 0.026316482012684543,
"grad_norm": 0.6036553978919983,
"learning_rate": 4.870471777590565e-05,
"loss": 0.5001,
"step": 250
},
{
"epoch": 0.026421747940735284,
"grad_norm": 0.5581931471824646,
"learning_rate": 4.8699452401010956e-05,
"loss": 0.3939,
"step": 251
},
{
"epoch": 0.02652701386878602,
"grad_norm": 0.4085439145565033,
"learning_rate": 4.869418702611626e-05,
"loss": 0.5321,
"step": 252
},
{
"epoch": 0.026632279796836758,
"grad_norm": 0.6976563334465027,
"learning_rate": 4.868892165122157e-05,
"loss": 0.4767,
"step": 253
},
{
"epoch": 0.0267375457248875,
"grad_norm": 0.48653343319892883,
"learning_rate": 4.8683656276326876e-05,
"loss": 0.5387,
"step": 254
},
{
"epoch": 0.026842811652938236,
"grad_norm": 0.5379003286361694,
"learning_rate": 4.8678390901432185e-05,
"loss": 0.4418,
"step": 255
},
{
"epoch": 0.026948077580988973,
"grad_norm": 0.42478466033935547,
"learning_rate": 4.8673125526537494e-05,
"loss": 0.4751,
"step": 256
},
{
"epoch": 0.02705334350903971,
"grad_norm": 0.4857715666294098,
"learning_rate": 4.86678601516428e-05,
"loss": 0.4608,
"step": 257
},
{
"epoch": 0.02715860943709045,
"grad_norm": 0.46174147725105286,
"learning_rate": 4.8662594776748106e-05,
"loss": 0.4611,
"step": 258
},
{
"epoch": 0.027263875365141188,
"grad_norm": 0.5316092371940613,
"learning_rate": 4.8657329401853415e-05,
"loss": 0.4463,
"step": 259
},
{
"epoch": 0.027369141293191925,
"grad_norm": 0.5541107058525085,
"learning_rate": 4.8652064026958724e-05,
"loss": 0.4619,
"step": 260
},
{
"epoch": 0.027474407221242666,
"grad_norm": 0.4637160003185272,
"learning_rate": 4.864679865206403e-05,
"loss": 0.425,
"step": 261
},
{
"epoch": 0.027579673149293403,
"grad_norm": 0.4406774938106537,
"learning_rate": 4.8641533277169335e-05,
"loss": 0.5234,
"step": 262
},
{
"epoch": 0.02768493907734414,
"grad_norm": 0.5540871620178223,
"learning_rate": 4.8636267902274644e-05,
"loss": 0.4565,
"step": 263
},
{
"epoch": 0.027790205005394877,
"grad_norm": 0.5119719505310059,
"learning_rate": 4.863100252737995e-05,
"loss": 0.4224,
"step": 264
},
{
"epoch": 0.027895470933445618,
"grad_norm": 0.6064046025276184,
"learning_rate": 4.8625737152485255e-05,
"loss": 0.453,
"step": 265
},
{
"epoch": 0.028000736861496355,
"grad_norm": 0.5928232669830322,
"learning_rate": 4.8620471777590564e-05,
"loss": 0.4444,
"step": 266
},
{
"epoch": 0.028106002789547092,
"grad_norm": 0.5610330700874329,
"learning_rate": 4.861520640269587e-05,
"loss": 0.4051,
"step": 267
},
{
"epoch": 0.028211268717597833,
"grad_norm": 0.4866770803928375,
"learning_rate": 4.860994102780118e-05,
"loss": 0.4629,
"step": 268
},
{
"epoch": 0.02831653464564857,
"grad_norm": 0.5181504487991333,
"learning_rate": 4.860467565290649e-05,
"loss": 0.4225,
"step": 269
},
{
"epoch": 0.028421800573699307,
"grad_norm": 0.36064937710762024,
"learning_rate": 4.85994102780118e-05,
"loss": 0.4136,
"step": 270
},
{
"epoch": 0.028527066501750045,
"grad_norm": 0.4846802353858948,
"learning_rate": 4.85941449031171e-05,
"loss": 0.4321,
"step": 271
},
{
"epoch": 0.028632332429800785,
"grad_norm": 0.4463631510734558,
"learning_rate": 4.858887952822241e-05,
"loss": 0.5485,
"step": 272
},
{
"epoch": 0.028737598357851522,
"grad_norm": 0.4516132175922394,
"learning_rate": 4.8583614153327714e-05,
"loss": 0.4853,
"step": 273
},
{
"epoch": 0.02884286428590226,
"grad_norm": 0.40815305709838867,
"learning_rate": 4.857834877843302e-05,
"loss": 0.3355,
"step": 274
},
{
"epoch": 0.028948130213953,
"grad_norm": 0.54203200340271,
"learning_rate": 4.857308340353833e-05,
"loss": 0.3969,
"step": 275
},
{
"epoch": 0.029053396142003737,
"grad_norm": 0.5161415338516235,
"learning_rate": 4.856781802864364e-05,
"loss": 0.3776,
"step": 276
},
{
"epoch": 0.029158662070054474,
"grad_norm": 0.4058281183242798,
"learning_rate": 4.856255265374895e-05,
"loss": 0.4268,
"step": 277
},
{
"epoch": 0.02926392799810521,
"grad_norm": 0.43867388367652893,
"learning_rate": 4.855728727885426e-05,
"loss": 0.4458,
"step": 278
},
{
"epoch": 0.029369193926155952,
"grad_norm": 0.441211998462677,
"learning_rate": 4.855202190395957e-05,
"loss": 0.4532,
"step": 279
},
{
"epoch": 0.02947445985420669,
"grad_norm": 0.5454714894294739,
"learning_rate": 4.854675652906488e-05,
"loss": 0.4907,
"step": 280
},
{
"epoch": 0.029579725782257427,
"grad_norm": 0.47156885266304016,
"learning_rate": 4.854149115417018e-05,
"loss": 0.4905,
"step": 281
},
{
"epoch": 0.029684991710308167,
"grad_norm": 0.40513938665390015,
"learning_rate": 4.853622577927549e-05,
"loss": 0.4808,
"step": 282
},
{
"epoch": 0.029790257638358904,
"grad_norm": 0.47520211338996887,
"learning_rate": 4.853096040438079e-05,
"loss": 0.4501,
"step": 283
},
{
"epoch": 0.02989552356640964,
"grad_norm": 0.5248693823814392,
"learning_rate": 4.85256950294861e-05,
"loss": 0.4287,
"step": 284
},
{
"epoch": 0.03000078949446038,
"grad_norm": 0.4880824089050293,
"learning_rate": 4.852042965459141e-05,
"loss": 0.3947,
"step": 285
},
{
"epoch": 0.03010605542251112,
"grad_norm": 0.4884517788887024,
"learning_rate": 4.851516427969672e-05,
"loss": 0.4521,
"step": 286
},
{
"epoch": 0.030211321350561857,
"grad_norm": 0.5394681096076965,
"learning_rate": 4.8509898904802026e-05,
"loss": 0.4033,
"step": 287
},
{
"epoch": 0.030316587278612594,
"grad_norm": 0.46996134519577026,
"learning_rate": 4.8504633529907335e-05,
"loss": 0.4217,
"step": 288
},
{
"epoch": 0.030421853206663334,
"grad_norm": 0.4631175398826599,
"learning_rate": 4.849936815501264e-05,
"loss": 0.4114,
"step": 289
},
{
"epoch": 0.03052711913471407,
"grad_norm": 0.5271033644676208,
"learning_rate": 4.849410278011795e-05,
"loss": 0.4044,
"step": 290
},
{
"epoch": 0.03063238506276481,
"grad_norm": 0.46999993920326233,
"learning_rate": 4.8488837405223256e-05,
"loss": 0.4408,
"step": 291
},
{
"epoch": 0.03073765099081555,
"grad_norm": 0.3656292259693146,
"learning_rate": 4.848357203032856e-05,
"loss": 0.4169,
"step": 292
},
{
"epoch": 0.030842916918866287,
"grad_norm": 0.5758498907089233,
"learning_rate": 4.847830665543387e-05,
"loss": 0.4718,
"step": 293
},
{
"epoch": 0.030948182846917024,
"grad_norm": 0.43184739351272583,
"learning_rate": 4.8473041280539176e-05,
"loss": 0.4081,
"step": 294
},
{
"epoch": 0.03105344877496776,
"grad_norm": 0.44835662841796875,
"learning_rate": 4.8467775905644485e-05,
"loss": 0.4249,
"step": 295
},
{
"epoch": 0.0311587147030185,
"grad_norm": 0.4488978087902069,
"learning_rate": 4.8462510530749794e-05,
"loss": 0.5449,
"step": 296
},
{
"epoch": 0.031263980631069235,
"grad_norm": 0.5275838971138,
"learning_rate": 4.8457245155855096e-05,
"loss": 0.4624,
"step": 297
},
{
"epoch": 0.031369246559119976,
"grad_norm": 0.6487151980400085,
"learning_rate": 4.8451979780960405e-05,
"loss": 0.4815,
"step": 298
},
{
"epoch": 0.03147451248717072,
"grad_norm": 0.5481114983558655,
"learning_rate": 4.8446714406065714e-05,
"loss": 0.3889,
"step": 299
},
{
"epoch": 0.03157977841522145,
"grad_norm": 0.516204833984375,
"learning_rate": 4.844144903117102e-05,
"loss": 0.3923,
"step": 300
},
{
"epoch": 0.03168504434327219,
"grad_norm": 0.5541898012161255,
"learning_rate": 4.843618365627633e-05,
"loss": 0.4513,
"step": 301
},
{
"epoch": 0.03179031027132293,
"grad_norm": 0.5141636729240417,
"learning_rate": 4.8430918281381635e-05,
"loss": 0.4993,
"step": 302
},
{
"epoch": 0.031895576199373665,
"grad_norm": 0.46877187490463257,
"learning_rate": 4.8425652906486944e-05,
"loss": 0.4815,
"step": 303
},
{
"epoch": 0.032000842127424406,
"grad_norm": 0.5002549886703491,
"learning_rate": 4.842038753159225e-05,
"loss": 0.5064,
"step": 304
},
{
"epoch": 0.03210610805547515,
"grad_norm": 0.45424237847328186,
"learning_rate": 4.8415122156697555e-05,
"loss": 0.4549,
"step": 305
},
{
"epoch": 0.03221137398352588,
"grad_norm": 0.4908994138240814,
"learning_rate": 4.8409856781802864e-05,
"loss": 0.5029,
"step": 306
},
{
"epoch": 0.03231663991157662,
"grad_norm": 0.6221848726272583,
"learning_rate": 4.840459140690817e-05,
"loss": 0.4033,
"step": 307
},
{
"epoch": 0.03242190583962736,
"grad_norm": 0.5026724934577942,
"learning_rate": 4.839932603201348e-05,
"loss": 0.3765,
"step": 308
},
{
"epoch": 0.032527171767678095,
"grad_norm": 0.4318561255931854,
"learning_rate": 4.839406065711879e-05,
"loss": 0.4174,
"step": 309
},
{
"epoch": 0.032632437695728836,
"grad_norm": 0.5485970377922058,
"learning_rate": 4.83887952822241e-05,
"loss": 0.4528,
"step": 310
},
{
"epoch": 0.03273770362377958,
"grad_norm": 0.49032801389694214,
"learning_rate": 4.838352990732941e-05,
"loss": 0.4687,
"step": 311
},
{
"epoch": 0.03284296955183031,
"grad_norm": 0.4289769232273102,
"learning_rate": 4.837826453243471e-05,
"loss": 0.5144,
"step": 312
},
{
"epoch": 0.03294823547988105,
"grad_norm": 0.500663697719574,
"learning_rate": 4.8372999157540013e-05,
"loss": 0.3923,
"step": 313
},
{
"epoch": 0.033053501407931785,
"grad_norm": 0.5670647025108337,
"learning_rate": 4.836773378264532e-05,
"loss": 0.4049,
"step": 314
},
{
"epoch": 0.033158767335982525,
"grad_norm": 0.4813581109046936,
"learning_rate": 4.836246840775063e-05,
"loss": 0.443,
"step": 315
},
{
"epoch": 0.033264033264033266,
"grad_norm": 0.5485454797744751,
"learning_rate": 4.835720303285594e-05,
"loss": 0.4008,
"step": 316
},
{
"epoch": 0.033369299192084,
"grad_norm": 0.5390880703926086,
"learning_rate": 4.835193765796125e-05,
"loss": 0.3993,
"step": 317
},
{
"epoch": 0.03347456512013474,
"grad_norm": 0.498060017824173,
"learning_rate": 4.834667228306656e-05,
"loss": 0.3953,
"step": 318
},
{
"epoch": 0.03357983104818548,
"grad_norm": 0.49461764097213745,
"learning_rate": 4.834140690817187e-05,
"loss": 0.3972,
"step": 319
},
{
"epoch": 0.033685096976236215,
"grad_norm": 0.723934531211853,
"learning_rate": 4.8336141533277176e-05,
"loss": 0.4582,
"step": 320
},
{
"epoch": 0.033790362904286955,
"grad_norm": 0.4396905303001404,
"learning_rate": 4.833087615838248e-05,
"loss": 0.404,
"step": 321
},
{
"epoch": 0.033895628832337696,
"grad_norm": 0.4418332576751709,
"learning_rate": 4.832561078348779e-05,
"loss": 0.5145,
"step": 322
},
{
"epoch": 0.03400089476038843,
"grad_norm": 0.5111250281333923,
"learning_rate": 4.832034540859309e-05,
"loss": 0.5276,
"step": 323
},
{
"epoch": 0.03410616068843917,
"grad_norm": 0.5635156035423279,
"learning_rate": 4.83150800336984e-05,
"loss": 0.5484,
"step": 324
},
{
"epoch": 0.03421142661648991,
"grad_norm": 0.5792466402053833,
"learning_rate": 4.830981465880371e-05,
"loss": 0.5747,
"step": 325
},
{
"epoch": 0.034316692544540645,
"grad_norm": 0.4661281406879425,
"learning_rate": 4.830454928390902e-05,
"loss": 0.4601,
"step": 326
},
{
"epoch": 0.034421958472591385,
"grad_norm": 0.6661891937255859,
"learning_rate": 4.8299283909014326e-05,
"loss": 0.4993,
"step": 327
},
{
"epoch": 0.03452722440064212,
"grad_norm": 0.5207692384719849,
"learning_rate": 4.8294018534119635e-05,
"loss": 0.421,
"step": 328
},
{
"epoch": 0.03463249032869286,
"grad_norm": 0.6618428826332092,
"learning_rate": 4.828875315922494e-05,
"loss": 0.4163,
"step": 329
},
{
"epoch": 0.0347377562567436,
"grad_norm": 0.513272225856781,
"learning_rate": 4.8283487784330246e-05,
"loss": 0.3797,
"step": 330
},
{
"epoch": 0.034843022184794334,
"grad_norm": 0.4838692545890808,
"learning_rate": 4.8278222409435555e-05,
"loss": 0.3843,
"step": 331
},
{
"epoch": 0.034948288112845075,
"grad_norm": 0.5403527021408081,
"learning_rate": 4.8272957034540864e-05,
"loss": 0.4821,
"step": 332
},
{
"epoch": 0.035053554040895815,
"grad_norm": 0.48934701085090637,
"learning_rate": 4.8267691659646167e-05,
"loss": 0.4205,
"step": 333
},
{
"epoch": 0.03515881996894655,
"grad_norm": 0.5227293968200684,
"learning_rate": 4.8262426284751476e-05,
"loss": 0.483,
"step": 334
},
{
"epoch": 0.03526408589699729,
"grad_norm": 0.5904392004013062,
"learning_rate": 4.8257160909856785e-05,
"loss": 0.3868,
"step": 335
},
{
"epoch": 0.03536935182504803,
"grad_norm": 0.4555564522743225,
"learning_rate": 4.8251895534962094e-05,
"loss": 0.4235,
"step": 336
},
{
"epoch": 0.035474617753098764,
"grad_norm": 0.8526967763900757,
"learning_rate": 4.8246630160067396e-05,
"loss": 0.4588,
"step": 337
},
{
"epoch": 0.035579883681149505,
"grad_norm": 0.45085299015045166,
"learning_rate": 4.8241364785172705e-05,
"loss": 0.4228,
"step": 338
},
{
"epoch": 0.035685149609200245,
"grad_norm": 0.5043511390686035,
"learning_rate": 4.8236099410278014e-05,
"loss": 0.4632,
"step": 339
},
{
"epoch": 0.03579041553725098,
"grad_norm": 0.5064621567726135,
"learning_rate": 4.823083403538332e-05,
"loss": 0.4844,
"step": 340
},
{
"epoch": 0.03589568146530172,
"grad_norm": 0.48965758085250854,
"learning_rate": 4.822556866048863e-05,
"loss": 0.4481,
"step": 341
},
{
"epoch": 0.03600094739335245,
"grad_norm": 0.4565337300300598,
"learning_rate": 4.8220303285593934e-05,
"loss": 0.4011,
"step": 342
},
{
"epoch": 0.036106213321403194,
"grad_norm": 0.5424944758415222,
"learning_rate": 4.821503791069924e-05,
"loss": 0.5101,
"step": 343
},
{
"epoch": 0.036211479249453934,
"grad_norm": 0.4527457058429718,
"learning_rate": 4.820977253580455e-05,
"loss": 0.4097,
"step": 344
},
{
"epoch": 0.03631674517750467,
"grad_norm": 0.3896700441837311,
"learning_rate": 4.8204507160909854e-05,
"loss": 0.4177,
"step": 345
},
{
"epoch": 0.03642201110555541,
"grad_norm": 0.5583755373954773,
"learning_rate": 4.8199241786015163e-05,
"loss": 0.4437,
"step": 346
},
{
"epoch": 0.03652727703360615,
"grad_norm": 0.41155165433883667,
"learning_rate": 4.819397641112047e-05,
"loss": 0.4382,
"step": 347
},
{
"epoch": 0.03663254296165688,
"grad_norm": 0.36993688344955444,
"learning_rate": 4.818871103622578e-05,
"loss": 0.4839,
"step": 348
},
{
"epoch": 0.036737808889707624,
"grad_norm": 0.449740469455719,
"learning_rate": 4.818344566133109e-05,
"loss": 0.4251,
"step": 349
},
{
"epoch": 0.036843074817758364,
"grad_norm": 0.3957495391368866,
"learning_rate": 4.81781802864364e-05,
"loss": 0.4743,
"step": 350
},
{
"epoch": 0.0369483407458091,
"grad_norm": 0.5629512667655945,
"learning_rate": 4.817291491154171e-05,
"loss": 0.4002,
"step": 351
},
{
"epoch": 0.03705360667385984,
"grad_norm": 0.4598921239376068,
"learning_rate": 4.816764953664701e-05,
"loss": 0.4692,
"step": 352
},
{
"epoch": 0.03715887260191058,
"grad_norm": 0.516234278678894,
"learning_rate": 4.816238416175232e-05,
"loss": 0.4175,
"step": 353
},
{
"epoch": 0.03726413852996131,
"grad_norm": 0.5708214044570923,
"learning_rate": 4.815711878685762e-05,
"loss": 0.4306,
"step": 354
},
{
"epoch": 0.037369404458012054,
"grad_norm": 0.6185720562934875,
"learning_rate": 4.815185341196293e-05,
"loss": 0.4598,
"step": 355
},
{
"epoch": 0.037474670386062794,
"grad_norm": 0.5227758884429932,
"learning_rate": 4.814658803706824e-05,
"loss": 0.3782,
"step": 356
},
{
"epoch": 0.03757993631411353,
"grad_norm": 0.5345552563667297,
"learning_rate": 4.814132266217355e-05,
"loss": 0.418,
"step": 357
},
{
"epoch": 0.03768520224216427,
"grad_norm": 0.5797765254974365,
"learning_rate": 4.813605728727886e-05,
"loss": 0.5089,
"step": 358
},
{
"epoch": 0.037790468170215,
"grad_norm": 0.5567287802696228,
"learning_rate": 4.813079191238417e-05,
"loss": 0.4304,
"step": 359
},
{
"epoch": 0.03789573409826574,
"grad_norm": 0.4520246982574463,
"learning_rate": 4.8125526537489476e-05,
"loss": 0.4626,
"step": 360
},
{
"epoch": 0.038001000026316484,
"grad_norm": 0.44900500774383545,
"learning_rate": 4.812026116259478e-05,
"loss": 0.3843,
"step": 361
},
{
"epoch": 0.03810626595436722,
"grad_norm": 0.48296135663986206,
"learning_rate": 4.811499578770009e-05,
"loss": 0.4855,
"step": 362
},
{
"epoch": 0.03821153188241796,
"grad_norm": 0.4269002377986908,
"learning_rate": 4.810973041280539e-05,
"loss": 0.3795,
"step": 363
},
{
"epoch": 0.0383167978104687,
"grad_norm": 0.9296995401382446,
"learning_rate": 4.81044650379107e-05,
"loss": 0.4861,
"step": 364
},
{
"epoch": 0.03842206373851943,
"grad_norm": 0.5746780633926392,
"learning_rate": 4.809919966301601e-05,
"loss": 0.3991,
"step": 365
},
{
"epoch": 0.03852732966657017,
"grad_norm": 0.47170913219451904,
"learning_rate": 4.8093934288121317e-05,
"loss": 0.4348,
"step": 366
},
{
"epoch": 0.038632595594620914,
"grad_norm": 0.4327333867549896,
"learning_rate": 4.8088668913226626e-05,
"loss": 0.405,
"step": 367
},
{
"epoch": 0.03873786152267165,
"grad_norm": 0.4907747507095337,
"learning_rate": 4.8083403538331935e-05,
"loss": 0.4467,
"step": 368
},
{
"epoch": 0.03884312745072239,
"grad_norm": 0.48626840114593506,
"learning_rate": 4.807813816343724e-05,
"loss": 0.485,
"step": 369
},
{
"epoch": 0.03894839337877313,
"grad_norm": 0.5155723094940186,
"learning_rate": 4.8072872788542546e-05,
"loss": 0.3931,
"step": 370
},
{
"epoch": 0.03905365930682386,
"grad_norm": 0.5703728795051575,
"learning_rate": 4.8067607413647855e-05,
"loss": 0.3728,
"step": 371
},
{
"epoch": 0.0391589252348746,
"grad_norm": 0.5467020273208618,
"learning_rate": 4.8062342038753164e-05,
"loss": 0.477,
"step": 372
},
{
"epoch": 0.03926419116292534,
"grad_norm": 0.4459872543811798,
"learning_rate": 4.8057076663858466e-05,
"loss": 0.4712,
"step": 373
},
{
"epoch": 0.03936945709097608,
"grad_norm": 0.511060357093811,
"learning_rate": 4.8051811288963775e-05,
"loss": 0.5146,
"step": 374
},
{
"epoch": 0.03947472301902682,
"grad_norm": 0.3677018880844116,
"learning_rate": 4.8046545914069084e-05,
"loss": 0.4605,
"step": 375
},
{
"epoch": 0.03957998894707755,
"grad_norm": 0.47560691833496094,
"learning_rate": 4.804128053917439e-05,
"loss": 0.4479,
"step": 376
},
{
"epoch": 0.03968525487512829,
"grad_norm": 0.5171210169792175,
"learning_rate": 4.8036015164279695e-05,
"loss": 0.4413,
"step": 377
},
{
"epoch": 0.03979052080317903,
"grad_norm": 0.448194295167923,
"learning_rate": 4.8030749789385004e-05,
"loss": 0.4637,
"step": 378
},
{
"epoch": 0.03989578673122977,
"grad_norm": 0.5280170440673828,
"learning_rate": 4.8025484414490313e-05,
"loss": 0.4365,
"step": 379
},
{
"epoch": 0.04000105265928051,
"grad_norm": 0.490249902009964,
"learning_rate": 4.802021903959562e-05,
"loss": 0.4618,
"step": 380
},
{
"epoch": 0.04010631858733125,
"grad_norm": 0.5452317595481873,
"learning_rate": 4.801495366470093e-05,
"loss": 0.3972,
"step": 381
},
{
"epoch": 0.04021158451538198,
"grad_norm": 0.5572560429573059,
"learning_rate": 4.800968828980624e-05,
"loss": 0.4756,
"step": 382
},
{
"epoch": 0.04031685044343272,
"grad_norm": 0.45014721155166626,
"learning_rate": 4.800442291491154e-05,
"loss": 0.3915,
"step": 383
},
{
"epoch": 0.04042211637148346,
"grad_norm": 0.6049466729164124,
"learning_rate": 4.799915754001685e-05,
"loss": 0.3675,
"step": 384
},
{
"epoch": 0.0405273822995342,
"grad_norm": 0.6129103302955627,
"learning_rate": 4.7993892165122154e-05,
"loss": 0.378,
"step": 385
},
{
"epoch": 0.04063264822758494,
"grad_norm": 0.5461925864219666,
"learning_rate": 4.798862679022746e-05,
"loss": 0.4091,
"step": 386
},
{
"epoch": 0.04073791415563568,
"grad_norm": 0.41969093680381775,
"learning_rate": 4.798336141533277e-05,
"loss": 0.4843,
"step": 387
},
{
"epoch": 0.04084318008368641,
"grad_norm": 0.510870635509491,
"learning_rate": 4.797809604043808e-05,
"loss": 0.581,
"step": 388
},
{
"epoch": 0.04094844601173715,
"grad_norm": 0.5956604480743408,
"learning_rate": 4.797283066554339e-05,
"loss": 0.3163,
"step": 389
},
{
"epoch": 0.041053711939787886,
"grad_norm": 0.4685046076774597,
"learning_rate": 4.79675652906487e-05,
"loss": 0.4587,
"step": 390
},
{
"epoch": 0.04115897786783863,
"grad_norm": 0.4563463628292084,
"learning_rate": 4.796229991575401e-05,
"loss": 0.468,
"step": 391
},
{
"epoch": 0.04126424379588937,
"grad_norm": 0.5047011971473694,
"learning_rate": 4.795703454085931e-05,
"loss": 0.4117,
"step": 392
},
{
"epoch": 0.0413695097239401,
"grad_norm": 0.6256960034370422,
"learning_rate": 4.795176916596462e-05,
"loss": 0.4522,
"step": 393
},
{
"epoch": 0.04147477565199084,
"grad_norm": 0.479109525680542,
"learning_rate": 4.794650379106992e-05,
"loss": 0.5458,
"step": 394
},
{
"epoch": 0.04158004158004158,
"grad_norm": 0.5637032985687256,
"learning_rate": 4.794123841617523e-05,
"loss": 0.4724,
"step": 395
},
{
"epoch": 0.041685307508092316,
"grad_norm": 0.5758900046348572,
"learning_rate": 4.793597304128054e-05,
"loss": 0.3943,
"step": 396
},
{
"epoch": 0.04179057343614306,
"grad_norm": 0.41813746094703674,
"learning_rate": 4.793070766638585e-05,
"loss": 0.4937,
"step": 397
},
{
"epoch": 0.0418958393641938,
"grad_norm": 0.4549589455127716,
"learning_rate": 4.792544229149116e-05,
"loss": 0.4055,
"step": 398
},
{
"epoch": 0.04200110529224453,
"grad_norm": 0.42384806275367737,
"learning_rate": 4.792017691659647e-05,
"loss": 0.4189,
"step": 399
},
{
"epoch": 0.04210637122029527,
"grad_norm": 0.4235416352748871,
"learning_rate": 4.7914911541701776e-05,
"loss": 0.4304,
"step": 400
},
{
"epoch": 0.04221163714834601,
"grad_norm": 0.44901612401008606,
"learning_rate": 4.7909646166807085e-05,
"loss": 0.4575,
"step": 401
},
{
"epoch": 0.042316903076396746,
"grad_norm": 0.4786452353000641,
"learning_rate": 4.790438079191239e-05,
"loss": 0.4031,
"step": 402
},
{
"epoch": 0.04242216900444749,
"grad_norm": 0.64895099401474,
"learning_rate": 4.7899115417017696e-05,
"loss": 0.4437,
"step": 403
},
{
"epoch": 0.04252743493249822,
"grad_norm": 0.7129364609718323,
"learning_rate": 4.7893850042123e-05,
"loss": 0.426,
"step": 404
},
{
"epoch": 0.04263270086054896,
"grad_norm": 0.5261722207069397,
"learning_rate": 4.788858466722831e-05,
"loss": 0.4704,
"step": 405
},
{
"epoch": 0.0427379667885997,
"grad_norm": 0.5278510451316833,
"learning_rate": 4.7883319292333616e-05,
"loss": 0.43,
"step": 406
},
{
"epoch": 0.042843232716650435,
"grad_norm": 0.47645267844200134,
"learning_rate": 4.7878053917438925e-05,
"loss": 0.4399,
"step": 407
},
{
"epoch": 0.042948498644701176,
"grad_norm": 0.5606099367141724,
"learning_rate": 4.7872788542544234e-05,
"loss": 0.5023,
"step": 408
},
{
"epoch": 0.04305376457275192,
"grad_norm": 0.5183596611022949,
"learning_rate": 4.786752316764954e-05,
"loss": 0.4431,
"step": 409
},
{
"epoch": 0.04315903050080265,
"grad_norm": 0.4570636451244354,
"learning_rate": 4.7862257792754845e-05,
"loss": 0.4435,
"step": 410
},
{
"epoch": 0.04326429642885339,
"grad_norm": 0.5054503679275513,
"learning_rate": 4.7856992417860154e-05,
"loss": 0.4884,
"step": 411
},
{
"epoch": 0.04336956235690413,
"grad_norm": 0.4896951913833618,
"learning_rate": 4.7851727042965463e-05,
"loss": 0.472,
"step": 412
},
{
"epoch": 0.043474828284954865,
"grad_norm": 0.6141940951347351,
"learning_rate": 4.7846461668070766e-05,
"loss": 0.426,
"step": 413
},
{
"epoch": 0.043580094213005606,
"grad_norm": 0.48963436484336853,
"learning_rate": 4.7841196293176075e-05,
"loss": 0.4668,
"step": 414
},
{
"epoch": 0.04368536014105635,
"grad_norm": 0.5451966524124146,
"learning_rate": 4.7835930918281384e-05,
"loss": 0.4728,
"step": 415
},
{
"epoch": 0.04379062606910708,
"grad_norm": 0.434573769569397,
"learning_rate": 4.783066554338669e-05,
"loss": 0.4055,
"step": 416
},
{
"epoch": 0.04389589199715782,
"grad_norm": 0.5499134659767151,
"learning_rate": 4.7825400168492e-05,
"loss": 0.3879,
"step": 417
},
{
"epoch": 0.044001157925208555,
"grad_norm": 0.5180830955505371,
"learning_rate": 4.7820134793597304e-05,
"loss": 0.4445,
"step": 418
},
{
"epoch": 0.044106423853259295,
"grad_norm": 0.4541892409324646,
"learning_rate": 4.781486941870261e-05,
"loss": 0.4059,
"step": 419
},
{
"epoch": 0.044211689781310036,
"grad_norm": 0.3752939999103546,
"learning_rate": 4.780960404380792e-05,
"loss": 0.5885,
"step": 420
},
{
"epoch": 0.04431695570936077,
"grad_norm": 0.4906155467033386,
"learning_rate": 4.780433866891323e-05,
"loss": 0.4839,
"step": 421
},
{
"epoch": 0.04442222163741151,
"grad_norm": 0.4721757769584656,
"learning_rate": 4.779907329401854e-05,
"loss": 0.4177,
"step": 422
},
{
"epoch": 0.04452748756546225,
"grad_norm": 0.42130014300346375,
"learning_rate": 4.779380791912384e-05,
"loss": 0.4295,
"step": 423
},
{
"epoch": 0.044632753493512985,
"grad_norm": 0.5732069611549377,
"learning_rate": 4.778854254422915e-05,
"loss": 0.3721,
"step": 424
},
{
"epoch": 0.044738019421563725,
"grad_norm": 0.48826277256011963,
"learning_rate": 4.778327716933446e-05,
"loss": 0.4228,
"step": 425
},
{
"epoch": 0.044843285349614466,
"grad_norm": 0.5234729051589966,
"learning_rate": 4.777801179443976e-05,
"loss": 0.4014,
"step": 426
},
{
"epoch": 0.0449485512776652,
"grad_norm": 0.46457454562187195,
"learning_rate": 4.777274641954507e-05,
"loss": 0.5259,
"step": 427
},
{
"epoch": 0.04505381720571594,
"grad_norm": 0.5036742091178894,
"learning_rate": 4.776748104465038e-05,
"loss": 0.4361,
"step": 428
},
{
"epoch": 0.04515908313376668,
"grad_norm": 0.5410817265510559,
"learning_rate": 4.776221566975569e-05,
"loss": 0.463,
"step": 429
},
{
"epoch": 0.045264349061817415,
"grad_norm": 0.4173840284347534,
"learning_rate": 4.7756950294861e-05,
"loss": 0.4048,
"step": 430
},
{
"epoch": 0.045369614989868155,
"grad_norm": 0.726842999458313,
"learning_rate": 4.775168491996631e-05,
"loss": 0.5549,
"step": 431
},
{
"epoch": 0.045474880917918896,
"grad_norm": 0.40877723693847656,
"learning_rate": 4.774641954507162e-05,
"loss": 0.4433,
"step": 432
},
{
"epoch": 0.04558014684596963,
"grad_norm": 0.6194121241569519,
"learning_rate": 4.774115417017692e-05,
"loss": 0.4257,
"step": 433
},
{
"epoch": 0.04568541277402037,
"grad_norm": 0.5976036787033081,
"learning_rate": 4.773588879528222e-05,
"loss": 0.4709,
"step": 434
},
{
"epoch": 0.045790678702071104,
"grad_norm": 0.6144199371337891,
"learning_rate": 4.773062342038753e-05,
"loss": 0.3868,
"step": 435
},
{
"epoch": 0.045895944630121845,
"grad_norm": 0.5125494599342346,
"learning_rate": 4.772535804549284e-05,
"loss": 0.4116,
"step": 436
},
{
"epoch": 0.046001210558172585,
"grad_norm": 0.5164209604263306,
"learning_rate": 4.772009267059815e-05,
"loss": 0.3564,
"step": 437
},
{
"epoch": 0.04610647648622332,
"grad_norm": 0.4817107319831848,
"learning_rate": 4.771482729570346e-05,
"loss": 0.4801,
"step": 438
},
{
"epoch": 0.04621174241427406,
"grad_norm": 0.44076791405677795,
"learning_rate": 4.7709561920808766e-05,
"loss": 0.551,
"step": 439
},
{
"epoch": 0.0463170083423248,
"grad_norm": 0.634650707244873,
"learning_rate": 4.7704296545914075e-05,
"loss": 0.533,
"step": 440
},
{
"epoch": 0.046422274270375534,
"grad_norm": 0.4300638437271118,
"learning_rate": 4.7699031171019384e-05,
"loss": 0.4219,
"step": 441
},
{
"epoch": 0.046527540198426275,
"grad_norm": 0.5052940249443054,
"learning_rate": 4.7693765796124686e-05,
"loss": 0.4419,
"step": 442
},
{
"epoch": 0.046632806126477015,
"grad_norm": 0.4833763539791107,
"learning_rate": 4.7688500421229995e-05,
"loss": 0.4074,
"step": 443
},
{
"epoch": 0.04673807205452775,
"grad_norm": 0.4841054677963257,
"learning_rate": 4.76832350463353e-05,
"loss": 0.4357,
"step": 444
},
{
"epoch": 0.04684333798257849,
"grad_norm": 0.5227946639060974,
"learning_rate": 4.767796967144061e-05,
"loss": 0.4471,
"step": 445
},
{
"epoch": 0.04694860391062923,
"grad_norm": 0.5761273503303528,
"learning_rate": 4.7672704296545916e-05,
"loss": 0.4422,
"step": 446
},
{
"epoch": 0.047053869838679964,
"grad_norm": 0.47115081548690796,
"learning_rate": 4.7667438921651225e-05,
"loss": 0.4172,
"step": 447
},
{
"epoch": 0.047159135766730705,
"grad_norm": 0.5475848913192749,
"learning_rate": 4.7662173546756534e-05,
"loss": 0.4435,
"step": 448
},
{
"epoch": 0.04726440169478144,
"grad_norm": 0.4437314569950104,
"learning_rate": 4.765690817186184e-05,
"loss": 0.389,
"step": 449
},
{
"epoch": 0.04736966762283218,
"grad_norm": 0.4307888448238373,
"learning_rate": 4.7651642796967145e-05,
"loss": 0.4354,
"step": 450
},
{
"epoch": 0.04747493355088292,
"grad_norm": 0.3933163285255432,
"learning_rate": 4.7646377422072454e-05,
"loss": 0.561,
"step": 451
},
{
"epoch": 0.04758019947893365,
"grad_norm": 0.37329408526420593,
"learning_rate": 4.764111204717776e-05,
"loss": 0.4767,
"step": 452
},
{
"epoch": 0.047685465406984394,
"grad_norm": 0.554229199886322,
"learning_rate": 4.763584667228307e-05,
"loss": 0.3594,
"step": 453
},
{
"epoch": 0.047790731335035135,
"grad_norm": 0.4243522882461548,
"learning_rate": 4.7630581297388374e-05,
"loss": 0.44,
"step": 454
},
{
"epoch": 0.04789599726308587,
"grad_norm": 0.5723696351051331,
"learning_rate": 4.762531592249368e-05,
"loss": 0.4377,
"step": 455
},
{
"epoch": 0.04800126319113661,
"grad_norm": 0.5366947054862976,
"learning_rate": 4.762005054759899e-05,
"loss": 0.4021,
"step": 456
},
{
"epoch": 0.04810652911918735,
"grad_norm": 0.5559504628181458,
"learning_rate": 4.76147851727043e-05,
"loss": 0.3775,
"step": 457
},
{
"epoch": 0.04821179504723808,
"grad_norm": 0.48702389001846313,
"learning_rate": 4.7609519797809604e-05,
"loss": 0.4751,
"step": 458
},
{
"epoch": 0.048317060975288824,
"grad_norm": 0.36137351393699646,
"learning_rate": 4.760425442291491e-05,
"loss": 0.482,
"step": 459
},
{
"epoch": 0.048422326903339565,
"grad_norm": 0.4528438150882721,
"learning_rate": 4.759898904802022e-05,
"loss": 0.4059,
"step": 460
},
{
"epoch": 0.0485275928313903,
"grad_norm": 0.5218043923377991,
"learning_rate": 4.759372367312553e-05,
"loss": 0.4095,
"step": 461
},
{
"epoch": 0.04863285875944104,
"grad_norm": 0.5252096652984619,
"learning_rate": 4.758845829823084e-05,
"loss": 0.4989,
"step": 462
},
{
"epoch": 0.04873812468749178,
"grad_norm": 0.3626563549041748,
"learning_rate": 4.758319292333614e-05,
"loss": 0.5983,
"step": 463
},
{
"epoch": 0.04884339061554251,
"grad_norm": 0.473537415266037,
"learning_rate": 4.757792754844145e-05,
"loss": 0.5459,
"step": 464
},
{
"epoch": 0.048948656543593254,
"grad_norm": 0.7054407596588135,
"learning_rate": 4.757266217354676e-05,
"loss": 0.3718,
"step": 465
},
{
"epoch": 0.04905392247164399,
"grad_norm": 0.4829826056957245,
"learning_rate": 4.756739679865206e-05,
"loss": 0.4165,
"step": 466
},
{
"epoch": 0.04915918839969473,
"grad_norm": 0.5529534816741943,
"learning_rate": 4.756213142375737e-05,
"loss": 0.5058,
"step": 467
},
{
"epoch": 0.04926445432774547,
"grad_norm": 0.4331270456314087,
"learning_rate": 4.755686604886268e-05,
"loss": 0.4267,
"step": 468
},
{
"epoch": 0.0493697202557962,
"grad_norm": 0.48735421895980835,
"learning_rate": 4.755160067396799e-05,
"loss": 0.443,
"step": 469
},
{
"epoch": 0.04947498618384694,
"grad_norm": 0.6138409972190857,
"learning_rate": 4.75463352990733e-05,
"loss": 0.4449,
"step": 470
},
{
"epoch": 0.049580252111897684,
"grad_norm": 0.4512140154838562,
"learning_rate": 4.754106992417861e-05,
"loss": 0.486,
"step": 471
},
{
"epoch": 0.04968551803994842,
"grad_norm": 0.5221918225288391,
"learning_rate": 4.7535804549283916e-05,
"loss": 0.4122,
"step": 472
},
{
"epoch": 0.04979078396799916,
"grad_norm": 0.5450029969215393,
"learning_rate": 4.753053917438922e-05,
"loss": 0.3362,
"step": 473
},
{
"epoch": 0.0498960498960499,
"grad_norm": 0.5064875483512878,
"learning_rate": 4.752527379949452e-05,
"loss": 0.4868,
"step": 474
},
{
"epoch": 0.05000131582410063,
"grad_norm": 0.5182908177375793,
"learning_rate": 4.752000842459983e-05,
"loss": 0.4034,
"step": 475
},
{
"epoch": 0.05010658175215137,
"grad_norm": 0.5384114384651184,
"learning_rate": 4.751474304970514e-05,
"loss": 0.5353,
"step": 476
},
{
"epoch": 0.050211847680202114,
"grad_norm": 0.5357162952423096,
"learning_rate": 4.750947767481045e-05,
"loss": 0.3976,
"step": 477
},
{
"epoch": 0.05031711360825285,
"grad_norm": 0.45556405186653137,
"learning_rate": 4.750421229991576e-05,
"loss": 0.403,
"step": 478
},
{
"epoch": 0.05042237953630359,
"grad_norm": 0.5855860710144043,
"learning_rate": 4.7498946925021066e-05,
"loss": 0.3754,
"step": 479
},
{
"epoch": 0.05052764546435432,
"grad_norm": 0.5920200943946838,
"learning_rate": 4.7493681550126375e-05,
"loss": 0.3944,
"step": 480
},
{
"epoch": 0.05063291139240506,
"grad_norm": 0.5460993051528931,
"learning_rate": 4.7488416175231684e-05,
"loss": 0.5356,
"step": 481
},
{
"epoch": 0.0507381773204558,
"grad_norm": 0.5433392524719238,
"learning_rate": 4.7483150800336986e-05,
"loss": 0.4043,
"step": 482
},
{
"epoch": 0.05084344324850654,
"grad_norm": 0.6986379027366638,
"learning_rate": 4.7477885425442295e-05,
"loss": 0.4374,
"step": 483
},
{
"epoch": 0.05094870917655728,
"grad_norm": 0.6336686611175537,
"learning_rate": 4.74726200505476e-05,
"loss": 0.4308,
"step": 484
},
{
"epoch": 0.05105397510460802,
"grad_norm": 0.5509925484657288,
"learning_rate": 4.7467354675652906e-05,
"loss": 0.4101,
"step": 485
},
{
"epoch": 0.05115924103265875,
"grad_norm": 0.5978362560272217,
"learning_rate": 4.7462089300758215e-05,
"loss": 0.379,
"step": 486
},
{
"epoch": 0.05126450696070949,
"grad_norm": 0.5480085015296936,
"learning_rate": 4.7456823925863524e-05,
"loss": 0.3327,
"step": 487
},
{
"epoch": 0.05136977288876023,
"grad_norm": 0.5396241545677185,
"learning_rate": 4.745155855096883e-05,
"loss": 0.4283,
"step": 488
},
{
"epoch": 0.05147503881681097,
"grad_norm": 0.43143001198768616,
"learning_rate": 4.744629317607414e-05,
"loss": 0.584,
"step": 489
},
{
"epoch": 0.05158030474486171,
"grad_norm": 0.4590414762496948,
"learning_rate": 4.7441027801179445e-05,
"loss": 0.3794,
"step": 490
},
{
"epoch": 0.05168557067291245,
"grad_norm": 0.4620942175388336,
"learning_rate": 4.7435762426284754e-05,
"loss": 0.4421,
"step": 491
},
{
"epoch": 0.05179083660096318,
"grad_norm": 0.5003826022148132,
"learning_rate": 4.743049705139006e-05,
"loss": 0.4408,
"step": 492
},
{
"epoch": 0.05189610252901392,
"grad_norm": 0.5184903740882874,
"learning_rate": 4.742523167649537e-05,
"loss": 0.4523,
"step": 493
},
{
"epoch": 0.052001368457064656,
"grad_norm": 0.5750355124473572,
"learning_rate": 4.7419966301600674e-05,
"loss": 0.3512,
"step": 494
},
{
"epoch": 0.0521066343851154,
"grad_norm": 0.516768217086792,
"learning_rate": 4.741470092670598e-05,
"loss": 0.3583,
"step": 495
},
{
"epoch": 0.05221190031316614,
"grad_norm": 0.5511295199394226,
"learning_rate": 4.740943555181129e-05,
"loss": 0.4536,
"step": 496
},
{
"epoch": 0.05231716624121687,
"grad_norm": 0.4026057720184326,
"learning_rate": 4.74041701769166e-05,
"loss": 0.4834,
"step": 497
},
{
"epoch": 0.05242243216926761,
"grad_norm": 0.6032986044883728,
"learning_rate": 4.73989048020219e-05,
"loss": 0.3901,
"step": 498
},
{
"epoch": 0.05252769809731835,
"grad_norm": 0.45538461208343506,
"learning_rate": 4.739363942712721e-05,
"loss": 0.4174,
"step": 499
},
{
"epoch": 0.052632964025369086,
"grad_norm": 0.564687967300415,
"learning_rate": 4.738837405223252e-05,
"loss": 0.4543,
"step": 500
},
{
"epoch": 0.05273822995341983,
"grad_norm": 0.5365861058235168,
"learning_rate": 4.738310867733783e-05,
"loss": 0.3998,
"step": 501
},
{
"epoch": 0.05284349588147057,
"grad_norm": 0.5887376666069031,
"learning_rate": 4.737784330244314e-05,
"loss": 0.4881,
"step": 502
},
{
"epoch": 0.0529487618095213,
"grad_norm": 0.5137104392051697,
"learning_rate": 4.737257792754845e-05,
"loss": 0.4158,
"step": 503
},
{
"epoch": 0.05305402773757204,
"grad_norm": 0.7075323462486267,
"learning_rate": 4.736731255265375e-05,
"loss": 0.4249,
"step": 504
},
{
"epoch": 0.05315929366562278,
"grad_norm": 0.5085923075675964,
"learning_rate": 4.736204717775906e-05,
"loss": 0.3974,
"step": 505
},
{
"epoch": 0.053264559593673516,
"grad_norm": 0.4885638654232025,
"learning_rate": 4.735678180286436e-05,
"loss": 0.3569,
"step": 506
},
{
"epoch": 0.05336982552172426,
"grad_norm": 0.5807955265045166,
"learning_rate": 4.735151642796967e-05,
"loss": 0.3868,
"step": 507
},
{
"epoch": 0.053475091449775,
"grad_norm": 0.4715438485145569,
"learning_rate": 4.734625105307498e-05,
"loss": 0.4592,
"step": 508
},
{
"epoch": 0.05358035737782573,
"grad_norm": 0.4971379041671753,
"learning_rate": 4.734098567818029e-05,
"loss": 0.4449,
"step": 509
},
{
"epoch": 0.05368562330587647,
"grad_norm": 0.5600916743278503,
"learning_rate": 4.73357203032856e-05,
"loss": 0.3905,
"step": 510
},
{
"epoch": 0.053790889233927205,
"grad_norm": 0.5462086200714111,
"learning_rate": 4.733045492839091e-05,
"loss": 0.3757,
"step": 511
},
{
"epoch": 0.053896155161977946,
"grad_norm": 0.4880779981613159,
"learning_rate": 4.7325189553496216e-05,
"loss": 0.4084,
"step": 512
},
{
"epoch": 0.05400142109002869,
"grad_norm": 0.5553451180458069,
"learning_rate": 4.731992417860152e-05,
"loss": 0.4088,
"step": 513
},
{
"epoch": 0.05410668701807942,
"grad_norm": 0.4913026690483093,
"learning_rate": 4.731465880370683e-05,
"loss": 0.4286,
"step": 514
},
{
"epoch": 0.05421195294613016,
"grad_norm": 0.43161246180534363,
"learning_rate": 4.730939342881213e-05,
"loss": 0.413,
"step": 515
},
{
"epoch": 0.0543172188741809,
"grad_norm": 0.5062459707260132,
"learning_rate": 4.730412805391744e-05,
"loss": 0.4713,
"step": 516
},
{
"epoch": 0.054422484802231635,
"grad_norm": 0.4592074751853943,
"learning_rate": 4.729886267902275e-05,
"loss": 0.4902,
"step": 517
},
{
"epoch": 0.054527750730282376,
"grad_norm": 0.49476075172424316,
"learning_rate": 4.7293597304128056e-05,
"loss": 0.4016,
"step": 518
},
{
"epoch": 0.05463301665833312,
"grad_norm": 0.4191977381706238,
"learning_rate": 4.7288331929233365e-05,
"loss": 0.4672,
"step": 519
},
{
"epoch": 0.05473828258638385,
"grad_norm": 0.5030830502510071,
"learning_rate": 4.7283066554338674e-05,
"loss": 0.4905,
"step": 520
},
{
"epoch": 0.05484354851443459,
"grad_norm": 0.4686654210090637,
"learning_rate": 4.727780117944398e-05,
"loss": 0.4441,
"step": 521
},
{
"epoch": 0.05494881444248533,
"grad_norm": 0.46608471870422363,
"learning_rate": 4.7272535804549286e-05,
"loss": 0.3742,
"step": 522
},
{
"epoch": 0.055054080370536065,
"grad_norm": 0.5822672247886658,
"learning_rate": 4.7267270429654595e-05,
"loss": 0.4266,
"step": 523
},
{
"epoch": 0.055159346298586806,
"grad_norm": 0.4522544741630554,
"learning_rate": 4.7262005054759904e-05,
"loss": 0.4532,
"step": 524
},
{
"epoch": 0.05526461222663754,
"grad_norm": 0.47990643978118896,
"learning_rate": 4.7256739679865206e-05,
"loss": 0.3853,
"step": 525
},
{
"epoch": 0.05536987815468828,
"grad_norm": 0.5252960920333862,
"learning_rate": 4.7251474304970515e-05,
"loss": 0.4716,
"step": 526
},
{
"epoch": 0.05547514408273902,
"grad_norm": 0.45028603076934814,
"learning_rate": 4.7246208930075824e-05,
"loss": 0.4579,
"step": 527
},
{
"epoch": 0.055580410010789755,
"grad_norm": 0.5253304243087769,
"learning_rate": 4.724094355518113e-05,
"loss": 0.4433,
"step": 528
},
{
"epoch": 0.055685675938840495,
"grad_norm": 0.48800671100616455,
"learning_rate": 4.723567818028644e-05,
"loss": 0.4228,
"step": 529
},
{
"epoch": 0.055790941866891236,
"grad_norm": 0.5435435771942139,
"learning_rate": 4.7230412805391744e-05,
"loss": 0.4181,
"step": 530
},
{
"epoch": 0.05589620779494197,
"grad_norm": 0.5906736254692078,
"learning_rate": 4.722514743049705e-05,
"loss": 0.4003,
"step": 531
},
{
"epoch": 0.05600147372299271,
"grad_norm": 0.49869149923324585,
"learning_rate": 4.721988205560236e-05,
"loss": 0.4781,
"step": 532
},
{
"epoch": 0.05610673965104345,
"grad_norm": 0.4748145341873169,
"learning_rate": 4.721461668070767e-05,
"loss": 0.4291,
"step": 533
},
{
"epoch": 0.056212005579094185,
"grad_norm": 0.471021831035614,
"learning_rate": 4.7209351305812973e-05,
"loss": 0.4683,
"step": 534
},
{
"epoch": 0.056317271507144925,
"grad_norm": 0.6247691512107849,
"learning_rate": 4.720408593091828e-05,
"loss": 0.3932,
"step": 535
},
{
"epoch": 0.056422537435195666,
"grad_norm": 0.6917199492454529,
"learning_rate": 4.719882055602359e-05,
"loss": 0.597,
"step": 536
},
{
"epoch": 0.0565278033632464,
"grad_norm": 0.607105553150177,
"learning_rate": 4.71935551811289e-05,
"loss": 0.5024,
"step": 537
},
{
"epoch": 0.05663306929129714,
"grad_norm": 0.6015260815620422,
"learning_rate": 4.71882898062342e-05,
"loss": 0.4569,
"step": 538
},
{
"epoch": 0.05673833521934788,
"grad_norm": 0.6226845979690552,
"learning_rate": 4.718302443133951e-05,
"loss": 0.4134,
"step": 539
},
{
"epoch": 0.056843601147398615,
"grad_norm": 0.46711722016334534,
"learning_rate": 4.717775905644482e-05,
"loss": 0.4957,
"step": 540
},
{
"epoch": 0.056948867075449355,
"grad_norm": 0.4069374203681946,
"learning_rate": 4.717249368155013e-05,
"loss": 0.4173,
"step": 541
},
{
"epoch": 0.05705413300350009,
"grad_norm": 0.47599026560783386,
"learning_rate": 4.716722830665544e-05,
"loss": 0.4865,
"step": 542
},
{
"epoch": 0.05715939893155083,
"grad_norm": 0.46828117966651917,
"learning_rate": 4.716196293176075e-05,
"loss": 0.4763,
"step": 543
},
{
"epoch": 0.05726466485960157,
"grad_norm": 0.3772525191307068,
"learning_rate": 4.715669755686605e-05,
"loss": 0.4225,
"step": 544
},
{
"epoch": 0.057369930787652304,
"grad_norm": 0.44674021005630493,
"learning_rate": 4.715143218197136e-05,
"loss": 0.5063,
"step": 545
},
{
"epoch": 0.057475196715703045,
"grad_norm": 0.5613642334938049,
"learning_rate": 4.714616680707666e-05,
"loss": 0.5388,
"step": 546
},
{
"epoch": 0.057580462643753785,
"grad_norm": 0.5140121579170227,
"learning_rate": 4.714090143218197e-05,
"loss": 0.4481,
"step": 547
},
{
"epoch": 0.05768572857180452,
"grad_norm": 0.4728577435016632,
"learning_rate": 4.713563605728728e-05,
"loss": 0.3896,
"step": 548
},
{
"epoch": 0.05779099449985526,
"grad_norm": 0.4167439639568329,
"learning_rate": 4.713037068239259e-05,
"loss": 0.3863,
"step": 549
},
{
"epoch": 0.057896260427906,
"grad_norm": 0.5620428919792175,
"learning_rate": 4.71251053074979e-05,
"loss": 0.4342,
"step": 550
},
{
"epoch": 0.058001526355956734,
"grad_norm": 0.424396812915802,
"learning_rate": 4.7119839932603206e-05,
"loss": 0.5043,
"step": 551
},
{
"epoch": 0.058106792284007475,
"grad_norm": 0.4943045675754547,
"learning_rate": 4.7114574557708515e-05,
"loss": 0.3649,
"step": 552
},
{
"epoch": 0.058212058212058215,
"grad_norm": 0.5179657340049744,
"learning_rate": 4.7109309182813824e-05,
"loss": 0.3986,
"step": 553
},
{
"epoch": 0.05831732414010895,
"grad_norm": 0.46122902631759644,
"learning_rate": 4.710404380791913e-05,
"loss": 0.4501,
"step": 554
},
{
"epoch": 0.05842259006815969,
"grad_norm": 0.5129498243331909,
"learning_rate": 4.709877843302443e-05,
"loss": 0.4105,
"step": 555
},
{
"epoch": 0.05852785599621042,
"grad_norm": 0.5061764121055603,
"learning_rate": 4.709351305812974e-05,
"loss": 0.3993,
"step": 556
},
{
"epoch": 0.058633121924261164,
"grad_norm": 0.5676811933517456,
"learning_rate": 4.708824768323505e-05,
"loss": 0.3786,
"step": 557
},
{
"epoch": 0.058738387852311905,
"grad_norm": 0.5383573174476624,
"learning_rate": 4.7082982308340356e-05,
"loss": 0.4541,
"step": 558
},
{
"epoch": 0.05884365378036264,
"grad_norm": 0.6130087375640869,
"learning_rate": 4.7077716933445665e-05,
"loss": 0.4215,
"step": 559
},
{
"epoch": 0.05894891970841338,
"grad_norm": 0.6504372954368591,
"learning_rate": 4.7072451558550974e-05,
"loss": 0.3891,
"step": 560
},
{
"epoch": 0.05905418563646412,
"grad_norm": 0.5079691410064697,
"learning_rate": 4.706718618365628e-05,
"loss": 0.4668,
"step": 561
},
{
"epoch": 0.05915945156451485,
"grad_norm": 0.528856635093689,
"learning_rate": 4.7061920808761585e-05,
"loss": 0.3965,
"step": 562
},
{
"epoch": 0.059264717492565594,
"grad_norm": 0.44504040479660034,
"learning_rate": 4.7056655433866894e-05,
"loss": 0.5032,
"step": 563
},
{
"epoch": 0.059369983420616335,
"grad_norm": 0.5209716558456421,
"learning_rate": 4.70513900589722e-05,
"loss": 0.4837,
"step": 564
},
{
"epoch": 0.05947524934866707,
"grad_norm": 0.48046526312828064,
"learning_rate": 4.7046124684077505e-05,
"loss": 0.3989,
"step": 565
},
{
"epoch": 0.05958051527671781,
"grad_norm": 0.5712192058563232,
"learning_rate": 4.7040859309182814e-05,
"loss": 0.4788,
"step": 566
},
{
"epoch": 0.05968578120476855,
"grad_norm": 0.6029406785964966,
"learning_rate": 4.7035593934288123e-05,
"loss": 0.3974,
"step": 567
},
{
"epoch": 0.05979104713281928,
"grad_norm": 0.5272865295410156,
"learning_rate": 4.703032855939343e-05,
"loss": 0.4562,
"step": 568
},
{
"epoch": 0.059896313060870024,
"grad_norm": 0.5821331143379211,
"learning_rate": 4.702506318449874e-05,
"loss": 0.3848,
"step": 569
},
{
"epoch": 0.06000157898892076,
"grad_norm": 0.45264291763305664,
"learning_rate": 4.7019797809604044e-05,
"loss": 0.491,
"step": 570
},
{
"epoch": 0.0601068449169715,
"grad_norm": 0.5712417364120483,
"learning_rate": 4.701453243470935e-05,
"loss": 0.4128,
"step": 571
},
{
"epoch": 0.06021211084502224,
"grad_norm": 0.5191047787666321,
"learning_rate": 4.700926705981466e-05,
"loss": 0.4552,
"step": 572
},
{
"epoch": 0.06031737677307297,
"grad_norm": 0.4191204607486725,
"learning_rate": 4.700400168491997e-05,
"loss": 0.4669,
"step": 573
},
{
"epoch": 0.06042264270112371,
"grad_norm": 0.508425235748291,
"learning_rate": 4.699873631002528e-05,
"loss": 0.4031,
"step": 574
},
{
"epoch": 0.060527908629174454,
"grad_norm": 0.47075721621513367,
"learning_rate": 4.699347093513058e-05,
"loss": 0.4773,
"step": 575
},
{
"epoch": 0.06063317455722519,
"grad_norm": 0.5133448839187622,
"learning_rate": 4.698820556023589e-05,
"loss": 0.3865,
"step": 576
},
{
"epoch": 0.06073844048527593,
"grad_norm": 0.5425415635108948,
"learning_rate": 4.69829401853412e-05,
"loss": 0.4117,
"step": 577
},
{
"epoch": 0.06084370641332667,
"grad_norm": 0.61476731300354,
"learning_rate": 4.69776748104465e-05,
"loss": 0.4307,
"step": 578
},
{
"epoch": 0.0609489723413774,
"grad_norm": 0.553023099899292,
"learning_rate": 4.697240943555181e-05,
"loss": 0.3579,
"step": 579
},
{
"epoch": 0.06105423826942814,
"grad_norm": 0.4436430037021637,
"learning_rate": 4.696714406065712e-05,
"loss": 0.4099,
"step": 580
},
{
"epoch": 0.061159504197478884,
"grad_norm": 0.5598846673965454,
"learning_rate": 4.696187868576243e-05,
"loss": 0.3615,
"step": 581
},
{
"epoch": 0.06126477012552962,
"grad_norm": 0.6036468744277954,
"learning_rate": 4.695661331086774e-05,
"loss": 0.4438,
"step": 582
},
{
"epoch": 0.06137003605358036,
"grad_norm": 0.6011479496955872,
"learning_rate": 4.695134793597305e-05,
"loss": 0.4288,
"step": 583
},
{
"epoch": 0.0614753019816311,
"grad_norm": 0.5292397141456604,
"learning_rate": 4.694608256107835e-05,
"loss": 0.4086,
"step": 584
},
{
"epoch": 0.06158056790968183,
"grad_norm": 0.5526982545852661,
"learning_rate": 4.694081718618366e-05,
"loss": 0.3941,
"step": 585
},
{
"epoch": 0.06168583383773257,
"grad_norm": 0.5088376402854919,
"learning_rate": 4.693555181128896e-05,
"loss": 0.4356,
"step": 586
},
{
"epoch": 0.06179109976578331,
"grad_norm": 0.5751054286956787,
"learning_rate": 4.693028643639427e-05,
"loss": 0.4629,
"step": 587
},
{
"epoch": 0.06189636569383405,
"grad_norm": 0.47562679648399353,
"learning_rate": 4.692502106149958e-05,
"loss": 0.4875,
"step": 588
},
{
"epoch": 0.06200163162188479,
"grad_norm": 0.406876802444458,
"learning_rate": 4.691975568660489e-05,
"loss": 0.5099,
"step": 589
},
{
"epoch": 0.06210689754993552,
"grad_norm": 0.43212106823921204,
"learning_rate": 4.69144903117102e-05,
"loss": 0.5271,
"step": 590
},
{
"epoch": 0.06221216347798626,
"grad_norm": 0.5265733003616333,
"learning_rate": 4.6909224936815506e-05,
"loss": 0.4456,
"step": 591
},
{
"epoch": 0.062317429406037,
"grad_norm": 0.37871816754341125,
"learning_rate": 4.6903959561920815e-05,
"loss": 0.3964,
"step": 592
},
{
"epoch": 0.06242269533408774,
"grad_norm": 0.443781316280365,
"learning_rate": 4.6898694187026124e-05,
"loss": 0.4575,
"step": 593
},
{
"epoch": 0.06252796126213847,
"grad_norm": 0.5184212923049927,
"learning_rate": 4.6893428812131426e-05,
"loss": 0.4791,
"step": 594
},
{
"epoch": 0.06263322719018921,
"grad_norm": 0.4982917308807373,
"learning_rate": 4.688816343723673e-05,
"loss": 0.4104,
"step": 595
},
{
"epoch": 0.06273849311823995,
"grad_norm": 0.43113309144973755,
"learning_rate": 4.688289806234204e-05,
"loss": 0.4384,
"step": 596
},
{
"epoch": 0.06284375904629069,
"grad_norm": 0.5594951510429382,
"learning_rate": 4.6877632687447346e-05,
"loss": 0.4428,
"step": 597
},
{
"epoch": 0.06294902497434143,
"grad_norm": 0.408655047416687,
"learning_rate": 4.6872367312552655e-05,
"loss": 0.4328,
"step": 598
},
{
"epoch": 0.06305429090239217,
"grad_norm": 0.41858869791030884,
"learning_rate": 4.6867101937657964e-05,
"loss": 0.4822,
"step": 599
},
{
"epoch": 0.0631595568304429,
"grad_norm": 0.5304632186889648,
"learning_rate": 4.6861836562763274e-05,
"loss": 0.4376,
"step": 600
},
{
"epoch": 0.06326482275849364,
"grad_norm": 0.4693495035171509,
"learning_rate": 4.685657118786858e-05,
"loss": 0.3905,
"step": 601
},
{
"epoch": 0.06337008868654438,
"grad_norm": 0.5536295771598816,
"learning_rate": 4.6851305812973885e-05,
"loss": 0.4378,
"step": 602
},
{
"epoch": 0.06347535461459512,
"grad_norm": 0.4618769884109497,
"learning_rate": 4.6846040438079194e-05,
"loss": 0.4642,
"step": 603
},
{
"epoch": 0.06358062054264586,
"grad_norm": 0.463776171207428,
"learning_rate": 4.68407750631845e-05,
"loss": 0.4518,
"step": 604
},
{
"epoch": 0.0636858864706966,
"grad_norm": 0.5297257900238037,
"learning_rate": 4.6835509688289805e-05,
"loss": 0.3222,
"step": 605
},
{
"epoch": 0.06379115239874733,
"grad_norm": 0.47493240237236023,
"learning_rate": 4.6830244313395114e-05,
"loss": 0.4,
"step": 606
},
{
"epoch": 0.06389641832679807,
"grad_norm": 0.6347471475601196,
"learning_rate": 4.682497893850042e-05,
"loss": 0.4315,
"step": 607
},
{
"epoch": 0.06400168425484881,
"grad_norm": 0.5118055939674377,
"learning_rate": 4.681971356360573e-05,
"loss": 0.4136,
"step": 608
},
{
"epoch": 0.06410695018289955,
"grad_norm": 0.5062241554260254,
"learning_rate": 4.681444818871104e-05,
"loss": 0.51,
"step": 609
},
{
"epoch": 0.0642122161109503,
"grad_norm": 0.45359355211257935,
"learning_rate": 4.680918281381634e-05,
"loss": 0.3897,
"step": 610
},
{
"epoch": 0.06431748203900102,
"grad_norm": 0.4978649914264679,
"learning_rate": 4.680391743892165e-05,
"loss": 0.4234,
"step": 611
},
{
"epoch": 0.06442274796705176,
"grad_norm": 0.5025052428245544,
"learning_rate": 4.679865206402696e-05,
"loss": 0.4344,
"step": 612
},
{
"epoch": 0.0645280138951025,
"grad_norm": 0.4677049517631531,
"learning_rate": 4.679338668913227e-05,
"loss": 0.3997,
"step": 613
},
{
"epoch": 0.06463327982315324,
"grad_norm": 0.38490286469459534,
"learning_rate": 4.678812131423758e-05,
"loss": 0.4778,
"step": 614
},
{
"epoch": 0.06473854575120398,
"grad_norm": 0.4486238956451416,
"learning_rate": 4.678285593934288e-05,
"loss": 0.4002,
"step": 615
},
{
"epoch": 0.06484381167925472,
"grad_norm": 0.48641228675842285,
"learning_rate": 4.677759056444819e-05,
"loss": 0.4302,
"step": 616
},
{
"epoch": 0.06494907760730545,
"grad_norm": 0.5490376353263855,
"learning_rate": 4.67723251895535e-05,
"loss": 0.4203,
"step": 617
},
{
"epoch": 0.06505434353535619,
"grad_norm": 0.4899100363254547,
"learning_rate": 4.67670598146588e-05,
"loss": 0.399,
"step": 618
},
{
"epoch": 0.06515960946340693,
"grad_norm": 0.7570556998252869,
"learning_rate": 4.676179443976411e-05,
"loss": 0.4409,
"step": 619
},
{
"epoch": 0.06526487539145767,
"grad_norm": 0.5624217391014099,
"learning_rate": 4.675652906486942e-05,
"loss": 0.3867,
"step": 620
},
{
"epoch": 0.06537014131950841,
"grad_norm": 0.47434237599372864,
"learning_rate": 4.675126368997473e-05,
"loss": 0.3962,
"step": 621
},
{
"epoch": 0.06547540724755915,
"grad_norm": 0.5388314723968506,
"learning_rate": 4.674599831508004e-05,
"loss": 0.3872,
"step": 622
},
{
"epoch": 0.06558067317560988,
"grad_norm": 0.49027901887893677,
"learning_rate": 4.674073294018535e-05,
"loss": 0.4786,
"step": 623
},
{
"epoch": 0.06568593910366062,
"grad_norm": 0.4333001673221588,
"learning_rate": 4.6735467565290656e-05,
"loss": 0.4245,
"step": 624
},
{
"epoch": 0.06579120503171136,
"grad_norm": 0.4188300669193268,
"learning_rate": 4.673020219039596e-05,
"loss": 0.4713,
"step": 625
},
{
"epoch": 0.0658964709597621,
"grad_norm": 0.48492878675460815,
"learning_rate": 4.672493681550126e-05,
"loss": 0.4896,
"step": 626
},
{
"epoch": 0.06600173688781284,
"grad_norm": 0.5120576024055481,
"learning_rate": 4.671967144060657e-05,
"loss": 0.4209,
"step": 627
},
{
"epoch": 0.06610700281586357,
"grad_norm": 0.5438317060470581,
"learning_rate": 4.671440606571188e-05,
"loss": 0.4494,
"step": 628
},
{
"epoch": 0.06621226874391431,
"grad_norm": 0.5266952514648438,
"learning_rate": 4.670914069081719e-05,
"loss": 0.5609,
"step": 629
},
{
"epoch": 0.06631753467196505,
"grad_norm": 0.6691259741783142,
"learning_rate": 4.6703875315922496e-05,
"loss": 0.405,
"step": 630
},
{
"epoch": 0.06642280060001579,
"grad_norm": 0.6721771955490112,
"learning_rate": 4.6698609941027806e-05,
"loss": 0.537,
"step": 631
},
{
"epoch": 0.06652806652806653,
"grad_norm": 0.6021822690963745,
"learning_rate": 4.6693344566133115e-05,
"loss": 0.4862,
"step": 632
},
{
"epoch": 0.06663333245611727,
"grad_norm": 0.42799803614616394,
"learning_rate": 4.6688079191238424e-05,
"loss": 0.4316,
"step": 633
},
{
"epoch": 0.066738598384168,
"grad_norm": 0.3875657320022583,
"learning_rate": 4.6682813816343726e-05,
"loss": 0.4557,
"step": 634
},
{
"epoch": 0.06684386431221874,
"grad_norm": 0.4300662577152252,
"learning_rate": 4.6677548441449035e-05,
"loss": 0.5253,
"step": 635
},
{
"epoch": 0.06694913024026948,
"grad_norm": 0.4926076829433441,
"learning_rate": 4.667228306655434e-05,
"loss": 0.5151,
"step": 636
},
{
"epoch": 0.06705439616832022,
"grad_norm": 0.457466185092926,
"learning_rate": 4.6667017691659646e-05,
"loss": 0.4296,
"step": 637
},
{
"epoch": 0.06715966209637096,
"grad_norm": 0.5367447137832642,
"learning_rate": 4.6661752316764955e-05,
"loss": 0.43,
"step": 638
},
{
"epoch": 0.06726492802442169,
"grad_norm": 0.5215645432472229,
"learning_rate": 4.6656486941870264e-05,
"loss": 0.4355,
"step": 639
},
{
"epoch": 0.06737019395247243,
"grad_norm": 0.5821287035942078,
"learning_rate": 4.665122156697557e-05,
"loss": 0.3576,
"step": 640
},
{
"epoch": 0.06747545988052317,
"grad_norm": 0.5504344701766968,
"learning_rate": 4.664595619208088e-05,
"loss": 0.4843,
"step": 641
},
{
"epoch": 0.06758072580857391,
"grad_norm": 0.4482622742652893,
"learning_rate": 4.6640690817186184e-05,
"loss": 0.4474,
"step": 642
},
{
"epoch": 0.06768599173662465,
"grad_norm": 0.5162287950515747,
"learning_rate": 4.663542544229149e-05,
"loss": 0.5323,
"step": 643
},
{
"epoch": 0.06779125766467539,
"grad_norm": 0.5771566033363342,
"learning_rate": 4.66301600673968e-05,
"loss": 0.3508,
"step": 644
},
{
"epoch": 0.06789652359272612,
"grad_norm": 0.473014235496521,
"learning_rate": 4.6624894692502105e-05,
"loss": 0.3959,
"step": 645
},
{
"epoch": 0.06800178952077686,
"grad_norm": 0.4953562915325165,
"learning_rate": 4.6619629317607414e-05,
"loss": 0.4301,
"step": 646
},
{
"epoch": 0.0681070554488276,
"grad_norm": 0.519964337348938,
"learning_rate": 4.661436394271272e-05,
"loss": 0.4395,
"step": 647
},
{
"epoch": 0.06821232137687834,
"grad_norm": 0.5988878607749939,
"learning_rate": 4.660909856781803e-05,
"loss": 0.4151,
"step": 648
},
{
"epoch": 0.06831758730492908,
"grad_norm": 0.5311563014984131,
"learning_rate": 4.660383319292334e-05,
"loss": 0.431,
"step": 649
},
{
"epoch": 0.06842285323297982,
"grad_norm": 0.48196783661842346,
"learning_rate": 4.659856781802864e-05,
"loss": 0.4645,
"step": 650
}
],
"logging_steps": 1,
"max_steps": 9499,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 4.570341741428736e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}