DiagAgent-14B / trainer_state.json
Henrychur's picture
add model weights
2b6714a verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.987886944818304,
"eval_steps": 500,
"global_step": 555,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005383580080753701,
"grad_norm": 9.846100807189941,
"learning_rate": 5.882352941176471e-07,
"loss": 1.3462,
"step": 1
},
{
"epoch": 0.010767160161507403,
"grad_norm": 10.834526062011719,
"learning_rate": 1.1764705882352942e-06,
"loss": 1.4331,
"step": 2
},
{
"epoch": 0.016150740242261104,
"grad_norm": 8.454448699951172,
"learning_rate": 1.7647058823529414e-06,
"loss": 1.2743,
"step": 3
},
{
"epoch": 0.021534320323014805,
"grad_norm": 9.057403564453125,
"learning_rate": 2.3529411764705885e-06,
"loss": 1.2552,
"step": 4
},
{
"epoch": 0.026917900403768506,
"grad_norm": 7.3954267501831055,
"learning_rate": 2.9411764705882355e-06,
"loss": 1.2539,
"step": 5
},
{
"epoch": 0.03230148048452221,
"grad_norm": 4.456072807312012,
"learning_rate": 3.529411764705883e-06,
"loss": 0.9338,
"step": 6
},
{
"epoch": 0.03768506056527591,
"grad_norm": 1.6015737056732178,
"learning_rate": 4.11764705882353e-06,
"loss": 0.7819,
"step": 7
},
{
"epoch": 0.04306864064602961,
"grad_norm": 1.523898959159851,
"learning_rate": 4.705882352941177e-06,
"loss": 0.6926,
"step": 8
},
{
"epoch": 0.04845222072678331,
"grad_norm": 1.4889625310897827,
"learning_rate": 5.294117647058824e-06,
"loss": 0.6867,
"step": 9
},
{
"epoch": 0.05383580080753701,
"grad_norm": 1.1910319328308105,
"learning_rate": 5.882352941176471e-06,
"loss": 0.5507,
"step": 10
},
{
"epoch": 0.059219380888290714,
"grad_norm": 1.136652946472168,
"learning_rate": 6.470588235294119e-06,
"loss": 0.6116,
"step": 11
},
{
"epoch": 0.06460296096904442,
"grad_norm": 1.6838195323944092,
"learning_rate": 7.058823529411766e-06,
"loss": 0.656,
"step": 12
},
{
"epoch": 0.06998654104979811,
"grad_norm": 12.243623733520508,
"learning_rate": 7.647058823529411e-06,
"loss": 0.6089,
"step": 13
},
{
"epoch": 0.07537012113055182,
"grad_norm": 2.4841673374176025,
"learning_rate": 8.23529411764706e-06,
"loss": 0.5457,
"step": 14
},
{
"epoch": 0.08075370121130551,
"grad_norm": 1.2509280443191528,
"learning_rate": 8.823529411764707e-06,
"loss": 0.6457,
"step": 15
},
{
"epoch": 0.08613728129205922,
"grad_norm": 1.0484827756881714,
"learning_rate": 9.411764705882354e-06,
"loss": 0.5583,
"step": 16
},
{
"epoch": 0.09152086137281291,
"grad_norm": 0.8880680203437805,
"learning_rate": 1e-05,
"loss": 0.5628,
"step": 17
},
{
"epoch": 0.09690444145356662,
"grad_norm": 0.9158921241760254,
"learning_rate": 9.999914754008063e-06,
"loss": 0.5625,
"step": 18
},
{
"epoch": 0.10228802153432032,
"grad_norm": 0.9270734786987305,
"learning_rate": 9.999659018938999e-06,
"loss": 0.5319,
"step": 19
},
{
"epoch": 0.10767160161507403,
"grad_norm": 0.9035325050354004,
"learning_rate": 9.999232803512967e-06,
"loss": 0.518,
"step": 20
},
{
"epoch": 0.11305518169582772,
"grad_norm": 1.2811542749404907,
"learning_rate": 9.998636122263227e-06,
"loss": 0.5504,
"step": 21
},
{
"epoch": 0.11843876177658143,
"grad_norm": 0.8128458261489868,
"learning_rate": 9.997868995535658e-06,
"loss": 0.5344,
"step": 22
},
{
"epoch": 0.12382234185733512,
"grad_norm": 0.8413318991661072,
"learning_rate": 9.996931449488046e-06,
"loss": 0.5376,
"step": 23
},
{
"epoch": 0.12920592193808883,
"grad_norm": 0.8115915656089783,
"learning_rate": 9.99582351608921e-06,
"loss": 0.5844,
"step": 24
},
{
"epoch": 0.13458950201884254,
"grad_norm": 0.8173759579658508,
"learning_rate": 9.994545233117904e-06,
"loss": 0.5126,
"step": 25
},
{
"epoch": 0.13997308209959622,
"grad_norm": 0.7367566823959351,
"learning_rate": 9.993096644161526e-06,
"loss": 0.5311,
"step": 26
},
{
"epoch": 0.14535666218034993,
"grad_norm": 0.7710299491882324,
"learning_rate": 9.991477798614638e-06,
"loss": 0.5286,
"step": 27
},
{
"epoch": 0.15074024226110364,
"grad_norm": 0.7534223794937134,
"learning_rate": 9.989688751677277e-06,
"loss": 0.5462,
"step": 28
},
{
"epoch": 0.15612382234185734,
"grad_norm": 0.7281956672668457,
"learning_rate": 9.987729564353077e-06,
"loss": 0.5298,
"step": 29
},
{
"epoch": 0.16150740242261102,
"grad_norm": 0.6779235601425171,
"learning_rate": 9.985600303447185e-06,
"loss": 0.4758,
"step": 30
},
{
"epoch": 0.16689098250336473,
"grad_norm": 0.7668159008026123,
"learning_rate": 9.98330104156398e-06,
"loss": 0.5493,
"step": 31
},
{
"epoch": 0.17227456258411844,
"grad_norm": 0.7769574522972107,
"learning_rate": 9.980831857104612e-06,
"loss": 0.5033,
"step": 32
},
{
"epoch": 0.17765814266487215,
"grad_norm": 0.7682322263717651,
"learning_rate": 9.978192834264307e-06,
"loss": 0.4927,
"step": 33
},
{
"epoch": 0.18304172274562583,
"grad_norm": 0.7225139737129211,
"learning_rate": 9.975384063029516e-06,
"loss": 0.4922,
"step": 34
},
{
"epoch": 0.18842530282637954,
"grad_norm": 0.7247219681739807,
"learning_rate": 9.972405639174833e-06,
"loss": 0.5248,
"step": 35
},
{
"epoch": 0.19380888290713325,
"grad_norm": 0.7795732617378235,
"learning_rate": 9.96925766425974e-06,
"loss": 0.5207,
"step": 36
},
{
"epoch": 0.19919246298788695,
"grad_norm": 0.6990232467651367,
"learning_rate": 9.965940245625131e-06,
"loss": 0.5078,
"step": 37
},
{
"epoch": 0.20457604306864063,
"grad_norm": 0.7676703929901123,
"learning_rate": 9.962453496389665e-06,
"loss": 0.4908,
"step": 38
},
{
"epoch": 0.20995962314939434,
"grad_norm": 0.7075534462928772,
"learning_rate": 9.958797535445898e-06,
"loss": 0.5156,
"step": 39
},
{
"epoch": 0.21534320323014805,
"grad_norm": 0.7213850021362305,
"learning_rate": 9.95497248745624e-06,
"loss": 0.5212,
"step": 40
},
{
"epoch": 0.22072678331090176,
"grad_norm": 0.703669011592865,
"learning_rate": 9.950978482848694e-06,
"loss": 0.5124,
"step": 41
},
{
"epoch": 0.22611036339165544,
"grad_norm": 0.7526930570602417,
"learning_rate": 9.946815657812416e-06,
"loss": 0.537,
"step": 42
},
{
"epoch": 0.23149394347240915,
"grad_norm": 0.7019714117050171,
"learning_rate": 9.94248415429306e-06,
"loss": 0.5013,
"step": 43
},
{
"epoch": 0.23687752355316286,
"grad_norm": 0.753139078617096,
"learning_rate": 9.937984119987958e-06,
"loss": 0.5205,
"step": 44
},
{
"epoch": 0.24226110363391656,
"grad_norm": 0.7210888862609863,
"learning_rate": 9.93331570834106e-06,
"loss": 0.4658,
"step": 45
},
{
"epoch": 0.24764468371467024,
"grad_norm": 0.6602186560630798,
"learning_rate": 9.928479078537722e-06,
"loss": 0.4819,
"step": 46
},
{
"epoch": 0.253028263795424,
"grad_norm": 0.7594957947731018,
"learning_rate": 9.923474395499266e-06,
"loss": 0.5389,
"step": 47
},
{
"epoch": 0.25841184387617766,
"grad_norm": 0.7201584577560425,
"learning_rate": 9.91830182987736e-06,
"loss": 0.5362,
"step": 48
},
{
"epoch": 0.26379542395693134,
"grad_norm": 0.8874572515487671,
"learning_rate": 9.912961558048196e-06,
"loss": 0.5384,
"step": 49
},
{
"epoch": 0.2691790040376851,
"grad_norm": 0.6909434199333191,
"learning_rate": 9.907453762106484e-06,
"loss": 0.5042,
"step": 50
},
{
"epoch": 0.27456258411843876,
"grad_norm": 0.6489965319633484,
"learning_rate": 9.901778629859236e-06,
"loss": 0.4282,
"step": 51
},
{
"epoch": 0.27994616419919244,
"grad_norm": 0.6962871551513672,
"learning_rate": 9.895936354819362e-06,
"loss": 0.549,
"step": 52
},
{
"epoch": 0.2853297442799462,
"grad_norm": 0.6420189738273621,
"learning_rate": 9.889927136199075e-06,
"loss": 0.5255,
"step": 53
},
{
"epoch": 0.29071332436069985,
"grad_norm": 0.6697545647621155,
"learning_rate": 9.883751178903095e-06,
"loss": 0.5122,
"step": 54
},
{
"epoch": 0.2960969044414536,
"grad_norm": 0.6961387395858765,
"learning_rate": 9.877408693521664e-06,
"loss": 0.5277,
"step": 55
},
{
"epoch": 0.30148048452220727,
"grad_norm": 0.721615195274353,
"learning_rate": 9.870899896323368e-06,
"loss": 0.5309,
"step": 56
},
{
"epoch": 0.30686406460296095,
"grad_norm": 0.7135268449783325,
"learning_rate": 9.864225009247753e-06,
"loss": 0.5451,
"step": 57
},
{
"epoch": 0.3122476446837147,
"grad_norm": 0.6227580904960632,
"learning_rate": 9.857384259897768e-06,
"loss": 0.4653,
"step": 58
},
{
"epoch": 0.31763122476446837,
"grad_norm": 0.6683838963508606,
"learning_rate": 9.850377881532e-06,
"loss": 0.549,
"step": 59
},
{
"epoch": 0.32301480484522205,
"grad_norm": 0.6848832964897156,
"learning_rate": 9.843206113056715e-06,
"loss": 0.4432,
"step": 60
},
{
"epoch": 0.3283983849259758,
"grad_norm": 0.6483569145202637,
"learning_rate": 9.835869199017725e-06,
"loss": 0.467,
"step": 61
},
{
"epoch": 0.33378196500672946,
"grad_norm": 0.7005964517593384,
"learning_rate": 9.828367389592034e-06,
"loss": 0.5185,
"step": 62
},
{
"epoch": 0.3391655450874832,
"grad_norm": 0.6160753965377808,
"learning_rate": 9.820700940579312e-06,
"loss": 0.4116,
"step": 63
},
{
"epoch": 0.3445491251682369,
"grad_norm": 0.6587129235267639,
"learning_rate": 9.812870113393185e-06,
"loss": 0.5197,
"step": 64
},
{
"epoch": 0.34993270524899056,
"grad_norm": 1.4088473320007324,
"learning_rate": 9.804875175052304e-06,
"loss": 0.4992,
"step": 65
},
{
"epoch": 0.3553162853297443,
"grad_norm": 0.7397728562355042,
"learning_rate": 9.796716398171248e-06,
"loss": 0.5006,
"step": 66
},
{
"epoch": 0.360699865410498,
"grad_norm": 0.6741731762886047,
"learning_rate": 9.788394060951228e-06,
"loss": 0.4474,
"step": 67
},
{
"epoch": 0.36608344549125166,
"grad_norm": 0.6397544741630554,
"learning_rate": 9.779908447170602e-06,
"loss": 0.4835,
"step": 68
},
{
"epoch": 0.3714670255720054,
"grad_norm": 0.6985306739807129,
"learning_rate": 9.771259846175195e-06,
"loss": 0.476,
"step": 69
},
{
"epoch": 0.3768506056527591,
"grad_norm": 0.64960116147995,
"learning_rate": 9.762448552868433e-06,
"loss": 0.4628,
"step": 70
},
{
"epoch": 0.3822341857335128,
"grad_norm": 0.6351596713066101,
"learning_rate": 9.753474867701294e-06,
"loss": 0.4925,
"step": 71
},
{
"epoch": 0.3876177658142665,
"grad_norm": 0.6702280640602112,
"learning_rate": 9.744339096662056e-06,
"loss": 0.482,
"step": 72
},
{
"epoch": 0.39300134589502017,
"grad_norm": 0.5831217169761658,
"learning_rate": 9.735041551265862e-06,
"loss": 0.4794,
"step": 73
},
{
"epoch": 0.3983849259757739,
"grad_norm": 0.6301687359809875,
"learning_rate": 9.725582548544106e-06,
"loss": 0.4483,
"step": 74
},
{
"epoch": 0.4037685060565276,
"grad_norm": 0.6406306624412537,
"learning_rate": 9.715962411033614e-06,
"loss": 0.4514,
"step": 75
},
{
"epoch": 0.40915208613728127,
"grad_norm": 0.6490384936332703,
"learning_rate": 9.706181466765654e-06,
"loss": 0.4615,
"step": 76
},
{
"epoch": 0.414535666218035,
"grad_norm": 0.6236180663108826,
"learning_rate": 9.696240049254744e-06,
"loss": 0.4375,
"step": 77
},
{
"epoch": 0.4199192462987887,
"grad_norm": 0.6604640483856201,
"learning_rate": 9.686138497487282e-06,
"loss": 0.3954,
"step": 78
},
{
"epoch": 0.4253028263795424,
"grad_norm": 0.6148284673690796,
"learning_rate": 9.675877155909989e-06,
"loss": 0.4611,
"step": 79
},
{
"epoch": 0.4306864064602961,
"grad_norm": 0.6233279705047607,
"learning_rate": 9.66545637441816e-06,
"loss": 0.4489,
"step": 80
},
{
"epoch": 0.4360699865410498,
"grad_norm": 0.6342514157295227,
"learning_rate": 9.654876508343739e-06,
"loss": 0.4852,
"step": 81
},
{
"epoch": 0.4414535666218035,
"grad_norm": 0.6237147450447083,
"learning_rate": 9.644137918443198e-06,
"loss": 0.4351,
"step": 82
},
{
"epoch": 0.4468371467025572,
"grad_norm": 0.6277084946632385,
"learning_rate": 9.633240970885231e-06,
"loss": 0.4747,
"step": 83
},
{
"epoch": 0.4522207267833109,
"grad_norm": 0.6557124257087708,
"learning_rate": 9.622186037238286e-06,
"loss": 0.475,
"step": 84
},
{
"epoch": 0.4576043068640646,
"grad_norm": 0.6205596923828125,
"learning_rate": 9.610973494457873e-06,
"loss": 0.4732,
"step": 85
},
{
"epoch": 0.4629878869448183,
"grad_norm": 0.6538224816322327,
"learning_rate": 9.599603724873725e-06,
"loss": 0.4817,
"step": 86
},
{
"epoch": 0.46837146702557203,
"grad_norm": 0.6803449392318726,
"learning_rate": 9.588077116176756e-06,
"loss": 0.5178,
"step": 87
},
{
"epoch": 0.4737550471063257,
"grad_norm": 0.6003801226615906,
"learning_rate": 9.576394061405847e-06,
"loss": 0.4771,
"step": 88
},
{
"epoch": 0.4791386271870794,
"grad_norm": 0.6364747881889343,
"learning_rate": 9.564554958934432e-06,
"loss": 0.5041,
"step": 89
},
{
"epoch": 0.4845222072678331,
"grad_norm": 0.6516885757446289,
"learning_rate": 9.55256021245692e-06,
"loss": 0.5322,
"step": 90
},
{
"epoch": 0.4899057873485868,
"grad_norm": 0.6385886073112488,
"learning_rate": 9.540410230974943e-06,
"loss": 0.4747,
"step": 91
},
{
"epoch": 0.4952893674293405,
"grad_norm": 0.6282247304916382,
"learning_rate": 9.52810542878339e-06,
"loss": 0.4859,
"step": 92
},
{
"epoch": 0.5006729475100942,
"grad_norm": 0.6206268668174744,
"learning_rate": 9.515646225456283e-06,
"loss": 0.4458,
"step": 93
},
{
"epoch": 0.506056527590848,
"grad_norm": 0.6404337882995605,
"learning_rate": 9.503033045832484e-06,
"loss": 0.5434,
"step": 94
},
{
"epoch": 0.5114401076716016,
"grad_norm": 0.5463396906852722,
"learning_rate": 9.490266320001195e-06,
"loss": 0.4286,
"step": 95
},
{
"epoch": 0.5168236877523553,
"grad_norm": 0.6801166534423828,
"learning_rate": 9.4773464832873e-06,
"loss": 0.4889,
"step": 96
},
{
"epoch": 0.522207267833109,
"grad_norm": 0.6011826992034912,
"learning_rate": 9.464273976236518e-06,
"loss": 0.5188,
"step": 97
},
{
"epoch": 0.5275908479138627,
"grad_norm": 0.5876375436782837,
"learning_rate": 9.451049244600381e-06,
"loss": 0.4622,
"step": 98
},
{
"epoch": 0.5329744279946165,
"grad_norm": 0.582517147064209,
"learning_rate": 9.437672739321034e-06,
"loss": 0.4523,
"step": 99
},
{
"epoch": 0.5383580080753702,
"grad_norm": 0.5836907029151917,
"learning_rate": 9.424144916515863e-06,
"loss": 0.498,
"step": 100
},
{
"epoch": 0.5437415881561238,
"grad_norm": 0.5619045495986938,
"learning_rate": 9.410466237461937e-06,
"loss": 0.4475,
"step": 101
},
{
"epoch": 0.5491251682368775,
"grad_norm": 0.6337983012199402,
"learning_rate": 9.396637168580282e-06,
"loss": 0.4562,
"step": 102
},
{
"epoch": 0.5545087483176312,
"grad_norm": 0.6391755938529968,
"learning_rate": 9.382658181419977e-06,
"loss": 0.4738,
"step": 103
},
{
"epoch": 0.5598923283983849,
"grad_norm": 0.6735963821411133,
"learning_rate": 9.36852975264207e-06,
"loss": 0.4888,
"step": 104
},
{
"epoch": 0.5652759084791387,
"grad_norm": 0.5379722118377686,
"learning_rate": 9.354252364003334e-06,
"loss": 0.3988,
"step": 105
},
{
"epoch": 0.5706594885598923,
"grad_norm": 0.6036385893821716,
"learning_rate": 9.339826502339828e-06,
"loss": 0.5088,
"step": 106
},
{
"epoch": 0.576043068640646,
"grad_norm": 0.6139518022537231,
"learning_rate": 9.32525265955031e-06,
"loss": 0.4708,
"step": 107
},
{
"epoch": 0.5814266487213997,
"grad_norm": 0.5770635008811951,
"learning_rate": 9.310531332579453e-06,
"loss": 0.4981,
"step": 108
},
{
"epoch": 0.5868102288021534,
"grad_norm": 0.6464108228683472,
"learning_rate": 9.295663023400907e-06,
"loss": 0.5121,
"step": 109
},
{
"epoch": 0.5921938088829072,
"grad_norm": 0.6004317402839661,
"learning_rate": 9.280648239000174e-06,
"loss": 0.4751,
"step": 110
},
{
"epoch": 0.5975773889636609,
"grad_norm": 0.5914390683174133,
"learning_rate": 9.265487491357334e-06,
"loss": 0.4878,
"step": 111
},
{
"epoch": 0.6029609690444145,
"grad_norm": 0.6945117712020874,
"learning_rate": 9.250181297429573e-06,
"loss": 0.4927,
"step": 112
},
{
"epoch": 0.6083445491251682,
"grad_norm": 0.5963965058326721,
"learning_rate": 9.234730179133564e-06,
"loss": 0.4405,
"step": 113
},
{
"epoch": 0.6137281292059219,
"grad_norm": 0.6108909845352173,
"learning_rate": 9.219134663327672e-06,
"loss": 0.5301,
"step": 114
},
{
"epoch": 0.6191117092866757,
"grad_norm": 0.590741753578186,
"learning_rate": 9.203395281793979e-06,
"loss": 0.4701,
"step": 115
},
{
"epoch": 0.6244952893674294,
"grad_norm": 0.5966534614562988,
"learning_rate": 9.187512571220166e-06,
"loss": 0.4829,
"step": 116
},
{
"epoch": 0.629878869448183,
"grad_norm": 0.5713053941726685,
"learning_rate": 9.171487073181198e-06,
"loss": 0.4208,
"step": 117
},
{
"epoch": 0.6352624495289367,
"grad_norm": 0.6419247388839722,
"learning_rate": 9.155319334120864e-06,
"loss": 0.4565,
"step": 118
},
{
"epoch": 0.6406460296096904,
"grad_norm": 0.5234012007713318,
"learning_rate": 9.139009905333147e-06,
"loss": 0.3937,
"step": 119
},
{
"epoch": 0.6460296096904441,
"grad_norm": 0.5776930451393127,
"learning_rate": 9.122559342943423e-06,
"loss": 0.4677,
"step": 120
},
{
"epoch": 0.6514131897711979,
"grad_norm": 0.5588910579681396,
"learning_rate": 9.105968207889493e-06,
"loss": 0.4171,
"step": 121
},
{
"epoch": 0.6567967698519516,
"grad_norm": 0.5887078046798706,
"learning_rate": 9.089237065902464e-06,
"loss": 0.4209,
"step": 122
},
{
"epoch": 0.6621803499327052,
"grad_norm": 0.5707204937934875,
"learning_rate": 9.072366487487451e-06,
"loss": 0.4502,
"step": 123
},
{
"epoch": 0.6675639300134589,
"grad_norm": 0.5806924104690552,
"learning_rate": 9.055357047904133e-06,
"loss": 0.4428,
"step": 124
},
{
"epoch": 0.6729475100942126,
"grad_norm": 0.6028096079826355,
"learning_rate": 9.038209327147134e-06,
"loss": 0.4816,
"step": 125
},
{
"epoch": 0.6783310901749664,
"grad_norm": 0.592367947101593,
"learning_rate": 9.020923909926233e-06,
"loss": 0.49,
"step": 126
},
{
"epoch": 0.6837146702557201,
"grad_norm": 0.6010198593139648,
"learning_rate": 9.00350138564645e-06,
"loss": 0.4971,
"step": 127
},
{
"epoch": 0.6890982503364738,
"grad_norm": 0.5716829299926758,
"learning_rate": 8.985942348387926e-06,
"loss": 0.4828,
"step": 128
},
{
"epoch": 0.6944818304172274,
"grad_norm": 0.527796745300293,
"learning_rate": 8.968247396885685e-06,
"loss": 0.4113,
"step": 129
},
{
"epoch": 0.6998654104979811,
"grad_norm": 0.5992532968521118,
"learning_rate": 8.950417134509201e-06,
"loss": 0.4487,
"step": 130
},
{
"epoch": 0.7052489905787349,
"grad_norm": 0.5818247199058533,
"learning_rate": 8.932452169241838e-06,
"loss": 0.4804,
"step": 131
},
{
"epoch": 0.7106325706594886,
"grad_norm": 0.6332154870033264,
"learning_rate": 8.914353113660107e-06,
"loss": 0.5535,
"step": 132
},
{
"epoch": 0.7160161507402423,
"grad_norm": 0.5611910820007324,
"learning_rate": 8.89612058491279e-06,
"loss": 0.4464,
"step": 133
},
{
"epoch": 0.721399730820996,
"grad_norm": 0.5586318969726562,
"learning_rate": 8.877755204699883e-06,
"loss": 0.4606,
"step": 134
},
{
"epoch": 0.7267833109017496,
"grad_norm": 0.5422524809837341,
"learning_rate": 8.859257599251408e-06,
"loss": 0.4452,
"step": 135
},
{
"epoch": 0.7321668909825033,
"grad_norm": 0.5787152051925659,
"learning_rate": 8.840628399306056e-06,
"loss": 0.4997,
"step": 136
},
{
"epoch": 0.7375504710632571,
"grad_norm": 0.5561872720718384,
"learning_rate": 8.821868240089676e-06,
"loss": 0.4712,
"step": 137
},
{
"epoch": 0.7429340511440108,
"grad_norm": 0.629596471786499,
"learning_rate": 8.802977761293625e-06,
"loss": 0.5005,
"step": 138
},
{
"epoch": 0.7483176312247645,
"grad_norm": 0.5670992136001587,
"learning_rate": 8.783957607052941e-06,
"loss": 0.4594,
"step": 139
},
{
"epoch": 0.7537012113055181,
"grad_norm": 0.6181672811508179,
"learning_rate": 8.764808425924392e-06,
"loss": 0.48,
"step": 140
},
{
"epoch": 0.7590847913862718,
"grad_norm": 0.5901859998703003,
"learning_rate": 8.745530870864351e-06,
"loss": 0.4121,
"step": 141
},
{
"epoch": 0.7644683714670256,
"grad_norm": 0.5341172218322754,
"learning_rate": 8.726125599206543e-06,
"loss": 0.4905,
"step": 142
},
{
"epoch": 0.7698519515477793,
"grad_norm": 0.6587361097335815,
"learning_rate": 8.706593272639616e-06,
"loss": 0.4846,
"step": 143
},
{
"epoch": 0.775235531628533,
"grad_norm": 0.5404164791107178,
"learning_rate": 8.686934557184594e-06,
"loss": 0.4265,
"step": 144
},
{
"epoch": 0.7806191117092867,
"grad_norm": 0.6407716870307922,
"learning_rate": 8.667150123172159e-06,
"loss": 0.5006,
"step": 145
},
{
"epoch": 0.7860026917900403,
"grad_norm": 0.5715042948722839,
"learning_rate": 8.647240645219787e-06,
"loss": 0.4388,
"step": 146
},
{
"epoch": 0.7913862718707941,
"grad_norm": 0.575707197189331,
"learning_rate": 8.62720680220876e-06,
"loss": 0.4626,
"step": 147
},
{
"epoch": 0.7967698519515478,
"grad_norm": 0.5612806677818298,
"learning_rate": 8.607049277261005e-06,
"loss": 0.4644,
"step": 148
},
{
"epoch": 0.8021534320323015,
"grad_norm": 0.5671082735061646,
"learning_rate": 8.586768757715806e-06,
"loss": 0.4442,
"step": 149
},
{
"epoch": 0.8075370121130552,
"grad_norm": 0.598675012588501,
"learning_rate": 8.566365935106367e-06,
"loss": 0.4802,
"step": 150
},
{
"epoch": 0.8129205921938089,
"grad_norm": 0.546492338180542,
"learning_rate": 8.545841505136224e-06,
"loss": 0.4551,
"step": 151
},
{
"epoch": 0.8183041722745625,
"grad_norm": 0.5794171094894409,
"learning_rate": 8.525196167655539e-06,
"loss": 0.4755,
"step": 152
},
{
"epoch": 0.8236877523553163,
"grad_norm": 0.5300067663192749,
"learning_rate": 8.504430626637215e-06,
"loss": 0.4233,
"step": 153
},
{
"epoch": 0.82907133243607,
"grad_norm": 0.5738832950592041,
"learning_rate": 8.483545590152915e-06,
"loss": 0.5016,
"step": 154
},
{
"epoch": 0.8344549125168237,
"grad_norm": 0.5611905455589294,
"learning_rate": 8.462541770348896e-06,
"loss": 0.4444,
"step": 155
},
{
"epoch": 0.8398384925975774,
"grad_norm": 0.554915189743042,
"learning_rate": 8.441419883421742e-06,
"loss": 0.4603,
"step": 156
},
{
"epoch": 0.845222072678331,
"grad_norm": 0.5979538559913635,
"learning_rate": 8.42018064959393e-06,
"loss": 0.5154,
"step": 157
},
{
"epoch": 0.8506056527590848,
"grad_norm": 0.54628986120224,
"learning_rate": 8.398824793089287e-06,
"loss": 0.3947,
"step": 158
},
{
"epoch": 0.8559892328398385,
"grad_norm": 0.5486013889312744,
"learning_rate": 8.377353042108278e-06,
"loss": 0.4317,
"step": 159
},
{
"epoch": 0.8613728129205922,
"grad_norm": 0.5597162246704102,
"learning_rate": 8.355766128803192e-06,
"loss": 0.4471,
"step": 160
},
{
"epoch": 0.8667563930013459,
"grad_norm": 0.5271990895271301,
"learning_rate": 8.334064789253157e-06,
"loss": 0.3983,
"step": 161
},
{
"epoch": 0.8721399730820996,
"grad_norm": 0.5897473692893982,
"learning_rate": 8.312249763439066e-06,
"loss": 0.4504,
"step": 162
},
{
"epoch": 0.8775235531628532,
"grad_norm": 0.6026889085769653,
"learning_rate": 8.29032179521832e-06,
"loss": 0.4785,
"step": 163
},
{
"epoch": 0.882907133243607,
"grad_norm": 0.5334970951080322,
"learning_rate": 8.268281632299483e-06,
"loss": 0.5166,
"step": 164
},
{
"epoch": 0.8882907133243607,
"grad_norm": 0.568034827709198,
"learning_rate": 8.246130026216777e-06,
"loss": 0.4354,
"step": 165
},
{
"epoch": 0.8936742934051144,
"grad_norm": 0.5437761545181274,
"learning_rate": 8.22386773230445e-06,
"loss": 0.4398,
"step": 166
},
{
"epoch": 0.8990578734858681,
"grad_norm": 0.5542709231376648,
"learning_rate": 8.201495509671036e-06,
"loss": 0.4074,
"step": 167
},
{
"epoch": 0.9044414535666218,
"grad_norm": 0.5601239800453186,
"learning_rate": 8.179014121173461e-06,
"loss": 0.4764,
"step": 168
},
{
"epoch": 0.9098250336473755,
"grad_norm": 0.5747672319412231,
"learning_rate": 8.156424333391026e-06,
"loss": 0.4537,
"step": 169
},
{
"epoch": 0.9152086137281292,
"grad_norm": 0.56292325258255,
"learning_rate": 8.13372691659928e-06,
"loss": 0.4641,
"step": 170
},
{
"epoch": 0.9205921938088829,
"grad_norm": 0.5486699938774109,
"learning_rate": 8.110922644743747e-06,
"loss": 0.4489,
"step": 171
},
{
"epoch": 0.9259757738896366,
"grad_norm": 0.5740337371826172,
"learning_rate": 8.088012295413536e-06,
"loss": 0.475,
"step": 172
},
{
"epoch": 0.9313593539703903,
"grad_norm": 0.5686214566230774,
"learning_rate": 8.064996649814826e-06,
"loss": 0.4182,
"step": 173
},
{
"epoch": 0.9367429340511441,
"grad_norm": 0.5474251508712769,
"learning_rate": 8.041876492744239e-06,
"loss": 0.4011,
"step": 174
},
{
"epoch": 0.9421265141318977,
"grad_norm": 0.5313992500305176,
"learning_rate": 8.018652612562061e-06,
"loss": 0.4598,
"step": 175
},
{
"epoch": 0.9475100942126514,
"grad_norm": 0.5516825914382935,
"learning_rate": 7.99532580116537e-06,
"loss": 0.3926,
"step": 176
},
{
"epoch": 0.9528936742934051,
"grad_norm": 0.567688524723053,
"learning_rate": 7.971896853961043e-06,
"loss": 0.442,
"step": 177
},
{
"epoch": 0.9582772543741588,
"grad_norm": 0.5734118819236755,
"learning_rate": 7.948366569838612e-06,
"loss": 0.4221,
"step": 178
},
{
"epoch": 0.9636608344549125,
"grad_norm": 0.5655908584594727,
"learning_rate": 7.924735751143044e-06,
"loss": 0.51,
"step": 179
},
{
"epoch": 0.9690444145356663,
"grad_norm": 0.5655565857887268,
"learning_rate": 7.901005203647373e-06,
"loss": 0.3944,
"step": 180
},
{
"epoch": 0.9744279946164199,
"grad_norm": 0.6050511598587036,
"learning_rate": 7.877175736525217e-06,
"loss": 0.4433,
"step": 181
},
{
"epoch": 0.9798115746971736,
"grad_norm": 0.5776525139808655,
"learning_rate": 7.853248162323208e-06,
"loss": 0.5174,
"step": 182
},
{
"epoch": 0.9851951547779273,
"grad_norm": 0.5618104338645935,
"learning_rate": 7.829223296933259e-06,
"loss": 0.4297,
"step": 183
},
{
"epoch": 0.990578734858681,
"grad_norm": 0.5539780855178833,
"learning_rate": 7.805101959564768e-06,
"loss": 0.4988,
"step": 184
},
{
"epoch": 0.9959623149394348,
"grad_norm": 0.5038336515426636,
"learning_rate": 7.780884972716663e-06,
"loss": 0.3906,
"step": 185
},
{
"epoch": 1.0013458950201883,
"grad_norm": 0.6332990527153015,
"learning_rate": 7.75657316214937e-06,
"loss": 0.4842,
"step": 186
},
{
"epoch": 1.0067294751009421,
"grad_norm": 0.41341373324394226,
"learning_rate": 7.732167356856656e-06,
"loss": 0.2382,
"step": 187
},
{
"epoch": 1.012113055181696,
"grad_norm": 0.5181017518043518,
"learning_rate": 7.70766838903735e-06,
"loss": 0.2906,
"step": 188
},
{
"epoch": 1.0174966352624495,
"grad_norm": 0.4716527760028839,
"learning_rate": 7.683077094066981e-06,
"loss": 0.2688,
"step": 189
},
{
"epoch": 1.0228802153432033,
"grad_norm": 0.48120298981666565,
"learning_rate": 7.65839431046928e-06,
"loss": 0.2854,
"step": 190
},
{
"epoch": 1.0282637954239569,
"grad_norm": 0.4271540343761444,
"learning_rate": 7.63362087988759e-06,
"loss": 0.2093,
"step": 191
},
{
"epoch": 1.0336473755047106,
"grad_norm": 0.5108612775802612,
"learning_rate": 7.608757647056186e-06,
"loss": 0.2317,
"step": 192
},
{
"epoch": 1.0390309555854644,
"grad_norm": 0.4512535333633423,
"learning_rate": 7.583805459771443e-06,
"loss": 0.249,
"step": 193
},
{
"epoch": 1.044414535666218,
"grad_norm": 0.4441206455230713,
"learning_rate": 7.5587651688629405e-06,
"loss": 0.2657,
"step": 194
},
{
"epoch": 1.0497981157469718,
"grad_norm": 0.46206924319267273,
"learning_rate": 7.533637628164456e-06,
"loss": 0.2207,
"step": 195
},
{
"epoch": 1.0551816958277254,
"grad_norm": 0.52704918384552,
"learning_rate": 7.508423694484841e-06,
"loss": 0.2705,
"step": 196
},
{
"epoch": 1.0605652759084792,
"grad_norm": 0.5095883011817932,
"learning_rate": 7.483124227578811e-06,
"loss": 0.2428,
"step": 197
},
{
"epoch": 1.065948855989233,
"grad_norm": 0.5210585594177246,
"learning_rate": 7.457740090117627e-06,
"loss": 0.2344,
"step": 198
},
{
"epoch": 1.0713324360699865,
"grad_norm": 0.46602457761764526,
"learning_rate": 7.432272147659678e-06,
"loss": 0.241,
"step": 199
},
{
"epoch": 1.0767160161507403,
"grad_norm": 0.4984048306941986,
"learning_rate": 7.406721268620975e-06,
"loss": 0.2388,
"step": 200
},
{
"epoch": 1.0820995962314939,
"grad_norm": 0.5057407021522522,
"learning_rate": 7.381088324245526e-06,
"loss": 0.23,
"step": 201
},
{
"epoch": 1.0874831763122477,
"grad_norm": 0.4600376784801483,
"learning_rate": 7.355374188575639e-06,
"loss": 0.2022,
"step": 202
},
{
"epoch": 1.0928667563930015,
"grad_norm": 0.5112857818603516,
"learning_rate": 7.3295797384221156e-06,
"loss": 0.2333,
"step": 203
},
{
"epoch": 1.098250336473755,
"grad_norm": 0.527310848236084,
"learning_rate": 7.303705853334353e-06,
"loss": 0.242,
"step": 204
},
{
"epoch": 1.1036339165545088,
"grad_norm": 0.5270518660545349,
"learning_rate": 7.277753415570349e-06,
"loss": 0.2417,
"step": 205
},
{
"epoch": 1.1090174966352624,
"grad_norm": 0.5107465386390686,
"learning_rate": 7.2517233100666255e-06,
"loss": 0.2162,
"step": 206
},
{
"epoch": 1.1144010767160162,
"grad_norm": 0.5194461345672607,
"learning_rate": 7.225616424408045e-06,
"loss": 0.255,
"step": 207
},
{
"epoch": 1.1197846567967698,
"grad_norm": 0.5149202346801758,
"learning_rate": 7.199433648797558e-06,
"loss": 0.2593,
"step": 208
},
{
"epoch": 1.1251682368775235,
"grad_norm": 0.5071370005607605,
"learning_rate": 7.1731758760258315e-06,
"loss": 0.2427,
"step": 209
},
{
"epoch": 1.1305518169582773,
"grad_norm": 0.4726599454879761,
"learning_rate": 7.146844001440823e-06,
"loss": 0.2344,
"step": 210
},
{
"epoch": 1.135935397039031,
"grad_norm": 0.43700599670410156,
"learning_rate": 7.120438922917237e-06,
"loss": 0.1889,
"step": 211
},
{
"epoch": 1.1413189771197847,
"grad_norm": 0.4685395359992981,
"learning_rate": 7.09396154082592e-06,
"loss": 0.2127,
"step": 212
},
{
"epoch": 1.1467025572005383,
"grad_norm": 0.4829280972480774,
"learning_rate": 7.067412758003154e-06,
"loss": 0.2271,
"step": 213
},
{
"epoch": 1.152086137281292,
"grad_norm": 0.4522843360900879,
"learning_rate": 7.040793479719864e-06,
"loss": 0.217,
"step": 214
},
{
"epoch": 1.1574697173620458,
"grad_norm": 0.42811307311058044,
"learning_rate": 7.014104613650767e-06,
"loss": 0.1944,
"step": 215
},
{
"epoch": 1.1628532974427994,
"grad_norm": 0.465836763381958,
"learning_rate": 6.987347069843406e-06,
"loss": 0.2352,
"step": 216
},
{
"epoch": 1.1682368775235532,
"grad_norm": 0.5526953339576721,
"learning_rate": 6.96052176068713e-06,
"loss": 0.2839,
"step": 217
},
{
"epoch": 1.1736204576043068,
"grad_norm": 0.5280203223228455,
"learning_rate": 6.93362960088197e-06,
"loss": 0.2398,
"step": 218
},
{
"epoch": 1.1790040376850606,
"grad_norm": 0.4957825839519501,
"learning_rate": 6.906671507407463e-06,
"loss": 0.2391,
"step": 219
},
{
"epoch": 1.1843876177658144,
"grad_norm": 0.47294560074806213,
"learning_rate": 6.879648399491376e-06,
"loss": 0.1976,
"step": 220
},
{
"epoch": 1.189771197846568,
"grad_norm": 0.45914170145988464,
"learning_rate": 6.852561198578364e-06,
"loss": 0.1903,
"step": 221
},
{
"epoch": 1.1951547779273217,
"grad_norm": 0.5234487652778625,
"learning_rate": 6.825410828298552e-06,
"loss": 0.2548,
"step": 222
},
{
"epoch": 1.2005383580080753,
"grad_norm": 0.4907478094100952,
"learning_rate": 6.79819821443604e-06,
"loss": 0.2203,
"step": 223
},
{
"epoch": 1.205921938088829,
"grad_norm": 0.488614559173584,
"learning_rate": 6.7709242848973326e-06,
"loss": 0.1889,
"step": 224
},
{
"epoch": 1.2113055181695827,
"grad_norm": 0.42549803853034973,
"learning_rate": 6.743589969679697e-06,
"loss": 0.173,
"step": 225
},
{
"epoch": 1.2166890982503364,
"grad_norm": 0.5077455639839172,
"learning_rate": 6.716196200839465e-06,
"loss": 0.2301,
"step": 226
},
{
"epoch": 1.2220726783310902,
"grad_norm": 0.4867914915084839,
"learning_rate": 6.6887439124602295e-06,
"loss": 0.2455,
"step": 227
},
{
"epoch": 1.2274562584118438,
"grad_norm": 0.4867931306362152,
"learning_rate": 6.661234040621017e-06,
"loss": 0.201,
"step": 228
},
{
"epoch": 1.2328398384925976,
"grad_norm": 0.4922155737876892,
"learning_rate": 6.63366752336435e-06,
"loss": 0.2068,
"step": 229
},
{
"epoch": 1.2382234185733512,
"grad_norm": 0.5053098797798157,
"learning_rate": 6.606045300664272e-06,
"loss": 0.2237,
"step": 230
},
{
"epoch": 1.243606998654105,
"grad_norm": 0.5080535411834717,
"learning_rate": 6.578368314394293e-06,
"loss": 0.2189,
"step": 231
},
{
"epoch": 1.2489905787348587,
"grad_norm": 0.4673517346382141,
"learning_rate": 6.550637508295272e-06,
"loss": 0.202,
"step": 232
},
{
"epoch": 1.2543741588156123,
"grad_norm": 0.5345984697341919,
"learning_rate": 6.52285382794324e-06,
"loss": 0.2197,
"step": 233
},
{
"epoch": 1.259757738896366,
"grad_norm": 0.4533955752849579,
"learning_rate": 6.49501822071715e-06,
"loss": 0.1996,
"step": 234
},
{
"epoch": 1.2651413189771197,
"grad_norm": 0.48141008615493774,
"learning_rate": 6.467131635766585e-06,
"loss": 0.225,
"step": 235
},
{
"epoch": 1.2705248990578735,
"grad_norm": 0.5605146288871765,
"learning_rate": 6.439195023979381e-06,
"loss": 0.2769,
"step": 236
},
{
"epoch": 1.2759084791386273,
"grad_norm": 0.4871980845928192,
"learning_rate": 6.411209337949214e-06,
"loss": 0.2054,
"step": 237
},
{
"epoch": 1.2812920592193808,
"grad_norm": 0.5211129784584045,
"learning_rate": 6.383175531943106e-06,
"loss": 0.2682,
"step": 238
},
{
"epoch": 1.2866756393001346,
"grad_norm": 0.5319603085517883,
"learning_rate": 6.355094561868902e-06,
"loss": 0.2581,
"step": 239
},
{
"epoch": 1.2920592193808882,
"grad_norm": 0.4909502863883972,
"learning_rate": 6.3269673852426575e-06,
"loss": 0.208,
"step": 240
},
{
"epoch": 1.297442799461642,
"grad_norm": 0.5048267245292664,
"learning_rate": 6.298794961156004e-06,
"loss": 0.2213,
"step": 241
},
{
"epoch": 1.3028263795423958,
"grad_norm": 0.45375633239746094,
"learning_rate": 6.270578250243437e-06,
"loss": 0.1804,
"step": 242
},
{
"epoch": 1.3082099596231493,
"grad_norm": 0.4308919608592987,
"learning_rate": 6.242318214649556e-06,
"loss": 0.1866,
"step": 243
},
{
"epoch": 1.3135935397039031,
"grad_norm": 0.6137887835502625,
"learning_rate": 6.214015817996273e-06,
"loss": 0.2951,
"step": 244
},
{
"epoch": 1.3189771197846567,
"grad_norm": 0.5159800052642822,
"learning_rate": 6.185672025349936e-06,
"loss": 0.2405,
"step": 245
},
{
"epoch": 1.3243606998654105,
"grad_norm": 0.5221627354621887,
"learning_rate": 6.157287803188432e-06,
"loss": 0.2361,
"step": 246
},
{
"epoch": 1.3297442799461643,
"grad_norm": 0.5131467580795288,
"learning_rate": 6.128864119368234e-06,
"loss": 0.2467,
"step": 247
},
{
"epoch": 1.3351278600269179,
"grad_norm": 0.5357580780982971,
"learning_rate": 6.100401943091386e-06,
"loss": 0.2142,
"step": 248
},
{
"epoch": 1.3405114401076716,
"grad_norm": 0.5234276056289673,
"learning_rate": 6.0719022448724705e-06,
"loss": 0.2387,
"step": 249
},
{
"epoch": 1.3458950201884252,
"grad_norm": 0.5050548911094666,
"learning_rate": 6.043365996505506e-06,
"loss": 0.2257,
"step": 250
},
{
"epoch": 1.351278600269179,
"grad_norm": 0.5760233998298645,
"learning_rate": 6.014794171030811e-06,
"loss": 0.2929,
"step": 251
},
{
"epoch": 1.3566621803499328,
"grad_norm": 0.5137818455696106,
"learning_rate": 5.986187742701825e-06,
"loss": 0.2604,
"step": 252
},
{
"epoch": 1.3620457604306864,
"grad_norm": 0.4670131504535675,
"learning_rate": 5.9575476869518945e-06,
"loss": 0.2222,
"step": 253
},
{
"epoch": 1.3674293405114402,
"grad_norm": 0.5121346116065979,
"learning_rate": 5.928874980361005e-06,
"loss": 0.254,
"step": 254
},
{
"epoch": 1.3728129205921937,
"grad_norm": 0.47050395607948303,
"learning_rate": 5.900170600622477e-06,
"loss": 0.2295,
"step": 255
},
{
"epoch": 1.3781965006729475,
"grad_norm": 0.5137650966644287,
"learning_rate": 5.871435526509647e-06,
"loss": 0.1969,
"step": 256
},
{
"epoch": 1.3835800807537013,
"grad_norm": 0.5146386623382568,
"learning_rate": 5.8426707378424675e-06,
"loss": 0.2523,
"step": 257
},
{
"epoch": 1.3889636608344549,
"grad_norm": 0.47957491874694824,
"learning_rate": 5.813877215454118e-06,
"loss": 0.2406,
"step": 258
},
{
"epoch": 1.3943472409152087,
"grad_norm": 0.4431574046611786,
"learning_rate": 5.78505594115755e-06,
"loss": 0.2141,
"step": 259
},
{
"epoch": 1.3997308209959622,
"grad_norm": 0.5288009643554688,
"learning_rate": 5.756207897712011e-06,
"loss": 0.2348,
"step": 260
},
{
"epoch": 1.405114401076716,
"grad_norm": 0.47516876459121704,
"learning_rate": 5.727334068789529e-06,
"loss": 0.2324,
"step": 261
},
{
"epoch": 1.4104979811574698,
"grad_norm": 0.4710802137851715,
"learning_rate": 5.698435438941382e-06,
"loss": 0.217,
"step": 262
},
{
"epoch": 1.4158815612382234,
"grad_norm": 0.5013542175292969,
"learning_rate": 5.669512993564517e-06,
"loss": 0.2538,
"step": 263
},
{
"epoch": 1.4212651413189772,
"grad_norm": 0.4954458773136139,
"learning_rate": 5.640567718867951e-06,
"loss": 0.2175,
"step": 264
},
{
"epoch": 1.4266487213997308,
"grad_norm": 0.5086066126823425,
"learning_rate": 5.611600601839144e-06,
"loss": 0.2649,
"step": 265
},
{
"epoch": 1.4320323014804845,
"grad_norm": 0.5038528442382812,
"learning_rate": 5.582612630210349e-06,
"loss": 0.2396,
"step": 266
},
{
"epoch": 1.4374158815612383,
"grad_norm": 0.4795680642127991,
"learning_rate": 5.553604792424923e-06,
"loss": 0.2234,
"step": 267
},
{
"epoch": 1.442799461641992,
"grad_norm": 0.553688645362854,
"learning_rate": 5.524578077603627e-06,
"loss": 0.2435,
"step": 268
},
{
"epoch": 1.4481830417227457,
"grad_norm": 0.5056889057159424,
"learning_rate": 5.495533475510901e-06,
"loss": 0.2224,
"step": 269
},
{
"epoch": 1.4535666218034993,
"grad_norm": 0.44364944100379944,
"learning_rate": 5.4664719765211125e-06,
"loss": 0.185,
"step": 270
},
{
"epoch": 1.458950201884253,
"grad_norm": 0.5148865580558777,
"learning_rate": 5.4373945715847845e-06,
"loss": 0.2416,
"step": 271
},
{
"epoch": 1.4643337819650069,
"grad_norm": 0.5296265482902527,
"learning_rate": 5.408302252194806e-06,
"loss": 0.2179,
"step": 272
},
{
"epoch": 1.4697173620457604,
"grad_norm": 0.5192491412162781,
"learning_rate": 5.379196010352629e-06,
"loss": 0.2338,
"step": 273
},
{
"epoch": 1.4751009421265142,
"grad_norm": 0.45017164945602417,
"learning_rate": 5.3500768385344345e-06,
"loss": 0.203,
"step": 274
},
{
"epoch": 1.4804845222072678,
"grad_norm": 0.47436919808387756,
"learning_rate": 5.320945729657299e-06,
"loss": 0.2495,
"step": 275
},
{
"epoch": 1.4858681022880216,
"grad_norm": 0.47932523488998413,
"learning_rate": 5.2918036770453285e-06,
"loss": 0.2123,
"step": 276
},
{
"epoch": 1.4912516823687754,
"grad_norm": 0.5231288075447083,
"learning_rate": 5.262651674395799e-06,
"loss": 0.2636,
"step": 277
},
{
"epoch": 1.496635262449529,
"grad_norm": 0.46927890181541443,
"learning_rate": 5.2334907157452605e-06,
"loss": 0.2045,
"step": 278
},
{
"epoch": 1.5020188425302825,
"grad_norm": 0.5273484587669373,
"learning_rate": 5.204321795435656e-06,
"loss": 0.2352,
"step": 279
},
{
"epoch": 1.5074024226110363,
"grad_norm": 0.4517362713813782,
"learning_rate": 5.1751459080803986e-06,
"loss": 0.2068,
"step": 280
},
{
"epoch": 1.51278600269179,
"grad_norm": 0.5345643758773804,
"learning_rate": 5.145964048530475e-06,
"loss": 0.2578,
"step": 281
},
{
"epoch": 1.5181695827725439,
"grad_norm": 0.6723287105560303,
"learning_rate": 5.11677721184051e-06,
"loss": 0.2362,
"step": 282
},
{
"epoch": 1.5235531628532974,
"grad_norm": 0.4516390562057495,
"learning_rate": 5.08758639323484e-06,
"loss": 0.1979,
"step": 283
},
{
"epoch": 1.528936742934051,
"grad_norm": 0.4627610445022583,
"learning_rate": 5.058392588073583e-06,
"loss": 0.2235,
"step": 284
},
{
"epoch": 1.5343203230148048,
"grad_norm": 0.4922831356525421,
"learning_rate": 5.029196791818688e-06,
"loss": 0.2141,
"step": 285
},
{
"epoch": 1.5397039030955586,
"grad_norm": 0.4735919237136841,
"learning_rate": 5e-06,
"loss": 0.2235,
"step": 286
},
{
"epoch": 1.5450874831763124,
"grad_norm": 0.5311393737792969,
"learning_rate": 4.970803208181315e-06,
"loss": 0.2127,
"step": 287
},
{
"epoch": 1.550471063257066,
"grad_norm": 0.5476110577583313,
"learning_rate": 4.941607411926419e-06,
"loss": 0.236,
"step": 288
},
{
"epoch": 1.5558546433378195,
"grad_norm": 0.44367510080337524,
"learning_rate": 4.9124136067651615e-06,
"loss": 0.1843,
"step": 289
},
{
"epoch": 1.5612382234185733,
"grad_norm": 0.5168237686157227,
"learning_rate": 4.883222788159491e-06,
"loss": 0.2349,
"step": 290
},
{
"epoch": 1.5666218034993271,
"grad_norm": 0.5239467620849609,
"learning_rate": 4.8540359514695266e-06,
"loss": 0.2424,
"step": 291
},
{
"epoch": 1.572005383580081,
"grad_norm": 0.5578256845474243,
"learning_rate": 4.824854091919601e-06,
"loss": 0.2492,
"step": 292
},
{
"epoch": 1.5773889636608345,
"grad_norm": 0.5159158110618591,
"learning_rate": 4.795678204564346e-06,
"loss": 0.2031,
"step": 293
},
{
"epoch": 1.582772543741588,
"grad_norm": 0.4600106179714203,
"learning_rate": 4.766509284254739e-06,
"loss": 0.2042,
"step": 294
},
{
"epoch": 1.5881561238223418,
"grad_norm": 0.46104931831359863,
"learning_rate": 4.737348325604203e-06,
"loss": 0.1984,
"step": 295
},
{
"epoch": 1.5935397039030956,
"grad_norm": 0.5123720765113831,
"learning_rate": 4.708196322954673e-06,
"loss": 0.2449,
"step": 296
},
{
"epoch": 1.5989232839838494,
"grad_norm": 0.5240789651870728,
"learning_rate": 4.679054270342703e-06,
"loss": 0.1956,
"step": 297
},
{
"epoch": 1.604306864064603,
"grad_norm": 0.5075330138206482,
"learning_rate": 4.649923161465567e-06,
"loss": 0.2318,
"step": 298
},
{
"epoch": 1.6096904441453566,
"grad_norm": 0.5857378840446472,
"learning_rate": 4.620803989647373e-06,
"loss": 0.2623,
"step": 299
},
{
"epoch": 1.6150740242261103,
"grad_norm": 0.5065007209777832,
"learning_rate": 4.591697747805196e-06,
"loss": 0.2171,
"step": 300
},
{
"epoch": 1.6204576043068641,
"grad_norm": 0.47048458456993103,
"learning_rate": 4.562605428415216e-06,
"loss": 0.1985,
"step": 301
},
{
"epoch": 1.6258411843876177,
"grad_norm": 0.4939180314540863,
"learning_rate": 4.533528023478888e-06,
"loss": 0.2162,
"step": 302
},
{
"epoch": 1.6312247644683715,
"grad_norm": 0.5094431638717651,
"learning_rate": 4.5044665244891e-06,
"loss": 0.1996,
"step": 303
},
{
"epoch": 1.636608344549125,
"grad_norm": 0.5184011459350586,
"learning_rate": 4.475421922396375e-06,
"loss": 0.2053,
"step": 304
},
{
"epoch": 1.6419919246298789,
"grad_norm": 0.485853374004364,
"learning_rate": 4.446395207575081e-06,
"loss": 0.2063,
"step": 305
},
{
"epoch": 1.6473755047106327,
"grad_norm": 0.48953792452812195,
"learning_rate": 4.417387369789652e-06,
"loss": 0.2208,
"step": 306
},
{
"epoch": 1.6527590847913862,
"grad_norm": 0.48435530066490173,
"learning_rate": 4.388399398160857e-06,
"loss": 0.1991,
"step": 307
},
{
"epoch": 1.65814266487214,
"grad_norm": 0.4711257219314575,
"learning_rate": 4.359432281132051e-06,
"loss": 0.1985,
"step": 308
},
{
"epoch": 1.6635262449528936,
"grad_norm": 0.49920031428337097,
"learning_rate": 4.330487006435485e-06,
"loss": 0.2281,
"step": 309
},
{
"epoch": 1.6689098250336474,
"grad_norm": 0.4793451428413391,
"learning_rate": 4.301564561058618e-06,
"loss": 0.2052,
"step": 310
},
{
"epoch": 1.6742934051144012,
"grad_norm": 0.49276602268218994,
"learning_rate": 4.272665931210472e-06,
"loss": 0.2163,
"step": 311
},
{
"epoch": 1.6796769851951547,
"grad_norm": 0.48469507694244385,
"learning_rate": 4.243792102287991e-06,
"loss": 0.214,
"step": 312
},
{
"epoch": 1.6850605652759085,
"grad_norm": 0.5068939328193665,
"learning_rate": 4.214944058842452e-06,
"loss": 0.2463,
"step": 313
},
{
"epoch": 1.690444145356662,
"grad_norm": 0.4834253489971161,
"learning_rate": 4.186122784545885e-06,
"loss": 0.2204,
"step": 314
},
{
"epoch": 1.695827725437416,
"grad_norm": 0.7421865463256836,
"learning_rate": 4.157329262157534e-06,
"loss": 0.2297,
"step": 315
},
{
"epoch": 1.7012113055181697,
"grad_norm": 0.5400863289833069,
"learning_rate": 4.128564473490357e-06,
"loss": 0.2784,
"step": 316
},
{
"epoch": 1.7065948855989233,
"grad_norm": 0.46585744619369507,
"learning_rate": 4.099829399377524e-06,
"loss": 0.2039,
"step": 317
},
{
"epoch": 1.7119784656796768,
"grad_norm": 0.45379072427749634,
"learning_rate": 4.071125019638998e-06,
"loss": 0.1987,
"step": 318
},
{
"epoch": 1.7173620457604306,
"grad_norm": 0.5648776292800903,
"learning_rate": 4.0424523130481055e-06,
"loss": 0.2224,
"step": 319
},
{
"epoch": 1.7227456258411844,
"grad_norm": 0.4834424555301666,
"learning_rate": 4.013812257298175e-06,
"loss": 0.2175,
"step": 320
},
{
"epoch": 1.7281292059219382,
"grad_norm": 0.49235790967941284,
"learning_rate": 3.985205828969191e-06,
"loss": 0.1996,
"step": 321
},
{
"epoch": 1.7335127860026918,
"grad_norm": 0.4619491994380951,
"learning_rate": 3.956634003494496e-06,
"loss": 0.2143,
"step": 322
},
{
"epoch": 1.7388963660834453,
"grad_norm": 0.4783826172351837,
"learning_rate": 3.9280977551275294e-06,
"loss": 0.2154,
"step": 323
},
{
"epoch": 1.7442799461641991,
"grad_norm": 0.5519052743911743,
"learning_rate": 3.899598056908615e-06,
"loss": 0.2516,
"step": 324
},
{
"epoch": 1.749663526244953,
"grad_norm": 0.5011211633682251,
"learning_rate": 3.871135880631769e-06,
"loss": 0.2265,
"step": 325
},
{
"epoch": 1.7550471063257067,
"grad_norm": 0.41989102959632874,
"learning_rate": 3.842712196811569e-06,
"loss": 0.1792,
"step": 326
},
{
"epoch": 1.7604306864064603,
"grad_norm": 0.472318172454834,
"learning_rate": 3.8143279746500665e-06,
"loss": 0.2204,
"step": 327
},
{
"epoch": 1.7658142664872138,
"grad_norm": 0.531564474105835,
"learning_rate": 3.785984182003728e-06,
"loss": 0.2012,
"step": 328
},
{
"epoch": 1.7711978465679676,
"grad_norm": 0.5032511353492737,
"learning_rate": 3.757681785350445e-06,
"loss": 0.2242,
"step": 329
},
{
"epoch": 1.7765814266487214,
"grad_norm": 0.48782920837402344,
"learning_rate": 3.729421749756564e-06,
"loss": 0.2187,
"step": 330
},
{
"epoch": 1.7819650067294752,
"grad_norm": 0.4836859405040741,
"learning_rate": 3.701205038843997e-06,
"loss": 0.2194,
"step": 331
},
{
"epoch": 1.7873485868102288,
"grad_norm": 0.49115753173828125,
"learning_rate": 3.6730326147573425e-06,
"loss": 0.1968,
"step": 332
},
{
"epoch": 1.7927321668909824,
"grad_norm": 0.5141318440437317,
"learning_rate": 3.6449054381311e-06,
"loss": 0.2233,
"step": 333
},
{
"epoch": 1.7981157469717362,
"grad_norm": 0.5064616799354553,
"learning_rate": 3.616824468056896e-06,
"loss": 0.2065,
"step": 334
},
{
"epoch": 1.80349932705249,
"grad_norm": 0.47807809710502625,
"learning_rate": 3.5887906620507877e-06,
"loss": 0.2145,
"step": 335
},
{
"epoch": 1.8088829071332437,
"grad_norm": 0.5218194723129272,
"learning_rate": 3.5608049760206203e-06,
"loss": 0.227,
"step": 336
},
{
"epoch": 1.8142664872139973,
"grad_norm": 0.4956798851490021,
"learning_rate": 3.532868364233416e-06,
"loss": 0.2089,
"step": 337
},
{
"epoch": 1.8196500672947509,
"grad_norm": 0.5096341967582703,
"learning_rate": 3.504981779282852e-06,
"loss": 0.2397,
"step": 338
},
{
"epoch": 1.8250336473755047,
"grad_norm": 0.4995509684085846,
"learning_rate": 3.4771461720567613e-06,
"loss": 0.2397,
"step": 339
},
{
"epoch": 1.8304172274562585,
"grad_norm": 0.4688532054424286,
"learning_rate": 3.4493624917047284e-06,
"loss": 0.2161,
"step": 340
},
{
"epoch": 1.8358008075370122,
"grad_norm": 0.5076211094856262,
"learning_rate": 3.4216316856057074e-06,
"loss": 0.24,
"step": 341
},
{
"epoch": 1.8411843876177658,
"grad_norm": 0.4792284667491913,
"learning_rate": 3.3939546993357297e-06,
"loss": 0.1995,
"step": 342
},
{
"epoch": 1.8465679676985194,
"grad_norm": 0.4893110692501068,
"learning_rate": 3.3663324766356524e-06,
"loss": 0.2117,
"step": 343
},
{
"epoch": 1.8519515477792732,
"grad_norm": 0.493745893239975,
"learning_rate": 3.3387659593789845e-06,
"loss": 0.2422,
"step": 344
},
{
"epoch": 1.857335127860027,
"grad_norm": 0.494195818901062,
"learning_rate": 3.3112560875397713e-06,
"loss": 0.2344,
"step": 345
},
{
"epoch": 1.8627187079407808,
"grad_norm": 0.47956109046936035,
"learning_rate": 3.283803799160537e-06,
"loss": 0.2228,
"step": 346
},
{
"epoch": 1.8681022880215343,
"grad_norm": 0.4594026803970337,
"learning_rate": 3.256410030320304e-06,
"loss": 0.2119,
"step": 347
},
{
"epoch": 1.873485868102288,
"grad_norm": 0.512570321559906,
"learning_rate": 3.2290757151026687e-06,
"loss": 0.2414,
"step": 348
},
{
"epoch": 1.8788694481830417,
"grad_norm": 0.5020653605461121,
"learning_rate": 3.2018017855639605e-06,
"loss": 0.2425,
"step": 349
},
{
"epoch": 1.8842530282637955,
"grad_norm": 0.46298474073410034,
"learning_rate": 3.1745891717014477e-06,
"loss": 0.2077,
"step": 350
},
{
"epoch": 1.8896366083445493,
"grad_norm": 0.48863649368286133,
"learning_rate": 3.147438801421638e-06,
"loss": 0.2181,
"step": 351
},
{
"epoch": 1.8950201884253028,
"grad_norm": 0.4544221758842468,
"learning_rate": 3.1203516005086276e-06,
"loss": 0.2052,
"step": 352
},
{
"epoch": 1.9004037685060564,
"grad_norm": 0.4919374883174896,
"learning_rate": 3.093328492592539e-06,
"loss": 0.2266,
"step": 353
},
{
"epoch": 1.9057873485868102,
"grad_norm": 0.5141823291778564,
"learning_rate": 3.0663703991180318e-06,
"loss": 0.2329,
"step": 354
},
{
"epoch": 1.911170928667564,
"grad_norm": 0.46769434213638306,
"learning_rate": 3.0394782393128713e-06,
"loss": 0.2006,
"step": 355
},
{
"epoch": 1.9165545087483178,
"grad_norm": 0.4760676622390747,
"learning_rate": 3.0126529301565945e-06,
"loss": 0.1909,
"step": 356
},
{
"epoch": 1.9219380888290714,
"grad_norm": 0.4960988163948059,
"learning_rate": 2.9858953863492334e-06,
"loss": 0.2177,
"step": 357
},
{
"epoch": 1.927321668909825,
"grad_norm": 0.5212114453315735,
"learning_rate": 2.9592065202801374e-06,
"loss": 0.2096,
"step": 358
},
{
"epoch": 1.9327052489905787,
"grad_norm": 0.5346338152885437,
"learning_rate": 2.9325872419968484e-06,
"loss": 0.2391,
"step": 359
},
{
"epoch": 1.9380888290713325,
"grad_norm": 0.4992043375968933,
"learning_rate": 2.906038459174081e-06,
"loss": 0.2113,
"step": 360
},
{
"epoch": 1.9434724091520863,
"grad_norm": 0.4740796387195587,
"learning_rate": 2.879561077082764e-06,
"loss": 0.2178,
"step": 361
},
{
"epoch": 1.9488559892328399,
"grad_norm": 0.512220025062561,
"learning_rate": 2.853155998559179e-06,
"loss": 0.2325,
"step": 362
},
{
"epoch": 1.9542395693135934,
"grad_norm": 0.5286325216293335,
"learning_rate": 2.826824123974171e-06,
"loss": 0.2405,
"step": 363
},
{
"epoch": 1.9596231493943472,
"grad_norm": 0.4532966911792755,
"learning_rate": 2.800566351202443e-06,
"loss": 0.1983,
"step": 364
},
{
"epoch": 1.965006729475101,
"grad_norm": 0.5386168360710144,
"learning_rate": 2.774383575591956e-06,
"loss": 0.225,
"step": 365
},
{
"epoch": 1.9703903095558546,
"grad_norm": 0.49068483710289,
"learning_rate": 2.748276689933377e-06,
"loss": 0.2142,
"step": 366
},
{
"epoch": 1.9757738896366084,
"grad_norm": 0.5264994502067566,
"learning_rate": 2.722246584429652e-06,
"loss": 0.2197,
"step": 367
},
{
"epoch": 1.981157469717362,
"grad_norm": 0.5036882162094116,
"learning_rate": 2.6962941466656477e-06,
"loss": 0.2153,
"step": 368
},
{
"epoch": 1.9865410497981157,
"grad_norm": 0.46985024213790894,
"learning_rate": 2.6704202615778844e-06,
"loss": 0.216,
"step": 369
},
{
"epoch": 1.9919246298788695,
"grad_norm": 0.5271331667900085,
"learning_rate": 2.6446258114243633e-06,
"loss": 0.2125,
"step": 370
},
{
"epoch": 1.997308209959623,
"grad_norm": 0.5481729507446289,
"learning_rate": 2.6189116757544765e-06,
"loss": 0.2351,
"step": 371
},
{
"epoch": 2.0026917900403767,
"grad_norm": 0.4495651125907898,
"learning_rate": 2.593278731379027e-06,
"loss": 0.1652,
"step": 372
},
{
"epoch": 2.0080753701211305,
"grad_norm": 0.345325231552124,
"learning_rate": 2.567727852340323e-06,
"loss": 0.1108,
"step": 373
},
{
"epoch": 2.0134589502018843,
"grad_norm": 0.29901817440986633,
"learning_rate": 2.542259909882374e-06,
"loss": 0.0865,
"step": 374
},
{
"epoch": 2.018842530282638,
"grad_norm": 0.33557021617889404,
"learning_rate": 2.51687577242119e-06,
"loss": 0.107,
"step": 375
},
{
"epoch": 2.024226110363392,
"grad_norm": 0.2968936264514923,
"learning_rate": 2.4915763055151615e-06,
"loss": 0.0858,
"step": 376
},
{
"epoch": 2.029609690444145,
"grad_norm": 0.3676191568374634,
"learning_rate": 2.4663623718355444e-06,
"loss": 0.1066,
"step": 377
},
{
"epoch": 2.034993270524899,
"grad_norm": 0.30083024501800537,
"learning_rate": 2.4412348311370616e-06,
"loss": 0.0871,
"step": 378
},
{
"epoch": 2.0403768506056528,
"grad_norm": 0.2911483347415924,
"learning_rate": 2.416194540228559e-06,
"loss": 0.0808,
"step": 379
},
{
"epoch": 2.0457604306864066,
"grad_norm": 0.31706151366233826,
"learning_rate": 2.3912423529438145e-06,
"loss": 0.0818,
"step": 380
},
{
"epoch": 2.0511440107671604,
"grad_norm": 0.30930769443511963,
"learning_rate": 2.3663791201124093e-06,
"loss": 0.0812,
"step": 381
},
{
"epoch": 2.0565275908479137,
"grad_norm": 0.35245367884635925,
"learning_rate": 2.341605689530723e-06,
"loss": 0.0856,
"step": 382
},
{
"epoch": 2.0619111709286675,
"grad_norm": 0.3333040177822113,
"learning_rate": 2.316922905933022e-06,
"loss": 0.0745,
"step": 383
},
{
"epoch": 2.0672947510094213,
"grad_norm": 0.3866671025753021,
"learning_rate": 2.292331610962649e-06,
"loss": 0.0844,
"step": 384
},
{
"epoch": 2.072678331090175,
"grad_norm": 0.33665308356285095,
"learning_rate": 2.2678326431433456e-06,
"loss": 0.0773,
"step": 385
},
{
"epoch": 2.078061911170929,
"grad_norm": 0.3511718809604645,
"learning_rate": 2.243426837850631e-06,
"loss": 0.0775,
"step": 386
},
{
"epoch": 2.083445491251682,
"grad_norm": 0.3618534505367279,
"learning_rate": 2.219115027283339e-06,
"loss": 0.0812,
"step": 387
},
{
"epoch": 2.088829071332436,
"grad_norm": 0.39068838953971863,
"learning_rate": 2.194898040435234e-06,
"loss": 0.0829,
"step": 388
},
{
"epoch": 2.09421265141319,
"grad_norm": 0.47448840737342834,
"learning_rate": 2.17077670306674e-06,
"loss": 0.1055,
"step": 389
},
{
"epoch": 2.0995962314939436,
"grad_norm": 0.3499176800251007,
"learning_rate": 2.146751837676794e-06,
"loss": 0.0677,
"step": 390
},
{
"epoch": 2.1049798115746974,
"grad_norm": 0.39072269201278687,
"learning_rate": 2.122824263474784e-06,
"loss": 0.0754,
"step": 391
},
{
"epoch": 2.1103633916554507,
"grad_norm": 0.33510833978652954,
"learning_rate": 2.098994796352629e-06,
"loss": 0.058,
"step": 392
},
{
"epoch": 2.1157469717362045,
"grad_norm": 0.39688751101493835,
"learning_rate": 2.0752642488569557e-06,
"loss": 0.0728,
"step": 393
},
{
"epoch": 2.1211305518169583,
"grad_norm": 0.389644593000412,
"learning_rate": 2.0516334301613876e-06,
"loss": 0.0815,
"step": 394
},
{
"epoch": 2.126514131897712,
"grad_norm": 0.3516867160797119,
"learning_rate": 2.028103146038958e-06,
"loss": 0.0724,
"step": 395
},
{
"epoch": 2.131897711978466,
"grad_norm": 0.3905945420265198,
"learning_rate": 2.004674198834631e-06,
"loss": 0.0792,
"step": 396
},
{
"epoch": 2.1372812920592192,
"grad_norm": 0.46998897194862366,
"learning_rate": 1.98134738743794e-06,
"loss": 0.0793,
"step": 397
},
{
"epoch": 2.142664872139973,
"grad_norm": 0.4259118139743805,
"learning_rate": 1.9581235072557618e-06,
"loss": 0.0916,
"step": 398
},
{
"epoch": 2.148048452220727,
"grad_norm": 0.47033047676086426,
"learning_rate": 1.935003350185174e-06,
"loss": 0.0857,
"step": 399
},
{
"epoch": 2.1534320323014806,
"grad_norm": 0.4288282096385956,
"learning_rate": 1.911987704586466e-06,
"loss": 0.0709,
"step": 400
},
{
"epoch": 2.1588156123822344,
"grad_norm": 0.3920668661594391,
"learning_rate": 1.8890773552562564e-06,
"loss": 0.0722,
"step": 401
},
{
"epoch": 2.1641991924629878,
"grad_norm": 0.35498660802841187,
"learning_rate": 1.8662730834007204e-06,
"loss": 0.0635,
"step": 402
},
{
"epoch": 2.1695827725437415,
"grad_norm": 0.4081229269504547,
"learning_rate": 1.843575666608976e-06,
"loss": 0.0713,
"step": 403
},
{
"epoch": 2.1749663526244953,
"grad_norm": 0.41039130091667175,
"learning_rate": 1.8209858788265411e-06,
"loss": 0.0838,
"step": 404
},
{
"epoch": 2.180349932705249,
"grad_norm": 0.44797372817993164,
"learning_rate": 1.7985044903289645e-06,
"loss": 0.1013,
"step": 405
},
{
"epoch": 2.185733512786003,
"grad_norm": 0.3503686785697937,
"learning_rate": 1.7761322676955505e-06,
"loss": 0.066,
"step": 406
},
{
"epoch": 2.1911170928667563,
"grad_norm": 0.4590007960796356,
"learning_rate": 1.7538699737832237e-06,
"loss": 0.0772,
"step": 407
},
{
"epoch": 2.19650067294751,
"grad_norm": 0.3556067943572998,
"learning_rate": 1.7317183677005173e-06,
"loss": 0.0648,
"step": 408
},
{
"epoch": 2.201884253028264,
"grad_norm": 0.3512371778488159,
"learning_rate": 1.7096782047816806e-06,
"loss": 0.069,
"step": 409
},
{
"epoch": 2.2072678331090176,
"grad_norm": 0.39259177446365356,
"learning_rate": 1.687750236560936e-06,
"loss": 0.0793,
"step": 410
},
{
"epoch": 2.2126514131897714,
"grad_norm": 0.3561786711215973,
"learning_rate": 1.665935210746844e-06,
"loss": 0.0586,
"step": 411
},
{
"epoch": 2.218034993270525,
"grad_norm": 0.35219818353652954,
"learning_rate": 1.6442338711968102e-06,
"loss": 0.0681,
"step": 412
},
{
"epoch": 2.2234185733512786,
"grad_norm": 0.3837469220161438,
"learning_rate": 1.622646957891722e-06,
"loss": 0.0736,
"step": 413
},
{
"epoch": 2.2288021534320324,
"grad_norm": 0.39585286378860474,
"learning_rate": 1.601175206910715e-06,
"loss": 0.0826,
"step": 414
},
{
"epoch": 2.234185733512786,
"grad_norm": 0.33951419591903687,
"learning_rate": 1.5798193504060693e-06,
"loss": 0.0599,
"step": 415
},
{
"epoch": 2.2395693135935395,
"grad_norm": 0.39095380902290344,
"learning_rate": 1.5585801165782606e-06,
"loss": 0.0724,
"step": 416
},
{
"epoch": 2.2449528936742933,
"grad_norm": 0.3765682876110077,
"learning_rate": 1.5374582296511054e-06,
"loss": 0.0747,
"step": 417
},
{
"epoch": 2.250336473755047,
"grad_norm": 0.3725675046443939,
"learning_rate": 1.5164544098470862e-06,
"loss": 0.0717,
"step": 418
},
{
"epoch": 2.255720053835801,
"grad_norm": 0.37952670454978943,
"learning_rate": 1.4955693733627869e-06,
"loss": 0.0776,
"step": 419
},
{
"epoch": 2.2611036339165547,
"grad_norm": 0.39090678095817566,
"learning_rate": 1.474803832344463e-06,
"loss": 0.0766,
"step": 420
},
{
"epoch": 2.2664872139973085,
"grad_norm": 0.3887679874897003,
"learning_rate": 1.4541584948637777e-06,
"loss": 0.0868,
"step": 421
},
{
"epoch": 2.271870794078062,
"grad_norm": 0.3668728768825531,
"learning_rate": 1.4336340648936342e-06,
"loss": 0.0797,
"step": 422
},
{
"epoch": 2.2772543741588156,
"grad_norm": 0.3776654005050659,
"learning_rate": 1.413231242284195e-06,
"loss": 0.0775,
"step": 423
},
{
"epoch": 2.2826379542395694,
"grad_norm": 0.43863725662231445,
"learning_rate": 1.3929507227389954e-06,
"loss": 0.0848,
"step": 424
},
{
"epoch": 2.288021534320323,
"grad_norm": 0.3964315354824066,
"learning_rate": 1.3727931977912406e-06,
"loss": 0.0719,
"step": 425
},
{
"epoch": 2.2934051144010765,
"grad_norm": 0.3711508810520172,
"learning_rate": 1.352759354780215e-06,
"loss": 0.0602,
"step": 426
},
{
"epoch": 2.2987886944818303,
"grad_norm": 0.3771410584449768,
"learning_rate": 1.332849876827842e-06,
"loss": 0.0689,
"step": 427
},
{
"epoch": 2.304172274562584,
"grad_norm": 0.45632028579711914,
"learning_rate": 1.3130654428154066e-06,
"loss": 0.0634,
"step": 428
},
{
"epoch": 2.309555854643338,
"grad_norm": 0.40130868554115295,
"learning_rate": 1.2934067273603855e-06,
"loss": 0.0818,
"step": 429
},
{
"epoch": 2.3149394347240917,
"grad_norm": 0.3942681849002838,
"learning_rate": 1.2738744007934595e-06,
"loss": 0.0843,
"step": 430
},
{
"epoch": 2.320323014804845,
"grad_norm": 0.3565605580806732,
"learning_rate": 1.2544691291356497e-06,
"loss": 0.0584,
"step": 431
},
{
"epoch": 2.325706594885599,
"grad_norm": 0.38263797760009766,
"learning_rate": 1.2351915740756087e-06,
"loss": 0.0652,
"step": 432
},
{
"epoch": 2.3310901749663526,
"grad_norm": 0.4015883207321167,
"learning_rate": 1.2160423929470584e-06,
"loss": 0.0751,
"step": 433
},
{
"epoch": 2.3364737550471064,
"grad_norm": 0.3580048680305481,
"learning_rate": 1.1970222387063756e-06,
"loss": 0.0624,
"step": 434
},
{
"epoch": 2.34185733512786,
"grad_norm": 0.47708114981651306,
"learning_rate": 1.1781317599103238e-06,
"loss": 0.0829,
"step": 435
},
{
"epoch": 2.3472409152086136,
"grad_norm": 0.3463763892650604,
"learning_rate": 1.1593716006939455e-06,
"loss": 0.0693,
"step": 436
},
{
"epoch": 2.3526244952893673,
"grad_norm": 0.3862798810005188,
"learning_rate": 1.140742400748593e-06,
"loss": 0.0716,
"step": 437
},
{
"epoch": 2.358008075370121,
"grad_norm": 0.3969804346561432,
"learning_rate": 1.1222447953001182e-06,
"loss": 0.0708,
"step": 438
},
{
"epoch": 2.363391655450875,
"grad_norm": 0.3394986689090729,
"learning_rate": 1.1038794150872117e-06,
"loss": 0.0595,
"step": 439
},
{
"epoch": 2.3687752355316287,
"grad_norm": 0.39073002338409424,
"learning_rate": 1.0856468863398917e-06,
"loss": 0.0634,
"step": 440
},
{
"epoch": 2.374158815612382,
"grad_norm": 0.3924263119697571,
"learning_rate": 1.0675478307581627e-06,
"loss": 0.0725,
"step": 441
},
{
"epoch": 2.379542395693136,
"grad_norm": 0.3952764868736267,
"learning_rate": 1.0495828654907991e-06,
"loss": 0.0663,
"step": 442
},
{
"epoch": 2.3849259757738897,
"grad_norm": 0.37942010164260864,
"learning_rate": 1.0317526031143161e-06,
"loss": 0.0683,
"step": 443
},
{
"epoch": 2.3903095558546434,
"grad_norm": 0.35665637254714966,
"learning_rate": 1.014057651612076e-06,
"loss": 0.0662,
"step": 444
},
{
"epoch": 2.3956931359353972,
"grad_norm": 0.3667193651199341,
"learning_rate": 9.964986143535515e-07,
"loss": 0.0616,
"step": 445
},
{
"epoch": 2.4010767160161506,
"grad_norm": 0.4359084367752075,
"learning_rate": 9.790760900737683e-07,
"loss": 0.0637,
"step": 446
},
{
"epoch": 2.4064602960969044,
"grad_norm": 0.3700020909309387,
"learning_rate": 9.61790672852868e-07,
"loss": 0.0569,
"step": 447
},
{
"epoch": 2.411843876177658,
"grad_norm": 0.4084100127220154,
"learning_rate": 9.446429520958666e-07,
"loss": 0.0708,
"step": 448
},
{
"epoch": 2.417227456258412,
"grad_norm": 0.40237903594970703,
"learning_rate": 9.276335125125502e-07,
"loss": 0.0755,
"step": 449
},
{
"epoch": 2.4226110363391653,
"grad_norm": 0.36956214904785156,
"learning_rate": 9.107629340975388e-07,
"loss": 0.0618,
"step": 450
},
{
"epoch": 2.427994616419919,
"grad_norm": 0.38042622804641724,
"learning_rate": 8.940317921105085e-07,
"loss": 0.0579,
"step": 451
},
{
"epoch": 2.433378196500673,
"grad_norm": 0.39496564865112305,
"learning_rate": 8.774406570565791e-07,
"loss": 0.0702,
"step": 452
},
{
"epoch": 2.4387617765814267,
"grad_norm": 0.3166196942329407,
"learning_rate": 8.609900946668536e-07,
"loss": 0.0555,
"step": 453
},
{
"epoch": 2.4441453566621805,
"grad_norm": 0.3680025637149811,
"learning_rate": 8.446806658791373e-07,
"loss": 0.0593,
"step": 454
},
{
"epoch": 2.449528936742934,
"grad_norm": 0.39065518975257874,
"learning_rate": 8.285129268188042e-07,
"loss": 0.0708,
"step": 455
},
{
"epoch": 2.4549125168236876,
"grad_norm": 0.40179872512817383,
"learning_rate": 8.124874287798352e-07,
"loss": 0.0773,
"step": 456
},
{
"epoch": 2.4602960969044414,
"grad_norm": 0.33520442247390747,
"learning_rate": 7.966047182060226e-07,
"loss": 0.0573,
"step": 457
},
{
"epoch": 2.465679676985195,
"grad_norm": 0.4467129111289978,
"learning_rate": 7.808653366723296e-07,
"loss": 0.0826,
"step": 458
},
{
"epoch": 2.471063257065949,
"grad_norm": 0.3427630662918091,
"learning_rate": 7.652698208664377e-07,
"loss": 0.0657,
"step": 459
},
{
"epoch": 2.4764468371467023,
"grad_norm": 0.3667747974395752,
"learning_rate": 7.498187025704296e-07,
"loss": 0.0649,
"step": 460
},
{
"epoch": 2.481830417227456,
"grad_norm": 0.36384788155555725,
"learning_rate": 7.345125086426675e-07,
"loss": 0.0532,
"step": 461
},
{
"epoch": 2.48721399730821,
"grad_norm": 0.40607815980911255,
"learning_rate": 7.193517609998263e-07,
"loss": 0.0796,
"step": 462
},
{
"epoch": 2.4925975773889637,
"grad_norm": 0.36063507199287415,
"learning_rate": 7.043369765990943e-07,
"loss": 0.0639,
"step": 463
},
{
"epoch": 2.4979811574697175,
"grad_norm": 0.3970508277416229,
"learning_rate": 6.894686674205481e-07,
"loss": 0.0688,
"step": 464
},
{
"epoch": 2.503364737550471,
"grad_norm": 0.3685045540332794,
"learning_rate": 6.747473404496902e-07,
"loss": 0.0661,
"step": 465
},
{
"epoch": 2.5087483176312246,
"grad_norm": 0.45861902832984924,
"learning_rate": 6.601734976601737e-07,
"loss": 0.0673,
"step": 466
},
{
"epoch": 2.5141318977119784,
"grad_norm": 0.40021732449531555,
"learning_rate": 6.457476359966685e-07,
"loss": 0.0757,
"step": 467
},
{
"epoch": 2.519515477792732,
"grad_norm": 0.3946848511695862,
"learning_rate": 6.314702473579309e-07,
"loss": 0.0654,
"step": 468
},
{
"epoch": 2.524899057873486,
"grad_norm": 0.4420785903930664,
"learning_rate": 6.17341818580024e-07,
"loss": 0.0864,
"step": 469
},
{
"epoch": 2.5302826379542394,
"grad_norm": 0.4311622679233551,
"learning_rate": 6.033628314197176e-07,
"loss": 0.0823,
"step": 470
},
{
"epoch": 2.535666218034993,
"grad_norm": 0.4172739088535309,
"learning_rate": 5.895337625380632e-07,
"loss": 0.0892,
"step": 471
},
{
"epoch": 2.541049798115747,
"grad_norm": 0.46117520332336426,
"learning_rate": 5.758550834841381e-07,
"loss": 0.0762,
"step": 472
},
{
"epoch": 2.5464333781965007,
"grad_norm": 0.38010281324386597,
"learning_rate": 5.62327260678967e-07,
"loss": 0.0576,
"step": 473
},
{
"epoch": 2.5518169582772545,
"grad_norm": 0.32299867272377014,
"learning_rate": 5.489507553996204e-07,
"loss": 0.0593,
"step": 474
},
{
"epoch": 2.557200538358008,
"grad_norm": 0.39713406562805176,
"learning_rate": 5.357260237634826e-07,
"loss": 0.0742,
"step": 475
},
{
"epoch": 2.5625841184387617,
"grad_norm": 0.4520042836666107,
"learning_rate": 5.226535167127e-07,
"loss": 0.0823,
"step": 476
},
{
"epoch": 2.5679676985195155,
"grad_norm": 0.38494300842285156,
"learning_rate": 5.097336799988067e-07,
"loss": 0.0723,
"step": 477
},
{
"epoch": 2.5733512786002692,
"grad_norm": 0.30375781655311584,
"learning_rate": 4.96966954167517e-07,
"loss": 0.0588,
"step": 478
},
{
"epoch": 2.578734858681023,
"grad_norm": 0.35356128215789795,
"learning_rate": 4.843537745437188e-07,
"loss": 0.0653,
"step": 479
},
{
"epoch": 2.5841184387617764,
"grad_norm": 0.3791372776031494,
"learning_rate": 4.718945712166123e-07,
"loss": 0.0715,
"step": 480
},
{
"epoch": 2.58950201884253,
"grad_norm": 0.42902350425720215,
"learning_rate": 4.595897690250567e-07,
"loss": 0.0797,
"step": 481
},
{
"epoch": 2.594885598923284,
"grad_norm": 0.39135926961898804,
"learning_rate": 4.4743978754308027e-07,
"loss": 0.0708,
"step": 482
},
{
"epoch": 2.6002691790040378,
"grad_norm": 0.4254235625267029,
"learning_rate": 4.3544504106557026e-07,
"loss": 0.0802,
"step": 483
},
{
"epoch": 2.6056527590847915,
"grad_norm": 0.3718099296092987,
"learning_rate": 4.2360593859415433e-07,
"loss": 0.0617,
"step": 484
},
{
"epoch": 2.611036339165545,
"grad_norm": 0.4191717505455017,
"learning_rate": 4.1192288382324363e-07,
"loss": 0.0859,
"step": 485
},
{
"epoch": 2.6164199192462987,
"grad_norm": 0.3816201388835907,
"learning_rate": 4.003962751262763e-07,
"loss": 0.0646,
"step": 486
},
{
"epoch": 2.6218034993270525,
"grad_norm": 0.36653172969818115,
"learning_rate": 3.890265055421283e-07,
"loss": 0.0641,
"step": 487
},
{
"epoch": 2.6271870794078063,
"grad_norm": 0.3723650276660919,
"learning_rate": 3.77813962761715e-07,
"loss": 0.075,
"step": 488
},
{
"epoch": 2.63257065948856,
"grad_norm": 0.34589794278144836,
"learning_rate": 3.6675902911476937e-07,
"loss": 0.0595,
"step": 489
},
{
"epoch": 2.6379542395693134,
"grad_norm": 0.4536292552947998,
"learning_rate": 3.558620815568048e-07,
"loss": 0.0766,
"step": 490
},
{
"epoch": 2.643337819650067,
"grad_norm": 0.4030088782310486,
"learning_rate": 3.451234916562618e-07,
"loss": 0.0702,
"step": 491
},
{
"epoch": 2.648721399730821,
"grad_norm": 0.6007040739059448,
"learning_rate": 3.3454362558184075e-07,
"loss": 0.0665,
"step": 492
},
{
"epoch": 2.654104979811575,
"grad_norm": 0.3956087827682495,
"learning_rate": 3.241228440900124e-07,
"loss": 0.0669,
"step": 493
},
{
"epoch": 2.6594885598923286,
"grad_norm": 0.4161822199821472,
"learning_rate": 3.1386150251271897e-07,
"loss": 0.0722,
"step": 494
},
{
"epoch": 2.664872139973082,
"grad_norm": 0.36707159876823425,
"learning_rate": 3.0375995074525764e-07,
"loss": 0.0602,
"step": 495
},
{
"epoch": 2.6702557200538357,
"grad_norm": 0.4103851318359375,
"learning_rate": 2.9381853323434627e-07,
"loss": 0.0898,
"step": 496
},
{
"epoch": 2.6756393001345895,
"grad_norm": 0.3391963541507721,
"learning_rate": 2.840375889663871e-07,
"loss": 0.06,
"step": 497
},
{
"epoch": 2.6810228802153433,
"grad_norm": 0.36111244559288025,
"learning_rate": 2.744174514558956e-07,
"loss": 0.0595,
"step": 498
},
{
"epoch": 2.686406460296097,
"grad_norm": 0.33847886323928833,
"learning_rate": 2.6495844873413944e-07,
"loss": 0.0604,
"step": 499
},
{
"epoch": 2.6917900403768504,
"grad_norm": 0.38463106751441956,
"learning_rate": 2.556609033379459e-07,
"loss": 0.0642,
"step": 500
},
{
"epoch": 2.6971736204576042,
"grad_norm": 0.42590996623039246,
"learning_rate": 2.465251322987061e-07,
"loss": 0.0773,
"step": 501
},
{
"epoch": 2.702557200538358,
"grad_norm": 0.4083699584007263,
"learning_rate": 2.3755144713156819e-07,
"loss": 0.0744,
"step": 502
},
{
"epoch": 2.707940780619112,
"grad_norm": 0.34972718358039856,
"learning_rate": 2.287401538248074e-07,
"loss": 0.0631,
"step": 503
},
{
"epoch": 2.7133243606998656,
"grad_norm": 0.3795744776725769,
"learning_rate": 2.20091552829399e-07,
"loss": 0.0611,
"step": 504
},
{
"epoch": 2.718707940780619,
"grad_norm": 0.4142250120639801,
"learning_rate": 2.1160593904877236e-07,
"loss": 0.0755,
"step": 505
},
{
"epoch": 2.7240915208613727,
"grad_norm": 0.3663713335990906,
"learning_rate": 2.0328360182875262e-07,
"loss": 0.0674,
"step": 506
},
{
"epoch": 2.7294751009421265,
"grad_norm": 0.391294926404953,
"learning_rate": 1.9512482494769613e-07,
"loss": 0.0597,
"step": 507
},
{
"epoch": 2.7348586810228803,
"grad_norm": 0.4010995328426361,
"learning_rate": 1.8712988660681498e-07,
"loss": 0.0702,
"step": 508
},
{
"epoch": 2.740242261103634,
"grad_norm": 0.3831869065761566,
"learning_rate": 1.7929905942068836e-07,
"loss": 0.0618,
"step": 509
},
{
"epoch": 2.7456258411843875,
"grad_norm": 0.34535014629364014,
"learning_rate": 1.7163261040796797e-07,
"loss": 0.0598,
"step": 510
},
{
"epoch": 2.7510094212651413,
"grad_norm": 0.4758029282093048,
"learning_rate": 1.6413080098227562e-07,
"loss": 0.0895,
"step": 511
},
{
"epoch": 2.756393001345895,
"grad_norm": 0.3582858741283417,
"learning_rate": 1.5679388694328446e-07,
"loss": 0.0603,
"step": 512
},
{
"epoch": 2.761776581426649,
"grad_norm": 0.3556898832321167,
"learning_rate": 1.4962211846800078e-07,
"loss": 0.0551,
"step": 513
},
{
"epoch": 2.7671601615074026,
"grad_norm": 0.4652111232280731,
"learning_rate": 1.426157401022321e-07,
"loss": 0.0935,
"step": 514
},
{
"epoch": 2.772543741588156,
"grad_norm": 0.33882859349250793,
"learning_rate": 1.3577499075224821e-07,
"loss": 0.0569,
"step": 515
},
{
"epoch": 2.7779273216689098,
"grad_norm": 0.3924010992050171,
"learning_rate": 1.2910010367663317e-07,
"loss": 0.0646,
"step": 516
},
{
"epoch": 2.7833109017496636,
"grad_norm": 0.38544705510139465,
"learning_rate": 1.2259130647833627e-07,
"loss": 0.0836,
"step": 517
},
{
"epoch": 2.7886944818304173,
"grad_norm": 0.4027419984340668,
"learning_rate": 1.162488210969065e-07,
"loss": 0.0653,
"step": 518
},
{
"epoch": 2.794078061911171,
"grad_norm": 0.3646996021270752,
"learning_rate": 1.100728638009263e-07,
"loss": 0.0617,
"step": 519
},
{
"epoch": 2.7994616419919245,
"grad_norm": 0.2809794247150421,
"learning_rate": 1.0406364518063927e-07,
"loss": 0.0394,
"step": 520
},
{
"epoch": 2.8048452220726783,
"grad_norm": 0.3826785385608673,
"learning_rate": 9.822137014076472e-08,
"loss": 0.0793,
"step": 521
},
{
"epoch": 2.810228802153432,
"grad_norm": 0.324332594871521,
"learning_rate": 9.254623789351714e-08,
"loss": 0.0598,
"step": 522
},
{
"epoch": 2.815612382234186,
"grad_norm": 0.32736143469810486,
"learning_rate": 8.703844195180555e-08,
"loss": 0.056,
"step": 523
},
{
"epoch": 2.8209959623149397,
"grad_norm": 0.3823404908180237,
"learning_rate": 8.169817012264214e-08,
"loss": 0.068,
"step": 524
},
{
"epoch": 2.826379542395693,
"grad_norm": 0.4002109169960022,
"learning_rate": 7.652560450073454e-08,
"loss": 0.0803,
"step": 525
},
{
"epoch": 2.831763122476447,
"grad_norm": 0.44719937443733215,
"learning_rate": 7.152092146227806e-08,
"loss": 0.0853,
"step": 526
},
{
"epoch": 2.8371467025572006,
"grad_norm": 0.40953177213668823,
"learning_rate": 6.668429165893996e-08,
"loss": 0.0587,
"step": 527
},
{
"epoch": 2.8425302826379544,
"grad_norm": 0.34889018535614014,
"learning_rate": 6.20158800120435e-08,
"loss": 0.0653,
"step": 528
},
{
"epoch": 2.847913862718708,
"grad_norm": 0.4065254330635071,
"learning_rate": 5.7515845706940246e-08,
"loss": 0.0847,
"step": 529
},
{
"epoch": 2.8532974427994615,
"grad_norm": 0.4163700342178345,
"learning_rate": 5.31843421875855e-08,
"loss": 0.0616,
"step": 530
},
{
"epoch": 2.8586810228802153,
"grad_norm": 0.40107670426368713,
"learning_rate": 4.9021517151305875e-08,
"loss": 0.0793,
"step": 531
},
{
"epoch": 2.864064602960969,
"grad_norm": 0.34356680512428284,
"learning_rate": 4.502751254375992e-08,
"loss": 0.0571,
"step": 532
},
{
"epoch": 2.869448183041723,
"grad_norm": 0.4364492893218994,
"learning_rate": 4.120246455410204e-08,
"loss": 0.0545,
"step": 533
},
{
"epoch": 2.8748317631224767,
"grad_norm": 0.3949114680290222,
"learning_rate": 3.7546503610336183e-08,
"loss": 0.0672,
"step": 534
},
{
"epoch": 2.88021534320323,
"grad_norm": 0.35691335797309875,
"learning_rate": 3.405975437486997e-08,
"loss": 0.0646,
"step": 535
},
{
"epoch": 2.885598923283984,
"grad_norm": 0.3505745828151703,
"learning_rate": 3.074233574026087e-08,
"loss": 0.0556,
"step": 536
},
{
"epoch": 2.8909825033647376,
"grad_norm": 0.345758318901062,
"learning_rate": 2.7594360825166644e-08,
"loss": 0.0664,
"step": 537
},
{
"epoch": 2.8963660834454914,
"grad_norm": 0.3653146028518677,
"learning_rate": 2.4615936970485144e-08,
"loss": 0.0568,
"step": 538
},
{
"epoch": 2.901749663526245,
"grad_norm": 0.35214874148368835,
"learning_rate": 2.180716573569386e-08,
"loss": 0.0723,
"step": 539
},
{
"epoch": 2.9071332436069985,
"grad_norm": 0.31391990184783936,
"learning_rate": 1.9168142895389376e-08,
"loss": 0.0511,
"step": 540
},
{
"epoch": 2.9125168236877523,
"grad_norm": 0.3372190594673157,
"learning_rate": 1.6698958436019986e-08,
"loss": 0.0559,
"step": 541
},
{
"epoch": 2.917900403768506,
"grad_norm": 0.32231083512306213,
"learning_rate": 1.4399696552816477e-08,
"loss": 0.0585,
"step": 542
},
{
"epoch": 2.92328398384926,
"grad_norm": 0.4236755669116974,
"learning_rate": 1.2270435646922763e-08,
"loss": 0.0818,
"step": 543
},
{
"epoch": 2.9286675639300137,
"grad_norm": 0.3500356078147888,
"learning_rate": 1.031124832272301e-08,
"loss": 0.0716,
"step": 544
},
{
"epoch": 2.934051144010767,
"grad_norm": 0.38234201073646545,
"learning_rate": 8.522201385362528e-09,
"loss": 0.0632,
"step": 545
},
{
"epoch": 2.939434724091521,
"grad_norm": 0.39198631048202515,
"learning_rate": 6.903355838475123e-09,
"loss": 0.0707,
"step": 546
},
{
"epoch": 2.9448183041722746,
"grad_norm": 0.3228546977043152,
"learning_rate": 5.454766882097007e-09,
"loss": 0.058,
"step": 547
},
{
"epoch": 2.9502018842530284,
"grad_norm": 0.35666099190711975,
"learning_rate": 4.1764839107905074e-09,
"loss": 0.061,
"step": 548
},
{
"epoch": 2.955585464333782,
"grad_norm": 0.3645073175430298,
"learning_rate": 3.068550511955426e-09,
"loss": 0.061,
"step": 549
},
{
"epoch": 2.9609690444145356,
"grad_norm": 0.34374579787254333,
"learning_rate": 2.131004464343556e-09,
"loss": 0.0671,
"step": 550
},
{
"epoch": 2.9663526244952894,
"grad_norm": 0.38777437806129456,
"learning_rate": 1.3638777367724898e-09,
"loss": 0.0789,
"step": 551
},
{
"epoch": 2.971736204576043,
"grad_norm": 0.35030388832092285,
"learning_rate": 7.671964870337168e-10,
"loss": 0.0649,
"step": 552
},
{
"epoch": 2.9771197846567965,
"grad_norm": 0.39920809864997864,
"learning_rate": 3.4098106100166616e-10,
"loss": 0.0783,
"step": 553
},
{
"epoch": 2.9825033647375507,
"grad_norm": 0.4224764406681061,
"learning_rate": 8.52459919381543e-11,
"loss": 0.0818,
"step": 554
},
{
"epoch": 2.987886944818304,
"grad_norm": 0.34364205598831177,
"learning_rate": 0.0,
"loss": 0.0664,
"step": 555
},
{
"epoch": 2.987886944818304,
"step": 555,
"total_flos": 1.777533654151463e+18,
"train_loss": 0.2704765519647448,
"train_runtime": 4301.2874,
"train_samples_per_second": 4.142,
"train_steps_per_second": 0.129
}
],
"logging_steps": 1.0,
"max_steps": 555,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.777533654151463e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}