9b-41 / trainer_state.json
furproxy's picture
Upload folder using huggingface_hub
b2f24e1 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 774,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.007751937984496124,
"grad_norm": 2.3754148483276367,
"learning_rate": 7.692307692307693e-07,
"loss": 4.068140983581543,
"step": 2
},
{
"epoch": 0.015503875968992248,
"grad_norm": 0.20276732742786407,
"learning_rate": 2.307692307692308e-06,
"loss": 2.0239908695220947,
"step": 4
},
{
"epoch": 0.023255813953488372,
"grad_norm": 0.35163113474845886,
"learning_rate": 3.846153846153846e-06,
"loss": 1.9337211847305298,
"step": 6
},
{
"epoch": 0.031007751937984496,
"grad_norm": 0.19401516020298004,
"learning_rate": 5.384615384615385e-06,
"loss": 1.9213242530822754,
"step": 8
},
{
"epoch": 0.03875968992248062,
"grad_norm": 0.11387048661708832,
"learning_rate": 6.923076923076923e-06,
"loss": 2.1911349296569824,
"step": 10
},
{
"epoch": 0.046511627906976744,
"grad_norm": 2.326526165008545,
"learning_rate": 8.461538461538462e-06,
"loss": 3.268449306488037,
"step": 12
},
{
"epoch": 0.05426356589147287,
"grad_norm": 0.15703660249710083,
"learning_rate": 9.999999999999999e-06,
"loss": 1.7003194093704224,
"step": 14
},
{
"epoch": 0.06201550387596899,
"grad_norm": 0.6381284594535828,
"learning_rate": 1.153846153846154e-05,
"loss": 1.8064090013504028,
"step": 16
},
{
"epoch": 0.06976744186046512,
"grad_norm": 0.5508278608322144,
"learning_rate": 1.3076923076923078e-05,
"loss": 1.3964051008224487,
"step": 18
},
{
"epoch": 0.07751937984496124,
"grad_norm": 0.3715132772922516,
"learning_rate": 1.4615384615384615e-05,
"loss": 1.59793221950531,
"step": 20
},
{
"epoch": 0.08527131782945736,
"grad_norm": 7.088710308074951,
"learning_rate": 1.6153846153846154e-05,
"loss": 1.7358227968215942,
"step": 22
},
{
"epoch": 0.09302325581395349,
"grad_norm": 0.14647279679775238,
"learning_rate": 1.7692307692307694e-05,
"loss": 1.3447600603103638,
"step": 24
},
{
"epoch": 0.10077519379844961,
"grad_norm": 0.22108672559261322,
"learning_rate": 1.923076923076923e-05,
"loss": 1.4682307243347168,
"step": 26
},
{
"epoch": 0.10852713178294573,
"grad_norm": 0.3697395622730255,
"learning_rate": 2.076923076923077e-05,
"loss": 1.2035093307495117,
"step": 28
},
{
"epoch": 0.11627906976744186,
"grad_norm": 0.24884682893753052,
"learning_rate": 2.230769230769231e-05,
"loss": 1.1427452564239502,
"step": 30
},
{
"epoch": 0.12403100775193798,
"grad_norm": 0.18956558406352997,
"learning_rate": 2.3846153846153846e-05,
"loss": 1.3711202144622803,
"step": 32
},
{
"epoch": 0.13178294573643412,
"grad_norm": 0.18877695500850677,
"learning_rate": 2.5384615384615386e-05,
"loss": 1.2189266681671143,
"step": 34
},
{
"epoch": 0.13953488372093023,
"grad_norm": 0.10388179123401642,
"learning_rate": 2.6923076923076923e-05,
"loss": 1.3252586126327515,
"step": 36
},
{
"epoch": 0.14728682170542637,
"grad_norm": 0.2508637309074402,
"learning_rate": 2.846153846153846e-05,
"loss": 1.0033904314041138,
"step": 38
},
{
"epoch": 0.15503875968992248,
"grad_norm": 0.09986624866724014,
"learning_rate": 3e-05,
"loss": 1.4535468816757202,
"step": 40
},
{
"epoch": 0.16279069767441862,
"grad_norm": 0.12885728478431702,
"learning_rate": 2.999580739494117e-05,
"loss": 1.0325186252593994,
"step": 42
},
{
"epoch": 0.17054263565891473,
"grad_norm": 0.10903146117925644,
"learning_rate": 2.998323233708815e-05,
"loss": 1.2467223405838013,
"step": 44
},
{
"epoch": 0.17829457364341086,
"grad_norm": 0.11212802678346634,
"learning_rate": 2.9962283096597995e-05,
"loss": 1.6686618328094482,
"step": 46
},
{
"epoch": 0.18604651162790697,
"grad_norm": 0.2613708972930908,
"learning_rate": 2.9932973451022333e-05,
"loss": 0.8405603170394897,
"step": 48
},
{
"epoch": 0.1937984496124031,
"grad_norm": 0.8741084337234497,
"learning_rate": 2.9895322676246387e-05,
"loss": 0.6372175812721252,
"step": 50
},
{
"epoch": 0.20155038759689922,
"grad_norm": 0.20064491033554077,
"learning_rate": 2.9849355533811937e-05,
"loss": 1.0768086910247803,
"step": 52
},
{
"epoch": 0.20930232558139536,
"grad_norm": 0.20809470117092133,
"learning_rate": 2.9795102254632528e-05,
"loss": 0.58198082447052,
"step": 54
},
{
"epoch": 0.21705426356589147,
"grad_norm": 0.10565000772476196,
"learning_rate": 2.9732598519111736e-05,
"loss": 1.3517603874206543,
"step": 56
},
{
"epoch": 0.2248062015503876,
"grad_norm": 0.10898104310035706,
"learning_rate": 2.9661885433677437e-05,
"loss": 1.340335488319397,
"step": 58
},
{
"epoch": 0.23255813953488372,
"grad_norm": 0.3277449309825897,
"learning_rate": 2.9583009503747627e-05,
"loss": 1.1451056003570557,
"step": 60
},
{
"epoch": 0.24031007751937986,
"grad_norm": 0.11206506192684174,
"learning_rate": 2.9496022603145497e-05,
"loss": 1.2255440950393677,
"step": 62
},
{
"epoch": 0.24806201550387597,
"grad_norm": 0.14122240245342255,
"learning_rate": 2.940098193998391e-05,
"loss": 1.2778782844543457,
"step": 64
},
{
"epoch": 0.2558139534883721,
"grad_norm": 0.17153455317020416,
"learning_rate": 2.9297950019041724e-05,
"loss": 1.178369402885437,
"step": 66
},
{
"epoch": 0.26356589147286824,
"grad_norm": 0.2940099239349365,
"learning_rate": 2.918699460065665e-05,
"loss": 1.1788100004196167,
"step": 68
},
{
"epoch": 0.2713178294573643,
"grad_norm": 0.07703827321529388,
"learning_rate": 2.906818865616178e-05,
"loss": 1.306922435760498,
"step": 70
},
{
"epoch": 0.27906976744186046,
"grad_norm": 0.24490903317928314,
"learning_rate": 2.8941610319894977e-05,
"loss": 1.0475130081176758,
"step": 72
},
{
"epoch": 0.2868217054263566,
"grad_norm": 0.13828147947788239,
"learning_rate": 2.8807342837812783e-05,
"loss": 1.1680102348327637,
"step": 74
},
{
"epoch": 0.29457364341085274,
"grad_norm": 0.13997578620910645,
"learning_rate": 2.8665474512742603e-05,
"loss": 1.0921390056610107,
"step": 76
},
{
"epoch": 0.3023255813953488,
"grad_norm": 0.08565083891153336,
"learning_rate": 2.8516098646309108e-05,
"loss": 1.1694703102111816,
"step": 78
},
{
"epoch": 0.31007751937984496,
"grad_norm": 0.054738063365221024,
"learning_rate": 2.8359313477573215e-05,
"loss": 1.1712660789489746,
"step": 80
},
{
"epoch": 0.3178294573643411,
"grad_norm": 0.06600243598222733,
"learning_rate": 2.8195222118423792e-05,
"loss": 1.32106351852417,
"step": 82
},
{
"epoch": 0.32558139534883723,
"grad_norm": 0.1856859177350998,
"learning_rate": 2.8023932485764768e-05,
"loss": 1.002191424369812,
"step": 84
},
{
"epoch": 0.3333333333333333,
"grad_norm": 0.1050916314125061,
"learning_rate": 2.7845557230542076e-05,
"loss": 1.1279501914978027,
"step": 86
},
{
"epoch": 0.34108527131782945,
"grad_norm": 0.09759809821844101,
"learning_rate": 2.7660213663657282e-05,
"loss": 1.3432408571243286,
"step": 88
},
{
"epoch": 0.3488372093023256,
"grad_norm": 0.09217233955860138,
"learning_rate": 2.7468023678816447e-05,
"loss": 0.8359699249267578,
"step": 90
},
{
"epoch": 0.35658914728682173,
"grad_norm": 0.11377432197332382,
"learning_rate": 2.726911367236509e-05,
"loss": 1.1406829357147217,
"step": 92
},
{
"epoch": 0.3643410852713178,
"grad_norm": 0.14340269565582275,
"learning_rate": 2.706361446016193e-05,
"loss": 1.142421841621399,
"step": 94
},
{
"epoch": 0.37209302325581395,
"grad_norm": 0.39369332790374756,
"learning_rate": 2.6851661191546038e-05,
"loss": 1.2204563617706299,
"step": 96
},
{
"epoch": 0.3798449612403101,
"grad_norm": 0.1904468685388565,
"learning_rate": 2.6633393260454096e-05,
"loss": 0.7862477898597717,
"step": 98
},
{
"epoch": 0.3875968992248062,
"grad_norm": 0.13884544372558594,
"learning_rate": 2.6408954213746028e-05,
"loss": 0.7346755862236023,
"step": 100
},
{
"epoch": 0.3953488372093023,
"grad_norm": 0.07463762909173965,
"learning_rate": 2.61784916567995e-05,
"loss": 1.3203626871109009,
"step": 102
},
{
"epoch": 0.40310077519379844,
"grad_norm": 0.1436045616865158,
"learning_rate": 2.5942157156435248e-05,
"loss": 1.2376055717468262,
"step": 104
},
{
"epoch": 0.4108527131782946,
"grad_norm": 0.14889220893383026,
"learning_rate": 2.570010614123707e-05,
"loss": 1.0368235111236572,
"step": 106
},
{
"epoch": 0.4186046511627907,
"grad_norm": 0.073515884578228,
"learning_rate": 2.545249779933216e-05,
"loss": 1.105363130569458,
"step": 108
},
{
"epoch": 0.4263565891472868,
"grad_norm": 0.0721718966960907,
"learning_rate": 2.5199494973698856e-05,
"loss": 1.0211938619613647,
"step": 110
},
{
"epoch": 0.43410852713178294,
"grad_norm": 0.08736886829137802,
"learning_rate": 2.494126405507074e-05,
"loss": 0.9343675971031189,
"step": 112
},
{
"epoch": 0.4418604651162791,
"grad_norm": 0.6035546064376831,
"learning_rate": 2.4677974872507553e-05,
"loss": 1.0941760540008545,
"step": 114
},
{
"epoch": 0.4496124031007752,
"grad_norm": 0.10358745604753494,
"learning_rate": 2.440980058170478e-05,
"loss": 1.0486119985580444,
"step": 116
},
{
"epoch": 0.4573643410852713,
"grad_norm": 0.41432875394821167,
"learning_rate": 2.4136917551115484e-05,
"loss": 0.9473840594291687,
"step": 118
},
{
"epoch": 0.46511627906976744,
"grad_norm": 0.06210324168205261,
"learning_rate": 2.38595052459592e-05,
"loss": 1.2813960313796997,
"step": 120
},
{
"epoch": 0.4728682170542636,
"grad_norm": 0.21510902047157288,
"learning_rate": 2.357774611019419e-05,
"loss": 1.0586227178573608,
"step": 122
},
{
"epoch": 0.4806201550387597,
"grad_norm": 0.10468325763940811,
"learning_rate": 2.3291825446530736e-05,
"loss": 1.2756110429763794,
"step": 124
},
{
"epoch": 0.4883720930232558,
"grad_norm": 1.1782441139221191,
"learning_rate": 2.3001931294564265e-05,
"loss": 1.168853759765625,
"step": 126
},
{
"epoch": 0.49612403100775193,
"grad_norm": 0.11016172915697098,
"learning_rate": 2.27082543071086e-05,
"loss": 1.181935429573059,
"step": 128
},
{
"epoch": 0.5038759689922481,
"grad_norm": 0.6895468831062317,
"learning_rate": 2.2410987624810524e-05,
"loss": 1.1901732683181763,
"step": 130
},
{
"epoch": 0.5116279069767442,
"grad_norm": 0.18288742005825043,
"learning_rate": 2.2110326749128233e-05,
"loss": 0.7289036512374878,
"step": 132
},
{
"epoch": 0.5193798449612403,
"grad_norm": 0.11772674322128296,
"learning_rate": 2.1806469413757164e-05,
"loss": 1.161149024963379,
"step": 134
},
{
"epoch": 0.5271317829457365,
"grad_norm": 0.2412514090538025,
"learning_rate": 2.149961545458773e-05,
"loss": 1.1283718347549438,
"step": 136
},
{
"epoch": 0.5348837209302325,
"grad_norm": 0.08974076807498932,
"learning_rate": 2.118996667828058e-05,
"loss": 1.362121343612671,
"step": 138
},
{
"epoch": 0.5426356589147286,
"grad_norm": 0.12378139048814774,
"learning_rate": 2.0877726729545665e-05,
"loss": 1.2608673572540283,
"step": 140
},
{
"epoch": 0.5503875968992248,
"grad_norm": 0.41343384981155396,
"learning_rate": 2.0563100957212577e-05,
"loss": 0.5950201153755188,
"step": 142
},
{
"epoch": 0.5581395348837209,
"grad_norm": 0.09795749187469482,
"learning_rate": 2.0246296279180093e-05,
"loss": 1.3639545440673828,
"step": 144
},
{
"epoch": 0.5658914728682171,
"grad_norm": 0.09332104027271271,
"learning_rate": 1.9927521046333833e-05,
"loss": 1.0145015716552734,
"step": 146
},
{
"epoch": 0.5736434108527132,
"grad_norm": 0.15143655240535736,
"learning_rate": 1.960698490552145e-05,
"loss": 0.9937471151351929,
"step": 148
},
{
"epoch": 0.5813953488372093,
"grad_norm": 0.13751712441444397,
"learning_rate": 1.9284898661675586e-05,
"loss": 1.0032529830932617,
"step": 150
},
{
"epoch": 0.5891472868217055,
"grad_norm": 0.0935468077659607,
"learning_rate": 1.8961474139175106e-05,
"loss": 1.2299753427505493,
"step": 152
},
{
"epoch": 0.5968992248062015,
"grad_norm": 0.06415323913097382,
"learning_rate": 1.863692404253597e-05,
"loss": 1.2138370275497437,
"step": 154
},
{
"epoch": 0.6046511627906976,
"grad_norm": 0.17963920533657074,
"learning_rate": 1.8311461816523192e-05,
"loss": 0.7944934964179993,
"step": 156
},
{
"epoch": 0.6124031007751938,
"grad_norm": 0.13642992079257965,
"learning_rate": 1.7985301505776026e-05,
"loss": 0.8701238036155701,
"step": 158
},
{
"epoch": 0.6201550387596899,
"grad_norm": 0.09022583067417145,
"learning_rate": 1.765865761403861e-05,
"loss": 1.279708981513977,
"step": 160
},
{
"epoch": 0.627906976744186,
"grad_norm": 0.310406357049942,
"learning_rate": 1.733174496308864e-05,
"loss": 1.020676612854004,
"step": 162
},
{
"epoch": 0.6356589147286822,
"grad_norm": 0.0761994794011116,
"learning_rate": 1.700477855145699e-05,
"loss": 1.2313765287399292,
"step": 164
},
{
"epoch": 0.6434108527131783,
"grad_norm": 0.09753235429525375,
"learning_rate": 1.6677973413030936e-05,
"loss": 0.9673617482185364,
"step": 166
},
{
"epoch": 0.6511627906976745,
"grad_norm": 0.16590853035449982,
"learning_rate": 1.6351544475634266e-05,
"loss": 1.194890022277832,
"step": 168
},
{
"epoch": 0.6589147286821705,
"grad_norm": 0.14948436617851257,
"learning_rate": 1.6025706419677057e-05,
"loss": 0.5818596482276917,
"step": 170
},
{
"epoch": 0.6666666666666666,
"grad_norm": 0.1858793944120407,
"learning_rate": 1.5700673536968222e-05,
"loss": 1.1378095149993896,
"step": 172
},
{
"epoch": 0.6744186046511628,
"grad_norm": 0.2438780963420868,
"learning_rate": 1.5376659589783572e-05,
"loss": 0.864031970500946,
"step": 174
},
{
"epoch": 0.6821705426356589,
"grad_norm": 0.13808684051036835,
"learning_rate": 1.5053877670282186e-05,
"loss": 0.9113052487373352,
"step": 176
},
{
"epoch": 0.689922480620155,
"grad_norm": 0.211846262216568,
"learning_rate": 1.4732540060363447e-05,
"loss": 0.9309589862823486,
"step": 178
},
{
"epoch": 0.6976744186046512,
"grad_norm": 0.13679386675357819,
"learning_rate": 1.4412858092056991e-05,
"loss": 1.002301573753357,
"step": 180
},
{
"epoch": 0.7054263565891473,
"grad_norm": 0.09943073987960815,
"learning_rate": 1.4095042008537343e-05,
"loss": 1.0712729692459106,
"step": 182
},
{
"epoch": 0.7131782945736435,
"grad_norm": 0.18878647685050964,
"learning_rate": 1.3779300825854622e-05,
"loss": 0.9123468995094299,
"step": 184
},
{
"epoch": 0.7209302325581395,
"grad_norm": 0.08719319850206375,
"learning_rate": 1.3465842195472321e-05,
"loss": 1.2733235359191895,
"step": 186
},
{
"epoch": 0.7286821705426356,
"grad_norm": 0.06448693573474884,
"learning_rate": 1.3154872267702522e-05,
"loss": 0.9789453148841858,
"step": 188
},
{
"epoch": 0.7364341085271318,
"grad_norm": 0.12241167575120926,
"learning_rate": 1.2846595556128331e-05,
"loss": 1.0140795707702637,
"step": 190
},
{
"epoch": 0.7441860465116279,
"grad_norm": 0.14142774045467377,
"learning_rate": 1.254121480310276e-05,
"loss": 1.1332778930664062,
"step": 192
},
{
"epoch": 0.751937984496124,
"grad_norm": 0.07337574660778046,
"learning_rate": 1.2238930846412475e-05,
"loss": 1.201830506324768,
"step": 194
},
{
"epoch": 0.7596899224806202,
"grad_norm": 0.07899358868598938,
"learning_rate": 1.1939942487194116e-05,
"loss": 1.2011100053787231,
"step": 196
},
{
"epoch": 0.7674418604651163,
"grad_norm": 0.10521137714385986,
"learning_rate": 1.1644446359190004e-05,
"loss": 0.5936653017997742,
"step": 198
},
{
"epoch": 0.7751937984496124,
"grad_norm": 0.16837139427661896,
"learning_rate": 1.1352636799429354e-05,
"loss": 1.3216241598129272,
"step": 200
},
{
"epoch": 0.7829457364341085,
"grad_norm": 0.11404802650213242,
"learning_rate": 1.1064705720419829e-05,
"loss": 1.084835171699524,
"step": 202
},
{
"epoch": 0.7906976744186046,
"grad_norm": 0.24780981242656708,
"learning_rate": 1.0780842483933755e-05,
"loss": 1.2125266790390015,
"step": 204
},
{
"epoch": 0.7984496124031008,
"grad_norm": 0.12619031965732574,
"learning_rate": 1.050123377647171e-05,
"loss": 1.0225963592529297,
"step": 206
},
{
"epoch": 0.8062015503875969,
"grad_norm": 1.412670612335205,
"learning_rate": 1.0226063486485695e-05,
"loss": 0.7963980436325073,
"step": 208
},
{
"epoch": 0.813953488372093,
"grad_norm": 0.18459799885749817,
"learning_rate": 9.955512583442334e-06,
"loss": 1.2788116931915283,
"step": 210
},
{
"epoch": 0.8217054263565892,
"grad_norm": 0.058253731578588486,
"learning_rate": 9.68975899880592e-06,
"loss": 1.1842073202133179,
"step": 212
},
{
"epoch": 0.8294573643410853,
"grad_norm": 0.09324084967374802,
"learning_rate": 9.42897750901933e-06,
"loss": 0.9420091509819031,
"step": 214
},
{
"epoch": 0.8372093023255814,
"grad_norm": 0.14589789509773254,
"learning_rate": 9.173339620559935e-06,
"loss": 1.0436409711837769,
"step": 216
},
{
"epoch": 0.8449612403100775,
"grad_norm": 0.08236993849277496,
"learning_rate": 8.923013457146082e-06,
"loss": 1.2834446430206299,
"step": 218
},
{
"epoch": 0.8527131782945736,
"grad_norm": 0.07797209173440933,
"learning_rate": 8.678163649168214e-06,
"loss": 1.1693506240844727,
"step": 220
},
{
"epoch": 0.8604651162790697,
"grad_norm": 0.21979407966136932,
"learning_rate": 8.438951225417476e-06,
"loss": 0.49415066838264465,
"step": 222
},
{
"epoch": 0.8682170542635659,
"grad_norm": 0.16792796552181244,
"learning_rate": 8.205533507182963e-06,
"loss": 1.1654852628707886,
"step": 224
},
{
"epoch": 0.875968992248062,
"grad_norm": 0.1074092760682106,
"learning_rate": 7.978064004787238e-06,
"loss": 1.2648242712020874,
"step": 226
},
{
"epoch": 0.8837209302325582,
"grad_norm": 0.12686721980571747,
"learning_rate": 7.756692316628162e-06,
"loss": 0.8766679167747498,
"step": 228
},
{
"epoch": 0.8914728682170543,
"grad_norm": 0.10413216799497604,
"learning_rate": 7.541564030793536e-06,
"loss": 0.9922328591346741,
"step": 230
},
{
"epoch": 0.8992248062015504,
"grad_norm": 0.07999309152364731,
"learning_rate": 7.33282062931308e-06,
"loss": 0.837881863117218,
"step": 232
},
{
"epoch": 0.9069767441860465,
"grad_norm": 0.16637900471687317,
"learning_rate": 7.13059939511089e-06,
"loss": 1.272527813911438,
"step": 234
},
{
"epoch": 0.9147286821705426,
"grad_norm": 0.13920988142490387,
"learning_rate": 6.935033321719421e-06,
"loss": 0.6637862920761108,
"step": 236
},
{
"epoch": 0.9224806201550387,
"grad_norm": 0.07921171188354492,
"learning_rate": 6.746251025814548e-06,
"loss": 1.2028839588165283,
"step": 238
},
{
"epoch": 0.9302325581395349,
"grad_norm": 0.11715266853570938,
"learning_rate": 6.564376662629032e-06,
"loss": 1.0310890674591064,
"step": 240
},
{
"epoch": 0.937984496124031,
"grad_norm": 0.1706083118915558,
"learning_rate": 6.389529844300147e-06,
"loss": 1.129476547241211,
"step": 242
},
{
"epoch": 0.9457364341085271,
"grad_norm": 0.09015638381242752,
"learning_rate": 6.2218255612051575e-06,
"loss": 0.9788402915000916,
"step": 244
},
{
"epoch": 0.9534883720930233,
"grad_norm": 0.09626635164022446,
"learning_rate": 6.061374106336329e-06,
"loss": 0.7472362518310547,
"step": 246
},
{
"epoch": 0.9612403100775194,
"grad_norm": 0.17239803075790405,
"learning_rate": 5.9082810027652495e-06,
"loss": 0.7408154606819153,
"step": 248
},
{
"epoch": 0.9689922480620154,
"grad_norm": 0.07973187416791916,
"learning_rate": 5.762646934244157e-06,
"loss": 1.1912089586257935,
"step": 250
},
{
"epoch": 0.9767441860465116,
"grad_norm": 0.08109164237976074,
"learning_rate": 5.6245676789899e-06,
"loss": 0.970727264881134,
"step": 252
},
{
"epoch": 0.9844961240310077,
"grad_norm": 0.2656784951686859,
"learning_rate": 5.494134046694101e-06,
"loss": 0.9474197626113892,
"step": 254
},
{
"epoch": 0.9922480620155039,
"grad_norm": 0.09388367086648941,
"learning_rate": 5.371431818800934e-06,
"loss": 0.7675265073776245,
"step": 256
},
{
"epoch": 1.0,
"grad_norm": 0.07208788394927979,
"learning_rate": 5.256541692091799e-06,
"loss": 1.151860237121582,
"step": 258
},
{
"epoch": 1.0077519379844961,
"grad_norm": 0.07177931815385818,
"learning_rate": 5.149539225613974e-06,
"loss": 0.6956380605697632,
"step": 260
},
{
"epoch": 1.0155038759689923,
"grad_norm": 0.06252402067184448,
"learning_rate": 5.050494790988212e-06,
"loss": 0.9135383367538452,
"step": 262
},
{
"epoch": 1.0232558139534884,
"grad_norm": 0.17128507792949677,
"learning_rate": 4.95947352612787e-06,
"loss": 0.721315324306488,
"step": 264
},
{
"epoch": 1.0310077519379846,
"grad_norm": 0.08467314392328262,
"learning_rate": 4.876535292400089e-06,
"loss": 0.4410458207130432,
"step": 266
},
{
"epoch": 1.0387596899224807,
"grad_norm": 0.10766004770994186,
"learning_rate": 4.801734635257148e-06,
"loss": 0.8536827564239502,
"step": 268
},
{
"epoch": 1.0465116279069768,
"grad_norm": 0.15451736748218536,
"learning_rate": 4.735120748363916e-06,
"loss": 0.903506875038147,
"step": 270
},
{
"epoch": 1.054263565891473,
"grad_norm": 0.06612464040517807,
"learning_rate": 4.676737441244975e-06,
"loss": 0.48186248540878296,
"step": 272
},
{
"epoch": 1.062015503875969,
"grad_norm": 0.10002221167087555,
"learning_rate": 4.626623110472677e-06,
"loss": 0.8960871696472168,
"step": 274
},
{
"epoch": 1.069767441860465,
"grad_norm": 0.07858562469482422,
"learning_rate": 4.584810714415135e-06,
"loss": 0.8507243990898132,
"step": 276
},
{
"epoch": 1.0775193798449612,
"grad_norm": 0.06665261089801788,
"learning_rate": 4.5513277515607014e-06,
"loss": 0.9197998642921448,
"step": 278
},
{
"epoch": 1.0852713178294573,
"grad_norm": 0.16345328092575073,
"learning_rate": 4.526196242433211e-06,
"loss": 0.778313398361206,
"step": 280
},
{
"epoch": 1.0930232558139534,
"grad_norm": 0.10489022731781006,
"learning_rate": 4.509432715109887e-06,
"loss": 0.5479567050933838,
"step": 282
},
{
"epoch": 1.1007751937984496,
"grad_norm": 0.05080074071884155,
"learning_rate": 4.50104819435143e-06,
"loss": 0.6334800720214844,
"step": 284
},
{
"epoch": 1.1085271317829457,
"grad_norm": 0.12185381352901459,
"learning_rate": 4.50104819435143e-06,
"loss": 0.8215212225914001,
"step": 286
},
{
"epoch": 1.1162790697674418,
"grad_norm": 0.08784171938896179,
"learning_rate": 4.509432715109887e-06,
"loss": 0.5245926976203918,
"step": 288
},
{
"epoch": 1.124031007751938,
"grad_norm": 0.09065528959035873,
"learning_rate": 4.526196242433211e-06,
"loss": 1.0330955982208252,
"step": 290
},
{
"epoch": 1.1317829457364341,
"grad_norm": 0.04104357957839966,
"learning_rate": 4.5513277515607014e-06,
"loss": 0.5526050329208374,
"step": 292
},
{
"epoch": 1.1395348837209303,
"grad_norm": 0.31155890226364136,
"learning_rate": 4.584810714415136e-06,
"loss": 1.046125888824463,
"step": 294
},
{
"epoch": 1.1472868217054264,
"grad_norm": 0.15053009986877441,
"learning_rate": 4.626623110472676e-06,
"loss": 0.3840217590332031,
"step": 296
},
{
"epoch": 1.1550387596899225,
"grad_norm": 0.08694499731063843,
"learning_rate": 4.676737441244973e-06,
"loss": 0.6799867153167725,
"step": 298
},
{
"epoch": 1.1627906976744187,
"grad_norm": 0.07247356325387955,
"learning_rate": 4.735120748363917e-06,
"loss": 0.6748986840248108,
"step": 300
},
{
"epoch": 1.1705426356589148,
"grad_norm": 0.06139397993683815,
"learning_rate": 4.801734635257148e-06,
"loss": 0.8421810865402222,
"step": 302
},
{
"epoch": 1.178294573643411,
"grad_norm": 0.08629012107849121,
"learning_rate": 4.876535292400087e-06,
"loss": 0.5402819514274597,
"step": 304
},
{
"epoch": 1.1860465116279069,
"grad_norm": 0.17801746726036072,
"learning_rate": 4.95947352612787e-06,
"loss": 0.9019787311553955,
"step": 306
},
{
"epoch": 1.193798449612403,
"grad_norm": 0.13845831155776978,
"learning_rate": 5.050494790988212e-06,
"loss": 0.8330530524253845,
"step": 308
},
{
"epoch": 1.2015503875968991,
"grad_norm": 0.0652163103222847,
"learning_rate": 5.149539225613974e-06,
"loss": 1.0060863494873047,
"step": 310
},
{
"epoch": 1.2093023255813953,
"grad_norm": 0.0937967598438263,
"learning_rate": 5.256541692091797e-06,
"loss": 0.5403499007225037,
"step": 312
},
{
"epoch": 1.2170542635658914,
"grad_norm": 0.19726139307022095,
"learning_rate": 5.371431818800936e-06,
"loss": 0.37406668066978455,
"step": 314
},
{
"epoch": 1.2248062015503876,
"grad_norm": 0.11253905296325684,
"learning_rate": 5.494134046694099e-06,
"loss": 0.6960604786872864,
"step": 316
},
{
"epoch": 1.2325581395348837,
"grad_norm": 0.08688368648290634,
"learning_rate": 5.624567678989899e-06,
"loss": 0.7832977771759033,
"step": 318
},
{
"epoch": 1.2403100775193798,
"grad_norm": 0.19534340500831604,
"learning_rate": 5.762646934244156e-06,
"loss": 0.9501113295555115,
"step": 320
},
{
"epoch": 1.248062015503876,
"grad_norm": 0.06447340548038483,
"learning_rate": 5.908281002765248e-06,
"loss": 1.0130536556243896,
"step": 322
},
{
"epoch": 1.255813953488372,
"grad_norm": 0.11461887508630753,
"learning_rate": 6.061374106336328e-06,
"loss": 0.631900429725647,
"step": 324
},
{
"epoch": 1.2635658914728682,
"grad_norm": 0.09350797533988953,
"learning_rate": 6.2218255612051575e-06,
"loss": 0.8754401803016663,
"step": 326
},
{
"epoch": 1.2713178294573644,
"grad_norm": 0.11175557225942612,
"learning_rate": 6.389529844300143e-06,
"loss": 0.7127947807312012,
"step": 328
},
{
"epoch": 1.2790697674418605,
"grad_norm": 0.09055038541555405,
"learning_rate": 6.564376662629029e-06,
"loss": 0.4656026363372803,
"step": 330
},
{
"epoch": 1.2868217054263567,
"grad_norm": 0.09712733328342438,
"learning_rate": 6.74625102581455e-06,
"loss": 0.8079378008842468,
"step": 332
},
{
"epoch": 1.2945736434108528,
"grad_norm": 0.18206307291984558,
"learning_rate": 6.935033321719419e-06,
"loss": 0.5637804865837097,
"step": 334
},
{
"epoch": 1.302325581395349,
"grad_norm": 0.23368722200393677,
"learning_rate": 7.130599395110884e-06,
"loss": 0.8007771968841553,
"step": 336
},
{
"epoch": 1.310077519379845,
"grad_norm": 0.05224426090717316,
"learning_rate": 7.332820629313082e-06,
"loss": 0.551106333732605,
"step": 338
},
{
"epoch": 1.3178294573643412,
"grad_norm": 0.07984264940023422,
"learning_rate": 7.541564030793533e-06,
"loss": 0.7754759788513184,
"step": 340
},
{
"epoch": 1.3255813953488373,
"grad_norm": 0.22976501286029816,
"learning_rate": 7.75669231662816e-06,
"loss": 0.7786872982978821,
"step": 342
},
{
"epoch": 1.3333333333333333,
"grad_norm": 0.17023955285549164,
"learning_rate": 7.978064004787231e-06,
"loss": 0.7895460724830627,
"step": 344
},
{
"epoch": 1.3410852713178294,
"grad_norm": 0.12108391523361206,
"learning_rate": 8.205533507182961e-06,
"loss": 0.20940443873405457,
"step": 346
},
{
"epoch": 1.3488372093023255,
"grad_norm": 0.07635517418384552,
"learning_rate": 8.438951225417474e-06,
"loss": 0.819771409034729,
"step": 348
},
{
"epoch": 1.3565891472868217,
"grad_norm": 0.11260077357292175,
"learning_rate": 8.678163649168212e-06,
"loss": 0.9801982641220093,
"step": 350
},
{
"epoch": 1.3643410852713178,
"grad_norm": 0.09885291010141373,
"learning_rate": 8.923013457146075e-06,
"loss": 0.7718797326087952,
"step": 352
},
{
"epoch": 1.372093023255814,
"grad_norm": 0.09329655021429062,
"learning_rate": 9.173339620559931e-06,
"loss": 0.40787971019744873,
"step": 354
},
{
"epoch": 1.37984496124031,
"grad_norm": 0.11724522709846497,
"learning_rate": 9.428977509019326e-06,
"loss": 0.797160804271698,
"step": 356
},
{
"epoch": 1.3875968992248062,
"grad_norm": 0.11735495924949646,
"learning_rate": 9.689758998805924e-06,
"loss": 0.6483190059661865,
"step": 358
},
{
"epoch": 1.3953488372093024,
"grad_norm": 0.08914632350206375,
"learning_rate": 9.955512583442333e-06,
"loss": 0.7835768461227417,
"step": 360
},
{
"epoch": 1.4031007751937985,
"grad_norm": 0.07666268944740295,
"learning_rate": 1.0226063486485691e-05,
"loss": 0.6386092901229858,
"step": 362
},
{
"epoch": 1.4108527131782946,
"grad_norm": 0.08281254768371582,
"learning_rate": 1.0501233776471714e-05,
"loss": 0.8520874977111816,
"step": 364
},
{
"epoch": 1.4186046511627908,
"grad_norm": 0.14842084050178528,
"learning_rate": 1.0780842483933755e-05,
"loss": 0.37374499440193176,
"step": 366
},
{
"epoch": 1.4263565891472867,
"grad_norm": 0.24841120839118958,
"learning_rate": 1.1064705720419827e-05,
"loss": 0.3320968449115753,
"step": 368
},
{
"epoch": 1.4341085271317828,
"grad_norm": 0.11581484228372574,
"learning_rate": 1.135263679942935e-05,
"loss": 0.7746375799179077,
"step": 370
},
{
"epoch": 1.441860465116279,
"grad_norm": 0.0945417657494545,
"learning_rate": 1.1644446359190006e-05,
"loss": 0.6704602241516113,
"step": 372
},
{
"epoch": 1.449612403100775,
"grad_norm": 0.06997057050466537,
"learning_rate": 1.1939942487194116e-05,
"loss": 0.9213350415229797,
"step": 374
},
{
"epoch": 1.4573643410852712,
"grad_norm": 0.07435750216245651,
"learning_rate": 1.2238930846412471e-05,
"loss": 0.7233853936195374,
"step": 376
},
{
"epoch": 1.4651162790697674,
"grad_norm": 0.18093754351139069,
"learning_rate": 1.2541214803102757e-05,
"loss": 0.5185383558273315,
"step": 378
},
{
"epoch": 1.4728682170542635,
"grad_norm": 0.052637044340372086,
"learning_rate": 1.2846595556128331e-05,
"loss": 0.7751470804214478,
"step": 380
},
{
"epoch": 1.4806201550387597,
"grad_norm": 0.10150747746229172,
"learning_rate": 1.3154872267702518e-05,
"loss": 0.7363438010215759,
"step": 382
},
{
"epoch": 1.4883720930232558,
"grad_norm": 0.08896318078041077,
"learning_rate": 1.3465842195472318e-05,
"loss": 0.697909951210022,
"step": 384
},
{
"epoch": 1.496124031007752,
"grad_norm": 0.09349460154771805,
"learning_rate": 1.3779300825854622e-05,
"loss": 0.5058455467224121,
"step": 386
},
{
"epoch": 1.503875968992248,
"grad_norm": 0.0640454888343811,
"learning_rate": 1.4095042008537336e-05,
"loss": 0.6899944543838501,
"step": 388
},
{
"epoch": 1.5116279069767442,
"grad_norm": 0.08342494815587997,
"learning_rate": 1.4412858092056988e-05,
"loss": 0.5844802856445312,
"step": 390
},
{
"epoch": 1.5193798449612403,
"grad_norm": 0.14086060225963593,
"learning_rate": 1.4732540060363447e-05,
"loss": 0.6977730393409729,
"step": 392
},
{
"epoch": 1.5271317829457365,
"grad_norm": 0.135100856423378,
"learning_rate": 1.5053877670282176e-05,
"loss": 0.7261441349983215,
"step": 394
},
{
"epoch": 1.5348837209302326,
"grad_norm": 0.1089802235364914,
"learning_rate": 1.537665958978357e-05,
"loss": 0.7607800960540771,
"step": 396
},
{
"epoch": 1.5426356589147288,
"grad_norm": 0.17686955630779266,
"learning_rate": 1.5700673536968222e-05,
"loss": 0.5964785218238831,
"step": 398
},
{
"epoch": 1.550387596899225,
"grad_norm": 0.06133165583014488,
"learning_rate": 1.6025706419677047e-05,
"loss": 0.7581831812858582,
"step": 400
},
{
"epoch": 1.558139534883721,
"grad_norm": 0.11867906898260117,
"learning_rate": 1.6351544475634256e-05,
"loss": 0.5359363555908203,
"step": 402
},
{
"epoch": 1.5658914728682172,
"grad_norm": 0.08171830326318741,
"learning_rate": 1.6677973413030932e-05,
"loss": 0.9142735004425049,
"step": 404
},
{
"epoch": 1.5736434108527133,
"grad_norm": 0.11325247585773468,
"learning_rate": 1.7004778551456975e-05,
"loss": 0.7637568712234497,
"step": 406
},
{
"epoch": 1.5813953488372094,
"grad_norm": 0.054144054651260376,
"learning_rate": 1.7331744963088644e-05,
"loss": 0.31641456484794617,
"step": 408
},
{
"epoch": 1.5891472868217056,
"grad_norm": 0.051439665257930756,
"learning_rate": 1.7658657614038598e-05,
"loss": 0.780099630355835,
"step": 410
},
{
"epoch": 1.5968992248062015,
"grad_norm": 0.07669004052877426,
"learning_rate": 1.7985301505776015e-05,
"loss": 0.7998414635658264,
"step": 412
},
{
"epoch": 1.6046511627906976,
"grad_norm": 0.08620447665452957,
"learning_rate": 1.8311461816523192e-05,
"loss": 0.5864279866218567,
"step": 414
},
{
"epoch": 1.6124031007751938,
"grad_norm": 0.0925377830862999,
"learning_rate": 1.8636924042535962e-05,
"loss": 0.47105392813682556,
"step": 416
},
{
"epoch": 1.62015503875969,
"grad_norm": 0.08717449009418488,
"learning_rate": 1.8961474139175093e-05,
"loss": 0.8024092316627502,
"step": 418
},
{
"epoch": 1.627906976744186,
"grad_norm": 0.12033627182245255,
"learning_rate": 1.9284898661675586e-05,
"loss": 0.810451090335846,
"step": 420
},
{
"epoch": 1.6356589147286822,
"grad_norm": 0.07522077113389969,
"learning_rate": 1.9606984905521443e-05,
"loss": 0.4688906967639923,
"step": 422
},
{
"epoch": 1.6434108527131783,
"grad_norm": 0.07208564877510071,
"learning_rate": 1.9927521046333837e-05,
"loss": 0.7383279204368591,
"step": 424
},
{
"epoch": 1.6511627906976745,
"grad_norm": 0.09510686248540878,
"learning_rate": 2.0246296279180093e-05,
"loss": 0.8395543694496155,
"step": 426
},
{
"epoch": 1.6589147286821704,
"grad_norm": 0.15472382307052612,
"learning_rate": 2.0563100957212567e-05,
"loss": 0.8986775875091553,
"step": 428
},
{
"epoch": 1.6666666666666665,
"grad_norm": 0.09020368754863739,
"learning_rate": 2.0877726729545672e-05,
"loss": 0.8169777393341064,
"step": 430
},
{
"epoch": 1.6744186046511627,
"grad_norm": 0.198333740234375,
"learning_rate": 2.1189966678280578e-05,
"loss": 1.033119559288025,
"step": 432
},
{
"epoch": 1.6821705426356588,
"grad_norm": 0.08684570342302322,
"learning_rate": 2.149961545458772e-05,
"loss": 0.5892492532730103,
"step": 434
},
{
"epoch": 1.689922480620155,
"grad_norm": 0.0764966830611229,
"learning_rate": 2.1806469413757164e-05,
"loss": 0.7995302081108093,
"step": 436
},
{
"epoch": 1.697674418604651,
"grad_norm": 0.13916683197021484,
"learning_rate": 2.211032674912823e-05,
"loss": 0.8415105938911438,
"step": 438
},
{
"epoch": 1.7054263565891472,
"grad_norm": 0.24585378170013428,
"learning_rate": 2.241098762481052e-05,
"loss": 0.6350277066230774,
"step": 440
},
{
"epoch": 1.7131782945736433,
"grad_norm": 0.050845544785261154,
"learning_rate": 2.27082543071086e-05,
"loss": 0.8463593125343323,
"step": 442
},
{
"epoch": 1.7209302325581395,
"grad_norm": 0.07698489725589752,
"learning_rate": 2.3001931294564265e-05,
"loss": 0.5609403252601624,
"step": 444
},
{
"epoch": 1.7286821705426356,
"grad_norm": 0.06638149172067642,
"learning_rate": 2.3291825446530733e-05,
"loss": 0.8690592050552368,
"step": 446
},
{
"epoch": 1.7364341085271318,
"grad_norm": 0.08811336010694504,
"learning_rate": 2.357774611019419e-05,
"loss": 0.8064720630645752,
"step": 448
},
{
"epoch": 1.744186046511628,
"grad_norm": 0.0755743682384491,
"learning_rate": 2.385950524595919e-05,
"loss": 1.0067108869552612,
"step": 450
},
{
"epoch": 1.751937984496124,
"grad_norm": 0.06093823164701462,
"learning_rate": 2.4136917551115478e-05,
"loss": 0.967079222202301,
"step": 452
},
{
"epoch": 1.7596899224806202,
"grad_norm": 0.09034255892038345,
"learning_rate": 2.4409800581704777e-05,
"loss": 0.6444424986839294,
"step": 454
},
{
"epoch": 1.7674418604651163,
"grad_norm": 0.1733829230070114,
"learning_rate": 2.4677974872507553e-05,
"loss": 0.8322298526763916,
"step": 456
},
{
"epoch": 1.7751937984496124,
"grad_norm": 0.23445071280002594,
"learning_rate": 2.4941264055070734e-05,
"loss": 0.4230212867259979,
"step": 458
},
{
"epoch": 1.7829457364341086,
"grad_norm": 0.1249038353562355,
"learning_rate": 2.5199494973698852e-05,
"loss": 0.6065483093261719,
"step": 460
},
{
"epoch": 1.7906976744186047,
"grad_norm": 0.08323405683040619,
"learning_rate": 2.545249779933216e-05,
"loss": 0.8183580040931702,
"step": 462
},
{
"epoch": 1.7984496124031009,
"grad_norm": 0.10287293046712875,
"learning_rate": 2.5700106141237063e-05,
"loss": 0.9282822608947754,
"step": 464
},
{
"epoch": 1.806201550387597,
"grad_norm": 0.053924717009067535,
"learning_rate": 2.594215715643524e-05,
"loss": 0.8734548687934875,
"step": 466
},
{
"epoch": 1.8139534883720931,
"grad_norm": 0.10388979315757751,
"learning_rate": 2.6178491656799497e-05,
"loss": 0.8903089165687561,
"step": 468
},
{
"epoch": 1.8217054263565893,
"grad_norm": 0.06755795329809189,
"learning_rate": 2.640895421374602e-05,
"loss": 0.4710087180137634,
"step": 470
},
{
"epoch": 1.8294573643410854,
"grad_norm": 0.08703745901584625,
"learning_rate": 2.6633393260454096e-05,
"loss": 1.1290743350982666,
"step": 472
},
{
"epoch": 1.8372093023255816,
"grad_norm": 0.10183677822351456,
"learning_rate": 2.6851661191546034e-05,
"loss": 0.6608400344848633,
"step": 474
},
{
"epoch": 1.8449612403100775,
"grad_norm": 0.11454630643129349,
"learning_rate": 2.706361446016192e-05,
"loss": 0.850265383720398,
"step": 476
},
{
"epoch": 1.8527131782945736,
"grad_norm": 0.06782646477222443,
"learning_rate": 2.7269113672365096e-05,
"loss": 0.6361703872680664,
"step": 478
},
{
"epoch": 1.8604651162790697,
"grad_norm": 0.08559778332710266,
"learning_rate": 2.7468023678816444e-05,
"loss": 1.0639129877090454,
"step": 480
},
{
"epoch": 1.8682170542635659,
"grad_norm": 0.06762553006410599,
"learning_rate": 2.766021366365728e-05,
"loss": 0.6422796845436096,
"step": 482
},
{
"epoch": 1.875968992248062,
"grad_norm": 0.07438317686319351,
"learning_rate": 2.784555723054208e-05,
"loss": 0.7208263874053955,
"step": 484
},
{
"epoch": 1.8837209302325582,
"grad_norm": 0.07318796217441559,
"learning_rate": 2.8023932485764764e-05,
"loss": 0.8420804738998413,
"step": 486
},
{
"epoch": 1.8914728682170543,
"grad_norm": 0.10379486531019211,
"learning_rate": 2.81952221184238e-05,
"loss": 0.5533670783042908,
"step": 488
},
{
"epoch": 1.8992248062015504,
"grad_norm": 1.0894800424575806,
"learning_rate": 2.8359313477573215e-05,
"loss": 0.688605785369873,
"step": 490
},
{
"epoch": 1.9069767441860463,
"grad_norm": 0.23758739233016968,
"learning_rate": 2.8516098646309108e-05,
"loss": 0.5789573192596436,
"step": 492
},
{
"epoch": 1.9147286821705425,
"grad_norm": 0.06857667863368988,
"learning_rate": 2.8665474512742607e-05,
"loss": 0.6448074579238892,
"step": 494
},
{
"epoch": 1.9224806201550386,
"grad_norm": 0.08650626242160797,
"learning_rate": 2.8807342837812783e-05,
"loss": 0.6479641199111938,
"step": 496
},
{
"epoch": 1.9302325581395348,
"grad_norm": 0.07275024801492691,
"learning_rate": 2.894161031989497e-05,
"loss": 0.4521400034427643,
"step": 498
},
{
"epoch": 1.937984496124031,
"grad_norm": 0.05953352525830269,
"learning_rate": 2.906818865616178e-05,
"loss": 0.9132779240608215,
"step": 500
},
{
"epoch": 1.945736434108527,
"grad_norm": 0.12861226499080658,
"learning_rate": 2.9186994600656647e-05,
"loss": 0.6908618807792664,
"step": 502
},
{
"epoch": 1.9534883720930232,
"grad_norm": 0.07091208547353745,
"learning_rate": 2.929795001904172e-05,
"loss": 0.6676538586616516,
"step": 504
},
{
"epoch": 1.9612403100775193,
"grad_norm": 0.11093394458293915,
"learning_rate": 2.9400981939983914e-05,
"loss": 1.0052788257598877,
"step": 506
},
{
"epoch": 1.9689922480620154,
"grad_norm": 0.05772824585437775,
"learning_rate": 2.9496022603145494e-05,
"loss": 0.7913935780525208,
"step": 508
},
{
"epoch": 1.9767441860465116,
"grad_norm": 0.0762370154261589,
"learning_rate": 2.9583009503747627e-05,
"loss": 0.9280475974082947,
"step": 510
},
{
"epoch": 1.9844961240310077,
"grad_norm": 0.18662315607070923,
"learning_rate": 2.9661885433677437e-05,
"loss": 0.7493736743927002,
"step": 512
},
{
"epoch": 1.9922480620155039,
"grad_norm": 0.07221183180809021,
"learning_rate": 2.9732598519111736e-05,
"loss": 1.0501880645751953,
"step": 514
},
{
"epoch": 2.0,
"grad_norm": 0.08491652458906174,
"learning_rate": 2.9795102254632528e-05,
"loss": 1.011595368385315,
"step": 516
},
{
"epoch": 2.007751937984496,
"grad_norm": 0.09373293071985245,
"learning_rate": 2.9849355533811937e-05,
"loss": 0.5705936551094055,
"step": 518
},
{
"epoch": 2.0155038759689923,
"grad_norm": 0.06463813781738281,
"learning_rate": 2.9895322676246387e-05,
"loss": 0.7379302978515625,
"step": 520
},
{
"epoch": 2.0232558139534884,
"grad_norm": 0.09566348791122437,
"learning_rate": 2.993297345102233e-05,
"loss": 0.46209296584129333,
"step": 522
},
{
"epoch": 2.0310077519379846,
"grad_norm": 0.05616720765829086,
"learning_rate": 2.9962283096597995e-05,
"loss": 0.773676335811615,
"step": 524
},
{
"epoch": 2.0387596899224807,
"grad_norm": 0.09464262425899506,
"learning_rate": 2.998323233708815e-05,
"loss": 0.6592158675193787,
"step": 526
},
{
"epoch": 2.046511627906977,
"grad_norm": 0.09258489310741425,
"learning_rate": 2.999580739494117e-05,
"loss": 0.7777129411697388,
"step": 528
},
{
"epoch": 2.054263565891473,
"grad_norm": 0.13635995984077454,
"learning_rate": 3e-05,
"loss": 0.385895311832428,
"step": 530
},
{
"epoch": 2.062015503875969,
"grad_norm": 0.1054837629199028,
"learning_rate": 2.999580739494117e-05,
"loss": 0.7748541235923767,
"step": 532
},
{
"epoch": 2.0697674418604652,
"grad_norm": 0.08860507607460022,
"learning_rate": 2.998323233708815e-05,
"loss": 0.407875120639801,
"step": 534
},
{
"epoch": 2.0775193798449614,
"grad_norm": 0.07644882053136826,
"learning_rate": 2.9962283096598e-05,
"loss": 0.41405466198921204,
"step": 536
},
{
"epoch": 2.0852713178294575,
"grad_norm": 0.20681916177272797,
"learning_rate": 2.9932973451022333e-05,
"loss": 0.701027512550354,
"step": 538
},
{
"epoch": 2.0930232558139537,
"grad_norm": 0.07310563325881958,
"learning_rate": 2.9895322676246387e-05,
"loss": 0.4735100567340851,
"step": 540
},
{
"epoch": 2.10077519379845,
"grad_norm": 0.0755162462592125,
"learning_rate": 2.9849355533811937e-05,
"loss": 0.27081194519996643,
"step": 542
},
{
"epoch": 2.108527131782946,
"grad_norm": 0.07929737865924835,
"learning_rate": 2.9795102254632528e-05,
"loss": 0.6002092957496643,
"step": 544
},
{
"epoch": 2.116279069767442,
"grad_norm": 0.25740522146224976,
"learning_rate": 2.973259851911174e-05,
"loss": 0.4636404514312744,
"step": 546
},
{
"epoch": 2.124031007751938,
"grad_norm": 0.07688764482736588,
"learning_rate": 2.9661885433677434e-05,
"loss": 0.4923861026763916,
"step": 548
},
{
"epoch": 2.1317829457364343,
"grad_norm": 0.24001885950565338,
"learning_rate": 2.9583009503747627e-05,
"loss": 0.3250856101512909,
"step": 550
},
{
"epoch": 2.13953488372093,
"grad_norm": 0.09132993221282959,
"learning_rate": 2.9496022603145497e-05,
"loss": 0.7897784113883972,
"step": 552
},
{
"epoch": 2.147286821705426,
"grad_norm": 0.09284122288227081,
"learning_rate": 2.940098193998391e-05,
"loss": 0.8441802859306335,
"step": 554
},
{
"epoch": 2.1550387596899223,
"grad_norm": 0.07503140717744827,
"learning_rate": 2.9297950019041724e-05,
"loss": 0.4028940498828888,
"step": 556
},
{
"epoch": 2.1627906976744184,
"grad_norm": 0.11651087552309036,
"learning_rate": 2.9186994600656647e-05,
"loss": 0.6657426953315735,
"step": 558
},
{
"epoch": 2.1705426356589146,
"grad_norm": 0.06494183093309402,
"learning_rate": 2.906818865616178e-05,
"loss": 0.5439774990081787,
"step": 560
},
{
"epoch": 2.1782945736434107,
"grad_norm": 0.05145857110619545,
"learning_rate": 2.8941610319894977e-05,
"loss": 0.7213448882102966,
"step": 562
},
{
"epoch": 2.186046511627907,
"grad_norm": 0.1473415493965149,
"learning_rate": 2.8807342837812783e-05,
"loss": 0.38557326793670654,
"step": 564
},
{
"epoch": 2.193798449612403,
"grad_norm": 0.2709689438343048,
"learning_rate": 2.8665474512742603e-05,
"loss": 0.41664543747901917,
"step": 566
},
{
"epoch": 2.201550387596899,
"grad_norm": 0.06767801940441132,
"learning_rate": 2.851609864630911e-05,
"loss": 0.4579377770423889,
"step": 568
},
{
"epoch": 2.2093023255813953,
"grad_norm": 0.3255118727684021,
"learning_rate": 2.8359313477573215e-05,
"loss": 0.3196179270744324,
"step": 570
},
{
"epoch": 2.2170542635658914,
"grad_norm": 0.1096249520778656,
"learning_rate": 2.8195222118423792e-05,
"loss": 0.5369107127189636,
"step": 572
},
{
"epoch": 2.2248062015503876,
"grad_norm": 0.2894248068332672,
"learning_rate": 2.8023932485764768e-05,
"loss": 0.23676389455795288,
"step": 574
},
{
"epoch": 2.2325581395348837,
"grad_norm": 0.19947735965251923,
"learning_rate": 2.7845557230542076e-05,
"loss": 0.44901129603385925,
"step": 576
},
{
"epoch": 2.24031007751938,
"grad_norm": 0.06506390869617462,
"learning_rate": 2.766021366365729e-05,
"loss": 0.5859266519546509,
"step": 578
},
{
"epoch": 2.248062015503876,
"grad_norm": 0.10611079633235931,
"learning_rate": 2.746802367881645e-05,
"loss": 0.6005488038063049,
"step": 580
},
{
"epoch": 2.255813953488372,
"grad_norm": 0.05949712544679642,
"learning_rate": 2.726911367236509e-05,
"loss": 0.32260704040527344,
"step": 582
},
{
"epoch": 2.2635658914728682,
"grad_norm": 0.09240850806236267,
"learning_rate": 2.706361446016193e-05,
"loss": 0.8233704566955566,
"step": 584
},
{
"epoch": 2.2713178294573644,
"grad_norm": 0.08874181658029556,
"learning_rate": 2.685166119154604e-05,
"loss": 0.4317566156387329,
"step": 586
},
{
"epoch": 2.2790697674418605,
"grad_norm": 0.05373215302824974,
"learning_rate": 2.6633393260454096e-05,
"loss": 0.8105683326721191,
"step": 588
},
{
"epoch": 2.2868217054263567,
"grad_norm": 0.05979755148291588,
"learning_rate": 2.6408954213746025e-05,
"loss": 0.4510256350040436,
"step": 590
},
{
"epoch": 2.294573643410853,
"grad_norm": 0.056298933923244476,
"learning_rate": 2.6178491656799504e-05,
"loss": 0.7199202179908752,
"step": 592
},
{
"epoch": 2.302325581395349,
"grad_norm": 0.06022209674119949,
"learning_rate": 2.5942157156435248e-05,
"loss": 0.47333112359046936,
"step": 594
},
{
"epoch": 2.310077519379845,
"grad_norm": 0.1291632205247879,
"learning_rate": 2.570010614123707e-05,
"loss": 0.4947061836719513,
"step": 596
},
{
"epoch": 2.317829457364341,
"grad_norm": 0.7499107718467712,
"learning_rate": 2.5452497799332167e-05,
"loss": 0.6046218872070312,
"step": 598
},
{
"epoch": 2.3255813953488373,
"grad_norm": 0.05242902785539627,
"learning_rate": 2.519949497369886e-05,
"loss": 0.37087422609329224,
"step": 600
},
{
"epoch": 2.3333333333333335,
"grad_norm": 0.4367184340953827,
"learning_rate": 2.494126405507074e-05,
"loss": 0.579389214515686,
"step": 602
},
{
"epoch": 2.3410852713178296,
"grad_norm": 0.0486953966319561,
"learning_rate": 2.467797487250756e-05,
"loss": 0.7329738736152649,
"step": 604
},
{
"epoch": 2.3488372093023258,
"grad_norm": 0.09891688823699951,
"learning_rate": 2.4409800581704784e-05,
"loss": 0.5676310658454895,
"step": 606
},
{
"epoch": 2.356589147286822,
"grad_norm": 0.12641683220863342,
"learning_rate": 2.4136917551115484e-05,
"loss": 0.6383396983146667,
"step": 608
},
{
"epoch": 2.3643410852713176,
"grad_norm": 0.07655075937509537,
"learning_rate": 2.3859505245959206e-05,
"loss": 0.6663593053817749,
"step": 610
},
{
"epoch": 2.3720930232558137,
"grad_norm": 0.062206994742155075,
"learning_rate": 2.3577746110194188e-05,
"loss": 0.32523995637893677,
"step": 612
},
{
"epoch": 2.37984496124031,
"grad_norm": 0.13163718581199646,
"learning_rate": 2.329182544653074e-05,
"loss": 0.5087898373603821,
"step": 614
},
{
"epoch": 2.387596899224806,
"grad_norm": 0.04577813297510147,
"learning_rate": 2.3001931294564278e-05,
"loss": 0.5215730667114258,
"step": 616
},
{
"epoch": 2.395348837209302,
"grad_norm": 0.06540275365114212,
"learning_rate": 2.27082543071086e-05,
"loss": 0.7069303393363953,
"step": 618
},
{
"epoch": 2.4031007751937983,
"grad_norm": 0.04587893187999725,
"learning_rate": 2.2410987624810527e-05,
"loss": 0.6097102165222168,
"step": 620
},
{
"epoch": 2.4108527131782944,
"grad_norm": 0.18531644344329834,
"learning_rate": 2.2110326749128246e-05,
"loss": 0.28449299931526184,
"step": 622
},
{
"epoch": 2.4186046511627906,
"grad_norm": 0.06915592402219772,
"learning_rate": 2.180646941375716e-05,
"loss": 0.5394483208656311,
"step": 624
},
{
"epoch": 2.4263565891472867,
"grad_norm": 0.0683450847864151,
"learning_rate": 2.149961545458774e-05,
"loss": 0.351560115814209,
"step": 626
},
{
"epoch": 2.434108527131783,
"grad_norm": 0.0661771222949028,
"learning_rate": 2.1189966678280585e-05,
"loss": 0.6790451407432556,
"step": 628
},
{
"epoch": 2.441860465116279,
"grad_norm": 0.2682180106639862,
"learning_rate": 2.0877726729545665e-05,
"loss": 0.34560778737068176,
"step": 630
},
{
"epoch": 2.449612403100775,
"grad_norm": 0.05607810616493225,
"learning_rate": 2.0563100957212584e-05,
"loss": 0.35299909114837646,
"step": 632
},
{
"epoch": 2.4573643410852712,
"grad_norm": 0.1276787519454956,
"learning_rate": 2.02462962791801e-05,
"loss": 0.45075708627700806,
"step": 634
},
{
"epoch": 2.4651162790697674,
"grad_norm": 0.07231509685516357,
"learning_rate": 1.9927521046333833e-05,
"loss": 0.4892677664756775,
"step": 636
},
{
"epoch": 2.4728682170542635,
"grad_norm": 0.12232723832130432,
"learning_rate": 1.9606984905521463e-05,
"loss": 0.6066938042640686,
"step": 638
},
{
"epoch": 2.4806201550387597,
"grad_norm": 0.054693497717380524,
"learning_rate": 1.928489866167559e-05,
"loss": 0.3974202275276184,
"step": 640
},
{
"epoch": 2.488372093023256,
"grad_norm": 0.07348073273897171,
"learning_rate": 1.896147413917511e-05,
"loss": 0.43941450119018555,
"step": 642
},
{
"epoch": 2.496124031007752,
"grad_norm": 0.05807847902178764,
"learning_rate": 1.863692404253597e-05,
"loss": 0.5508748888969421,
"step": 644
},
{
"epoch": 2.503875968992248,
"grad_norm": 0.08628101646900177,
"learning_rate": 1.83114618165232e-05,
"loss": 0.5954611897468567,
"step": 646
},
{
"epoch": 2.511627906976744,
"grad_norm": 0.08698024600744247,
"learning_rate": 1.798530150577603e-05,
"loss": 0.7873520851135254,
"step": 648
},
{
"epoch": 2.5193798449612403,
"grad_norm": 0.0802086666226387,
"learning_rate": 1.765865761403861e-05,
"loss": 0.27345526218414307,
"step": 650
},
{
"epoch": 2.5271317829457365,
"grad_norm": 0.058408260345458984,
"learning_rate": 1.7331744963088654e-05,
"loss": 0.5833812355995178,
"step": 652
},
{
"epoch": 2.5348837209302326,
"grad_norm": 0.10947899520397186,
"learning_rate": 1.7004778551456995e-05,
"loss": 0.3762988746166229,
"step": 654
},
{
"epoch": 2.5426356589147288,
"grad_norm": 0.08571284264326096,
"learning_rate": 1.667797341303094e-05,
"loss": 0.5067244172096252,
"step": 656
},
{
"epoch": 2.550387596899225,
"grad_norm": 0.06394554674625397,
"learning_rate": 1.6351544475634277e-05,
"loss": 0.42985814809799194,
"step": 658
},
{
"epoch": 2.558139534883721,
"grad_norm": 0.1848040074110031,
"learning_rate": 1.6025706419677054e-05,
"loss": 0.8337141871452332,
"step": 660
},
{
"epoch": 2.565891472868217,
"grad_norm": 0.04375322908163071,
"learning_rate": 1.570067353696823e-05,
"loss": 0.5003541707992554,
"step": 662
},
{
"epoch": 2.5736434108527133,
"grad_norm": 0.04600893706083298,
"learning_rate": 1.5376659589783585e-05,
"loss": 0.3022569715976715,
"step": 664
},
{
"epoch": 2.5813953488372094,
"grad_norm": 0.05343756452202797,
"learning_rate": 1.5053877670282193e-05,
"loss": 0.4718426465988159,
"step": 666
},
{
"epoch": 2.5891472868217056,
"grad_norm": 0.09041419625282288,
"learning_rate": 1.473254006036345e-05,
"loss": 0.4901648163795471,
"step": 668
},
{
"epoch": 2.5968992248062017,
"grad_norm": 0.06343325972557068,
"learning_rate": 1.4412858092056995e-05,
"loss": 0.6914687156677246,
"step": 670
},
{
"epoch": 2.604651162790698,
"grad_norm": 0.06813778728246689,
"learning_rate": 1.4095042008537343e-05,
"loss": 0.4894769787788391,
"step": 672
},
{
"epoch": 2.612403100775194,
"grad_norm": 0.1439589262008667,
"learning_rate": 1.3779300825854615e-05,
"loss": 0.7514118552207947,
"step": 674
},
{
"epoch": 2.62015503875969,
"grad_norm": 0.07022061944007874,
"learning_rate": 1.3465842195472315e-05,
"loss": 0.7393191456794739,
"step": 676
},
{
"epoch": 2.6279069767441863,
"grad_norm": 0.07147393375635147,
"learning_rate": 1.3154872267702535e-05,
"loss": 0.8212107419967651,
"step": 678
},
{
"epoch": 2.6356589147286824,
"grad_norm": 0.06350687146186829,
"learning_rate": 1.2846595556128338e-05,
"loss": 0.7656596302986145,
"step": 680
},
{
"epoch": 2.6434108527131785,
"grad_norm": 0.1861017942428589,
"learning_rate": 1.2541214803102764e-05,
"loss": 0.39778298139572144,
"step": 682
},
{
"epoch": 2.6511627906976747,
"grad_norm": 0.12844400107860565,
"learning_rate": 1.2238930846412478e-05,
"loss": 0.4492897689342499,
"step": 684
},
{
"epoch": 2.6589147286821704,
"grad_norm": 0.09717841446399689,
"learning_rate": 1.1939942487194114e-05,
"loss": 0.5477796792984009,
"step": 686
},
{
"epoch": 2.6666666666666665,
"grad_norm": 0.0593947097659111,
"learning_rate": 1.1644446359190002e-05,
"loss": 0.28585392236709595,
"step": 688
},
{
"epoch": 2.6744186046511627,
"grad_norm": 0.045567888766527176,
"learning_rate": 1.1352636799429364e-05,
"loss": 0.5053625106811523,
"step": 690
},
{
"epoch": 2.682170542635659,
"grad_norm": 0.05959075689315796,
"learning_rate": 1.1064705720419824e-05,
"loss": 0.567241370677948,
"step": 692
},
{
"epoch": 2.689922480620155,
"grad_norm": 0.15132786333560944,
"learning_rate": 1.0780842483933762e-05,
"loss": 0.6684018969535828,
"step": 694
},
{
"epoch": 2.697674418604651,
"grad_norm": 0.08239150047302246,
"learning_rate": 1.0501233776471719e-05,
"loss": 0.33106908202171326,
"step": 696
},
{
"epoch": 2.705426356589147,
"grad_norm": 0.06124155595898628,
"learning_rate": 1.0226063486485696e-05,
"loss": 0.566682755947113,
"step": 698
},
{
"epoch": 2.7131782945736433,
"grad_norm": 0.07035624980926514,
"learning_rate": 9.955512583442338e-06,
"loss": 0.4341398775577545,
"step": 700
},
{
"epoch": 2.7209302325581395,
"grad_norm": 0.05901051685214043,
"learning_rate": 9.689758998805937e-06,
"loss": 0.4164765775203705,
"step": 702
},
{
"epoch": 2.7286821705426356,
"grad_norm": 0.04497726634144783,
"learning_rate": 9.428977509019321e-06,
"loss": 0.40749120712280273,
"step": 704
},
{
"epoch": 2.7364341085271318,
"grad_norm": 0.15438151359558105,
"learning_rate": 9.173339620559945e-06,
"loss": 0.28900110721588135,
"step": 706
},
{
"epoch": 2.744186046511628,
"grad_norm": 0.05592001974582672,
"learning_rate": 8.923013457146072e-06,
"loss": 0.41211241483688354,
"step": 708
},
{
"epoch": 2.751937984496124,
"grad_norm": 0.0629056990146637,
"learning_rate": 8.678163649168217e-06,
"loss": 0.5537896156311035,
"step": 710
},
{
"epoch": 2.75968992248062,
"grad_norm": 0.06699282675981522,
"learning_rate": 8.43895122541748e-06,
"loss": 0.5788278579711914,
"step": 712
},
{
"epoch": 2.7674418604651163,
"grad_norm": 0.13577990233898163,
"learning_rate": 8.205533507182964e-06,
"loss": 0.37125617265701294,
"step": 714
},
{
"epoch": 2.7751937984496124,
"grad_norm": 0.16210103034973145,
"learning_rate": 7.978064004787233e-06,
"loss": 0.3962320387363434,
"step": 716
},
{
"epoch": 2.7829457364341086,
"grad_norm": 0.06700747460126877,
"learning_rate": 7.756692316628171e-06,
"loss": 0.6869024634361267,
"step": 718
},
{
"epoch": 2.7906976744186047,
"grad_norm": 0.06708226352930069,
"learning_rate": 7.541564030793529e-06,
"loss": 0.5122371912002563,
"step": 720
},
{
"epoch": 2.798449612403101,
"grad_norm": 0.0619942843914032,
"learning_rate": 7.332820629313089e-06,
"loss": 0.4030957818031311,
"step": 722
},
{
"epoch": 2.806201550387597,
"grad_norm": 0.08853983879089355,
"learning_rate": 7.1305993951108914e-06,
"loss": 0.4579683840274811,
"step": 724
},
{
"epoch": 2.813953488372093,
"grad_norm": 0.1136261448264122,
"learning_rate": 6.935033321719423e-06,
"loss": 0.4582154452800751,
"step": 726
},
{
"epoch": 2.8217054263565893,
"grad_norm": 0.03374806419014931,
"learning_rate": 6.74625102581455e-06,
"loss": 0.46171411871910095,
"step": 728
},
{
"epoch": 2.8294573643410854,
"grad_norm": 0.05085311084985733,
"learning_rate": 6.56437666262903e-06,
"loss": 0.4829785227775574,
"step": 730
},
{
"epoch": 2.8372093023255816,
"grad_norm": 0.051897477358579636,
"learning_rate": 6.389529844300143e-06,
"loss": 0.4446869194507599,
"step": 732
},
{
"epoch": 2.8449612403100772,
"grad_norm": 0.06380399316549301,
"learning_rate": 6.221825561205165e-06,
"loss": 0.5170708298683167,
"step": 734
},
{
"epoch": 2.8527131782945734,
"grad_norm": 0.07282527536153793,
"learning_rate": 6.061374106336333e-06,
"loss": 0.6230844259262085,
"step": 736
},
{
"epoch": 2.8604651162790695,
"grad_norm": 0.09063038229942322,
"learning_rate": 5.908281002765252e-06,
"loss": 0.35932058095932007,
"step": 738
},
{
"epoch": 2.8682170542635657,
"grad_norm": 1.006274938583374,
"learning_rate": 5.762646934244159e-06,
"loss": 0.3806362748146057,
"step": 740
},
{
"epoch": 2.875968992248062,
"grad_norm": 0.3232397139072418,
"learning_rate": 5.624567678989899e-06,
"loss": 0.513190507888794,
"step": 742
},
{
"epoch": 2.883720930232558,
"grad_norm": 0.120949886739254,
"learning_rate": 5.494134046694099e-06,
"loss": 0.6526894569396973,
"step": 744
},
{
"epoch": 2.891472868217054,
"grad_norm": 0.12644588947296143,
"learning_rate": 5.371431818800933e-06,
"loss": 0.4791458249092102,
"step": 746
},
{
"epoch": 2.89922480620155,
"grad_norm": 0.06687454879283905,
"learning_rate": 5.256541692091802e-06,
"loss": 0.5770004987716675,
"step": 748
},
{
"epoch": 2.9069767441860463,
"grad_norm": 0.08843245357275009,
"learning_rate": 5.149539225613978e-06,
"loss": 0.3434167802333832,
"step": 750
},
{
"epoch": 2.9147286821705425,
"grad_norm": 0.07200266420841217,
"learning_rate": 5.050494790988215e-06,
"loss": 0.4575299322605133,
"step": 752
},
{
"epoch": 2.9224806201550386,
"grad_norm": 0.07060275971889496,
"learning_rate": 4.959473526127871e-06,
"loss": 0.3564453721046448,
"step": 754
},
{
"epoch": 2.9302325581395348,
"grad_norm": 0.06662537902593613,
"learning_rate": 4.876535292400089e-06,
"loss": 0.7428521513938904,
"step": 756
},
{
"epoch": 2.937984496124031,
"grad_norm": 0.05963343381881714,
"learning_rate": 4.801734635257146e-06,
"loss": 0.4571719169616699,
"step": 758
},
{
"epoch": 2.945736434108527,
"grad_norm": 0.05507909134030342,
"learning_rate": 4.73512074836392e-06,
"loss": 0.5132399797439575,
"step": 760
},
{
"epoch": 2.953488372093023,
"grad_norm": 0.05289539694786072,
"learning_rate": 4.676737441244973e-06,
"loss": 0.814540445804596,
"step": 762
},
{
"epoch": 2.9612403100775193,
"grad_norm": 0.05587043985724449,
"learning_rate": 4.626623110472678e-06,
"loss": 0.5996021628379822,
"step": 764
},
{
"epoch": 2.9689922480620154,
"grad_norm": 0.0820513367652893,
"learning_rate": 4.584810714415136e-06,
"loss": 0.2337801605463028,
"step": 766
},
{
"epoch": 2.9767441860465116,
"grad_norm": 0.08467745780944824,
"learning_rate": 4.551327751560703e-06,
"loss": 0.43569573760032654,
"step": 768
},
{
"epoch": 2.9844961240310077,
"grad_norm": 0.09394794702529907,
"learning_rate": 4.526196242433211e-06,
"loss": 0.42782190442085266,
"step": 770
},
{
"epoch": 2.992248062015504,
"grad_norm": 0.05439142882823944,
"learning_rate": 4.509432715109889e-06,
"loss": 0.516304612159729,
"step": 772
},
{
"epoch": 3.0,
"grad_norm": 0.03667069226503372,
"learning_rate": 4.50104819435143e-06,
"loss": 0.15898612141609192,
"step": 774
},
{
"epoch": 3.0,
"step": 774,
"total_flos": 3.2487544184132076e+18,
"train_loss": 0.8073726282178277,
"train_runtime": 15483.8831,
"train_samples_per_second": 3.199,
"train_steps_per_second": 0.05
}
],
"logging_steps": 2,
"max_steps": 774,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 99999,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.2487544184132076e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}