9b-65 / trainer_state.json
furproxy's picture
Upload folder using huggingface_hub
9b4ff2d verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.0,
"eval_steps": 500,
"global_step": 1004,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00796812749003984,
"grad_norm": 0.8701496124267578,
"learning_rate": 2.9411764705882356e-07,
"loss": 2.091559410095215,
"step": 2
},
{
"epoch": 0.01593625498007968,
"grad_norm": 0.9730402231216431,
"learning_rate": 8.823529411764706e-07,
"loss": 2.0084309577941895,
"step": 4
},
{
"epoch": 0.02390438247011952,
"grad_norm": 23.87286949157715,
"learning_rate": 1.4705882352941177e-06,
"loss": 2.1494643688201904,
"step": 6
},
{
"epoch": 0.03187250996015936,
"grad_norm": 0.5717560052871704,
"learning_rate": 2.058823529411765e-06,
"loss": 1.8582701683044434,
"step": 8
},
{
"epoch": 0.0398406374501992,
"grad_norm": 1.2766757011413574,
"learning_rate": 2.647058823529412e-06,
"loss": 1.387536644935608,
"step": 10
},
{
"epoch": 0.04780876494023904,
"grad_norm": 0.6310649514198303,
"learning_rate": 3.235294117647059e-06,
"loss": 1.6384257078170776,
"step": 12
},
{
"epoch": 0.055776892430278883,
"grad_norm": 0.5051367282867432,
"learning_rate": 3.8235294117647055e-06,
"loss": 1.8997098207473755,
"step": 14
},
{
"epoch": 0.06374501992031872,
"grad_norm": 3.0508971214294434,
"learning_rate": 4.411764705882353e-06,
"loss": 1.5508460998535156,
"step": 16
},
{
"epoch": 0.07171314741035857,
"grad_norm": 0.6429860591888428,
"learning_rate": 4.9999999999999996e-06,
"loss": 1.544956922531128,
"step": 18
},
{
"epoch": 0.0796812749003984,
"grad_norm": 0.5023921132087708,
"learning_rate": 5.588235294117647e-06,
"loss": 1.4685248136520386,
"step": 20
},
{
"epoch": 0.08764940239043825,
"grad_norm": 0.4997398853302002,
"learning_rate": 6.176470588235294e-06,
"loss": 1.1129087209701538,
"step": 22
},
{
"epoch": 0.09561752988047809,
"grad_norm": 1.2241058349609375,
"learning_rate": 6.7647058823529414e-06,
"loss": 1.2971528768539429,
"step": 24
},
{
"epoch": 0.10358565737051793,
"grad_norm": 0.583693265914917,
"learning_rate": 7.3529411764705884e-06,
"loss": 1.399789810180664,
"step": 26
},
{
"epoch": 0.11155378486055777,
"grad_norm": 0.41760727763175964,
"learning_rate": 7.941176470588236e-06,
"loss": 1.6126567125320435,
"step": 28
},
{
"epoch": 0.11952191235059761,
"grad_norm": 0.6942929625511169,
"learning_rate": 8.529411764705882e-06,
"loss": 1.3107324838638306,
"step": 30
},
{
"epoch": 0.12749003984063745,
"grad_norm": 2.4918148517608643,
"learning_rate": 9.117647058823529e-06,
"loss": 1.1656028032302856,
"step": 32
},
{
"epoch": 0.13545816733067728,
"grad_norm": 0.6997283101081848,
"learning_rate": 9.705882352941177e-06,
"loss": 1.2270398139953613,
"step": 34
},
{
"epoch": 0.14342629482071714,
"grad_norm": 0.41730615496635437,
"learning_rate": 1.0294117647058824e-05,
"loss": 1.3723477125167847,
"step": 36
},
{
"epoch": 0.15139442231075698,
"grad_norm": 0.5808508992195129,
"learning_rate": 1.0882352941176471e-05,
"loss": 1.166778802871704,
"step": 38
},
{
"epoch": 0.1593625498007968,
"grad_norm": 0.29741156101226807,
"learning_rate": 1.1470588235294117e-05,
"loss": 1.2935056686401367,
"step": 40
},
{
"epoch": 0.16733067729083664,
"grad_norm": 1.2481650114059448,
"learning_rate": 1.2058823529411765e-05,
"loss": 0.765558123588562,
"step": 42
},
{
"epoch": 0.1752988047808765,
"grad_norm": 0.4549512267112732,
"learning_rate": 1.2647058823529412e-05,
"loss": 0.9544646739959717,
"step": 44
},
{
"epoch": 0.18326693227091634,
"grad_norm": 2.7968297004699707,
"learning_rate": 1.323529411764706e-05,
"loss": 0.9361187815666199,
"step": 46
},
{
"epoch": 0.19123505976095617,
"grad_norm": 0.6919461488723755,
"learning_rate": 1.3823529411764705e-05,
"loss": 1.17107093334198,
"step": 48
},
{
"epoch": 0.199203187250996,
"grad_norm": 0.5921279191970825,
"learning_rate": 1.4411764705882353e-05,
"loss": 1.3282028436660767,
"step": 50
},
{
"epoch": 0.20717131474103587,
"grad_norm": 0.5274451971054077,
"learning_rate": 1.5e-05,
"loss": 1.2876099348068237,
"step": 52
},
{
"epoch": 0.2151394422310757,
"grad_norm": 1.5639928579330444,
"learning_rate": 1.4999853294586629e-05,
"loss": 1.109473466873169,
"step": 54
},
{
"epoch": 0.22310756972111553,
"grad_norm": 0.6973602771759033,
"learning_rate": 1.4999413184723549e-05,
"loss": 1.5242366790771484,
"step": 56
},
{
"epoch": 0.23107569721115537,
"grad_norm": 0.5269781351089478,
"learning_rate": 1.4998679689541569e-05,
"loss": 1.3331416845321655,
"step": 58
},
{
"epoch": 0.23904382470119523,
"grad_norm": 0.4338107109069824,
"learning_rate": 1.499765284092446e-05,
"loss": 0.9126222729682922,
"step": 60
},
{
"epoch": 0.24701195219123506,
"grad_norm": 0.3536894917488098,
"learning_rate": 1.4996332683507557e-05,
"loss": 1.3404982089996338,
"step": 62
},
{
"epoch": 0.2549800796812749,
"grad_norm": 0.7808045148849487,
"learning_rate": 1.4994719274675816e-05,
"loss": 1.1124142408370972,
"step": 64
},
{
"epoch": 0.26294820717131473,
"grad_norm": 0.3446694314479828,
"learning_rate": 1.4992812684561331e-05,
"loss": 1.2747009992599487,
"step": 66
},
{
"epoch": 0.27091633466135456,
"grad_norm": 13.088342666625977,
"learning_rate": 1.4990612996040276e-05,
"loss": 1.282449722290039,
"step": 68
},
{
"epoch": 0.2788844621513944,
"grad_norm": 2.078386068344116,
"learning_rate": 1.498812030472931e-05,
"loss": 1.5724037885665894,
"step": 70
},
{
"epoch": 0.2868525896414343,
"grad_norm": 0.6237571239471436,
"learning_rate": 1.498533471898141e-05,
"loss": 0.8898400068283081,
"step": 72
},
{
"epoch": 0.2948207171314741,
"grad_norm": 1.2999846935272217,
"learning_rate": 1.4982256359881172e-05,
"loss": 1.1757071018218994,
"step": 74
},
{
"epoch": 0.30278884462151395,
"grad_norm": 0.5385910868644714,
"learning_rate": 1.4978885361239544e-05,
"loss": 1.4709817171096802,
"step": 76
},
{
"epoch": 0.3107569721115538,
"grad_norm": 1.2187010049819946,
"learning_rate": 1.4975221869588004e-05,
"loss": 0.9453757405281067,
"step": 78
},
{
"epoch": 0.3187250996015936,
"grad_norm": 0.3541651964187622,
"learning_rate": 1.4971266044172201e-05,
"loss": 0.8519526720046997,
"step": 80
},
{
"epoch": 0.32669322709163345,
"grad_norm": 0.4355093836784363,
"learning_rate": 1.4967018056945026e-05,
"loss": 1.3587875366210938,
"step": 82
},
{
"epoch": 0.3346613545816733,
"grad_norm": 0.6306200623512268,
"learning_rate": 1.4962478092559135e-05,
"loss": 0.9281608462333679,
"step": 84
},
{
"epoch": 0.3426294820717131,
"grad_norm": 0.4139735698699951,
"learning_rate": 1.495764634835893e-05,
"loss": 1.3322420120239258,
"step": 86
},
{
"epoch": 0.350597609561753,
"grad_norm": 0.9012618660926819,
"learning_rate": 1.4952523034371973e-05,
"loss": 0.9445306658744812,
"step": 88
},
{
"epoch": 0.35856573705179284,
"grad_norm": 0.46748796105384827,
"learning_rate": 1.4947108373299864e-05,
"loss": 1.3331313133239746,
"step": 90
},
{
"epoch": 0.3665338645418327,
"grad_norm": 1.0903550386428833,
"learning_rate": 1.4941402600508558e-05,
"loss": 1.128015398979187,
"step": 92
},
{
"epoch": 0.3745019920318725,
"grad_norm": 0.4805486798286438,
"learning_rate": 1.4935405964018128e-05,
"loss": 1.2455147504806519,
"step": 94
},
{
"epoch": 0.38247011952191234,
"grad_norm": 0.7429084181785583,
"learning_rate": 1.4929118724491996e-05,
"loss": 1.1041914224624634,
"step": 96
},
{
"epoch": 0.3904382470119522,
"grad_norm": 0.27306675910949707,
"learning_rate": 1.4922541155225586e-05,
"loss": 1.2655969858169556,
"step": 98
},
{
"epoch": 0.398406374501992,
"grad_norm": 0.41318008303642273,
"learning_rate": 1.4915673542134462e-05,
"loss": 0.8851726651191711,
"step": 100
},
{
"epoch": 0.4063745019920319,
"grad_norm": 0.4386235773563385,
"learning_rate": 1.4908516183741889e-05,
"loss": 1.265491008758545,
"step": 102
},
{
"epoch": 0.41434262948207173,
"grad_norm": 0.6781812906265259,
"learning_rate": 1.4901069391165857e-05,
"loss": 0.8081492185592651,
"step": 104
},
{
"epoch": 0.42231075697211157,
"grad_norm": 1.4451416730880737,
"learning_rate": 1.4893333488105559e-05,
"loss": 0.7170528173446655,
"step": 106
},
{
"epoch": 0.4302788844621514,
"grad_norm": 0.6063726544380188,
"learning_rate": 1.4885308810827328e-05,
"loss": 0.9935809969902039,
"step": 108
},
{
"epoch": 0.43824701195219123,
"grad_norm": 0.40737852454185486,
"learning_rate": 1.4876995708150003e-05,
"loss": 1.2845995426177979,
"step": 110
},
{
"epoch": 0.44621513944223107,
"grad_norm": 0.4796580374240875,
"learning_rate": 1.4868394541429784e-05,
"loss": 0.8904252052307129,
"step": 112
},
{
"epoch": 0.4541832669322709,
"grad_norm": 3.001218318939209,
"learning_rate": 1.4859505684544512e-05,
"loss": 1.1530516147613525,
"step": 114
},
{
"epoch": 0.46215139442231074,
"grad_norm": 0.4466836452484131,
"learning_rate": 1.4850329523877425e-05,
"loss": 1.2753629684448242,
"step": 116
},
{
"epoch": 0.4701195219123506,
"grad_norm": 0.28066951036453247,
"learning_rate": 1.4840866458300357e-05,
"loss": 1.3401973247528076,
"step": 118
},
{
"epoch": 0.47808764940239046,
"grad_norm": 0.2835182249546051,
"learning_rate": 1.4831116899156402e-05,
"loss": 1.2199780941009521,
"step": 120
},
{
"epoch": 0.4860557768924303,
"grad_norm": 0.36116963624954224,
"learning_rate": 1.4821081270242039e-05,
"loss": 0.9814391136169434,
"step": 122
},
{
"epoch": 0.4940239043824701,
"grad_norm": 0.6912099123001099,
"learning_rate": 1.48107600077887e-05,
"loss": 1.0494424104690552,
"step": 124
},
{
"epoch": 0.50199203187251,
"grad_norm": 0.8504573702812195,
"learning_rate": 1.480015356044381e-05,
"loss": 0.9379956126213074,
"step": 126
},
{
"epoch": 0.5099601593625498,
"grad_norm": 0.5862733125686646,
"learning_rate": 1.4789262389251301e-05,
"loss": 1.2821743488311768,
"step": 128
},
{
"epoch": 0.5179282868525896,
"grad_norm": 0.5818023681640625,
"learning_rate": 1.4778086967631548e-05,
"loss": 0.9355220198631287,
"step": 130
},
{
"epoch": 0.5258964143426295,
"grad_norm": 0.31655120849609375,
"learning_rate": 1.4766627781360796e-05,
"loss": 0.826532244682312,
"step": 132
},
{
"epoch": 0.5338645418326693,
"grad_norm": 0.5141142010688782,
"learning_rate": 1.4754885328550062e-05,
"loss": 0.9170287251472473,
"step": 134
},
{
"epoch": 0.5418326693227091,
"grad_norm": 0.47723662853240967,
"learning_rate": 1.4742860119623458e-05,
"loss": 1.3180201053619385,
"step": 136
},
{
"epoch": 0.549800796812749,
"grad_norm": 0.32824379205703735,
"learning_rate": 1.473055267729602e-05,
"loss": 0.9599122405052185,
"step": 138
},
{
"epoch": 0.5577689243027888,
"grad_norm": 1.1303349733352661,
"learning_rate": 1.4717963536550988e-05,
"loss": 1.0953630208969116,
"step": 140
},
{
"epoch": 0.5657370517928287,
"grad_norm": 0.49718862771987915,
"learning_rate": 1.470509324461653e-05,
"loss": 1.0326279401779175,
"step": 142
},
{
"epoch": 0.5737051792828686,
"grad_norm": 0.2485317885875702,
"learning_rate": 1.4691942360941986e-05,
"loss": 1.2258632183074951,
"step": 144
},
{
"epoch": 0.5816733067729084,
"grad_norm": 3.5433390140533447,
"learning_rate": 1.4678511457173523e-05,
"loss": 1.202100396156311,
"step": 146
},
{
"epoch": 0.5896414342629482,
"grad_norm": 0.3908817172050476,
"learning_rate": 1.4664801117129303e-05,
"loss": 0.9758645296096802,
"step": 148
},
{
"epoch": 0.5976095617529881,
"grad_norm": 0.5502234697341919,
"learning_rate": 1.4650811936774093e-05,
"loss": 0.9454991817474365,
"step": 150
},
{
"epoch": 0.6055776892430279,
"grad_norm": 4.790173530578613,
"learning_rate": 1.4636544524193378e-05,
"loss": 0.9398374557495117,
"step": 152
},
{
"epoch": 0.6135458167330677,
"grad_norm": 0.638011634349823,
"learning_rate": 1.46219994995669e-05,
"loss": 1.090728998184204,
"step": 154
},
{
"epoch": 0.6215139442231076,
"grad_norm": 2.4593403339385986,
"learning_rate": 1.4607177495141734e-05,
"loss": 1.1246390342712402,
"step": 156
},
{
"epoch": 0.6294820717131474,
"grad_norm": 0.8616807460784912,
"learning_rate": 1.4592079155204776e-05,
"loss": 1.1782993078231812,
"step": 158
},
{
"epoch": 0.6374501992031872,
"grad_norm": 0.2915763854980469,
"learning_rate": 1.457670513605475e-05,
"loss": 1.0174801349639893,
"step": 160
},
{
"epoch": 0.6454183266932271,
"grad_norm": 0.27435067296028137,
"learning_rate": 1.4561056105973688e-05,
"loss": 0.8091227412223816,
"step": 162
},
{
"epoch": 0.6533864541832669,
"grad_norm": 0.2575240731239319,
"learning_rate": 1.4545132745197857e-05,
"loss": 1.1529077291488647,
"step": 164
},
{
"epoch": 0.6613545816733067,
"grad_norm": 0.777723491191864,
"learning_rate": 1.4528935745888218e-05,
"loss": 0.8908942937850952,
"step": 166
},
{
"epoch": 0.6693227091633466,
"grad_norm": 0.2517397105693817,
"learning_rate": 1.4512465812100317e-05,
"loss": 1.2097852230072021,
"step": 168
},
{
"epoch": 0.6772908366533864,
"grad_norm": 3.4033937454223633,
"learning_rate": 1.4495723659753695e-05,
"loss": 1.2028913497924805,
"step": 170
},
{
"epoch": 0.6852589641434262,
"grad_norm": 0.3606719374656677,
"learning_rate": 1.447871001660076e-05,
"loss": 0.8955773115158081,
"step": 172
},
{
"epoch": 0.6932270916334662,
"grad_norm": 0.2552003860473633,
"learning_rate": 1.4461425622195157e-05,
"loss": 1.2185531854629517,
"step": 174
},
{
"epoch": 0.701195219123506,
"grad_norm": 1.0111852884292603,
"learning_rate": 1.4443871227859621e-05,
"loss": 0.7776660919189453,
"step": 176
},
{
"epoch": 0.7091633466135459,
"grad_norm": 0.7659691572189331,
"learning_rate": 1.4426047596653316e-05,
"loss": 0.9216206669807434,
"step": 178
},
{
"epoch": 0.7171314741035857,
"grad_norm": 1.132752776145935,
"learning_rate": 1.4407955503338663e-05,
"loss": 1.0899910926818848,
"step": 180
},
{
"epoch": 0.7250996015936255,
"grad_norm": 0.16658742725849152,
"learning_rate": 1.4389595734347675e-05,
"loss": 0.5195258855819702,
"step": 182
},
{
"epoch": 0.7330677290836654,
"grad_norm": 0.6180145144462585,
"learning_rate": 1.4370969087747755e-05,
"loss": 1.3304177522659302,
"step": 184
},
{
"epoch": 0.7410358565737052,
"grad_norm": 0.35436052083969116,
"learning_rate": 1.4352076373207023e-05,
"loss": 1.2653801441192627,
"step": 186
},
{
"epoch": 0.749003984063745,
"grad_norm": 0.2843472361564636,
"learning_rate": 1.4332918411959106e-05,
"loss": 1.1138914823532104,
"step": 188
},
{
"epoch": 0.7569721115537849,
"grad_norm": 1.0151716470718384,
"learning_rate": 1.4313496036767444e-05,
"loss": 0.8904833197593689,
"step": 190
},
{
"epoch": 0.7649402390438247,
"grad_norm": 0.7267096042633057,
"learning_rate": 1.4293810091889105e-05,
"loss": 1.2340463399887085,
"step": 192
},
{
"epoch": 0.7729083665338645,
"grad_norm": 0.47353217005729675,
"learning_rate": 1.4273861433038063e-05,
"loss": 0.9082501530647278,
"step": 194
},
{
"epoch": 0.7808764940239044,
"grad_norm": 0.9817029237747192,
"learning_rate": 1.425365092734802e-05,
"loss": 0.663750946521759,
"step": 196
},
{
"epoch": 0.7888446215139442,
"grad_norm": 0.7875825762748718,
"learning_rate": 1.423317945333471e-05,
"loss": 0.7919776439666748,
"step": 198
},
{
"epoch": 0.796812749003984,
"grad_norm": 0.5649994015693665,
"learning_rate": 1.4212447900857703e-05,
"loss": 1.0543051958084106,
"step": 200
},
{
"epoch": 0.8047808764940239,
"grad_norm": 0.1523721069097519,
"learning_rate": 1.4191457171081736e-05,
"loss": 1.0212864875793457,
"step": 202
},
{
"epoch": 0.8127490039840638,
"grad_norm": 0.28413787484169006,
"learning_rate": 1.417020817643753e-05,
"loss": 1.5364233255386353,
"step": 204
},
{
"epoch": 0.8207171314741036,
"grad_norm": 0.2831563651561737,
"learning_rate": 1.4148701840582129e-05,
"loss": 1.2227693796157837,
"step": 206
},
{
"epoch": 0.8286852589641435,
"grad_norm": 2.0232136249542236,
"learning_rate": 1.412693909835877e-05,
"loss": 0.7362918853759766,
"step": 208
},
{
"epoch": 0.8366533864541833,
"grad_norm": 0.6372008323669434,
"learning_rate": 1.4104920895756216e-05,
"loss": 1.265373945236206,
"step": 210
},
{
"epoch": 0.8446215139442231,
"grad_norm": 0.22620588541030884,
"learning_rate": 1.4082648189867656e-05,
"loss": 1.2132854461669922,
"step": 212
},
{
"epoch": 0.852589641434263,
"grad_norm": 0.287081241607666,
"learning_rate": 1.4060121948849098e-05,
"loss": 0.9602269530296326,
"step": 214
},
{
"epoch": 0.8605577689243028,
"grad_norm": 0.8160057067871094,
"learning_rate": 1.4037343151877285e-05,
"loss": 1.452444076538086,
"step": 216
},
{
"epoch": 0.8685258964143426,
"grad_norm": 1.8605669736862183,
"learning_rate": 1.4014312789107124e-05,
"loss": 1.3142669200897217,
"step": 218
},
{
"epoch": 0.8764940239043825,
"grad_norm": 0.28666868805885315,
"learning_rate": 1.3991031861628662e-05,
"loss": 1.2287095785140991,
"step": 220
},
{
"epoch": 0.8844621513944223,
"grad_norm": 0.29921239614486694,
"learning_rate": 1.3967501381423552e-05,
"loss": 1.48736572265625,
"step": 222
},
{
"epoch": 0.8924302788844621,
"grad_norm": 1.2563499212265015,
"learning_rate": 1.3943722371321075e-05,
"loss": 0.9397075176239014,
"step": 224
},
{
"epoch": 0.900398406374502,
"grad_norm": 0.39466801285743713,
"learning_rate": 1.3919695864953679e-05,
"loss": 1.0238375663757324,
"step": 226
},
{
"epoch": 0.9083665338645418,
"grad_norm": 2.8415801525115967,
"learning_rate": 1.3895422906712042e-05,
"loss": 1.1098148822784424,
"step": 228
},
{
"epoch": 0.9163346613545816,
"grad_norm": 0.6246854662895203,
"learning_rate": 1.3870904551699686e-05,
"loss": 1.1869398355484009,
"step": 230
},
{
"epoch": 0.9243027888446215,
"grad_norm": 0.308601975440979,
"learning_rate": 1.38461418656871e-05,
"loss": 1.3266777992248535,
"step": 232
},
{
"epoch": 0.9322709163346613,
"grad_norm": 0.3320607841014862,
"learning_rate": 1.3821135925065423e-05,
"loss": 0.8920221924781799,
"step": 234
},
{
"epoch": 0.9402390438247012,
"grad_norm": 0.2533508837223053,
"learning_rate": 1.3795887816799647e-05,
"loss": 0.8552533984184265,
"step": 236
},
{
"epoch": 0.9482071713147411,
"grad_norm": 0.37766775488853455,
"learning_rate": 1.3770398638381374e-05,
"loss": 0.5838753581047058,
"step": 238
},
{
"epoch": 0.9561752988047809,
"grad_norm": 0.5343811511993408,
"learning_rate": 1.3744669497781111e-05,
"loss": 0.8912972807884216,
"step": 240
},
{
"epoch": 0.9641434262948207,
"grad_norm": 0.5110613107681274,
"learning_rate": 1.3718701513400104e-05,
"loss": 1.1340361833572388,
"step": 242
},
{
"epoch": 0.9721115537848606,
"grad_norm": 0.34986478090286255,
"learning_rate": 1.369249581402173e-05,
"loss": 1.2093524932861328,
"step": 244
},
{
"epoch": 0.9800796812749004,
"grad_norm": 0.6902351975440979,
"learning_rate": 1.3666053538762414e-05,
"loss": 0.973604142665863,
"step": 246
},
{
"epoch": 0.9880478087649402,
"grad_norm": 0.364798903465271,
"learning_rate": 1.363937583702214e-05,
"loss": 1.004298448562622,
"step": 248
},
{
"epoch": 0.9960159362549801,
"grad_norm": 0.594591498374939,
"learning_rate": 1.3612463868434462e-05,
"loss": 1.005676031112671,
"step": 250
},
{
"epoch": 1.00398406374502,
"grad_norm": 0.9396325349807739,
"learning_rate": 1.3585318802816118e-05,
"loss": 0.9656413197517395,
"step": 252
},
{
"epoch": 1.0119521912350598,
"grad_norm": 0.5345960855484009,
"learning_rate": 1.3557941820116163e-05,
"loss": 0.7036761045455933,
"step": 254
},
{
"epoch": 1.0199203187250996,
"grad_norm": 0.8415208458900452,
"learning_rate": 1.3530334110364691e-05,
"loss": 1.0861495733261108,
"step": 256
},
{
"epoch": 1.0278884462151394,
"grad_norm": 0.4500897228717804,
"learning_rate": 1.35024968736211e-05,
"loss": 1.0180453062057495,
"step": 258
},
{
"epoch": 1.0358565737051793,
"grad_norm": 0.3588436245918274,
"learning_rate": 1.3474431319921936e-05,
"loss": 0.9354724884033203,
"step": 260
},
{
"epoch": 1.043824701195219,
"grad_norm": 0.3891165852546692,
"learning_rate": 1.3446138669228274e-05,
"loss": 0.9144407510757446,
"step": 262
},
{
"epoch": 1.051792828685259,
"grad_norm": 1.5371865034103394,
"learning_rate": 1.3417620151372716e-05,
"loss": 0.9848403930664062,
"step": 264
},
{
"epoch": 1.0597609561752988,
"grad_norm": 0.6903578639030457,
"learning_rate": 1.3388877006005911e-05,
"loss": 0.6154371500015259,
"step": 266
},
{
"epoch": 1.0677290836653386,
"grad_norm": 0.19243323802947998,
"learning_rate": 1.3359910482542686e-05,
"loss": 0.8479989171028137,
"step": 268
},
{
"epoch": 1.0756972111553784,
"grad_norm": 0.5195255279541016,
"learning_rate": 1.3330721840107718e-05,
"loss": 0.5587765574455261,
"step": 270
},
{
"epoch": 1.0836653386454183,
"grad_norm": 0.3604806661605835,
"learning_rate": 1.3301312347480817e-05,
"loss": 1.1884621381759644,
"step": 272
},
{
"epoch": 1.091633466135458,
"grad_norm": 1.2894952297210693,
"learning_rate": 1.3271683283041767e-05,
"loss": 0.625873863697052,
"step": 274
},
{
"epoch": 1.099601593625498,
"grad_norm": 0.27667495608329773,
"learning_rate": 1.3241835934714759e-05,
"loss": 0.7773606181144714,
"step": 276
},
{
"epoch": 1.1075697211155378,
"grad_norm": 0.23738747835159302,
"learning_rate": 1.3211771599912408e-05,
"loss": 0.7299227714538574,
"step": 278
},
{
"epoch": 1.1155378486055776,
"grad_norm": 0.3089618980884552,
"learning_rate": 1.3181491585479354e-05,
"loss": 0.8809335231781006,
"step": 280
},
{
"epoch": 1.1235059760956174,
"grad_norm": 0.25363025069236755,
"learning_rate": 1.3150997207635463e-05,
"loss": 1.0729031562805176,
"step": 282
},
{
"epoch": 1.1314741035856573,
"grad_norm": 0.47339093685150146,
"learning_rate": 1.31202897919186e-05,
"loss": 1.0833154916763306,
"step": 284
},
{
"epoch": 1.139442231075697,
"grad_norm": 0.18187947571277618,
"learning_rate": 1.3089370673127026e-05,
"loss": 0.3476455509662628,
"step": 286
},
{
"epoch": 1.1474103585657371,
"grad_norm": 0.226917564868927,
"learning_rate": 1.3058241195261357e-05,
"loss": 0.6067731976509094,
"step": 288
},
{
"epoch": 1.155378486055777,
"grad_norm": 1.2993286848068237,
"learning_rate": 1.3026902711466169e-05,
"loss": 0.8683360815048218,
"step": 290
},
{
"epoch": 1.1633466135458168,
"grad_norm": 0.4915187954902649,
"learning_rate": 1.2995356583971152e-05,
"loss": 0.6069297790527344,
"step": 292
},
{
"epoch": 1.1713147410358566,
"grad_norm": 0.24846410751342773,
"learning_rate": 1.2963604184031913e-05,
"loss": 1.096907615661621,
"step": 294
},
{
"epoch": 1.1792828685258965,
"grad_norm": 0.40995633602142334,
"learning_rate": 1.2931646891870371e-05,
"loss": 1.1847357749938965,
"step": 296
},
{
"epoch": 1.1872509960159363,
"grad_norm": 0.42281821370124817,
"learning_rate": 1.2899486096614742e-05,
"loss": 1.1937490701675415,
"step": 298
},
{
"epoch": 1.1952191235059761,
"grad_norm": 0.5707376003265381,
"learning_rate": 1.2867123196239186e-05,
"loss": 0.5830255746841431,
"step": 300
},
{
"epoch": 1.203187250996016,
"grad_norm": 0.5589886903762817,
"learning_rate": 1.2834559597503008e-05,
"loss": 0.8486528992652893,
"step": 302
},
{
"epoch": 1.2111553784860558,
"grad_norm": 0.4859887361526489,
"learning_rate": 1.2801796715889535e-05,
"loss": 0.7010272145271301,
"step": 304
},
{
"epoch": 1.2191235059760956,
"grad_norm": 0.3184964060783386,
"learning_rate": 1.2768835975544572e-05,
"loss": 1.087632179260254,
"step": 306
},
{
"epoch": 1.2270916334661355,
"grad_norm": 0.6567210555076599,
"learning_rate": 1.2735678809214497e-05,
"loss": 0.8818908333778381,
"step": 308
},
{
"epoch": 1.2350597609561753,
"grad_norm": 0.8336063027381897,
"learning_rate": 1.270232665818399e-05,
"loss": 1.1307973861694336,
"step": 310
},
{
"epoch": 1.2430278884462151,
"grad_norm": 0.46054601669311523,
"learning_rate": 1.266878097221338e-05,
"loss": 1.0041382312774658,
"step": 312
},
{
"epoch": 1.250996015936255,
"grad_norm": 1.004090666770935,
"learning_rate": 1.263504320947562e-05,
"loss": 0.8790667057037354,
"step": 314
},
{
"epoch": 1.2589641434262948,
"grad_norm": 0.6220927834510803,
"learning_rate": 1.2601114836492917e-05,
"loss": 0.7389086484909058,
"step": 316
},
{
"epoch": 1.2669322709163346,
"grad_norm": 1.0630143880844116,
"learning_rate": 1.2566997328072966e-05,
"loss": 0.6448332667350769,
"step": 318
},
{
"epoch": 1.2749003984063745,
"grad_norm": 0.4616350829601288,
"learning_rate": 1.2532692167244852e-05,
"loss": 0.5268493890762329,
"step": 320
},
{
"epoch": 1.2828685258964143,
"grad_norm": 3.0412392616271973,
"learning_rate": 1.2498200845194596e-05,
"loss": 0.9104723930358887,
"step": 322
},
{
"epoch": 1.2908366533864541,
"grad_norm": 0.4327695369720459,
"learning_rate": 1.2463524861200316e-05,
"loss": 0.8771180510520935,
"step": 324
},
{
"epoch": 1.298804780876494,
"grad_norm": 0.6371755599975586,
"learning_rate": 1.2428665722567073e-05,
"loss": 1.1892993450164795,
"step": 326
},
{
"epoch": 1.3067729083665338,
"grad_norm": 0.503496527671814,
"learning_rate": 1.2393624944561334e-05,
"loss": 0.7128881216049194,
"step": 328
},
{
"epoch": 1.3147410358565736,
"grad_norm": 0.43169552087783813,
"learning_rate": 1.2358404050345122e-05,
"loss": 0.7095832824707031,
"step": 330
},
{
"epoch": 1.3227091633466135,
"grad_norm": 0.5526296496391296,
"learning_rate": 1.2323004570909798e-05,
"loss": 0.8684831261634827,
"step": 332
},
{
"epoch": 1.3306772908366533,
"grad_norm": 1.0183297395706177,
"learning_rate": 1.2287428045009517e-05,
"loss": 0.665216863155365,
"step": 334
},
{
"epoch": 1.3386454183266931,
"grad_norm": 0.7202191352844238,
"learning_rate": 1.2251676019094331e-05,
"loss": 0.8956350684165955,
"step": 336
},
{
"epoch": 1.3466135458167332,
"grad_norm": 0.38658207654953003,
"learning_rate": 1.2215750047242982e-05,
"loss": 1.0827162265777588,
"step": 338
},
{
"epoch": 1.354581673306773,
"grad_norm": 0.2367570847272873,
"learning_rate": 1.2179651691095329e-05,
"loss": 1.0241369009017944,
"step": 340
},
{
"epoch": 1.3625498007968129,
"grad_norm": 0.3254954218864441,
"learning_rate": 1.2143382519784498e-05,
"loss": 1.1053788661956787,
"step": 342
},
{
"epoch": 1.3705179282868527,
"grad_norm": 0.25897926092147827,
"learning_rate": 1.2106944109868636e-05,
"loss": 1.037227988243103,
"step": 344
},
{
"epoch": 1.3784860557768925,
"grad_norm": 0.4815937876701355,
"learning_rate": 1.2070338045262406e-05,
"loss": 0.7165056467056274,
"step": 346
},
{
"epoch": 1.3864541832669324,
"grad_norm": 0.2625236213207245,
"learning_rate": 1.2033565917168133e-05,
"loss": 1.0718673467636108,
"step": 348
},
{
"epoch": 1.3944223107569722,
"grad_norm": 0.4188198447227478,
"learning_rate": 1.1996629324006632e-05,
"loss": 0.6164529323577881,
"step": 350
},
{
"epoch": 1.402390438247012,
"grad_norm": 0.25191178917884827,
"learning_rate": 1.195952987134773e-05,
"loss": 1.0730476379394531,
"step": 352
},
{
"epoch": 1.4103585657370519,
"grad_norm": 0.5282136797904968,
"learning_rate": 1.1922269171840477e-05,
"loss": 1.1133763790130615,
"step": 354
},
{
"epoch": 1.4183266932270917,
"grad_norm": 0.39372941851615906,
"learning_rate": 1.1884848845143039e-05,
"loss": 0.9437786936759949,
"step": 356
},
{
"epoch": 1.4262948207171315,
"grad_norm": 0.27430135011672974,
"learning_rate": 1.1847270517852312e-05,
"loss": 1.101191759109497,
"step": 358
},
{
"epoch": 1.4342629482071714,
"grad_norm": 0.4338213801383972,
"learning_rate": 1.180953582343319e-05,
"loss": 0.5615993738174438,
"step": 360
},
{
"epoch": 1.4422310756972112,
"grad_norm": 0.20297643542289734,
"learning_rate": 1.177164640214758e-05,
"loss": 0.648676335811615,
"step": 362
},
{
"epoch": 1.450199203187251,
"grad_norm": 0.43412458896636963,
"learning_rate": 1.1733603900983107e-05,
"loss": 0.9797654747962952,
"step": 364
},
{
"epoch": 1.4581673306772909,
"grad_norm": 0.27069559693336487,
"learning_rate": 1.1695409973581504e-05,
"loss": 1.0201314687728882,
"step": 366
},
{
"epoch": 1.4661354581673307,
"grad_norm": 0.21320168673992157,
"learning_rate": 1.1657066280166745e-05,
"loss": 0.5693846940994263,
"step": 368
},
{
"epoch": 1.4741035856573705,
"grad_norm": 0.609273374080658,
"learning_rate": 1.1618574487472867e-05,
"loss": 0.6598872542381287,
"step": 370
},
{
"epoch": 1.4820717131474104,
"grad_norm": 1.0151580572128296,
"learning_rate": 1.1579936268671537e-05,
"loss": 1.1873997449874878,
"step": 372
},
{
"epoch": 1.4900398406374502,
"grad_norm": 0.5126774907112122,
"learning_rate": 1.1541153303299305e-05,
"loss": 1.0114318132400513,
"step": 374
},
{
"epoch": 1.49800796812749,
"grad_norm": 0.4790279269218445,
"learning_rate": 1.1502227277184605e-05,
"loss": 1.0180116891860962,
"step": 376
},
{
"epoch": 1.5059760956175299,
"grad_norm": 3.794914722442627,
"learning_rate": 1.1463159882374477e-05,
"loss": 0.8887977004051208,
"step": 378
},
{
"epoch": 1.5139442231075697,
"grad_norm": 0.2821894884109497,
"learning_rate": 1.1423952817061005e-05,
"loss": 1.0826634168624878,
"step": 380
},
{
"epoch": 1.5219123505976095,
"grad_norm": 0.26013344526290894,
"learning_rate": 1.1384607785507527e-05,
"loss": 0.6501424312591553,
"step": 382
},
{
"epoch": 1.5298804780876494,
"grad_norm": 0.21201461553573608,
"learning_rate": 1.1345126497974507e-05,
"loss": 0.6929817795753479,
"step": 384
},
{
"epoch": 1.5378486055776892,
"grad_norm": 0.603386402130127,
"learning_rate": 1.1305510670645247e-05,
"loss": 0.9329879879951477,
"step": 386
},
{
"epoch": 1.545816733067729,
"grad_norm": 0.3552367389202118,
"learning_rate": 1.1265762025551246e-05,
"loss": 1.1002554893493652,
"step": 388
},
{
"epoch": 1.5537848605577689,
"grad_norm": 0.8357146382331848,
"learning_rate": 1.122588229049737e-05,
"loss": 0.5634505152702332,
"step": 390
},
{
"epoch": 1.5617529880478087,
"grad_norm": 0.9403584003448486,
"learning_rate": 1.118587319898673e-05,
"loss": 0.6033604145050049,
"step": 392
},
{
"epoch": 1.5697211155378485,
"grad_norm": 2.087606430053711,
"learning_rate": 1.1145736490145346e-05,
"loss": 1.0487326383590698,
"step": 394
},
{
"epoch": 1.5776892430278884,
"grad_norm": 0.7443987727165222,
"learning_rate": 1.110547390864654e-05,
"loss": 0.9917337894439697,
"step": 396
},
{
"epoch": 1.5856573705179282,
"grad_norm": 0.4282863438129425,
"learning_rate": 1.1065087204635103e-05,
"loss": 1.0512839555740356,
"step": 398
},
{
"epoch": 1.593625498007968,
"grad_norm": 0.6512730121612549,
"learning_rate": 1.1024578133651209e-05,
"loss": 0.6531898975372314,
"step": 400
},
{
"epoch": 1.6015936254980079,
"grad_norm": 0.47180187702178955,
"learning_rate": 1.0983948456554123e-05,
"loss": 1.0244213342666626,
"step": 402
},
{
"epoch": 1.6095617529880477,
"grad_norm": 0.41504454612731934,
"learning_rate": 1.0943199939445644e-05,
"loss": 1.141480803489685,
"step": 404
},
{
"epoch": 1.6175298804780875,
"grad_norm": 0.7667415142059326,
"learning_rate": 1.0902334353593342e-05,
"loss": 0.6996335387229919,
"step": 406
},
{
"epoch": 1.6254980079681274,
"grad_norm": 0.23972085118293762,
"learning_rate": 1.0861353475353559e-05,
"loss": 0.5143875479698181,
"step": 408
},
{
"epoch": 1.6334661354581672,
"grad_norm": 0.1878281980752945,
"learning_rate": 1.08202590860942e-05,
"loss": 0.6930667757987976,
"step": 410
},
{
"epoch": 1.641434262948207,
"grad_norm": 0.3578081429004669,
"learning_rate": 1.0779052972117306e-05,
"loss": 0.4972156882286072,
"step": 412
},
{
"epoch": 1.6494023904382469,
"grad_norm": 0.26842987537384033,
"learning_rate": 1.0737736924581386e-05,
"loss": 0.7380754351615906,
"step": 414
},
{
"epoch": 1.6573705179282867,
"grad_norm": 0.31403297185897827,
"learning_rate": 1.0696312739423573e-05,
"loss": 0.7590941190719604,
"step": 416
},
{
"epoch": 1.6653386454183265,
"grad_norm": 0.5314321517944336,
"learning_rate": 1.0654782217281563e-05,
"loss": 0.8922839760780334,
"step": 418
},
{
"epoch": 1.6733067729083664,
"grad_norm": 0.2761631906032562,
"learning_rate": 1.0613147163415331e-05,
"loss": 1.112337350845337,
"step": 420
},
{
"epoch": 1.6812749003984062,
"grad_norm": 0.3739781081676483,
"learning_rate": 1.0571409387628661e-05,
"loss": 0.9243249893188477,
"step": 422
},
{
"epoch": 1.6892430278884463,
"grad_norm": 0.8964663147926331,
"learning_rate": 1.0529570704190493e-05,
"loss": 0.5647684335708618,
"step": 424
},
{
"epoch": 1.697211155378486,
"grad_norm": 0.333854079246521,
"learning_rate": 1.0487632931756039e-05,
"loss": 1.0856620073318481,
"step": 426
},
{
"epoch": 1.705179282868526,
"grad_norm": 0.26213064789772034,
"learning_rate": 1.0445597893287742e-05,
"loss": 1.0230387449264526,
"step": 428
},
{
"epoch": 1.7131474103585658,
"grad_norm": 0.4736036956310272,
"learning_rate": 1.0403467415976025e-05,
"loss": 0.6771261692047119,
"step": 430
},
{
"epoch": 1.7211155378486056,
"grad_norm": 0.8969900608062744,
"learning_rate": 1.036124333115988e-05,
"loss": 0.8703440427780151,
"step": 432
},
{
"epoch": 1.7290836653386454,
"grad_norm": 0.9138644337654114,
"learning_rate": 1.0318927474247258e-05,
"loss": 0.6527059674263,
"step": 434
},
{
"epoch": 1.7370517928286853,
"grad_norm": 1.2199382781982422,
"learning_rate": 1.0276521684635272e-05,
"loss": 0.42034152150154114,
"step": 436
},
{
"epoch": 1.745019920318725,
"grad_norm": 0.753322422504425,
"learning_rate": 1.0234027805630263e-05,
"loss": 0.8424271941184998,
"step": 438
},
{
"epoch": 1.752988047808765,
"grad_norm": 0.6605293154716492,
"learning_rate": 1.0191447684367665e-05,
"loss": 0.6778283715248108,
"step": 440
},
{
"epoch": 1.7609561752988048,
"grad_norm": 0.8106198310852051,
"learning_rate": 1.0148783171731716e-05,
"loss": 1.4355847835540771,
"step": 442
},
{
"epoch": 1.7689243027888446,
"grad_norm": 0.3683789074420929,
"learning_rate": 1.0106036122274989e-05,
"loss": 0.6579235196113586,
"step": 444
},
{
"epoch": 1.7768924302788844,
"grad_norm": 0.2205553501844406,
"learning_rate": 1.0063208394137804e-05,
"loss": 0.9973717927932739,
"step": 446
},
{
"epoch": 1.7848605577689243,
"grad_norm": 0.8739639520645142,
"learning_rate": 1.0020301848967437e-05,
"loss": 1.029483437538147,
"step": 448
},
{
"epoch": 1.792828685258964,
"grad_norm": 0.2899617552757263,
"learning_rate": 9.977318351837206e-06,
"loss": 0.7871066331863403,
"step": 450
},
{
"epoch": 1.800796812749004,
"grad_norm": 0.42468908429145813,
"learning_rate": 9.934259771165394e-06,
"loss": 0.3967509865760803,
"step": 452
},
{
"epoch": 1.8087649402390438,
"grad_norm": 0.8459072113037109,
"learning_rate": 9.89112797863404e-06,
"loss": 0.9443418383598328,
"step": 454
},
{
"epoch": 1.8167330677290838,
"grad_norm": 0.7007260322570801,
"learning_rate": 9.847924849107578e-06,
"loss": 0.7411941289901733,
"step": 456
},
{
"epoch": 1.8247011952191237,
"grad_norm": 1.2606959342956543,
"learning_rate": 9.804652260551332e-06,
"loss": 0.9570497274398804,
"step": 458
},
{
"epoch": 1.8326693227091635,
"grad_norm": 1.1064777374267578,
"learning_rate": 9.761312093949886e-06,
"loss": 0.7529144883155823,
"step": 460
},
{
"epoch": 1.8406374501992033,
"grad_norm": 0.7540960907936096,
"learning_rate": 9.717906233225339e-06,
"loss": 0.7726236581802368,
"step": 462
},
{
"epoch": 1.8486055776892432,
"grad_norm": 0.4733653962612152,
"learning_rate": 9.674436565155389e-06,
"loss": 0.15728430449962616,
"step": 464
},
{
"epoch": 1.856573705179283,
"grad_norm": 0.2718278169631958,
"learning_rate": 9.63090497929133e-06,
"loss": 1.0682100057601929,
"step": 466
},
{
"epoch": 1.8645418326693228,
"grad_norm": 1.8510208129882812,
"learning_rate": 9.587313367875922e-06,
"loss": 0.4695431590080261,
"step": 468
},
{
"epoch": 1.8725099601593627,
"grad_norm": 0.4119950234889984,
"learning_rate": 9.543663625761121e-06,
"loss": 1.0789568424224854,
"step": 470
},
{
"epoch": 1.8804780876494025,
"grad_norm": 0.7990518808364868,
"learning_rate": 9.499957650325738e-06,
"loss": 1.02091383934021,
"step": 472
},
{
"epoch": 1.8884462151394423,
"grad_norm": 0.7012404799461365,
"learning_rate": 9.456197341392932e-06,
"loss": 0.9402192831039429,
"step": 474
},
{
"epoch": 1.8964143426294822,
"grad_norm": 0.3745291531085968,
"learning_rate": 9.412384601147663e-06,
"loss": 0.9166637063026428,
"step": 476
},
{
"epoch": 1.904382470119522,
"grad_norm": 0.30497679114341736,
"learning_rate": 9.368521334053973e-06,
"loss": 0.812641978263855,
"step": 478
},
{
"epoch": 1.9123505976095618,
"grad_norm": 1.237668514251709,
"learning_rate": 9.324609446772233e-06,
"loss": 0.5746023058891296,
"step": 480
},
{
"epoch": 1.9203187250996017,
"grad_norm": 0.6451582908630371,
"learning_rate": 9.280650848076242e-06,
"loss": 0.760349690914154,
"step": 482
},
{
"epoch": 1.9282868525896415,
"grad_norm": 0.288142591714859,
"learning_rate": 9.23664744877026e-06,
"loss": 1.0170018672943115,
"step": 484
},
{
"epoch": 1.9362549800796813,
"grad_norm": 0.407728374004364,
"learning_rate": 9.19260116160596e-06,
"loss": 0.9356874227523804,
"step": 486
},
{
"epoch": 1.9442231075697212,
"grad_norm": 0.23040206730365753,
"learning_rate": 9.148513901199276e-06,
"loss": 1.0043561458587646,
"step": 488
},
{
"epoch": 1.952191235059761,
"grad_norm": 0.2875385880470276,
"learning_rate": 9.104387583947168e-06,
"loss": 1.023063063621521,
"step": 490
},
{
"epoch": 1.9601593625498008,
"grad_norm": 0.3855358362197876,
"learning_rate": 9.060224127944343e-06,
"loss": 0.6780633330345154,
"step": 492
},
{
"epoch": 1.9681274900398407,
"grad_norm": 2.7685351371765137,
"learning_rate": 9.016025452899853e-06,
"loss": 0.7522924542427063,
"step": 494
},
{
"epoch": 1.9760956175298805,
"grad_norm": 0.37701013684272766,
"learning_rate": 8.971793480053668e-06,
"loss": 0.9699747562408447,
"step": 496
},
{
"epoch": 1.9840637450199203,
"grad_norm": 5.959843635559082,
"learning_rate": 8.927530132093156e-06,
"loss": 0.8083460927009583,
"step": 498
},
{
"epoch": 1.9920318725099602,
"grad_norm": 0.27620622515678406,
"learning_rate": 8.8832373330695e-06,
"loss": 1.1264008283615112,
"step": 500
},
{
"epoch": 2.0,
"grad_norm": 0.4609326422214508,
"learning_rate": 8.83891700831408e-06,
"loss": 0.5836660265922546,
"step": 502
},
{
"epoch": 2.00796812749004,
"grad_norm": 0.4023412764072418,
"learning_rate": 8.794571084354764e-06,
"loss": 0.47467219829559326,
"step": 504
},
{
"epoch": 2.0159362549800797,
"grad_norm": 1.0555591583251953,
"learning_rate": 8.750201488832178e-06,
"loss": 0.44583338499069214,
"step": 506
},
{
"epoch": 2.0239043824701195,
"grad_norm": 0.038634952157735825,
"learning_rate": 8.705810150415905e-06,
"loss": 0.42819151282310486,
"step": 508
},
{
"epoch": 2.0318725099601593,
"grad_norm": 0.418973833322525,
"learning_rate": 8.661398998720662e-06,
"loss": 0.6882845163345337,
"step": 510
},
{
"epoch": 2.039840637450199,
"grad_norm": 0.32119250297546387,
"learning_rate": 8.616969964222403e-06,
"loss": 0.5964008569717407,
"step": 512
},
{
"epoch": 2.047808764940239,
"grad_norm": 0.9128912091255188,
"learning_rate": 8.572524978174426e-06,
"loss": 0.33640968799591064,
"step": 514
},
{
"epoch": 2.055776892430279,
"grad_norm": 0.3310595452785492,
"learning_rate": 8.528065972523414e-06,
"loss": 0.7787442207336426,
"step": 516
},
{
"epoch": 2.0637450199203187,
"grad_norm": 1.0674067735671997,
"learning_rate": 8.483594879825458e-06,
"loss": 0.4966733455657959,
"step": 518
},
{
"epoch": 2.0717131474103585,
"grad_norm": 1.0013618469238281,
"learning_rate": 8.439113633162048e-06,
"loss": 0.6508659720420837,
"step": 520
},
{
"epoch": 2.0796812749003983,
"grad_norm": 0.3296944797039032,
"learning_rate": 8.39462416605605e-06,
"loss": 0.7466489672660828,
"step": 522
},
{
"epoch": 2.087649402390438,
"grad_norm": 0.7697274684906006,
"learning_rate": 8.350128412387663e-06,
"loss": 0.754063606262207,
"step": 524
},
{
"epoch": 2.095617529880478,
"grad_norm": 1.24392831325531,
"learning_rate": 8.305628306310352e-06,
"loss": 0.3448694050312042,
"step": 526
},
{
"epoch": 2.103585657370518,
"grad_norm": 0.42689138650894165,
"learning_rate": 8.261125782166764e-06,
"loss": 0.6862057447433472,
"step": 528
},
{
"epoch": 2.1115537848605577,
"grad_norm": 0.13302293419837952,
"learning_rate": 8.216622774404667e-06,
"loss": 0.42651891708374023,
"step": 530
},
{
"epoch": 2.1195219123505975,
"grad_norm": 1.496959924697876,
"learning_rate": 8.172121217492846e-06,
"loss": 0.2123342901468277,
"step": 532
},
{
"epoch": 2.1274900398406373,
"grad_norm": 0.46577370166778564,
"learning_rate": 8.127623045837018e-06,
"loss": 0.7218248844146729,
"step": 534
},
{
"epoch": 2.135458167330677,
"grad_norm": 1.5315691232681274,
"learning_rate": 8.08313019369575e-06,
"loss": 0.610504686832428,
"step": 536
},
{
"epoch": 2.143426294820717,
"grad_norm": 0.7654959559440613,
"learning_rate": 8.038644595096385e-06,
"loss": 0.6098729372024536,
"step": 538
},
{
"epoch": 2.151394422310757,
"grad_norm": 0.5512191653251648,
"learning_rate": 7.994168183750962e-06,
"loss": 0.7628468871116638,
"step": 540
},
{
"epoch": 2.1593625498007967,
"grad_norm": 0.3205984830856323,
"learning_rate": 7.949702892972157e-06,
"loss": 0.6645801067352295,
"step": 542
},
{
"epoch": 2.1673306772908365,
"grad_norm": 0.1639721542596817,
"learning_rate": 7.905250655589271e-06,
"loss": 0.5173146724700928,
"step": 544
},
{
"epoch": 2.1752988047808763,
"grad_norm": 0.9050138592720032,
"learning_rate": 7.860813403864191e-06,
"loss": 0.6048539876937866,
"step": 546
},
{
"epoch": 2.183266932270916,
"grad_norm": 0.3164230287075043,
"learning_rate": 7.816393069407394e-06,
"loss": 0.7414080500602722,
"step": 548
},
{
"epoch": 2.191235059760956,
"grad_norm": 0.24208378791809082,
"learning_rate": 7.771991583094e-06,
"loss": 0.7846360206604004,
"step": 550
},
{
"epoch": 2.199203187250996,
"grad_norm": 0.35901176929473877,
"learning_rate": 7.727610874979838e-06,
"loss": 0.48403286933898926,
"step": 552
},
{
"epoch": 2.2071713147410357,
"grad_norm": 0.506497323513031,
"learning_rate": 7.683252874217535e-06,
"loss": 0.43215182423591614,
"step": 554
},
{
"epoch": 2.2151394422310755,
"grad_norm": 0.31206437945365906,
"learning_rate": 7.638919508972672e-06,
"loss": 0.5736108422279358,
"step": 556
},
{
"epoch": 2.2231075697211153,
"grad_norm": 1.6536140441894531,
"learning_rate": 7.594612706339969e-06,
"loss": 0.8024041056632996,
"step": 558
},
{
"epoch": 2.231075697211155,
"grad_norm": 0.21574831008911133,
"learning_rate": 7.550334392259514e-06,
"loss": 0.8128300905227661,
"step": 560
},
{
"epoch": 2.239043824701195,
"grad_norm": 0.6206152439117432,
"learning_rate": 7.506086491433047e-06,
"loss": 0.833297610282898,
"step": 562
},
{
"epoch": 2.247011952191235,
"grad_norm": 0.8244820237159729,
"learning_rate": 7.461870927240291e-06,
"loss": 0.7118552327156067,
"step": 564
},
{
"epoch": 2.2549800796812747,
"grad_norm": 0.2986677587032318,
"learning_rate": 7.417689621655362e-06,
"loss": 0.5102535486221313,
"step": 566
},
{
"epoch": 2.2629482071713145,
"grad_norm": 0.2273208200931549,
"learning_rate": 7.373544495163206e-06,
"loss": 0.6329899430274963,
"step": 568
},
{
"epoch": 2.2709163346613543,
"grad_norm": 0.23210270702838898,
"learning_rate": 7.329437466676127e-06,
"loss": 0.7478767037391663,
"step": 570
},
{
"epoch": 2.278884462151394,
"grad_norm": 0.6402852535247803,
"learning_rate": 7.285370453450376e-06,
"loss": 0.6049424409866333,
"step": 572
},
{
"epoch": 2.2868525896414345,
"grad_norm": 0.48132938146591187,
"learning_rate": 7.2413453710028155e-06,
"loss": 0.5839511156082153,
"step": 574
},
{
"epoch": 2.2948207171314743,
"grad_norm": 0.29688745737075806,
"learning_rate": 7.197364133027632e-06,
"loss": 0.25525566935539246,
"step": 576
},
{
"epoch": 2.302788844621514,
"grad_norm": 0.6722139120101929,
"learning_rate": 7.153428651313191e-06,
"loss": 0.5150002241134644,
"step": 578
},
{
"epoch": 2.310756972111554,
"grad_norm": 0.4063420593738556,
"learning_rate": 7.109540835658898e-06,
"loss": 0.5354428887367249,
"step": 580
},
{
"epoch": 2.318725099601594,
"grad_norm": 0.9487866163253784,
"learning_rate": 7.065702593792204e-06,
"loss": 0.5104379653930664,
"step": 582
},
{
"epoch": 2.3266932270916336,
"grad_norm": 0.2526935040950775,
"learning_rate": 7.021915831285661e-06,
"loss": 0.6450150609016418,
"step": 584
},
{
"epoch": 2.3346613545816735,
"grad_norm": 0.3406190276145935,
"learning_rate": 6.978182451474124e-06,
"loss": 0.5338073968887329,
"step": 586
},
{
"epoch": 2.3426294820717133,
"grad_norm": 1.3200128078460693,
"learning_rate": 6.934504355371974e-06,
"loss": 0.7506805062294006,
"step": 588
},
{
"epoch": 2.350597609561753,
"grad_norm": 0.27950209379196167,
"learning_rate": 6.890883441590515e-06,
"loss": 0.7645633220672607,
"step": 590
},
{
"epoch": 2.358565737051793,
"grad_norm": 0.29245108366012573,
"learning_rate": 6.847321606255432e-06,
"loss": 0.7928623557090759,
"step": 592
},
{
"epoch": 2.366533864541833,
"grad_norm": 0.4357150197029114,
"learning_rate": 6.803820742924374e-06,
"loss": 0.5477173924446106,
"step": 594
},
{
"epoch": 2.3745019920318726,
"grad_norm": 0.3675963878631592,
"learning_rate": 6.76038274250464e-06,
"loss": 0.8036378622055054,
"step": 596
},
{
"epoch": 2.3824701195219125,
"grad_norm": 0.5962640047073364,
"learning_rate": 6.717009493170986e-06,
"loss": 0.5513007044792175,
"step": 598
},
{
"epoch": 2.3904382470119523,
"grad_norm": 0.8920307159423828,
"learning_rate": 6.673702880283554e-06,
"loss": 0.8076795935630798,
"step": 600
},
{
"epoch": 2.398406374501992,
"grad_norm": 0.22857658565044403,
"learning_rate": 6.6304647863059155e-06,
"loss": 0.7613834142684937,
"step": 602
},
{
"epoch": 2.406374501992032,
"grad_norm": 0.9126567244529724,
"learning_rate": 6.587297090723235e-06,
"loss": 0.47278252243995667,
"step": 604
},
{
"epoch": 2.414342629482072,
"grad_norm": 0.8739012479782104,
"learning_rate": 6.54420166996059e-06,
"loss": 0.23272567987442017,
"step": 606
},
{
"epoch": 2.4223107569721116,
"grad_norm": 0.09651335328817368,
"learning_rate": 6.501180397301394e-06,
"loss": 0.32919982075691223,
"step": 608
},
{
"epoch": 2.4302788844621515,
"grad_norm": 0.1508469432592392,
"learning_rate": 6.458235142805968e-06,
"loss": 0.6115418672561646,
"step": 610
},
{
"epoch": 2.4382470119521913,
"grad_norm": 0.223999485373497,
"learning_rate": 6.415367773230254e-06,
"loss": 0.656358540058136,
"step": 612
},
{
"epoch": 2.446215139442231,
"grad_norm": 0.3630542755126953,
"learning_rate": 6.372580151944681e-06,
"loss": 0.4408586919307709,
"step": 614
},
{
"epoch": 2.454183266932271,
"grad_norm": 0.5294836163520813,
"learning_rate": 6.329874138853146e-06,
"loss": 0.7569445371627808,
"step": 616
},
{
"epoch": 2.462151394422311,
"grad_norm": 0.7806637287139893,
"learning_rate": 6.287251590312181e-06,
"loss": 0.5635365843772888,
"step": 618
},
{
"epoch": 2.4701195219123506,
"grad_norm": 0.8465815186500549,
"learning_rate": 6.244714359050267e-06,
"loss": 0.6494905352592468,
"step": 620
},
{
"epoch": 2.4780876494023905,
"grad_norm": 0.30154383182525635,
"learning_rate": 6.20226429408728e-06,
"loss": 0.722070038318634,
"step": 622
},
{
"epoch": 2.4860557768924303,
"grad_norm": 0.49222832918167114,
"learning_rate": 6.159903240654132e-06,
"loss": 0.6191802620887756,
"step": 624
},
{
"epoch": 2.49402390438247,
"grad_norm": 0.4883638322353363,
"learning_rate": 6.117633040112559e-06,
"loss": 0.3768939673900604,
"step": 626
},
{
"epoch": 2.50199203187251,
"grad_norm": 0.7983854413032532,
"learning_rate": 6.0754555298750795e-06,
"loss": 0.7864499688148499,
"step": 628
},
{
"epoch": 2.50996015936255,
"grad_norm": 0.3459266126155853,
"learning_rate": 6.033372543325119e-06,
"loss": 0.3463517427444458,
"step": 630
},
{
"epoch": 2.5179282868525896,
"grad_norm": 1.2402698993682861,
"learning_rate": 5.991385909737327e-06,
"loss": 0.3873278796672821,
"step": 632
},
{
"epoch": 2.5258964143426295,
"grad_norm": 0.28206056356430054,
"learning_rate": 5.949497454198058e-06,
"loss": 0.7801554799079895,
"step": 634
},
{
"epoch": 2.5338645418326693,
"grad_norm": 0.5089584589004517,
"learning_rate": 5.907708997526031e-06,
"loss": 0.7173982262611389,
"step": 636
},
{
"epoch": 2.541832669322709,
"grad_norm": 0.7955684065818787,
"learning_rate": 5.86602235619319e-06,
"loss": 0.9195908904075623,
"step": 638
},
{
"epoch": 2.549800796812749,
"grad_norm": 0.17236770689487457,
"learning_rate": 5.824439342245739e-06,
"loss": 0.40686023235321045,
"step": 640
},
{
"epoch": 2.557768924302789,
"grad_norm": 0.4617612063884735,
"learning_rate": 5.782961763225388e-06,
"loss": 0.7664303183555603,
"step": 642
},
{
"epoch": 2.5657370517928286,
"grad_norm": 0.2930012345314026,
"learning_rate": 5.741591422090765e-06,
"loss": 0.7867609858512878,
"step": 644
},
{
"epoch": 2.5737051792828685,
"grad_norm": 0.436357706785202,
"learning_rate": 5.70033011713905e-06,
"loss": 0.5984311699867249,
"step": 646
},
{
"epoch": 2.5816733067729083,
"grad_norm": 0.40557265281677246,
"learning_rate": 5.659179641927816e-06,
"loss": 0.7649792432785034,
"step": 648
},
{
"epoch": 2.589641434262948,
"grad_norm": 0.17836439609527588,
"learning_rate": 5.61814178519706e-06,
"loss": 0.5768654346466064,
"step": 650
},
{
"epoch": 2.597609561752988,
"grad_norm": 0.40341848134994507,
"learning_rate": 5.577218330791436e-06,
"loss": 0.5763181447982788,
"step": 652
},
{
"epoch": 2.605577689243028,
"grad_norm": 0.5692223906517029,
"learning_rate": 5.536411057582744e-06,
"loss": 0.5641070008277893,
"step": 654
},
{
"epoch": 2.6135458167330676,
"grad_norm": 0.29569053649902344,
"learning_rate": 5.4957217393925734e-06,
"loss": 0.2429419606924057,
"step": 656
},
{
"epoch": 2.6215139442231075,
"grad_norm": 0.3552258610725403,
"learning_rate": 5.4551521449152216e-06,
"loss": 0.42948848009109497,
"step": 658
},
{
"epoch": 2.6294820717131473,
"grad_norm": 0.41975289583206177,
"learning_rate": 5.4147040376408e-06,
"loss": 0.7414237260818481,
"step": 660
},
{
"epoch": 2.637450199203187,
"grad_norm": 0.8263479471206665,
"learning_rate": 5.37437917577858e-06,
"loss": 0.6220693588256836,
"step": 662
},
{
"epoch": 2.645418326693227,
"grad_norm": 1.41019868850708,
"learning_rate": 5.334179312180574e-06,
"loss": 0.4769461154937744,
"step": 664
},
{
"epoch": 2.653386454183267,
"grad_norm": 9.828413009643555,
"learning_rate": 5.2941061942653315e-06,
"loss": 0.7357695698738098,
"step": 666
},
{
"epoch": 2.6613545816733066,
"grad_norm": 0.20442984998226166,
"learning_rate": 5.254161563941981e-06,
"loss": 0.545133650302887,
"step": 668
},
{
"epoch": 2.6693227091633465,
"grad_norm": 1.070529818534851,
"learning_rate": 5.2143471575345295e-06,
"loss": 0.5713125467300415,
"step": 670
},
{
"epoch": 2.6772908366533863,
"grad_norm": 0.08597006648778915,
"learning_rate": 5.174664705706371e-06,
"loss": 0.2371898740530014,
"step": 672
},
{
"epoch": 2.685258964143426,
"grad_norm": 0.2467171996831894,
"learning_rate": 5.135115933385058e-06,
"loss": 0.7705000638961792,
"step": 674
},
{
"epoch": 2.6932270916334664,
"grad_norm": 1.5602085590362549,
"learning_rate": 5.0957025596873256e-06,
"loss": 0.5420997142791748,
"step": 676
},
{
"epoch": 2.7011952191235062,
"grad_norm": 0.6235253810882568,
"learning_rate": 5.0564262978443745e-06,
"loss": 0.4899404048919678,
"step": 678
},
{
"epoch": 2.709163346613546,
"grad_norm": 0.2874850332736969,
"learning_rate": 5.017288855127377e-06,
"loss": 0.778532862663269,
"step": 680
},
{
"epoch": 2.717131474103586,
"grad_norm": 0.26746895909309387,
"learning_rate": 4.978291932773289e-06,
"loss": 0.7769652605056763,
"step": 682
},
{
"epoch": 2.7250996015936257,
"grad_norm": 0.25973984599113464,
"learning_rate": 4.9394372259108886e-06,
"loss": 0.5638492107391357,
"step": 684
},
{
"epoch": 2.7330677290836656,
"grad_norm": 0.8309025168418884,
"learning_rate": 4.9007264234870805e-06,
"loss": 0.41929128766059875,
"step": 686
},
{
"epoch": 2.7410358565737054,
"grad_norm": 0.3012772798538208,
"learning_rate": 4.862161208193505e-06,
"loss": 0.7767641544342041,
"step": 688
},
{
"epoch": 2.7490039840637452,
"grad_norm": 0.35578370094299316,
"learning_rate": 4.823743256393377e-06,
"loss": 0.47287169098854065,
"step": 690
},
{
"epoch": 2.756972111553785,
"grad_norm": 0.49327176809310913,
"learning_rate": 4.785474238048626e-06,
"loss": 0.8931385278701782,
"step": 692
},
{
"epoch": 2.764940239043825,
"grad_norm": 1.3697088956832886,
"learning_rate": 4.747355816647293e-06,
"loss": 0.6319751143455505,
"step": 694
},
{
"epoch": 2.7729083665338647,
"grad_norm": 1.342233657836914,
"learning_rate": 4.709389649131235e-06,
"loss": 0.4150761365890503,
"step": 696
},
{
"epoch": 2.7808764940239046,
"grad_norm": 0.27556970715522766,
"learning_rate": 4.6715773858241e-06,
"loss": 0.8045108318328857,
"step": 698
},
{
"epoch": 2.7888446215139444,
"grad_norm": 0.31476858258247375,
"learning_rate": 4.63392067035958e-06,
"loss": 0.8101509213447571,
"step": 700
},
{
"epoch": 2.7968127490039842,
"grad_norm": 0.5621429681777954,
"learning_rate": 4.596421139609977e-06,
"loss": 0.4465515911579132,
"step": 702
},
{
"epoch": 2.804780876494024,
"grad_norm": 0.8817136287689209,
"learning_rate": 4.5590804236150365e-06,
"loss": 0.9612689018249512,
"step": 704
},
{
"epoch": 2.812749003984064,
"grad_norm": 0.19133038818836212,
"learning_rate": 4.521900145511112e-06,
"loss": 0.8152596950531006,
"step": 706
},
{
"epoch": 2.8207171314741037,
"grad_norm": 0.4524690508842468,
"learning_rate": 4.484881921460591e-06,
"loss": 0.8935415744781494,
"step": 708
},
{
"epoch": 2.8286852589641436,
"grad_norm": 0.24354888498783112,
"learning_rate": 4.4480273605816556e-06,
"loss": 0.4386708736419678,
"step": 710
},
{
"epoch": 2.8366533864541834,
"grad_norm": 0.2424662858247757,
"learning_rate": 4.411338064878337e-06,
"loss": 0.8338403701782227,
"step": 712
},
{
"epoch": 2.8446215139442232,
"grad_norm": 0.319381445646286,
"learning_rate": 4.374815629170861e-06,
"loss": 0.5186902284622192,
"step": 714
},
{
"epoch": 2.852589641434263,
"grad_norm": 0.2536839246749878,
"learning_rate": 4.338461641026351e-06,
"loss": 0.769604504108429,
"step": 716
},
{
"epoch": 2.860557768924303,
"grad_norm": 0.8778960108757019,
"learning_rate": 4.302277680689801e-06,
"loss": 0.6171420216560364,
"step": 718
},
{
"epoch": 2.8685258964143427,
"grad_norm": 0.39766034483909607,
"learning_rate": 4.2662653210153965e-06,
"loss": 0.5202685594558716,
"step": 720
},
{
"epoch": 2.8764940239043826,
"grad_norm": 0.8559178113937378,
"learning_rate": 4.23042612739813e-06,
"loss": 0.4717506766319275,
"step": 722
},
{
"epoch": 2.8844621513944224,
"grad_norm": 0.3448426127433777,
"learning_rate": 4.194761657705765e-06,
"loss": 0.5054087042808533,
"step": 724
},
{
"epoch": 2.8924302788844622,
"grad_norm": 0.29262322187423706,
"learning_rate": 4.159273462211129e-06,
"loss": 0.7536461353302002,
"step": 726
},
{
"epoch": 2.900398406374502,
"grad_norm": 0.4943152964115143,
"learning_rate": 4.123963083524702e-06,
"loss": 0.43974122405052185,
"step": 728
},
{
"epoch": 2.908366533864542,
"grad_norm": 0.24242062866687775,
"learning_rate": 4.0888320565275854e-06,
"loss": 0.7488172650337219,
"step": 730
},
{
"epoch": 2.9163346613545817,
"grad_norm": 0.6715952754020691,
"learning_rate": 4.053881908304764e-06,
"loss": 0.2420373111963272,
"step": 732
},
{
"epoch": 2.9243027888446216,
"grad_norm": 0.2856823205947876,
"learning_rate": 4.019114158078742e-06,
"loss": 0.8018136620521545,
"step": 734
},
{
"epoch": 2.9322709163346614,
"grad_norm": 0.7715031504631042,
"learning_rate": 3.984530317143495e-06,
"loss": 0.41188791394233704,
"step": 736
},
{
"epoch": 2.9402390438247012,
"grad_norm": 1.3740425109863281,
"learning_rate": 3.950131888798777e-06,
"loss": 0.6634250283241272,
"step": 738
},
{
"epoch": 2.948207171314741,
"grad_norm": 0.7085353136062622,
"learning_rate": 3.915920368284786e-06,
"loss": 0.8047435283660889,
"step": 740
},
{
"epoch": 2.956175298804781,
"grad_norm": 1.6132349967956543,
"learning_rate": 3.881897242717153e-06,
"loss": 0.2846962511539459,
"step": 742
},
{
"epoch": 2.9641434262948207,
"grad_norm": 0.3325771987438202,
"learning_rate": 3.848063991022304e-06,
"loss": 0.679719865322113,
"step": 744
},
{
"epoch": 2.9721115537848606,
"grad_norm": 0.3333672881126404,
"learning_rate": 3.814422083873181e-06,
"loss": 0.716017484664917,
"step": 746
},
{
"epoch": 2.9800796812749004,
"grad_norm": 0.31956031918525696,
"learning_rate": 3.7809729836253126e-06,
"loss": 0.44896891713142395,
"step": 748
},
{
"epoch": 2.9880478087649402,
"grad_norm": 0.14244171977043152,
"learning_rate": 3.7477181442532373e-06,
"loss": 0.11532896757125854,
"step": 750
},
{
"epoch": 2.99601593625498,
"grad_norm": 0.196710005402565,
"learning_rate": 3.7146590112873117e-06,
"loss": 0.7710368633270264,
"step": 752
},
{
"epoch": 3.00398406374502,
"grad_norm": 0.2597305178642273,
"learning_rate": 3.6817970217508766e-06,
"loss": 0.37589359283447266,
"step": 754
},
{
"epoch": 3.0119521912350598,
"grad_norm": 0.36714112758636475,
"learning_rate": 3.649133604097784e-06,
"loss": 0.34749507904052734,
"step": 756
},
{
"epoch": 3.0199203187250996,
"grad_norm": 2.629531145095825,
"learning_rate": 3.616670178150316e-06,
"loss": 0.18874035775661469,
"step": 758
},
{
"epoch": 3.0278884462151394,
"grad_norm": 0.3082272410392761,
"learning_rate": 3.5844081550374545e-06,
"loss": 0.37505829334259033,
"step": 760
},
{
"epoch": 3.0358565737051793,
"grad_norm": 0.6151975989341736,
"learning_rate": 3.5523489371335502e-06,
"loss": 0.3742624819278717,
"step": 762
},
{
"epoch": 3.043824701195219,
"grad_norm": 0.3428267538547516,
"learning_rate": 3.5204939179973634e-06,
"loss": 0.4816422462463379,
"step": 764
},
{
"epoch": 3.051792828685259,
"grad_norm": 0.6483787894248962,
"learning_rate": 3.488844482311489e-06,
"loss": 0.16634498536586761,
"step": 766
},
{
"epoch": 3.0597609561752988,
"grad_norm": 0.5806704163551331,
"learning_rate": 3.457402005822163e-06,
"loss": 0.31581252813339233,
"step": 768
},
{
"epoch": 3.0677290836653386,
"grad_norm": 0.3666588068008423,
"learning_rate": 3.4261678552794615e-06,
"loss": 0.3485649824142456,
"step": 770
},
{
"epoch": 3.0756972111553784,
"grad_norm": 0.3737334609031677,
"learning_rate": 3.39514338837789e-06,
"loss": 0.5422434210777283,
"step": 772
},
{
"epoch": 3.0836653386454183,
"grad_norm": 1.312560796737671,
"learning_rate": 3.364329953697377e-06,
"loss": 0.5372627973556519,
"step": 774
},
{
"epoch": 3.091633466135458,
"grad_norm": 0.025555025786161423,
"learning_rate": 3.3337288906446356e-06,
"loss": 0.30303874611854553,
"step": 776
},
{
"epoch": 3.099601593625498,
"grad_norm": 0.36579927802085876,
"learning_rate": 3.303341529394961e-06,
"loss": 0.3074573278427124,
"step": 778
},
{
"epoch": 3.1075697211155378,
"grad_norm": 0.36329302191734314,
"learning_rate": 3.2731691908343907e-06,
"loss": 0.4981156885623932,
"step": 780
},
{
"epoch": 3.1155378486055776,
"grad_norm": 0.08173166960477829,
"learning_rate": 3.2432131865023065e-06,
"loss": 0.160829097032547,
"step": 782
},
{
"epoch": 3.1235059760956174,
"grad_norm": 0.3885779082775116,
"learning_rate": 3.2134748185344098e-06,
"loss": 0.5554381608963013,
"step": 784
},
{
"epoch": 3.1314741035856573,
"grad_norm": 0.317030668258667,
"learning_rate": 3.1839553796061266e-06,
"loss": 0.45913565158843994,
"step": 786
},
{
"epoch": 3.139442231075697,
"grad_norm": 0.6573988795280457,
"learning_rate": 3.1546561528764227e-06,
"loss": 0.32907965779304504,
"step": 788
},
{
"epoch": 3.147410358565737,
"grad_norm": 0.35516512393951416,
"learning_rate": 3.1255784119320064e-06,
"loss": 0.25920620560646057,
"step": 790
},
{
"epoch": 3.1553784860557768,
"grad_norm": 0.789368748664856,
"learning_rate": 3.0967234207319946e-06,
"loss": 0.3322998285293579,
"step": 792
},
{
"epoch": 3.1633466135458166,
"grad_norm": 0.08553847670555115,
"learning_rate": 3.0680924335529536e-06,
"loss": 0.20808134973049164,
"step": 794
},
{
"epoch": 3.1713147410358564,
"grad_norm": 1.4314020872116089,
"learning_rate": 3.0396866949343833e-06,
"loss": 0.33690834045410156,
"step": 796
},
{
"epoch": 3.1792828685258963,
"grad_norm": 0.279748797416687,
"learning_rate": 3.0115074396246176e-06,
"loss": 0.36214491724967957,
"step": 798
},
{
"epoch": 3.187250996015936,
"grad_norm": 0.6961238384246826,
"learning_rate": 2.9835558925271495e-06,
"loss": 0.501541018486023,
"step": 800
},
{
"epoch": 3.195219123505976,
"grad_norm": 0.18416091799736023,
"learning_rate": 2.955833268647395e-06,
"loss": 0.3577136993408203,
"step": 802
},
{
"epoch": 3.2031872509960158,
"grad_norm": 0.6885810494422913,
"learning_rate": 2.9283407730398702e-06,
"loss": 0.29195672273635864,
"step": 804
},
{
"epoch": 3.2111553784860556,
"grad_norm": 1.7156380414962769,
"learning_rate": 2.901079600755817e-06,
"loss": 0.5778890252113342,
"step": 806
},
{
"epoch": 3.2191235059760954,
"grad_norm": 0.6983752846717834,
"learning_rate": 2.8740509367912457e-06,
"loss": 0.18633845448493958,
"step": 808
},
{
"epoch": 3.2270916334661353,
"grad_norm": 0.6704440712928772,
"learning_rate": 2.8472559560354404e-06,
"loss": 0.3643829822540283,
"step": 810
},
{
"epoch": 3.235059760956175,
"grad_norm": 1.632941484451294,
"learning_rate": 2.820695823219873e-06,
"loss": 0.2959984838962555,
"step": 812
},
{
"epoch": 3.243027888446215,
"grad_norm": 0.39763104915618896,
"learning_rate": 2.794371692867585e-06,
"loss": 0.44851499795913696,
"step": 814
},
{
"epoch": 3.2509960159362548,
"grad_norm": 1.5863844156265259,
"learning_rate": 2.768284709243002e-06,
"loss": 0.13297411799430847,
"step": 816
},
{
"epoch": 3.2589641434262946,
"grad_norm": 0.5324887633323669,
"learning_rate": 2.7424360063021855e-06,
"loss": 0.5013939142227173,
"step": 818
},
{
"epoch": 3.2669322709163344,
"grad_norm": 2.0388095378875732,
"learning_rate": 2.7168267076435485e-06,
"loss": 0.2653783857822418,
"step": 820
},
{
"epoch": 3.2749003984063743,
"grad_norm": 0.6833744049072266,
"learning_rate": 2.69145792645902e-06,
"loss": 0.41534146666526794,
"step": 822
},
{
"epoch": 3.2828685258964145,
"grad_norm": 0.8634832501411438,
"learning_rate": 2.6663307654856407e-06,
"loss": 0.3562511205673218,
"step": 824
},
{
"epoch": 3.2908366533864544,
"grad_norm": 0.39872676134109497,
"learning_rate": 2.6414463169576492e-06,
"loss": 0.4844256043434143,
"step": 826
},
{
"epoch": 3.298804780876494,
"grad_norm": 0.434477299451828,
"learning_rate": 2.616805662558985e-06,
"loss": 0.6063498854637146,
"step": 828
},
{
"epoch": 3.306772908366534,
"grad_norm": 2.5125367641448975,
"learning_rate": 2.5924098733762835e-06,
"loss": 0.4092828035354614,
"step": 830
},
{
"epoch": 3.314741035856574,
"grad_norm": 0.5066865682601929,
"learning_rate": 2.5682600098523105e-06,
"loss": 0.28628939390182495,
"step": 832
},
{
"epoch": 3.3227091633466137,
"grad_norm": 0.6276751160621643,
"learning_rate": 2.5443571217398705e-06,
"loss": 0.2303668111562729,
"step": 834
},
{
"epoch": 3.3306772908366535,
"grad_norm": 3.6938159465789795,
"learning_rate": 2.5207022480561722e-06,
"loss": 0.3531423807144165,
"step": 836
},
{
"epoch": 3.3386454183266934,
"grad_norm": 0.4289490580558777,
"learning_rate": 2.497296417037664e-06,
"loss": 0.4265778660774231,
"step": 838
},
{
"epoch": 3.346613545816733,
"grad_norm": 0.859740674495697,
"learning_rate": 2.474140646095346e-06,
"loss": 0.11164703965187073,
"step": 840
},
{
"epoch": 3.354581673306773,
"grad_norm": 0.573935866355896,
"learning_rate": 2.451235941770535e-06,
"loss": 0.36163708567619324,
"step": 842
},
{
"epoch": 3.362549800796813,
"grad_norm": 0.38408342003822327,
"learning_rate": 2.428583299691118e-06,
"loss": 0.4686431884765625,
"step": 844
},
{
"epoch": 3.3705179282868527,
"grad_norm": 0.3920894265174866,
"learning_rate": 2.4061837045282717e-06,
"loss": 0.544544517993927,
"step": 846
},
{
"epoch": 3.3784860557768925,
"grad_norm": 0.2612384259700775,
"learning_rate": 2.3840381299536584e-06,
"loss": 0.4954265058040619,
"step": 848
},
{
"epoch": 3.3864541832669324,
"grad_norm": 0.9370325207710266,
"learning_rate": 2.36214753859711e-06,
"loss": 0.45011717081069946,
"step": 850
},
{
"epoch": 3.394422310756972,
"grad_norm": 0.42760828137397766,
"learning_rate": 2.3405128820047716e-06,
"loss": 0.4825401306152344,
"step": 852
},
{
"epoch": 3.402390438247012,
"grad_norm": 0.4402712285518646,
"learning_rate": 2.3191351005977556e-06,
"loss": 0.31368541717529297,
"step": 854
},
{
"epoch": 3.410358565737052,
"grad_norm": 0.4192966818809509,
"learning_rate": 2.298015123631246e-06,
"loss": 0.4709932208061218,
"step": 856
},
{
"epoch": 3.4183266932270917,
"grad_norm": 0.5742263197898865,
"learning_rate": 2.2771538691541196e-06,
"loss": 0.439094603061676,
"step": 858
},
{
"epoch": 3.4262948207171315,
"grad_norm": 0.41355282068252563,
"learning_rate": 2.256552243969029e-06,
"loss": 0.5255416035652161,
"step": 860
},
{
"epoch": 3.4342629482071714,
"grad_norm": 0.2783606946468353,
"learning_rate": 2.2362111435929956e-06,
"loss": 0.3297284245491028,
"step": 862
},
{
"epoch": 3.442231075697211,
"grad_norm": 1.933573842048645,
"learning_rate": 2.2161314522184778e-06,
"loss": 0.4290310740470886,
"step": 864
},
{
"epoch": 3.450199203187251,
"grad_norm": 0.4777624309062958,
"learning_rate": 2.1963140426749277e-06,
"loss": 0.5890864729881287,
"step": 866
},
{
"epoch": 3.458167330677291,
"grad_norm": 0.30510279536247253,
"learning_rate": 2.176759776390871e-06,
"loss": 0.4166991114616394,
"step": 868
},
{
"epoch": 3.4661354581673307,
"grad_norm": 0.32527998089790344,
"learning_rate": 2.1574695033564447e-06,
"loss": 0.343144029378891,
"step": 870
},
{
"epoch": 3.4741035856573705,
"grad_norm": 0.06616739183664322,
"learning_rate": 2.1384440620864597e-06,
"loss": 0.32270875573158264,
"step": 872
},
{
"epoch": 3.4820717131474104,
"grad_norm": 0.08085694909095764,
"learning_rate": 2.1196842795839454e-06,
"loss": 0.28370317816734314,
"step": 874
},
{
"epoch": 3.49003984063745,
"grad_norm": 0.543026328086853,
"learning_rate": 2.101190971304202e-06,
"loss": 0.2624368369579315,
"step": 876
},
{
"epoch": 3.49800796812749,
"grad_norm": 0.3456118702888489,
"learning_rate": 2.0829649411193613e-06,
"loss": 0.3216794431209564,
"step": 878
},
{
"epoch": 3.50597609561753,
"grad_norm": 0.2047196626663208,
"learning_rate": 2.0650069812834345e-06,
"loss": 0.2091296762228012,
"step": 880
},
{
"epoch": 3.5139442231075697,
"grad_norm": 0.286630779504776,
"learning_rate": 2.0473178723978813e-06,
"loss": 0.20823848247528076,
"step": 882
},
{
"epoch": 3.5219123505976095,
"grad_norm": 0.5212514400482178,
"learning_rate": 2.0298983833776717e-06,
"loss": 0.08361003547906876,
"step": 884
},
{
"epoch": 3.5298804780876494,
"grad_norm": 0.5501599311828613,
"learning_rate": 2.01274927141787e-06,
"loss": 0.4509070813655853,
"step": 886
},
{
"epoch": 3.537848605577689,
"grad_norm": 0.39404916763305664,
"learning_rate": 1.995871281960715e-06,
"loss": 0.44182029366493225,
"step": 888
},
{
"epoch": 3.545816733067729,
"grad_norm": 0.39789876341819763,
"learning_rate": 1.9792651486632213e-06,
"loss": 0.27486419677734375,
"step": 890
},
{
"epoch": 3.553784860557769,
"grad_norm": 1.1401015520095825,
"learning_rate": 1.962931593365286e-06,
"loss": 0.3863001763820648,
"step": 892
},
{
"epoch": 3.5617529880478087,
"grad_norm": 0.37662816047668457,
"learning_rate": 1.946871326058308e-06,
"loss": 0.6621991395950317,
"step": 894
},
{
"epoch": 3.5697211155378485,
"grad_norm": 0.10905114561319351,
"learning_rate": 1.9310850448543344e-06,
"loss": 0.10537078976631165,
"step": 896
},
{
"epoch": 3.5776892430278884,
"grad_norm": 0.3512639105319977,
"learning_rate": 1.915573435955711e-06,
"loss": 0.2836357057094574,
"step": 898
},
{
"epoch": 3.585657370517928,
"grad_norm": 0.43090760707855225,
"learning_rate": 1.9003371736252472e-06,
"loss": 0.15316523611545563,
"step": 900
},
{
"epoch": 3.593625498007968,
"grad_norm": 0.6001132726669312,
"learning_rate": 1.8853769201569208e-06,
"loss": 0.09330594539642334,
"step": 902
},
{
"epoch": 3.601593625498008,
"grad_norm": 0.5064031481742859,
"learning_rate": 1.8706933258470757e-06,
"loss": 0.4762483835220337,
"step": 904
},
{
"epoch": 3.6095617529880477,
"grad_norm": 0.3355056643486023,
"learning_rate": 1.8562870289661659e-06,
"loss": 0.27884015440940857,
"step": 906
},
{
"epoch": 3.6175298804780875,
"grad_norm": 0.3930132985115051,
"learning_rate": 1.8421586557309996e-06,
"loss": 0.5141717195510864,
"step": 908
},
{
"epoch": 3.6254980079681274,
"grad_norm": 0.46708425879478455,
"learning_rate": 1.8283088202775314e-06,
"loss": 0.4603351652622223,
"step": 910
},
{
"epoch": 3.633466135458167,
"grad_norm": 0.037517350167036057,
"learning_rate": 1.8147381246341558e-06,
"loss": 0.02768601104617119,
"step": 912
},
{
"epoch": 3.641434262948207,
"grad_norm": 0.3065638542175293,
"learning_rate": 1.8014471586955424e-06,
"loss": 0.3721899390220642,
"step": 914
},
{
"epoch": 3.649402390438247,
"grad_norm": 0.43989261984825134,
"learning_rate": 1.7884365001969967e-06,
"loss": 0.43738237023353577,
"step": 916
},
{
"epoch": 3.6573705179282867,
"grad_norm": 0.6677345633506775,
"learning_rate": 1.7757067146893425e-06,
"loss": 0.0748777762055397,
"step": 918
},
{
"epoch": 3.6653386454183265,
"grad_norm": 0.3021090030670166,
"learning_rate": 1.7632583555143435e-06,
"loss": 0.5561968684196472,
"step": 920
},
{
"epoch": 3.6733067729083664,
"grad_norm": 0.42820993065834045,
"learning_rate": 1.751091963780643e-06,
"loss": 0.07096469402313232,
"step": 922
},
{
"epoch": 3.681274900398406,
"grad_norm": 0.8393615484237671,
"learning_rate": 1.7392080683402496e-06,
"loss": 0.46100661158561707,
"step": 924
},
{
"epoch": 3.6892430278884465,
"grad_norm": 0.6544818878173828,
"learning_rate": 1.7276071857655479e-06,
"loss": 0.1528330296278,
"step": 926
},
{
"epoch": 3.6972111553784863,
"grad_norm": 0.31229308247566223,
"learning_rate": 1.716289820326839e-06,
"loss": 0.29350802302360535,
"step": 928
},
{
"epoch": 3.705179282868526,
"grad_norm": 0.1188875362277031,
"learning_rate": 1.7052564639704286e-06,
"loss": 0.3660446107387543,
"step": 930
},
{
"epoch": 3.713147410358566,
"grad_norm": 0.5841293931007385,
"learning_rate": 1.6945075962972356e-06,
"loss": 0.45137277245521545,
"step": 932
},
{
"epoch": 3.721115537848606,
"grad_norm": 0.7146270275115967,
"learning_rate": 1.6840436845419498e-06,
"loss": 0.4348509907722473,
"step": 934
},
{
"epoch": 3.7290836653386457,
"grad_norm": 0.326523095369339,
"learning_rate": 1.6738651835527184e-06,
"loss": 0.4922831654548645,
"step": 936
},
{
"epoch": 3.7370517928286855,
"grad_norm": 0.8038604855537415,
"learning_rate": 1.6639725357713769e-06,
"loss": 0.21507446467876434,
"step": 938
},
{
"epoch": 3.7450199203187253,
"grad_norm": 0.36527737975120544,
"learning_rate": 1.6543661712142184e-06,
"loss": 0.4618900418281555,
"step": 940
},
{
"epoch": 3.752988047808765,
"grad_norm": 0.6607845425605774,
"learning_rate": 1.645046507453294e-06,
"loss": 0.36659500002861023,
"step": 942
},
{
"epoch": 3.760956175298805,
"grad_norm": 0.421165406703949,
"learning_rate": 1.6360139495982712e-06,
"loss": 0.28992268443107605,
"step": 944
},
{
"epoch": 3.768924302788845,
"grad_norm": 0.5764026641845703,
"learning_rate": 1.6272688902788207e-06,
"loss": 0.5770589709281921,
"step": 946
},
{
"epoch": 3.7768924302788847,
"grad_norm": 0.3415585160255432,
"learning_rate": 1.6188117096275477e-06,
"loss": 0.3260127305984497,
"step": 948
},
{
"epoch": 3.7848605577689245,
"grad_norm": 0.2808169424533844,
"learning_rate": 1.610642775263468e-06,
"loss": 0.5168456435203552,
"step": 950
},
{
"epoch": 3.7928286852589643,
"grad_norm": 0.3828094005584717,
"learning_rate": 1.6027624422760312e-06,
"loss": 0.5155588388442993,
"step": 952
},
{
"epoch": 3.800796812749004,
"grad_norm": 0.616820216178894,
"learning_rate": 1.5951710532096857e-06,
"loss": 0.2644089460372925,
"step": 954
},
{
"epoch": 3.808764940239044,
"grad_norm": 0.531527042388916,
"learning_rate": 1.5878689380489846e-06,
"loss": 0.38867413997650146,
"step": 956
},
{
"epoch": 3.816733067729084,
"grad_norm": 0.9145589470863342,
"learning_rate": 1.580856414204247e-06,
"loss": 0.3010810315608978,
"step": 958
},
{
"epoch": 3.8247011952191237,
"grad_norm": 1.3794469833374023,
"learning_rate": 1.5741337864977558e-06,
"loss": 0.21975839138031006,
"step": 960
},
{
"epoch": 3.8326693227091635,
"grad_norm": 0.11370343714952469,
"learning_rate": 1.567701347150513e-06,
"loss": 0.3248888850212097,
"step": 962
},
{
"epoch": 3.8406374501992033,
"grad_norm": 0.4226270616054535,
"learning_rate": 1.5615593757695319e-06,
"loss": 0.6149446964263916,
"step": 964
},
{
"epoch": 3.848605577689243,
"grad_norm": 0.4638464152812958,
"learning_rate": 1.555708139335687e-06,
"loss": 0.21839484572410583,
"step": 966
},
{
"epoch": 3.856573705179283,
"grad_norm": 0.28881698846817017,
"learning_rate": 1.5501478921921071e-06,
"loss": 0.3512417674064636,
"step": 968
},
{
"epoch": 3.864541832669323,
"grad_norm": 0.8101674318313599,
"learning_rate": 1.54487887603312e-06,
"loss": 0.36906710267066956,
"step": 970
},
{
"epoch": 3.8725099601593627,
"grad_norm": 0.10503221303224564,
"learning_rate": 1.5399013198937452e-06,
"loss": 0.3287951946258545,
"step": 972
},
{
"epoch": 3.8804780876494025,
"grad_norm": 0.1844586879014969,
"learning_rate": 1.5352154401397418e-06,
"loss": 0.32523638010025024,
"step": 974
},
{
"epoch": 3.8884462151394423,
"grad_norm": 0.38847818970680237,
"learning_rate": 1.5308214404581968e-06,
"loss": 0.5000988245010376,
"step": 976
},
{
"epoch": 3.896414342629482,
"grad_norm": 0.4692430794239044,
"learning_rate": 1.5267195118486794e-06,
"loss": 0.2642746567726135,
"step": 978
},
{
"epoch": 3.904382470119522,
"grad_norm": 0.2286023050546646,
"learning_rate": 1.522909832614931e-06,
"loss": 0.2238185554742813,
"step": 980
},
{
"epoch": 3.912350597609562,
"grad_norm": 0.3902443051338196,
"learning_rate": 1.5193925683571211e-06,
"loss": 0.19102515280246735,
"step": 982
},
{
"epoch": 3.9203187250996017,
"grad_norm": 0.4479309320449829,
"learning_rate": 1.516167871964643e-06,
"loss": 0.5202714800834656,
"step": 984
},
{
"epoch": 3.9282868525896415,
"grad_norm": 0.5768634080886841,
"learning_rate": 1.5132358836094728e-06,
"loss": 0.16960352659225464,
"step": 986
},
{
"epoch": 3.9362549800796813,
"grad_norm": 3.171630620956421,
"learning_rate": 1.510596730740074e-06,
"loss": 0.4183100461959839,
"step": 988
},
{
"epoch": 3.944223107569721,
"grad_norm": 0.10318754613399506,
"learning_rate": 1.508250528075857e-06,
"loss": 0.3005601763725281,
"step": 990
},
{
"epoch": 3.952191235059761,
"grad_norm": 0.5118750929832458,
"learning_rate": 1.5061973776021949e-06,
"loss": 0.4696381390094757,
"step": 992
},
{
"epoch": 3.960159362549801,
"grad_norm": 0.3834742307662964,
"learning_rate": 1.504437368565988e-06,
"loss": 0.1685551553964615,
"step": 994
},
{
"epoch": 3.9681274900398407,
"grad_norm": 0.34701693058013916,
"learning_rate": 1.502970577471785e-06,
"loss": 0.4004333019256592,
"step": 996
},
{
"epoch": 3.9760956175298805,
"grad_norm": 0.14854289591312408,
"learning_rate": 1.5017970680784587e-06,
"loss": 0.32395121455192566,
"step": 998
},
{
"epoch": 3.9840637450199203,
"grad_norm": 0.3810655474662781,
"learning_rate": 1.5009168913964322e-06,
"loss": 0.23012831807136536,
"step": 1000
},
{
"epoch": 3.99203187250996,
"grad_norm": 0.3381198048591614,
"learning_rate": 1.5003300856854642e-06,
"loss": 0.48588454723358154,
"step": 1002
},
{
"epoch": 4.0,
"grad_norm": 0.07251780480146408,
"learning_rate": 1.5000366764529846e-06,
"loss": 0.11095666140317917,
"step": 1004
},
{
"epoch": 4.0,
"step": 1004,
"total_flos": 4.038502240003031e+18,
"train_loss": 0.7418717171225059,
"train_runtime": 10936.9117,
"train_samples_per_second": 5.508,
"train_steps_per_second": 0.092
}
],
"logging_steps": 2,
"max_steps": 1004,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 99999,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 4.038502240003031e+18,
"train_batch_size": 3,
"trial_name": null,
"trial_params": null
}