{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 500, "global_step": 1004, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00796812749003984, "grad_norm": 0.8701496124267578, "learning_rate": 2.9411764705882356e-07, "loss": 2.091559410095215, "step": 2 }, { "epoch": 0.01593625498007968, "grad_norm": 0.9730402231216431, "learning_rate": 8.823529411764706e-07, "loss": 2.0084309577941895, "step": 4 }, { "epoch": 0.02390438247011952, "grad_norm": 23.87286949157715, "learning_rate": 1.4705882352941177e-06, "loss": 2.1494643688201904, "step": 6 }, { "epoch": 0.03187250996015936, "grad_norm": 0.5717560052871704, "learning_rate": 2.058823529411765e-06, "loss": 1.8582701683044434, "step": 8 }, { "epoch": 0.0398406374501992, "grad_norm": 1.2766757011413574, "learning_rate": 2.647058823529412e-06, "loss": 1.387536644935608, "step": 10 }, { "epoch": 0.04780876494023904, "grad_norm": 0.6310649514198303, "learning_rate": 3.235294117647059e-06, "loss": 1.6384257078170776, "step": 12 }, { "epoch": 0.055776892430278883, "grad_norm": 0.5051367282867432, "learning_rate": 3.8235294117647055e-06, "loss": 1.8997098207473755, "step": 14 }, { "epoch": 0.06374501992031872, "grad_norm": 3.0508971214294434, "learning_rate": 4.411764705882353e-06, "loss": 1.5508460998535156, "step": 16 }, { "epoch": 0.07171314741035857, "grad_norm": 0.6429860591888428, "learning_rate": 4.9999999999999996e-06, "loss": 1.544956922531128, "step": 18 }, { "epoch": 0.0796812749003984, "grad_norm": 0.5023921132087708, "learning_rate": 5.588235294117647e-06, "loss": 1.4685248136520386, "step": 20 }, { "epoch": 0.08764940239043825, "grad_norm": 0.4997398853302002, "learning_rate": 6.176470588235294e-06, "loss": 1.1129087209701538, "step": 22 }, { "epoch": 0.09561752988047809, "grad_norm": 1.2241058349609375, "learning_rate": 6.7647058823529414e-06, "loss": 1.2971528768539429, "step": 24 }, { "epoch": 0.10358565737051793, "grad_norm": 0.583693265914917, "learning_rate": 7.3529411764705884e-06, "loss": 1.399789810180664, "step": 26 }, { "epoch": 0.11155378486055777, "grad_norm": 0.41760727763175964, "learning_rate": 7.941176470588236e-06, "loss": 1.6126567125320435, "step": 28 }, { "epoch": 0.11952191235059761, "grad_norm": 0.6942929625511169, "learning_rate": 8.529411764705882e-06, "loss": 1.3107324838638306, "step": 30 }, { "epoch": 0.12749003984063745, "grad_norm": 2.4918148517608643, "learning_rate": 9.117647058823529e-06, "loss": 1.1656028032302856, "step": 32 }, { "epoch": 0.13545816733067728, "grad_norm": 0.6997283101081848, "learning_rate": 9.705882352941177e-06, "loss": 1.2270398139953613, "step": 34 }, { "epoch": 0.14342629482071714, "grad_norm": 0.41730615496635437, "learning_rate": 1.0294117647058824e-05, "loss": 1.3723477125167847, "step": 36 }, { "epoch": 0.15139442231075698, "grad_norm": 0.5808508992195129, "learning_rate": 1.0882352941176471e-05, "loss": 1.166778802871704, "step": 38 }, { "epoch": 0.1593625498007968, "grad_norm": 0.29741156101226807, "learning_rate": 1.1470588235294117e-05, "loss": 1.2935056686401367, "step": 40 }, { "epoch": 0.16733067729083664, "grad_norm": 1.2481650114059448, "learning_rate": 1.2058823529411765e-05, "loss": 0.765558123588562, "step": 42 }, { "epoch": 0.1752988047808765, "grad_norm": 0.4549512267112732, "learning_rate": 1.2647058823529412e-05, "loss": 0.9544646739959717, "step": 44 }, { "epoch": 0.18326693227091634, "grad_norm": 2.7968297004699707, "learning_rate": 1.323529411764706e-05, "loss": 0.9361187815666199, "step": 46 }, { "epoch": 0.19123505976095617, "grad_norm": 0.6919461488723755, "learning_rate": 1.3823529411764705e-05, "loss": 1.17107093334198, "step": 48 }, { "epoch": 0.199203187250996, "grad_norm": 0.5921279191970825, "learning_rate": 1.4411764705882353e-05, "loss": 1.3282028436660767, "step": 50 }, { "epoch": 0.20717131474103587, "grad_norm": 0.5274451971054077, "learning_rate": 1.5e-05, "loss": 1.2876099348068237, "step": 52 }, { "epoch": 0.2151394422310757, "grad_norm": 1.5639928579330444, "learning_rate": 1.4999853294586629e-05, "loss": 1.109473466873169, "step": 54 }, { "epoch": 0.22310756972111553, "grad_norm": 0.6973602771759033, "learning_rate": 1.4999413184723549e-05, "loss": 1.5242366790771484, "step": 56 }, { "epoch": 0.23107569721115537, "grad_norm": 0.5269781351089478, "learning_rate": 1.4998679689541569e-05, "loss": 1.3331416845321655, "step": 58 }, { "epoch": 0.23904382470119523, "grad_norm": 0.4338107109069824, "learning_rate": 1.499765284092446e-05, "loss": 0.9126222729682922, "step": 60 }, { "epoch": 0.24701195219123506, "grad_norm": 0.3536894917488098, "learning_rate": 1.4996332683507557e-05, "loss": 1.3404982089996338, "step": 62 }, { "epoch": 0.2549800796812749, "grad_norm": 0.7808045148849487, "learning_rate": 1.4994719274675816e-05, "loss": 1.1124142408370972, "step": 64 }, { "epoch": 0.26294820717131473, "grad_norm": 0.3446694314479828, "learning_rate": 1.4992812684561331e-05, "loss": 1.2747009992599487, "step": 66 }, { "epoch": 0.27091633466135456, "grad_norm": 13.088342666625977, "learning_rate": 1.4990612996040276e-05, "loss": 1.282449722290039, "step": 68 }, { "epoch": 0.2788844621513944, "grad_norm": 2.078386068344116, "learning_rate": 1.498812030472931e-05, "loss": 1.5724037885665894, "step": 70 }, { "epoch": 0.2868525896414343, "grad_norm": 0.6237571239471436, "learning_rate": 1.498533471898141e-05, "loss": 0.8898400068283081, "step": 72 }, { "epoch": 0.2948207171314741, "grad_norm": 1.2999846935272217, "learning_rate": 1.4982256359881172e-05, "loss": 1.1757071018218994, "step": 74 }, { "epoch": 0.30278884462151395, "grad_norm": 0.5385910868644714, "learning_rate": 1.4978885361239544e-05, "loss": 1.4709817171096802, "step": 76 }, { "epoch": 0.3107569721115538, "grad_norm": 1.2187010049819946, "learning_rate": 1.4975221869588004e-05, "loss": 0.9453757405281067, "step": 78 }, { "epoch": 0.3187250996015936, "grad_norm": 0.3541651964187622, "learning_rate": 1.4971266044172201e-05, "loss": 0.8519526720046997, "step": 80 }, { "epoch": 0.32669322709163345, "grad_norm": 0.4355093836784363, "learning_rate": 1.4967018056945026e-05, "loss": 1.3587875366210938, "step": 82 }, { "epoch": 0.3346613545816733, "grad_norm": 0.6306200623512268, "learning_rate": 1.4962478092559135e-05, "loss": 0.9281608462333679, "step": 84 }, { "epoch": 0.3426294820717131, "grad_norm": 0.4139735698699951, "learning_rate": 1.495764634835893e-05, "loss": 1.3322420120239258, "step": 86 }, { "epoch": 0.350597609561753, "grad_norm": 0.9012618660926819, "learning_rate": 1.4952523034371973e-05, "loss": 0.9445306658744812, "step": 88 }, { "epoch": 0.35856573705179284, "grad_norm": 0.46748796105384827, "learning_rate": 1.4947108373299864e-05, "loss": 1.3331313133239746, "step": 90 }, { "epoch": 0.3665338645418327, "grad_norm": 1.0903550386428833, "learning_rate": 1.4941402600508558e-05, "loss": 1.128015398979187, "step": 92 }, { "epoch": 0.3745019920318725, "grad_norm": 0.4805486798286438, "learning_rate": 1.4935405964018128e-05, "loss": 1.2455147504806519, "step": 94 }, { "epoch": 0.38247011952191234, "grad_norm": 0.7429084181785583, "learning_rate": 1.4929118724491996e-05, "loss": 1.1041914224624634, "step": 96 }, { "epoch": 0.3904382470119522, "grad_norm": 0.27306675910949707, "learning_rate": 1.4922541155225586e-05, "loss": 1.2655969858169556, "step": 98 }, { "epoch": 0.398406374501992, "grad_norm": 0.41318008303642273, "learning_rate": 1.4915673542134462e-05, "loss": 0.8851726651191711, "step": 100 }, { "epoch": 0.4063745019920319, "grad_norm": 0.4386235773563385, "learning_rate": 1.4908516183741889e-05, "loss": 1.265491008758545, "step": 102 }, { "epoch": 0.41434262948207173, "grad_norm": 0.6781812906265259, "learning_rate": 1.4901069391165857e-05, "loss": 0.8081492185592651, "step": 104 }, { "epoch": 0.42231075697211157, "grad_norm": 1.4451416730880737, "learning_rate": 1.4893333488105559e-05, "loss": 0.7170528173446655, "step": 106 }, { "epoch": 0.4302788844621514, "grad_norm": 0.6063726544380188, "learning_rate": 1.4885308810827328e-05, "loss": 0.9935809969902039, "step": 108 }, { "epoch": 0.43824701195219123, "grad_norm": 0.40737852454185486, "learning_rate": 1.4876995708150003e-05, "loss": 1.2845995426177979, "step": 110 }, { "epoch": 0.44621513944223107, "grad_norm": 0.4796580374240875, "learning_rate": 1.4868394541429784e-05, "loss": 0.8904252052307129, "step": 112 }, { "epoch": 0.4541832669322709, "grad_norm": 3.001218318939209, "learning_rate": 1.4859505684544512e-05, "loss": 1.1530516147613525, "step": 114 }, { "epoch": 0.46215139442231074, "grad_norm": 0.4466836452484131, "learning_rate": 1.4850329523877425e-05, "loss": 1.2753629684448242, "step": 116 }, { "epoch": 0.4701195219123506, "grad_norm": 0.28066951036453247, "learning_rate": 1.4840866458300357e-05, "loss": 1.3401973247528076, "step": 118 }, { "epoch": 0.47808764940239046, "grad_norm": 0.2835182249546051, "learning_rate": 1.4831116899156402e-05, "loss": 1.2199780941009521, "step": 120 }, { "epoch": 0.4860557768924303, "grad_norm": 0.36116963624954224, "learning_rate": 1.4821081270242039e-05, "loss": 0.9814391136169434, "step": 122 }, { "epoch": 0.4940239043824701, "grad_norm": 0.6912099123001099, "learning_rate": 1.48107600077887e-05, "loss": 1.0494424104690552, "step": 124 }, { "epoch": 0.50199203187251, "grad_norm": 0.8504573702812195, "learning_rate": 1.480015356044381e-05, "loss": 0.9379956126213074, "step": 126 }, { "epoch": 0.5099601593625498, "grad_norm": 0.5862733125686646, "learning_rate": 1.4789262389251301e-05, "loss": 1.2821743488311768, "step": 128 }, { "epoch": 0.5179282868525896, "grad_norm": 0.5818023681640625, "learning_rate": 1.4778086967631548e-05, "loss": 0.9355220198631287, "step": 130 }, { "epoch": 0.5258964143426295, "grad_norm": 0.31655120849609375, "learning_rate": 1.4766627781360796e-05, "loss": 0.826532244682312, "step": 132 }, { "epoch": 0.5338645418326693, "grad_norm": 0.5141142010688782, "learning_rate": 1.4754885328550062e-05, "loss": 0.9170287251472473, "step": 134 }, { "epoch": 0.5418326693227091, "grad_norm": 0.47723662853240967, "learning_rate": 1.4742860119623458e-05, "loss": 1.3180201053619385, "step": 136 }, { "epoch": 0.549800796812749, "grad_norm": 0.32824379205703735, "learning_rate": 1.473055267729602e-05, "loss": 0.9599122405052185, "step": 138 }, { "epoch": 0.5577689243027888, "grad_norm": 1.1303349733352661, "learning_rate": 1.4717963536550988e-05, "loss": 1.0953630208969116, "step": 140 }, { "epoch": 0.5657370517928287, "grad_norm": 0.49718862771987915, "learning_rate": 1.470509324461653e-05, "loss": 1.0326279401779175, "step": 142 }, { "epoch": 0.5737051792828686, "grad_norm": 0.2485317885875702, "learning_rate": 1.4691942360941986e-05, "loss": 1.2258632183074951, "step": 144 }, { "epoch": 0.5816733067729084, "grad_norm": 3.5433390140533447, "learning_rate": 1.4678511457173523e-05, "loss": 1.202100396156311, "step": 146 }, { "epoch": 0.5896414342629482, "grad_norm": 0.3908817172050476, "learning_rate": 1.4664801117129303e-05, "loss": 0.9758645296096802, "step": 148 }, { "epoch": 0.5976095617529881, "grad_norm": 0.5502234697341919, "learning_rate": 1.4650811936774093e-05, "loss": 0.9454991817474365, "step": 150 }, { "epoch": 0.6055776892430279, "grad_norm": 4.790173530578613, "learning_rate": 1.4636544524193378e-05, "loss": 0.9398374557495117, "step": 152 }, { "epoch": 0.6135458167330677, "grad_norm": 0.638011634349823, "learning_rate": 1.46219994995669e-05, "loss": 1.090728998184204, "step": 154 }, { "epoch": 0.6215139442231076, "grad_norm": 2.4593403339385986, "learning_rate": 1.4607177495141734e-05, "loss": 1.1246390342712402, "step": 156 }, { "epoch": 0.6294820717131474, "grad_norm": 0.8616807460784912, "learning_rate": 1.4592079155204776e-05, "loss": 1.1782993078231812, "step": 158 }, { "epoch": 0.6374501992031872, "grad_norm": 0.2915763854980469, "learning_rate": 1.457670513605475e-05, "loss": 1.0174801349639893, "step": 160 }, { "epoch": 0.6454183266932271, "grad_norm": 0.27435067296028137, "learning_rate": 1.4561056105973688e-05, "loss": 0.8091227412223816, "step": 162 }, { "epoch": 0.6533864541832669, "grad_norm": 0.2575240731239319, "learning_rate": 1.4545132745197857e-05, "loss": 1.1529077291488647, "step": 164 }, { "epoch": 0.6613545816733067, "grad_norm": 0.777723491191864, "learning_rate": 1.4528935745888218e-05, "loss": 0.8908942937850952, "step": 166 }, { "epoch": 0.6693227091633466, "grad_norm": 0.2517397105693817, "learning_rate": 1.4512465812100317e-05, "loss": 1.2097852230072021, "step": 168 }, { "epoch": 0.6772908366533864, "grad_norm": 3.4033937454223633, "learning_rate": 1.4495723659753695e-05, "loss": 1.2028913497924805, "step": 170 }, { "epoch": 0.6852589641434262, "grad_norm": 0.3606719374656677, "learning_rate": 1.447871001660076e-05, "loss": 0.8955773115158081, "step": 172 }, { "epoch": 0.6932270916334662, "grad_norm": 0.2552003860473633, "learning_rate": 1.4461425622195157e-05, "loss": 1.2185531854629517, "step": 174 }, { "epoch": 0.701195219123506, "grad_norm": 1.0111852884292603, "learning_rate": 1.4443871227859621e-05, "loss": 0.7776660919189453, "step": 176 }, { "epoch": 0.7091633466135459, "grad_norm": 0.7659691572189331, "learning_rate": 1.4426047596653316e-05, "loss": 0.9216206669807434, "step": 178 }, { "epoch": 0.7171314741035857, "grad_norm": 1.132752776145935, "learning_rate": 1.4407955503338663e-05, "loss": 1.0899910926818848, "step": 180 }, { "epoch": 0.7250996015936255, "grad_norm": 0.16658742725849152, "learning_rate": 1.4389595734347675e-05, "loss": 0.5195258855819702, "step": 182 }, { "epoch": 0.7330677290836654, "grad_norm": 0.6180145144462585, "learning_rate": 1.4370969087747755e-05, "loss": 1.3304177522659302, "step": 184 }, { "epoch": 0.7410358565737052, "grad_norm": 0.35436052083969116, "learning_rate": 1.4352076373207023e-05, "loss": 1.2653801441192627, "step": 186 }, { "epoch": 0.749003984063745, "grad_norm": 0.2843472361564636, "learning_rate": 1.4332918411959106e-05, "loss": 1.1138914823532104, "step": 188 }, { "epoch": 0.7569721115537849, "grad_norm": 1.0151716470718384, "learning_rate": 1.4313496036767444e-05, "loss": 0.8904833197593689, "step": 190 }, { "epoch": 0.7649402390438247, "grad_norm": 0.7267096042633057, "learning_rate": 1.4293810091889105e-05, "loss": 1.2340463399887085, "step": 192 }, { "epoch": 0.7729083665338645, "grad_norm": 0.47353217005729675, "learning_rate": 1.4273861433038063e-05, "loss": 0.9082501530647278, "step": 194 }, { "epoch": 0.7808764940239044, "grad_norm": 0.9817029237747192, "learning_rate": 1.425365092734802e-05, "loss": 0.663750946521759, "step": 196 }, { "epoch": 0.7888446215139442, "grad_norm": 0.7875825762748718, "learning_rate": 1.423317945333471e-05, "loss": 0.7919776439666748, "step": 198 }, { "epoch": 0.796812749003984, "grad_norm": 0.5649994015693665, "learning_rate": 1.4212447900857703e-05, "loss": 1.0543051958084106, "step": 200 }, { "epoch": 0.8047808764940239, "grad_norm": 0.1523721069097519, "learning_rate": 1.4191457171081736e-05, "loss": 1.0212864875793457, "step": 202 }, { "epoch": 0.8127490039840638, "grad_norm": 0.28413787484169006, "learning_rate": 1.417020817643753e-05, "loss": 1.5364233255386353, "step": 204 }, { "epoch": 0.8207171314741036, "grad_norm": 0.2831563651561737, "learning_rate": 1.4148701840582129e-05, "loss": 1.2227693796157837, "step": 206 }, { "epoch": 0.8286852589641435, "grad_norm": 2.0232136249542236, "learning_rate": 1.412693909835877e-05, "loss": 0.7362918853759766, "step": 208 }, { "epoch": 0.8366533864541833, "grad_norm": 0.6372008323669434, "learning_rate": 1.4104920895756216e-05, "loss": 1.265373945236206, "step": 210 }, { "epoch": 0.8446215139442231, "grad_norm": 0.22620588541030884, "learning_rate": 1.4082648189867656e-05, "loss": 1.2132854461669922, "step": 212 }, { "epoch": 0.852589641434263, "grad_norm": 0.287081241607666, "learning_rate": 1.4060121948849098e-05, "loss": 0.9602269530296326, "step": 214 }, { "epoch": 0.8605577689243028, "grad_norm": 0.8160057067871094, "learning_rate": 1.4037343151877285e-05, "loss": 1.452444076538086, "step": 216 }, { "epoch": 0.8685258964143426, "grad_norm": 1.8605669736862183, "learning_rate": 1.4014312789107124e-05, "loss": 1.3142669200897217, "step": 218 }, { "epoch": 0.8764940239043825, "grad_norm": 0.28666868805885315, "learning_rate": 1.3991031861628662e-05, "loss": 1.2287095785140991, "step": 220 }, { "epoch": 0.8844621513944223, "grad_norm": 0.29921239614486694, "learning_rate": 1.3967501381423552e-05, "loss": 1.48736572265625, "step": 222 }, { "epoch": 0.8924302788844621, "grad_norm": 1.2563499212265015, "learning_rate": 1.3943722371321075e-05, "loss": 0.9397075176239014, "step": 224 }, { "epoch": 0.900398406374502, "grad_norm": 0.39466801285743713, "learning_rate": 1.3919695864953679e-05, "loss": 1.0238375663757324, "step": 226 }, { "epoch": 0.9083665338645418, "grad_norm": 2.8415801525115967, "learning_rate": 1.3895422906712042e-05, "loss": 1.1098148822784424, "step": 228 }, { "epoch": 0.9163346613545816, "grad_norm": 0.6246854662895203, "learning_rate": 1.3870904551699686e-05, "loss": 1.1869398355484009, "step": 230 }, { "epoch": 0.9243027888446215, "grad_norm": 0.308601975440979, "learning_rate": 1.38461418656871e-05, "loss": 1.3266777992248535, "step": 232 }, { "epoch": 0.9322709163346613, "grad_norm": 0.3320607841014862, "learning_rate": 1.3821135925065423e-05, "loss": 0.8920221924781799, "step": 234 }, { "epoch": 0.9402390438247012, "grad_norm": 0.2533508837223053, "learning_rate": 1.3795887816799647e-05, "loss": 0.8552533984184265, "step": 236 }, { "epoch": 0.9482071713147411, "grad_norm": 0.37766775488853455, "learning_rate": 1.3770398638381374e-05, "loss": 0.5838753581047058, "step": 238 }, { "epoch": 0.9561752988047809, "grad_norm": 0.5343811511993408, "learning_rate": 1.3744669497781111e-05, "loss": 0.8912972807884216, "step": 240 }, { "epoch": 0.9641434262948207, "grad_norm": 0.5110613107681274, "learning_rate": 1.3718701513400104e-05, "loss": 1.1340361833572388, "step": 242 }, { "epoch": 0.9721115537848606, "grad_norm": 0.34986478090286255, "learning_rate": 1.369249581402173e-05, "loss": 1.2093524932861328, "step": 244 }, { "epoch": 0.9800796812749004, "grad_norm": 0.6902351975440979, "learning_rate": 1.3666053538762414e-05, "loss": 0.973604142665863, "step": 246 }, { "epoch": 0.9880478087649402, "grad_norm": 0.364798903465271, "learning_rate": 1.363937583702214e-05, "loss": 1.004298448562622, "step": 248 }, { "epoch": 0.9960159362549801, "grad_norm": 0.594591498374939, "learning_rate": 1.3612463868434462e-05, "loss": 1.005676031112671, "step": 250 }, { "epoch": 1.00398406374502, "grad_norm": 0.9396325349807739, "learning_rate": 1.3585318802816118e-05, "loss": 0.9656413197517395, "step": 252 }, { "epoch": 1.0119521912350598, "grad_norm": 0.5345960855484009, "learning_rate": 1.3557941820116163e-05, "loss": 0.7036761045455933, "step": 254 }, { "epoch": 1.0199203187250996, "grad_norm": 0.8415208458900452, "learning_rate": 1.3530334110364691e-05, "loss": 1.0861495733261108, "step": 256 }, { "epoch": 1.0278884462151394, "grad_norm": 0.4500897228717804, "learning_rate": 1.35024968736211e-05, "loss": 1.0180453062057495, "step": 258 }, { "epoch": 1.0358565737051793, "grad_norm": 0.3588436245918274, "learning_rate": 1.3474431319921936e-05, "loss": 0.9354724884033203, "step": 260 }, { "epoch": 1.043824701195219, "grad_norm": 0.3891165852546692, "learning_rate": 1.3446138669228274e-05, "loss": 0.9144407510757446, "step": 262 }, { "epoch": 1.051792828685259, "grad_norm": 1.5371865034103394, "learning_rate": 1.3417620151372716e-05, "loss": 0.9848403930664062, "step": 264 }, { "epoch": 1.0597609561752988, "grad_norm": 0.6903578639030457, "learning_rate": 1.3388877006005911e-05, "loss": 0.6154371500015259, "step": 266 }, { "epoch": 1.0677290836653386, "grad_norm": 0.19243323802947998, "learning_rate": 1.3359910482542686e-05, "loss": 0.8479989171028137, "step": 268 }, { "epoch": 1.0756972111553784, "grad_norm": 0.5195255279541016, "learning_rate": 1.3330721840107718e-05, "loss": 0.5587765574455261, "step": 270 }, { "epoch": 1.0836653386454183, "grad_norm": 0.3604806661605835, "learning_rate": 1.3301312347480817e-05, "loss": 1.1884621381759644, "step": 272 }, { "epoch": 1.091633466135458, "grad_norm": 1.2894952297210693, "learning_rate": 1.3271683283041767e-05, "loss": 0.625873863697052, "step": 274 }, { "epoch": 1.099601593625498, "grad_norm": 0.27667495608329773, "learning_rate": 1.3241835934714759e-05, "loss": 0.7773606181144714, "step": 276 }, { "epoch": 1.1075697211155378, "grad_norm": 0.23738747835159302, "learning_rate": 1.3211771599912408e-05, "loss": 0.7299227714538574, "step": 278 }, { "epoch": 1.1155378486055776, "grad_norm": 0.3089618980884552, "learning_rate": 1.3181491585479354e-05, "loss": 0.8809335231781006, "step": 280 }, { "epoch": 1.1235059760956174, "grad_norm": 0.25363025069236755, "learning_rate": 1.3150997207635463e-05, "loss": 1.0729031562805176, "step": 282 }, { "epoch": 1.1314741035856573, "grad_norm": 0.47339093685150146, "learning_rate": 1.31202897919186e-05, "loss": 1.0833154916763306, "step": 284 }, { "epoch": 1.139442231075697, "grad_norm": 0.18187947571277618, "learning_rate": 1.3089370673127026e-05, "loss": 0.3476455509662628, "step": 286 }, { "epoch": 1.1474103585657371, "grad_norm": 0.226917564868927, "learning_rate": 1.3058241195261357e-05, "loss": 0.6067731976509094, "step": 288 }, { "epoch": 1.155378486055777, "grad_norm": 1.2993286848068237, "learning_rate": 1.3026902711466169e-05, "loss": 0.8683360815048218, "step": 290 }, { "epoch": 1.1633466135458168, "grad_norm": 0.4915187954902649, "learning_rate": 1.2995356583971152e-05, "loss": 0.6069297790527344, "step": 292 }, { "epoch": 1.1713147410358566, "grad_norm": 0.24846410751342773, "learning_rate": 1.2963604184031913e-05, "loss": 1.096907615661621, "step": 294 }, { "epoch": 1.1792828685258965, "grad_norm": 0.40995633602142334, "learning_rate": 1.2931646891870371e-05, "loss": 1.1847357749938965, "step": 296 }, { "epoch": 1.1872509960159363, "grad_norm": 0.42281821370124817, "learning_rate": 1.2899486096614742e-05, "loss": 1.1937490701675415, "step": 298 }, { "epoch": 1.1952191235059761, "grad_norm": 0.5707376003265381, "learning_rate": 1.2867123196239186e-05, "loss": 0.5830255746841431, "step": 300 }, { "epoch": 1.203187250996016, "grad_norm": 0.5589886903762817, "learning_rate": 1.2834559597503008e-05, "loss": 0.8486528992652893, "step": 302 }, { "epoch": 1.2111553784860558, "grad_norm": 0.4859887361526489, "learning_rate": 1.2801796715889535e-05, "loss": 0.7010272145271301, "step": 304 }, { "epoch": 1.2191235059760956, "grad_norm": 0.3184964060783386, "learning_rate": 1.2768835975544572e-05, "loss": 1.087632179260254, "step": 306 }, { "epoch": 1.2270916334661355, "grad_norm": 0.6567210555076599, "learning_rate": 1.2735678809214497e-05, "loss": 0.8818908333778381, "step": 308 }, { "epoch": 1.2350597609561753, "grad_norm": 0.8336063027381897, "learning_rate": 1.270232665818399e-05, "loss": 1.1307973861694336, "step": 310 }, { "epoch": 1.2430278884462151, "grad_norm": 0.46054601669311523, "learning_rate": 1.266878097221338e-05, "loss": 1.0041382312774658, "step": 312 }, { "epoch": 1.250996015936255, "grad_norm": 1.004090666770935, "learning_rate": 1.263504320947562e-05, "loss": 0.8790667057037354, "step": 314 }, { "epoch": 1.2589641434262948, "grad_norm": 0.6220927834510803, "learning_rate": 1.2601114836492917e-05, "loss": 0.7389086484909058, "step": 316 }, { "epoch": 1.2669322709163346, "grad_norm": 1.0630143880844116, "learning_rate": 1.2566997328072966e-05, "loss": 0.6448332667350769, "step": 318 }, { "epoch": 1.2749003984063745, "grad_norm": 0.4616350829601288, "learning_rate": 1.2532692167244852e-05, "loss": 0.5268493890762329, "step": 320 }, { "epoch": 1.2828685258964143, "grad_norm": 3.0412392616271973, "learning_rate": 1.2498200845194596e-05, "loss": 0.9104723930358887, "step": 322 }, { "epoch": 1.2908366533864541, "grad_norm": 0.4327695369720459, "learning_rate": 1.2463524861200316e-05, "loss": 0.8771180510520935, "step": 324 }, { "epoch": 1.298804780876494, "grad_norm": 0.6371755599975586, "learning_rate": 1.2428665722567073e-05, "loss": 1.1892993450164795, "step": 326 }, { "epoch": 1.3067729083665338, "grad_norm": 0.503496527671814, "learning_rate": 1.2393624944561334e-05, "loss": 0.7128881216049194, "step": 328 }, { "epoch": 1.3147410358565736, "grad_norm": 0.43169552087783813, "learning_rate": 1.2358404050345122e-05, "loss": 0.7095832824707031, "step": 330 }, { "epoch": 1.3227091633466135, "grad_norm": 0.5526296496391296, "learning_rate": 1.2323004570909798e-05, "loss": 0.8684831261634827, "step": 332 }, { "epoch": 1.3306772908366533, "grad_norm": 1.0183297395706177, "learning_rate": 1.2287428045009517e-05, "loss": 0.665216863155365, "step": 334 }, { "epoch": 1.3386454183266931, "grad_norm": 0.7202191352844238, "learning_rate": 1.2251676019094331e-05, "loss": 0.8956350684165955, "step": 336 }, { "epoch": 1.3466135458167332, "grad_norm": 0.38658207654953003, "learning_rate": 1.2215750047242982e-05, "loss": 1.0827162265777588, "step": 338 }, { "epoch": 1.354581673306773, "grad_norm": 0.2367570847272873, "learning_rate": 1.2179651691095329e-05, "loss": 1.0241369009017944, "step": 340 }, { "epoch": 1.3625498007968129, "grad_norm": 0.3254954218864441, "learning_rate": 1.2143382519784498e-05, "loss": 1.1053788661956787, "step": 342 }, { "epoch": 1.3705179282868527, "grad_norm": 0.25897926092147827, "learning_rate": 1.2106944109868636e-05, "loss": 1.037227988243103, "step": 344 }, { "epoch": 1.3784860557768925, "grad_norm": 0.4815937876701355, "learning_rate": 1.2070338045262406e-05, "loss": 0.7165056467056274, "step": 346 }, { "epoch": 1.3864541832669324, "grad_norm": 0.2625236213207245, "learning_rate": 1.2033565917168133e-05, "loss": 1.0718673467636108, "step": 348 }, { "epoch": 1.3944223107569722, "grad_norm": 0.4188198447227478, "learning_rate": 1.1996629324006632e-05, "loss": 0.6164529323577881, "step": 350 }, { "epoch": 1.402390438247012, "grad_norm": 0.25191178917884827, "learning_rate": 1.195952987134773e-05, "loss": 1.0730476379394531, "step": 352 }, { "epoch": 1.4103585657370519, "grad_norm": 0.5282136797904968, "learning_rate": 1.1922269171840477e-05, "loss": 1.1133763790130615, "step": 354 }, { "epoch": 1.4183266932270917, "grad_norm": 0.39372941851615906, "learning_rate": 1.1884848845143039e-05, "loss": 0.9437786936759949, "step": 356 }, { "epoch": 1.4262948207171315, "grad_norm": 0.27430135011672974, "learning_rate": 1.1847270517852312e-05, "loss": 1.101191759109497, "step": 358 }, { "epoch": 1.4342629482071714, "grad_norm": 0.4338213801383972, "learning_rate": 1.180953582343319e-05, "loss": 0.5615993738174438, "step": 360 }, { "epoch": 1.4422310756972112, "grad_norm": 0.20297643542289734, "learning_rate": 1.177164640214758e-05, "loss": 0.648676335811615, "step": 362 }, { "epoch": 1.450199203187251, "grad_norm": 0.43412458896636963, "learning_rate": 1.1733603900983107e-05, "loss": 0.9797654747962952, "step": 364 }, { "epoch": 1.4581673306772909, "grad_norm": 0.27069559693336487, "learning_rate": 1.1695409973581504e-05, "loss": 1.0201314687728882, "step": 366 }, { "epoch": 1.4661354581673307, "grad_norm": 0.21320168673992157, "learning_rate": 1.1657066280166745e-05, "loss": 0.5693846940994263, "step": 368 }, { "epoch": 1.4741035856573705, "grad_norm": 0.609273374080658, "learning_rate": 1.1618574487472867e-05, "loss": 0.6598872542381287, "step": 370 }, { "epoch": 1.4820717131474104, "grad_norm": 1.0151580572128296, "learning_rate": 1.1579936268671537e-05, "loss": 1.1873997449874878, "step": 372 }, { "epoch": 1.4900398406374502, "grad_norm": 0.5126774907112122, "learning_rate": 1.1541153303299305e-05, "loss": 1.0114318132400513, "step": 374 }, { "epoch": 1.49800796812749, "grad_norm": 0.4790279269218445, "learning_rate": 1.1502227277184605e-05, "loss": 1.0180116891860962, "step": 376 }, { "epoch": 1.5059760956175299, "grad_norm": 3.794914722442627, "learning_rate": 1.1463159882374477e-05, "loss": 0.8887977004051208, "step": 378 }, { "epoch": 1.5139442231075697, "grad_norm": 0.2821894884109497, "learning_rate": 1.1423952817061005e-05, "loss": 1.0826634168624878, "step": 380 }, { "epoch": 1.5219123505976095, "grad_norm": 0.26013344526290894, "learning_rate": 1.1384607785507527e-05, "loss": 0.6501424312591553, "step": 382 }, { "epoch": 1.5298804780876494, "grad_norm": 0.21201461553573608, "learning_rate": 1.1345126497974507e-05, "loss": 0.6929817795753479, "step": 384 }, { "epoch": 1.5378486055776892, "grad_norm": 0.603386402130127, "learning_rate": 1.1305510670645247e-05, "loss": 0.9329879879951477, "step": 386 }, { "epoch": 1.545816733067729, "grad_norm": 0.3552367389202118, "learning_rate": 1.1265762025551246e-05, "loss": 1.1002554893493652, "step": 388 }, { "epoch": 1.5537848605577689, "grad_norm": 0.8357146382331848, "learning_rate": 1.122588229049737e-05, "loss": 0.5634505152702332, "step": 390 }, { "epoch": 1.5617529880478087, "grad_norm": 0.9403584003448486, "learning_rate": 1.118587319898673e-05, "loss": 0.6033604145050049, "step": 392 }, { "epoch": 1.5697211155378485, "grad_norm": 2.087606430053711, "learning_rate": 1.1145736490145346e-05, "loss": 1.0487326383590698, "step": 394 }, { "epoch": 1.5776892430278884, "grad_norm": 0.7443987727165222, "learning_rate": 1.110547390864654e-05, "loss": 0.9917337894439697, "step": 396 }, { "epoch": 1.5856573705179282, "grad_norm": 0.4282863438129425, "learning_rate": 1.1065087204635103e-05, "loss": 1.0512839555740356, "step": 398 }, { "epoch": 1.593625498007968, "grad_norm": 0.6512730121612549, "learning_rate": 1.1024578133651209e-05, "loss": 0.6531898975372314, "step": 400 }, { "epoch": 1.6015936254980079, "grad_norm": 0.47180187702178955, "learning_rate": 1.0983948456554123e-05, "loss": 1.0244213342666626, "step": 402 }, { "epoch": 1.6095617529880477, "grad_norm": 0.41504454612731934, "learning_rate": 1.0943199939445644e-05, "loss": 1.141480803489685, "step": 404 }, { "epoch": 1.6175298804780875, "grad_norm": 0.7667415142059326, "learning_rate": 1.0902334353593342e-05, "loss": 0.6996335387229919, "step": 406 }, { "epoch": 1.6254980079681274, "grad_norm": 0.23972085118293762, "learning_rate": 1.0861353475353559e-05, "loss": 0.5143875479698181, "step": 408 }, { "epoch": 1.6334661354581672, "grad_norm": 0.1878281980752945, "learning_rate": 1.08202590860942e-05, "loss": 0.6930667757987976, "step": 410 }, { "epoch": 1.641434262948207, "grad_norm": 0.3578081429004669, "learning_rate": 1.0779052972117306e-05, "loss": 0.4972156882286072, "step": 412 }, { "epoch": 1.6494023904382469, "grad_norm": 0.26842987537384033, "learning_rate": 1.0737736924581386e-05, "loss": 0.7380754351615906, "step": 414 }, { "epoch": 1.6573705179282867, "grad_norm": 0.31403297185897827, "learning_rate": 1.0696312739423573e-05, "loss": 0.7590941190719604, "step": 416 }, { "epoch": 1.6653386454183265, "grad_norm": 0.5314321517944336, "learning_rate": 1.0654782217281563e-05, "loss": 0.8922839760780334, "step": 418 }, { "epoch": 1.6733067729083664, "grad_norm": 0.2761631906032562, "learning_rate": 1.0613147163415331e-05, "loss": 1.112337350845337, "step": 420 }, { "epoch": 1.6812749003984062, "grad_norm": 0.3739781081676483, "learning_rate": 1.0571409387628661e-05, "loss": 0.9243249893188477, "step": 422 }, { "epoch": 1.6892430278884463, "grad_norm": 0.8964663147926331, "learning_rate": 1.0529570704190493e-05, "loss": 0.5647684335708618, "step": 424 }, { "epoch": 1.697211155378486, "grad_norm": 0.333854079246521, "learning_rate": 1.0487632931756039e-05, "loss": 1.0856620073318481, "step": 426 }, { "epoch": 1.705179282868526, "grad_norm": 0.26213064789772034, "learning_rate": 1.0445597893287742e-05, "loss": 1.0230387449264526, "step": 428 }, { "epoch": 1.7131474103585658, "grad_norm": 0.4736036956310272, "learning_rate": 1.0403467415976025e-05, "loss": 0.6771261692047119, "step": 430 }, { "epoch": 1.7211155378486056, "grad_norm": 0.8969900608062744, "learning_rate": 1.036124333115988e-05, "loss": 0.8703440427780151, "step": 432 }, { "epoch": 1.7290836653386454, "grad_norm": 0.9138644337654114, "learning_rate": 1.0318927474247258e-05, "loss": 0.6527059674263, "step": 434 }, { "epoch": 1.7370517928286853, "grad_norm": 1.2199382781982422, "learning_rate": 1.0276521684635272e-05, "loss": 0.42034152150154114, "step": 436 }, { "epoch": 1.745019920318725, "grad_norm": 0.753322422504425, "learning_rate": 1.0234027805630263e-05, "loss": 0.8424271941184998, "step": 438 }, { "epoch": 1.752988047808765, "grad_norm": 0.6605293154716492, "learning_rate": 1.0191447684367665e-05, "loss": 0.6778283715248108, "step": 440 }, { "epoch": 1.7609561752988048, "grad_norm": 0.8106198310852051, "learning_rate": 1.0148783171731716e-05, "loss": 1.4355847835540771, "step": 442 }, { "epoch": 1.7689243027888446, "grad_norm": 0.3683789074420929, "learning_rate": 1.0106036122274989e-05, "loss": 0.6579235196113586, "step": 444 }, { "epoch": 1.7768924302788844, "grad_norm": 0.2205553501844406, "learning_rate": 1.0063208394137804e-05, "loss": 0.9973717927932739, "step": 446 }, { "epoch": 1.7848605577689243, "grad_norm": 0.8739639520645142, "learning_rate": 1.0020301848967437e-05, "loss": 1.029483437538147, "step": 448 }, { "epoch": 1.792828685258964, "grad_norm": 0.2899617552757263, "learning_rate": 9.977318351837206e-06, "loss": 0.7871066331863403, "step": 450 }, { "epoch": 1.800796812749004, "grad_norm": 0.42468908429145813, "learning_rate": 9.934259771165394e-06, "loss": 0.3967509865760803, "step": 452 }, { "epoch": 1.8087649402390438, "grad_norm": 0.8459072113037109, "learning_rate": 9.89112797863404e-06, "loss": 0.9443418383598328, "step": 454 }, { "epoch": 1.8167330677290838, "grad_norm": 0.7007260322570801, "learning_rate": 9.847924849107578e-06, "loss": 0.7411941289901733, "step": 456 }, { "epoch": 1.8247011952191237, "grad_norm": 1.2606959342956543, "learning_rate": 9.804652260551332e-06, "loss": 0.9570497274398804, "step": 458 }, { "epoch": 1.8326693227091635, "grad_norm": 1.1064777374267578, "learning_rate": 9.761312093949886e-06, "loss": 0.7529144883155823, "step": 460 }, { "epoch": 1.8406374501992033, "grad_norm": 0.7540960907936096, "learning_rate": 9.717906233225339e-06, "loss": 0.7726236581802368, "step": 462 }, { "epoch": 1.8486055776892432, "grad_norm": 0.4733653962612152, "learning_rate": 9.674436565155389e-06, "loss": 0.15728430449962616, "step": 464 }, { "epoch": 1.856573705179283, "grad_norm": 0.2718278169631958, "learning_rate": 9.63090497929133e-06, "loss": 1.0682100057601929, "step": 466 }, { "epoch": 1.8645418326693228, "grad_norm": 1.8510208129882812, "learning_rate": 9.587313367875922e-06, "loss": 0.4695431590080261, "step": 468 }, { "epoch": 1.8725099601593627, "grad_norm": 0.4119950234889984, "learning_rate": 9.543663625761121e-06, "loss": 1.0789568424224854, "step": 470 }, { "epoch": 1.8804780876494025, "grad_norm": 0.7990518808364868, "learning_rate": 9.499957650325738e-06, "loss": 1.02091383934021, "step": 472 }, { "epoch": 1.8884462151394423, "grad_norm": 0.7012404799461365, "learning_rate": 9.456197341392932e-06, "loss": 0.9402192831039429, "step": 474 }, { "epoch": 1.8964143426294822, "grad_norm": 0.3745291531085968, "learning_rate": 9.412384601147663e-06, "loss": 0.9166637063026428, "step": 476 }, { "epoch": 1.904382470119522, "grad_norm": 0.30497679114341736, "learning_rate": 9.368521334053973e-06, "loss": 0.812641978263855, "step": 478 }, { "epoch": 1.9123505976095618, "grad_norm": 1.237668514251709, "learning_rate": 9.324609446772233e-06, "loss": 0.5746023058891296, "step": 480 }, { "epoch": 1.9203187250996017, "grad_norm": 0.6451582908630371, "learning_rate": 9.280650848076242e-06, "loss": 0.760349690914154, "step": 482 }, { "epoch": 1.9282868525896415, "grad_norm": 0.288142591714859, "learning_rate": 9.23664744877026e-06, "loss": 1.0170018672943115, "step": 484 }, { "epoch": 1.9362549800796813, "grad_norm": 0.407728374004364, "learning_rate": 9.19260116160596e-06, "loss": 0.9356874227523804, "step": 486 }, { "epoch": 1.9442231075697212, "grad_norm": 0.23040206730365753, "learning_rate": 9.148513901199276e-06, "loss": 1.0043561458587646, "step": 488 }, { "epoch": 1.952191235059761, "grad_norm": 0.2875385880470276, "learning_rate": 9.104387583947168e-06, "loss": 1.023063063621521, "step": 490 }, { "epoch": 1.9601593625498008, "grad_norm": 0.3855358362197876, "learning_rate": 9.060224127944343e-06, "loss": 0.6780633330345154, "step": 492 }, { "epoch": 1.9681274900398407, "grad_norm": 2.7685351371765137, "learning_rate": 9.016025452899853e-06, "loss": 0.7522924542427063, "step": 494 }, { "epoch": 1.9760956175298805, "grad_norm": 0.37701013684272766, "learning_rate": 8.971793480053668e-06, "loss": 0.9699747562408447, "step": 496 }, { "epoch": 1.9840637450199203, "grad_norm": 5.959843635559082, "learning_rate": 8.927530132093156e-06, "loss": 0.8083460927009583, "step": 498 }, { "epoch": 1.9920318725099602, "grad_norm": 0.27620622515678406, "learning_rate": 8.8832373330695e-06, "loss": 1.1264008283615112, "step": 500 }, { "epoch": 2.0, "grad_norm": 0.4609326422214508, "learning_rate": 8.83891700831408e-06, "loss": 0.5836660265922546, "step": 502 }, { "epoch": 2.00796812749004, "grad_norm": 0.4023412764072418, "learning_rate": 8.794571084354764e-06, "loss": 0.47467219829559326, "step": 504 }, { "epoch": 2.0159362549800797, "grad_norm": 1.0555591583251953, "learning_rate": 8.750201488832178e-06, "loss": 0.44583338499069214, "step": 506 }, { "epoch": 2.0239043824701195, "grad_norm": 0.038634952157735825, "learning_rate": 8.705810150415905e-06, "loss": 0.42819151282310486, "step": 508 }, { "epoch": 2.0318725099601593, "grad_norm": 0.418973833322525, "learning_rate": 8.661398998720662e-06, "loss": 0.6882845163345337, "step": 510 }, { "epoch": 2.039840637450199, "grad_norm": 0.32119250297546387, "learning_rate": 8.616969964222403e-06, "loss": 0.5964008569717407, "step": 512 }, { "epoch": 2.047808764940239, "grad_norm": 0.9128912091255188, "learning_rate": 8.572524978174426e-06, "loss": 0.33640968799591064, "step": 514 }, { "epoch": 2.055776892430279, "grad_norm": 0.3310595452785492, "learning_rate": 8.528065972523414e-06, "loss": 0.7787442207336426, "step": 516 }, { "epoch": 2.0637450199203187, "grad_norm": 1.0674067735671997, "learning_rate": 8.483594879825458e-06, "loss": 0.4966733455657959, "step": 518 }, { "epoch": 2.0717131474103585, "grad_norm": 1.0013618469238281, "learning_rate": 8.439113633162048e-06, "loss": 0.6508659720420837, "step": 520 }, { "epoch": 2.0796812749003983, "grad_norm": 0.3296944797039032, "learning_rate": 8.39462416605605e-06, "loss": 0.7466489672660828, "step": 522 }, { "epoch": 2.087649402390438, "grad_norm": 0.7697274684906006, "learning_rate": 8.350128412387663e-06, "loss": 0.754063606262207, "step": 524 }, { "epoch": 2.095617529880478, "grad_norm": 1.24392831325531, "learning_rate": 8.305628306310352e-06, "loss": 0.3448694050312042, "step": 526 }, { "epoch": 2.103585657370518, "grad_norm": 0.42689138650894165, "learning_rate": 8.261125782166764e-06, "loss": 0.6862057447433472, "step": 528 }, { "epoch": 2.1115537848605577, "grad_norm": 0.13302293419837952, "learning_rate": 8.216622774404667e-06, "loss": 0.42651891708374023, "step": 530 }, { "epoch": 2.1195219123505975, "grad_norm": 1.496959924697876, "learning_rate": 8.172121217492846e-06, "loss": 0.2123342901468277, "step": 532 }, { "epoch": 2.1274900398406373, "grad_norm": 0.46577370166778564, "learning_rate": 8.127623045837018e-06, "loss": 0.7218248844146729, "step": 534 }, { "epoch": 2.135458167330677, "grad_norm": 1.5315691232681274, "learning_rate": 8.08313019369575e-06, "loss": 0.610504686832428, "step": 536 }, { "epoch": 2.143426294820717, "grad_norm": 0.7654959559440613, "learning_rate": 8.038644595096385e-06, "loss": 0.6098729372024536, "step": 538 }, { "epoch": 2.151394422310757, "grad_norm": 0.5512191653251648, "learning_rate": 7.994168183750962e-06, "loss": 0.7628468871116638, "step": 540 }, { "epoch": 2.1593625498007967, "grad_norm": 0.3205984830856323, "learning_rate": 7.949702892972157e-06, "loss": 0.6645801067352295, "step": 542 }, { "epoch": 2.1673306772908365, "grad_norm": 0.1639721542596817, "learning_rate": 7.905250655589271e-06, "loss": 0.5173146724700928, "step": 544 }, { "epoch": 2.1752988047808763, "grad_norm": 0.9050138592720032, "learning_rate": 7.860813403864191e-06, "loss": 0.6048539876937866, "step": 546 }, { "epoch": 2.183266932270916, "grad_norm": 0.3164230287075043, "learning_rate": 7.816393069407394e-06, "loss": 0.7414080500602722, "step": 548 }, { "epoch": 2.191235059760956, "grad_norm": 0.24208378791809082, "learning_rate": 7.771991583094e-06, "loss": 0.7846360206604004, "step": 550 }, { "epoch": 2.199203187250996, "grad_norm": 0.35901176929473877, "learning_rate": 7.727610874979838e-06, "loss": 0.48403286933898926, "step": 552 }, { "epoch": 2.2071713147410357, "grad_norm": 0.506497323513031, "learning_rate": 7.683252874217535e-06, "loss": 0.43215182423591614, "step": 554 }, { "epoch": 2.2151394422310755, "grad_norm": 0.31206437945365906, "learning_rate": 7.638919508972672e-06, "loss": 0.5736108422279358, "step": 556 }, { "epoch": 2.2231075697211153, "grad_norm": 1.6536140441894531, "learning_rate": 7.594612706339969e-06, "loss": 0.8024041056632996, "step": 558 }, { "epoch": 2.231075697211155, "grad_norm": 0.21574831008911133, "learning_rate": 7.550334392259514e-06, "loss": 0.8128300905227661, "step": 560 }, { "epoch": 2.239043824701195, "grad_norm": 0.6206152439117432, "learning_rate": 7.506086491433047e-06, "loss": 0.833297610282898, "step": 562 }, { "epoch": 2.247011952191235, "grad_norm": 0.8244820237159729, "learning_rate": 7.461870927240291e-06, "loss": 0.7118552327156067, "step": 564 }, { "epoch": 2.2549800796812747, "grad_norm": 0.2986677587032318, "learning_rate": 7.417689621655362e-06, "loss": 0.5102535486221313, "step": 566 }, { "epoch": 2.2629482071713145, "grad_norm": 0.2273208200931549, "learning_rate": 7.373544495163206e-06, "loss": 0.6329899430274963, "step": 568 }, { "epoch": 2.2709163346613543, "grad_norm": 0.23210270702838898, "learning_rate": 7.329437466676127e-06, "loss": 0.7478767037391663, "step": 570 }, { "epoch": 2.278884462151394, "grad_norm": 0.6402852535247803, "learning_rate": 7.285370453450376e-06, "loss": 0.6049424409866333, "step": 572 }, { "epoch": 2.2868525896414345, "grad_norm": 0.48132938146591187, "learning_rate": 7.2413453710028155e-06, "loss": 0.5839511156082153, "step": 574 }, { "epoch": 2.2948207171314743, "grad_norm": 0.29688745737075806, "learning_rate": 7.197364133027632e-06, "loss": 0.25525566935539246, "step": 576 }, { "epoch": 2.302788844621514, "grad_norm": 0.6722139120101929, "learning_rate": 7.153428651313191e-06, "loss": 0.5150002241134644, "step": 578 }, { "epoch": 2.310756972111554, "grad_norm": 0.4063420593738556, "learning_rate": 7.109540835658898e-06, "loss": 0.5354428887367249, "step": 580 }, { "epoch": 2.318725099601594, "grad_norm": 0.9487866163253784, "learning_rate": 7.065702593792204e-06, "loss": 0.5104379653930664, "step": 582 }, { "epoch": 2.3266932270916336, "grad_norm": 0.2526935040950775, "learning_rate": 7.021915831285661e-06, "loss": 0.6450150609016418, "step": 584 }, { "epoch": 2.3346613545816735, "grad_norm": 0.3406190276145935, "learning_rate": 6.978182451474124e-06, "loss": 0.5338073968887329, "step": 586 }, { "epoch": 2.3426294820717133, "grad_norm": 1.3200128078460693, "learning_rate": 6.934504355371974e-06, "loss": 0.7506805062294006, "step": 588 }, { "epoch": 2.350597609561753, "grad_norm": 0.27950209379196167, "learning_rate": 6.890883441590515e-06, "loss": 0.7645633220672607, "step": 590 }, { "epoch": 2.358565737051793, "grad_norm": 0.29245108366012573, "learning_rate": 6.847321606255432e-06, "loss": 0.7928623557090759, "step": 592 }, { "epoch": 2.366533864541833, "grad_norm": 0.4357150197029114, "learning_rate": 6.803820742924374e-06, "loss": 0.5477173924446106, "step": 594 }, { "epoch": 2.3745019920318726, "grad_norm": 0.3675963878631592, "learning_rate": 6.76038274250464e-06, "loss": 0.8036378622055054, "step": 596 }, { "epoch": 2.3824701195219125, "grad_norm": 0.5962640047073364, "learning_rate": 6.717009493170986e-06, "loss": 0.5513007044792175, "step": 598 }, { "epoch": 2.3904382470119523, "grad_norm": 0.8920307159423828, "learning_rate": 6.673702880283554e-06, "loss": 0.8076795935630798, "step": 600 }, { "epoch": 2.398406374501992, "grad_norm": 0.22857658565044403, "learning_rate": 6.6304647863059155e-06, "loss": 0.7613834142684937, "step": 602 }, { "epoch": 2.406374501992032, "grad_norm": 0.9126567244529724, "learning_rate": 6.587297090723235e-06, "loss": 0.47278252243995667, "step": 604 }, { "epoch": 2.414342629482072, "grad_norm": 0.8739012479782104, "learning_rate": 6.54420166996059e-06, "loss": 0.23272567987442017, "step": 606 }, { "epoch": 2.4223107569721116, "grad_norm": 0.09651335328817368, "learning_rate": 6.501180397301394e-06, "loss": 0.32919982075691223, "step": 608 }, { "epoch": 2.4302788844621515, "grad_norm": 0.1508469432592392, "learning_rate": 6.458235142805968e-06, "loss": 0.6115418672561646, "step": 610 }, { "epoch": 2.4382470119521913, "grad_norm": 0.223999485373497, "learning_rate": 6.415367773230254e-06, "loss": 0.656358540058136, "step": 612 }, { "epoch": 2.446215139442231, "grad_norm": 0.3630542755126953, "learning_rate": 6.372580151944681e-06, "loss": 0.4408586919307709, "step": 614 }, { "epoch": 2.454183266932271, "grad_norm": 0.5294836163520813, "learning_rate": 6.329874138853146e-06, "loss": 0.7569445371627808, "step": 616 }, { "epoch": 2.462151394422311, "grad_norm": 0.7806637287139893, "learning_rate": 6.287251590312181e-06, "loss": 0.5635365843772888, "step": 618 }, { "epoch": 2.4701195219123506, "grad_norm": 0.8465815186500549, "learning_rate": 6.244714359050267e-06, "loss": 0.6494905352592468, "step": 620 }, { "epoch": 2.4780876494023905, "grad_norm": 0.30154383182525635, "learning_rate": 6.20226429408728e-06, "loss": 0.722070038318634, "step": 622 }, { "epoch": 2.4860557768924303, "grad_norm": 0.49222832918167114, "learning_rate": 6.159903240654132e-06, "loss": 0.6191802620887756, "step": 624 }, { "epoch": 2.49402390438247, "grad_norm": 0.4883638322353363, "learning_rate": 6.117633040112559e-06, "loss": 0.3768939673900604, "step": 626 }, { "epoch": 2.50199203187251, "grad_norm": 0.7983854413032532, "learning_rate": 6.0754555298750795e-06, "loss": 0.7864499688148499, "step": 628 }, { "epoch": 2.50996015936255, "grad_norm": 0.3459266126155853, "learning_rate": 6.033372543325119e-06, "loss": 0.3463517427444458, "step": 630 }, { "epoch": 2.5179282868525896, "grad_norm": 1.2402698993682861, "learning_rate": 5.991385909737327e-06, "loss": 0.3873278796672821, "step": 632 }, { "epoch": 2.5258964143426295, "grad_norm": 0.28206056356430054, "learning_rate": 5.949497454198058e-06, "loss": 0.7801554799079895, "step": 634 }, { "epoch": 2.5338645418326693, "grad_norm": 0.5089584589004517, "learning_rate": 5.907708997526031e-06, "loss": 0.7173982262611389, "step": 636 }, { "epoch": 2.541832669322709, "grad_norm": 0.7955684065818787, "learning_rate": 5.86602235619319e-06, "loss": 0.9195908904075623, "step": 638 }, { "epoch": 2.549800796812749, "grad_norm": 0.17236770689487457, "learning_rate": 5.824439342245739e-06, "loss": 0.40686023235321045, "step": 640 }, { "epoch": 2.557768924302789, "grad_norm": 0.4617612063884735, "learning_rate": 5.782961763225388e-06, "loss": 0.7664303183555603, "step": 642 }, { "epoch": 2.5657370517928286, "grad_norm": 0.2930012345314026, "learning_rate": 5.741591422090765e-06, "loss": 0.7867609858512878, "step": 644 }, { "epoch": 2.5737051792828685, "grad_norm": 0.436357706785202, "learning_rate": 5.70033011713905e-06, "loss": 0.5984311699867249, "step": 646 }, { "epoch": 2.5816733067729083, "grad_norm": 0.40557265281677246, "learning_rate": 5.659179641927816e-06, "loss": 0.7649792432785034, "step": 648 }, { "epoch": 2.589641434262948, "grad_norm": 0.17836439609527588, "learning_rate": 5.61814178519706e-06, "loss": 0.5768654346466064, "step": 650 }, { "epoch": 2.597609561752988, "grad_norm": 0.40341848134994507, "learning_rate": 5.577218330791436e-06, "loss": 0.5763181447982788, "step": 652 }, { "epoch": 2.605577689243028, "grad_norm": 0.5692223906517029, "learning_rate": 5.536411057582744e-06, "loss": 0.5641070008277893, "step": 654 }, { "epoch": 2.6135458167330676, "grad_norm": 0.29569053649902344, "learning_rate": 5.4957217393925734e-06, "loss": 0.2429419606924057, "step": 656 }, { "epoch": 2.6215139442231075, "grad_norm": 0.3552258610725403, "learning_rate": 5.4551521449152216e-06, "loss": 0.42948848009109497, "step": 658 }, { "epoch": 2.6294820717131473, "grad_norm": 0.41975289583206177, "learning_rate": 5.4147040376408e-06, "loss": 0.7414237260818481, "step": 660 }, { "epoch": 2.637450199203187, "grad_norm": 0.8263479471206665, "learning_rate": 5.37437917577858e-06, "loss": 0.6220693588256836, "step": 662 }, { "epoch": 2.645418326693227, "grad_norm": 1.41019868850708, "learning_rate": 5.334179312180574e-06, "loss": 0.4769461154937744, "step": 664 }, { "epoch": 2.653386454183267, "grad_norm": 9.828413009643555, "learning_rate": 5.2941061942653315e-06, "loss": 0.7357695698738098, "step": 666 }, { "epoch": 2.6613545816733066, "grad_norm": 0.20442984998226166, "learning_rate": 5.254161563941981e-06, "loss": 0.545133650302887, "step": 668 }, { "epoch": 2.6693227091633465, "grad_norm": 1.070529818534851, "learning_rate": 5.2143471575345295e-06, "loss": 0.5713125467300415, "step": 670 }, { "epoch": 2.6772908366533863, "grad_norm": 0.08597006648778915, "learning_rate": 5.174664705706371e-06, "loss": 0.2371898740530014, "step": 672 }, { "epoch": 2.685258964143426, "grad_norm": 0.2467171996831894, "learning_rate": 5.135115933385058e-06, "loss": 0.7705000638961792, "step": 674 }, { "epoch": 2.6932270916334664, "grad_norm": 1.5602085590362549, "learning_rate": 5.0957025596873256e-06, "loss": 0.5420997142791748, "step": 676 }, { "epoch": 2.7011952191235062, "grad_norm": 0.6235253810882568, "learning_rate": 5.0564262978443745e-06, "loss": 0.4899404048919678, "step": 678 }, { "epoch": 2.709163346613546, "grad_norm": 0.2874850332736969, "learning_rate": 5.017288855127377e-06, "loss": 0.778532862663269, "step": 680 }, { "epoch": 2.717131474103586, "grad_norm": 0.26746895909309387, "learning_rate": 4.978291932773289e-06, "loss": 0.7769652605056763, "step": 682 }, { "epoch": 2.7250996015936257, "grad_norm": 0.25973984599113464, "learning_rate": 4.9394372259108886e-06, "loss": 0.5638492107391357, "step": 684 }, { "epoch": 2.7330677290836656, "grad_norm": 0.8309025168418884, "learning_rate": 4.9007264234870805e-06, "loss": 0.41929128766059875, "step": 686 }, { "epoch": 2.7410358565737054, "grad_norm": 0.3012772798538208, "learning_rate": 4.862161208193505e-06, "loss": 0.7767641544342041, "step": 688 }, { "epoch": 2.7490039840637452, "grad_norm": 0.35578370094299316, "learning_rate": 4.823743256393377e-06, "loss": 0.47287169098854065, "step": 690 }, { "epoch": 2.756972111553785, "grad_norm": 0.49327176809310913, "learning_rate": 4.785474238048626e-06, "loss": 0.8931385278701782, "step": 692 }, { "epoch": 2.764940239043825, "grad_norm": 1.3697088956832886, "learning_rate": 4.747355816647293e-06, "loss": 0.6319751143455505, "step": 694 }, { "epoch": 2.7729083665338647, "grad_norm": 1.342233657836914, "learning_rate": 4.709389649131235e-06, "loss": 0.4150761365890503, "step": 696 }, { "epoch": 2.7808764940239046, "grad_norm": 0.27556970715522766, "learning_rate": 4.6715773858241e-06, "loss": 0.8045108318328857, "step": 698 }, { "epoch": 2.7888446215139444, "grad_norm": 0.31476858258247375, "learning_rate": 4.63392067035958e-06, "loss": 0.8101509213447571, "step": 700 }, { "epoch": 2.7968127490039842, "grad_norm": 0.5621429681777954, "learning_rate": 4.596421139609977e-06, "loss": 0.4465515911579132, "step": 702 }, { "epoch": 2.804780876494024, "grad_norm": 0.8817136287689209, "learning_rate": 4.5590804236150365e-06, "loss": 0.9612689018249512, "step": 704 }, { "epoch": 2.812749003984064, "grad_norm": 0.19133038818836212, "learning_rate": 4.521900145511112e-06, "loss": 0.8152596950531006, "step": 706 }, { "epoch": 2.8207171314741037, "grad_norm": 0.4524690508842468, "learning_rate": 4.484881921460591e-06, "loss": 0.8935415744781494, "step": 708 }, { "epoch": 2.8286852589641436, "grad_norm": 0.24354888498783112, "learning_rate": 4.4480273605816556e-06, "loss": 0.4386708736419678, "step": 710 }, { "epoch": 2.8366533864541834, "grad_norm": 0.2424662858247757, "learning_rate": 4.411338064878337e-06, "loss": 0.8338403701782227, "step": 712 }, { "epoch": 2.8446215139442232, "grad_norm": 0.319381445646286, "learning_rate": 4.374815629170861e-06, "loss": 0.5186902284622192, "step": 714 }, { "epoch": 2.852589641434263, "grad_norm": 0.2536839246749878, "learning_rate": 4.338461641026351e-06, "loss": 0.769604504108429, "step": 716 }, { "epoch": 2.860557768924303, "grad_norm": 0.8778960108757019, "learning_rate": 4.302277680689801e-06, "loss": 0.6171420216560364, "step": 718 }, { "epoch": 2.8685258964143427, "grad_norm": 0.39766034483909607, "learning_rate": 4.2662653210153965e-06, "loss": 0.5202685594558716, "step": 720 }, { "epoch": 2.8764940239043826, "grad_norm": 0.8559178113937378, "learning_rate": 4.23042612739813e-06, "loss": 0.4717506766319275, "step": 722 }, { "epoch": 2.8844621513944224, "grad_norm": 0.3448426127433777, "learning_rate": 4.194761657705765e-06, "loss": 0.5054087042808533, "step": 724 }, { "epoch": 2.8924302788844622, "grad_norm": 0.29262322187423706, "learning_rate": 4.159273462211129e-06, "loss": 0.7536461353302002, "step": 726 }, { "epoch": 2.900398406374502, "grad_norm": 0.4943152964115143, "learning_rate": 4.123963083524702e-06, "loss": 0.43974122405052185, "step": 728 }, { "epoch": 2.908366533864542, "grad_norm": 0.24242062866687775, "learning_rate": 4.0888320565275854e-06, "loss": 0.7488172650337219, "step": 730 }, { "epoch": 2.9163346613545817, "grad_norm": 0.6715952754020691, "learning_rate": 4.053881908304764e-06, "loss": 0.2420373111963272, "step": 732 }, { "epoch": 2.9243027888446216, "grad_norm": 0.2856823205947876, "learning_rate": 4.019114158078742e-06, "loss": 0.8018136620521545, "step": 734 }, { "epoch": 2.9322709163346614, "grad_norm": 0.7715031504631042, "learning_rate": 3.984530317143495e-06, "loss": 0.41188791394233704, "step": 736 }, { "epoch": 2.9402390438247012, "grad_norm": 1.3740425109863281, "learning_rate": 3.950131888798777e-06, "loss": 0.6634250283241272, "step": 738 }, { "epoch": 2.948207171314741, "grad_norm": 0.7085353136062622, "learning_rate": 3.915920368284786e-06, "loss": 0.8047435283660889, "step": 740 }, { "epoch": 2.956175298804781, "grad_norm": 1.6132349967956543, "learning_rate": 3.881897242717153e-06, "loss": 0.2846962511539459, "step": 742 }, { "epoch": 2.9641434262948207, "grad_norm": 0.3325771987438202, "learning_rate": 3.848063991022304e-06, "loss": 0.679719865322113, "step": 744 }, { "epoch": 2.9721115537848606, "grad_norm": 0.3333672881126404, "learning_rate": 3.814422083873181e-06, "loss": 0.716017484664917, "step": 746 }, { "epoch": 2.9800796812749004, "grad_norm": 0.31956031918525696, "learning_rate": 3.7809729836253126e-06, "loss": 0.44896891713142395, "step": 748 }, { "epoch": 2.9880478087649402, "grad_norm": 0.14244171977043152, "learning_rate": 3.7477181442532373e-06, "loss": 0.11532896757125854, "step": 750 }, { "epoch": 2.99601593625498, "grad_norm": 0.196710005402565, "learning_rate": 3.7146590112873117e-06, "loss": 0.7710368633270264, "step": 752 }, { "epoch": 3.00398406374502, "grad_norm": 0.2597305178642273, "learning_rate": 3.6817970217508766e-06, "loss": 0.37589359283447266, "step": 754 }, { "epoch": 3.0119521912350598, "grad_norm": 0.36714112758636475, "learning_rate": 3.649133604097784e-06, "loss": 0.34749507904052734, "step": 756 }, { "epoch": 3.0199203187250996, "grad_norm": 2.629531145095825, "learning_rate": 3.616670178150316e-06, "loss": 0.18874035775661469, "step": 758 }, { "epoch": 3.0278884462151394, "grad_norm": 0.3082272410392761, "learning_rate": 3.5844081550374545e-06, "loss": 0.37505829334259033, "step": 760 }, { "epoch": 3.0358565737051793, "grad_norm": 0.6151975989341736, "learning_rate": 3.5523489371335502e-06, "loss": 0.3742624819278717, "step": 762 }, { "epoch": 3.043824701195219, "grad_norm": 0.3428267538547516, "learning_rate": 3.5204939179973634e-06, "loss": 0.4816422462463379, "step": 764 }, { "epoch": 3.051792828685259, "grad_norm": 0.6483787894248962, "learning_rate": 3.488844482311489e-06, "loss": 0.16634498536586761, "step": 766 }, { "epoch": 3.0597609561752988, "grad_norm": 0.5806704163551331, "learning_rate": 3.457402005822163e-06, "loss": 0.31581252813339233, "step": 768 }, { "epoch": 3.0677290836653386, "grad_norm": 0.3666588068008423, "learning_rate": 3.4261678552794615e-06, "loss": 0.3485649824142456, "step": 770 }, { "epoch": 3.0756972111553784, "grad_norm": 0.3737334609031677, "learning_rate": 3.39514338837789e-06, "loss": 0.5422434210777283, "step": 772 }, { "epoch": 3.0836653386454183, "grad_norm": 1.312560796737671, "learning_rate": 3.364329953697377e-06, "loss": 0.5372627973556519, "step": 774 }, { "epoch": 3.091633466135458, "grad_norm": 0.025555025786161423, "learning_rate": 3.3337288906446356e-06, "loss": 0.30303874611854553, "step": 776 }, { "epoch": 3.099601593625498, "grad_norm": 0.36579927802085876, "learning_rate": 3.303341529394961e-06, "loss": 0.3074573278427124, "step": 778 }, { "epoch": 3.1075697211155378, "grad_norm": 0.36329302191734314, "learning_rate": 3.2731691908343907e-06, "loss": 0.4981156885623932, "step": 780 }, { "epoch": 3.1155378486055776, "grad_norm": 0.08173166960477829, "learning_rate": 3.2432131865023065e-06, "loss": 0.160829097032547, "step": 782 }, { "epoch": 3.1235059760956174, "grad_norm": 0.3885779082775116, "learning_rate": 3.2134748185344098e-06, "loss": 0.5554381608963013, "step": 784 }, { "epoch": 3.1314741035856573, "grad_norm": 0.317030668258667, "learning_rate": 3.1839553796061266e-06, "loss": 0.45913565158843994, "step": 786 }, { "epoch": 3.139442231075697, "grad_norm": 0.6573988795280457, "learning_rate": 3.1546561528764227e-06, "loss": 0.32907965779304504, "step": 788 }, { "epoch": 3.147410358565737, "grad_norm": 0.35516512393951416, "learning_rate": 3.1255784119320064e-06, "loss": 0.25920620560646057, "step": 790 }, { "epoch": 3.1553784860557768, "grad_norm": 0.789368748664856, "learning_rate": 3.0967234207319946e-06, "loss": 0.3322998285293579, "step": 792 }, { "epoch": 3.1633466135458166, "grad_norm": 0.08553847670555115, "learning_rate": 3.0680924335529536e-06, "loss": 0.20808134973049164, "step": 794 }, { "epoch": 3.1713147410358564, "grad_norm": 1.4314020872116089, "learning_rate": 3.0396866949343833e-06, "loss": 0.33690834045410156, "step": 796 }, { "epoch": 3.1792828685258963, "grad_norm": 0.279748797416687, "learning_rate": 3.0115074396246176e-06, "loss": 0.36214491724967957, "step": 798 }, { "epoch": 3.187250996015936, "grad_norm": 0.6961238384246826, "learning_rate": 2.9835558925271495e-06, "loss": 0.501541018486023, "step": 800 }, { "epoch": 3.195219123505976, "grad_norm": 0.18416091799736023, "learning_rate": 2.955833268647395e-06, "loss": 0.3577136993408203, "step": 802 }, { "epoch": 3.2031872509960158, "grad_norm": 0.6885810494422913, "learning_rate": 2.9283407730398702e-06, "loss": 0.29195672273635864, "step": 804 }, { "epoch": 3.2111553784860556, "grad_norm": 1.7156380414962769, "learning_rate": 2.901079600755817e-06, "loss": 0.5778890252113342, "step": 806 }, { "epoch": 3.2191235059760954, "grad_norm": 0.6983752846717834, "learning_rate": 2.8740509367912457e-06, "loss": 0.18633845448493958, "step": 808 }, { "epoch": 3.2270916334661353, "grad_norm": 0.6704440712928772, "learning_rate": 2.8472559560354404e-06, "loss": 0.3643829822540283, "step": 810 }, { "epoch": 3.235059760956175, "grad_norm": 1.632941484451294, "learning_rate": 2.820695823219873e-06, "loss": 0.2959984838962555, "step": 812 }, { "epoch": 3.243027888446215, "grad_norm": 0.39763104915618896, "learning_rate": 2.794371692867585e-06, "loss": 0.44851499795913696, "step": 814 }, { "epoch": 3.2509960159362548, "grad_norm": 1.5863844156265259, "learning_rate": 2.768284709243002e-06, "loss": 0.13297411799430847, "step": 816 }, { "epoch": 3.2589641434262946, "grad_norm": 0.5324887633323669, "learning_rate": 2.7424360063021855e-06, "loss": 0.5013939142227173, "step": 818 }, { "epoch": 3.2669322709163344, "grad_norm": 2.0388095378875732, "learning_rate": 2.7168267076435485e-06, "loss": 0.2653783857822418, "step": 820 }, { "epoch": 3.2749003984063743, "grad_norm": 0.6833744049072266, "learning_rate": 2.69145792645902e-06, "loss": 0.41534146666526794, "step": 822 }, { "epoch": 3.2828685258964145, "grad_norm": 0.8634832501411438, "learning_rate": 2.6663307654856407e-06, "loss": 0.3562511205673218, "step": 824 }, { "epoch": 3.2908366533864544, "grad_norm": 0.39872676134109497, "learning_rate": 2.6414463169576492e-06, "loss": 0.4844256043434143, "step": 826 }, { "epoch": 3.298804780876494, "grad_norm": 0.434477299451828, "learning_rate": 2.616805662558985e-06, "loss": 0.6063498854637146, "step": 828 }, { "epoch": 3.306772908366534, "grad_norm": 2.5125367641448975, "learning_rate": 2.5924098733762835e-06, "loss": 0.4092828035354614, "step": 830 }, { "epoch": 3.314741035856574, "grad_norm": 0.5066865682601929, "learning_rate": 2.5682600098523105e-06, "loss": 0.28628939390182495, "step": 832 }, { "epoch": 3.3227091633466137, "grad_norm": 0.6276751160621643, "learning_rate": 2.5443571217398705e-06, "loss": 0.2303668111562729, "step": 834 }, { "epoch": 3.3306772908366535, "grad_norm": 3.6938159465789795, "learning_rate": 2.5207022480561722e-06, "loss": 0.3531423807144165, "step": 836 }, { "epoch": 3.3386454183266934, "grad_norm": 0.4289490580558777, "learning_rate": 2.497296417037664e-06, "loss": 0.4265778660774231, "step": 838 }, { "epoch": 3.346613545816733, "grad_norm": 0.859740674495697, "learning_rate": 2.474140646095346e-06, "loss": 0.11164703965187073, "step": 840 }, { "epoch": 3.354581673306773, "grad_norm": 0.573935866355896, "learning_rate": 2.451235941770535e-06, "loss": 0.36163708567619324, "step": 842 }, { "epoch": 3.362549800796813, "grad_norm": 0.38408342003822327, "learning_rate": 2.428583299691118e-06, "loss": 0.4686431884765625, "step": 844 }, { "epoch": 3.3705179282868527, "grad_norm": 0.3920894265174866, "learning_rate": 2.4061837045282717e-06, "loss": 0.544544517993927, "step": 846 }, { "epoch": 3.3784860557768925, "grad_norm": 0.2612384259700775, "learning_rate": 2.3840381299536584e-06, "loss": 0.4954265058040619, "step": 848 }, { "epoch": 3.3864541832669324, "grad_norm": 0.9370325207710266, "learning_rate": 2.36214753859711e-06, "loss": 0.45011717081069946, "step": 850 }, { "epoch": 3.394422310756972, "grad_norm": 0.42760828137397766, "learning_rate": 2.3405128820047716e-06, "loss": 0.4825401306152344, "step": 852 }, { "epoch": 3.402390438247012, "grad_norm": 0.4402712285518646, "learning_rate": 2.3191351005977556e-06, "loss": 0.31368541717529297, "step": 854 }, { "epoch": 3.410358565737052, "grad_norm": 0.4192966818809509, "learning_rate": 2.298015123631246e-06, "loss": 0.4709932208061218, "step": 856 }, { "epoch": 3.4183266932270917, "grad_norm": 0.5742263197898865, "learning_rate": 2.2771538691541196e-06, "loss": 0.439094603061676, "step": 858 }, { "epoch": 3.4262948207171315, "grad_norm": 0.41355282068252563, "learning_rate": 2.256552243969029e-06, "loss": 0.5255416035652161, "step": 860 }, { "epoch": 3.4342629482071714, "grad_norm": 0.2783606946468353, "learning_rate": 2.2362111435929956e-06, "loss": 0.3297284245491028, "step": 862 }, { "epoch": 3.442231075697211, "grad_norm": 1.933573842048645, "learning_rate": 2.2161314522184778e-06, "loss": 0.4290310740470886, "step": 864 }, { "epoch": 3.450199203187251, "grad_norm": 0.4777624309062958, "learning_rate": 2.1963140426749277e-06, "loss": 0.5890864729881287, "step": 866 }, { "epoch": 3.458167330677291, "grad_norm": 0.30510279536247253, "learning_rate": 2.176759776390871e-06, "loss": 0.4166991114616394, "step": 868 }, { "epoch": 3.4661354581673307, "grad_norm": 0.32527998089790344, "learning_rate": 2.1574695033564447e-06, "loss": 0.343144029378891, "step": 870 }, { "epoch": 3.4741035856573705, "grad_norm": 0.06616739183664322, "learning_rate": 2.1384440620864597e-06, "loss": 0.32270875573158264, "step": 872 }, { "epoch": 3.4820717131474104, "grad_norm": 0.08085694909095764, "learning_rate": 2.1196842795839454e-06, "loss": 0.28370317816734314, "step": 874 }, { "epoch": 3.49003984063745, "grad_norm": 0.543026328086853, "learning_rate": 2.101190971304202e-06, "loss": 0.2624368369579315, "step": 876 }, { "epoch": 3.49800796812749, "grad_norm": 0.3456118702888489, "learning_rate": 2.0829649411193613e-06, "loss": 0.3216794431209564, "step": 878 }, { "epoch": 3.50597609561753, "grad_norm": 0.2047196626663208, "learning_rate": 2.0650069812834345e-06, "loss": 0.2091296762228012, "step": 880 }, { "epoch": 3.5139442231075697, "grad_norm": 0.286630779504776, "learning_rate": 2.0473178723978813e-06, "loss": 0.20823848247528076, "step": 882 }, { "epoch": 3.5219123505976095, "grad_norm": 0.5212514400482178, "learning_rate": 2.0298983833776717e-06, "loss": 0.08361003547906876, "step": 884 }, { "epoch": 3.5298804780876494, "grad_norm": 0.5501599311828613, "learning_rate": 2.01274927141787e-06, "loss": 0.4509070813655853, "step": 886 }, { "epoch": 3.537848605577689, "grad_norm": 0.39404916763305664, "learning_rate": 1.995871281960715e-06, "loss": 0.44182029366493225, "step": 888 }, { "epoch": 3.545816733067729, "grad_norm": 0.39789876341819763, "learning_rate": 1.9792651486632213e-06, "loss": 0.27486419677734375, "step": 890 }, { "epoch": 3.553784860557769, "grad_norm": 1.1401015520095825, "learning_rate": 1.962931593365286e-06, "loss": 0.3863001763820648, "step": 892 }, { "epoch": 3.5617529880478087, "grad_norm": 0.37662816047668457, "learning_rate": 1.946871326058308e-06, "loss": 0.6621991395950317, "step": 894 }, { "epoch": 3.5697211155378485, "grad_norm": 0.10905114561319351, "learning_rate": 1.9310850448543344e-06, "loss": 0.10537078976631165, "step": 896 }, { "epoch": 3.5776892430278884, "grad_norm": 0.3512639105319977, "learning_rate": 1.915573435955711e-06, "loss": 0.2836357057094574, "step": 898 }, { "epoch": 3.585657370517928, "grad_norm": 0.43090760707855225, "learning_rate": 1.9003371736252472e-06, "loss": 0.15316523611545563, "step": 900 }, { "epoch": 3.593625498007968, "grad_norm": 0.6001132726669312, "learning_rate": 1.8853769201569208e-06, "loss": 0.09330594539642334, "step": 902 }, { "epoch": 3.601593625498008, "grad_norm": 0.5064031481742859, "learning_rate": 1.8706933258470757e-06, "loss": 0.4762483835220337, "step": 904 }, { "epoch": 3.6095617529880477, "grad_norm": 0.3355056643486023, "learning_rate": 1.8562870289661659e-06, "loss": 0.27884015440940857, "step": 906 }, { "epoch": 3.6175298804780875, "grad_norm": 0.3930132985115051, "learning_rate": 1.8421586557309996e-06, "loss": 0.5141717195510864, "step": 908 }, { "epoch": 3.6254980079681274, "grad_norm": 0.46708425879478455, "learning_rate": 1.8283088202775314e-06, "loss": 0.4603351652622223, "step": 910 }, { "epoch": 3.633466135458167, "grad_norm": 0.037517350167036057, "learning_rate": 1.8147381246341558e-06, "loss": 0.02768601104617119, "step": 912 }, { "epoch": 3.641434262948207, "grad_norm": 0.3065638542175293, "learning_rate": 1.8014471586955424e-06, "loss": 0.3721899390220642, "step": 914 }, { "epoch": 3.649402390438247, "grad_norm": 0.43989261984825134, "learning_rate": 1.7884365001969967e-06, "loss": 0.43738237023353577, "step": 916 }, { "epoch": 3.6573705179282867, "grad_norm": 0.6677345633506775, "learning_rate": 1.7757067146893425e-06, "loss": 0.0748777762055397, "step": 918 }, { "epoch": 3.6653386454183265, "grad_norm": 0.3021090030670166, "learning_rate": 1.7632583555143435e-06, "loss": 0.5561968684196472, "step": 920 }, { "epoch": 3.6733067729083664, "grad_norm": 0.42820993065834045, "learning_rate": 1.751091963780643e-06, "loss": 0.07096469402313232, "step": 922 }, { "epoch": 3.681274900398406, "grad_norm": 0.8393615484237671, "learning_rate": 1.7392080683402496e-06, "loss": 0.46100661158561707, "step": 924 }, { "epoch": 3.6892430278884465, "grad_norm": 0.6544818878173828, "learning_rate": 1.7276071857655479e-06, "loss": 0.1528330296278, "step": 926 }, { "epoch": 3.6972111553784863, "grad_norm": 0.31229308247566223, "learning_rate": 1.716289820326839e-06, "loss": 0.29350802302360535, "step": 928 }, { "epoch": 3.705179282868526, "grad_norm": 0.1188875362277031, "learning_rate": 1.7052564639704286e-06, "loss": 0.3660446107387543, "step": 930 }, { "epoch": 3.713147410358566, "grad_norm": 0.5841293931007385, "learning_rate": 1.6945075962972356e-06, "loss": 0.45137277245521545, "step": 932 }, { "epoch": 3.721115537848606, "grad_norm": 0.7146270275115967, "learning_rate": 1.6840436845419498e-06, "loss": 0.4348509907722473, "step": 934 }, { "epoch": 3.7290836653386457, "grad_norm": 0.326523095369339, "learning_rate": 1.6738651835527184e-06, "loss": 0.4922831654548645, "step": 936 }, { "epoch": 3.7370517928286855, "grad_norm": 0.8038604855537415, "learning_rate": 1.6639725357713769e-06, "loss": 0.21507446467876434, "step": 938 }, { "epoch": 3.7450199203187253, "grad_norm": 0.36527737975120544, "learning_rate": 1.6543661712142184e-06, "loss": 0.4618900418281555, "step": 940 }, { "epoch": 3.752988047808765, "grad_norm": 0.6607845425605774, "learning_rate": 1.645046507453294e-06, "loss": 0.36659500002861023, "step": 942 }, { "epoch": 3.760956175298805, "grad_norm": 0.421165406703949, "learning_rate": 1.6360139495982712e-06, "loss": 0.28992268443107605, "step": 944 }, { "epoch": 3.768924302788845, "grad_norm": 0.5764026641845703, "learning_rate": 1.6272688902788207e-06, "loss": 0.5770589709281921, "step": 946 }, { "epoch": 3.7768924302788847, "grad_norm": 0.3415585160255432, "learning_rate": 1.6188117096275477e-06, "loss": 0.3260127305984497, "step": 948 }, { "epoch": 3.7848605577689245, "grad_norm": 0.2808169424533844, "learning_rate": 1.610642775263468e-06, "loss": 0.5168456435203552, "step": 950 }, { "epoch": 3.7928286852589643, "grad_norm": 0.3828094005584717, "learning_rate": 1.6027624422760312e-06, "loss": 0.5155588388442993, "step": 952 }, { "epoch": 3.800796812749004, "grad_norm": 0.616820216178894, "learning_rate": 1.5951710532096857e-06, "loss": 0.2644089460372925, "step": 954 }, { "epoch": 3.808764940239044, "grad_norm": 0.531527042388916, "learning_rate": 1.5878689380489846e-06, "loss": 0.38867413997650146, "step": 956 }, { "epoch": 3.816733067729084, "grad_norm": 0.9145589470863342, "learning_rate": 1.580856414204247e-06, "loss": 0.3010810315608978, "step": 958 }, { "epoch": 3.8247011952191237, "grad_norm": 1.3794469833374023, "learning_rate": 1.5741337864977558e-06, "loss": 0.21975839138031006, "step": 960 }, { "epoch": 3.8326693227091635, "grad_norm": 0.11370343714952469, "learning_rate": 1.567701347150513e-06, "loss": 0.3248888850212097, "step": 962 }, { "epoch": 3.8406374501992033, "grad_norm": 0.4226270616054535, "learning_rate": 1.5615593757695319e-06, "loss": 0.6149446964263916, "step": 964 }, { "epoch": 3.848605577689243, "grad_norm": 0.4638464152812958, "learning_rate": 1.555708139335687e-06, "loss": 0.21839484572410583, "step": 966 }, { "epoch": 3.856573705179283, "grad_norm": 0.28881698846817017, "learning_rate": 1.5501478921921071e-06, "loss": 0.3512417674064636, "step": 968 }, { "epoch": 3.864541832669323, "grad_norm": 0.8101674318313599, "learning_rate": 1.54487887603312e-06, "loss": 0.36906710267066956, "step": 970 }, { "epoch": 3.8725099601593627, "grad_norm": 0.10503221303224564, "learning_rate": 1.5399013198937452e-06, "loss": 0.3287951946258545, "step": 972 }, { "epoch": 3.8804780876494025, "grad_norm": 0.1844586879014969, "learning_rate": 1.5352154401397418e-06, "loss": 0.32523638010025024, "step": 974 }, { "epoch": 3.8884462151394423, "grad_norm": 0.38847818970680237, "learning_rate": 1.5308214404581968e-06, "loss": 0.5000988245010376, "step": 976 }, { "epoch": 3.896414342629482, "grad_norm": 0.4692430794239044, "learning_rate": 1.5267195118486794e-06, "loss": 0.2642746567726135, "step": 978 }, { "epoch": 3.904382470119522, "grad_norm": 0.2286023050546646, "learning_rate": 1.522909832614931e-06, "loss": 0.2238185554742813, "step": 980 }, { "epoch": 3.912350597609562, "grad_norm": 0.3902443051338196, "learning_rate": 1.5193925683571211e-06, "loss": 0.19102515280246735, "step": 982 }, { "epoch": 3.9203187250996017, "grad_norm": 0.4479309320449829, "learning_rate": 1.516167871964643e-06, "loss": 0.5202714800834656, "step": 984 }, { "epoch": 3.9282868525896415, "grad_norm": 0.5768634080886841, "learning_rate": 1.5132358836094728e-06, "loss": 0.16960352659225464, "step": 986 }, { "epoch": 3.9362549800796813, "grad_norm": 3.171630620956421, "learning_rate": 1.510596730740074e-06, "loss": 0.4183100461959839, "step": 988 }, { "epoch": 3.944223107569721, "grad_norm": 0.10318754613399506, "learning_rate": 1.508250528075857e-06, "loss": 0.3005601763725281, "step": 990 }, { "epoch": 3.952191235059761, "grad_norm": 0.5118750929832458, "learning_rate": 1.5061973776021949e-06, "loss": 0.4696381390094757, "step": 992 }, { "epoch": 3.960159362549801, "grad_norm": 0.3834742307662964, "learning_rate": 1.504437368565988e-06, "loss": 0.1685551553964615, "step": 994 }, { "epoch": 3.9681274900398407, "grad_norm": 0.34701693058013916, "learning_rate": 1.502970577471785e-06, "loss": 0.4004333019256592, "step": 996 }, { "epoch": 3.9760956175298805, "grad_norm": 0.14854289591312408, "learning_rate": 1.5017970680784587e-06, "loss": 0.32395121455192566, "step": 998 }, { "epoch": 3.9840637450199203, "grad_norm": 0.3810655474662781, "learning_rate": 1.5009168913964322e-06, "loss": 0.23012831807136536, "step": 1000 }, { "epoch": 3.99203187250996, "grad_norm": 0.3381198048591614, "learning_rate": 1.5003300856854642e-06, "loss": 0.48588454723358154, "step": 1002 }, { "epoch": 4.0, "grad_norm": 0.07251780480146408, "learning_rate": 1.5000366764529846e-06, "loss": 0.11095666140317917, "step": 1004 }, { "epoch": 4.0, "step": 1004, "total_flos": 4.038502240003031e+18, "train_loss": 0.7418717171225059, "train_runtime": 10936.9117, "train_samples_per_second": 5.508, "train_steps_per_second": 0.092 } ], "logging_steps": 2, "max_steps": 1004, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 99999, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.038502240003031e+18, "train_batch_size": 3, "trial_name": null, "trial_params": null }