| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 5.0, | |
| "eval_steps": 500, | |
| "global_step": 124205, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.020128014170121975, | |
| "grad_norm": 4.729732990264893, | |
| "learning_rate": 2.994e-05, | |
| "loss": 4.6998, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.04025602834024395, | |
| "grad_norm": 5.319427967071533, | |
| "learning_rate": 2.9878986298047775e-05, | |
| "loss": 3.2594, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.06038404251036593, | |
| "grad_norm": 6.513505458831787, | |
| "learning_rate": 2.975773008366679e-05, | |
| "loss": 3.0934, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.0805120566804879, | |
| "grad_norm": 7.240413665771484, | |
| "learning_rate": 2.9636473869285803e-05, | |
| "loss": 3.0216, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.10064007085060987, | |
| "grad_norm": 7.327127933502197, | |
| "learning_rate": 2.9515217654904812e-05, | |
| "loss": 2.8993, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.12076808502073186, | |
| "grad_norm": 7.550909519195557, | |
| "learning_rate": 2.9393961440523828e-05, | |
| "loss": 2.7834, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.14089609919085383, | |
| "grad_norm": 9.492281913757324, | |
| "learning_rate": 2.927270522614284e-05, | |
| "loss": 2.6516, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.1610241133609758, | |
| "grad_norm": 13.708161354064941, | |
| "learning_rate": 2.9151449011761853e-05, | |
| "loss": 2.5647, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.18115212753109777, | |
| "grad_norm": 8.566939353942871, | |
| "learning_rate": 2.903019279738087e-05, | |
| "loss": 2.4627, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.20128014170121974, | |
| "grad_norm": 13.398711204528809, | |
| "learning_rate": 2.890893658299988e-05, | |
| "loss": 2.427, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.22140815587134174, | |
| "grad_norm": 9.740044593811035, | |
| "learning_rate": 2.878768036861889e-05, | |
| "loss": 2.3102, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.24153617004146372, | |
| "grad_norm": 13.687227249145508, | |
| "learning_rate": 2.8666424154237906e-05, | |
| "loss": 2.2774, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.26166418421158566, | |
| "grad_norm": 10.618597984313965, | |
| "learning_rate": 2.854516793985692e-05, | |
| "loss": 2.2404, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.28179219838170766, | |
| "grad_norm": 9.622278213500977, | |
| "learning_rate": 2.842391172547593e-05, | |
| "loss": 2.1845, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.30192021255182966, | |
| "grad_norm": 12.272369384765625, | |
| "learning_rate": 2.8302655511094947e-05, | |
| "loss": 2.1741, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.3220482267219516, | |
| "grad_norm": 8.921002388000488, | |
| "learning_rate": 2.818139929671396e-05, | |
| "loss": 2.1317, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.3421762408920736, | |
| "grad_norm": 11.55328369140625, | |
| "learning_rate": 2.8060143082332968e-05, | |
| "loss": 2.1168, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.36230425506219555, | |
| "grad_norm": 9.84124755859375, | |
| "learning_rate": 2.7938886867951984e-05, | |
| "loss": 2.1129, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.38243226923231755, | |
| "grad_norm": 9.949904441833496, | |
| "learning_rate": 2.7817630653570996e-05, | |
| "loss": 2.0368, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.4025602834024395, | |
| "grad_norm": 15.386507034301758, | |
| "learning_rate": 2.769637443919001e-05, | |
| "loss": 2.0455, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.4226882975725615, | |
| "grad_norm": 10.55031967163086, | |
| "learning_rate": 2.757511822480902e-05, | |
| "loss": 2.0155, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.4428163117426835, | |
| "grad_norm": 13.076041221618652, | |
| "learning_rate": 2.7453862010428037e-05, | |
| "loss": 1.9655, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.46294432591280543, | |
| "grad_norm": 9.13262939453125, | |
| "learning_rate": 2.7332605796047046e-05, | |
| "loss": 1.9698, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 0.48307234008292743, | |
| "grad_norm": 14.231966972351074, | |
| "learning_rate": 2.721134958166606e-05, | |
| "loss": 1.9137, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.5032003542530494, | |
| "grad_norm": 13.586121559143066, | |
| "learning_rate": 2.7090093367285074e-05, | |
| "loss": 1.8677, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 0.5233283684231713, | |
| "grad_norm": 16.752092361450195, | |
| "learning_rate": 2.6968837152904087e-05, | |
| "loss": 1.8927, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.5434563825932933, | |
| "grad_norm": 11.639701843261719, | |
| "learning_rate": 2.68475809385231e-05, | |
| "loss": 1.8398, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 0.5635843967634153, | |
| "grad_norm": 20.035966873168945, | |
| "learning_rate": 2.6726324724142115e-05, | |
| "loss": 1.8741, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.5837124109335373, | |
| "grad_norm": 12.169594764709473, | |
| "learning_rate": 2.6605068509761127e-05, | |
| "loss": 1.8125, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 0.6038404251036593, | |
| "grad_norm": 14.39586067199707, | |
| "learning_rate": 2.6483812295380136e-05, | |
| "loss": 1.7955, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.6239684392737812, | |
| "grad_norm": 9.263022422790527, | |
| "learning_rate": 2.6362556080999152e-05, | |
| "loss": 1.8026, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 0.6440964534439032, | |
| "grad_norm": 10.964536666870117, | |
| "learning_rate": 2.6241299866618165e-05, | |
| "loss": 1.7693, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.6642244676140252, | |
| "grad_norm": 17.373477935791016, | |
| "learning_rate": 2.6120043652237177e-05, | |
| "loss": 1.7218, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 0.6843524817841472, | |
| "grad_norm": 13.868760108947754, | |
| "learning_rate": 2.5998787437856193e-05, | |
| "loss": 1.7308, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 0.7044804959542692, | |
| "grad_norm": 16.622940063476562, | |
| "learning_rate": 2.5877531223475205e-05, | |
| "loss": 1.7091, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 0.7246085101243911, | |
| "grad_norm": 9.113160133361816, | |
| "learning_rate": 2.5756275009094214e-05, | |
| "loss": 1.7317, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.7447365242945131, | |
| "grad_norm": 24.89649200439453, | |
| "learning_rate": 2.563501879471323e-05, | |
| "loss": 1.6873, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 0.7648645384646351, | |
| "grad_norm": 11.15603256225586, | |
| "learning_rate": 2.5513762580332243e-05, | |
| "loss": 1.6772, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 0.7849925526347571, | |
| "grad_norm": 19.64437484741211, | |
| "learning_rate": 2.5392506365951255e-05, | |
| "loss": 1.6586, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 0.805120566804879, | |
| "grad_norm": 14.52999496459961, | |
| "learning_rate": 2.527125015157027e-05, | |
| "loss": 1.6505, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 0.825248580975001, | |
| "grad_norm": 13.304444313049316, | |
| "learning_rate": 2.5149993937189283e-05, | |
| "loss": 1.615, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 0.845376595145123, | |
| "grad_norm": 14.634563446044922, | |
| "learning_rate": 2.5028737722808292e-05, | |
| "loss": 1.6244, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 0.865504609315245, | |
| "grad_norm": 12.946802139282227, | |
| "learning_rate": 2.4907481508427308e-05, | |
| "loss": 1.6158, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 0.885632623485367, | |
| "grad_norm": 11.765786170959473, | |
| "learning_rate": 2.478622529404632e-05, | |
| "loss": 1.5787, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 0.9057606376554889, | |
| "grad_norm": 11.961956024169922, | |
| "learning_rate": 2.4664969079665333e-05, | |
| "loss": 1.5621, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 0.9258886518256109, | |
| "grad_norm": 13.635610580444336, | |
| "learning_rate": 2.454371286528435e-05, | |
| "loss": 1.5692, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 0.9460166659957329, | |
| "grad_norm": 13.10095500946045, | |
| "learning_rate": 2.442245665090336e-05, | |
| "loss": 1.5458, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 0.9661446801658549, | |
| "grad_norm": 11.790682792663574, | |
| "learning_rate": 2.430120043652237e-05, | |
| "loss": 1.5394, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 0.9862726943359769, | |
| "grad_norm": 11.249995231628418, | |
| "learning_rate": 2.4179944222141386e-05, | |
| "loss": 1.5485, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 1.0064007085060989, | |
| "grad_norm": 13.755157470703125, | |
| "learning_rate": 2.40586880077604e-05, | |
| "loss": 1.5134, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 1.0265287226762208, | |
| "grad_norm": 15.988091468811035, | |
| "learning_rate": 2.393743179337941e-05, | |
| "loss": 1.4833, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 1.0466567368463426, | |
| "grad_norm": 11.56142807006836, | |
| "learning_rate": 2.3816175578998427e-05, | |
| "loss": 1.4523, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 1.0667847510164647, | |
| "grad_norm": 10.849580764770508, | |
| "learning_rate": 2.369491936461744e-05, | |
| "loss": 1.4571, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 1.0869127651865866, | |
| "grad_norm": 17.24896240234375, | |
| "learning_rate": 2.3573663150236448e-05, | |
| "loss": 1.4886, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 1.1070407793567087, | |
| "grad_norm": 12.933219909667969, | |
| "learning_rate": 2.345240693585546e-05, | |
| "loss": 1.4485, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 1.1271687935268306, | |
| "grad_norm": 12.675749778747559, | |
| "learning_rate": 2.3331150721474476e-05, | |
| "loss": 1.4161, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 1.1472968076969525, | |
| "grad_norm": 21.270776748657227, | |
| "learning_rate": 2.320989450709349e-05, | |
| "loss": 1.4371, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 1.1674248218670746, | |
| "grad_norm": 17.078645706176758, | |
| "learning_rate": 2.30886382927125e-05, | |
| "loss": 1.3918, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 1.1875528360371965, | |
| "grad_norm": 23.501638412475586, | |
| "learning_rate": 2.2967382078331517e-05, | |
| "loss": 1.4278, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 1.2076808502073186, | |
| "grad_norm": 12.903084754943848, | |
| "learning_rate": 2.284612586395053e-05, | |
| "loss": 1.3752, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 1.2278088643774405, | |
| "grad_norm": 15.732855796813965, | |
| "learning_rate": 2.272486964956954e-05, | |
| "loss": 1.3931, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 1.2479368785475624, | |
| "grad_norm": 11.898987770080566, | |
| "learning_rate": 2.2603613435188554e-05, | |
| "loss": 1.3587, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 1.2680648927176845, | |
| "grad_norm": 18.970348358154297, | |
| "learning_rate": 2.2482357220807567e-05, | |
| "loss": 1.3706, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 1.2881929068878064, | |
| "grad_norm": 13.289978981018066, | |
| "learning_rate": 2.236110100642658e-05, | |
| "loss": 1.3378, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 1.3083209210579283, | |
| "grad_norm": 25.023792266845703, | |
| "learning_rate": 2.2239844792045595e-05, | |
| "loss": 1.3174, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 1.3284489352280504, | |
| "grad_norm": 12.036040306091309, | |
| "learning_rate": 2.2118588577664607e-05, | |
| "loss": 1.3293, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 1.3485769493981723, | |
| "grad_norm": 12.723782539367676, | |
| "learning_rate": 2.1997332363283616e-05, | |
| "loss": 1.3287, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 1.3687049635682944, | |
| "grad_norm": 11.143896102905273, | |
| "learning_rate": 2.1876076148902632e-05, | |
| "loss": 1.3125, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 1.3888329777384163, | |
| "grad_norm": 12.347333908081055, | |
| "learning_rate": 2.1754819934521645e-05, | |
| "loss": 1.3472, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 1.4089609919085384, | |
| "grad_norm": 20.10418701171875, | |
| "learning_rate": 2.1633563720140657e-05, | |
| "loss": 1.3052, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 1.4290890060786603, | |
| "grad_norm": 17.1345157623291, | |
| "learning_rate": 2.1512307505759673e-05, | |
| "loss": 1.2828, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 1.4492170202487822, | |
| "grad_norm": 17.451622009277344, | |
| "learning_rate": 2.1391051291378685e-05, | |
| "loss": 1.2787, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 1.4693450344189043, | |
| "grad_norm": 19.05263900756836, | |
| "learning_rate": 2.1269795076997694e-05, | |
| "loss": 1.3195, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 1.4894730485890262, | |
| "grad_norm": 12.999706268310547, | |
| "learning_rate": 2.114853886261671e-05, | |
| "loss": 1.2759, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 1.509601062759148, | |
| "grad_norm": 12.11323356628418, | |
| "learning_rate": 2.1027282648235722e-05, | |
| "loss": 1.2738, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 1.5297290769292702, | |
| "grad_norm": 10.93237018585205, | |
| "learning_rate": 2.0906026433854735e-05, | |
| "loss": 1.2742, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 1.5498570910993923, | |
| "grad_norm": 26.265893936157227, | |
| "learning_rate": 2.078477021947375e-05, | |
| "loss": 1.2193, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 1.569985105269514, | |
| "grad_norm": 17.12728500366211, | |
| "learning_rate": 2.0663514005092763e-05, | |
| "loss": 1.2355, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 1.590113119439636, | |
| "grad_norm": 15.962538719177246, | |
| "learning_rate": 2.0542257790711772e-05, | |
| "loss": 1.219, | |
| "step": 39500 | |
| }, | |
| { | |
| "epoch": 1.6102411336097582, | |
| "grad_norm": 19.7554931640625, | |
| "learning_rate": 2.0421001576330788e-05, | |
| "loss": 1.2393, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 1.63036914777988, | |
| "grad_norm": 17.81658363342285, | |
| "learning_rate": 2.02997453619498e-05, | |
| "loss": 1.22, | |
| "step": 40500 | |
| }, | |
| { | |
| "epoch": 1.650497161950002, | |
| "grad_norm": 16.0762882232666, | |
| "learning_rate": 2.0178489147568813e-05, | |
| "loss": 1.2359, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 1.670625176120124, | |
| "grad_norm": 13.652649879455566, | |
| "learning_rate": 2.005723293318783e-05, | |
| "loss": 1.244, | |
| "step": 41500 | |
| }, | |
| { | |
| "epoch": 1.690753190290246, | |
| "grad_norm": 8.598692893981934, | |
| "learning_rate": 1.993597671880684e-05, | |
| "loss": 1.2153, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 1.7108812044603678, | |
| "grad_norm": 15.637930870056152, | |
| "learning_rate": 1.981472050442585e-05, | |
| "loss": 1.1855, | |
| "step": 42500 | |
| }, | |
| { | |
| "epoch": 1.73100921863049, | |
| "grad_norm": 16.582963943481445, | |
| "learning_rate": 1.9693464290044866e-05, | |
| "loss": 1.1908, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 1.7511372328006118, | |
| "grad_norm": 16.173324584960938, | |
| "learning_rate": 1.957220807566388e-05, | |
| "loss": 1.1659, | |
| "step": 43500 | |
| }, | |
| { | |
| "epoch": 1.7712652469707337, | |
| "grad_norm": 15.524099349975586, | |
| "learning_rate": 1.945095186128289e-05, | |
| "loss": 1.1853, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 1.7913932611408558, | |
| "grad_norm": 11.66182804107666, | |
| "learning_rate": 1.9329695646901903e-05, | |
| "loss": 1.1679, | |
| "step": 44500 | |
| }, | |
| { | |
| "epoch": 1.811521275310978, | |
| "grad_norm": 10.504340171813965, | |
| "learning_rate": 1.920843943252092e-05, | |
| "loss": 1.155, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 1.8316492894810998, | |
| "grad_norm": 16.5634708404541, | |
| "learning_rate": 1.908718321813993e-05, | |
| "loss": 1.1544, | |
| "step": 45500 | |
| }, | |
| { | |
| "epoch": 1.8517773036512217, | |
| "grad_norm": 13.282904624938965, | |
| "learning_rate": 1.896592700375894e-05, | |
| "loss": 1.1892, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 1.8719053178213438, | |
| "grad_norm": 13.532590866088867, | |
| "learning_rate": 1.8844670789377956e-05, | |
| "loss": 1.1728, | |
| "step": 46500 | |
| }, | |
| { | |
| "epoch": 1.8920333319914657, | |
| "grad_norm": 15.26899242401123, | |
| "learning_rate": 1.872341457499697e-05, | |
| "loss": 1.1733, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 1.9121613461615876, | |
| "grad_norm": 14.551050186157227, | |
| "learning_rate": 1.860215836061598e-05, | |
| "loss": 1.156, | |
| "step": 47500 | |
| }, | |
| { | |
| "epoch": 1.9322893603317097, | |
| "grad_norm": 11.31080436706543, | |
| "learning_rate": 1.8480902146234997e-05, | |
| "loss": 1.1405, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 1.9524173745018316, | |
| "grad_norm": 19.817716598510742, | |
| "learning_rate": 1.835964593185401e-05, | |
| "loss": 1.1488, | |
| "step": 48500 | |
| }, | |
| { | |
| "epoch": 1.9725453886719535, | |
| "grad_norm": 13.350114822387695, | |
| "learning_rate": 1.823838971747302e-05, | |
| "loss": 1.1094, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 1.9926734028420756, | |
| "grad_norm": 13.383456230163574, | |
| "learning_rate": 1.8117133503092034e-05, | |
| "loss": 1.1162, | |
| "step": 49500 | |
| }, | |
| { | |
| "epoch": 2.0128014170121977, | |
| "grad_norm": 14.433093070983887, | |
| "learning_rate": 1.7995877288711047e-05, | |
| "loss": 1.0904, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 2.0329294311823194, | |
| "grad_norm": 5.4649338722229, | |
| "learning_rate": 1.787462107433006e-05, | |
| "loss": 1.0922, | |
| "step": 50500 | |
| }, | |
| { | |
| "epoch": 2.0530574453524415, | |
| "grad_norm": 14.124307632446289, | |
| "learning_rate": 1.7753364859949075e-05, | |
| "loss": 1.0895, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 2.0731854595225636, | |
| "grad_norm": 9.346240043640137, | |
| "learning_rate": 1.7632108645568087e-05, | |
| "loss": 1.0611, | |
| "step": 51500 | |
| }, | |
| { | |
| "epoch": 2.0933134736926853, | |
| "grad_norm": 16.641101837158203, | |
| "learning_rate": 1.7510852431187096e-05, | |
| "loss": 1.0945, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 2.1134414878628074, | |
| "grad_norm": 12.283844947814941, | |
| "learning_rate": 1.7389596216806112e-05, | |
| "loss": 1.078, | |
| "step": 52500 | |
| }, | |
| { | |
| "epoch": 2.1335695020329295, | |
| "grad_norm": 8.219395637512207, | |
| "learning_rate": 1.7268340002425125e-05, | |
| "loss": 1.0482, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 2.1536975162030516, | |
| "grad_norm": 15.403627395629883, | |
| "learning_rate": 1.7147083788044137e-05, | |
| "loss": 1.0433, | |
| "step": 53500 | |
| }, | |
| { | |
| "epoch": 2.1738255303731733, | |
| "grad_norm": 15.240696907043457, | |
| "learning_rate": 1.7025827573663153e-05, | |
| "loss": 1.0493, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 2.1939535445432954, | |
| "grad_norm": 19.72351837158203, | |
| "learning_rate": 1.6904571359282165e-05, | |
| "loss": 1.042, | |
| "step": 54500 | |
| }, | |
| { | |
| "epoch": 2.2140815587134175, | |
| "grad_norm": 21.067684173583984, | |
| "learning_rate": 1.6783315144901174e-05, | |
| "loss": 1.025, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 2.234209572883539, | |
| "grad_norm": 14.798884391784668, | |
| "learning_rate": 1.666205893052019e-05, | |
| "loss": 1.0143, | |
| "step": 55500 | |
| }, | |
| { | |
| "epoch": 2.2543375870536613, | |
| "grad_norm": 15.239629745483398, | |
| "learning_rate": 1.6540802716139202e-05, | |
| "loss": 1.0322, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 2.2744656012237834, | |
| "grad_norm": 14.908315658569336, | |
| "learning_rate": 1.6419546501758215e-05, | |
| "loss": 1.0826, | |
| "step": 56500 | |
| }, | |
| { | |
| "epoch": 2.294593615393905, | |
| "grad_norm": 13.52440071105957, | |
| "learning_rate": 1.629829028737723e-05, | |
| "loss": 1.0199, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 2.314721629564027, | |
| "grad_norm": 20.474451065063477, | |
| "learning_rate": 1.6177034072996243e-05, | |
| "loss": 1.0061, | |
| "step": 57500 | |
| }, | |
| { | |
| "epoch": 2.3348496437341493, | |
| "grad_norm": 15.805046081542969, | |
| "learning_rate": 1.6055777858615252e-05, | |
| "loss": 1.0141, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 2.3549776579042714, | |
| "grad_norm": 9.82214641571045, | |
| "learning_rate": 1.5934521644234268e-05, | |
| "loss": 1.0099, | |
| "step": 58500 | |
| }, | |
| { | |
| "epoch": 2.375105672074393, | |
| "grad_norm": 17.32090950012207, | |
| "learning_rate": 1.581326542985328e-05, | |
| "loss": 0.9851, | |
| "step": 59000 | |
| }, | |
| { | |
| "epoch": 2.395233686244515, | |
| "grad_norm": 27.325069427490234, | |
| "learning_rate": 1.5692009215472293e-05, | |
| "loss": 0.9991, | |
| "step": 59500 | |
| }, | |
| { | |
| "epoch": 2.4153617004146373, | |
| "grad_norm": 21.118209838867188, | |
| "learning_rate": 1.557075300109131e-05, | |
| "loss": 1.0082, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 2.435489714584759, | |
| "grad_norm": 14.355386734008789, | |
| "learning_rate": 1.544949678671032e-05, | |
| "loss": 0.9549, | |
| "step": 60500 | |
| }, | |
| { | |
| "epoch": 2.455617728754881, | |
| "grad_norm": 16.598129272460938, | |
| "learning_rate": 1.532824057232933e-05, | |
| "loss": 1.0101, | |
| "step": 61000 | |
| }, | |
| { | |
| "epoch": 2.475745742925003, | |
| "grad_norm": 21.729766845703125, | |
| "learning_rate": 1.5206984357948344e-05, | |
| "loss": 0.9704, | |
| "step": 61500 | |
| }, | |
| { | |
| "epoch": 2.495873757095125, | |
| "grad_norm": 16.548641204833984, | |
| "learning_rate": 1.5085728143567358e-05, | |
| "loss": 0.9893, | |
| "step": 62000 | |
| }, | |
| { | |
| "epoch": 2.516001771265247, | |
| "grad_norm": 14.282777786254883, | |
| "learning_rate": 1.496447192918637e-05, | |
| "loss": 0.9721, | |
| "step": 62500 | |
| }, | |
| { | |
| "epoch": 2.536129785435369, | |
| "grad_norm": 9.005020141601562, | |
| "learning_rate": 1.4843215714805385e-05, | |
| "loss": 0.98, | |
| "step": 63000 | |
| }, | |
| { | |
| "epoch": 2.5562577996054907, | |
| "grad_norm": 8.10714340209961, | |
| "learning_rate": 1.4721959500424397e-05, | |
| "loss": 0.9887, | |
| "step": 63500 | |
| }, | |
| { | |
| "epoch": 2.576385813775613, | |
| "grad_norm": 13.707820892333984, | |
| "learning_rate": 1.460070328604341e-05, | |
| "loss": 0.9805, | |
| "step": 64000 | |
| }, | |
| { | |
| "epoch": 2.596513827945735, | |
| "grad_norm": 20.182363510131836, | |
| "learning_rate": 1.4479447071662424e-05, | |
| "loss": 0.9837, | |
| "step": 64500 | |
| }, | |
| { | |
| "epoch": 2.6166418421158566, | |
| "grad_norm": 9.87313175201416, | |
| "learning_rate": 1.4358190857281435e-05, | |
| "loss": 0.9609, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 2.6367698562859787, | |
| "grad_norm": 12.288646697998047, | |
| "learning_rate": 1.4236934642900449e-05, | |
| "loss": 1.0035, | |
| "step": 65500 | |
| }, | |
| { | |
| "epoch": 2.656897870456101, | |
| "grad_norm": 18.152629852294922, | |
| "learning_rate": 1.4115678428519463e-05, | |
| "loss": 0.9494, | |
| "step": 66000 | |
| }, | |
| { | |
| "epoch": 2.677025884626223, | |
| "grad_norm": 16.326662063598633, | |
| "learning_rate": 1.3994422214138473e-05, | |
| "loss": 0.946, | |
| "step": 66500 | |
| }, | |
| { | |
| "epoch": 2.6971538987963446, | |
| "grad_norm": 18.14234733581543, | |
| "learning_rate": 1.3873165999757488e-05, | |
| "loss": 0.9504, | |
| "step": 67000 | |
| }, | |
| { | |
| "epoch": 2.7172819129664667, | |
| "grad_norm": 20.3934326171875, | |
| "learning_rate": 1.3751909785376502e-05, | |
| "loss": 0.9676, | |
| "step": 67500 | |
| }, | |
| { | |
| "epoch": 2.737409927136589, | |
| "grad_norm": 11.495948791503906, | |
| "learning_rate": 1.3630653570995512e-05, | |
| "loss": 0.9283, | |
| "step": 68000 | |
| }, | |
| { | |
| "epoch": 2.757537941306711, | |
| "grad_norm": 20.127979278564453, | |
| "learning_rate": 1.3509397356614527e-05, | |
| "loss": 0.9467, | |
| "step": 68500 | |
| }, | |
| { | |
| "epoch": 2.7776659554768326, | |
| "grad_norm": 13.345834732055664, | |
| "learning_rate": 1.338814114223354e-05, | |
| "loss": 0.9538, | |
| "step": 69000 | |
| }, | |
| { | |
| "epoch": 2.7977939696469547, | |
| "grad_norm": 9.327335357666016, | |
| "learning_rate": 1.3266884927852551e-05, | |
| "loss": 0.9437, | |
| "step": 69500 | |
| }, | |
| { | |
| "epoch": 2.817921983817077, | |
| "grad_norm": 12.741182327270508, | |
| "learning_rate": 1.3145628713471566e-05, | |
| "loss": 0.9291, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 2.8380499979871985, | |
| "grad_norm": 16.994661331176758, | |
| "learning_rate": 1.302437249909058e-05, | |
| "loss": 0.9147, | |
| "step": 70500 | |
| }, | |
| { | |
| "epoch": 2.8581780121573206, | |
| "grad_norm": 15.74470043182373, | |
| "learning_rate": 1.2903116284709592e-05, | |
| "loss": 0.9296, | |
| "step": 71000 | |
| }, | |
| { | |
| "epoch": 2.8783060263274427, | |
| "grad_norm": 13.54488754272461, | |
| "learning_rate": 1.2781860070328604e-05, | |
| "loss": 0.9482, | |
| "step": 71500 | |
| }, | |
| { | |
| "epoch": 2.8984340404975644, | |
| "grad_norm": 10.650059700012207, | |
| "learning_rate": 1.2660603855947619e-05, | |
| "loss": 0.9516, | |
| "step": 72000 | |
| }, | |
| { | |
| "epoch": 2.9185620546676865, | |
| "grad_norm": 12.577211380004883, | |
| "learning_rate": 1.2539347641566631e-05, | |
| "loss": 0.9173, | |
| "step": 72500 | |
| }, | |
| { | |
| "epoch": 2.9386900688378086, | |
| "grad_norm": 14.282366752624512, | |
| "learning_rate": 1.2418091427185643e-05, | |
| "loss": 0.9511, | |
| "step": 73000 | |
| }, | |
| { | |
| "epoch": 2.9588180830079303, | |
| "grad_norm": 14.529337882995605, | |
| "learning_rate": 1.2296835212804656e-05, | |
| "loss": 0.9302, | |
| "step": 73500 | |
| }, | |
| { | |
| "epoch": 2.9789460971780524, | |
| "grad_norm": 11.681228637695312, | |
| "learning_rate": 1.217557899842367e-05, | |
| "loss": 0.9097, | |
| "step": 74000 | |
| }, | |
| { | |
| "epoch": 2.9990741113481745, | |
| "grad_norm": 11.70090389251709, | |
| "learning_rate": 1.2054322784042682e-05, | |
| "loss": 0.9233, | |
| "step": 74500 | |
| }, | |
| { | |
| "epoch": 3.019202125518296, | |
| "grad_norm": 27.22252655029297, | |
| "learning_rate": 1.1933066569661695e-05, | |
| "loss": 0.8651, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 3.0393301396884183, | |
| "grad_norm": 14.896398544311523, | |
| "learning_rate": 1.1811810355280709e-05, | |
| "loss": 0.8639, | |
| "step": 75500 | |
| }, | |
| { | |
| "epoch": 3.0594581538585404, | |
| "grad_norm": 20.037960052490234, | |
| "learning_rate": 1.1690554140899721e-05, | |
| "loss": 0.8606, | |
| "step": 76000 | |
| }, | |
| { | |
| "epoch": 3.0795861680286625, | |
| "grad_norm": 16.03421974182129, | |
| "learning_rate": 1.1569297926518734e-05, | |
| "loss": 0.8639, | |
| "step": 76500 | |
| }, | |
| { | |
| "epoch": 3.099714182198784, | |
| "grad_norm": 14.802894592285156, | |
| "learning_rate": 1.1448041712137748e-05, | |
| "loss": 0.8875, | |
| "step": 77000 | |
| }, | |
| { | |
| "epoch": 3.1198421963689063, | |
| "grad_norm": 9.06533145904541, | |
| "learning_rate": 1.132678549775676e-05, | |
| "loss": 0.8877, | |
| "step": 77500 | |
| }, | |
| { | |
| "epoch": 3.1399702105390284, | |
| "grad_norm": 13.744263648986816, | |
| "learning_rate": 1.1205529283375773e-05, | |
| "loss": 0.8761, | |
| "step": 78000 | |
| }, | |
| { | |
| "epoch": 3.16009822470915, | |
| "grad_norm": 12.16555404663086, | |
| "learning_rate": 1.1084273068994787e-05, | |
| "loss": 0.8782, | |
| "step": 78500 | |
| }, | |
| { | |
| "epoch": 3.180226238879272, | |
| "grad_norm": 29.285688400268555, | |
| "learning_rate": 1.09630168546138e-05, | |
| "loss": 0.8579, | |
| "step": 79000 | |
| }, | |
| { | |
| "epoch": 3.2003542530493942, | |
| "grad_norm": 14.758946418762207, | |
| "learning_rate": 1.0841760640232812e-05, | |
| "loss": 0.878, | |
| "step": 79500 | |
| }, | |
| { | |
| "epoch": 3.220482267219516, | |
| "grad_norm": 12.481344223022461, | |
| "learning_rate": 1.0720504425851826e-05, | |
| "loss": 0.8383, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 3.240610281389638, | |
| "grad_norm": 11.378300666809082, | |
| "learning_rate": 1.0599248211470838e-05, | |
| "loss": 0.866, | |
| "step": 80500 | |
| }, | |
| { | |
| "epoch": 3.26073829555976, | |
| "grad_norm": 18.51228141784668, | |
| "learning_rate": 1.047799199708985e-05, | |
| "loss": 0.8727, | |
| "step": 81000 | |
| }, | |
| { | |
| "epoch": 3.2808663097298822, | |
| "grad_norm": 13.013883590698242, | |
| "learning_rate": 1.0356735782708865e-05, | |
| "loss": 0.8497, | |
| "step": 81500 | |
| }, | |
| { | |
| "epoch": 3.300994323900004, | |
| "grad_norm": 18.66629409790039, | |
| "learning_rate": 1.0235479568327876e-05, | |
| "loss": 0.8817, | |
| "step": 82000 | |
| }, | |
| { | |
| "epoch": 3.321122338070126, | |
| "grad_norm": 22.02678108215332, | |
| "learning_rate": 1.011422335394689e-05, | |
| "loss": 0.8207, | |
| "step": 82500 | |
| }, | |
| { | |
| "epoch": 3.341250352240248, | |
| "grad_norm": 21.1297550201416, | |
| "learning_rate": 9.992967139565904e-06, | |
| "loss": 0.834, | |
| "step": 83000 | |
| }, | |
| { | |
| "epoch": 3.36137836641037, | |
| "grad_norm": 15.060477256774902, | |
| "learning_rate": 9.871710925184914e-06, | |
| "loss": 0.8313, | |
| "step": 83500 | |
| }, | |
| { | |
| "epoch": 3.381506380580492, | |
| "grad_norm": 20.013944625854492, | |
| "learning_rate": 9.750454710803929e-06, | |
| "loss": 0.8628, | |
| "step": 84000 | |
| }, | |
| { | |
| "epoch": 3.401634394750614, | |
| "grad_norm": 11.168913841247559, | |
| "learning_rate": 9.629198496422943e-06, | |
| "loss": 0.8261, | |
| "step": 84500 | |
| }, | |
| { | |
| "epoch": 3.4217624089207357, | |
| "grad_norm": 15.372590065002441, | |
| "learning_rate": 9.507942282041953e-06, | |
| "loss": 0.8618, | |
| "step": 85000 | |
| }, | |
| { | |
| "epoch": 3.441890423090858, | |
| "grad_norm": 11.604378700256348, | |
| "learning_rate": 9.386686067660968e-06, | |
| "loss": 0.8239, | |
| "step": 85500 | |
| }, | |
| { | |
| "epoch": 3.46201843726098, | |
| "grad_norm": 9.609265327453613, | |
| "learning_rate": 9.265429853279982e-06, | |
| "loss": 0.8371, | |
| "step": 86000 | |
| }, | |
| { | |
| "epoch": 3.4821464514311016, | |
| "grad_norm": 15.69279956817627, | |
| "learning_rate": 9.144173638898994e-06, | |
| "loss": 0.8218, | |
| "step": 86500 | |
| }, | |
| { | |
| "epoch": 3.5022744656012237, | |
| "grad_norm": 14.74257755279541, | |
| "learning_rate": 9.022917424518007e-06, | |
| "loss": 0.8055, | |
| "step": 87000 | |
| }, | |
| { | |
| "epoch": 3.522402479771346, | |
| "grad_norm": 10.193700790405273, | |
| "learning_rate": 8.90166121013702e-06, | |
| "loss": 0.8566, | |
| "step": 87500 | |
| }, | |
| { | |
| "epoch": 3.5425304939414675, | |
| "grad_norm": 13.010785102844238, | |
| "learning_rate": 8.780404995756033e-06, | |
| "loss": 0.8443, | |
| "step": 88000 | |
| }, | |
| { | |
| "epoch": 3.5626585081115896, | |
| "grad_norm": 11.916807174682617, | |
| "learning_rate": 8.659148781375045e-06, | |
| "loss": 0.8272, | |
| "step": 88500 | |
| }, | |
| { | |
| "epoch": 3.5827865222817117, | |
| "grad_norm": 11.876017570495605, | |
| "learning_rate": 8.53789256699406e-06, | |
| "loss": 0.8518, | |
| "step": 89000 | |
| }, | |
| { | |
| "epoch": 3.602914536451834, | |
| "grad_norm": 21.5701847076416, | |
| "learning_rate": 8.416636352613072e-06, | |
| "loss": 0.8087, | |
| "step": 89500 | |
| }, | |
| { | |
| "epoch": 3.623042550621956, | |
| "grad_norm": 11.204216957092285, | |
| "learning_rate": 8.295380138232084e-06, | |
| "loss": 0.8279, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 3.6431705647920776, | |
| "grad_norm": 11.78646469116211, | |
| "learning_rate": 8.174123923851097e-06, | |
| "loss": 0.8316, | |
| "step": 90500 | |
| }, | |
| { | |
| "epoch": 3.6632985789621997, | |
| "grad_norm": 12.788416862487793, | |
| "learning_rate": 8.052867709470111e-06, | |
| "loss": 0.8332, | |
| "step": 91000 | |
| }, | |
| { | |
| "epoch": 3.683426593132322, | |
| "grad_norm": 14.306061744689941, | |
| "learning_rate": 7.931611495089123e-06, | |
| "loss": 0.823, | |
| "step": 91500 | |
| }, | |
| { | |
| "epoch": 3.7035546073024435, | |
| "grad_norm": 20.168163299560547, | |
| "learning_rate": 7.810355280708136e-06, | |
| "loss": 0.81, | |
| "step": 92000 | |
| }, | |
| { | |
| "epoch": 3.7236826214725656, | |
| "grad_norm": 20.580291748046875, | |
| "learning_rate": 7.68909906632715e-06, | |
| "loss": 0.822, | |
| "step": 92500 | |
| }, | |
| { | |
| "epoch": 3.7438106356426877, | |
| "grad_norm": 13.826583862304688, | |
| "learning_rate": 7.567842851946163e-06, | |
| "loss": 0.8378, | |
| "step": 93000 | |
| }, | |
| { | |
| "epoch": 3.7639386498128093, | |
| "grad_norm": 30.890518188476562, | |
| "learning_rate": 7.446586637565176e-06, | |
| "loss": 0.8311, | |
| "step": 93500 | |
| }, | |
| { | |
| "epoch": 3.7840666639829315, | |
| "grad_norm": 15.22163200378418, | |
| "learning_rate": 7.325330423184188e-06, | |
| "loss": 0.8138, | |
| "step": 94000 | |
| }, | |
| { | |
| "epoch": 3.8041946781530536, | |
| "grad_norm": 8.326911926269531, | |
| "learning_rate": 7.204074208803201e-06, | |
| "loss": 0.784, | |
| "step": 94500 | |
| }, | |
| { | |
| "epoch": 3.8243226923231752, | |
| "grad_norm": 31.577423095703125, | |
| "learning_rate": 7.082817994422215e-06, | |
| "loss": 0.8006, | |
| "step": 95000 | |
| }, | |
| { | |
| "epoch": 3.8444507064932973, | |
| "grad_norm": 15.388664245605469, | |
| "learning_rate": 6.961561780041227e-06, | |
| "loss": 0.8418, | |
| "step": 95500 | |
| }, | |
| { | |
| "epoch": 3.8645787206634195, | |
| "grad_norm": 21.28485107421875, | |
| "learning_rate": 6.84030556566024e-06, | |
| "loss": 0.7972, | |
| "step": 96000 | |
| }, | |
| { | |
| "epoch": 3.884706734833541, | |
| "grad_norm": 11.151982307434082, | |
| "learning_rate": 6.7190493512792536e-06, | |
| "loss": 0.8133, | |
| "step": 96500 | |
| }, | |
| { | |
| "epoch": 3.9048347490036632, | |
| "grad_norm": 11.545019149780273, | |
| "learning_rate": 6.597793136898266e-06, | |
| "loss": 0.8035, | |
| "step": 97000 | |
| }, | |
| { | |
| "epoch": 3.9249627631737853, | |
| "grad_norm": 11.109121322631836, | |
| "learning_rate": 6.476536922517279e-06, | |
| "loss": 0.7959, | |
| "step": 97500 | |
| }, | |
| { | |
| "epoch": 3.945090777343907, | |
| "grad_norm": 12.6671142578125, | |
| "learning_rate": 6.355280708136292e-06, | |
| "loss": 0.8132, | |
| "step": 98000 | |
| }, | |
| { | |
| "epoch": 3.965218791514029, | |
| "grad_norm": 11.02685260772705, | |
| "learning_rate": 6.234024493755305e-06, | |
| "loss": 0.7959, | |
| "step": 98500 | |
| }, | |
| { | |
| "epoch": 3.9853468056841512, | |
| "grad_norm": 11.704038619995117, | |
| "learning_rate": 6.112768279374318e-06, | |
| "loss": 0.7837, | |
| "step": 99000 | |
| }, | |
| { | |
| "epoch": 4.005474819854273, | |
| "grad_norm": 16.34335708618164, | |
| "learning_rate": 5.991512064993331e-06, | |
| "loss": 0.7851, | |
| "step": 99500 | |
| }, | |
| { | |
| "epoch": 4.0256028340243954, | |
| "grad_norm": 10.739608764648438, | |
| "learning_rate": 5.870255850612345e-06, | |
| "loss": 0.7684, | |
| "step": 100000 | |
| }, | |
| { | |
| "epoch": 4.045730848194517, | |
| "grad_norm": 15.17026424407959, | |
| "learning_rate": 5.748999636231357e-06, | |
| "loss": 0.7679, | |
| "step": 100500 | |
| }, | |
| { | |
| "epoch": 4.065858862364639, | |
| "grad_norm": 16.030241012573242, | |
| "learning_rate": 5.62774342185037e-06, | |
| "loss": 0.764, | |
| "step": 101000 | |
| }, | |
| { | |
| "epoch": 4.085986876534761, | |
| "grad_norm": 15.900766372680664, | |
| "learning_rate": 5.506487207469383e-06, | |
| "loss": 0.7666, | |
| "step": 101500 | |
| }, | |
| { | |
| "epoch": 4.106114890704883, | |
| "grad_norm": 13.20738410949707, | |
| "learning_rate": 5.385230993088396e-06, | |
| "loss": 0.7686, | |
| "step": 102000 | |
| }, | |
| { | |
| "epoch": 4.126242904875005, | |
| "grad_norm": 9.8963623046875, | |
| "learning_rate": 5.2639747787074086e-06, | |
| "loss": 0.7589, | |
| "step": 102500 | |
| }, | |
| { | |
| "epoch": 4.146370919045127, | |
| "grad_norm": 16.053571701049805, | |
| "learning_rate": 5.142718564326422e-06, | |
| "loss": 0.7676, | |
| "step": 103000 | |
| }, | |
| { | |
| "epoch": 4.166498933215249, | |
| "grad_norm": 12.643793106079102, | |
| "learning_rate": 5.021462349945435e-06, | |
| "loss": 0.7462, | |
| "step": 103500 | |
| }, | |
| { | |
| "epoch": 4.186626947385371, | |
| "grad_norm": 27.60247230529785, | |
| "learning_rate": 4.9002061355644475e-06, | |
| "loss": 0.7864, | |
| "step": 104000 | |
| }, | |
| { | |
| "epoch": 4.206754961555493, | |
| "grad_norm": 13.564982414245605, | |
| "learning_rate": 4.778949921183461e-06, | |
| "loss": 0.7693, | |
| "step": 104500 | |
| }, | |
| { | |
| "epoch": 4.226882975725615, | |
| "grad_norm": 20.11015510559082, | |
| "learning_rate": 4.657693706802474e-06, | |
| "loss": 0.7386, | |
| "step": 105000 | |
| }, | |
| { | |
| "epoch": 4.247010989895736, | |
| "grad_norm": 15.393072128295898, | |
| "learning_rate": 4.5364374924214865e-06, | |
| "loss": 0.7793, | |
| "step": 105500 | |
| }, | |
| { | |
| "epoch": 4.267139004065859, | |
| "grad_norm": 19.87403678894043, | |
| "learning_rate": 4.4151812780405e-06, | |
| "loss": 0.7779, | |
| "step": 106000 | |
| }, | |
| { | |
| "epoch": 4.287267018235981, | |
| "grad_norm": 9.388250350952148, | |
| "learning_rate": 4.293925063659512e-06, | |
| "loss": 0.7681, | |
| "step": 106500 | |
| }, | |
| { | |
| "epoch": 4.307395032406103, | |
| "grad_norm": 10.060807228088379, | |
| "learning_rate": 4.1726688492785255e-06, | |
| "loss": 0.7509, | |
| "step": 107000 | |
| }, | |
| { | |
| "epoch": 4.327523046576225, | |
| "grad_norm": 23.562870025634766, | |
| "learning_rate": 4.051412634897539e-06, | |
| "loss": 0.7833, | |
| "step": 107500 | |
| }, | |
| { | |
| "epoch": 4.3476510607463466, | |
| "grad_norm": 14.926592826843262, | |
| "learning_rate": 3.930156420516551e-06, | |
| "loss": 0.7446, | |
| "step": 108000 | |
| }, | |
| { | |
| "epoch": 4.367779074916469, | |
| "grad_norm": 11.940516471862793, | |
| "learning_rate": 3.808900206135565e-06, | |
| "loss": 0.754, | |
| "step": 108500 | |
| }, | |
| { | |
| "epoch": 4.387907089086591, | |
| "grad_norm": 14.217045783996582, | |
| "learning_rate": 3.6876439917545777e-06, | |
| "loss": 0.7731, | |
| "step": 109000 | |
| }, | |
| { | |
| "epoch": 4.408035103256712, | |
| "grad_norm": 9.447354316711426, | |
| "learning_rate": 3.5663877773735905e-06, | |
| "loss": 0.7597, | |
| "step": 109500 | |
| }, | |
| { | |
| "epoch": 4.428163117426835, | |
| "grad_norm": 19.97547149658203, | |
| "learning_rate": 3.4451315629926034e-06, | |
| "loss": 0.7657, | |
| "step": 110000 | |
| }, | |
| { | |
| "epoch": 4.448291131596957, | |
| "grad_norm": 15.066329956054688, | |
| "learning_rate": 3.3238753486116167e-06, | |
| "loss": 0.7619, | |
| "step": 110500 | |
| }, | |
| { | |
| "epoch": 4.468419145767078, | |
| "grad_norm": 12.446183204650879, | |
| "learning_rate": 3.2026191342306295e-06, | |
| "loss": 0.7656, | |
| "step": 111000 | |
| }, | |
| { | |
| "epoch": 4.488547159937201, | |
| "grad_norm": 32.365234375, | |
| "learning_rate": 3.0813629198496423e-06, | |
| "loss": 0.7575, | |
| "step": 111500 | |
| }, | |
| { | |
| "epoch": 4.5086751741073225, | |
| "grad_norm": 12.082524299621582, | |
| "learning_rate": 2.960106705468655e-06, | |
| "loss": 0.7502, | |
| "step": 112000 | |
| }, | |
| { | |
| "epoch": 4.528803188277444, | |
| "grad_norm": 20.70221519470215, | |
| "learning_rate": 2.8388504910876685e-06, | |
| "loss": 0.7638, | |
| "step": 112500 | |
| }, | |
| { | |
| "epoch": 4.548931202447567, | |
| "grad_norm": 22.083984375, | |
| "learning_rate": 2.717594276706681e-06, | |
| "loss": 0.7365, | |
| "step": 113000 | |
| }, | |
| { | |
| "epoch": 4.569059216617688, | |
| "grad_norm": 14.066744804382324, | |
| "learning_rate": 2.596338062325694e-06, | |
| "loss": 0.766, | |
| "step": 113500 | |
| }, | |
| { | |
| "epoch": 4.58918723078781, | |
| "grad_norm": 24.38865089416504, | |
| "learning_rate": 2.4750818479447074e-06, | |
| "loss": 0.7449, | |
| "step": 114000 | |
| }, | |
| { | |
| "epoch": 4.609315244957933, | |
| "grad_norm": 11.597355842590332, | |
| "learning_rate": 2.3538256335637203e-06, | |
| "loss": 0.7556, | |
| "step": 114500 | |
| }, | |
| { | |
| "epoch": 4.629443259128054, | |
| "grad_norm": 10.837632179260254, | |
| "learning_rate": 2.232569419182733e-06, | |
| "loss": 0.7501, | |
| "step": 115000 | |
| }, | |
| { | |
| "epoch": 4.649571273298177, | |
| "grad_norm": 20.56001853942871, | |
| "learning_rate": 2.111313204801746e-06, | |
| "loss": 0.7234, | |
| "step": 115500 | |
| }, | |
| { | |
| "epoch": 4.6696992874682985, | |
| "grad_norm": 14.60595703125, | |
| "learning_rate": 1.9900569904207592e-06, | |
| "loss": 0.7695, | |
| "step": 116000 | |
| }, | |
| { | |
| "epoch": 4.68982730163842, | |
| "grad_norm": 28.349151611328125, | |
| "learning_rate": 1.868800776039772e-06, | |
| "loss": 0.7661, | |
| "step": 116500 | |
| }, | |
| { | |
| "epoch": 4.709955315808543, | |
| "grad_norm": 10.647957801818848, | |
| "learning_rate": 1.747544561658785e-06, | |
| "loss": 0.7308, | |
| "step": 117000 | |
| }, | |
| { | |
| "epoch": 4.730083329978664, | |
| "grad_norm": 11.21895980834961, | |
| "learning_rate": 1.6262883472777982e-06, | |
| "loss": 0.7585, | |
| "step": 117500 | |
| }, | |
| { | |
| "epoch": 4.750211344148786, | |
| "grad_norm": 12.75427532196045, | |
| "learning_rate": 1.505032132896811e-06, | |
| "loss": 0.7553, | |
| "step": 118000 | |
| }, | |
| { | |
| "epoch": 4.770339358318909, | |
| "grad_norm": 9.93217658996582, | |
| "learning_rate": 1.383775918515824e-06, | |
| "loss": 0.7525, | |
| "step": 118500 | |
| }, | |
| { | |
| "epoch": 4.79046737248903, | |
| "grad_norm": 13.394769668579102, | |
| "learning_rate": 1.262519704134837e-06, | |
| "loss": 0.7493, | |
| "step": 119000 | |
| }, | |
| { | |
| "epoch": 4.810595386659152, | |
| "grad_norm": 8.94278335571289, | |
| "learning_rate": 1.1412634897538498e-06, | |
| "loss": 0.7575, | |
| "step": 119500 | |
| }, | |
| { | |
| "epoch": 4.8307234008292745, | |
| "grad_norm": 16.46908950805664, | |
| "learning_rate": 1.0200072753728628e-06, | |
| "loss": 0.7651, | |
| "step": 120000 | |
| }, | |
| { | |
| "epoch": 4.850851414999396, | |
| "grad_norm": 27.788360595703125, | |
| "learning_rate": 8.987510609918758e-07, | |
| "loss": 0.7472, | |
| "step": 120500 | |
| }, | |
| { | |
| "epoch": 4.870979429169518, | |
| "grad_norm": 7.398582458496094, | |
| "learning_rate": 7.774948466108889e-07, | |
| "loss": 0.7696, | |
| "step": 121000 | |
| }, | |
| { | |
| "epoch": 4.89110744333964, | |
| "grad_norm": 17.573110580444336, | |
| "learning_rate": 6.562386322299018e-07, | |
| "loss": 0.7445, | |
| "step": 121500 | |
| }, | |
| { | |
| "epoch": 4.911235457509762, | |
| "grad_norm": 5.554362773895264, | |
| "learning_rate": 5.349824178489148e-07, | |
| "loss": 0.7407, | |
| "step": 122000 | |
| }, | |
| { | |
| "epoch": 4.931363471679884, | |
| "grad_norm": 8.908127784729004, | |
| "learning_rate": 4.137262034679277e-07, | |
| "loss": 0.7352, | |
| "step": 122500 | |
| }, | |
| { | |
| "epoch": 4.951491485850006, | |
| "grad_norm": 17.096956253051758, | |
| "learning_rate": 2.924699890869407e-07, | |
| "loss": 0.7548, | |
| "step": 123000 | |
| }, | |
| { | |
| "epoch": 4.971619500020128, | |
| "grad_norm": 15.15579891204834, | |
| "learning_rate": 1.7121377470595367e-07, | |
| "loss": 0.7635, | |
| "step": 123500 | |
| }, | |
| { | |
| "epoch": 4.99174751419025, | |
| "grad_norm": 14.474600791931152, | |
| "learning_rate": 4.9957560324966654e-08, | |
| "loss": 0.748, | |
| "step": 124000 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "step": 124205, | |
| "total_flos": 2.789913716232192e+16, | |
| "train_loss": 1.1867824254838901, | |
| "train_runtime": 21629.6238, | |
| "train_samples_per_second": 91.877, | |
| "train_steps_per_second": 5.742 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 124205, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": false, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.789913716232192e+16, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |