checkpoint-56000 / trainer_state.json
rj9898's picture
Upload 7 files
72639a7 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 6.816798539257456,
"eval_steps": 500,
"global_step": 56000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.012172854534388313,
"grad_norm": 21.32210922241211,
"learning_rate": 4.85e-06,
"loss": 3.6457,
"step": 100
},
{
"epoch": 0.024345709068776627,
"grad_norm": 17.6686954498291,
"learning_rate": 9.85e-06,
"loss": 3.3243,
"step": 200
},
{
"epoch": 0.036518563603164945,
"grad_norm": 17.45920181274414,
"learning_rate": 1.4850000000000002e-05,
"loss": 3.2128,
"step": 300
},
{
"epoch": 0.048691418137553254,
"grad_norm": 18.293773651123047,
"learning_rate": 1.985e-05,
"loss": 3.124,
"step": 400
},
{
"epoch": 0.06086427267194157,
"grad_norm": 15.793401718139648,
"learning_rate": 1.995650224215247e-05,
"loss": 3.1209,
"step": 500
},
{
"epoch": 0.06086427267194157,
"eval_loss": 3.249819278717041,
"eval_runtime": 6.941,
"eval_samples_per_second": 144.072,
"eval_steps_per_second": 36.018,
"step": 500
},
{
"epoch": 0.07303712720632989,
"grad_norm": 13.932258605957031,
"learning_rate": 1.9911659192825115e-05,
"loss": 3.094,
"step": 600
},
{
"epoch": 0.0852099817407182,
"grad_norm": 11.793479919433594,
"learning_rate": 1.986681614349776e-05,
"loss": 3.0426,
"step": 700
},
{
"epoch": 0.09738283627510651,
"grad_norm": 11.373984336853027,
"learning_rate": 1.9821973094170406e-05,
"loss": 3.0645,
"step": 800
},
{
"epoch": 0.10955569080949483,
"grad_norm": 10.407483100891113,
"learning_rate": 1.9777130044843052e-05,
"loss": 3.0681,
"step": 900
},
{
"epoch": 0.12172854534388314,
"grad_norm": 9.600470542907715,
"learning_rate": 1.9732286995515698e-05,
"loss": 3.0599,
"step": 1000
},
{
"epoch": 0.12172854534388314,
"eval_loss": 3.157822847366333,
"eval_runtime": 6.8366,
"eval_samples_per_second": 146.272,
"eval_steps_per_second": 36.568,
"step": 1000
},
{
"epoch": 0.13390139987827146,
"grad_norm": 10.010004043579102,
"learning_rate": 1.9687443946188343e-05,
"loss": 3.0379,
"step": 1100
},
{
"epoch": 0.14607425441265978,
"grad_norm": 9.130040168762207,
"learning_rate": 1.964260089686099e-05,
"loss": 2.9859,
"step": 1200
},
{
"epoch": 0.15824710894704808,
"grad_norm": 8.330909729003906,
"learning_rate": 1.9597757847533635e-05,
"loss": 3.0213,
"step": 1300
},
{
"epoch": 0.1704199634814364,
"grad_norm": 7.502275466918945,
"learning_rate": 1.955291479820628e-05,
"loss": 3.0415,
"step": 1400
},
{
"epoch": 0.18259281801582472,
"grad_norm": 7.305887222290039,
"learning_rate": 1.9508071748878926e-05,
"loss": 2.966,
"step": 1500
},
{
"epoch": 0.18259281801582472,
"eval_loss": 3.091937303543091,
"eval_runtime": 6.9209,
"eval_samples_per_second": 144.491,
"eval_steps_per_second": 36.123,
"step": 1500
},
{
"epoch": 0.19476567255021301,
"grad_norm": 8.190788269042969,
"learning_rate": 1.9463228699551572e-05,
"loss": 2.9814,
"step": 1600
},
{
"epoch": 0.20693852708460134,
"grad_norm": 7.867215633392334,
"learning_rate": 1.9418385650224218e-05,
"loss": 2.9614,
"step": 1700
},
{
"epoch": 0.21911138161898966,
"grad_norm": 7.410882472991943,
"learning_rate": 1.9373542600896864e-05,
"loss": 2.9515,
"step": 1800
},
{
"epoch": 0.23128423615337795,
"grad_norm": 6.388878345489502,
"learning_rate": 1.9328699551569506e-05,
"loss": 2.915,
"step": 1900
},
{
"epoch": 0.24345709068776628,
"grad_norm": 6.401773452758789,
"learning_rate": 1.928385650224215e-05,
"loss": 2.942,
"step": 2000
},
{
"epoch": 0.24345709068776628,
"eval_loss": 3.0528335571289062,
"eval_runtime": 6.9438,
"eval_samples_per_second": 144.014,
"eval_steps_per_second": 36.004,
"step": 2000
},
{
"epoch": 0.2556299452221546,
"grad_norm": 6.346031665802002,
"learning_rate": 1.9239013452914797e-05,
"loss": 2.952,
"step": 2100
},
{
"epoch": 0.2678027997565429,
"grad_norm": 7.141861438751221,
"learning_rate": 1.9194170403587446e-05,
"loss": 2.9309,
"step": 2200
},
{
"epoch": 0.27997565429093124,
"grad_norm": 7.175647735595703,
"learning_rate": 1.9149327354260092e-05,
"loss": 2.9315,
"step": 2300
},
{
"epoch": 0.29214850882531956,
"grad_norm": 5.47502326965332,
"learning_rate": 1.9104484304932738e-05,
"loss": 2.944,
"step": 2400
},
{
"epoch": 0.30432136335970783,
"grad_norm": 6.102653980255127,
"learning_rate": 1.9059641255605384e-05,
"loss": 2.8639,
"step": 2500
},
{
"epoch": 0.30432136335970783,
"eval_loss": 3.0088276863098145,
"eval_runtime": 6.9657,
"eval_samples_per_second": 143.56,
"eval_steps_per_second": 35.89,
"step": 2500
},
{
"epoch": 0.31649421789409615,
"grad_norm": 6.21509313583374,
"learning_rate": 1.901479820627803e-05,
"loss": 2.8462,
"step": 2600
},
{
"epoch": 0.3286670724284845,
"grad_norm": 7.218765735626221,
"learning_rate": 1.8969955156950675e-05,
"loss": 2.8849,
"step": 2700
},
{
"epoch": 0.3408399269628728,
"grad_norm": 6.037746429443359,
"learning_rate": 1.892511210762332e-05,
"loss": 2.894,
"step": 2800
},
{
"epoch": 0.3530127814972611,
"grad_norm": 5.483625411987305,
"learning_rate": 1.8880269058295967e-05,
"loss": 2.8988,
"step": 2900
},
{
"epoch": 0.36518563603164944,
"grad_norm": 4.460190296173096,
"learning_rate": 1.8835426008968612e-05,
"loss": 2.8909,
"step": 3000
},
{
"epoch": 0.36518563603164944,
"eval_loss": 2.9809019565582275,
"eval_runtime": 6.9067,
"eval_samples_per_second": 144.787,
"eval_steps_per_second": 36.197,
"step": 3000
},
{
"epoch": 0.37735849056603776,
"grad_norm": 5.2231125831604,
"learning_rate": 1.8790582959641258e-05,
"loss": 2.894,
"step": 3100
},
{
"epoch": 0.38953134510042603,
"grad_norm": 5.9949517250061035,
"learning_rate": 1.8745739910313904e-05,
"loss": 2.8816,
"step": 3200
},
{
"epoch": 0.40170419963481435,
"grad_norm": 5.3864054679870605,
"learning_rate": 1.870089686098655e-05,
"loss": 2.863,
"step": 3300
},
{
"epoch": 0.4138770541692027,
"grad_norm": 6.138455390930176,
"learning_rate": 1.8656053811659195e-05,
"loss": 2.8577,
"step": 3400
},
{
"epoch": 0.426049908703591,
"grad_norm": 5.222280025482178,
"learning_rate": 1.861121076233184e-05,
"loss": 2.901,
"step": 3500
},
{
"epoch": 0.426049908703591,
"eval_loss": 2.944925308227539,
"eval_runtime": 6.9152,
"eval_samples_per_second": 144.608,
"eval_steps_per_second": 36.152,
"step": 3500
},
{
"epoch": 0.4382227632379793,
"grad_norm": 4.749873638153076,
"learning_rate": 1.8566367713004487e-05,
"loss": 2.8628,
"step": 3600
},
{
"epoch": 0.45039561777236764,
"grad_norm": 4.7014336585998535,
"learning_rate": 1.852152466367713e-05,
"loss": 2.8418,
"step": 3700
},
{
"epoch": 0.4625684723067559,
"grad_norm": 5.343926429748535,
"learning_rate": 1.8476681614349775e-05,
"loss": 2.9097,
"step": 3800
},
{
"epoch": 0.47474132684114423,
"grad_norm": 5.276562690734863,
"learning_rate": 1.8431838565022424e-05,
"loss": 2.8659,
"step": 3900
},
{
"epoch": 0.48691418137553255,
"grad_norm": 5.228163242340088,
"learning_rate": 1.838699551569507e-05,
"loss": 2.8497,
"step": 4000
},
{
"epoch": 0.48691418137553255,
"eval_loss": 2.9137816429138184,
"eval_runtime": 6.8289,
"eval_samples_per_second": 146.437,
"eval_steps_per_second": 36.609,
"step": 4000
},
{
"epoch": 0.4990870359099209,
"grad_norm": 5.291093826293945,
"learning_rate": 1.8342600896860988e-05,
"loss": 2.8562,
"step": 4100
},
{
"epoch": 0.5112598904443092,
"grad_norm": 5.388160705566406,
"learning_rate": 1.8297757847533634e-05,
"loss": 2.87,
"step": 4200
},
{
"epoch": 0.5234327449786975,
"grad_norm": 5.260839939117432,
"learning_rate": 1.825291479820628e-05,
"loss": 2.8755,
"step": 4300
},
{
"epoch": 0.5356055995130858,
"grad_norm": 5.170462131500244,
"learning_rate": 1.8208071748878925e-05,
"loss": 2.8342,
"step": 4400
},
{
"epoch": 0.5477784540474742,
"grad_norm": 4.9179582595825195,
"learning_rate": 1.816322869955157e-05,
"loss": 2.8494,
"step": 4500
},
{
"epoch": 0.5477784540474742,
"eval_loss": 2.886016607284546,
"eval_runtime": 6.8492,
"eval_samples_per_second": 146.002,
"eval_steps_per_second": 36.5,
"step": 4500
},
{
"epoch": 0.5599513085818625,
"grad_norm": 5.140480041503906,
"learning_rate": 1.8118385650224217e-05,
"loss": 2.8659,
"step": 4600
},
{
"epoch": 0.5721241631162508,
"grad_norm": 5.088667869567871,
"learning_rate": 1.8073542600896862e-05,
"loss": 2.8228,
"step": 4700
},
{
"epoch": 0.5842970176506391,
"grad_norm": 4.764868259429932,
"learning_rate": 1.8028699551569508e-05,
"loss": 2.8455,
"step": 4800
},
{
"epoch": 0.5964698721850273,
"grad_norm": 4.458358287811279,
"learning_rate": 1.7983856502242154e-05,
"loss": 2.8196,
"step": 4900
},
{
"epoch": 0.6086427267194157,
"grad_norm": 5.425631999969482,
"learning_rate": 1.79390134529148e-05,
"loss": 2.8247,
"step": 5000
},
{
"epoch": 0.6086427267194157,
"eval_loss": 2.85610294342041,
"eval_runtime": 6.9206,
"eval_samples_per_second": 144.495,
"eval_steps_per_second": 36.124,
"step": 5000
},
{
"epoch": 0.620815581253804,
"grad_norm": 4.651830196380615,
"learning_rate": 1.7894170403587445e-05,
"loss": 2.8296,
"step": 5100
},
{
"epoch": 0.6329884357881923,
"grad_norm": 5.064242839813232,
"learning_rate": 1.784932735426009e-05,
"loss": 2.8446,
"step": 5200
},
{
"epoch": 0.6451612903225806,
"grad_norm": 5.3180413246154785,
"learning_rate": 1.7804484304932737e-05,
"loss": 2.7944,
"step": 5300
},
{
"epoch": 0.657334144856969,
"grad_norm": 4.934672832489014,
"learning_rate": 1.7759641255605383e-05,
"loss": 2.7975,
"step": 5400
},
{
"epoch": 0.6695069993913573,
"grad_norm": 5.154861927032471,
"learning_rate": 1.7714798206278028e-05,
"loss": 2.8144,
"step": 5500
},
{
"epoch": 0.6695069993913573,
"eval_loss": 2.831345319747925,
"eval_runtime": 6.9102,
"eval_samples_per_second": 144.714,
"eval_steps_per_second": 36.179,
"step": 5500
},
{
"epoch": 0.6816798539257456,
"grad_norm": 5.322381973266602,
"learning_rate": 1.7669955156950674e-05,
"loss": 2.8196,
"step": 5600
},
{
"epoch": 0.6938527084601339,
"grad_norm": 4.949143886566162,
"learning_rate": 1.762511210762332e-05,
"loss": 2.8154,
"step": 5700
},
{
"epoch": 0.7060255629945222,
"grad_norm": 4.853809356689453,
"learning_rate": 1.7580269058295965e-05,
"loss": 2.8085,
"step": 5800
},
{
"epoch": 0.7181984175289106,
"grad_norm": 4.941267490386963,
"learning_rate": 1.753542600896861e-05,
"loss": 2.7982,
"step": 5900
},
{
"epoch": 0.7303712720632989,
"grad_norm": 4.971885681152344,
"learning_rate": 1.7490582959641257e-05,
"loss": 2.8049,
"step": 6000
},
{
"epoch": 0.7303712720632989,
"eval_loss": 2.8138246536254883,
"eval_runtime": 6.8576,
"eval_samples_per_second": 145.824,
"eval_steps_per_second": 36.456,
"step": 6000
},
{
"epoch": 0.7425441265976872,
"grad_norm": 4.718198776245117,
"learning_rate": 1.7445739910313903e-05,
"loss": 2.7546,
"step": 6100
},
{
"epoch": 0.7547169811320755,
"grad_norm": 5.367305278778076,
"learning_rate": 1.740089686098655e-05,
"loss": 2.7714,
"step": 6200
},
{
"epoch": 0.7668898356664637,
"grad_norm": 4.827259063720703,
"learning_rate": 1.7356053811659194e-05,
"loss": 2.8043,
"step": 6300
},
{
"epoch": 0.7790626902008521,
"grad_norm": 5.011576175689697,
"learning_rate": 1.731121076233184e-05,
"loss": 2.7859,
"step": 6400
},
{
"epoch": 0.7912355447352404,
"grad_norm": 5.363623142242432,
"learning_rate": 1.7266816143497758e-05,
"loss": 2.8161,
"step": 6500
},
{
"epoch": 0.7912355447352404,
"eval_loss": 2.791551113128662,
"eval_runtime": 6.8881,
"eval_samples_per_second": 145.177,
"eval_steps_per_second": 36.294,
"step": 6500
},
{
"epoch": 0.8034083992696287,
"grad_norm": 4.721231937408447,
"learning_rate": 1.7221973094170404e-05,
"loss": 2.7857,
"step": 6600
},
{
"epoch": 0.815581253804017,
"grad_norm": 4.657351016998291,
"learning_rate": 1.717713004484305e-05,
"loss": 2.7734,
"step": 6700
},
{
"epoch": 0.8277541083384053,
"grad_norm": 4.4942145347595215,
"learning_rate": 1.7132286995515695e-05,
"loss": 2.7885,
"step": 6800
},
{
"epoch": 0.8399269628727937,
"grad_norm": 5.061729431152344,
"learning_rate": 1.708744394618834e-05,
"loss": 2.7841,
"step": 6900
},
{
"epoch": 0.852099817407182,
"grad_norm": 4.816007137298584,
"learning_rate": 1.7042600896860987e-05,
"loss": 2.741,
"step": 7000
},
{
"epoch": 0.852099817407182,
"eval_loss": 2.7756857872009277,
"eval_runtime": 6.8679,
"eval_samples_per_second": 145.605,
"eval_steps_per_second": 36.401,
"step": 7000
},
{
"epoch": 0.8642726719415703,
"grad_norm": 5.255375385284424,
"learning_rate": 1.6997757847533633e-05,
"loss": 2.7561,
"step": 7100
},
{
"epoch": 0.8764455264759586,
"grad_norm": 4.844815254211426,
"learning_rate": 1.695291479820628e-05,
"loss": 2.7558,
"step": 7200
},
{
"epoch": 0.888618381010347,
"grad_norm": 4.8912224769592285,
"learning_rate": 1.6908071748878924e-05,
"loss": 2.7512,
"step": 7300
},
{
"epoch": 0.9007912355447353,
"grad_norm": 4.5775017738342285,
"learning_rate": 1.686322869955157e-05,
"loss": 2.745,
"step": 7400
},
{
"epoch": 0.9129640900791236,
"grad_norm": 4.753942012786865,
"learning_rate": 1.6818385650224216e-05,
"loss": 2.7173,
"step": 7500
},
{
"epoch": 0.9129640900791236,
"eval_loss": 2.7591283321380615,
"eval_runtime": 6.877,
"eval_samples_per_second": 145.412,
"eval_steps_per_second": 36.353,
"step": 7500
},
{
"epoch": 0.9251369446135118,
"grad_norm": 5.192244052886963,
"learning_rate": 1.677354260089686e-05,
"loss": 2.7373,
"step": 7600
},
{
"epoch": 0.9373097991479001,
"grad_norm": 4.5390801429748535,
"learning_rate": 1.6728699551569507e-05,
"loss": 2.7654,
"step": 7700
},
{
"epoch": 0.9494826536822885,
"grad_norm": 5.091897487640381,
"learning_rate": 1.6683856502242153e-05,
"loss": 2.7615,
"step": 7800
},
{
"epoch": 0.9616555082166768,
"grad_norm": 4.253417015075684,
"learning_rate": 1.6639013452914802e-05,
"loss": 2.7521,
"step": 7900
},
{
"epoch": 0.9738283627510651,
"grad_norm": 4.891059875488281,
"learning_rate": 1.6594170403587448e-05,
"loss": 2.7665,
"step": 8000
},
{
"epoch": 0.9738283627510651,
"eval_loss": 2.7409751415252686,
"eval_runtime": 6.8856,
"eval_samples_per_second": 145.23,
"eval_steps_per_second": 36.308,
"step": 8000
},
{
"epoch": 0.9860012172854534,
"grad_norm": 4.412657260894775,
"learning_rate": 1.6549327354260093e-05,
"loss": 2.7471,
"step": 8100
},
{
"epoch": 0.9981740718198417,
"grad_norm": 5.708240509033203,
"learning_rate": 1.650448430493274e-05,
"loss": 2.7545,
"step": 8200
},
{
"epoch": 1.01034692635423,
"grad_norm": 4.956757068634033,
"learning_rate": 1.645964125560538e-05,
"loss": 2.6015,
"step": 8300
},
{
"epoch": 1.0225197808886184,
"grad_norm": 5.220682621002197,
"learning_rate": 1.6414798206278027e-05,
"loss": 2.6077,
"step": 8400
},
{
"epoch": 1.0346926354230066,
"grad_norm": 5.160597801208496,
"learning_rate": 1.6369955156950673e-05,
"loss": 2.5857,
"step": 8500
},
{
"epoch": 1.0346926354230066,
"eval_loss": 2.7148427963256836,
"eval_runtime": 6.8912,
"eval_samples_per_second": 145.113,
"eval_steps_per_second": 36.278,
"step": 8500
},
{
"epoch": 1.046865489957395,
"grad_norm": 5.304019927978516,
"learning_rate": 1.6325560538116595e-05,
"loss": 2.5738,
"step": 8600
},
{
"epoch": 1.0590383444917832,
"grad_norm": 5.3433637619018555,
"learning_rate": 1.628071748878924e-05,
"loss": 2.5499,
"step": 8700
},
{
"epoch": 1.0712111990261717,
"grad_norm": 4.527110576629639,
"learning_rate": 1.6235874439461886e-05,
"loss": 2.6156,
"step": 8800
},
{
"epoch": 1.08338405356056,
"grad_norm": 5.513104438781738,
"learning_rate": 1.6191031390134532e-05,
"loss": 2.6217,
"step": 8900
},
{
"epoch": 1.0955569080949483,
"grad_norm": 5.579029083251953,
"learning_rate": 1.6146188340807178e-05,
"loss": 2.5829,
"step": 9000
},
{
"epoch": 1.0955569080949483,
"eval_loss": 2.6865806579589844,
"eval_runtime": 6.8369,
"eval_samples_per_second": 146.265,
"eval_steps_per_second": 36.566,
"step": 9000
},
{
"epoch": 1.1077297626293365,
"grad_norm": 4.849677562713623,
"learning_rate": 1.6101345291479823e-05,
"loss": 2.6253,
"step": 9100
},
{
"epoch": 1.119902617163725,
"grad_norm": 5.025945663452148,
"learning_rate": 1.605650224215247e-05,
"loss": 2.5725,
"step": 9200
},
{
"epoch": 1.1320754716981132,
"grad_norm": 5.991898059844971,
"learning_rate": 1.601165919282511e-05,
"loss": 2.5994,
"step": 9300
},
{
"epoch": 1.1442483262325016,
"grad_norm": 4.980128765106201,
"learning_rate": 1.5966816143497757e-05,
"loss": 2.5645,
"step": 9400
},
{
"epoch": 1.1564211807668898,
"grad_norm": 4.839084625244141,
"learning_rate": 1.5921973094170403e-05,
"loss": 2.5861,
"step": 9500
},
{
"epoch": 1.1564211807668898,
"eval_loss": 2.6708385944366455,
"eval_runtime": 6.8615,
"eval_samples_per_second": 145.74,
"eval_steps_per_second": 36.435,
"step": 9500
},
{
"epoch": 1.168594035301278,
"grad_norm": 5.058382511138916,
"learning_rate": 1.587713004484305e-05,
"loss": 2.5524,
"step": 9600
},
{
"epoch": 1.1807668898356665,
"grad_norm": 4.867978572845459,
"learning_rate": 1.5832286995515694e-05,
"loss": 2.582,
"step": 9700
},
{
"epoch": 1.192939744370055,
"grad_norm": 5.896303653717041,
"learning_rate": 1.578744394618834e-05,
"loss": 2.5899,
"step": 9800
},
{
"epoch": 1.205112598904443,
"grad_norm": 4.735970497131348,
"learning_rate": 1.574260089686099e-05,
"loss": 2.5878,
"step": 9900
},
{
"epoch": 1.2172854534388313,
"grad_norm": 4.8292670249938965,
"learning_rate": 1.5697757847533635e-05,
"loss": 2.6047,
"step": 10000
},
{
"epoch": 1.2172854534388313,
"eval_loss": 2.65461802482605,
"eval_runtime": 6.8819,
"eval_samples_per_second": 145.309,
"eval_steps_per_second": 36.327,
"step": 10000
},
{
"epoch": 1.2294583079732198,
"grad_norm": 5.350712299346924,
"learning_rate": 1.565291479820628e-05,
"loss": 2.5777,
"step": 10100
},
{
"epoch": 1.241631162507608,
"grad_norm": 5.471200466156006,
"learning_rate": 1.5608071748878926e-05,
"loss": 2.5908,
"step": 10200
},
{
"epoch": 1.2538040170419964,
"grad_norm": 5.038080215454102,
"learning_rate": 1.5563228699551572e-05,
"loss": 2.5951,
"step": 10300
},
{
"epoch": 1.2659768715763846,
"grad_norm": 4.982104778289795,
"learning_rate": 1.5518385650224218e-05,
"loss": 2.5461,
"step": 10400
},
{
"epoch": 1.278149726110773,
"grad_norm": 4.736184120178223,
"learning_rate": 1.5473542600896864e-05,
"loss": 2.5874,
"step": 10500
},
{
"epoch": 1.278149726110773,
"eval_loss": 2.6384053230285645,
"eval_runtime": 6.8888,
"eval_samples_per_second": 145.164,
"eval_steps_per_second": 36.291,
"step": 10500
},
{
"epoch": 1.2903225806451613,
"grad_norm": 5.710967540740967,
"learning_rate": 1.5429147982062782e-05,
"loss": 2.5818,
"step": 10600
},
{
"epoch": 1.3024954351795497,
"grad_norm": 5.1653947830200195,
"learning_rate": 1.5384304932735428e-05,
"loss": 2.5916,
"step": 10700
},
{
"epoch": 1.314668289713938,
"grad_norm": 5.706851959228516,
"learning_rate": 1.5339461883408074e-05,
"loss": 2.563,
"step": 10800
},
{
"epoch": 1.326841144248326,
"grad_norm": 5.320187568664551,
"learning_rate": 1.529461883408072e-05,
"loss": 2.5657,
"step": 10900
},
{
"epoch": 1.3390139987827145,
"grad_norm": 5.1567463874816895,
"learning_rate": 1.5249775784753365e-05,
"loss": 2.5362,
"step": 11000
},
{
"epoch": 1.3390139987827145,
"eval_loss": 2.6256721019744873,
"eval_runtime": 6.8781,
"eval_samples_per_second": 145.389,
"eval_steps_per_second": 36.347,
"step": 11000
},
{
"epoch": 1.351186853317103,
"grad_norm": 5.355208396911621,
"learning_rate": 1.520493273542601e-05,
"loss": 2.5748,
"step": 11100
},
{
"epoch": 1.3633597078514912,
"grad_norm": 4.878857612609863,
"learning_rate": 1.5160089686098656e-05,
"loss": 2.5768,
"step": 11200
},
{
"epoch": 1.3755325623858794,
"grad_norm": 5.551296234130859,
"learning_rate": 1.51152466367713e-05,
"loss": 2.5616,
"step": 11300
},
{
"epoch": 1.3877054169202678,
"grad_norm": 4.894459247589111,
"learning_rate": 1.5070403587443946e-05,
"loss": 2.5366,
"step": 11400
},
{
"epoch": 1.399878271454656,
"grad_norm": 5.237545967102051,
"learning_rate": 1.5025560538116592e-05,
"loss": 2.5516,
"step": 11500
},
{
"epoch": 1.399878271454656,
"eval_loss": 2.6034560203552246,
"eval_runtime": 6.9038,
"eval_samples_per_second": 144.848,
"eval_steps_per_second": 36.212,
"step": 11500
},
{
"epoch": 1.4120511259890445,
"grad_norm": 4.714597702026367,
"learning_rate": 1.4980717488789238e-05,
"loss": 2.5384,
"step": 11600
},
{
"epoch": 1.4242239805234327,
"grad_norm": 4.776740550994873,
"learning_rate": 1.4935874439461883e-05,
"loss": 2.5733,
"step": 11700
},
{
"epoch": 1.4363968350578211,
"grad_norm": 5.181590557098389,
"learning_rate": 1.4891031390134529e-05,
"loss": 2.5698,
"step": 11800
},
{
"epoch": 1.4485696895922093,
"grad_norm": 4.948436737060547,
"learning_rate": 1.4846188340807177e-05,
"loss": 2.5288,
"step": 11900
},
{
"epoch": 1.4607425441265978,
"grad_norm": 5.549213409423828,
"learning_rate": 1.4801345291479822e-05,
"loss": 2.5291,
"step": 12000
},
{
"epoch": 1.4607425441265978,
"eval_loss": 2.5940563678741455,
"eval_runtime": 6.8627,
"eval_samples_per_second": 145.716,
"eval_steps_per_second": 36.429,
"step": 12000
},
{
"epoch": 1.472915398660986,
"grad_norm": 6.372870445251465,
"learning_rate": 1.4756502242152468e-05,
"loss": 2.5457,
"step": 12100
},
{
"epoch": 1.4850882531953742,
"grad_norm": 5.433255195617676,
"learning_rate": 1.4711659192825114e-05,
"loss": 2.5521,
"step": 12200
},
{
"epoch": 1.4972611077297626,
"grad_norm": 5.604691028594971,
"learning_rate": 1.466681614349776e-05,
"loss": 2.5585,
"step": 12300
},
{
"epoch": 1.509433962264151,
"grad_norm": 5.348121643066406,
"learning_rate": 1.4621973094170405e-05,
"loss": 2.527,
"step": 12400
},
{
"epoch": 1.5216068167985393,
"grad_norm": 4.68524694442749,
"learning_rate": 1.4577130044843051e-05,
"loss": 2.5351,
"step": 12500
},
{
"epoch": 1.5216068167985393,
"eval_loss": 2.5787315368652344,
"eval_runtime": 6.8521,
"eval_samples_per_second": 145.94,
"eval_steps_per_second": 36.485,
"step": 12500
},
{
"epoch": 1.5337796713329275,
"grad_norm": 5.369399070739746,
"learning_rate": 1.4532286995515697e-05,
"loss": 2.5457,
"step": 12600
},
{
"epoch": 1.545952525867316,
"grad_norm": 5.384763717651367,
"learning_rate": 1.4487892376681615e-05,
"loss": 2.5603,
"step": 12700
},
{
"epoch": 1.5581253804017043,
"grad_norm": 5.1856369972229,
"learning_rate": 1.4443049327354261e-05,
"loss": 2.5531,
"step": 12800
},
{
"epoch": 1.5702982349360926,
"grad_norm": 5.600665092468262,
"learning_rate": 1.4398206278026907e-05,
"loss": 2.5226,
"step": 12900
},
{
"epoch": 1.5824710894704808,
"grad_norm": 5.185864448547363,
"learning_rate": 1.4353363228699552e-05,
"loss": 2.5585,
"step": 13000
},
{
"epoch": 1.5824710894704808,
"eval_loss": 2.5597262382507324,
"eval_runtime": 6.9067,
"eval_samples_per_second": 144.787,
"eval_steps_per_second": 36.197,
"step": 13000
},
{
"epoch": 1.5946439440048692,
"grad_norm": 5.945424556732178,
"learning_rate": 1.4308520179372198e-05,
"loss": 2.5447,
"step": 13100
},
{
"epoch": 1.6068167985392574,
"grad_norm": 4.447841167449951,
"learning_rate": 1.4263677130044844e-05,
"loss": 2.5638,
"step": 13200
},
{
"epoch": 1.6189896530736458,
"grad_norm": 4.947375297546387,
"learning_rate": 1.421883408071749e-05,
"loss": 2.5245,
"step": 13300
},
{
"epoch": 1.631162507608034,
"grad_norm": 5.11275053024292,
"learning_rate": 1.4173991031390135e-05,
"loss": 2.504,
"step": 13400
},
{
"epoch": 1.6433353621424223,
"grad_norm": 5.144463539123535,
"learning_rate": 1.4129147982062781e-05,
"loss": 2.5517,
"step": 13500
},
{
"epoch": 1.6433353621424223,
"eval_loss": 2.5378565788269043,
"eval_runtime": 6.8542,
"eval_samples_per_second": 145.896,
"eval_steps_per_second": 36.474,
"step": 13500
},
{
"epoch": 1.6555082166768107,
"grad_norm": 6.138312816619873,
"learning_rate": 1.4084304932735427e-05,
"loss": 2.5334,
"step": 13600
},
{
"epoch": 1.6676810712111991,
"grad_norm": 4.641015529632568,
"learning_rate": 1.4039461883408072e-05,
"loss": 2.5692,
"step": 13700
},
{
"epoch": 1.6798539257455873,
"grad_norm": 5.140405178070068,
"learning_rate": 1.3994618834080718e-05,
"loss": 2.5462,
"step": 13800
},
{
"epoch": 1.6920267802799756,
"grad_norm": 5.093076705932617,
"learning_rate": 1.3949775784753366e-05,
"loss": 2.5227,
"step": 13900
},
{
"epoch": 1.704199634814364,
"grad_norm": 5.549164772033691,
"learning_rate": 1.3904932735426011e-05,
"loss": 2.5469,
"step": 14000
},
{
"epoch": 1.704199634814364,
"eval_loss": 2.5302209854125977,
"eval_runtime": 6.8833,
"eval_samples_per_second": 145.279,
"eval_steps_per_second": 36.32,
"step": 14000
},
{
"epoch": 1.7163724893487524,
"grad_norm": 5.112196922302246,
"learning_rate": 1.3860089686098657e-05,
"loss": 2.4753,
"step": 14100
},
{
"epoch": 1.7285453438831406,
"grad_norm": 4.9223313331604,
"learning_rate": 1.3815246636771303e-05,
"loss": 2.5477,
"step": 14200
},
{
"epoch": 1.7407181984175288,
"grad_norm": 5.270020484924316,
"learning_rate": 1.3770403587443948e-05,
"loss": 2.5141,
"step": 14300
},
{
"epoch": 1.7528910529519173,
"grad_norm": 5.377967357635498,
"learning_rate": 1.3725560538116594e-05,
"loss": 2.5151,
"step": 14400
},
{
"epoch": 1.7650639074863055,
"grad_norm": 4.732293605804443,
"learning_rate": 1.368071748878924e-05,
"loss": 2.559,
"step": 14500
},
{
"epoch": 1.7650639074863055,
"eval_loss": 2.5161020755767822,
"eval_runtime": 6.8279,
"eval_samples_per_second": 146.457,
"eval_steps_per_second": 36.614,
"step": 14500
},
{
"epoch": 1.777236762020694,
"grad_norm": 5.2639241218566895,
"learning_rate": 1.3635874439461884e-05,
"loss": 2.5199,
"step": 14600
},
{
"epoch": 1.7894096165550821,
"grad_norm": 5.222829818725586,
"learning_rate": 1.3591479820627804e-05,
"loss": 2.5122,
"step": 14700
},
{
"epoch": 1.8015824710894703,
"grad_norm": 5.396998882293701,
"learning_rate": 1.354663677130045e-05,
"loss": 2.5665,
"step": 14800
},
{
"epoch": 1.8137553256238588,
"grad_norm": 5.598328113555908,
"learning_rate": 1.3501793721973096e-05,
"loss": 2.5061,
"step": 14900
},
{
"epoch": 1.8259281801582472,
"grad_norm": 4.519299507141113,
"learning_rate": 1.3456950672645741e-05,
"loss": 2.5173,
"step": 15000
},
{
"epoch": 1.8259281801582472,
"eval_loss": 2.505549430847168,
"eval_runtime": 6.8476,
"eval_samples_per_second": 146.036,
"eval_steps_per_second": 36.509,
"step": 15000
},
{
"epoch": 1.8381010346926354,
"grad_norm": 5.07867431640625,
"learning_rate": 1.3412107623318387e-05,
"loss": 2.5085,
"step": 15100
},
{
"epoch": 1.8502738892270236,
"grad_norm": 4.80793571472168,
"learning_rate": 1.3367264573991033e-05,
"loss": 2.5269,
"step": 15200
},
{
"epoch": 1.862446743761412,
"grad_norm": 5.122992992401123,
"learning_rate": 1.3322421524663679e-05,
"loss": 2.5165,
"step": 15300
},
{
"epoch": 1.8746195982958005,
"grad_norm": 5.070724010467529,
"learning_rate": 1.3277578475336324e-05,
"loss": 2.4733,
"step": 15400
},
{
"epoch": 1.8867924528301887,
"grad_norm": 4.850822448730469,
"learning_rate": 1.3232735426008968e-05,
"loss": 2.5045,
"step": 15500
},
{
"epoch": 1.8867924528301887,
"eval_loss": 2.49042010307312,
"eval_runtime": 6.9202,
"eval_samples_per_second": 144.505,
"eval_steps_per_second": 36.126,
"step": 15500
},
{
"epoch": 1.898965307364577,
"grad_norm": 5.182281494140625,
"learning_rate": 1.3187892376681614e-05,
"loss": 2.4858,
"step": 15600
},
{
"epoch": 1.9111381618989653,
"grad_norm": 4.803709030151367,
"learning_rate": 1.314304932735426e-05,
"loss": 2.5043,
"step": 15700
},
{
"epoch": 1.9233110164333538,
"grad_norm": 5.211897850036621,
"learning_rate": 1.3098206278026905e-05,
"loss": 2.4974,
"step": 15800
},
{
"epoch": 1.935483870967742,
"grad_norm": 4.982048988342285,
"learning_rate": 1.3053363228699553e-05,
"loss": 2.4901,
"step": 15900
},
{
"epoch": 1.9476567255021302,
"grad_norm": 5.34013557434082,
"learning_rate": 1.3008520179372199e-05,
"loss": 2.4938,
"step": 16000
},
{
"epoch": 1.9476567255021302,
"eval_loss": 2.479241371154785,
"eval_runtime": 6.9209,
"eval_samples_per_second": 144.49,
"eval_steps_per_second": 36.122,
"step": 16000
},
{
"epoch": 1.9598295800365184,
"grad_norm": 4.926109790802002,
"learning_rate": 1.2963677130044844e-05,
"loss": 2.522,
"step": 16100
},
{
"epoch": 1.9720024345709068,
"grad_norm": 5.252937316894531,
"learning_rate": 1.291883408071749e-05,
"loss": 2.4979,
"step": 16200
},
{
"epoch": 1.9841752891052953,
"grad_norm": 4.676843166351318,
"learning_rate": 1.2873991031390136e-05,
"loss": 2.5011,
"step": 16300
},
{
"epoch": 1.9963481436396835,
"grad_norm": 4.4982171058654785,
"learning_rate": 1.2829147982062782e-05,
"loss": 2.5232,
"step": 16400
},
{
"epoch": 2.0085209981740717,
"grad_norm": 5.115514278411865,
"learning_rate": 1.2784304932735427e-05,
"loss": 2.4807,
"step": 16500
},
{
"epoch": 2.0085209981740717,
"eval_loss": 2.4553143978118896,
"eval_runtime": 6.8911,
"eval_samples_per_second": 145.114,
"eval_steps_per_second": 36.279,
"step": 16500
},
{
"epoch": 2.02069385270846,
"grad_norm": 5.778520107269287,
"learning_rate": 1.2739461883408073e-05,
"loss": 2.3637,
"step": 16600
},
{
"epoch": 2.0328667072428486,
"grad_norm": 4.936229705810547,
"learning_rate": 1.2694618834080719e-05,
"loss": 2.3936,
"step": 16700
},
{
"epoch": 2.045039561777237,
"grad_norm": 6.013847827911377,
"learning_rate": 1.2649775784753364e-05,
"loss": 2.3953,
"step": 16800
},
{
"epoch": 2.057212416311625,
"grad_norm": 6.078458786010742,
"learning_rate": 1.2605381165919283e-05,
"loss": 2.3312,
"step": 16900
},
{
"epoch": 2.069385270846013,
"grad_norm": 5.697019100189209,
"learning_rate": 1.2560538116591929e-05,
"loss": 2.334,
"step": 17000
},
{
"epoch": 2.069385270846013,
"eval_loss": 2.4449574947357178,
"eval_runtime": 6.9363,
"eval_samples_per_second": 144.169,
"eval_steps_per_second": 36.042,
"step": 17000
},
{
"epoch": 2.081558125380402,
"grad_norm": 5.652517795562744,
"learning_rate": 1.2515695067264574e-05,
"loss": 2.3902,
"step": 17100
},
{
"epoch": 2.09373097991479,
"grad_norm": 6.007380485534668,
"learning_rate": 1.247085201793722e-05,
"loss": 2.3629,
"step": 17200
},
{
"epoch": 2.1059038344491783,
"grad_norm": 5.070584774017334,
"learning_rate": 1.2426008968609866e-05,
"loss": 2.3523,
"step": 17300
},
{
"epoch": 2.1180766889835665,
"grad_norm": 5.079153537750244,
"learning_rate": 1.2381165919282512e-05,
"loss": 2.3429,
"step": 17400
},
{
"epoch": 2.130249543517955,
"grad_norm": 5.278266906738281,
"learning_rate": 1.2336322869955157e-05,
"loss": 2.2969,
"step": 17500
},
{
"epoch": 2.130249543517955,
"eval_loss": 2.4217474460601807,
"eval_runtime": 6.9637,
"eval_samples_per_second": 143.601,
"eval_steps_per_second": 35.9,
"step": 17500
},
{
"epoch": 2.1424223980523434,
"grad_norm": 5.2419633865356445,
"learning_rate": 1.2291479820627803e-05,
"loss": 2.3671,
"step": 17600
},
{
"epoch": 2.1545952525867316,
"grad_norm": 5.445255279541016,
"learning_rate": 1.2246636771300449e-05,
"loss": 2.3834,
"step": 17700
},
{
"epoch": 2.16676810712112,
"grad_norm": 5.891075134277344,
"learning_rate": 1.2201793721973095e-05,
"loss": 2.36,
"step": 17800
},
{
"epoch": 2.178940961655508,
"grad_norm": 5.8141865730285645,
"learning_rate": 1.215695067264574e-05,
"loss": 2.3596,
"step": 17900
},
{
"epoch": 2.1911138161898966,
"grad_norm": 5.558561325073242,
"learning_rate": 1.2112107623318388e-05,
"loss": 2.3926,
"step": 18000
},
{
"epoch": 2.1911138161898966,
"eval_loss": 2.415804624557495,
"eval_runtime": 6.8469,
"eval_samples_per_second": 146.052,
"eval_steps_per_second": 36.513,
"step": 18000
},
{
"epoch": 2.203286670724285,
"grad_norm": 5.968663692474365,
"learning_rate": 1.2067264573991033e-05,
"loss": 2.3609,
"step": 18100
},
{
"epoch": 2.215459525258673,
"grad_norm": 5.241644382476807,
"learning_rate": 1.2022421524663679e-05,
"loss": 2.3634,
"step": 18200
},
{
"epoch": 2.2276323797930613,
"grad_norm": 6.328832149505615,
"learning_rate": 1.1977578475336325e-05,
"loss": 2.3465,
"step": 18300
},
{
"epoch": 2.23980523432745,
"grad_norm": 5.125701904296875,
"learning_rate": 1.193273542600897e-05,
"loss": 2.3171,
"step": 18400
},
{
"epoch": 2.251978088861838,
"grad_norm": 4.962270259857178,
"learning_rate": 1.1887892376681616e-05,
"loss": 2.3739,
"step": 18500
},
{
"epoch": 2.251978088861838,
"eval_loss": 2.4065887928009033,
"eval_runtime": 6.9359,
"eval_samples_per_second": 144.178,
"eval_steps_per_second": 36.045,
"step": 18500
},
{
"epoch": 2.2641509433962264,
"grad_norm": 5.895593643188477,
"learning_rate": 1.1843049327354262e-05,
"loss": 2.3656,
"step": 18600
},
{
"epoch": 2.2763237979306146,
"grad_norm": 6.21762752532959,
"learning_rate": 1.1798206278026906e-05,
"loss": 2.3575,
"step": 18700
},
{
"epoch": 2.2884966524650032,
"grad_norm": 5.935133934020996,
"learning_rate": 1.1753363228699552e-05,
"loss": 2.3687,
"step": 18800
},
{
"epoch": 2.3006695069993914,
"grad_norm": 5.431483268737793,
"learning_rate": 1.1708520179372198e-05,
"loss": 2.3465,
"step": 18900
},
{
"epoch": 2.3128423615337796,
"grad_norm": 6.319828510284424,
"learning_rate": 1.1664125560538118e-05,
"loss": 2.3659,
"step": 19000
},
{
"epoch": 2.3128423615337796,
"eval_loss": 2.390819787979126,
"eval_runtime": 6.9389,
"eval_samples_per_second": 144.115,
"eval_steps_per_second": 36.029,
"step": 19000
},
{
"epoch": 2.325015216068168,
"grad_norm": 5.955752372741699,
"learning_rate": 1.1619282511210763e-05,
"loss": 2.3702,
"step": 19100
},
{
"epoch": 2.337188070602556,
"grad_norm": 5.977270603179932,
"learning_rate": 1.157443946188341e-05,
"loss": 2.3935,
"step": 19200
},
{
"epoch": 2.3493609251369447,
"grad_norm": 5.417830944061279,
"learning_rate": 1.1529596412556055e-05,
"loss": 2.359,
"step": 19300
},
{
"epoch": 2.361533779671333,
"grad_norm": 5.452037334442139,
"learning_rate": 1.14847533632287e-05,
"loss": 2.3496,
"step": 19400
},
{
"epoch": 2.373706634205721,
"grad_norm": 4.931158065795898,
"learning_rate": 1.1439910313901346e-05,
"loss": 2.3483,
"step": 19500
},
{
"epoch": 2.373706634205721,
"eval_loss": 2.3805489540100098,
"eval_runtime": 6.803,
"eval_samples_per_second": 146.994,
"eval_steps_per_second": 36.749,
"step": 19500
},
{
"epoch": 2.38587948874011,
"grad_norm": 5.650387287139893,
"learning_rate": 1.1395067264573992e-05,
"loss": 2.3644,
"step": 19600
},
{
"epoch": 2.398052343274498,
"grad_norm": 5.70589542388916,
"learning_rate": 1.1350224215246636e-05,
"loss": 2.3472,
"step": 19700
},
{
"epoch": 2.410225197808886,
"grad_norm": 5.833774566650391,
"learning_rate": 1.1305381165919282e-05,
"loss": 2.3663,
"step": 19800
},
{
"epoch": 2.4223980523432744,
"grad_norm": 5.079782485961914,
"learning_rate": 1.1260538116591928e-05,
"loss": 2.3726,
"step": 19900
},
{
"epoch": 2.4345709068776626,
"grad_norm": 5.578153610229492,
"learning_rate": 1.1215695067264577e-05,
"loss": 2.3432,
"step": 20000
},
{
"epoch": 2.4345709068776626,
"eval_loss": 2.3689472675323486,
"eval_runtime": 6.993,
"eval_samples_per_second": 143.0,
"eval_steps_per_second": 35.75,
"step": 20000
},
{
"epoch": 2.4467437614120513,
"grad_norm": 5.551452159881592,
"learning_rate": 1.117085201793722e-05,
"loss": 2.3658,
"step": 20100
},
{
"epoch": 2.4589166159464395,
"grad_norm": 5.28959321975708,
"learning_rate": 1.1126008968609866e-05,
"loss": 2.3526,
"step": 20200
},
{
"epoch": 2.4710894704808277,
"grad_norm": 5.358762741088867,
"learning_rate": 1.1081165919282512e-05,
"loss": 2.3161,
"step": 20300
},
{
"epoch": 2.483262325015216,
"grad_norm": 5.633576393127441,
"learning_rate": 1.1036322869955158e-05,
"loss": 2.3778,
"step": 20400
},
{
"epoch": 2.495435179549604,
"grad_norm": 5.258509635925293,
"learning_rate": 1.0991479820627804e-05,
"loss": 2.3538,
"step": 20500
},
{
"epoch": 2.495435179549604,
"eval_loss": 2.358330488204956,
"eval_runtime": 6.9369,
"eval_samples_per_second": 144.156,
"eval_steps_per_second": 36.039,
"step": 20500
},
{
"epoch": 2.507608034083993,
"grad_norm": 5.632132053375244,
"learning_rate": 1.094663677130045e-05,
"loss": 2.3514,
"step": 20600
},
{
"epoch": 2.519780888618381,
"grad_norm": 5.449893951416016,
"learning_rate": 1.0901793721973095e-05,
"loss": 2.3404,
"step": 20700
},
{
"epoch": 2.531953743152769,
"grad_norm": 5.66605281829834,
"learning_rate": 1.085695067264574e-05,
"loss": 2.3335,
"step": 20800
},
{
"epoch": 2.544126597687158,
"grad_norm": 6.729547500610352,
"learning_rate": 1.0812107623318387e-05,
"loss": 2.3784,
"step": 20900
},
{
"epoch": 2.556299452221546,
"grad_norm": 5.5277581214904785,
"learning_rate": 1.0767713004484305e-05,
"loss": 2.3424,
"step": 21000
},
{
"epoch": 2.556299452221546,
"eval_loss": 2.3434271812438965,
"eval_runtime": 6.8629,
"eval_samples_per_second": 145.712,
"eval_steps_per_second": 36.428,
"step": 21000
},
{
"epoch": 2.5684723067559343,
"grad_norm": 5.892464637756348,
"learning_rate": 1.072286995515695e-05,
"loss": 2.3577,
"step": 21100
},
{
"epoch": 2.5806451612903225,
"grad_norm": 5.313469409942627,
"learning_rate": 1.0678026905829597e-05,
"loss": 2.3489,
"step": 21200
},
{
"epoch": 2.5928180158247107,
"grad_norm": 5.569064140319824,
"learning_rate": 1.0633183856502242e-05,
"loss": 2.3828,
"step": 21300
},
{
"epoch": 2.6049908703590994,
"grad_norm": 6.133281707763672,
"learning_rate": 1.0588340807174888e-05,
"loss": 2.3203,
"step": 21400
},
{
"epoch": 2.6171637248934876,
"grad_norm": 5.569573402404785,
"learning_rate": 1.0543497757847534e-05,
"loss": 2.3508,
"step": 21500
},
{
"epoch": 2.6171637248934876,
"eval_loss": 2.3320422172546387,
"eval_runtime": 6.8431,
"eval_samples_per_second": 146.132,
"eval_steps_per_second": 36.533,
"step": 21500
},
{
"epoch": 2.629336579427876,
"grad_norm": 5.560952186584473,
"learning_rate": 1.049865470852018e-05,
"loss": 2.3532,
"step": 21600
},
{
"epoch": 2.641509433962264,
"grad_norm": 5.652987957000732,
"learning_rate": 1.0453811659192825e-05,
"loss": 2.3233,
"step": 21700
},
{
"epoch": 2.653682288496652,
"grad_norm": 5.666792869567871,
"learning_rate": 1.0408968609865471e-05,
"loss": 2.353,
"step": 21800
},
{
"epoch": 2.665855143031041,
"grad_norm": 5.652164936065674,
"learning_rate": 1.0364125560538117e-05,
"loss": 2.3483,
"step": 21900
},
{
"epoch": 2.678027997565429,
"grad_norm": 5.158956527709961,
"learning_rate": 1.0319282511210764e-05,
"loss": 2.3344,
"step": 22000
},
{
"epoch": 2.678027997565429,
"eval_loss": 2.3204360008239746,
"eval_runtime": 6.8964,
"eval_samples_per_second": 145.003,
"eval_steps_per_second": 36.251,
"step": 22000
},
{
"epoch": 2.6902008520998173,
"grad_norm": 4.993370056152344,
"learning_rate": 1.027443946188341e-05,
"loss": 2.3185,
"step": 22100
},
{
"epoch": 2.702373706634206,
"grad_norm": 5.251499652862549,
"learning_rate": 1.0229596412556056e-05,
"loss": 2.3463,
"step": 22200
},
{
"epoch": 2.714546561168594,
"grad_norm": 5.155273914337158,
"learning_rate": 1.0184753363228701e-05,
"loss": 2.3299,
"step": 22300
},
{
"epoch": 2.7267194157029824,
"grad_norm": 4.445164680480957,
"learning_rate": 1.0139910313901347e-05,
"loss": 2.3368,
"step": 22400
},
{
"epoch": 2.7388922702373706,
"grad_norm": 5.968411445617676,
"learning_rate": 1.0095067264573993e-05,
"loss": 2.321,
"step": 22500
},
{
"epoch": 2.7388922702373706,
"eval_loss": 2.3084633350372314,
"eval_runtime": 6.9774,
"eval_samples_per_second": 143.32,
"eval_steps_per_second": 35.83,
"step": 22500
},
{
"epoch": 2.751065124771759,
"grad_norm": 5.2266364097595215,
"learning_rate": 1.0050224215246638e-05,
"loss": 2.3387,
"step": 22600
},
{
"epoch": 2.7632379793061475,
"grad_norm": 5.649938583374023,
"learning_rate": 1.0005381165919284e-05,
"loss": 2.3388,
"step": 22700
},
{
"epoch": 2.7754108338405357,
"grad_norm": 5.603872299194336,
"learning_rate": 9.96053811659193e-06,
"loss": 2.3331,
"step": 22800
},
{
"epoch": 2.787583688374924,
"grad_norm": 5.831801891326904,
"learning_rate": 9.915695067264574e-06,
"loss": 2.3509,
"step": 22900
},
{
"epoch": 2.799756542909312,
"grad_norm": 5.071148872375488,
"learning_rate": 9.871300448430494e-06,
"loss": 2.3296,
"step": 23000
},
{
"epoch": 2.799756542909312,
"eval_loss": 2.298048257827759,
"eval_runtime": 6.8909,
"eval_samples_per_second": 145.119,
"eval_steps_per_second": 36.28,
"step": 23000
},
{
"epoch": 2.8119293974437003,
"grad_norm": 5.613708972930908,
"learning_rate": 9.82645739910314e-06,
"loss": 2.3458,
"step": 23100
},
{
"epoch": 2.824102251978089,
"grad_norm": 6.964206218719482,
"learning_rate": 9.781614349775786e-06,
"loss": 2.3523,
"step": 23200
},
{
"epoch": 2.836275106512477,
"grad_norm": 6.069615364074707,
"learning_rate": 9.737219730941706e-06,
"loss": 2.3364,
"step": 23300
},
{
"epoch": 2.8484479610468654,
"grad_norm": 4.563328266143799,
"learning_rate": 9.69237668161435e-06,
"loss": 2.3164,
"step": 23400
},
{
"epoch": 2.860620815581254,
"grad_norm": 5.069984436035156,
"learning_rate": 9.647533632286995e-06,
"loss": 2.3347,
"step": 23500
},
{
"epoch": 2.860620815581254,
"eval_loss": 2.2902982234954834,
"eval_runtime": 6.9027,
"eval_samples_per_second": 144.87,
"eval_steps_per_second": 36.218,
"step": 23500
},
{
"epoch": 2.8727936701156422,
"grad_norm": 5.443928241729736,
"learning_rate": 9.602690582959641e-06,
"loss": 2.3211,
"step": 23600
},
{
"epoch": 2.8849665246500305,
"grad_norm": 5.5851664543151855,
"learning_rate": 9.557847533632287e-06,
"loss": 2.3469,
"step": 23700
},
{
"epoch": 2.8971393791844187,
"grad_norm": 5.386264324188232,
"learning_rate": 9.513004484304934e-06,
"loss": 2.3303,
"step": 23800
},
{
"epoch": 2.909312233718807,
"grad_norm": 5.505928993225098,
"learning_rate": 9.46816143497758e-06,
"loss": 2.3396,
"step": 23900
},
{
"epoch": 2.9214850882531955,
"grad_norm": 5.181743621826172,
"learning_rate": 9.423318385650226e-06,
"loss": 2.3214,
"step": 24000
},
{
"epoch": 2.9214850882531955,
"eval_loss": 2.28114652633667,
"eval_runtime": 6.8437,
"eval_samples_per_second": 146.121,
"eval_steps_per_second": 36.53,
"step": 24000
},
{
"epoch": 2.9336579427875837,
"grad_norm": 6.292041301727295,
"learning_rate": 9.378475336322872e-06,
"loss": 2.3341,
"step": 24100
},
{
"epoch": 2.945830797321972,
"grad_norm": 5.232330322265625,
"learning_rate": 9.333632286995517e-06,
"loss": 2.2984,
"step": 24200
},
{
"epoch": 2.95800365185636,
"grad_norm": 5.351822376251221,
"learning_rate": 9.288789237668161e-06,
"loss": 2.326,
"step": 24300
},
{
"epoch": 2.9701765063907484,
"grad_norm": 5.880828380584717,
"learning_rate": 9.243946188340807e-06,
"loss": 2.3399,
"step": 24400
},
{
"epoch": 2.982349360925137,
"grad_norm": 5.407314777374268,
"learning_rate": 9.199103139013453e-06,
"loss": 2.3007,
"step": 24500
},
{
"epoch": 2.982349360925137,
"eval_loss": 2.273526906967163,
"eval_runtime": 6.9358,
"eval_samples_per_second": 144.179,
"eval_steps_per_second": 36.045,
"step": 24500
},
{
"epoch": 2.9945222154595252,
"grad_norm": 5.49412727355957,
"learning_rate": 9.154260089686099e-06,
"loss": 2.3325,
"step": 24600
},
{
"epoch": 3.0066950699939134,
"grad_norm": 5.382359981536865,
"learning_rate": 9.109417040358746e-06,
"loss": 2.2779,
"step": 24700
},
{
"epoch": 3.018867924528302,
"grad_norm": 6.563231945037842,
"learning_rate": 9.064573991031392e-06,
"loss": 2.2142,
"step": 24800
},
{
"epoch": 3.0310407790626903,
"grad_norm": 6.05570650100708,
"learning_rate": 9.019730941704037e-06,
"loss": 2.1952,
"step": 24900
},
{
"epoch": 3.0432136335970785,
"grad_norm": 5.2819366455078125,
"learning_rate": 8.974887892376683e-06,
"loss": 2.2244,
"step": 25000
},
{
"epoch": 3.0432136335970785,
"eval_loss": 2.253713846206665,
"eval_runtime": 6.8869,
"eval_samples_per_second": 145.202,
"eval_steps_per_second": 36.301,
"step": 25000
},
{
"epoch": 3.0553864881314667,
"grad_norm": 5.801946640014648,
"learning_rate": 8.930044843049329e-06,
"loss": 2.2423,
"step": 25100
},
{
"epoch": 3.067559342665855,
"grad_norm": 5.829814910888672,
"learning_rate": 8.885201793721973e-06,
"loss": 2.2372,
"step": 25200
},
{
"epoch": 3.0797321972002436,
"grad_norm": 5.983118534088135,
"learning_rate": 8.840358744394619e-06,
"loss": 2.2363,
"step": 25300
},
{
"epoch": 3.091905051734632,
"grad_norm": 5.694368839263916,
"learning_rate": 8.795515695067264e-06,
"loss": 2.1785,
"step": 25400
},
{
"epoch": 3.10407790626902,
"grad_norm": 5.976083755493164,
"learning_rate": 8.75067264573991e-06,
"loss": 2.2061,
"step": 25500
},
{
"epoch": 3.10407790626902,
"eval_loss": 2.2468533515930176,
"eval_runtime": 6.9894,
"eval_samples_per_second": 143.075,
"eval_steps_per_second": 35.769,
"step": 25500
},
{
"epoch": 3.1162507608034082,
"grad_norm": 5.972872734069824,
"learning_rate": 8.705829596412557e-06,
"loss": 2.2269,
"step": 25600
},
{
"epoch": 3.128423615337797,
"grad_norm": 5.245333671569824,
"learning_rate": 8.660986547085203e-06,
"loss": 2.2519,
"step": 25700
},
{
"epoch": 3.140596469872185,
"grad_norm": 6.581233501434326,
"learning_rate": 8.616143497757849e-06,
"loss": 2.2181,
"step": 25800
},
{
"epoch": 3.1527693244065733,
"grad_norm": 6.18913459777832,
"learning_rate": 8.571300448430495e-06,
"loss": 2.1892,
"step": 25900
},
{
"epoch": 3.1649421789409615,
"grad_norm": 5.771265983581543,
"learning_rate": 8.52645739910314e-06,
"loss": 2.1789,
"step": 26000
},
{
"epoch": 3.1649421789409615,
"eval_loss": 2.2344589233398438,
"eval_runtime": 6.8267,
"eval_samples_per_second": 146.484,
"eval_steps_per_second": 36.621,
"step": 26000
},
{
"epoch": 3.17711503347535,
"grad_norm": 6.273107528686523,
"learning_rate": 8.481614349775784e-06,
"loss": 2.2029,
"step": 26100
},
{
"epoch": 3.1892878880097384,
"grad_norm": 6.768197059631348,
"learning_rate": 8.43677130044843e-06,
"loss": 2.2042,
"step": 26200
},
{
"epoch": 3.2014607425441266,
"grad_norm": 7.103708267211914,
"learning_rate": 8.391928251121076e-06,
"loss": 2.2142,
"step": 26300
},
{
"epoch": 3.213633597078515,
"grad_norm": 6.05976676940918,
"learning_rate": 8.347085201793723e-06,
"loss": 2.1747,
"step": 26400
},
{
"epoch": 3.225806451612903,
"grad_norm": 5.711021900177002,
"learning_rate": 8.302242152466369e-06,
"loss": 2.2039,
"step": 26500
},
{
"epoch": 3.225806451612903,
"eval_loss": 2.2293026447296143,
"eval_runtime": 6.8771,
"eval_samples_per_second": 145.41,
"eval_steps_per_second": 36.352,
"step": 26500
},
{
"epoch": 3.2379793061472917,
"grad_norm": 5.777741432189941,
"learning_rate": 8.257399103139015e-06,
"loss": 2.2259,
"step": 26600
},
{
"epoch": 3.25015216068168,
"grad_norm": 5.676499843597412,
"learning_rate": 8.21255605381166e-06,
"loss": 2.1918,
"step": 26700
},
{
"epoch": 3.262325015216068,
"grad_norm": 6.470264911651611,
"learning_rate": 8.167713004484306e-06,
"loss": 2.212,
"step": 26800
},
{
"epoch": 3.2744978697504563,
"grad_norm": 6.308848857879639,
"learning_rate": 8.122869955156952e-06,
"loss": 2.2138,
"step": 26900
},
{
"epoch": 3.286670724284845,
"grad_norm": 5.39501428604126,
"learning_rate": 8.078026905829596e-06,
"loss": 2.248,
"step": 27000
},
{
"epoch": 3.286670724284845,
"eval_loss": 2.2192747592926025,
"eval_runtime": 6.9636,
"eval_samples_per_second": 143.603,
"eval_steps_per_second": 35.901,
"step": 27000
},
{
"epoch": 3.298843578819233,
"grad_norm": 5.875838756561279,
"learning_rate": 8.033183856502242e-06,
"loss": 2.2131,
"step": 27100
},
{
"epoch": 3.3110164333536214,
"grad_norm": 5.159265518188477,
"learning_rate": 7.988340807174887e-06,
"loss": 2.2037,
"step": 27200
},
{
"epoch": 3.3231892878880096,
"grad_norm": 5.619683265686035,
"learning_rate": 7.943946188340808e-06,
"loss": 2.1818,
"step": 27300
},
{
"epoch": 3.3353621424223983,
"grad_norm": 7.503751277923584,
"learning_rate": 7.899103139013453e-06,
"loss": 2.2087,
"step": 27400
},
{
"epoch": 3.3475349969567865,
"grad_norm": 5.3004937171936035,
"learning_rate": 7.854260089686099e-06,
"loss": 2.2151,
"step": 27500
},
{
"epoch": 3.3475349969567865,
"eval_loss": 2.209369659423828,
"eval_runtime": 6.9186,
"eval_samples_per_second": 144.537,
"eval_steps_per_second": 36.134,
"step": 27500
},
{
"epoch": 3.3597078514911747,
"grad_norm": 6.6273193359375,
"learning_rate": 7.809417040358745e-06,
"loss": 2.2208,
"step": 27600
},
{
"epoch": 3.371880706025563,
"grad_norm": 6.1234588623046875,
"learning_rate": 7.76457399103139e-06,
"loss": 2.1957,
"step": 27700
},
{
"epoch": 3.384053560559951,
"grad_norm": 6.082681655883789,
"learning_rate": 7.719730941704036e-06,
"loss": 2.2202,
"step": 27800
},
{
"epoch": 3.3962264150943398,
"grad_norm": 6.618956089019775,
"learning_rate": 7.674887892376682e-06,
"loss": 2.2045,
"step": 27900
},
{
"epoch": 3.408399269628728,
"grad_norm": 5.74383544921875,
"learning_rate": 7.630044843049328e-06,
"loss": 2.2308,
"step": 28000
},
{
"epoch": 3.408399269628728,
"eval_loss": 2.206360340118408,
"eval_runtime": 6.9078,
"eval_samples_per_second": 144.763,
"eval_steps_per_second": 36.191,
"step": 28000
},
{
"epoch": 3.420572124163116,
"grad_norm": 6.5505690574646,
"learning_rate": 7.5852017937219735e-06,
"loss": 2.2036,
"step": 28100
},
{
"epoch": 3.4327449786975044,
"grad_norm": 5.887704372406006,
"learning_rate": 7.540358744394619e-06,
"loss": 2.1714,
"step": 28200
},
{
"epoch": 3.444917833231893,
"grad_norm": 6.853738784790039,
"learning_rate": 7.495515695067265e-06,
"loss": 2.2269,
"step": 28300
},
{
"epoch": 3.4570906877662813,
"grad_norm": 5.702883243560791,
"learning_rate": 7.4506726457399115e-06,
"loss": 2.192,
"step": 28400
},
{
"epoch": 3.4692635423006695,
"grad_norm": 6.062043190002441,
"learning_rate": 7.405829596412557e-06,
"loss": 2.2095,
"step": 28500
},
{
"epoch": 3.4692635423006695,
"eval_loss": 2.1974008083343506,
"eval_runtime": 6.9256,
"eval_samples_per_second": 144.392,
"eval_steps_per_second": 36.098,
"step": 28500
},
{
"epoch": 3.4814363968350577,
"grad_norm": 5.228243827819824,
"learning_rate": 7.360986547085203e-06,
"loss": 2.221,
"step": 28600
},
{
"epoch": 3.4936092513694463,
"grad_norm": 5.8091607093811035,
"learning_rate": 7.316143497757848e-06,
"loss": 2.2062,
"step": 28700
},
{
"epoch": 3.5057821059038345,
"grad_norm": 4.786416053771973,
"learning_rate": 7.2713004484304936e-06,
"loss": 2.1716,
"step": 28800
},
{
"epoch": 3.5179549604382228,
"grad_norm": 6.92462158203125,
"learning_rate": 7.226457399103139e-06,
"loss": 2.211,
"step": 28900
},
{
"epoch": 3.530127814972611,
"grad_norm": 7.192811489105225,
"learning_rate": 7.181614349775785e-06,
"loss": 2.2085,
"step": 29000
},
{
"epoch": 3.530127814972611,
"eval_loss": 2.185516595840454,
"eval_runtime": 6.8385,
"eval_samples_per_second": 146.231,
"eval_steps_per_second": 36.558,
"step": 29000
},
{
"epoch": 3.542300669506999,
"grad_norm": 5.579026222229004,
"learning_rate": 7.136771300448431e-06,
"loss": 2.1974,
"step": 29100
},
{
"epoch": 3.554473524041388,
"grad_norm": 6.277022838592529,
"learning_rate": 7.0919282511210765e-06,
"loss": 2.1982,
"step": 29200
},
{
"epoch": 3.566646378575776,
"grad_norm": 5.486943244934082,
"learning_rate": 7.047533632286996e-06,
"loss": 2.1877,
"step": 29300
},
{
"epoch": 3.5788192331101643,
"grad_norm": 6.431853771209717,
"learning_rate": 7.0026905829596416e-06,
"loss": 2.2109,
"step": 29400
},
{
"epoch": 3.590992087644553,
"grad_norm": 6.601170539855957,
"learning_rate": 6.957847533632287e-06,
"loss": 2.2122,
"step": 29500
},
{
"epoch": 3.590992087644553,
"eval_loss": 2.1791625022888184,
"eval_runtime": 6.9136,
"eval_samples_per_second": 144.642,
"eval_steps_per_second": 36.161,
"step": 29500
},
{
"epoch": 3.603164942178941,
"grad_norm": 5.159702301025391,
"learning_rate": 6.913004484304934e-06,
"loss": 2.2246,
"step": 29600
},
{
"epoch": 3.6153377967133293,
"grad_norm": 6.260033130645752,
"learning_rate": 6.86816143497758e-06,
"loss": 2.2044,
"step": 29700
},
{
"epoch": 3.6275106512477175,
"grad_norm": 5.428004741668701,
"learning_rate": 6.823318385650225e-06,
"loss": 2.1735,
"step": 29800
},
{
"epoch": 3.6396835057821058,
"grad_norm": 5.895395278930664,
"learning_rate": 6.77847533632287e-06,
"loss": 2.2027,
"step": 29900
},
{
"epoch": 3.6518563603164944,
"grad_norm": 5.690395355224609,
"learning_rate": 6.733632286995516e-06,
"loss": 2.2145,
"step": 30000
},
{
"epoch": 3.6518563603164944,
"eval_loss": 2.177266836166382,
"eval_runtime": 7.0906,
"eval_samples_per_second": 141.032,
"eval_steps_per_second": 35.258,
"step": 30000
},
{
"epoch": 3.6640292148508826,
"grad_norm": 5.669330596923828,
"learning_rate": 6.688789237668162e-06,
"loss": 2.1954,
"step": 30100
},
{
"epoch": 3.676202069385271,
"grad_norm": 6.493986129760742,
"learning_rate": 6.643946188340807e-06,
"loss": 2.2011,
"step": 30200
},
{
"epoch": 3.688374923919659,
"grad_norm": 7.738183975219727,
"learning_rate": 6.599103139013453e-06,
"loss": 2.2347,
"step": 30300
},
{
"epoch": 3.7005477784540473,
"grad_norm": 6.565354347229004,
"learning_rate": 6.5542600896861e-06,
"loss": 2.1945,
"step": 30400
},
{
"epoch": 3.712720632988436,
"grad_norm": 6.189778804779053,
"learning_rate": 6.509417040358745e-06,
"loss": 2.2141,
"step": 30500
},
{
"epoch": 3.712720632988436,
"eval_loss": 2.168225049972534,
"eval_runtime": 6.9549,
"eval_samples_per_second": 143.785,
"eval_steps_per_second": 35.946,
"step": 30500
},
{
"epoch": 3.724893487522824,
"grad_norm": 5.11403226852417,
"learning_rate": 6.464573991031391e-06,
"loss": 2.2121,
"step": 30600
},
{
"epoch": 3.7370663420572123,
"grad_norm": 6.672878742218018,
"learning_rate": 6.419730941704037e-06,
"loss": 2.181,
"step": 30700
},
{
"epoch": 3.749239196591601,
"grad_norm": 5.224799156188965,
"learning_rate": 6.374887892376682e-06,
"loss": 2.1807,
"step": 30800
},
{
"epoch": 3.761412051125989,
"grad_norm": 6.442698955535889,
"learning_rate": 6.330493273542602e-06,
"loss": 2.2021,
"step": 30900
},
{
"epoch": 3.7735849056603774,
"grad_norm": 6.708118438720703,
"learning_rate": 6.285650224215248e-06,
"loss": 2.182,
"step": 31000
},
{
"epoch": 3.7735849056603774,
"eval_loss": 2.1596100330352783,
"eval_runtime": 6.872,
"eval_samples_per_second": 145.518,
"eval_steps_per_second": 36.38,
"step": 31000
},
{
"epoch": 3.7857577601947656,
"grad_norm": 6.288793087005615,
"learning_rate": 6.2408071748878926e-06,
"loss": 2.1886,
"step": 31100
},
{
"epoch": 3.797930614729154,
"grad_norm": 6.112220287322998,
"learning_rate": 6.195964125560538e-06,
"loss": 2.2107,
"step": 31200
},
{
"epoch": 3.8101034692635425,
"grad_norm": 6.044913291931152,
"learning_rate": 6.151121076233184e-06,
"loss": 2.2246,
"step": 31300
},
{
"epoch": 3.8222763237979307,
"grad_norm": 6.079142093658447,
"learning_rate": 6.10627802690583e-06,
"loss": 2.2187,
"step": 31400
},
{
"epoch": 3.834449178332319,
"grad_norm": 5.865757942199707,
"learning_rate": 6.0614349775784755e-06,
"loss": 2.2069,
"step": 31500
},
{
"epoch": 3.834449178332319,
"eval_loss": 2.156599760055542,
"eval_runtime": 6.8965,
"eval_samples_per_second": 145.001,
"eval_steps_per_second": 36.25,
"step": 31500
},
{
"epoch": 3.846622032866707,
"grad_norm": 6.289271354675293,
"learning_rate": 6.016591928251122e-06,
"loss": 2.2349,
"step": 31600
},
{
"epoch": 3.8587948874010953,
"grad_norm": 6.607455730438232,
"learning_rate": 5.971748878923768e-06,
"loss": 2.1849,
"step": 31700
},
{
"epoch": 3.870967741935484,
"grad_norm": 6.193937301635742,
"learning_rate": 5.9269058295964135e-06,
"loss": 2.1901,
"step": 31800
},
{
"epoch": 3.883140596469872,
"grad_norm": 5.6171650886535645,
"learning_rate": 5.882062780269059e-06,
"loss": 2.1968,
"step": 31900
},
{
"epoch": 3.8953134510042604,
"grad_norm": 7.239607334136963,
"learning_rate": 5.837219730941704e-06,
"loss": 2.1984,
"step": 32000
},
{
"epoch": 3.8953134510042604,
"eval_loss": 2.1437973976135254,
"eval_runtime": 6.9069,
"eval_samples_per_second": 144.782,
"eval_steps_per_second": 36.195,
"step": 32000
},
{
"epoch": 3.907486305538649,
"grad_norm": 6.314813613891602,
"learning_rate": 5.79237668161435e-06,
"loss": 2.1706,
"step": 32100
},
{
"epoch": 3.9196591600730373,
"grad_norm": 5.416664123535156,
"learning_rate": 5.7475336322869956e-06,
"loss": 2.1852,
"step": 32200
},
{
"epoch": 3.9318320146074255,
"grad_norm": 6.1277594566345215,
"learning_rate": 5.702690582959641e-06,
"loss": 2.2202,
"step": 32300
},
{
"epoch": 3.9440048691418137,
"grad_norm": 6.0932440757751465,
"learning_rate": 5.657847533632288e-06,
"loss": 2.1591,
"step": 32400
},
{
"epoch": 3.956177723676202,
"grad_norm": 6.029341697692871,
"learning_rate": 5.613004484304934e-06,
"loss": 2.1805,
"step": 32500
},
{
"epoch": 3.956177723676202,
"eval_loss": 2.138620138168335,
"eval_runtime": 6.9091,
"eval_samples_per_second": 144.737,
"eval_steps_per_second": 36.184,
"step": 32500
},
{
"epoch": 3.9683505782105906,
"grad_norm": 6.374738693237305,
"learning_rate": 5.568161434977579e-06,
"loss": 2.168,
"step": 32600
},
{
"epoch": 3.9805234327449788,
"grad_norm": 6.206404209136963,
"learning_rate": 5.523318385650225e-06,
"loss": 2.2168,
"step": 32700
},
{
"epoch": 3.992696287279367,
"grad_norm": 6.701908588409424,
"learning_rate": 5.478475336322871e-06,
"loss": 2.1796,
"step": 32800
},
{
"epoch": 4.004869141813756,
"grad_norm": 6.456433296203613,
"learning_rate": 5.433632286995516e-06,
"loss": 2.1461,
"step": 32900
},
{
"epoch": 4.017041996348143,
"grad_norm": 6.578303337097168,
"learning_rate": 5.388789237668161e-06,
"loss": 2.1061,
"step": 33000
},
{
"epoch": 4.017041996348143,
"eval_loss": 2.123652458190918,
"eval_runtime": 6.8771,
"eval_samples_per_second": 145.409,
"eval_steps_per_second": 36.352,
"step": 33000
},
{
"epoch": 4.029214850882532,
"grad_norm": 5.736875057220459,
"learning_rate": 5.343946188340807e-06,
"loss": 2.098,
"step": 33100
},
{
"epoch": 4.04138770541692,
"grad_norm": 6.322964191436768,
"learning_rate": 5.299103139013453e-06,
"loss": 2.1334,
"step": 33200
},
{
"epoch": 4.0535605599513085,
"grad_norm": 7.002594470977783,
"learning_rate": 5.2542600896860994e-06,
"loss": 2.1186,
"step": 33300
},
{
"epoch": 4.065733414485697,
"grad_norm": 6.592886924743652,
"learning_rate": 5.209417040358745e-06,
"loss": 2.1218,
"step": 33400
},
{
"epoch": 4.077906269020085,
"grad_norm": 6.610073566436768,
"learning_rate": 5.164573991031391e-06,
"loss": 2.1112,
"step": 33500
},
{
"epoch": 4.077906269020085,
"eval_loss": 2.115506172180176,
"eval_runtime": 6.8933,
"eval_samples_per_second": 145.068,
"eval_steps_per_second": 36.267,
"step": 33500
},
{
"epoch": 4.090079123554474,
"grad_norm": 6.308100700378418,
"learning_rate": 5.119730941704037e-06,
"loss": 2.0984,
"step": 33600
},
{
"epoch": 4.102251978088862,
"grad_norm": 5.7667083740234375,
"learning_rate": 5.074887892376682e-06,
"loss": 2.1189,
"step": 33700
},
{
"epoch": 4.11442483262325,
"grad_norm": 6.554234504699707,
"learning_rate": 5.030044843049327e-06,
"loss": 2.1006,
"step": 33800
},
{
"epoch": 4.126597687157639,
"grad_norm": 6.494872570037842,
"learning_rate": 4.985201793721974e-06,
"loss": 2.1258,
"step": 33900
},
{
"epoch": 4.138770541692026,
"grad_norm": 6.796899318695068,
"learning_rate": 4.940807174887893e-06,
"loss": 2.1335,
"step": 34000
},
{
"epoch": 4.138770541692026,
"eval_loss": 2.1111514568328857,
"eval_runtime": 6.8774,
"eval_samples_per_second": 145.404,
"eval_steps_per_second": 36.351,
"step": 34000
},
{
"epoch": 4.150943396226415,
"grad_norm": 5.832895755767822,
"learning_rate": 4.895964125560538e-06,
"loss": 2.0956,
"step": 34100
},
{
"epoch": 4.163116250760804,
"grad_norm": 5.17689847946167,
"learning_rate": 4.851121076233185e-06,
"loss": 2.1503,
"step": 34200
},
{
"epoch": 4.1752891052951915,
"grad_norm": 6.65399694442749,
"learning_rate": 4.80627802690583e-06,
"loss": 2.1244,
"step": 34300
},
{
"epoch": 4.18746195982958,
"grad_norm": 6.744587421417236,
"learning_rate": 4.761434977578476e-06,
"loss": 2.1237,
"step": 34400
},
{
"epoch": 4.199634814363968,
"grad_norm": 6.663182258605957,
"learning_rate": 4.716591928251121e-06,
"loss": 2.1198,
"step": 34500
},
{
"epoch": 4.199634814363968,
"eval_loss": 2.1056010723114014,
"eval_runtime": 6.9075,
"eval_samples_per_second": 144.771,
"eval_steps_per_second": 36.193,
"step": 34500
},
{
"epoch": 4.211807668898357,
"grad_norm": 6.046566009521484,
"learning_rate": 4.671748878923767e-06,
"loss": 2.0746,
"step": 34600
},
{
"epoch": 4.223980523432745,
"grad_norm": 6.08657169342041,
"learning_rate": 4.626905829596413e-06,
"loss": 2.1154,
"step": 34700
},
{
"epoch": 4.236153377967133,
"grad_norm": 6.235377788543701,
"learning_rate": 4.582062780269059e-06,
"loss": 2.1013,
"step": 34800
},
{
"epoch": 4.248326232501522,
"grad_norm": 5.864556312561035,
"learning_rate": 4.537219730941705e-06,
"loss": 2.1293,
"step": 34900
},
{
"epoch": 4.26049908703591,
"grad_norm": 6.5032124519348145,
"learning_rate": 4.49237668161435e-06,
"loss": 2.0909,
"step": 35000
},
{
"epoch": 4.26049908703591,
"eval_loss": 2.1002509593963623,
"eval_runtime": 6.9552,
"eval_samples_per_second": 143.777,
"eval_steps_per_second": 35.944,
"step": 35000
},
{
"epoch": 4.272671941570298,
"grad_norm": 5.491804599761963,
"learning_rate": 4.447533632286996e-06,
"loss": 2.1075,
"step": 35100
},
{
"epoch": 4.284844796104687,
"grad_norm": 6.165935516357422,
"learning_rate": 4.402690582959642e-06,
"loss": 2.1172,
"step": 35200
},
{
"epoch": 4.2970176506390745,
"grad_norm": 6.2660369873046875,
"learning_rate": 4.357847533632288e-06,
"loss": 2.1234,
"step": 35300
},
{
"epoch": 4.309190505173463,
"grad_norm": 6.266602516174316,
"learning_rate": 4.3130044843049325e-06,
"loss": 2.1023,
"step": 35400
},
{
"epoch": 4.321363359707852,
"grad_norm": 6.377227306365967,
"learning_rate": 4.268161434977579e-06,
"loss": 2.095,
"step": 35500
},
{
"epoch": 4.321363359707852,
"eval_loss": 2.096508264541626,
"eval_runtime": 6.896,
"eval_samples_per_second": 145.011,
"eval_steps_per_second": 36.253,
"step": 35500
},
{
"epoch": 4.33353621424224,
"grad_norm": 5.756918907165527,
"learning_rate": 4.223318385650225e-06,
"loss": 2.1258,
"step": 35600
},
{
"epoch": 4.345709068776628,
"grad_norm": 6.3634934425354,
"learning_rate": 4.1784753363228705e-06,
"loss": 2.1326,
"step": 35700
},
{
"epoch": 4.357881923311016,
"grad_norm": 6.081814765930176,
"learning_rate": 4.133632286995516e-06,
"loss": 2.0923,
"step": 35800
},
{
"epoch": 4.370054777845405,
"grad_norm": 5.829545021057129,
"learning_rate": 4.088789237668161e-06,
"loss": 2.1001,
"step": 35900
},
{
"epoch": 4.382227632379793,
"grad_norm": 7.019509315490723,
"learning_rate": 4.043946188340808e-06,
"loss": 2.0947,
"step": 36000
},
{
"epoch": 4.382227632379793,
"eval_loss": 2.0914690494537354,
"eval_runtime": 6.9162,
"eval_samples_per_second": 144.589,
"eval_steps_per_second": 36.147,
"step": 36000
},
{
"epoch": 4.394400486914181,
"grad_norm": 7.135252952575684,
"learning_rate": 3.9991031390134534e-06,
"loss": 2.111,
"step": 36100
},
{
"epoch": 4.40657334144857,
"grad_norm": 5.3956522941589355,
"learning_rate": 3.954260089686099e-06,
"loss": 2.1072,
"step": 36200
},
{
"epoch": 4.418746195982958,
"grad_norm": 5.853066921234131,
"learning_rate": 3.909417040358744e-06,
"loss": 2.1327,
"step": 36300
},
{
"epoch": 4.430919050517346,
"grad_norm": 6.294539928436279,
"learning_rate": 3.864573991031391e-06,
"loss": 2.0886,
"step": 36400
},
{
"epoch": 4.443091905051735,
"grad_norm": 7.183646202087402,
"learning_rate": 3.819730941704036e-06,
"loss": 2.139,
"step": 36500
},
{
"epoch": 4.443091905051735,
"eval_loss": 2.0876991748809814,
"eval_runtime": 6.8527,
"eval_samples_per_second": 145.928,
"eval_steps_per_second": 36.482,
"step": 36500
},
{
"epoch": 4.455264759586123,
"grad_norm": 6.069007396697998,
"learning_rate": 3.7748878923766817e-06,
"loss": 2.1076,
"step": 36600
},
{
"epoch": 4.467437614120511,
"grad_norm": 6.092281341552734,
"learning_rate": 3.7300448430493274e-06,
"loss": 2.1274,
"step": 36700
},
{
"epoch": 4.4796104686549,
"grad_norm": 6.095892429351807,
"learning_rate": 3.6852017937219735e-06,
"loss": 2.1202,
"step": 36800
},
{
"epoch": 4.491783323189288,
"grad_norm": 6.349238395690918,
"learning_rate": 3.6403587443946193e-06,
"loss": 2.1192,
"step": 36900
},
{
"epoch": 4.503956177723676,
"grad_norm": 6.508525848388672,
"learning_rate": 3.595515695067265e-06,
"loss": 2.106,
"step": 37000
},
{
"epoch": 4.503956177723676,
"eval_loss": 2.0852510929107666,
"eval_runtime": 6.9159,
"eval_samples_per_second": 144.595,
"eval_steps_per_second": 36.149,
"step": 37000
},
{
"epoch": 4.516129032258064,
"grad_norm": 6.2998046875,
"learning_rate": 3.5506726457399103e-06,
"loss": 2.1264,
"step": 37100
},
{
"epoch": 4.528301886792453,
"grad_norm": 6.988924503326416,
"learning_rate": 3.5058295964125565e-06,
"loss": 2.0855,
"step": 37200
},
{
"epoch": 4.540474741326841,
"grad_norm": 5.999715328216553,
"learning_rate": 3.460986547085202e-06,
"loss": 2.1288,
"step": 37300
},
{
"epoch": 4.552647595861229,
"grad_norm": 5.390603542327881,
"learning_rate": 3.416143497757848e-06,
"loss": 2.1119,
"step": 37400
},
{
"epoch": 4.564820450395618,
"grad_norm": 5.443009853363037,
"learning_rate": 3.3713004484304932e-06,
"loss": 2.1137,
"step": 37500
},
{
"epoch": 4.564820450395618,
"eval_loss": 2.0807323455810547,
"eval_runtime": 6.868,
"eval_samples_per_second": 145.603,
"eval_steps_per_second": 36.401,
"step": 37500
},
{
"epoch": 4.5769933049300064,
"grad_norm": 6.028597831726074,
"learning_rate": 3.326457399103139e-06,
"loss": 2.1233,
"step": 37600
},
{
"epoch": 4.589166159464394,
"grad_norm": 6.3508992195129395,
"learning_rate": 3.281614349775785e-06,
"loss": 2.0785,
"step": 37700
},
{
"epoch": 4.601339013998783,
"grad_norm": 6.304683685302734,
"learning_rate": 3.237219730941704e-06,
"loss": 2.1058,
"step": 37800
},
{
"epoch": 4.6135118685331715,
"grad_norm": 5.774105548858643,
"learning_rate": 3.1923766816143497e-06,
"loss": 2.1363,
"step": 37900
},
{
"epoch": 4.625684723067559,
"grad_norm": 6.000542163848877,
"learning_rate": 3.147533632286996e-06,
"loss": 2.1097,
"step": 38000
},
{
"epoch": 4.625684723067559,
"eval_loss": 2.0734775066375732,
"eval_runtime": 6.9601,
"eval_samples_per_second": 143.677,
"eval_steps_per_second": 35.919,
"step": 38000
},
{
"epoch": 4.637857577601948,
"grad_norm": 6.037074565887451,
"learning_rate": 3.1026905829596416e-06,
"loss": 2.0839,
"step": 38100
},
{
"epoch": 4.650030432136336,
"grad_norm": 6.941400051116943,
"learning_rate": 3.0578475336322874e-06,
"loss": 2.0961,
"step": 38200
},
{
"epoch": 4.662203286670724,
"grad_norm": 6.625183582305908,
"learning_rate": 3.0130044843049327e-06,
"loss": 2.1532,
"step": 38300
},
{
"epoch": 4.674376141205112,
"grad_norm": 5.852269649505615,
"learning_rate": 2.968161434977579e-06,
"loss": 2.1085,
"step": 38400
},
{
"epoch": 4.686548995739501,
"grad_norm": 5.130061626434326,
"learning_rate": 2.9233183856502245e-06,
"loss": 2.1028,
"step": 38500
},
{
"epoch": 4.686548995739501,
"eval_loss": 2.070453643798828,
"eval_runtime": 6.9577,
"eval_samples_per_second": 143.725,
"eval_steps_per_second": 35.931,
"step": 38500
},
{
"epoch": 4.6987218502738894,
"grad_norm": 6.478227615356445,
"learning_rate": 2.8784753363228703e-06,
"loss": 2.0895,
"step": 38600
},
{
"epoch": 4.710894704808277,
"grad_norm": 6.043088436126709,
"learning_rate": 2.8336322869955156e-06,
"loss": 2.1032,
"step": 38700
},
{
"epoch": 4.723067559342666,
"grad_norm": 6.732186317443848,
"learning_rate": 2.7887892376681617e-06,
"loss": 2.0838,
"step": 38800
},
{
"epoch": 4.7352404138770545,
"grad_norm": 6.393290996551514,
"learning_rate": 2.7439461883408075e-06,
"loss": 2.1058,
"step": 38900
},
{
"epoch": 4.747413268411442,
"grad_norm": 6.3943705558776855,
"learning_rate": 2.699103139013453e-06,
"loss": 2.1142,
"step": 39000
},
{
"epoch": 4.747413268411442,
"eval_loss": 2.0703060626983643,
"eval_runtime": 7.0835,
"eval_samples_per_second": 141.172,
"eval_steps_per_second": 35.293,
"step": 39000
},
{
"epoch": 4.759586122945831,
"grad_norm": 5.651825428009033,
"learning_rate": 2.654260089686099e-06,
"loss": 2.1099,
"step": 39100
},
{
"epoch": 4.77175897748022,
"grad_norm": 5.763203144073486,
"learning_rate": 2.609417040358745e-06,
"loss": 2.1199,
"step": 39200
},
{
"epoch": 4.783931832014607,
"grad_norm": 6.404742240905762,
"learning_rate": 2.5645739910313904e-06,
"loss": 2.1065,
"step": 39300
},
{
"epoch": 4.796104686548996,
"grad_norm": 6.63946533203125,
"learning_rate": 2.519730941704036e-06,
"loss": 2.0982,
"step": 39400
},
{
"epoch": 4.808277541083384,
"grad_norm": 6.3910675048828125,
"learning_rate": 2.474887892376682e-06,
"loss": 2.0945,
"step": 39500
},
{
"epoch": 4.808277541083384,
"eval_loss": 2.066244602203369,
"eval_runtime": 7.0266,
"eval_samples_per_second": 142.317,
"eval_steps_per_second": 35.579,
"step": 39500
},
{
"epoch": 4.820450395617772,
"grad_norm": 6.50945520401001,
"learning_rate": 2.4300448430493276e-06,
"loss": 2.0619,
"step": 39600
},
{
"epoch": 4.83262325015216,
"grad_norm": 6.681357383728027,
"learning_rate": 2.3852017937219733e-06,
"loss": 2.097,
"step": 39700
},
{
"epoch": 4.844796104686549,
"grad_norm": 5.813176155090332,
"learning_rate": 2.340358744394619e-06,
"loss": 2.1173,
"step": 39800
},
{
"epoch": 4.8569689592209375,
"grad_norm": 6.834031581878662,
"learning_rate": 2.2955156950672647e-06,
"loss": 2.0721,
"step": 39900
},
{
"epoch": 4.869141813755325,
"grad_norm": 5.929574966430664,
"learning_rate": 2.2506726457399105e-06,
"loss": 2.1325,
"step": 40000
},
{
"epoch": 4.869141813755325,
"eval_loss": 2.064162254333496,
"eval_runtime": 7.021,
"eval_samples_per_second": 142.43,
"eval_steps_per_second": 35.608,
"step": 40000
},
{
"epoch": 4.881314668289714,
"grad_norm": 6.4569830894470215,
"learning_rate": 2.205829596412556e-06,
"loss": 2.1224,
"step": 40100
},
{
"epoch": 4.893487522824103,
"grad_norm": 6.773449897766113,
"learning_rate": 2.160986547085202e-06,
"loss": 2.1037,
"step": 40200
},
{
"epoch": 4.90566037735849,
"grad_norm": 6.341082572937012,
"learning_rate": 2.1165919282511213e-06,
"loss": 2.0936,
"step": 40300
},
{
"epoch": 4.917833231892879,
"grad_norm": 6.29095983505249,
"learning_rate": 2.071748878923767e-06,
"loss": 2.141,
"step": 40400
},
{
"epoch": 4.930006086427268,
"grad_norm": 7.924270153045654,
"learning_rate": 2.0269058295964127e-06,
"loss": 2.0937,
"step": 40500
},
{
"epoch": 4.930006086427268,
"eval_loss": 2.0617458820343018,
"eval_runtime": 6.9841,
"eval_samples_per_second": 143.183,
"eval_steps_per_second": 35.796,
"step": 40500
},
{
"epoch": 4.942178940961655,
"grad_norm": 7.386099338531494,
"learning_rate": 1.9820627802690585e-06,
"loss": 2.1187,
"step": 40600
},
{
"epoch": 4.954351795496044,
"grad_norm": 6.6330413818359375,
"learning_rate": 1.937219730941704e-06,
"loss": 2.0891,
"step": 40700
},
{
"epoch": 4.966524650030432,
"grad_norm": 5.590965747833252,
"learning_rate": 1.89237668161435e-06,
"loss": 2.0809,
"step": 40800
},
{
"epoch": 4.9786975045648205,
"grad_norm": 6.468350410461426,
"learning_rate": 1.8475336322869959e-06,
"loss": 2.0654,
"step": 40900
},
{
"epoch": 4.990870359099208,
"grad_norm": 6.724806308746338,
"learning_rate": 1.8026905829596414e-06,
"loss": 2.0938,
"step": 41000
},
{
"epoch": 4.990870359099208,
"eval_loss": 2.057617664337158,
"eval_runtime": 7.1367,
"eval_samples_per_second": 140.121,
"eval_steps_per_second": 35.03,
"step": 41000
},
{
"epoch": 5.003043213633597,
"grad_norm": 5.549363136291504,
"learning_rate": 1.7578475336322873e-06,
"loss": 2.0672,
"step": 41100
},
{
"epoch": 5.015216068167986,
"grad_norm": 6.4161152839660645,
"learning_rate": 1.7130044843049328e-06,
"loss": 2.0589,
"step": 41200
},
{
"epoch": 5.027388922702373,
"grad_norm": 6.318953514099121,
"learning_rate": 1.6681614349775786e-06,
"loss": 2.0643,
"step": 41300
},
{
"epoch": 5.039561777236762,
"grad_norm": 7.292160987854004,
"learning_rate": 1.6233183856502243e-06,
"loss": 2.0718,
"step": 41400
},
{
"epoch": 5.051734631771151,
"grad_norm": 6.140988349914551,
"learning_rate": 1.57847533632287e-06,
"loss": 2.0437,
"step": 41500
},
{
"epoch": 5.051734631771151,
"eval_loss": 2.051799774169922,
"eval_runtime": 7.0596,
"eval_samples_per_second": 141.651,
"eval_steps_per_second": 35.413,
"step": 41500
},
{
"epoch": 5.063907486305538,
"grad_norm": 6.15008020401001,
"learning_rate": 1.533632286995516e-06,
"loss": 2.0561,
"step": 41600
},
{
"epoch": 5.076080340839927,
"grad_norm": 6.889511585235596,
"learning_rate": 1.4887892376681615e-06,
"loss": 2.0729,
"step": 41700
},
{
"epoch": 5.088253195374315,
"grad_norm": 5.815738201141357,
"learning_rate": 1.4439461883408074e-06,
"loss": 2.0413,
"step": 41800
},
{
"epoch": 5.1004260499087035,
"grad_norm": 5.965245723724365,
"learning_rate": 1.399103139013453e-06,
"loss": 2.0407,
"step": 41900
},
{
"epoch": 5.112598904443092,
"grad_norm": 7.188913345336914,
"learning_rate": 1.3542600896860989e-06,
"loss": 2.0781,
"step": 42000
},
{
"epoch": 5.112598904443092,
"eval_loss": 2.0501816272735596,
"eval_runtime": 6.9017,
"eval_samples_per_second": 144.892,
"eval_steps_per_second": 36.223,
"step": 42000
},
{
"epoch": 5.12477175897748,
"grad_norm": 7.101166725158691,
"learning_rate": 1.3094170403587444e-06,
"loss": 2.0348,
"step": 42100
},
{
"epoch": 5.136944613511869,
"grad_norm": 5.820453643798828,
"learning_rate": 1.2645739910313903e-06,
"loss": 2.0497,
"step": 42200
},
{
"epoch": 5.149117468046257,
"grad_norm": 5.811570167541504,
"learning_rate": 1.2197309417040358e-06,
"loss": 2.058,
"step": 42300
},
{
"epoch": 5.161290322580645,
"grad_norm": 6.54494047164917,
"learning_rate": 1.1748878923766818e-06,
"loss": 2.0833,
"step": 42400
},
{
"epoch": 5.173463177115034,
"grad_norm": 6.547015190124512,
"learning_rate": 1.1300448430493275e-06,
"loss": 2.0671,
"step": 42500
},
{
"epoch": 5.173463177115034,
"eval_loss": 2.049518346786499,
"eval_runtime": 7.0155,
"eval_samples_per_second": 142.542,
"eval_steps_per_second": 35.636,
"step": 42500
},
{
"epoch": 5.185636031649421,
"grad_norm": 6.079003810882568,
"learning_rate": 1.0852017937219732e-06,
"loss": 2.0732,
"step": 42600
},
{
"epoch": 5.19780888618381,
"grad_norm": 6.835382461547852,
"learning_rate": 1.040358744394619e-06,
"loss": 2.0516,
"step": 42700
},
{
"epoch": 5.209981740718199,
"grad_norm": 6.055761814117432,
"learning_rate": 9.955156950672647e-07,
"loss": 2.0639,
"step": 42800
},
{
"epoch": 5.2221545952525865,
"grad_norm": 6.516651630401611,
"learning_rate": 9.511210762331839e-07,
"loss": 2.0597,
"step": 42900
},
{
"epoch": 5.234327449786975,
"grad_norm": 5.874512195587158,
"learning_rate": 9.062780269058297e-07,
"loss": 2.073,
"step": 43000
},
{
"epoch": 5.234327449786975,
"eval_loss": 2.0482187271118164,
"eval_runtime": 6.8896,
"eval_samples_per_second": 145.145,
"eval_steps_per_second": 36.286,
"step": 43000
},
{
"epoch": 5.246500304321363,
"grad_norm": 6.2515459060668945,
"learning_rate": 8.614349775784754e-07,
"loss": 2.0594,
"step": 43100
},
{
"epoch": 5.258673158855752,
"grad_norm": 6.7219438552856445,
"learning_rate": 8.165919282511211e-07,
"loss": 2.0138,
"step": 43200
},
{
"epoch": 5.27084601339014,
"grad_norm": 6.588565349578857,
"learning_rate": 7.71748878923767e-07,
"loss": 2.089,
"step": 43300
},
{
"epoch": 5.283018867924528,
"grad_norm": 6.52641487121582,
"learning_rate": 7.269058295964127e-07,
"loss": 2.0274,
"step": 43400
},
{
"epoch": 5.295191722458917,
"grad_norm": 7.77009391784668,
"learning_rate": 6.820627802690584e-07,
"loss": 2.0412,
"step": 43500
},
{
"epoch": 5.295191722458917,
"eval_loss": 2.0471861362457275,
"eval_runtime": 6.8473,
"eval_samples_per_second": 146.043,
"eval_steps_per_second": 36.511,
"step": 43500
},
{
"epoch": 5.307364576993305,
"grad_norm": 6.563704490661621,
"learning_rate": 6.372197309417041e-07,
"loss": 2.0538,
"step": 43600
},
{
"epoch": 5.319537431527693,
"grad_norm": 5.842877388000488,
"learning_rate": 5.923766816143499e-07,
"loss": 2.0378,
"step": 43700
},
{
"epoch": 5.331710286062082,
"grad_norm": 5.96117639541626,
"learning_rate": 5.475336322869956e-07,
"loss": 2.0702,
"step": 43800
},
{
"epoch": 5.3438831405964695,
"grad_norm": 6.195252895355225,
"learning_rate": 5.026905829596413e-07,
"loss": 2.0519,
"step": 43900
},
{
"epoch": 5.356055995130858,
"grad_norm": 6.967134475708008,
"learning_rate": 4.5784753363228705e-07,
"loss": 2.0747,
"step": 44000
},
{
"epoch": 5.356055995130858,
"eval_loss": 2.046496629714966,
"eval_runtime": 6.9289,
"eval_samples_per_second": 144.322,
"eval_steps_per_second": 36.081,
"step": 44000
},
{
"epoch": 5.368228849665247,
"grad_norm": 5.7902984619140625,
"learning_rate": 4.130044843049328e-07,
"loss": 2.0551,
"step": 44100
},
{
"epoch": 5.380401704199635,
"grad_norm": 6.01054048538208,
"learning_rate": 3.6816143497757846e-07,
"loss": 2.0569,
"step": 44200
},
{
"epoch": 5.392574558734023,
"grad_norm": 6.690357685089111,
"learning_rate": 3.2331838565022424e-07,
"loss": 2.08,
"step": 44300
},
{
"epoch": 5.404747413268412,
"grad_norm": 5.836359024047852,
"learning_rate": 2.7847533632286997e-07,
"loss": 2.0405,
"step": 44400
},
{
"epoch": 5.4169202678028,
"grad_norm": 6.3250298500061035,
"learning_rate": 2.3363228699551572e-07,
"loss": 2.0717,
"step": 44500
},
{
"epoch": 5.4169202678028,
"eval_loss": 2.04555606842041,
"eval_runtime": 6.9014,
"eval_samples_per_second": 144.899,
"eval_steps_per_second": 36.225,
"step": 44500
},
{
"epoch": 5.429093122337188,
"grad_norm": 6.5666890144348145,
"learning_rate": 1.8878923766816145e-07,
"loss": 2.06,
"step": 44600
},
{
"epoch": 5.441265976871576,
"grad_norm": 7.2658843994140625,
"learning_rate": 1.4394618834080718e-07,
"loss": 2.0556,
"step": 44700
},
{
"epoch": 5.453438831405965,
"grad_norm": 6.671789646148682,
"learning_rate": 9.910313901345293e-08,
"loss": 2.0642,
"step": 44800
},
{
"epoch": 5.465611685940353,
"grad_norm": 5.944987773895264,
"learning_rate": 5.426008968609866e-08,
"loss": 2.0414,
"step": 44900
},
{
"epoch": 5.477784540474741,
"grad_norm": 6.427646636962891,
"learning_rate": 9.417040358744396e-09,
"loss": 2.0667,
"step": 45000
},
{
"epoch": 5.477784540474741,
"eval_loss": 2.0452468395233154,
"eval_runtime": 7.0229,
"eval_samples_per_second": 142.391,
"eval_steps_per_second": 35.598,
"step": 45000
},
{
"epoch": 5.48995739500913,
"grad_norm": 6.559889793395996,
"learning_rate": 5.007718120805369e-06,
"loss": 2.0447,
"step": 45100
},
{
"epoch": 5.502130249543518,
"grad_norm": 6.235354423522949,
"learning_rate": 4.974161073825503e-06,
"loss": 2.0806,
"step": 45200
},
{
"epoch": 5.514303104077906,
"grad_norm": 7.230030536651611,
"learning_rate": 4.940604026845638e-06,
"loss": 2.0696,
"step": 45300
},
{
"epoch": 5.526475958612295,
"grad_norm": 5.613503456115723,
"learning_rate": 4.907046979865772e-06,
"loss": 2.0662,
"step": 45400
},
{
"epoch": 5.538648813146683,
"grad_norm": 5.988820552825928,
"learning_rate": 4.873489932885906e-06,
"loss": 2.0551,
"step": 45500
},
{
"epoch": 5.538648813146683,
"eval_loss": 2.0472412109375,
"eval_runtime": 6.7805,
"eval_samples_per_second": 147.481,
"eval_steps_per_second": 36.87,
"step": 45500
},
{
"epoch": 5.550821667681071,
"grad_norm": 6.566047191619873,
"learning_rate": 4.8399328859060404e-06,
"loss": 2.0652,
"step": 45600
},
{
"epoch": 5.56299452221546,
"grad_norm": 6.979294300079346,
"learning_rate": 4.806375838926175e-06,
"loss": 2.0409,
"step": 45700
},
{
"epoch": 5.575167376749848,
"grad_norm": 6.474365234375,
"learning_rate": 4.772818791946309e-06,
"loss": 2.0562,
"step": 45800
},
{
"epoch": 5.587340231284236,
"grad_norm": 6.100124835968018,
"learning_rate": 4.739261744966443e-06,
"loss": 2.0448,
"step": 45900
},
{
"epoch": 5.599513085818624,
"grad_norm": 6.383643627166748,
"learning_rate": 4.706040268456376e-06,
"loss": 2.0798,
"step": 46000
},
{
"epoch": 5.599513085818624,
"eval_loss": 2.042715311050415,
"eval_runtime": 6.7981,
"eval_samples_per_second": 147.1,
"eval_steps_per_second": 36.775,
"step": 46000
},
{
"epoch": 5.611685940353013,
"grad_norm": 6.848605632781982,
"learning_rate": 4.67248322147651e-06,
"loss": 2.0615,
"step": 46100
},
{
"epoch": 5.6238587948874015,
"grad_norm": 6.921677589416504,
"learning_rate": 4.638926174496644e-06,
"loss": 2.0888,
"step": 46200
},
{
"epoch": 5.636031649421789,
"grad_norm": 6.901805400848389,
"learning_rate": 4.6053691275167785e-06,
"loss": 2.0552,
"step": 46300
},
{
"epoch": 5.648204503956178,
"grad_norm": 6.497274398803711,
"learning_rate": 4.571812080536913e-06,
"loss": 2.0596,
"step": 46400
},
{
"epoch": 5.660377358490566,
"grad_norm": 6.1705803871154785,
"learning_rate": 4.538255033557047e-06,
"loss": 2.0352,
"step": 46500
},
{
"epoch": 5.660377358490566,
"eval_loss": 2.0392038822174072,
"eval_runtime": 6.8175,
"eval_samples_per_second": 146.681,
"eval_steps_per_second": 36.67,
"step": 46500
},
{
"epoch": 5.672550213024954,
"grad_norm": 6.3149847984313965,
"learning_rate": 4.504697986577181e-06,
"loss": 2.0782,
"step": 46600
},
{
"epoch": 5.684723067559343,
"grad_norm": 5.7811760902404785,
"learning_rate": 4.471140939597316e-06,
"loss": 2.0745,
"step": 46700
},
{
"epoch": 5.696895922093731,
"grad_norm": 6.381850719451904,
"learning_rate": 4.43758389261745e-06,
"loss": 2.0967,
"step": 46800
},
{
"epoch": 5.709068776628119,
"grad_norm": 6.7904534339904785,
"learning_rate": 4.404026845637584e-06,
"loss": 2.048,
"step": 46900
},
{
"epoch": 5.721241631162508,
"grad_norm": 6.390072822570801,
"learning_rate": 4.370469798657718e-06,
"loss": 2.0458,
"step": 47000
},
{
"epoch": 5.721241631162508,
"eval_loss": 2.0331013202667236,
"eval_runtime": 6.7556,
"eval_samples_per_second": 148.026,
"eval_steps_per_second": 37.006,
"step": 47000
},
{
"epoch": 5.733414485696896,
"grad_norm": 6.4294514656066895,
"learning_rate": 4.336912751677853e-06,
"loss": 2.0555,
"step": 47100
},
{
"epoch": 5.7455873402312845,
"grad_norm": 7.039945602416992,
"learning_rate": 4.303355704697987e-06,
"loss": 2.0973,
"step": 47200
},
{
"epoch": 5.757760194765672,
"grad_norm": 6.919515132904053,
"learning_rate": 4.269798657718121e-06,
"loss": 2.0572,
"step": 47300
},
{
"epoch": 5.769933049300061,
"grad_norm": 6.846578598022461,
"learning_rate": 4.2362416107382554e-06,
"loss": 2.0703,
"step": 47400
},
{
"epoch": 5.78210590383445,
"grad_norm": 6.899037837982178,
"learning_rate": 4.20268456375839e-06,
"loss": 2.0382,
"step": 47500
},
{
"epoch": 5.78210590383445,
"eval_loss": 2.0307412147521973,
"eval_runtime": 6.8182,
"eval_samples_per_second": 146.666,
"eval_steps_per_second": 36.667,
"step": 47500
},
{
"epoch": 5.794278758368837,
"grad_norm": 5.726818084716797,
"learning_rate": 4.169127516778524e-06,
"loss": 2.0595,
"step": 47600
},
{
"epoch": 5.806451612903226,
"grad_norm": 7.426904201507568,
"learning_rate": 4.135570469798658e-06,
"loss": 2.0605,
"step": 47700
},
{
"epoch": 5.818624467437614,
"grad_norm": 6.416141986846924,
"learning_rate": 4.1020134228187925e-06,
"loss": 2.071,
"step": 47800
},
{
"epoch": 5.830797321972002,
"grad_norm": 6.170881748199463,
"learning_rate": 4.068456375838927e-06,
"loss": 2.0601,
"step": 47900
},
{
"epoch": 5.842970176506391,
"grad_norm": 5.913904666900635,
"learning_rate": 4.034899328859061e-06,
"loss": 2.0663,
"step": 48000
},
{
"epoch": 5.842970176506391,
"eval_loss": 2.0260586738586426,
"eval_runtime": 6.7968,
"eval_samples_per_second": 147.127,
"eval_steps_per_second": 36.782,
"step": 48000
},
{
"epoch": 5.855143031040779,
"grad_norm": 6.9575090408325195,
"learning_rate": 4.0013422818791944e-06,
"loss": 2.0487,
"step": 48100
},
{
"epoch": 5.8673158855751675,
"grad_norm": 7.018653392791748,
"learning_rate": 3.967785234899329e-06,
"loss": 2.0836,
"step": 48200
},
{
"epoch": 5.879488740109556,
"grad_norm": 6.9810285568237305,
"learning_rate": 3.934228187919463e-06,
"loss": 2.0645,
"step": 48300
},
{
"epoch": 5.891661594643944,
"grad_norm": 5.732436656951904,
"learning_rate": 3.900671140939597e-06,
"loss": 2.0682,
"step": 48400
},
{
"epoch": 5.9038344491783326,
"grad_norm": 6.543402671813965,
"learning_rate": 3.8671140939597315e-06,
"loss": 2.0797,
"step": 48500
},
{
"epoch": 5.9038344491783326,
"eval_loss": 2.0245697498321533,
"eval_runtime": 6.7746,
"eval_samples_per_second": 147.609,
"eval_steps_per_second": 36.902,
"step": 48500
},
{
"epoch": 5.91600730371272,
"grad_norm": 6.355215072631836,
"learning_rate": 3.833557046979866e-06,
"loss": 2.0386,
"step": 48600
},
{
"epoch": 5.928180158247109,
"grad_norm": 5.7379889488220215,
"learning_rate": 3.8000000000000005e-06,
"loss": 2.0498,
"step": 48700
},
{
"epoch": 5.940353012781498,
"grad_norm": 5.857077121734619,
"learning_rate": 3.7664429530201347e-06,
"loss": 2.061,
"step": 48800
},
{
"epoch": 5.952525867315885,
"grad_norm": 7.078189373016357,
"learning_rate": 3.732885906040269e-06,
"loss": 2.0569,
"step": 48900
},
{
"epoch": 5.964698721850274,
"grad_norm": 6.31903600692749,
"learning_rate": 3.6993288590604033e-06,
"loss": 2.0755,
"step": 49000
},
{
"epoch": 5.964698721850274,
"eval_loss": 2.0195415019989014,
"eval_runtime": 6.7847,
"eval_samples_per_second": 147.392,
"eval_steps_per_second": 36.848,
"step": 49000
},
{
"epoch": 5.976871576384662,
"grad_norm": 6.295201778411865,
"learning_rate": 3.6657718120805375e-06,
"loss": 2.0393,
"step": 49100
},
{
"epoch": 5.9890444309190505,
"grad_norm": 5.829520225524902,
"learning_rate": 3.6322147651006714e-06,
"loss": 2.0839,
"step": 49200
},
{
"epoch": 6.001217285453439,
"grad_norm": 6.653756141662598,
"learning_rate": 3.5986577181208056e-06,
"loss": 2.0581,
"step": 49300
},
{
"epoch": 6.013390139987827,
"grad_norm": 6.303423881530762,
"learning_rate": 3.56510067114094e-06,
"loss": 2.0524,
"step": 49400
},
{
"epoch": 6.0255629945222156,
"grad_norm": 6.783233642578125,
"learning_rate": 3.531543624161074e-06,
"loss": 2.0284,
"step": 49500
},
{
"epoch": 6.0255629945222156,
"eval_loss": 2.013944387435913,
"eval_runtime": 6.7878,
"eval_samples_per_second": 147.323,
"eval_steps_per_second": 36.831,
"step": 49500
},
{
"epoch": 6.037735849056604,
"grad_norm": 5.857462406158447,
"learning_rate": 3.4979865771812084e-06,
"loss": 2.0229,
"step": 49600
},
{
"epoch": 6.049908703590992,
"grad_norm": 6.777635097503662,
"learning_rate": 3.4644295302013427e-06,
"loss": 1.9982,
"step": 49700
},
{
"epoch": 6.062081558125381,
"grad_norm": 7.1341328620910645,
"learning_rate": 3.430872483221477e-06,
"loss": 2.0211,
"step": 49800
},
{
"epoch": 6.074254412659768,
"grad_norm": 6.320338249206543,
"learning_rate": 3.3973154362416112e-06,
"loss": 2.0137,
"step": 49900
},
{
"epoch": 6.086427267194157,
"grad_norm": 6.523722171783447,
"learning_rate": 3.3640939597315437e-06,
"loss": 2.0073,
"step": 50000
},
{
"epoch": 6.086427267194157,
"eval_loss": 2.0087661743164062,
"eval_runtime": 6.731,
"eval_samples_per_second": 148.565,
"eval_steps_per_second": 37.141,
"step": 50000
},
{
"epoch": 6.098600121728546,
"grad_norm": 6.2543559074401855,
"learning_rate": 3.330536912751678e-06,
"loss": 2.05,
"step": 50100
},
{
"epoch": 6.1107729762629335,
"grad_norm": 6.838403701782227,
"learning_rate": 3.2969798657718123e-06,
"loss": 2.0041,
"step": 50200
},
{
"epoch": 6.122945830797322,
"grad_norm": 6.734765529632568,
"learning_rate": 3.2634228187919465e-06,
"loss": 2.0144,
"step": 50300
},
{
"epoch": 6.13511868533171,
"grad_norm": 7.506516933441162,
"learning_rate": 3.2298657718120808e-06,
"loss": 2.0238,
"step": 50400
},
{
"epoch": 6.1472915398660986,
"grad_norm": 7.153513431549072,
"learning_rate": 3.196308724832215e-06,
"loss": 2.0032,
"step": 50500
},
{
"epoch": 6.1472915398660986,
"eval_loss": 2.0054242610931396,
"eval_runtime": 6.8452,
"eval_samples_per_second": 146.089,
"eval_steps_per_second": 36.522,
"step": 50500
},
{
"epoch": 6.159464394400487,
"grad_norm": 5.951141834259033,
"learning_rate": 3.1627516778523493e-06,
"loss": 2.0768,
"step": 50600
},
{
"epoch": 6.171637248934875,
"grad_norm": 6.877615928649902,
"learning_rate": 3.1291946308724836e-06,
"loss": 2.0123,
"step": 50700
},
{
"epoch": 6.183810103469264,
"grad_norm": 6.209372520446777,
"learning_rate": 3.095637583892618e-06,
"loss": 2.0153,
"step": 50800
},
{
"epoch": 6.195982958003652,
"grad_norm": 6.799842834472656,
"learning_rate": 3.062080536912752e-06,
"loss": 1.9955,
"step": 50900
},
{
"epoch": 6.20815581253804,
"grad_norm": 6.479254722595215,
"learning_rate": 3.0285234899328864e-06,
"loss": 2.0315,
"step": 51000
},
{
"epoch": 6.20815581253804,
"eval_loss": 2.0038652420043945,
"eval_runtime": 6.8012,
"eval_samples_per_second": 147.033,
"eval_steps_per_second": 36.758,
"step": 51000
},
{
"epoch": 6.220328667072429,
"grad_norm": 6.269389629364014,
"learning_rate": 2.9949664429530206e-06,
"loss": 1.9839,
"step": 51100
},
{
"epoch": 6.2325015216068165,
"grad_norm": 7.240963935852051,
"learning_rate": 2.9614093959731545e-06,
"loss": 2.0155,
"step": 51200
},
{
"epoch": 6.244674376141205,
"grad_norm": 5.774966716766357,
"learning_rate": 2.9278523489932887e-06,
"loss": 2.0198,
"step": 51300
},
{
"epoch": 6.256847230675594,
"grad_norm": 6.272314071655273,
"learning_rate": 2.894295302013423e-06,
"loss": 2.0554,
"step": 51400
},
{
"epoch": 6.2690200852099816,
"grad_norm": 9.089746475219727,
"learning_rate": 2.8607382550335573e-06,
"loss": 2.0408,
"step": 51500
},
{
"epoch": 6.2690200852099816,
"eval_loss": 2.000591278076172,
"eval_runtime": 6.7821,
"eval_samples_per_second": 147.447,
"eval_steps_per_second": 36.862,
"step": 51500
},
{
"epoch": 6.28119293974437,
"grad_norm": 6.007697105407715,
"learning_rate": 2.8271812080536915e-06,
"loss": 2.0251,
"step": 51600
},
{
"epoch": 6.293365794278758,
"grad_norm": 7.7493791580200195,
"learning_rate": 2.793624161073826e-06,
"loss": 2.0447,
"step": 51700
},
{
"epoch": 6.305538648813147,
"grad_norm": 7.068716526031494,
"learning_rate": 2.76006711409396e-06,
"loss": 2.0377,
"step": 51800
},
{
"epoch": 6.317711503347535,
"grad_norm": 6.732091903686523,
"learning_rate": 2.7265100671140943e-06,
"loss": 2.0131,
"step": 51900
},
{
"epoch": 6.329884357881923,
"grad_norm": 6.7231125831604,
"learning_rate": 2.693288590604027e-06,
"loss": 2.0385,
"step": 52000
},
{
"epoch": 6.329884357881923,
"eval_loss": 1.9973669052124023,
"eval_runtime": 6.7984,
"eval_samples_per_second": 147.093,
"eval_steps_per_second": 36.773,
"step": 52000
},
{
"epoch": 6.342057212416312,
"grad_norm": 6.017531394958496,
"learning_rate": 2.659731543624161e-06,
"loss": 2.0407,
"step": 52100
},
{
"epoch": 6.3542300669507,
"grad_norm": 5.93875789642334,
"learning_rate": 2.6261744966442954e-06,
"loss": 2.0368,
"step": 52200
},
{
"epoch": 6.366402921485088,
"grad_norm": 6.382920265197754,
"learning_rate": 2.5926174496644296e-06,
"loss": 2.036,
"step": 52300
},
{
"epoch": 6.378575776019477,
"grad_norm": 6.723759651184082,
"learning_rate": 2.559060402684564e-06,
"loss": 1.9914,
"step": 52400
},
{
"epoch": 6.3907486305538646,
"grad_norm": 8.295475959777832,
"learning_rate": 2.525503355704698e-06,
"loss": 2.0401,
"step": 52500
},
{
"epoch": 6.3907486305538646,
"eval_loss": 1.9946650266647339,
"eval_runtime": 6.8495,
"eval_samples_per_second": 145.995,
"eval_steps_per_second": 36.499,
"step": 52500
},
{
"epoch": 6.402921485088253,
"grad_norm": 6.045047283172607,
"learning_rate": 2.4919463087248324e-06,
"loss": 2.0287,
"step": 52600
},
{
"epoch": 6.415094339622642,
"grad_norm": 7.3694000244140625,
"learning_rate": 2.4583892617449667e-06,
"loss": 2.0318,
"step": 52700
},
{
"epoch": 6.42726719415703,
"grad_norm": 6.970037460327148,
"learning_rate": 2.424832214765101e-06,
"loss": 2.0352,
"step": 52800
},
{
"epoch": 6.439440048691418,
"grad_norm": 7.87092399597168,
"learning_rate": 2.391275167785235e-06,
"loss": 2.0522,
"step": 52900
},
{
"epoch": 6.451612903225806,
"grad_norm": 6.341009616851807,
"learning_rate": 2.357718120805369e-06,
"loss": 2.0717,
"step": 53000
},
{
"epoch": 6.451612903225806,
"eval_loss": 1.9915155172348022,
"eval_runtime": 6.8145,
"eval_samples_per_second": 146.746,
"eval_steps_per_second": 36.686,
"step": 53000
},
{
"epoch": 6.463785757760195,
"grad_norm": 7.210479736328125,
"learning_rate": 2.3241610738255038e-06,
"loss": 2.0154,
"step": 53100
},
{
"epoch": 6.475958612294583,
"grad_norm": 8.30247688293457,
"learning_rate": 2.290604026845638e-06,
"loss": 2.0242,
"step": 53200
},
{
"epoch": 6.488131466828971,
"grad_norm": 5.9992570877075195,
"learning_rate": 2.2573825503355705e-06,
"loss": 2.0372,
"step": 53300
},
{
"epoch": 6.50030432136336,
"grad_norm": 6.450936317443848,
"learning_rate": 2.2238255033557048e-06,
"loss": 2.0267,
"step": 53400
},
{
"epoch": 6.512477175897748,
"grad_norm": 6.037837982177734,
"learning_rate": 2.190268456375839e-06,
"loss": 2.0178,
"step": 53500
},
{
"epoch": 6.512477175897748,
"eval_loss": 1.9894185066223145,
"eval_runtime": 6.8572,
"eval_samples_per_second": 145.831,
"eval_steps_per_second": 36.458,
"step": 53500
},
{
"epoch": 6.524650030432136,
"grad_norm": 6.875925064086914,
"learning_rate": 2.1567114093959733e-06,
"loss": 2.0354,
"step": 53600
},
{
"epoch": 6.536822884966525,
"grad_norm": 6.961463451385498,
"learning_rate": 2.1231543624161076e-06,
"loss": 2.06,
"step": 53700
},
{
"epoch": 6.548995739500913,
"grad_norm": 5.773210525512695,
"learning_rate": 2.089597315436242e-06,
"loss": 2.0167,
"step": 53800
},
{
"epoch": 6.561168594035301,
"grad_norm": 6.747873783111572,
"learning_rate": 2.056040268456376e-06,
"loss": 1.9882,
"step": 53900
},
{
"epoch": 6.57334144856969,
"grad_norm": 6.432974338531494,
"learning_rate": 2.0224832214765104e-06,
"loss": 2.0029,
"step": 54000
},
{
"epoch": 6.57334144856969,
"eval_loss": 1.9841845035552979,
"eval_runtime": 6.8372,
"eval_samples_per_second": 146.258,
"eval_steps_per_second": 36.564,
"step": 54000
},
{
"epoch": 6.585514303104078,
"grad_norm": 6.159907341003418,
"learning_rate": 1.9889261744966446e-06,
"loss": 2.0454,
"step": 54100
},
{
"epoch": 6.597687157638466,
"grad_norm": 7.004731178283691,
"learning_rate": 1.955369127516779e-06,
"loss": 2.0011,
"step": 54200
},
{
"epoch": 6.609860012172854,
"grad_norm": 7.388941764831543,
"learning_rate": 1.9218120805369127e-06,
"loss": 2.0446,
"step": 54300
},
{
"epoch": 6.622032866707243,
"grad_norm": 7.399050235748291,
"learning_rate": 1.888255033557047e-06,
"loss": 2.0265,
"step": 54400
},
{
"epoch": 6.634205721241631,
"grad_norm": 6.445584297180176,
"learning_rate": 1.8546979865771813e-06,
"loss": 2.0124,
"step": 54500
},
{
"epoch": 6.634205721241631,
"eval_loss": 1.9837737083435059,
"eval_runtime": 6.903,
"eval_samples_per_second": 144.864,
"eval_steps_per_second": 36.216,
"step": 54500
},
{
"epoch": 6.646378575776019,
"grad_norm": 6.1334967613220215,
"learning_rate": 1.8211409395973155e-06,
"loss": 2.0495,
"step": 54600
},
{
"epoch": 6.658551430310408,
"grad_norm": 6.132894992828369,
"learning_rate": 1.7875838926174498e-06,
"loss": 2.0308,
"step": 54700
},
{
"epoch": 6.6707242848447965,
"grad_norm": 7.038134574890137,
"learning_rate": 1.7540268456375839e-06,
"loss": 2.0532,
"step": 54800
},
{
"epoch": 6.682897139379184,
"grad_norm": 6.755254745483398,
"learning_rate": 1.7204697986577181e-06,
"loss": 2.0178,
"step": 54900
},
{
"epoch": 6.695069993913573,
"grad_norm": 6.841146945953369,
"learning_rate": 1.6869127516778524e-06,
"loss": 2.0442,
"step": 55000
},
{
"epoch": 6.695069993913573,
"eval_loss": 1.9824799299240112,
"eval_runtime": 6.8019,
"eval_samples_per_second": 147.018,
"eval_steps_per_second": 36.755,
"step": 55000
},
{
"epoch": 6.707242848447961,
"grad_norm": 6.4666547775268555,
"learning_rate": 1.6533557046979867e-06,
"loss": 2.0205,
"step": 55100
},
{
"epoch": 6.719415702982349,
"grad_norm": 7.502538204193115,
"learning_rate": 1.619798657718121e-06,
"loss": 2.0261,
"step": 55200
},
{
"epoch": 6.731588557516738,
"grad_norm": 7.378790378570557,
"learning_rate": 1.5862416107382552e-06,
"loss": 2.0288,
"step": 55300
},
{
"epoch": 6.743761412051126,
"grad_norm": 7.264867305755615,
"learning_rate": 1.5526845637583892e-06,
"loss": 2.0187,
"step": 55400
},
{
"epoch": 6.755934266585514,
"grad_norm": 7.020994663238525,
"learning_rate": 1.5191275167785235e-06,
"loss": 2.038,
"step": 55500
},
{
"epoch": 6.755934266585514,
"eval_loss": 1.9808002710342407,
"eval_runtime": 6.8401,
"eval_samples_per_second": 146.197,
"eval_steps_per_second": 36.549,
"step": 55500
},
{
"epoch": 6.768107121119902,
"grad_norm": 6.773026943206787,
"learning_rate": 1.4855704697986578e-06,
"loss": 2.0144,
"step": 55600
},
{
"epoch": 6.780279975654291,
"grad_norm": 5.357457160949707,
"learning_rate": 1.452013422818792e-06,
"loss": 2.0353,
"step": 55700
},
{
"epoch": 6.7924528301886795,
"grad_norm": 6.2290873527526855,
"learning_rate": 1.4184563758389263e-06,
"loss": 2.0328,
"step": 55800
},
{
"epoch": 6.804625684723067,
"grad_norm": 6.145375728607178,
"learning_rate": 1.3848993288590606e-06,
"loss": 2.0438,
"step": 55900
},
{
"epoch": 6.816798539257456,
"grad_norm": 6.537805080413818,
"learning_rate": 1.3513422818791946e-06,
"loss": 2.0634,
"step": 56000
},
{
"epoch": 6.816798539257456,
"eval_loss": 1.9801044464111328,
"eval_runtime": 6.8399,
"eval_samples_per_second": 146.202,
"eval_steps_per_second": 36.55,
"step": 56000
}
],
"logging_steps": 100,
"max_steps": 60000,
"num_input_tokens_seen": 0,
"num_train_epochs": 8,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.080261694721884e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}