| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 6.816798539257456, |
| "eval_steps": 500, |
| "global_step": 56000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.012172854534388313, |
| "grad_norm": 21.32210922241211, |
| "learning_rate": 4.85e-06, |
| "loss": 3.6457, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.024345709068776627, |
| "grad_norm": 17.6686954498291, |
| "learning_rate": 9.85e-06, |
| "loss": 3.3243, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.036518563603164945, |
| "grad_norm": 17.45920181274414, |
| "learning_rate": 1.4850000000000002e-05, |
| "loss": 3.2128, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.048691418137553254, |
| "grad_norm": 18.293773651123047, |
| "learning_rate": 1.985e-05, |
| "loss": 3.124, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.06086427267194157, |
| "grad_norm": 15.793401718139648, |
| "learning_rate": 1.995650224215247e-05, |
| "loss": 3.1209, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.06086427267194157, |
| "eval_loss": 3.249819278717041, |
| "eval_runtime": 6.941, |
| "eval_samples_per_second": 144.072, |
| "eval_steps_per_second": 36.018, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.07303712720632989, |
| "grad_norm": 13.932258605957031, |
| "learning_rate": 1.9911659192825115e-05, |
| "loss": 3.094, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.0852099817407182, |
| "grad_norm": 11.793479919433594, |
| "learning_rate": 1.986681614349776e-05, |
| "loss": 3.0426, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.09738283627510651, |
| "grad_norm": 11.373984336853027, |
| "learning_rate": 1.9821973094170406e-05, |
| "loss": 3.0645, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.10955569080949483, |
| "grad_norm": 10.407483100891113, |
| "learning_rate": 1.9777130044843052e-05, |
| "loss": 3.0681, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.12172854534388314, |
| "grad_norm": 9.600470542907715, |
| "learning_rate": 1.9732286995515698e-05, |
| "loss": 3.0599, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.12172854534388314, |
| "eval_loss": 3.157822847366333, |
| "eval_runtime": 6.8366, |
| "eval_samples_per_second": 146.272, |
| "eval_steps_per_second": 36.568, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.13390139987827146, |
| "grad_norm": 10.010004043579102, |
| "learning_rate": 1.9687443946188343e-05, |
| "loss": 3.0379, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.14607425441265978, |
| "grad_norm": 9.130040168762207, |
| "learning_rate": 1.964260089686099e-05, |
| "loss": 2.9859, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.15824710894704808, |
| "grad_norm": 8.330909729003906, |
| "learning_rate": 1.9597757847533635e-05, |
| "loss": 3.0213, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.1704199634814364, |
| "grad_norm": 7.502275466918945, |
| "learning_rate": 1.955291479820628e-05, |
| "loss": 3.0415, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.18259281801582472, |
| "grad_norm": 7.305887222290039, |
| "learning_rate": 1.9508071748878926e-05, |
| "loss": 2.966, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.18259281801582472, |
| "eval_loss": 3.091937303543091, |
| "eval_runtime": 6.9209, |
| "eval_samples_per_second": 144.491, |
| "eval_steps_per_second": 36.123, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.19476567255021301, |
| "grad_norm": 8.190788269042969, |
| "learning_rate": 1.9463228699551572e-05, |
| "loss": 2.9814, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.20693852708460134, |
| "grad_norm": 7.867215633392334, |
| "learning_rate": 1.9418385650224218e-05, |
| "loss": 2.9614, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.21911138161898966, |
| "grad_norm": 7.410882472991943, |
| "learning_rate": 1.9373542600896864e-05, |
| "loss": 2.9515, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.23128423615337795, |
| "grad_norm": 6.388878345489502, |
| "learning_rate": 1.9328699551569506e-05, |
| "loss": 2.915, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.24345709068776628, |
| "grad_norm": 6.401773452758789, |
| "learning_rate": 1.928385650224215e-05, |
| "loss": 2.942, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.24345709068776628, |
| "eval_loss": 3.0528335571289062, |
| "eval_runtime": 6.9438, |
| "eval_samples_per_second": 144.014, |
| "eval_steps_per_second": 36.004, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.2556299452221546, |
| "grad_norm": 6.346031665802002, |
| "learning_rate": 1.9239013452914797e-05, |
| "loss": 2.952, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.2678027997565429, |
| "grad_norm": 7.141861438751221, |
| "learning_rate": 1.9194170403587446e-05, |
| "loss": 2.9309, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.27997565429093124, |
| "grad_norm": 7.175647735595703, |
| "learning_rate": 1.9149327354260092e-05, |
| "loss": 2.9315, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.29214850882531956, |
| "grad_norm": 5.47502326965332, |
| "learning_rate": 1.9104484304932738e-05, |
| "loss": 2.944, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.30432136335970783, |
| "grad_norm": 6.102653980255127, |
| "learning_rate": 1.9059641255605384e-05, |
| "loss": 2.8639, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.30432136335970783, |
| "eval_loss": 3.0088276863098145, |
| "eval_runtime": 6.9657, |
| "eval_samples_per_second": 143.56, |
| "eval_steps_per_second": 35.89, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.31649421789409615, |
| "grad_norm": 6.21509313583374, |
| "learning_rate": 1.901479820627803e-05, |
| "loss": 2.8462, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.3286670724284845, |
| "grad_norm": 7.218765735626221, |
| "learning_rate": 1.8969955156950675e-05, |
| "loss": 2.8849, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.3408399269628728, |
| "grad_norm": 6.037746429443359, |
| "learning_rate": 1.892511210762332e-05, |
| "loss": 2.894, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.3530127814972611, |
| "grad_norm": 5.483625411987305, |
| "learning_rate": 1.8880269058295967e-05, |
| "loss": 2.8988, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.36518563603164944, |
| "grad_norm": 4.460190296173096, |
| "learning_rate": 1.8835426008968612e-05, |
| "loss": 2.8909, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.36518563603164944, |
| "eval_loss": 2.9809019565582275, |
| "eval_runtime": 6.9067, |
| "eval_samples_per_second": 144.787, |
| "eval_steps_per_second": 36.197, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.37735849056603776, |
| "grad_norm": 5.2231125831604, |
| "learning_rate": 1.8790582959641258e-05, |
| "loss": 2.894, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.38953134510042603, |
| "grad_norm": 5.9949517250061035, |
| "learning_rate": 1.8745739910313904e-05, |
| "loss": 2.8816, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.40170419963481435, |
| "grad_norm": 5.3864054679870605, |
| "learning_rate": 1.870089686098655e-05, |
| "loss": 2.863, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.4138770541692027, |
| "grad_norm": 6.138455390930176, |
| "learning_rate": 1.8656053811659195e-05, |
| "loss": 2.8577, |
| "step": 3400 |
| }, |
| { |
| "epoch": 0.426049908703591, |
| "grad_norm": 5.222280025482178, |
| "learning_rate": 1.861121076233184e-05, |
| "loss": 2.901, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.426049908703591, |
| "eval_loss": 2.944925308227539, |
| "eval_runtime": 6.9152, |
| "eval_samples_per_second": 144.608, |
| "eval_steps_per_second": 36.152, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.4382227632379793, |
| "grad_norm": 4.749873638153076, |
| "learning_rate": 1.8566367713004487e-05, |
| "loss": 2.8628, |
| "step": 3600 |
| }, |
| { |
| "epoch": 0.45039561777236764, |
| "grad_norm": 4.7014336585998535, |
| "learning_rate": 1.852152466367713e-05, |
| "loss": 2.8418, |
| "step": 3700 |
| }, |
| { |
| "epoch": 0.4625684723067559, |
| "grad_norm": 5.343926429748535, |
| "learning_rate": 1.8476681614349775e-05, |
| "loss": 2.9097, |
| "step": 3800 |
| }, |
| { |
| "epoch": 0.47474132684114423, |
| "grad_norm": 5.276562690734863, |
| "learning_rate": 1.8431838565022424e-05, |
| "loss": 2.8659, |
| "step": 3900 |
| }, |
| { |
| "epoch": 0.48691418137553255, |
| "grad_norm": 5.228163242340088, |
| "learning_rate": 1.838699551569507e-05, |
| "loss": 2.8497, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.48691418137553255, |
| "eval_loss": 2.9137816429138184, |
| "eval_runtime": 6.8289, |
| "eval_samples_per_second": 146.437, |
| "eval_steps_per_second": 36.609, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.4990870359099209, |
| "grad_norm": 5.291093826293945, |
| "learning_rate": 1.8342600896860988e-05, |
| "loss": 2.8562, |
| "step": 4100 |
| }, |
| { |
| "epoch": 0.5112598904443092, |
| "grad_norm": 5.388160705566406, |
| "learning_rate": 1.8297757847533634e-05, |
| "loss": 2.87, |
| "step": 4200 |
| }, |
| { |
| "epoch": 0.5234327449786975, |
| "grad_norm": 5.260839939117432, |
| "learning_rate": 1.825291479820628e-05, |
| "loss": 2.8755, |
| "step": 4300 |
| }, |
| { |
| "epoch": 0.5356055995130858, |
| "grad_norm": 5.170462131500244, |
| "learning_rate": 1.8208071748878925e-05, |
| "loss": 2.8342, |
| "step": 4400 |
| }, |
| { |
| "epoch": 0.5477784540474742, |
| "grad_norm": 4.9179582595825195, |
| "learning_rate": 1.816322869955157e-05, |
| "loss": 2.8494, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.5477784540474742, |
| "eval_loss": 2.886016607284546, |
| "eval_runtime": 6.8492, |
| "eval_samples_per_second": 146.002, |
| "eval_steps_per_second": 36.5, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.5599513085818625, |
| "grad_norm": 5.140480041503906, |
| "learning_rate": 1.8118385650224217e-05, |
| "loss": 2.8659, |
| "step": 4600 |
| }, |
| { |
| "epoch": 0.5721241631162508, |
| "grad_norm": 5.088667869567871, |
| "learning_rate": 1.8073542600896862e-05, |
| "loss": 2.8228, |
| "step": 4700 |
| }, |
| { |
| "epoch": 0.5842970176506391, |
| "grad_norm": 4.764868259429932, |
| "learning_rate": 1.8028699551569508e-05, |
| "loss": 2.8455, |
| "step": 4800 |
| }, |
| { |
| "epoch": 0.5964698721850273, |
| "grad_norm": 4.458358287811279, |
| "learning_rate": 1.7983856502242154e-05, |
| "loss": 2.8196, |
| "step": 4900 |
| }, |
| { |
| "epoch": 0.6086427267194157, |
| "grad_norm": 5.425631999969482, |
| "learning_rate": 1.79390134529148e-05, |
| "loss": 2.8247, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.6086427267194157, |
| "eval_loss": 2.85610294342041, |
| "eval_runtime": 6.9206, |
| "eval_samples_per_second": 144.495, |
| "eval_steps_per_second": 36.124, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.620815581253804, |
| "grad_norm": 4.651830196380615, |
| "learning_rate": 1.7894170403587445e-05, |
| "loss": 2.8296, |
| "step": 5100 |
| }, |
| { |
| "epoch": 0.6329884357881923, |
| "grad_norm": 5.064242839813232, |
| "learning_rate": 1.784932735426009e-05, |
| "loss": 2.8446, |
| "step": 5200 |
| }, |
| { |
| "epoch": 0.6451612903225806, |
| "grad_norm": 5.3180413246154785, |
| "learning_rate": 1.7804484304932737e-05, |
| "loss": 2.7944, |
| "step": 5300 |
| }, |
| { |
| "epoch": 0.657334144856969, |
| "grad_norm": 4.934672832489014, |
| "learning_rate": 1.7759641255605383e-05, |
| "loss": 2.7975, |
| "step": 5400 |
| }, |
| { |
| "epoch": 0.6695069993913573, |
| "grad_norm": 5.154861927032471, |
| "learning_rate": 1.7714798206278028e-05, |
| "loss": 2.8144, |
| "step": 5500 |
| }, |
| { |
| "epoch": 0.6695069993913573, |
| "eval_loss": 2.831345319747925, |
| "eval_runtime": 6.9102, |
| "eval_samples_per_second": 144.714, |
| "eval_steps_per_second": 36.179, |
| "step": 5500 |
| }, |
| { |
| "epoch": 0.6816798539257456, |
| "grad_norm": 5.322381973266602, |
| "learning_rate": 1.7669955156950674e-05, |
| "loss": 2.8196, |
| "step": 5600 |
| }, |
| { |
| "epoch": 0.6938527084601339, |
| "grad_norm": 4.949143886566162, |
| "learning_rate": 1.762511210762332e-05, |
| "loss": 2.8154, |
| "step": 5700 |
| }, |
| { |
| "epoch": 0.7060255629945222, |
| "grad_norm": 4.853809356689453, |
| "learning_rate": 1.7580269058295965e-05, |
| "loss": 2.8085, |
| "step": 5800 |
| }, |
| { |
| "epoch": 0.7181984175289106, |
| "grad_norm": 4.941267490386963, |
| "learning_rate": 1.753542600896861e-05, |
| "loss": 2.7982, |
| "step": 5900 |
| }, |
| { |
| "epoch": 0.7303712720632989, |
| "grad_norm": 4.971885681152344, |
| "learning_rate": 1.7490582959641257e-05, |
| "loss": 2.8049, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.7303712720632989, |
| "eval_loss": 2.8138246536254883, |
| "eval_runtime": 6.8576, |
| "eval_samples_per_second": 145.824, |
| "eval_steps_per_second": 36.456, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.7425441265976872, |
| "grad_norm": 4.718198776245117, |
| "learning_rate": 1.7445739910313903e-05, |
| "loss": 2.7546, |
| "step": 6100 |
| }, |
| { |
| "epoch": 0.7547169811320755, |
| "grad_norm": 5.367305278778076, |
| "learning_rate": 1.740089686098655e-05, |
| "loss": 2.7714, |
| "step": 6200 |
| }, |
| { |
| "epoch": 0.7668898356664637, |
| "grad_norm": 4.827259063720703, |
| "learning_rate": 1.7356053811659194e-05, |
| "loss": 2.8043, |
| "step": 6300 |
| }, |
| { |
| "epoch": 0.7790626902008521, |
| "grad_norm": 5.011576175689697, |
| "learning_rate": 1.731121076233184e-05, |
| "loss": 2.7859, |
| "step": 6400 |
| }, |
| { |
| "epoch": 0.7912355447352404, |
| "grad_norm": 5.363623142242432, |
| "learning_rate": 1.7266816143497758e-05, |
| "loss": 2.8161, |
| "step": 6500 |
| }, |
| { |
| "epoch": 0.7912355447352404, |
| "eval_loss": 2.791551113128662, |
| "eval_runtime": 6.8881, |
| "eval_samples_per_second": 145.177, |
| "eval_steps_per_second": 36.294, |
| "step": 6500 |
| }, |
| { |
| "epoch": 0.8034083992696287, |
| "grad_norm": 4.721231937408447, |
| "learning_rate": 1.7221973094170404e-05, |
| "loss": 2.7857, |
| "step": 6600 |
| }, |
| { |
| "epoch": 0.815581253804017, |
| "grad_norm": 4.657351016998291, |
| "learning_rate": 1.717713004484305e-05, |
| "loss": 2.7734, |
| "step": 6700 |
| }, |
| { |
| "epoch": 0.8277541083384053, |
| "grad_norm": 4.4942145347595215, |
| "learning_rate": 1.7132286995515695e-05, |
| "loss": 2.7885, |
| "step": 6800 |
| }, |
| { |
| "epoch": 0.8399269628727937, |
| "grad_norm": 5.061729431152344, |
| "learning_rate": 1.708744394618834e-05, |
| "loss": 2.7841, |
| "step": 6900 |
| }, |
| { |
| "epoch": 0.852099817407182, |
| "grad_norm": 4.816007137298584, |
| "learning_rate": 1.7042600896860987e-05, |
| "loss": 2.741, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.852099817407182, |
| "eval_loss": 2.7756857872009277, |
| "eval_runtime": 6.8679, |
| "eval_samples_per_second": 145.605, |
| "eval_steps_per_second": 36.401, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.8642726719415703, |
| "grad_norm": 5.255375385284424, |
| "learning_rate": 1.6997757847533633e-05, |
| "loss": 2.7561, |
| "step": 7100 |
| }, |
| { |
| "epoch": 0.8764455264759586, |
| "grad_norm": 4.844815254211426, |
| "learning_rate": 1.695291479820628e-05, |
| "loss": 2.7558, |
| "step": 7200 |
| }, |
| { |
| "epoch": 0.888618381010347, |
| "grad_norm": 4.8912224769592285, |
| "learning_rate": 1.6908071748878924e-05, |
| "loss": 2.7512, |
| "step": 7300 |
| }, |
| { |
| "epoch": 0.9007912355447353, |
| "grad_norm": 4.5775017738342285, |
| "learning_rate": 1.686322869955157e-05, |
| "loss": 2.745, |
| "step": 7400 |
| }, |
| { |
| "epoch": 0.9129640900791236, |
| "grad_norm": 4.753942012786865, |
| "learning_rate": 1.6818385650224216e-05, |
| "loss": 2.7173, |
| "step": 7500 |
| }, |
| { |
| "epoch": 0.9129640900791236, |
| "eval_loss": 2.7591283321380615, |
| "eval_runtime": 6.877, |
| "eval_samples_per_second": 145.412, |
| "eval_steps_per_second": 36.353, |
| "step": 7500 |
| }, |
| { |
| "epoch": 0.9251369446135118, |
| "grad_norm": 5.192244052886963, |
| "learning_rate": 1.677354260089686e-05, |
| "loss": 2.7373, |
| "step": 7600 |
| }, |
| { |
| "epoch": 0.9373097991479001, |
| "grad_norm": 4.5390801429748535, |
| "learning_rate": 1.6728699551569507e-05, |
| "loss": 2.7654, |
| "step": 7700 |
| }, |
| { |
| "epoch": 0.9494826536822885, |
| "grad_norm": 5.091897487640381, |
| "learning_rate": 1.6683856502242153e-05, |
| "loss": 2.7615, |
| "step": 7800 |
| }, |
| { |
| "epoch": 0.9616555082166768, |
| "grad_norm": 4.253417015075684, |
| "learning_rate": 1.6639013452914802e-05, |
| "loss": 2.7521, |
| "step": 7900 |
| }, |
| { |
| "epoch": 0.9738283627510651, |
| "grad_norm": 4.891059875488281, |
| "learning_rate": 1.6594170403587448e-05, |
| "loss": 2.7665, |
| "step": 8000 |
| }, |
| { |
| "epoch": 0.9738283627510651, |
| "eval_loss": 2.7409751415252686, |
| "eval_runtime": 6.8856, |
| "eval_samples_per_second": 145.23, |
| "eval_steps_per_second": 36.308, |
| "step": 8000 |
| }, |
| { |
| "epoch": 0.9860012172854534, |
| "grad_norm": 4.412657260894775, |
| "learning_rate": 1.6549327354260093e-05, |
| "loss": 2.7471, |
| "step": 8100 |
| }, |
| { |
| "epoch": 0.9981740718198417, |
| "grad_norm": 5.708240509033203, |
| "learning_rate": 1.650448430493274e-05, |
| "loss": 2.7545, |
| "step": 8200 |
| }, |
| { |
| "epoch": 1.01034692635423, |
| "grad_norm": 4.956757068634033, |
| "learning_rate": 1.645964125560538e-05, |
| "loss": 2.6015, |
| "step": 8300 |
| }, |
| { |
| "epoch": 1.0225197808886184, |
| "grad_norm": 5.220682621002197, |
| "learning_rate": 1.6414798206278027e-05, |
| "loss": 2.6077, |
| "step": 8400 |
| }, |
| { |
| "epoch": 1.0346926354230066, |
| "grad_norm": 5.160597801208496, |
| "learning_rate": 1.6369955156950673e-05, |
| "loss": 2.5857, |
| "step": 8500 |
| }, |
| { |
| "epoch": 1.0346926354230066, |
| "eval_loss": 2.7148427963256836, |
| "eval_runtime": 6.8912, |
| "eval_samples_per_second": 145.113, |
| "eval_steps_per_second": 36.278, |
| "step": 8500 |
| }, |
| { |
| "epoch": 1.046865489957395, |
| "grad_norm": 5.304019927978516, |
| "learning_rate": 1.6325560538116595e-05, |
| "loss": 2.5738, |
| "step": 8600 |
| }, |
| { |
| "epoch": 1.0590383444917832, |
| "grad_norm": 5.3433637619018555, |
| "learning_rate": 1.628071748878924e-05, |
| "loss": 2.5499, |
| "step": 8700 |
| }, |
| { |
| "epoch": 1.0712111990261717, |
| "grad_norm": 4.527110576629639, |
| "learning_rate": 1.6235874439461886e-05, |
| "loss": 2.6156, |
| "step": 8800 |
| }, |
| { |
| "epoch": 1.08338405356056, |
| "grad_norm": 5.513104438781738, |
| "learning_rate": 1.6191031390134532e-05, |
| "loss": 2.6217, |
| "step": 8900 |
| }, |
| { |
| "epoch": 1.0955569080949483, |
| "grad_norm": 5.579029083251953, |
| "learning_rate": 1.6146188340807178e-05, |
| "loss": 2.5829, |
| "step": 9000 |
| }, |
| { |
| "epoch": 1.0955569080949483, |
| "eval_loss": 2.6865806579589844, |
| "eval_runtime": 6.8369, |
| "eval_samples_per_second": 146.265, |
| "eval_steps_per_second": 36.566, |
| "step": 9000 |
| }, |
| { |
| "epoch": 1.1077297626293365, |
| "grad_norm": 4.849677562713623, |
| "learning_rate": 1.6101345291479823e-05, |
| "loss": 2.6253, |
| "step": 9100 |
| }, |
| { |
| "epoch": 1.119902617163725, |
| "grad_norm": 5.025945663452148, |
| "learning_rate": 1.605650224215247e-05, |
| "loss": 2.5725, |
| "step": 9200 |
| }, |
| { |
| "epoch": 1.1320754716981132, |
| "grad_norm": 5.991898059844971, |
| "learning_rate": 1.601165919282511e-05, |
| "loss": 2.5994, |
| "step": 9300 |
| }, |
| { |
| "epoch": 1.1442483262325016, |
| "grad_norm": 4.980128765106201, |
| "learning_rate": 1.5966816143497757e-05, |
| "loss": 2.5645, |
| "step": 9400 |
| }, |
| { |
| "epoch": 1.1564211807668898, |
| "grad_norm": 4.839084625244141, |
| "learning_rate": 1.5921973094170403e-05, |
| "loss": 2.5861, |
| "step": 9500 |
| }, |
| { |
| "epoch": 1.1564211807668898, |
| "eval_loss": 2.6708385944366455, |
| "eval_runtime": 6.8615, |
| "eval_samples_per_second": 145.74, |
| "eval_steps_per_second": 36.435, |
| "step": 9500 |
| }, |
| { |
| "epoch": 1.168594035301278, |
| "grad_norm": 5.058382511138916, |
| "learning_rate": 1.587713004484305e-05, |
| "loss": 2.5524, |
| "step": 9600 |
| }, |
| { |
| "epoch": 1.1807668898356665, |
| "grad_norm": 4.867978572845459, |
| "learning_rate": 1.5832286995515694e-05, |
| "loss": 2.582, |
| "step": 9700 |
| }, |
| { |
| "epoch": 1.192939744370055, |
| "grad_norm": 5.896303653717041, |
| "learning_rate": 1.578744394618834e-05, |
| "loss": 2.5899, |
| "step": 9800 |
| }, |
| { |
| "epoch": 1.205112598904443, |
| "grad_norm": 4.735970497131348, |
| "learning_rate": 1.574260089686099e-05, |
| "loss": 2.5878, |
| "step": 9900 |
| }, |
| { |
| "epoch": 1.2172854534388313, |
| "grad_norm": 4.8292670249938965, |
| "learning_rate": 1.5697757847533635e-05, |
| "loss": 2.6047, |
| "step": 10000 |
| }, |
| { |
| "epoch": 1.2172854534388313, |
| "eval_loss": 2.65461802482605, |
| "eval_runtime": 6.8819, |
| "eval_samples_per_second": 145.309, |
| "eval_steps_per_second": 36.327, |
| "step": 10000 |
| }, |
| { |
| "epoch": 1.2294583079732198, |
| "grad_norm": 5.350712299346924, |
| "learning_rate": 1.565291479820628e-05, |
| "loss": 2.5777, |
| "step": 10100 |
| }, |
| { |
| "epoch": 1.241631162507608, |
| "grad_norm": 5.471200466156006, |
| "learning_rate": 1.5608071748878926e-05, |
| "loss": 2.5908, |
| "step": 10200 |
| }, |
| { |
| "epoch": 1.2538040170419964, |
| "grad_norm": 5.038080215454102, |
| "learning_rate": 1.5563228699551572e-05, |
| "loss": 2.5951, |
| "step": 10300 |
| }, |
| { |
| "epoch": 1.2659768715763846, |
| "grad_norm": 4.982104778289795, |
| "learning_rate": 1.5518385650224218e-05, |
| "loss": 2.5461, |
| "step": 10400 |
| }, |
| { |
| "epoch": 1.278149726110773, |
| "grad_norm": 4.736184120178223, |
| "learning_rate": 1.5473542600896864e-05, |
| "loss": 2.5874, |
| "step": 10500 |
| }, |
| { |
| "epoch": 1.278149726110773, |
| "eval_loss": 2.6384053230285645, |
| "eval_runtime": 6.8888, |
| "eval_samples_per_second": 145.164, |
| "eval_steps_per_second": 36.291, |
| "step": 10500 |
| }, |
| { |
| "epoch": 1.2903225806451613, |
| "grad_norm": 5.710967540740967, |
| "learning_rate": 1.5429147982062782e-05, |
| "loss": 2.5818, |
| "step": 10600 |
| }, |
| { |
| "epoch": 1.3024954351795497, |
| "grad_norm": 5.1653947830200195, |
| "learning_rate": 1.5384304932735428e-05, |
| "loss": 2.5916, |
| "step": 10700 |
| }, |
| { |
| "epoch": 1.314668289713938, |
| "grad_norm": 5.706851959228516, |
| "learning_rate": 1.5339461883408074e-05, |
| "loss": 2.563, |
| "step": 10800 |
| }, |
| { |
| "epoch": 1.326841144248326, |
| "grad_norm": 5.320187568664551, |
| "learning_rate": 1.529461883408072e-05, |
| "loss": 2.5657, |
| "step": 10900 |
| }, |
| { |
| "epoch": 1.3390139987827145, |
| "grad_norm": 5.1567463874816895, |
| "learning_rate": 1.5249775784753365e-05, |
| "loss": 2.5362, |
| "step": 11000 |
| }, |
| { |
| "epoch": 1.3390139987827145, |
| "eval_loss": 2.6256721019744873, |
| "eval_runtime": 6.8781, |
| "eval_samples_per_second": 145.389, |
| "eval_steps_per_second": 36.347, |
| "step": 11000 |
| }, |
| { |
| "epoch": 1.351186853317103, |
| "grad_norm": 5.355208396911621, |
| "learning_rate": 1.520493273542601e-05, |
| "loss": 2.5748, |
| "step": 11100 |
| }, |
| { |
| "epoch": 1.3633597078514912, |
| "grad_norm": 4.878857612609863, |
| "learning_rate": 1.5160089686098656e-05, |
| "loss": 2.5768, |
| "step": 11200 |
| }, |
| { |
| "epoch": 1.3755325623858794, |
| "grad_norm": 5.551296234130859, |
| "learning_rate": 1.51152466367713e-05, |
| "loss": 2.5616, |
| "step": 11300 |
| }, |
| { |
| "epoch": 1.3877054169202678, |
| "grad_norm": 4.894459247589111, |
| "learning_rate": 1.5070403587443946e-05, |
| "loss": 2.5366, |
| "step": 11400 |
| }, |
| { |
| "epoch": 1.399878271454656, |
| "grad_norm": 5.237545967102051, |
| "learning_rate": 1.5025560538116592e-05, |
| "loss": 2.5516, |
| "step": 11500 |
| }, |
| { |
| "epoch": 1.399878271454656, |
| "eval_loss": 2.6034560203552246, |
| "eval_runtime": 6.9038, |
| "eval_samples_per_second": 144.848, |
| "eval_steps_per_second": 36.212, |
| "step": 11500 |
| }, |
| { |
| "epoch": 1.4120511259890445, |
| "grad_norm": 4.714597702026367, |
| "learning_rate": 1.4980717488789238e-05, |
| "loss": 2.5384, |
| "step": 11600 |
| }, |
| { |
| "epoch": 1.4242239805234327, |
| "grad_norm": 4.776740550994873, |
| "learning_rate": 1.4935874439461883e-05, |
| "loss": 2.5733, |
| "step": 11700 |
| }, |
| { |
| "epoch": 1.4363968350578211, |
| "grad_norm": 5.181590557098389, |
| "learning_rate": 1.4891031390134529e-05, |
| "loss": 2.5698, |
| "step": 11800 |
| }, |
| { |
| "epoch": 1.4485696895922093, |
| "grad_norm": 4.948436737060547, |
| "learning_rate": 1.4846188340807177e-05, |
| "loss": 2.5288, |
| "step": 11900 |
| }, |
| { |
| "epoch": 1.4607425441265978, |
| "grad_norm": 5.549213409423828, |
| "learning_rate": 1.4801345291479822e-05, |
| "loss": 2.5291, |
| "step": 12000 |
| }, |
| { |
| "epoch": 1.4607425441265978, |
| "eval_loss": 2.5940563678741455, |
| "eval_runtime": 6.8627, |
| "eval_samples_per_second": 145.716, |
| "eval_steps_per_second": 36.429, |
| "step": 12000 |
| }, |
| { |
| "epoch": 1.472915398660986, |
| "grad_norm": 6.372870445251465, |
| "learning_rate": 1.4756502242152468e-05, |
| "loss": 2.5457, |
| "step": 12100 |
| }, |
| { |
| "epoch": 1.4850882531953742, |
| "grad_norm": 5.433255195617676, |
| "learning_rate": 1.4711659192825114e-05, |
| "loss": 2.5521, |
| "step": 12200 |
| }, |
| { |
| "epoch": 1.4972611077297626, |
| "grad_norm": 5.604691028594971, |
| "learning_rate": 1.466681614349776e-05, |
| "loss": 2.5585, |
| "step": 12300 |
| }, |
| { |
| "epoch": 1.509433962264151, |
| "grad_norm": 5.348121643066406, |
| "learning_rate": 1.4621973094170405e-05, |
| "loss": 2.527, |
| "step": 12400 |
| }, |
| { |
| "epoch": 1.5216068167985393, |
| "grad_norm": 4.68524694442749, |
| "learning_rate": 1.4577130044843051e-05, |
| "loss": 2.5351, |
| "step": 12500 |
| }, |
| { |
| "epoch": 1.5216068167985393, |
| "eval_loss": 2.5787315368652344, |
| "eval_runtime": 6.8521, |
| "eval_samples_per_second": 145.94, |
| "eval_steps_per_second": 36.485, |
| "step": 12500 |
| }, |
| { |
| "epoch": 1.5337796713329275, |
| "grad_norm": 5.369399070739746, |
| "learning_rate": 1.4532286995515697e-05, |
| "loss": 2.5457, |
| "step": 12600 |
| }, |
| { |
| "epoch": 1.545952525867316, |
| "grad_norm": 5.384763717651367, |
| "learning_rate": 1.4487892376681615e-05, |
| "loss": 2.5603, |
| "step": 12700 |
| }, |
| { |
| "epoch": 1.5581253804017043, |
| "grad_norm": 5.1856369972229, |
| "learning_rate": 1.4443049327354261e-05, |
| "loss": 2.5531, |
| "step": 12800 |
| }, |
| { |
| "epoch": 1.5702982349360926, |
| "grad_norm": 5.600665092468262, |
| "learning_rate": 1.4398206278026907e-05, |
| "loss": 2.5226, |
| "step": 12900 |
| }, |
| { |
| "epoch": 1.5824710894704808, |
| "grad_norm": 5.185864448547363, |
| "learning_rate": 1.4353363228699552e-05, |
| "loss": 2.5585, |
| "step": 13000 |
| }, |
| { |
| "epoch": 1.5824710894704808, |
| "eval_loss": 2.5597262382507324, |
| "eval_runtime": 6.9067, |
| "eval_samples_per_second": 144.787, |
| "eval_steps_per_second": 36.197, |
| "step": 13000 |
| }, |
| { |
| "epoch": 1.5946439440048692, |
| "grad_norm": 5.945424556732178, |
| "learning_rate": 1.4308520179372198e-05, |
| "loss": 2.5447, |
| "step": 13100 |
| }, |
| { |
| "epoch": 1.6068167985392574, |
| "grad_norm": 4.447841167449951, |
| "learning_rate": 1.4263677130044844e-05, |
| "loss": 2.5638, |
| "step": 13200 |
| }, |
| { |
| "epoch": 1.6189896530736458, |
| "grad_norm": 4.947375297546387, |
| "learning_rate": 1.421883408071749e-05, |
| "loss": 2.5245, |
| "step": 13300 |
| }, |
| { |
| "epoch": 1.631162507608034, |
| "grad_norm": 5.11275053024292, |
| "learning_rate": 1.4173991031390135e-05, |
| "loss": 2.504, |
| "step": 13400 |
| }, |
| { |
| "epoch": 1.6433353621424223, |
| "grad_norm": 5.144463539123535, |
| "learning_rate": 1.4129147982062781e-05, |
| "loss": 2.5517, |
| "step": 13500 |
| }, |
| { |
| "epoch": 1.6433353621424223, |
| "eval_loss": 2.5378565788269043, |
| "eval_runtime": 6.8542, |
| "eval_samples_per_second": 145.896, |
| "eval_steps_per_second": 36.474, |
| "step": 13500 |
| }, |
| { |
| "epoch": 1.6555082166768107, |
| "grad_norm": 6.138312816619873, |
| "learning_rate": 1.4084304932735427e-05, |
| "loss": 2.5334, |
| "step": 13600 |
| }, |
| { |
| "epoch": 1.6676810712111991, |
| "grad_norm": 4.641015529632568, |
| "learning_rate": 1.4039461883408072e-05, |
| "loss": 2.5692, |
| "step": 13700 |
| }, |
| { |
| "epoch": 1.6798539257455873, |
| "grad_norm": 5.140405178070068, |
| "learning_rate": 1.3994618834080718e-05, |
| "loss": 2.5462, |
| "step": 13800 |
| }, |
| { |
| "epoch": 1.6920267802799756, |
| "grad_norm": 5.093076705932617, |
| "learning_rate": 1.3949775784753366e-05, |
| "loss": 2.5227, |
| "step": 13900 |
| }, |
| { |
| "epoch": 1.704199634814364, |
| "grad_norm": 5.549164772033691, |
| "learning_rate": 1.3904932735426011e-05, |
| "loss": 2.5469, |
| "step": 14000 |
| }, |
| { |
| "epoch": 1.704199634814364, |
| "eval_loss": 2.5302209854125977, |
| "eval_runtime": 6.8833, |
| "eval_samples_per_second": 145.279, |
| "eval_steps_per_second": 36.32, |
| "step": 14000 |
| }, |
| { |
| "epoch": 1.7163724893487524, |
| "grad_norm": 5.112196922302246, |
| "learning_rate": 1.3860089686098657e-05, |
| "loss": 2.4753, |
| "step": 14100 |
| }, |
| { |
| "epoch": 1.7285453438831406, |
| "grad_norm": 4.9223313331604, |
| "learning_rate": 1.3815246636771303e-05, |
| "loss": 2.5477, |
| "step": 14200 |
| }, |
| { |
| "epoch": 1.7407181984175288, |
| "grad_norm": 5.270020484924316, |
| "learning_rate": 1.3770403587443948e-05, |
| "loss": 2.5141, |
| "step": 14300 |
| }, |
| { |
| "epoch": 1.7528910529519173, |
| "grad_norm": 5.377967357635498, |
| "learning_rate": 1.3725560538116594e-05, |
| "loss": 2.5151, |
| "step": 14400 |
| }, |
| { |
| "epoch": 1.7650639074863055, |
| "grad_norm": 4.732293605804443, |
| "learning_rate": 1.368071748878924e-05, |
| "loss": 2.559, |
| "step": 14500 |
| }, |
| { |
| "epoch": 1.7650639074863055, |
| "eval_loss": 2.5161020755767822, |
| "eval_runtime": 6.8279, |
| "eval_samples_per_second": 146.457, |
| "eval_steps_per_second": 36.614, |
| "step": 14500 |
| }, |
| { |
| "epoch": 1.777236762020694, |
| "grad_norm": 5.2639241218566895, |
| "learning_rate": 1.3635874439461884e-05, |
| "loss": 2.5199, |
| "step": 14600 |
| }, |
| { |
| "epoch": 1.7894096165550821, |
| "grad_norm": 5.222829818725586, |
| "learning_rate": 1.3591479820627804e-05, |
| "loss": 2.5122, |
| "step": 14700 |
| }, |
| { |
| "epoch": 1.8015824710894703, |
| "grad_norm": 5.396998882293701, |
| "learning_rate": 1.354663677130045e-05, |
| "loss": 2.5665, |
| "step": 14800 |
| }, |
| { |
| "epoch": 1.8137553256238588, |
| "grad_norm": 5.598328113555908, |
| "learning_rate": 1.3501793721973096e-05, |
| "loss": 2.5061, |
| "step": 14900 |
| }, |
| { |
| "epoch": 1.8259281801582472, |
| "grad_norm": 4.519299507141113, |
| "learning_rate": 1.3456950672645741e-05, |
| "loss": 2.5173, |
| "step": 15000 |
| }, |
| { |
| "epoch": 1.8259281801582472, |
| "eval_loss": 2.505549430847168, |
| "eval_runtime": 6.8476, |
| "eval_samples_per_second": 146.036, |
| "eval_steps_per_second": 36.509, |
| "step": 15000 |
| }, |
| { |
| "epoch": 1.8381010346926354, |
| "grad_norm": 5.07867431640625, |
| "learning_rate": 1.3412107623318387e-05, |
| "loss": 2.5085, |
| "step": 15100 |
| }, |
| { |
| "epoch": 1.8502738892270236, |
| "grad_norm": 4.80793571472168, |
| "learning_rate": 1.3367264573991033e-05, |
| "loss": 2.5269, |
| "step": 15200 |
| }, |
| { |
| "epoch": 1.862446743761412, |
| "grad_norm": 5.122992992401123, |
| "learning_rate": 1.3322421524663679e-05, |
| "loss": 2.5165, |
| "step": 15300 |
| }, |
| { |
| "epoch": 1.8746195982958005, |
| "grad_norm": 5.070724010467529, |
| "learning_rate": 1.3277578475336324e-05, |
| "loss": 2.4733, |
| "step": 15400 |
| }, |
| { |
| "epoch": 1.8867924528301887, |
| "grad_norm": 4.850822448730469, |
| "learning_rate": 1.3232735426008968e-05, |
| "loss": 2.5045, |
| "step": 15500 |
| }, |
| { |
| "epoch": 1.8867924528301887, |
| "eval_loss": 2.49042010307312, |
| "eval_runtime": 6.9202, |
| "eval_samples_per_second": 144.505, |
| "eval_steps_per_second": 36.126, |
| "step": 15500 |
| }, |
| { |
| "epoch": 1.898965307364577, |
| "grad_norm": 5.182281494140625, |
| "learning_rate": 1.3187892376681614e-05, |
| "loss": 2.4858, |
| "step": 15600 |
| }, |
| { |
| "epoch": 1.9111381618989653, |
| "grad_norm": 4.803709030151367, |
| "learning_rate": 1.314304932735426e-05, |
| "loss": 2.5043, |
| "step": 15700 |
| }, |
| { |
| "epoch": 1.9233110164333538, |
| "grad_norm": 5.211897850036621, |
| "learning_rate": 1.3098206278026905e-05, |
| "loss": 2.4974, |
| "step": 15800 |
| }, |
| { |
| "epoch": 1.935483870967742, |
| "grad_norm": 4.982048988342285, |
| "learning_rate": 1.3053363228699553e-05, |
| "loss": 2.4901, |
| "step": 15900 |
| }, |
| { |
| "epoch": 1.9476567255021302, |
| "grad_norm": 5.34013557434082, |
| "learning_rate": 1.3008520179372199e-05, |
| "loss": 2.4938, |
| "step": 16000 |
| }, |
| { |
| "epoch": 1.9476567255021302, |
| "eval_loss": 2.479241371154785, |
| "eval_runtime": 6.9209, |
| "eval_samples_per_second": 144.49, |
| "eval_steps_per_second": 36.122, |
| "step": 16000 |
| }, |
| { |
| "epoch": 1.9598295800365184, |
| "grad_norm": 4.926109790802002, |
| "learning_rate": 1.2963677130044844e-05, |
| "loss": 2.522, |
| "step": 16100 |
| }, |
| { |
| "epoch": 1.9720024345709068, |
| "grad_norm": 5.252937316894531, |
| "learning_rate": 1.291883408071749e-05, |
| "loss": 2.4979, |
| "step": 16200 |
| }, |
| { |
| "epoch": 1.9841752891052953, |
| "grad_norm": 4.676843166351318, |
| "learning_rate": 1.2873991031390136e-05, |
| "loss": 2.5011, |
| "step": 16300 |
| }, |
| { |
| "epoch": 1.9963481436396835, |
| "grad_norm": 4.4982171058654785, |
| "learning_rate": 1.2829147982062782e-05, |
| "loss": 2.5232, |
| "step": 16400 |
| }, |
| { |
| "epoch": 2.0085209981740717, |
| "grad_norm": 5.115514278411865, |
| "learning_rate": 1.2784304932735427e-05, |
| "loss": 2.4807, |
| "step": 16500 |
| }, |
| { |
| "epoch": 2.0085209981740717, |
| "eval_loss": 2.4553143978118896, |
| "eval_runtime": 6.8911, |
| "eval_samples_per_second": 145.114, |
| "eval_steps_per_second": 36.279, |
| "step": 16500 |
| }, |
| { |
| "epoch": 2.02069385270846, |
| "grad_norm": 5.778520107269287, |
| "learning_rate": 1.2739461883408073e-05, |
| "loss": 2.3637, |
| "step": 16600 |
| }, |
| { |
| "epoch": 2.0328667072428486, |
| "grad_norm": 4.936229705810547, |
| "learning_rate": 1.2694618834080719e-05, |
| "loss": 2.3936, |
| "step": 16700 |
| }, |
| { |
| "epoch": 2.045039561777237, |
| "grad_norm": 6.013847827911377, |
| "learning_rate": 1.2649775784753364e-05, |
| "loss": 2.3953, |
| "step": 16800 |
| }, |
| { |
| "epoch": 2.057212416311625, |
| "grad_norm": 6.078458786010742, |
| "learning_rate": 1.2605381165919283e-05, |
| "loss": 2.3312, |
| "step": 16900 |
| }, |
| { |
| "epoch": 2.069385270846013, |
| "grad_norm": 5.697019100189209, |
| "learning_rate": 1.2560538116591929e-05, |
| "loss": 2.334, |
| "step": 17000 |
| }, |
| { |
| "epoch": 2.069385270846013, |
| "eval_loss": 2.4449574947357178, |
| "eval_runtime": 6.9363, |
| "eval_samples_per_second": 144.169, |
| "eval_steps_per_second": 36.042, |
| "step": 17000 |
| }, |
| { |
| "epoch": 2.081558125380402, |
| "grad_norm": 5.652517795562744, |
| "learning_rate": 1.2515695067264574e-05, |
| "loss": 2.3902, |
| "step": 17100 |
| }, |
| { |
| "epoch": 2.09373097991479, |
| "grad_norm": 6.007380485534668, |
| "learning_rate": 1.247085201793722e-05, |
| "loss": 2.3629, |
| "step": 17200 |
| }, |
| { |
| "epoch": 2.1059038344491783, |
| "grad_norm": 5.070584774017334, |
| "learning_rate": 1.2426008968609866e-05, |
| "loss": 2.3523, |
| "step": 17300 |
| }, |
| { |
| "epoch": 2.1180766889835665, |
| "grad_norm": 5.079153537750244, |
| "learning_rate": 1.2381165919282512e-05, |
| "loss": 2.3429, |
| "step": 17400 |
| }, |
| { |
| "epoch": 2.130249543517955, |
| "grad_norm": 5.278266906738281, |
| "learning_rate": 1.2336322869955157e-05, |
| "loss": 2.2969, |
| "step": 17500 |
| }, |
| { |
| "epoch": 2.130249543517955, |
| "eval_loss": 2.4217474460601807, |
| "eval_runtime": 6.9637, |
| "eval_samples_per_second": 143.601, |
| "eval_steps_per_second": 35.9, |
| "step": 17500 |
| }, |
| { |
| "epoch": 2.1424223980523434, |
| "grad_norm": 5.2419633865356445, |
| "learning_rate": 1.2291479820627803e-05, |
| "loss": 2.3671, |
| "step": 17600 |
| }, |
| { |
| "epoch": 2.1545952525867316, |
| "grad_norm": 5.445255279541016, |
| "learning_rate": 1.2246636771300449e-05, |
| "loss": 2.3834, |
| "step": 17700 |
| }, |
| { |
| "epoch": 2.16676810712112, |
| "grad_norm": 5.891075134277344, |
| "learning_rate": 1.2201793721973095e-05, |
| "loss": 2.36, |
| "step": 17800 |
| }, |
| { |
| "epoch": 2.178940961655508, |
| "grad_norm": 5.8141865730285645, |
| "learning_rate": 1.215695067264574e-05, |
| "loss": 2.3596, |
| "step": 17900 |
| }, |
| { |
| "epoch": 2.1911138161898966, |
| "grad_norm": 5.558561325073242, |
| "learning_rate": 1.2112107623318388e-05, |
| "loss": 2.3926, |
| "step": 18000 |
| }, |
| { |
| "epoch": 2.1911138161898966, |
| "eval_loss": 2.415804624557495, |
| "eval_runtime": 6.8469, |
| "eval_samples_per_second": 146.052, |
| "eval_steps_per_second": 36.513, |
| "step": 18000 |
| }, |
| { |
| "epoch": 2.203286670724285, |
| "grad_norm": 5.968663692474365, |
| "learning_rate": 1.2067264573991033e-05, |
| "loss": 2.3609, |
| "step": 18100 |
| }, |
| { |
| "epoch": 2.215459525258673, |
| "grad_norm": 5.241644382476807, |
| "learning_rate": 1.2022421524663679e-05, |
| "loss": 2.3634, |
| "step": 18200 |
| }, |
| { |
| "epoch": 2.2276323797930613, |
| "grad_norm": 6.328832149505615, |
| "learning_rate": 1.1977578475336325e-05, |
| "loss": 2.3465, |
| "step": 18300 |
| }, |
| { |
| "epoch": 2.23980523432745, |
| "grad_norm": 5.125701904296875, |
| "learning_rate": 1.193273542600897e-05, |
| "loss": 2.3171, |
| "step": 18400 |
| }, |
| { |
| "epoch": 2.251978088861838, |
| "grad_norm": 4.962270259857178, |
| "learning_rate": 1.1887892376681616e-05, |
| "loss": 2.3739, |
| "step": 18500 |
| }, |
| { |
| "epoch": 2.251978088861838, |
| "eval_loss": 2.4065887928009033, |
| "eval_runtime": 6.9359, |
| "eval_samples_per_second": 144.178, |
| "eval_steps_per_second": 36.045, |
| "step": 18500 |
| }, |
| { |
| "epoch": 2.2641509433962264, |
| "grad_norm": 5.895593643188477, |
| "learning_rate": 1.1843049327354262e-05, |
| "loss": 2.3656, |
| "step": 18600 |
| }, |
| { |
| "epoch": 2.2763237979306146, |
| "grad_norm": 6.21762752532959, |
| "learning_rate": 1.1798206278026906e-05, |
| "loss": 2.3575, |
| "step": 18700 |
| }, |
| { |
| "epoch": 2.2884966524650032, |
| "grad_norm": 5.935133934020996, |
| "learning_rate": 1.1753363228699552e-05, |
| "loss": 2.3687, |
| "step": 18800 |
| }, |
| { |
| "epoch": 2.3006695069993914, |
| "grad_norm": 5.431483268737793, |
| "learning_rate": 1.1708520179372198e-05, |
| "loss": 2.3465, |
| "step": 18900 |
| }, |
| { |
| "epoch": 2.3128423615337796, |
| "grad_norm": 6.319828510284424, |
| "learning_rate": 1.1664125560538118e-05, |
| "loss": 2.3659, |
| "step": 19000 |
| }, |
| { |
| "epoch": 2.3128423615337796, |
| "eval_loss": 2.390819787979126, |
| "eval_runtime": 6.9389, |
| "eval_samples_per_second": 144.115, |
| "eval_steps_per_second": 36.029, |
| "step": 19000 |
| }, |
| { |
| "epoch": 2.325015216068168, |
| "grad_norm": 5.955752372741699, |
| "learning_rate": 1.1619282511210763e-05, |
| "loss": 2.3702, |
| "step": 19100 |
| }, |
| { |
| "epoch": 2.337188070602556, |
| "grad_norm": 5.977270603179932, |
| "learning_rate": 1.157443946188341e-05, |
| "loss": 2.3935, |
| "step": 19200 |
| }, |
| { |
| "epoch": 2.3493609251369447, |
| "grad_norm": 5.417830944061279, |
| "learning_rate": 1.1529596412556055e-05, |
| "loss": 2.359, |
| "step": 19300 |
| }, |
| { |
| "epoch": 2.361533779671333, |
| "grad_norm": 5.452037334442139, |
| "learning_rate": 1.14847533632287e-05, |
| "loss": 2.3496, |
| "step": 19400 |
| }, |
| { |
| "epoch": 2.373706634205721, |
| "grad_norm": 4.931158065795898, |
| "learning_rate": 1.1439910313901346e-05, |
| "loss": 2.3483, |
| "step": 19500 |
| }, |
| { |
| "epoch": 2.373706634205721, |
| "eval_loss": 2.3805489540100098, |
| "eval_runtime": 6.803, |
| "eval_samples_per_second": 146.994, |
| "eval_steps_per_second": 36.749, |
| "step": 19500 |
| }, |
| { |
| "epoch": 2.38587948874011, |
| "grad_norm": 5.650387287139893, |
| "learning_rate": 1.1395067264573992e-05, |
| "loss": 2.3644, |
| "step": 19600 |
| }, |
| { |
| "epoch": 2.398052343274498, |
| "grad_norm": 5.70589542388916, |
| "learning_rate": 1.1350224215246636e-05, |
| "loss": 2.3472, |
| "step": 19700 |
| }, |
| { |
| "epoch": 2.410225197808886, |
| "grad_norm": 5.833774566650391, |
| "learning_rate": 1.1305381165919282e-05, |
| "loss": 2.3663, |
| "step": 19800 |
| }, |
| { |
| "epoch": 2.4223980523432744, |
| "grad_norm": 5.079782485961914, |
| "learning_rate": 1.1260538116591928e-05, |
| "loss": 2.3726, |
| "step": 19900 |
| }, |
| { |
| "epoch": 2.4345709068776626, |
| "grad_norm": 5.578153610229492, |
| "learning_rate": 1.1215695067264577e-05, |
| "loss": 2.3432, |
| "step": 20000 |
| }, |
| { |
| "epoch": 2.4345709068776626, |
| "eval_loss": 2.3689472675323486, |
| "eval_runtime": 6.993, |
| "eval_samples_per_second": 143.0, |
| "eval_steps_per_second": 35.75, |
| "step": 20000 |
| }, |
| { |
| "epoch": 2.4467437614120513, |
| "grad_norm": 5.551452159881592, |
| "learning_rate": 1.117085201793722e-05, |
| "loss": 2.3658, |
| "step": 20100 |
| }, |
| { |
| "epoch": 2.4589166159464395, |
| "grad_norm": 5.28959321975708, |
| "learning_rate": 1.1126008968609866e-05, |
| "loss": 2.3526, |
| "step": 20200 |
| }, |
| { |
| "epoch": 2.4710894704808277, |
| "grad_norm": 5.358762741088867, |
| "learning_rate": 1.1081165919282512e-05, |
| "loss": 2.3161, |
| "step": 20300 |
| }, |
| { |
| "epoch": 2.483262325015216, |
| "grad_norm": 5.633576393127441, |
| "learning_rate": 1.1036322869955158e-05, |
| "loss": 2.3778, |
| "step": 20400 |
| }, |
| { |
| "epoch": 2.495435179549604, |
| "grad_norm": 5.258509635925293, |
| "learning_rate": 1.0991479820627804e-05, |
| "loss": 2.3538, |
| "step": 20500 |
| }, |
| { |
| "epoch": 2.495435179549604, |
| "eval_loss": 2.358330488204956, |
| "eval_runtime": 6.9369, |
| "eval_samples_per_second": 144.156, |
| "eval_steps_per_second": 36.039, |
| "step": 20500 |
| }, |
| { |
| "epoch": 2.507608034083993, |
| "grad_norm": 5.632132053375244, |
| "learning_rate": 1.094663677130045e-05, |
| "loss": 2.3514, |
| "step": 20600 |
| }, |
| { |
| "epoch": 2.519780888618381, |
| "grad_norm": 5.449893951416016, |
| "learning_rate": 1.0901793721973095e-05, |
| "loss": 2.3404, |
| "step": 20700 |
| }, |
| { |
| "epoch": 2.531953743152769, |
| "grad_norm": 5.66605281829834, |
| "learning_rate": 1.085695067264574e-05, |
| "loss": 2.3335, |
| "step": 20800 |
| }, |
| { |
| "epoch": 2.544126597687158, |
| "grad_norm": 6.729547500610352, |
| "learning_rate": 1.0812107623318387e-05, |
| "loss": 2.3784, |
| "step": 20900 |
| }, |
| { |
| "epoch": 2.556299452221546, |
| "grad_norm": 5.5277581214904785, |
| "learning_rate": 1.0767713004484305e-05, |
| "loss": 2.3424, |
| "step": 21000 |
| }, |
| { |
| "epoch": 2.556299452221546, |
| "eval_loss": 2.3434271812438965, |
| "eval_runtime": 6.8629, |
| "eval_samples_per_second": 145.712, |
| "eval_steps_per_second": 36.428, |
| "step": 21000 |
| }, |
| { |
| "epoch": 2.5684723067559343, |
| "grad_norm": 5.892464637756348, |
| "learning_rate": 1.072286995515695e-05, |
| "loss": 2.3577, |
| "step": 21100 |
| }, |
| { |
| "epoch": 2.5806451612903225, |
| "grad_norm": 5.313469409942627, |
| "learning_rate": 1.0678026905829597e-05, |
| "loss": 2.3489, |
| "step": 21200 |
| }, |
| { |
| "epoch": 2.5928180158247107, |
| "grad_norm": 5.569064140319824, |
| "learning_rate": 1.0633183856502242e-05, |
| "loss": 2.3828, |
| "step": 21300 |
| }, |
| { |
| "epoch": 2.6049908703590994, |
| "grad_norm": 6.133281707763672, |
| "learning_rate": 1.0588340807174888e-05, |
| "loss": 2.3203, |
| "step": 21400 |
| }, |
| { |
| "epoch": 2.6171637248934876, |
| "grad_norm": 5.569573402404785, |
| "learning_rate": 1.0543497757847534e-05, |
| "loss": 2.3508, |
| "step": 21500 |
| }, |
| { |
| "epoch": 2.6171637248934876, |
| "eval_loss": 2.3320422172546387, |
| "eval_runtime": 6.8431, |
| "eval_samples_per_second": 146.132, |
| "eval_steps_per_second": 36.533, |
| "step": 21500 |
| }, |
| { |
| "epoch": 2.629336579427876, |
| "grad_norm": 5.560952186584473, |
| "learning_rate": 1.049865470852018e-05, |
| "loss": 2.3532, |
| "step": 21600 |
| }, |
| { |
| "epoch": 2.641509433962264, |
| "grad_norm": 5.652987957000732, |
| "learning_rate": 1.0453811659192825e-05, |
| "loss": 2.3233, |
| "step": 21700 |
| }, |
| { |
| "epoch": 2.653682288496652, |
| "grad_norm": 5.666792869567871, |
| "learning_rate": 1.0408968609865471e-05, |
| "loss": 2.353, |
| "step": 21800 |
| }, |
| { |
| "epoch": 2.665855143031041, |
| "grad_norm": 5.652164936065674, |
| "learning_rate": 1.0364125560538117e-05, |
| "loss": 2.3483, |
| "step": 21900 |
| }, |
| { |
| "epoch": 2.678027997565429, |
| "grad_norm": 5.158956527709961, |
| "learning_rate": 1.0319282511210764e-05, |
| "loss": 2.3344, |
| "step": 22000 |
| }, |
| { |
| "epoch": 2.678027997565429, |
| "eval_loss": 2.3204360008239746, |
| "eval_runtime": 6.8964, |
| "eval_samples_per_second": 145.003, |
| "eval_steps_per_second": 36.251, |
| "step": 22000 |
| }, |
| { |
| "epoch": 2.6902008520998173, |
| "grad_norm": 4.993370056152344, |
| "learning_rate": 1.027443946188341e-05, |
| "loss": 2.3185, |
| "step": 22100 |
| }, |
| { |
| "epoch": 2.702373706634206, |
| "grad_norm": 5.251499652862549, |
| "learning_rate": 1.0229596412556056e-05, |
| "loss": 2.3463, |
| "step": 22200 |
| }, |
| { |
| "epoch": 2.714546561168594, |
| "grad_norm": 5.155273914337158, |
| "learning_rate": 1.0184753363228701e-05, |
| "loss": 2.3299, |
| "step": 22300 |
| }, |
| { |
| "epoch": 2.7267194157029824, |
| "grad_norm": 4.445164680480957, |
| "learning_rate": 1.0139910313901347e-05, |
| "loss": 2.3368, |
| "step": 22400 |
| }, |
| { |
| "epoch": 2.7388922702373706, |
| "grad_norm": 5.968411445617676, |
| "learning_rate": 1.0095067264573993e-05, |
| "loss": 2.321, |
| "step": 22500 |
| }, |
| { |
| "epoch": 2.7388922702373706, |
| "eval_loss": 2.3084633350372314, |
| "eval_runtime": 6.9774, |
| "eval_samples_per_second": 143.32, |
| "eval_steps_per_second": 35.83, |
| "step": 22500 |
| }, |
| { |
| "epoch": 2.751065124771759, |
| "grad_norm": 5.2266364097595215, |
| "learning_rate": 1.0050224215246638e-05, |
| "loss": 2.3387, |
| "step": 22600 |
| }, |
| { |
| "epoch": 2.7632379793061475, |
| "grad_norm": 5.649938583374023, |
| "learning_rate": 1.0005381165919284e-05, |
| "loss": 2.3388, |
| "step": 22700 |
| }, |
| { |
| "epoch": 2.7754108338405357, |
| "grad_norm": 5.603872299194336, |
| "learning_rate": 9.96053811659193e-06, |
| "loss": 2.3331, |
| "step": 22800 |
| }, |
| { |
| "epoch": 2.787583688374924, |
| "grad_norm": 5.831801891326904, |
| "learning_rate": 9.915695067264574e-06, |
| "loss": 2.3509, |
| "step": 22900 |
| }, |
| { |
| "epoch": 2.799756542909312, |
| "grad_norm": 5.071148872375488, |
| "learning_rate": 9.871300448430494e-06, |
| "loss": 2.3296, |
| "step": 23000 |
| }, |
| { |
| "epoch": 2.799756542909312, |
| "eval_loss": 2.298048257827759, |
| "eval_runtime": 6.8909, |
| "eval_samples_per_second": 145.119, |
| "eval_steps_per_second": 36.28, |
| "step": 23000 |
| }, |
| { |
| "epoch": 2.8119293974437003, |
| "grad_norm": 5.613708972930908, |
| "learning_rate": 9.82645739910314e-06, |
| "loss": 2.3458, |
| "step": 23100 |
| }, |
| { |
| "epoch": 2.824102251978089, |
| "grad_norm": 6.964206218719482, |
| "learning_rate": 9.781614349775786e-06, |
| "loss": 2.3523, |
| "step": 23200 |
| }, |
| { |
| "epoch": 2.836275106512477, |
| "grad_norm": 6.069615364074707, |
| "learning_rate": 9.737219730941706e-06, |
| "loss": 2.3364, |
| "step": 23300 |
| }, |
| { |
| "epoch": 2.8484479610468654, |
| "grad_norm": 4.563328266143799, |
| "learning_rate": 9.69237668161435e-06, |
| "loss": 2.3164, |
| "step": 23400 |
| }, |
| { |
| "epoch": 2.860620815581254, |
| "grad_norm": 5.069984436035156, |
| "learning_rate": 9.647533632286995e-06, |
| "loss": 2.3347, |
| "step": 23500 |
| }, |
| { |
| "epoch": 2.860620815581254, |
| "eval_loss": 2.2902982234954834, |
| "eval_runtime": 6.9027, |
| "eval_samples_per_second": 144.87, |
| "eval_steps_per_second": 36.218, |
| "step": 23500 |
| }, |
| { |
| "epoch": 2.8727936701156422, |
| "grad_norm": 5.443928241729736, |
| "learning_rate": 9.602690582959641e-06, |
| "loss": 2.3211, |
| "step": 23600 |
| }, |
| { |
| "epoch": 2.8849665246500305, |
| "grad_norm": 5.5851664543151855, |
| "learning_rate": 9.557847533632287e-06, |
| "loss": 2.3469, |
| "step": 23700 |
| }, |
| { |
| "epoch": 2.8971393791844187, |
| "grad_norm": 5.386264324188232, |
| "learning_rate": 9.513004484304934e-06, |
| "loss": 2.3303, |
| "step": 23800 |
| }, |
| { |
| "epoch": 2.909312233718807, |
| "grad_norm": 5.505928993225098, |
| "learning_rate": 9.46816143497758e-06, |
| "loss": 2.3396, |
| "step": 23900 |
| }, |
| { |
| "epoch": 2.9214850882531955, |
| "grad_norm": 5.181743621826172, |
| "learning_rate": 9.423318385650226e-06, |
| "loss": 2.3214, |
| "step": 24000 |
| }, |
| { |
| "epoch": 2.9214850882531955, |
| "eval_loss": 2.28114652633667, |
| "eval_runtime": 6.8437, |
| "eval_samples_per_second": 146.121, |
| "eval_steps_per_second": 36.53, |
| "step": 24000 |
| }, |
| { |
| "epoch": 2.9336579427875837, |
| "grad_norm": 6.292041301727295, |
| "learning_rate": 9.378475336322872e-06, |
| "loss": 2.3341, |
| "step": 24100 |
| }, |
| { |
| "epoch": 2.945830797321972, |
| "grad_norm": 5.232330322265625, |
| "learning_rate": 9.333632286995517e-06, |
| "loss": 2.2984, |
| "step": 24200 |
| }, |
| { |
| "epoch": 2.95800365185636, |
| "grad_norm": 5.351822376251221, |
| "learning_rate": 9.288789237668161e-06, |
| "loss": 2.326, |
| "step": 24300 |
| }, |
| { |
| "epoch": 2.9701765063907484, |
| "grad_norm": 5.880828380584717, |
| "learning_rate": 9.243946188340807e-06, |
| "loss": 2.3399, |
| "step": 24400 |
| }, |
| { |
| "epoch": 2.982349360925137, |
| "grad_norm": 5.407314777374268, |
| "learning_rate": 9.199103139013453e-06, |
| "loss": 2.3007, |
| "step": 24500 |
| }, |
| { |
| "epoch": 2.982349360925137, |
| "eval_loss": 2.273526906967163, |
| "eval_runtime": 6.9358, |
| "eval_samples_per_second": 144.179, |
| "eval_steps_per_second": 36.045, |
| "step": 24500 |
| }, |
| { |
| "epoch": 2.9945222154595252, |
| "grad_norm": 5.49412727355957, |
| "learning_rate": 9.154260089686099e-06, |
| "loss": 2.3325, |
| "step": 24600 |
| }, |
| { |
| "epoch": 3.0066950699939134, |
| "grad_norm": 5.382359981536865, |
| "learning_rate": 9.109417040358746e-06, |
| "loss": 2.2779, |
| "step": 24700 |
| }, |
| { |
| "epoch": 3.018867924528302, |
| "grad_norm": 6.563231945037842, |
| "learning_rate": 9.064573991031392e-06, |
| "loss": 2.2142, |
| "step": 24800 |
| }, |
| { |
| "epoch": 3.0310407790626903, |
| "grad_norm": 6.05570650100708, |
| "learning_rate": 9.019730941704037e-06, |
| "loss": 2.1952, |
| "step": 24900 |
| }, |
| { |
| "epoch": 3.0432136335970785, |
| "grad_norm": 5.2819366455078125, |
| "learning_rate": 8.974887892376683e-06, |
| "loss": 2.2244, |
| "step": 25000 |
| }, |
| { |
| "epoch": 3.0432136335970785, |
| "eval_loss": 2.253713846206665, |
| "eval_runtime": 6.8869, |
| "eval_samples_per_second": 145.202, |
| "eval_steps_per_second": 36.301, |
| "step": 25000 |
| }, |
| { |
| "epoch": 3.0553864881314667, |
| "grad_norm": 5.801946640014648, |
| "learning_rate": 8.930044843049329e-06, |
| "loss": 2.2423, |
| "step": 25100 |
| }, |
| { |
| "epoch": 3.067559342665855, |
| "grad_norm": 5.829814910888672, |
| "learning_rate": 8.885201793721973e-06, |
| "loss": 2.2372, |
| "step": 25200 |
| }, |
| { |
| "epoch": 3.0797321972002436, |
| "grad_norm": 5.983118534088135, |
| "learning_rate": 8.840358744394619e-06, |
| "loss": 2.2363, |
| "step": 25300 |
| }, |
| { |
| "epoch": 3.091905051734632, |
| "grad_norm": 5.694368839263916, |
| "learning_rate": 8.795515695067264e-06, |
| "loss": 2.1785, |
| "step": 25400 |
| }, |
| { |
| "epoch": 3.10407790626902, |
| "grad_norm": 5.976083755493164, |
| "learning_rate": 8.75067264573991e-06, |
| "loss": 2.2061, |
| "step": 25500 |
| }, |
| { |
| "epoch": 3.10407790626902, |
| "eval_loss": 2.2468533515930176, |
| "eval_runtime": 6.9894, |
| "eval_samples_per_second": 143.075, |
| "eval_steps_per_second": 35.769, |
| "step": 25500 |
| }, |
| { |
| "epoch": 3.1162507608034082, |
| "grad_norm": 5.972872734069824, |
| "learning_rate": 8.705829596412557e-06, |
| "loss": 2.2269, |
| "step": 25600 |
| }, |
| { |
| "epoch": 3.128423615337797, |
| "grad_norm": 5.245333671569824, |
| "learning_rate": 8.660986547085203e-06, |
| "loss": 2.2519, |
| "step": 25700 |
| }, |
| { |
| "epoch": 3.140596469872185, |
| "grad_norm": 6.581233501434326, |
| "learning_rate": 8.616143497757849e-06, |
| "loss": 2.2181, |
| "step": 25800 |
| }, |
| { |
| "epoch": 3.1527693244065733, |
| "grad_norm": 6.18913459777832, |
| "learning_rate": 8.571300448430495e-06, |
| "loss": 2.1892, |
| "step": 25900 |
| }, |
| { |
| "epoch": 3.1649421789409615, |
| "grad_norm": 5.771265983581543, |
| "learning_rate": 8.52645739910314e-06, |
| "loss": 2.1789, |
| "step": 26000 |
| }, |
| { |
| "epoch": 3.1649421789409615, |
| "eval_loss": 2.2344589233398438, |
| "eval_runtime": 6.8267, |
| "eval_samples_per_second": 146.484, |
| "eval_steps_per_second": 36.621, |
| "step": 26000 |
| }, |
| { |
| "epoch": 3.17711503347535, |
| "grad_norm": 6.273107528686523, |
| "learning_rate": 8.481614349775784e-06, |
| "loss": 2.2029, |
| "step": 26100 |
| }, |
| { |
| "epoch": 3.1892878880097384, |
| "grad_norm": 6.768197059631348, |
| "learning_rate": 8.43677130044843e-06, |
| "loss": 2.2042, |
| "step": 26200 |
| }, |
| { |
| "epoch": 3.2014607425441266, |
| "grad_norm": 7.103708267211914, |
| "learning_rate": 8.391928251121076e-06, |
| "loss": 2.2142, |
| "step": 26300 |
| }, |
| { |
| "epoch": 3.213633597078515, |
| "grad_norm": 6.05976676940918, |
| "learning_rate": 8.347085201793723e-06, |
| "loss": 2.1747, |
| "step": 26400 |
| }, |
| { |
| "epoch": 3.225806451612903, |
| "grad_norm": 5.711021900177002, |
| "learning_rate": 8.302242152466369e-06, |
| "loss": 2.2039, |
| "step": 26500 |
| }, |
| { |
| "epoch": 3.225806451612903, |
| "eval_loss": 2.2293026447296143, |
| "eval_runtime": 6.8771, |
| "eval_samples_per_second": 145.41, |
| "eval_steps_per_second": 36.352, |
| "step": 26500 |
| }, |
| { |
| "epoch": 3.2379793061472917, |
| "grad_norm": 5.777741432189941, |
| "learning_rate": 8.257399103139015e-06, |
| "loss": 2.2259, |
| "step": 26600 |
| }, |
| { |
| "epoch": 3.25015216068168, |
| "grad_norm": 5.676499843597412, |
| "learning_rate": 8.21255605381166e-06, |
| "loss": 2.1918, |
| "step": 26700 |
| }, |
| { |
| "epoch": 3.262325015216068, |
| "grad_norm": 6.470264911651611, |
| "learning_rate": 8.167713004484306e-06, |
| "loss": 2.212, |
| "step": 26800 |
| }, |
| { |
| "epoch": 3.2744978697504563, |
| "grad_norm": 6.308848857879639, |
| "learning_rate": 8.122869955156952e-06, |
| "loss": 2.2138, |
| "step": 26900 |
| }, |
| { |
| "epoch": 3.286670724284845, |
| "grad_norm": 5.39501428604126, |
| "learning_rate": 8.078026905829596e-06, |
| "loss": 2.248, |
| "step": 27000 |
| }, |
| { |
| "epoch": 3.286670724284845, |
| "eval_loss": 2.2192747592926025, |
| "eval_runtime": 6.9636, |
| "eval_samples_per_second": 143.603, |
| "eval_steps_per_second": 35.901, |
| "step": 27000 |
| }, |
| { |
| "epoch": 3.298843578819233, |
| "grad_norm": 5.875838756561279, |
| "learning_rate": 8.033183856502242e-06, |
| "loss": 2.2131, |
| "step": 27100 |
| }, |
| { |
| "epoch": 3.3110164333536214, |
| "grad_norm": 5.159265518188477, |
| "learning_rate": 7.988340807174887e-06, |
| "loss": 2.2037, |
| "step": 27200 |
| }, |
| { |
| "epoch": 3.3231892878880096, |
| "grad_norm": 5.619683265686035, |
| "learning_rate": 7.943946188340808e-06, |
| "loss": 2.1818, |
| "step": 27300 |
| }, |
| { |
| "epoch": 3.3353621424223983, |
| "grad_norm": 7.503751277923584, |
| "learning_rate": 7.899103139013453e-06, |
| "loss": 2.2087, |
| "step": 27400 |
| }, |
| { |
| "epoch": 3.3475349969567865, |
| "grad_norm": 5.3004937171936035, |
| "learning_rate": 7.854260089686099e-06, |
| "loss": 2.2151, |
| "step": 27500 |
| }, |
| { |
| "epoch": 3.3475349969567865, |
| "eval_loss": 2.209369659423828, |
| "eval_runtime": 6.9186, |
| "eval_samples_per_second": 144.537, |
| "eval_steps_per_second": 36.134, |
| "step": 27500 |
| }, |
| { |
| "epoch": 3.3597078514911747, |
| "grad_norm": 6.6273193359375, |
| "learning_rate": 7.809417040358745e-06, |
| "loss": 2.2208, |
| "step": 27600 |
| }, |
| { |
| "epoch": 3.371880706025563, |
| "grad_norm": 6.1234588623046875, |
| "learning_rate": 7.76457399103139e-06, |
| "loss": 2.1957, |
| "step": 27700 |
| }, |
| { |
| "epoch": 3.384053560559951, |
| "grad_norm": 6.082681655883789, |
| "learning_rate": 7.719730941704036e-06, |
| "loss": 2.2202, |
| "step": 27800 |
| }, |
| { |
| "epoch": 3.3962264150943398, |
| "grad_norm": 6.618956089019775, |
| "learning_rate": 7.674887892376682e-06, |
| "loss": 2.2045, |
| "step": 27900 |
| }, |
| { |
| "epoch": 3.408399269628728, |
| "grad_norm": 5.74383544921875, |
| "learning_rate": 7.630044843049328e-06, |
| "loss": 2.2308, |
| "step": 28000 |
| }, |
| { |
| "epoch": 3.408399269628728, |
| "eval_loss": 2.206360340118408, |
| "eval_runtime": 6.9078, |
| "eval_samples_per_second": 144.763, |
| "eval_steps_per_second": 36.191, |
| "step": 28000 |
| }, |
| { |
| "epoch": 3.420572124163116, |
| "grad_norm": 6.5505690574646, |
| "learning_rate": 7.5852017937219735e-06, |
| "loss": 2.2036, |
| "step": 28100 |
| }, |
| { |
| "epoch": 3.4327449786975044, |
| "grad_norm": 5.887704372406006, |
| "learning_rate": 7.540358744394619e-06, |
| "loss": 2.1714, |
| "step": 28200 |
| }, |
| { |
| "epoch": 3.444917833231893, |
| "grad_norm": 6.853738784790039, |
| "learning_rate": 7.495515695067265e-06, |
| "loss": 2.2269, |
| "step": 28300 |
| }, |
| { |
| "epoch": 3.4570906877662813, |
| "grad_norm": 5.702883243560791, |
| "learning_rate": 7.4506726457399115e-06, |
| "loss": 2.192, |
| "step": 28400 |
| }, |
| { |
| "epoch": 3.4692635423006695, |
| "grad_norm": 6.062043190002441, |
| "learning_rate": 7.405829596412557e-06, |
| "loss": 2.2095, |
| "step": 28500 |
| }, |
| { |
| "epoch": 3.4692635423006695, |
| "eval_loss": 2.1974008083343506, |
| "eval_runtime": 6.9256, |
| "eval_samples_per_second": 144.392, |
| "eval_steps_per_second": 36.098, |
| "step": 28500 |
| }, |
| { |
| "epoch": 3.4814363968350577, |
| "grad_norm": 5.228243827819824, |
| "learning_rate": 7.360986547085203e-06, |
| "loss": 2.221, |
| "step": 28600 |
| }, |
| { |
| "epoch": 3.4936092513694463, |
| "grad_norm": 5.8091607093811035, |
| "learning_rate": 7.316143497757848e-06, |
| "loss": 2.2062, |
| "step": 28700 |
| }, |
| { |
| "epoch": 3.5057821059038345, |
| "grad_norm": 4.786416053771973, |
| "learning_rate": 7.2713004484304936e-06, |
| "loss": 2.1716, |
| "step": 28800 |
| }, |
| { |
| "epoch": 3.5179549604382228, |
| "grad_norm": 6.92462158203125, |
| "learning_rate": 7.226457399103139e-06, |
| "loss": 2.211, |
| "step": 28900 |
| }, |
| { |
| "epoch": 3.530127814972611, |
| "grad_norm": 7.192811489105225, |
| "learning_rate": 7.181614349775785e-06, |
| "loss": 2.2085, |
| "step": 29000 |
| }, |
| { |
| "epoch": 3.530127814972611, |
| "eval_loss": 2.185516595840454, |
| "eval_runtime": 6.8385, |
| "eval_samples_per_second": 146.231, |
| "eval_steps_per_second": 36.558, |
| "step": 29000 |
| }, |
| { |
| "epoch": 3.542300669506999, |
| "grad_norm": 5.579026222229004, |
| "learning_rate": 7.136771300448431e-06, |
| "loss": 2.1974, |
| "step": 29100 |
| }, |
| { |
| "epoch": 3.554473524041388, |
| "grad_norm": 6.277022838592529, |
| "learning_rate": 7.0919282511210765e-06, |
| "loss": 2.1982, |
| "step": 29200 |
| }, |
| { |
| "epoch": 3.566646378575776, |
| "grad_norm": 5.486943244934082, |
| "learning_rate": 7.047533632286996e-06, |
| "loss": 2.1877, |
| "step": 29300 |
| }, |
| { |
| "epoch": 3.5788192331101643, |
| "grad_norm": 6.431853771209717, |
| "learning_rate": 7.0026905829596416e-06, |
| "loss": 2.2109, |
| "step": 29400 |
| }, |
| { |
| "epoch": 3.590992087644553, |
| "grad_norm": 6.601170539855957, |
| "learning_rate": 6.957847533632287e-06, |
| "loss": 2.2122, |
| "step": 29500 |
| }, |
| { |
| "epoch": 3.590992087644553, |
| "eval_loss": 2.1791625022888184, |
| "eval_runtime": 6.9136, |
| "eval_samples_per_second": 144.642, |
| "eval_steps_per_second": 36.161, |
| "step": 29500 |
| }, |
| { |
| "epoch": 3.603164942178941, |
| "grad_norm": 5.159702301025391, |
| "learning_rate": 6.913004484304934e-06, |
| "loss": 2.2246, |
| "step": 29600 |
| }, |
| { |
| "epoch": 3.6153377967133293, |
| "grad_norm": 6.260033130645752, |
| "learning_rate": 6.86816143497758e-06, |
| "loss": 2.2044, |
| "step": 29700 |
| }, |
| { |
| "epoch": 3.6275106512477175, |
| "grad_norm": 5.428004741668701, |
| "learning_rate": 6.823318385650225e-06, |
| "loss": 2.1735, |
| "step": 29800 |
| }, |
| { |
| "epoch": 3.6396835057821058, |
| "grad_norm": 5.895395278930664, |
| "learning_rate": 6.77847533632287e-06, |
| "loss": 2.2027, |
| "step": 29900 |
| }, |
| { |
| "epoch": 3.6518563603164944, |
| "grad_norm": 5.690395355224609, |
| "learning_rate": 6.733632286995516e-06, |
| "loss": 2.2145, |
| "step": 30000 |
| }, |
| { |
| "epoch": 3.6518563603164944, |
| "eval_loss": 2.177266836166382, |
| "eval_runtime": 7.0906, |
| "eval_samples_per_second": 141.032, |
| "eval_steps_per_second": 35.258, |
| "step": 30000 |
| }, |
| { |
| "epoch": 3.6640292148508826, |
| "grad_norm": 5.669330596923828, |
| "learning_rate": 6.688789237668162e-06, |
| "loss": 2.1954, |
| "step": 30100 |
| }, |
| { |
| "epoch": 3.676202069385271, |
| "grad_norm": 6.493986129760742, |
| "learning_rate": 6.643946188340807e-06, |
| "loss": 2.2011, |
| "step": 30200 |
| }, |
| { |
| "epoch": 3.688374923919659, |
| "grad_norm": 7.738183975219727, |
| "learning_rate": 6.599103139013453e-06, |
| "loss": 2.2347, |
| "step": 30300 |
| }, |
| { |
| "epoch": 3.7005477784540473, |
| "grad_norm": 6.565354347229004, |
| "learning_rate": 6.5542600896861e-06, |
| "loss": 2.1945, |
| "step": 30400 |
| }, |
| { |
| "epoch": 3.712720632988436, |
| "grad_norm": 6.189778804779053, |
| "learning_rate": 6.509417040358745e-06, |
| "loss": 2.2141, |
| "step": 30500 |
| }, |
| { |
| "epoch": 3.712720632988436, |
| "eval_loss": 2.168225049972534, |
| "eval_runtime": 6.9549, |
| "eval_samples_per_second": 143.785, |
| "eval_steps_per_second": 35.946, |
| "step": 30500 |
| }, |
| { |
| "epoch": 3.724893487522824, |
| "grad_norm": 5.11403226852417, |
| "learning_rate": 6.464573991031391e-06, |
| "loss": 2.2121, |
| "step": 30600 |
| }, |
| { |
| "epoch": 3.7370663420572123, |
| "grad_norm": 6.672878742218018, |
| "learning_rate": 6.419730941704037e-06, |
| "loss": 2.181, |
| "step": 30700 |
| }, |
| { |
| "epoch": 3.749239196591601, |
| "grad_norm": 5.224799156188965, |
| "learning_rate": 6.374887892376682e-06, |
| "loss": 2.1807, |
| "step": 30800 |
| }, |
| { |
| "epoch": 3.761412051125989, |
| "grad_norm": 6.442698955535889, |
| "learning_rate": 6.330493273542602e-06, |
| "loss": 2.2021, |
| "step": 30900 |
| }, |
| { |
| "epoch": 3.7735849056603774, |
| "grad_norm": 6.708118438720703, |
| "learning_rate": 6.285650224215248e-06, |
| "loss": 2.182, |
| "step": 31000 |
| }, |
| { |
| "epoch": 3.7735849056603774, |
| "eval_loss": 2.1596100330352783, |
| "eval_runtime": 6.872, |
| "eval_samples_per_second": 145.518, |
| "eval_steps_per_second": 36.38, |
| "step": 31000 |
| }, |
| { |
| "epoch": 3.7857577601947656, |
| "grad_norm": 6.288793087005615, |
| "learning_rate": 6.2408071748878926e-06, |
| "loss": 2.1886, |
| "step": 31100 |
| }, |
| { |
| "epoch": 3.797930614729154, |
| "grad_norm": 6.112220287322998, |
| "learning_rate": 6.195964125560538e-06, |
| "loss": 2.2107, |
| "step": 31200 |
| }, |
| { |
| "epoch": 3.8101034692635425, |
| "grad_norm": 6.044913291931152, |
| "learning_rate": 6.151121076233184e-06, |
| "loss": 2.2246, |
| "step": 31300 |
| }, |
| { |
| "epoch": 3.8222763237979307, |
| "grad_norm": 6.079142093658447, |
| "learning_rate": 6.10627802690583e-06, |
| "loss": 2.2187, |
| "step": 31400 |
| }, |
| { |
| "epoch": 3.834449178332319, |
| "grad_norm": 5.865757942199707, |
| "learning_rate": 6.0614349775784755e-06, |
| "loss": 2.2069, |
| "step": 31500 |
| }, |
| { |
| "epoch": 3.834449178332319, |
| "eval_loss": 2.156599760055542, |
| "eval_runtime": 6.8965, |
| "eval_samples_per_second": 145.001, |
| "eval_steps_per_second": 36.25, |
| "step": 31500 |
| }, |
| { |
| "epoch": 3.846622032866707, |
| "grad_norm": 6.289271354675293, |
| "learning_rate": 6.016591928251122e-06, |
| "loss": 2.2349, |
| "step": 31600 |
| }, |
| { |
| "epoch": 3.8587948874010953, |
| "grad_norm": 6.607455730438232, |
| "learning_rate": 5.971748878923768e-06, |
| "loss": 2.1849, |
| "step": 31700 |
| }, |
| { |
| "epoch": 3.870967741935484, |
| "grad_norm": 6.193937301635742, |
| "learning_rate": 5.9269058295964135e-06, |
| "loss": 2.1901, |
| "step": 31800 |
| }, |
| { |
| "epoch": 3.883140596469872, |
| "grad_norm": 5.6171650886535645, |
| "learning_rate": 5.882062780269059e-06, |
| "loss": 2.1968, |
| "step": 31900 |
| }, |
| { |
| "epoch": 3.8953134510042604, |
| "grad_norm": 7.239607334136963, |
| "learning_rate": 5.837219730941704e-06, |
| "loss": 2.1984, |
| "step": 32000 |
| }, |
| { |
| "epoch": 3.8953134510042604, |
| "eval_loss": 2.1437973976135254, |
| "eval_runtime": 6.9069, |
| "eval_samples_per_second": 144.782, |
| "eval_steps_per_second": 36.195, |
| "step": 32000 |
| }, |
| { |
| "epoch": 3.907486305538649, |
| "grad_norm": 6.314813613891602, |
| "learning_rate": 5.79237668161435e-06, |
| "loss": 2.1706, |
| "step": 32100 |
| }, |
| { |
| "epoch": 3.9196591600730373, |
| "grad_norm": 5.416664123535156, |
| "learning_rate": 5.7475336322869956e-06, |
| "loss": 2.1852, |
| "step": 32200 |
| }, |
| { |
| "epoch": 3.9318320146074255, |
| "grad_norm": 6.1277594566345215, |
| "learning_rate": 5.702690582959641e-06, |
| "loss": 2.2202, |
| "step": 32300 |
| }, |
| { |
| "epoch": 3.9440048691418137, |
| "grad_norm": 6.0932440757751465, |
| "learning_rate": 5.657847533632288e-06, |
| "loss": 2.1591, |
| "step": 32400 |
| }, |
| { |
| "epoch": 3.956177723676202, |
| "grad_norm": 6.029341697692871, |
| "learning_rate": 5.613004484304934e-06, |
| "loss": 2.1805, |
| "step": 32500 |
| }, |
| { |
| "epoch": 3.956177723676202, |
| "eval_loss": 2.138620138168335, |
| "eval_runtime": 6.9091, |
| "eval_samples_per_second": 144.737, |
| "eval_steps_per_second": 36.184, |
| "step": 32500 |
| }, |
| { |
| "epoch": 3.9683505782105906, |
| "grad_norm": 6.374738693237305, |
| "learning_rate": 5.568161434977579e-06, |
| "loss": 2.168, |
| "step": 32600 |
| }, |
| { |
| "epoch": 3.9805234327449788, |
| "grad_norm": 6.206404209136963, |
| "learning_rate": 5.523318385650225e-06, |
| "loss": 2.2168, |
| "step": 32700 |
| }, |
| { |
| "epoch": 3.992696287279367, |
| "grad_norm": 6.701908588409424, |
| "learning_rate": 5.478475336322871e-06, |
| "loss": 2.1796, |
| "step": 32800 |
| }, |
| { |
| "epoch": 4.004869141813756, |
| "grad_norm": 6.456433296203613, |
| "learning_rate": 5.433632286995516e-06, |
| "loss": 2.1461, |
| "step": 32900 |
| }, |
| { |
| "epoch": 4.017041996348143, |
| "grad_norm": 6.578303337097168, |
| "learning_rate": 5.388789237668161e-06, |
| "loss": 2.1061, |
| "step": 33000 |
| }, |
| { |
| "epoch": 4.017041996348143, |
| "eval_loss": 2.123652458190918, |
| "eval_runtime": 6.8771, |
| "eval_samples_per_second": 145.409, |
| "eval_steps_per_second": 36.352, |
| "step": 33000 |
| }, |
| { |
| "epoch": 4.029214850882532, |
| "grad_norm": 5.736875057220459, |
| "learning_rate": 5.343946188340807e-06, |
| "loss": 2.098, |
| "step": 33100 |
| }, |
| { |
| "epoch": 4.04138770541692, |
| "grad_norm": 6.322964191436768, |
| "learning_rate": 5.299103139013453e-06, |
| "loss": 2.1334, |
| "step": 33200 |
| }, |
| { |
| "epoch": 4.0535605599513085, |
| "grad_norm": 7.002594470977783, |
| "learning_rate": 5.2542600896860994e-06, |
| "loss": 2.1186, |
| "step": 33300 |
| }, |
| { |
| "epoch": 4.065733414485697, |
| "grad_norm": 6.592886924743652, |
| "learning_rate": 5.209417040358745e-06, |
| "loss": 2.1218, |
| "step": 33400 |
| }, |
| { |
| "epoch": 4.077906269020085, |
| "grad_norm": 6.610073566436768, |
| "learning_rate": 5.164573991031391e-06, |
| "loss": 2.1112, |
| "step": 33500 |
| }, |
| { |
| "epoch": 4.077906269020085, |
| "eval_loss": 2.115506172180176, |
| "eval_runtime": 6.8933, |
| "eval_samples_per_second": 145.068, |
| "eval_steps_per_second": 36.267, |
| "step": 33500 |
| }, |
| { |
| "epoch": 4.090079123554474, |
| "grad_norm": 6.308100700378418, |
| "learning_rate": 5.119730941704037e-06, |
| "loss": 2.0984, |
| "step": 33600 |
| }, |
| { |
| "epoch": 4.102251978088862, |
| "grad_norm": 5.7667083740234375, |
| "learning_rate": 5.074887892376682e-06, |
| "loss": 2.1189, |
| "step": 33700 |
| }, |
| { |
| "epoch": 4.11442483262325, |
| "grad_norm": 6.554234504699707, |
| "learning_rate": 5.030044843049327e-06, |
| "loss": 2.1006, |
| "step": 33800 |
| }, |
| { |
| "epoch": 4.126597687157639, |
| "grad_norm": 6.494872570037842, |
| "learning_rate": 4.985201793721974e-06, |
| "loss": 2.1258, |
| "step": 33900 |
| }, |
| { |
| "epoch": 4.138770541692026, |
| "grad_norm": 6.796899318695068, |
| "learning_rate": 4.940807174887893e-06, |
| "loss": 2.1335, |
| "step": 34000 |
| }, |
| { |
| "epoch": 4.138770541692026, |
| "eval_loss": 2.1111514568328857, |
| "eval_runtime": 6.8774, |
| "eval_samples_per_second": 145.404, |
| "eval_steps_per_second": 36.351, |
| "step": 34000 |
| }, |
| { |
| "epoch": 4.150943396226415, |
| "grad_norm": 5.832895755767822, |
| "learning_rate": 4.895964125560538e-06, |
| "loss": 2.0956, |
| "step": 34100 |
| }, |
| { |
| "epoch": 4.163116250760804, |
| "grad_norm": 5.17689847946167, |
| "learning_rate": 4.851121076233185e-06, |
| "loss": 2.1503, |
| "step": 34200 |
| }, |
| { |
| "epoch": 4.1752891052951915, |
| "grad_norm": 6.65399694442749, |
| "learning_rate": 4.80627802690583e-06, |
| "loss": 2.1244, |
| "step": 34300 |
| }, |
| { |
| "epoch": 4.18746195982958, |
| "grad_norm": 6.744587421417236, |
| "learning_rate": 4.761434977578476e-06, |
| "loss": 2.1237, |
| "step": 34400 |
| }, |
| { |
| "epoch": 4.199634814363968, |
| "grad_norm": 6.663182258605957, |
| "learning_rate": 4.716591928251121e-06, |
| "loss": 2.1198, |
| "step": 34500 |
| }, |
| { |
| "epoch": 4.199634814363968, |
| "eval_loss": 2.1056010723114014, |
| "eval_runtime": 6.9075, |
| "eval_samples_per_second": 144.771, |
| "eval_steps_per_second": 36.193, |
| "step": 34500 |
| }, |
| { |
| "epoch": 4.211807668898357, |
| "grad_norm": 6.046566009521484, |
| "learning_rate": 4.671748878923767e-06, |
| "loss": 2.0746, |
| "step": 34600 |
| }, |
| { |
| "epoch": 4.223980523432745, |
| "grad_norm": 6.08657169342041, |
| "learning_rate": 4.626905829596413e-06, |
| "loss": 2.1154, |
| "step": 34700 |
| }, |
| { |
| "epoch": 4.236153377967133, |
| "grad_norm": 6.235377788543701, |
| "learning_rate": 4.582062780269059e-06, |
| "loss": 2.1013, |
| "step": 34800 |
| }, |
| { |
| "epoch": 4.248326232501522, |
| "grad_norm": 5.864556312561035, |
| "learning_rate": 4.537219730941705e-06, |
| "loss": 2.1293, |
| "step": 34900 |
| }, |
| { |
| "epoch": 4.26049908703591, |
| "grad_norm": 6.5032124519348145, |
| "learning_rate": 4.49237668161435e-06, |
| "loss": 2.0909, |
| "step": 35000 |
| }, |
| { |
| "epoch": 4.26049908703591, |
| "eval_loss": 2.1002509593963623, |
| "eval_runtime": 6.9552, |
| "eval_samples_per_second": 143.777, |
| "eval_steps_per_second": 35.944, |
| "step": 35000 |
| }, |
| { |
| "epoch": 4.272671941570298, |
| "grad_norm": 5.491804599761963, |
| "learning_rate": 4.447533632286996e-06, |
| "loss": 2.1075, |
| "step": 35100 |
| }, |
| { |
| "epoch": 4.284844796104687, |
| "grad_norm": 6.165935516357422, |
| "learning_rate": 4.402690582959642e-06, |
| "loss": 2.1172, |
| "step": 35200 |
| }, |
| { |
| "epoch": 4.2970176506390745, |
| "grad_norm": 6.2660369873046875, |
| "learning_rate": 4.357847533632288e-06, |
| "loss": 2.1234, |
| "step": 35300 |
| }, |
| { |
| "epoch": 4.309190505173463, |
| "grad_norm": 6.266602516174316, |
| "learning_rate": 4.3130044843049325e-06, |
| "loss": 2.1023, |
| "step": 35400 |
| }, |
| { |
| "epoch": 4.321363359707852, |
| "grad_norm": 6.377227306365967, |
| "learning_rate": 4.268161434977579e-06, |
| "loss": 2.095, |
| "step": 35500 |
| }, |
| { |
| "epoch": 4.321363359707852, |
| "eval_loss": 2.096508264541626, |
| "eval_runtime": 6.896, |
| "eval_samples_per_second": 145.011, |
| "eval_steps_per_second": 36.253, |
| "step": 35500 |
| }, |
| { |
| "epoch": 4.33353621424224, |
| "grad_norm": 5.756918907165527, |
| "learning_rate": 4.223318385650225e-06, |
| "loss": 2.1258, |
| "step": 35600 |
| }, |
| { |
| "epoch": 4.345709068776628, |
| "grad_norm": 6.3634934425354, |
| "learning_rate": 4.1784753363228705e-06, |
| "loss": 2.1326, |
| "step": 35700 |
| }, |
| { |
| "epoch": 4.357881923311016, |
| "grad_norm": 6.081814765930176, |
| "learning_rate": 4.133632286995516e-06, |
| "loss": 2.0923, |
| "step": 35800 |
| }, |
| { |
| "epoch": 4.370054777845405, |
| "grad_norm": 5.829545021057129, |
| "learning_rate": 4.088789237668161e-06, |
| "loss": 2.1001, |
| "step": 35900 |
| }, |
| { |
| "epoch": 4.382227632379793, |
| "grad_norm": 7.019509315490723, |
| "learning_rate": 4.043946188340808e-06, |
| "loss": 2.0947, |
| "step": 36000 |
| }, |
| { |
| "epoch": 4.382227632379793, |
| "eval_loss": 2.0914690494537354, |
| "eval_runtime": 6.9162, |
| "eval_samples_per_second": 144.589, |
| "eval_steps_per_second": 36.147, |
| "step": 36000 |
| }, |
| { |
| "epoch": 4.394400486914181, |
| "grad_norm": 7.135252952575684, |
| "learning_rate": 3.9991031390134534e-06, |
| "loss": 2.111, |
| "step": 36100 |
| }, |
| { |
| "epoch": 4.40657334144857, |
| "grad_norm": 5.3956522941589355, |
| "learning_rate": 3.954260089686099e-06, |
| "loss": 2.1072, |
| "step": 36200 |
| }, |
| { |
| "epoch": 4.418746195982958, |
| "grad_norm": 5.853066921234131, |
| "learning_rate": 3.909417040358744e-06, |
| "loss": 2.1327, |
| "step": 36300 |
| }, |
| { |
| "epoch": 4.430919050517346, |
| "grad_norm": 6.294539928436279, |
| "learning_rate": 3.864573991031391e-06, |
| "loss": 2.0886, |
| "step": 36400 |
| }, |
| { |
| "epoch": 4.443091905051735, |
| "grad_norm": 7.183646202087402, |
| "learning_rate": 3.819730941704036e-06, |
| "loss": 2.139, |
| "step": 36500 |
| }, |
| { |
| "epoch": 4.443091905051735, |
| "eval_loss": 2.0876991748809814, |
| "eval_runtime": 6.8527, |
| "eval_samples_per_second": 145.928, |
| "eval_steps_per_second": 36.482, |
| "step": 36500 |
| }, |
| { |
| "epoch": 4.455264759586123, |
| "grad_norm": 6.069007396697998, |
| "learning_rate": 3.7748878923766817e-06, |
| "loss": 2.1076, |
| "step": 36600 |
| }, |
| { |
| "epoch": 4.467437614120511, |
| "grad_norm": 6.092281341552734, |
| "learning_rate": 3.7300448430493274e-06, |
| "loss": 2.1274, |
| "step": 36700 |
| }, |
| { |
| "epoch": 4.4796104686549, |
| "grad_norm": 6.095892429351807, |
| "learning_rate": 3.6852017937219735e-06, |
| "loss": 2.1202, |
| "step": 36800 |
| }, |
| { |
| "epoch": 4.491783323189288, |
| "grad_norm": 6.349238395690918, |
| "learning_rate": 3.6403587443946193e-06, |
| "loss": 2.1192, |
| "step": 36900 |
| }, |
| { |
| "epoch": 4.503956177723676, |
| "grad_norm": 6.508525848388672, |
| "learning_rate": 3.595515695067265e-06, |
| "loss": 2.106, |
| "step": 37000 |
| }, |
| { |
| "epoch": 4.503956177723676, |
| "eval_loss": 2.0852510929107666, |
| "eval_runtime": 6.9159, |
| "eval_samples_per_second": 144.595, |
| "eval_steps_per_second": 36.149, |
| "step": 37000 |
| }, |
| { |
| "epoch": 4.516129032258064, |
| "grad_norm": 6.2998046875, |
| "learning_rate": 3.5506726457399103e-06, |
| "loss": 2.1264, |
| "step": 37100 |
| }, |
| { |
| "epoch": 4.528301886792453, |
| "grad_norm": 6.988924503326416, |
| "learning_rate": 3.5058295964125565e-06, |
| "loss": 2.0855, |
| "step": 37200 |
| }, |
| { |
| "epoch": 4.540474741326841, |
| "grad_norm": 5.999715328216553, |
| "learning_rate": 3.460986547085202e-06, |
| "loss": 2.1288, |
| "step": 37300 |
| }, |
| { |
| "epoch": 4.552647595861229, |
| "grad_norm": 5.390603542327881, |
| "learning_rate": 3.416143497757848e-06, |
| "loss": 2.1119, |
| "step": 37400 |
| }, |
| { |
| "epoch": 4.564820450395618, |
| "grad_norm": 5.443009853363037, |
| "learning_rate": 3.3713004484304932e-06, |
| "loss": 2.1137, |
| "step": 37500 |
| }, |
| { |
| "epoch": 4.564820450395618, |
| "eval_loss": 2.0807323455810547, |
| "eval_runtime": 6.868, |
| "eval_samples_per_second": 145.603, |
| "eval_steps_per_second": 36.401, |
| "step": 37500 |
| }, |
| { |
| "epoch": 4.5769933049300064, |
| "grad_norm": 6.028597831726074, |
| "learning_rate": 3.326457399103139e-06, |
| "loss": 2.1233, |
| "step": 37600 |
| }, |
| { |
| "epoch": 4.589166159464394, |
| "grad_norm": 6.3508992195129395, |
| "learning_rate": 3.281614349775785e-06, |
| "loss": 2.0785, |
| "step": 37700 |
| }, |
| { |
| "epoch": 4.601339013998783, |
| "grad_norm": 6.304683685302734, |
| "learning_rate": 3.237219730941704e-06, |
| "loss": 2.1058, |
| "step": 37800 |
| }, |
| { |
| "epoch": 4.6135118685331715, |
| "grad_norm": 5.774105548858643, |
| "learning_rate": 3.1923766816143497e-06, |
| "loss": 2.1363, |
| "step": 37900 |
| }, |
| { |
| "epoch": 4.625684723067559, |
| "grad_norm": 6.000542163848877, |
| "learning_rate": 3.147533632286996e-06, |
| "loss": 2.1097, |
| "step": 38000 |
| }, |
| { |
| "epoch": 4.625684723067559, |
| "eval_loss": 2.0734775066375732, |
| "eval_runtime": 6.9601, |
| "eval_samples_per_second": 143.677, |
| "eval_steps_per_second": 35.919, |
| "step": 38000 |
| }, |
| { |
| "epoch": 4.637857577601948, |
| "grad_norm": 6.037074565887451, |
| "learning_rate": 3.1026905829596416e-06, |
| "loss": 2.0839, |
| "step": 38100 |
| }, |
| { |
| "epoch": 4.650030432136336, |
| "grad_norm": 6.941400051116943, |
| "learning_rate": 3.0578475336322874e-06, |
| "loss": 2.0961, |
| "step": 38200 |
| }, |
| { |
| "epoch": 4.662203286670724, |
| "grad_norm": 6.625183582305908, |
| "learning_rate": 3.0130044843049327e-06, |
| "loss": 2.1532, |
| "step": 38300 |
| }, |
| { |
| "epoch": 4.674376141205112, |
| "grad_norm": 5.852269649505615, |
| "learning_rate": 2.968161434977579e-06, |
| "loss": 2.1085, |
| "step": 38400 |
| }, |
| { |
| "epoch": 4.686548995739501, |
| "grad_norm": 5.130061626434326, |
| "learning_rate": 2.9233183856502245e-06, |
| "loss": 2.1028, |
| "step": 38500 |
| }, |
| { |
| "epoch": 4.686548995739501, |
| "eval_loss": 2.070453643798828, |
| "eval_runtime": 6.9577, |
| "eval_samples_per_second": 143.725, |
| "eval_steps_per_second": 35.931, |
| "step": 38500 |
| }, |
| { |
| "epoch": 4.6987218502738894, |
| "grad_norm": 6.478227615356445, |
| "learning_rate": 2.8784753363228703e-06, |
| "loss": 2.0895, |
| "step": 38600 |
| }, |
| { |
| "epoch": 4.710894704808277, |
| "grad_norm": 6.043088436126709, |
| "learning_rate": 2.8336322869955156e-06, |
| "loss": 2.1032, |
| "step": 38700 |
| }, |
| { |
| "epoch": 4.723067559342666, |
| "grad_norm": 6.732186317443848, |
| "learning_rate": 2.7887892376681617e-06, |
| "loss": 2.0838, |
| "step": 38800 |
| }, |
| { |
| "epoch": 4.7352404138770545, |
| "grad_norm": 6.393290996551514, |
| "learning_rate": 2.7439461883408075e-06, |
| "loss": 2.1058, |
| "step": 38900 |
| }, |
| { |
| "epoch": 4.747413268411442, |
| "grad_norm": 6.3943705558776855, |
| "learning_rate": 2.699103139013453e-06, |
| "loss": 2.1142, |
| "step": 39000 |
| }, |
| { |
| "epoch": 4.747413268411442, |
| "eval_loss": 2.0703060626983643, |
| "eval_runtime": 7.0835, |
| "eval_samples_per_second": 141.172, |
| "eval_steps_per_second": 35.293, |
| "step": 39000 |
| }, |
| { |
| "epoch": 4.759586122945831, |
| "grad_norm": 5.651825428009033, |
| "learning_rate": 2.654260089686099e-06, |
| "loss": 2.1099, |
| "step": 39100 |
| }, |
| { |
| "epoch": 4.77175897748022, |
| "grad_norm": 5.763203144073486, |
| "learning_rate": 2.609417040358745e-06, |
| "loss": 2.1199, |
| "step": 39200 |
| }, |
| { |
| "epoch": 4.783931832014607, |
| "grad_norm": 6.404742240905762, |
| "learning_rate": 2.5645739910313904e-06, |
| "loss": 2.1065, |
| "step": 39300 |
| }, |
| { |
| "epoch": 4.796104686548996, |
| "grad_norm": 6.63946533203125, |
| "learning_rate": 2.519730941704036e-06, |
| "loss": 2.0982, |
| "step": 39400 |
| }, |
| { |
| "epoch": 4.808277541083384, |
| "grad_norm": 6.3910675048828125, |
| "learning_rate": 2.474887892376682e-06, |
| "loss": 2.0945, |
| "step": 39500 |
| }, |
| { |
| "epoch": 4.808277541083384, |
| "eval_loss": 2.066244602203369, |
| "eval_runtime": 7.0266, |
| "eval_samples_per_second": 142.317, |
| "eval_steps_per_second": 35.579, |
| "step": 39500 |
| }, |
| { |
| "epoch": 4.820450395617772, |
| "grad_norm": 6.50945520401001, |
| "learning_rate": 2.4300448430493276e-06, |
| "loss": 2.0619, |
| "step": 39600 |
| }, |
| { |
| "epoch": 4.83262325015216, |
| "grad_norm": 6.681357383728027, |
| "learning_rate": 2.3852017937219733e-06, |
| "loss": 2.097, |
| "step": 39700 |
| }, |
| { |
| "epoch": 4.844796104686549, |
| "grad_norm": 5.813176155090332, |
| "learning_rate": 2.340358744394619e-06, |
| "loss": 2.1173, |
| "step": 39800 |
| }, |
| { |
| "epoch": 4.8569689592209375, |
| "grad_norm": 6.834031581878662, |
| "learning_rate": 2.2955156950672647e-06, |
| "loss": 2.0721, |
| "step": 39900 |
| }, |
| { |
| "epoch": 4.869141813755325, |
| "grad_norm": 5.929574966430664, |
| "learning_rate": 2.2506726457399105e-06, |
| "loss": 2.1325, |
| "step": 40000 |
| }, |
| { |
| "epoch": 4.869141813755325, |
| "eval_loss": 2.064162254333496, |
| "eval_runtime": 7.021, |
| "eval_samples_per_second": 142.43, |
| "eval_steps_per_second": 35.608, |
| "step": 40000 |
| }, |
| { |
| "epoch": 4.881314668289714, |
| "grad_norm": 6.4569830894470215, |
| "learning_rate": 2.205829596412556e-06, |
| "loss": 2.1224, |
| "step": 40100 |
| }, |
| { |
| "epoch": 4.893487522824103, |
| "grad_norm": 6.773449897766113, |
| "learning_rate": 2.160986547085202e-06, |
| "loss": 2.1037, |
| "step": 40200 |
| }, |
| { |
| "epoch": 4.90566037735849, |
| "grad_norm": 6.341082572937012, |
| "learning_rate": 2.1165919282511213e-06, |
| "loss": 2.0936, |
| "step": 40300 |
| }, |
| { |
| "epoch": 4.917833231892879, |
| "grad_norm": 6.29095983505249, |
| "learning_rate": 2.071748878923767e-06, |
| "loss": 2.141, |
| "step": 40400 |
| }, |
| { |
| "epoch": 4.930006086427268, |
| "grad_norm": 7.924270153045654, |
| "learning_rate": 2.0269058295964127e-06, |
| "loss": 2.0937, |
| "step": 40500 |
| }, |
| { |
| "epoch": 4.930006086427268, |
| "eval_loss": 2.0617458820343018, |
| "eval_runtime": 6.9841, |
| "eval_samples_per_second": 143.183, |
| "eval_steps_per_second": 35.796, |
| "step": 40500 |
| }, |
| { |
| "epoch": 4.942178940961655, |
| "grad_norm": 7.386099338531494, |
| "learning_rate": 1.9820627802690585e-06, |
| "loss": 2.1187, |
| "step": 40600 |
| }, |
| { |
| "epoch": 4.954351795496044, |
| "grad_norm": 6.6330413818359375, |
| "learning_rate": 1.937219730941704e-06, |
| "loss": 2.0891, |
| "step": 40700 |
| }, |
| { |
| "epoch": 4.966524650030432, |
| "grad_norm": 5.590965747833252, |
| "learning_rate": 1.89237668161435e-06, |
| "loss": 2.0809, |
| "step": 40800 |
| }, |
| { |
| "epoch": 4.9786975045648205, |
| "grad_norm": 6.468350410461426, |
| "learning_rate": 1.8475336322869959e-06, |
| "loss": 2.0654, |
| "step": 40900 |
| }, |
| { |
| "epoch": 4.990870359099208, |
| "grad_norm": 6.724806308746338, |
| "learning_rate": 1.8026905829596414e-06, |
| "loss": 2.0938, |
| "step": 41000 |
| }, |
| { |
| "epoch": 4.990870359099208, |
| "eval_loss": 2.057617664337158, |
| "eval_runtime": 7.1367, |
| "eval_samples_per_second": 140.121, |
| "eval_steps_per_second": 35.03, |
| "step": 41000 |
| }, |
| { |
| "epoch": 5.003043213633597, |
| "grad_norm": 5.549363136291504, |
| "learning_rate": 1.7578475336322873e-06, |
| "loss": 2.0672, |
| "step": 41100 |
| }, |
| { |
| "epoch": 5.015216068167986, |
| "grad_norm": 6.4161152839660645, |
| "learning_rate": 1.7130044843049328e-06, |
| "loss": 2.0589, |
| "step": 41200 |
| }, |
| { |
| "epoch": 5.027388922702373, |
| "grad_norm": 6.318953514099121, |
| "learning_rate": 1.6681614349775786e-06, |
| "loss": 2.0643, |
| "step": 41300 |
| }, |
| { |
| "epoch": 5.039561777236762, |
| "grad_norm": 7.292160987854004, |
| "learning_rate": 1.6233183856502243e-06, |
| "loss": 2.0718, |
| "step": 41400 |
| }, |
| { |
| "epoch": 5.051734631771151, |
| "grad_norm": 6.140988349914551, |
| "learning_rate": 1.57847533632287e-06, |
| "loss": 2.0437, |
| "step": 41500 |
| }, |
| { |
| "epoch": 5.051734631771151, |
| "eval_loss": 2.051799774169922, |
| "eval_runtime": 7.0596, |
| "eval_samples_per_second": 141.651, |
| "eval_steps_per_second": 35.413, |
| "step": 41500 |
| }, |
| { |
| "epoch": 5.063907486305538, |
| "grad_norm": 6.15008020401001, |
| "learning_rate": 1.533632286995516e-06, |
| "loss": 2.0561, |
| "step": 41600 |
| }, |
| { |
| "epoch": 5.076080340839927, |
| "grad_norm": 6.889511585235596, |
| "learning_rate": 1.4887892376681615e-06, |
| "loss": 2.0729, |
| "step": 41700 |
| }, |
| { |
| "epoch": 5.088253195374315, |
| "grad_norm": 5.815738201141357, |
| "learning_rate": 1.4439461883408074e-06, |
| "loss": 2.0413, |
| "step": 41800 |
| }, |
| { |
| "epoch": 5.1004260499087035, |
| "grad_norm": 5.965245723724365, |
| "learning_rate": 1.399103139013453e-06, |
| "loss": 2.0407, |
| "step": 41900 |
| }, |
| { |
| "epoch": 5.112598904443092, |
| "grad_norm": 7.188913345336914, |
| "learning_rate": 1.3542600896860989e-06, |
| "loss": 2.0781, |
| "step": 42000 |
| }, |
| { |
| "epoch": 5.112598904443092, |
| "eval_loss": 2.0501816272735596, |
| "eval_runtime": 6.9017, |
| "eval_samples_per_second": 144.892, |
| "eval_steps_per_second": 36.223, |
| "step": 42000 |
| }, |
| { |
| "epoch": 5.12477175897748, |
| "grad_norm": 7.101166725158691, |
| "learning_rate": 1.3094170403587444e-06, |
| "loss": 2.0348, |
| "step": 42100 |
| }, |
| { |
| "epoch": 5.136944613511869, |
| "grad_norm": 5.820453643798828, |
| "learning_rate": 1.2645739910313903e-06, |
| "loss": 2.0497, |
| "step": 42200 |
| }, |
| { |
| "epoch": 5.149117468046257, |
| "grad_norm": 5.811570167541504, |
| "learning_rate": 1.2197309417040358e-06, |
| "loss": 2.058, |
| "step": 42300 |
| }, |
| { |
| "epoch": 5.161290322580645, |
| "grad_norm": 6.54494047164917, |
| "learning_rate": 1.1748878923766818e-06, |
| "loss": 2.0833, |
| "step": 42400 |
| }, |
| { |
| "epoch": 5.173463177115034, |
| "grad_norm": 6.547015190124512, |
| "learning_rate": 1.1300448430493275e-06, |
| "loss": 2.0671, |
| "step": 42500 |
| }, |
| { |
| "epoch": 5.173463177115034, |
| "eval_loss": 2.049518346786499, |
| "eval_runtime": 7.0155, |
| "eval_samples_per_second": 142.542, |
| "eval_steps_per_second": 35.636, |
| "step": 42500 |
| }, |
| { |
| "epoch": 5.185636031649421, |
| "grad_norm": 6.079003810882568, |
| "learning_rate": 1.0852017937219732e-06, |
| "loss": 2.0732, |
| "step": 42600 |
| }, |
| { |
| "epoch": 5.19780888618381, |
| "grad_norm": 6.835382461547852, |
| "learning_rate": 1.040358744394619e-06, |
| "loss": 2.0516, |
| "step": 42700 |
| }, |
| { |
| "epoch": 5.209981740718199, |
| "grad_norm": 6.055761814117432, |
| "learning_rate": 9.955156950672647e-07, |
| "loss": 2.0639, |
| "step": 42800 |
| }, |
| { |
| "epoch": 5.2221545952525865, |
| "grad_norm": 6.516651630401611, |
| "learning_rate": 9.511210762331839e-07, |
| "loss": 2.0597, |
| "step": 42900 |
| }, |
| { |
| "epoch": 5.234327449786975, |
| "grad_norm": 5.874512195587158, |
| "learning_rate": 9.062780269058297e-07, |
| "loss": 2.073, |
| "step": 43000 |
| }, |
| { |
| "epoch": 5.234327449786975, |
| "eval_loss": 2.0482187271118164, |
| "eval_runtime": 6.8896, |
| "eval_samples_per_second": 145.145, |
| "eval_steps_per_second": 36.286, |
| "step": 43000 |
| }, |
| { |
| "epoch": 5.246500304321363, |
| "grad_norm": 6.2515459060668945, |
| "learning_rate": 8.614349775784754e-07, |
| "loss": 2.0594, |
| "step": 43100 |
| }, |
| { |
| "epoch": 5.258673158855752, |
| "grad_norm": 6.7219438552856445, |
| "learning_rate": 8.165919282511211e-07, |
| "loss": 2.0138, |
| "step": 43200 |
| }, |
| { |
| "epoch": 5.27084601339014, |
| "grad_norm": 6.588565349578857, |
| "learning_rate": 7.71748878923767e-07, |
| "loss": 2.089, |
| "step": 43300 |
| }, |
| { |
| "epoch": 5.283018867924528, |
| "grad_norm": 6.52641487121582, |
| "learning_rate": 7.269058295964127e-07, |
| "loss": 2.0274, |
| "step": 43400 |
| }, |
| { |
| "epoch": 5.295191722458917, |
| "grad_norm": 7.77009391784668, |
| "learning_rate": 6.820627802690584e-07, |
| "loss": 2.0412, |
| "step": 43500 |
| }, |
| { |
| "epoch": 5.295191722458917, |
| "eval_loss": 2.0471861362457275, |
| "eval_runtime": 6.8473, |
| "eval_samples_per_second": 146.043, |
| "eval_steps_per_second": 36.511, |
| "step": 43500 |
| }, |
| { |
| "epoch": 5.307364576993305, |
| "grad_norm": 6.563704490661621, |
| "learning_rate": 6.372197309417041e-07, |
| "loss": 2.0538, |
| "step": 43600 |
| }, |
| { |
| "epoch": 5.319537431527693, |
| "grad_norm": 5.842877388000488, |
| "learning_rate": 5.923766816143499e-07, |
| "loss": 2.0378, |
| "step": 43700 |
| }, |
| { |
| "epoch": 5.331710286062082, |
| "grad_norm": 5.96117639541626, |
| "learning_rate": 5.475336322869956e-07, |
| "loss": 2.0702, |
| "step": 43800 |
| }, |
| { |
| "epoch": 5.3438831405964695, |
| "grad_norm": 6.195252895355225, |
| "learning_rate": 5.026905829596413e-07, |
| "loss": 2.0519, |
| "step": 43900 |
| }, |
| { |
| "epoch": 5.356055995130858, |
| "grad_norm": 6.967134475708008, |
| "learning_rate": 4.5784753363228705e-07, |
| "loss": 2.0747, |
| "step": 44000 |
| }, |
| { |
| "epoch": 5.356055995130858, |
| "eval_loss": 2.046496629714966, |
| "eval_runtime": 6.9289, |
| "eval_samples_per_second": 144.322, |
| "eval_steps_per_second": 36.081, |
| "step": 44000 |
| }, |
| { |
| "epoch": 5.368228849665247, |
| "grad_norm": 5.7902984619140625, |
| "learning_rate": 4.130044843049328e-07, |
| "loss": 2.0551, |
| "step": 44100 |
| }, |
| { |
| "epoch": 5.380401704199635, |
| "grad_norm": 6.01054048538208, |
| "learning_rate": 3.6816143497757846e-07, |
| "loss": 2.0569, |
| "step": 44200 |
| }, |
| { |
| "epoch": 5.392574558734023, |
| "grad_norm": 6.690357685089111, |
| "learning_rate": 3.2331838565022424e-07, |
| "loss": 2.08, |
| "step": 44300 |
| }, |
| { |
| "epoch": 5.404747413268412, |
| "grad_norm": 5.836359024047852, |
| "learning_rate": 2.7847533632286997e-07, |
| "loss": 2.0405, |
| "step": 44400 |
| }, |
| { |
| "epoch": 5.4169202678028, |
| "grad_norm": 6.3250298500061035, |
| "learning_rate": 2.3363228699551572e-07, |
| "loss": 2.0717, |
| "step": 44500 |
| }, |
| { |
| "epoch": 5.4169202678028, |
| "eval_loss": 2.04555606842041, |
| "eval_runtime": 6.9014, |
| "eval_samples_per_second": 144.899, |
| "eval_steps_per_second": 36.225, |
| "step": 44500 |
| }, |
| { |
| "epoch": 5.429093122337188, |
| "grad_norm": 6.5666890144348145, |
| "learning_rate": 1.8878923766816145e-07, |
| "loss": 2.06, |
| "step": 44600 |
| }, |
| { |
| "epoch": 5.441265976871576, |
| "grad_norm": 7.2658843994140625, |
| "learning_rate": 1.4394618834080718e-07, |
| "loss": 2.0556, |
| "step": 44700 |
| }, |
| { |
| "epoch": 5.453438831405965, |
| "grad_norm": 6.671789646148682, |
| "learning_rate": 9.910313901345293e-08, |
| "loss": 2.0642, |
| "step": 44800 |
| }, |
| { |
| "epoch": 5.465611685940353, |
| "grad_norm": 5.944987773895264, |
| "learning_rate": 5.426008968609866e-08, |
| "loss": 2.0414, |
| "step": 44900 |
| }, |
| { |
| "epoch": 5.477784540474741, |
| "grad_norm": 6.427646636962891, |
| "learning_rate": 9.417040358744396e-09, |
| "loss": 2.0667, |
| "step": 45000 |
| }, |
| { |
| "epoch": 5.477784540474741, |
| "eval_loss": 2.0452468395233154, |
| "eval_runtime": 7.0229, |
| "eval_samples_per_second": 142.391, |
| "eval_steps_per_second": 35.598, |
| "step": 45000 |
| }, |
| { |
| "epoch": 5.48995739500913, |
| "grad_norm": 6.559889793395996, |
| "learning_rate": 5.007718120805369e-06, |
| "loss": 2.0447, |
| "step": 45100 |
| }, |
| { |
| "epoch": 5.502130249543518, |
| "grad_norm": 6.235354423522949, |
| "learning_rate": 4.974161073825503e-06, |
| "loss": 2.0806, |
| "step": 45200 |
| }, |
| { |
| "epoch": 5.514303104077906, |
| "grad_norm": 7.230030536651611, |
| "learning_rate": 4.940604026845638e-06, |
| "loss": 2.0696, |
| "step": 45300 |
| }, |
| { |
| "epoch": 5.526475958612295, |
| "grad_norm": 5.613503456115723, |
| "learning_rate": 4.907046979865772e-06, |
| "loss": 2.0662, |
| "step": 45400 |
| }, |
| { |
| "epoch": 5.538648813146683, |
| "grad_norm": 5.988820552825928, |
| "learning_rate": 4.873489932885906e-06, |
| "loss": 2.0551, |
| "step": 45500 |
| }, |
| { |
| "epoch": 5.538648813146683, |
| "eval_loss": 2.0472412109375, |
| "eval_runtime": 6.7805, |
| "eval_samples_per_second": 147.481, |
| "eval_steps_per_second": 36.87, |
| "step": 45500 |
| }, |
| { |
| "epoch": 5.550821667681071, |
| "grad_norm": 6.566047191619873, |
| "learning_rate": 4.8399328859060404e-06, |
| "loss": 2.0652, |
| "step": 45600 |
| }, |
| { |
| "epoch": 5.56299452221546, |
| "grad_norm": 6.979294300079346, |
| "learning_rate": 4.806375838926175e-06, |
| "loss": 2.0409, |
| "step": 45700 |
| }, |
| { |
| "epoch": 5.575167376749848, |
| "grad_norm": 6.474365234375, |
| "learning_rate": 4.772818791946309e-06, |
| "loss": 2.0562, |
| "step": 45800 |
| }, |
| { |
| "epoch": 5.587340231284236, |
| "grad_norm": 6.100124835968018, |
| "learning_rate": 4.739261744966443e-06, |
| "loss": 2.0448, |
| "step": 45900 |
| }, |
| { |
| "epoch": 5.599513085818624, |
| "grad_norm": 6.383643627166748, |
| "learning_rate": 4.706040268456376e-06, |
| "loss": 2.0798, |
| "step": 46000 |
| }, |
| { |
| "epoch": 5.599513085818624, |
| "eval_loss": 2.042715311050415, |
| "eval_runtime": 6.7981, |
| "eval_samples_per_second": 147.1, |
| "eval_steps_per_second": 36.775, |
| "step": 46000 |
| }, |
| { |
| "epoch": 5.611685940353013, |
| "grad_norm": 6.848605632781982, |
| "learning_rate": 4.67248322147651e-06, |
| "loss": 2.0615, |
| "step": 46100 |
| }, |
| { |
| "epoch": 5.6238587948874015, |
| "grad_norm": 6.921677589416504, |
| "learning_rate": 4.638926174496644e-06, |
| "loss": 2.0888, |
| "step": 46200 |
| }, |
| { |
| "epoch": 5.636031649421789, |
| "grad_norm": 6.901805400848389, |
| "learning_rate": 4.6053691275167785e-06, |
| "loss": 2.0552, |
| "step": 46300 |
| }, |
| { |
| "epoch": 5.648204503956178, |
| "grad_norm": 6.497274398803711, |
| "learning_rate": 4.571812080536913e-06, |
| "loss": 2.0596, |
| "step": 46400 |
| }, |
| { |
| "epoch": 5.660377358490566, |
| "grad_norm": 6.1705803871154785, |
| "learning_rate": 4.538255033557047e-06, |
| "loss": 2.0352, |
| "step": 46500 |
| }, |
| { |
| "epoch": 5.660377358490566, |
| "eval_loss": 2.0392038822174072, |
| "eval_runtime": 6.8175, |
| "eval_samples_per_second": 146.681, |
| "eval_steps_per_second": 36.67, |
| "step": 46500 |
| }, |
| { |
| "epoch": 5.672550213024954, |
| "grad_norm": 6.3149847984313965, |
| "learning_rate": 4.504697986577181e-06, |
| "loss": 2.0782, |
| "step": 46600 |
| }, |
| { |
| "epoch": 5.684723067559343, |
| "grad_norm": 5.7811760902404785, |
| "learning_rate": 4.471140939597316e-06, |
| "loss": 2.0745, |
| "step": 46700 |
| }, |
| { |
| "epoch": 5.696895922093731, |
| "grad_norm": 6.381850719451904, |
| "learning_rate": 4.43758389261745e-06, |
| "loss": 2.0967, |
| "step": 46800 |
| }, |
| { |
| "epoch": 5.709068776628119, |
| "grad_norm": 6.7904534339904785, |
| "learning_rate": 4.404026845637584e-06, |
| "loss": 2.048, |
| "step": 46900 |
| }, |
| { |
| "epoch": 5.721241631162508, |
| "grad_norm": 6.390072822570801, |
| "learning_rate": 4.370469798657718e-06, |
| "loss": 2.0458, |
| "step": 47000 |
| }, |
| { |
| "epoch": 5.721241631162508, |
| "eval_loss": 2.0331013202667236, |
| "eval_runtime": 6.7556, |
| "eval_samples_per_second": 148.026, |
| "eval_steps_per_second": 37.006, |
| "step": 47000 |
| }, |
| { |
| "epoch": 5.733414485696896, |
| "grad_norm": 6.4294514656066895, |
| "learning_rate": 4.336912751677853e-06, |
| "loss": 2.0555, |
| "step": 47100 |
| }, |
| { |
| "epoch": 5.7455873402312845, |
| "grad_norm": 7.039945602416992, |
| "learning_rate": 4.303355704697987e-06, |
| "loss": 2.0973, |
| "step": 47200 |
| }, |
| { |
| "epoch": 5.757760194765672, |
| "grad_norm": 6.919515132904053, |
| "learning_rate": 4.269798657718121e-06, |
| "loss": 2.0572, |
| "step": 47300 |
| }, |
| { |
| "epoch": 5.769933049300061, |
| "grad_norm": 6.846578598022461, |
| "learning_rate": 4.2362416107382554e-06, |
| "loss": 2.0703, |
| "step": 47400 |
| }, |
| { |
| "epoch": 5.78210590383445, |
| "grad_norm": 6.899037837982178, |
| "learning_rate": 4.20268456375839e-06, |
| "loss": 2.0382, |
| "step": 47500 |
| }, |
| { |
| "epoch": 5.78210590383445, |
| "eval_loss": 2.0307412147521973, |
| "eval_runtime": 6.8182, |
| "eval_samples_per_second": 146.666, |
| "eval_steps_per_second": 36.667, |
| "step": 47500 |
| }, |
| { |
| "epoch": 5.794278758368837, |
| "grad_norm": 5.726818084716797, |
| "learning_rate": 4.169127516778524e-06, |
| "loss": 2.0595, |
| "step": 47600 |
| }, |
| { |
| "epoch": 5.806451612903226, |
| "grad_norm": 7.426904201507568, |
| "learning_rate": 4.135570469798658e-06, |
| "loss": 2.0605, |
| "step": 47700 |
| }, |
| { |
| "epoch": 5.818624467437614, |
| "grad_norm": 6.416141986846924, |
| "learning_rate": 4.1020134228187925e-06, |
| "loss": 2.071, |
| "step": 47800 |
| }, |
| { |
| "epoch": 5.830797321972002, |
| "grad_norm": 6.170881748199463, |
| "learning_rate": 4.068456375838927e-06, |
| "loss": 2.0601, |
| "step": 47900 |
| }, |
| { |
| "epoch": 5.842970176506391, |
| "grad_norm": 5.913904666900635, |
| "learning_rate": 4.034899328859061e-06, |
| "loss": 2.0663, |
| "step": 48000 |
| }, |
| { |
| "epoch": 5.842970176506391, |
| "eval_loss": 2.0260586738586426, |
| "eval_runtime": 6.7968, |
| "eval_samples_per_second": 147.127, |
| "eval_steps_per_second": 36.782, |
| "step": 48000 |
| }, |
| { |
| "epoch": 5.855143031040779, |
| "grad_norm": 6.9575090408325195, |
| "learning_rate": 4.0013422818791944e-06, |
| "loss": 2.0487, |
| "step": 48100 |
| }, |
| { |
| "epoch": 5.8673158855751675, |
| "grad_norm": 7.018653392791748, |
| "learning_rate": 3.967785234899329e-06, |
| "loss": 2.0836, |
| "step": 48200 |
| }, |
| { |
| "epoch": 5.879488740109556, |
| "grad_norm": 6.9810285568237305, |
| "learning_rate": 3.934228187919463e-06, |
| "loss": 2.0645, |
| "step": 48300 |
| }, |
| { |
| "epoch": 5.891661594643944, |
| "grad_norm": 5.732436656951904, |
| "learning_rate": 3.900671140939597e-06, |
| "loss": 2.0682, |
| "step": 48400 |
| }, |
| { |
| "epoch": 5.9038344491783326, |
| "grad_norm": 6.543402671813965, |
| "learning_rate": 3.8671140939597315e-06, |
| "loss": 2.0797, |
| "step": 48500 |
| }, |
| { |
| "epoch": 5.9038344491783326, |
| "eval_loss": 2.0245697498321533, |
| "eval_runtime": 6.7746, |
| "eval_samples_per_second": 147.609, |
| "eval_steps_per_second": 36.902, |
| "step": 48500 |
| }, |
| { |
| "epoch": 5.91600730371272, |
| "grad_norm": 6.355215072631836, |
| "learning_rate": 3.833557046979866e-06, |
| "loss": 2.0386, |
| "step": 48600 |
| }, |
| { |
| "epoch": 5.928180158247109, |
| "grad_norm": 5.7379889488220215, |
| "learning_rate": 3.8000000000000005e-06, |
| "loss": 2.0498, |
| "step": 48700 |
| }, |
| { |
| "epoch": 5.940353012781498, |
| "grad_norm": 5.857077121734619, |
| "learning_rate": 3.7664429530201347e-06, |
| "loss": 2.061, |
| "step": 48800 |
| }, |
| { |
| "epoch": 5.952525867315885, |
| "grad_norm": 7.078189373016357, |
| "learning_rate": 3.732885906040269e-06, |
| "loss": 2.0569, |
| "step": 48900 |
| }, |
| { |
| "epoch": 5.964698721850274, |
| "grad_norm": 6.31903600692749, |
| "learning_rate": 3.6993288590604033e-06, |
| "loss": 2.0755, |
| "step": 49000 |
| }, |
| { |
| "epoch": 5.964698721850274, |
| "eval_loss": 2.0195415019989014, |
| "eval_runtime": 6.7847, |
| "eval_samples_per_second": 147.392, |
| "eval_steps_per_second": 36.848, |
| "step": 49000 |
| }, |
| { |
| "epoch": 5.976871576384662, |
| "grad_norm": 6.295201778411865, |
| "learning_rate": 3.6657718120805375e-06, |
| "loss": 2.0393, |
| "step": 49100 |
| }, |
| { |
| "epoch": 5.9890444309190505, |
| "grad_norm": 5.829520225524902, |
| "learning_rate": 3.6322147651006714e-06, |
| "loss": 2.0839, |
| "step": 49200 |
| }, |
| { |
| "epoch": 6.001217285453439, |
| "grad_norm": 6.653756141662598, |
| "learning_rate": 3.5986577181208056e-06, |
| "loss": 2.0581, |
| "step": 49300 |
| }, |
| { |
| "epoch": 6.013390139987827, |
| "grad_norm": 6.303423881530762, |
| "learning_rate": 3.56510067114094e-06, |
| "loss": 2.0524, |
| "step": 49400 |
| }, |
| { |
| "epoch": 6.0255629945222156, |
| "grad_norm": 6.783233642578125, |
| "learning_rate": 3.531543624161074e-06, |
| "loss": 2.0284, |
| "step": 49500 |
| }, |
| { |
| "epoch": 6.0255629945222156, |
| "eval_loss": 2.013944387435913, |
| "eval_runtime": 6.7878, |
| "eval_samples_per_second": 147.323, |
| "eval_steps_per_second": 36.831, |
| "step": 49500 |
| }, |
| { |
| "epoch": 6.037735849056604, |
| "grad_norm": 5.857462406158447, |
| "learning_rate": 3.4979865771812084e-06, |
| "loss": 2.0229, |
| "step": 49600 |
| }, |
| { |
| "epoch": 6.049908703590992, |
| "grad_norm": 6.777635097503662, |
| "learning_rate": 3.4644295302013427e-06, |
| "loss": 1.9982, |
| "step": 49700 |
| }, |
| { |
| "epoch": 6.062081558125381, |
| "grad_norm": 7.1341328620910645, |
| "learning_rate": 3.430872483221477e-06, |
| "loss": 2.0211, |
| "step": 49800 |
| }, |
| { |
| "epoch": 6.074254412659768, |
| "grad_norm": 6.320338249206543, |
| "learning_rate": 3.3973154362416112e-06, |
| "loss": 2.0137, |
| "step": 49900 |
| }, |
| { |
| "epoch": 6.086427267194157, |
| "grad_norm": 6.523722171783447, |
| "learning_rate": 3.3640939597315437e-06, |
| "loss": 2.0073, |
| "step": 50000 |
| }, |
| { |
| "epoch": 6.086427267194157, |
| "eval_loss": 2.0087661743164062, |
| "eval_runtime": 6.731, |
| "eval_samples_per_second": 148.565, |
| "eval_steps_per_second": 37.141, |
| "step": 50000 |
| }, |
| { |
| "epoch": 6.098600121728546, |
| "grad_norm": 6.2543559074401855, |
| "learning_rate": 3.330536912751678e-06, |
| "loss": 2.05, |
| "step": 50100 |
| }, |
| { |
| "epoch": 6.1107729762629335, |
| "grad_norm": 6.838403701782227, |
| "learning_rate": 3.2969798657718123e-06, |
| "loss": 2.0041, |
| "step": 50200 |
| }, |
| { |
| "epoch": 6.122945830797322, |
| "grad_norm": 6.734765529632568, |
| "learning_rate": 3.2634228187919465e-06, |
| "loss": 2.0144, |
| "step": 50300 |
| }, |
| { |
| "epoch": 6.13511868533171, |
| "grad_norm": 7.506516933441162, |
| "learning_rate": 3.2298657718120808e-06, |
| "loss": 2.0238, |
| "step": 50400 |
| }, |
| { |
| "epoch": 6.1472915398660986, |
| "grad_norm": 7.153513431549072, |
| "learning_rate": 3.196308724832215e-06, |
| "loss": 2.0032, |
| "step": 50500 |
| }, |
| { |
| "epoch": 6.1472915398660986, |
| "eval_loss": 2.0054242610931396, |
| "eval_runtime": 6.8452, |
| "eval_samples_per_second": 146.089, |
| "eval_steps_per_second": 36.522, |
| "step": 50500 |
| }, |
| { |
| "epoch": 6.159464394400487, |
| "grad_norm": 5.951141834259033, |
| "learning_rate": 3.1627516778523493e-06, |
| "loss": 2.0768, |
| "step": 50600 |
| }, |
| { |
| "epoch": 6.171637248934875, |
| "grad_norm": 6.877615928649902, |
| "learning_rate": 3.1291946308724836e-06, |
| "loss": 2.0123, |
| "step": 50700 |
| }, |
| { |
| "epoch": 6.183810103469264, |
| "grad_norm": 6.209372520446777, |
| "learning_rate": 3.095637583892618e-06, |
| "loss": 2.0153, |
| "step": 50800 |
| }, |
| { |
| "epoch": 6.195982958003652, |
| "grad_norm": 6.799842834472656, |
| "learning_rate": 3.062080536912752e-06, |
| "loss": 1.9955, |
| "step": 50900 |
| }, |
| { |
| "epoch": 6.20815581253804, |
| "grad_norm": 6.479254722595215, |
| "learning_rate": 3.0285234899328864e-06, |
| "loss": 2.0315, |
| "step": 51000 |
| }, |
| { |
| "epoch": 6.20815581253804, |
| "eval_loss": 2.0038652420043945, |
| "eval_runtime": 6.8012, |
| "eval_samples_per_second": 147.033, |
| "eval_steps_per_second": 36.758, |
| "step": 51000 |
| }, |
| { |
| "epoch": 6.220328667072429, |
| "grad_norm": 6.269389629364014, |
| "learning_rate": 2.9949664429530206e-06, |
| "loss": 1.9839, |
| "step": 51100 |
| }, |
| { |
| "epoch": 6.2325015216068165, |
| "grad_norm": 7.240963935852051, |
| "learning_rate": 2.9614093959731545e-06, |
| "loss": 2.0155, |
| "step": 51200 |
| }, |
| { |
| "epoch": 6.244674376141205, |
| "grad_norm": 5.774966716766357, |
| "learning_rate": 2.9278523489932887e-06, |
| "loss": 2.0198, |
| "step": 51300 |
| }, |
| { |
| "epoch": 6.256847230675594, |
| "grad_norm": 6.272314071655273, |
| "learning_rate": 2.894295302013423e-06, |
| "loss": 2.0554, |
| "step": 51400 |
| }, |
| { |
| "epoch": 6.2690200852099816, |
| "grad_norm": 9.089746475219727, |
| "learning_rate": 2.8607382550335573e-06, |
| "loss": 2.0408, |
| "step": 51500 |
| }, |
| { |
| "epoch": 6.2690200852099816, |
| "eval_loss": 2.000591278076172, |
| "eval_runtime": 6.7821, |
| "eval_samples_per_second": 147.447, |
| "eval_steps_per_second": 36.862, |
| "step": 51500 |
| }, |
| { |
| "epoch": 6.28119293974437, |
| "grad_norm": 6.007697105407715, |
| "learning_rate": 2.8271812080536915e-06, |
| "loss": 2.0251, |
| "step": 51600 |
| }, |
| { |
| "epoch": 6.293365794278758, |
| "grad_norm": 7.7493791580200195, |
| "learning_rate": 2.793624161073826e-06, |
| "loss": 2.0447, |
| "step": 51700 |
| }, |
| { |
| "epoch": 6.305538648813147, |
| "grad_norm": 7.068716526031494, |
| "learning_rate": 2.76006711409396e-06, |
| "loss": 2.0377, |
| "step": 51800 |
| }, |
| { |
| "epoch": 6.317711503347535, |
| "grad_norm": 6.732091903686523, |
| "learning_rate": 2.7265100671140943e-06, |
| "loss": 2.0131, |
| "step": 51900 |
| }, |
| { |
| "epoch": 6.329884357881923, |
| "grad_norm": 6.7231125831604, |
| "learning_rate": 2.693288590604027e-06, |
| "loss": 2.0385, |
| "step": 52000 |
| }, |
| { |
| "epoch": 6.329884357881923, |
| "eval_loss": 1.9973669052124023, |
| "eval_runtime": 6.7984, |
| "eval_samples_per_second": 147.093, |
| "eval_steps_per_second": 36.773, |
| "step": 52000 |
| }, |
| { |
| "epoch": 6.342057212416312, |
| "grad_norm": 6.017531394958496, |
| "learning_rate": 2.659731543624161e-06, |
| "loss": 2.0407, |
| "step": 52100 |
| }, |
| { |
| "epoch": 6.3542300669507, |
| "grad_norm": 5.93875789642334, |
| "learning_rate": 2.6261744966442954e-06, |
| "loss": 2.0368, |
| "step": 52200 |
| }, |
| { |
| "epoch": 6.366402921485088, |
| "grad_norm": 6.382920265197754, |
| "learning_rate": 2.5926174496644296e-06, |
| "loss": 2.036, |
| "step": 52300 |
| }, |
| { |
| "epoch": 6.378575776019477, |
| "grad_norm": 6.723759651184082, |
| "learning_rate": 2.559060402684564e-06, |
| "loss": 1.9914, |
| "step": 52400 |
| }, |
| { |
| "epoch": 6.3907486305538646, |
| "grad_norm": 8.295475959777832, |
| "learning_rate": 2.525503355704698e-06, |
| "loss": 2.0401, |
| "step": 52500 |
| }, |
| { |
| "epoch": 6.3907486305538646, |
| "eval_loss": 1.9946650266647339, |
| "eval_runtime": 6.8495, |
| "eval_samples_per_second": 145.995, |
| "eval_steps_per_second": 36.499, |
| "step": 52500 |
| }, |
| { |
| "epoch": 6.402921485088253, |
| "grad_norm": 6.045047283172607, |
| "learning_rate": 2.4919463087248324e-06, |
| "loss": 2.0287, |
| "step": 52600 |
| }, |
| { |
| "epoch": 6.415094339622642, |
| "grad_norm": 7.3694000244140625, |
| "learning_rate": 2.4583892617449667e-06, |
| "loss": 2.0318, |
| "step": 52700 |
| }, |
| { |
| "epoch": 6.42726719415703, |
| "grad_norm": 6.970037460327148, |
| "learning_rate": 2.424832214765101e-06, |
| "loss": 2.0352, |
| "step": 52800 |
| }, |
| { |
| "epoch": 6.439440048691418, |
| "grad_norm": 7.87092399597168, |
| "learning_rate": 2.391275167785235e-06, |
| "loss": 2.0522, |
| "step": 52900 |
| }, |
| { |
| "epoch": 6.451612903225806, |
| "grad_norm": 6.341009616851807, |
| "learning_rate": 2.357718120805369e-06, |
| "loss": 2.0717, |
| "step": 53000 |
| }, |
| { |
| "epoch": 6.451612903225806, |
| "eval_loss": 1.9915155172348022, |
| "eval_runtime": 6.8145, |
| "eval_samples_per_second": 146.746, |
| "eval_steps_per_second": 36.686, |
| "step": 53000 |
| }, |
| { |
| "epoch": 6.463785757760195, |
| "grad_norm": 7.210479736328125, |
| "learning_rate": 2.3241610738255038e-06, |
| "loss": 2.0154, |
| "step": 53100 |
| }, |
| { |
| "epoch": 6.475958612294583, |
| "grad_norm": 8.30247688293457, |
| "learning_rate": 2.290604026845638e-06, |
| "loss": 2.0242, |
| "step": 53200 |
| }, |
| { |
| "epoch": 6.488131466828971, |
| "grad_norm": 5.9992570877075195, |
| "learning_rate": 2.2573825503355705e-06, |
| "loss": 2.0372, |
| "step": 53300 |
| }, |
| { |
| "epoch": 6.50030432136336, |
| "grad_norm": 6.450936317443848, |
| "learning_rate": 2.2238255033557048e-06, |
| "loss": 2.0267, |
| "step": 53400 |
| }, |
| { |
| "epoch": 6.512477175897748, |
| "grad_norm": 6.037837982177734, |
| "learning_rate": 2.190268456375839e-06, |
| "loss": 2.0178, |
| "step": 53500 |
| }, |
| { |
| "epoch": 6.512477175897748, |
| "eval_loss": 1.9894185066223145, |
| "eval_runtime": 6.8572, |
| "eval_samples_per_second": 145.831, |
| "eval_steps_per_second": 36.458, |
| "step": 53500 |
| }, |
| { |
| "epoch": 6.524650030432136, |
| "grad_norm": 6.875925064086914, |
| "learning_rate": 2.1567114093959733e-06, |
| "loss": 2.0354, |
| "step": 53600 |
| }, |
| { |
| "epoch": 6.536822884966525, |
| "grad_norm": 6.961463451385498, |
| "learning_rate": 2.1231543624161076e-06, |
| "loss": 2.06, |
| "step": 53700 |
| }, |
| { |
| "epoch": 6.548995739500913, |
| "grad_norm": 5.773210525512695, |
| "learning_rate": 2.089597315436242e-06, |
| "loss": 2.0167, |
| "step": 53800 |
| }, |
| { |
| "epoch": 6.561168594035301, |
| "grad_norm": 6.747873783111572, |
| "learning_rate": 2.056040268456376e-06, |
| "loss": 1.9882, |
| "step": 53900 |
| }, |
| { |
| "epoch": 6.57334144856969, |
| "grad_norm": 6.432974338531494, |
| "learning_rate": 2.0224832214765104e-06, |
| "loss": 2.0029, |
| "step": 54000 |
| }, |
| { |
| "epoch": 6.57334144856969, |
| "eval_loss": 1.9841845035552979, |
| "eval_runtime": 6.8372, |
| "eval_samples_per_second": 146.258, |
| "eval_steps_per_second": 36.564, |
| "step": 54000 |
| }, |
| { |
| "epoch": 6.585514303104078, |
| "grad_norm": 6.159907341003418, |
| "learning_rate": 1.9889261744966446e-06, |
| "loss": 2.0454, |
| "step": 54100 |
| }, |
| { |
| "epoch": 6.597687157638466, |
| "grad_norm": 7.004731178283691, |
| "learning_rate": 1.955369127516779e-06, |
| "loss": 2.0011, |
| "step": 54200 |
| }, |
| { |
| "epoch": 6.609860012172854, |
| "grad_norm": 7.388941764831543, |
| "learning_rate": 1.9218120805369127e-06, |
| "loss": 2.0446, |
| "step": 54300 |
| }, |
| { |
| "epoch": 6.622032866707243, |
| "grad_norm": 7.399050235748291, |
| "learning_rate": 1.888255033557047e-06, |
| "loss": 2.0265, |
| "step": 54400 |
| }, |
| { |
| "epoch": 6.634205721241631, |
| "grad_norm": 6.445584297180176, |
| "learning_rate": 1.8546979865771813e-06, |
| "loss": 2.0124, |
| "step": 54500 |
| }, |
| { |
| "epoch": 6.634205721241631, |
| "eval_loss": 1.9837737083435059, |
| "eval_runtime": 6.903, |
| "eval_samples_per_second": 144.864, |
| "eval_steps_per_second": 36.216, |
| "step": 54500 |
| }, |
| { |
| "epoch": 6.646378575776019, |
| "grad_norm": 6.1334967613220215, |
| "learning_rate": 1.8211409395973155e-06, |
| "loss": 2.0495, |
| "step": 54600 |
| }, |
| { |
| "epoch": 6.658551430310408, |
| "grad_norm": 6.132894992828369, |
| "learning_rate": 1.7875838926174498e-06, |
| "loss": 2.0308, |
| "step": 54700 |
| }, |
| { |
| "epoch": 6.6707242848447965, |
| "grad_norm": 7.038134574890137, |
| "learning_rate": 1.7540268456375839e-06, |
| "loss": 2.0532, |
| "step": 54800 |
| }, |
| { |
| "epoch": 6.682897139379184, |
| "grad_norm": 6.755254745483398, |
| "learning_rate": 1.7204697986577181e-06, |
| "loss": 2.0178, |
| "step": 54900 |
| }, |
| { |
| "epoch": 6.695069993913573, |
| "grad_norm": 6.841146945953369, |
| "learning_rate": 1.6869127516778524e-06, |
| "loss": 2.0442, |
| "step": 55000 |
| }, |
| { |
| "epoch": 6.695069993913573, |
| "eval_loss": 1.9824799299240112, |
| "eval_runtime": 6.8019, |
| "eval_samples_per_second": 147.018, |
| "eval_steps_per_second": 36.755, |
| "step": 55000 |
| }, |
| { |
| "epoch": 6.707242848447961, |
| "grad_norm": 6.4666547775268555, |
| "learning_rate": 1.6533557046979867e-06, |
| "loss": 2.0205, |
| "step": 55100 |
| }, |
| { |
| "epoch": 6.719415702982349, |
| "grad_norm": 7.502538204193115, |
| "learning_rate": 1.619798657718121e-06, |
| "loss": 2.0261, |
| "step": 55200 |
| }, |
| { |
| "epoch": 6.731588557516738, |
| "grad_norm": 7.378790378570557, |
| "learning_rate": 1.5862416107382552e-06, |
| "loss": 2.0288, |
| "step": 55300 |
| }, |
| { |
| "epoch": 6.743761412051126, |
| "grad_norm": 7.264867305755615, |
| "learning_rate": 1.5526845637583892e-06, |
| "loss": 2.0187, |
| "step": 55400 |
| }, |
| { |
| "epoch": 6.755934266585514, |
| "grad_norm": 7.020994663238525, |
| "learning_rate": 1.5191275167785235e-06, |
| "loss": 2.038, |
| "step": 55500 |
| }, |
| { |
| "epoch": 6.755934266585514, |
| "eval_loss": 1.9808002710342407, |
| "eval_runtime": 6.8401, |
| "eval_samples_per_second": 146.197, |
| "eval_steps_per_second": 36.549, |
| "step": 55500 |
| }, |
| { |
| "epoch": 6.768107121119902, |
| "grad_norm": 6.773026943206787, |
| "learning_rate": 1.4855704697986578e-06, |
| "loss": 2.0144, |
| "step": 55600 |
| }, |
| { |
| "epoch": 6.780279975654291, |
| "grad_norm": 5.357457160949707, |
| "learning_rate": 1.452013422818792e-06, |
| "loss": 2.0353, |
| "step": 55700 |
| }, |
| { |
| "epoch": 6.7924528301886795, |
| "grad_norm": 6.2290873527526855, |
| "learning_rate": 1.4184563758389263e-06, |
| "loss": 2.0328, |
| "step": 55800 |
| }, |
| { |
| "epoch": 6.804625684723067, |
| "grad_norm": 6.145375728607178, |
| "learning_rate": 1.3848993288590606e-06, |
| "loss": 2.0438, |
| "step": 55900 |
| }, |
| { |
| "epoch": 6.816798539257456, |
| "grad_norm": 6.537805080413818, |
| "learning_rate": 1.3513422818791946e-06, |
| "loss": 2.0634, |
| "step": 56000 |
| }, |
| { |
| "epoch": 6.816798539257456, |
| "eval_loss": 1.9801044464111328, |
| "eval_runtime": 6.8399, |
| "eval_samples_per_second": 146.202, |
| "eval_steps_per_second": 36.55, |
| "step": 56000 |
| } |
| ], |
| "logging_steps": 100, |
| "max_steps": 60000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 8, |
| "save_steps": 1000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.080261694721884e+17, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|