{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.999856278819091, "global_step": 83492, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "learning_rate": 2.9826330666411153e-06, "loss": 1.4668, "step": 500 }, { "epoch": 0.05, "learning_rate": 2.9652661332822306e-06, "loss": 1.4671, "step": 1000 }, { "epoch": 0.07, "learning_rate": 2.9478991999233463e-06, "loss": 1.467, "step": 1500 }, { "epoch": 0.1, "learning_rate": 2.930532266564461e-06, "loss": 1.4669, "step": 2000 }, { "epoch": 0.12, "learning_rate": 2.913165333205577e-06, "loss": 1.4671, "step": 2500 }, { "epoch": 0.14, "learning_rate": 2.8957983998466917e-06, "loss": 1.4675, "step": 3000 }, { "epoch": 0.17, "learning_rate": 2.8784314664878074e-06, "loss": 1.467, "step": 3500 }, { "epoch": 0.19, "learning_rate": 2.8610645331289227e-06, "loss": 1.4663, "step": 4000 }, { "epoch": 0.22, "learning_rate": 2.843697599770038e-06, "loss": 1.4664, "step": 4500 }, { "epoch": 0.24, "learning_rate": 2.826330666411153e-06, "loss": 1.4667, "step": 5000 }, { "epoch": 0.26, "learning_rate": 2.8089637330522685e-06, "loss": 1.467, "step": 5500 }, { "epoch": 0.29, "learning_rate": 2.791596799693384e-06, "loss": 1.4669, "step": 6000 }, { "epoch": 0.31, "learning_rate": 2.774229866334499e-06, "loss": 1.4668, "step": 6500 }, { "epoch": 0.34, "learning_rate": 2.7568629329756147e-06, "loss": 1.4663, "step": 7000 }, { "epoch": 0.36, "learning_rate": 2.7394959996167295e-06, "loss": 1.4668, "step": 7500 }, { "epoch": 0.38, "learning_rate": 2.7221290662578452e-06, "loss": 1.4665, "step": 8000 }, { "epoch": 0.41, "learning_rate": 2.7047621328989605e-06, "loss": 1.4665, "step": 8500 }, { "epoch": 0.43, "learning_rate": 2.6873951995400758e-06, "loss": 1.4663, "step": 9000 }, { "epoch": 0.46, "learning_rate": 2.670028266181191e-06, "loss": 1.4663, "step": 9500 }, { "epoch": 0.48, "learning_rate": 2.6526613328223063e-06, "loss": 1.4663, "step": 10000 }, { "epoch": 0.5, "learning_rate": 2.635294399463422e-06, "loss": 1.4658, "step": 10500 }, { "epoch": 0.53, "learning_rate": 2.617927466104537e-06, "loss": 1.4657, "step": 11000 }, { "epoch": 0.55, "learning_rate": 2.6005605327456525e-06, "loss": 1.4657, "step": 11500 }, { "epoch": 0.57, "learning_rate": 2.5831935993867674e-06, "loss": 1.4659, "step": 12000 }, { "epoch": 0.6, "learning_rate": 2.565826666027883e-06, "loss": 1.4661, "step": 12500 }, { "epoch": 0.62, "learning_rate": 2.5484597326689983e-06, "loss": 1.4657, "step": 13000 }, { "epoch": 0.65, "learning_rate": 2.5310927993101136e-06, "loss": 1.4661, "step": 13500 }, { "epoch": 0.67, "learning_rate": 2.513725865951229e-06, "loss": 1.4654, "step": 14000 }, { "epoch": 0.69, "learning_rate": 2.496358932592344e-06, "loss": 1.466, "step": 14500 }, { "epoch": 0.72, "learning_rate": 2.4789919992334594e-06, "loss": 1.4661, "step": 15000 }, { "epoch": 0.74, "learning_rate": 2.4616250658745747e-06, "loss": 1.4655, "step": 15500 }, { "epoch": 0.77, "learning_rate": 2.4442581325156904e-06, "loss": 1.4654, "step": 16000 }, { "epoch": 0.79, "learning_rate": 2.4268911991568052e-06, "loss": 1.4654, "step": 16500 }, { "epoch": 0.81, "learning_rate": 2.409524265797921e-06, "loss": 1.4647, "step": 17000 }, { "epoch": 0.84, "learning_rate": 2.392157332439036e-06, "loss": 1.4651, "step": 17500 }, { "epoch": 0.86, "learning_rate": 2.3747903990801515e-06, "loss": 1.4656, "step": 18000 }, { "epoch": 0.89, "learning_rate": 2.3574234657212667e-06, "loss": 1.4652, "step": 18500 }, { "epoch": 0.91, "learning_rate": 2.340056532362382e-06, "loss": 1.4649, "step": 19000 }, { "epoch": 0.93, "learning_rate": 2.3226895990034977e-06, "loss": 1.4654, "step": 19500 }, { "epoch": 0.96, "learning_rate": 2.3053226656446125e-06, "loss": 1.4649, "step": 20000 }, { "epoch": 0.98, "learning_rate": 2.2879557322857282e-06, "loss": 1.4648, "step": 20500 }, { "epoch": 1.0, "eval_loss": 1.4683704376220703, "eval_runtime": 319.8553, "eval_samples_per_second": 93.792, "eval_steps_per_second": 1.466, "step": 20873 }, { "epoch": 1.01, "learning_rate": 2.270588798926843e-06, "loss": 1.4643, "step": 21000 }, { "epoch": 1.03, "learning_rate": 2.2532218655679588e-06, "loss": 1.4633, "step": 21500 }, { "epoch": 1.05, "learning_rate": 2.235854932209074e-06, "loss": 1.4635, "step": 22000 }, { "epoch": 1.08, "learning_rate": 2.2184879988501893e-06, "loss": 1.4633, "step": 22500 }, { "epoch": 1.1, "learning_rate": 2.2011210654913046e-06, "loss": 1.4634, "step": 23000 }, { "epoch": 1.13, "learning_rate": 2.18375413213242e-06, "loss": 1.4632, "step": 23500 }, { "epoch": 1.15, "learning_rate": 2.166387198773535e-06, "loss": 1.4632, "step": 24000 }, { "epoch": 1.17, "learning_rate": 2.1490202654146504e-06, "loss": 1.4632, "step": 24500 }, { "epoch": 1.2, "learning_rate": 2.131653332055766e-06, "loss": 1.4634, "step": 25000 }, { "epoch": 1.22, "learning_rate": 2.114286398696881e-06, "loss": 1.463, "step": 25500 }, { "epoch": 1.25, "learning_rate": 2.0969194653379966e-06, "loss": 1.4635, "step": 26000 }, { "epoch": 1.27, "learning_rate": 2.079552531979112e-06, "loss": 1.4636, "step": 26500 }, { "epoch": 1.29, "learning_rate": 2.062185598620227e-06, "loss": 1.4629, "step": 27000 }, { "epoch": 1.32, "learning_rate": 2.0448186652613424e-06, "loss": 1.463, "step": 27500 }, { "epoch": 1.34, "learning_rate": 2.0274517319024577e-06, "loss": 1.4627, "step": 28000 }, { "epoch": 1.37, "learning_rate": 2.0100847985435734e-06, "loss": 1.4635, "step": 28500 }, { "epoch": 1.39, "learning_rate": 1.9927178651846882e-06, "loss": 1.463, "step": 29000 }, { "epoch": 1.41, "learning_rate": 1.9753509318258035e-06, "loss": 1.4631, "step": 29500 }, { "epoch": 1.44, "learning_rate": 1.957983998466919e-06, "loss": 1.463, "step": 30000 }, { "epoch": 1.46, "learning_rate": 1.9406170651080345e-06, "loss": 1.4632, "step": 30500 }, { "epoch": 1.49, "learning_rate": 1.9232501317491497e-06, "loss": 1.4626, "step": 31000 }, { "epoch": 1.51, "learning_rate": 1.9058831983902652e-06, "loss": 1.463, "step": 31500 }, { "epoch": 1.53, "learning_rate": 1.8885162650313807e-06, "loss": 1.4632, "step": 32000 }, { "epoch": 1.56, "learning_rate": 1.8711493316724955e-06, "loss": 1.4629, "step": 32500 }, { "epoch": 1.58, "learning_rate": 1.853782398313611e-06, "loss": 1.4628, "step": 33000 }, { "epoch": 1.6, "learning_rate": 1.836415464954726e-06, "loss": 1.463, "step": 33500 }, { "epoch": 1.63, "learning_rate": 1.8190485315958416e-06, "loss": 1.4627, "step": 34000 }, { "epoch": 1.65, "learning_rate": 1.801681598236957e-06, "loss": 1.4626, "step": 34500 }, { "epoch": 1.68, "learning_rate": 1.7843146648780719e-06, "loss": 1.4631, "step": 35000 }, { "epoch": 1.7, "learning_rate": 1.766947731519188e-06, "loss": 1.4628, "step": 35500 }, { "epoch": 1.72, "learning_rate": 1.7495807981603028e-06, "loss": 1.4625, "step": 36000 }, { "epoch": 1.75, "learning_rate": 1.7322138648014181e-06, "loss": 1.4627, "step": 36500 }, { "epoch": 1.77, "learning_rate": 1.7148469314425332e-06, "loss": 1.4628, "step": 37000 }, { "epoch": 1.8, "learning_rate": 1.697479998083649e-06, "loss": 1.4623, "step": 37500 }, { "epoch": 1.82, "learning_rate": 1.680113064724764e-06, "loss": 1.4626, "step": 38000 }, { "epoch": 1.84, "learning_rate": 1.662746131365879e-06, "loss": 1.4621, "step": 38500 }, { "epoch": 1.87, "learning_rate": 1.6453791980069949e-06, "loss": 1.4626, "step": 39000 }, { "epoch": 1.89, "learning_rate": 1.62801226464811e-06, "loss": 1.4623, "step": 39500 }, { "epoch": 1.92, "learning_rate": 1.6106453312892254e-06, "loss": 1.4622, "step": 40000 }, { "epoch": 1.94, "learning_rate": 1.5932783979303407e-06, "loss": 1.4624, "step": 40500 }, { "epoch": 1.96, "learning_rate": 1.5759114645714566e-06, "loss": 1.4626, "step": 41000 }, { "epoch": 1.99, "learning_rate": 1.5585445312125714e-06, "loss": 1.4619, "step": 41500 }, { "epoch": 2.0, "eval_loss": 1.467372179031372, "eval_runtime": 318.5344, "eval_samples_per_second": 94.181, "eval_steps_per_second": 1.472, "step": 41747 }, { "epoch": 2.01, "learning_rate": 1.5411775978536865e-06, "loss": 1.4618, "step": 42000 }, { "epoch": 2.04, "learning_rate": 1.5238106644948016e-06, "loss": 1.4609, "step": 42500 }, { "epoch": 2.06, "learning_rate": 1.5064437311359175e-06, "loss": 1.461, "step": 43000 }, { "epoch": 2.08, "learning_rate": 1.4890767977770327e-06, "loss": 1.4613, "step": 43500 }, { "epoch": 2.11, "learning_rate": 1.471709864418148e-06, "loss": 1.4607, "step": 44000 }, { "epoch": 2.13, "learning_rate": 1.4543429310592633e-06, "loss": 1.4611, "step": 44500 }, { "epoch": 2.16, "learning_rate": 1.4369759977003785e-06, "loss": 1.4614, "step": 45000 }, { "epoch": 2.18, "learning_rate": 1.4196090643414938e-06, "loss": 1.4608, "step": 45500 }, { "epoch": 2.2, "learning_rate": 1.402242130982609e-06, "loss": 1.4609, "step": 46000 }, { "epoch": 2.23, "learning_rate": 1.3848751976237243e-06, "loss": 1.4618, "step": 46500 }, { "epoch": 2.25, "learning_rate": 1.3675082642648396e-06, "loss": 1.4609, "step": 47000 }, { "epoch": 2.28, "learning_rate": 1.3501413309059553e-06, "loss": 1.4606, "step": 47500 }, { "epoch": 2.3, "learning_rate": 1.3327743975470706e-06, "loss": 1.461, "step": 48000 }, { "epoch": 2.32, "learning_rate": 1.3154074641881858e-06, "loss": 1.4609, "step": 48500 }, { "epoch": 2.35, "learning_rate": 1.2980405308293011e-06, "loss": 1.4611, "step": 49000 }, { "epoch": 2.37, "learning_rate": 1.2806735974704164e-06, "loss": 1.4608, "step": 49500 }, { "epoch": 2.4, "learning_rate": 1.2633066641115317e-06, "loss": 1.4604, "step": 50000 }, { "epoch": 2.42, "learning_rate": 1.245939730752647e-06, "loss": 1.4608, "step": 50500 }, { "epoch": 2.44, "learning_rate": 1.2285727973937622e-06, "loss": 1.4607, "step": 51000 }, { "epoch": 2.47, "learning_rate": 1.2112058640348775e-06, "loss": 1.4606, "step": 51500 }, { "epoch": 2.49, "learning_rate": 1.1938389306759932e-06, "loss": 1.4606, "step": 52000 }, { "epoch": 2.52, "learning_rate": 1.1764719973171084e-06, "loss": 1.4607, "step": 52500 }, { "epoch": 2.54, "learning_rate": 1.1591050639582237e-06, "loss": 1.4606, "step": 53000 }, { "epoch": 2.56, "learning_rate": 1.141738130599339e-06, "loss": 1.4609, "step": 53500 }, { "epoch": 2.59, "learning_rate": 1.1243711972404542e-06, "loss": 1.4609, "step": 54000 }, { "epoch": 2.61, "learning_rate": 1.1070042638815695e-06, "loss": 1.4604, "step": 54500 }, { "epoch": 2.63, "learning_rate": 1.0896373305226848e-06, "loss": 1.4608, "step": 55000 }, { "epoch": 2.66, "learning_rate": 1.0722703971638e-06, "loss": 1.4604, "step": 55500 }, { "epoch": 2.68, "learning_rate": 1.0549034638049153e-06, "loss": 1.4607, "step": 56000 }, { "epoch": 2.71, "learning_rate": 1.037536530446031e-06, "loss": 1.4607, "step": 56500 }, { "epoch": 2.73, "learning_rate": 1.020169597087146e-06, "loss": 1.4609, "step": 57000 }, { "epoch": 2.75, "learning_rate": 1.0028026637282615e-06, "loss": 1.461, "step": 57500 }, { "epoch": 2.78, "learning_rate": 9.854357303693768e-07, "loss": 1.4609, "step": 58000 }, { "epoch": 2.8, "learning_rate": 9.68068797010492e-07, "loss": 1.4608, "step": 58500 }, { "epoch": 2.83, "learning_rate": 9.507018636516072e-07, "loss": 1.4603, "step": 59000 }, { "epoch": 2.85, "learning_rate": 9.333349302927227e-07, "loss": 1.4606, "step": 59500 }, { "epoch": 2.87, "learning_rate": 9.159679969338379e-07, "loss": 1.4605, "step": 60000 }, { "epoch": 2.9, "learning_rate": 8.986010635749534e-07, "loss": 1.4603, "step": 60500 }, { "epoch": 2.92, "learning_rate": 8.81234130216069e-07, "loss": 1.4606, "step": 61000 }, { "epoch": 2.95, "learning_rate": 8.638671968571839e-07, "loss": 1.4611, "step": 61500 }, { "epoch": 2.97, "learning_rate": 8.465002634982994e-07, "loss": 1.4603, "step": 62000 }, { "epoch": 2.99, "learning_rate": 8.291333301394146e-07, "loss": 1.4606, "step": 62500 }, { "epoch": 3.0, "eval_loss": 1.4666800498962402, "eval_runtime": 320.1083, "eval_samples_per_second": 93.718, "eval_steps_per_second": 1.465, "step": 62621 }, { "epoch": 3.02, "learning_rate": 8.1176639678053e-07, "loss": 1.4591, "step": 63000 }, { "epoch": 3.04, "learning_rate": 7.943994634216451e-07, "loss": 1.46, "step": 63500 }, { "epoch": 3.07, "learning_rate": 7.770325300627606e-07, "loss": 1.4594, "step": 64000 }, { "epoch": 3.09, "learning_rate": 7.596655967038757e-07, "loss": 1.4603, "step": 64500 }, { "epoch": 3.11, "learning_rate": 7.422986633449912e-07, "loss": 1.4599, "step": 65000 }, { "epoch": 3.14, "learning_rate": 7.249317299861067e-07, "loss": 1.4596, "step": 65500 }, { "epoch": 3.16, "learning_rate": 7.075647966272218e-07, "loss": 1.4593, "step": 66000 }, { "epoch": 3.19, "learning_rate": 6.901978632683372e-07, "loss": 1.4594, "step": 66500 }, { "epoch": 3.21, "learning_rate": 6.728309299094524e-07, "loss": 1.4595, "step": 67000 }, { "epoch": 3.23, "learning_rate": 6.554639965505679e-07, "loss": 1.4602, "step": 67500 }, { "epoch": 3.26, "learning_rate": 6.380970631916829e-07, "loss": 1.4592, "step": 68000 }, { "epoch": 3.28, "learning_rate": 6.207301298327984e-07, "loss": 1.4592, "step": 68500 }, { "epoch": 3.31, "learning_rate": 6.033631964739136e-07, "loss": 1.46, "step": 69000 }, { "epoch": 3.33, "learning_rate": 5.859962631150291e-07, "loss": 1.4596, "step": 69500 }, { "epoch": 3.35, "learning_rate": 5.686293297561441e-07, "loss": 1.4597, "step": 70000 }, { "epoch": 3.38, "learning_rate": 5.512623963972596e-07, "loss": 1.4594, "step": 70500 }, { "epoch": 3.4, "learning_rate": 5.338954630383751e-07, "loss": 1.4598, "step": 71000 }, { "epoch": 3.43, "learning_rate": 5.165285296794902e-07, "loss": 1.4596, "step": 71500 }, { "epoch": 3.45, "learning_rate": 4.991615963206056e-07, "loss": 1.4596, "step": 72000 }, { "epoch": 3.47, "learning_rate": 4.817946629617209e-07, "loss": 1.4594, "step": 72500 }, { "epoch": 3.5, "learning_rate": 4.6442772960283626e-07, "loss": 1.4593, "step": 73000 }, { "epoch": 3.52, "learning_rate": 4.4706079624395143e-07, "loss": 1.4597, "step": 73500 }, { "epoch": 3.55, "learning_rate": 4.296938628850668e-07, "loss": 1.4601, "step": 74000 }, { "epoch": 3.57, "learning_rate": 4.1232692952618197e-07, "loss": 1.4592, "step": 74500 }, { "epoch": 3.59, "learning_rate": 3.9495999616729745e-07, "loss": 1.4599, "step": 75000 }, { "epoch": 3.62, "learning_rate": 3.77593062808413e-07, "loss": 1.4591, "step": 75500 }, { "epoch": 3.64, "learning_rate": 3.602261294495281e-07, "loss": 1.4598, "step": 76000 }, { "epoch": 3.66, "learning_rate": 3.428591960906435e-07, "loss": 1.4592, "step": 76500 }, { "epoch": 3.69, "learning_rate": 3.2549226273175863e-07, "loss": 1.4597, "step": 77000 }, { "epoch": 3.71, "learning_rate": 3.081253293728741e-07, "loss": 1.4598, "step": 77500 }, { "epoch": 3.74, "learning_rate": 2.907583960139893e-07, "loss": 1.4594, "step": 78000 }, { "epoch": 3.76, "learning_rate": 2.7339146265510476e-07, "loss": 1.4596, "step": 78500 }, { "epoch": 3.78, "learning_rate": 2.5602452929621987e-07, "loss": 1.4592, "step": 79000 }, { "epoch": 3.81, "learning_rate": 2.386575959373353e-07, "loss": 1.4597, "step": 79500 }, { "epoch": 3.83, "learning_rate": 2.2129066257845077e-07, "loss": 1.4593, "step": 80000 }, { "epoch": 3.86, "learning_rate": 2.0392372921956589e-07, "loss": 1.4598, "step": 80500 }, { "epoch": 3.88, "learning_rate": 1.8655679586068137e-07, "loss": 1.4593, "step": 81000 }, { "epoch": 3.9, "learning_rate": 1.691898625017965e-07, "loss": 1.4592, "step": 81500 }, { "epoch": 3.93, "learning_rate": 1.5182292914291196e-07, "loss": 1.4593, "step": 82000 }, { "epoch": 3.95, "learning_rate": 1.344559957840271e-07, "loss": 1.4594, "step": 82500 }, { "epoch": 3.98, "learning_rate": 1.1708906242514258e-07, "loss": 1.4596, "step": 83000 }, { "epoch": 4.0, "eval_loss": 1.4663872718811035, "eval_runtime": 321.2818, "eval_samples_per_second": 93.376, "eval_steps_per_second": 1.46, "step": 83492 }, { "epoch": 4.0, "step": 83492, "total_flos": 2.080820519714685e+18, "train_loss": 1.4623237360947567, "train_runtime": 299973.1617, "train_samples_per_second": 71.255, "train_steps_per_second": 0.278 } ], "max_steps": 83492, "num_train_epochs": 4, "total_flos": 2.080820519714685e+18, "trial_name": null, "trial_params": null }