{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9880794701986755, "eval_steps": 500, "global_step": 564, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.052980132450331126, "grad_norm": 2.3983146952234486, "learning_rate": 5e-06, "loss": 0.9851, "step": 10 }, { "epoch": 0.10596026490066225, "grad_norm": 1.7246649174554667, "learning_rate": 5e-06, "loss": 0.8319, "step": 20 }, { "epoch": 0.15894039735099338, "grad_norm": 3.601038796091522, "learning_rate": 5e-06, "loss": 0.8075, "step": 30 }, { "epoch": 0.2119205298013245, "grad_norm": 1.2059052889786372, "learning_rate": 5e-06, "loss": 0.7915, "step": 40 }, { "epoch": 0.26490066225165565, "grad_norm": 1.3608839677655582, "learning_rate": 5e-06, "loss": 0.7709, "step": 50 }, { "epoch": 0.31788079470198677, "grad_norm": 1.2434441624370192, "learning_rate": 5e-06, "loss": 0.7621, "step": 60 }, { "epoch": 0.3708609271523179, "grad_norm": 0.9441777855534653, "learning_rate": 5e-06, "loss": 0.7454, "step": 70 }, { "epoch": 0.423841059602649, "grad_norm": 1.5125978662591961, "learning_rate": 5e-06, "loss": 0.7248, "step": 80 }, { "epoch": 0.4768211920529801, "grad_norm": 0.7471461569802452, "learning_rate": 5e-06, "loss": 0.7275, "step": 90 }, { "epoch": 0.5298013245033113, "grad_norm": 0.645516397675585, "learning_rate": 5e-06, "loss": 0.7123, "step": 100 }, { "epoch": 0.5827814569536424, "grad_norm": 0.7251445711578004, "learning_rate": 5e-06, "loss": 0.7117, "step": 110 }, { "epoch": 0.6357615894039735, "grad_norm": 0.651327566584479, "learning_rate": 5e-06, "loss": 0.7119, "step": 120 }, { "epoch": 0.6887417218543046, "grad_norm": 0.6140018870668793, "learning_rate": 5e-06, "loss": 0.7053, "step": 130 }, { "epoch": 0.7417218543046358, "grad_norm": 0.5388085038750972, "learning_rate": 5e-06, "loss": 0.7022, "step": 140 }, { "epoch": 0.7947019867549668, "grad_norm": 1.010650981679106, "learning_rate": 5e-06, "loss": 0.6987, "step": 150 }, { "epoch": 0.847682119205298, "grad_norm": 1.1037782052291758, "learning_rate": 5e-06, "loss": 0.6976, "step": 160 }, { "epoch": 0.9006622516556292, "grad_norm": 0.666699690620748, "learning_rate": 5e-06, "loss": 0.705, "step": 170 }, { "epoch": 0.9536423841059603, "grad_norm": 0.5794869194974834, "learning_rate": 5e-06, "loss": 0.7042, "step": 180 }, { "epoch": 0.9960264900662251, "eval_loss": 0.6996881365776062, "eval_runtime": 101.9424, "eval_samples_per_second": 49.901, "eval_steps_per_second": 0.392, "step": 188 }, { "epoch": 1.0066225165562914, "grad_norm": 0.8762129194412334, "learning_rate": 5e-06, "loss": 0.6861, "step": 190 }, { "epoch": 1.0596026490066226, "grad_norm": 0.7641648019922738, "learning_rate": 5e-06, "loss": 0.6399, "step": 200 }, { "epoch": 1.1125827814569536, "grad_norm": 0.6026445432992825, "learning_rate": 5e-06, "loss": 0.6342, "step": 210 }, { "epoch": 1.1655629139072847, "grad_norm": 0.5607709134599749, "learning_rate": 5e-06, "loss": 0.6285, "step": 220 }, { "epoch": 1.218543046357616, "grad_norm": 0.723598523167553, "learning_rate": 5e-06, "loss": 0.654, "step": 230 }, { "epoch": 1.271523178807947, "grad_norm": 0.6634015008522252, "learning_rate": 5e-06, "loss": 0.6474, "step": 240 }, { "epoch": 1.3245033112582782, "grad_norm": 0.5676178378824602, "learning_rate": 5e-06, "loss": 0.6356, "step": 250 }, { "epoch": 1.3774834437086092, "grad_norm": 0.6219906931731467, "learning_rate": 5e-06, "loss": 0.6396, "step": 260 }, { "epoch": 1.4304635761589404, "grad_norm": 0.5539002206307158, "learning_rate": 5e-06, "loss": 0.6395, "step": 270 }, { "epoch": 1.4834437086092715, "grad_norm": 0.6706880554061717, "learning_rate": 5e-06, "loss": 0.6364, "step": 280 }, { "epoch": 1.5364238410596025, "grad_norm": 0.6250744115575335, "learning_rate": 5e-06, "loss": 0.6455, "step": 290 }, { "epoch": 1.589403973509934, "grad_norm": 0.5666575820633527, "learning_rate": 5e-06, "loss": 0.6352, "step": 300 }, { "epoch": 1.6423841059602649, "grad_norm": 0.8049891928557037, "learning_rate": 5e-06, "loss": 0.634, "step": 310 }, { "epoch": 1.695364238410596, "grad_norm": 0.8098028256502842, "learning_rate": 5e-06, "loss": 0.6379, "step": 320 }, { "epoch": 1.7483443708609272, "grad_norm": 0.6314929024368203, "learning_rate": 5e-06, "loss": 0.6394, "step": 330 }, { "epoch": 1.8013245033112582, "grad_norm": 0.824620474103318, "learning_rate": 5e-06, "loss": 0.6414, "step": 340 }, { "epoch": 1.8543046357615895, "grad_norm": 0.5854556799760776, "learning_rate": 5e-06, "loss": 0.6393, "step": 350 }, { "epoch": 1.9072847682119205, "grad_norm": 0.6825161397864904, "learning_rate": 5e-06, "loss": 0.6408, "step": 360 }, { "epoch": 1.9602649006622517, "grad_norm": 0.5897191051228083, "learning_rate": 5e-06, "loss": 0.6362, "step": 370 }, { "epoch": 1.9973509933774833, "eval_loss": 0.6881988644599915, "eval_runtime": 101.7242, "eval_samples_per_second": 50.008, "eval_steps_per_second": 0.393, "step": 377 }, { "epoch": 2.013245033112583, "grad_norm": 0.8098259238713678, "learning_rate": 5e-06, "loss": 0.6249, "step": 380 }, { "epoch": 2.066225165562914, "grad_norm": 0.8384706128552907, "learning_rate": 5e-06, "loss": 0.5796, "step": 390 }, { "epoch": 2.119205298013245, "grad_norm": 0.7877590869928718, "learning_rate": 5e-06, "loss": 0.5721, "step": 400 }, { "epoch": 2.172185430463576, "grad_norm": 0.8214423131053483, "learning_rate": 5e-06, "loss": 0.5881, "step": 410 }, { "epoch": 2.225165562913907, "grad_norm": 0.708950143379715, "learning_rate": 5e-06, "loss": 0.5788, "step": 420 }, { "epoch": 2.2781456953642385, "grad_norm": 0.6491889315422662, "learning_rate": 5e-06, "loss": 0.5778, "step": 430 }, { "epoch": 2.3311258278145695, "grad_norm": 0.6858462236619034, "learning_rate": 5e-06, "loss": 0.5821, "step": 440 }, { "epoch": 2.384105960264901, "grad_norm": 0.780640823583864, "learning_rate": 5e-06, "loss": 0.5768, "step": 450 }, { "epoch": 2.437086092715232, "grad_norm": 0.7173053514977337, "learning_rate": 5e-06, "loss": 0.5759, "step": 460 }, { "epoch": 2.4900662251655628, "grad_norm": 0.7004632879605499, "learning_rate": 5e-06, "loss": 0.5787, "step": 470 }, { "epoch": 2.543046357615894, "grad_norm": 0.7028579263335615, "learning_rate": 5e-06, "loss": 0.579, "step": 480 }, { "epoch": 2.596026490066225, "grad_norm": 0.9012109929919548, "learning_rate": 5e-06, "loss": 0.5848, "step": 490 }, { "epoch": 2.6490066225165565, "grad_norm": 0.6237112161014274, "learning_rate": 5e-06, "loss": 0.584, "step": 500 }, { "epoch": 2.7019867549668874, "grad_norm": 0.6803732464125802, "learning_rate": 5e-06, "loss": 0.5918, "step": 510 }, { "epoch": 2.7549668874172184, "grad_norm": 0.7496234836165662, "learning_rate": 5e-06, "loss": 0.5797, "step": 520 }, { "epoch": 2.80794701986755, "grad_norm": 0.6761315878843943, "learning_rate": 5e-06, "loss": 0.5921, "step": 530 }, { "epoch": 2.8609271523178808, "grad_norm": 0.6002390896713952, "learning_rate": 5e-06, "loss": 0.5887, "step": 540 }, { "epoch": 2.9139072847682117, "grad_norm": 0.6064169883870584, "learning_rate": 5e-06, "loss": 0.5919, "step": 550 }, { "epoch": 2.966887417218543, "grad_norm": 0.5890652422466117, "learning_rate": 5e-06, "loss": 0.5826, "step": 560 }, { "epoch": 2.9880794701986755, "eval_loss": 0.6938396692276001, "eval_runtime": 102.1596, "eval_samples_per_second": 49.795, "eval_steps_per_second": 0.392, "step": 564 }, { "epoch": 2.9880794701986755, "step": 564, "total_flos": 944302247116800.0, "train_loss": 0.6565315867146702, "train_runtime": 17042.4484, "train_samples_per_second": 17.011, "train_steps_per_second": 0.033 } ], "logging_steps": 10, "max_steps": 564, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 944302247116800.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }