| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.9880794701986755, | |
| "eval_steps": 500, | |
| "global_step": 564, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.052980132450331126, | |
| "grad_norm": 2.3983146952234486, | |
| "learning_rate": 5e-06, | |
| "loss": 0.9851, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.10596026490066225, | |
| "grad_norm": 1.7246649174554667, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8319, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.15894039735099338, | |
| "grad_norm": 3.601038796091522, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8075, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.2119205298013245, | |
| "grad_norm": 1.2059052889786372, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7915, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.26490066225165565, | |
| "grad_norm": 1.3608839677655582, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7709, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.31788079470198677, | |
| "grad_norm": 1.2434441624370192, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7621, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.3708609271523179, | |
| "grad_norm": 0.9441777855534653, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7454, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.423841059602649, | |
| "grad_norm": 1.5125978662591961, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7248, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.4768211920529801, | |
| "grad_norm": 0.7471461569802452, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7275, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.5298013245033113, | |
| "grad_norm": 0.645516397675585, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7123, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.5827814569536424, | |
| "grad_norm": 0.7251445711578004, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7117, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.6357615894039735, | |
| "grad_norm": 0.651327566584479, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7119, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.6887417218543046, | |
| "grad_norm": 0.6140018870668793, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7053, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.7417218543046358, | |
| "grad_norm": 0.5388085038750972, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7022, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.7947019867549668, | |
| "grad_norm": 1.010650981679106, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6987, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.847682119205298, | |
| "grad_norm": 1.1037782052291758, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6976, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.9006622516556292, | |
| "grad_norm": 0.666699690620748, | |
| "learning_rate": 5e-06, | |
| "loss": 0.705, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.9536423841059603, | |
| "grad_norm": 0.5794869194974834, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7042, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.9960264900662251, | |
| "eval_loss": 0.6996881365776062, | |
| "eval_runtime": 101.9424, | |
| "eval_samples_per_second": 49.901, | |
| "eval_steps_per_second": 0.392, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 1.0066225165562914, | |
| "grad_norm": 0.8762129194412334, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6861, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.0596026490066226, | |
| "grad_norm": 0.7641648019922738, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6399, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.1125827814569536, | |
| "grad_norm": 0.6026445432992825, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6342, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.1655629139072847, | |
| "grad_norm": 0.5607709134599749, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6285, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.218543046357616, | |
| "grad_norm": 0.723598523167553, | |
| "learning_rate": 5e-06, | |
| "loss": 0.654, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.271523178807947, | |
| "grad_norm": 0.6634015008522252, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6474, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.3245033112582782, | |
| "grad_norm": 0.5676178378824602, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6356, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.3774834437086092, | |
| "grad_norm": 0.6219906931731467, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6396, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.4304635761589404, | |
| "grad_norm": 0.5539002206307158, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6395, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.4834437086092715, | |
| "grad_norm": 0.6706880554061717, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6364, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.5364238410596025, | |
| "grad_norm": 0.6250744115575335, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6455, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.589403973509934, | |
| "grad_norm": 0.5666575820633527, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6352, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.6423841059602649, | |
| "grad_norm": 0.8049891928557037, | |
| "learning_rate": 5e-06, | |
| "loss": 0.634, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.695364238410596, | |
| "grad_norm": 0.8098028256502842, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6379, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.7483443708609272, | |
| "grad_norm": 0.6314929024368203, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6394, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.8013245033112582, | |
| "grad_norm": 0.824620474103318, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6414, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.8543046357615895, | |
| "grad_norm": 0.5854556799760776, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6393, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.9072847682119205, | |
| "grad_norm": 0.6825161397864904, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6408, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.9602649006622517, | |
| "grad_norm": 0.5897191051228083, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6362, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.9973509933774833, | |
| "eval_loss": 0.6881988644599915, | |
| "eval_runtime": 101.7242, | |
| "eval_samples_per_second": 50.008, | |
| "eval_steps_per_second": 0.393, | |
| "step": 377 | |
| }, | |
| { | |
| "epoch": 2.013245033112583, | |
| "grad_norm": 0.8098259238713678, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6249, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 2.066225165562914, | |
| "grad_norm": 0.8384706128552907, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5796, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 2.119205298013245, | |
| "grad_norm": 0.7877590869928718, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5721, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 2.172185430463576, | |
| "grad_norm": 0.8214423131053483, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5881, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 2.225165562913907, | |
| "grad_norm": 0.708950143379715, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5788, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 2.2781456953642385, | |
| "grad_norm": 0.6491889315422662, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5778, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 2.3311258278145695, | |
| "grad_norm": 0.6858462236619034, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5821, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 2.384105960264901, | |
| "grad_norm": 0.780640823583864, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5768, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 2.437086092715232, | |
| "grad_norm": 0.7173053514977337, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5759, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 2.4900662251655628, | |
| "grad_norm": 0.7004632879605499, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5787, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 2.543046357615894, | |
| "grad_norm": 0.7028579263335615, | |
| "learning_rate": 5e-06, | |
| "loss": 0.579, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 2.596026490066225, | |
| "grad_norm": 0.9012109929919548, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5848, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 2.6490066225165565, | |
| "grad_norm": 0.6237112161014274, | |
| "learning_rate": 5e-06, | |
| "loss": 0.584, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 2.7019867549668874, | |
| "grad_norm": 0.6803732464125802, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5918, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 2.7549668874172184, | |
| "grad_norm": 0.7496234836165662, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5797, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 2.80794701986755, | |
| "grad_norm": 0.6761315878843943, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5921, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 2.8609271523178808, | |
| "grad_norm": 0.6002390896713952, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5887, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 2.9139072847682117, | |
| "grad_norm": 0.6064169883870584, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5919, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 2.966887417218543, | |
| "grad_norm": 0.5890652422466117, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5826, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 2.9880794701986755, | |
| "eval_loss": 0.6938396692276001, | |
| "eval_runtime": 102.1596, | |
| "eval_samples_per_second": 49.795, | |
| "eval_steps_per_second": 0.392, | |
| "step": 564 | |
| }, | |
| { | |
| "epoch": 2.9880794701986755, | |
| "step": 564, | |
| "total_flos": 944302247116800.0, | |
| "train_loss": 0.6565315867146702, | |
| "train_runtime": 17042.4484, | |
| "train_samples_per_second": 17.011, | |
| "train_steps_per_second": 0.033 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 564, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 944302247116800.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |