{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9996261682242991, "eval_steps": 250, "global_step": 1337, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05009345794392523, "grad_norm": 3.513535261154175, "learning_rate": 4.925373134328357e-07, "loss": 4.4864, "step": 67 }, { "epoch": 0.10018691588785046, "grad_norm": 4.232038974761963, "learning_rate": 9.925373134328357e-07, "loss": 4.4815, "step": 134 }, { "epoch": 0.1502803738317757, "grad_norm": 3.0024209022521973, "learning_rate": 9.451371571072319e-07, "loss": 4.3614, "step": 201 }, { "epoch": 0.18691588785046728, "eval_loss": 4.165477752685547, "eval_runtime": 18.4458, "eval_samples_per_second": 27.107, "eval_steps_per_second": 13.553, "step": 250 }, { "epoch": 0.20037383177570092, "grad_norm": 4.35124397277832, "learning_rate": 8.894430590191188e-07, "loss": 4.1954, "step": 268 }, { "epoch": 0.2504672897196262, "grad_norm": 4.761136531829834, "learning_rate": 8.337489609310058e-07, "loss": 3.9922, "step": 335 }, { "epoch": 0.3005607476635514, "grad_norm": 5.12410831451416, "learning_rate": 7.780548628428927e-07, "loss": 3.6946, "step": 402 }, { "epoch": 0.3506542056074766, "grad_norm": 4.370376110076904, "learning_rate": 7.223607647547797e-07, "loss": 3.3914, "step": 469 }, { "epoch": 0.37383177570093457, "eval_loss": 3.1697182655334473, "eval_runtime": 18.4011, "eval_samples_per_second": 27.172, "eval_steps_per_second": 13.586, "step": 500 }, { "epoch": 0.40074766355140184, "grad_norm": 3.3430747985839844, "learning_rate": 6.666666666666666e-07, "loss": 3.154, "step": 536 }, { "epoch": 0.4508411214953271, "grad_norm": 4.1145734786987305, "learning_rate": 6.109725685785536e-07, "loss": 2.9973, "step": 603 }, { "epoch": 0.5009345794392523, "grad_norm": 3.4338088035583496, "learning_rate": 5.552784704904405e-07, "loss": 2.815, "step": 670 }, { "epoch": 0.5510280373831775, "grad_norm": 3.3768129348754883, "learning_rate": 4.995843724023275e-07, "loss": 2.684, "step": 737 }, { "epoch": 0.5607476635514018, "eval_loss": 2.6262145042419434, "eval_runtime": 18.4215, "eval_samples_per_second": 27.142, "eval_steps_per_second": 13.571, "step": 750 }, { "epoch": 0.6011214953271028, "grad_norm": 2.7695705890655518, "learning_rate": 4.438902743142144e-07, "loss": 2.6152, "step": 804 }, { "epoch": 0.6512149532710281, "grad_norm": 2.307760715484619, "learning_rate": 3.881961762261014e-07, "loss": 2.5592, "step": 871 }, { "epoch": 0.7013084112149532, "grad_norm": 2.533203125, "learning_rate": 3.3250207813798835e-07, "loss": 2.5099, "step": 938 }, { "epoch": 0.7476635514018691, "eval_loss": 2.444204807281494, "eval_runtime": 18.3848, "eval_samples_per_second": 27.196, "eval_steps_per_second": 13.598, "step": 1000 }, { "epoch": 0.7514018691588785, "grad_norm": 1.7434587478637695, "learning_rate": 2.7680798004987534e-07, "loss": 2.4359, "step": 1005 }, { "epoch": 0.8014953271028037, "grad_norm": 2.9051830768585205, "learning_rate": 2.2111388196176226e-07, "loss": 2.4193, "step": 1072 }, { "epoch": 0.851588785046729, "grad_norm": 2.5767340660095215, "learning_rate": 1.6541978387364923e-07, "loss": 2.4261, "step": 1139 }, { "epoch": 0.9016822429906542, "grad_norm": 1.5444096326828003, "learning_rate": 1.0972568578553615e-07, "loss": 2.3706, "step": 1206 }, { "epoch": 0.9345794392523364, "eval_loss": 2.3879618644714355, "eval_runtime": 18.3821, "eval_samples_per_second": 27.2, "eval_steps_per_second": 13.6, "step": 1250 }, { "epoch": 0.9517757009345794, "grad_norm": 2.0951781272888184, "learning_rate": 5.403158769742311e-08, "loss": 2.3796, "step": 1273 }, { "epoch": 0.9996261682242991, "step": 1337, "total_flos": 4.450466167504896e+16, "train_loss": 3.11931433781101, "train_runtime": 1360.775, "train_samples_per_second": 7.863, "train_steps_per_second": 0.983 } ], "logging_steps": 67, "max_steps": 1337, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.450466167504896e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }