| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9982905982905983, | |
| "eval_steps": 500, | |
| "global_step": 438, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.022792022792022793, | |
| "grad_norm": 2.218597567152377, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7544, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.045584045584045586, | |
| "grad_norm": 0.8186028537973438, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6997, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.06837606837606838, | |
| "grad_norm": 0.9212941016288362, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6798, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.09116809116809117, | |
| "grad_norm": 0.8748927154749253, | |
| "learning_rate": 5e-06, | |
| "loss": 0.68, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.11396011396011396, | |
| "grad_norm": 0.8185828012496023, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6762, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.13675213675213677, | |
| "grad_norm": 0.7047820637148428, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6559, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.15954415954415954, | |
| "grad_norm": 0.4630577367491141, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6526, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.18233618233618235, | |
| "grad_norm": 0.37711272448868094, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6554, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.20512820512820512, | |
| "grad_norm": 0.3202737721386268, | |
| "learning_rate": 5e-06, | |
| "loss": 0.64, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.22792022792022792, | |
| "grad_norm": 0.29895290557822196, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6413, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.25071225071225073, | |
| "grad_norm": 0.3339564719104408, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6326, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.27350427350427353, | |
| "grad_norm": 0.3089711327253267, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6408, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.2962962962962963, | |
| "grad_norm": 0.2880064692869082, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6417, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.3190883190883191, | |
| "grad_norm": 0.3066866041749207, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6439, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.3418803418803419, | |
| "grad_norm": 0.3183377069071228, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6364, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.3646723646723647, | |
| "grad_norm": 0.30389279648516754, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6415, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.38746438746438744, | |
| "grad_norm": 0.34515965846333546, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6333, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.41025641025641024, | |
| "grad_norm": 0.3010238903973123, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6389, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.43304843304843305, | |
| "grad_norm": 0.305044869326132, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6314, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.45584045584045585, | |
| "grad_norm": 0.30840519078259393, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6395, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.47863247863247865, | |
| "grad_norm": 0.30681357495275924, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6358, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.5014245014245015, | |
| "grad_norm": 0.30336186343842153, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6395, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.5242165242165242, | |
| "grad_norm": 0.3283645936629147, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6351, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.5470085470085471, | |
| "grad_norm": 0.3041964089929852, | |
| "learning_rate": 5e-06, | |
| "loss": 0.631, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.5698005698005698, | |
| "grad_norm": 0.33949867440584647, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6359, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.5925925925925926, | |
| "grad_norm": 0.30499195934646295, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6341, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.6153846153846154, | |
| "grad_norm": 0.31613496109824796, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6313, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.6381766381766382, | |
| "grad_norm": 0.31969719335542396, | |
| "learning_rate": 5e-06, | |
| "loss": 0.642, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.6609686609686609, | |
| "grad_norm": 0.3186872465072314, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6307, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.6837606837606838, | |
| "grad_norm": 0.2888007951280724, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6287, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.7065527065527065, | |
| "grad_norm": 0.2960253626480404, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6286, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.7293447293447294, | |
| "grad_norm": 0.33915618291310873, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6292, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.7521367521367521, | |
| "grad_norm": 0.30116887815816673, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6258, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.7749287749287749, | |
| "grad_norm": 0.3333518580221403, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6317, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.7977207977207977, | |
| "grad_norm": 0.33224367385448017, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6387, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.8205128205128205, | |
| "grad_norm": 0.3181916905648656, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6305, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.8433048433048433, | |
| "grad_norm": 0.33030362566649507, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6242, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.8660968660968661, | |
| "grad_norm": 0.3162880358649072, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6365, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.8888888888888888, | |
| "grad_norm": 0.3263181921886909, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6351, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.9116809116809117, | |
| "grad_norm": 0.2982532466684275, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6383, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.9344729344729344, | |
| "grad_norm": 0.2890804672214108, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6346, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.9572649572649573, | |
| "grad_norm": 0.300069760789381, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6214, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.98005698005698, | |
| "grad_norm": 0.34145032165166, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6345, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.9982905982905983, | |
| "eval_loss": 0.6251269578933716, | |
| "eval_runtime": 442.6816, | |
| "eval_samples_per_second": 26.708, | |
| "eval_steps_per_second": 0.418, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 0.9982905982905983, | |
| "step": 438, | |
| "total_flos": 918231661412352.0, | |
| "train_loss": 0.6429911312991625, | |
| "train_runtime": 23746.3651, | |
| "train_samples_per_second": 9.459, | |
| "train_steps_per_second": 0.018 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 438, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 918231661412352.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |