| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.2, | |
| "eval_steps": 50, | |
| "global_step": 750, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 0.05632242560386658, | |
| "learning_rate": 0.00013297872340425532, | |
| "loss": 1.8852, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "eval_loss": 1.707169532775879, | |
| "eval_runtime": 27.6118, | |
| "eval_samples_per_second": 3.622, | |
| "eval_steps_per_second": 0.471, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 0.02606302499771118, | |
| "learning_rate": 0.0002632978723404255, | |
| "loss": 1.6036, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "eval_loss": 1.4855879545211792, | |
| "eval_runtime": 27.6396, | |
| "eval_samples_per_second": 3.618, | |
| "eval_steps_per_second": 0.47, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 0.034023039042949677, | |
| "learning_rate": 0.00039627659574468084, | |
| "loss": 1.5186, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "eval_loss": 1.466215968132019, | |
| "eval_runtime": 27.5933, | |
| "eval_samples_per_second": 3.624, | |
| "eval_steps_per_second": 0.471, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 0.04070857912302017, | |
| "learning_rate": 0.0004967397747480735, | |
| "loss": 1.4822, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "eval_loss": 1.4602761268615723, | |
| "eval_runtime": 27.615, | |
| "eval_samples_per_second": 3.621, | |
| "eval_steps_per_second": 0.471, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.03537657856941223, | |
| "learning_rate": 0.00048192056905749855, | |
| "loss": 1.5035, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "eval_loss": 1.4578460454940796, | |
| "eval_runtime": 27.6799, | |
| "eval_samples_per_second": 3.613, | |
| "eval_steps_per_second": 0.47, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.040425803512334824, | |
| "learning_rate": 0.00046710136336692356, | |
| "loss": 1.4813, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "eval_loss": 1.4557801485061646, | |
| "eval_runtime": 27.6578, | |
| "eval_samples_per_second": 3.616, | |
| "eval_steps_per_second": 0.47, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.03970955312252045, | |
| "learning_rate": 0.00045228215767634857, | |
| "loss": 1.4878, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "eval_loss": 1.4533815383911133, | |
| "eval_runtime": 27.6598, | |
| "eval_samples_per_second": 3.615, | |
| "eval_steps_per_second": 0.47, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.03238854929804802, | |
| "learning_rate": 0.0004374629519857736, | |
| "loss": 1.4765, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "eval_loss": 1.4522851705551147, | |
| "eval_runtime": 27.5987, | |
| "eval_samples_per_second": 3.623, | |
| "eval_steps_per_second": 0.471, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.04000236839056015, | |
| "learning_rate": 0.0004226437462951986, | |
| "loss": 1.4803, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "eval_loss": 1.4484608173370361, | |
| "eval_runtime": 27.6359, | |
| "eval_samples_per_second": 3.618, | |
| "eval_steps_per_second": 0.47, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.035757772624492645, | |
| "learning_rate": 0.0004078245406046236, | |
| "loss": 1.4925, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "eval_loss": 1.4477622509002686, | |
| "eval_runtime": 27.6496, | |
| "eval_samples_per_second": 3.617, | |
| "eval_steps_per_second": 0.47, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 0.039886392652988434, | |
| "learning_rate": 0.0003930053349140486, | |
| "loss": 1.49, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "eval_loss": 1.4466557502746582, | |
| "eval_runtime": 27.6503, | |
| "eval_samples_per_second": 3.617, | |
| "eval_steps_per_second": 0.47, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 0.035769980400800705, | |
| "learning_rate": 0.00037818612922347364, | |
| "loss": 1.4888, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "eval_loss": 1.4460580348968506, | |
| "eval_runtime": 27.6213, | |
| "eval_samples_per_second": 3.62, | |
| "eval_steps_per_second": 0.471, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "grad_norm": 0.039503153413534164, | |
| "learning_rate": 0.00036336692353289865, | |
| "loss": 1.4732, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "eval_loss": 1.4470006227493286, | |
| "eval_runtime": 27.6084, | |
| "eval_samples_per_second": 3.622, | |
| "eval_steps_per_second": 0.471, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": 0.04102110490202904, | |
| "learning_rate": 0.00034854771784232366, | |
| "loss": 1.4677, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "eval_loss": 1.4475681781768799, | |
| "eval_runtime": 27.6032, | |
| "eval_samples_per_second": 3.623, | |
| "eval_steps_per_second": 0.471, | |
| "step": 700 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 1875, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5.641442504933376e+16, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |