| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 8.0, | |
| "eval_steps": 200, | |
| "global_step": 2000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0, | |
| "eval_runtime": 17.4243, | |
| "eval_samples_per_second": 0.287, | |
| "eval_steps_per_second": 0.057, | |
| "step": 0 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 10.4270658493042, | |
| "learning_rate": 1.9600000000000002e-05, | |
| "loss": 6.4004, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 9.59022331237793, | |
| "learning_rate": 1.8968421052631582e-05, | |
| "loss": 5.1737, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "eval_runtime": 5.1058, | |
| "eval_samples_per_second": 0.979, | |
| "eval_steps_per_second": 0.196, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 16.000308990478516, | |
| "learning_rate": 1.7915789473684214e-05, | |
| "loss": 4.7286, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 12.360644340515137, | |
| "learning_rate": 1.6863157894736844e-05, | |
| "loss": 4.442, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "eval_runtime": 5.1817, | |
| "eval_samples_per_second": 0.965, | |
| "eval_steps_per_second": 0.193, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 16.141651153564453, | |
| "learning_rate": 1.5810526315789473e-05, | |
| "loss": 4.3198, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 18.464941024780273, | |
| "learning_rate": 1.4757894736842106e-05, | |
| "loss": 3.5096, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "eval_runtime": 5.173, | |
| "eval_samples_per_second": 0.967, | |
| "eval_steps_per_second": 0.193, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "grad_norm": 22.829710006713867, | |
| "learning_rate": 1.371578947368421e-05, | |
| "loss": 3.485, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 3.2, | |
| "grad_norm": 29.9942569732666, | |
| "learning_rate": 1.2663157894736843e-05, | |
| "loss": 3.032, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 3.2, | |
| "eval_runtime": 5.362, | |
| "eval_samples_per_second": 0.932, | |
| "eval_steps_per_second": 0.186, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 3.6, | |
| "grad_norm": 36.433963775634766, | |
| "learning_rate": 1.1621052631578948e-05, | |
| "loss": 2.5797, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 24.55065155029297, | |
| "learning_rate": 1.0568421052631579e-05, | |
| "loss": 2.5185, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_runtime": 4.999, | |
| "eval_samples_per_second": 1.0, | |
| "eval_steps_per_second": 0.2, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 4.4, | |
| "grad_norm": 25.797325134277344, | |
| "learning_rate": 9.515789473684212e-06, | |
| "loss": 1.6965, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 4.8, | |
| "grad_norm": 32.363624572753906, | |
| "learning_rate": 8.463157894736843e-06, | |
| "loss": 1.7272, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 4.8, | |
| "eval_runtime": 5.5586, | |
| "eval_samples_per_second": 0.9, | |
| "eval_steps_per_second": 0.18, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 5.2, | |
| "grad_norm": 36.48984146118164, | |
| "learning_rate": 7.410526315789475e-06, | |
| "loss": 1.342, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 5.6, | |
| "grad_norm": 33.30397415161133, | |
| "learning_rate": 6.357894736842106e-06, | |
| "loss": 1.0472, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 5.6, | |
| "eval_runtime": 5.6569, | |
| "eval_samples_per_second": 0.884, | |
| "eval_steps_per_second": 0.177, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "grad_norm": 29.339319229125977, | |
| "learning_rate": 5.305263157894738e-06, | |
| "loss": 1.0126, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 6.4, | |
| "grad_norm": 48.872501373291016, | |
| "learning_rate": 4.252631578947369e-06, | |
| "loss": 0.5935, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 6.4, | |
| "eval_runtime": 5.4682, | |
| "eval_samples_per_second": 0.914, | |
| "eval_steps_per_second": 0.183, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 6.8, | |
| "grad_norm": 24.40644073486328, | |
| "learning_rate": 3.2000000000000003e-06, | |
| "loss": 0.5881, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 7.2, | |
| "grad_norm": 13.23859691619873, | |
| "learning_rate": 2.1473684210526317e-06, | |
| "loss": 0.4895, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 7.2, | |
| "eval_runtime": 5.2891, | |
| "eval_samples_per_second": 0.945, | |
| "eval_steps_per_second": 0.189, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 7.6, | |
| "grad_norm": 13.824739456176758, | |
| "learning_rate": 1.0947368421052632e-06, | |
| "loss": 0.3585, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "grad_norm": 13.559722900390625, | |
| "learning_rate": 4.2105263157894737e-08, | |
| "loss": 0.3556, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "eval_runtime": 5.0911, | |
| "eval_samples_per_second": 0.982, | |
| "eval_steps_per_second": 0.196, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "step": 2000, | |
| "total_flos": 0.0, | |
| "train_loss": 2.4700030918121336, | |
| "train_runtime": 33124.2258, | |
| "train_samples_per_second": 2.414, | |
| "train_steps_per_second": 0.06 | |
| } | |
| ], | |
| "logging_steps": 100, | |
| "max_steps": 2000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 8, | |
| "save_steps": 400, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 20, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |