| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.88, | |
| "eval_steps": 500, | |
| "global_step": 30, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.096, | |
| "grad_norm": 5.814898920253824, | |
| "learning_rate": 1.6666666666666667e-06, | |
| "loss": 0.8387, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.192, | |
| "grad_norm": 6.057292675599317, | |
| "learning_rate": 3.3333333333333333e-06, | |
| "loss": 0.8858, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.288, | |
| "grad_norm": 5.644663837222156, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8602, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.384, | |
| "grad_norm": 4.386875194981092, | |
| "learning_rate": 4.983095894354858e-06, | |
| "loss": 0.8439, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 2.4847785324465694, | |
| "learning_rate": 4.93261217644956e-06, | |
| "loss": 0.7638, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.576, | |
| "grad_norm": 2.0150513534004237, | |
| "learning_rate": 4.849231551964771e-06, | |
| "loss": 0.7787, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.672, | |
| "grad_norm": 4.310739824230349, | |
| "learning_rate": 4.734081600808531e-06, | |
| "loss": 0.7736, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.768, | |
| "grad_norm": 4.593835357339227, | |
| "learning_rate": 4.588719528532342e-06, | |
| "loss": 0.794, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.864, | |
| "grad_norm": 4.297514689606173, | |
| "learning_rate": 4.415111107797445e-06, | |
| "loss": 0.7967, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 3.5868480239959792, | |
| "learning_rate": 4.215604094671835e-06, | |
| "loss": 0.7584, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 1.056, | |
| "grad_norm": 4.7658908684279835, | |
| "learning_rate": 3.992896479256966e-06, | |
| "loss": 1.0823, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 1.152, | |
| "grad_norm": 2.4559603231588087, | |
| "learning_rate": 3.7500000000000005e-06, | |
| "loss": 0.6939, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 1.248, | |
| "grad_norm": 1.972442848963337, | |
| "learning_rate": 3.4901994150978926e-06, | |
| "loss": 0.7301, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 1.3439999999999999, | |
| "grad_norm": 1.3858986042175938, | |
| "learning_rate": 3.217008081777726e-06, | |
| "loss": 0.7026, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "grad_norm": 1.1730441100334783, | |
| "learning_rate": 2.9341204441673267e-06, | |
| "loss": 0.6786, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 1.536, | |
| "grad_norm": 1.119242637007583, | |
| "learning_rate": 2.6453620722761897e-06, | |
| "loss": 0.6457, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 1.6320000000000001, | |
| "grad_norm": 1.242070894051599, | |
| "learning_rate": 2.3546379277238107e-06, | |
| "loss": 0.6852, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 1.728, | |
| "grad_norm": 1.0986457895290331, | |
| "learning_rate": 2.0658795558326745e-06, | |
| "loss": 0.6334, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 1.8239999999999998, | |
| "grad_norm": 0.9075839495380478, | |
| "learning_rate": 1.7829919182222752e-06, | |
| "loss": 0.679, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "grad_norm": 0.7221290672453798, | |
| "learning_rate": 1.509800584902108e-06, | |
| "loss": 0.6066, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 2.016, | |
| "grad_norm": 1.091018170812889, | |
| "learning_rate": 1.2500000000000007e-06, | |
| "loss": 1.0308, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 2.112, | |
| "grad_norm": 0.7565398447709, | |
| "learning_rate": 1.0071035207430352e-06, | |
| "loss": 0.6308, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 2.208, | |
| "grad_norm": 0.6252631187849726, | |
| "learning_rate": 7.843959053281663e-07, | |
| "loss": 0.6039, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 2.304, | |
| "grad_norm": 0.6592522345972539, | |
| "learning_rate": 5.848888922025553e-07, | |
| "loss": 0.6632, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 0.6078000962674466, | |
| "learning_rate": 4.1128047146765936e-07, | |
| "loss": 0.5926, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 2.496, | |
| "grad_norm": 0.643894665051632, | |
| "learning_rate": 2.6591839919146963e-07, | |
| "loss": 0.6525, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 2.592, | |
| "grad_norm": 0.5811315948755351, | |
| "learning_rate": 1.507684480352292e-07, | |
| "loss": 0.6727, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 2.6879999999999997, | |
| "grad_norm": 0.574044192546292, | |
| "learning_rate": 6.738782355044048e-08, | |
| "loss": 0.6145, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 2.784, | |
| "grad_norm": 0.538050743523217, | |
| "learning_rate": 1.6904105645142443e-08, | |
| "loss": 0.5905, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "grad_norm": 0.5778834721689959, | |
| "learning_rate": 0.0, | |
| "loss": 0.6567, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "step": 30, | |
| "total_flos": 5.991907970357658e+16, | |
| "train_loss": 0.7313097993532817, | |
| "train_runtime": 1870.0735, | |
| "train_samples_per_second": 1.598, | |
| "train_steps_per_second": 0.016 | |
| } | |
| ], | |
| "logging_steps": 1.0, | |
| "max_steps": 30, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5.991907970357658e+16, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |