| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 180, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.08403361344537816, | |
| "grad_norm": 0.5671328902244568, | |
| "learning_rate": 8e-06, | |
| "loss": 1.9146, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.16806722689075632, | |
| "grad_norm": 0.6045827269554138, | |
| "learning_rate": 1.8e-05, | |
| "loss": 1.8831, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.25210084033613445, | |
| "grad_norm": 0.4676426351070404, | |
| "learning_rate": 2.8e-05, | |
| "loss": 1.8656, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.33613445378151263, | |
| "grad_norm": 0.6086534857749939, | |
| "learning_rate": 2.998542122917149e-05, | |
| "loss": 1.7756, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.42016806722689076, | |
| "grad_norm": 0.5000975131988525, | |
| "learning_rate": 2.9926243538175172e-05, | |
| "loss": 1.7644, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.5042016806722689, | |
| "grad_norm": 0.4531285762786865, | |
| "learning_rate": 2.9821735336128774e-05, | |
| "loss": 1.7086, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.5882352941176471, | |
| "grad_norm": 0.503158688545227, | |
| "learning_rate": 2.9672214011007087e-05, | |
| "loss": 1.6228, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.6722689075630253, | |
| "grad_norm": 0.5503730177879333, | |
| "learning_rate": 2.947813365416023e-05, | |
| "loss": 1.541, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.7563025210084033, | |
| "grad_norm": 0.5910710692405701, | |
| "learning_rate": 2.9240083681253192e-05, | |
| "loss": 1.5533, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.8403361344537815, | |
| "grad_norm": 0.5865580439567566, | |
| "learning_rate": 2.895878704222978e-05, | |
| "loss": 1.5369, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.9243697478991597, | |
| "grad_norm": 0.694772481918335, | |
| "learning_rate": 2.863509802573744e-05, | |
| "loss": 1.4103, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 1.0787880420684814, | |
| "learning_rate": 2.826999966468069e-05, | |
| "loss": 1.4139, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 1.084033613445378, | |
| "grad_norm": 0.8692999482154846, | |
| "learning_rate": 2.7864600750782507e-05, | |
| "loss": 1.2387, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 1.1680672268907564, | |
| "grad_norm": 0.8634143471717834, | |
| "learning_rate": 2.74201324672203e-05, | |
| "loss": 1.23, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 1.2521008403361344, | |
| "grad_norm": 1.0803030729293823, | |
| "learning_rate": 2.6937944649563078e-05, | |
| "loss": 1.2203, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 1.3361344537815127, | |
| "grad_norm": 1.4019559621810913, | |
| "learning_rate": 2.641950168636517e-05, | |
| "loss": 1.1067, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 1.4201680672268908, | |
| "grad_norm": 1.0336651802062988, | |
| "learning_rate": 2.5866378071866338e-05, | |
| "loss": 1.0889, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 1.504201680672269, | |
| "grad_norm": 1.1765518188476562, | |
| "learning_rate": 2.52802536243045e-05, | |
| "loss": 1.05, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 1.5882352941176472, | |
| "grad_norm": 1.146044135093689, | |
| "learning_rate": 2.4662908384362964e-05, | |
| "loss": 0.9963, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 1.6722689075630253, | |
| "grad_norm": 1.2770051956176758, | |
| "learning_rate": 2.4016217209245377e-05, | |
| "loss": 0.9403, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.7563025210084033, | |
| "grad_norm": 1.9435651302337646, | |
| "learning_rate": 2.3342144078796007e-05, | |
| "loss": 0.9242, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 1.8403361344537816, | |
| "grad_norm": 1.3785291910171509, | |
| "learning_rate": 2.2642736130957522e-05, | |
| "loss": 0.8651, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 1.9243697478991597, | |
| "grad_norm": 1.5004098415374756, | |
| "learning_rate": 2.1920117444680317e-05, | |
| "loss": 0.8922, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 1.874428153038025, | |
| "learning_rate": 2.1176482589164575e-05, | |
| "loss": 0.8208, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 2.0840336134453783, | |
| "grad_norm": 1.3628363609313965, | |
| "learning_rate": 2.0414089959025724e-05, | |
| "loss": 0.7341, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 2.168067226890756, | |
| "grad_norm": 2.062274694442749, | |
| "learning_rate": 1.963525491562421e-05, | |
| "loss": 0.7337, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 2.2521008403361344, | |
| "grad_norm": 1.80815589427948, | |
| "learning_rate": 1.8842342755389172e-05, | |
| "loss": 0.6989, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 2.3361344537815127, | |
| "grad_norm": 1.5047976970672607, | |
| "learning_rate": 1.803776152649088e-05, | |
| "loss": 0.6134, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 2.4201680672268906, | |
| "grad_norm": 1.5067673921585083, | |
| "learning_rate": 1.722395471567763e-05, | |
| "loss": 0.6403, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 2.504201680672269, | |
| "grad_norm": 1.7410961389541626, | |
| "learning_rate": 1.6403393827486768e-05, | |
| "loss": 0.659, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 2.588235294117647, | |
| "grad_norm": 1.5989471673965454, | |
| "learning_rate": 1.5578570878366656e-05, | |
| "loss": 0.6186, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 2.6722689075630255, | |
| "grad_norm": 1.7921215295791626, | |
| "learning_rate": 1.4751990828504623e-05, | |
| "loss": 0.6041, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 2.7563025210084033, | |
| "grad_norm": 1.5869600772857666, | |
| "learning_rate": 1.3926163974345199e-05, | |
| "loss": 0.5408, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 2.8403361344537816, | |
| "grad_norm": 1.6724202632904053, | |
| "learning_rate": 1.3103598324902307e-05, | |
| "loss": 0.5608, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 2.92436974789916, | |
| "grad_norm": 1.8233758211135864, | |
| "learning_rate": 1.2286791985018356e-05, | |
| "loss": 0.5338, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 2.5403754711151123, | |
| "learning_rate": 1.1478225568701888e-05, | |
| "loss": 0.5532, | |
| "step": 180 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 300, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 2000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3.8358614913581056e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |