| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 292, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.017123287671232876, | |
| "grad_norm": 1.0587148666381836, | |
| "learning_rate": 1.6438356164383561e-06, | |
| "loss": 1.2908, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.03424657534246575, | |
| "grad_norm": 0.957391619682312, | |
| "learning_rate": 3.6986301369863014e-06, | |
| "loss": 1.408, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.05136986301369863, | |
| "grad_norm": 0.8232783675193787, | |
| "learning_rate": 5.753424657534246e-06, | |
| "loss": 1.2972, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.0684931506849315, | |
| "grad_norm": 0.707105278968811, | |
| "learning_rate": 7.808219178082192e-06, | |
| "loss": 1.2907, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.08561643835616438, | |
| "grad_norm": 0.5033673048019409, | |
| "learning_rate": 9.863013698630136e-06, | |
| "loss": 1.28, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.10273972602739725, | |
| "grad_norm": 0.51893150806427, | |
| "learning_rate": 1.1917808219178083e-05, | |
| "loss": 1.2497, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.11986301369863013, | |
| "grad_norm": 0.5541372299194336, | |
| "learning_rate": 1.3972602739726027e-05, | |
| "loss": 1.2018, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.136986301369863, | |
| "grad_norm": 0.4447920620441437, | |
| "learning_rate": 1.6027397260273974e-05, | |
| "loss": 1.218, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.1541095890410959, | |
| "grad_norm": 0.5761701464653015, | |
| "learning_rate": 1.8082191780821916e-05, | |
| "loss": 1.2215, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.17123287671232876, | |
| "grad_norm": 0.46446430683135986, | |
| "learning_rate": 2.0136986301369863e-05, | |
| "loss": 1.1981, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.18835616438356165, | |
| "grad_norm": 0.4923893213272095, | |
| "learning_rate": 2.219178082191781e-05, | |
| "loss": 1.2212, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.2054794520547945, | |
| "grad_norm": 0.4145517945289612, | |
| "learning_rate": 2.4246575342465755e-05, | |
| "loss": 1.1524, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.2226027397260274, | |
| "grad_norm": 0.5622988939285278, | |
| "learning_rate": 2.6301369863013698e-05, | |
| "loss": 1.15, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.23972602739726026, | |
| "grad_norm": 0.45440879464149475, | |
| "learning_rate": 2.8356164383561644e-05, | |
| "loss": 1.1336, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.2568493150684932, | |
| "grad_norm": 0.5431708693504333, | |
| "learning_rate": 2.999996152240661e-05, | |
| "loss": 1.1332, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.273972602739726, | |
| "grad_norm": 0.5097510814666748, | |
| "learning_rate": 2.9998614827365136e-05, | |
| "loss": 1.0534, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.2910958904109589, | |
| "grad_norm": 0.523253858089447, | |
| "learning_rate": 2.999534445005289e-05, | |
| "loss": 1.0262, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.3082191780821918, | |
| "grad_norm": 0.5461484789848328, | |
| "learning_rate": 2.9990150809919714e-05, | |
| "loss": 1.0322, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.3253424657534247, | |
| "grad_norm": 0.5159561634063721, | |
| "learning_rate": 2.998303457308803e-05, | |
| "loss": 1.0268, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.3424657534246575, | |
| "grad_norm": 0.5679484009742737, | |
| "learning_rate": 2.997399665226736e-05, | |
| "loss": 1.0459, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.3595890410958904, | |
| "grad_norm": 0.659304678440094, | |
| "learning_rate": 2.9963038206637277e-05, | |
| "loss": 1.0858, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.3767123287671233, | |
| "grad_norm": 0.5805976390838623, | |
| "learning_rate": 2.9950160641698755e-05, | |
| "loss": 1.0286, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.3938356164383562, | |
| "grad_norm": 0.6749956011772156, | |
| "learning_rate": 2.993536560909387e-05, | |
| "loss": 0.9836, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.410958904109589, | |
| "grad_norm": 0.6401397585868835, | |
| "learning_rate": 2.991865500639398e-05, | |
| "loss": 0.996, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.4280821917808219, | |
| "grad_norm": 0.6153554916381836, | |
| "learning_rate": 2.990003097685634e-05, | |
| "loss": 0.9677, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.4452054794520548, | |
| "grad_norm": 0.6153773069381714, | |
| "learning_rate": 2.987949590914923e-05, | |
| "loss": 0.9196, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.4623287671232877, | |
| "grad_norm": 0.6787427067756653, | |
| "learning_rate": 2.985705243704559e-05, | |
| "loss": 0.9613, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.4794520547945205, | |
| "grad_norm": 0.6827302575111389, | |
| "learning_rate": 2.9832703439085174e-05, | |
| "loss": 0.9559, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.4965753424657534, | |
| "grad_norm": 0.7343211770057678, | |
| "learning_rate": 2.9806452038205437e-05, | |
| "loss": 0.9245, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.5136986301369864, | |
| "grad_norm": 0.7253044247627258, | |
| "learning_rate": 2.977830160134091e-05, | |
| "loss": 0.9052, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.5308219178082192, | |
| "grad_norm": 0.7713239789009094, | |
| "learning_rate": 2.974825573899144e-05, | |
| "loss": 0.9172, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.547945205479452, | |
| "grad_norm": 0.7671046257019043, | |
| "learning_rate": 2.9716318304759057e-05, | |
| "loss": 0.9238, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.565068493150685, | |
| "grad_norm": 0.8231728076934814, | |
| "learning_rate": 2.9682493394853763e-05, | |
| "loss": 0.9248, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.5821917808219178, | |
| "grad_norm": 0.8311623930931091, | |
| "learning_rate": 2.9646785347568143e-05, | |
| "loss": 0.8765, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.5993150684931506, | |
| "grad_norm": 0.7531796097755432, | |
| "learning_rate": 2.9609198742720957e-05, | |
| "loss": 0.8939, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.6164383561643836, | |
| "grad_norm": 1.0523626804351807, | |
| "learning_rate": 2.9569738401069728e-05, | |
| "loss": 0.8488, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.6335616438356164, | |
| "grad_norm": 0.9050348401069641, | |
| "learning_rate": 2.9528409383692465e-05, | |
| "loss": 0.8467, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.6506849315068494, | |
| "grad_norm": 0.8227254748344421, | |
| "learning_rate": 2.948521699133853e-05, | |
| "loss": 0.83, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.6678082191780822, | |
| "grad_norm": 0.8400980830192566, | |
| "learning_rate": 2.9440166763748782e-05, | |
| "loss": 0.7697, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.684931506849315, | |
| "grad_norm": 0.854492723941803, | |
| "learning_rate": 2.9393264478945073e-05, | |
| "loss": 0.8413, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.702054794520548, | |
| "grad_norm": 0.9226430654525757, | |
| "learning_rate": 2.934451615248915e-05, | |
| "loss": 0.8085, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.7191780821917808, | |
| "grad_norm": 0.8370131850242615, | |
| "learning_rate": 2.929392803671114e-05, | |
| "loss": 0.8028, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.7363013698630136, | |
| "grad_norm": 0.927441418170929, | |
| "learning_rate": 2.9241506619907636e-05, | |
| "loss": 0.8509, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.7534246575342466, | |
| "grad_norm": 0.9028705358505249, | |
| "learning_rate": 2.9187258625509518e-05, | |
| "loss": 0.7967, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.7705479452054794, | |
| "grad_norm": 0.8795896768569946, | |
| "learning_rate": 2.9131191011219634e-05, | |
| "loss": 0.7865, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.7876712328767124, | |
| "grad_norm": 0.8585197329521179, | |
| "learning_rate": 2.907331096812041e-05, | |
| "loss": 0.764, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.8047945205479452, | |
| "grad_norm": 0.8900719881057739, | |
| "learning_rate": 2.9013625919751557e-05, | |
| "loss": 0.8205, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.821917808219178, | |
| "grad_norm": 0.9867483377456665, | |
| "learning_rate": 2.8952143521157933e-05, | |
| "loss": 0.7868, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.839041095890411, | |
| "grad_norm": 1.04231595993042, | |
| "learning_rate": 2.888887165790775e-05, | |
| "loss": 0.7418, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.8561643835616438, | |
| "grad_norm": 0.9595353007316589, | |
| "learning_rate": 2.8823818445081152e-05, | |
| "loss": 0.7532, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.8732876712328768, | |
| "grad_norm": 1.2640131711959839, | |
| "learning_rate": 2.8756992226229443e-05, | |
| "loss": 0.6791, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.8904109589041096, | |
| "grad_norm": 1.0312385559082031, | |
| "learning_rate": 2.8688401572304927e-05, | |
| "loss": 0.7609, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.9075342465753424, | |
| "grad_norm": 0.9013313055038452, | |
| "learning_rate": 2.8618055280561656e-05, | |
| "loss": 0.7723, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.9246575342465754, | |
| "grad_norm": 1.1069859266281128, | |
| "learning_rate": 2.854596237342708e-05, | |
| "loss": 0.6889, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.9417808219178082, | |
| "grad_norm": 1.0332902669906616, | |
| "learning_rate": 2.8472132097344877e-05, | |
| "loss": 0.7521, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.958904109589041, | |
| "grad_norm": 1.0214784145355225, | |
| "learning_rate": 2.839657392158904e-05, | |
| "loss": 0.6667, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.976027397260274, | |
| "grad_norm": 1.0733942985534668, | |
| "learning_rate": 2.8319297537049338e-05, | |
| "loss": 0.6784, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.9931506849315068, | |
| "grad_norm": 0.9576444029808044, | |
| "learning_rate": 2.8240312854988424e-05, | |
| "loss": 0.7012, | |
| "step": 290 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 1460, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 2000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4.2721250868881e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |