| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 312, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.016025641025641024, | |
| "grad_norm": 1.0291906595230103, | |
| "learning_rate": 1.5384615384615385e-06, | |
| "loss": 1.3399, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.03205128205128205, | |
| "grad_norm": 1.1106044054031372, | |
| "learning_rate": 3.4615384615384617e-06, | |
| "loss": 1.3951, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.04807692307692308, | |
| "grad_norm": 0.6151779890060425, | |
| "learning_rate": 5.384615384615385e-06, | |
| "loss": 1.3654, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.0641025641025641, | |
| "grad_norm": 0.6062663793563843, | |
| "learning_rate": 7.307692307692308e-06, | |
| "loss": 1.3207, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.08012820512820513, | |
| "grad_norm": 0.5593442320823669, | |
| "learning_rate": 9.230769230769232e-06, | |
| "loss": 1.3146, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.09615384615384616, | |
| "grad_norm": 0.5808208584785461, | |
| "learning_rate": 1.1153846153846154e-05, | |
| "loss": 1.3007, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.11217948717948718, | |
| "grad_norm": 0.600754976272583, | |
| "learning_rate": 1.3076923076923078e-05, | |
| "loss": 1.2647, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.1282051282051282, | |
| "grad_norm": 0.4767054617404938, | |
| "learning_rate": 1.5e-05, | |
| "loss": 1.276, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.14423076923076922, | |
| "grad_norm": 0.4327481687068939, | |
| "learning_rate": 1.6923076923076924e-05, | |
| "loss": 1.2434, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.16025641025641027, | |
| "grad_norm": 0.4272952079772949, | |
| "learning_rate": 1.8846153846153846e-05, | |
| "loss": 1.2425, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.1762820512820513, | |
| "grad_norm": 0.4927898943424225, | |
| "learning_rate": 2.076923076923077e-05, | |
| "loss": 1.233, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.19230769230769232, | |
| "grad_norm": 0.39667901396751404, | |
| "learning_rate": 2.269230769230769e-05, | |
| "loss": 1.2061, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.20833333333333334, | |
| "grad_norm": 0.5480871200561523, | |
| "learning_rate": 2.4615384615384616e-05, | |
| "loss": 1.2113, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.22435897435897437, | |
| "grad_norm": 0.4961429536342621, | |
| "learning_rate": 2.6538461538461538e-05, | |
| "loss": 1.1851, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.2403846153846154, | |
| "grad_norm": 0.47594135999679565, | |
| "learning_rate": 2.846153846153846e-05, | |
| "loss": 1.2022, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.2564102564102564, | |
| "grad_norm": 0.5065383911132812, | |
| "learning_rate": 2.9999966297319848e-05, | |
| "loss": 1.1301, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.2724358974358974, | |
| "grad_norm": 0.594122052192688, | |
| "learning_rate": 2.9998786719416534e-05, | |
| "loss": 1.1598, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.28846153846153844, | |
| "grad_norm": 0.6093367338180542, | |
| "learning_rate": 2.9995922158951827e-05, | |
| "loss": 1.1405, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.30448717948717946, | |
| "grad_norm": 0.5758622288703918, | |
| "learning_rate": 2.9991372937734057e-05, | |
| "loss": 1.1319, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.32051282051282054, | |
| "grad_norm": 0.5893705487251282, | |
| "learning_rate": 2.9985139566828457e-05, | |
| "loss": 1.1455, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.33653846153846156, | |
| "grad_norm": 0.541392982006073, | |
| "learning_rate": 2.997722274649974e-05, | |
| "loss": 1.1353, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.3525641025641026, | |
| "grad_norm": 0.5347727537155151, | |
| "learning_rate": 2.9967623366133475e-05, | |
| "loss": 1.0905, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.3685897435897436, | |
| "grad_norm": 0.6084627509117126, | |
| "learning_rate": 2.995634250413612e-05, | |
| "loss": 1.1164, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.38461538461538464, | |
| "grad_norm": 0.6210500597953796, | |
| "learning_rate": 2.9943381427813882e-05, | |
| "loss": 1.0631, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.40064102564102566, | |
| "grad_norm": 0.618479311466217, | |
| "learning_rate": 2.9928741593230393e-05, | |
| "loss": 1.0156, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.4166666666666667, | |
| "grad_norm": 0.6391302943229675, | |
| "learning_rate": 2.9912424645043064e-05, | |
| "loss": 1.036, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.4326923076923077, | |
| "grad_norm": 0.7931795120239258, | |
| "learning_rate": 2.989443241631839e-05, | |
| "loss": 1.0432, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.44871794871794873, | |
| "grad_norm": 0.7722377181053162, | |
| "learning_rate": 2.987476692832596e-05, | |
| "loss": 1.0204, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.46474358974358976, | |
| "grad_norm": 0.6615847945213318, | |
| "learning_rate": 2.9853430390311434e-05, | |
| "loss": 1.0592, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.4807692307692308, | |
| "grad_norm": 0.7195934057235718, | |
| "learning_rate": 2.983042519924831e-05, | |
| "loss": 1.0199, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.4967948717948718, | |
| "grad_norm": 0.731874942779541, | |
| "learning_rate": 2.980575393956869e-05, | |
| "loss": 1.003, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.5128205128205128, | |
| "grad_norm": 0.7320418953895569, | |
| "learning_rate": 2.977941938287292e-05, | |
| "loss": 0.9789, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.5288461538461539, | |
| "grad_norm": 0.6661425828933716, | |
| "learning_rate": 2.9751424487618196e-05, | |
| "loss": 0.9349, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.5448717948717948, | |
| "grad_norm": 0.8029599189758301, | |
| "learning_rate": 2.972177239878627e-05, | |
| "loss": 0.9327, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.5608974358974359, | |
| "grad_norm": 0.8484524488449097, | |
| "learning_rate": 2.969046644753008e-05, | |
| "loss": 0.9744, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.5769230769230769, | |
| "grad_norm": 0.7641316652297974, | |
| "learning_rate": 2.965751015079957e-05, | |
| "loss": 0.9159, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.592948717948718, | |
| "grad_norm": 0.7425558567047119, | |
| "learning_rate": 2.962290721094655e-05, | |
| "loss": 0.9187, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.6089743589743589, | |
| "grad_norm": 0.8162761926651001, | |
| "learning_rate": 2.9586661515308793e-05, | |
| "loss": 0.9121, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.625, | |
| "grad_norm": 0.9251567125320435, | |
| "learning_rate": 2.9548777135773338e-05, | |
| "loss": 0.8566, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.6410256410256411, | |
| "grad_norm": 0.8238964676856995, | |
| "learning_rate": 2.950925832831901e-05, | |
| "loss": 0.8903, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.657051282051282, | |
| "grad_norm": 0.8461718559265137, | |
| "learning_rate": 2.9468109532538346e-05, | |
| "loss": 0.8331, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.6730769230769231, | |
| "grad_norm": 1.0172253847122192, | |
| "learning_rate": 2.9425335371138802e-05, | |
| "loss": 0.8476, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.6891025641025641, | |
| "grad_norm": 0.8224555253982544, | |
| "learning_rate": 2.9380940649423462e-05, | |
| "loss": 0.8956, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.7051282051282052, | |
| "grad_norm": 0.9176145195960999, | |
| "learning_rate": 2.933493035475119e-05, | |
| "loss": 0.834, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.7211538461538461, | |
| "grad_norm": 0.8288151025772095, | |
| "learning_rate": 2.928730965597635e-05, | |
| "loss": 0.8746, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.7371794871794872, | |
| "grad_norm": 1.015427589416504, | |
| "learning_rate": 2.9238083902868123e-05, | |
| "loss": 0.8229, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.7532051282051282, | |
| "grad_norm": 0.8535773158073425, | |
| "learning_rate": 2.9187258625509518e-05, | |
| "loss": 0.8432, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.7692307692307693, | |
| "grad_norm": 0.8888025283813477, | |
| "learning_rate": 2.9134839533676115e-05, | |
| "loss": 0.7881, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.7852564102564102, | |
| "grad_norm": 0.9770839810371399, | |
| "learning_rate": 2.9080832516194596e-05, | |
| "loss": 0.7904, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.8012820512820513, | |
| "grad_norm": 1.187432050704956, | |
| "learning_rate": 2.9025243640281226e-05, | |
| "loss": 0.7799, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.8173076923076923, | |
| "grad_norm": 0.9135069847106934, | |
| "learning_rate": 2.8968079150860228e-05, | |
| "loss": 0.7853, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.8333333333333334, | |
| "grad_norm": 1.066171407699585, | |
| "learning_rate": 2.8909345469862228e-05, | |
| "loss": 0.7577, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.8493589743589743, | |
| "grad_norm": 0.9215983152389526, | |
| "learning_rate": 2.8849049195502812e-05, | |
| "loss": 0.8291, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.8653846153846154, | |
| "grad_norm": 0.940805196762085, | |
| "learning_rate": 2.8787197101541266e-05, | |
| "loss": 0.7549, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.8814102564102564, | |
| "grad_norm": 0.9688215851783752, | |
| "learning_rate": 2.8723796136519604e-05, | |
| "loss": 0.7478, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.8974358974358975, | |
| "grad_norm": 0.9502094984054565, | |
| "learning_rate": 2.8658853422981964e-05, | |
| "loss": 0.7425, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.9134615384615384, | |
| "grad_norm": 1.0309516191482544, | |
| "learning_rate": 2.8592376256674455e-05, | |
| "loss": 0.7053, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.9294871794871795, | |
| "grad_norm": 1.0961329936981201, | |
| "learning_rate": 2.852437210572553e-05, | |
| "loss": 0.721, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.9455128205128205, | |
| "grad_norm": 1.02060067653656, | |
| "learning_rate": 2.845484860980703e-05, | |
| "loss": 0.6888, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.9615384615384616, | |
| "grad_norm": 0.9542692303657532, | |
| "learning_rate": 2.8383813579275912e-05, | |
| "loss": 0.6602, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.9775641025641025, | |
| "grad_norm": 1.0014078617095947, | |
| "learning_rate": 2.8311274994296835e-05, | |
| "loss": 0.7161, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.9935897435897436, | |
| "grad_norm": 0.9263032078742981, | |
| "learning_rate": 2.823724100394565e-05, | |
| "loss": 0.7149, | |
| "step": 310 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 1560, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 2000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4.397092298219848e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |