| { | |
| "best_metric": 1.9192386865615845, | |
| "best_model_checkpoint": "experiments/checkpoint-500", | |
| "epoch": 100.0, | |
| "global_step": 500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 1.0, | |
| "learning_rate": 1.8e-06, | |
| "loss": 1.7688, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "learning_rate": 4.8e-06, | |
| "loss": 1.7684, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "learning_rate": 7.799999999999998e-06, | |
| "loss": 1.7564, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "learning_rate": 1.0799999999999998e-05, | |
| "loss": 1.7424, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "learning_rate": 1.3799999999999998e-05, | |
| "loss": 1.727, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "learning_rate": 1.68e-05, | |
| "loss": 1.7135, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "learning_rate": 1.98e-05, | |
| "loss": 1.701, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "learning_rate": 2.28e-05, | |
| "loss": 1.6797, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "learning_rate": 2.5799999999999997e-05, | |
| "loss": 1.6547, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "learning_rate": 2.88e-05, | |
| "loss": 1.6245, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "eval_loss": 1.5684312582015991, | |
| "eval_runtime": 1.4166, | |
| "eval_samples_per_second": 7.059, | |
| "eval_steps_per_second": 1.412, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 11.0, | |
| "learning_rate": 3.1799999999999994e-05, | |
| "loss": 1.5841, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "learning_rate": 3.48e-05, | |
| "loss": 1.5316, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 13.0, | |
| "learning_rate": 3.78e-05, | |
| "loss": 1.4644, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 14.0, | |
| "learning_rate": 4.08e-05, | |
| "loss": 1.3728, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 15.0, | |
| "learning_rate": 4.3799999999999994e-05, | |
| "loss": 1.2692, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 16.0, | |
| "learning_rate": 4.56e-05, | |
| "loss": 1.1998, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 17.0, | |
| "learning_rate": 4.8599999999999995e-05, | |
| "loss": 1.1159, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 18.0, | |
| "learning_rate": 5.1599999999999994e-05, | |
| "loss": 1.0442, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 19.0, | |
| "learning_rate": 5.459999999999999e-05, | |
| "loss": 0.9944, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "learning_rate": 5.76e-05, | |
| "loss": 0.9518, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "eval_loss": 0.9818406105041504, | |
| "eval_runtime": 1.3835, | |
| "eval_samples_per_second": 7.228, | |
| "eval_steps_per_second": 1.446, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 21.0, | |
| "learning_rate": 6.0599999999999996e-05, | |
| "loss": 0.908, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 22.0, | |
| "learning_rate": 6.359999999999999e-05, | |
| "loss": 0.8678, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 23.0, | |
| "learning_rate": 6.659999999999999e-05, | |
| "loss": 0.8303, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 24.0, | |
| "learning_rate": 6.96e-05, | |
| "loss": 0.7928, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 25.0, | |
| "learning_rate": 7.259999999999999e-05, | |
| "loss": 0.7594, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 26.0, | |
| "learning_rate": 7.56e-05, | |
| "loss": 0.73, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 27.0, | |
| "learning_rate": 7.86e-05, | |
| "loss": 0.7034, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 28.0, | |
| "learning_rate": 8.16e-05, | |
| "loss": 0.6777, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 29.0, | |
| "learning_rate": 8.459999999999998e-05, | |
| "loss": 0.6493, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 30.0, | |
| "learning_rate": 8.759999999999999e-05, | |
| "loss": 0.6249, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 30.0, | |
| "eval_loss": 0.8599645495414734, | |
| "eval_runtime": 1.4099, | |
| "eval_samples_per_second": 7.093, | |
| "eval_steps_per_second": 1.419, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 31.0, | |
| "learning_rate": 9.059999999999999e-05, | |
| "loss": 0.6007, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 32.0, | |
| "learning_rate": 9.36e-05, | |
| "loss": 0.5716, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 33.0, | |
| "learning_rate": 9.659999999999999e-05, | |
| "loss": 0.5465, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 34.0, | |
| "learning_rate": 9.96e-05, | |
| "loss": 0.5191, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 35.0, | |
| "learning_rate": 0.0001026, | |
| "loss": 0.4947, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 36.0, | |
| "learning_rate": 0.00010559999999999998, | |
| "loss": 0.4681, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 37.0, | |
| "learning_rate": 0.00010859999999999998, | |
| "loss": 0.4417, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 38.0, | |
| "learning_rate": 0.00011159999999999999, | |
| "loss": 0.4116, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 39.0, | |
| "learning_rate": 0.0001146, | |
| "loss": 0.3804, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 40.0, | |
| "learning_rate": 0.0001176, | |
| "loss": 0.3544, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 40.0, | |
| "eval_loss": 1.1081293821334839, | |
| "eval_runtime": 1.3764, | |
| "eval_samples_per_second": 7.265, | |
| "eval_steps_per_second": 1.453, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 41.0, | |
| "learning_rate": 0.00012059999999999999, | |
| "loss": 0.3248, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 42.0, | |
| "learning_rate": 0.0001236, | |
| "loss": 0.2931, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 43.0, | |
| "learning_rate": 0.0001266, | |
| "loss": 0.2677, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 44.0, | |
| "learning_rate": 0.00012959999999999998, | |
| "loss": 0.2386, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 45.0, | |
| "learning_rate": 0.0001326, | |
| "loss": 0.2142, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 46.0, | |
| "learning_rate": 0.0001356, | |
| "loss": 0.1932, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 47.0, | |
| "learning_rate": 0.0001386, | |
| "loss": 0.1709, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 48.0, | |
| "learning_rate": 0.00014159999999999997, | |
| "loss": 0.1571, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 49.0, | |
| "learning_rate": 0.0001446, | |
| "loss": 0.1417, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 50.0, | |
| "learning_rate": 0.00014759999999999998, | |
| "loss": 0.1184, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 50.0, | |
| "eval_loss": 1.5212451219558716, | |
| "eval_runtime": 1.3827, | |
| "eval_samples_per_second": 7.232, | |
| "eval_steps_per_second": 1.446, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 51.0, | |
| "learning_rate": 0.00015059999999999997, | |
| "loss": 0.1096, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 52.0, | |
| "learning_rate": 0.0001536, | |
| "loss": 0.1037, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 53.0, | |
| "learning_rate": 0.00015659999999999998, | |
| "loss": 0.095, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 54.0, | |
| "learning_rate": 0.0001596, | |
| "loss": 0.0865, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 55.0, | |
| "learning_rate": 0.0001626, | |
| "loss": 0.0808, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 56.0, | |
| "learning_rate": 0.0001656, | |
| "loss": 0.0794, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 57.0, | |
| "learning_rate": 0.0001686, | |
| "loss": 0.075, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 58.0, | |
| "learning_rate": 0.00017159999999999997, | |
| "loss": 0.0726, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 59.0, | |
| "learning_rate": 0.00017459999999999996, | |
| "loss": 0.0696, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 60.0, | |
| "learning_rate": 0.00017759999999999998, | |
| "loss": 0.0665, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 60.0, | |
| "eval_loss": 1.7048699855804443, | |
| "eval_runtime": 1.3753, | |
| "eval_samples_per_second": 7.271, | |
| "eval_steps_per_second": 1.454, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 61.0, | |
| "learning_rate": 0.00018059999999999997, | |
| "loss": 0.065, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 62.0, | |
| "learning_rate": 0.0001836, | |
| "loss": 0.0623, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 63.0, | |
| "learning_rate": 0.00018659999999999998, | |
| "loss": 0.0574, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 64.0, | |
| "learning_rate": 0.00018959999999999997, | |
| "loss": 0.0577, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 65.0, | |
| "learning_rate": 0.0001926, | |
| "loss": 0.0597, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 66.0, | |
| "learning_rate": 0.00019559999999999998, | |
| "loss": 0.0546, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 67.0, | |
| "learning_rate": 0.0001986, | |
| "loss": 0.0603, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 68.0, | |
| "learning_rate": 0.0002016, | |
| "loss": 0.0555, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 69.0, | |
| "learning_rate": 0.00020459999999999999, | |
| "loss": 0.0551, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 70.0, | |
| "learning_rate": 0.00020759999999999998, | |
| "loss": 0.0529, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 70.0, | |
| "eval_loss": 1.901442527770996, | |
| "eval_runtime": 1.3785, | |
| "eval_samples_per_second": 7.254, | |
| "eval_steps_per_second": 1.451, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 71.0, | |
| "learning_rate": 0.00021059999999999997, | |
| "loss": 0.051, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 72.0, | |
| "learning_rate": 0.00021359999999999996, | |
| "loss": 0.05, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 73.0, | |
| "learning_rate": 0.00021659999999999998, | |
| "loss": 0.0493, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 74.0, | |
| "learning_rate": 0.00021959999999999997, | |
| "loss": 0.0465, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 75.0, | |
| "learning_rate": 0.0002226, | |
| "loss": 0.0504, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 76.0, | |
| "learning_rate": 0.00022559999999999998, | |
| "loss": 0.0491, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 77.0, | |
| "learning_rate": 0.00022859999999999997, | |
| "loss": 0.0485, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 78.0, | |
| "learning_rate": 0.0002316, | |
| "loss": 0.0451, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 79.0, | |
| "learning_rate": 0.00023459999999999998, | |
| "loss": 0.0478, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 80.0, | |
| "learning_rate": 0.0002376, | |
| "loss": 0.0435, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 80.0, | |
| "eval_loss": 1.9840269088745117, | |
| "eval_runtime": 1.3787, | |
| "eval_samples_per_second": 7.253, | |
| "eval_steps_per_second": 1.451, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 81.0, | |
| "learning_rate": 0.0002406, | |
| "loss": 0.0429, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 82.0, | |
| "learning_rate": 0.00024359999999999999, | |
| "loss": 0.0464, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 83.0, | |
| "learning_rate": 0.0002466, | |
| "loss": 0.0458, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 84.0, | |
| "learning_rate": 0.00024959999999999994, | |
| "loss": 0.0441, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 85.0, | |
| "learning_rate": 0.00025259999999999996, | |
| "loss": 0.0421, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 86.0, | |
| "learning_rate": 0.0002556, | |
| "loss": 0.0433, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 87.0, | |
| "learning_rate": 0.0002586, | |
| "loss": 0.0444, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 88.0, | |
| "learning_rate": 0.00026159999999999996, | |
| "loss": 0.0472, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 89.0, | |
| "learning_rate": 0.0002646, | |
| "loss": 0.0442, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 90.0, | |
| "learning_rate": 0.0002676, | |
| "loss": 0.0431, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 90.0, | |
| "eval_loss": 1.839582085609436, | |
| "eval_runtime": 1.3848, | |
| "eval_samples_per_second": 7.221, | |
| "eval_steps_per_second": 1.444, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 91.0, | |
| "learning_rate": 0.00027059999999999996, | |
| "loss": 0.0431, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 92.0, | |
| "learning_rate": 0.0002736, | |
| "loss": 0.044, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 93.0, | |
| "learning_rate": 0.0002766, | |
| "loss": 0.0429, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 94.0, | |
| "learning_rate": 0.00027959999999999997, | |
| "loss": 0.042, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 95.0, | |
| "learning_rate": 0.0002826, | |
| "loss": 0.0415, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 96.0, | |
| "learning_rate": 0.00028559999999999995, | |
| "loss": 0.0433, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 97.0, | |
| "learning_rate": 0.00028859999999999997, | |
| "loss": 0.0405, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 98.0, | |
| "learning_rate": 0.0002916, | |
| "loss": 0.0416, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 99.0, | |
| "learning_rate": 0.00029459999999999995, | |
| "loss": 0.0378, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 100.0, | |
| "learning_rate": 0.00029759999999999997, | |
| "loss": 0.0379, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 100.0, | |
| "eval_loss": 1.9192386865615845, | |
| "eval_runtime": 1.3756, | |
| "eval_samples_per_second": 7.269, | |
| "eval_steps_per_second": 1.454, | |
| "step": 500 | |
| } | |
| ], | |
| "max_steps": 10000, | |
| "num_train_epochs": 2000, | |
| "total_flos": 8.0956959522816e+16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |