{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 15.0, "eval_steps": 200, "global_step": 480, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0, "eval_runtime": 13.9532, "eval_samples_per_second": 0.072, "eval_steps_per_second": 0.072, "step": 0 }, { "epoch": 0.15873015873015872, "grad_norm": 11.557395935058594, "learning_rate": 1.6000000000000003e-05, "loss": 8.1827, "step": 5 }, { "epoch": 0.31746031746031744, "grad_norm": 11.404980659484863, "learning_rate": 1.9831578947368423e-05, "loss": 6.9238, "step": 10 }, { "epoch": 0.47619047619047616, "grad_norm": 7.607983589172363, "learning_rate": 1.962105263157895e-05, "loss": 5.6257, "step": 15 }, { "epoch": 0.6349206349206349, "grad_norm": 9.123591423034668, "learning_rate": 1.9410526315789476e-05, "loss": 5.757, "step": 20 }, { "epoch": 0.7936507936507936, "grad_norm": 8.275577545166016, "learning_rate": 1.9200000000000003e-05, "loss": 5.8446, "step": 25 }, { "epoch": 0.9523809523809523, "grad_norm": 7.894503593444824, "learning_rate": 1.898947368421053e-05, "loss": 5.9334, "step": 30 }, { "epoch": 1.0952380952380953, "grad_norm": 8.614194869995117, "learning_rate": 1.8778947368421056e-05, "loss": 4.9961, "step": 35 }, { "epoch": 1.253968253968254, "grad_norm": 7.895678520202637, "learning_rate": 1.856842105263158e-05, "loss": 4.9552, "step": 40 }, { "epoch": 1.4126984126984126, "grad_norm": 8.537894248962402, "learning_rate": 1.8357894736842105e-05, "loss": 4.4348, "step": 45 }, { "epoch": 1.5714285714285714, "grad_norm": 14.754180908203125, "learning_rate": 1.8147368421052632e-05, "loss": 5.5887, "step": 50 }, { "epoch": 1.7301587301587302, "grad_norm": 12.931550979614258, "learning_rate": 1.793684210526316e-05, "loss": 4.3393, "step": 55 }, { "epoch": 1.8888888888888888, "grad_norm": 12.019343376159668, "learning_rate": 1.7726315789473685e-05, "loss": 4.4514, "step": 60 }, { "epoch": 2.0317460317460316, "grad_norm": 9.587310791015625, "learning_rate": 1.751578947368421e-05, "loss": 4.4192, "step": 65 }, { "epoch": 2.1904761904761907, "grad_norm": 15.39806842803955, "learning_rate": 1.7305263157894738e-05, "loss": 3.321, "step": 70 }, { "epoch": 2.3492063492063493, "grad_norm": 33.02901077270508, "learning_rate": 1.7094736842105265e-05, "loss": 3.5273, "step": 75 }, { "epoch": 2.507936507936508, "grad_norm": 15.809943199157715, "learning_rate": 1.688421052631579e-05, "loss": 3.627, "step": 80 }, { "epoch": 2.6666666666666665, "grad_norm": 17.844499588012695, "learning_rate": 1.6673684210526318e-05, "loss": 3.4103, "step": 85 }, { "epoch": 2.825396825396825, "grad_norm": 16.915332794189453, "learning_rate": 1.6463157894736844e-05, "loss": 3.2948, "step": 90 }, { "epoch": 2.984126984126984, "grad_norm": 10.49189567565918, "learning_rate": 1.6252631578947367e-05, "loss": 3.841, "step": 95 }, { "epoch": 3.126984126984127, "grad_norm": 10.832942962646484, "learning_rate": 1.6042105263157897e-05, "loss": 2.8493, "step": 100 }, { "epoch": 3.2857142857142856, "grad_norm": 21.16595458984375, "learning_rate": 1.5831578947368424e-05, "loss": 2.5387, "step": 105 }, { "epoch": 3.4444444444444446, "grad_norm": 14.9147310256958, "learning_rate": 1.5621052631578947e-05, "loss": 2.615, "step": 110 }, { "epoch": 3.6031746031746033, "grad_norm": 18.327787399291992, "learning_rate": 1.5410526315789477e-05, "loss": 2.1285, "step": 115 }, { "epoch": 3.761904761904762, "grad_norm": 9.476289749145508, "learning_rate": 1.5200000000000002e-05, "loss": 2.2728, "step": 120 }, { "epoch": 3.9206349206349205, "grad_norm": 17.919769287109375, "learning_rate": 1.4989473684210527e-05, "loss": 2.5896, "step": 125 }, { "epoch": 4.063492063492063, "grad_norm": 14.85531997680664, "learning_rate": 1.4778947368421055e-05, "loss": 2.1744, "step": 130 }, { "epoch": 4.222222222222222, "grad_norm": 14.478163719177246, "learning_rate": 1.456842105263158e-05, "loss": 1.7097, "step": 135 }, { "epoch": 4.380952380952381, "grad_norm": 11.578816413879395, "learning_rate": 1.4357894736842106e-05, "loss": 1.5367, "step": 140 }, { "epoch": 4.5396825396825395, "grad_norm": 17.49262809753418, "learning_rate": 1.4147368421052631e-05, "loss": 1.5296, "step": 145 }, { "epoch": 4.698412698412699, "grad_norm": 14.988396644592285, "learning_rate": 1.393684210526316e-05, "loss": 1.7586, "step": 150 }, { "epoch": 4.857142857142857, "grad_norm": 14.392678260803223, "learning_rate": 1.3726315789473686e-05, "loss": 1.6262, "step": 155 }, { "epoch": 5.0, "grad_norm": 20.406312942504883, "learning_rate": 1.3515789473684211e-05, "loss": 1.6957, "step": 160 }, { "epoch": 5.158730158730159, "grad_norm": 11.343132972717285, "learning_rate": 1.3305263157894739e-05, "loss": 1.1688, "step": 165 }, { "epoch": 5.317460317460317, "grad_norm": 19.622802734375, "learning_rate": 1.3094736842105264e-05, "loss": 1.2816, "step": 170 }, { "epoch": 5.476190476190476, "grad_norm": 8.312881469726562, "learning_rate": 1.288421052631579e-05, "loss": 1.1862, "step": 175 }, { "epoch": 5.634920634920634, "grad_norm": 9.719124794006348, "learning_rate": 1.2673684210526315e-05, "loss": 1.1252, "step": 180 }, { "epoch": 5.7936507936507935, "grad_norm": 13.568297386169434, "learning_rate": 1.2463157894736844e-05, "loss": 1.4237, "step": 185 }, { "epoch": 5.9523809523809526, "grad_norm": 14.834338188171387, "learning_rate": 1.225263157894737e-05, "loss": 1.1826, "step": 190 }, { "epoch": 6.095238095238095, "grad_norm": 9.984978675842285, "learning_rate": 1.2042105263157895e-05, "loss": 0.7894, "step": 195 }, { "epoch": 6.253968253968254, "grad_norm": 9.38455581665039, "learning_rate": 1.1831578947368423e-05, "loss": 0.631, "step": 200 }, { "epoch": 6.253968253968254, "eval_runtime": 1.8467, "eval_samples_per_second": 0.542, "eval_steps_per_second": 0.542, "step": 200 }, { "epoch": 6.412698412698413, "grad_norm": 10.59284496307373, "learning_rate": 1.1621052631578948e-05, "loss": 0.5435, "step": 205 }, { "epoch": 6.571428571428571, "grad_norm": 11.207695007324219, "learning_rate": 1.1410526315789475e-05, "loss": 0.7223, "step": 210 }, { "epoch": 6.73015873015873, "grad_norm": 8.157690048217773, "learning_rate": 1.1200000000000001e-05, "loss": 0.6299, "step": 215 }, { "epoch": 6.888888888888889, "grad_norm": 11.521504402160645, "learning_rate": 1.0989473684210528e-05, "loss": 0.68, "step": 220 }, { "epoch": 7.031746031746032, "grad_norm": 15.177350997924805, "learning_rate": 1.0778947368421053e-05, "loss": 0.5935, "step": 225 }, { "epoch": 7.190476190476191, "grad_norm": 12.173933029174805, "learning_rate": 1.0568421052631579e-05, "loss": 0.3321, "step": 230 }, { "epoch": 7.349206349206349, "grad_norm": 8.02884292602539, "learning_rate": 1.0357894736842107e-05, "loss": 0.3183, "step": 235 }, { "epoch": 7.507936507936508, "grad_norm": 7.30162239074707, "learning_rate": 1.0147368421052632e-05, "loss": 0.2462, "step": 240 }, { "epoch": 7.666666666666667, "grad_norm": 8.202823638916016, "learning_rate": 9.936842105263159e-06, "loss": 0.4269, "step": 245 }, { "epoch": 7.825396825396825, "grad_norm": 12.977221488952637, "learning_rate": 9.726315789473685e-06, "loss": 0.4175, "step": 250 }, { "epoch": 7.984126984126984, "grad_norm": 13.560741424560547, "learning_rate": 9.515789473684212e-06, "loss": 0.4621, "step": 255 }, { "epoch": 8.126984126984127, "grad_norm": 4.011098384857178, "learning_rate": 9.305263157894737e-06, "loss": 0.2328, "step": 260 }, { "epoch": 8.285714285714286, "grad_norm": 6.631120681762695, "learning_rate": 9.094736842105263e-06, "loss": 0.1861, "step": 265 }, { "epoch": 8.444444444444445, "grad_norm": 5.168328285217285, "learning_rate": 8.884210526315792e-06, "loss": 0.1793, "step": 270 }, { "epoch": 8.603174603174603, "grad_norm": 10.629039764404297, "learning_rate": 8.673684210526316e-06, "loss": 0.2486, "step": 275 }, { "epoch": 8.761904761904763, "grad_norm": 13.89322566986084, "learning_rate": 8.463157894736843e-06, "loss": 0.167, "step": 280 }, { "epoch": 8.920634920634921, "grad_norm": 6.970239639282227, "learning_rate": 8.25263157894737e-06, "loss": 0.1983, "step": 285 }, { "epoch": 9.063492063492063, "grad_norm": 3.4170796871185303, "learning_rate": 8.042105263157896e-06, "loss": 0.1928, "step": 290 }, { "epoch": 9.222222222222221, "grad_norm": 11.420437812805176, "learning_rate": 7.831578947368421e-06, "loss": 0.1943, "step": 295 }, { "epoch": 9.380952380952381, "grad_norm": 5.474252223968506, "learning_rate": 7.621052631578948e-06, "loss": 0.1197, "step": 300 }, { "epoch": 9.53968253968254, "grad_norm": 47.27220916748047, "learning_rate": 7.410526315789475e-06, "loss": 0.3147, "step": 305 }, { "epoch": 9.698412698412698, "grad_norm": 9.262266159057617, "learning_rate": 7.2000000000000005e-06, "loss": 0.0838, "step": 310 }, { "epoch": 9.857142857142858, "grad_norm": 2.843402862548828, "learning_rate": 6.989473684210527e-06, "loss": 0.1181, "step": 315 }, { "epoch": 10.0, "grad_norm": 10.928123474121094, "learning_rate": 6.778947368421053e-06, "loss": 0.1583, "step": 320 }, { "epoch": 10.158730158730158, "grad_norm": 1.7071508169174194, "learning_rate": 6.568421052631579e-06, "loss": 0.0808, "step": 325 }, { "epoch": 10.317460317460318, "grad_norm": 6.591403007507324, "learning_rate": 6.357894736842106e-06, "loss": 0.1076, "step": 330 }, { "epoch": 10.476190476190476, "grad_norm": 4.758854389190674, "learning_rate": 6.1473684210526316e-06, "loss": 0.085, "step": 335 }, { "epoch": 10.634920634920634, "grad_norm": 8.381784439086914, "learning_rate": 5.936842105263159e-06, "loss": 0.1412, "step": 340 }, { "epoch": 10.793650793650794, "grad_norm": 6.775882244110107, "learning_rate": 5.726315789473685e-06, "loss": 0.1122, "step": 345 }, { "epoch": 10.952380952380953, "grad_norm": 3.3244922161102295, "learning_rate": 5.515789473684211e-06, "loss": 0.0922, "step": 350 }, { "epoch": 11.095238095238095, "grad_norm": 2.986769437789917, "learning_rate": 5.305263157894738e-06, "loss": 0.051, "step": 355 }, { "epoch": 11.253968253968253, "grad_norm": 2.7891147136688232, "learning_rate": 5.0947368421052635e-06, "loss": 0.0607, "step": 360 }, { "epoch": 11.412698412698413, "grad_norm": 1.6444604396820068, "learning_rate": 4.88421052631579e-06, "loss": 0.0738, "step": 365 }, { "epoch": 11.571428571428571, "grad_norm": 3.8520383834838867, "learning_rate": 4.6736842105263166e-06, "loss": 0.0699, "step": 370 }, { "epoch": 11.73015873015873, "grad_norm": 2.9614264965057373, "learning_rate": 4.463157894736842e-06, "loss": 0.0662, "step": 375 }, { "epoch": 11.88888888888889, "grad_norm": 0.9366450309753418, "learning_rate": 4.252631578947369e-06, "loss": 0.038, "step": 380 }, { "epoch": 12.031746031746032, "grad_norm": 1.4501631259918213, "learning_rate": 4.042105263157895e-06, "loss": 0.0415, "step": 385 }, { "epoch": 12.19047619047619, "grad_norm": 1.08451509475708, "learning_rate": 3.831578947368421e-06, "loss": 0.0382, "step": 390 }, { "epoch": 12.34920634920635, "grad_norm": 3.214855670928955, "learning_rate": 3.621052631578948e-06, "loss": 0.0337, "step": 395 }, { "epoch": 12.507936507936508, "grad_norm": 1.9067870378494263, "learning_rate": 3.410526315789474e-06, "loss": 0.0524, "step": 400 }, { "epoch": 12.507936507936508, "eval_runtime": 1.7856, "eval_samples_per_second": 0.56, "eval_steps_per_second": 0.56, "step": 400 }, { "epoch": 12.666666666666666, "grad_norm": 1.6618458032608032, "learning_rate": 3.2000000000000003e-06, "loss": 0.0591, "step": 405 }, { "epoch": 12.825396825396826, "grad_norm": 1.5796409845352173, "learning_rate": 2.9894736842105264e-06, "loss": 0.0835, "step": 410 }, { "epoch": 12.984126984126984, "grad_norm": 0.4131016433238983, "learning_rate": 2.7789473684210525e-06, "loss": 0.0579, "step": 415 }, { "epoch": 13.126984126984127, "grad_norm": 0.8320081830024719, "learning_rate": 2.568421052631579e-06, "loss": 0.0559, "step": 420 }, { "epoch": 13.285714285714286, "grad_norm": 0.18294575810432434, "learning_rate": 2.357894736842105e-06, "loss": 0.0245, "step": 425 }, { "epoch": 13.444444444444445, "grad_norm": 1.5417789220809937, "learning_rate": 2.1473684210526317e-06, "loss": 0.0367, "step": 430 }, { "epoch": 13.603174603174603, "grad_norm": 0.5289078950881958, "learning_rate": 1.936842105263158e-06, "loss": 0.0327, "step": 435 }, { "epoch": 13.761904761904763, "grad_norm": 0.751720666885376, "learning_rate": 1.7263157894736842e-06, "loss": 0.0243, "step": 440 }, { "epoch": 13.920634920634921, "grad_norm": 0.7442598938941956, "learning_rate": 1.5157894736842108e-06, "loss": 0.033, "step": 445 }, { "epoch": 14.063492063492063, "grad_norm": 0.3151313066482544, "learning_rate": 1.3052631578947369e-06, "loss": 0.0253, "step": 450 }, { "epoch": 14.222222222222221, "grad_norm": 0.2672920525074005, "learning_rate": 1.0947368421052632e-06, "loss": 0.0198, "step": 455 }, { "epoch": 14.380952380952381, "grad_norm": 0.6730213165283203, "learning_rate": 8.842105263157895e-07, "loss": 0.0212, "step": 460 }, { "epoch": 14.53968253968254, "grad_norm": 0.5566405653953552, "learning_rate": 6.736842105263158e-07, "loss": 0.0176, "step": 465 }, { "epoch": 14.698412698412698, "grad_norm": 0.37914007902145386, "learning_rate": 4.631578947368422e-07, "loss": 0.0341, "step": 470 }, { "epoch": 14.857142857142858, "grad_norm": 0.2741248905658722, "learning_rate": 2.5263157894736846e-07, "loss": 0.0247, "step": 475 }, { "epoch": 15.0, "grad_norm": 0.30271536111831665, "learning_rate": 4.2105263157894737e-08, "loss": 0.0212, "step": 480 }, { "epoch": 15.0, "step": 480, "total_flos": 0.0, "train_loss": 1.4350806780159473, "train_runtime": 3024.1986, "train_samples_per_second": 2.475, "train_steps_per_second": 0.159 } ], "logging_steps": 5, "max_steps": 480, "num_input_tokens_seen": 0, "num_train_epochs": 15, "save_steps": 400, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }