| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 15.0, | |
| "eval_steps": 200, | |
| "global_step": 480, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0, | |
| "eval_runtime": 13.9532, | |
| "eval_samples_per_second": 0.072, | |
| "eval_steps_per_second": 0.072, | |
| "step": 0 | |
| }, | |
| { | |
| "epoch": 0.15873015873015872, | |
| "grad_norm": 11.557395935058594, | |
| "learning_rate": 1.6000000000000003e-05, | |
| "loss": 8.1827, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.31746031746031744, | |
| "grad_norm": 11.404980659484863, | |
| "learning_rate": 1.9831578947368423e-05, | |
| "loss": 6.9238, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.47619047619047616, | |
| "grad_norm": 7.607983589172363, | |
| "learning_rate": 1.962105263157895e-05, | |
| "loss": 5.6257, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.6349206349206349, | |
| "grad_norm": 9.123591423034668, | |
| "learning_rate": 1.9410526315789476e-05, | |
| "loss": 5.757, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.7936507936507936, | |
| "grad_norm": 8.275577545166016, | |
| "learning_rate": 1.9200000000000003e-05, | |
| "loss": 5.8446, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.9523809523809523, | |
| "grad_norm": 7.894503593444824, | |
| "learning_rate": 1.898947368421053e-05, | |
| "loss": 5.9334, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 1.0952380952380953, | |
| "grad_norm": 8.614194869995117, | |
| "learning_rate": 1.8778947368421056e-05, | |
| "loss": 4.9961, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 1.253968253968254, | |
| "grad_norm": 7.895678520202637, | |
| "learning_rate": 1.856842105263158e-05, | |
| "loss": 4.9552, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 1.4126984126984126, | |
| "grad_norm": 8.537894248962402, | |
| "learning_rate": 1.8357894736842105e-05, | |
| "loss": 4.4348, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 1.5714285714285714, | |
| "grad_norm": 14.754180908203125, | |
| "learning_rate": 1.8147368421052632e-05, | |
| "loss": 5.5887, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 1.7301587301587302, | |
| "grad_norm": 12.931550979614258, | |
| "learning_rate": 1.793684210526316e-05, | |
| "loss": 4.3393, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 1.8888888888888888, | |
| "grad_norm": 12.019343376159668, | |
| "learning_rate": 1.7726315789473685e-05, | |
| "loss": 4.4514, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 2.0317460317460316, | |
| "grad_norm": 9.587310791015625, | |
| "learning_rate": 1.751578947368421e-05, | |
| "loss": 4.4192, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 2.1904761904761907, | |
| "grad_norm": 15.39806842803955, | |
| "learning_rate": 1.7305263157894738e-05, | |
| "loss": 3.321, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 2.3492063492063493, | |
| "grad_norm": 33.02901077270508, | |
| "learning_rate": 1.7094736842105265e-05, | |
| "loss": 3.5273, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 2.507936507936508, | |
| "grad_norm": 15.809943199157715, | |
| "learning_rate": 1.688421052631579e-05, | |
| "loss": 3.627, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 2.6666666666666665, | |
| "grad_norm": 17.844499588012695, | |
| "learning_rate": 1.6673684210526318e-05, | |
| "loss": 3.4103, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 2.825396825396825, | |
| "grad_norm": 16.915332794189453, | |
| "learning_rate": 1.6463157894736844e-05, | |
| "loss": 3.2948, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 2.984126984126984, | |
| "grad_norm": 10.49189567565918, | |
| "learning_rate": 1.6252631578947367e-05, | |
| "loss": 3.841, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 3.126984126984127, | |
| "grad_norm": 10.832942962646484, | |
| "learning_rate": 1.6042105263157897e-05, | |
| "loss": 2.8493, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 3.2857142857142856, | |
| "grad_norm": 21.16595458984375, | |
| "learning_rate": 1.5831578947368424e-05, | |
| "loss": 2.5387, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 3.4444444444444446, | |
| "grad_norm": 14.9147310256958, | |
| "learning_rate": 1.5621052631578947e-05, | |
| "loss": 2.615, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 3.6031746031746033, | |
| "grad_norm": 18.327787399291992, | |
| "learning_rate": 1.5410526315789477e-05, | |
| "loss": 2.1285, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 3.761904761904762, | |
| "grad_norm": 9.476289749145508, | |
| "learning_rate": 1.5200000000000002e-05, | |
| "loss": 2.2728, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 3.9206349206349205, | |
| "grad_norm": 17.919769287109375, | |
| "learning_rate": 1.4989473684210527e-05, | |
| "loss": 2.5896, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 4.063492063492063, | |
| "grad_norm": 14.85531997680664, | |
| "learning_rate": 1.4778947368421055e-05, | |
| "loss": 2.1744, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 4.222222222222222, | |
| "grad_norm": 14.478163719177246, | |
| "learning_rate": 1.456842105263158e-05, | |
| "loss": 1.7097, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 4.380952380952381, | |
| "grad_norm": 11.578816413879395, | |
| "learning_rate": 1.4357894736842106e-05, | |
| "loss": 1.5367, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 4.5396825396825395, | |
| "grad_norm": 17.49262809753418, | |
| "learning_rate": 1.4147368421052631e-05, | |
| "loss": 1.5296, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 4.698412698412699, | |
| "grad_norm": 14.988396644592285, | |
| "learning_rate": 1.393684210526316e-05, | |
| "loss": 1.7586, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 4.857142857142857, | |
| "grad_norm": 14.392678260803223, | |
| "learning_rate": 1.3726315789473686e-05, | |
| "loss": 1.6262, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 20.406312942504883, | |
| "learning_rate": 1.3515789473684211e-05, | |
| "loss": 1.6957, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 5.158730158730159, | |
| "grad_norm": 11.343132972717285, | |
| "learning_rate": 1.3305263157894739e-05, | |
| "loss": 1.1688, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 5.317460317460317, | |
| "grad_norm": 19.622802734375, | |
| "learning_rate": 1.3094736842105264e-05, | |
| "loss": 1.2816, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 5.476190476190476, | |
| "grad_norm": 8.312881469726562, | |
| "learning_rate": 1.288421052631579e-05, | |
| "loss": 1.1862, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 5.634920634920634, | |
| "grad_norm": 9.719124794006348, | |
| "learning_rate": 1.2673684210526315e-05, | |
| "loss": 1.1252, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 5.7936507936507935, | |
| "grad_norm": 13.568297386169434, | |
| "learning_rate": 1.2463157894736844e-05, | |
| "loss": 1.4237, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 5.9523809523809526, | |
| "grad_norm": 14.834338188171387, | |
| "learning_rate": 1.225263157894737e-05, | |
| "loss": 1.1826, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 6.095238095238095, | |
| "grad_norm": 9.984978675842285, | |
| "learning_rate": 1.2042105263157895e-05, | |
| "loss": 0.7894, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 6.253968253968254, | |
| "grad_norm": 9.38455581665039, | |
| "learning_rate": 1.1831578947368423e-05, | |
| "loss": 0.631, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 6.253968253968254, | |
| "eval_runtime": 1.8467, | |
| "eval_samples_per_second": 0.542, | |
| "eval_steps_per_second": 0.542, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 6.412698412698413, | |
| "grad_norm": 10.59284496307373, | |
| "learning_rate": 1.1621052631578948e-05, | |
| "loss": 0.5435, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 6.571428571428571, | |
| "grad_norm": 11.207695007324219, | |
| "learning_rate": 1.1410526315789475e-05, | |
| "loss": 0.7223, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 6.73015873015873, | |
| "grad_norm": 8.157690048217773, | |
| "learning_rate": 1.1200000000000001e-05, | |
| "loss": 0.6299, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 6.888888888888889, | |
| "grad_norm": 11.521504402160645, | |
| "learning_rate": 1.0989473684210528e-05, | |
| "loss": 0.68, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 7.031746031746032, | |
| "grad_norm": 15.177350997924805, | |
| "learning_rate": 1.0778947368421053e-05, | |
| "loss": 0.5935, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 7.190476190476191, | |
| "grad_norm": 12.173933029174805, | |
| "learning_rate": 1.0568421052631579e-05, | |
| "loss": 0.3321, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 7.349206349206349, | |
| "grad_norm": 8.02884292602539, | |
| "learning_rate": 1.0357894736842107e-05, | |
| "loss": 0.3183, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 7.507936507936508, | |
| "grad_norm": 7.30162239074707, | |
| "learning_rate": 1.0147368421052632e-05, | |
| "loss": 0.2462, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 7.666666666666667, | |
| "grad_norm": 8.202823638916016, | |
| "learning_rate": 9.936842105263159e-06, | |
| "loss": 0.4269, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 7.825396825396825, | |
| "grad_norm": 12.977221488952637, | |
| "learning_rate": 9.726315789473685e-06, | |
| "loss": 0.4175, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 7.984126984126984, | |
| "grad_norm": 13.560741424560547, | |
| "learning_rate": 9.515789473684212e-06, | |
| "loss": 0.4621, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 8.126984126984127, | |
| "grad_norm": 4.011098384857178, | |
| "learning_rate": 9.305263157894737e-06, | |
| "loss": 0.2328, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 8.285714285714286, | |
| "grad_norm": 6.631120681762695, | |
| "learning_rate": 9.094736842105263e-06, | |
| "loss": 0.1861, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 8.444444444444445, | |
| "grad_norm": 5.168328285217285, | |
| "learning_rate": 8.884210526315792e-06, | |
| "loss": 0.1793, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 8.603174603174603, | |
| "grad_norm": 10.629039764404297, | |
| "learning_rate": 8.673684210526316e-06, | |
| "loss": 0.2486, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 8.761904761904763, | |
| "grad_norm": 13.89322566986084, | |
| "learning_rate": 8.463157894736843e-06, | |
| "loss": 0.167, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 8.920634920634921, | |
| "grad_norm": 6.970239639282227, | |
| "learning_rate": 8.25263157894737e-06, | |
| "loss": 0.1983, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 9.063492063492063, | |
| "grad_norm": 3.4170796871185303, | |
| "learning_rate": 8.042105263157896e-06, | |
| "loss": 0.1928, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 9.222222222222221, | |
| "grad_norm": 11.420437812805176, | |
| "learning_rate": 7.831578947368421e-06, | |
| "loss": 0.1943, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 9.380952380952381, | |
| "grad_norm": 5.474252223968506, | |
| "learning_rate": 7.621052631578948e-06, | |
| "loss": 0.1197, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 9.53968253968254, | |
| "grad_norm": 47.27220916748047, | |
| "learning_rate": 7.410526315789475e-06, | |
| "loss": 0.3147, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 9.698412698412698, | |
| "grad_norm": 9.262266159057617, | |
| "learning_rate": 7.2000000000000005e-06, | |
| "loss": 0.0838, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 9.857142857142858, | |
| "grad_norm": 2.843402862548828, | |
| "learning_rate": 6.989473684210527e-06, | |
| "loss": 0.1181, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "grad_norm": 10.928123474121094, | |
| "learning_rate": 6.778947368421053e-06, | |
| "loss": 0.1583, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 10.158730158730158, | |
| "grad_norm": 1.7071508169174194, | |
| "learning_rate": 6.568421052631579e-06, | |
| "loss": 0.0808, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 10.317460317460318, | |
| "grad_norm": 6.591403007507324, | |
| "learning_rate": 6.357894736842106e-06, | |
| "loss": 0.1076, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 10.476190476190476, | |
| "grad_norm": 4.758854389190674, | |
| "learning_rate": 6.1473684210526316e-06, | |
| "loss": 0.085, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 10.634920634920634, | |
| "grad_norm": 8.381784439086914, | |
| "learning_rate": 5.936842105263159e-06, | |
| "loss": 0.1412, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 10.793650793650794, | |
| "grad_norm": 6.775882244110107, | |
| "learning_rate": 5.726315789473685e-06, | |
| "loss": 0.1122, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 10.952380952380953, | |
| "grad_norm": 3.3244922161102295, | |
| "learning_rate": 5.515789473684211e-06, | |
| "loss": 0.0922, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 11.095238095238095, | |
| "grad_norm": 2.986769437789917, | |
| "learning_rate": 5.305263157894738e-06, | |
| "loss": 0.051, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 11.253968253968253, | |
| "grad_norm": 2.7891147136688232, | |
| "learning_rate": 5.0947368421052635e-06, | |
| "loss": 0.0607, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 11.412698412698413, | |
| "grad_norm": 1.6444604396820068, | |
| "learning_rate": 4.88421052631579e-06, | |
| "loss": 0.0738, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 11.571428571428571, | |
| "grad_norm": 3.8520383834838867, | |
| "learning_rate": 4.6736842105263166e-06, | |
| "loss": 0.0699, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 11.73015873015873, | |
| "grad_norm": 2.9614264965057373, | |
| "learning_rate": 4.463157894736842e-06, | |
| "loss": 0.0662, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 11.88888888888889, | |
| "grad_norm": 0.9366450309753418, | |
| "learning_rate": 4.252631578947369e-06, | |
| "loss": 0.038, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 12.031746031746032, | |
| "grad_norm": 1.4501631259918213, | |
| "learning_rate": 4.042105263157895e-06, | |
| "loss": 0.0415, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 12.19047619047619, | |
| "grad_norm": 1.08451509475708, | |
| "learning_rate": 3.831578947368421e-06, | |
| "loss": 0.0382, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 12.34920634920635, | |
| "grad_norm": 3.214855670928955, | |
| "learning_rate": 3.621052631578948e-06, | |
| "loss": 0.0337, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 12.507936507936508, | |
| "grad_norm": 1.9067870378494263, | |
| "learning_rate": 3.410526315789474e-06, | |
| "loss": 0.0524, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 12.507936507936508, | |
| "eval_runtime": 1.7856, | |
| "eval_samples_per_second": 0.56, | |
| "eval_steps_per_second": 0.56, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 12.666666666666666, | |
| "grad_norm": 1.6618458032608032, | |
| "learning_rate": 3.2000000000000003e-06, | |
| "loss": 0.0591, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 12.825396825396826, | |
| "grad_norm": 1.5796409845352173, | |
| "learning_rate": 2.9894736842105264e-06, | |
| "loss": 0.0835, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 12.984126984126984, | |
| "grad_norm": 0.4131016433238983, | |
| "learning_rate": 2.7789473684210525e-06, | |
| "loss": 0.0579, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 13.126984126984127, | |
| "grad_norm": 0.8320081830024719, | |
| "learning_rate": 2.568421052631579e-06, | |
| "loss": 0.0559, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 13.285714285714286, | |
| "grad_norm": 0.18294575810432434, | |
| "learning_rate": 2.357894736842105e-06, | |
| "loss": 0.0245, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 13.444444444444445, | |
| "grad_norm": 1.5417789220809937, | |
| "learning_rate": 2.1473684210526317e-06, | |
| "loss": 0.0367, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 13.603174603174603, | |
| "grad_norm": 0.5289078950881958, | |
| "learning_rate": 1.936842105263158e-06, | |
| "loss": 0.0327, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 13.761904761904763, | |
| "grad_norm": 0.751720666885376, | |
| "learning_rate": 1.7263157894736842e-06, | |
| "loss": 0.0243, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 13.920634920634921, | |
| "grad_norm": 0.7442598938941956, | |
| "learning_rate": 1.5157894736842108e-06, | |
| "loss": 0.033, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 14.063492063492063, | |
| "grad_norm": 0.3151313066482544, | |
| "learning_rate": 1.3052631578947369e-06, | |
| "loss": 0.0253, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 14.222222222222221, | |
| "grad_norm": 0.2672920525074005, | |
| "learning_rate": 1.0947368421052632e-06, | |
| "loss": 0.0198, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 14.380952380952381, | |
| "grad_norm": 0.6730213165283203, | |
| "learning_rate": 8.842105263157895e-07, | |
| "loss": 0.0212, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 14.53968253968254, | |
| "grad_norm": 0.5566405653953552, | |
| "learning_rate": 6.736842105263158e-07, | |
| "loss": 0.0176, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 14.698412698412698, | |
| "grad_norm": 0.37914007902145386, | |
| "learning_rate": 4.631578947368422e-07, | |
| "loss": 0.0341, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 14.857142857142858, | |
| "grad_norm": 0.2741248905658722, | |
| "learning_rate": 2.5263157894736846e-07, | |
| "loss": 0.0247, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 15.0, | |
| "grad_norm": 0.30271536111831665, | |
| "learning_rate": 4.2105263157894737e-08, | |
| "loss": 0.0212, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 15.0, | |
| "step": 480, | |
| "total_flos": 0.0, | |
| "train_loss": 1.4350806780159473, | |
| "train_runtime": 3024.1986, | |
| "train_samples_per_second": 2.475, | |
| "train_steps_per_second": 0.159 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 480, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 15, | |
| "save_steps": 400, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |