| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 1250, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0008, | |
| "grad_norm": 11.4375, | |
| "learning_rate": 0.0, | |
| "loss": 3.8345, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.008, | |
| "grad_norm": 11.125, | |
| "learning_rate": 2.25e-06, | |
| "loss": 3.3934, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.016, | |
| "grad_norm": 9.5, | |
| "learning_rate": 4.75e-06, | |
| "loss": 3.4002, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.024, | |
| "grad_norm": 9.4375, | |
| "learning_rate": 7.25e-06, | |
| "loss": 3.4003, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.032, | |
| "grad_norm": 11.1875, | |
| "learning_rate": 9.750000000000002e-06, | |
| "loss": 3.374, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 10.625, | |
| "learning_rate": 1.225e-05, | |
| "loss": 3.4678, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.048, | |
| "grad_norm": 11.0, | |
| "learning_rate": 1.475e-05, | |
| "loss": 3.4331, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.056, | |
| "grad_norm": 10.6875, | |
| "learning_rate": 1.725e-05, | |
| "loss": 3.4722, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.064, | |
| "grad_norm": 10.25, | |
| "learning_rate": 1.9750000000000002e-05, | |
| "loss": 3.2413, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.072, | |
| "grad_norm": 10.8125, | |
| "learning_rate": 2.2250000000000002e-05, | |
| "loss": 3.1804, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 10.1875, | |
| "learning_rate": 2.4750000000000002e-05, | |
| "loss": 3.1778, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.088, | |
| "grad_norm": 8.4375, | |
| "learning_rate": 2.725e-05, | |
| "loss": 3.2479, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.096, | |
| "grad_norm": 11.0, | |
| "learning_rate": 2.975e-05, | |
| "loss": 3.2259, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.104, | |
| "grad_norm": 9.5625, | |
| "learning_rate": 3.2250000000000005e-05, | |
| "loss": 3.0845, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.112, | |
| "grad_norm": 8.9375, | |
| "learning_rate": 3.475e-05, | |
| "loss": 3.1694, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 8.4375, | |
| "learning_rate": 3.7250000000000004e-05, | |
| "loss": 3.1736, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.128, | |
| "grad_norm": 9.5, | |
| "learning_rate": 3.9750000000000004e-05, | |
| "loss": 3.035, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.136, | |
| "grad_norm": 9.8125, | |
| "learning_rate": 4.2250000000000004e-05, | |
| "loss": 3.1456, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.144, | |
| "grad_norm": 9.25, | |
| "learning_rate": 4.4750000000000004e-05, | |
| "loss": 3.0917, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.152, | |
| "grad_norm": 9.1875, | |
| "learning_rate": 4.7249999999999997e-05, | |
| "loss": 2.9948, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 8.5625, | |
| "learning_rate": 4.975e-05, | |
| "loss": 3.0667, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.168, | |
| "grad_norm": 8.125, | |
| "learning_rate": 4.957142857142857e-05, | |
| "loss": 2.9987, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.176, | |
| "grad_norm": 7.875, | |
| "learning_rate": 4.90952380952381e-05, | |
| "loss": 2.9686, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.184, | |
| "grad_norm": 9.1875, | |
| "learning_rate": 4.861904761904762e-05, | |
| "loss": 3.0708, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.192, | |
| "grad_norm": 7.90625, | |
| "learning_rate": 4.8142857142857147e-05, | |
| "loss": 3.0676, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 9.375, | |
| "learning_rate": 4.766666666666667e-05, | |
| "loss": 2.8833, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.208, | |
| "grad_norm": 8.75, | |
| "learning_rate": 4.719047619047619e-05, | |
| "loss": 3.0558, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.216, | |
| "grad_norm": 9.375, | |
| "learning_rate": 4.671428571428571e-05, | |
| "loss": 3.0003, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.224, | |
| "grad_norm": 8.25, | |
| "learning_rate": 4.623809523809524e-05, | |
| "loss": 2.9514, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.232, | |
| "grad_norm": 7.15625, | |
| "learning_rate": 4.5761904761904765e-05, | |
| "loss": 2.984, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 8.625, | |
| "learning_rate": 4.528571428571429e-05, | |
| "loss": 3.0273, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.248, | |
| "grad_norm": 7.75, | |
| "learning_rate": 4.480952380952381e-05, | |
| "loss": 2.9437, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.256, | |
| "grad_norm": 9.1875, | |
| "learning_rate": 4.433333333333334e-05, | |
| "loss": 3.0913, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.264, | |
| "grad_norm": 8.5625, | |
| "learning_rate": 4.385714285714286e-05, | |
| "loss": 2.9659, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.272, | |
| "grad_norm": 8.0625, | |
| "learning_rate": 4.338095238095238e-05, | |
| "loss": 2.854, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 7.0, | |
| "learning_rate": 4.290476190476191e-05, | |
| "loss": 2.8936, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.288, | |
| "grad_norm": 8.125, | |
| "learning_rate": 4.242857142857143e-05, | |
| "loss": 2.9805, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.296, | |
| "grad_norm": 8.5, | |
| "learning_rate": 4.1952380952380956e-05, | |
| "loss": 2.9814, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.304, | |
| "grad_norm": 7.25, | |
| "learning_rate": 4.147619047619048e-05, | |
| "loss": 2.8731, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.312, | |
| "grad_norm": 7.4375, | |
| "learning_rate": 4.1e-05, | |
| "loss": 2.8186, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 8.0, | |
| "learning_rate": 4.052380952380952e-05, | |
| "loss": 3.048, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.328, | |
| "grad_norm": 8.5, | |
| "learning_rate": 4.004761904761905e-05, | |
| "loss": 2.8923, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.336, | |
| "grad_norm": 7.96875, | |
| "learning_rate": 3.9571428571428574e-05, | |
| "loss": 2.8443, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.344, | |
| "grad_norm": 7.75, | |
| "learning_rate": 3.9095238095238096e-05, | |
| "loss": 3.0182, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.352, | |
| "grad_norm": 7.625, | |
| "learning_rate": 3.861904761904762e-05, | |
| "loss": 2.849, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 8.125, | |
| "learning_rate": 3.814285714285715e-05, | |
| "loss": 2.8413, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.368, | |
| "grad_norm": 7.6875, | |
| "learning_rate": 3.766666666666667e-05, | |
| "loss": 2.9321, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.376, | |
| "grad_norm": 7.15625, | |
| "learning_rate": 3.719047619047619e-05, | |
| "loss": 2.9628, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.384, | |
| "grad_norm": 8.0625, | |
| "learning_rate": 3.671428571428572e-05, | |
| "loss": 3.0175, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.392, | |
| "grad_norm": 7.375, | |
| "learning_rate": 3.6238095238095236e-05, | |
| "loss": 2.9446, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 7.625, | |
| "learning_rate": 3.5761904761904765e-05, | |
| "loss": 2.8095, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.408, | |
| "grad_norm": 9.25, | |
| "learning_rate": 3.528571428571429e-05, | |
| "loss": 2.9347, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.416, | |
| "grad_norm": 7.875, | |
| "learning_rate": 3.480952380952381e-05, | |
| "loss": 2.7751, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.424, | |
| "grad_norm": 8.5625, | |
| "learning_rate": 3.433333333333333e-05, | |
| "loss": 2.8434, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.432, | |
| "grad_norm": 7.71875, | |
| "learning_rate": 3.385714285714286e-05, | |
| "loss": 2.869, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 6.75, | |
| "learning_rate": 3.338095238095238e-05, | |
| "loss": 2.8466, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.448, | |
| "grad_norm": 7.4375, | |
| "learning_rate": 3.2904761904761906e-05, | |
| "loss": 2.7286, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.456, | |
| "grad_norm": 8.4375, | |
| "learning_rate": 3.242857142857143e-05, | |
| "loss": 2.9655, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.464, | |
| "grad_norm": 7.3125, | |
| "learning_rate": 3.195238095238096e-05, | |
| "loss": 2.7955, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.472, | |
| "grad_norm": 9.25, | |
| "learning_rate": 3.147619047619048e-05, | |
| "loss": 2.8469, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 8.25, | |
| "learning_rate": 3.1e-05, | |
| "loss": 2.9423, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.488, | |
| "grad_norm": 8.0625, | |
| "learning_rate": 3.0523809523809524e-05, | |
| "loss": 2.9848, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.496, | |
| "grad_norm": 7.25, | |
| "learning_rate": 3.0047619047619046e-05, | |
| "loss": 2.8185, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.504, | |
| "grad_norm": 8.3125, | |
| "learning_rate": 2.957142857142857e-05, | |
| "loss": 2.7036, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.512, | |
| "grad_norm": 7.5625, | |
| "learning_rate": 2.9095238095238097e-05, | |
| "loss": 2.6809, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 8.3125, | |
| "learning_rate": 2.8619047619047623e-05, | |
| "loss": 2.8011, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.528, | |
| "grad_norm": 7.40625, | |
| "learning_rate": 2.814285714285714e-05, | |
| "loss": 2.7213, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.536, | |
| "grad_norm": 7.15625, | |
| "learning_rate": 2.7666666666666667e-05, | |
| "loss": 2.8342, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.544, | |
| "grad_norm": 6.625, | |
| "learning_rate": 2.7190476190476193e-05, | |
| "loss": 2.7829, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.552, | |
| "grad_norm": 7.5625, | |
| "learning_rate": 2.6714285714285715e-05, | |
| "loss": 2.7949, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 7.5625, | |
| "learning_rate": 2.623809523809524e-05, | |
| "loss": 2.7692, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.568, | |
| "grad_norm": 8.25, | |
| "learning_rate": 2.5761904761904766e-05, | |
| "loss": 2.8251, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.576, | |
| "grad_norm": 7.15625, | |
| "learning_rate": 2.5285714285714285e-05, | |
| "loss": 2.7056, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.584, | |
| "grad_norm": 8.8125, | |
| "learning_rate": 2.480952380952381e-05, | |
| "loss": 2.8116, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.592, | |
| "grad_norm": 8.125, | |
| "learning_rate": 2.4333333333333336e-05, | |
| "loss": 2.6844, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 8.5625, | |
| "learning_rate": 2.385714285714286e-05, | |
| "loss": 2.7657, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.608, | |
| "grad_norm": 8.1875, | |
| "learning_rate": 2.338095238095238e-05, | |
| "loss": 2.8522, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.616, | |
| "grad_norm": 7.71875, | |
| "learning_rate": 2.2904761904761906e-05, | |
| "loss": 2.8368, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.624, | |
| "grad_norm": 7.71875, | |
| "learning_rate": 2.242857142857143e-05, | |
| "loss": 2.6978, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.632, | |
| "grad_norm": 7.25, | |
| "learning_rate": 2.195238095238095e-05, | |
| "loss": 2.8267, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 8.1875, | |
| "learning_rate": 2.1476190476190477e-05, | |
| "loss": 2.7188, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.648, | |
| "grad_norm": 7.78125, | |
| "learning_rate": 2.1e-05, | |
| "loss": 2.8553, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.656, | |
| "grad_norm": 7.25, | |
| "learning_rate": 2.0523809523809524e-05, | |
| "loss": 2.675, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.664, | |
| "grad_norm": 6.9375, | |
| "learning_rate": 2.004761904761905e-05, | |
| "loss": 2.7901, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.672, | |
| "grad_norm": 8.4375, | |
| "learning_rate": 1.9571428571428572e-05, | |
| "loss": 2.6719, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 7.8125, | |
| "learning_rate": 1.9095238095238098e-05, | |
| "loss": 2.6436, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.688, | |
| "grad_norm": 7.71875, | |
| "learning_rate": 1.861904761904762e-05, | |
| "loss": 2.8045, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.696, | |
| "grad_norm": 8.75, | |
| "learning_rate": 1.8142857142857146e-05, | |
| "loss": 2.802, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.704, | |
| "grad_norm": 8.0, | |
| "learning_rate": 1.7666666666666668e-05, | |
| "loss": 2.807, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.712, | |
| "grad_norm": 8.75, | |
| "learning_rate": 1.719047619047619e-05, | |
| "loss": 2.7425, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 7.40625, | |
| "learning_rate": 1.6714285714285716e-05, | |
| "loss": 2.8515, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.728, | |
| "grad_norm": 7.0625, | |
| "learning_rate": 1.6238095238095238e-05, | |
| "loss": 2.7694, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.736, | |
| "grad_norm": 8.0, | |
| "learning_rate": 1.5761904761904764e-05, | |
| "loss": 2.6311, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.744, | |
| "grad_norm": 6.6875, | |
| "learning_rate": 1.5285714285714286e-05, | |
| "loss": 2.6884, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.752, | |
| "grad_norm": 6.9375, | |
| "learning_rate": 1.480952380952381e-05, | |
| "loss": 2.8331, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 7.75, | |
| "learning_rate": 1.4333333333333334e-05, | |
| "loss": 2.7617, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.768, | |
| "grad_norm": 7.1875, | |
| "learning_rate": 1.3857142857142858e-05, | |
| "loss": 2.6287, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.776, | |
| "grad_norm": 7.71875, | |
| "learning_rate": 1.3380952380952383e-05, | |
| "loss": 2.8381, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.784, | |
| "grad_norm": 7.625, | |
| "learning_rate": 1.2904761904761906e-05, | |
| "loss": 2.6136, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.792, | |
| "grad_norm": 7.40625, | |
| "learning_rate": 1.242857142857143e-05, | |
| "loss": 2.7682, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 7.71875, | |
| "learning_rate": 1.1952380952380952e-05, | |
| "loss": 2.6878, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.808, | |
| "grad_norm": 8.0, | |
| "learning_rate": 1.1476190476190476e-05, | |
| "loss": 2.5709, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.816, | |
| "grad_norm": 8.0625, | |
| "learning_rate": 1.1000000000000001e-05, | |
| "loss": 2.7941, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.824, | |
| "grad_norm": 8.1875, | |
| "learning_rate": 1.0523809523809525e-05, | |
| "loss": 2.6739, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.832, | |
| "grad_norm": 7.59375, | |
| "learning_rate": 1.004761904761905e-05, | |
| "loss": 2.6953, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 7.3125, | |
| "learning_rate": 9.571428571428572e-06, | |
| "loss": 2.6107, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.848, | |
| "grad_norm": 7.59375, | |
| "learning_rate": 9.095238095238095e-06, | |
| "loss": 2.7805, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.856, | |
| "grad_norm": 9.0625, | |
| "learning_rate": 8.61904761904762e-06, | |
| "loss": 2.7856, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.864, | |
| "grad_norm": 7.75, | |
| "learning_rate": 8.142857142857143e-06, | |
| "loss": 2.6762, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.872, | |
| "grad_norm": 7.625, | |
| "learning_rate": 7.666666666666667e-06, | |
| "loss": 2.7825, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 7.0, | |
| "learning_rate": 7.19047619047619e-06, | |
| "loss": 2.7893, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.888, | |
| "grad_norm": 6.9375, | |
| "learning_rate": 6.714285714285714e-06, | |
| "loss": 2.7058, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.896, | |
| "grad_norm": 7.03125, | |
| "learning_rate": 6.238095238095239e-06, | |
| "loss": 2.7899, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.904, | |
| "grad_norm": 6.9375, | |
| "learning_rate": 5.761904761904762e-06, | |
| "loss": 2.6251, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.912, | |
| "grad_norm": 6.84375, | |
| "learning_rate": 5.285714285714286e-06, | |
| "loss": 2.7011, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 6.78125, | |
| "learning_rate": 4.80952380952381e-06, | |
| "loss": 2.6375, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.928, | |
| "grad_norm": 8.5625, | |
| "learning_rate": 4.333333333333334e-06, | |
| "loss": 2.7994, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.936, | |
| "grad_norm": 6.59375, | |
| "learning_rate": 3.857142857142857e-06, | |
| "loss": 2.7145, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.944, | |
| "grad_norm": 7.0625, | |
| "learning_rate": 3.3809523809523814e-06, | |
| "loss": 2.6465, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.952, | |
| "grad_norm": 8.3125, | |
| "learning_rate": 2.904761904761905e-06, | |
| "loss": 2.7466, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 7.84375, | |
| "learning_rate": 2.428571428571429e-06, | |
| "loss": 2.76, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.968, | |
| "grad_norm": 7.78125, | |
| "learning_rate": 1.9523809523809523e-06, | |
| "loss": 2.7507, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.976, | |
| "grad_norm": 7.4375, | |
| "learning_rate": 1.4761904761904762e-06, | |
| "loss": 2.7197, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.984, | |
| "grad_norm": 7.96875, | |
| "learning_rate": 1.0000000000000002e-06, | |
| "loss": 2.7261, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.992, | |
| "grad_norm": 5.90625, | |
| "learning_rate": 5.238095238095238e-07, | |
| "loss": 2.6812, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 8.125, | |
| "learning_rate": 4.761904761904762e-08, | |
| "loss": 2.6935, | |
| "step": 1250 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1250, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.443927710242921e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |