{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 1250, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0008, "grad_norm": 11.4375, "learning_rate": 0.0, "loss": 3.8345, "step": 1 }, { "epoch": 0.008, "grad_norm": 11.125, "learning_rate": 2.25e-06, "loss": 3.3934, "step": 10 }, { "epoch": 0.016, "grad_norm": 9.5, "learning_rate": 4.75e-06, "loss": 3.4002, "step": 20 }, { "epoch": 0.024, "grad_norm": 9.4375, "learning_rate": 7.25e-06, "loss": 3.4003, "step": 30 }, { "epoch": 0.032, "grad_norm": 11.1875, "learning_rate": 9.750000000000002e-06, "loss": 3.374, "step": 40 }, { "epoch": 0.04, "grad_norm": 10.625, "learning_rate": 1.225e-05, "loss": 3.4678, "step": 50 }, { "epoch": 0.048, "grad_norm": 11.0, "learning_rate": 1.475e-05, "loss": 3.4331, "step": 60 }, { "epoch": 0.056, "grad_norm": 10.6875, "learning_rate": 1.725e-05, "loss": 3.4722, "step": 70 }, { "epoch": 0.064, "grad_norm": 10.25, "learning_rate": 1.9750000000000002e-05, "loss": 3.2413, "step": 80 }, { "epoch": 0.072, "grad_norm": 10.8125, "learning_rate": 2.2250000000000002e-05, "loss": 3.1804, "step": 90 }, { "epoch": 0.08, "grad_norm": 10.1875, "learning_rate": 2.4750000000000002e-05, "loss": 3.1778, "step": 100 }, { "epoch": 0.088, "grad_norm": 8.4375, "learning_rate": 2.725e-05, "loss": 3.2479, "step": 110 }, { "epoch": 0.096, "grad_norm": 11.0, "learning_rate": 2.975e-05, "loss": 3.2259, "step": 120 }, { "epoch": 0.104, "grad_norm": 9.5625, "learning_rate": 3.2250000000000005e-05, "loss": 3.0845, "step": 130 }, { "epoch": 0.112, "grad_norm": 8.9375, "learning_rate": 3.475e-05, "loss": 3.1694, "step": 140 }, { "epoch": 0.12, "grad_norm": 8.4375, "learning_rate": 3.7250000000000004e-05, "loss": 3.1736, "step": 150 }, { "epoch": 0.128, "grad_norm": 9.5, "learning_rate": 3.9750000000000004e-05, "loss": 3.035, "step": 160 }, { "epoch": 0.136, "grad_norm": 9.8125, "learning_rate": 4.2250000000000004e-05, "loss": 3.1456, "step": 170 }, { "epoch": 0.144, "grad_norm": 9.25, "learning_rate": 4.4750000000000004e-05, "loss": 3.0917, "step": 180 }, { "epoch": 0.152, "grad_norm": 9.1875, "learning_rate": 4.7249999999999997e-05, "loss": 2.9948, "step": 190 }, { "epoch": 0.16, "grad_norm": 8.5625, "learning_rate": 4.975e-05, "loss": 3.0667, "step": 200 }, { "epoch": 0.168, "grad_norm": 8.125, "learning_rate": 4.957142857142857e-05, "loss": 2.9987, "step": 210 }, { "epoch": 0.176, "grad_norm": 7.875, "learning_rate": 4.90952380952381e-05, "loss": 2.9686, "step": 220 }, { "epoch": 0.184, "grad_norm": 9.1875, "learning_rate": 4.861904761904762e-05, "loss": 3.0708, "step": 230 }, { "epoch": 0.192, "grad_norm": 7.90625, "learning_rate": 4.8142857142857147e-05, "loss": 3.0676, "step": 240 }, { "epoch": 0.2, "grad_norm": 9.375, "learning_rate": 4.766666666666667e-05, "loss": 2.8833, "step": 250 }, { "epoch": 0.208, "grad_norm": 8.75, "learning_rate": 4.719047619047619e-05, "loss": 3.0558, "step": 260 }, { "epoch": 0.216, "grad_norm": 9.375, "learning_rate": 4.671428571428571e-05, "loss": 3.0003, "step": 270 }, { "epoch": 0.224, "grad_norm": 8.25, "learning_rate": 4.623809523809524e-05, "loss": 2.9514, "step": 280 }, { "epoch": 0.232, "grad_norm": 7.15625, "learning_rate": 4.5761904761904765e-05, "loss": 2.984, "step": 290 }, { "epoch": 0.24, "grad_norm": 8.625, "learning_rate": 4.528571428571429e-05, "loss": 3.0273, "step": 300 }, { "epoch": 0.248, "grad_norm": 7.75, "learning_rate": 4.480952380952381e-05, "loss": 2.9437, "step": 310 }, { "epoch": 0.256, "grad_norm": 9.1875, "learning_rate": 4.433333333333334e-05, "loss": 3.0913, "step": 320 }, { "epoch": 0.264, "grad_norm": 8.5625, "learning_rate": 4.385714285714286e-05, "loss": 2.9659, "step": 330 }, { "epoch": 0.272, "grad_norm": 8.0625, "learning_rate": 4.338095238095238e-05, "loss": 2.854, "step": 340 }, { "epoch": 0.28, "grad_norm": 7.0, "learning_rate": 4.290476190476191e-05, "loss": 2.8936, "step": 350 }, { "epoch": 0.288, "grad_norm": 8.125, "learning_rate": 4.242857142857143e-05, "loss": 2.9805, "step": 360 }, { "epoch": 0.296, "grad_norm": 8.5, "learning_rate": 4.1952380952380956e-05, "loss": 2.9814, "step": 370 }, { "epoch": 0.304, "grad_norm": 7.25, "learning_rate": 4.147619047619048e-05, "loss": 2.8731, "step": 380 }, { "epoch": 0.312, "grad_norm": 7.4375, "learning_rate": 4.1e-05, "loss": 2.8186, "step": 390 }, { "epoch": 0.32, "grad_norm": 8.0, "learning_rate": 4.052380952380952e-05, "loss": 3.048, "step": 400 }, { "epoch": 0.328, "grad_norm": 8.5, "learning_rate": 4.004761904761905e-05, "loss": 2.8923, "step": 410 }, { "epoch": 0.336, "grad_norm": 7.96875, "learning_rate": 3.9571428571428574e-05, "loss": 2.8443, "step": 420 }, { "epoch": 0.344, "grad_norm": 7.75, "learning_rate": 3.9095238095238096e-05, "loss": 3.0182, "step": 430 }, { "epoch": 0.352, "grad_norm": 7.625, "learning_rate": 3.861904761904762e-05, "loss": 2.849, "step": 440 }, { "epoch": 0.36, "grad_norm": 8.125, "learning_rate": 3.814285714285715e-05, "loss": 2.8413, "step": 450 }, { "epoch": 0.368, "grad_norm": 7.6875, "learning_rate": 3.766666666666667e-05, "loss": 2.9321, "step": 460 }, { "epoch": 0.376, "grad_norm": 7.15625, "learning_rate": 3.719047619047619e-05, "loss": 2.9628, "step": 470 }, { "epoch": 0.384, "grad_norm": 8.0625, "learning_rate": 3.671428571428572e-05, "loss": 3.0175, "step": 480 }, { "epoch": 0.392, "grad_norm": 7.375, "learning_rate": 3.6238095238095236e-05, "loss": 2.9446, "step": 490 }, { "epoch": 0.4, "grad_norm": 7.625, "learning_rate": 3.5761904761904765e-05, "loss": 2.8095, "step": 500 }, { "epoch": 0.408, "grad_norm": 9.25, "learning_rate": 3.528571428571429e-05, "loss": 2.9347, "step": 510 }, { "epoch": 0.416, "grad_norm": 7.875, "learning_rate": 3.480952380952381e-05, "loss": 2.7751, "step": 520 }, { "epoch": 0.424, "grad_norm": 8.5625, "learning_rate": 3.433333333333333e-05, "loss": 2.8434, "step": 530 }, { "epoch": 0.432, "grad_norm": 7.71875, "learning_rate": 3.385714285714286e-05, "loss": 2.869, "step": 540 }, { "epoch": 0.44, "grad_norm": 6.75, "learning_rate": 3.338095238095238e-05, "loss": 2.8466, "step": 550 }, { "epoch": 0.448, "grad_norm": 7.4375, "learning_rate": 3.2904761904761906e-05, "loss": 2.7286, "step": 560 }, { "epoch": 0.456, "grad_norm": 8.4375, "learning_rate": 3.242857142857143e-05, "loss": 2.9655, "step": 570 }, { "epoch": 0.464, "grad_norm": 7.3125, "learning_rate": 3.195238095238096e-05, "loss": 2.7955, "step": 580 }, { "epoch": 0.472, "grad_norm": 9.25, "learning_rate": 3.147619047619048e-05, "loss": 2.8469, "step": 590 }, { "epoch": 0.48, "grad_norm": 8.25, "learning_rate": 3.1e-05, "loss": 2.9423, "step": 600 }, { "epoch": 0.488, "grad_norm": 8.0625, "learning_rate": 3.0523809523809524e-05, "loss": 2.9848, "step": 610 }, { "epoch": 0.496, "grad_norm": 7.25, "learning_rate": 3.0047619047619046e-05, "loss": 2.8185, "step": 620 }, { "epoch": 0.504, "grad_norm": 8.3125, "learning_rate": 2.957142857142857e-05, "loss": 2.7036, "step": 630 }, { "epoch": 0.512, "grad_norm": 7.5625, "learning_rate": 2.9095238095238097e-05, "loss": 2.6809, "step": 640 }, { "epoch": 0.52, "grad_norm": 8.3125, "learning_rate": 2.8619047619047623e-05, "loss": 2.8011, "step": 650 }, { "epoch": 0.528, "grad_norm": 7.40625, "learning_rate": 2.814285714285714e-05, "loss": 2.7213, "step": 660 }, { "epoch": 0.536, "grad_norm": 7.15625, "learning_rate": 2.7666666666666667e-05, "loss": 2.8342, "step": 670 }, { "epoch": 0.544, "grad_norm": 6.625, "learning_rate": 2.7190476190476193e-05, "loss": 2.7829, "step": 680 }, { "epoch": 0.552, "grad_norm": 7.5625, "learning_rate": 2.6714285714285715e-05, "loss": 2.7949, "step": 690 }, { "epoch": 0.56, "grad_norm": 7.5625, "learning_rate": 2.623809523809524e-05, "loss": 2.7692, "step": 700 }, { "epoch": 0.568, "grad_norm": 8.25, "learning_rate": 2.5761904761904766e-05, "loss": 2.8251, "step": 710 }, { "epoch": 0.576, "grad_norm": 7.15625, "learning_rate": 2.5285714285714285e-05, "loss": 2.7056, "step": 720 }, { "epoch": 0.584, "grad_norm": 8.8125, "learning_rate": 2.480952380952381e-05, "loss": 2.8116, "step": 730 }, { "epoch": 0.592, "grad_norm": 8.125, "learning_rate": 2.4333333333333336e-05, "loss": 2.6844, "step": 740 }, { "epoch": 0.6, "grad_norm": 8.5625, "learning_rate": 2.385714285714286e-05, "loss": 2.7657, "step": 750 }, { "epoch": 0.608, "grad_norm": 8.1875, "learning_rate": 2.338095238095238e-05, "loss": 2.8522, "step": 760 }, { "epoch": 0.616, "grad_norm": 7.71875, "learning_rate": 2.2904761904761906e-05, "loss": 2.8368, "step": 770 }, { "epoch": 0.624, "grad_norm": 7.71875, "learning_rate": 2.242857142857143e-05, "loss": 2.6978, "step": 780 }, { "epoch": 0.632, "grad_norm": 7.25, "learning_rate": 2.195238095238095e-05, "loss": 2.8267, "step": 790 }, { "epoch": 0.64, "grad_norm": 8.1875, "learning_rate": 2.1476190476190477e-05, "loss": 2.7188, "step": 800 }, { "epoch": 0.648, "grad_norm": 7.78125, "learning_rate": 2.1e-05, "loss": 2.8553, "step": 810 }, { "epoch": 0.656, "grad_norm": 7.25, "learning_rate": 2.0523809523809524e-05, "loss": 2.675, "step": 820 }, { "epoch": 0.664, "grad_norm": 6.9375, "learning_rate": 2.004761904761905e-05, "loss": 2.7901, "step": 830 }, { "epoch": 0.672, "grad_norm": 8.4375, "learning_rate": 1.9571428571428572e-05, "loss": 2.6719, "step": 840 }, { "epoch": 0.68, "grad_norm": 7.8125, "learning_rate": 1.9095238095238098e-05, "loss": 2.6436, "step": 850 }, { "epoch": 0.688, "grad_norm": 7.71875, "learning_rate": 1.861904761904762e-05, "loss": 2.8045, "step": 860 }, { "epoch": 0.696, "grad_norm": 8.75, "learning_rate": 1.8142857142857146e-05, "loss": 2.802, "step": 870 }, { "epoch": 0.704, "grad_norm": 8.0, "learning_rate": 1.7666666666666668e-05, "loss": 2.807, "step": 880 }, { "epoch": 0.712, "grad_norm": 8.75, "learning_rate": 1.719047619047619e-05, "loss": 2.7425, "step": 890 }, { "epoch": 0.72, "grad_norm": 7.40625, "learning_rate": 1.6714285714285716e-05, "loss": 2.8515, "step": 900 }, { "epoch": 0.728, "grad_norm": 7.0625, "learning_rate": 1.6238095238095238e-05, "loss": 2.7694, "step": 910 }, { "epoch": 0.736, "grad_norm": 8.0, "learning_rate": 1.5761904761904764e-05, "loss": 2.6311, "step": 920 }, { "epoch": 0.744, "grad_norm": 6.6875, "learning_rate": 1.5285714285714286e-05, "loss": 2.6884, "step": 930 }, { "epoch": 0.752, "grad_norm": 6.9375, "learning_rate": 1.480952380952381e-05, "loss": 2.8331, "step": 940 }, { "epoch": 0.76, "grad_norm": 7.75, "learning_rate": 1.4333333333333334e-05, "loss": 2.7617, "step": 950 }, { "epoch": 0.768, "grad_norm": 7.1875, "learning_rate": 1.3857142857142858e-05, "loss": 2.6287, "step": 960 }, { "epoch": 0.776, "grad_norm": 7.71875, "learning_rate": 1.3380952380952383e-05, "loss": 2.8381, "step": 970 }, { "epoch": 0.784, "grad_norm": 7.625, "learning_rate": 1.2904761904761906e-05, "loss": 2.6136, "step": 980 }, { "epoch": 0.792, "grad_norm": 7.40625, "learning_rate": 1.242857142857143e-05, "loss": 2.7682, "step": 990 }, { "epoch": 0.8, "grad_norm": 7.71875, "learning_rate": 1.1952380952380952e-05, "loss": 2.6878, "step": 1000 }, { "epoch": 0.808, "grad_norm": 8.0, "learning_rate": 1.1476190476190476e-05, "loss": 2.5709, "step": 1010 }, { "epoch": 0.816, "grad_norm": 8.0625, "learning_rate": 1.1000000000000001e-05, "loss": 2.7941, "step": 1020 }, { "epoch": 0.824, "grad_norm": 8.1875, "learning_rate": 1.0523809523809525e-05, "loss": 2.6739, "step": 1030 }, { "epoch": 0.832, "grad_norm": 7.59375, "learning_rate": 1.004761904761905e-05, "loss": 2.6953, "step": 1040 }, { "epoch": 0.84, "grad_norm": 7.3125, "learning_rate": 9.571428571428572e-06, "loss": 2.6107, "step": 1050 }, { "epoch": 0.848, "grad_norm": 7.59375, "learning_rate": 9.095238095238095e-06, "loss": 2.7805, "step": 1060 }, { "epoch": 0.856, "grad_norm": 9.0625, "learning_rate": 8.61904761904762e-06, "loss": 2.7856, "step": 1070 }, { "epoch": 0.864, "grad_norm": 7.75, "learning_rate": 8.142857142857143e-06, "loss": 2.6762, "step": 1080 }, { "epoch": 0.872, "grad_norm": 7.625, "learning_rate": 7.666666666666667e-06, "loss": 2.7825, "step": 1090 }, { "epoch": 0.88, "grad_norm": 7.0, "learning_rate": 7.19047619047619e-06, "loss": 2.7893, "step": 1100 }, { "epoch": 0.888, "grad_norm": 6.9375, "learning_rate": 6.714285714285714e-06, "loss": 2.7058, "step": 1110 }, { "epoch": 0.896, "grad_norm": 7.03125, "learning_rate": 6.238095238095239e-06, "loss": 2.7899, "step": 1120 }, { "epoch": 0.904, "grad_norm": 6.9375, "learning_rate": 5.761904761904762e-06, "loss": 2.6251, "step": 1130 }, { "epoch": 0.912, "grad_norm": 6.84375, "learning_rate": 5.285714285714286e-06, "loss": 2.7011, "step": 1140 }, { "epoch": 0.92, "grad_norm": 6.78125, "learning_rate": 4.80952380952381e-06, "loss": 2.6375, "step": 1150 }, { "epoch": 0.928, "grad_norm": 8.5625, "learning_rate": 4.333333333333334e-06, "loss": 2.7994, "step": 1160 }, { "epoch": 0.936, "grad_norm": 6.59375, "learning_rate": 3.857142857142857e-06, "loss": 2.7145, "step": 1170 }, { "epoch": 0.944, "grad_norm": 7.0625, "learning_rate": 3.3809523809523814e-06, "loss": 2.6465, "step": 1180 }, { "epoch": 0.952, "grad_norm": 8.3125, "learning_rate": 2.904761904761905e-06, "loss": 2.7466, "step": 1190 }, { "epoch": 0.96, "grad_norm": 7.84375, "learning_rate": 2.428571428571429e-06, "loss": 2.76, "step": 1200 }, { "epoch": 0.968, "grad_norm": 7.78125, "learning_rate": 1.9523809523809523e-06, "loss": 2.7507, "step": 1210 }, { "epoch": 0.976, "grad_norm": 7.4375, "learning_rate": 1.4761904761904762e-06, "loss": 2.7197, "step": 1220 }, { "epoch": 0.984, "grad_norm": 7.96875, "learning_rate": 1.0000000000000002e-06, "loss": 2.7261, "step": 1230 }, { "epoch": 0.992, "grad_norm": 5.90625, "learning_rate": 5.238095238095238e-07, "loss": 2.6812, "step": 1240 }, { "epoch": 1.0, "grad_norm": 8.125, "learning_rate": 4.761904761904762e-08, "loss": 2.6935, "step": 1250 } ], "logging_steps": 10, "max_steps": 1250, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.443927710242921e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }