{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.8, "eval_steps": 500, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 4.9840000000000004e-05, "loss": 4.7991, "step": 10 }, { "epoch": 0.01, "learning_rate": 4.9640000000000006e-05, "loss": 4.4063, "step": 20 }, { "epoch": 0.01, "learning_rate": 4.944e-05, "loss": 3.8331, "step": 30 }, { "epoch": 0.02, "learning_rate": 4.924e-05, "loss": 2.8485, "step": 40 }, { "epoch": 0.02, "learning_rate": 4.9040000000000005e-05, "loss": 2.5842, "step": 50 }, { "epoch": 0.02, "learning_rate": 4.884e-05, "loss": 2.3857, "step": 60 }, { "epoch": 0.03, "learning_rate": 4.864e-05, "loss": 2.2079, "step": 70 }, { "epoch": 0.03, "learning_rate": 4.8440000000000004e-05, "loss": 2.2818, "step": 80 }, { "epoch": 0.04, "learning_rate": 4.824e-05, "loss": 2.1391, "step": 90 }, { "epoch": 0.04, "learning_rate": 4.804e-05, "loss": 2.1858, "step": 100 }, { "epoch": 0.04, "learning_rate": 4.784e-05, "loss": 2.1695, "step": 110 }, { "epoch": 0.05, "learning_rate": 4.7640000000000005e-05, "loss": 2.0576, "step": 120 }, { "epoch": 0.05, "learning_rate": 4.744e-05, "loss": 2.0989, "step": 130 }, { "epoch": 0.06, "learning_rate": 4.724e-05, "loss": 2.0584, "step": 140 }, { "epoch": 0.06, "learning_rate": 4.7040000000000004e-05, "loss": 2.0516, "step": 150 }, { "epoch": 0.06, "learning_rate": 4.684e-05, "loss": 2.0031, "step": 160 }, { "epoch": 0.07, "learning_rate": 4.664e-05, "loss": 2.0133, "step": 170 }, { "epoch": 0.07, "learning_rate": 4.644e-05, "loss": 2.027, "step": 180 }, { "epoch": 0.08, "learning_rate": 4.624e-05, "loss": 2.0238, "step": 190 }, { "epoch": 0.08, "learning_rate": 4.606e-05, "loss": 2.0607, "step": 200 }, { "epoch": 0.08, "learning_rate": 4.5860000000000005e-05, "loss": 2.1364, "step": 210 }, { "epoch": 0.09, "learning_rate": 4.566e-05, "loss": 2.0829, "step": 220 }, { "epoch": 0.09, "learning_rate": 4.546e-05, "loss": 2.2079, "step": 230 }, { "epoch": 0.1, "learning_rate": 4.5260000000000004e-05, "loss": 1.9552, "step": 240 }, { "epoch": 0.1, "learning_rate": 4.506e-05, "loss": 2.0141, "step": 250 }, { "epoch": 0.1, "learning_rate": 4.486e-05, "loss": 1.9904, "step": 260 }, { "epoch": 0.11, "learning_rate": 4.466e-05, "loss": 1.9627, "step": 270 }, { "epoch": 0.11, "learning_rate": 4.4460000000000005e-05, "loss": 2.0108, "step": 280 }, { "epoch": 0.12, "learning_rate": 4.426e-05, "loss": 1.9851, "step": 290 }, { "epoch": 0.12, "learning_rate": 4.406e-05, "loss": 2.1354, "step": 300 }, { "epoch": 0.12, "learning_rate": 4.3860000000000004e-05, "loss": 2.0532, "step": 310 }, { "epoch": 0.13, "learning_rate": 4.366e-05, "loss": 2.0449, "step": 320 }, { "epoch": 0.13, "learning_rate": 4.346e-05, "loss": 1.9925, "step": 330 }, { "epoch": 0.14, "learning_rate": 4.326e-05, "loss": 2.0868, "step": 340 }, { "epoch": 0.14, "learning_rate": 4.306e-05, "loss": 2.0281, "step": 350 }, { "epoch": 0.14, "learning_rate": 4.286e-05, "loss": 1.9509, "step": 360 }, { "epoch": 0.15, "learning_rate": 4.266e-05, "loss": 2.0431, "step": 370 }, { "epoch": 0.15, "learning_rate": 4.246e-05, "loss": 1.917, "step": 380 }, { "epoch": 0.16, "learning_rate": 4.226e-05, "loss": 2.0301, "step": 390 }, { "epoch": 0.16, "learning_rate": 4.206e-05, "loss": 1.8611, "step": 400 }, { "epoch": 0.16, "learning_rate": 4.186e-05, "loss": 1.8227, "step": 410 }, { "epoch": 0.17, "learning_rate": 4.1660000000000004e-05, "loss": 1.9631, "step": 420 }, { "epoch": 0.17, "learning_rate": 4.1460000000000006e-05, "loss": 1.954, "step": 430 }, { "epoch": 0.18, "learning_rate": 4.126e-05, "loss": 1.9286, "step": 440 }, { "epoch": 0.18, "learning_rate": 4.106e-05, "loss": 2.0062, "step": 450 }, { "epoch": 0.18, "learning_rate": 4.0860000000000005e-05, "loss": 2.0218, "step": 460 }, { "epoch": 0.19, "learning_rate": 4.066e-05, "loss": 2.0331, "step": 470 }, { "epoch": 0.19, "learning_rate": 4.046e-05, "loss": 1.8572, "step": 480 }, { "epoch": 0.2, "learning_rate": 4.0260000000000004e-05, "loss": 2.0023, "step": 490 }, { "epoch": 0.2, "learning_rate": 4.0060000000000006e-05, "loss": 2.1018, "step": 500 }, { "epoch": 0.2, "learning_rate": 3.986e-05, "loss": 1.8793, "step": 510 }, { "epoch": 0.21, "learning_rate": 3.966e-05, "loss": 1.9712, "step": 520 }, { "epoch": 0.21, "learning_rate": 3.9460000000000005e-05, "loss": 1.9055, "step": 530 }, { "epoch": 0.22, "learning_rate": 3.926e-05, "loss": 1.9317, "step": 540 }, { "epoch": 0.22, "learning_rate": 3.906e-05, "loss": 2.0175, "step": 550 }, { "epoch": 0.22, "learning_rate": 3.8860000000000004e-05, "loss": 1.9598, "step": 560 }, { "epoch": 0.23, "learning_rate": 3.866e-05, "loss": 1.9828, "step": 570 }, { "epoch": 0.23, "learning_rate": 3.846e-05, "loss": 1.9153, "step": 580 }, { "epoch": 0.24, "learning_rate": 3.826e-05, "loss": 1.8578, "step": 590 }, { "epoch": 0.24, "learning_rate": 3.806e-05, "loss": 1.9993, "step": 600 }, { "epoch": 0.24, "learning_rate": 3.786e-05, "loss": 1.8577, "step": 610 }, { "epoch": 0.25, "learning_rate": 3.766e-05, "loss": 1.9251, "step": 620 }, { "epoch": 0.25, "learning_rate": 3.7460000000000004e-05, "loss": 1.8683, "step": 630 }, { "epoch": 0.26, "learning_rate": 3.726e-05, "loss": 1.9517, "step": 640 }, { "epoch": 0.26, "learning_rate": 3.706e-05, "loss": 1.8613, "step": 650 }, { "epoch": 0.26, "learning_rate": 3.686e-05, "loss": 1.8133, "step": 660 }, { "epoch": 0.27, "learning_rate": 3.666e-05, "loss": 1.8165, "step": 670 }, { "epoch": 0.27, "learning_rate": 3.646e-05, "loss": 1.7916, "step": 680 }, { "epoch": 0.28, "learning_rate": 3.626e-05, "loss": 1.7016, "step": 690 }, { "epoch": 0.28, "learning_rate": 3.606e-05, "loss": 1.7513, "step": 700 }, { "epoch": 0.28, "learning_rate": 3.586e-05, "loss": 1.7839, "step": 710 }, { "epoch": 0.29, "learning_rate": 3.566e-05, "loss": 1.8013, "step": 720 }, { "epoch": 0.29, "learning_rate": 3.546e-05, "loss": 1.7298, "step": 730 }, { "epoch": 0.3, "learning_rate": 3.5260000000000005e-05, "loss": 1.7424, "step": 740 }, { "epoch": 0.3, "learning_rate": 3.5060000000000007e-05, "loss": 1.6861, "step": 750 }, { "epoch": 0.3, "learning_rate": 3.486e-05, "loss": 1.7022, "step": 760 }, { "epoch": 0.31, "learning_rate": 3.4660000000000004e-05, "loss": 1.7965, "step": 770 }, { "epoch": 0.31, "learning_rate": 3.4460000000000005e-05, "loss": 1.7315, "step": 780 }, { "epoch": 0.32, "learning_rate": 3.426e-05, "loss": 1.7057, "step": 790 }, { "epoch": 0.32, "learning_rate": 3.406e-05, "loss": 1.8009, "step": 800 }, { "epoch": 0.32, "learning_rate": 3.3860000000000004e-05, "loss": 1.7931, "step": 810 }, { "epoch": 0.33, "learning_rate": 3.366e-05, "loss": 1.664, "step": 820 }, { "epoch": 0.33, "learning_rate": 3.346e-05, "loss": 1.7098, "step": 830 }, { "epoch": 0.34, "learning_rate": 3.3260000000000003e-05, "loss": 1.7413, "step": 840 }, { "epoch": 0.34, "learning_rate": 3.3060000000000005e-05, "loss": 1.7604, "step": 850 }, { "epoch": 0.34, "learning_rate": 3.286e-05, "loss": 1.7232, "step": 860 }, { "epoch": 0.35, "learning_rate": 3.266e-05, "loss": 1.7531, "step": 870 }, { "epoch": 0.35, "learning_rate": 3.2460000000000004e-05, "loss": 1.7535, "step": 880 }, { "epoch": 0.36, "learning_rate": 3.226e-05, "loss": 1.7284, "step": 890 }, { "epoch": 0.36, "learning_rate": 3.206e-05, "loss": 1.7321, "step": 900 }, { "epoch": 0.36, "learning_rate": 3.186e-05, "loss": 1.7222, "step": 910 }, { "epoch": 0.37, "learning_rate": 3.166e-05, "loss": 1.7226, "step": 920 }, { "epoch": 0.37, "learning_rate": 3.146e-05, "loss": 1.7156, "step": 930 }, { "epoch": 0.38, "learning_rate": 3.126e-05, "loss": 1.7291, "step": 940 }, { "epoch": 0.38, "learning_rate": 3.106e-05, "loss": 1.7604, "step": 950 }, { "epoch": 0.38, "learning_rate": 3.086e-05, "loss": 1.6752, "step": 960 }, { "epoch": 0.39, "learning_rate": 3.066e-05, "loss": 1.6983, "step": 970 }, { "epoch": 0.39, "learning_rate": 3.046e-05, "loss": 1.6599, "step": 980 }, { "epoch": 0.4, "learning_rate": 3.0259999999999998e-05, "loss": 1.7848, "step": 990 }, { "epoch": 0.4, "learning_rate": 3.006e-05, "loss": 1.7583, "step": 1000 }, { "epoch": 0.4, "learning_rate": 2.986e-05, "loss": 1.6887, "step": 1010 }, { "epoch": 0.41, "learning_rate": 2.9659999999999997e-05, "loss": 1.6829, "step": 1020 }, { "epoch": 0.41, "learning_rate": 2.946e-05, "loss": 1.5917, "step": 1030 }, { "epoch": 0.42, "learning_rate": 2.9260000000000004e-05, "loss": 1.6951, "step": 1040 }, { "epoch": 0.42, "learning_rate": 2.9060000000000003e-05, "loss": 1.7579, "step": 1050 }, { "epoch": 0.42, "learning_rate": 2.8860000000000005e-05, "loss": 1.7073, "step": 1060 }, { "epoch": 0.43, "learning_rate": 2.8660000000000003e-05, "loss": 1.6211, "step": 1070 }, { "epoch": 0.43, "learning_rate": 2.8460000000000002e-05, "loss": 1.6521, "step": 1080 }, { "epoch": 0.44, "learning_rate": 2.8260000000000004e-05, "loss": 1.758, "step": 1090 }, { "epoch": 0.44, "learning_rate": 2.8060000000000002e-05, "loss": 1.7321, "step": 1100 }, { "epoch": 0.44, "learning_rate": 2.7860000000000004e-05, "loss": 1.7187, "step": 1110 }, { "epoch": 0.45, "learning_rate": 2.7660000000000003e-05, "loss": 1.7101, "step": 1120 }, { "epoch": 0.45, "learning_rate": 2.746e-05, "loss": 1.6314, "step": 1130 }, { "epoch": 0.46, "learning_rate": 2.7260000000000003e-05, "loss": 1.794, "step": 1140 }, { "epoch": 0.46, "learning_rate": 2.7060000000000002e-05, "loss": 1.7144, "step": 1150 }, { "epoch": 0.46, "learning_rate": 2.686e-05, "loss": 1.7711, "step": 1160 }, { "epoch": 0.47, "learning_rate": 2.6660000000000002e-05, "loss": 1.7436, "step": 1170 }, { "epoch": 0.47, "learning_rate": 2.646e-05, "loss": 1.6836, "step": 1180 }, { "epoch": 0.48, "learning_rate": 2.6260000000000003e-05, "loss": 1.7399, "step": 1190 }, { "epoch": 0.48, "learning_rate": 2.606e-05, "loss": 1.7628, "step": 1200 }, { "epoch": 0.48, "learning_rate": 2.586e-05, "loss": 1.7708, "step": 1210 }, { "epoch": 0.49, "learning_rate": 2.566e-05, "loss": 1.7114, "step": 1220 }, { "epoch": 0.49, "learning_rate": 2.546e-05, "loss": 1.5984, "step": 1230 }, { "epoch": 0.5, "learning_rate": 2.526e-05, "loss": 1.7057, "step": 1240 }, { "epoch": 0.5, "learning_rate": 2.506e-05, "loss": 1.7779, "step": 1250 }, { "epoch": 0.5, "learning_rate": 2.486e-05, "loss": 1.6984, "step": 1260 }, { "epoch": 0.51, "learning_rate": 2.466e-05, "loss": 1.7017, "step": 1270 }, { "epoch": 0.51, "learning_rate": 2.4460000000000003e-05, "loss": 1.7408, "step": 1280 }, { "epoch": 0.52, "learning_rate": 2.426e-05, "loss": 1.7441, "step": 1290 }, { "epoch": 0.52, "learning_rate": 2.4060000000000003e-05, "loss": 1.6177, "step": 1300 }, { "epoch": 0.52, "learning_rate": 2.3860000000000002e-05, "loss": 1.7621, "step": 1310 }, { "epoch": 0.53, "learning_rate": 2.366e-05, "loss": 1.6717, "step": 1320 }, { "epoch": 0.53, "learning_rate": 2.3460000000000002e-05, "loss": 1.6861, "step": 1330 }, { "epoch": 0.54, "learning_rate": 2.326e-05, "loss": 1.707, "step": 1340 }, { "epoch": 0.54, "learning_rate": 2.306e-05, "loss": 1.7853, "step": 1350 }, { "epoch": 0.54, "learning_rate": 2.286e-05, "loss": 1.744, "step": 1360 }, { "epoch": 0.55, "learning_rate": 2.266e-05, "loss": 1.7309, "step": 1370 }, { "epoch": 0.55, "learning_rate": 2.2460000000000002e-05, "loss": 1.6741, "step": 1380 }, { "epoch": 0.56, "learning_rate": 2.226e-05, "loss": 1.6597, "step": 1390 }, { "epoch": 0.56, "learning_rate": 2.206e-05, "loss": 1.7398, "step": 1400 }, { "epoch": 0.56, "learning_rate": 2.186e-05, "loss": 1.7266, "step": 1410 }, { "epoch": 0.57, "learning_rate": 2.166e-05, "loss": 1.667, "step": 1420 }, { "epoch": 0.57, "learning_rate": 2.146e-05, "loss": 1.7571, "step": 1430 }, { "epoch": 0.58, "learning_rate": 2.1260000000000003e-05, "loss": 1.7637, "step": 1440 }, { "epoch": 0.58, "learning_rate": 2.106e-05, "loss": 1.6878, "step": 1450 }, { "epoch": 0.58, "learning_rate": 2.086e-05, "loss": 1.694, "step": 1460 }, { "epoch": 0.59, "learning_rate": 2.0660000000000002e-05, "loss": 1.6909, "step": 1470 }, { "epoch": 0.59, "learning_rate": 2.046e-05, "loss": 1.7218, "step": 1480 }, { "epoch": 0.6, "learning_rate": 2.0260000000000003e-05, "loss": 1.6519, "step": 1490 }, { "epoch": 0.6, "learning_rate": 2.006e-05, "loss": 1.5711, "step": 1500 }, { "epoch": 0.6, "learning_rate": 1.986e-05, "loss": 1.7072, "step": 1510 }, { "epoch": 0.61, "learning_rate": 1.966e-05, "loss": 1.6743, "step": 1520 }, { "epoch": 0.61, "learning_rate": 1.946e-05, "loss": 1.8284, "step": 1530 }, { "epoch": 0.62, "learning_rate": 1.9260000000000002e-05, "loss": 1.7596, "step": 1540 }, { "epoch": 0.62, "learning_rate": 1.906e-05, "loss": 1.6365, "step": 1550 }, { "epoch": 0.62, "learning_rate": 1.886e-05, "loss": 1.6961, "step": 1560 }, { "epoch": 0.63, "learning_rate": 1.866e-05, "loss": 1.7091, "step": 1570 }, { "epoch": 0.63, "learning_rate": 1.846e-05, "loss": 1.6835, "step": 1580 }, { "epoch": 0.64, "learning_rate": 1.826e-05, "loss": 1.7597, "step": 1590 }, { "epoch": 0.64, "learning_rate": 1.8060000000000003e-05, "loss": 1.7119, "step": 1600 }, { "epoch": 0.64, "learning_rate": 1.7860000000000002e-05, "loss": 1.6979, "step": 1610 }, { "epoch": 0.65, "learning_rate": 1.766e-05, "loss": 1.6685, "step": 1620 }, { "epoch": 0.65, "learning_rate": 1.7460000000000002e-05, "loss": 1.6231, "step": 1630 }, { "epoch": 0.66, "learning_rate": 1.726e-05, "loss": 1.6583, "step": 1640 }, { "epoch": 0.66, "learning_rate": 1.706e-05, "loss": 1.7673, "step": 1650 }, { "epoch": 0.66, "learning_rate": 1.686e-05, "loss": 1.7094, "step": 1660 }, { "epoch": 0.67, "learning_rate": 1.666e-05, "loss": 1.7663, "step": 1670 }, { "epoch": 0.67, "learning_rate": 1.646e-05, "loss": 1.6688, "step": 1680 }, { "epoch": 0.68, "learning_rate": 1.626e-05, "loss": 1.7058, "step": 1690 }, { "epoch": 0.68, "learning_rate": 1.606e-05, "loss": 1.705, "step": 1700 }, { "epoch": 0.68, "learning_rate": 1.586e-05, "loss": 1.7562, "step": 1710 }, { "epoch": 0.69, "learning_rate": 1.566e-05, "loss": 1.7273, "step": 1720 }, { "epoch": 0.69, "learning_rate": 1.546e-05, "loss": 1.7069, "step": 1730 }, { "epoch": 0.7, "learning_rate": 1.5260000000000003e-05, "loss": 1.6405, "step": 1740 }, { "epoch": 0.7, "learning_rate": 1.5060000000000001e-05, "loss": 1.6771, "step": 1750 }, { "epoch": 0.7, "learning_rate": 1.4860000000000002e-05, "loss": 1.7157, "step": 1760 }, { "epoch": 0.71, "learning_rate": 1.4660000000000002e-05, "loss": 1.6899, "step": 1770 }, { "epoch": 0.71, "learning_rate": 1.4460000000000002e-05, "loss": 1.7271, "step": 1780 }, { "epoch": 0.72, "learning_rate": 1.426e-05, "loss": 1.7838, "step": 1790 }, { "epoch": 0.72, "learning_rate": 1.4060000000000001e-05, "loss": 1.802, "step": 1800 }, { "epoch": 0.72, "learning_rate": 1.3860000000000001e-05, "loss": 1.7144, "step": 1810 }, { "epoch": 0.73, "learning_rate": 1.3660000000000001e-05, "loss": 1.7215, "step": 1820 }, { "epoch": 0.73, "learning_rate": 1.346e-05, "loss": 1.6961, "step": 1830 }, { "epoch": 0.74, "learning_rate": 1.326e-05, "loss": 1.6575, "step": 1840 }, { "epoch": 0.74, "learning_rate": 1.306e-05, "loss": 1.697, "step": 1850 }, { "epoch": 0.74, "learning_rate": 1.286e-05, "loss": 1.6385, "step": 1860 }, { "epoch": 0.75, "learning_rate": 1.2659999999999999e-05, "loss": 1.7265, "step": 1870 }, { "epoch": 0.75, "learning_rate": 1.2460000000000001e-05, "loss": 1.7385, "step": 1880 }, { "epoch": 0.76, "learning_rate": 1.2260000000000001e-05, "loss": 1.6288, "step": 1890 }, { "epoch": 0.76, "learning_rate": 1.206e-05, "loss": 1.7478, "step": 1900 }, { "epoch": 0.76, "learning_rate": 1.186e-05, "loss": 1.6661, "step": 1910 }, { "epoch": 0.77, "learning_rate": 1.166e-05, "loss": 1.6749, "step": 1920 }, { "epoch": 0.77, "learning_rate": 1.146e-05, "loss": 1.6919, "step": 1930 }, { "epoch": 0.78, "learning_rate": 1.126e-05, "loss": 1.7477, "step": 1940 }, { "epoch": 0.78, "learning_rate": 1.106e-05, "loss": 1.7278, "step": 1950 }, { "epoch": 0.78, "learning_rate": 1.0860000000000001e-05, "loss": 1.7192, "step": 1960 }, { "epoch": 0.79, "learning_rate": 1.0660000000000001e-05, "loss": 1.6794, "step": 1970 }, { "epoch": 0.79, "learning_rate": 1.046e-05, "loss": 1.7545, "step": 1980 }, { "epoch": 0.8, "learning_rate": 1.026e-05, "loss": 1.7377, "step": 1990 }, { "epoch": 0.8, "learning_rate": 1.006e-05, "loss": 1.663, "step": 2000 } ], "logging_steps": 10, "max_steps": 2500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "total_flos": 3.632950229815296e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }