{ "best_metric": null, "best_model_checkpoint": null, "epoch": 8.695652173913043, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04, "learning_rate": 5.000000000000001e-07, "loss": 0.7601, "step": 5 }, { "epoch": 0.09, "learning_rate": 1.0000000000000002e-06, "loss": 0.7662, "step": 10 }, { "epoch": 0.13, "learning_rate": 1.5e-06, "loss": 0.7385, "step": 15 }, { "epoch": 0.17, "learning_rate": 1.9000000000000002e-06, "loss": 0.707, "step": 20 }, { "epoch": 0.22, "learning_rate": 2.4000000000000003e-06, "loss": 0.6551, "step": 25 }, { "epoch": 0.26, "learning_rate": 2.8000000000000003e-06, "loss": 0.5934, "step": 30 }, { "epoch": 0.3, "learning_rate": 3.3000000000000006e-06, "loss": 0.5183, "step": 35 }, { "epoch": 0.35, "learning_rate": 3.8000000000000005e-06, "loss": 0.487, "step": 40 }, { "epoch": 0.39, "learning_rate": 4.3e-06, "loss": 0.4695, "step": 45 }, { "epoch": 0.43, "learning_rate": 4.800000000000001e-06, "loss": 0.4183, "step": 50 }, { "epoch": 0.48, "learning_rate": 5.300000000000001e-06, "loss": 0.4144, "step": 55 }, { "epoch": 0.52, "learning_rate": 5.8e-06, "loss": 0.3759, "step": 60 }, { "epoch": 0.57, "learning_rate": 6.300000000000001e-06, "loss": 0.3933, "step": 65 }, { "epoch": 0.61, "learning_rate": 6.800000000000001e-06, "loss": 0.406, "step": 70 }, { "epoch": 0.65, "learning_rate": 7.3e-06, "loss": 0.3771, "step": 75 }, { "epoch": 0.7, "learning_rate": 7.800000000000002e-06, "loss": 0.3719, "step": 80 }, { "epoch": 0.74, "learning_rate": 8.3e-06, "loss": 0.366, "step": 85 }, { "epoch": 0.78, "learning_rate": 8.8e-06, "loss": 0.3465, "step": 90 }, { "epoch": 0.83, "learning_rate": 9.3e-06, "loss": 0.3979, "step": 95 }, { "epoch": 0.87, "learning_rate": 9.800000000000001e-06, "loss": 0.3745, "step": 100 }, { "epoch": 0.91, "learning_rate": 9.966666666666667e-06, "loss": 0.388, "step": 105 }, { "epoch": 0.96, "learning_rate": 9.911111111111113e-06, "loss": 0.4033, "step": 110 }, { "epoch": 1.0, "learning_rate": 9.855555555555555e-06, "loss": 0.3652, "step": 115 }, { "epoch": 1.04, "learning_rate": 9.800000000000001e-06, "loss": 0.3252, "step": 120 }, { "epoch": 1.09, "learning_rate": 9.744444444444445e-06, "loss": 0.357, "step": 125 }, { "epoch": 1.13, "learning_rate": 9.688888888888889e-06, "loss": 0.3843, "step": 130 }, { "epoch": 1.17, "learning_rate": 9.633333333333335e-06, "loss": 0.3602, "step": 135 }, { "epoch": 1.22, "learning_rate": 9.577777777777779e-06, "loss": 0.2971, "step": 140 }, { "epoch": 1.26, "learning_rate": 9.522222222222223e-06, "loss": 0.3397, "step": 145 }, { "epoch": 1.3, "learning_rate": 9.466666666666667e-06, "loss": 0.3062, "step": 150 }, { "epoch": 1.35, "learning_rate": 9.411111111111113e-06, "loss": 0.327, "step": 155 }, { "epoch": 1.39, "learning_rate": 9.355555555555557e-06, "loss": 0.3006, "step": 160 }, { "epoch": 1.43, "learning_rate": 9.3e-06, "loss": 0.304, "step": 165 }, { "epoch": 1.48, "learning_rate": 9.244444444444445e-06, "loss": 0.2943, "step": 170 }, { "epoch": 1.52, "learning_rate": 9.188888888888889e-06, "loss": 0.252, "step": 175 }, { "epoch": 1.57, "learning_rate": 9.133333333333335e-06, "loss": 0.3139, "step": 180 }, { "epoch": 1.61, "learning_rate": 9.077777777777779e-06, "loss": 0.2665, "step": 185 }, { "epoch": 1.65, "learning_rate": 9.022222222222223e-06, "loss": 0.302, "step": 190 }, { "epoch": 1.7, "learning_rate": 8.966666666666667e-06, "loss": 0.2248, "step": 195 }, { "epoch": 1.74, "learning_rate": 8.91111111111111e-06, "loss": 0.2442, "step": 200 }, { "epoch": 1.78, "learning_rate": 8.855555555555556e-06, "loss": 0.2233, "step": 205 }, { "epoch": 1.83, "learning_rate": 8.8e-06, "loss": 0.1981, "step": 210 }, { "epoch": 1.87, "learning_rate": 8.744444444444446e-06, "loss": 0.2188, "step": 215 }, { "epoch": 1.91, "learning_rate": 8.68888888888889e-06, "loss": 0.2156, "step": 220 }, { "epoch": 1.96, "learning_rate": 8.633333333333334e-06, "loss": 0.1932, "step": 225 }, { "epoch": 2.0, "learning_rate": 8.577777777777778e-06, "loss": 0.2024, "step": 230 }, { "epoch": 2.04, "learning_rate": 8.522222222222222e-06, "loss": 0.2008, "step": 235 }, { "epoch": 2.09, "learning_rate": 8.466666666666668e-06, "loss": 0.2096, "step": 240 }, { "epoch": 2.13, "learning_rate": 8.411111111111112e-06, "loss": 0.1882, "step": 245 }, { "epoch": 2.17, "learning_rate": 8.355555555555556e-06, "loss": 0.2387, "step": 250 }, { "epoch": 2.22, "learning_rate": 8.3e-06, "loss": 0.1566, "step": 255 }, { "epoch": 2.26, "learning_rate": 8.244444444444444e-06, "loss": 0.2002, "step": 260 }, { "epoch": 2.3, "learning_rate": 8.18888888888889e-06, "loss": 0.1623, "step": 265 }, { "epoch": 2.35, "learning_rate": 8.133333333333334e-06, "loss": 0.2073, "step": 270 }, { "epoch": 2.39, "learning_rate": 8.077777777777778e-06, "loss": 0.2038, "step": 275 }, { "epoch": 2.43, "learning_rate": 8.022222222222222e-06, "loss": 0.1519, "step": 280 }, { "epoch": 2.48, "learning_rate": 7.966666666666668e-06, "loss": 0.1626, "step": 285 }, { "epoch": 2.52, "learning_rate": 7.911111111111112e-06, "loss": 0.1737, "step": 290 }, { "epoch": 2.57, "learning_rate": 7.855555555555556e-06, "loss": 0.1992, "step": 295 }, { "epoch": 2.61, "learning_rate": 7.800000000000002e-06, "loss": 0.1539, "step": 300 }, { "epoch": 2.65, "learning_rate": 7.744444444444446e-06, "loss": 0.1764, "step": 305 }, { "epoch": 2.7, "learning_rate": 7.68888888888889e-06, "loss": 0.1601, "step": 310 }, { "epoch": 2.74, "learning_rate": 7.633333333333334e-06, "loss": 0.1793, "step": 315 }, { "epoch": 2.78, "learning_rate": 7.5777777777777785e-06, "loss": 0.1314, "step": 320 }, { "epoch": 2.83, "learning_rate": 7.5222222222222226e-06, "loss": 0.1543, "step": 325 }, { "epoch": 2.87, "learning_rate": 7.4666666666666675e-06, "loss": 0.1393, "step": 330 }, { "epoch": 2.91, "learning_rate": 7.411111111111112e-06, "loss": 0.1847, "step": 335 }, { "epoch": 2.96, "learning_rate": 7.3555555555555555e-06, "loss": 0.2043, "step": 340 }, { "epoch": 3.0, "learning_rate": 7.3e-06, "loss": 0.1308, "step": 345 }, { "epoch": 3.04, "learning_rate": 7.244444444444445e-06, "loss": 0.1352, "step": 350 }, { "epoch": 3.09, "learning_rate": 7.188888888888889e-06, "loss": 0.1378, "step": 355 }, { "epoch": 3.13, "learning_rate": 7.133333333333334e-06, "loss": 0.1581, "step": 360 }, { "epoch": 3.17, "learning_rate": 7.077777777777778e-06, "loss": 0.1399, "step": 365 }, { "epoch": 3.22, "learning_rate": 7.022222222222222e-06, "loss": 0.149, "step": 370 }, { "epoch": 3.26, "learning_rate": 6.966666666666667e-06, "loss": 0.0899, "step": 375 }, { "epoch": 3.3, "learning_rate": 6.911111111111112e-06, "loss": 0.1157, "step": 380 }, { "epoch": 3.35, "learning_rate": 6.855555555555556e-06, "loss": 0.137, "step": 385 }, { "epoch": 3.39, "learning_rate": 6.800000000000001e-06, "loss": 0.109, "step": 390 }, { "epoch": 3.43, "learning_rate": 6.744444444444444e-06, "loss": 0.0956, "step": 395 }, { "epoch": 3.48, "learning_rate": 6.688888888888889e-06, "loss": 0.1121, "step": 400 }, { "epoch": 3.52, "learning_rate": 6.633333333333334e-06, "loss": 0.125, "step": 405 }, { "epoch": 3.57, "learning_rate": 6.577777777777779e-06, "loss": 0.1222, "step": 410 }, { "epoch": 3.61, "learning_rate": 6.522222222222223e-06, "loss": 0.0806, "step": 415 }, { "epoch": 3.65, "learning_rate": 6.466666666666667e-06, "loss": 0.1144, "step": 420 }, { "epoch": 3.7, "learning_rate": 6.411111111111111e-06, "loss": 0.1139, "step": 425 }, { "epoch": 3.74, "learning_rate": 6.355555555555556e-06, "loss": 0.0836, "step": 430 }, { "epoch": 3.78, "learning_rate": 6.300000000000001e-06, "loss": 0.1699, "step": 435 }, { "epoch": 3.83, "learning_rate": 6.2444444444444456e-06, "loss": 0.1187, "step": 440 }, { "epoch": 3.87, "learning_rate": 6.18888888888889e-06, "loss": 0.0888, "step": 445 }, { "epoch": 3.91, "learning_rate": 6.133333333333334e-06, "loss": 0.1015, "step": 450 }, { "epoch": 3.96, "learning_rate": 6.077777777777778e-06, "loss": 0.0988, "step": 455 }, { "epoch": 4.0, "learning_rate": 6.0222222222222225e-06, "loss": 0.1092, "step": 460 }, { "epoch": 4.04, "learning_rate": 5.966666666666667e-06, "loss": 0.0985, "step": 465 }, { "epoch": 4.09, "learning_rate": 5.911111111111112e-06, "loss": 0.0934, "step": 470 }, { "epoch": 4.13, "learning_rate": 5.855555555555556e-06, "loss": 0.0739, "step": 475 }, { "epoch": 4.17, "learning_rate": 5.8e-06, "loss": 0.0905, "step": 480 }, { "epoch": 4.22, "learning_rate": 5.744444444444444e-06, "loss": 0.0695, "step": 485 }, { "epoch": 4.26, "learning_rate": 5.688888888888889e-06, "loss": 0.0875, "step": 490 }, { "epoch": 4.3, "learning_rate": 5.633333333333334e-06, "loss": 0.0896, "step": 495 }, { "epoch": 4.35, "learning_rate": 5.577777777777778e-06, "loss": 0.0751, "step": 500 }, { "epoch": 4.39, "learning_rate": 5.522222222222222e-06, "loss": 0.0923, "step": 505 }, { "epoch": 4.43, "learning_rate": 5.466666666666667e-06, "loss": 0.0841, "step": 510 }, { "epoch": 4.48, "learning_rate": 5.411111111111111e-06, "loss": 0.0978, "step": 515 }, { "epoch": 4.52, "learning_rate": 5.355555555555556e-06, "loss": 0.0831, "step": 520 }, { "epoch": 4.57, "learning_rate": 5.300000000000001e-06, "loss": 0.0817, "step": 525 }, { "epoch": 4.61, "learning_rate": 5.244444444444445e-06, "loss": 0.0612, "step": 530 }, { "epoch": 4.65, "learning_rate": 5.188888888888889e-06, "loss": 0.0601, "step": 535 }, { "epoch": 4.7, "learning_rate": 5.133333333333334e-06, "loss": 0.0664, "step": 540 }, { "epoch": 4.74, "learning_rate": 5.077777777777778e-06, "loss": 0.0893, "step": 545 }, { "epoch": 4.78, "learning_rate": 5.022222222222223e-06, "loss": 0.0725, "step": 550 }, { "epoch": 4.83, "learning_rate": 4.966666666666667e-06, "loss": 0.0761, "step": 555 }, { "epoch": 4.87, "learning_rate": 4.911111111111112e-06, "loss": 0.0509, "step": 560 }, { "epoch": 4.91, "learning_rate": 4.855555555555556e-06, "loss": 0.0841, "step": 565 }, { "epoch": 4.96, "learning_rate": 4.800000000000001e-06, "loss": 0.1143, "step": 570 }, { "epoch": 5.0, "learning_rate": 4.744444444444445e-06, "loss": 0.0867, "step": 575 }, { "epoch": 5.04, "learning_rate": 4.6888888888888895e-06, "loss": 0.0829, "step": 580 }, { "epoch": 5.09, "learning_rate": 4.633333333333334e-06, "loss": 0.0609, "step": 585 }, { "epoch": 5.13, "learning_rate": 4.5777777777777785e-06, "loss": 0.0551, "step": 590 }, { "epoch": 5.17, "learning_rate": 4.5222222222222225e-06, "loss": 0.0653, "step": 595 }, { "epoch": 5.22, "learning_rate": 4.4666666666666665e-06, "loss": 0.067, "step": 600 }, { "epoch": 5.26, "learning_rate": 4.411111111111111e-06, "loss": 0.0561, "step": 605 }, { "epoch": 5.3, "learning_rate": 4.3555555555555555e-06, "loss": 0.0465, "step": 610 }, { "epoch": 5.35, "learning_rate": 4.3e-06, "loss": 0.0484, "step": 615 }, { "epoch": 5.39, "learning_rate": 4.244444444444445e-06, "loss": 0.0667, "step": 620 }, { "epoch": 5.43, "learning_rate": 4.188888888888889e-06, "loss": 0.0692, "step": 625 }, { "epoch": 5.48, "learning_rate": 4.133333333333333e-06, "loss": 0.0602, "step": 630 }, { "epoch": 5.52, "learning_rate": 4.077777777777778e-06, "loss": 0.0665, "step": 635 }, { "epoch": 5.57, "learning_rate": 4.022222222222222e-06, "loss": 0.0974, "step": 640 }, { "epoch": 5.61, "learning_rate": 3.966666666666667e-06, "loss": 0.0573, "step": 645 }, { "epoch": 5.65, "learning_rate": 3.911111111111112e-06, "loss": 0.064, "step": 650 }, { "epoch": 5.7, "learning_rate": 3.855555555555556e-06, "loss": 0.0556, "step": 655 }, { "epoch": 5.74, "learning_rate": 3.8000000000000005e-06, "loss": 0.0401, "step": 660 }, { "epoch": 5.78, "learning_rate": 3.744444444444445e-06, "loss": 0.0626, "step": 665 }, { "epoch": 5.83, "learning_rate": 3.688888888888889e-06, "loss": 0.0483, "step": 670 }, { "epoch": 5.87, "learning_rate": 3.633333333333334e-06, "loss": 0.0847, "step": 675 }, { "epoch": 5.91, "learning_rate": 3.577777777777778e-06, "loss": 0.0492, "step": 680 }, { "epoch": 5.96, "learning_rate": 3.5222222222222223e-06, "loss": 0.0515, "step": 685 }, { "epoch": 6.0, "learning_rate": 3.4666666666666672e-06, "loss": 0.0372, "step": 690 }, { "epoch": 6.04, "learning_rate": 3.4111111111111113e-06, "loss": 0.0418, "step": 695 }, { "epoch": 6.09, "learning_rate": 3.3555555555555557e-06, "loss": 0.0368, "step": 700 }, { "epoch": 6.13, "learning_rate": 3.3000000000000006e-06, "loss": 0.0476, "step": 705 }, { "epoch": 6.17, "learning_rate": 3.2444444444444446e-06, "loss": 0.044, "step": 710 }, { "epoch": 6.22, "learning_rate": 3.188888888888889e-06, "loss": 0.0591, "step": 715 }, { "epoch": 6.26, "learning_rate": 3.133333333333334e-06, "loss": 0.0493, "step": 720 }, { "epoch": 6.3, "learning_rate": 3.077777777777778e-06, "loss": 0.0306, "step": 725 }, { "epoch": 6.35, "learning_rate": 3.0222222222222225e-06, "loss": 0.0481, "step": 730 }, { "epoch": 6.39, "learning_rate": 2.9666666666666673e-06, "loss": 0.0469, "step": 735 }, { "epoch": 6.43, "learning_rate": 2.9111111111111114e-06, "loss": 0.0412, "step": 740 }, { "epoch": 6.48, "learning_rate": 2.855555555555556e-06, "loss": 0.0325, "step": 745 }, { "epoch": 6.52, "learning_rate": 2.8000000000000003e-06, "loss": 0.0563, "step": 750 }, { "epoch": 6.57, "learning_rate": 2.7444444444444448e-06, "loss": 0.0402, "step": 755 }, { "epoch": 6.61, "learning_rate": 2.6888888888888892e-06, "loss": 0.0781, "step": 760 }, { "epoch": 6.65, "learning_rate": 2.6333333333333332e-06, "loss": 0.0459, "step": 765 }, { "epoch": 6.7, "learning_rate": 2.577777777777778e-06, "loss": 0.0339, "step": 770 }, { "epoch": 6.74, "learning_rate": 2.5222222222222226e-06, "loss": 0.0388, "step": 775 }, { "epoch": 6.78, "learning_rate": 2.466666666666667e-06, "loss": 0.0359, "step": 780 }, { "epoch": 6.83, "learning_rate": 2.411111111111111e-06, "loss": 0.088, "step": 785 }, { "epoch": 6.87, "learning_rate": 2.3555555555555555e-06, "loss": 0.0376, "step": 790 }, { "epoch": 6.91, "learning_rate": 2.3000000000000004e-06, "loss": 0.0361, "step": 795 }, { "epoch": 6.96, "learning_rate": 2.2444444444444445e-06, "loss": 0.0489, "step": 800 }, { "epoch": 7.0, "learning_rate": 2.188888888888889e-06, "loss": 0.0337, "step": 805 }, { "epoch": 7.04, "learning_rate": 2.133333333333334e-06, "loss": 0.0488, "step": 810 }, { "epoch": 7.09, "learning_rate": 2.077777777777778e-06, "loss": 0.0367, "step": 815 }, { "epoch": 7.13, "learning_rate": 2.0222222222222223e-06, "loss": 0.0514, "step": 820 }, { "epoch": 7.17, "learning_rate": 1.9666666666666668e-06, "loss": 0.0274, "step": 825 }, { "epoch": 7.22, "learning_rate": 1.9111111111111112e-06, "loss": 0.0356, "step": 830 }, { "epoch": 7.26, "learning_rate": 1.8555555555555557e-06, "loss": 0.03, "step": 835 }, { "epoch": 7.3, "learning_rate": 1.8000000000000001e-06, "loss": 0.0339, "step": 840 }, { "epoch": 7.35, "learning_rate": 1.7444444444444448e-06, "loss": 0.0269, "step": 845 }, { "epoch": 7.39, "learning_rate": 1.688888888888889e-06, "loss": 0.0379, "step": 850 }, { "epoch": 7.43, "learning_rate": 1.6333333333333335e-06, "loss": 0.0584, "step": 855 }, { "epoch": 7.48, "learning_rate": 1.5777777777777778e-06, "loss": 0.0282, "step": 860 }, { "epoch": 7.52, "learning_rate": 1.5222222222222224e-06, "loss": 0.0304, "step": 865 }, { "epoch": 7.57, "learning_rate": 1.4666666666666669e-06, "loss": 0.0289, "step": 870 }, { "epoch": 7.61, "learning_rate": 1.4111111111111111e-06, "loss": 0.0393, "step": 875 }, { "epoch": 7.65, "learning_rate": 1.3555555555555558e-06, "loss": 0.0318, "step": 880 }, { "epoch": 7.7, "learning_rate": 1.3e-06, "loss": 0.0704, "step": 885 }, { "epoch": 7.74, "learning_rate": 1.2444444444444445e-06, "loss": 0.0261, "step": 890 }, { "epoch": 7.78, "learning_rate": 1.188888888888889e-06, "loss": 0.0389, "step": 895 }, { "epoch": 7.83, "learning_rate": 1.1333333333333334e-06, "loss": 0.031, "step": 900 }, { "epoch": 7.87, "learning_rate": 1.0777777777777779e-06, "loss": 0.0261, "step": 905 }, { "epoch": 7.91, "learning_rate": 1.0222222222222223e-06, "loss": 0.0349, "step": 910 }, { "epoch": 7.96, "learning_rate": 9.666666666666668e-07, "loss": 0.0326, "step": 915 }, { "epoch": 8.0, "learning_rate": 9.111111111111113e-07, "loss": 0.0742, "step": 920 }, { "epoch": 8.04, "learning_rate": 8.555555555555556e-07, "loss": 0.0343, "step": 925 }, { "epoch": 8.09, "learning_rate": 8.000000000000001e-07, "loss": 0.0348, "step": 930 }, { "epoch": 8.13, "learning_rate": 7.444444444444444e-07, "loss": 0.0293, "step": 935 }, { "epoch": 8.17, "learning_rate": 6.88888888888889e-07, "loss": 0.0357, "step": 940 }, { "epoch": 8.22, "learning_rate": 6.333333333333334e-07, "loss": 0.024, "step": 945 }, { "epoch": 8.26, "learning_rate": 5.777777777777778e-07, "loss": 0.0336, "step": 950 }, { "epoch": 8.3, "learning_rate": 5.222222222222223e-07, "loss": 0.0202, "step": 955 }, { "epoch": 8.35, "learning_rate": 4.666666666666667e-07, "loss": 0.0379, "step": 960 }, { "epoch": 8.39, "learning_rate": 4.111111111111112e-07, "loss": 0.0388, "step": 965 }, { "epoch": 8.43, "learning_rate": 3.555555555555556e-07, "loss": 0.0287, "step": 970 }, { "epoch": 8.48, "learning_rate": 3.0000000000000004e-07, "loss": 0.021, "step": 975 }, { "epoch": 8.52, "learning_rate": 2.444444444444445e-07, "loss": 0.0342, "step": 980 }, { "epoch": 8.57, "learning_rate": 1.888888888888889e-07, "loss": 0.0329, "step": 985 }, { "epoch": 8.61, "learning_rate": 1.3333333333333336e-07, "loss": 0.0517, "step": 990 }, { "epoch": 8.65, "learning_rate": 7.777777777777778e-08, "loss": 0.024, "step": 995 }, { "epoch": 8.7, "learning_rate": 2.2222222222222224e-08, "loss": 0.0259, "step": 1000 }, { "epoch": 8.7, "step": 1000, "total_flos": 1850846955153408.0, "train_loss": 0.15014069636166097, "train_runtime": 142.3624, "train_samples_per_second": 56.195, "train_steps_per_second": 7.024 } ], "max_steps": 1000, "num_train_epochs": 9, "total_flos": 1850846955153408.0, "trial_name": null, "trial_params": null }