{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 5000, "global_step": 2250, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0044444444444444444, "grad_norm": 152.64707946777344, "learning_rate": 5.000000000000001e-07, "loss": 10.2908, "step": 10 }, { "epoch": 0.008888888888888889, "grad_norm": 211.05294799804688, "learning_rate": 1.4000000000000001e-06, "loss": 10.0664, "step": 20 }, { "epoch": 0.013333333333333334, "grad_norm": 2954.0419921875, "learning_rate": 2.4000000000000003e-06, "loss": 10.0624, "step": 30 }, { "epoch": 0.017777777777777778, "grad_norm": 1075.460205078125, "learning_rate": 3.4000000000000005e-06, "loss": 8.5938, "step": 40 }, { "epoch": 0.022222222222222223, "grad_norm": 7704.75048828125, "learning_rate": 4.4e-06, "loss": 7.8263, "step": 50 }, { "epoch": 0.02666666666666667, "grad_norm": 6002.2890625, "learning_rate": 5.4e-06, "loss": 6.2896, "step": 60 }, { "epoch": 0.03111111111111111, "grad_norm": 14311.9560546875, "learning_rate": 6.4000000000000006e-06, "loss": 5.4298, "step": 70 }, { "epoch": 0.035555555555555556, "grad_norm": 5250.40869140625, "learning_rate": 7.4e-06, "loss": 5.2735, "step": 80 }, { "epoch": 0.04, "grad_norm": 470.0221862792969, "learning_rate": 8.400000000000001e-06, "loss": 4.9127, "step": 90 }, { "epoch": 0.044444444444444446, "grad_norm": 19534.701171875, "learning_rate": 9.4e-06, "loss": 4.6077, "step": 100 }, { "epoch": 0.04888888888888889, "grad_norm": 492.7581481933594, "learning_rate": 1.04e-05, "loss": 4.5512, "step": 110 }, { "epoch": 0.05333333333333334, "grad_norm": 9192.6767578125, "learning_rate": 1.13e-05, "loss": 5.1424, "step": 120 }, { "epoch": 0.057777777777777775, "grad_norm": 108.0229263305664, "learning_rate": 1.23e-05, "loss": 3.7802, "step": 130 }, { "epoch": 0.06222222222222222, "grad_norm": 1061.415771484375, "learning_rate": 1.3300000000000001e-05, "loss": 3.3019, "step": 140 }, { "epoch": 0.06666666666666667, "grad_norm": 128.2921600341797, "learning_rate": 1.43e-05, "loss": 2.9124, "step": 150 }, { "epoch": 0.07111111111111111, "grad_norm": 29.611740112304688, "learning_rate": 1.53e-05, "loss": 2.6086, "step": 160 }, { "epoch": 0.07555555555555556, "grad_norm": 43.61715316772461, "learning_rate": 1.63e-05, "loss": 1.8848, "step": 170 }, { "epoch": 0.08, "grad_norm": 11403.16015625, "learning_rate": 1.73e-05, "loss": 1.9176, "step": 180 }, { "epoch": 0.08444444444444445, "grad_norm": 381.3476867675781, "learning_rate": 1.83e-05, "loss": 1.734, "step": 190 }, { "epoch": 0.08888888888888889, "grad_norm": 4.966193199157715, "learning_rate": 1.93e-05, "loss": 1.3579, "step": 200 }, { "epoch": 0.09333333333333334, "grad_norm": 4.789448261260986, "learning_rate": 2.0300000000000002e-05, "loss": 1.0905, "step": 210 }, { "epoch": 0.09777777777777778, "grad_norm": 2.323220729827881, "learning_rate": 2.13e-05, "loss": 1.0379, "step": 220 }, { "epoch": 0.10222222222222223, "grad_norm": 1.8111121654510498, "learning_rate": 2.23e-05, "loss": 0.9473, "step": 230 }, { "epoch": 0.10666666666666667, "grad_norm": 4.103352069854736, "learning_rate": 2.3300000000000004e-05, "loss": 1.0941, "step": 240 }, { "epoch": 0.1111111111111111, "grad_norm": 2.7268946170806885, "learning_rate": 2.43e-05, "loss": 1.1331, "step": 250 }, { "epoch": 0.11555555555555555, "grad_norm": 36.0265998840332, "learning_rate": 2.5300000000000002e-05, "loss": 1.3104, "step": 260 }, { "epoch": 0.12, "grad_norm": 1.8667157888412476, "learning_rate": 2.6300000000000002e-05, "loss": 1.0259, "step": 270 }, { "epoch": 0.12444444444444444, "grad_norm": 5.474687099456787, "learning_rate": 2.7300000000000003e-05, "loss": 1.0916, "step": 280 }, { "epoch": 0.1288888888888889, "grad_norm": 2.022836208343506, "learning_rate": 2.83e-05, "loss": 0.9872, "step": 290 }, { "epoch": 0.13333333333333333, "grad_norm": 1.9781649112701416, "learning_rate": 2.93e-05, "loss": 0.8844, "step": 300 }, { "epoch": 0.13777777777777778, "grad_norm": 1.9537264108657837, "learning_rate": 3.03e-05, "loss": 1.0466, "step": 310 }, { "epoch": 0.14222222222222222, "grad_norm": 2.3547990322113037, "learning_rate": 3.13e-05, "loss": 1.0016, "step": 320 }, { "epoch": 0.14666666666666667, "grad_norm": 2.5301690101623535, "learning_rate": 3.2300000000000006e-05, "loss": 1.015, "step": 330 }, { "epoch": 0.1511111111111111, "grad_norm": 1.698588252067566, "learning_rate": 3.33e-05, "loss": 0.994, "step": 340 }, { "epoch": 0.15555555555555556, "grad_norm": 2.17368221282959, "learning_rate": 3.430000000000001e-05, "loss": 1.0743, "step": 350 }, { "epoch": 0.16, "grad_norm": 2.106858968734741, "learning_rate": 3.53e-05, "loss": 1.161, "step": 360 }, { "epoch": 0.16444444444444445, "grad_norm": 2.4343013763427734, "learning_rate": 3.63e-05, "loss": 1.0599, "step": 370 }, { "epoch": 0.1688888888888889, "grad_norm": 2.284984588623047, "learning_rate": 3.73e-05, "loss": 0.9959, "step": 380 }, { "epoch": 0.17333333333333334, "grad_norm": 1.8927963972091675, "learning_rate": 3.83e-05, "loss": 1.0063, "step": 390 }, { "epoch": 0.17777777777777778, "grad_norm": 2.731109142303467, "learning_rate": 3.9300000000000007e-05, "loss": 1.0467, "step": 400 }, { "epoch": 0.18222222222222223, "grad_norm": 9.546753883361816, "learning_rate": 4.0300000000000004e-05, "loss": 1.0892, "step": 410 }, { "epoch": 0.18666666666666668, "grad_norm": 1.986333966255188, "learning_rate": 4.13e-05, "loss": 1.0233, "step": 420 }, { "epoch": 0.19111111111111112, "grad_norm": 2.203075408935547, "learning_rate": 4.23e-05, "loss": 1.268, "step": 430 }, { "epoch": 0.19555555555555557, "grad_norm": 2.235809564590454, "learning_rate": 4.33e-05, "loss": 0.9937, "step": 440 }, { "epoch": 0.2, "grad_norm": 3.598283290863037, "learning_rate": 4.43e-05, "loss": 0.8837, "step": 450 }, { "epoch": 0.20444444444444446, "grad_norm": 1.872710108757019, "learning_rate": 4.53e-05, "loss": 1.1982, "step": 460 }, { "epoch": 0.2088888888888889, "grad_norm": 3.0466055870056152, "learning_rate": 4.630000000000001e-05, "loss": 1.1995, "step": 470 }, { "epoch": 0.21333333333333335, "grad_norm": 2.7732715606689453, "learning_rate": 4.73e-05, "loss": 1.0711, "step": 480 }, { "epoch": 0.21777777777777776, "grad_norm": 2.3050129413604736, "learning_rate": 4.83e-05, "loss": 1.2792, "step": 490 }, { "epoch": 0.2222222222222222, "grad_norm": 2.128685235977173, "learning_rate": 4.93e-05, "loss": 1.1137, "step": 500 }, { "epoch": 0.22666666666666666, "grad_norm": 2.2330660820007324, "learning_rate": 4.9914285714285717e-05, "loss": 1.0332, "step": 510 }, { "epoch": 0.2311111111111111, "grad_norm": 2.049591541290283, "learning_rate": 4.962857142857143e-05, "loss": 1.1467, "step": 520 }, { "epoch": 0.23555555555555555, "grad_norm": 2.388408660888672, "learning_rate": 4.934285714285715e-05, "loss": 1.1017, "step": 530 }, { "epoch": 0.24, "grad_norm": 1.9390537738800049, "learning_rate": 4.905714285714286e-05, "loss": 1.0874, "step": 540 }, { "epoch": 0.24444444444444444, "grad_norm": 1.530515432357788, "learning_rate": 4.8771428571428574e-05, "loss": 1.0236, "step": 550 }, { "epoch": 0.24888888888888888, "grad_norm": 2.425351142883301, "learning_rate": 4.848571428571429e-05, "loss": 1.1392, "step": 560 }, { "epoch": 0.25333333333333335, "grad_norm": 2.0615339279174805, "learning_rate": 4.82e-05, "loss": 1.035, "step": 570 }, { "epoch": 0.2577777777777778, "grad_norm": 1.8026305437088013, "learning_rate": 4.7914285714285715e-05, "loss": 0.9894, "step": 580 }, { "epoch": 0.26222222222222225, "grad_norm": 1.7569513320922852, "learning_rate": 4.762857142857143e-05, "loss": 1.1085, "step": 590 }, { "epoch": 0.26666666666666666, "grad_norm": 2.374699354171753, "learning_rate": 4.734285714285715e-05, "loss": 1.0259, "step": 600 }, { "epoch": 0.27111111111111114, "grad_norm": 2.0250742435455322, "learning_rate": 4.7057142857142864e-05, "loss": 1.0345, "step": 610 }, { "epoch": 0.27555555555555555, "grad_norm": 2.330720901489258, "learning_rate": 4.677142857142857e-05, "loss": 1.1081, "step": 620 }, { "epoch": 0.28, "grad_norm": 1.5041579008102417, "learning_rate": 4.648571428571429e-05, "loss": 0.8348, "step": 630 }, { "epoch": 0.28444444444444444, "grad_norm": 2.397007703781128, "learning_rate": 4.6200000000000005e-05, "loss": 1.2422, "step": 640 }, { "epoch": 0.28888888888888886, "grad_norm": 1.4800223112106323, "learning_rate": 4.5914285714285714e-05, "loss": 0.995, "step": 650 }, { "epoch": 0.29333333333333333, "grad_norm": 1.6215323209762573, "learning_rate": 4.562857142857143e-05, "loss": 1.0207, "step": 660 }, { "epoch": 0.29777777777777775, "grad_norm": 4.243315696716309, "learning_rate": 4.534285714285714e-05, "loss": 1.0753, "step": 670 }, { "epoch": 0.3022222222222222, "grad_norm": 1.8909763097763062, "learning_rate": 4.5057142857142856e-05, "loss": 0.9546, "step": 680 }, { "epoch": 0.30666666666666664, "grad_norm": 1.5152394771575928, "learning_rate": 4.477142857142858e-05, "loss": 1.0375, "step": 690 }, { "epoch": 0.3111111111111111, "grad_norm": 1.9466438293457031, "learning_rate": 4.448571428571429e-05, "loss": 0.942, "step": 700 }, { "epoch": 0.31555555555555553, "grad_norm": 2.293703317642212, "learning_rate": 4.4200000000000004e-05, "loss": 1.1798, "step": 710 }, { "epoch": 0.32, "grad_norm": 1.8559818267822266, "learning_rate": 4.391428571428572e-05, "loss": 0.9743, "step": 720 }, { "epoch": 0.3244444444444444, "grad_norm": 2.2498509883880615, "learning_rate": 4.362857142857143e-05, "loss": 1.0288, "step": 730 }, { "epoch": 0.3288888888888889, "grad_norm": 2.1050989627838135, "learning_rate": 4.3342857142857145e-05, "loss": 1.0123, "step": 740 }, { "epoch": 0.3333333333333333, "grad_norm": 1.5077266693115234, "learning_rate": 4.3057142857142854e-05, "loss": 0.9108, "step": 750 }, { "epoch": 0.3377777777777778, "grad_norm": 1.7779529094696045, "learning_rate": 4.277142857142857e-05, "loss": 1.1093, "step": 760 }, { "epoch": 0.3422222222222222, "grad_norm": 1.9781780242919922, "learning_rate": 4.2485714285714286e-05, "loss": 1.0721, "step": 770 }, { "epoch": 0.3466666666666667, "grad_norm": 1.94735848903656, "learning_rate": 4.22e-05, "loss": 1.1273, "step": 780 }, { "epoch": 0.3511111111111111, "grad_norm": 1.4685845375061035, "learning_rate": 4.191428571428572e-05, "loss": 0.9747, "step": 790 }, { "epoch": 0.35555555555555557, "grad_norm": 1.094089150428772, "learning_rate": 4.162857142857143e-05, "loss": 1.0063, "step": 800 }, { "epoch": 0.36, "grad_norm": 2.0618069171905518, "learning_rate": 4.1342857142857144e-05, "loss": 0.9391, "step": 810 }, { "epoch": 0.36444444444444446, "grad_norm": 1.430254578590393, "learning_rate": 4.105714285714286e-05, "loss": 0.842, "step": 820 }, { "epoch": 0.3688888888888889, "grad_norm": 2.3631067276000977, "learning_rate": 4.077142857142857e-05, "loss": 0.9372, "step": 830 }, { "epoch": 0.37333333333333335, "grad_norm": 2.09013032913208, "learning_rate": 4.0485714285714285e-05, "loss": 0.9428, "step": 840 }, { "epoch": 0.37777777777777777, "grad_norm": 2.336822748184204, "learning_rate": 4.02e-05, "loss": 1.0985, "step": 850 }, { "epoch": 0.38222222222222224, "grad_norm": 2.335042953491211, "learning_rate": 3.991428571428572e-05, "loss": 1.2887, "step": 860 }, { "epoch": 0.38666666666666666, "grad_norm": 2.4629454612731934, "learning_rate": 3.9628571428571433e-05, "loss": 1.1101, "step": 870 }, { "epoch": 0.39111111111111113, "grad_norm": 2.509438991546631, "learning_rate": 3.934285714285714e-05, "loss": 1.0036, "step": 880 }, { "epoch": 0.39555555555555555, "grad_norm": 16.282512664794922, "learning_rate": 3.905714285714286e-05, "loss": 1.0398, "step": 890 }, { "epoch": 0.4, "grad_norm": 2.0597307682037354, "learning_rate": 3.8771428571428575e-05, "loss": 0.9606, "step": 900 }, { "epoch": 0.40444444444444444, "grad_norm": 1.9231626987457275, "learning_rate": 3.8485714285714284e-05, "loss": 0.9271, "step": 910 }, { "epoch": 0.4088888888888889, "grad_norm": 2.7093663215637207, "learning_rate": 3.82e-05, "loss": 1.0634, "step": 920 }, { "epoch": 0.41333333333333333, "grad_norm": 1.8224252462387085, "learning_rate": 3.7914285714285716e-05, "loss": 0.9241, "step": 930 }, { "epoch": 0.4177777777777778, "grad_norm": 1.1034265756607056, "learning_rate": 3.762857142857143e-05, "loss": 0.9692, "step": 940 }, { "epoch": 0.4222222222222222, "grad_norm": 1.8517080545425415, "learning_rate": 3.734285714285715e-05, "loss": 0.93, "step": 950 }, { "epoch": 0.4266666666666667, "grad_norm": 2.2137563228607178, "learning_rate": 3.705714285714286e-05, "loss": 0.9953, "step": 960 }, { "epoch": 0.4311111111111111, "grad_norm": 1.9600673913955688, "learning_rate": 3.6771428571428574e-05, "loss": 1.2618, "step": 970 }, { "epoch": 0.43555555555555553, "grad_norm": 2.1263670921325684, "learning_rate": 3.648571428571429e-05, "loss": 1.139, "step": 980 }, { "epoch": 0.44, "grad_norm": 15.621545791625977, "learning_rate": 3.62e-05, "loss": 1.1132, "step": 990 }, { "epoch": 0.4444444444444444, "grad_norm": 1.1507309675216675, "learning_rate": 3.5914285714285715e-05, "loss": 0.8175, "step": 1000 }, { "epoch": 0.4488888888888889, "grad_norm": 1.6997977495193481, "learning_rate": 3.562857142857143e-05, "loss": 0.9233, "step": 1010 }, { "epoch": 0.4533333333333333, "grad_norm": 4.499351501464844, "learning_rate": 3.534285714285715e-05, "loss": 1.2277, "step": 1020 }, { "epoch": 0.4577777777777778, "grad_norm": 2.199875593185425, "learning_rate": 3.505714285714286e-05, "loss": 1.122, "step": 1030 }, { "epoch": 0.4622222222222222, "grad_norm": 1.630294919013977, "learning_rate": 3.477142857142857e-05, "loss": 1.0278, "step": 1040 }, { "epoch": 0.4666666666666667, "grad_norm": 1.3660622835159302, "learning_rate": 3.448571428571429e-05, "loss": 0.8593, "step": 1050 }, { "epoch": 0.4711111111111111, "grad_norm": 1.1493386030197144, "learning_rate": 3.4200000000000005e-05, "loss": 1.1425, "step": 1060 }, { "epoch": 0.47555555555555556, "grad_norm": 1.2984066009521484, "learning_rate": 3.3914285714285714e-05, "loss": 1.0536, "step": 1070 }, { "epoch": 0.48, "grad_norm": 1.9474918842315674, "learning_rate": 3.362857142857143e-05, "loss": 1.0463, "step": 1080 }, { "epoch": 0.48444444444444446, "grad_norm": 1.580550193786621, "learning_rate": 3.334285714285714e-05, "loss": 1.0603, "step": 1090 }, { "epoch": 0.4888888888888889, "grad_norm": 2.1200408935546875, "learning_rate": 3.305714285714286e-05, "loss": 1.0465, "step": 1100 }, { "epoch": 0.49333333333333335, "grad_norm": 1.9244203567504883, "learning_rate": 3.277142857142858e-05, "loss": 1.1546, "step": 1110 }, { "epoch": 0.49777777777777776, "grad_norm": 2.738420248031616, "learning_rate": 3.248571428571429e-05, "loss": 1.0352, "step": 1120 }, { "epoch": 0.5022222222222222, "grad_norm": 1.3462022542953491, "learning_rate": 3.2200000000000003e-05, "loss": 1.0471, "step": 1130 }, { "epoch": 0.5066666666666667, "grad_norm": 2.3860256671905518, "learning_rate": 3.191428571428571e-05, "loss": 1.2251, "step": 1140 }, { "epoch": 0.5111111111111111, "grad_norm": 2.3215584754943848, "learning_rate": 3.162857142857143e-05, "loss": 1.0449, "step": 1150 }, { "epoch": 0.5155555555555555, "grad_norm": 2.4864187240600586, "learning_rate": 3.1342857142857145e-05, "loss": 1.0363, "step": 1160 }, { "epoch": 0.52, "grad_norm": 2.4286997318267822, "learning_rate": 3.1057142857142854e-05, "loss": 1.0117, "step": 1170 }, { "epoch": 0.5244444444444445, "grad_norm": 1.4346647262573242, "learning_rate": 3.077142857142857e-05, "loss": 1.0378, "step": 1180 }, { "epoch": 0.5288888888888889, "grad_norm": 1.2675151824951172, "learning_rate": 3.048571428571429e-05, "loss": 1.2185, "step": 1190 }, { "epoch": 0.5333333333333333, "grad_norm": 1.820166826248169, "learning_rate": 3.02e-05, "loss": 0.9562, "step": 1200 }, { "epoch": 0.5377777777777778, "grad_norm": 2.047520875930786, "learning_rate": 2.9914285714285718e-05, "loss": 1.1071, "step": 1210 }, { "epoch": 0.5422222222222223, "grad_norm": 1.5641695261001587, "learning_rate": 2.9628571428571428e-05, "loss": 1.0808, "step": 1220 }, { "epoch": 0.5466666666666666, "grad_norm": 1.394386887550354, "learning_rate": 2.9342857142857144e-05, "loss": 1.1786, "step": 1230 }, { "epoch": 0.5511111111111111, "grad_norm": 1.5711551904678345, "learning_rate": 2.905714285714286e-05, "loss": 1.0592, "step": 1240 }, { "epoch": 0.5555555555555556, "grad_norm": 4.389777660369873, "learning_rate": 2.8771428571428572e-05, "loss": 0.9317, "step": 1250 }, { "epoch": 0.56, "grad_norm": 2.3850152492523193, "learning_rate": 2.848571428571429e-05, "loss": 1.2062, "step": 1260 }, { "epoch": 0.5644444444444444, "grad_norm": 1.9837779998779297, "learning_rate": 2.8199999999999998e-05, "loss": 1.0094, "step": 1270 }, { "epoch": 0.5688888888888889, "grad_norm": 1.7174725532531738, "learning_rate": 2.7914285714285714e-05, "loss": 0.8973, "step": 1280 }, { "epoch": 0.5733333333333334, "grad_norm": 3.0591206550598145, "learning_rate": 2.762857142857143e-05, "loss": 0.9418, "step": 1290 }, { "epoch": 0.5777777777777777, "grad_norm": 2.102701187133789, "learning_rate": 2.7342857142857142e-05, "loss": 0.8937, "step": 1300 }, { "epoch": 0.5822222222222222, "grad_norm": 1.4529622793197632, "learning_rate": 2.705714285714286e-05, "loss": 0.8687, "step": 1310 }, { "epoch": 0.5866666666666667, "grad_norm": 2.070000410079956, "learning_rate": 2.6771428571428575e-05, "loss": 0.9817, "step": 1320 }, { "epoch": 0.5911111111111111, "grad_norm": 1.9277245998382568, "learning_rate": 2.6485714285714287e-05, "loss": 1.0032, "step": 1330 }, { "epoch": 0.5955555555555555, "grad_norm": 1.843050241470337, "learning_rate": 2.6200000000000003e-05, "loss": 1.1426, "step": 1340 }, { "epoch": 0.6, "grad_norm": 1.0302870273590088, "learning_rate": 2.5914285714285713e-05, "loss": 0.9906, "step": 1350 }, { "epoch": 0.6044444444444445, "grad_norm": 2.2761287689208984, "learning_rate": 2.562857142857143e-05, "loss": 1.1229, "step": 1360 }, { "epoch": 0.6088888888888889, "grad_norm": 3.804614782333374, "learning_rate": 2.5342857142857145e-05, "loss": 0.9621, "step": 1370 }, { "epoch": 0.6133333333333333, "grad_norm": 2.139857053756714, "learning_rate": 2.5057142857142857e-05, "loss": 0.9236, "step": 1380 }, { "epoch": 0.6177777777777778, "grad_norm": 2.6473701000213623, "learning_rate": 2.4771428571428573e-05, "loss": 0.9378, "step": 1390 }, { "epoch": 0.6222222222222222, "grad_norm": 2.084102153778076, "learning_rate": 2.4485714285714286e-05, "loss": 1.1248, "step": 1400 }, { "epoch": 0.6266666666666667, "grad_norm": 1.9559253454208374, "learning_rate": 2.4200000000000002e-05, "loss": 0.9016, "step": 1410 }, { "epoch": 0.6311111111111111, "grad_norm": 2.2711124420166016, "learning_rate": 2.3914285714285715e-05, "loss": 0.9147, "step": 1420 }, { "epoch": 0.6355555555555555, "grad_norm": 1.9618175029754639, "learning_rate": 2.362857142857143e-05, "loss": 1.0164, "step": 1430 }, { "epoch": 0.64, "grad_norm": 1.52959144115448, "learning_rate": 2.3342857142857143e-05, "loss": 0.9068, "step": 1440 }, { "epoch": 0.6444444444444445, "grad_norm": 1.666641354560852, "learning_rate": 2.3057142857142856e-05, "loss": 0.9525, "step": 1450 }, { "epoch": 0.6488888888888888, "grad_norm": 2.2008984088897705, "learning_rate": 2.2771428571428572e-05, "loss": 1.2026, "step": 1460 }, { "epoch": 0.6533333333333333, "grad_norm": 1.7555994987487793, "learning_rate": 2.2485714285714288e-05, "loss": 1.0945, "step": 1470 }, { "epoch": 0.6577777777777778, "grad_norm": 2.01448392868042, "learning_rate": 2.22e-05, "loss": 1.0728, "step": 1480 }, { "epoch": 0.6622222222222223, "grad_norm": 3.834198474884033, "learning_rate": 2.1914285714285714e-05, "loss": 1.1607, "step": 1490 }, { "epoch": 0.6666666666666666, "grad_norm": 1.5447688102722168, "learning_rate": 2.162857142857143e-05, "loss": 0.9704, "step": 1500 }, { "epoch": 0.6711111111111111, "grad_norm": 1.6624338626861572, "learning_rate": 2.1342857142857146e-05, "loss": 0.8271, "step": 1510 }, { "epoch": 0.6755555555555556, "grad_norm": 1.5587396621704102, "learning_rate": 2.105714285714286e-05, "loss": 1.0568, "step": 1520 }, { "epoch": 0.68, "grad_norm": 1.042724370956421, "learning_rate": 2.077142857142857e-05, "loss": 1.1686, "step": 1530 }, { "epoch": 0.6844444444444444, "grad_norm": 1.855294942855835, "learning_rate": 2.0485714285714287e-05, "loss": 0.9368, "step": 1540 }, { "epoch": 0.6888888888888889, "grad_norm": 1.5107423067092896, "learning_rate": 2.0200000000000003e-05, "loss": 1.0277, "step": 1550 }, { "epoch": 0.6933333333333334, "grad_norm": 1.0451265573501587, "learning_rate": 1.9914285714285716e-05, "loss": 1.0994, "step": 1560 }, { "epoch": 0.6977777777777778, "grad_norm": 2.220353126525879, "learning_rate": 1.962857142857143e-05, "loss": 1.1676, "step": 1570 }, { "epoch": 0.7022222222222222, "grad_norm": 4.320748805999756, "learning_rate": 1.9342857142857144e-05, "loss": 1.0167, "step": 1580 }, { "epoch": 0.7066666666666667, "grad_norm": 1.5187314748764038, "learning_rate": 1.9057142857142857e-05, "loss": 1.0061, "step": 1590 }, { "epoch": 0.7111111111111111, "grad_norm": 2.62479305267334, "learning_rate": 1.8771428571428573e-05, "loss": 1.0231, "step": 1600 }, { "epoch": 0.7155555555555555, "grad_norm": 37.25562286376953, "learning_rate": 1.8485714285714286e-05, "loss": 1.0258, "step": 1610 }, { "epoch": 0.72, "grad_norm": 7.708355903625488, "learning_rate": 1.8200000000000002e-05, "loss": 0.8673, "step": 1620 }, { "epoch": 0.7244444444444444, "grad_norm": 7.80335807800293, "learning_rate": 1.7914285714285715e-05, "loss": 1.1722, "step": 1630 }, { "epoch": 0.7288888888888889, "grad_norm": 4.959846496582031, "learning_rate": 1.762857142857143e-05, "loss": 1.0197, "step": 1640 }, { "epoch": 0.7333333333333333, "grad_norm": 1.8894150257110596, "learning_rate": 1.7342857142857143e-05, "loss": 0.9571, "step": 1650 }, { "epoch": 0.7377777777777778, "grad_norm": 5.9880828857421875, "learning_rate": 1.7057142857142856e-05, "loss": 1.1879, "step": 1660 }, { "epoch": 0.7422222222222222, "grad_norm": 1.9994230270385742, "learning_rate": 1.6771428571428572e-05, "loss": 1.01, "step": 1670 }, { "epoch": 0.7466666666666667, "grad_norm": 1.1010164022445679, "learning_rate": 1.6485714285714288e-05, "loss": 0.8837, "step": 1680 }, { "epoch": 0.7511111111111111, "grad_norm": 4.9511399269104, "learning_rate": 1.62e-05, "loss": 0.9419, "step": 1690 }, { "epoch": 0.7555555555555555, "grad_norm": 1.8997151851654053, "learning_rate": 1.5914285714285713e-05, "loss": 0.9052, "step": 1700 }, { "epoch": 0.76, "grad_norm": 1.6055902242660522, "learning_rate": 1.562857142857143e-05, "loss": 0.9279, "step": 1710 }, { "epoch": 0.7644444444444445, "grad_norm": 1.4079903364181519, "learning_rate": 1.5342857142857146e-05, "loss": 1.0294, "step": 1720 }, { "epoch": 0.7688888888888888, "grad_norm": 1.1559503078460693, "learning_rate": 1.5057142857142858e-05, "loss": 0.9437, "step": 1730 }, { "epoch": 0.7733333333333333, "grad_norm": 2.20170259475708, "learning_rate": 1.4771428571428573e-05, "loss": 1.1114, "step": 1740 }, { "epoch": 0.7777777777777778, "grad_norm": 1.4884487390518188, "learning_rate": 1.4485714285714285e-05, "loss": 1.0683, "step": 1750 }, { "epoch": 0.7822222222222223, "grad_norm": 1.7694205045700073, "learning_rate": 1.42e-05, "loss": 0.683, "step": 1760 }, { "epoch": 0.7866666666666666, "grad_norm": 7.573609352111816, "learning_rate": 1.3914285714285716e-05, "loss": 0.9765, "step": 1770 }, { "epoch": 0.7911111111111111, "grad_norm": 1.1830403804779053, "learning_rate": 1.362857142857143e-05, "loss": 1.0293, "step": 1780 }, { "epoch": 0.7955555555555556, "grad_norm": 2.407702922821045, "learning_rate": 1.3342857142857143e-05, "loss": 1.092, "step": 1790 }, { "epoch": 0.8, "grad_norm": 1.5534087419509888, "learning_rate": 1.3057142857142857e-05, "loss": 0.9226, "step": 1800 }, { "epoch": 0.8044444444444444, "grad_norm": 1.5992074012756348, "learning_rate": 1.2771428571428573e-05, "loss": 1.024, "step": 1810 }, { "epoch": 0.8088888888888889, "grad_norm": 4.057394027709961, "learning_rate": 1.2485714285714287e-05, "loss": 1.0755, "step": 1820 }, { "epoch": 0.8133333333333334, "grad_norm": 1.3395154476165771, "learning_rate": 1.22e-05, "loss": 0.979, "step": 1830 }, { "epoch": 0.8177777777777778, "grad_norm": 8.083459854125977, "learning_rate": 1.1914285714285716e-05, "loss": 0.9691, "step": 1840 }, { "epoch": 0.8222222222222222, "grad_norm": 1.3631497621536255, "learning_rate": 1.1628571428571429e-05, "loss": 0.9148, "step": 1850 }, { "epoch": 0.8266666666666667, "grad_norm": 1.3387725353240967, "learning_rate": 1.1342857142857143e-05, "loss": 1.058, "step": 1860 }, { "epoch": 0.8311111111111111, "grad_norm": 5.362998008728027, "learning_rate": 1.1057142857142858e-05, "loss": 1.1844, "step": 1870 }, { "epoch": 0.8355555555555556, "grad_norm": 1.5291681289672852, "learning_rate": 1.0771428571428572e-05, "loss": 1.0351, "step": 1880 }, { "epoch": 0.84, "grad_norm": 2.036616563796997, "learning_rate": 1.0485714285714286e-05, "loss": 1.1685, "step": 1890 }, { "epoch": 0.8444444444444444, "grad_norm": 10.672110557556152, "learning_rate": 1.02e-05, "loss": 1.186, "step": 1900 }, { "epoch": 0.8488888888888889, "grad_norm": 7.35708475112915, "learning_rate": 9.914285714285715e-06, "loss": 1.0015, "step": 1910 }, { "epoch": 0.8533333333333334, "grad_norm": 3.0570969581604004, "learning_rate": 9.628571428571428e-06, "loss": 1.0048, "step": 1920 }, { "epoch": 0.8577777777777778, "grad_norm": 2.548383951187134, "learning_rate": 9.342857142857144e-06, "loss": 0.9308, "step": 1930 }, { "epoch": 0.8622222222222222, "grad_norm": 2.9634547233581543, "learning_rate": 9.057142857142856e-06, "loss": 1.108, "step": 1940 }, { "epoch": 0.8666666666666667, "grad_norm": 1.768025279045105, "learning_rate": 8.771428571428572e-06, "loss": 1.1125, "step": 1950 }, { "epoch": 0.8711111111111111, "grad_norm": 1.4923690557479858, "learning_rate": 8.485714285714285e-06, "loss": 1.0878, "step": 1960 }, { "epoch": 0.8755555555555555, "grad_norm": 1.8677984476089478, "learning_rate": 8.200000000000001e-06, "loss": 0.8959, "step": 1970 }, { "epoch": 0.88, "grad_norm": 4.373391151428223, "learning_rate": 7.914285714285714e-06, "loss": 0.9732, "step": 1980 }, { "epoch": 0.8844444444444445, "grad_norm": 1.9039726257324219, "learning_rate": 7.628571428571429e-06, "loss": 0.9692, "step": 1990 }, { "epoch": 0.8888888888888888, "grad_norm": 4.483780384063721, "learning_rate": 7.342857142857143e-06, "loss": 0.9734, "step": 2000 }, { "epoch": 0.8933333333333333, "grad_norm": 2.355618476867676, "learning_rate": 7.057142857142858e-06, "loss": 0.7923, "step": 2010 }, { "epoch": 0.8977777777777778, "grad_norm": 1.768234133720398, "learning_rate": 6.771428571428571e-06, "loss": 1.0397, "step": 2020 }, { "epoch": 0.9022222222222223, "grad_norm": 1.9736918210983276, "learning_rate": 6.485714285714286e-06, "loss": 0.9889, "step": 2030 }, { "epoch": 0.9066666666666666, "grad_norm": 2.7325940132141113, "learning_rate": 6.2e-06, "loss": 0.9604, "step": 2040 }, { "epoch": 0.9111111111111111, "grad_norm": 37.42255401611328, "learning_rate": 5.914285714285714e-06, "loss": 1.06, "step": 2050 }, { "epoch": 0.9155555555555556, "grad_norm": 3.5316126346588135, "learning_rate": 5.628571428571429e-06, "loss": 1.0478, "step": 2060 }, { "epoch": 0.92, "grad_norm": 4.869263648986816, "learning_rate": 5.342857142857143e-06, "loss": 0.8253, "step": 2070 }, { "epoch": 0.9244444444444444, "grad_norm": 71.07227325439453, "learning_rate": 5.057142857142857e-06, "loss": 1.1752, "step": 2080 }, { "epoch": 0.9288888888888889, "grad_norm": 5.730470657348633, "learning_rate": 4.771428571428572e-06, "loss": 0.9383, "step": 2090 }, { "epoch": 0.9333333333333333, "grad_norm": 24.144546508789062, "learning_rate": 4.485714285714286e-06, "loss": 0.9968, "step": 2100 }, { "epoch": 0.9377777777777778, "grad_norm": 3.403139352798462, "learning_rate": 4.2000000000000004e-06, "loss": 0.8893, "step": 2110 }, { "epoch": 0.9422222222222222, "grad_norm": 3.723447322845459, "learning_rate": 3.914285714285715e-06, "loss": 1.0404, "step": 2120 }, { "epoch": 0.9466666666666667, "grad_norm": 9.551548957824707, "learning_rate": 3.6285714285714283e-06, "loss": 0.9837, "step": 2130 }, { "epoch": 0.9511111111111111, "grad_norm": 6.35592794418335, "learning_rate": 3.3428571428571427e-06, "loss": 0.8563, "step": 2140 }, { "epoch": 0.9555555555555556, "grad_norm": 3.69284987449646, "learning_rate": 3.0571428571428575e-06, "loss": 1.0082, "step": 2150 }, { "epoch": 0.96, "grad_norm": 2.6728179454803467, "learning_rate": 2.771428571428572e-06, "loss": 0.9613, "step": 2160 }, { "epoch": 0.9644444444444444, "grad_norm": 6.068182945251465, "learning_rate": 2.4857142857142858e-06, "loss": 0.9627, "step": 2170 }, { "epoch": 0.9688888888888889, "grad_norm": 28.534027099609375, "learning_rate": 2.2e-06, "loss": 0.9503, "step": 2180 }, { "epoch": 0.9733333333333334, "grad_norm": 7.36533260345459, "learning_rate": 1.9142857142857145e-06, "loss": 1.0315, "step": 2190 }, { "epoch": 0.9777777777777777, "grad_norm": 126.33111572265625, "learning_rate": 1.6285714285714286e-06, "loss": 1.015, "step": 2200 }, { "epoch": 0.9822222222222222, "grad_norm": 3.259016990661621, "learning_rate": 1.342857142857143e-06, "loss": 0.8513, "step": 2210 }, { "epoch": 0.9866666666666667, "grad_norm": 3.191985607147217, "learning_rate": 1.0571428571428573e-06, "loss": 1.0846, "step": 2220 }, { "epoch": 0.9911111111111112, "grad_norm": 3.515030860900879, "learning_rate": 7.714285714285715e-07, "loss": 0.9536, "step": 2230 }, { "epoch": 0.9955555555555555, "grad_norm": 3.0338504314422607, "learning_rate": 4.857142857142857e-07, "loss": 1.0406, "step": 2240 }, { "epoch": 1.0, "grad_norm": 6.6893110275268555, "learning_rate": 2.285714285714286e-07, "loss": 1.0769, "step": 2250 } ], "logging_steps": 10, "max_steps": 2250, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.0067730341888e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }