{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.96, "eval_steps": 500, "global_step": 3000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0032, "grad_norm": 0.18818965554237366, "learning_rate": 0.00019942400000000002, "loss": 1.4398, "step": 10 }, { "epoch": 0.0064, "grad_norm": 0.20508623123168945, "learning_rate": 0.00019878400000000003, "loss": 1.3682, "step": 20 }, { "epoch": 0.0096, "grad_norm": 0.18839821219444275, "learning_rate": 0.000198144, "loss": 1.3981, "step": 30 }, { "epoch": 0.0128, "grad_norm": 0.20183835923671722, "learning_rate": 0.000197504, "loss": 1.3109, "step": 40 }, { "epoch": 0.016, "grad_norm": 0.22559459507465363, "learning_rate": 0.000196864, "loss": 1.343, "step": 50 }, { "epoch": 0.0192, "grad_norm": 0.21955451369285583, "learning_rate": 0.000196224, "loss": 1.3645, "step": 60 }, { "epoch": 0.0224, "grad_norm": 0.2031077742576599, "learning_rate": 0.000195584, "loss": 1.2116, "step": 70 }, { "epoch": 0.0256, "grad_norm": 0.24542862176895142, "learning_rate": 0.000194944, "loss": 1.3466, "step": 80 }, { "epoch": 0.0288, "grad_norm": 0.24425864219665527, "learning_rate": 0.00019430400000000002, "loss": 1.2202, "step": 90 }, { "epoch": 0.032, "grad_norm": 0.23464636504650116, "learning_rate": 0.000193664, "loss": 1.2862, "step": 100 }, { "epoch": 0.0352, "grad_norm": 0.22896084189414978, "learning_rate": 0.000193024, "loss": 1.2424, "step": 110 }, { "epoch": 0.0384, "grad_norm": 0.30132901668548584, "learning_rate": 0.000192384, "loss": 1.2674, "step": 120 }, { "epoch": 0.0416, "grad_norm": 0.2417932003736496, "learning_rate": 0.000191744, "loss": 1.2433, "step": 130 }, { "epoch": 0.0448, "grad_norm": 0.25476428866386414, "learning_rate": 0.00019110400000000002, "loss": 1.2969, "step": 140 }, { "epoch": 0.048, "grad_norm": 0.28159213066101074, "learning_rate": 0.00019046400000000002, "loss": 1.2396, "step": 150 }, { "epoch": 0.0512, "grad_norm": 0.2914562225341797, "learning_rate": 0.000189824, "loss": 1.3067, "step": 160 }, { "epoch": 0.0544, "grad_norm": 0.2946033477783203, "learning_rate": 0.000189184, "loss": 1.1911, "step": 170 }, { "epoch": 0.0576, "grad_norm": 0.2544347941875458, "learning_rate": 0.00018854400000000002, "loss": 1.2062, "step": 180 }, { "epoch": 0.0608, "grad_norm": 0.2680202126502991, "learning_rate": 0.00018790400000000002, "loss": 1.1604, "step": 190 }, { "epoch": 0.064, "grad_norm": 0.25478115677833557, "learning_rate": 0.00018726400000000003, "loss": 1.2515, "step": 200 }, { "epoch": 0.0672, "grad_norm": 0.26804226636886597, "learning_rate": 0.000186624, "loss": 1.2765, "step": 210 }, { "epoch": 0.0704, "grad_norm": 0.24587133526802063, "learning_rate": 0.00018598400000000001, "loss": 1.1768, "step": 220 }, { "epoch": 0.0736, "grad_norm": 0.27216073870658875, "learning_rate": 0.00018534400000000002, "loss": 1.1848, "step": 230 }, { "epoch": 0.0768, "grad_norm": 0.3114471733570099, "learning_rate": 0.000184704, "loss": 1.2086, "step": 240 }, { "epoch": 0.08, "grad_norm": 0.3045384883880615, "learning_rate": 0.000184064, "loss": 1.1874, "step": 250 }, { "epoch": 0.0832, "grad_norm": 0.2793915271759033, "learning_rate": 0.000183424, "loss": 1.1977, "step": 260 }, { "epoch": 0.0864, "grad_norm": 0.34699785709381104, "learning_rate": 0.000182784, "loss": 1.2089, "step": 270 }, { "epoch": 0.0896, "grad_norm": 0.25291335582733154, "learning_rate": 0.000182144, "loss": 1.2459, "step": 280 }, { "epoch": 0.0928, "grad_norm": 0.2616818845272064, "learning_rate": 0.000181504, "loss": 1.2655, "step": 290 }, { "epoch": 0.096, "grad_norm": 0.2569883167743683, "learning_rate": 0.000180864, "loss": 1.2479, "step": 300 }, { "epoch": 0.0992, "grad_norm": 0.33097052574157715, "learning_rate": 0.00018022400000000001, "loss": 1.1975, "step": 310 }, { "epoch": 0.1024, "grad_norm": 0.2557203471660614, "learning_rate": 0.00017958400000000002, "loss": 1.1423, "step": 320 }, { "epoch": 0.1056, "grad_norm": 0.24836640059947968, "learning_rate": 0.000178944, "loss": 1.197, "step": 330 }, { "epoch": 0.1088, "grad_norm": 0.25998368859291077, "learning_rate": 0.000178304, "loss": 1.195, "step": 340 }, { "epoch": 0.112, "grad_norm": 0.29318585991859436, "learning_rate": 0.000177664, "loss": 1.1771, "step": 350 }, { "epoch": 0.1152, "grad_norm": 0.2819266617298126, "learning_rate": 0.00017702400000000002, "loss": 1.1113, "step": 360 }, { "epoch": 0.1184, "grad_norm": 0.24419356882572174, "learning_rate": 0.00017638400000000002, "loss": 1.2027, "step": 370 }, { "epoch": 0.1216, "grad_norm": 0.30148938298225403, "learning_rate": 0.000175744, "loss": 1.1583, "step": 380 }, { "epoch": 0.1248, "grad_norm": 0.2704978287220001, "learning_rate": 0.000175104, "loss": 1.2246, "step": 390 }, { "epoch": 0.128, "grad_norm": 0.2642189562320709, "learning_rate": 0.00017446400000000002, "loss": 1.1543, "step": 400 }, { "epoch": 0.1312, "grad_norm": 0.31052809953689575, "learning_rate": 0.00017382400000000002, "loss": 1.2039, "step": 410 }, { "epoch": 0.1344, "grad_norm": 0.2596384286880493, "learning_rate": 0.000173184, "loss": 1.1693, "step": 420 }, { "epoch": 0.1376, "grad_norm": 0.2813952565193176, "learning_rate": 0.000172544, "loss": 1.2213, "step": 430 }, { "epoch": 0.1408, "grad_norm": 0.26021143794059753, "learning_rate": 0.00017190399999999999, "loss": 1.1611, "step": 440 }, { "epoch": 0.144, "grad_norm": 0.3072742521762848, "learning_rate": 0.000171264, "loss": 1.1464, "step": 450 }, { "epoch": 0.1472, "grad_norm": 0.3125726580619812, "learning_rate": 0.000170624, "loss": 1.1901, "step": 460 }, { "epoch": 0.1504, "grad_norm": 0.25702011585235596, "learning_rate": 0.000169984, "loss": 1.1439, "step": 470 }, { "epoch": 0.1536, "grad_norm": 0.26658275723457336, "learning_rate": 0.000169344, "loss": 1.1256, "step": 480 }, { "epoch": 0.1568, "grad_norm": 0.2687830626964569, "learning_rate": 0.00016870400000000002, "loss": 1.233, "step": 490 }, { "epoch": 0.16, "grad_norm": 0.30417656898498535, "learning_rate": 0.000168064, "loss": 1.1951, "step": 500 }, { "epoch": 0.1632, "grad_norm": 0.24493242800235748, "learning_rate": 0.000167424, "loss": 1.2125, "step": 510 }, { "epoch": 0.1664, "grad_norm": 0.2442469447851181, "learning_rate": 0.000166784, "loss": 1.2005, "step": 520 }, { "epoch": 0.1696, "grad_norm": 0.2598196566104889, "learning_rate": 0.00016614400000000001, "loss": 1.1361, "step": 530 }, { "epoch": 0.1728, "grad_norm": 0.2823512852191925, "learning_rate": 0.00016550400000000002, "loss": 1.2671, "step": 540 }, { "epoch": 0.176, "grad_norm": 0.27284982800483704, "learning_rate": 0.00016486400000000003, "loss": 1.2019, "step": 550 }, { "epoch": 0.1792, "grad_norm": 0.4265678822994232, "learning_rate": 0.000164224, "loss": 1.1557, "step": 560 }, { "epoch": 0.1824, "grad_norm": 0.32852381467819214, "learning_rate": 0.000163584, "loss": 1.1792, "step": 570 }, { "epoch": 0.1856, "grad_norm": 0.25044888257980347, "learning_rate": 0.00016294400000000002, "loss": 1.1814, "step": 580 }, { "epoch": 0.1888, "grad_norm": 0.30278605222702026, "learning_rate": 0.00016230400000000002, "loss": 1.1865, "step": 590 }, { "epoch": 0.192, "grad_norm": 0.259230375289917, "learning_rate": 0.000161664, "loss": 1.2012, "step": 600 }, { "epoch": 0.1952, "grad_norm": 0.3037353456020355, "learning_rate": 0.000161024, "loss": 1.1508, "step": 610 }, { "epoch": 0.1984, "grad_norm": 0.32500702142715454, "learning_rate": 0.000160384, "loss": 1.2302, "step": 620 }, { "epoch": 0.2016, "grad_norm": 0.3390319049358368, "learning_rate": 0.000159744, "loss": 1.1949, "step": 630 }, { "epoch": 0.2048, "grad_norm": 0.30461153388023376, "learning_rate": 0.000159104, "loss": 1.1659, "step": 640 }, { "epoch": 0.208, "grad_norm": 0.2725638449192047, "learning_rate": 0.000158464, "loss": 1.1905, "step": 650 }, { "epoch": 0.2112, "grad_norm": 0.24364377558231354, "learning_rate": 0.000157824, "loss": 1.1194, "step": 660 }, { "epoch": 0.2144, "grad_norm": 0.2993292510509491, "learning_rate": 0.000157184, "loss": 1.165, "step": 670 }, { "epoch": 0.2176, "grad_norm": 0.22024789452552795, "learning_rate": 0.000156544, "loss": 1.1187, "step": 680 }, { "epoch": 0.2208, "grad_norm": 0.2543094754219055, "learning_rate": 0.000155904, "loss": 1.0779, "step": 690 }, { "epoch": 0.224, "grad_norm": 0.31767502427101135, "learning_rate": 0.000155264, "loss": 1.1076, "step": 700 }, { "epoch": 0.2272, "grad_norm": 0.27651944756507874, "learning_rate": 0.00015462400000000002, "loss": 1.1748, "step": 710 }, { "epoch": 0.2304, "grad_norm": 0.2532593905925751, "learning_rate": 0.00015398400000000002, "loss": 1.1302, "step": 720 }, { "epoch": 0.2336, "grad_norm": 0.26502105593681335, "learning_rate": 0.000153344, "loss": 1.1652, "step": 730 }, { "epoch": 0.2368, "grad_norm": 0.22709833085536957, "learning_rate": 0.000152704, "loss": 1.1045, "step": 740 }, { "epoch": 0.24, "grad_norm": 0.273548424243927, "learning_rate": 0.000152064, "loss": 1.1216, "step": 750 }, { "epoch": 0.2432, "grad_norm": 0.26936766505241394, "learning_rate": 0.00015142400000000002, "loss": 1.0981, "step": 760 }, { "epoch": 0.2464, "grad_norm": 0.2642468810081482, "learning_rate": 0.00015078400000000003, "loss": 1.1044, "step": 770 }, { "epoch": 0.2496, "grad_norm": 0.25724905729293823, "learning_rate": 0.000150144, "loss": 1.1474, "step": 780 }, { "epoch": 0.2528, "grad_norm": 0.27715614438056946, "learning_rate": 0.000149504, "loss": 1.102, "step": 790 }, { "epoch": 0.256, "grad_norm": 0.25462499260902405, "learning_rate": 0.00014886400000000002, "loss": 1.1834, "step": 800 }, { "epoch": 0.2592, "grad_norm": 0.25248298048973083, "learning_rate": 0.000148224, "loss": 1.2023, "step": 810 }, { "epoch": 0.2624, "grad_norm": 0.3147791922092438, "learning_rate": 0.000147584, "loss": 1.1564, "step": 820 }, { "epoch": 0.2656, "grad_norm": 0.30874860286712646, "learning_rate": 0.000146944, "loss": 1.1722, "step": 830 }, { "epoch": 0.2688, "grad_norm": 0.2486383467912674, "learning_rate": 0.000146304, "loss": 1.1161, "step": 840 }, { "epoch": 0.272, "grad_norm": 0.24819347262382507, "learning_rate": 0.000145664, "loss": 1.1813, "step": 850 }, { "epoch": 0.2752, "grad_norm": 0.2713952660560608, "learning_rate": 0.000145024, "loss": 1.0917, "step": 860 }, { "epoch": 0.2784, "grad_norm": 0.2632099986076355, "learning_rate": 0.000144384, "loss": 1.1806, "step": 870 }, { "epoch": 0.2816, "grad_norm": 0.24610574543476105, "learning_rate": 0.000143744, "loss": 1.1397, "step": 880 }, { "epoch": 0.2848, "grad_norm": 0.3086780607700348, "learning_rate": 0.00014310400000000002, "loss": 1.1338, "step": 890 }, { "epoch": 0.288, "grad_norm": 0.25754594802856445, "learning_rate": 0.000142464, "loss": 1.148, "step": 900 }, { "epoch": 0.2912, "grad_norm": 0.2540118098258972, "learning_rate": 0.000141824, "loss": 1.0787, "step": 910 }, { "epoch": 0.2944, "grad_norm": 0.3876936733722687, "learning_rate": 0.000141184, "loss": 1.0841, "step": 920 }, { "epoch": 0.2976, "grad_norm": 0.27151429653167725, "learning_rate": 0.00014054400000000002, "loss": 1.0859, "step": 930 }, { "epoch": 0.3008, "grad_norm": 0.2533249855041504, "learning_rate": 0.00013990400000000002, "loss": 1.1347, "step": 940 }, { "epoch": 0.304, "grad_norm": 0.2707951068878174, "learning_rate": 0.00013926400000000003, "loss": 1.1787, "step": 950 }, { "epoch": 0.3072, "grad_norm": 0.27499666810035706, "learning_rate": 0.000138624, "loss": 1.1991, "step": 960 }, { "epoch": 0.3104, "grad_norm": 0.31902188062667847, "learning_rate": 0.000137984, "loss": 1.2036, "step": 970 }, { "epoch": 0.3136, "grad_norm": 0.2782028913497925, "learning_rate": 0.00013734400000000002, "loss": 1.1654, "step": 980 }, { "epoch": 0.3168, "grad_norm": 0.3343225419521332, "learning_rate": 0.000136704, "loss": 1.1677, "step": 990 }, { "epoch": 0.32, "grad_norm": 0.2642679810523987, "learning_rate": 0.000136064, "loss": 1.1241, "step": 1000 }, { "epoch": 0.3232, "grad_norm": 0.37036266922950745, "learning_rate": 0.000135424, "loss": 1.1454, "step": 1010 }, { "epoch": 0.3264, "grad_norm": 0.2647375464439392, "learning_rate": 0.000134784, "loss": 1.1288, "step": 1020 }, { "epoch": 0.3296, "grad_norm": 0.3140479028224945, "learning_rate": 0.000134144, "loss": 1.2205, "step": 1030 }, { "epoch": 0.3328, "grad_norm": 0.24704284965991974, "learning_rate": 0.000133504, "loss": 1.1341, "step": 1040 }, { "epoch": 0.336, "grad_norm": 0.3063775300979614, "learning_rate": 0.000132864, "loss": 1.2261, "step": 1050 }, { "epoch": 0.3392, "grad_norm": 0.2879609167575836, "learning_rate": 0.000132224, "loss": 1.1676, "step": 1060 }, { "epoch": 0.3424, "grad_norm": 0.27018502354621887, "learning_rate": 0.000131584, "loss": 1.1237, "step": 1070 }, { "epoch": 0.3456, "grad_norm": 0.25358906388282776, "learning_rate": 0.000130944, "loss": 1.1034, "step": 1080 }, { "epoch": 0.3488, "grad_norm": 0.26398876309394836, "learning_rate": 0.000130304, "loss": 1.2041, "step": 1090 }, { "epoch": 0.352, "grad_norm": 0.4267687499523163, "learning_rate": 0.000129664, "loss": 1.15, "step": 1100 }, { "epoch": 0.3552, "grad_norm": 0.26307064294815063, "learning_rate": 0.00012902400000000002, "loss": 1.0921, "step": 1110 }, { "epoch": 0.3584, "grad_norm": 0.38714703917503357, "learning_rate": 0.00012838400000000002, "loss": 1.225, "step": 1120 }, { "epoch": 0.3616, "grad_norm": 0.275495320558548, "learning_rate": 0.000127744, "loss": 1.1513, "step": 1130 }, { "epoch": 0.3648, "grad_norm": 0.26792117953300476, "learning_rate": 0.000127104, "loss": 1.1293, "step": 1140 }, { "epoch": 0.368, "grad_norm": 0.2474093735218048, "learning_rate": 0.00012646400000000001, "loss": 1.175, "step": 1150 }, { "epoch": 0.3712, "grad_norm": 0.2551768124103546, "learning_rate": 0.00012582400000000002, "loss": 1.1212, "step": 1160 }, { "epoch": 0.3744, "grad_norm": 0.2605915665626526, "learning_rate": 0.000125184, "loss": 1.1303, "step": 1170 }, { "epoch": 0.3776, "grad_norm": 0.30072781443595886, "learning_rate": 0.000124544, "loss": 1.1552, "step": 1180 }, { "epoch": 0.3808, "grad_norm": 0.2362007051706314, "learning_rate": 0.00012390399999999998, "loss": 1.1984, "step": 1190 }, { "epoch": 0.384, "grad_norm": 0.3113998472690582, "learning_rate": 0.000123264, "loss": 1.1239, "step": 1200 }, { "epoch": 0.3872, "grad_norm": 0.3142775893211365, "learning_rate": 0.000122624, "loss": 1.1089, "step": 1210 }, { "epoch": 0.3904, "grad_norm": 0.38791659474372864, "learning_rate": 0.000121984, "loss": 1.1532, "step": 1220 }, { "epoch": 0.3936, "grad_norm": 0.37536805868148804, "learning_rate": 0.00012134400000000001, "loss": 1.1333, "step": 1230 }, { "epoch": 0.3968, "grad_norm": 0.2762630879878998, "learning_rate": 0.00012070399999999999, "loss": 1.0189, "step": 1240 }, { "epoch": 0.4, "grad_norm": 0.25079619884490967, "learning_rate": 0.000120064, "loss": 1.1169, "step": 1250 }, { "epoch": 0.4032, "grad_norm": 0.2860804498195648, "learning_rate": 0.000119424, "loss": 1.0866, "step": 1260 }, { "epoch": 0.4064, "grad_norm": 0.2980159819126129, "learning_rate": 0.000118784, "loss": 1.1898, "step": 1270 }, { "epoch": 0.4096, "grad_norm": 0.336664080619812, "learning_rate": 0.00011814400000000001, "loss": 1.1332, "step": 1280 }, { "epoch": 0.4128, "grad_norm": 0.26283812522888184, "learning_rate": 0.00011750400000000002, "loss": 1.1367, "step": 1290 }, { "epoch": 0.416, "grad_norm": 0.27355343103408813, "learning_rate": 0.000116864, "loss": 1.0927, "step": 1300 }, { "epoch": 0.4192, "grad_norm": 0.2756136953830719, "learning_rate": 0.000116224, "loss": 1.1109, "step": 1310 }, { "epoch": 0.4224, "grad_norm": 0.26861876249313354, "learning_rate": 0.00011558400000000001, "loss": 1.1401, "step": 1320 }, { "epoch": 0.4256, "grad_norm": 0.23718690872192383, "learning_rate": 0.000114944, "loss": 1.1717, "step": 1330 }, { "epoch": 0.4288, "grad_norm": 0.2714873254299164, "learning_rate": 0.00011430400000000001, "loss": 1.1032, "step": 1340 }, { "epoch": 0.432, "grad_norm": 0.29829949140548706, "learning_rate": 0.00011366400000000001, "loss": 1.1129, "step": 1350 }, { "epoch": 0.4352, "grad_norm": 0.2577730417251587, "learning_rate": 0.000113024, "loss": 1.0978, "step": 1360 }, { "epoch": 0.4384, "grad_norm": 0.24967093765735626, "learning_rate": 0.000112384, "loss": 1.1533, "step": 1370 }, { "epoch": 0.4416, "grad_norm": 0.2530049681663513, "learning_rate": 0.000111744, "loss": 1.0921, "step": 1380 }, { "epoch": 0.4448, "grad_norm": 0.30380481481552124, "learning_rate": 0.00011110400000000001, "loss": 1.1773, "step": 1390 }, { "epoch": 0.448, "grad_norm": 0.25974929332733154, "learning_rate": 0.00011046400000000002, "loss": 1.1584, "step": 1400 }, { "epoch": 0.4512, "grad_norm": 0.37404870986938477, "learning_rate": 0.00010982400000000001, "loss": 1.1199, "step": 1410 }, { "epoch": 0.4544, "grad_norm": 0.32186731696128845, "learning_rate": 0.000109184, "loss": 1.1392, "step": 1420 }, { "epoch": 0.4576, "grad_norm": 0.2843293845653534, "learning_rate": 0.000108544, "loss": 1.1223, "step": 1430 }, { "epoch": 0.4608, "grad_norm": 0.310863733291626, "learning_rate": 0.000107904, "loss": 1.1112, "step": 1440 }, { "epoch": 0.464, "grad_norm": 0.2348843812942505, "learning_rate": 0.00010726400000000001, "loss": 1.227, "step": 1450 }, { "epoch": 0.4672, "grad_norm": 0.27787262201309204, "learning_rate": 0.00010662400000000001, "loss": 1.145, "step": 1460 }, { "epoch": 0.4704, "grad_norm": 0.26069822907447815, "learning_rate": 0.000105984, "loss": 1.1417, "step": 1470 }, { "epoch": 0.4736, "grad_norm": 0.27364885807037354, "learning_rate": 0.000105344, "loss": 1.0959, "step": 1480 }, { "epoch": 0.4768, "grad_norm": 0.24618731439113617, "learning_rate": 0.000104704, "loss": 1.1312, "step": 1490 }, { "epoch": 0.48, "grad_norm": 0.28439000248908997, "learning_rate": 0.00010406400000000001, "loss": 1.1506, "step": 1500 }, { "epoch": 0.4832, "grad_norm": 0.26154178380966187, "learning_rate": 0.000103424, "loss": 1.0225, "step": 1510 }, { "epoch": 0.4864, "grad_norm": 0.26929762959480286, "learning_rate": 0.00010278400000000001, "loss": 1.061, "step": 1520 }, { "epoch": 0.4896, "grad_norm": 0.2753245532512665, "learning_rate": 0.00010214399999999999, "loss": 1.1724, "step": 1530 }, { "epoch": 0.4928, "grad_norm": 0.23799718916416168, "learning_rate": 0.000101504, "loss": 1.1397, "step": 1540 }, { "epoch": 0.496, "grad_norm": 0.277057945728302, "learning_rate": 0.000100864, "loss": 1.1924, "step": 1550 }, { "epoch": 0.4992, "grad_norm": 0.27867016196250916, "learning_rate": 0.00010022400000000001, "loss": 1.1841, "step": 1560 }, { "epoch": 0.5024, "grad_norm": 0.2876740097999573, "learning_rate": 9.9584e-05, "loss": 1.1085, "step": 1570 }, { "epoch": 0.5056, "grad_norm": 0.2694062292575836, "learning_rate": 9.8944e-05, "loss": 1.0743, "step": 1580 }, { "epoch": 0.5088, "grad_norm": 0.27280256152153015, "learning_rate": 9.830400000000001e-05, "loss": 1.2189, "step": 1590 }, { "epoch": 0.512, "grad_norm": 0.3361833393573761, "learning_rate": 9.7664e-05, "loss": 1.0771, "step": 1600 }, { "epoch": 0.5152, "grad_norm": 0.2917429208755493, "learning_rate": 9.7024e-05, "loss": 1.1137, "step": 1610 }, { "epoch": 0.5184, "grad_norm": 0.306228905916214, "learning_rate": 9.6384e-05, "loss": 1.1708, "step": 1620 }, { "epoch": 0.5216, "grad_norm": 0.25011730194091797, "learning_rate": 9.5744e-05, "loss": 1.1042, "step": 1630 }, { "epoch": 0.5248, "grad_norm": 0.3258339464664459, "learning_rate": 9.5104e-05, "loss": 1.1781, "step": 1640 }, { "epoch": 0.528, "grad_norm": 0.3206620216369629, "learning_rate": 9.446400000000001e-05, "loss": 1.0991, "step": 1650 }, { "epoch": 0.5312, "grad_norm": 0.31144049763679504, "learning_rate": 9.3824e-05, "loss": 1.101, "step": 1660 }, { "epoch": 0.5344, "grad_norm": 0.2294250875711441, "learning_rate": 9.318400000000001e-05, "loss": 1.1346, "step": 1670 }, { "epoch": 0.5376, "grad_norm": 0.34881460666656494, "learning_rate": 9.254400000000001e-05, "loss": 1.1389, "step": 1680 }, { "epoch": 0.5408, "grad_norm": 0.27140548825263977, "learning_rate": 9.1904e-05, "loss": 1.1214, "step": 1690 }, { "epoch": 0.544, "grad_norm": 0.25193724036216736, "learning_rate": 9.1264e-05, "loss": 1.1273, "step": 1700 }, { "epoch": 0.5472, "grad_norm": 0.27552053332328796, "learning_rate": 9.0624e-05, "loss": 1.1523, "step": 1710 }, { "epoch": 0.5504, "grad_norm": 0.2877196967601776, "learning_rate": 8.9984e-05, "loss": 1.082, "step": 1720 }, { "epoch": 0.5536, "grad_norm": 0.23511843383312225, "learning_rate": 8.9344e-05, "loss": 1.1251, "step": 1730 }, { "epoch": 0.5568, "grad_norm": 0.28936073184013367, "learning_rate": 8.870400000000001e-05, "loss": 1.1742, "step": 1740 }, { "epoch": 0.56, "grad_norm": 0.28244665265083313, "learning_rate": 8.8064e-05, "loss": 1.1665, "step": 1750 }, { "epoch": 0.5632, "grad_norm": 0.22868531942367554, "learning_rate": 8.742400000000001e-05, "loss": 1.0733, "step": 1760 }, { "epoch": 0.5664, "grad_norm": 0.23258651793003082, "learning_rate": 8.6784e-05, "loss": 1.0538, "step": 1770 }, { "epoch": 0.5696, "grad_norm": 0.2886444330215454, "learning_rate": 8.614400000000001e-05, "loss": 1.0896, "step": 1780 }, { "epoch": 0.5728, "grad_norm": 0.26151803135871887, "learning_rate": 8.5504e-05, "loss": 1.0771, "step": 1790 }, { "epoch": 0.576, "grad_norm": 0.2829320430755615, "learning_rate": 8.486399999999999e-05, "loss": 1.0796, "step": 1800 }, { "epoch": 0.5792, "grad_norm": 0.2471112608909607, "learning_rate": 8.4224e-05, "loss": 1.1751, "step": 1810 }, { "epoch": 0.5824, "grad_norm": 0.35704588890075684, "learning_rate": 8.3584e-05, "loss": 1.1203, "step": 1820 }, { "epoch": 0.5856, "grad_norm": 0.3598352074623108, "learning_rate": 8.2944e-05, "loss": 1.1982, "step": 1830 }, { "epoch": 0.5888, "grad_norm": 0.24873049557209015, "learning_rate": 8.2304e-05, "loss": 1.11, "step": 1840 }, { "epoch": 0.592, "grad_norm": 0.2786683142185211, "learning_rate": 8.166400000000001e-05, "loss": 1.1045, "step": 1850 }, { "epoch": 0.5952, "grad_norm": 0.31083497405052185, "learning_rate": 8.1024e-05, "loss": 1.1094, "step": 1860 }, { "epoch": 0.5984, "grad_norm": 0.25888901948928833, "learning_rate": 8.038400000000001e-05, "loss": 1.0637, "step": 1870 }, { "epoch": 0.6016, "grad_norm": 0.27845245599746704, "learning_rate": 7.9744e-05, "loss": 1.1442, "step": 1880 }, { "epoch": 0.6048, "grad_norm": 0.32606974244117737, "learning_rate": 7.910400000000001e-05, "loss": 1.1409, "step": 1890 }, { "epoch": 0.608, "grad_norm": 0.28251323103904724, "learning_rate": 7.8464e-05, "loss": 1.1037, "step": 1900 }, { "epoch": 0.6112, "grad_norm": 0.2702641487121582, "learning_rate": 7.7824e-05, "loss": 1.11, "step": 1910 }, { "epoch": 0.6144, "grad_norm": 0.31664419174194336, "learning_rate": 7.7184e-05, "loss": 1.0634, "step": 1920 }, { "epoch": 0.6176, "grad_norm": 0.30974429845809937, "learning_rate": 7.6544e-05, "loss": 1.0884, "step": 1930 }, { "epoch": 0.6208, "grad_norm": 0.27011537551879883, "learning_rate": 7.590400000000001e-05, "loss": 1.0534, "step": 1940 }, { "epoch": 0.624, "grad_norm": 0.26978424191474915, "learning_rate": 7.5264e-05, "loss": 1.1019, "step": 1950 }, { "epoch": 0.6272, "grad_norm": 0.29052773118019104, "learning_rate": 7.462400000000001e-05, "loss": 1.1617, "step": 1960 }, { "epoch": 0.6304, "grad_norm": 0.2614887058734894, "learning_rate": 7.398400000000002e-05, "loss": 1.1562, "step": 1970 }, { "epoch": 0.6336, "grad_norm": 0.27294448018074036, "learning_rate": 7.334400000000001e-05, "loss": 1.1257, "step": 1980 }, { "epoch": 0.6368, "grad_norm": 0.24990952014923096, "learning_rate": 7.2704e-05, "loss": 1.0977, "step": 1990 }, { "epoch": 0.64, "grad_norm": 0.272666335105896, "learning_rate": 7.206399999999999e-05, "loss": 1.0731, "step": 2000 }, { "epoch": 0.6432, "grad_norm": 0.28734877705574036, "learning_rate": 7.1424e-05, "loss": 1.1062, "step": 2010 }, { "epoch": 0.6464, "grad_norm": 0.27861565351486206, "learning_rate": 7.0784e-05, "loss": 1.1435, "step": 2020 }, { "epoch": 0.6496, "grad_norm": 0.34537386894226074, "learning_rate": 7.0144e-05, "loss": 1.1227, "step": 2030 }, { "epoch": 0.6528, "grad_norm": 0.3273583650588989, "learning_rate": 6.9504e-05, "loss": 1.0856, "step": 2040 }, { "epoch": 0.656, "grad_norm": 0.3750070631504059, "learning_rate": 6.886400000000001e-05, "loss": 1.1438, "step": 2050 }, { "epoch": 0.6592, "grad_norm": 0.4590343236923218, "learning_rate": 6.8224e-05, "loss": 1.1532, "step": 2060 }, { "epoch": 0.6624, "grad_norm": 0.3549063503742218, "learning_rate": 6.758400000000001e-05, "loss": 1.1418, "step": 2070 }, { "epoch": 0.6656, "grad_norm": 0.33026474714279175, "learning_rate": 6.6944e-05, "loss": 1.0342, "step": 2080 }, { "epoch": 0.6688, "grad_norm": 0.37451469898223877, "learning_rate": 6.6304e-05, "loss": 1.104, "step": 2090 }, { "epoch": 0.672, "grad_norm": 0.32349246740341187, "learning_rate": 6.5664e-05, "loss": 1.1403, "step": 2100 }, { "epoch": 0.6752, "grad_norm": 0.3289710581302643, "learning_rate": 6.5024e-05, "loss": 1.1349, "step": 2110 }, { "epoch": 0.6784, "grad_norm": 0.24263447523117065, "learning_rate": 6.4384e-05, "loss": 1.1162, "step": 2120 }, { "epoch": 0.6816, "grad_norm": 0.3055990934371948, "learning_rate": 6.3744e-05, "loss": 1.0623, "step": 2130 }, { "epoch": 0.6848, "grad_norm": 0.26169636845588684, "learning_rate": 6.310400000000001e-05, "loss": 1.1566, "step": 2140 }, { "epoch": 0.688, "grad_norm": 0.2564956545829773, "learning_rate": 6.2464e-05, "loss": 1.1615, "step": 2150 }, { "epoch": 0.6912, "grad_norm": 0.2543143033981323, "learning_rate": 6.182400000000001e-05, "loss": 1.14, "step": 2160 }, { "epoch": 0.6944, "grad_norm": 0.28348684310913086, "learning_rate": 6.1184e-05, "loss": 1.1192, "step": 2170 }, { "epoch": 0.6976, "grad_norm": 0.25627613067626953, "learning_rate": 6.0544e-05, "loss": 1.1469, "step": 2180 }, { "epoch": 0.7008, "grad_norm": 0.39540451765060425, "learning_rate": 5.990400000000001e-05, "loss": 1.1497, "step": 2190 }, { "epoch": 0.704, "grad_norm": 0.2549257278442383, "learning_rate": 5.9264e-05, "loss": 1.077, "step": 2200 }, { "epoch": 0.7072, "grad_norm": 0.2789277136325836, "learning_rate": 5.8624e-05, "loss": 1.048, "step": 2210 }, { "epoch": 0.7104, "grad_norm": 0.28794965147972107, "learning_rate": 5.7984000000000006e-05, "loss": 1.14, "step": 2220 }, { "epoch": 0.7136, "grad_norm": 0.27266013622283936, "learning_rate": 5.7344e-05, "loss": 1.1882, "step": 2230 }, { "epoch": 0.7168, "grad_norm": 0.31794604659080505, "learning_rate": 5.6704000000000005e-05, "loss": 1.1174, "step": 2240 }, { "epoch": 0.72, "grad_norm": 0.26018059253692627, "learning_rate": 5.6064000000000004e-05, "loss": 1.0834, "step": 2250 }, { "epoch": 0.7232, "grad_norm": 0.30461081862449646, "learning_rate": 5.5423999999999997e-05, "loss": 1.0549, "step": 2260 }, { "epoch": 0.7264, "grad_norm": 0.38113224506378174, "learning_rate": 5.4784e-05, "loss": 1.1426, "step": 2270 }, { "epoch": 0.7296, "grad_norm": 0.2623901069164276, "learning_rate": 5.414400000000001e-05, "loss": 1.066, "step": 2280 }, { "epoch": 0.7328, "grad_norm": 0.2910241484642029, "learning_rate": 5.3504e-05, "loss": 1.0094, "step": 2290 }, { "epoch": 0.736, "grad_norm": 0.2720244228839874, "learning_rate": 5.2864e-05, "loss": 1.1315, "step": 2300 }, { "epoch": 0.7392, "grad_norm": 0.25109943747520447, "learning_rate": 5.222400000000001e-05, "loss": 1.0621, "step": 2310 }, { "epoch": 0.7424, "grad_norm": 0.2956967353820801, "learning_rate": 5.1584e-05, "loss": 1.0889, "step": 2320 }, { "epoch": 0.7456, "grad_norm": 0.2685677409172058, "learning_rate": 5.0944000000000006e-05, "loss": 1.0868, "step": 2330 }, { "epoch": 0.7488, "grad_norm": 0.3329203426837921, "learning_rate": 5.0304000000000005e-05, "loss": 1.1943, "step": 2340 }, { "epoch": 0.752, "grad_norm": 0.30616864562034607, "learning_rate": 4.9664000000000004e-05, "loss": 1.1168, "step": 2350 }, { "epoch": 0.7552, "grad_norm": 0.26512446999549866, "learning_rate": 4.9024000000000004e-05, "loss": 1.1155, "step": 2360 }, { "epoch": 0.7584, "grad_norm": 0.3897416591644287, "learning_rate": 4.8384e-05, "loss": 1.1451, "step": 2370 }, { "epoch": 0.7616, "grad_norm": 0.30231884121894836, "learning_rate": 4.7744e-05, "loss": 1.1502, "step": 2380 }, { "epoch": 0.7648, "grad_norm": 0.3417859673500061, "learning_rate": 4.7104e-05, "loss": 1.1075, "step": 2390 }, { "epoch": 0.768, "grad_norm": 0.2546481788158417, "learning_rate": 4.6464e-05, "loss": 1.1244, "step": 2400 }, { "epoch": 0.7712, "grad_norm": 0.2996661067008972, "learning_rate": 4.5824e-05, "loss": 1.0609, "step": 2410 }, { "epoch": 0.7744, "grad_norm": 0.3443852663040161, "learning_rate": 4.5184000000000006e-05, "loss": 1.0461, "step": 2420 }, { "epoch": 0.7776, "grad_norm": 0.4986393451690674, "learning_rate": 4.4544e-05, "loss": 1.1281, "step": 2430 }, { "epoch": 0.7808, "grad_norm": 0.4183667302131653, "learning_rate": 4.3904e-05, "loss": 1.1396, "step": 2440 }, { "epoch": 0.784, "grad_norm": 0.29799389839172363, "learning_rate": 4.3264000000000005e-05, "loss": 1.1093, "step": 2450 }, { "epoch": 0.7872, "grad_norm": 0.28885000944137573, "learning_rate": 4.2624000000000004e-05, "loss": 1.0716, "step": 2460 }, { "epoch": 0.7904, "grad_norm": 0.31649360060691833, "learning_rate": 4.1984e-05, "loss": 1.096, "step": 2470 }, { "epoch": 0.7936, "grad_norm": 0.2691904306411743, "learning_rate": 4.1344e-05, "loss": 1.1591, "step": 2480 }, { "epoch": 0.7968, "grad_norm": 0.31309962272644043, "learning_rate": 4.0704e-05, "loss": 1.1577, "step": 2490 }, { "epoch": 0.8, "grad_norm": 0.27012893557548523, "learning_rate": 4.0064e-05, "loss": 1.1719, "step": 2500 }, { "epoch": 0.8032, "grad_norm": 0.4025005102157593, "learning_rate": 3.9424e-05, "loss": 1.0755, "step": 2510 }, { "epoch": 0.8064, "grad_norm": 0.29118382930755615, "learning_rate": 3.878400000000001e-05, "loss": 1.1296, "step": 2520 }, { "epoch": 0.8096, "grad_norm": 0.26896369457244873, "learning_rate": 3.8144e-05, "loss": 1.1221, "step": 2530 }, { "epoch": 0.8128, "grad_norm": 0.3033294677734375, "learning_rate": 3.7504e-05, "loss": 1.1374, "step": 2540 }, { "epoch": 0.816, "grad_norm": 0.3265039026737213, "learning_rate": 3.6864000000000005e-05, "loss": 1.0935, "step": 2550 }, { "epoch": 0.8192, "grad_norm": 0.24130520224571228, "learning_rate": 3.6224000000000004e-05, "loss": 1.2369, "step": 2560 }, { "epoch": 0.8224, "grad_norm": 0.36245155334472656, "learning_rate": 3.5584000000000004e-05, "loss": 1.0706, "step": 2570 }, { "epoch": 0.8256, "grad_norm": 0.2817525267601013, "learning_rate": 3.4943999999999996e-05, "loss": 1.0652, "step": 2580 }, { "epoch": 0.8288, "grad_norm": 0.2730059027671814, "learning_rate": 3.4304e-05, "loss": 1.067, "step": 2590 }, { "epoch": 0.832, "grad_norm": 0.2850629389286041, "learning_rate": 3.3664e-05, "loss": 1.0898, "step": 2600 }, { "epoch": 0.8352, "grad_norm": 0.4017927348613739, "learning_rate": 3.3024e-05, "loss": 1.1364, "step": 2610 }, { "epoch": 0.8384, "grad_norm": 0.36761587858200073, "learning_rate": 3.2384e-05, "loss": 1.1171, "step": 2620 }, { "epoch": 0.8416, "grad_norm": 0.25396281480789185, "learning_rate": 3.1744e-05, "loss": 1.0557, "step": 2630 }, { "epoch": 0.8448, "grad_norm": 0.2851448059082031, "learning_rate": 3.1104e-05, "loss": 1.0673, "step": 2640 }, { "epoch": 0.848, "grad_norm": 0.2541842758655548, "learning_rate": 3.0464000000000005e-05, "loss": 1.1323, "step": 2650 }, { "epoch": 0.8512, "grad_norm": 0.29052332043647766, "learning_rate": 2.9824e-05, "loss": 1.114, "step": 2660 }, { "epoch": 0.8544, "grad_norm": 0.2773646414279938, "learning_rate": 2.9184e-05, "loss": 1.0973, "step": 2670 }, { "epoch": 0.8576, "grad_norm": 0.28323352336883545, "learning_rate": 2.8544000000000003e-05, "loss": 1.1452, "step": 2680 }, { "epoch": 0.8608, "grad_norm": 0.3044739365577698, "learning_rate": 2.7904000000000003e-05, "loss": 1.0615, "step": 2690 }, { "epoch": 0.864, "grad_norm": 0.2924152612686157, "learning_rate": 2.7264000000000002e-05, "loss": 1.1456, "step": 2700 }, { "epoch": 0.8672, "grad_norm": 0.2642782926559448, "learning_rate": 2.6623999999999998e-05, "loss": 1.1651, "step": 2710 }, { "epoch": 0.8704, "grad_norm": 0.3707656264305115, "learning_rate": 2.5984000000000004e-05, "loss": 1.1247, "step": 2720 }, { "epoch": 0.8736, "grad_norm": 0.4668145775794983, "learning_rate": 2.5344e-05, "loss": 1.1537, "step": 2730 }, { "epoch": 0.8768, "grad_norm": 0.34861406683921814, "learning_rate": 2.4704000000000003e-05, "loss": 1.1141, "step": 2740 }, { "epoch": 0.88, "grad_norm": 0.2636050283908844, "learning_rate": 2.4064000000000002e-05, "loss": 1.0715, "step": 2750 }, { "epoch": 0.8832, "grad_norm": 0.30254754424095154, "learning_rate": 2.3424e-05, "loss": 1.108, "step": 2760 }, { "epoch": 0.8864, "grad_norm": 0.298784077167511, "learning_rate": 2.2784e-05, "loss": 1.2063, "step": 2770 }, { "epoch": 0.8896, "grad_norm": 0.34004920721054077, "learning_rate": 2.2144e-05, "loss": 1.1275, "step": 2780 }, { "epoch": 0.8928, "grad_norm": 0.33883705735206604, "learning_rate": 2.1504000000000003e-05, "loss": 1.083, "step": 2790 }, { "epoch": 0.896, "grad_norm": 0.30469802021980286, "learning_rate": 2.0864e-05, "loss": 1.0508, "step": 2800 }, { "epoch": 0.8992, "grad_norm": 0.36350762844085693, "learning_rate": 2.0224e-05, "loss": 1.112, "step": 2810 }, { "epoch": 0.9024, "grad_norm": 0.27796122431755066, "learning_rate": 1.9584e-05, "loss": 1.1022, "step": 2820 }, { "epoch": 0.9056, "grad_norm": 0.2932034134864807, "learning_rate": 1.8944e-05, "loss": 1.0319, "step": 2830 }, { "epoch": 0.9088, "grad_norm": 0.394808828830719, "learning_rate": 1.8304000000000003e-05, "loss": 1.0978, "step": 2840 }, { "epoch": 0.912, "grad_norm": 0.34576427936553955, "learning_rate": 1.7664e-05, "loss": 1.1566, "step": 2850 }, { "epoch": 0.9152, "grad_norm": 0.32156696915626526, "learning_rate": 1.7024e-05, "loss": 1.1663, "step": 2860 }, { "epoch": 0.9184, "grad_norm": 0.3129923939704895, "learning_rate": 1.6384e-05, "loss": 1.0844, "step": 2870 }, { "epoch": 0.9216, "grad_norm": 0.24763771891593933, "learning_rate": 1.5744e-05, "loss": 1.12, "step": 2880 }, { "epoch": 0.9248, "grad_norm": 0.3140289783477783, "learning_rate": 1.5104000000000001e-05, "loss": 1.1582, "step": 2890 }, { "epoch": 0.928, "grad_norm": 0.2710714340209961, "learning_rate": 1.4463999999999999e-05, "loss": 1.0895, "step": 2900 }, { "epoch": 0.9312, "grad_norm": 0.3298426568508148, "learning_rate": 1.3824e-05, "loss": 1.0665, "step": 2910 }, { "epoch": 0.9344, "grad_norm": 0.24175623059272766, "learning_rate": 1.3184000000000001e-05, "loss": 1.0966, "step": 2920 }, { "epoch": 0.9376, "grad_norm": 0.31446129083633423, "learning_rate": 1.2544e-05, "loss": 1.0798, "step": 2930 }, { "epoch": 0.9408, "grad_norm": 0.32403141260147095, "learning_rate": 1.1904000000000002e-05, "loss": 1.1143, "step": 2940 }, { "epoch": 0.944, "grad_norm": 0.2862265408039093, "learning_rate": 1.1264000000000001e-05, "loss": 1.0839, "step": 2950 }, { "epoch": 0.9472, "grad_norm": 0.4909001588821411, "learning_rate": 1.0624e-05, "loss": 1.1413, "step": 2960 }, { "epoch": 0.9504, "grad_norm": 0.32599207758903503, "learning_rate": 9.984e-06, "loss": 1.1079, "step": 2970 }, { "epoch": 0.9536, "grad_norm": 0.29065415263175964, "learning_rate": 9.344e-06, "loss": 1.1232, "step": 2980 }, { "epoch": 0.9568, "grad_norm": 0.33668118715286255, "learning_rate": 8.704000000000002e-06, "loss": 1.2002, "step": 2990 }, { "epoch": 0.96, "grad_norm": 0.2757526636123657, "learning_rate": 8.064000000000001e-06, "loss": 1.1951, "step": 3000 } ], "logging_steps": 10, "max_steps": 3125, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.8776953724928e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }