| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.0, |
| "eval_steps": 500, |
| "global_step": 3125, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0032, |
| "grad_norm": 0.18818965554237366, |
| "learning_rate": 0.00019942400000000002, |
| "loss": 1.4398, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.0064, |
| "grad_norm": 0.20508623123168945, |
| "learning_rate": 0.00019878400000000003, |
| "loss": 1.3682, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.0096, |
| "grad_norm": 0.18839821219444275, |
| "learning_rate": 0.000198144, |
| "loss": 1.3981, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.0128, |
| "grad_norm": 0.20183835923671722, |
| "learning_rate": 0.000197504, |
| "loss": 1.3109, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.016, |
| "grad_norm": 0.22559459507465363, |
| "learning_rate": 0.000196864, |
| "loss": 1.343, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.0192, |
| "grad_norm": 0.21955451369285583, |
| "learning_rate": 0.000196224, |
| "loss": 1.3645, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.0224, |
| "grad_norm": 0.2031077742576599, |
| "learning_rate": 0.000195584, |
| "loss": 1.2116, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.0256, |
| "grad_norm": 0.24542862176895142, |
| "learning_rate": 0.000194944, |
| "loss": 1.3466, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.0288, |
| "grad_norm": 0.24425864219665527, |
| "learning_rate": 0.00019430400000000002, |
| "loss": 1.2202, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.032, |
| "grad_norm": 0.23464636504650116, |
| "learning_rate": 0.000193664, |
| "loss": 1.2862, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.0352, |
| "grad_norm": 0.22896084189414978, |
| "learning_rate": 0.000193024, |
| "loss": 1.2424, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.0384, |
| "grad_norm": 0.30132901668548584, |
| "learning_rate": 0.000192384, |
| "loss": 1.2674, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.0416, |
| "grad_norm": 0.2417932003736496, |
| "learning_rate": 0.000191744, |
| "loss": 1.2433, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.0448, |
| "grad_norm": 0.25476428866386414, |
| "learning_rate": 0.00019110400000000002, |
| "loss": 1.2969, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.048, |
| "grad_norm": 0.28159213066101074, |
| "learning_rate": 0.00019046400000000002, |
| "loss": 1.2396, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.0512, |
| "grad_norm": 0.2914562225341797, |
| "learning_rate": 0.000189824, |
| "loss": 1.3067, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.0544, |
| "grad_norm": 0.2946033477783203, |
| "learning_rate": 0.000189184, |
| "loss": 1.1911, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.0576, |
| "grad_norm": 0.2544347941875458, |
| "learning_rate": 0.00018854400000000002, |
| "loss": 1.2062, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.0608, |
| "grad_norm": 0.2680202126502991, |
| "learning_rate": 0.00018790400000000002, |
| "loss": 1.1604, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.064, |
| "grad_norm": 0.25478115677833557, |
| "learning_rate": 0.00018726400000000003, |
| "loss": 1.2515, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.0672, |
| "grad_norm": 0.26804226636886597, |
| "learning_rate": 0.000186624, |
| "loss": 1.2765, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.0704, |
| "grad_norm": 0.24587133526802063, |
| "learning_rate": 0.00018598400000000001, |
| "loss": 1.1768, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.0736, |
| "grad_norm": 0.27216073870658875, |
| "learning_rate": 0.00018534400000000002, |
| "loss": 1.1848, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.0768, |
| "grad_norm": 0.3114471733570099, |
| "learning_rate": 0.000184704, |
| "loss": 1.2086, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.08, |
| "grad_norm": 0.3045384883880615, |
| "learning_rate": 0.000184064, |
| "loss": 1.1874, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.0832, |
| "grad_norm": 0.2793915271759033, |
| "learning_rate": 0.000183424, |
| "loss": 1.1977, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.0864, |
| "grad_norm": 0.34699785709381104, |
| "learning_rate": 0.000182784, |
| "loss": 1.2089, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.0896, |
| "grad_norm": 0.25291335582733154, |
| "learning_rate": 0.000182144, |
| "loss": 1.2459, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.0928, |
| "grad_norm": 0.2616818845272064, |
| "learning_rate": 0.000181504, |
| "loss": 1.2655, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.096, |
| "grad_norm": 0.2569883167743683, |
| "learning_rate": 0.000180864, |
| "loss": 1.2479, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.0992, |
| "grad_norm": 0.33097052574157715, |
| "learning_rate": 0.00018022400000000001, |
| "loss": 1.1975, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.1024, |
| "grad_norm": 0.2557203471660614, |
| "learning_rate": 0.00017958400000000002, |
| "loss": 1.1423, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.1056, |
| "grad_norm": 0.24836640059947968, |
| "learning_rate": 0.000178944, |
| "loss": 1.197, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.1088, |
| "grad_norm": 0.25998368859291077, |
| "learning_rate": 0.000178304, |
| "loss": 1.195, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.112, |
| "grad_norm": 0.29318585991859436, |
| "learning_rate": 0.000177664, |
| "loss": 1.1771, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.1152, |
| "grad_norm": 0.2819266617298126, |
| "learning_rate": 0.00017702400000000002, |
| "loss": 1.1113, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.1184, |
| "grad_norm": 0.24419356882572174, |
| "learning_rate": 0.00017638400000000002, |
| "loss": 1.2027, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.1216, |
| "grad_norm": 0.30148938298225403, |
| "learning_rate": 0.000175744, |
| "loss": 1.1583, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.1248, |
| "grad_norm": 0.2704978287220001, |
| "learning_rate": 0.000175104, |
| "loss": 1.2246, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.128, |
| "grad_norm": 0.2642189562320709, |
| "learning_rate": 0.00017446400000000002, |
| "loss": 1.1543, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.1312, |
| "grad_norm": 0.31052809953689575, |
| "learning_rate": 0.00017382400000000002, |
| "loss": 1.2039, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.1344, |
| "grad_norm": 0.2596384286880493, |
| "learning_rate": 0.000173184, |
| "loss": 1.1693, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.1376, |
| "grad_norm": 0.2813952565193176, |
| "learning_rate": 0.000172544, |
| "loss": 1.2213, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.1408, |
| "grad_norm": 0.26021143794059753, |
| "learning_rate": 0.00017190399999999999, |
| "loss": 1.1611, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.144, |
| "grad_norm": 0.3072742521762848, |
| "learning_rate": 0.000171264, |
| "loss": 1.1464, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.1472, |
| "grad_norm": 0.3125726580619812, |
| "learning_rate": 0.000170624, |
| "loss": 1.1901, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.1504, |
| "grad_norm": 0.25702011585235596, |
| "learning_rate": 0.000169984, |
| "loss": 1.1439, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.1536, |
| "grad_norm": 0.26658275723457336, |
| "learning_rate": 0.000169344, |
| "loss": 1.1256, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.1568, |
| "grad_norm": 0.2687830626964569, |
| "learning_rate": 0.00016870400000000002, |
| "loss": 1.233, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.16, |
| "grad_norm": 0.30417656898498535, |
| "learning_rate": 0.000168064, |
| "loss": 1.1951, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.1632, |
| "grad_norm": 0.24493242800235748, |
| "learning_rate": 0.000167424, |
| "loss": 1.2125, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.1664, |
| "grad_norm": 0.2442469447851181, |
| "learning_rate": 0.000166784, |
| "loss": 1.2005, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.1696, |
| "grad_norm": 0.2598196566104889, |
| "learning_rate": 0.00016614400000000001, |
| "loss": 1.1361, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.1728, |
| "grad_norm": 0.2823512852191925, |
| "learning_rate": 0.00016550400000000002, |
| "loss": 1.2671, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.176, |
| "grad_norm": 0.27284982800483704, |
| "learning_rate": 0.00016486400000000003, |
| "loss": 1.2019, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.1792, |
| "grad_norm": 0.4265678822994232, |
| "learning_rate": 0.000164224, |
| "loss": 1.1557, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.1824, |
| "grad_norm": 0.32852381467819214, |
| "learning_rate": 0.000163584, |
| "loss": 1.1792, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.1856, |
| "grad_norm": 0.25044888257980347, |
| "learning_rate": 0.00016294400000000002, |
| "loss": 1.1814, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.1888, |
| "grad_norm": 0.30278605222702026, |
| "learning_rate": 0.00016230400000000002, |
| "loss": 1.1865, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.192, |
| "grad_norm": 0.259230375289917, |
| "learning_rate": 0.000161664, |
| "loss": 1.2012, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.1952, |
| "grad_norm": 0.3037353456020355, |
| "learning_rate": 0.000161024, |
| "loss": 1.1508, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.1984, |
| "grad_norm": 0.32500702142715454, |
| "learning_rate": 0.000160384, |
| "loss": 1.2302, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.2016, |
| "grad_norm": 0.3390319049358368, |
| "learning_rate": 0.000159744, |
| "loss": 1.1949, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.2048, |
| "grad_norm": 0.30461153388023376, |
| "learning_rate": 0.000159104, |
| "loss": 1.1659, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.208, |
| "grad_norm": 0.2725638449192047, |
| "learning_rate": 0.000158464, |
| "loss": 1.1905, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.2112, |
| "grad_norm": 0.24364377558231354, |
| "learning_rate": 0.000157824, |
| "loss": 1.1194, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.2144, |
| "grad_norm": 0.2993292510509491, |
| "learning_rate": 0.000157184, |
| "loss": 1.165, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.2176, |
| "grad_norm": 0.22024789452552795, |
| "learning_rate": 0.000156544, |
| "loss": 1.1187, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.2208, |
| "grad_norm": 0.2543094754219055, |
| "learning_rate": 0.000155904, |
| "loss": 1.0779, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.224, |
| "grad_norm": 0.31767502427101135, |
| "learning_rate": 0.000155264, |
| "loss": 1.1076, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.2272, |
| "grad_norm": 0.27651944756507874, |
| "learning_rate": 0.00015462400000000002, |
| "loss": 1.1748, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.2304, |
| "grad_norm": 0.2532593905925751, |
| "learning_rate": 0.00015398400000000002, |
| "loss": 1.1302, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.2336, |
| "grad_norm": 0.26502105593681335, |
| "learning_rate": 0.000153344, |
| "loss": 1.1652, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.2368, |
| "grad_norm": 0.22709833085536957, |
| "learning_rate": 0.000152704, |
| "loss": 1.1045, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.24, |
| "grad_norm": 0.273548424243927, |
| "learning_rate": 0.000152064, |
| "loss": 1.1216, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.2432, |
| "grad_norm": 0.26936766505241394, |
| "learning_rate": 0.00015142400000000002, |
| "loss": 1.0981, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.2464, |
| "grad_norm": 0.2642468810081482, |
| "learning_rate": 0.00015078400000000003, |
| "loss": 1.1044, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.2496, |
| "grad_norm": 0.25724905729293823, |
| "learning_rate": 0.000150144, |
| "loss": 1.1474, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.2528, |
| "grad_norm": 0.27715614438056946, |
| "learning_rate": 0.000149504, |
| "loss": 1.102, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.256, |
| "grad_norm": 0.25462499260902405, |
| "learning_rate": 0.00014886400000000002, |
| "loss": 1.1834, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.2592, |
| "grad_norm": 0.25248298048973083, |
| "learning_rate": 0.000148224, |
| "loss": 1.2023, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.2624, |
| "grad_norm": 0.3147791922092438, |
| "learning_rate": 0.000147584, |
| "loss": 1.1564, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.2656, |
| "grad_norm": 0.30874860286712646, |
| "learning_rate": 0.000146944, |
| "loss": 1.1722, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.2688, |
| "grad_norm": 0.2486383467912674, |
| "learning_rate": 0.000146304, |
| "loss": 1.1161, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.272, |
| "grad_norm": 0.24819347262382507, |
| "learning_rate": 0.000145664, |
| "loss": 1.1813, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.2752, |
| "grad_norm": 0.2713952660560608, |
| "learning_rate": 0.000145024, |
| "loss": 1.0917, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.2784, |
| "grad_norm": 0.2632099986076355, |
| "learning_rate": 0.000144384, |
| "loss": 1.1806, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.2816, |
| "grad_norm": 0.24610574543476105, |
| "learning_rate": 0.000143744, |
| "loss": 1.1397, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.2848, |
| "grad_norm": 0.3086780607700348, |
| "learning_rate": 0.00014310400000000002, |
| "loss": 1.1338, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.288, |
| "grad_norm": 0.25754594802856445, |
| "learning_rate": 0.000142464, |
| "loss": 1.148, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.2912, |
| "grad_norm": 0.2540118098258972, |
| "learning_rate": 0.000141824, |
| "loss": 1.0787, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.2944, |
| "grad_norm": 0.3876936733722687, |
| "learning_rate": 0.000141184, |
| "loss": 1.0841, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.2976, |
| "grad_norm": 0.27151429653167725, |
| "learning_rate": 0.00014054400000000002, |
| "loss": 1.0859, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.3008, |
| "grad_norm": 0.2533249855041504, |
| "learning_rate": 0.00013990400000000002, |
| "loss": 1.1347, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.304, |
| "grad_norm": 0.2707951068878174, |
| "learning_rate": 0.00013926400000000003, |
| "loss": 1.1787, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.3072, |
| "grad_norm": 0.27499666810035706, |
| "learning_rate": 0.000138624, |
| "loss": 1.1991, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.3104, |
| "grad_norm": 0.31902188062667847, |
| "learning_rate": 0.000137984, |
| "loss": 1.2036, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.3136, |
| "grad_norm": 0.2782028913497925, |
| "learning_rate": 0.00013734400000000002, |
| "loss": 1.1654, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.3168, |
| "grad_norm": 0.3343225419521332, |
| "learning_rate": 0.000136704, |
| "loss": 1.1677, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.32, |
| "grad_norm": 0.2642679810523987, |
| "learning_rate": 0.000136064, |
| "loss": 1.1241, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.3232, |
| "grad_norm": 0.37036266922950745, |
| "learning_rate": 0.000135424, |
| "loss": 1.1454, |
| "step": 1010 |
| }, |
| { |
| "epoch": 0.3264, |
| "grad_norm": 0.2647375464439392, |
| "learning_rate": 0.000134784, |
| "loss": 1.1288, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.3296, |
| "grad_norm": 0.3140479028224945, |
| "learning_rate": 0.000134144, |
| "loss": 1.2205, |
| "step": 1030 |
| }, |
| { |
| "epoch": 0.3328, |
| "grad_norm": 0.24704284965991974, |
| "learning_rate": 0.000133504, |
| "loss": 1.1341, |
| "step": 1040 |
| }, |
| { |
| "epoch": 0.336, |
| "grad_norm": 0.3063775300979614, |
| "learning_rate": 0.000132864, |
| "loss": 1.2261, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.3392, |
| "grad_norm": 0.2879609167575836, |
| "learning_rate": 0.000132224, |
| "loss": 1.1676, |
| "step": 1060 |
| }, |
| { |
| "epoch": 0.3424, |
| "grad_norm": 0.27018502354621887, |
| "learning_rate": 0.000131584, |
| "loss": 1.1237, |
| "step": 1070 |
| }, |
| { |
| "epoch": 0.3456, |
| "grad_norm": 0.25358906388282776, |
| "learning_rate": 0.000130944, |
| "loss": 1.1034, |
| "step": 1080 |
| }, |
| { |
| "epoch": 0.3488, |
| "grad_norm": 0.26398876309394836, |
| "learning_rate": 0.000130304, |
| "loss": 1.2041, |
| "step": 1090 |
| }, |
| { |
| "epoch": 0.352, |
| "grad_norm": 0.4267687499523163, |
| "learning_rate": 0.000129664, |
| "loss": 1.15, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.3552, |
| "grad_norm": 0.26307064294815063, |
| "learning_rate": 0.00012902400000000002, |
| "loss": 1.0921, |
| "step": 1110 |
| }, |
| { |
| "epoch": 0.3584, |
| "grad_norm": 0.38714703917503357, |
| "learning_rate": 0.00012838400000000002, |
| "loss": 1.225, |
| "step": 1120 |
| }, |
| { |
| "epoch": 0.3616, |
| "grad_norm": 0.275495320558548, |
| "learning_rate": 0.000127744, |
| "loss": 1.1513, |
| "step": 1130 |
| }, |
| { |
| "epoch": 0.3648, |
| "grad_norm": 0.26792117953300476, |
| "learning_rate": 0.000127104, |
| "loss": 1.1293, |
| "step": 1140 |
| }, |
| { |
| "epoch": 0.368, |
| "grad_norm": 0.2474093735218048, |
| "learning_rate": 0.00012646400000000001, |
| "loss": 1.175, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.3712, |
| "grad_norm": 0.2551768124103546, |
| "learning_rate": 0.00012582400000000002, |
| "loss": 1.1212, |
| "step": 1160 |
| }, |
| { |
| "epoch": 0.3744, |
| "grad_norm": 0.2605915665626526, |
| "learning_rate": 0.000125184, |
| "loss": 1.1303, |
| "step": 1170 |
| }, |
| { |
| "epoch": 0.3776, |
| "grad_norm": 0.30072781443595886, |
| "learning_rate": 0.000124544, |
| "loss": 1.1552, |
| "step": 1180 |
| }, |
| { |
| "epoch": 0.3808, |
| "grad_norm": 0.2362007051706314, |
| "learning_rate": 0.00012390399999999998, |
| "loss": 1.1984, |
| "step": 1190 |
| }, |
| { |
| "epoch": 0.384, |
| "grad_norm": 0.3113998472690582, |
| "learning_rate": 0.000123264, |
| "loss": 1.1239, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.3872, |
| "grad_norm": 0.3142775893211365, |
| "learning_rate": 0.000122624, |
| "loss": 1.1089, |
| "step": 1210 |
| }, |
| { |
| "epoch": 0.3904, |
| "grad_norm": 0.38791659474372864, |
| "learning_rate": 0.000121984, |
| "loss": 1.1532, |
| "step": 1220 |
| }, |
| { |
| "epoch": 0.3936, |
| "grad_norm": 0.37536805868148804, |
| "learning_rate": 0.00012134400000000001, |
| "loss": 1.1333, |
| "step": 1230 |
| }, |
| { |
| "epoch": 0.3968, |
| "grad_norm": 0.2762630879878998, |
| "learning_rate": 0.00012070399999999999, |
| "loss": 1.0189, |
| "step": 1240 |
| }, |
| { |
| "epoch": 0.4, |
| "grad_norm": 0.25079619884490967, |
| "learning_rate": 0.000120064, |
| "loss": 1.1169, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.4032, |
| "grad_norm": 0.2860804498195648, |
| "learning_rate": 0.000119424, |
| "loss": 1.0866, |
| "step": 1260 |
| }, |
| { |
| "epoch": 0.4064, |
| "grad_norm": 0.2980159819126129, |
| "learning_rate": 0.000118784, |
| "loss": 1.1898, |
| "step": 1270 |
| }, |
| { |
| "epoch": 0.4096, |
| "grad_norm": 0.336664080619812, |
| "learning_rate": 0.00011814400000000001, |
| "loss": 1.1332, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.4128, |
| "grad_norm": 0.26283812522888184, |
| "learning_rate": 0.00011750400000000002, |
| "loss": 1.1367, |
| "step": 1290 |
| }, |
| { |
| "epoch": 0.416, |
| "grad_norm": 0.27355343103408813, |
| "learning_rate": 0.000116864, |
| "loss": 1.0927, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.4192, |
| "grad_norm": 0.2756136953830719, |
| "learning_rate": 0.000116224, |
| "loss": 1.1109, |
| "step": 1310 |
| }, |
| { |
| "epoch": 0.4224, |
| "grad_norm": 0.26861876249313354, |
| "learning_rate": 0.00011558400000000001, |
| "loss": 1.1401, |
| "step": 1320 |
| }, |
| { |
| "epoch": 0.4256, |
| "grad_norm": 0.23718690872192383, |
| "learning_rate": 0.000114944, |
| "loss": 1.1717, |
| "step": 1330 |
| }, |
| { |
| "epoch": 0.4288, |
| "grad_norm": 0.2714873254299164, |
| "learning_rate": 0.00011430400000000001, |
| "loss": 1.1032, |
| "step": 1340 |
| }, |
| { |
| "epoch": 0.432, |
| "grad_norm": 0.29829949140548706, |
| "learning_rate": 0.00011366400000000001, |
| "loss": 1.1129, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.4352, |
| "grad_norm": 0.2577730417251587, |
| "learning_rate": 0.000113024, |
| "loss": 1.0978, |
| "step": 1360 |
| }, |
| { |
| "epoch": 0.4384, |
| "grad_norm": 0.24967093765735626, |
| "learning_rate": 0.000112384, |
| "loss": 1.1533, |
| "step": 1370 |
| }, |
| { |
| "epoch": 0.4416, |
| "grad_norm": 0.2530049681663513, |
| "learning_rate": 0.000111744, |
| "loss": 1.0921, |
| "step": 1380 |
| }, |
| { |
| "epoch": 0.4448, |
| "grad_norm": 0.30380481481552124, |
| "learning_rate": 0.00011110400000000001, |
| "loss": 1.1773, |
| "step": 1390 |
| }, |
| { |
| "epoch": 0.448, |
| "grad_norm": 0.25974929332733154, |
| "learning_rate": 0.00011046400000000002, |
| "loss": 1.1584, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.4512, |
| "grad_norm": 0.37404870986938477, |
| "learning_rate": 0.00010982400000000001, |
| "loss": 1.1199, |
| "step": 1410 |
| }, |
| { |
| "epoch": 0.4544, |
| "grad_norm": 0.32186731696128845, |
| "learning_rate": 0.000109184, |
| "loss": 1.1392, |
| "step": 1420 |
| }, |
| { |
| "epoch": 0.4576, |
| "grad_norm": 0.2843293845653534, |
| "learning_rate": 0.000108544, |
| "loss": 1.1223, |
| "step": 1430 |
| }, |
| { |
| "epoch": 0.4608, |
| "grad_norm": 0.310863733291626, |
| "learning_rate": 0.000107904, |
| "loss": 1.1112, |
| "step": 1440 |
| }, |
| { |
| "epoch": 0.464, |
| "grad_norm": 0.2348843812942505, |
| "learning_rate": 0.00010726400000000001, |
| "loss": 1.227, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.4672, |
| "grad_norm": 0.27787262201309204, |
| "learning_rate": 0.00010662400000000001, |
| "loss": 1.145, |
| "step": 1460 |
| }, |
| { |
| "epoch": 0.4704, |
| "grad_norm": 0.26069822907447815, |
| "learning_rate": 0.000105984, |
| "loss": 1.1417, |
| "step": 1470 |
| }, |
| { |
| "epoch": 0.4736, |
| "grad_norm": 0.27364885807037354, |
| "learning_rate": 0.000105344, |
| "loss": 1.0959, |
| "step": 1480 |
| }, |
| { |
| "epoch": 0.4768, |
| "grad_norm": 0.24618731439113617, |
| "learning_rate": 0.000104704, |
| "loss": 1.1312, |
| "step": 1490 |
| }, |
| { |
| "epoch": 0.48, |
| "grad_norm": 0.28439000248908997, |
| "learning_rate": 0.00010406400000000001, |
| "loss": 1.1506, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.4832, |
| "grad_norm": 0.26154178380966187, |
| "learning_rate": 0.000103424, |
| "loss": 1.0225, |
| "step": 1510 |
| }, |
| { |
| "epoch": 0.4864, |
| "grad_norm": 0.26929762959480286, |
| "learning_rate": 0.00010278400000000001, |
| "loss": 1.061, |
| "step": 1520 |
| }, |
| { |
| "epoch": 0.4896, |
| "grad_norm": 0.2753245532512665, |
| "learning_rate": 0.00010214399999999999, |
| "loss": 1.1724, |
| "step": 1530 |
| }, |
| { |
| "epoch": 0.4928, |
| "grad_norm": 0.23799718916416168, |
| "learning_rate": 0.000101504, |
| "loss": 1.1397, |
| "step": 1540 |
| }, |
| { |
| "epoch": 0.496, |
| "grad_norm": 0.277057945728302, |
| "learning_rate": 0.000100864, |
| "loss": 1.1924, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.4992, |
| "grad_norm": 0.27867016196250916, |
| "learning_rate": 0.00010022400000000001, |
| "loss": 1.1841, |
| "step": 1560 |
| }, |
| { |
| "epoch": 0.5024, |
| "grad_norm": 0.2876740097999573, |
| "learning_rate": 9.9584e-05, |
| "loss": 1.1085, |
| "step": 1570 |
| }, |
| { |
| "epoch": 0.5056, |
| "grad_norm": 0.2694062292575836, |
| "learning_rate": 9.8944e-05, |
| "loss": 1.0743, |
| "step": 1580 |
| }, |
| { |
| "epoch": 0.5088, |
| "grad_norm": 0.27280256152153015, |
| "learning_rate": 9.830400000000001e-05, |
| "loss": 1.2189, |
| "step": 1590 |
| }, |
| { |
| "epoch": 0.512, |
| "grad_norm": 0.3361833393573761, |
| "learning_rate": 9.7664e-05, |
| "loss": 1.0771, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.5152, |
| "grad_norm": 0.2917429208755493, |
| "learning_rate": 9.7024e-05, |
| "loss": 1.1137, |
| "step": 1610 |
| }, |
| { |
| "epoch": 0.5184, |
| "grad_norm": 0.306228905916214, |
| "learning_rate": 9.6384e-05, |
| "loss": 1.1708, |
| "step": 1620 |
| }, |
| { |
| "epoch": 0.5216, |
| "grad_norm": 0.25011730194091797, |
| "learning_rate": 9.5744e-05, |
| "loss": 1.1042, |
| "step": 1630 |
| }, |
| { |
| "epoch": 0.5248, |
| "grad_norm": 0.3258339464664459, |
| "learning_rate": 9.5104e-05, |
| "loss": 1.1781, |
| "step": 1640 |
| }, |
| { |
| "epoch": 0.528, |
| "grad_norm": 0.3206620216369629, |
| "learning_rate": 9.446400000000001e-05, |
| "loss": 1.0991, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.5312, |
| "grad_norm": 0.31144049763679504, |
| "learning_rate": 9.3824e-05, |
| "loss": 1.101, |
| "step": 1660 |
| }, |
| { |
| "epoch": 0.5344, |
| "grad_norm": 0.2294250875711441, |
| "learning_rate": 9.318400000000001e-05, |
| "loss": 1.1346, |
| "step": 1670 |
| }, |
| { |
| "epoch": 0.5376, |
| "grad_norm": 0.34881460666656494, |
| "learning_rate": 9.254400000000001e-05, |
| "loss": 1.1389, |
| "step": 1680 |
| }, |
| { |
| "epoch": 0.5408, |
| "grad_norm": 0.27140548825263977, |
| "learning_rate": 9.1904e-05, |
| "loss": 1.1214, |
| "step": 1690 |
| }, |
| { |
| "epoch": 0.544, |
| "grad_norm": 0.25193724036216736, |
| "learning_rate": 9.1264e-05, |
| "loss": 1.1273, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.5472, |
| "grad_norm": 0.27552053332328796, |
| "learning_rate": 9.0624e-05, |
| "loss": 1.1523, |
| "step": 1710 |
| }, |
| { |
| "epoch": 0.5504, |
| "grad_norm": 0.2877196967601776, |
| "learning_rate": 8.9984e-05, |
| "loss": 1.082, |
| "step": 1720 |
| }, |
| { |
| "epoch": 0.5536, |
| "grad_norm": 0.23511843383312225, |
| "learning_rate": 8.9344e-05, |
| "loss": 1.1251, |
| "step": 1730 |
| }, |
| { |
| "epoch": 0.5568, |
| "grad_norm": 0.28936073184013367, |
| "learning_rate": 8.870400000000001e-05, |
| "loss": 1.1742, |
| "step": 1740 |
| }, |
| { |
| "epoch": 0.56, |
| "grad_norm": 0.28244665265083313, |
| "learning_rate": 8.8064e-05, |
| "loss": 1.1665, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.5632, |
| "grad_norm": 0.22868531942367554, |
| "learning_rate": 8.742400000000001e-05, |
| "loss": 1.0733, |
| "step": 1760 |
| }, |
| { |
| "epoch": 0.5664, |
| "grad_norm": 0.23258651793003082, |
| "learning_rate": 8.6784e-05, |
| "loss": 1.0538, |
| "step": 1770 |
| }, |
| { |
| "epoch": 0.5696, |
| "grad_norm": 0.2886444330215454, |
| "learning_rate": 8.614400000000001e-05, |
| "loss": 1.0896, |
| "step": 1780 |
| }, |
| { |
| "epoch": 0.5728, |
| "grad_norm": 0.26151803135871887, |
| "learning_rate": 8.5504e-05, |
| "loss": 1.0771, |
| "step": 1790 |
| }, |
| { |
| "epoch": 0.576, |
| "grad_norm": 0.2829320430755615, |
| "learning_rate": 8.486399999999999e-05, |
| "loss": 1.0796, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.5792, |
| "grad_norm": 0.2471112608909607, |
| "learning_rate": 8.4224e-05, |
| "loss": 1.1751, |
| "step": 1810 |
| }, |
| { |
| "epoch": 0.5824, |
| "grad_norm": 0.35704588890075684, |
| "learning_rate": 8.3584e-05, |
| "loss": 1.1203, |
| "step": 1820 |
| }, |
| { |
| "epoch": 0.5856, |
| "grad_norm": 0.3598352074623108, |
| "learning_rate": 8.2944e-05, |
| "loss": 1.1982, |
| "step": 1830 |
| }, |
| { |
| "epoch": 0.5888, |
| "grad_norm": 0.24873049557209015, |
| "learning_rate": 8.2304e-05, |
| "loss": 1.11, |
| "step": 1840 |
| }, |
| { |
| "epoch": 0.592, |
| "grad_norm": 0.2786683142185211, |
| "learning_rate": 8.166400000000001e-05, |
| "loss": 1.1045, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.5952, |
| "grad_norm": 0.31083497405052185, |
| "learning_rate": 8.1024e-05, |
| "loss": 1.1094, |
| "step": 1860 |
| }, |
| { |
| "epoch": 0.5984, |
| "grad_norm": 0.25888901948928833, |
| "learning_rate": 8.038400000000001e-05, |
| "loss": 1.0637, |
| "step": 1870 |
| }, |
| { |
| "epoch": 0.6016, |
| "grad_norm": 0.27845245599746704, |
| "learning_rate": 7.9744e-05, |
| "loss": 1.1442, |
| "step": 1880 |
| }, |
| { |
| "epoch": 0.6048, |
| "grad_norm": 0.32606974244117737, |
| "learning_rate": 7.910400000000001e-05, |
| "loss": 1.1409, |
| "step": 1890 |
| }, |
| { |
| "epoch": 0.608, |
| "grad_norm": 0.28251323103904724, |
| "learning_rate": 7.8464e-05, |
| "loss": 1.1037, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.6112, |
| "grad_norm": 0.2702641487121582, |
| "learning_rate": 7.7824e-05, |
| "loss": 1.11, |
| "step": 1910 |
| }, |
| { |
| "epoch": 0.6144, |
| "grad_norm": 0.31664419174194336, |
| "learning_rate": 7.7184e-05, |
| "loss": 1.0634, |
| "step": 1920 |
| }, |
| { |
| "epoch": 0.6176, |
| "grad_norm": 0.30974429845809937, |
| "learning_rate": 7.6544e-05, |
| "loss": 1.0884, |
| "step": 1930 |
| }, |
| { |
| "epoch": 0.6208, |
| "grad_norm": 0.27011537551879883, |
| "learning_rate": 7.590400000000001e-05, |
| "loss": 1.0534, |
| "step": 1940 |
| }, |
| { |
| "epoch": 0.624, |
| "grad_norm": 0.26978424191474915, |
| "learning_rate": 7.5264e-05, |
| "loss": 1.1019, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.6272, |
| "grad_norm": 0.29052773118019104, |
| "learning_rate": 7.462400000000001e-05, |
| "loss": 1.1617, |
| "step": 1960 |
| }, |
| { |
| "epoch": 0.6304, |
| "grad_norm": 0.2614887058734894, |
| "learning_rate": 7.398400000000002e-05, |
| "loss": 1.1562, |
| "step": 1970 |
| }, |
| { |
| "epoch": 0.6336, |
| "grad_norm": 0.27294448018074036, |
| "learning_rate": 7.334400000000001e-05, |
| "loss": 1.1257, |
| "step": 1980 |
| }, |
| { |
| "epoch": 0.6368, |
| "grad_norm": 0.24990952014923096, |
| "learning_rate": 7.2704e-05, |
| "loss": 1.0977, |
| "step": 1990 |
| }, |
| { |
| "epoch": 0.64, |
| "grad_norm": 0.272666335105896, |
| "learning_rate": 7.206399999999999e-05, |
| "loss": 1.0731, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.6432, |
| "grad_norm": 0.28734877705574036, |
| "learning_rate": 7.1424e-05, |
| "loss": 1.1062, |
| "step": 2010 |
| }, |
| { |
| "epoch": 0.6464, |
| "grad_norm": 0.27861565351486206, |
| "learning_rate": 7.0784e-05, |
| "loss": 1.1435, |
| "step": 2020 |
| }, |
| { |
| "epoch": 0.6496, |
| "grad_norm": 0.34537386894226074, |
| "learning_rate": 7.0144e-05, |
| "loss": 1.1227, |
| "step": 2030 |
| }, |
| { |
| "epoch": 0.6528, |
| "grad_norm": 0.3273583650588989, |
| "learning_rate": 6.9504e-05, |
| "loss": 1.0856, |
| "step": 2040 |
| }, |
| { |
| "epoch": 0.656, |
| "grad_norm": 0.3750070631504059, |
| "learning_rate": 6.886400000000001e-05, |
| "loss": 1.1438, |
| "step": 2050 |
| }, |
| { |
| "epoch": 0.6592, |
| "grad_norm": 0.4590343236923218, |
| "learning_rate": 6.8224e-05, |
| "loss": 1.1532, |
| "step": 2060 |
| }, |
| { |
| "epoch": 0.6624, |
| "grad_norm": 0.3549063503742218, |
| "learning_rate": 6.758400000000001e-05, |
| "loss": 1.1418, |
| "step": 2070 |
| }, |
| { |
| "epoch": 0.6656, |
| "grad_norm": 0.33026474714279175, |
| "learning_rate": 6.6944e-05, |
| "loss": 1.0342, |
| "step": 2080 |
| }, |
| { |
| "epoch": 0.6688, |
| "grad_norm": 0.37451469898223877, |
| "learning_rate": 6.6304e-05, |
| "loss": 1.104, |
| "step": 2090 |
| }, |
| { |
| "epoch": 0.672, |
| "grad_norm": 0.32349246740341187, |
| "learning_rate": 6.5664e-05, |
| "loss": 1.1403, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.6752, |
| "grad_norm": 0.3289710581302643, |
| "learning_rate": 6.5024e-05, |
| "loss": 1.1349, |
| "step": 2110 |
| }, |
| { |
| "epoch": 0.6784, |
| "grad_norm": 0.24263447523117065, |
| "learning_rate": 6.4384e-05, |
| "loss": 1.1162, |
| "step": 2120 |
| }, |
| { |
| "epoch": 0.6816, |
| "grad_norm": 0.3055990934371948, |
| "learning_rate": 6.3744e-05, |
| "loss": 1.0623, |
| "step": 2130 |
| }, |
| { |
| "epoch": 0.6848, |
| "grad_norm": 0.26169636845588684, |
| "learning_rate": 6.310400000000001e-05, |
| "loss": 1.1566, |
| "step": 2140 |
| }, |
| { |
| "epoch": 0.688, |
| "grad_norm": 0.2564956545829773, |
| "learning_rate": 6.2464e-05, |
| "loss": 1.1615, |
| "step": 2150 |
| }, |
| { |
| "epoch": 0.6912, |
| "grad_norm": 0.2543143033981323, |
| "learning_rate": 6.182400000000001e-05, |
| "loss": 1.14, |
| "step": 2160 |
| }, |
| { |
| "epoch": 0.6944, |
| "grad_norm": 0.28348684310913086, |
| "learning_rate": 6.1184e-05, |
| "loss": 1.1192, |
| "step": 2170 |
| }, |
| { |
| "epoch": 0.6976, |
| "grad_norm": 0.25627613067626953, |
| "learning_rate": 6.0544e-05, |
| "loss": 1.1469, |
| "step": 2180 |
| }, |
| { |
| "epoch": 0.7008, |
| "grad_norm": 0.39540451765060425, |
| "learning_rate": 5.990400000000001e-05, |
| "loss": 1.1497, |
| "step": 2190 |
| }, |
| { |
| "epoch": 0.704, |
| "grad_norm": 0.2549257278442383, |
| "learning_rate": 5.9264e-05, |
| "loss": 1.077, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.7072, |
| "grad_norm": 0.2789277136325836, |
| "learning_rate": 5.8624e-05, |
| "loss": 1.048, |
| "step": 2210 |
| }, |
| { |
| "epoch": 0.7104, |
| "grad_norm": 0.28794965147972107, |
| "learning_rate": 5.7984000000000006e-05, |
| "loss": 1.14, |
| "step": 2220 |
| }, |
| { |
| "epoch": 0.7136, |
| "grad_norm": 0.27266013622283936, |
| "learning_rate": 5.7344e-05, |
| "loss": 1.1882, |
| "step": 2230 |
| }, |
| { |
| "epoch": 0.7168, |
| "grad_norm": 0.31794604659080505, |
| "learning_rate": 5.6704000000000005e-05, |
| "loss": 1.1174, |
| "step": 2240 |
| }, |
| { |
| "epoch": 0.72, |
| "grad_norm": 0.26018059253692627, |
| "learning_rate": 5.6064000000000004e-05, |
| "loss": 1.0834, |
| "step": 2250 |
| }, |
| { |
| "epoch": 0.7232, |
| "grad_norm": 0.30461081862449646, |
| "learning_rate": 5.5423999999999997e-05, |
| "loss": 1.0549, |
| "step": 2260 |
| }, |
| { |
| "epoch": 0.7264, |
| "grad_norm": 0.38113224506378174, |
| "learning_rate": 5.4784e-05, |
| "loss": 1.1426, |
| "step": 2270 |
| }, |
| { |
| "epoch": 0.7296, |
| "grad_norm": 0.2623901069164276, |
| "learning_rate": 5.414400000000001e-05, |
| "loss": 1.066, |
| "step": 2280 |
| }, |
| { |
| "epoch": 0.7328, |
| "grad_norm": 0.2910241484642029, |
| "learning_rate": 5.3504e-05, |
| "loss": 1.0094, |
| "step": 2290 |
| }, |
| { |
| "epoch": 0.736, |
| "grad_norm": 0.2720244228839874, |
| "learning_rate": 5.2864e-05, |
| "loss": 1.1315, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.7392, |
| "grad_norm": 0.25109943747520447, |
| "learning_rate": 5.222400000000001e-05, |
| "loss": 1.0621, |
| "step": 2310 |
| }, |
| { |
| "epoch": 0.7424, |
| "grad_norm": 0.2956967353820801, |
| "learning_rate": 5.1584e-05, |
| "loss": 1.0889, |
| "step": 2320 |
| }, |
| { |
| "epoch": 0.7456, |
| "grad_norm": 0.2685677409172058, |
| "learning_rate": 5.0944000000000006e-05, |
| "loss": 1.0868, |
| "step": 2330 |
| }, |
| { |
| "epoch": 0.7488, |
| "grad_norm": 0.3329203426837921, |
| "learning_rate": 5.0304000000000005e-05, |
| "loss": 1.1943, |
| "step": 2340 |
| }, |
| { |
| "epoch": 0.752, |
| "grad_norm": 0.30616864562034607, |
| "learning_rate": 4.9664000000000004e-05, |
| "loss": 1.1168, |
| "step": 2350 |
| }, |
| { |
| "epoch": 0.7552, |
| "grad_norm": 0.26512446999549866, |
| "learning_rate": 4.9024000000000004e-05, |
| "loss": 1.1155, |
| "step": 2360 |
| }, |
| { |
| "epoch": 0.7584, |
| "grad_norm": 0.3897416591644287, |
| "learning_rate": 4.8384e-05, |
| "loss": 1.1451, |
| "step": 2370 |
| }, |
| { |
| "epoch": 0.7616, |
| "grad_norm": 0.30231884121894836, |
| "learning_rate": 4.7744e-05, |
| "loss": 1.1502, |
| "step": 2380 |
| }, |
| { |
| "epoch": 0.7648, |
| "grad_norm": 0.3417859673500061, |
| "learning_rate": 4.7104e-05, |
| "loss": 1.1075, |
| "step": 2390 |
| }, |
| { |
| "epoch": 0.768, |
| "grad_norm": 0.2546481788158417, |
| "learning_rate": 4.6464e-05, |
| "loss": 1.1244, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.7712, |
| "grad_norm": 0.2996661067008972, |
| "learning_rate": 4.5824e-05, |
| "loss": 1.0609, |
| "step": 2410 |
| }, |
| { |
| "epoch": 0.7744, |
| "grad_norm": 0.3443852663040161, |
| "learning_rate": 4.5184000000000006e-05, |
| "loss": 1.0461, |
| "step": 2420 |
| }, |
| { |
| "epoch": 0.7776, |
| "grad_norm": 0.4986393451690674, |
| "learning_rate": 4.4544e-05, |
| "loss": 1.1281, |
| "step": 2430 |
| }, |
| { |
| "epoch": 0.7808, |
| "grad_norm": 0.4183667302131653, |
| "learning_rate": 4.3904e-05, |
| "loss": 1.1396, |
| "step": 2440 |
| }, |
| { |
| "epoch": 0.784, |
| "grad_norm": 0.29799389839172363, |
| "learning_rate": 4.3264000000000005e-05, |
| "loss": 1.1093, |
| "step": 2450 |
| }, |
| { |
| "epoch": 0.7872, |
| "grad_norm": 0.28885000944137573, |
| "learning_rate": 4.2624000000000004e-05, |
| "loss": 1.0716, |
| "step": 2460 |
| }, |
| { |
| "epoch": 0.7904, |
| "grad_norm": 0.31649360060691833, |
| "learning_rate": 4.1984e-05, |
| "loss": 1.096, |
| "step": 2470 |
| }, |
| { |
| "epoch": 0.7936, |
| "grad_norm": 0.2691904306411743, |
| "learning_rate": 4.1344e-05, |
| "loss": 1.1591, |
| "step": 2480 |
| }, |
| { |
| "epoch": 0.7968, |
| "grad_norm": 0.31309962272644043, |
| "learning_rate": 4.0704e-05, |
| "loss": 1.1577, |
| "step": 2490 |
| }, |
| { |
| "epoch": 0.8, |
| "grad_norm": 0.27012893557548523, |
| "learning_rate": 4.0064e-05, |
| "loss": 1.1719, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.8032, |
| "grad_norm": 0.4025005102157593, |
| "learning_rate": 3.9424e-05, |
| "loss": 1.0755, |
| "step": 2510 |
| }, |
| { |
| "epoch": 0.8064, |
| "grad_norm": 0.29118382930755615, |
| "learning_rate": 3.878400000000001e-05, |
| "loss": 1.1296, |
| "step": 2520 |
| }, |
| { |
| "epoch": 0.8096, |
| "grad_norm": 0.26896369457244873, |
| "learning_rate": 3.8144e-05, |
| "loss": 1.1221, |
| "step": 2530 |
| }, |
| { |
| "epoch": 0.8128, |
| "grad_norm": 0.3033294677734375, |
| "learning_rate": 3.7504e-05, |
| "loss": 1.1374, |
| "step": 2540 |
| }, |
| { |
| "epoch": 0.816, |
| "grad_norm": 0.3265039026737213, |
| "learning_rate": 3.6864000000000005e-05, |
| "loss": 1.0935, |
| "step": 2550 |
| }, |
| { |
| "epoch": 0.8192, |
| "grad_norm": 0.24130520224571228, |
| "learning_rate": 3.6224000000000004e-05, |
| "loss": 1.2369, |
| "step": 2560 |
| }, |
| { |
| "epoch": 0.8224, |
| "grad_norm": 0.36245155334472656, |
| "learning_rate": 3.5584000000000004e-05, |
| "loss": 1.0706, |
| "step": 2570 |
| }, |
| { |
| "epoch": 0.8256, |
| "grad_norm": 0.2817525267601013, |
| "learning_rate": 3.4943999999999996e-05, |
| "loss": 1.0652, |
| "step": 2580 |
| }, |
| { |
| "epoch": 0.8288, |
| "grad_norm": 0.2730059027671814, |
| "learning_rate": 3.4304e-05, |
| "loss": 1.067, |
| "step": 2590 |
| }, |
| { |
| "epoch": 0.832, |
| "grad_norm": 0.2850629389286041, |
| "learning_rate": 3.3664e-05, |
| "loss": 1.0898, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.8352, |
| "grad_norm": 0.4017927348613739, |
| "learning_rate": 3.3024e-05, |
| "loss": 1.1364, |
| "step": 2610 |
| }, |
| { |
| "epoch": 0.8384, |
| "grad_norm": 0.36761587858200073, |
| "learning_rate": 3.2384e-05, |
| "loss": 1.1171, |
| "step": 2620 |
| }, |
| { |
| "epoch": 0.8416, |
| "grad_norm": 0.25396281480789185, |
| "learning_rate": 3.1744e-05, |
| "loss": 1.0557, |
| "step": 2630 |
| }, |
| { |
| "epoch": 0.8448, |
| "grad_norm": 0.2851448059082031, |
| "learning_rate": 3.1104e-05, |
| "loss": 1.0673, |
| "step": 2640 |
| }, |
| { |
| "epoch": 0.848, |
| "grad_norm": 0.2541842758655548, |
| "learning_rate": 3.0464000000000005e-05, |
| "loss": 1.1323, |
| "step": 2650 |
| }, |
| { |
| "epoch": 0.8512, |
| "grad_norm": 0.29052332043647766, |
| "learning_rate": 2.9824e-05, |
| "loss": 1.114, |
| "step": 2660 |
| }, |
| { |
| "epoch": 0.8544, |
| "grad_norm": 0.2773646414279938, |
| "learning_rate": 2.9184e-05, |
| "loss": 1.0973, |
| "step": 2670 |
| }, |
| { |
| "epoch": 0.8576, |
| "grad_norm": 0.28323352336883545, |
| "learning_rate": 2.8544000000000003e-05, |
| "loss": 1.1452, |
| "step": 2680 |
| }, |
| { |
| "epoch": 0.8608, |
| "grad_norm": 0.3044739365577698, |
| "learning_rate": 2.7904000000000003e-05, |
| "loss": 1.0615, |
| "step": 2690 |
| }, |
| { |
| "epoch": 0.864, |
| "grad_norm": 0.2924152612686157, |
| "learning_rate": 2.7264000000000002e-05, |
| "loss": 1.1456, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.8672, |
| "grad_norm": 0.2642782926559448, |
| "learning_rate": 2.6623999999999998e-05, |
| "loss": 1.1651, |
| "step": 2710 |
| }, |
| { |
| "epoch": 0.8704, |
| "grad_norm": 0.3707656264305115, |
| "learning_rate": 2.5984000000000004e-05, |
| "loss": 1.1247, |
| "step": 2720 |
| }, |
| { |
| "epoch": 0.8736, |
| "grad_norm": 0.4668145775794983, |
| "learning_rate": 2.5344e-05, |
| "loss": 1.1537, |
| "step": 2730 |
| }, |
| { |
| "epoch": 0.8768, |
| "grad_norm": 0.34861406683921814, |
| "learning_rate": 2.4704000000000003e-05, |
| "loss": 1.1141, |
| "step": 2740 |
| }, |
| { |
| "epoch": 0.88, |
| "grad_norm": 0.2636050283908844, |
| "learning_rate": 2.4064000000000002e-05, |
| "loss": 1.0715, |
| "step": 2750 |
| }, |
| { |
| "epoch": 0.8832, |
| "grad_norm": 0.30254754424095154, |
| "learning_rate": 2.3424e-05, |
| "loss": 1.108, |
| "step": 2760 |
| }, |
| { |
| "epoch": 0.8864, |
| "grad_norm": 0.298784077167511, |
| "learning_rate": 2.2784e-05, |
| "loss": 1.2063, |
| "step": 2770 |
| }, |
| { |
| "epoch": 0.8896, |
| "grad_norm": 0.34004920721054077, |
| "learning_rate": 2.2144e-05, |
| "loss": 1.1275, |
| "step": 2780 |
| }, |
| { |
| "epoch": 0.8928, |
| "grad_norm": 0.33883705735206604, |
| "learning_rate": 2.1504000000000003e-05, |
| "loss": 1.083, |
| "step": 2790 |
| }, |
| { |
| "epoch": 0.896, |
| "grad_norm": 0.30469802021980286, |
| "learning_rate": 2.0864e-05, |
| "loss": 1.0508, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.8992, |
| "grad_norm": 0.36350762844085693, |
| "learning_rate": 2.0224e-05, |
| "loss": 1.112, |
| "step": 2810 |
| }, |
| { |
| "epoch": 0.9024, |
| "grad_norm": 0.27796122431755066, |
| "learning_rate": 1.9584e-05, |
| "loss": 1.1022, |
| "step": 2820 |
| }, |
| { |
| "epoch": 0.9056, |
| "grad_norm": 0.2932034134864807, |
| "learning_rate": 1.8944e-05, |
| "loss": 1.0319, |
| "step": 2830 |
| }, |
| { |
| "epoch": 0.9088, |
| "grad_norm": 0.394808828830719, |
| "learning_rate": 1.8304000000000003e-05, |
| "loss": 1.0978, |
| "step": 2840 |
| }, |
| { |
| "epoch": 0.912, |
| "grad_norm": 0.34576427936553955, |
| "learning_rate": 1.7664e-05, |
| "loss": 1.1566, |
| "step": 2850 |
| }, |
| { |
| "epoch": 0.9152, |
| "grad_norm": 0.32156696915626526, |
| "learning_rate": 1.7024e-05, |
| "loss": 1.1663, |
| "step": 2860 |
| }, |
| { |
| "epoch": 0.9184, |
| "grad_norm": 0.3129923939704895, |
| "learning_rate": 1.6384e-05, |
| "loss": 1.0844, |
| "step": 2870 |
| }, |
| { |
| "epoch": 0.9216, |
| "grad_norm": 0.24763771891593933, |
| "learning_rate": 1.5744e-05, |
| "loss": 1.12, |
| "step": 2880 |
| }, |
| { |
| "epoch": 0.9248, |
| "grad_norm": 0.3140289783477783, |
| "learning_rate": 1.5104000000000001e-05, |
| "loss": 1.1582, |
| "step": 2890 |
| }, |
| { |
| "epoch": 0.928, |
| "grad_norm": 0.2710714340209961, |
| "learning_rate": 1.4463999999999999e-05, |
| "loss": 1.0895, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.9312, |
| "grad_norm": 0.3298426568508148, |
| "learning_rate": 1.3824e-05, |
| "loss": 1.0665, |
| "step": 2910 |
| }, |
| { |
| "epoch": 0.9344, |
| "grad_norm": 0.24175623059272766, |
| "learning_rate": 1.3184000000000001e-05, |
| "loss": 1.0966, |
| "step": 2920 |
| }, |
| { |
| "epoch": 0.9376, |
| "grad_norm": 0.31446129083633423, |
| "learning_rate": 1.2544e-05, |
| "loss": 1.0798, |
| "step": 2930 |
| }, |
| { |
| "epoch": 0.9408, |
| "grad_norm": 0.32403141260147095, |
| "learning_rate": 1.1904000000000002e-05, |
| "loss": 1.1143, |
| "step": 2940 |
| }, |
| { |
| "epoch": 0.944, |
| "grad_norm": 0.2862265408039093, |
| "learning_rate": 1.1264000000000001e-05, |
| "loss": 1.0839, |
| "step": 2950 |
| }, |
| { |
| "epoch": 0.9472, |
| "grad_norm": 0.4909001588821411, |
| "learning_rate": 1.0624e-05, |
| "loss": 1.1413, |
| "step": 2960 |
| }, |
| { |
| "epoch": 0.9504, |
| "grad_norm": 0.32599207758903503, |
| "learning_rate": 9.984e-06, |
| "loss": 1.1079, |
| "step": 2970 |
| }, |
| { |
| "epoch": 0.9536, |
| "grad_norm": 0.29065415263175964, |
| "learning_rate": 9.344e-06, |
| "loss": 1.1232, |
| "step": 2980 |
| }, |
| { |
| "epoch": 0.9568, |
| "grad_norm": 0.33668118715286255, |
| "learning_rate": 8.704000000000002e-06, |
| "loss": 1.2002, |
| "step": 2990 |
| }, |
| { |
| "epoch": 0.96, |
| "grad_norm": 0.2757526636123657, |
| "learning_rate": 8.064000000000001e-06, |
| "loss": 1.1951, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.9632, |
| "grad_norm": 0.2758793234825134, |
| "learning_rate": 7.424e-06, |
| "loss": 1.1531, |
| "step": 3010 |
| }, |
| { |
| "epoch": 0.9664, |
| "grad_norm": 0.3347441852092743, |
| "learning_rate": 6.784e-06, |
| "loss": 1.0937, |
| "step": 3020 |
| }, |
| { |
| "epoch": 0.9696, |
| "grad_norm": 0.4259791076183319, |
| "learning_rate": 6.144000000000001e-06, |
| "loss": 1.0273, |
| "step": 3030 |
| }, |
| { |
| "epoch": 0.9728, |
| "grad_norm": 0.2677628993988037, |
| "learning_rate": 5.504e-06, |
| "loss": 1.1058, |
| "step": 3040 |
| }, |
| { |
| "epoch": 0.976, |
| "grad_norm": 0.29030516743659973, |
| "learning_rate": 4.864e-06, |
| "loss": 1.1056, |
| "step": 3050 |
| }, |
| { |
| "epoch": 0.9792, |
| "grad_norm": 0.3703081011772156, |
| "learning_rate": 4.2240000000000006e-06, |
| "loss": 1.0774, |
| "step": 3060 |
| }, |
| { |
| "epoch": 0.9824, |
| "grad_norm": 0.31592023372650146, |
| "learning_rate": 3.584e-06, |
| "loss": 1.1481, |
| "step": 3070 |
| }, |
| { |
| "epoch": 0.9856, |
| "grad_norm": 0.3331587016582489, |
| "learning_rate": 2.944e-06, |
| "loss": 1.1113, |
| "step": 3080 |
| }, |
| { |
| "epoch": 0.9888, |
| "grad_norm": 0.33121487498283386, |
| "learning_rate": 2.3040000000000003e-06, |
| "loss": 1.1043, |
| "step": 3090 |
| }, |
| { |
| "epoch": 0.992, |
| "grad_norm": 0.2973518967628479, |
| "learning_rate": 1.6639999999999999e-06, |
| "loss": 1.1307, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.9952, |
| "grad_norm": 0.34271204471588135, |
| "learning_rate": 1.024e-06, |
| "loss": 1.0957, |
| "step": 3110 |
| }, |
| { |
| "epoch": 0.9984, |
| "grad_norm": 0.4162846803665161, |
| "learning_rate": 3.8400000000000005e-07, |
| "loss": 1.2086, |
| "step": 3120 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 3125, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 5.08093267968e+17, |
| "train_batch_size": 16, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|