| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.96, | |
| "eval_steps": 500, | |
| "global_step": 3000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0032, | |
| "grad_norm": 0.18818965554237366, | |
| "learning_rate": 0.00019942400000000002, | |
| "loss": 1.4398, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.0064, | |
| "grad_norm": 0.20508623123168945, | |
| "learning_rate": 0.00019878400000000003, | |
| "loss": 1.3682, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.0096, | |
| "grad_norm": 0.18839821219444275, | |
| "learning_rate": 0.000198144, | |
| "loss": 1.3981, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.0128, | |
| "grad_norm": 0.20183835923671722, | |
| "learning_rate": 0.000197504, | |
| "loss": 1.3109, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.016, | |
| "grad_norm": 0.22559459507465363, | |
| "learning_rate": 0.000196864, | |
| "loss": 1.343, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.0192, | |
| "grad_norm": 0.21955451369285583, | |
| "learning_rate": 0.000196224, | |
| "loss": 1.3645, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.0224, | |
| "grad_norm": 0.2031077742576599, | |
| "learning_rate": 0.000195584, | |
| "loss": 1.2116, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.0256, | |
| "grad_norm": 0.24542862176895142, | |
| "learning_rate": 0.000194944, | |
| "loss": 1.3466, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.0288, | |
| "grad_norm": 0.24425864219665527, | |
| "learning_rate": 0.00019430400000000002, | |
| "loss": 1.2202, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.032, | |
| "grad_norm": 0.23464636504650116, | |
| "learning_rate": 0.000193664, | |
| "loss": 1.2862, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.0352, | |
| "grad_norm": 0.22896084189414978, | |
| "learning_rate": 0.000193024, | |
| "loss": 1.2424, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.0384, | |
| "grad_norm": 0.30132901668548584, | |
| "learning_rate": 0.000192384, | |
| "loss": 1.2674, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.0416, | |
| "grad_norm": 0.2417932003736496, | |
| "learning_rate": 0.000191744, | |
| "loss": 1.2433, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.0448, | |
| "grad_norm": 0.25476428866386414, | |
| "learning_rate": 0.00019110400000000002, | |
| "loss": 1.2969, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.048, | |
| "grad_norm": 0.28159213066101074, | |
| "learning_rate": 0.00019046400000000002, | |
| "loss": 1.2396, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.0512, | |
| "grad_norm": 0.2914562225341797, | |
| "learning_rate": 0.000189824, | |
| "loss": 1.3067, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.0544, | |
| "grad_norm": 0.2946033477783203, | |
| "learning_rate": 0.000189184, | |
| "loss": 1.1911, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.0576, | |
| "grad_norm": 0.2544347941875458, | |
| "learning_rate": 0.00018854400000000002, | |
| "loss": 1.2062, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.0608, | |
| "grad_norm": 0.2680202126502991, | |
| "learning_rate": 0.00018790400000000002, | |
| "loss": 1.1604, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.064, | |
| "grad_norm": 0.25478115677833557, | |
| "learning_rate": 0.00018726400000000003, | |
| "loss": 1.2515, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.0672, | |
| "grad_norm": 0.26804226636886597, | |
| "learning_rate": 0.000186624, | |
| "loss": 1.2765, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.0704, | |
| "grad_norm": 0.24587133526802063, | |
| "learning_rate": 0.00018598400000000001, | |
| "loss": 1.1768, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.0736, | |
| "grad_norm": 0.27216073870658875, | |
| "learning_rate": 0.00018534400000000002, | |
| "loss": 1.1848, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.0768, | |
| "grad_norm": 0.3114471733570099, | |
| "learning_rate": 0.000184704, | |
| "loss": 1.2086, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 0.3045384883880615, | |
| "learning_rate": 0.000184064, | |
| "loss": 1.1874, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.0832, | |
| "grad_norm": 0.2793915271759033, | |
| "learning_rate": 0.000183424, | |
| "loss": 1.1977, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.0864, | |
| "grad_norm": 0.34699785709381104, | |
| "learning_rate": 0.000182784, | |
| "loss": 1.2089, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.0896, | |
| "grad_norm": 0.25291335582733154, | |
| "learning_rate": 0.000182144, | |
| "loss": 1.2459, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.0928, | |
| "grad_norm": 0.2616818845272064, | |
| "learning_rate": 0.000181504, | |
| "loss": 1.2655, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.096, | |
| "grad_norm": 0.2569883167743683, | |
| "learning_rate": 0.000180864, | |
| "loss": 1.2479, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.0992, | |
| "grad_norm": 0.33097052574157715, | |
| "learning_rate": 0.00018022400000000001, | |
| "loss": 1.1975, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.1024, | |
| "grad_norm": 0.2557203471660614, | |
| "learning_rate": 0.00017958400000000002, | |
| "loss": 1.1423, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.1056, | |
| "grad_norm": 0.24836640059947968, | |
| "learning_rate": 0.000178944, | |
| "loss": 1.197, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.1088, | |
| "grad_norm": 0.25998368859291077, | |
| "learning_rate": 0.000178304, | |
| "loss": 1.195, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.112, | |
| "grad_norm": 0.29318585991859436, | |
| "learning_rate": 0.000177664, | |
| "loss": 1.1771, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.1152, | |
| "grad_norm": 0.2819266617298126, | |
| "learning_rate": 0.00017702400000000002, | |
| "loss": 1.1113, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.1184, | |
| "grad_norm": 0.24419356882572174, | |
| "learning_rate": 0.00017638400000000002, | |
| "loss": 1.2027, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.1216, | |
| "grad_norm": 0.30148938298225403, | |
| "learning_rate": 0.000175744, | |
| "loss": 1.1583, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.1248, | |
| "grad_norm": 0.2704978287220001, | |
| "learning_rate": 0.000175104, | |
| "loss": 1.2246, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.128, | |
| "grad_norm": 0.2642189562320709, | |
| "learning_rate": 0.00017446400000000002, | |
| "loss": 1.1543, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.1312, | |
| "grad_norm": 0.31052809953689575, | |
| "learning_rate": 0.00017382400000000002, | |
| "loss": 1.2039, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.1344, | |
| "grad_norm": 0.2596384286880493, | |
| "learning_rate": 0.000173184, | |
| "loss": 1.1693, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.1376, | |
| "grad_norm": 0.2813952565193176, | |
| "learning_rate": 0.000172544, | |
| "loss": 1.2213, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.1408, | |
| "grad_norm": 0.26021143794059753, | |
| "learning_rate": 0.00017190399999999999, | |
| "loss": 1.1611, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.144, | |
| "grad_norm": 0.3072742521762848, | |
| "learning_rate": 0.000171264, | |
| "loss": 1.1464, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.1472, | |
| "grad_norm": 0.3125726580619812, | |
| "learning_rate": 0.000170624, | |
| "loss": 1.1901, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.1504, | |
| "grad_norm": 0.25702011585235596, | |
| "learning_rate": 0.000169984, | |
| "loss": 1.1439, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.1536, | |
| "grad_norm": 0.26658275723457336, | |
| "learning_rate": 0.000169344, | |
| "loss": 1.1256, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.1568, | |
| "grad_norm": 0.2687830626964569, | |
| "learning_rate": 0.00016870400000000002, | |
| "loss": 1.233, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 0.30417656898498535, | |
| "learning_rate": 0.000168064, | |
| "loss": 1.1951, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.1632, | |
| "grad_norm": 0.24493242800235748, | |
| "learning_rate": 0.000167424, | |
| "loss": 1.2125, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.1664, | |
| "grad_norm": 0.2442469447851181, | |
| "learning_rate": 0.000166784, | |
| "loss": 1.2005, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.1696, | |
| "grad_norm": 0.2598196566104889, | |
| "learning_rate": 0.00016614400000000001, | |
| "loss": 1.1361, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.1728, | |
| "grad_norm": 0.2823512852191925, | |
| "learning_rate": 0.00016550400000000002, | |
| "loss": 1.2671, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.176, | |
| "grad_norm": 0.27284982800483704, | |
| "learning_rate": 0.00016486400000000003, | |
| "loss": 1.2019, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.1792, | |
| "grad_norm": 0.4265678822994232, | |
| "learning_rate": 0.000164224, | |
| "loss": 1.1557, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.1824, | |
| "grad_norm": 0.32852381467819214, | |
| "learning_rate": 0.000163584, | |
| "loss": 1.1792, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.1856, | |
| "grad_norm": 0.25044888257980347, | |
| "learning_rate": 0.00016294400000000002, | |
| "loss": 1.1814, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.1888, | |
| "grad_norm": 0.30278605222702026, | |
| "learning_rate": 0.00016230400000000002, | |
| "loss": 1.1865, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.192, | |
| "grad_norm": 0.259230375289917, | |
| "learning_rate": 0.000161664, | |
| "loss": 1.2012, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.1952, | |
| "grad_norm": 0.3037353456020355, | |
| "learning_rate": 0.000161024, | |
| "loss": 1.1508, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.1984, | |
| "grad_norm": 0.32500702142715454, | |
| "learning_rate": 0.000160384, | |
| "loss": 1.2302, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.2016, | |
| "grad_norm": 0.3390319049358368, | |
| "learning_rate": 0.000159744, | |
| "loss": 1.1949, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.2048, | |
| "grad_norm": 0.30461153388023376, | |
| "learning_rate": 0.000159104, | |
| "loss": 1.1659, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.208, | |
| "grad_norm": 0.2725638449192047, | |
| "learning_rate": 0.000158464, | |
| "loss": 1.1905, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.2112, | |
| "grad_norm": 0.24364377558231354, | |
| "learning_rate": 0.000157824, | |
| "loss": 1.1194, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.2144, | |
| "grad_norm": 0.2993292510509491, | |
| "learning_rate": 0.000157184, | |
| "loss": 1.165, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.2176, | |
| "grad_norm": 0.22024789452552795, | |
| "learning_rate": 0.000156544, | |
| "loss": 1.1187, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.2208, | |
| "grad_norm": 0.2543094754219055, | |
| "learning_rate": 0.000155904, | |
| "loss": 1.0779, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.224, | |
| "grad_norm": 0.31767502427101135, | |
| "learning_rate": 0.000155264, | |
| "loss": 1.1076, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.2272, | |
| "grad_norm": 0.27651944756507874, | |
| "learning_rate": 0.00015462400000000002, | |
| "loss": 1.1748, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.2304, | |
| "grad_norm": 0.2532593905925751, | |
| "learning_rate": 0.00015398400000000002, | |
| "loss": 1.1302, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.2336, | |
| "grad_norm": 0.26502105593681335, | |
| "learning_rate": 0.000153344, | |
| "loss": 1.1652, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.2368, | |
| "grad_norm": 0.22709833085536957, | |
| "learning_rate": 0.000152704, | |
| "loss": 1.1045, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 0.273548424243927, | |
| "learning_rate": 0.000152064, | |
| "loss": 1.1216, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.2432, | |
| "grad_norm": 0.26936766505241394, | |
| "learning_rate": 0.00015142400000000002, | |
| "loss": 1.0981, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.2464, | |
| "grad_norm": 0.2642468810081482, | |
| "learning_rate": 0.00015078400000000003, | |
| "loss": 1.1044, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.2496, | |
| "grad_norm": 0.25724905729293823, | |
| "learning_rate": 0.000150144, | |
| "loss": 1.1474, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.2528, | |
| "grad_norm": 0.27715614438056946, | |
| "learning_rate": 0.000149504, | |
| "loss": 1.102, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.256, | |
| "grad_norm": 0.25462499260902405, | |
| "learning_rate": 0.00014886400000000002, | |
| "loss": 1.1834, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.2592, | |
| "grad_norm": 0.25248298048973083, | |
| "learning_rate": 0.000148224, | |
| "loss": 1.2023, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.2624, | |
| "grad_norm": 0.3147791922092438, | |
| "learning_rate": 0.000147584, | |
| "loss": 1.1564, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.2656, | |
| "grad_norm": 0.30874860286712646, | |
| "learning_rate": 0.000146944, | |
| "loss": 1.1722, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.2688, | |
| "grad_norm": 0.2486383467912674, | |
| "learning_rate": 0.000146304, | |
| "loss": 1.1161, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.272, | |
| "grad_norm": 0.24819347262382507, | |
| "learning_rate": 0.000145664, | |
| "loss": 1.1813, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.2752, | |
| "grad_norm": 0.2713952660560608, | |
| "learning_rate": 0.000145024, | |
| "loss": 1.0917, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.2784, | |
| "grad_norm": 0.2632099986076355, | |
| "learning_rate": 0.000144384, | |
| "loss": 1.1806, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.2816, | |
| "grad_norm": 0.24610574543476105, | |
| "learning_rate": 0.000143744, | |
| "loss": 1.1397, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.2848, | |
| "grad_norm": 0.3086780607700348, | |
| "learning_rate": 0.00014310400000000002, | |
| "loss": 1.1338, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.288, | |
| "grad_norm": 0.25754594802856445, | |
| "learning_rate": 0.000142464, | |
| "loss": 1.148, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.2912, | |
| "grad_norm": 0.2540118098258972, | |
| "learning_rate": 0.000141824, | |
| "loss": 1.0787, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.2944, | |
| "grad_norm": 0.3876936733722687, | |
| "learning_rate": 0.000141184, | |
| "loss": 1.0841, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.2976, | |
| "grad_norm": 0.27151429653167725, | |
| "learning_rate": 0.00014054400000000002, | |
| "loss": 1.0859, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.3008, | |
| "grad_norm": 0.2533249855041504, | |
| "learning_rate": 0.00013990400000000002, | |
| "loss": 1.1347, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.304, | |
| "grad_norm": 0.2707951068878174, | |
| "learning_rate": 0.00013926400000000003, | |
| "loss": 1.1787, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.3072, | |
| "grad_norm": 0.27499666810035706, | |
| "learning_rate": 0.000138624, | |
| "loss": 1.1991, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.3104, | |
| "grad_norm": 0.31902188062667847, | |
| "learning_rate": 0.000137984, | |
| "loss": 1.2036, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.3136, | |
| "grad_norm": 0.2782028913497925, | |
| "learning_rate": 0.00013734400000000002, | |
| "loss": 1.1654, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.3168, | |
| "grad_norm": 0.3343225419521332, | |
| "learning_rate": 0.000136704, | |
| "loss": 1.1677, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 0.2642679810523987, | |
| "learning_rate": 0.000136064, | |
| "loss": 1.1241, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.3232, | |
| "grad_norm": 0.37036266922950745, | |
| "learning_rate": 0.000135424, | |
| "loss": 1.1454, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.3264, | |
| "grad_norm": 0.2647375464439392, | |
| "learning_rate": 0.000134784, | |
| "loss": 1.1288, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.3296, | |
| "grad_norm": 0.3140479028224945, | |
| "learning_rate": 0.000134144, | |
| "loss": 1.2205, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.3328, | |
| "grad_norm": 0.24704284965991974, | |
| "learning_rate": 0.000133504, | |
| "loss": 1.1341, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.336, | |
| "grad_norm": 0.3063775300979614, | |
| "learning_rate": 0.000132864, | |
| "loss": 1.2261, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.3392, | |
| "grad_norm": 0.2879609167575836, | |
| "learning_rate": 0.000132224, | |
| "loss": 1.1676, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.3424, | |
| "grad_norm": 0.27018502354621887, | |
| "learning_rate": 0.000131584, | |
| "loss": 1.1237, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.3456, | |
| "grad_norm": 0.25358906388282776, | |
| "learning_rate": 0.000130944, | |
| "loss": 1.1034, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.3488, | |
| "grad_norm": 0.26398876309394836, | |
| "learning_rate": 0.000130304, | |
| "loss": 1.2041, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.352, | |
| "grad_norm": 0.4267687499523163, | |
| "learning_rate": 0.000129664, | |
| "loss": 1.15, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.3552, | |
| "grad_norm": 0.26307064294815063, | |
| "learning_rate": 0.00012902400000000002, | |
| "loss": 1.0921, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.3584, | |
| "grad_norm": 0.38714703917503357, | |
| "learning_rate": 0.00012838400000000002, | |
| "loss": 1.225, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.3616, | |
| "grad_norm": 0.275495320558548, | |
| "learning_rate": 0.000127744, | |
| "loss": 1.1513, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.3648, | |
| "grad_norm": 0.26792117953300476, | |
| "learning_rate": 0.000127104, | |
| "loss": 1.1293, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.368, | |
| "grad_norm": 0.2474093735218048, | |
| "learning_rate": 0.00012646400000000001, | |
| "loss": 1.175, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.3712, | |
| "grad_norm": 0.2551768124103546, | |
| "learning_rate": 0.00012582400000000002, | |
| "loss": 1.1212, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.3744, | |
| "grad_norm": 0.2605915665626526, | |
| "learning_rate": 0.000125184, | |
| "loss": 1.1303, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.3776, | |
| "grad_norm": 0.30072781443595886, | |
| "learning_rate": 0.000124544, | |
| "loss": 1.1552, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.3808, | |
| "grad_norm": 0.2362007051706314, | |
| "learning_rate": 0.00012390399999999998, | |
| "loss": 1.1984, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.384, | |
| "grad_norm": 0.3113998472690582, | |
| "learning_rate": 0.000123264, | |
| "loss": 1.1239, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.3872, | |
| "grad_norm": 0.3142775893211365, | |
| "learning_rate": 0.000122624, | |
| "loss": 1.1089, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.3904, | |
| "grad_norm": 0.38791659474372864, | |
| "learning_rate": 0.000121984, | |
| "loss": 1.1532, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.3936, | |
| "grad_norm": 0.37536805868148804, | |
| "learning_rate": 0.00012134400000000001, | |
| "loss": 1.1333, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.3968, | |
| "grad_norm": 0.2762630879878998, | |
| "learning_rate": 0.00012070399999999999, | |
| "loss": 1.0189, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.25079619884490967, | |
| "learning_rate": 0.000120064, | |
| "loss": 1.1169, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.4032, | |
| "grad_norm": 0.2860804498195648, | |
| "learning_rate": 0.000119424, | |
| "loss": 1.0866, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.4064, | |
| "grad_norm": 0.2980159819126129, | |
| "learning_rate": 0.000118784, | |
| "loss": 1.1898, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.4096, | |
| "grad_norm": 0.336664080619812, | |
| "learning_rate": 0.00011814400000000001, | |
| "loss": 1.1332, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.4128, | |
| "grad_norm": 0.26283812522888184, | |
| "learning_rate": 0.00011750400000000002, | |
| "loss": 1.1367, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.416, | |
| "grad_norm": 0.27355343103408813, | |
| "learning_rate": 0.000116864, | |
| "loss": 1.0927, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.4192, | |
| "grad_norm": 0.2756136953830719, | |
| "learning_rate": 0.000116224, | |
| "loss": 1.1109, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.4224, | |
| "grad_norm": 0.26861876249313354, | |
| "learning_rate": 0.00011558400000000001, | |
| "loss": 1.1401, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.4256, | |
| "grad_norm": 0.23718690872192383, | |
| "learning_rate": 0.000114944, | |
| "loss": 1.1717, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.4288, | |
| "grad_norm": 0.2714873254299164, | |
| "learning_rate": 0.00011430400000000001, | |
| "loss": 1.1032, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.432, | |
| "grad_norm": 0.29829949140548706, | |
| "learning_rate": 0.00011366400000000001, | |
| "loss": 1.1129, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.4352, | |
| "grad_norm": 0.2577730417251587, | |
| "learning_rate": 0.000113024, | |
| "loss": 1.0978, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.4384, | |
| "grad_norm": 0.24967093765735626, | |
| "learning_rate": 0.000112384, | |
| "loss": 1.1533, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.4416, | |
| "grad_norm": 0.2530049681663513, | |
| "learning_rate": 0.000111744, | |
| "loss": 1.0921, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.4448, | |
| "grad_norm": 0.30380481481552124, | |
| "learning_rate": 0.00011110400000000001, | |
| "loss": 1.1773, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.448, | |
| "grad_norm": 0.25974929332733154, | |
| "learning_rate": 0.00011046400000000002, | |
| "loss": 1.1584, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.4512, | |
| "grad_norm": 0.37404870986938477, | |
| "learning_rate": 0.00010982400000000001, | |
| "loss": 1.1199, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.4544, | |
| "grad_norm": 0.32186731696128845, | |
| "learning_rate": 0.000109184, | |
| "loss": 1.1392, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.4576, | |
| "grad_norm": 0.2843293845653534, | |
| "learning_rate": 0.000108544, | |
| "loss": 1.1223, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.4608, | |
| "grad_norm": 0.310863733291626, | |
| "learning_rate": 0.000107904, | |
| "loss": 1.1112, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.464, | |
| "grad_norm": 0.2348843812942505, | |
| "learning_rate": 0.00010726400000000001, | |
| "loss": 1.227, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.4672, | |
| "grad_norm": 0.27787262201309204, | |
| "learning_rate": 0.00010662400000000001, | |
| "loss": 1.145, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.4704, | |
| "grad_norm": 0.26069822907447815, | |
| "learning_rate": 0.000105984, | |
| "loss": 1.1417, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.4736, | |
| "grad_norm": 0.27364885807037354, | |
| "learning_rate": 0.000105344, | |
| "loss": 1.0959, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.4768, | |
| "grad_norm": 0.24618731439113617, | |
| "learning_rate": 0.000104704, | |
| "loss": 1.1312, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.28439000248908997, | |
| "learning_rate": 0.00010406400000000001, | |
| "loss": 1.1506, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.4832, | |
| "grad_norm": 0.26154178380966187, | |
| "learning_rate": 0.000103424, | |
| "loss": 1.0225, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.4864, | |
| "grad_norm": 0.26929762959480286, | |
| "learning_rate": 0.00010278400000000001, | |
| "loss": 1.061, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.4896, | |
| "grad_norm": 0.2753245532512665, | |
| "learning_rate": 0.00010214399999999999, | |
| "loss": 1.1724, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.4928, | |
| "grad_norm": 0.23799718916416168, | |
| "learning_rate": 0.000101504, | |
| "loss": 1.1397, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.496, | |
| "grad_norm": 0.277057945728302, | |
| "learning_rate": 0.000100864, | |
| "loss": 1.1924, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.4992, | |
| "grad_norm": 0.27867016196250916, | |
| "learning_rate": 0.00010022400000000001, | |
| "loss": 1.1841, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.5024, | |
| "grad_norm": 0.2876740097999573, | |
| "learning_rate": 9.9584e-05, | |
| "loss": 1.1085, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.5056, | |
| "grad_norm": 0.2694062292575836, | |
| "learning_rate": 9.8944e-05, | |
| "loss": 1.0743, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.5088, | |
| "grad_norm": 0.27280256152153015, | |
| "learning_rate": 9.830400000000001e-05, | |
| "loss": 1.2189, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.512, | |
| "grad_norm": 0.3361833393573761, | |
| "learning_rate": 9.7664e-05, | |
| "loss": 1.0771, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.5152, | |
| "grad_norm": 0.2917429208755493, | |
| "learning_rate": 9.7024e-05, | |
| "loss": 1.1137, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.5184, | |
| "grad_norm": 0.306228905916214, | |
| "learning_rate": 9.6384e-05, | |
| "loss": 1.1708, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.5216, | |
| "grad_norm": 0.25011730194091797, | |
| "learning_rate": 9.5744e-05, | |
| "loss": 1.1042, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.5248, | |
| "grad_norm": 0.3258339464664459, | |
| "learning_rate": 9.5104e-05, | |
| "loss": 1.1781, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.528, | |
| "grad_norm": 0.3206620216369629, | |
| "learning_rate": 9.446400000000001e-05, | |
| "loss": 1.0991, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.5312, | |
| "grad_norm": 0.31144049763679504, | |
| "learning_rate": 9.3824e-05, | |
| "loss": 1.101, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.5344, | |
| "grad_norm": 0.2294250875711441, | |
| "learning_rate": 9.318400000000001e-05, | |
| "loss": 1.1346, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.5376, | |
| "grad_norm": 0.34881460666656494, | |
| "learning_rate": 9.254400000000001e-05, | |
| "loss": 1.1389, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.5408, | |
| "grad_norm": 0.27140548825263977, | |
| "learning_rate": 9.1904e-05, | |
| "loss": 1.1214, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.544, | |
| "grad_norm": 0.25193724036216736, | |
| "learning_rate": 9.1264e-05, | |
| "loss": 1.1273, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.5472, | |
| "grad_norm": 0.27552053332328796, | |
| "learning_rate": 9.0624e-05, | |
| "loss": 1.1523, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.5504, | |
| "grad_norm": 0.2877196967601776, | |
| "learning_rate": 8.9984e-05, | |
| "loss": 1.082, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.5536, | |
| "grad_norm": 0.23511843383312225, | |
| "learning_rate": 8.9344e-05, | |
| "loss": 1.1251, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.5568, | |
| "grad_norm": 0.28936073184013367, | |
| "learning_rate": 8.870400000000001e-05, | |
| "loss": 1.1742, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.28244665265083313, | |
| "learning_rate": 8.8064e-05, | |
| "loss": 1.1665, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.5632, | |
| "grad_norm": 0.22868531942367554, | |
| "learning_rate": 8.742400000000001e-05, | |
| "loss": 1.0733, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.5664, | |
| "grad_norm": 0.23258651793003082, | |
| "learning_rate": 8.6784e-05, | |
| "loss": 1.0538, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.5696, | |
| "grad_norm": 0.2886444330215454, | |
| "learning_rate": 8.614400000000001e-05, | |
| "loss": 1.0896, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.5728, | |
| "grad_norm": 0.26151803135871887, | |
| "learning_rate": 8.5504e-05, | |
| "loss": 1.0771, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.576, | |
| "grad_norm": 0.2829320430755615, | |
| "learning_rate": 8.486399999999999e-05, | |
| "loss": 1.0796, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.5792, | |
| "grad_norm": 0.2471112608909607, | |
| "learning_rate": 8.4224e-05, | |
| "loss": 1.1751, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.5824, | |
| "grad_norm": 0.35704588890075684, | |
| "learning_rate": 8.3584e-05, | |
| "loss": 1.1203, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.5856, | |
| "grad_norm": 0.3598352074623108, | |
| "learning_rate": 8.2944e-05, | |
| "loss": 1.1982, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.5888, | |
| "grad_norm": 0.24873049557209015, | |
| "learning_rate": 8.2304e-05, | |
| "loss": 1.11, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.592, | |
| "grad_norm": 0.2786683142185211, | |
| "learning_rate": 8.166400000000001e-05, | |
| "loss": 1.1045, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.5952, | |
| "grad_norm": 0.31083497405052185, | |
| "learning_rate": 8.1024e-05, | |
| "loss": 1.1094, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.5984, | |
| "grad_norm": 0.25888901948928833, | |
| "learning_rate": 8.038400000000001e-05, | |
| "loss": 1.0637, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.6016, | |
| "grad_norm": 0.27845245599746704, | |
| "learning_rate": 7.9744e-05, | |
| "loss": 1.1442, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.6048, | |
| "grad_norm": 0.32606974244117737, | |
| "learning_rate": 7.910400000000001e-05, | |
| "loss": 1.1409, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 0.608, | |
| "grad_norm": 0.28251323103904724, | |
| "learning_rate": 7.8464e-05, | |
| "loss": 1.1037, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.6112, | |
| "grad_norm": 0.2702641487121582, | |
| "learning_rate": 7.7824e-05, | |
| "loss": 1.11, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 0.6144, | |
| "grad_norm": 0.31664419174194336, | |
| "learning_rate": 7.7184e-05, | |
| "loss": 1.0634, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.6176, | |
| "grad_norm": 0.30974429845809937, | |
| "learning_rate": 7.6544e-05, | |
| "loss": 1.0884, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 0.6208, | |
| "grad_norm": 0.27011537551879883, | |
| "learning_rate": 7.590400000000001e-05, | |
| "loss": 1.0534, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.624, | |
| "grad_norm": 0.26978424191474915, | |
| "learning_rate": 7.5264e-05, | |
| "loss": 1.1019, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.6272, | |
| "grad_norm": 0.29052773118019104, | |
| "learning_rate": 7.462400000000001e-05, | |
| "loss": 1.1617, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.6304, | |
| "grad_norm": 0.2614887058734894, | |
| "learning_rate": 7.398400000000002e-05, | |
| "loss": 1.1562, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 0.6336, | |
| "grad_norm": 0.27294448018074036, | |
| "learning_rate": 7.334400000000001e-05, | |
| "loss": 1.1257, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.6368, | |
| "grad_norm": 0.24990952014923096, | |
| "learning_rate": 7.2704e-05, | |
| "loss": 1.0977, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.272666335105896, | |
| "learning_rate": 7.206399999999999e-05, | |
| "loss": 1.0731, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.6432, | |
| "grad_norm": 0.28734877705574036, | |
| "learning_rate": 7.1424e-05, | |
| "loss": 1.1062, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 0.6464, | |
| "grad_norm": 0.27861565351486206, | |
| "learning_rate": 7.0784e-05, | |
| "loss": 1.1435, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 0.6496, | |
| "grad_norm": 0.34537386894226074, | |
| "learning_rate": 7.0144e-05, | |
| "loss": 1.1227, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 0.6528, | |
| "grad_norm": 0.3273583650588989, | |
| "learning_rate": 6.9504e-05, | |
| "loss": 1.0856, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 0.656, | |
| "grad_norm": 0.3750070631504059, | |
| "learning_rate": 6.886400000000001e-05, | |
| "loss": 1.1438, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.6592, | |
| "grad_norm": 0.4590343236923218, | |
| "learning_rate": 6.8224e-05, | |
| "loss": 1.1532, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 0.6624, | |
| "grad_norm": 0.3549063503742218, | |
| "learning_rate": 6.758400000000001e-05, | |
| "loss": 1.1418, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 0.6656, | |
| "grad_norm": 0.33026474714279175, | |
| "learning_rate": 6.6944e-05, | |
| "loss": 1.0342, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.6688, | |
| "grad_norm": 0.37451469898223877, | |
| "learning_rate": 6.6304e-05, | |
| "loss": 1.104, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 0.672, | |
| "grad_norm": 0.32349246740341187, | |
| "learning_rate": 6.5664e-05, | |
| "loss": 1.1403, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.6752, | |
| "grad_norm": 0.3289710581302643, | |
| "learning_rate": 6.5024e-05, | |
| "loss": 1.1349, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 0.6784, | |
| "grad_norm": 0.24263447523117065, | |
| "learning_rate": 6.4384e-05, | |
| "loss": 1.1162, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 0.6816, | |
| "grad_norm": 0.3055990934371948, | |
| "learning_rate": 6.3744e-05, | |
| "loss": 1.0623, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 0.6848, | |
| "grad_norm": 0.26169636845588684, | |
| "learning_rate": 6.310400000000001e-05, | |
| "loss": 1.1566, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 0.688, | |
| "grad_norm": 0.2564956545829773, | |
| "learning_rate": 6.2464e-05, | |
| "loss": 1.1615, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.6912, | |
| "grad_norm": 0.2543143033981323, | |
| "learning_rate": 6.182400000000001e-05, | |
| "loss": 1.14, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.6944, | |
| "grad_norm": 0.28348684310913086, | |
| "learning_rate": 6.1184e-05, | |
| "loss": 1.1192, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 0.6976, | |
| "grad_norm": 0.25627613067626953, | |
| "learning_rate": 6.0544e-05, | |
| "loss": 1.1469, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 0.7008, | |
| "grad_norm": 0.39540451765060425, | |
| "learning_rate": 5.990400000000001e-05, | |
| "loss": 1.1497, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 0.704, | |
| "grad_norm": 0.2549257278442383, | |
| "learning_rate": 5.9264e-05, | |
| "loss": 1.077, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.7072, | |
| "grad_norm": 0.2789277136325836, | |
| "learning_rate": 5.8624e-05, | |
| "loss": 1.048, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 0.7104, | |
| "grad_norm": 0.28794965147972107, | |
| "learning_rate": 5.7984000000000006e-05, | |
| "loss": 1.14, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 0.7136, | |
| "grad_norm": 0.27266013622283936, | |
| "learning_rate": 5.7344e-05, | |
| "loss": 1.1882, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 0.7168, | |
| "grad_norm": 0.31794604659080505, | |
| "learning_rate": 5.6704000000000005e-05, | |
| "loss": 1.1174, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.26018059253692627, | |
| "learning_rate": 5.6064000000000004e-05, | |
| "loss": 1.0834, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.7232, | |
| "grad_norm": 0.30461081862449646, | |
| "learning_rate": 5.5423999999999997e-05, | |
| "loss": 1.0549, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 0.7264, | |
| "grad_norm": 0.38113224506378174, | |
| "learning_rate": 5.4784e-05, | |
| "loss": 1.1426, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 0.7296, | |
| "grad_norm": 0.2623901069164276, | |
| "learning_rate": 5.414400000000001e-05, | |
| "loss": 1.066, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 0.7328, | |
| "grad_norm": 0.2910241484642029, | |
| "learning_rate": 5.3504e-05, | |
| "loss": 1.0094, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 0.736, | |
| "grad_norm": 0.2720244228839874, | |
| "learning_rate": 5.2864e-05, | |
| "loss": 1.1315, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.7392, | |
| "grad_norm": 0.25109943747520447, | |
| "learning_rate": 5.222400000000001e-05, | |
| "loss": 1.0621, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 0.7424, | |
| "grad_norm": 0.2956967353820801, | |
| "learning_rate": 5.1584e-05, | |
| "loss": 1.0889, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 0.7456, | |
| "grad_norm": 0.2685677409172058, | |
| "learning_rate": 5.0944000000000006e-05, | |
| "loss": 1.0868, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 0.7488, | |
| "grad_norm": 0.3329203426837921, | |
| "learning_rate": 5.0304000000000005e-05, | |
| "loss": 1.1943, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 0.752, | |
| "grad_norm": 0.30616864562034607, | |
| "learning_rate": 4.9664000000000004e-05, | |
| "loss": 1.1168, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.7552, | |
| "grad_norm": 0.26512446999549866, | |
| "learning_rate": 4.9024000000000004e-05, | |
| "loss": 1.1155, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 0.7584, | |
| "grad_norm": 0.3897416591644287, | |
| "learning_rate": 4.8384e-05, | |
| "loss": 1.1451, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 0.7616, | |
| "grad_norm": 0.30231884121894836, | |
| "learning_rate": 4.7744e-05, | |
| "loss": 1.1502, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 0.7648, | |
| "grad_norm": 0.3417859673500061, | |
| "learning_rate": 4.7104e-05, | |
| "loss": 1.1075, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 0.768, | |
| "grad_norm": 0.2546481788158417, | |
| "learning_rate": 4.6464e-05, | |
| "loss": 1.1244, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.7712, | |
| "grad_norm": 0.2996661067008972, | |
| "learning_rate": 4.5824e-05, | |
| "loss": 1.0609, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 0.7744, | |
| "grad_norm": 0.3443852663040161, | |
| "learning_rate": 4.5184000000000006e-05, | |
| "loss": 1.0461, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 0.7776, | |
| "grad_norm": 0.4986393451690674, | |
| "learning_rate": 4.4544e-05, | |
| "loss": 1.1281, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 0.7808, | |
| "grad_norm": 0.4183667302131653, | |
| "learning_rate": 4.3904e-05, | |
| "loss": 1.1396, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 0.784, | |
| "grad_norm": 0.29799389839172363, | |
| "learning_rate": 4.3264000000000005e-05, | |
| "loss": 1.1093, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.7872, | |
| "grad_norm": 0.28885000944137573, | |
| "learning_rate": 4.2624000000000004e-05, | |
| "loss": 1.0716, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 0.7904, | |
| "grad_norm": 0.31649360060691833, | |
| "learning_rate": 4.1984e-05, | |
| "loss": 1.096, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 0.7936, | |
| "grad_norm": 0.2691904306411743, | |
| "learning_rate": 4.1344e-05, | |
| "loss": 1.1591, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 0.7968, | |
| "grad_norm": 0.31309962272644043, | |
| "learning_rate": 4.0704e-05, | |
| "loss": 1.1577, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.27012893557548523, | |
| "learning_rate": 4.0064e-05, | |
| "loss": 1.1719, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.8032, | |
| "grad_norm": 0.4025005102157593, | |
| "learning_rate": 3.9424e-05, | |
| "loss": 1.0755, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 0.8064, | |
| "grad_norm": 0.29118382930755615, | |
| "learning_rate": 3.878400000000001e-05, | |
| "loss": 1.1296, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 0.8096, | |
| "grad_norm": 0.26896369457244873, | |
| "learning_rate": 3.8144e-05, | |
| "loss": 1.1221, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 0.8128, | |
| "grad_norm": 0.3033294677734375, | |
| "learning_rate": 3.7504e-05, | |
| "loss": 1.1374, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 0.816, | |
| "grad_norm": 0.3265039026737213, | |
| "learning_rate": 3.6864000000000005e-05, | |
| "loss": 1.0935, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.8192, | |
| "grad_norm": 0.24130520224571228, | |
| "learning_rate": 3.6224000000000004e-05, | |
| "loss": 1.2369, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 0.8224, | |
| "grad_norm": 0.36245155334472656, | |
| "learning_rate": 3.5584000000000004e-05, | |
| "loss": 1.0706, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 0.8256, | |
| "grad_norm": 0.2817525267601013, | |
| "learning_rate": 3.4943999999999996e-05, | |
| "loss": 1.0652, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 0.8288, | |
| "grad_norm": 0.2730059027671814, | |
| "learning_rate": 3.4304e-05, | |
| "loss": 1.067, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 0.832, | |
| "grad_norm": 0.2850629389286041, | |
| "learning_rate": 3.3664e-05, | |
| "loss": 1.0898, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.8352, | |
| "grad_norm": 0.4017927348613739, | |
| "learning_rate": 3.3024e-05, | |
| "loss": 1.1364, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 0.8384, | |
| "grad_norm": 0.36761587858200073, | |
| "learning_rate": 3.2384e-05, | |
| "loss": 1.1171, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 0.8416, | |
| "grad_norm": 0.25396281480789185, | |
| "learning_rate": 3.1744e-05, | |
| "loss": 1.0557, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 0.8448, | |
| "grad_norm": 0.2851448059082031, | |
| "learning_rate": 3.1104e-05, | |
| "loss": 1.0673, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 0.848, | |
| "grad_norm": 0.2541842758655548, | |
| "learning_rate": 3.0464000000000005e-05, | |
| "loss": 1.1323, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.8512, | |
| "grad_norm": 0.29052332043647766, | |
| "learning_rate": 2.9824e-05, | |
| "loss": 1.114, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 0.8544, | |
| "grad_norm": 0.2773646414279938, | |
| "learning_rate": 2.9184e-05, | |
| "loss": 1.0973, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 0.8576, | |
| "grad_norm": 0.28323352336883545, | |
| "learning_rate": 2.8544000000000003e-05, | |
| "loss": 1.1452, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 0.8608, | |
| "grad_norm": 0.3044739365577698, | |
| "learning_rate": 2.7904000000000003e-05, | |
| "loss": 1.0615, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 0.864, | |
| "grad_norm": 0.2924152612686157, | |
| "learning_rate": 2.7264000000000002e-05, | |
| "loss": 1.1456, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.8672, | |
| "grad_norm": 0.2642782926559448, | |
| "learning_rate": 2.6623999999999998e-05, | |
| "loss": 1.1651, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 0.8704, | |
| "grad_norm": 0.3707656264305115, | |
| "learning_rate": 2.5984000000000004e-05, | |
| "loss": 1.1247, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 0.8736, | |
| "grad_norm": 0.4668145775794983, | |
| "learning_rate": 2.5344e-05, | |
| "loss": 1.1537, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 0.8768, | |
| "grad_norm": 0.34861406683921814, | |
| "learning_rate": 2.4704000000000003e-05, | |
| "loss": 1.1141, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 0.2636050283908844, | |
| "learning_rate": 2.4064000000000002e-05, | |
| "loss": 1.0715, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.8832, | |
| "grad_norm": 0.30254754424095154, | |
| "learning_rate": 2.3424e-05, | |
| "loss": 1.108, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 0.8864, | |
| "grad_norm": 0.298784077167511, | |
| "learning_rate": 2.2784e-05, | |
| "loss": 1.2063, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 0.8896, | |
| "grad_norm": 0.34004920721054077, | |
| "learning_rate": 2.2144e-05, | |
| "loss": 1.1275, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 0.8928, | |
| "grad_norm": 0.33883705735206604, | |
| "learning_rate": 2.1504000000000003e-05, | |
| "loss": 1.083, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 0.896, | |
| "grad_norm": 0.30469802021980286, | |
| "learning_rate": 2.0864e-05, | |
| "loss": 1.0508, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.8992, | |
| "grad_norm": 0.36350762844085693, | |
| "learning_rate": 2.0224e-05, | |
| "loss": 1.112, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 0.9024, | |
| "grad_norm": 0.27796122431755066, | |
| "learning_rate": 1.9584e-05, | |
| "loss": 1.1022, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 0.9056, | |
| "grad_norm": 0.2932034134864807, | |
| "learning_rate": 1.8944e-05, | |
| "loss": 1.0319, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 0.9088, | |
| "grad_norm": 0.394808828830719, | |
| "learning_rate": 1.8304000000000003e-05, | |
| "loss": 1.0978, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 0.912, | |
| "grad_norm": 0.34576427936553955, | |
| "learning_rate": 1.7664e-05, | |
| "loss": 1.1566, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.9152, | |
| "grad_norm": 0.32156696915626526, | |
| "learning_rate": 1.7024e-05, | |
| "loss": 1.1663, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 0.9184, | |
| "grad_norm": 0.3129923939704895, | |
| "learning_rate": 1.6384e-05, | |
| "loss": 1.0844, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 0.9216, | |
| "grad_norm": 0.24763771891593933, | |
| "learning_rate": 1.5744e-05, | |
| "loss": 1.12, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 0.9248, | |
| "grad_norm": 0.3140289783477783, | |
| "learning_rate": 1.5104000000000001e-05, | |
| "loss": 1.1582, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 0.928, | |
| "grad_norm": 0.2710714340209961, | |
| "learning_rate": 1.4463999999999999e-05, | |
| "loss": 1.0895, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.9312, | |
| "grad_norm": 0.3298426568508148, | |
| "learning_rate": 1.3824e-05, | |
| "loss": 1.0665, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 0.9344, | |
| "grad_norm": 0.24175623059272766, | |
| "learning_rate": 1.3184000000000001e-05, | |
| "loss": 1.0966, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 0.9376, | |
| "grad_norm": 0.31446129083633423, | |
| "learning_rate": 1.2544e-05, | |
| "loss": 1.0798, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 0.9408, | |
| "grad_norm": 0.32403141260147095, | |
| "learning_rate": 1.1904000000000002e-05, | |
| "loss": 1.1143, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 0.944, | |
| "grad_norm": 0.2862265408039093, | |
| "learning_rate": 1.1264000000000001e-05, | |
| "loss": 1.0839, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.9472, | |
| "grad_norm": 0.4909001588821411, | |
| "learning_rate": 1.0624e-05, | |
| "loss": 1.1413, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 0.9504, | |
| "grad_norm": 0.32599207758903503, | |
| "learning_rate": 9.984e-06, | |
| "loss": 1.1079, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 0.9536, | |
| "grad_norm": 0.29065415263175964, | |
| "learning_rate": 9.344e-06, | |
| "loss": 1.1232, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 0.9568, | |
| "grad_norm": 0.33668118715286255, | |
| "learning_rate": 8.704000000000002e-06, | |
| "loss": 1.2002, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 0.2757526636123657, | |
| "learning_rate": 8.064000000000001e-06, | |
| "loss": 1.1951, | |
| "step": 3000 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 3125, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4.8776953724928e+17, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |