diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,16049 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 10000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "ce_loss_12": 8.984679698944092, + "ce_loss_17": 8.228514671325684, + "ce_loss_23": 2.923858165740967, + "ce_loss_3": 10.232482433319092, + "ce_loss_6": 9.46618127822876, + "epoch": 0.0001, + "grad_norm": 78848.0, + "kl_loss_12": 12958.4599609375, + "kl_loss_17": 11774.85009765625, + "kl_loss_3": 15033.50439453125, + "kl_loss_6": 13517.923828125, + "learning_rate": 1e-05, + "loss": 13900.9365, + "step": 1 + }, + { + "ce_loss_12": 7.761990600162083, + "ce_loss_17": 6.6741025182935925, + "ce_loss_23": 2.9863624970118203, + "ce_loss_3": 8.87770226266649, + "ce_loss_6": 8.089250379138523, + "epoch": 0.001, + "grad_norm": 15936.0, + "kl_loss_12": 9865.890814887152, + "kl_loss_17": 7934.239854600694, + "kl_loss_3": 11802.4423828125, + "kl_loss_6": 10247.960828993055, + "learning_rate": 0.0001, + "loss": 10005.8681, + "step": 10 + }, + { + "ce_loss_12": 5.701529169082642, + "ce_loss_17": 4.513425433635712, + "ce_loss_23": 2.9900435090065, + "ce_loss_3": 6.997890543937683, + "ce_loss_6": 6.422506928443909, + "epoch": 0.002, + "grad_norm": 5984.0, + "kl_loss_12": 5145.42998046875, + "kl_loss_17": 2980.8785400390625, + "kl_loss_3": 7627.4039306640625, + "kl_loss_6": 6525.088452148438, + "learning_rate": 0.0002, + "loss": 5611.0891, + "step": 20 + }, + { + "ce_loss_12": 4.5624031066894535, + "ce_loss_17": 3.581133687496185, + "ce_loss_23": 2.798030948638916, + "ce_loss_3": 6.184347796440124, + "ce_loss_6": 5.588561630249023, + "epoch": 0.003, + "grad_norm": 4224.0, + "kl_loss_12": 3387.587744140625, + "kl_loss_17": 1482.0941528320313, + "kl_loss_3": 6495.78447265625, + "kl_loss_6": 5366.8233642578125, + "learning_rate": 0.0003, + "loss": 4129.3695, + "step": 30 + }, + { + "ce_loss_12": 4.41057540178299, + "ce_loss_17": 3.541793167591095, + "ce_loss_23": 2.9648167848587037, + "ce_loss_3": 5.861827778816223, + "ce_loss_6": 5.288572502136231, + "epoch": 0.004, + "grad_norm": 2448.0, + "kl_loss_12": 2764.277685546875, + "kl_loss_17": 1081.5157287597656, + "kl_loss_3": 5570.806640625, + "kl_loss_6": 4490.887646484375, + "learning_rate": 0.0004, + "loss": 3491.3078, + "step": 40 + }, + { + "ce_loss_12": 4.215928149223328, + "ce_loss_17": 3.4013169527053835, + "ce_loss_23": 2.9214254140853884, + "ce_loss_3": 5.664847564697266, + "ce_loss_6": 5.066226124763489, + "epoch": 0.005, + "grad_norm": 3184.0, + "kl_loss_12": 2475.8977905273437, + "kl_loss_17": 908.4640960693359, + "kl_loss_3": 5343.652758789062, + "kl_loss_6": 4173.3721923828125, + "learning_rate": 0.0005, + "loss": 3207.8293, + "step": 50 + }, + { + "ce_loss_12": 4.149893510341644, + "ce_loss_17": 3.3778258681297304, + "ce_loss_23": 2.944227957725525, + "ce_loss_3": 5.561229419708252, + "ce_loss_6": 4.903868675231934, + "epoch": 0.006, + "grad_norm": 2784.0, + "kl_loss_12": 2308.4556579589844, + "kl_loss_17": 802.4649719238281, + "kl_loss_3": 5074.807861328125, + "kl_loss_6": 3844.8043823242188, + "learning_rate": 0.0006, + "loss": 3021.1594, + "step": 60 + }, + { + "ce_loss_12": 4.012069857120514, + "ce_loss_17": 3.29058153629303, + "ce_loss_23": 2.8642167270183565, + "ce_loss_3": 5.4240919828414915, + "ce_loss_6": 4.797049546241761, + "epoch": 0.007, + "grad_norm": 3136.0, + "kl_loss_12": 2239.0998779296874, + "kl_loss_17": 821.2534942626953, + "kl_loss_3": 4984.6885986328125, + "kl_loss_6": 3770.4134521484375, + "learning_rate": 0.0007, + "loss": 2951.9617, + "step": 70 + }, + { + "ce_loss_12": 3.902779221534729, + "ce_loss_17": 3.2490451097488404, + "ce_loss_23": 2.865977668762207, + "ce_loss_3": 5.336037421226502, + "ce_loss_6": 4.70197069644928, + "epoch": 0.008, + "grad_norm": 2560.0, + "kl_loss_12": 2005.7831726074219, + "kl_loss_17": 710.9724884033203, + "kl_loss_3": 4829.145092773438, + "kl_loss_6": 3625.3213500976562, + "learning_rate": 0.0008, + "loss": 2813.2012, + "step": 80 + }, + { + "ce_loss_12": 3.8207091331481933, + "ce_loss_17": 3.173818564414978, + "ce_loss_23": 2.8292922019958495, + "ce_loss_3": 5.318088078498841, + "ce_loss_6": 4.644946813583374, + "epoch": 0.009, + "grad_norm": 2512.0, + "kl_loss_12": 1962.1452026367188, + "kl_loss_17": 651.503466796875, + "kl_loss_3": 4869.3822021484375, + "kl_loss_6": 3603.5643798828123, + "learning_rate": 0.0009000000000000001, + "loss": 2764.109, + "step": 90 + }, + { + "ce_loss_12": 3.950306713581085, + "ce_loss_17": 3.2736669540405274, + "ce_loss_23": 2.9367344856262205, + "ce_loss_3": 5.409222531318664, + "ce_loss_6": 4.758617329597473, + "epoch": 0.01, + "grad_norm": 4160.0, + "kl_loss_12": 1967.5855773925782, + "kl_loss_17": 628.142446899414, + "kl_loss_3": 4859.098413085938, + "kl_loss_6": 3599.1250610351562, + "learning_rate": 0.001, + "loss": 2756.1687, + "step": 100 + }, + { + "ce_loss_12": 3.9447910666465758, + "ce_loss_17": 3.267642879486084, + "ce_loss_23": 2.8981249213218687, + "ce_loss_3": 5.361401796340942, + "ce_loss_6": 4.691844916343689, + "epoch": 0.011, + "grad_norm": 5312.0, + "kl_loss_12": 2046.7912536621093, + "kl_loss_17": 695.9502899169922, + "kl_loss_3": 4828.063256835938, + "kl_loss_6": 3559.6369018554688, + "learning_rate": 0.0009999974825027757, + "loss": 2774.8828, + "step": 110 + }, + { + "ce_loss_12": 4.0172742247581485, + "ce_loss_17": 3.3309351563453675, + "ce_loss_23": 2.9565608143806457, + "ce_loss_3": 5.29806661605835, + "ce_loss_6": 4.676268362998963, + "epoch": 0.012, + "grad_norm": 2008.0, + "kl_loss_12": 2119.2089904785157, + "kl_loss_17": 750.8786010742188, + "kl_loss_3": 4602.393579101563, + "kl_loss_6": 3419.493859863281, + "learning_rate": 0.0009999899300364532, + "loss": 2695.0137, + "step": 120 + }, + { + "ce_loss_12": 3.8635333776474, + "ce_loss_17": 3.271331822872162, + "ce_loss_23": 2.918014478683472, + "ce_loss_3": 5.266225385665893, + "ce_loss_6": 4.658939671516419, + "epoch": 0.013, + "grad_norm": 2304.0, + "kl_loss_12": 1867.9003845214843, + "kl_loss_17": 662.0226776123047, + "kl_loss_3": 4594.488525390625, + "kl_loss_6": 3444.8893188476563, + "learning_rate": 0.0009999773426770863, + "loss": 2656.184, + "step": 130 + }, + { + "ce_loss_12": 3.8631237983703612, + "ce_loss_17": 3.289353311061859, + "ce_loss_23": 2.958159697055817, + "ce_loss_3": 5.247981405258178, + "ce_loss_6": 4.653779530525208, + "epoch": 0.014, + "grad_norm": 2464.0, + "kl_loss_12": 1786.0798950195312, + "kl_loss_17": 641.3013549804688, + "kl_loss_3": 4512.30078125, + "kl_loss_6": 3373.8338256835937, + "learning_rate": 0.0009999597205514296, + "loss": 2597.8961, + "step": 140 + }, + { + "ce_loss_12": 3.882954454421997, + "ce_loss_17": 3.241767716407776, + "ce_loss_23": 2.916785204410553, + "ce_loss_3": 5.209557509422302, + "ce_loss_6": 4.584619235992432, + "epoch": 0.015, + "grad_norm": 2448.0, + "kl_loss_12": 1895.7537048339843, + "kl_loss_17": 611.0111389160156, + "kl_loss_3": 4499.608471679688, + "kl_loss_6": 3297.973840332031, + "learning_rate": 0.0009999370638369377, + "loss": 2589.0434, + "step": 150 + }, + { + "ce_loss_12": 3.843744170665741, + "ce_loss_17": 3.2519840955734254, + "ce_loss_23": 2.9549498915672303, + "ce_loss_3": 5.23559000492096, + "ce_loss_6": 4.561180639266968, + "epoch": 0.016, + "grad_norm": 2992.0, + "kl_loss_12": 1756.7025024414063, + "kl_loss_17": 560.4201248168945, + "kl_loss_3": 4481.395788574218, + "kl_loss_6": 3194.0174194335937, + "learning_rate": 0.000999909372761763, + "loss": 2494.0602, + "step": 160 + }, + { + "ce_loss_12": 3.7462135672569277, + "ce_loss_17": 3.1985986948013307, + "ce_loss_23": 2.895840346813202, + "ce_loss_3": 5.146639466285706, + "ce_loss_6": 4.5148893117904665, + "epoch": 0.017, + "grad_norm": 2528.0, + "kl_loss_12": 1707.9720642089844, + "kl_loss_17": 584.8632720947265, + "kl_loss_3": 4474.881689453125, + "kl_loss_6": 3256.7652221679687, + "learning_rate": 0.0009998766476047546, + "loss": 2522.8484, + "step": 170 + }, + { + "ce_loss_12": 3.8042855978012087, + "ce_loss_17": 3.23611958026886, + "ce_loss_23": 2.9328884959220884, + "ce_loss_3": 5.155977535247803, + "ce_loss_6": 4.598589849472046, + "epoch": 0.018, + "grad_norm": 1504.0, + "kl_loss_12": 1726.3201721191406, + "kl_loss_17": 563.3028015136719, + "kl_loss_3": 4367.144787597656, + "kl_loss_6": 3311.9459716796873, + "learning_rate": 0.0009998388886954545, + "loss": 2518.5184, + "step": 180 + }, + { + "ce_loss_12": 3.7386841416358947, + "ce_loss_17": 3.1715746402740477, + "ce_loss_23": 2.901406168937683, + "ce_loss_3": 5.0980103492736815, + "ce_loss_6": 4.514608645439148, + "epoch": 0.019, + "grad_norm": 1640.0, + "kl_loss_12": 1660.0748291015625, + "kl_loss_17": 520.0130477905274, + "kl_loss_3": 4330.074011230468, + "kl_loss_6": 3235.5750732421875, + "learning_rate": 0.0009997960964140947, + "loss": 2429.5939, + "step": 190 + }, + { + "ce_loss_12": 3.7214233040809632, + "ce_loss_17": 3.1937288522720335, + "ce_loss_23": 2.8934134602546693, + "ce_loss_3": 5.083844017982483, + "ce_loss_6": 4.526143264770508, + "epoch": 0.02, + "grad_norm": 1440.0, + "kl_loss_12": 1676.9260620117188, + "kl_loss_17": 579.1263031005859, + "kl_loss_3": 4365.177062988281, + "kl_loss_6": 3291.1619384765627, + "learning_rate": 0.0009997482711915926, + "loss": 2447.6918, + "step": 200 + }, + { + "ce_loss_12": 3.641778492927551, + "ce_loss_17": 3.1578980088233948, + "ce_loss_23": 2.8717514753341673, + "ce_loss_3": 4.998690748214722, + "ce_loss_6": 4.416792297363282, + "epoch": 0.021, + "grad_norm": 1976.0, + "kl_loss_12": 1559.5156188964843, + "kl_loss_17": 547.3028244018554, + "kl_loss_3": 4223.840380859375, + "kl_loss_6": 3111.50791015625, + "learning_rate": 0.0009996954135095479, + "loss": 2355.3707, + "step": 210 + }, + { + "ce_loss_12": 3.7064762353897094, + "ce_loss_17": 3.1981414914131165, + "ce_loss_23": 2.9429712533950805, + "ce_loss_3": 5.015900611877441, + "ce_loss_6": 4.4358889818191525, + "epoch": 0.022, + "grad_norm": 1288.0, + "kl_loss_12": 1532.7710327148438, + "kl_loss_17": 486.04315795898435, + "kl_loss_3": 4124.781677246094, + "kl_loss_6": 3000.6218017578126, + "learning_rate": 0.0009996375239002368, + "loss": 2285.4283, + "step": 220 + }, + { + "ce_loss_12": 3.7420567750930784, + "ce_loss_17": 3.2466408610343933, + "ce_loss_23": 3.009046232700348, + "ce_loss_3": 5.001981043815613, + "ce_loss_6": 4.436567592620849, + "epoch": 0.023, + "grad_norm": 1080.0, + "kl_loss_12": 1472.3418090820312, + "kl_loss_17": 456.5246276855469, + "kl_loss_3": 3961.3736328125, + "kl_loss_6": 2896.7666748046877, + "learning_rate": 0.0009995746029466072, + "loss": 2198.6166, + "step": 230 + }, + { + "ce_loss_12": 3.568900454044342, + "ce_loss_17": 3.0627544045448305, + "ce_loss_23": 2.814389008283615, + "ce_loss_3": 4.937308883666992, + "ce_loss_6": 4.359485399723053, + "epoch": 0.024, + "grad_norm": 1040.0, + "kl_loss_12": 1524.7522888183594, + "kl_loss_17": 474.90843200683594, + "kl_loss_3": 4222.970385742187, + "kl_loss_6": 3120.115588378906, + "learning_rate": 0.0009995066512822719, + "loss": 2266.3066, + "step": 240 + }, + { + "ce_loss_12": 3.64496111869812, + "ce_loss_17": 3.15869255065918, + "ce_loss_23": 2.915254366397858, + "ce_loss_3": 5.050478267669678, + "ce_loss_6": 4.511067032814026, + "epoch": 0.025, + "grad_norm": 1120.0, + "kl_loss_12": 1464.532373046875, + "kl_loss_17": 459.29986267089845, + "kl_loss_3": 4245.069055175782, + "kl_loss_6": 3209.020361328125, + "learning_rate": 0.000999433669591504, + "loss": 2258.5, + "step": 250 + }, + { + "ce_loss_12": 3.5552550554275513, + "ce_loss_17": 3.0593504071235658, + "ce_loss_23": 2.8213512778282164, + "ce_loss_3": 4.910255432128906, + "ce_loss_6": 4.372002005577087, + "epoch": 0.026, + "grad_norm": 1192.0, + "kl_loss_12": 1479.4033325195312, + "kl_loss_17": 456.9223114013672, + "kl_loss_3": 4184.671716308594, + "kl_loss_6": 3139.4490234375, + "learning_rate": 0.000999355658609228, + "loss": 2265.118, + "step": 260 + }, + { + "ce_loss_12": 3.640113389492035, + "ce_loss_17": 3.109154534339905, + "ce_loss_23": 2.844805729389191, + "ce_loss_3": 5.0083172082901, + "ce_loss_6": 4.421030402183533, + "epoch": 0.027, + "grad_norm": 1504.0, + "kl_loss_12": 1555.0505676269531, + "kl_loss_17": 488.3232131958008, + "kl_loss_3": 4257.785607910157, + "kl_loss_6": 3140.6220458984376, + "learning_rate": 0.0009992726191210138, + "loss": 2321.6281, + "step": 270 + }, + { + "ce_loss_12": 3.689191257953644, + "ce_loss_17": 3.173827016353607, + "ce_loss_23": 2.883969247341156, + "ce_loss_3": 4.926087856292725, + "ce_loss_6": 4.361985659599304, + "epoch": 0.028, + "grad_norm": 1472.0, + "kl_loss_12": 1613.7977416992187, + "kl_loss_17": 560.2190643310547, + "kl_loss_3": 4040.1806396484376, + "kl_loss_6": 2963.1459350585938, + "learning_rate": 0.0009991845519630679, + "loss": 2273.5676, + "step": 280 + }, + { + "ce_loss_12": 3.528163266181946, + "ce_loss_17": 3.0618820190429688, + "ce_loss_23": 2.777234375476837, + "ce_loss_3": 4.792423892021179, + "ce_loss_6": 4.247742331027984, + "epoch": 0.029, + "grad_norm": 1288.0, + "kl_loss_12": 1522.0548034667968, + "kl_loss_17": 556.7666839599609, + "kl_loss_3": 4016.4250610351564, + "kl_loss_6": 2974.031591796875, + "learning_rate": 0.0009990914580222257, + "loss": 2261.3219, + "step": 290 + }, + { + "ce_loss_12": 3.605647420883179, + "ce_loss_17": 3.1652499079704284, + "ce_loss_23": 2.907702088356018, + "ce_loss_3": 4.834259033203125, + "ce_loss_6": 4.3232193946838375, + "epoch": 0.03, + "grad_norm": 1456.0, + "kl_loss_12": 1414.5411987304688, + "kl_loss_17": 479.91837005615236, + "kl_loss_3": 3866.831677246094, + "kl_loss_6": 2886.284387207031, + "learning_rate": 0.0009989933382359422, + "loss": 2207.8055, + "step": 300 + }, + { + "ce_loss_12": 3.5743016839027404, + "ce_loss_17": 3.1409225940704344, + "ce_loss_23": 2.921179413795471, + "ce_loss_3": 4.825497627258301, + "ce_loss_6": 4.308507108688355, + "epoch": 0.031, + "grad_norm": 1016.0, + "kl_loss_12": 1328.5848266601563, + "kl_loss_17": 429.2651168823242, + "kl_loss_3": 3800.1288330078123, + "kl_loss_6": 2814.2914916992186, + "learning_rate": 0.0009988901935922825, + "loss": 2121.6258, + "step": 310 + }, + { + "ce_loss_12": 3.4775821685791017, + "ce_loss_17": 2.9991234064102175, + "ce_loss_23": 2.775858294963837, + "ce_loss_3": 4.808964085578919, + "ce_loss_6": 4.253057157993316, + "epoch": 0.032, + "grad_norm": 1120.0, + "kl_loss_12": 1430.71904296875, + "kl_loss_17": 436.44429473876954, + "kl_loss_3": 4076.3151733398436, + "kl_loss_6": 3015.4330078125, + "learning_rate": 0.0009987820251299122, + "loss": 2176.4867, + "step": 320 + }, + { + "ce_loss_12": 3.56341667175293, + "ce_loss_17": 3.1153499126434325, + "ce_loss_23": 2.8936220049858092, + "ce_loss_3": 4.80550901889801, + "ce_loss_6": 4.265205645561219, + "epoch": 0.033, + "grad_norm": 1024.0, + "kl_loss_12": 1371.9511108398438, + "kl_loss_17": 423.1871704101562, + "kl_loss_3": 3859.571325683594, + "kl_loss_6": 2808.06484375, + "learning_rate": 0.0009986688339380862, + "loss": 2110.959, + "step": 330 + }, + { + "ce_loss_12": 3.4845592975616455, + "ce_loss_17": 3.065734314918518, + "ce_loss_23": 2.8524131059646605, + "ce_loss_3": 4.71535587310791, + "ce_loss_6": 4.1842347383499146, + "epoch": 0.034, + "grad_norm": 1008.0, + "kl_loss_12": 1298.608740234375, + "kl_loss_17": 409.944596862793, + "kl_loss_3": 3721.3015625, + "kl_loss_6": 2715.799694824219, + "learning_rate": 0.0009985506211566387, + "loss": 2065.8744, + "step": 340 + }, + { + "ce_loss_12": 3.5067627310752867, + "ce_loss_17": 3.0808719635009765, + "ce_loss_23": 2.8782512664794924, + "ce_loss_3": 4.718695211410522, + "ce_loss_6": 4.1888648748397825, + "epoch": 0.035, + "grad_norm": 1416.0, + "kl_loss_12": 1294.1714233398438, + "kl_loss_17": 393.9737945556641, + "kl_loss_3": 3701.8514770507813, + "kl_loss_6": 2681.097619628906, + "learning_rate": 0.0009984273879759713, + "loss": 2026.5016, + "step": 350 + }, + { + "ce_loss_12": 3.592199468612671, + "ce_loss_17": 3.1149800181388856, + "ce_loss_23": 2.9053641080856325, + "ce_loss_3": 4.82613046169281, + "ce_loss_6": 4.262071800231934, + "epoch": 0.036, + "grad_norm": 1040.0, + "kl_loss_12": 1391.2776611328125, + "kl_loss_17": 406.1607604980469, + "kl_loss_3": 3815.022607421875, + "kl_loss_6": 2743.4274780273436, + "learning_rate": 0.0009982991356370402, + "loss": 2114.5828, + "step": 360 + }, + { + "ce_loss_12": 3.5843002557754517, + "ce_loss_17": 3.0865729212760926, + "ce_loss_23": 2.884261870384216, + "ce_loss_3": 4.74839928150177, + "ce_loss_6": 4.211460220813751, + "epoch": 0.037, + "grad_norm": 1464.0, + "kl_loss_12": 1406.7499267578125, + "kl_loss_17": 386.5169387817383, + "kl_loss_3": 3740.9017333984375, + "kl_loss_6": 2711.004187011719, + "learning_rate": 0.0009981658654313456, + "loss": 2078.2295, + "step": 370 + }, + { + "ce_loss_12": 3.6554232001304627, + "ce_loss_17": 3.1450827717781067, + "ce_loss_23": 2.9506700158119203, + "ce_loss_3": 4.768363070487976, + "ce_loss_6": 4.236017107963562, + "epoch": 0.038, + "grad_norm": 1320.0, + "kl_loss_12": 1433.2474243164063, + "kl_loss_17": 370.44485626220705, + "kl_loss_3": 3631.6053466796875, + "kl_loss_6": 2617.8675903320313, + "learning_rate": 0.000998027578700917, + "loss": 2042.9258, + "step": 380 + }, + { + "ce_loss_12": 3.5732706069946287, + "ce_loss_17": 3.1007129430770872, + "ce_loss_23": 2.902851903438568, + "ce_loss_3": 4.731427311897278, + "ce_loss_6": 4.200420486927032, + "epoch": 0.039, + "grad_norm": 1012.0, + "kl_loss_12": 1337.565850830078, + "kl_loss_17": 372.07286834716797, + "kl_loss_3": 3656.5489135742187, + "kl_loss_6": 2639.2027587890625, + "learning_rate": 0.0009978842768382998, + "loss": 2025.0105, + "step": 390 + }, + { + "ce_loss_12": 3.5375619530677795, + "ce_loss_17": 3.0942453861236574, + "ce_loss_23": 2.9124602317810058, + "ce_loss_3": 4.689311552047729, + "ce_loss_6": 4.180230748653412, + "epoch": 0.04, + "grad_norm": 860.0, + "kl_loss_12": 1273.6886169433594, + "kl_loss_17": 354.69336090087893, + "kl_loss_3": 3551.566271972656, + "kl_loss_6": 2586.4600219726562, + "learning_rate": 0.0009977359612865424, + "loss": 1957.9037, + "step": 400 + }, + { + "ce_loss_12": 3.556220817565918, + "ce_loss_17": 3.12451456785202, + "ce_loss_23": 2.9229920387268065, + "ce_loss_3": 4.728884696960449, + "ce_loss_6": 4.222829926013946, + "epoch": 0.041, + "grad_norm": 972.0, + "kl_loss_12": 1299.3871643066407, + "kl_loss_17": 383.50810546875, + "kl_loss_3": 3631.7089721679686, + "kl_loss_6": 2658.399853515625, + "learning_rate": 0.0009975826335391806, + "loss": 1974.6064, + "step": 410 + }, + { + "ce_loss_12": 3.543349266052246, + "ce_loss_17": 3.154525864124298, + "ce_loss_23": 2.9399210810661316, + "ce_loss_3": 4.703366541862488, + "ce_loss_6": 4.223559427261352, + "epoch": 0.042, + "grad_norm": 948.0, + "kl_loss_12": 1242.3375122070313, + "kl_loss_17": 414.13100280761716, + "kl_loss_3": 3557.8230224609374, + "kl_loss_6": 2619.29794921875, + "learning_rate": 0.0009974242951402235, + "loss": 1966.6234, + "step": 420 + }, + { + "ce_loss_12": 3.566603493690491, + "ce_loss_17": 3.158625400066376, + "ce_loss_23": 2.9416210055351257, + "ce_loss_3": 4.736204338073731, + "ce_loss_6": 4.223373401165008, + "epoch": 0.043, + "grad_norm": 968.0, + "kl_loss_12": 1265.5928588867187, + "kl_loss_17": 413.29832458496094, + "kl_loss_3": 3618.9629150390624, + "kl_loss_6": 2619.972277832031, + "learning_rate": 0.0009972609476841367, + "loss": 1957.9562, + "step": 430 + }, + { + "ce_loss_12": 3.4807218074798585, + "ce_loss_17": 3.069835388660431, + "ce_loss_23": 2.863972246646881, + "ce_loss_3": 4.696232414245605, + "ce_loss_6": 4.172122418880463, + "epoch": 0.044, + "grad_norm": 968.0, + "kl_loss_12": 1249.1537475585938, + "kl_loss_17": 389.73670196533203, + "kl_loss_3": 3656.73857421875, + "kl_loss_6": 2652.6269409179686, + "learning_rate": 0.0009970925928158272, + "loss": 1989.065, + "step": 440 + }, + { + "ce_loss_12": 3.4308357000350953, + "ce_loss_17": 3.0189342141151427, + "ce_loss_23": 2.8129969954490663, + "ce_loss_3": 4.649441719055176, + "ce_loss_6": 4.141774463653564, + "epoch": 0.045, + "grad_norm": 1048.0, + "kl_loss_12": 1252.9172058105469, + "kl_loss_17": 389.16773986816406, + "kl_loss_3": 3700.7037109375, + "kl_loss_6": 2713.5445190429687, + "learning_rate": 0.000996919232230627, + "loss": 2001.5621, + "step": 450 + }, + { + "ce_loss_12": 3.469963800907135, + "ce_loss_17": 3.0809281349182127, + "ce_loss_23": 2.898935317993164, + "ce_loss_3": 4.651313662528992, + "ce_loss_6": 4.148461163043976, + "epoch": 0.046, + "grad_norm": 1032.0, + "kl_loss_12": 1198.188165283203, + "kl_loss_17": 359.42144775390625, + "kl_loss_3": 3564.2315307617187, + "kl_loss_6": 2599.220666503906, + "learning_rate": 0.0009967408676742752, + "loss": 1892.0246, + "step": 460 + }, + { + "ce_loss_12": 3.609225368499756, + "ce_loss_17": 3.2187411546707154, + "ce_loss_23": 3.031229591369629, + "ce_loss_3": 4.750286006927491, + "ce_loss_6": 4.243654370307922, + "epoch": 0.047, + "grad_norm": 872.0, + "kl_loss_12": 1202.05654296875, + "kl_loss_17": 362.01676483154296, + "kl_loss_3": 3482.6361572265623, + "kl_loss_6": 2512.702587890625, + "learning_rate": 0.0009965575009429006, + "loss": 1944.2992, + "step": 470 + }, + { + "ce_loss_12": 3.4212846636772154, + "ce_loss_17": 3.0093454837799074, + "ce_loss_23": 2.818720281124115, + "ce_loss_3": 4.620055222511292, + "ce_loss_6": 4.106726181507111, + "epoch": 0.048, + "grad_norm": 912.0, + "kl_loss_12": 1227.6209716796875, + "kl_loss_17": 358.4106140136719, + "kl_loss_3": 3604.5172119140625, + "kl_loss_6": 2607.6687866210937, + "learning_rate": 0.0009963691338830043, + "loss": 1925.5033, + "step": 480 + }, + { + "ce_loss_12": 3.4801173567771913, + "ce_loss_17": 3.0850799679756165, + "ce_loss_23": 2.9135042071342467, + "ce_loss_3": 4.646547818183899, + "ce_loss_6": 4.14217598438263, + "epoch": 0.049, + "grad_norm": 988.0, + "kl_loss_12": 1183.8849060058594, + "kl_loss_17": 336.66559143066405, + "kl_loss_3": 3537.405712890625, + "kl_loss_6": 2554.581188964844, + "learning_rate": 0.0009961757683914405, + "loss": 1886.1242, + "step": 490 + }, + { + "ce_loss_12": 3.499699795246124, + "ce_loss_17": 3.0934208989143372, + "ce_loss_23": 2.9036751270294188, + "ce_loss_3": 4.612337923049926, + "ce_loss_6": 4.11277426481247, + "epoch": 0.05, + "grad_norm": 916.0, + "kl_loss_12": 1241.4071655273438, + "kl_loss_17": 359.36790618896487, + "kl_loss_3": 3465.6534057617187, + "kl_loss_6": 2505.8146850585936, + "learning_rate": 0.0009959774064153978, + "loss": 1906.5914, + "step": 500 + }, + { + "ce_loss_12": 3.4712071061134337, + "ce_loss_17": 3.0930668592453, + "ce_loss_23": 2.917718541622162, + "ce_loss_3": 4.59089343547821, + "ce_loss_6": 4.090043306350708, + "epoch": 0.051, + "grad_norm": 1056.0, + "kl_loss_12": 1155.3846740722656, + "kl_loss_17": 342.849055480957, + "kl_loss_3": 3386.1743286132814, + "kl_loss_6": 2427.04423828125, + "learning_rate": 0.0009957740499523787, + "loss": 1867.041, + "step": 510 + }, + { + "ce_loss_12": 3.4931976437568664, + "ce_loss_17": 3.1047329187393187, + "ce_loss_23": 2.929399383068085, + "ce_loss_3": 4.618403267860413, + "ce_loss_6": 4.116227996349335, + "epoch": 0.052, + "grad_norm": 1024.0, + "kl_loss_12": 1160.737646484375, + "kl_loss_17": 340.6015426635742, + "kl_loss_3": 3408.1232788085936, + "kl_loss_6": 2438.4163452148437, + "learning_rate": 0.0009955657010501807, + "loss": 1855.0543, + "step": 520 + }, + { + "ce_loss_12": 3.4520660042762756, + "ce_loss_17": 3.0659743189811706, + "ce_loss_23": 2.884491264820099, + "ce_loss_3": 4.6060174942016605, + "ce_loss_6": 4.099496901035309, + "epoch": 0.053, + "grad_norm": 980.0, + "kl_loss_12": 1162.1714721679687, + "kl_loss_17": 337.9858093261719, + "kl_loss_3": 3480.6664428710938, + "kl_loss_6": 2493.27978515625, + "learning_rate": 0.000995352361806875, + "loss": 1855.2234, + "step": 530 + }, + { + "ce_loss_12": 3.491305911540985, + "ce_loss_17": 3.1029157042503357, + "ce_loss_23": 2.926457929611206, + "ce_loss_3": 4.627124905586243, + "ce_loss_6": 4.126366591453552, + "epoch": 0.054, + "grad_norm": 1176.0, + "kl_loss_12": 1182.6985778808594, + "kl_loss_17": 343.03696899414064, + "kl_loss_3": 3461.5344970703127, + "kl_loss_6": 2494.033410644531, + "learning_rate": 0.0009951340343707852, + "loss": 1879.4984, + "step": 540 + }, + { + "ce_loss_12": 3.5298919558525084, + "ce_loss_17": 3.1447739481925963, + "ce_loss_23": 2.974556732177734, + "ce_loss_3": 4.683878397941589, + "ce_loss_6": 4.181302392482758, + "epoch": 0.055, + "grad_norm": 928.0, + "kl_loss_12": 1132.3488525390626, + "kl_loss_17": 321.0900192260742, + "kl_loss_3": 3452.663916015625, + "kl_loss_6": 2489.4758911132812, + "learning_rate": 0.0009949107209404665, + "loss": 1865.5023, + "step": 550 + }, + { + "ce_loss_12": 3.437225317955017, + "ce_loss_17": 3.0596383571624757, + "ce_loss_23": 2.89289790391922, + "ce_loss_3": 4.561627984046936, + "ce_loss_6": 4.073839998245239, + "epoch": 0.056, + "grad_norm": 1056.0, + "kl_loss_12": 1127.2184448242188, + "kl_loss_17": 320.5839477539063, + "kl_loss_3": 3384.0282348632813, + "kl_loss_6": 2438.0457153320312, + "learning_rate": 0.0009946824237646824, + "loss": 1824.1559, + "step": 560 + }, + { + "ce_loss_12": 3.403595268726349, + "ce_loss_17": 3.0332306146621706, + "ce_loss_23": 2.8478466033935548, + "ce_loss_3": 4.554531359672547, + "ce_loss_6": 4.0524480700492855, + "epoch": 0.057, + "grad_norm": 1080.0, + "kl_loss_12": 1152.7150085449218, + "kl_loss_17": 354.2860137939453, + "kl_loss_3": 3450.8307495117188, + "kl_loss_6": 2486.5100708007812, + "learning_rate": 0.0009944491451423828, + "loss": 1890.2486, + "step": 570 + }, + { + "ce_loss_12": 3.4272465348243712, + "ce_loss_17": 3.028773379325867, + "ce_loss_23": 2.8409266591072084, + "ce_loss_3": 4.570419478416443, + "ce_loss_6": 4.065177631378174, + "epoch": 0.058, + "grad_norm": 976.0, + "kl_loss_12": 1192.8691772460938, + "kl_loss_17": 360.36272583007815, + "kl_loss_3": 3500.0014404296876, + "kl_loss_6": 2518.4212158203127, + "learning_rate": 0.0009942108874226813, + "loss": 1853.7166, + "step": 580 + }, + { + "ce_loss_12": 3.4888222932815554, + "ce_loss_17": 3.1163946747779847, + "ce_loss_23": 2.9453216910362245, + "ce_loss_3": 4.574800729751587, + "ce_loss_6": 4.082886636257172, + "epoch": 0.059, + "grad_norm": 1080.0, + "kl_loss_12": 1124.1027862548829, + "kl_loss_17": 326.6534820556641, + "kl_loss_3": 3309.919189453125, + "kl_loss_6": 2351.59560546875, + "learning_rate": 0.00099396765300483, + "loss": 1774.2744, + "step": 590 + }, + { + "ce_loss_12": 3.4706664443016053, + "ce_loss_17": 3.1086565136909483, + "ce_loss_23": 2.9299388766288756, + "ce_loss_3": 4.5702659606933596, + "ce_loss_6": 4.071904695034027, + "epoch": 0.06, + "grad_norm": 1088.0, + "kl_loss_12": 1110.3434112548828, + "kl_loss_17": 339.3138854980469, + "kl_loss_3": 3331.552526855469, + "kl_loss_6": 2361.865966796875, + "learning_rate": 0.0009937194443381972, + "loss": 1791.5111, + "step": 600 + }, + { + "ce_loss_12": 3.4855493545532226, + "ce_loss_17": 3.1413748025894166, + "ce_loss_23": 2.9650806307792665, + "ce_loss_3": 4.578087067604065, + "ce_loss_6": 4.082219207286835, + "epoch": 0.061, + "grad_norm": 1088.0, + "kl_loss_12": 1089.8639190673828, + "kl_loss_17": 342.9082061767578, + "kl_loss_3": 3284.514855957031, + "kl_loss_6": 2323.494982910156, + "learning_rate": 0.0009934662639222412, + "loss": 1806.4406, + "step": 610 + }, + { + "ce_loss_12": 3.451452672481537, + "ce_loss_17": 3.109323561191559, + "ce_loss_23": 2.9135228753089906, + "ce_loss_3": 4.588198184967041, + "ce_loss_6": 4.1074150681495665, + "epoch": 0.062, + "grad_norm": 1200.0, + "kl_loss_12": 1131.1844268798827, + "kl_loss_17": 387.26836547851565, + "kl_loss_3": 3419.529333496094, + "kl_loss_6": 2492.799841308594, + "learning_rate": 0.000993208114306486, + "loss": 1842.5354, + "step": 620 + }, + { + "ce_loss_12": 3.382601237297058, + "ce_loss_17": 3.0407912492752076, + "ce_loss_23": 2.843443822860718, + "ce_loss_3": 4.523557734489441, + "ce_loss_6": 4.0556340456008915, + "epoch": 0.063, + "grad_norm": 972.0, + "kl_loss_12": 1130.2154022216796, + "kl_loss_17": 385.3479629516602, + "kl_loss_3": 3416.6129150390625, + "kl_loss_6": 2508.2321899414064, + "learning_rate": 0.0009929449980904952, + "loss": 1818.0863, + "step": 630 + }, + { + "ce_loss_12": 3.425255465507507, + "ce_loss_17": 3.0779421091079713, + "ce_loss_23": 2.901267647743225, + "ce_loss_3": 4.548525047302246, + "ce_loss_6": 4.069075584411621, + "epoch": 0.064, + "grad_norm": 908.0, + "kl_loss_12": 1087.3882110595703, + "kl_loss_17": 342.4493881225586, + "kl_loss_3": 3351.716906738281, + "kl_loss_6": 2424.8897338867187, + "learning_rate": 0.0009926769179238466, + "loss": 1782.8191, + "step": 640 + }, + { + "ce_loss_12": 3.482888376712799, + "ce_loss_17": 3.113267958164215, + "ce_loss_23": 2.9342373251914977, + "ce_loss_3": 4.577497673034668, + "ce_loss_6": 4.098145663738251, + "epoch": 0.065, + "grad_norm": 1192.0, + "kl_loss_12": 1132.436993408203, + "kl_loss_17": 342.9522430419922, + "kl_loss_3": 3347.3211303710937, + "kl_loss_6": 2419.1859619140623, + "learning_rate": 0.000992403876506104, + "loss": 1802.1236, + "step": 650 + }, + { + "ce_loss_12": 3.398992598056793, + "ce_loss_17": 3.046174943447113, + "ce_loss_23": 2.8785677909851075, + "ce_loss_3": 4.516813969612121, + "ce_loss_6": 4.031654417514801, + "epoch": 0.066, + "grad_norm": 1008.0, + "kl_loss_12": 1100.7386840820313, + "kl_loss_17": 323.626286315918, + "kl_loss_3": 3347.7080810546877, + "kl_loss_6": 2399.8117553710936, + "learning_rate": 0.0009921258765867918, + "loss": 1783.9176, + "step": 660 + }, + { + "ce_loss_12": 3.3697550654411317, + "ce_loss_17": 3.0111876368522643, + "ce_loss_23": 2.8513786673545836, + "ce_loss_3": 4.516186356544495, + "ce_loss_6": 4.030804753303528, + "epoch": 0.067, + "grad_norm": 1008.0, + "kl_loss_12": 1083.3695251464844, + "kl_loss_17": 314.67342681884764, + "kl_loss_3": 3411.7294799804686, + "kl_loss_6": 2445.235070800781, + "learning_rate": 0.0009918429209653662, + "loss": 1784.7109, + "step": 670 + }, + { + "ce_loss_12": 3.4050143241882322, + "ce_loss_17": 3.0547261357307436, + "ce_loss_23": 2.898942744731903, + "ce_loss_3": 4.546445536613464, + "ce_loss_6": 4.047844111919403, + "epoch": 0.068, + "grad_norm": 1064.0, + "kl_loss_12": 1068.5941101074218, + "kl_loss_17": 307.52843170166017, + "kl_loss_3": 3371.168798828125, + "kl_loss_6": 2397.2773681640624, + "learning_rate": 0.0009915550124911866, + "loss": 1744.7219, + "step": 680 + }, + { + "ce_loss_12": 3.412628734111786, + "ce_loss_17": 3.0702058911323546, + "ce_loss_23": 2.9016688108444213, + "ce_loss_3": 4.517101573944092, + "ce_loss_6": 4.032331478595734, + "epoch": 0.069, + "grad_norm": 1072.0, + "kl_loss_12": 1052.8324340820313, + "kl_loss_17": 316.4966979980469, + "kl_loss_3": 3269.573693847656, + "kl_loss_6": 2327.179064941406, + "learning_rate": 0.0009912621540634887, + "loss": 1747.8387, + "step": 690 + }, + { + "ce_loss_12": 3.428032374382019, + "ce_loss_17": 3.0963001370429994, + "ce_loss_23": 2.941784918308258, + "ce_loss_3": 4.498676443099976, + "ce_loss_6": 4.022723865509033, + "epoch": 0.07, + "grad_norm": 920.0, + "kl_loss_12": 1022.0572509765625, + "kl_loss_17": 298.4063545227051, + "kl_loss_3": 3197.440222167969, + "kl_loss_6": 2262.5399536132813, + "learning_rate": 0.0009909643486313534, + "loss": 1711.5582, + "step": 700 + }, + { + "ce_loss_12": 3.3527217149734496, + "ce_loss_17": 3.0008968114852905, + "ce_loss_23": 2.842929780483246, + "ce_loss_3": 4.486680507659912, + "ce_loss_6": 3.979134178161621, + "epoch": 0.071, + "grad_norm": 1040.0, + "kl_loss_12": 1057.8290649414062, + "kl_loss_17": 299.24674224853516, + "kl_loss_3": 3346.7761352539064, + "kl_loss_6": 2364.9537719726563, + "learning_rate": 0.000990661599193678, + "loss": 1793.3203, + "step": 710 + }, + { + "ce_loss_12": 3.446326529979706, + "ce_loss_17": 3.1024960041046143, + "ce_loss_23": 2.946468675136566, + "ce_loss_3": 4.533236646652222, + "ce_loss_6": 4.060657846927643, + "epoch": 0.072, + "grad_norm": 1288.0, + "kl_loss_12": 1036.418603515625, + "kl_loss_17": 292.1839904785156, + "kl_loss_3": 3216.317138671875, + "kl_loss_6": 2302.488073730469, + "learning_rate": 0.0009903539087991462, + "loss": 1718.2531, + "step": 720 + }, + { + "ce_loss_12": 3.4354040145874025, + "ce_loss_17": 3.084610116481781, + "ce_loss_23": 2.933627998828888, + "ce_loss_3": 4.517619776725769, + "ce_loss_6": 4.039964389801026, + "epoch": 0.073, + "grad_norm": 968.0, + "kl_loss_12": 1051.909390258789, + "kl_loss_17": 294.1747932434082, + "kl_loss_3": 3248.3892211914062, + "kl_loss_6": 2321.465185546875, + "learning_rate": 0.0009900412805461966, + "loss": 1732.8176, + "step": 730 + }, + { + "ce_loss_12": 3.482883536815643, + "ce_loss_17": 3.142165744304657, + "ce_loss_23": 2.997369420528412, + "ce_loss_3": 4.546565818786621, + "ce_loss_6": 4.065502631664276, + "epoch": 0.074, + "grad_norm": 1024.0, + "kl_loss_12": 1026.4333953857422, + "kl_loss_17": 281.98936614990237, + "kl_loss_3": 3187.9749267578127, + "kl_loss_6": 2253.8543701171875, + "learning_rate": 0.0009897237175829927, + "loss": 1710.5418, + "step": 740 + }, + { + "ce_loss_12": 3.39098174571991, + "ce_loss_17": 3.0333335757255555, + "ce_loss_23": 2.8873408675193786, + "ce_loss_3": 4.4851244449615475, + "ce_loss_6": 4.002402055263519, + "epoch": 0.075, + "grad_norm": 1008.0, + "kl_loss_12": 1060.9873962402344, + "kl_loss_17": 287.43994064331054, + "kl_loss_3": 3271.13193359375, + "kl_loss_6": 2322.4891357421875, + "learning_rate": 0.0009894012231073895, + "loss": 1722.3041, + "step": 750 + }, + { + "ce_loss_12": 3.424803137779236, + "ce_loss_17": 3.0802384853363036, + "ce_loss_23": 2.931328672170639, + "ce_loss_3": 4.510021328926086, + "ce_loss_6": 4.023956918716431, + "epoch": 0.076, + "grad_norm": 964.0, + "kl_loss_12": 1030.1895233154296, + "kl_loss_17": 281.4108459472656, + "kl_loss_3": 3218.834765625, + "kl_loss_6": 2276.3154174804686, + "learning_rate": 0.0009890738003669028, + "loss": 1720.393, + "step": 760 + }, + { + "ce_loss_12": 3.4032132148742678, + "ce_loss_17": 3.055464744567871, + "ce_loss_23": 2.904374885559082, + "ce_loss_3": 4.515720820426941, + "ce_loss_6": 4.020367431640625, + "epoch": 0.077, + "grad_norm": 1032.0, + "kl_loss_12": 1052.8801330566407, + "kl_loss_17": 296.8541961669922, + "kl_loss_3": 3311.7972900390623, + "kl_loss_6": 2353.563269042969, + "learning_rate": 0.0009887414526586764, + "loss": 1706.2574, + "step": 770 + }, + { + "ce_loss_12": 3.431311821937561, + "ce_loss_17": 3.097555434703827, + "ce_loss_23": 2.953240728378296, + "ce_loss_3": 4.522035431861878, + "ce_loss_6": 4.0329923868179325, + "epoch": 0.078, + "grad_norm": 1408.0, + "kl_loss_12": 1007.6760467529297, + "kl_loss_17": 283.70689926147463, + "kl_loss_3": 3212.4423217773438, + "kl_loss_6": 2261.5300170898436, + "learning_rate": 0.0009884041833294476, + "loss": 1665.1531, + "step": 780 + }, + { + "ce_loss_12": 3.435549521446228, + "ce_loss_17": 3.1065102934837343, + "ce_loss_23": 2.9610527753829956, + "ce_loss_3": 4.498700094223023, + "ce_loss_6": 4.023032128810883, + "epoch": 0.079, + "grad_norm": 1144.0, + "kl_loss_12": 993.2736663818359, + "kl_loss_17": 279.1837074279785, + "kl_loss_3": 3154.5905639648436, + "kl_loss_6": 2226.1057067871093, + "learning_rate": 0.000988061995775515, + "loss": 1715.5586, + "step": 790 + }, + { + "ce_loss_12": 3.3710269212722777, + "ce_loss_17": 3.043732559680939, + "ce_loss_23": 2.8962334394454956, + "ce_loss_3": 4.4325767993927006, + "ce_loss_6": 3.9653960943222044, + "epoch": 0.08, + "grad_norm": 968.0, + "kl_loss_12": 1007.3277313232422, + "kl_loss_17": 288.94420166015624, + "kl_loss_3": 3166.8593994140624, + "kl_loss_6": 2252.780651855469, + "learning_rate": 0.0009877148934427035, + "loss": 1677.0441, + "step": 800 + }, + { + "ce_loss_12": 3.4022730112075807, + "ce_loss_17": 3.0723152995109557, + "ce_loss_23": 2.932292175292969, + "ce_loss_3": 4.493832111358643, + "ce_loss_6": 4.0058788895607, + "epoch": 0.081, + "grad_norm": 1004.0, + "kl_loss_12": 995.3911376953125, + "kl_loss_17": 270.32576446533204, + "kl_loss_3": 3203.4495239257812, + "kl_loss_6": 2255.8701599121096, + "learning_rate": 0.0009873628798263297, + "loss": 1663.8143, + "step": 810 + }, + { + "ce_loss_12": 3.3567355275154114, + "ce_loss_17": 3.032731032371521, + "ce_loss_23": 2.8951220154762267, + "ce_loss_3": 4.419554686546325, + "ce_loss_6": 3.9450634121894836, + "epoch": 0.082, + "grad_norm": 924.0, + "kl_loss_12": 981.8002044677735, + "kl_loss_17": 268.83979949951174, + "kl_loss_3": 3129.752062988281, + "kl_loss_6": 2206.8885803222656, + "learning_rate": 0.0009870059584711668, + "loss": 1689.6277, + "step": 820 + }, + { + "ce_loss_12": 3.3853783130645754, + "ce_loss_17": 3.0407726764678955, + "ce_loss_23": 2.900382101535797, + "ce_loss_3": 4.434931969642639, + "ce_loss_6": 3.966839051246643, + "epoch": 0.083, + "grad_norm": 1040.0, + "kl_loss_12": 992.5730743408203, + "kl_loss_17": 269.47634811401366, + "kl_loss_3": 3136.5814453125, + "kl_loss_6": 2225.992724609375, + "learning_rate": 0.000986644132971409, + "loss": 1660.5869, + "step": 830 + }, + { + "ce_loss_12": 3.3898305535316466, + "ce_loss_17": 3.033678078651428, + "ce_loss_23": 2.88665372133255, + "ce_loss_3": 4.466124033927917, + "ce_loss_6": 3.992603433132172, + "epoch": 0.084, + "grad_norm": 1024.0, + "kl_loss_12": 1025.6287628173827, + "kl_loss_17": 276.9879234313965, + "kl_loss_3": 3219.1418579101564, + "kl_loss_6": 2285.3357543945312, + "learning_rate": 0.0009862774069706345, + "loss": 1684.185, + "step": 840 + }, + { + "ce_loss_12": 3.4720041871070864, + "ce_loss_17": 3.152042555809021, + "ce_loss_23": 3.0148459553718565, + "ce_loss_3": 4.498300981521607, + "ce_loss_6": 4.030324721336365, + "epoch": 0.085, + "grad_norm": 1088.0, + "kl_loss_12": 985.2348510742188, + "kl_loss_17": 273.5748977661133, + "kl_loss_3": 3094.7895751953124, + "kl_loss_6": 2177.141778564453, + "learning_rate": 0.000985905784161771, + "loss": 1652.6162, + "step": 850 + }, + { + "ce_loss_12": 3.398676311969757, + "ce_loss_17": 3.0831673979759215, + "ce_loss_23": 2.9472405195236204, + "ce_loss_3": 4.458824563026428, + "ce_loss_6": 3.9770350813865663, + "epoch": 0.086, + "grad_norm": 1096.0, + "kl_loss_12": 965.9356964111328, + "kl_loss_17": 269.42513885498045, + "kl_loss_3": 3123.1033569335937, + "kl_loss_6": 2187.9228393554686, + "learning_rate": 0.000985529268287055, + "loss": 1627.1362, + "step": 860 + }, + { + "ce_loss_12": 3.3542402386665344, + "ce_loss_17": 3.0211271166801454, + "ce_loss_23": 2.8778584122657778, + "ce_loss_3": 4.438820695877075, + "ce_loss_6": 3.953563117980957, + "epoch": 0.087, + "grad_norm": 1012.0, + "kl_loss_12": 994.7591278076172, + "kl_loss_17": 277.0214416503906, + "kl_loss_3": 3177.5056396484374, + "kl_loss_6": 2236.740026855469, + "learning_rate": 0.0009851478631379982, + "loss": 1673.2793, + "step": 870 + }, + { + "ce_loss_12": 3.4064401268959044, + "ce_loss_17": 3.085164284706116, + "ce_loss_23": 2.9355137586593627, + "ce_loss_3": 4.463466334342956, + "ce_loss_6": 3.9813445806503296, + "epoch": 0.088, + "grad_norm": 1064.0, + "kl_loss_12": 984.8004760742188, + "kl_loss_17": 279.44955215454104, + "kl_loss_3": 3143.6706420898436, + "kl_loss_6": 2194.5278381347657, + "learning_rate": 0.0009847615725553456, + "loss": 1651.8555, + "step": 880 + }, + { + "ce_loss_12": 3.418182611465454, + "ce_loss_17": 3.120779263973236, + "ce_loss_23": 2.9867815256118773, + "ce_loss_3": 4.439004111289978, + "ce_loss_6": 3.9693766236305237, + "epoch": 0.089, + "grad_norm": 1008.0, + "kl_loss_12": 926.061654663086, + "kl_loss_17": 266.98855056762693, + "kl_loss_3": 2992.9824096679686, + "kl_loss_6": 2086.2604248046873, + "learning_rate": 0.0009843704004290394, + "loss": 1630.8486, + "step": 890 + }, + { + "ce_loss_12": 3.355166864395142, + "ce_loss_17": 3.0378090381622314, + "ce_loss_23": 2.8963677406311037, + "ce_loss_3": 4.414576816558838, + "ce_loss_6": 3.9384005665779114, + "epoch": 0.09, + "grad_norm": 944.0, + "kl_loss_12": 984.4168182373047, + "kl_loss_17": 277.1652038574219, + "kl_loss_3": 3144.8462036132814, + "kl_loss_6": 2210.351110839844, + "learning_rate": 0.0009839743506981783, + "loss": 1647.625, + "step": 900 + }, + { + "ce_loss_12": 3.3013385415077208, + "ce_loss_17": 2.9638872504234315, + "ce_loss_23": 2.8220457673072814, + "ce_loss_3": 4.408076620101928, + "ce_loss_6": 3.926641547679901, + "epoch": 0.091, + "grad_norm": 1016.0, + "kl_loss_12": 1005.3836059570312, + "kl_loss_17": 279.73329162597656, + "kl_loss_3": 3252.30029296875, + "kl_loss_6": 2299.349365234375, + "learning_rate": 0.0009835734273509786, + "loss": 1675.882, + "step": 910 + }, + { + "ce_loss_12": 3.381470000743866, + "ce_loss_17": 3.0562122464179993, + "ce_loss_23": 2.9091169953346254, + "ce_loss_3": 4.449205493927002, + "ce_loss_6": 3.9539331078529356, + "epoch": 0.092, + "grad_norm": 1208.0, + "kl_loss_12": 975.6916809082031, + "kl_loss_17": 274.71624221801756, + "kl_loss_3": 3144.822131347656, + "kl_loss_6": 2177.753350830078, + "learning_rate": 0.0009831676344247342, + "loss": 1643.0232, + "step": 920 + }, + { + "ce_loss_12": 3.3871499061584474, + "ce_loss_17": 3.068781626224518, + "ce_loss_23": 2.9382851362228393, + "ce_loss_3": 4.409728288650513, + "ce_loss_6": 3.929253375530243, + "epoch": 0.093, + "grad_norm": 1248.0, + "kl_loss_12": 956.8896667480469, + "kl_loss_17": 255.23201599121094, + "kl_loss_3": 3049.1298095703123, + "kl_loss_6": 2109.1454223632813, + "learning_rate": 0.0009827569760057755, + "loss": 1622.0775, + "step": 930 + }, + { + "ce_loss_12": 3.3510371327400206, + "ce_loss_17": 2.997589075565338, + "ce_loss_23": 2.853141355514526, + "ce_loss_3": 4.453029465675354, + "ce_loss_6": 3.9511353969573975, + "epoch": 0.094, + "grad_norm": 1168.0, + "kl_loss_12": 1029.933401489258, + "kl_loss_17": 276.05516586303713, + "kl_loss_3": 3270.2495849609377, + "kl_loss_6": 2290.6919494628905, + "learning_rate": 0.000982341456229428, + "loss": 1661.4086, + "step": 940 + }, + { + "ce_loss_12": 3.416623318195343, + "ce_loss_17": 3.0854893565177917, + "ce_loss_23": 2.947428512573242, + "ce_loss_3": 4.475690817832946, + "ce_loss_6": 4.002807903289795, + "epoch": 0.095, + "grad_norm": 1144.0, + "kl_loss_12": 1006.6869445800781, + "kl_loss_17": 273.9578086853027, + "kl_loss_3": 3156.6865844726562, + "kl_loss_6": 2236.0271118164064, + "learning_rate": 0.000981921079279971, + "loss": 1628.7965, + "step": 950 + }, + { + "ce_loss_12": 3.384030747413635, + "ce_loss_17": 3.0803101658821106, + "ce_loss_23": 2.9525837182998655, + "ce_loss_3": 4.387427949905396, + "ce_loss_6": 3.920593464374542, + "epoch": 0.096, + "grad_norm": 1120.0, + "kl_loss_12": 939.6712066650391, + "kl_loss_17": 261.59261322021484, + "kl_loss_3": 2991.079345703125, + "kl_loss_6": 2089.5918212890624, + "learning_rate": 0.0009814958493905962, + "loss": 1588.0285, + "step": 960 + }, + { + "ce_loss_12": 3.3699322938919067, + "ce_loss_17": 3.0514130353927613, + "ce_loss_23": 2.917893576622009, + "ce_loss_3": 4.442546010017395, + "ce_loss_6": 3.958000659942627, + "epoch": 0.097, + "grad_norm": 1136.0, + "kl_loss_12": 965.3707580566406, + "kl_loss_17": 267.0235397338867, + "kl_loss_3": 3129.030517578125, + "kl_loss_6": 2193.5827026367188, + "learning_rate": 0.0009810657708433637, + "loss": 1668.0988, + "step": 970 + }, + { + "ce_loss_12": 3.42313472032547, + "ce_loss_17": 3.1251712679862975, + "ce_loss_23": 2.986611008644104, + "ce_loss_3": 4.429538011550903, + "ce_loss_6": 3.975835406780243, + "epoch": 0.098, + "grad_norm": 1456.0, + "kl_loss_12": 925.4827178955078, + "kl_loss_17": 263.2483726501465, + "kl_loss_3": 2960.0764404296874, + "kl_loss_6": 2084.235943603516, + "learning_rate": 0.0009806308479691594, + "loss": 1575.2449, + "step": 980 + }, + { + "ce_loss_12": 3.459168326854706, + "ce_loss_17": 3.1265910387039186, + "ce_loss_23": 2.9830366373062134, + "ce_loss_3": 4.475966286659241, + "ce_loss_6": 4.010424196720123, + "epoch": 0.099, + "grad_norm": 1296.0, + "kl_loss_12": 992.4306762695312, + "kl_loss_17": 274.3281639099121, + "kl_loss_3": 3074.587536621094, + "kl_loss_6": 2162.7348876953124, + "learning_rate": 0.0009801910851476522, + "loss": 1613.9108, + "step": 990 + }, + { + "ce_loss_12": 3.3743842124938963, + "ce_loss_17": 3.055145001411438, + "ce_loss_23": 2.921414840221405, + "ce_loss_3": 4.455966448783874, + "ce_loss_6": 3.969641697406769, + "epoch": 0.1, + "grad_norm": 1064.0, + "kl_loss_12": 986.4201354980469, + "kl_loss_17": 261.8215835571289, + "kl_loss_3": 3182.0975708007813, + "kl_loss_6": 2227.5828186035155, + "learning_rate": 0.0009797464868072487, + "loss": 1625.0432, + "step": 1000 + }, + { + "ce_loss_12": 3.360707724094391, + "ce_loss_17": 3.0440573573112486, + "ce_loss_23": 2.9084664463996885, + "ce_loss_3": 4.410605311393738, + "ce_loss_6": 3.940072011947632, + "epoch": 0.101, + "grad_norm": 1192.0, + "kl_loss_12": 977.5397827148438, + "kl_loss_17": 263.3533248901367, + "kl_loss_3": 3114.2360229492188, + "kl_loss_6": 2192.034112548828, + "learning_rate": 0.0009792970574250492, + "loss": 1629.8252, + "step": 1010 + }, + { + "ce_loss_12": 3.3747955203056335, + "ce_loss_17": 3.0566949009895326, + "ce_loss_23": 2.9263578057289124, + "ce_loss_3": 4.41445894241333, + "ce_loss_6": 3.947386014461517, + "epoch": 0.102, + "grad_norm": 972.0, + "kl_loss_12": 945.7973480224609, + "kl_loss_17": 251.75567932128905, + "kl_loss_3": 3056.5161987304687, + "kl_loss_6": 2145.8559509277343, + "learning_rate": 0.0009788428015268028, + "loss": 1580.0807, + "step": 1020 + }, + { + "ce_loss_12": 3.368866300582886, + "ce_loss_17": 3.0569016098976136, + "ce_loss_23": 2.9333403587341307, + "ce_loss_3": 4.400923776626587, + "ce_loss_6": 3.9347620964050294, + "epoch": 0.103, + "grad_norm": 1256.0, + "kl_loss_12": 933.2007629394532, + "kl_loss_17": 248.47305450439453, + "kl_loss_3": 3033.2176879882813, + "kl_loss_6": 2127.818988037109, + "learning_rate": 0.0009783837236868609, + "loss": 1584.0074, + "step": 1030 + }, + { + "ce_loss_12": 3.33066782951355, + "ce_loss_17": 3.0230842351913454, + "ce_loss_23": 2.8921628475189207, + "ce_loss_3": 4.374507999420166, + "ce_loss_6": 3.908428359031677, + "epoch": 0.104, + "grad_norm": 1088.0, + "kl_loss_12": 937.0115997314454, + "kl_loss_17": 263.56311264038084, + "kl_loss_3": 3038.81962890625, + "kl_loss_6": 2133.8191467285155, + "learning_rate": 0.0009779198285281327, + "loss": 1584.0922, + "step": 1040 + }, + { + "ce_loss_12": 3.338231694698334, + "ce_loss_17": 3.0326517939567568, + "ce_loss_23": 2.895213079452515, + "ce_loss_3": 4.402967762947083, + "ce_loss_6": 3.9249982714653013, + "epoch": 0.105, + "grad_norm": 1120.0, + "kl_loss_12": 931.6781677246094, + "kl_loss_17": 263.748120880127, + "kl_loss_3": 3098.784777832031, + "kl_loss_6": 2160.467742919922, + "learning_rate": 0.0009774511207220368, + "loss": 1610.0279, + "step": 1050 + }, + { + "ce_loss_12": 3.376241135597229, + "ce_loss_17": 3.072509837150574, + "ce_loss_23": 2.9356080651283265, + "ce_loss_3": 4.441791725158692, + "ce_loss_6": 3.9527193784713743, + "epoch": 0.106, + "grad_norm": 1056.0, + "kl_loss_12": 929.0532318115235, + "kl_loss_17": 262.6850051879883, + "kl_loss_3": 3089.88740234375, + "kl_loss_6": 2132.646368408203, + "learning_rate": 0.0009769776049884564, + "loss": 1590.4035, + "step": 1060 + }, + { + "ce_loss_12": 3.2999967694282533, + "ce_loss_17": 2.9857473134994508, + "ce_loss_23": 2.850896108150482, + "ce_loss_3": 4.3774131536483765, + "ce_loss_6": 3.8904701471328735, + "epoch": 0.107, + "grad_norm": 1200.0, + "kl_loss_12": 951.7590454101562, + "kl_loss_17": 259.9833282470703, + "kl_loss_3": 3161.69931640625, + "kl_loss_6": 2198.422497558594, + "learning_rate": 0.0009764992860956889, + "loss": 1646.2891, + "step": 1070 + }, + { + "ce_loss_12": 3.3937099099159242, + "ce_loss_17": 3.113642930984497, + "ce_loss_23": 2.995354640483856, + "ce_loss_3": 4.38458936214447, + "ce_loss_6": 3.9279677510261535, + "epoch": 0.108, + "grad_norm": 1020.0, + "kl_loss_12": 881.066519165039, + "kl_loss_17": 242.9081069946289, + "kl_loss_3": 2896.6496826171874, + "kl_loss_6": 2003.4961853027344, + "learning_rate": 0.0009760161688604008, + "loss": 1536.7873, + "step": 1080 + }, + { + "ce_loss_12": 3.4213764667510986, + "ce_loss_17": 3.1238073348999023, + "ce_loss_23": 2.993813896179199, + "ce_loss_3": 4.448131680488586, + "ce_loss_6": 3.9845832109451296, + "epoch": 0.109, + "grad_norm": 1448.0, + "kl_loss_12": 919.0751220703125, + "kl_loss_17": 251.6627975463867, + "kl_loss_3": 3004.3504028320312, + "kl_loss_6": 2093.5984497070312, + "learning_rate": 0.0009755282581475768, + "loss": 1585.702, + "step": 1090 + }, + { + "ce_loss_12": 3.460730504989624, + "ce_loss_17": 3.1601850152015687, + "ce_loss_23": 3.025113046169281, + "ce_loss_3": 4.463287663459778, + "ce_loss_6": 3.997715425491333, + "epoch": 0.11, + "grad_norm": 1040.0, + "kl_loss_12": 931.5763977050781, + "kl_loss_17": 259.5204605102539, + "kl_loss_3": 2964.5007934570312, + "kl_loss_6": 2056.313586425781, + "learning_rate": 0.0009750355588704727, + "loss": 1546.6025, + "step": 1100 + }, + { + "ce_loss_12": 3.320233464241028, + "ce_loss_17": 3.010184943675995, + "ce_loss_23": 2.8826860785484314, + "ce_loss_3": 4.3593987107276915, + "ce_loss_6": 3.8807163000106812, + "epoch": 0.111, + "grad_norm": 996.0, + "kl_loss_12": 916.7739654541016, + "kl_loss_17": 248.07165603637696, + "kl_loss_3": 3027.491174316406, + "kl_loss_6": 2100.871795654297, + "learning_rate": 0.0009745380759905647, + "loss": 1598.6758, + "step": 1110 + }, + { + "ce_loss_12": 3.2762816071510317, + "ce_loss_17": 2.9711276292800903, + "ce_loss_23": 2.8430217027664186, + "ce_loss_3": 4.327974188327789, + "ce_loss_6": 3.86414635181427, + "epoch": 0.112, + "grad_norm": 1224.0, + "kl_loss_12": 912.6159393310547, + "kl_loss_17": 245.85351943969727, + "kl_loss_3": 3057.375622558594, + "kl_loss_6": 2136.870953369141, + "learning_rate": 0.0009740358145174998, + "loss": 1608.4203, + "step": 1120 + }, + { + "ce_loss_12": 3.3994181156158447, + "ce_loss_17": 3.107448387145996, + "ce_loss_23": 2.9838266491889955, + "ce_loss_3": 4.387881541252137, + "ce_loss_6": 3.928562307357788, + "epoch": 0.113, + "grad_norm": 1000.0, + "kl_loss_12": 896.2097869873047, + "kl_loss_17": 241.3724739074707, + "kl_loss_3": 2936.2065185546876, + "kl_loss_6": 2022.335791015625, + "learning_rate": 0.0009735287795090455, + "loss": 1545.7097, + "step": 1130 + }, + { + "ce_loss_12": 3.3220792770385743, + "ce_loss_17": 3.0120572090148925, + "ce_loss_23": 2.8869168281555178, + "ce_loss_3": 4.3510067701339725, + "ce_loss_6": 3.8771968603134157, + "epoch": 0.114, + "grad_norm": 1160.0, + "kl_loss_12": 934.3572448730469, + "kl_loss_17": 241.42439727783204, + "kl_loss_3": 3030.8449096679688, + "kl_loss_6": 2103.1423583984374, + "learning_rate": 0.0009730169760710386, + "loss": 1571.8191, + "step": 1140 + }, + { + "ce_loss_12": 3.3827139258384706, + "ce_loss_17": 3.0824633479118346, + "ce_loss_23": 2.952405309677124, + "ce_loss_3": 4.405815839767456, + "ce_loss_6": 3.9335633516311646, + "epoch": 0.115, + "grad_norm": 1320.0, + "kl_loss_12": 900.9095245361328, + "kl_loss_17": 260.4913131713867, + "kl_loss_3": 2963.3571166992188, + "kl_loss_6": 2042.1772155761719, + "learning_rate": 0.0009725004093573342, + "loss": 1564.9115, + "step": 1150 + }, + { + "ce_loss_12": 3.3458577156066895, + "ce_loss_17": 3.068521296977997, + "ce_loss_23": 2.9059925436973573, + "ce_loss_3": 4.370245170593262, + "ce_loss_6": 3.88500257730484, + "epoch": 0.116, + "grad_norm": 1272.0, + "kl_loss_12": 915.9109344482422, + "kl_loss_17": 307.17549896240234, + "kl_loss_3": 2986.1877685546874, + "kl_loss_6": 2042.9764526367187, + "learning_rate": 0.0009719790845697534, + "loss": 1557.8625, + "step": 1160 + }, + { + "ce_loss_12": 3.275306785106659, + "ce_loss_17": 3.0202790021896364, + "ce_loss_23": 2.8706357717514037, + "ce_loss_3": 4.282593977451325, + "ce_loss_6": 3.8117279291152952, + "epoch": 0.117, + "grad_norm": 1020.0, + "kl_loss_12": 862.7552917480468, + "kl_loss_17": 289.40465087890624, + "kl_loss_3": 2916.200549316406, + "kl_loss_6": 1996.2123596191407, + "learning_rate": 0.0009714530069580309, + "loss": 1528.9826, + "step": 1170 + }, + { + "ce_loss_12": 3.375526821613312, + "ce_loss_17": 3.0823593854904177, + "ce_loss_23": 2.9396392107009888, + "ce_loss_3": 4.404994821548462, + "ce_loss_6": 3.924890732765198, + "epoch": 0.118, + "grad_norm": 1208.0, + "kl_loss_12": 922.4925109863282, + "kl_loss_17": 283.04794082641604, + "kl_loss_3": 3010.2740478515625, + "kl_loss_6": 2069.5090759277346, + "learning_rate": 0.0009709221818197624, + "loss": 1561.6851, + "step": 1180 + }, + { + "ce_loss_12": 3.4169281005859373, + "ce_loss_17": 3.1253913164138796, + "ce_loss_23": 2.9899161219596864, + "ce_loss_3": 4.451602101325989, + "ce_loss_6": 3.970979058742523, + "epoch": 0.119, + "grad_norm": 1208.0, + "kl_loss_12": 912.0826873779297, + "kl_loss_17": 265.7390960693359, + "kl_loss_3": 3018.5254638671877, + "kl_loss_6": 2083.6102600097656, + "learning_rate": 0.0009703866145003512, + "loss": 1566.0104, + "step": 1190 + }, + { + "ce_loss_12": 3.3803894400596617, + "ce_loss_17": 3.0914229273796083, + "ce_loss_23": 2.9621423721313476, + "ce_loss_3": 4.384731888771057, + "ce_loss_6": 3.9111488461494446, + "epoch": 0.12, + "grad_norm": 1088.0, + "kl_loss_12": 893.22109375, + "kl_loss_17": 246.46930541992188, + "kl_loss_3": 2961.7060546875, + "kl_loss_6": 2032.8304382324218, + "learning_rate": 0.0009698463103929542, + "loss": 1564.7995, + "step": 1200 + }, + { + "ce_loss_12": 3.3435301184654236, + "ce_loss_17": 3.0554117798805236, + "ce_loss_23": 2.920744204521179, + "ce_loss_3": 4.385138463973999, + "ce_loss_6": 3.910277771949768, + "epoch": 0.121, + "grad_norm": 1304.0, + "kl_loss_12": 905.751821899414, + "kl_loss_17": 262.57847747802737, + "kl_loss_3": 3004.2045776367186, + "kl_loss_6": 2081.385791015625, + "learning_rate": 0.0009693012749384279, + "loss": 1570.0865, + "step": 1210 + }, + { + "ce_loss_12": 3.3524824380874634, + "ce_loss_17": 3.055639898777008, + "ce_loss_23": 2.9259533882141113, + "ce_loss_3": 4.350755381584167, + "ce_loss_6": 3.8888396263122558, + "epoch": 0.122, + "grad_norm": 1136.0, + "kl_loss_12": 902.4378936767578, + "kl_loss_17": 254.01562118530273, + "kl_loss_3": 2949.1076049804688, + "kl_loss_6": 2038.2358947753905, + "learning_rate": 0.0009687515136252732, + "loss": 1528.6016, + "step": 1220 + }, + { + "ce_loss_12": 3.335651671886444, + "ce_loss_17": 3.0272030591964723, + "ce_loss_23": 2.897006368637085, + "ce_loss_3": 4.386695981025696, + "ce_loss_6": 3.907988655567169, + "epoch": 0.123, + "grad_norm": 1504.0, + "kl_loss_12": 941.971224975586, + "kl_loss_17": 248.676749420166, + "kl_loss_3": 3086.333532714844, + "kl_loss_6": 2138.5221313476563, + "learning_rate": 0.0009681970319895803, + "loss": 1615.9452, + "step": 1230 + }, + { + "ce_loss_12": 3.398023796081543, + "ce_loss_17": 3.098674750328064, + "ce_loss_23": 2.9762393712997435, + "ce_loss_3": 4.392714786529541, + "ce_loss_6": 3.9301032185554505, + "epoch": 0.124, + "grad_norm": 1168.0, + "kl_loss_12": 901.2606384277344, + "kl_loss_17": 243.77872314453126, + "kl_loss_3": 2938.1956420898437, + "kl_loss_6": 2024.7957946777344, + "learning_rate": 0.0009676378356149733, + "loss": 1522.3184, + "step": 1240 + }, + { + "ce_loss_12": 3.348244881629944, + "ce_loss_17": 3.064131224155426, + "ce_loss_23": 2.940069627761841, + "ce_loss_3": 4.336971664428711, + "ce_loss_6": 3.8650328040122988, + "epoch": 0.125, + "grad_norm": 1008.0, + "kl_loss_12": 876.7109588623047, + "kl_loss_17": 246.56452255249025, + "kl_loss_3": 2892.247216796875, + "kl_loss_6": 1980.359063720703, + "learning_rate": 0.0009670739301325534, + "loss": 1512.8385, + "step": 1250 + }, + { + "ce_loss_12": 3.3288076877593995, + "ce_loss_17": 3.0383694767951965, + "ce_loss_23": 2.9099366903305053, + "ce_loss_3": 4.317926383018493, + "ce_loss_6": 3.8465208292007445, + "epoch": 0.126, + "grad_norm": 1240.0, + "kl_loss_12": 902.7433868408203, + "kl_loss_17": 258.2109687805176, + "kl_loss_3": 2916.2613891601563, + "kl_loss_6": 2000.0012634277343, + "learning_rate": 0.0009665053212208426, + "loss": 1542.7377, + "step": 1260 + }, + { + "ce_loss_12": 3.3632496118545534, + "ce_loss_17": 3.066045308113098, + "ce_loss_23": 2.9375475525856016, + "ce_loss_3": 4.379070150852203, + "ce_loss_6": 3.8987871289253233, + "epoch": 0.127, + "grad_norm": 1680.0, + "kl_loss_12": 918.7768859863281, + "kl_loss_17": 252.96115264892578, + "kl_loss_3": 2991.537243652344, + "kl_loss_6": 2043.5452209472655, + "learning_rate": 0.0009659320146057262, + "loss": 1544.7102, + "step": 1270 + }, + { + "ce_loss_12": 3.3732680678367615, + "ce_loss_17": 3.0801384329795836, + "ce_loss_23": 2.955021917819977, + "ce_loss_3": 4.353860938549042, + "ce_loss_6": 3.8984060645103455, + "epoch": 0.128, + "grad_norm": 1064.0, + "kl_loss_12": 888.1411804199219, + "kl_loss_17": 240.9340576171875, + "kl_loss_3": 2913.7650390625, + "kl_loss_6": 2009.7888427734374, + "learning_rate": 0.0009653540160603955, + "loss": 1519.4078, + "step": 1280 + }, + { + "ce_loss_12": 3.3622668504714968, + "ce_loss_17": 3.07658052444458, + "ce_loss_23": 2.957567012310028, + "ce_loss_3": 4.34757616519928, + "ce_loss_6": 3.8929473042488096, + "epoch": 0.129, + "grad_norm": 1152.0, + "kl_loss_12": 881.5169555664063, + "kl_loss_17": 235.51721801757813, + "kl_loss_3": 2916.9619262695314, + "kl_loss_6": 2009.5466003417969, + "learning_rate": 0.0009647713314052896, + "loss": 1500.7032, + "step": 1290 + }, + { + "ce_loss_12": 3.3372119426727296, + "ce_loss_17": 3.0263918161392214, + "ce_loss_23": 2.898031508922577, + "ce_loss_3": 4.367906975746155, + "ce_loss_6": 3.913242280483246, + "epoch": 0.13, + "grad_norm": 1008.0, + "kl_loss_12": 929.2950592041016, + "kl_loss_17": 245.21987762451172, + "kl_loss_3": 3032.432580566406, + "kl_loss_6": 2117.933239746094, + "learning_rate": 0.0009641839665080363, + "loss": 1564.207, + "step": 1300 + }, + { + "ce_loss_12": 3.2963956356048585, + "ce_loss_17": 3.0004076957702637, + "ce_loss_23": 2.878360021114349, + "ce_loss_3": 4.309890949726105, + "ce_loss_6": 3.835643780231476, + "epoch": 0.131, + "grad_norm": 1168.0, + "kl_loss_12": 888.8966857910157, + "kl_loss_17": 234.6859046936035, + "kl_loss_3": 2960.3671875, + "kl_loss_6": 2018.6436645507813, + "learning_rate": 0.0009635919272833937, + "loss": 1510.3238, + "step": 1310 + }, + { + "ce_loss_12": 3.32631973028183, + "ce_loss_17": 3.0244080305099486, + "ce_loss_23": 2.9018367528915405, + "ce_loss_3": 4.342757534980774, + "ce_loss_6": 3.8605214953422546, + "epoch": 0.132, + "grad_norm": 1376.0, + "kl_loss_12": 902.885546875, + "kl_loss_17": 241.0728988647461, + "kl_loss_3": 2962.0408935546875, + "kl_loss_6": 2010.6015258789062, + "learning_rate": 0.0009629952196931902, + "loss": 1499.0007, + "step": 1320 + }, + { + "ce_loss_12": 3.298629081249237, + "ce_loss_17": 3.0122446179389955, + "ce_loss_23": 2.8936039984226225, + "ce_loss_3": 4.3187672138214115, + "ce_loss_6": 3.837043654918671, + "epoch": 0.133, + "grad_norm": 1080.0, + "kl_loss_12": 866.9013549804688, + "kl_loss_17": 233.5844757080078, + "kl_loss_3": 2945.208361816406, + "kl_loss_6": 2010.2642578125, + "learning_rate": 0.0009623938497462645, + "loss": 1505.8475, + "step": 1330 + }, + { + "ce_loss_12": 3.2931827902793884, + "ce_loss_17": 3.005644714832306, + "ce_loss_23": 2.8875430822372437, + "ce_loss_3": 4.305406546592712, + "ce_loss_6": 3.8357440710067747, + "epoch": 0.134, + "grad_norm": 1256.0, + "kl_loss_12": 872.3399932861328, + "kl_loss_17": 233.73633880615233, + "kl_loss_3": 2938.6026123046877, + "kl_loss_6": 2015.0488647460938, + "learning_rate": 0.0009617878234984055, + "loss": 1522.1096, + "step": 1340 + }, + { + "ce_loss_12": 3.361295449733734, + "ce_loss_17": 3.0917954325675963, + "ce_loss_23": 2.9757425785064697, + "ce_loss_3": 4.341284513473511, + "ce_loss_6": 3.872436785697937, + "epoch": 0.135, + "grad_norm": 1256.0, + "kl_loss_12": 841.0878295898438, + "kl_loss_17": 228.47272262573242, + "kl_loss_3": 2845.256799316406, + "kl_loss_6": 1924.9097717285156, + "learning_rate": 0.0009611771470522907, + "loss": 1484.9016, + "step": 1350 + }, + { + "ce_loss_12": 3.31159029006958, + "ce_loss_17": 3.0249439001083376, + "ce_loss_23": 2.903464639186859, + "ce_loss_3": 4.321415758132934, + "ce_loss_6": 3.835740602016449, + "epoch": 0.136, + "grad_norm": 1240.0, + "kl_loss_12": 857.112924194336, + "kl_loss_17": 233.4657760620117, + "kl_loss_3": 2906.2755737304688, + "kl_loss_6": 1957.5037475585937, + "learning_rate": 0.0009605618265574251, + "loss": 1477.5262, + "step": 1360 + }, + { + "ce_loss_12": 3.299883234500885, + "ce_loss_17": 3.002118504047394, + "ce_loss_23": 2.880831480026245, + "ce_loss_3": 4.321231853961945, + "ce_loss_6": 3.8664443135261535, + "epoch": 0.137, + "grad_norm": 1352.0, + "kl_loss_12": 893.3223236083984, + "kl_loss_17": 239.58416213989258, + "kl_loss_3": 2996.390173339844, + "kl_loss_6": 2092.887841796875, + "learning_rate": 0.0009599418682100792, + "loss": 1531.8381, + "step": 1370 + }, + { + "ce_loss_12": 3.3143301963806153, + "ce_loss_17": 3.0341737985610964, + "ce_loss_23": 2.9149271130561827, + "ce_loss_3": 4.3402428150177, + "ce_loss_6": 3.854619801044464, + "epoch": 0.138, + "grad_norm": 1120.0, + "kl_loss_12": 860.9669158935546, + "kl_loss_17": 232.6212287902832, + "kl_loss_3": 2954.685266113281, + "kl_loss_6": 2010.193621826172, + "learning_rate": 0.0009593172782532268, + "loss": 1512.1051, + "step": 1380 + }, + { + "ce_loss_12": 3.3477386832237244, + "ce_loss_17": 3.064415216445923, + "ce_loss_23": 2.941612994670868, + "ce_loss_3": 4.350572264194488, + "ce_loss_6": 3.869351398944855, + "epoch": 0.139, + "grad_norm": 1184.0, + "kl_loss_12": 859.5776611328125, + "kl_loss_17": 242.2350700378418, + "kl_loss_3": 2904.505615234375, + "kl_loss_6": 1965.819189453125, + "learning_rate": 0.0009586880629764817, + "loss": 1491.527, + "step": 1390 + }, + { + "ce_loss_12": 3.292214822769165, + "ce_loss_17": 3.017342722415924, + "ce_loss_23": 2.885129392147064, + "ce_loss_3": 4.305712127685547, + "ce_loss_6": 3.8401988983154296, + "epoch": 0.14, + "grad_norm": 1424.0, + "kl_loss_12": 869.0703979492188, + "kl_loss_17": 259.43865356445315, + "kl_loss_3": 2921.1829345703127, + "kl_loss_6": 1998.2489196777344, + "learning_rate": 0.0009580542287160348, + "loss": 1487.2192, + "step": 1400 + }, + { + "ce_loss_12": 3.251637268066406, + "ce_loss_17": 2.977019262313843, + "ce_loss_23": 2.8478209018707275, + "ce_loss_3": 4.271168637275696, + "ce_loss_6": 3.7906409740447997, + "epoch": 0.141, + "grad_norm": 1944.0, + "kl_loss_12": 859.4048309326172, + "kl_loss_17": 250.33745803833008, + "kl_loss_3": 2948.1715576171873, + "kl_loss_6": 1991.5314575195312, + "learning_rate": 0.0009574157818545901, + "loss": 1486.7772, + "step": 1410 + }, + { + "ce_loss_12": 3.3089492201805113, + "ce_loss_17": 3.0406702399253844, + "ce_loss_23": 2.923642265796661, + "ce_loss_3": 4.295652580261231, + "ce_loss_6": 3.8324657559394835, + "epoch": 0.142, + "grad_norm": 1368.0, + "kl_loss_12": 831.2928436279296, + "kl_loss_17": 233.49320373535156, + "kl_loss_3": 2856.494677734375, + "kl_loss_6": 1937.763165283203, + "learning_rate": 0.0009567727288213005, + "loss": 1498.7279, + "step": 1420 + }, + { + "ce_loss_12": 3.2987663745880127, + "ce_loss_17": 3.0193196058273317, + "ce_loss_23": 2.8958592295646666, + "ce_loss_3": 4.320412039756775, + "ce_loss_6": 3.8408318996429442, + "epoch": 0.143, + "grad_norm": 1392.0, + "kl_loss_12": 862.813589477539, + "kl_loss_17": 238.17898406982422, + "kl_loss_3": 2939.9102294921877, + "kl_loss_6": 2009.5839050292968, + "learning_rate": 0.0009561250760917027, + "loss": 1494.0355, + "step": 1430 + }, + { + "ce_loss_12": 3.311892545223236, + "ce_loss_17": 3.030276918411255, + "ce_loss_23": 2.9070225238800047, + "ce_loss_3": 4.316233801841736, + "ce_loss_6": 3.8451044082641603, + "epoch": 0.144, + "grad_norm": 1384.0, + "kl_loss_12": 868.576318359375, + "kl_loss_17": 237.70533142089843, + "kl_loss_3": 2930.186877441406, + "kl_loss_6": 2013.4018005371095, + "learning_rate": 0.0009554728301876525, + "loss": 1474.7908, + "step": 1440 + }, + { + "ce_loss_12": 3.352381670475006, + "ce_loss_17": 3.0663819193840025, + "ce_loss_23": 2.9476805090904237, + "ce_loss_3": 4.337472128868103, + "ce_loss_6": 3.8769607067108156, + "epoch": 0.145, + "grad_norm": 1004.0, + "kl_loss_12": 873.2269561767578, + "kl_loss_17": 233.07788543701173, + "kl_loss_3": 2882.8794189453124, + "kl_loss_6": 1971.3247436523438, + "learning_rate": 0.0009548159976772592, + "loss": 1529.3137, + "step": 1450 + }, + { + "ce_loss_12": 3.3141078472137453, + "ce_loss_17": 3.0283764123916628, + "ce_loss_23": 2.9071272253990172, + "ce_loss_3": 4.340900087356568, + "ce_loss_6": 3.8532317757606505, + "epoch": 0.146, + "grad_norm": 1264.0, + "kl_loss_12": 869.4196044921875, + "kl_loss_17": 232.49502258300782, + "kl_loss_3": 2958.092810058594, + "kl_loss_6": 1998.0352722167968, + "learning_rate": 0.0009541545851748186, + "loss": 1498.915, + "step": 1460 + }, + { + "ce_loss_12": 3.1964035868644713, + "ce_loss_17": 2.901301550865173, + "ce_loss_23": 2.784227430820465, + "ce_loss_3": 4.248557126522064, + "ce_loss_6": 3.7501466155052183, + "epoch": 0.147, + "grad_norm": 1400.0, + "kl_loss_12": 874.8744964599609, + "kl_loss_17": 226.8998664855957, + "kl_loss_3": 3002.2226318359376, + "kl_loss_6": 2033.5486083984374, + "learning_rate": 0.0009534885993407473, + "loss": 1513.8453, + "step": 1470 + }, + { + "ce_loss_12": 3.331232464313507, + "ce_loss_17": 3.0541473150253298, + "ce_loss_23": 2.9391016602516173, + "ce_loss_3": 4.347428560256958, + "ce_loss_6": 3.884372317790985, + "epoch": 0.148, + "grad_norm": 1744.0, + "kl_loss_12": 848.6544342041016, + "kl_loss_17": 223.4082130432129, + "kl_loss_3": 2913.6508544921876, + "kl_loss_6": 2003.9459106445313, + "learning_rate": 0.0009528180468815154, + "loss": 1500.877, + "step": 1480 + }, + { + "ce_loss_12": 3.3899044036865233, + "ce_loss_17": 3.1017459988594056, + "ce_loss_23": 2.988395261764526, + "ce_loss_3": 4.358977675437927, + "ce_loss_6": 3.8890286922454833, + "epoch": 0.149, + "grad_norm": 1760.0, + "kl_loss_12": 872.0698760986328, + "kl_loss_17": 223.78531875610352, + "kl_loss_3": 2834.707238769531, + "kl_loss_6": 1919.793634033203, + "learning_rate": 0.0009521429345495787, + "loss": 1470.1632, + "step": 1490 + }, + { + "ce_loss_12": 3.366633725166321, + "ce_loss_17": 3.080304205417633, + "ce_loss_23": 2.9673553824424745, + "ce_loss_3": 4.328233468532562, + "ce_loss_6": 3.853518569469452, + "epoch": 0.15, + "grad_norm": 1480.0, + "kl_loss_12": 856.5237091064453, + "kl_loss_17": 214.42600326538087, + "kl_loss_3": 2845.6985107421874, + "kl_loss_6": 1915.26474609375, + "learning_rate": 0.0009514632691433108, + "loss": 1476.083, + "step": 1500 + }, + { + "ce_loss_12": 3.3294340491294863, + "ce_loss_17": 3.048425531387329, + "ce_loss_23": 2.9328495979309084, + "ce_loss_3": 4.322175335884094, + "ce_loss_6": 3.8507775187492372, + "epoch": 0.151, + "grad_norm": 1288.0, + "kl_loss_12": 862.7704681396484, + "kl_loss_17": 225.10146713256836, + "kl_loss_3": 2895.6834350585937, + "kl_loss_6": 1963.930975341797, + "learning_rate": 0.0009507790575069346, + "loss": 1480.7919, + "step": 1510 + }, + { + "ce_loss_12": 3.319838809967041, + "ce_loss_17": 3.018332529067993, + "ce_loss_23": 2.8951820373535155, + "ce_loss_3": 4.331275558471679, + "ce_loss_6": 3.8573664903640745, + "epoch": 0.152, + "grad_norm": 1048.0, + "kl_loss_12": 874.026333618164, + "kl_loss_17": 228.92938156127929, + "kl_loss_3": 2936.3609008789062, + "kl_loss_6": 1998.9308288574218, + "learning_rate": 0.0009500903065304539, + "loss": 1518.75, + "step": 1520 + }, + { + "ce_loss_12": 3.321444344520569, + "ce_loss_17": 3.0558812379837037, + "ce_loss_23": 2.944769334793091, + "ce_loss_3": 4.301256394386291, + "ce_loss_6": 3.818336236476898, + "epoch": 0.153, + "grad_norm": 1504.0, + "kl_loss_12": 814.5647064208985, + "kl_loss_17": 215.27584228515624, + "kl_loss_3": 2813.6534790039063, + "kl_loss_6": 1879.0197326660157, + "learning_rate": 0.0009493970231495835, + "loss": 1462.6092, + "step": 1530 + }, + { + "ce_loss_12": 3.2664665579795837, + "ce_loss_17": 2.998563539981842, + "ce_loss_23": 2.8953148245811464, + "ce_loss_3": 4.23151820898056, + "ce_loss_6": 3.7709195971488954, + "epoch": 0.154, + "grad_norm": 1248.0, + "kl_loss_12": 808.3708679199219, + "kl_loss_17": 210.40441131591797, + "kl_loss_3": 2800.541455078125, + "kl_loss_6": 1898.972589111328, + "learning_rate": 0.0009486992143456792, + "loss": 1438.5279, + "step": 1540 + }, + { + "ce_loss_12": 3.319480133056641, + "ce_loss_17": 3.0265442609786986, + "ce_loss_23": 2.9047950506210327, + "ce_loss_3": 4.378856492042542, + "ce_loss_6": 3.8882370591163635, + "epoch": 0.155, + "grad_norm": 1312.0, + "kl_loss_12": 882.6671630859375, + "kl_loss_17": 231.51347122192382, + "kl_loss_3": 3036.104455566406, + "kl_loss_6": 2071.101397705078, + "learning_rate": 0.0009479968871456679, + "loss": 1505.0044, + "step": 1550 + }, + { + "ce_loss_12": 3.2715526580810548, + "ce_loss_17": 2.9901047587394713, + "ce_loss_23": 2.8788031339645386, + "ce_loss_3": 4.3019544124603275, + "ce_loss_6": 3.8024895429611205, + "epoch": 0.156, + "grad_norm": 1096.0, + "kl_loss_12": 846.2780609130859, + "kl_loss_17": 222.8622169494629, + "kl_loss_3": 2946.623815917969, + "kl_loss_6": 1964.7590576171874, + "learning_rate": 0.0009472900486219768, + "loss": 1463.0236, + "step": 1560 + }, + { + "ce_loss_12": 3.2607172608375548, + "ce_loss_17": 2.989371109008789, + "ce_loss_23": 2.876069664955139, + "ce_loss_3": 4.2608747482299805, + "ce_loss_6": 3.7734758615493775, + "epoch": 0.157, + "grad_norm": 952.0, + "kl_loss_12": 825.6946960449219, + "kl_loss_17": 219.342626953125, + "kl_loss_3": 2875.1432250976563, + "kl_loss_6": 1919.1255493164062, + "learning_rate": 0.000946578705892462, + "loss": 1466.0156, + "step": 1570 + }, + { + "ce_loss_12": 3.2860172152519227, + "ce_loss_17": 3.0196128726005553, + "ce_loss_23": 2.9074905157089233, + "ce_loss_3": 4.2808654546737674, + "ce_loss_6": 3.794986295700073, + "epoch": 0.158, + "grad_norm": 1216.0, + "kl_loss_12": 806.870248413086, + "kl_loss_17": 217.5166389465332, + "kl_loss_3": 2823.8129638671876, + "kl_loss_6": 1876.8592712402344, + "learning_rate": 0.0009458628661203367, + "loss": 1458.6794, + "step": 1580 + }, + { + "ce_loss_12": 3.316411054134369, + "ce_loss_17": 3.0272481441497803, + "ce_loss_23": 2.9105294942855835, + "ce_loss_3": 4.336752998828888, + "ce_loss_6": 3.8515880346298217, + "epoch": 0.159, + "grad_norm": 1224.0, + "kl_loss_12": 848.1409912109375, + "kl_loss_17": 224.56259536743164, + "kl_loss_3": 2940.787548828125, + "kl_loss_6": 1978.9293884277345, + "learning_rate": 0.0009451425365140996, + "loss": 1450.2014, + "step": 1590 + }, + { + "ce_loss_12": 3.356225883960724, + "ce_loss_17": 3.0905832052230835, + "ce_loss_23": 2.9780961751937864, + "ce_loss_3": 4.327465915679932, + "ce_loss_6": 3.852176809310913, + "epoch": 0.16, + "grad_norm": 1392.0, + "kl_loss_12": 815.6587585449219, + "kl_loss_17": 221.48415145874023, + "kl_loss_3": 2803.952099609375, + "kl_loss_6": 1875.7579223632813, + "learning_rate": 0.0009444177243274617, + "loss": 1423.3404, + "step": 1600 + }, + { + "ce_loss_12": 3.246622681617737, + "ce_loss_17": 2.9689454555511476, + "ce_loss_23": 2.8456170320510865, + "ce_loss_3": 4.260777032375335, + "ce_loss_6": 3.777989220619202, + "epoch": 0.161, + "grad_norm": 1384.0, + "kl_loss_12": 855.178598022461, + "kl_loss_17": 233.64147033691407, + "kl_loss_3": 2918.9806518554688, + "kl_loss_6": 1983.483837890625, + "learning_rate": 0.0009436884368592739, + "loss": 1477.2695, + "step": 1610 + }, + { + "ce_loss_12": 3.2852012157440185, + "ce_loss_17": 3.0092008233070375, + "ce_loss_23": 2.8976743459701537, + "ce_loss_3": 4.2525951862335205, + "ce_loss_6": 3.782522237300873, + "epoch": 0.162, + "grad_norm": 1088.0, + "kl_loss_12": 828.1313049316407, + "kl_loss_17": 225.7959991455078, + "kl_loss_3": 2816.5077514648438, + "kl_loss_6": 1884.6949340820313, + "learning_rate": 0.0009429546814534529, + "loss": 1468.4939, + "step": 1620 + }, + { + "ce_loss_12": 3.281573009490967, + "ce_loss_17": 3.014826846122742, + "ce_loss_23": 2.9038607597351076, + "ce_loss_3": 4.268366956710816, + "ce_loss_6": 3.792364406585693, + "epoch": 0.163, + "grad_norm": 920.0, + "kl_loss_12": 830.5794372558594, + "kl_loss_17": 222.38085021972657, + "kl_loss_3": 2831.4859008789062, + "kl_loss_6": 1903.0419677734376, + "learning_rate": 0.0009422164654989072, + "loss": 1432.2832, + "step": 1630 + }, + { + "ce_loss_12": 3.390706717967987, + "ce_loss_17": 3.1266611218452454, + "ce_loss_23": 3.0065022110939026, + "ce_loss_3": 4.3509008407592775, + "ce_loss_6": 3.88686203956604, + "epoch": 0.164, + "grad_norm": 1256.0, + "kl_loss_12": 831.9651641845703, + "kl_loss_17": 238.26752624511718, + "kl_loss_3": 2801.181994628906, + "kl_loss_6": 1888.4372192382812, + "learning_rate": 0.0009414737964294635, + "loss": 1449.9193, + "step": 1640 + }, + { + "ce_loss_12": 3.312992036342621, + "ce_loss_17": 3.0648077011108397, + "ce_loss_23": 2.9473799586296083, + "ce_loss_3": 4.255030500888824, + "ce_loss_6": 3.799005913734436, + "epoch": 0.165, + "grad_norm": 1648.0, + "kl_loss_12": 789.7920562744141, + "kl_loss_17": 240.65155105590821, + "kl_loss_3": 2705.621618652344, + "kl_loss_6": 1814.280419921875, + "learning_rate": 0.000940726681723791, + "loss": 1433.8896, + "step": 1650 + }, + { + "ce_loss_12": 3.197109341621399, + "ce_loss_17": 2.924350929260254, + "ce_loss_23": 2.8034530520439147, + "ce_loss_3": 4.219123911857605, + "ce_loss_6": 3.7377219080924986, + "epoch": 0.166, + "grad_norm": 1216.0, + "kl_loss_12": 848.9781066894532, + "kl_loss_17": 244.35307540893555, + "kl_loss_3": 2947.090185546875, + "kl_loss_6": 1988.9083862304688, + "learning_rate": 0.0009399751289053266, + "loss": 1446.2516, + "step": 1660 + }, + { + "ce_loss_12": 3.3627843022346497, + "ce_loss_17": 3.105271542072296, + "ce_loss_23": 2.9874876737594604, + "ce_loss_3": 4.3334238767623905, + "ce_loss_6": 3.860498380661011, + "epoch": 0.167, + "grad_norm": 1336.0, + "kl_loss_12": 800.2411010742187, + "kl_loss_17": 230.9557662963867, + "kl_loss_3": 2787.5281982421875, + "kl_loss_6": 1856.9417358398437, + "learning_rate": 0.0009392191455421988, + "loss": 1445.9791, + "step": 1670 + }, + { + "ce_loss_12": 3.3555013060569765, + "ce_loss_17": 3.094896376132965, + "ce_loss_23": 2.9762457609176636, + "ce_loss_3": 4.328198766708374, + "ce_loss_6": 3.8430837988853455, + "epoch": 0.168, + "grad_norm": 1232.0, + "kl_loss_12": 829.3635925292969, + "kl_loss_17": 234.05589065551757, + "kl_loss_3": 2830.514123535156, + "kl_loss_6": 1874.6403503417969, + "learning_rate": 0.0009384587392471515, + "loss": 1411.1933, + "step": 1680 + }, + { + "ce_loss_12": 3.3350028157234193, + "ce_loss_17": 3.0776092767715455, + "ce_loss_23": 2.968519389629364, + "ce_loss_3": 4.272547805309296, + "ce_loss_6": 3.8146313190460206, + "epoch": 0.169, + "grad_norm": 1240.0, + "kl_loss_12": 799.5876861572266, + "kl_loss_17": 214.26291427612304, + "kl_loss_3": 2744.126611328125, + "kl_loss_6": 1833.7157287597656, + "learning_rate": 0.0009376939176774678, + "loss": 1400.9363, + "step": 1690 + }, + { + "ce_loss_12": 3.3187434673309326, + "ce_loss_17": 3.0456138134002684, + "ce_loss_23": 2.934523367881775, + "ce_loss_3": 4.282467603683472, + "ce_loss_6": 3.815245735645294, + "epoch": 0.17, + "grad_norm": 1328.0, + "kl_loss_12": 808.9097473144532, + "kl_loss_17": 216.57661361694335, + "kl_loss_3": 2789.231591796875, + "kl_loss_6": 1862.6118041992188, + "learning_rate": 0.0009369246885348925, + "loss": 1448.727, + "step": 1700 + }, + { + "ce_loss_12": 3.309137487411499, + "ce_loss_17": 3.040472149848938, + "ce_loss_23": 2.9257946729660036, + "ce_loss_3": 4.315243148803711, + "ce_loss_6": 3.8397695422172546, + "epoch": 0.171, + "grad_norm": 1168.0, + "kl_loss_12": 825.0207580566406, + "kl_loss_17": 219.96196365356445, + "kl_loss_3": 2880.1123291015624, + "kl_loss_6": 1938.305194091797, + "learning_rate": 0.0009361510595655545, + "loss": 1454.2666, + "step": 1710 + }, + { + "ce_loss_12": 3.279538094997406, + "ce_loss_17": 3.006372034549713, + "ce_loss_23": 2.8868885159492494, + "ce_loss_3": 4.248486363887787, + "ce_loss_6": 3.7824005246162415, + "epoch": 0.172, + "grad_norm": 1376.0, + "kl_loss_12": 834.2845306396484, + "kl_loss_17": 230.54197692871094, + "kl_loss_3": 2836.3697875976563, + "kl_loss_6": 1909.7118041992187, + "learning_rate": 0.0009353730385598887, + "loss": 1437.4458, + "step": 1720 + }, + { + "ce_loss_12": 3.207744026184082, + "ce_loss_17": 2.9387964844703673, + "ce_loss_23": 2.8279772877693174, + "ce_loss_3": 4.2246463894844055, + "ce_loss_6": 3.734726536273956, + "epoch": 0.173, + "grad_norm": 1320.0, + "kl_loss_12": 819.5813079833985, + "kl_loss_17": 220.05044860839843, + "kl_loss_3": 2892.9287353515624, + "kl_loss_6": 1924.0830078125, + "learning_rate": 0.0009345906333525581, + "loss": 1460.6379, + "step": 1730 + }, + { + "ce_loss_12": 3.245152676105499, + "ce_loss_17": 2.9810129046440124, + "ce_loss_23": 2.8624545097351075, + "ce_loss_3": 4.247416305541992, + "ce_loss_6": 3.763348925113678, + "epoch": 0.174, + "grad_norm": 1448.0, + "kl_loss_12": 826.6671295166016, + "kl_loss_17": 230.10610885620116, + "kl_loss_3": 2867.794921875, + "kl_loss_6": 1919.4322570800782, + "learning_rate": 0.0009338038518223745, + "loss": 1437.9183, + "step": 1740 + }, + { + "ce_loss_12": 3.3130956649780274, + "ce_loss_17": 3.0388700008392333, + "ce_loss_23": 2.918446886539459, + "ce_loss_3": 4.301969075202942, + "ce_loss_6": 3.8242091655731203, + "epoch": 0.175, + "grad_norm": 1328.0, + "kl_loss_12": 843.7064819335938, + "kl_loss_17": 233.51200790405272, + "kl_loss_3": 2894.5472412109375, + "kl_loss_6": 1943.9509155273438, + "learning_rate": 0.0009330127018922195, + "loss": 1498.6205, + "step": 1750 + }, + { + "ce_loss_12": 3.2673400759696962, + "ce_loss_17": 3.000726819038391, + "ce_loss_23": 2.885044813156128, + "ce_loss_3": 4.255298697948456, + "ce_loss_6": 3.7829174280166624, + "epoch": 0.176, + "grad_norm": 1184.0, + "kl_loss_12": 817.8572479248047, + "kl_loss_17": 226.14935226440429, + "kl_loss_3": 2856.8099609375, + "kl_loss_6": 1912.9068420410156, + "learning_rate": 0.0009322171915289634, + "loss": 1454.0908, + "step": 1760 + }, + { + "ce_loss_12": 3.292384111881256, + "ce_loss_17": 3.036431038379669, + "ce_loss_23": 2.924695813655853, + "ce_loss_3": 4.247577679157257, + "ce_loss_6": 3.782546889781952, + "epoch": 0.177, + "grad_norm": 1528.0, + "kl_loss_12": 814.5729064941406, + "kl_loss_17": 224.25176086425782, + "kl_loss_3": 2795.0510864257812, + "kl_loss_6": 1871.2465942382812, + "learning_rate": 0.0009314173287433873, + "loss": 1412.1355, + "step": 1770 + }, + { + "ce_loss_12": 3.27299427986145, + "ce_loss_17": 3.012584948539734, + "ce_loss_23": 2.8959354162216187, + "ce_loss_3": 4.245269799232483, + "ce_loss_6": 3.777409863471985, + "epoch": 0.178, + "grad_norm": 1568.0, + "kl_loss_12": 829.3791748046875, + "kl_loss_17": 231.68452835083008, + "kl_loss_3": 2829.4614501953124, + "kl_loss_6": 1899.1982788085938, + "learning_rate": 0.0009306131215901003, + "loss": 1420.7092, + "step": 1780 + }, + { + "ce_loss_12": 3.3116036295890807, + "ce_loss_17": 3.051762104034424, + "ce_loss_23": 2.9339508533477785, + "ce_loss_3": 4.274141776561737, + "ce_loss_6": 3.8060096144676208, + "epoch": 0.179, + "grad_norm": 1280.0, + "kl_loss_12": 825.7397613525391, + "kl_loss_17": 231.49067916870118, + "kl_loss_3": 2789.6765869140627, + "kl_loss_6": 1874.7170837402343, + "learning_rate": 0.0009298045781674596, + "loss": 1401.5098, + "step": 1790 + }, + { + "ce_loss_12": 3.290810251235962, + "ce_loss_17": 3.030768263339996, + "ce_loss_23": 2.915701675415039, + "ce_loss_3": 4.241096484661102, + "ce_loss_6": 3.774291145801544, + "epoch": 0.18, + "grad_norm": 1576.0, + "kl_loss_12": 800.6549865722657, + "kl_loss_17": 220.91228637695312, + "kl_loss_3": 2774.0736572265623, + "kl_loss_6": 1845.7342529296875, + "learning_rate": 0.0009289917066174886, + "loss": 1427.2309, + "step": 1800 + }, + { + "ce_loss_12": 3.2650416612625124, + "ce_loss_17": 3.0218977689743043, + "ce_loss_23": 2.9145373463630677, + "ce_loss_3": 4.202980744838714, + "ce_loss_6": 3.7441303372383117, + "epoch": 0.181, + "grad_norm": 1184.0, + "kl_loss_12": 771.3196075439453, + "kl_loss_17": 214.48935470581054, + "kl_loss_3": 2712.9664794921873, + "kl_loss_6": 1801.353857421875, + "learning_rate": 0.0009281745151257945, + "loss": 1386.1813, + "step": 1810 + }, + { + "ce_loss_12": 3.2988988161087036, + "ce_loss_17": 3.04592365026474, + "ce_loss_23": 2.930855906009674, + "ce_loss_3": 4.2782234907150265, + "ce_loss_6": 3.8062909841537476, + "epoch": 0.182, + "grad_norm": 1240.0, + "kl_loss_12": 787.0630126953125, + "kl_loss_17": 221.52530364990236, + "kl_loss_3": 2781.5021118164063, + "kl_loss_6": 1860.2128845214843, + "learning_rate": 0.0009273530119214868, + "loss": 1426.3877, + "step": 1820 + }, + { + "ce_loss_12": 3.374634265899658, + "ce_loss_17": 3.1234985828399657, + "ce_loss_23": 3.0172491669654846, + "ce_loss_3": 4.339367198944092, + "ce_loss_6": 3.880409133434296, + "epoch": 0.183, + "grad_norm": 1232.0, + "kl_loss_12": 785.4364990234375, + "kl_loss_17": 213.38180923461914, + "kl_loss_3": 2767.317590332031, + "kl_loss_6": 1863.320458984375, + "learning_rate": 0.0009265272052770935, + "loss": 1389.6257, + "step": 1830 + }, + { + "ce_loss_12": 3.23532772064209, + "ce_loss_17": 2.969554400444031, + "ce_loss_23": 2.8575196266174316, + "ce_loss_3": 4.234844088554382, + "ce_loss_6": 3.7532593846321105, + "epoch": 0.184, + "grad_norm": 1584.0, + "kl_loss_12": 791.7487670898438, + "kl_loss_17": 211.72332992553712, + "kl_loss_3": 2818.5214233398438, + "kl_loss_6": 1884.60966796875, + "learning_rate": 0.0009256971035084784, + "loss": 1418.0409, + "step": 1840 + }, + { + "ce_loss_12": 3.1916938424110413, + "ce_loss_17": 2.91089289188385, + "ce_loss_23": 2.794518756866455, + "ce_loss_3": 4.19860999584198, + "ce_loss_6": 3.711833357810974, + "epoch": 0.185, + "grad_norm": 1480.0, + "kl_loss_12": 838.5909271240234, + "kl_loss_17": 218.6007522583008, + "kl_loss_3": 2895.4562377929688, + "kl_loss_6": 1940.0254455566405, + "learning_rate": 0.0009248627149747573, + "loss": 1445.7287, + "step": 1850 + }, + { + "ce_loss_12": 3.3483711004257204, + "ce_loss_17": 3.093137788772583, + "ce_loss_23": 2.9884451985359193, + "ce_loss_3": 4.302263617515564, + "ce_loss_6": 3.838647758960724, + "epoch": 0.186, + "grad_norm": 1216.0, + "kl_loss_12": 801.7537963867187, + "kl_loss_17": 211.8816345214844, + "kl_loss_3": 2771.502001953125, + "kl_loss_6": 1855.2513854980468, + "learning_rate": 0.0009240240480782129, + "loss": 1410.6666, + "step": 1860 + }, + { + "ce_loss_12": 3.2653139352798464, + "ce_loss_17": 2.996794414520264, + "ce_loss_23": 2.890926492214203, + "ce_loss_3": 4.250155305862426, + "ce_loss_6": 3.7754549026489257, + "epoch": 0.187, + "grad_norm": 1272.0, + "kl_loss_12": 816.9411773681641, + "kl_loss_17": 212.1212501525879, + "kl_loss_3": 2823.0911865234375, + "kl_loss_6": 1898.2119262695312, + "learning_rate": 0.0009231811112642122, + "loss": 1414.3244, + "step": 1870 + }, + { + "ce_loss_12": 3.3026415705680847, + "ce_loss_17": 3.0429324388504027, + "ce_loss_23": 2.9370568990707397, + "ce_loss_3": 4.239575839042663, + "ce_loss_6": 3.7796589851379396, + "epoch": 0.188, + "grad_norm": 1408.0, + "kl_loss_12": 800.7113494873047, + "kl_loss_17": 211.73002395629882, + "kl_loss_3": 2733.2020629882813, + "kl_loss_6": 1829.6883544921875, + "learning_rate": 0.0009223339130211192, + "loss": 1393.907, + "step": 1880 + }, + { + "ce_loss_12": 3.178125536441803, + "ce_loss_17": 2.911378359794617, + "ce_loss_23": 2.8056634843349455, + "ce_loss_3": 4.178744328022003, + "ce_loss_6": 3.689058315753937, + "epoch": 0.189, + "grad_norm": 1304.0, + "kl_loss_12": 798.3189514160156, + "kl_loss_17": 208.87499465942383, + "kl_loss_3": 2850.0263427734376, + "kl_loss_6": 1886.8675842285156, + "learning_rate": 0.0009214824618802108, + "loss": 1424.7606, + "step": 1890 + }, + { + "ce_loss_12": 3.339543855190277, + "ce_loss_17": 3.0779946565628054, + "ce_loss_23": 2.9706859588623047, + "ce_loss_3": 4.303978550434112, + "ce_loss_6": 3.8317994713783263, + "epoch": 0.19, + "grad_norm": 1208.0, + "kl_loss_12": 810.0784576416015, + "kl_loss_17": 213.61787796020508, + "kl_loss_3": 2769.9716552734376, + "kl_loss_6": 1842.8320434570312, + "learning_rate": 0.0009206267664155906, + "loss": 1442.9592, + "step": 1900 + }, + { + "ce_loss_12": 3.2774033308029176, + "ce_loss_17": 3.0128692269325255, + "ce_loss_23": 2.9003699660301208, + "ce_loss_3": 4.249328136444092, + "ce_loss_6": 3.773828589916229, + "epoch": 0.191, + "grad_norm": 1280.0, + "kl_loss_12": 797.3476531982421, + "kl_loss_17": 212.8129913330078, + "kl_loss_3": 2789.0569580078127, + "kl_loss_6": 1851.9897399902343, + "learning_rate": 0.0009197668352441024, + "loss": 1417.5857, + "step": 1910 + }, + { + "ce_loss_12": 3.3080789923667906, + "ce_loss_17": 3.0548011660575867, + "ce_loss_23": 2.945826530456543, + "ce_loss_3": 4.280576181411743, + "ce_loss_6": 3.803591275215149, + "epoch": 0.192, + "grad_norm": 1288.0, + "kl_loss_12": 790.8529418945312, + "kl_loss_17": 215.02757263183594, + "kl_loss_3": 2774.336535644531, + "kl_loss_6": 1845.1325622558593, + "learning_rate": 0.0009189026770252437, + "loss": 1410.9227, + "step": 1920 + }, + { + "ce_loss_12": 3.335878050327301, + "ce_loss_17": 3.078634560108185, + "ce_loss_23": 2.9686517477035523, + "ce_loss_3": 4.294387483596802, + "ce_loss_6": 3.827136993408203, + "epoch": 0.193, + "grad_norm": 1448.0, + "kl_loss_12": 793.4487915039062, + "kl_loss_17": 213.7540054321289, + "kl_loss_3": 2755.5981689453124, + "kl_loss_6": 1827.88701171875, + "learning_rate": 0.000918034300461078, + "loss": 1444.1677, + "step": 1930 + }, + { + "ce_loss_12": 3.354683554172516, + "ce_loss_17": 3.096818745136261, + "ce_loss_23": 2.9910334944725037, + "ce_loss_3": 4.28806164264679, + "ce_loss_6": 3.8329141616821287, + "epoch": 0.194, + "grad_norm": 1216.0, + "kl_loss_12": 787.0794219970703, + "kl_loss_17": 213.68303756713868, + "kl_loss_3": 2712.944299316406, + "kl_loss_6": 1813.1900573730468, + "learning_rate": 0.0009171617142961477, + "loss": 1392.3291, + "step": 1940 + }, + { + "ce_loss_12": 3.3147756099700927, + "ce_loss_17": 3.065266180038452, + "ce_loss_23": 2.95823312997818, + "ce_loss_3": 4.272750425338745, + "ce_loss_6": 3.803443992137909, + "epoch": 0.195, + "grad_norm": 1400.0, + "kl_loss_12": 783.5265594482422, + "kl_loss_17": 206.5057601928711, + "kl_loss_3": 2734.9050537109374, + "kl_loss_6": 1812.6223083496093, + "learning_rate": 0.0009162849273173857, + "loss": 1392.0568, + "step": 1950 + }, + { + "ce_loss_12": 3.273218238353729, + "ce_loss_17": 3.0155712246894835, + "ce_loss_23": 2.9083391427993774, + "ce_loss_3": 4.228793978691101, + "ce_loss_6": 3.7560607314109804, + "epoch": 0.196, + "grad_norm": 1376.0, + "kl_loss_12": 785.6598205566406, + "kl_loss_17": 212.21677474975587, + "kl_loss_3": 2741.75830078125, + "kl_loss_6": 1812.2293518066406, + "learning_rate": 0.0009154039483540273, + "loss": 1402.1262, + "step": 1960 + }, + { + "ce_loss_12": 3.249917209148407, + "ce_loss_17": 2.992921471595764, + "ce_loss_23": 2.8863423466682434, + "ce_loss_3": 4.221356511116028, + "ce_loss_6": 3.7359424829483032, + "epoch": 0.197, + "grad_norm": 1272.0, + "kl_loss_12": 793.6822845458985, + "kl_loss_17": 214.4604591369629, + "kl_loss_3": 2779.8605834960936, + "kl_loss_6": 1822.9128112792969, + "learning_rate": 0.0009145187862775209, + "loss": 1398.8131, + "step": 1970 + }, + { + "ce_loss_12": 3.275111532211304, + "ce_loss_17": 3.021639096736908, + "ce_loss_23": 2.9154768347740174, + "ce_loss_3": 4.222866582870483, + "ce_loss_6": 3.760035479068756, + "epoch": 0.198, + "grad_norm": 1320.0, + "kl_loss_12": 780.6487457275391, + "kl_loss_17": 217.9638526916504, + "kl_loss_3": 2732.1581909179686, + "kl_loss_6": 1816.9951904296875, + "learning_rate": 0.0009136294500014386, + "loss": 1386.9706, + "step": 1980 + }, + { + "ce_loss_12": 3.243885362148285, + "ce_loss_17": 2.982141208648682, + "ce_loss_23": 2.8703251242637635, + "ce_loss_3": 4.262989103794098, + "ce_loss_6": 3.7759433507919313, + "epoch": 0.199, + "grad_norm": 1288.0, + "kl_loss_12": 800.5297271728516, + "kl_loss_17": 223.0495346069336, + "kl_loss_3": 2867.4418701171876, + "kl_loss_6": 1904.5263061523438, + "learning_rate": 0.000912735948481387, + "loss": 1440.5543, + "step": 1990 + }, + { + "ce_loss_12": 3.2660236597061156, + "ce_loss_17": 3.006909728050232, + "ce_loss_23": 2.900590443611145, + "ce_loss_3": 4.216969156265259, + "ce_loss_6": 3.753326213359833, + "epoch": 0.2, + "grad_norm": 1464.0, + "kl_loss_12": 800.6086486816406, + "kl_loss_17": 216.52916259765624, + "kl_loss_3": 2766.7096557617188, + "kl_loss_6": 1848.5642028808593, + "learning_rate": 0.0009118382907149164, + "loss": 1383.1088, + "step": 2000 + }, + { + "ce_loss_12": 3.284585154056549, + "ce_loss_17": 3.0366219162940977, + "ce_loss_23": 2.9253772735595702, + "ce_loss_3": 4.232977199554443, + "ce_loss_6": 3.7740879535675047, + "epoch": 0.201, + "grad_norm": 1472.0, + "kl_loss_12": 797.0703948974609, + "kl_loss_17": 220.19276428222656, + "kl_loss_3": 2723.853857421875, + "kl_loss_6": 1822.0363464355469, + "learning_rate": 0.0009109364857414306, + "loss": 1380.6947, + "step": 2010 + }, + { + "ce_loss_12": 3.2520282745361326, + "ce_loss_17": 3.0007045030593873, + "ce_loss_23": 2.888823854923248, + "ce_loss_3": 4.200846481323242, + "ce_loss_6": 3.7307973384857176, + "epoch": 0.202, + "grad_norm": 1488.0, + "kl_loss_12": 791.588168334961, + "kl_loss_17": 222.9745346069336, + "kl_loss_3": 2748.2218627929688, + "kl_loss_6": 1811.2208923339845, + "learning_rate": 0.0009100305426420956, + "loss": 1422.4186, + "step": 2020 + }, + { + "ce_loss_12": 3.2231249690055845, + "ce_loss_17": 2.97230703830719, + "ce_loss_23": 2.863428020477295, + "ce_loss_3": 4.245284426212311, + "ce_loss_6": 3.7391476035118103, + "epoch": 0.203, + "grad_norm": 2208.0, + "kl_loss_12": 796.4809753417969, + "kl_loss_17": 222.501375579834, + "kl_loss_3": 2898.3029174804688, + "kl_loss_6": 1883.9183410644532, + "learning_rate": 0.0009091204705397484, + "loss": 1416.9577, + "step": 2030 + }, + { + "ce_loss_12": 3.2175127506256103, + "ce_loss_17": 2.955952513217926, + "ce_loss_23": 2.8449599504470826, + "ce_loss_3": 4.22771806716919, + "ce_loss_6": 3.750357484817505, + "epoch": 0.204, + "grad_norm": 1136.0, + "kl_loss_12": 806.6401519775391, + "kl_loss_17": 220.58082427978516, + "kl_loss_3": 2889.6021118164062, + "kl_loss_6": 1927.42529296875, + "learning_rate": 0.0009082062785988049, + "loss": 1435.6596, + "step": 2040 + }, + { + "ce_loss_12": 3.32594176530838, + "ce_loss_17": 3.0832242012023925, + "ce_loss_23": 2.9809635281562805, + "ce_loss_3": 4.2515133142471315, + "ce_loss_6": 3.8101436614990236, + "epoch": 0.205, + "grad_norm": 1280.0, + "kl_loss_12": 774.9831390380859, + "kl_loss_17": 206.38177642822265, + "kl_loss_3": 2704.8578857421876, + "kl_loss_6": 1808.8279541015625, + "learning_rate": 0.0009072879760251679, + "loss": 1392.7363, + "step": 2050 + }, + { + "ce_loss_12": 3.2944729328155518, + "ce_loss_17": 3.0345913648605345, + "ce_loss_23": 2.9275845885276794, + "ce_loss_3": 4.279146134853363, + "ce_loss_6": 3.807130527496338, + "epoch": 0.206, + "grad_norm": 1400.0, + "kl_loss_12": 794.5384826660156, + "kl_loss_17": 210.71795120239258, + "kl_loss_3": 2833.31123046875, + "kl_loss_6": 1893.0530700683594, + "learning_rate": 0.0009063655720661341, + "loss": 1406.8914, + "step": 2060 + }, + { + "ce_loss_12": 3.3217201709747313, + "ce_loss_17": 3.0753472089767455, + "ce_loss_23": 2.9684493780136108, + "ce_loss_3": 4.257889258861542, + "ce_loss_6": 3.8021424770355225, + "epoch": 0.207, + "grad_norm": 1320.0, + "kl_loss_12": 784.0586700439453, + "kl_loss_17": 209.1168411254883, + "kl_loss_3": 2704.4903930664063, + "kl_loss_6": 1804.3248657226563, + "learning_rate": 0.000905439076010301, + "loss": 1384.1061, + "step": 2070 + }, + { + "ce_loss_12": 3.2956433057785035, + "ce_loss_17": 3.0315235018730164, + "ce_loss_23": 2.9202005863189697, + "ce_loss_3": 4.265854740142823, + "ce_loss_6": 3.7907448053359984, + "epoch": 0.208, + "grad_norm": 1144.0, + "kl_loss_12": 798.5905914306641, + "kl_loss_17": 214.00121459960937, + "kl_loss_3": 2773.0674682617187, + "kl_loss_6": 1839.7169311523437, + "learning_rate": 0.0009045084971874737, + "loss": 1375.9626, + "step": 2080 + }, + { + "ce_loss_12": 3.2762994289398195, + "ce_loss_17": 3.0212566256523132, + "ce_loss_23": 2.9129729986190798, + "ce_loss_3": 4.229314315319061, + "ce_loss_6": 3.7623565196990967, + "epoch": 0.209, + "grad_norm": 1784.0, + "kl_loss_12": 796.1528045654297, + "kl_loss_17": 214.5410514831543, + "kl_loss_3": 2751.5361572265624, + "kl_loss_6": 1833.2583068847657, + "learning_rate": 0.0009035738449685707, + "loss": 1418.4584, + "step": 2090 + }, + { + "ce_loss_12": 3.220756542682648, + "ce_loss_17": 2.9494006514549254, + "ce_loss_23": 2.841488039493561, + "ce_loss_3": 4.22071681022644, + "ce_loss_6": 3.7321442127227784, + "epoch": 0.21, + "grad_norm": 2112.0, + "kl_loss_12": 809.575830078125, + "kl_loss_17": 209.3122985839844, + "kl_loss_3": 2857.9162475585936, + "kl_loss_6": 1892.9190002441405, + "learning_rate": 0.0009026351287655293, + "loss": 1401.2596, + "step": 2100 + }, + { + "ce_loss_12": 3.3801162958145143, + "ce_loss_17": 3.138319969177246, + "ce_loss_23": 3.042530620098114, + "ce_loss_3": 4.262100148200989, + "ce_loss_6": 3.8183608770370485, + "epoch": 0.211, + "grad_norm": 1168.0, + "kl_loss_12": 756.0740661621094, + "kl_loss_17": 195.61113357543945, + "kl_loss_3": 2584.5614379882813, + "kl_loss_6": 1704.242449951172, + "learning_rate": 0.0009016923580312113, + "loss": 1321.6908, + "step": 2110 + }, + { + "ce_loss_12": 3.252412164211273, + "ce_loss_17": 3.006903553009033, + "ce_loss_23": 2.903613269329071, + "ce_loss_3": 4.197078967094422, + "ce_loss_6": 3.7228079557418825, + "epoch": 0.212, + "grad_norm": 1328.0, + "kl_loss_12": 777.3219177246094, + "kl_loss_17": 202.30859909057617, + "kl_loss_3": 2715.3901977539062, + "kl_loss_6": 1771.5765502929687, + "learning_rate": 0.0009007455422593077, + "loss": 1396.4434, + "step": 2120 + }, + { + "ce_loss_12": 3.283956015110016, + "ce_loss_17": 3.0239007830619813, + "ce_loss_23": 2.9164740085601806, + "ce_loss_3": 4.264662671089172, + "ce_loss_6": 3.7788233041763304, + "epoch": 0.213, + "grad_norm": 1216.0, + "kl_loss_12": 800.8696929931641, + "kl_loss_17": 212.63365936279297, + "kl_loss_3": 2810.420617675781, + "kl_loss_6": 1852.5259765625, + "learning_rate": 0.0008997946909842425, + "loss": 1414.5421, + "step": 2130 + }, + { + "ce_loss_12": 3.3082870483398437, + "ce_loss_17": 3.0352408289909363, + "ce_loss_23": 2.920850145816803, + "ce_loss_3": 4.320699071884155, + "ce_loss_6": 3.8336455464363097, + "epoch": 0.214, + "grad_norm": 1568.0, + "kl_loss_12": 822.8223999023437, + "kl_loss_17": 220.4384895324707, + "kl_loss_3": 2897.794787597656, + "kl_loss_6": 1938.5953796386718, + "learning_rate": 0.0008988398137810777, + "loss": 1410.1963, + "step": 2140 + }, + { + "ce_loss_12": 3.317046272754669, + "ce_loss_17": 3.067119073867798, + "ce_loss_23": 2.959135901927948, + "ce_loss_3": 4.2535442352294925, + "ce_loss_6": 3.7930386662483215, + "epoch": 0.215, + "grad_norm": 1448.0, + "kl_loss_12": 769.2138397216797, + "kl_loss_17": 212.8603088378906, + "kl_loss_3": 2707.310168457031, + "kl_loss_6": 1789.6441650390625, + "learning_rate": 0.0008978809202654162, + "loss": 1362.0427, + "step": 2150 + }, + { + "ce_loss_12": 3.3021875619888306, + "ce_loss_17": 3.049200105667114, + "ce_loss_23": 2.9374216318130495, + "ce_loss_3": 4.250543379783631, + "ce_loss_6": 3.7779937982559204, + "epoch": 0.216, + "grad_norm": 1344.0, + "kl_loss_12": 776.3447540283203, + "kl_loss_17": 225.30978622436524, + "kl_loss_3": 2715.261511230469, + "kl_loss_6": 1780.8355224609375, + "learning_rate": 0.0008969180200933046, + "loss": 1391.7432, + "step": 2160 + }, + { + "ce_loss_12": 3.2748268008232118, + "ce_loss_17": 3.0194689512252806, + "ce_loss_23": 2.897624909877777, + "ce_loss_3": 4.253852486610413, + "ce_loss_6": 3.7766872525215147, + "epoch": 0.217, + "grad_norm": 1440.0, + "kl_loss_12": 798.7320251464844, + "kl_loss_17": 234.73491363525392, + "kl_loss_3": 2779.512072753906, + "kl_loss_6": 1841.3709289550782, + "learning_rate": 0.0008959511229611376, + "loss": 1415.2232, + "step": 2170 + }, + { + "ce_loss_12": 3.326402735710144, + "ce_loss_17": 3.084159243106842, + "ce_loss_23": 2.9727144956588747, + "ce_loss_3": 4.287988996505737, + "ce_loss_6": 3.806883656978607, + "epoch": 0.218, + "grad_norm": 1304.0, + "kl_loss_12": 772.4044616699218, + "kl_loss_17": 245.79975051879882, + "kl_loss_3": 2754.1268798828123, + "kl_loss_6": 1809.63154296875, + "learning_rate": 0.0008949802386055581, + "loss": 1392.9225, + "step": 2180 + }, + { + "ce_loss_12": 3.204367291927338, + "ce_loss_17": 2.9835811376571657, + "ce_loss_23": 2.8466378808021546, + "ce_loss_3": 4.16232339143753, + "ce_loss_6": 3.6835678458213805, + "epoch": 0.219, + "grad_norm": 1352.0, + "kl_loss_12": 763.1388305664062, + "kl_loss_17": 258.0673347473145, + "kl_loss_3": 2716.5968383789063, + "kl_loss_6": 1774.9647338867187, + "learning_rate": 0.0008940053768033609, + "loss": 1408.7924, + "step": 2190 + }, + { + "ce_loss_12": 3.2718021631240846, + "ce_loss_17": 3.0378584384918215, + "ce_loss_23": 2.926106798648834, + "ce_loss_3": 4.214753663539886, + "ce_loss_6": 3.7463977098464967, + "epoch": 0.22, + "grad_norm": 1168.0, + "kl_loss_12": 757.7002105712891, + "kl_loss_17": 224.0239601135254, + "kl_loss_3": 2703.5033569335938, + "kl_loss_6": 1772.026416015625, + "learning_rate": 0.0008930265473713938, + "loss": 1366.7803, + "step": 2200 + }, + { + "ce_loss_12": 3.2438246488571165, + "ce_loss_17": 3.003926396369934, + "ce_loss_23": 2.888247859477997, + "ce_loss_3": 4.2010880470275875, + "ce_loss_6": 3.7235129475593567, + "epoch": 0.221, + "grad_norm": 1480.0, + "kl_loss_12": 762.0072631835938, + "kl_loss_17": 219.94583282470703, + "kl_loss_3": 2713.9848022460938, + "kl_loss_6": 1777.1114379882813, + "learning_rate": 0.0008920437601664579, + "loss": 1345.4111, + "step": 2210 + }, + { + "ce_loss_12": 3.2488271832466125, + "ce_loss_17": 3.000194180011749, + "ce_loss_23": 2.8827094435691833, + "ce_loss_3": 4.2079703450202945, + "ce_loss_6": 3.737367570400238, + "epoch": 0.222, + "grad_norm": 1400.0, + "kl_loss_12": 786.0016723632813, + "kl_loss_17": 228.3734375, + "kl_loss_3": 2753.2352905273438, + "kl_loss_6": 1828.0585388183595, + "learning_rate": 0.0008910570250852097, + "loss": 1365.5415, + "step": 2220 + }, + { + "ce_loss_12": 3.313809859752655, + "ce_loss_17": 3.086100924015045, + "ce_loss_23": 2.9755563855171205, + "ce_loss_3": 4.227397823333741, + "ce_loss_6": 3.767635691165924, + "epoch": 0.223, + "grad_norm": 1504.0, + "kl_loss_12": 736.7527008056641, + "kl_loss_17": 217.9220283508301, + "kl_loss_3": 2626.90498046875, + "kl_loss_6": 1716.7175415039062, + "learning_rate": 0.0008900663520640604, + "loss": 1328.9883, + "step": 2230 + }, + { + "ce_loss_12": 3.279825234413147, + "ce_loss_17": 3.041073262691498, + "ce_loss_23": 2.93398494720459, + "ce_loss_3": 4.234403324127197, + "ce_loss_6": 3.756113088130951, + "epoch": 0.224, + "grad_norm": 1472.0, + "kl_loss_12": 760.3786315917969, + "kl_loss_17": 213.48066482543945, + "kl_loss_3": 2720.758154296875, + "kl_loss_6": 1772.4732238769532, + "learning_rate": 0.0008890717510790764, + "loss": 1368.9785, + "step": 2240 + }, + { + "ce_loss_12": 3.2426836848258973, + "ce_loss_17": 3.0035343766212463, + "ce_loss_23": 2.9006335139274597, + "ce_loss_3": 4.213583302497864, + "ce_loss_6": 3.731015908718109, + "epoch": 0.225, + "grad_norm": 1296.0, + "kl_loss_12": 757.3236785888672, + "kl_loss_17": 205.8685417175293, + "kl_loss_3": 2738.9389404296876, + "kl_loss_6": 1793.2860595703125, + "learning_rate": 0.0008880732321458784, + "loss": 1382.7258, + "step": 2250 + }, + { + "ce_loss_12": 3.2711044907569886, + "ce_loss_17": 3.0255207896232603, + "ce_loss_23": 2.9230571389198303, + "ce_loss_3": 4.2078168272972105, + "ce_loss_6": 3.7409843802452087, + "epoch": 0.226, + "grad_norm": 1424.0, + "kl_loss_12": 758.025503540039, + "kl_loss_17": 201.38440322875977, + "kl_loss_3": 2679.084912109375, + "kl_loss_6": 1762.8384155273438, + "learning_rate": 0.0008870708053195413, + "loss": 1372.8844, + "step": 2260 + }, + { + "ce_loss_12": 3.284177005290985, + "ce_loss_17": 3.0465020895004273, + "ce_loss_23": 2.9497544050216673, + "ce_loss_3": 4.2041588306427, + "ce_loss_6": 3.752433407306671, + "epoch": 0.227, + "grad_norm": 1048.0, + "kl_loss_12": 739.420751953125, + "kl_loss_17": 195.02377853393554, + "kl_loss_3": 2645.383654785156, + "kl_loss_6": 1743.099432373047, + "learning_rate": 0.0008860644806944918, + "loss": 1343.2056, + "step": 2270 + }, + { + "ce_loss_12": 3.2478358864784242, + "ce_loss_17": 2.9956758499145506, + "ce_loss_23": 2.8902077198028566, + "ce_loss_3": 4.2102713823318485, + "ce_loss_6": 3.7266456723213195, + "epoch": 0.228, + "grad_norm": 1048.0, + "kl_loss_12": 772.7948638916016, + "kl_loss_17": 203.88796768188476, + "kl_loss_3": 2750.5269165039062, + "kl_loss_6": 1799.3314453125, + "learning_rate": 0.0008850542684044079, + "loss": 1347.4766, + "step": 2280 + }, + { + "ce_loss_12": 3.2202987551689146, + "ce_loss_17": 2.9613712072372436, + "ce_loss_23": 2.8506805300712585, + "ce_loss_3": 4.227268242835999, + "ce_loss_6": 3.7271655440330504, + "epoch": 0.229, + "grad_norm": 1880.0, + "kl_loss_12": 802.2013427734375, + "kl_loss_17": 212.1362106323242, + "kl_loss_3": 2867.6884887695314, + "kl_loss_6": 1870.6247192382812, + "learning_rate": 0.0008840401786221159, + "loss": 1391.3645, + "step": 2290 + }, + { + "ce_loss_12": 3.334716284275055, + "ce_loss_17": 3.09200896024704, + "ce_loss_23": 2.99348384141922, + "ce_loss_3": 4.255949902534485, + "ce_loss_6": 3.791596162319183, + "epoch": 0.23, + "grad_norm": 1208.0, + "kl_loss_12": 745.1397399902344, + "kl_loss_17": 190.49222869873046, + "kl_loss_3": 2647.28955078125, + "kl_loss_6": 1736.7224426269531, + "learning_rate": 0.000883022221559489, + "loss": 1328.1979, + "step": 2300 + }, + { + "ce_loss_12": 3.300665628910065, + "ce_loss_17": 3.060560202598572, + "ce_loss_23": 2.9594492673873902, + "ce_loss_3": 4.2628024339675905, + "ce_loss_6": 3.780871844291687, + "epoch": 0.231, + "grad_norm": 1584.0, + "kl_loss_12": 756.107797241211, + "kl_loss_17": 196.85059051513673, + "kl_loss_3": 2730.4540283203123, + "kl_loss_6": 1779.80791015625, + "learning_rate": 0.0008820004074673434, + "loss": 1398.8628, + "step": 2310 + }, + { + "ce_loss_12": 3.2197547435760496, + "ce_loss_17": 2.9663180351257323, + "ce_loss_23": 2.869211399555206, + "ce_loss_3": 4.151427733898163, + "ce_loss_6": 3.6867988228797914, + "epoch": 0.232, + "grad_norm": 1328.0, + "kl_loss_12": 759.1753936767578, + "kl_loss_17": 192.56471099853516, + "kl_loss_3": 2701.675634765625, + "kl_loss_6": 1773.2933288574218, + "learning_rate": 0.0008809747466353355, + "loss": 1338.3229, + "step": 2320 + }, + { + "ce_loss_12": 3.2194170236587523, + "ce_loss_17": 2.974473536014557, + "ce_loss_23": 2.872840905189514, + "ce_loss_3": 4.179217076301574, + "ce_loss_6": 3.6907984256744384, + "epoch": 0.233, + "grad_norm": 1336.0, + "kl_loss_12": 743.9783752441406, + "kl_loss_17": 195.2461311340332, + "kl_loss_3": 2732.4534301757812, + "kl_loss_6": 1756.7269653320313, + "learning_rate": 0.0008799452493918585, + "loss": 1367.2255, + "step": 2330 + }, + { + "ce_loss_12": 3.2832246899604796, + "ce_loss_17": 3.039524793624878, + "ce_loss_23": 2.9446143388748167, + "ce_loss_3": 4.216864311695099, + "ce_loss_6": 3.7657551407814025, + "epoch": 0.234, + "grad_norm": 1472.0, + "kl_loss_12": 747.0130462646484, + "kl_loss_17": 192.51783828735353, + "kl_loss_3": 2681.032360839844, + "kl_loss_6": 1775.4697387695312, + "learning_rate": 0.0008789119261039385, + "loss": 1390.376, + "step": 2340 + }, + { + "ce_loss_12": 3.2080708622932432, + "ce_loss_17": 2.958297300338745, + "ce_loss_23": 2.8614233791828156, + "ce_loss_3": 4.164182078838349, + "ce_loss_6": 3.6893629670143127, + "epoch": 0.235, + "grad_norm": 1064.0, + "kl_loss_12": 752.9404388427735, + "kl_loss_17": 190.90708923339844, + "kl_loss_3": 2706.8750244140624, + "kl_loss_6": 1778.0380737304688, + "learning_rate": 0.0008778747871771292, + "loss": 1335.748, + "step": 2350 + }, + { + "ce_loss_12": 3.2462808966636656, + "ce_loss_17": 3.011286127567291, + "ce_loss_23": 2.9157028436660766, + "ce_loss_3": 4.172871029376983, + "ce_loss_6": 3.718285655975342, + "epoch": 0.236, + "grad_norm": 1360.0, + "kl_loss_12": 733.8823089599609, + "kl_loss_17": 186.25939254760743, + "kl_loss_3": 2635.8314819335938, + "kl_loss_6": 1738.221502685547, + "learning_rate": 0.0008768338430554083, + "loss": 1320.7639, + "step": 2360 + }, + { + "ce_loss_12": 3.2746780037879946, + "ce_loss_17": 3.0209306478500366, + "ce_loss_23": 2.918417739868164, + "ce_loss_3": 4.2050821185112, + "ce_loss_6": 3.738642954826355, + "epoch": 0.237, + "grad_norm": 1640.0, + "kl_loss_12": 755.3820190429688, + "kl_loss_17": 196.90067367553712, + "kl_loss_3": 2672.4424560546877, + "kl_loss_6": 1742.3179504394532, + "learning_rate": 0.0008757891042210713, + "loss": 1353.3967, + "step": 2370 + }, + { + "ce_loss_12": 3.2774656653404235, + "ce_loss_17": 3.0383931517601015, + "ce_loss_23": 2.938407635688782, + "ce_loss_3": 4.2089228391647335, + "ce_loss_6": 3.7463168621063234, + "epoch": 0.238, + "grad_norm": 1280.0, + "kl_loss_12": 736.635708618164, + "kl_loss_17": 195.7603958129883, + "kl_loss_3": 2657.1938720703124, + "kl_loss_6": 1746.2701599121094, + "learning_rate": 0.0008747405811946271, + "loss": 1340.6756, + "step": 2380 + }, + { + "ce_loss_12": 3.199791061878204, + "ce_loss_17": 2.9455369830131533, + "ce_loss_23": 2.841073679924011, + "ce_loss_3": 4.184807789325714, + "ce_loss_6": 3.703061819076538, + "epoch": 0.239, + "grad_norm": 1296.0, + "kl_loss_12": 771.5895477294922, + "kl_loss_17": 201.72363662719727, + "kl_loss_3": 2803.9154296875, + "kl_loss_6": 1836.1460327148438, + "learning_rate": 0.0008736882845346905, + "loss": 1354.4574, + "step": 2390 + }, + { + "ce_loss_12": 3.2723363280296325, + "ce_loss_17": 3.0333364009857178, + "ce_loss_23": 2.9263649106025698, + "ce_loss_3": 4.228648591041565, + "ce_loss_6": 3.7488921880722046, + "epoch": 0.24, + "grad_norm": 1272.0, + "kl_loss_12": 751.8863342285156, + "kl_loss_17": 206.7871238708496, + "kl_loss_3": 2706.067492675781, + "kl_loss_6": 1758.2299743652343, + "learning_rate": 0.0008726322248378774, + "loss": 1340.971, + "step": 2400 + }, + { + "ce_loss_12": 3.2682889342308044, + "ce_loss_17": 3.0341816663742067, + "ce_loss_23": 2.9340709686279296, + "ce_loss_3": 4.256609678268433, + "ce_loss_6": 3.769345486164093, + "epoch": 0.241, + "grad_norm": 1176.0, + "kl_loss_12": 741.8277923583985, + "kl_loss_17": 200.1978973388672, + "kl_loss_3": 2778.068603515625, + "kl_loss_6": 1803.1031921386718, + "learning_rate": 0.0008715724127386971, + "loss": 1389.8736, + "step": 2410 + }, + { + "ce_loss_12": 3.33016916513443, + "ce_loss_17": 3.1028114438056944, + "ce_loss_23": 3.0000047087669373, + "ce_loss_3": 4.261696720123291, + "ce_loss_6": 3.790807771682739, + "epoch": 0.242, + "grad_norm": 1216.0, + "kl_loss_12": 728.3110717773437, + "kl_loss_17": 201.23279266357423, + "kl_loss_3": 2662.35673828125, + "kl_loss_6": 1726.7381774902344, + "learning_rate": 0.0008705088589094458, + "loss": 1342.7444, + "step": 2420 + }, + { + "ce_loss_12": 3.3445237278938293, + "ce_loss_17": 3.102230954170227, + "ce_loss_23": 3.012040579319, + "ce_loss_3": 4.280753767490387, + "ce_loss_6": 3.8091434478759765, + "epoch": 0.243, + "grad_norm": 1336.0, + "kl_loss_12": 741.5652801513672, + "kl_loss_17": 202.57024688720702, + "kl_loss_3": 2683.318249511719, + "kl_loss_6": 1738.7669006347655, + "learning_rate": 0.0008694415740600988, + "loss": 1352.136, + "step": 2430 + }, + { + "ce_loss_12": 3.207241451740265, + "ce_loss_17": 2.9795133471488953, + "ce_loss_23": 2.876672852039337, + "ce_loss_3": 4.191270697116852, + "ce_loss_6": 3.7183370351791383, + "epoch": 0.244, + "grad_norm": 1480.0, + "kl_loss_12": 741.0047424316406, + "kl_loss_17": 209.1358772277832, + "kl_loss_3": 2760.043310546875, + "kl_loss_6": 1821.8199890136718, + "learning_rate": 0.0008683705689382025, + "loss": 1360.9362, + "step": 2440 + }, + { + "ce_loss_12": 3.287972331047058, + "ce_loss_17": 3.0538623571395873, + "ce_loss_23": 2.956281900405884, + "ce_loss_3": 4.20314621925354, + "ce_loss_6": 3.7521802186965942, + "epoch": 0.245, + "grad_norm": 1200.0, + "kl_loss_12": 725.0128631591797, + "kl_loss_17": 195.95231246948242, + "kl_loss_3": 2643.5307861328124, + "kl_loss_6": 1735.7429321289062, + "learning_rate": 0.0008672958543287666, + "loss": 1356.2043, + "step": 2450 + }, + { + "ce_loss_12": 3.295141100883484, + "ce_loss_17": 3.0591702222824098, + "ce_loss_23": 2.9594735980033873, + "ce_loss_3": 4.211972963809967, + "ce_loss_6": 3.7494354844093323, + "epoch": 0.246, + "grad_norm": 1296.0, + "kl_loss_12": 741.9841430664062, + "kl_loss_17": 196.04131546020508, + "kl_loss_3": 2631.139404296875, + "kl_loss_6": 1714.9184997558593, + "learning_rate": 0.0008662174410541554, + "loss": 1321.3993, + "step": 2460 + }, + { + "ce_loss_12": 3.257572686672211, + "ce_loss_17": 3.025375556945801, + "ce_loss_23": 2.9310835719108583, + "ce_loss_3": 4.172731947898865, + "ce_loss_6": 3.7115997433662415, + "epoch": 0.247, + "grad_norm": 1104.0, + "kl_loss_12": 722.4749328613282, + "kl_loss_17": 190.3153823852539, + "kl_loss_3": 2626.8435668945312, + "kl_loss_6": 1705.6855590820312, + "learning_rate": 0.0008651353399739787, + "loss": 1353.1453, + "step": 2470 + }, + { + "ce_loss_12": 3.2865932106971742, + "ce_loss_17": 3.0495383024215696, + "ce_loss_23": 2.951648223400116, + "ce_loss_3": 4.223213315010071, + "ce_loss_6": 3.751573157310486, + "epoch": 0.248, + "grad_norm": 1184.0, + "kl_loss_12": 734.734326171875, + "kl_loss_17": 192.1750617980957, + "kl_loss_3": 2651.808801269531, + "kl_loss_6": 1723.8331176757813, + "learning_rate": 0.0008640495619849821, + "loss": 1333.3133, + "step": 2480 + }, + { + "ce_loss_12": 3.2500027775764466, + "ce_loss_17": 3.0080092668533327, + "ce_loss_23": 2.9124353170394897, + "ce_loss_3": 4.174582135677338, + "ce_loss_6": 3.6984178900718687, + "epoch": 0.249, + "grad_norm": 1336.0, + "kl_loss_12": 730.3832427978516, + "kl_loss_17": 191.85937957763673, + "kl_loss_3": 2647.4756591796877, + "kl_loss_6": 1711.3130432128905, + "learning_rate": 0.0008629601180209381, + "loss": 1319.4918, + "step": 2490 + }, + { + "ce_loss_12": 3.245604455471039, + "ce_loss_17": 3.002720868587494, + "ce_loss_23": 2.9027483463287354, + "ce_loss_3": 4.170182287693024, + "ce_loss_6": 3.6976698637008667, + "epoch": 0.25, + "grad_norm": 1824.0, + "kl_loss_12": 735.5652648925782, + "kl_loss_17": 189.70375900268556, + "kl_loss_3": 2622.413708496094, + "kl_loss_6": 1689.7849853515625, + "learning_rate": 0.000861867019052535, + "loss": 1340.4617, + "step": 2500 + }, + { + "ce_loss_12": 3.1825204610824587, + "ce_loss_17": 2.9324156284332275, + "ce_loss_23": 2.8359997272491455, + "ce_loss_3": 4.1463774800300595, + "ce_loss_6": 3.669266200065613, + "epoch": 0.251, + "grad_norm": 1200.0, + "kl_loss_12": 740.7624694824219, + "kl_loss_17": 189.69065551757814, + "kl_loss_3": 2712.4296630859376, + "kl_loss_6": 1767.862969970703, + "learning_rate": 0.0008607702760872678, + "loss": 1354.3875, + "step": 2510 + }, + { + "ce_loss_12": 3.269958865642548, + "ce_loss_17": 3.0331807374954223, + "ce_loss_23": 2.9399582028388975, + "ce_loss_3": 4.184882915019989, + "ce_loss_6": 3.7285060048103333, + "epoch": 0.252, + "grad_norm": 1432.0, + "kl_loss_12": 721.8887329101562, + "kl_loss_17": 184.12346801757812, + "kl_loss_3": 2600.9268432617187, + "kl_loss_6": 1702.8171997070312, + "learning_rate": 0.0008596699001693256, + "loss": 1345.4953, + "step": 2520 + }, + { + "ce_loss_12": 3.2692631840705872, + "ce_loss_17": 3.037832224369049, + "ce_loss_23": 2.950229322910309, + "ce_loss_3": 4.173362863063812, + "ce_loss_6": 3.7151461005210877, + "epoch": 0.253, + "grad_norm": 1512.0, + "kl_loss_12": 715.8109771728516, + "kl_loss_17": 181.9451873779297, + "kl_loss_3": 2598.9486694335938, + "kl_loss_6": 1684.0447570800782, + "learning_rate": 0.0008585659023794818, + "loss": 1337.3438, + "step": 2530 + }, + { + "ce_loss_12": 3.2618510007858275, + "ce_loss_17": 3.013910210132599, + "ce_loss_23": 2.913906764984131, + "ce_loss_3": 4.225406587123871, + "ce_loss_6": 3.7535743355751037, + "epoch": 0.254, + "grad_norm": 1448.0, + "kl_loss_12": 749.8170837402344, + "kl_loss_17": 191.1927291870117, + "kl_loss_3": 2725.9212646484375, + "kl_loss_6": 1783.3548706054687, + "learning_rate": 0.0008574582938349817, + "loss": 1347.65, + "step": 2540 + }, + { + "ce_loss_12": 3.2526952981948853, + "ce_loss_17": 2.9959097027778627, + "ce_loss_23": 2.8899311423301697, + "ce_loss_3": 4.210918033123017, + "ce_loss_6": 3.7343856573104857, + "epoch": 0.255, + "grad_norm": 1112.0, + "kl_loss_12": 771.1574676513671, + "kl_loss_17": 201.62697830200196, + "kl_loss_3": 2730.968017578125, + "kl_loss_6": 1795.5943908691406, + "learning_rate": 0.0008563470856894315, + "loss": 1334.5916, + "step": 2550 + }, + { + "ce_loss_12": 3.2368006229400637, + "ce_loss_17": 2.992919373512268, + "ce_loss_23": 2.8943570494651794, + "ce_loss_3": 4.182651102542877, + "ce_loss_6": 3.7243578314781187, + "epoch": 0.256, + "grad_norm": 1312.0, + "kl_loss_12": 745.0476654052734, + "kl_loss_17": 187.45094604492186, + "kl_loss_3": 2690.3960815429687, + "kl_loss_6": 1766.5554809570312, + "learning_rate": 0.0008552322891326845, + "loss": 1339.0016, + "step": 2560 + }, + { + "ce_loss_12": 3.2059734225273133, + "ce_loss_17": 2.963773548603058, + "ce_loss_23": 2.8643307089805603, + "ce_loss_3": 4.164939868450165, + "ce_loss_6": 3.686858570575714, + "epoch": 0.257, + "grad_norm": 1312.0, + "kl_loss_12": 737.233609008789, + "kl_loss_17": 191.0440788269043, + "kl_loss_3": 2699.5515258789064, + "kl_loss_6": 1752.0683654785157, + "learning_rate": 0.0008541139153907296, + "loss": 1320.2535, + "step": 2570 + }, + { + "ce_loss_12": 3.1582564234733583, + "ce_loss_17": 2.923947513103485, + "ce_loss_23": 2.829418158531189, + "ce_loss_3": 4.104415500164032, + "ce_loss_6": 3.6283998847007752, + "epoch": 0.258, + "grad_norm": 1320.0, + "kl_loss_12": 717.5931945800781, + "kl_loss_17": 183.29727096557616, + "kl_loss_3": 2659.9623168945313, + "kl_loss_6": 1722.821954345703, + "learning_rate": 0.0008529919757255782, + "loss": 1339.5538, + "step": 2580 + }, + { + "ce_loss_12": 3.174536180496216, + "ce_loss_17": 2.954156446456909, + "ce_loss_23": 2.864199459552765, + "ce_loss_3": 4.083700370788574, + "ce_loss_6": 3.6268170833587647, + "epoch": 0.259, + "grad_norm": 1432.0, + "kl_loss_12": 699.1353759765625, + "kl_loss_17": 181.89709854125977, + "kl_loss_3": 2583.3354248046876, + "kl_loss_6": 1667.4333862304688, + "learning_rate": 0.0008518664814351503, + "loss": 1297.9937, + "step": 2590 + }, + { + "ce_loss_12": 3.165991497039795, + "ce_loss_17": 2.923891270160675, + "ce_loss_23": 2.828059136867523, + "ce_loss_3": 4.127615463733673, + "ce_loss_6": 3.642699182033539, + "epoch": 0.26, + "grad_norm": 2720.0, + "kl_loss_12": 744.4950927734375, + "kl_loss_17": 194.33282699584962, + "kl_loss_3": 2720.434289550781, + "kl_loss_6": 1763.722235107422, + "learning_rate": 0.0008507374438531607, + "loss": 1388.6477, + "step": 2600 + }, + { + "ce_loss_12": 3.145507049560547, + "ce_loss_17": 2.906841003894806, + "ce_loss_23": 2.8138745784759522, + "ce_loss_3": 4.091469419002533, + "ce_loss_6": 3.6091453433036804, + "epoch": 0.261, + "grad_norm": 1096.0, + "kl_loss_12": 732.8025054931641, + "kl_loss_17": 189.48067474365234, + "kl_loss_3": 2663.9505004882812, + "kl_loss_6": 1725.3874877929688, + "learning_rate": 0.0008496048743490053, + "loss": 1325.5436, + "step": 2610 + }, + { + "ce_loss_12": 3.287692499160767, + "ce_loss_17": 3.0543363809585573, + "ce_loss_23": 2.955253612995148, + "ce_loss_3": 4.185577619075775, + "ce_loss_6": 3.732229971885681, + "epoch": 0.262, + "grad_norm": 1496.0, + "kl_loss_12": 721.2437561035156, + "kl_loss_17": 190.96459732055663, + "kl_loss_3": 2586.6703491210938, + "kl_loss_6": 1684.371563720703, + "learning_rate": 0.0008484687843276469, + "loss": 1310.6916, + "step": 2620 + }, + { + "ce_loss_12": 3.222010779380798, + "ce_loss_17": 2.9946094870567324, + "ce_loss_23": 2.888150525093079, + "ce_loss_3": 4.163958895206451, + "ce_loss_6": 3.6860865235328673, + "epoch": 0.263, + "grad_norm": 1384.0, + "kl_loss_12": 734.8125305175781, + "kl_loss_17": 209.23648681640626, + "kl_loss_3": 2669.965954589844, + "kl_loss_6": 1726.131591796875, + "learning_rate": 0.0008473291852294987, + "loss": 1349.202, + "step": 2630 + }, + { + "ce_loss_12": 3.2338871598243712, + "ce_loss_17": 3.003847348690033, + "ce_loss_23": 2.8962838411331178, + "ce_loss_3": 4.159780132770538, + "ce_loss_6": 3.6938066601753237, + "epoch": 0.264, + "grad_norm": 1344.0, + "kl_loss_12": 740.8481842041016, + "kl_loss_17": 235.8197814941406, + "kl_loss_3": 2672.1052856445312, + "kl_loss_6": 1734.7486999511718, + "learning_rate": 0.0008461860885303114, + "loss": 1336.9324, + "step": 2640 + }, + { + "ce_loss_12": 3.262851691246033, + "ce_loss_17": 3.0432870388031006, + "ce_loss_23": 2.926355171203613, + "ce_loss_3": 4.1671063780784605, + "ce_loss_6": 3.7086267828941346, + "epoch": 0.265, + "grad_norm": 1080.0, + "kl_loss_12": 725.8805877685547, + "kl_loss_17": 229.145906829834, + "kl_loss_3": 2597.6031494140625, + "kl_loss_6": 1684.2327880859375, + "learning_rate": 0.000845039505741056, + "loss": 1332.923, + "step": 2650 + }, + { + "ce_loss_12": 3.2442399501800536, + "ce_loss_17": 3.0072985649108888, + "ce_loss_23": 2.902949857711792, + "ce_loss_3": 4.180326998233795, + "ce_loss_6": 3.7002379536628722, + "epoch": 0.266, + "grad_norm": 1496.0, + "kl_loss_12": 762.3745544433593, + "kl_loss_17": 219.21652069091797, + "kl_loss_3": 2699.570947265625, + "kl_loss_6": 1747.6187866210937, + "learning_rate": 0.0008438894484078086, + "loss": 1386.7472, + "step": 2660 + }, + { + "ce_loss_12": 3.2508215069770814, + "ce_loss_17": 3.0207146883010862, + "ce_loss_23": 2.9180808544158934, + "ce_loss_3": 4.169582200050354, + "ce_loss_6": 3.7001988172531126, + "epoch": 0.267, + "grad_norm": 1568.0, + "kl_loss_12": 730.1839599609375, + "kl_loss_17": 202.67346878051757, + "kl_loss_3": 2626.172265625, + "kl_loss_6": 1690.5539245605469, + "learning_rate": 0.0008427359281116334, + "loss": 1322.8947, + "step": 2670 + }, + { + "ce_loss_12": 3.1615299701690676, + "ce_loss_17": 2.917899823188782, + "ce_loss_23": 2.8179174184799196, + "ce_loss_3": 4.110739576816559, + "ce_loss_6": 3.6515835881233216, + "epoch": 0.268, + "grad_norm": 1256.0, + "kl_loss_12": 747.1206237792969, + "kl_loss_17": 199.23356704711915, + "kl_loss_3": 2709.5385131835938, + "kl_loss_6": 1776.6613220214845, + "learning_rate": 0.0008415789564684673, + "loss": 1344.7199, + "step": 2680 + }, + { + "ce_loss_12": 3.37757648229599, + "ce_loss_17": 3.1467122197151185, + "ce_loss_23": 3.047198307514191, + "ce_loss_3": 4.283508396148681, + "ce_loss_6": 3.8278584480285645, + "epoch": 0.269, + "grad_norm": 1192.0, + "kl_loss_12": 729.712191772461, + "kl_loss_17": 196.99927520751953, + "kl_loss_3": 2565.828076171875, + "kl_loss_6": 1665.7049255371094, + "learning_rate": 0.0008404185451290017, + "loss": 1294.8127, + "step": 2690 + }, + { + "ce_loss_12": 3.2593582391738893, + "ce_loss_17": 3.0237264156341555, + "ce_loss_23": 2.9275999665260315, + "ce_loss_3": 4.178876841068268, + "ce_loss_6": 3.711337113380432, + "epoch": 0.27, + "grad_norm": 1216.0, + "kl_loss_12": 718.8045013427734, + "kl_loss_17": 188.78227767944335, + "kl_loss_3": 2614.6119018554687, + "kl_loss_6": 1683.182635498047, + "learning_rate": 0.0008392547057785661, + "loss": 1304.2764, + "step": 2700 + }, + { + "ce_loss_12": 3.2071892619132996, + "ce_loss_17": 2.961407685279846, + "ce_loss_23": 2.868341851234436, + "ce_loss_3": 4.171078884601593, + "ce_loss_6": 3.694565212726593, + "epoch": 0.271, + "grad_norm": 1320.0, + "kl_loss_12": 743.917333984375, + "kl_loss_17": 190.58706970214843, + "kl_loss_3": 2747.1872314453126, + "kl_loss_6": 1784.6308227539062, + "learning_rate": 0.0008380874501370098, + "loss": 1313.7463, + "step": 2710 + }, + { + "ce_loss_12": 3.1976038217544556, + "ce_loss_17": 2.950633406639099, + "ce_loss_23": 2.854086124897003, + "ce_loss_3": 4.162517547607422, + "ce_loss_6": 3.6708931803703306, + "epoch": 0.272, + "grad_norm": 1568.0, + "kl_loss_12": 749.6916198730469, + "kl_loss_17": 194.3929656982422, + "kl_loss_3": 2733.24736328125, + "kl_loss_6": 1759.3761169433594, + "learning_rate": 0.0008369167899585841, + "loss": 1342.7332, + "step": 2720 + }, + { + "ce_loss_12": 3.2849605321884154, + "ce_loss_17": 3.063574159145355, + "ce_loss_23": 2.9703721046447753, + "ce_loss_3": 4.180538165569305, + "ce_loss_6": 3.716990053653717, + "epoch": 0.273, + "grad_norm": 1424.0, + "kl_loss_12": 707.3694671630859, + "kl_loss_17": 186.6881980895996, + "kl_loss_3": 2562.8335083007814, + "kl_loss_6": 1644.158380126953, + "learning_rate": 0.0008357427370318238, + "loss": 1321.8501, + "step": 2730 + }, + { + "ce_loss_12": 3.251715886592865, + "ce_loss_17": 3.014530289173126, + "ce_loss_23": 2.9223742961883543, + "ce_loss_3": 4.193988764286042, + "ce_loss_6": 3.720304584503174, + "epoch": 0.274, + "grad_norm": 1512.0, + "kl_loss_12": 723.0832946777343, + "kl_loss_17": 185.57211303710938, + "kl_loss_3": 2666.404895019531, + "kl_loss_6": 1723.0100708007812, + "learning_rate": 0.0008345653031794292, + "loss": 1329.8638, + "step": 2740 + }, + { + "ce_loss_12": 3.2534584522247316, + "ce_loss_17": 3.019420337677002, + "ce_loss_23": 2.9230307936668396, + "ce_loss_3": 4.1833734035491945, + "ce_loss_6": 3.709990406036377, + "epoch": 0.275, + "grad_norm": 1336.0, + "kl_loss_12": 723.5877197265625, + "kl_loss_17": 189.66986541748048, + "kl_loss_3": 2639.2673217773436, + "kl_loss_6": 1699.3374938964844, + "learning_rate": 0.0008333845002581458, + "loss": 1312.4343, + "step": 2750 + }, + { + "ce_loss_12": 3.1938255786895753, + "ce_loss_17": 2.954642677307129, + "ce_loss_23": 2.8567110300064087, + "ce_loss_3": 4.144112658500672, + "ce_loss_6": 3.6723486185073853, + "epoch": 0.276, + "grad_norm": 1152.0, + "kl_loss_12": 743.0981384277344, + "kl_loss_17": 197.25850677490234, + "kl_loss_3": 2716.0939086914063, + "kl_loss_6": 1768.1518981933593, + "learning_rate": 0.0008322003401586462, + "loss": 1346.3904, + "step": 2760 + }, + { + "ce_loss_12": 3.206464159488678, + "ce_loss_17": 2.985082983970642, + "ce_loss_23": 2.8910439133644106, + "ce_loss_3": 4.114750480651855, + "ce_loss_6": 3.653266930580139, + "epoch": 0.277, + "grad_norm": 1000.0, + "kl_loss_12": 703.1947845458984, + "kl_loss_17": 192.4415397644043, + "kl_loss_3": 2582.096875, + "kl_loss_6": 1655.830126953125, + "learning_rate": 0.0008310128348054094, + "loss": 1268.9006, + "step": 2770 + }, + { + "ce_loss_12": 3.179832136631012, + "ce_loss_17": 2.950792682170868, + "ce_loss_23": 2.8574747800827027, + "ce_loss_3": 4.115865409374237, + "ce_loss_6": 3.647394621372223, + "epoch": 0.278, + "grad_norm": 2256.0, + "kl_loss_12": 719.2048980712891, + "kl_loss_17": 189.4114990234375, + "kl_loss_3": 2630.2521240234373, + "kl_loss_6": 1707.5929931640626, + "learning_rate": 0.0008298219961566008, + "loss": 1312.3221, + "step": 2780 + }, + { + "ce_loss_12": 3.1586066246032716, + "ce_loss_17": 2.9223502397537233, + "ce_loss_23": 2.826524806022644, + "ce_loss_3": 4.119323110580444, + "ce_loss_6": 3.6441922426223754, + "epoch": 0.279, + "grad_norm": 1088.0, + "kl_loss_12": 743.4344818115235, + "kl_loss_17": 195.6804672241211, + "kl_loss_3": 2735.8405517578126, + "kl_loss_6": 1786.5430541992187, + "learning_rate": 0.0008286278362039527, + "loss": 1324.4959, + "step": 2790 + }, + { + "ce_loss_12": 3.1875088691711424, + "ce_loss_17": 2.9585258960723877, + "ce_loss_23": 2.8505748391151426, + "ce_loss_3": 4.157631981372833, + "ce_loss_6": 3.6760962843894958, + "epoch": 0.28, + "grad_norm": 1600.0, + "kl_loss_12": 738.4578369140625, + "kl_loss_17": 207.52910079956055, + "kl_loss_3": 2740.503869628906, + "kl_loss_6": 1778.6897888183594, + "learning_rate": 0.0008274303669726426, + "loss": 1323.8701, + "step": 2800 + }, + { + "ce_loss_12": 3.10077189207077, + "ce_loss_17": 2.864953410625458, + "ce_loss_23": 2.765638995170593, + "ce_loss_3": 4.091872203350067, + "ce_loss_6": 3.593111681938171, + "epoch": 0.281, + "grad_norm": 1448.0, + "kl_loss_12": 734.8463958740234, + "kl_loss_17": 200.3710464477539, + "kl_loss_3": 2769.092578125, + "kl_loss_6": 1774.5084228515625, + "learning_rate": 0.0008262296005211721, + "loss": 1322.1288, + "step": 2810 + }, + { + "ce_loss_12": 3.217443346977234, + "ce_loss_17": 2.980482351779938, + "ce_loss_23": 2.88399224281311, + "ce_loss_3": 4.165814614295959, + "ce_loss_6": 3.692637538909912, + "epoch": 0.282, + "grad_norm": 1248.0, + "kl_loss_12": 731.9698852539062, + "kl_loss_17": 193.3557373046875, + "kl_loss_3": 2678.65576171875, + "kl_loss_6": 1738.8185180664063, + "learning_rate": 0.0008250255489412463, + "loss": 1315.8705, + "step": 2820 + }, + { + "ce_loss_12": 3.302238166332245, + "ce_loss_17": 3.0774773359298706, + "ce_loss_23": 2.980461800098419, + "ce_loss_3": 4.230216920375824, + "ce_loss_6": 3.7631787419319154, + "epoch": 0.283, + "grad_norm": 1840.0, + "kl_loss_12": 718.8991790771485, + "kl_loss_17": 195.19951171875, + "kl_loss_3": 2629.9222412109375, + "kl_loss_6": 1702.7022888183594, + "learning_rate": 0.0008238182243576511, + "loss": 1313.1373, + "step": 2830 + }, + { + "ce_loss_12": 3.2571099162101746, + "ce_loss_17": 3.0421578764915465, + "ce_loss_23": 2.945399808883667, + "ce_loss_3": 4.120764565467835, + "ce_loss_6": 3.6864093422889708, + "epoch": 0.284, + "grad_norm": 1576.0, + "kl_loss_12": 704.773062133789, + "kl_loss_17": 197.55138092041017, + "kl_loss_3": 2486.9965942382814, + "kl_loss_6": 1617.7297912597655, + "learning_rate": 0.0008226076389281315, + "loss": 1278.3004, + "step": 2840 + }, + { + "ce_loss_12": 3.3284648776054384, + "ce_loss_17": 3.093518555164337, + "ce_loss_23": 2.9918949365615846, + "ce_loss_3": 4.214064216613769, + "ce_loss_6": 3.745799195766449, + "epoch": 0.285, + "grad_norm": 1376.0, + "kl_loss_12": 755.4556213378906, + "kl_loss_17": 207.63405532836913, + "kl_loss_3": 2600.1022216796873, + "kl_loss_6": 1668.0916687011718, + "learning_rate": 0.0008213938048432696, + "loss": 1291.1268, + "step": 2850 + }, + { + "ce_loss_12": 3.253906548023224, + "ce_loss_17": 3.01155344247818, + "ce_loss_23": 2.9103656888008116, + "ce_loss_3": 4.147986555099488, + "ce_loss_6": 3.6820557951927184, + "epoch": 0.286, + "grad_norm": 1920.0, + "kl_loss_12": 755.4055480957031, + "kl_loss_17": 208.6363540649414, + "kl_loss_3": 2604.3248779296873, + "kl_loss_6": 1675.7506896972657, + "learning_rate": 0.0008201767343263612, + "loss": 1324.3863, + "step": 2860 + }, + { + "ce_loss_12": 3.207475483417511, + "ce_loss_17": 2.957265591621399, + "ce_loss_23": 2.860222041606903, + "ce_loss_3": 4.1249484658241276, + "ce_loss_6": 3.6550333380699156, + "epoch": 0.287, + "grad_norm": 1488.0, + "kl_loss_12": 772.2817749023437, + "kl_loss_17": 198.63464965820313, + "kl_loss_3": 2653.582800292969, + "kl_loss_6": 1722.738299560547, + "learning_rate": 0.0008189564396332927, + "loss": 1294.6703, + "step": 2870 + }, + { + "ce_loss_12": 3.206028151512146, + "ce_loss_17": 2.95153146982193, + "ce_loss_23": 2.849563407897949, + "ce_loss_3": 4.126707303524017, + "ce_loss_6": 3.641816198825836, + "epoch": 0.288, + "grad_norm": 1304.0, + "kl_loss_12": 765.9557800292969, + "kl_loss_17": 194.0538459777832, + "kl_loss_3": 2655.191064453125, + "kl_loss_6": 1697.8934020996094, + "learning_rate": 0.0008177329330524181, + "loss": 1329.9287, + "step": 2880 + }, + { + "ce_loss_12": 3.2430716037750242, + "ce_loss_17": 2.993469214439392, + "ce_loss_23": 2.8990333795547487, + "ce_loss_3": 4.125063967704773, + "ce_loss_6": 3.6682780027389525, + "epoch": 0.289, + "grad_norm": 1480.0, + "kl_loss_12": 735.3319274902344, + "kl_loss_17": 189.49797439575195, + "kl_loss_3": 2550.7237915039063, + "kl_loss_6": 1647.8123718261718, + "learning_rate": 0.0008165062269044352, + "loss": 1295.6047, + "step": 2890 + }, + { + "ce_loss_12": 3.1921915769577027, + "ce_loss_17": 2.9539024472236632, + "ce_loss_23": 2.8567499995231627, + "ce_loss_3": 4.125063931941986, + "ce_loss_6": 3.640816259384155, + "epoch": 0.29, + "grad_norm": 1568.0, + "kl_loss_12": 743.7144348144532, + "kl_loss_17": 191.56704406738282, + "kl_loss_3": 2669.54541015625, + "kl_loss_6": 1707.029766845703, + "learning_rate": 0.0008152763335422613, + "loss": 1328.2095, + "step": 2900 + }, + { + "ce_loss_12": 3.182156467437744, + "ce_loss_17": 2.939117431640625, + "ce_loss_23": 2.8404495120048523, + "ce_loss_3": 4.111138701438904, + "ce_loss_6": 3.6335360765457154, + "epoch": 0.291, + "grad_norm": 1496.0, + "kl_loss_12": 741.1706695556641, + "kl_loss_17": 189.98179397583007, + "kl_loss_3": 2651.492712402344, + "kl_loss_6": 1711.0212951660155, + "learning_rate": 0.0008140432653509088, + "loss": 1305.6396, + "step": 2910 + }, + { + "ce_loss_12": 3.226067852973938, + "ce_loss_17": 2.9837878346443176, + "ce_loss_23": 2.886159098148346, + "ce_loss_3": 4.133246636390686, + "ce_loss_6": 3.665525996685028, + "epoch": 0.292, + "grad_norm": 1616.0, + "kl_loss_12": 738.4330993652344, + "kl_loss_17": 188.00680923461914, + "kl_loss_3": 2629.9974609375, + "kl_loss_6": 1691.3132446289062, + "learning_rate": 0.0008128070347473608, + "loss": 1304.7395, + "step": 2920 + }, + { + "ce_loss_12": 3.2389190077781675, + "ce_loss_17": 3.000583600997925, + "ce_loss_23": 2.903918969631195, + "ce_loss_3": 4.187899720668793, + "ce_loss_6": 3.700623893737793, + "epoch": 0.293, + "grad_norm": 1416.0, + "kl_loss_12": 739.0092895507812, + "kl_loss_17": 188.44521789550782, + "kl_loss_3": 2695.9560546875, + "kl_loss_6": 1732.5191284179687, + "learning_rate": 0.0008115676541804455, + "loss": 1312.3934, + "step": 2930 + }, + { + "ce_loss_12": 3.2302377104759215, + "ce_loss_17": 2.9969589352607726, + "ce_loss_23": 2.906096911430359, + "ce_loss_3": 4.135867977142334, + "ce_loss_6": 3.672209370136261, + "epoch": 0.294, + "grad_norm": 1224.0, + "kl_loss_12": 711.2619415283203, + "kl_loss_17": 182.30578384399413, + "kl_loss_3": 2607.3838623046877, + "kl_loss_6": 1670.5546630859376, + "learning_rate": 0.0008103251361307119, + "loss": 1308.172, + "step": 2940 + }, + { + "ce_loss_12": 3.265262854099274, + "ce_loss_17": 3.027577888965607, + "ce_loss_23": 2.9357951879501343, + "ce_loss_3": 4.187027478218079, + "ce_loss_6": 3.7105315566062926, + "epoch": 0.295, + "grad_norm": 1536.0, + "kl_loss_12": 726.9580688476562, + "kl_loss_17": 186.03650360107423, + "kl_loss_3": 2650.1802612304687, + "kl_loss_6": 1708.4851806640625, + "learning_rate": 0.0008090794931103026, + "loss": 1300.7279, + "step": 2950 + }, + { + "ce_loss_12": 3.2232770323753357, + "ce_loss_17": 3.00313116312027, + "ce_loss_23": 2.9133943557739257, + "ce_loss_3": 4.136290061473846, + "ce_loss_6": 3.6703510642051698, + "epoch": 0.296, + "grad_norm": 1352.0, + "kl_loss_12": 700.5950714111328, + "kl_loss_17": 179.31483535766603, + "kl_loss_3": 2576.780908203125, + "kl_loss_6": 1658.1287353515625, + "learning_rate": 0.0008078307376628291, + "loss": 1289.8848, + "step": 2960 + }, + { + "ce_loss_12": 3.2900136351585387, + "ce_loss_17": 3.0622668147087095, + "ce_loss_23": 2.9745339155197144, + "ce_loss_3": 4.168375945091247, + "ce_loss_6": 3.721718990802765, + "epoch": 0.297, + "grad_norm": 1456.0, + "kl_loss_12": 683.8927856445313, + "kl_loss_17": 175.0184425354004, + "kl_loss_3": 2501.225134277344, + "kl_loss_6": 1614.7021057128907, + "learning_rate": 0.000806578882363245, + "loss": 1251.6572, + "step": 2970 + }, + { + "ce_loss_12": 3.215678596496582, + "ce_loss_17": 2.984221637248993, + "ce_loss_23": 2.896929371356964, + "ce_loss_3": 4.107896149158478, + "ce_loss_6": 3.6536261081695556, + "epoch": 0.298, + "grad_norm": 1344.0, + "kl_loss_12": 700.3212341308594, + "kl_loss_17": 177.6427230834961, + "kl_loss_3": 2580.3680419921875, + "kl_loss_6": 1669.9347412109375, + "learning_rate": 0.0008053239398177191, + "loss": 1308.9184, + "step": 2980 + }, + { + "ce_loss_12": 3.2067503094673158, + "ce_loss_17": 2.97095730304718, + "ce_loss_23": 2.8793780326843263, + "ce_loss_3": 4.128289008140564, + "ce_loss_6": 3.6655551314353945, + "epoch": 0.299, + "grad_norm": 1312.0, + "kl_loss_12": 707.6057037353515, + "kl_loss_17": 180.79422760009766, + "kl_loss_3": 2616.2923095703127, + "kl_loss_6": 1685.277392578125, + "learning_rate": 0.0008040659226635089, + "loss": 1325.6742, + "step": 2990 + }, + { + "ce_loss_12": 3.3263411283493043, + "ce_loss_17": 3.093203127384186, + "ce_loss_23": 2.994673478603363, + "ce_loss_3": 4.216775393486023, + "ce_loss_6": 3.767217791080475, + "epoch": 0.3, + "grad_norm": 1336.0, + "kl_loss_12": 730.0472717285156, + "kl_loss_17": 191.39580917358398, + "kl_loss_3": 2584.9020751953126, + "kl_loss_6": 1672.3069152832031, + "learning_rate": 0.0008028048435688333, + "loss": 1283.5127, + "step": 3000 + }, + { + "ce_loss_12": 3.210022675991058, + "ce_loss_17": 2.974554407596588, + "ce_loss_23": 2.884077084064484, + "ce_loss_3": 4.154890215396881, + "ce_loss_6": 3.676375377178192, + "epoch": 0.301, + "grad_norm": 1432.0, + "kl_loss_12": 724.8402709960938, + "kl_loss_17": 185.38192596435547, + "kl_loss_3": 2683.6630615234376, + "kl_loss_6": 1731.5496398925782, + "learning_rate": 0.0008015407152327448, + "loss": 1312.2201, + "step": 3010 + }, + { + "ce_loss_12": 3.2449921011924743, + "ce_loss_17": 3.015483093261719, + "ce_loss_23": 2.9206595540046694, + "ce_loss_3": 4.174470937252044, + "ce_loss_6": 3.6962149143218994, + "epoch": 0.302, + "grad_norm": 1504.0, + "kl_loss_12": 718.8039825439453, + "kl_loss_17": 183.68963470458985, + "kl_loss_3": 2661.935192871094, + "kl_loss_6": 1699.5322021484376, + "learning_rate": 0.0008002735503850016, + "loss": 1309.9627, + "step": 3020 + }, + { + "ce_loss_12": 3.155445086956024, + "ce_loss_17": 2.9152156472206117, + "ce_loss_23": 2.820170259475708, + "ce_loss_3": 4.101059782505035, + "ce_loss_6": 3.62444384098053, + "epoch": 0.303, + "grad_norm": 1736.0, + "kl_loss_12": 735.0263610839844, + "kl_loss_17": 188.96228256225587, + "kl_loss_3": 2687.139697265625, + "kl_loss_6": 1733.0789978027344, + "learning_rate": 0.0007990033617859396, + "loss": 1329.7411, + "step": 3030 + }, + { + "ce_loss_12": 3.1969924449920653, + "ce_loss_17": 2.9681341648101807, + "ce_loss_23": 2.8718268156051634, + "ce_loss_3": 4.10785653591156, + "ce_loss_6": 3.641191875934601, + "epoch": 0.304, + "grad_norm": 1224.0, + "kl_loss_12": 708.8882843017578, + "kl_loss_17": 195.2229965209961, + "kl_loss_3": 2582.933703613281, + "kl_loss_6": 1657.4207885742187, + "learning_rate": 0.000797730162226344, + "loss": 1257.9271, + "step": 3040 + }, + { + "ce_loss_12": 3.218884325027466, + "ce_loss_17": 2.9874356985092163, + "ce_loss_23": 2.8883252143859863, + "ce_loss_3": 4.132557225227356, + "ce_loss_6": 3.662423384189606, + "epoch": 0.305, + "grad_norm": 1232.0, + "kl_loss_12": 713.8391510009766, + "kl_loss_17": 198.42729873657225, + "kl_loss_3": 2602.8368530273438, + "kl_loss_6": 1676.0867919921875, + "learning_rate": 0.0007964539645273203, + "loss": 1286.1518, + "step": 3050 + }, + { + "ce_loss_12": 3.217023992538452, + "ce_loss_17": 3.0118243217468263, + "ce_loss_23": 2.911352443695068, + "ce_loss_3": 4.115066957473755, + "ce_loss_6": 3.662225902080536, + "epoch": 0.306, + "grad_norm": 1104.0, + "kl_loss_12": 685.2395874023438, + "kl_loss_17": 195.61914443969727, + "kl_loss_3": 2540.3358764648438, + "kl_loss_6": 1634.1998291015625, + "learning_rate": 0.000795174781540165, + "loss": 1285.0721, + "step": 3060 + }, + { + "ce_loss_12": 3.2873870611190794, + "ce_loss_17": 3.0870782017707823, + "ce_loss_23": 2.981433629989624, + "ce_loss_3": 4.164851725101471, + "ce_loss_6": 3.7137848019599913, + "epoch": 0.307, + "grad_norm": 1088.0, + "kl_loss_12": 685.9832824707031, + "kl_loss_17": 225.74871520996095, + "kl_loss_3": 2501.773645019531, + "kl_loss_6": 1609.5473571777343, + "learning_rate": 0.0007938926261462366, + "loss": 1287.3006, + "step": 3070 + }, + { + "ce_loss_12": 3.242907667160034, + "ce_loss_17": 3.0306016206741333, + "ce_loss_23": 2.930326449871063, + "ce_loss_3": 4.126864385604859, + "ce_loss_6": 3.6658257126808165, + "epoch": 0.308, + "grad_norm": 1416.0, + "kl_loss_12": 691.0276092529297, + "kl_loss_17": 204.1019416809082, + "kl_loss_3": 2545.3953247070312, + "kl_loss_6": 1630.110577392578, + "learning_rate": 0.0007926075112568258, + "loss": 1297.815, + "step": 3080 + }, + { + "ce_loss_12": 3.240575063228607, + "ce_loss_17": 3.0216240525245666, + "ce_loss_23": 2.926092779636383, + "ce_loss_3": 4.147086489200592, + "ce_loss_6": 3.6847408771514893, + "epoch": 0.309, + "grad_norm": 1264.0, + "kl_loss_12": 706.5309020996094, + "kl_loss_17": 193.5285789489746, + "kl_loss_3": 2578.653430175781, + "kl_loss_6": 1659.9425231933594, + "learning_rate": 0.0007913194498130252, + "loss": 1266.5773, + "step": 3090 + }, + { + "ce_loss_12": 3.18501592874527, + "ce_loss_17": 2.95448499917984, + "ce_loss_23": 2.855408418178558, + "ce_loss_3": 4.105933094024659, + "ce_loss_6": 3.6342186093330384, + "epoch": 0.31, + "grad_norm": 2800.0, + "kl_loss_12": 705.5955200195312, + "kl_loss_17": 189.90459060668945, + "kl_loss_3": 2598.2001342773438, + "kl_loss_6": 1666.8767150878907, + "learning_rate": 0.0007900284547855992, + "loss": 1296.8383, + "step": 3100 + }, + { + "ce_loss_12": 3.1849052548408507, + "ce_loss_17": 2.965952789783478, + "ce_loss_23": 2.876673400402069, + "ce_loss_3": 4.0776886463165285, + "ce_loss_6": 3.616372013092041, + "epoch": 0.311, + "grad_norm": 1296.0, + "kl_loss_12": 694.897573852539, + "kl_loss_17": 184.4494400024414, + "kl_loss_3": 2557.008825683594, + "kl_loss_6": 1641.6108154296876, + "learning_rate": 0.0007887345391748532, + "loss": 1293.7499, + "step": 3110 + }, + { + "ce_loss_12": 3.282544469833374, + "ce_loss_17": 3.067942702770233, + "ce_loss_23": 2.9785973906517027, + "ce_loss_3": 4.157709920406342, + "ce_loss_6": 3.7043622016906737, + "epoch": 0.312, + "grad_norm": 1056.0, + "kl_loss_12": 681.447378540039, + "kl_loss_17": 179.30100173950194, + "kl_loss_3": 2500.595910644531, + "kl_loss_6": 1598.087335205078, + "learning_rate": 0.0007874377160105036, + "loss": 1237.0448, + "step": 3120 + }, + { + "ce_loss_12": 3.198904585838318, + "ce_loss_17": 2.9811079144477843, + "ce_loss_23": 2.8912189245224, + "ce_loss_3": 4.140334153175354, + "ce_loss_6": 3.678908574581146, + "epoch": 0.313, + "grad_norm": 1272.0, + "kl_loss_12": 686.0269927978516, + "kl_loss_17": 178.5546676635742, + "kl_loss_3": 2648.404260253906, + "kl_loss_6": 1728.5712341308595, + "learning_rate": 0.0007861379983515449, + "loss": 1332.7811, + "step": 3130 + }, + { + "ce_loss_12": 3.2717678546905518, + "ce_loss_17": 3.0484447836875916, + "ce_loss_23": 2.9575275182724, + "ce_loss_3": 4.181915581226349, + "ce_loss_6": 3.7253427028656008, + "epoch": 0.314, + "grad_norm": 1480.0, + "kl_loss_12": 697.9916107177735, + "kl_loss_17": 179.39368896484376, + "kl_loss_3": 2580.733349609375, + "kl_loss_6": 1677.2579040527344, + "learning_rate": 0.0007848353992861195, + "loss": 1275.3266, + "step": 3140 + }, + { + "ce_loss_12": 3.3690295577049256, + "ce_loss_17": 3.127369058132172, + "ce_loss_23": 3.0289427757263185, + "ce_loss_3": 4.267640101909637, + "ce_loss_6": 3.810963344573975, + "epoch": 0.315, + "grad_norm": 1080.0, + "kl_loss_12": 724.3310852050781, + "kl_loss_17": 190.38062744140626, + "kl_loss_3": 2579.9655883789064, + "kl_loss_6": 1680.8723693847655, + "learning_rate": 0.0007835299319313853, + "loss": 1300.433, + "step": 3150 + }, + { + "ce_loss_12": 3.2413835883140565, + "ce_loss_17": 3.02094886302948, + "ce_loss_23": 2.9331553101539614, + "ce_loss_3": 4.131970524787903, + "ce_loss_6": 3.6723185420036315, + "epoch": 0.316, + "grad_norm": 1640.0, + "kl_loss_12": 692.0519561767578, + "kl_loss_17": 178.92868041992188, + "kl_loss_3": 2534.8064575195312, + "kl_loss_6": 1626.7398010253905, + "learning_rate": 0.0007822216094333848, + "loss": 1303.1312, + "step": 3160 + }, + { + "ce_loss_12": 3.256673276424408, + "ce_loss_17": 3.0264561772346497, + "ce_loss_23": 2.9382986426353455, + "ce_loss_3": 4.18041627407074, + "ce_loss_6": 3.712412190437317, + "epoch": 0.317, + "grad_norm": 1112.0, + "kl_loss_12": 706.8246185302735, + "kl_loss_17": 182.26864318847657, + "kl_loss_3": 2609.5548583984373, + "kl_loss_6": 1678.9423522949219, + "learning_rate": 0.0007809104449669101, + "loss": 1279.1823, + "step": 3170 + }, + { + "ce_loss_12": 3.1929746150970457, + "ce_loss_17": 2.9713391065597534, + "ce_loss_23": 2.8822572350502016, + "ce_loss_3": 4.085263776779175, + "ce_loss_6": 3.6272388219833376, + "epoch": 0.318, + "grad_norm": 1240.0, + "kl_loss_12": 689.0186950683594, + "kl_loss_17": 180.23409118652344, + "kl_loss_3": 2527.595251464844, + "kl_loss_6": 1631.381817626953, + "learning_rate": 0.0007795964517353734, + "loss": 1263.9521, + "step": 3180 + }, + { + "ce_loss_12": 3.2025929570198057, + "ce_loss_17": 2.9794071316719055, + "ce_loss_23": 2.888680374622345, + "ce_loss_3": 4.129607903957367, + "ce_loss_6": 3.6572258710861205, + "epoch": 0.319, + "grad_norm": 1528.0, + "kl_loss_12": 702.7746612548829, + "kl_loss_17": 186.2022560119629, + "kl_loss_3": 2622.777136230469, + "kl_loss_6": 1677.1491333007812, + "learning_rate": 0.000778279642970672, + "loss": 1261.2394, + "step": 3190 + }, + { + "ce_loss_12": 3.201473259925842, + "ce_loss_17": 2.9837117552757264, + "ce_loss_23": 2.893271243572235, + "ce_loss_3": 4.092443692684173, + "ce_loss_6": 3.6404433250427246, + "epoch": 0.32, + "grad_norm": 1512.0, + "kl_loss_12": 695.3761199951172, + "kl_loss_17": 183.10058822631837, + "kl_loss_3": 2549.80869140625, + "kl_loss_6": 1643.3399658203125, + "learning_rate": 0.0007769600319330552, + "loss": 1253.8471, + "step": 3200 + }, + { + "ce_loss_12": 3.236275517940521, + "ce_loss_17": 3.002731680870056, + "ce_loss_23": 2.913220691680908, + "ce_loss_3": 4.180563879013062, + "ce_loss_6": 3.701882815361023, + "epoch": 0.321, + "grad_norm": 2032.0, + "kl_loss_12": 709.1263793945312, + "kl_loss_17": 187.23024673461913, + "kl_loss_3": 2652.1067626953127, + "kl_loss_6": 1701.4243713378905, + "learning_rate": 0.0007756376319109917, + "loss": 1291.4088, + "step": 3210 + }, + { + "ce_loss_12": 3.271826946735382, + "ce_loss_17": 3.0468537330627443, + "ce_loss_23": 2.956276309490204, + "ce_loss_3": 4.156561923027039, + "ce_loss_6": 3.7022555470466614, + "epoch": 0.322, + "grad_norm": 1384.0, + "kl_loss_12": 700.7278533935547, + "kl_loss_17": 182.38882293701172, + "kl_loss_3": 2540.858837890625, + "kl_loss_6": 1635.3891052246095, + "learning_rate": 0.0007743124562210351, + "loss": 1243.3936, + "step": 3220 + }, + { + "ce_loss_12": 3.283222794532776, + "ce_loss_17": 3.0634321093559267, + "ce_loss_23": 2.972226691246033, + "ce_loss_3": 4.1698635697364805, + "ce_loss_6": 3.7179332256317137, + "epoch": 0.323, + "grad_norm": 1552.0, + "kl_loss_12": 693.1977661132812, + "kl_loss_17": 181.76399841308594, + "kl_loss_3": 2554.07998046875, + "kl_loss_6": 1639.9646362304688, + "learning_rate": 0.0007729845182076895, + "loss": 1271.4533, + "step": 3230 + }, + { + "ce_loss_12": 3.21431725025177, + "ce_loss_17": 2.994157004356384, + "ce_loss_23": 2.908644068241119, + "ce_loss_3": 4.099689674377442, + "ce_loss_6": 3.6397662997245788, + "epoch": 0.324, + "grad_norm": 1392.0, + "kl_loss_12": 688.5107879638672, + "kl_loss_17": 176.1745262145996, + "kl_loss_3": 2516.702099609375, + "kl_loss_6": 1616.052734375, + "learning_rate": 0.0007716538312432765, + "loss": 1282.8004, + "step": 3240 + }, + { + "ce_loss_12": 3.185789632797241, + "ce_loss_17": 2.956033003330231, + "ce_loss_23": 2.864128386974335, + "ce_loss_3": 4.1107929110527035, + "ce_loss_6": 3.636369824409485, + "epoch": 0.325, + "grad_norm": 1216.0, + "kl_loss_12": 711.5099182128906, + "kl_loss_17": 183.86663436889648, + "kl_loss_3": 2620.814343261719, + "kl_loss_6": 1682.6124145507813, + "learning_rate": 0.0007703204087277988, + "loss": 1294.602, + "step": 3250 + }, + { + "ce_loss_12": 3.261729085445404, + "ce_loss_17": 3.041890096664429, + "ce_loss_23": 2.956402611732483, + "ce_loss_3": 4.137962424755097, + "ce_loss_6": 3.6820000886917112, + "epoch": 0.326, + "grad_norm": 1248.0, + "kl_loss_12": 679.4031799316406, + "kl_loss_17": 173.54660720825194, + "kl_loss_3": 2491.391455078125, + "kl_loss_6": 1592.1739929199218, + "learning_rate": 0.0007689842640888063, + "loss": 1245.8531, + "step": 3260 + }, + { + "ce_loss_12": 3.263756537437439, + "ce_loss_17": 3.0447650790214538, + "ce_loss_23": 2.9546585202217104, + "ce_loss_3": 4.146595048904419, + "ce_loss_6": 3.692533540725708, + "epoch": 0.327, + "grad_norm": 1288.0, + "kl_loss_12": 691.6199554443359, + "kl_loss_17": 177.55080032348633, + "kl_loss_3": 2488.0351318359376, + "kl_loss_6": 1607.0404113769532, + "learning_rate": 0.0007676454107812607, + "loss": 1254.7438, + "step": 3270 + }, + { + "ce_loss_12": 3.2167189359664916, + "ce_loss_17": 2.9856369495391846, + "ce_loss_23": 2.896033489704132, + "ce_loss_3": 4.135352909564972, + "ce_loss_6": 3.658247637748718, + "epoch": 0.328, + "grad_norm": 1736.0, + "kl_loss_12": 701.1315216064453, + "kl_loss_17": 179.1365234375, + "kl_loss_3": 2609.4743774414064, + "kl_loss_6": 1656.1321044921874, + "learning_rate": 0.0007663038622873999, + "loss": 1264.0107, + "step": 3280 + }, + { + "ce_loss_12": 3.255653953552246, + "ce_loss_17": 3.034635305404663, + "ce_loss_23": 2.945936155319214, + "ce_loss_3": 4.157212674617767, + "ce_loss_6": 3.6950213432312013, + "epoch": 0.329, + "grad_norm": 1336.0, + "kl_loss_12": 692.009228515625, + "kl_loss_17": 176.38537139892577, + "kl_loss_3": 2562.4224609375, + "kl_loss_6": 1639.9743591308593, + "learning_rate": 0.0007649596321166025, + "loss": 1248.229, + "step": 3290 + }, + { + "ce_loss_12": 3.1578006744384766, + "ce_loss_17": 2.933754026889801, + "ce_loss_23": 2.8455958724021913, + "ce_loss_3": 4.034773278236389, + "ce_loss_6": 3.5791631937026978, + "epoch": 0.33, + "grad_norm": 1472.0, + "kl_loss_12": 680.4518218994141, + "kl_loss_17": 171.55679092407226, + "kl_loss_3": 2492.165576171875, + "kl_loss_6": 1597.7407775878905, + "learning_rate": 0.0007636127338052513, + "loss": 1258.4492, + "step": 3300 + }, + { + "ce_loss_12": 3.2585213422775268, + "ce_loss_17": 3.0327520966529846, + "ce_loss_23": 2.942645025253296, + "ce_loss_3": 4.181741786003113, + "ce_loss_6": 3.7107974171638487, + "epoch": 0.331, + "grad_norm": 1120.0, + "kl_loss_12": 706.8980224609375, + "kl_loss_17": 179.87042694091798, + "kl_loss_3": 2614.3381225585936, + "kl_loss_6": 1678.1480651855468, + "learning_rate": 0.0007622631809165971, + "loss": 1264.555, + "step": 3310 + }, + { + "ce_loss_12": 3.2314629554748535, + "ce_loss_17": 3.0195695042610167, + "ce_loss_23": 2.935496175289154, + "ce_loss_3": 4.101777184009552, + "ce_loss_6": 3.648426377773285, + "epoch": 0.332, + "grad_norm": 1296.0, + "kl_loss_12": 658.7951812744141, + "kl_loss_17": 164.9975357055664, + "kl_loss_3": 2434.165576171875, + "kl_loss_6": 1549.5012145996093, + "learning_rate": 0.000760910987040623, + "loss": 1228.0617, + "step": 3320 + }, + { + "ce_loss_12": 3.251615858078003, + "ce_loss_17": 3.014969897270203, + "ce_loss_23": 2.9256165981292725, + "ce_loss_3": 4.170702481269837, + "ce_loss_6": 3.6970121264457703, + "epoch": 0.333, + "grad_norm": 1160.0, + "kl_loss_12": 710.6054077148438, + "kl_loss_17": 178.40473556518555, + "kl_loss_3": 2621.5610961914062, + "kl_loss_6": 1686.7944213867188, + "learning_rate": 0.000759556165793906, + "loss": 1261.5679, + "step": 3330 + }, + { + "ce_loss_12": 3.2469671607017516, + "ce_loss_17": 3.019290053844452, + "ce_loss_23": 2.9311923027038573, + "ce_loss_3": 4.1470073699951175, + "ce_loss_6": 3.6872513771057127, + "epoch": 0.334, + "grad_norm": 1208.0, + "kl_loss_12": 697.2853240966797, + "kl_loss_17": 175.45677642822267, + "kl_loss_3": 2558.788195800781, + "kl_loss_6": 1639.9633178710938, + "learning_rate": 0.000758198730819481, + "loss": 1278.1193, + "step": 3340 + }, + { + "ce_loss_12": 3.2000674843788146, + "ce_loss_17": 2.9849128246307375, + "ce_loss_23": 2.9033224821090697, + "ce_loss_3": 4.107236564159393, + "ce_loss_6": 3.6453574776649473, + "epoch": 0.335, + "grad_norm": 1112.0, + "kl_loss_12": 669.4688842773437, + "kl_loss_17": 170.60084915161133, + "kl_loss_3": 2537.7765380859373, + "kl_loss_6": 1621.6796264648438, + "learning_rate": 0.0007568386957867032, + "loss": 1253.9214, + "step": 3350 + }, + { + "ce_loss_12": 3.2591148614883423, + "ce_loss_17": 3.034511923789978, + "ce_loss_23": 2.944902002811432, + "ce_loss_3": 4.14842437505722, + "ce_loss_6": 3.6896276235580445, + "epoch": 0.336, + "grad_norm": 1320.0, + "kl_loss_12": 693.616488647461, + "kl_loss_17": 175.55592575073243, + "kl_loss_3": 2526.75390625, + "kl_loss_6": 1627.826483154297, + "learning_rate": 0.0007554760743911103, + "loss": 1270.8844, + "step": 3360 + }, + { + "ce_loss_12": 3.183031690120697, + "ce_loss_17": 2.967649185657501, + "ce_loss_23": 2.881660187244415, + "ce_loss_3": 4.073859429359436, + "ce_loss_6": 3.619278872013092, + "epoch": 0.337, + "grad_norm": 1112.0, + "kl_loss_12": 669.0850189208984, + "kl_loss_17": 170.41897354125976, + "kl_loss_3": 2526.225146484375, + "kl_loss_6": 1611.0459899902344, + "learning_rate": 0.0007541108803542846, + "loss": 1283.7165, + "step": 3370 + }, + { + "ce_loss_12": 3.212652266025543, + "ce_loss_17": 2.996922516822815, + "ce_loss_23": 2.9127749681472777, + "ce_loss_3": 4.11750476360321, + "ce_loss_6": 3.647727131843567, + "epoch": 0.338, + "grad_norm": 1440.0, + "kl_loss_12": 682.5028564453125, + "kl_loss_17": 172.18799591064453, + "kl_loss_3": 2565.983251953125, + "kl_loss_6": 1622.7668823242188, + "learning_rate": 0.0007527431274237149, + "loss": 1317.9412, + "step": 3380 + }, + { + "ce_loss_12": 3.197421908378601, + "ce_loss_17": 2.976374638080597, + "ce_loss_23": 2.892578399181366, + "ce_loss_3": 4.078976202011108, + "ce_loss_6": 3.6260195136070252, + "epoch": 0.339, + "grad_norm": 1072.0, + "kl_loss_12": 672.5755462646484, + "kl_loss_17": 172.84192657470703, + "kl_loss_3": 2514.9993530273437, + "kl_loss_6": 1604.3666870117188, + "learning_rate": 0.0007513728293726579, + "loss": 1248.9244, + "step": 3390 + }, + { + "ce_loss_12": 3.295226848125458, + "ce_loss_17": 3.073020327091217, + "ce_loss_23": 2.9877483010292054, + "ce_loss_3": 4.1686977505683895, + "ce_loss_6": 3.7176806807518004, + "epoch": 0.34, + "grad_norm": 1376.0, + "kl_loss_12": 684.651318359375, + "kl_loss_17": 174.42405395507814, + "kl_loss_3": 2515.1052490234374, + "kl_loss_6": 1609.3935485839843, + "learning_rate": 0.00075, + "loss": 1242.0636, + "step": 3400 + }, + { + "ce_loss_12": 3.2965719938278197, + "ce_loss_17": 3.0647927641868593, + "ce_loss_23": 2.97651948928833, + "ce_loss_3": 4.199360990524292, + "ce_loss_6": 3.725625455379486, + "epoch": 0.341, + "grad_norm": 1448.0, + "kl_loss_12": 697.323843383789, + "kl_loss_17": 175.5424606323242, + "kl_loss_3": 2570.543505859375, + "kl_loss_6": 1631.5912353515625, + "learning_rate": 0.0007486246531301177, + "loss": 1254.5671, + "step": 3410 + }, + { + "ce_loss_12": 3.1126665830612184, + "ce_loss_17": 2.8926371693611146, + "ce_loss_23": 2.805622935295105, + "ce_loss_3": 4.008318614959717, + "ce_loss_6": 3.5567113280296327, + "epoch": 0.342, + "grad_norm": 1168.0, + "kl_loss_12": 673.064111328125, + "kl_loss_17": 170.77882537841796, + "kl_loss_3": 2524.815466308594, + "kl_loss_6": 1626.3312133789063, + "learning_rate": 0.0007472468026127384, + "loss": 1235.8322, + "step": 3420 + }, + { + "ce_loss_12": 3.2668772459030153, + "ce_loss_17": 3.0327144980430605, + "ce_loss_23": 2.9397048711776734, + "ce_loss_3": 4.189107143878937, + "ce_loss_6": 3.7113179564476013, + "epoch": 0.343, + "grad_norm": 1328.0, + "kl_loss_12": 713.0418273925782, + "kl_loss_17": 182.29576263427734, + "kl_loss_3": 2645.8495971679686, + "kl_loss_6": 1687.5421203613282, + "learning_rate": 0.000745866462322802, + "loss": 1284.9364, + "step": 3430 + }, + { + "ce_loss_12": 3.2224403619766235, + "ce_loss_17": 3.0063932299613954, + "ce_loss_23": 2.922226667404175, + "ce_loss_3": 4.104392576217651, + "ce_loss_6": 3.6443408131599426, + "epoch": 0.344, + "grad_norm": 1288.0, + "kl_loss_12": 669.1014770507812, + "kl_loss_17": 169.69496231079103, + "kl_loss_3": 2501.728381347656, + "kl_loss_6": 1581.1097290039063, + "learning_rate": 0.0007444836461603195, + "loss": 1239.8873, + "step": 3440 + }, + { + "ce_loss_12": 3.298066568374634, + "ce_loss_17": 3.072697710990906, + "ce_loss_23": 2.9792401432991027, + "ce_loss_3": 4.1962895154953, + "ce_loss_6": 3.735168826580048, + "epoch": 0.345, + "grad_norm": 1424.0, + "kl_loss_12": 719.5765075683594, + "kl_loss_17": 185.3255714416504, + "kl_loss_3": 2590.4420776367188, + "kl_loss_6": 1670.0243530273438, + "learning_rate": 0.0007430983680502344, + "loss": 1288.9348, + "step": 3450 + }, + { + "ce_loss_12": 3.140820288658142, + "ce_loss_17": 2.9176002502441407, + "ce_loss_23": 2.8298457622528077, + "ce_loss_3": 4.062877249717713, + "ce_loss_6": 3.5874500393867494, + "epoch": 0.346, + "grad_norm": 1440.0, + "kl_loss_12": 687.932275390625, + "kl_loss_17": 179.16093215942382, + "kl_loss_3": 2590.4509521484374, + "kl_loss_6": 1650.5429626464843, + "learning_rate": 0.0007417106419422819, + "loss": 1276.0914, + "step": 3460 + }, + { + "ce_loss_12": 3.2299922466278077, + "ce_loss_17": 3.01367164850235, + "ce_loss_23": 2.920131766796112, + "ce_loss_3": 4.117225778102875, + "ce_loss_6": 3.6604254961013796, + "epoch": 0.347, + "grad_norm": 1480.0, + "kl_loss_12": 680.9697845458984, + "kl_loss_17": 183.2493438720703, + "kl_loss_3": 2504.739465332031, + "kl_loss_6": 1599.7462829589845, + "learning_rate": 0.0007403204818108486, + "loss": 1263.8809, + "step": 3470 + }, + { + "ce_loss_12": 3.2155398964881896, + "ce_loss_17": 3.003265452384949, + "ce_loss_23": 2.9128665685653687, + "ce_loss_3": 4.111185204982758, + "ce_loss_6": 3.6440620183944703, + "epoch": 0.348, + "grad_norm": 1064.0, + "kl_loss_12": 682.5471984863282, + "kl_loss_17": 176.46894760131835, + "kl_loss_3": 2570.6810791015623, + "kl_loss_6": 1634.1306274414062, + "learning_rate": 0.0007389279016548316, + "loss": 1229.5125, + "step": 3480 + }, + { + "ce_loss_12": 3.224693238735199, + "ce_loss_17": 2.99582279920578, + "ce_loss_23": 2.904498505592346, + "ce_loss_3": 4.1740000486373905, + "ce_loss_6": 3.6846797585487367, + "epoch": 0.349, + "grad_norm": 1152.0, + "kl_loss_12": 702.2427368164062, + "kl_loss_17": 180.65175704956056, + "kl_loss_3": 2652.2956787109374, + "kl_loss_6": 1684.2456481933593, + "learning_rate": 0.0007375329154974975, + "loss": 1284.0389, + "step": 3490 + }, + { + "ce_loss_12": 3.174767458438873, + "ce_loss_17": 2.961744463443756, + "ce_loss_23": 2.875151014328003, + "ce_loss_3": 4.056888246536255, + "ce_loss_6": 3.6028032422065737, + "epoch": 0.35, + "grad_norm": 1312.0, + "kl_loss_12": 666.8886749267579, + "kl_loss_17": 176.2197494506836, + "kl_loss_3": 2488.2426025390623, + "kl_loss_6": 1584.5100158691407, + "learning_rate": 0.0007361355373863414, + "loss": 1265.6813, + "step": 3500 + }, + { + "ce_loss_12": 3.223465931415558, + "ce_loss_17": 3.005623769760132, + "ce_loss_23": 2.92046400308609, + "ce_loss_3": 4.106686019897461, + "ce_loss_6": 3.642595362663269, + "epoch": 0.351, + "grad_norm": 1400.0, + "kl_loss_12": 664.1003723144531, + "kl_loss_17": 172.3474266052246, + "kl_loss_3": 2488.0405151367186, + "kl_loss_6": 1577.5601440429687, + "learning_rate": 0.0007347357813929454, + "loss": 1261.238, + "step": 3510 + }, + { + "ce_loss_12": 3.175387609004974, + "ce_loss_17": 2.9665459513664247, + "ce_loss_23": 2.8761163115501405, + "ce_loss_3": 4.05610990524292, + "ce_loss_6": 3.604305791854858, + "epoch": 0.352, + "grad_norm": 1176.0, + "kl_loss_12": 662.9545318603516, + "kl_loss_17": 173.22559509277343, + "kl_loss_3": 2480.9940185546875, + "kl_loss_6": 1575.9367980957031, + "learning_rate": 0.0007333336616128369, + "loss": 1258.5035, + "step": 3520 + }, + { + "ce_loss_12": 3.154016065597534, + "ce_loss_17": 2.933714139461517, + "ce_loss_23": 2.842202401161194, + "ce_loss_3": 4.078752171993256, + "ce_loss_6": 3.606439673900604, + "epoch": 0.353, + "grad_norm": 1464.0, + "kl_loss_12": 686.2472900390625, + "kl_loss_17": 177.2117446899414, + "kl_loss_3": 2575.9606323242188, + "kl_loss_6": 1635.5638427734375, + "learning_rate": 0.0007319291921653463, + "loss": 1266.0604, + "step": 3530 + }, + { + "ce_loss_12": 3.244112253189087, + "ce_loss_17": 3.0184548616409304, + "ce_loss_23": 2.927817130088806, + "ce_loss_3": 4.154715037345886, + "ce_loss_6": 3.6804611444473267, + "epoch": 0.354, + "grad_norm": 1712.0, + "kl_loss_12": 695.5317565917969, + "kl_loss_17": 179.87535858154297, + "kl_loss_3": 2573.4360107421876, + "kl_loss_6": 1637.6216186523438, + "learning_rate": 0.0007305223871934656, + "loss": 1249.0599, + "step": 3540 + }, + { + "ce_loss_12": 3.1960132479667664, + "ce_loss_17": 2.9835826635360716, + "ce_loss_23": 2.893903136253357, + "ce_loss_3": 4.0905327796936035, + "ce_loss_6": 3.6329979300498962, + "epoch": 0.355, + "grad_norm": 1288.0, + "kl_loss_12": 676.8251953125, + "kl_loss_17": 179.56095199584962, + "kl_loss_3": 2530.212939453125, + "kl_loss_6": 1613.695166015625, + "learning_rate": 0.0007291132608637052, + "loss": 1253.3357, + "step": 3550 + }, + { + "ce_loss_12": 3.173990178108215, + "ce_loss_17": 2.9597131669521333, + "ce_loss_23": 2.8744539499282835, + "ce_loss_3": 4.148865497112274, + "ce_loss_6": 3.6403673768043516, + "epoch": 0.356, + "grad_norm": 1048.0, + "kl_loss_12": 661.6298370361328, + "kl_loss_17": 171.65505676269532, + "kl_loss_3": 2680.3787231445312, + "kl_loss_6": 1663.47939453125, + "learning_rate": 0.0007277018273659516, + "loss": 1289.8501, + "step": 3560 + }, + { + "ce_loss_12": 3.2997384309768676, + "ce_loss_17": 3.070998930931091, + "ce_loss_23": 2.976192998886108, + "ce_loss_3": 4.187009930610657, + "ce_loss_6": 3.743383991718292, + "epoch": 0.357, + "grad_norm": 1392.0, + "kl_loss_12": 707.4304107666015, + "kl_loss_17": 183.98058853149413, + "kl_loss_3": 2556.8765625, + "kl_loss_6": 1671.6949035644532, + "learning_rate": 0.0007262881009133242, + "loss": 1268.3045, + "step": 3570 + }, + { + "ce_loss_12": 3.2091238856315614, + "ce_loss_17": 2.988382303714752, + "ce_loss_23": 2.90533766746521, + "ce_loss_3": 4.095027208328247, + "ce_loss_6": 3.641123652458191, + "epoch": 0.358, + "grad_norm": 1352.0, + "kl_loss_12": 665.867074584961, + "kl_loss_17": 171.1517547607422, + "kl_loss_3": 2506.3150329589844, + "kl_loss_6": 1609.4116271972657, + "learning_rate": 0.0007248720957420329, + "loss": 1233.9654, + "step": 3580 + }, + { + "ce_loss_12": 3.1986087441444395, + "ce_loss_17": 2.989278721809387, + "ce_loss_23": 2.9084664225578307, + "ce_loss_3": 4.078347492218017, + "ce_loss_6": 3.627185559272766, + "epoch": 0.359, + "grad_norm": 1608.0, + "kl_loss_12": 662.6515014648437, + "kl_loss_17": 170.73527755737305, + "kl_loss_3": 2484.07275390625, + "kl_loss_6": 1581.685772705078, + "learning_rate": 0.0007234538261112341, + "loss": 1291.7923, + "step": 3590 + }, + { + "ce_loss_12": 3.25000581741333, + "ce_loss_17": 3.0334473848342896, + "ce_loss_23": 2.9443798422813416, + "ce_loss_3": 4.159439659118652, + "ce_loss_6": 3.6893228888511658, + "epoch": 0.36, + "grad_norm": 1200.0, + "kl_loss_12": 677.5771301269531, + "kl_loss_17": 175.92178649902343, + "kl_loss_3": 2548.3970336914062, + "kl_loss_6": 1625.2643127441406, + "learning_rate": 0.0007220333063028871, + "loss": 1245.0543, + "step": 3600 + }, + { + "ce_loss_12": 3.293718934059143, + "ce_loss_17": 3.071032238006592, + "ce_loss_23": 2.980521869659424, + "ce_loss_3": 4.245906913280487, + "ce_loss_6": 3.805212640762329, + "epoch": 0.361, + "grad_norm": 1552.0, + "kl_loss_12": 703.2577209472656, + "kl_loss_17": 183.09868621826172, + "kl_loss_3": 2684.7048095703126, + "kl_loss_6": 1793.4456726074218, + "learning_rate": 0.0007206105506216106, + "loss": 1308.6845, + "step": 3610 + }, + { + "ce_loss_12": 3.1581379532814027, + "ce_loss_17": 2.9497724771499634, + "ce_loss_23": 2.8643031477928163, + "ce_loss_3": 4.041967523097992, + "ce_loss_6": 3.5881839752197267, + "epoch": 0.362, + "grad_norm": 1312.0, + "kl_loss_12": 661.0356536865235, + "kl_loss_17": 169.57022018432616, + "kl_loss_3": 2483.1566528320313, + "kl_loss_6": 1591.054949951172, + "learning_rate": 0.0007191855733945387, + "loss": 1217.9996, + "step": 3620 + }, + { + "ce_loss_12": 3.2457900404930116, + "ce_loss_17": 3.0324423789978026, + "ce_loss_23": 2.945065474510193, + "ce_loss_3": 4.1396326422691345, + "ce_loss_6": 3.6858662724494935, + "epoch": 0.363, + "grad_norm": 1272.0, + "kl_loss_12": 666.4612762451172, + "kl_loss_17": 172.11029586791992, + "kl_loss_3": 2513.3689819335937, + "kl_loss_6": 1605.7999206542968, + "learning_rate": 0.0007177583889711762, + "loss": 1234.0895, + "step": 3630 + }, + { + "ce_loss_12": 3.1683589816093445, + "ce_loss_17": 2.9484509587287904, + "ce_loss_23": 2.8645978569984436, + "ce_loss_3": 4.0660979628562925, + "ce_loss_6": 3.605272591114044, + "epoch": 0.364, + "grad_norm": 1248.0, + "kl_loss_12": 678.2521392822266, + "kl_loss_17": 172.29432678222656, + "kl_loss_3": 2544.167138671875, + "kl_loss_6": 1619.9976318359375, + "learning_rate": 0.0007163290117232541, + "loss": 1250.5528, + "step": 3640 + }, + { + "ce_loss_12": 3.2618592262268065, + "ce_loss_17": 3.0550758957862856, + "ce_loss_23": 2.9710198998451234, + "ce_loss_3": 4.113152432441711, + "ce_loss_6": 3.6700500726699827, + "epoch": 0.365, + "grad_norm": 1304.0, + "kl_loss_12": 657.0813995361328, + "kl_loss_17": 168.57955169677734, + "kl_loss_3": 2453.638525390625, + "kl_loss_6": 1553.8087097167968, + "learning_rate": 0.0007148974560445859, + "loss": 1227.0289, + "step": 3650 + }, + { + "ce_loss_12": 3.20428751707077, + "ce_loss_17": 2.9895589351654053, + "ce_loss_23": 2.9060996413230895, + "ce_loss_3": 4.07148152589798, + "ce_loss_6": 3.6159334897994997, + "epoch": 0.366, + "grad_norm": 1240.0, + "kl_loss_12": 663.6022491455078, + "kl_loss_17": 167.72595138549804, + "kl_loss_3": 2461.8028930664063, + "kl_loss_6": 1564.8658630371094, + "learning_rate": 0.0007134637363509209, + "loss": 1212.9137, + "step": 3660 + }, + { + "ce_loss_12": 3.308513867855072, + "ce_loss_17": 3.095934271812439, + "ce_loss_23": 3.014577269554138, + "ce_loss_3": 4.175627636909485, + "ce_loss_6": 3.721687173843384, + "epoch": 0.367, + "grad_norm": 1424.0, + "kl_loss_12": 658.9216094970703, + "kl_loss_17": 165.24458236694335, + "kl_loss_3": 2444.9187072753907, + "kl_loss_6": 1562.8463012695313, + "learning_rate": 0.0007120278670798009, + "loss": 1231.3819, + "step": 3670 + }, + { + "ce_loss_12": 3.1433789849281313, + "ce_loss_17": 2.911480498313904, + "ce_loss_23": 2.8245518803596497, + "ce_loss_3": 4.091946232318878, + "ce_loss_6": 3.609583258628845, + "epoch": 0.368, + "grad_norm": 1760.0, + "kl_loss_12": 705.2106689453125, + "kl_loss_17": 174.5452033996582, + "kl_loss_3": 2662.1167236328124, + "kl_loss_6": 1698.6553161621093, + "learning_rate": 0.0007105898626904133, + "loss": 1300.6306, + "step": 3680 + }, + { + "ce_loss_12": 3.2091620683670046, + "ce_loss_17": 2.9982989549636843, + "ce_loss_23": 2.914182126522064, + "ce_loss_3": 4.118845117092133, + "ce_loss_6": 3.6490070939064028, + "epoch": 0.369, + "grad_norm": 1056.0, + "kl_loss_12": 660.9015686035157, + "kl_loss_17": 168.80124893188477, + "kl_loss_3": 2519.1709228515624, + "kl_loss_6": 1590.7595642089843, + "learning_rate": 0.0007091497376634463, + "loss": 1227.2898, + "step": 3690 + }, + { + "ce_loss_12": 3.1640050292015074, + "ce_loss_17": 2.9496522307395936, + "ce_loss_23": 2.863178324699402, + "ce_loss_3": 4.0586523652076725, + "ce_loss_6": 3.5967571139335632, + "epoch": 0.37, + "grad_norm": 1192.0, + "kl_loss_12": 664.4693145751953, + "kl_loss_17": 171.1860595703125, + "kl_loss_3": 2494.8483642578126, + "kl_loss_6": 1591.562060546875, + "learning_rate": 0.0007077075065009433, + "loss": 1255.5549, + "step": 3700 + }, + { + "ce_loss_12": 3.269791603088379, + "ce_loss_17": 3.045477032661438, + "ce_loss_23": 2.955088996887207, + "ce_loss_3": 4.169425082206726, + "ce_loss_6": 3.708793246746063, + "epoch": 0.371, + "grad_norm": 1328.0, + "kl_loss_12": 692.2478515625, + "kl_loss_17": 184.14117126464845, + "kl_loss_3": 2564.8384643554687, + "kl_loss_6": 1640.535137939453, + "learning_rate": 0.0007062631837261557, + "loss": 1259.7139, + "step": 3710 + }, + { + "ce_loss_12": 3.1447814106941223, + "ce_loss_17": 2.9327072620391847, + "ce_loss_23": 2.8490753054618834, + "ce_loss_3": 4.037320530414581, + "ce_loss_6": 3.571010208129883, + "epoch": 0.372, + "grad_norm": 1344.0, + "kl_loss_12": 659.5080200195313, + "kl_loss_17": 165.9877166748047, + "kl_loss_3": 2498.7698303222655, + "kl_loss_6": 1581.0278564453124, + "learning_rate": 0.0007048167838833977, + "loss": 1259.9834, + "step": 3720 + }, + { + "ce_loss_12": 3.225725495815277, + "ce_loss_17": 3.0078166246414186, + "ce_loss_23": 2.922504723072052, + "ce_loss_3": 4.098983669281006, + "ce_loss_6": 3.640908944606781, + "epoch": 0.373, + "grad_norm": 2320.0, + "kl_loss_12": 664.5679473876953, + "kl_loss_17": 172.4445037841797, + "kl_loss_3": 2484.5310546875, + "kl_loss_6": 1567.2558532714843, + "learning_rate": 0.0007033683215379002, + "loss": 1228.0563, + "step": 3730 + }, + { + "ce_loss_12": 3.2036506056785585, + "ce_loss_17": 3.000447142124176, + "ce_loss_23": 2.9124128341674806, + "ce_loss_3": 4.099389910697937, + "ce_loss_6": 3.63028404712677, + "epoch": 0.374, + "grad_norm": 1656.0, + "kl_loss_12": 654.6248901367187, + "kl_loss_17": 174.84245681762695, + "kl_loss_3": 2501.230078125, + "kl_loss_6": 1569.0875061035156, + "learning_rate": 0.0007019178112756625, + "loss": 1245.0402, + "step": 3740 + }, + { + "ce_loss_12": 3.1852274894714356, + "ce_loss_17": 2.977396011352539, + "ce_loss_23": 2.8892534017562865, + "ce_loss_3": 4.071548402309418, + "ce_loss_6": 3.6089416623115538, + "epoch": 0.375, + "grad_norm": 1568.0, + "kl_loss_12": 657.2333770751953, + "kl_loss_17": 181.52922439575195, + "kl_loss_3": 2474.169641113281, + "kl_loss_6": 1566.2497192382812, + "learning_rate": 0.0007004652677033068, + "loss": 1235.8654, + "step": 3750 + }, + { + "ce_loss_12": 3.246406066417694, + "ce_loss_17": 3.0545617818832396, + "ce_loss_23": 2.9658969402313233, + "ce_loss_3": 4.111606228351593, + "ce_loss_6": 3.660529816150665, + "epoch": 0.376, + "grad_norm": 1160.0, + "kl_loss_12": 636.2628814697266, + "kl_loss_17": 186.81802062988282, + "kl_loss_3": 2432.5833740234375, + "kl_loss_6": 1533.5833374023437, + "learning_rate": 0.0006990107054479312, + "loss": 1217.305, + "step": 3760 + }, + { + "ce_loss_12": 3.231976664066315, + "ce_loss_17": 3.0275618195533753, + "ce_loss_23": 2.9329543590545653, + "ce_loss_3": 4.106707489490509, + "ce_loss_6": 3.6681945443153383, + "epoch": 0.377, + "grad_norm": 1184.0, + "kl_loss_12": 663.0146911621093, + "kl_loss_17": 192.4864356994629, + "kl_loss_3": 2478.7212524414062, + "kl_loss_6": 1597.8008972167968, + "learning_rate": 0.000697554139156961, + "loss": 1238.799, + "step": 3770 + }, + { + "ce_loss_12": 3.23291597366333, + "ce_loss_17": 3.0283109664916994, + "ce_loss_23": 2.9323471069335936, + "ce_loss_3": 4.128734397888183, + "ce_loss_6": 3.654512882232666, + "epoch": 0.378, + "grad_norm": 2368.0, + "kl_loss_12": 677.5888610839844, + "kl_loss_17": 192.30858840942383, + "kl_loss_3": 2538.2166381835937, + "kl_loss_6": 1599.0732421875, + "learning_rate": 0.0006960955834980027, + "loss": 1224.4588, + "step": 3780 + }, + { + "ce_loss_12": 3.1920592904090883, + "ce_loss_17": 2.9854776501655578, + "ce_loss_23": 2.8947471618652343, + "ce_loss_3": 4.071837854385376, + "ce_loss_6": 3.62058881521225, + "epoch": 0.379, + "grad_norm": 1208.0, + "kl_loss_12": 655.1255462646484, + "kl_loss_17": 182.6277877807617, + "kl_loss_3": 2471.8943725585937, + "kl_loss_6": 1572.78779296875, + "learning_rate": 0.0006946350531586958, + "loss": 1228.286, + "step": 3790 + }, + { + "ce_loss_12": 3.2267534732818604, + "ce_loss_17": 3.0250037789344786, + "ce_loss_23": 2.9308645248413088, + "ce_loss_3": 4.103527474403381, + "ce_loss_6": 3.651949441432953, + "epoch": 0.38, + "grad_norm": 1192.0, + "kl_loss_12": 660.5157470703125, + "kl_loss_17": 184.3563117980957, + "kl_loss_3": 2489.6172607421877, + "kl_loss_6": 1582.6506652832031, + "learning_rate": 0.0006931725628465643, + "loss": 1253.1607, + "step": 3800 + }, + { + "ce_loss_12": 3.2306085586547852, + "ce_loss_17": 3.015321934223175, + "ce_loss_23": 2.9284037232398985, + "ce_loss_3": 4.1283272981643675, + "ce_loss_6": 3.6559887886047364, + "epoch": 0.381, + "grad_norm": 1136.0, + "kl_loss_12": 666.8306457519532, + "kl_loss_17": 176.56946716308593, + "kl_loss_3": 2508.562390136719, + "kl_loss_6": 1585.318896484375, + "learning_rate": 0.0006917081272888696, + "loss": 1239.0171, + "step": 3810 + }, + { + "ce_loss_12": 3.1532178282737733, + "ce_loss_17": 2.94263699054718, + "ce_loss_23": 2.854952907562256, + "ce_loss_3": 4.075488984584808, + "ce_loss_6": 3.596358561515808, + "epoch": 0.382, + "grad_norm": 1688.0, + "kl_loss_12": 665.3003204345703, + "kl_loss_17": 175.64662475585936, + "kl_loss_3": 2575.84931640625, + "kl_loss_6": 1632.7892517089845, + "learning_rate": 0.0006902417612324615, + "loss": 1236.6838, + "step": 3820 + }, + { + "ce_loss_12": 3.282550072669983, + "ce_loss_17": 3.054612565040588, + "ce_loss_23": 2.9622069239616393, + "ce_loss_3": 4.201048421859741, + "ce_loss_6": 3.7223628044128416, + "epoch": 0.383, + "grad_norm": 1248.0, + "kl_loss_12": 693.9081359863281, + "kl_loss_17": 180.50396270751952, + "kl_loss_3": 2589.1482543945312, + "kl_loss_6": 1641.8155029296875, + "learning_rate": 0.00068877347944363, + "loss": 1262.4301, + "step": 3830 + }, + { + "ce_loss_12": 3.26390939950943, + "ce_loss_17": 3.054534149169922, + "ce_loss_23": 2.9683485984802247, + "ce_loss_3": 4.131193685531616, + "ce_loss_6": 3.6752264499664307, + "epoch": 0.384, + "grad_norm": 1792.0, + "kl_loss_12": 659.4387176513671, + "kl_loss_17": 171.06110916137695, + "kl_loss_3": 2466.495556640625, + "kl_loss_6": 1567.6346740722656, + "learning_rate": 0.0006873032967079561, + "loss": 1237.3532, + "step": 3840 + }, + { + "ce_loss_12": 3.238120436668396, + "ce_loss_17": 3.034601593017578, + "ce_loss_23": 2.953378200531006, + "ce_loss_3": 4.100418388843536, + "ce_loss_6": 3.6456416606903077, + "epoch": 0.385, + "grad_norm": 1696.0, + "kl_loss_12": 648.6404235839843, + "kl_loss_17": 168.19820404052734, + "kl_loss_3": 2453.4826538085936, + "kl_loss_6": 1545.4518005371094, + "learning_rate": 0.0006858312278301637, + "loss": 1206.5986, + "step": 3850 + }, + { + "ce_loss_12": 3.275178110599518, + "ce_loss_17": 3.072287392616272, + "ce_loss_23": 2.990156865119934, + "ce_loss_3": 4.126061356067657, + "ce_loss_6": 3.673665976524353, + "epoch": 0.386, + "grad_norm": 1200.0, + "kl_loss_12": 652.5694366455078, + "kl_loss_17": 167.2362190246582, + "kl_loss_3": 2431.8782348632812, + "kl_loss_6": 1535.0179443359375, + "learning_rate": 0.0006843572876339704, + "loss": 1205.3057, + "step": 3860 + }, + { + "ce_loss_12": 3.1974708080291747, + "ce_loss_17": 2.994239926338196, + "ce_loss_23": 2.912939977645874, + "ce_loss_3": 4.043704795837402, + "ce_loss_6": 3.605368709564209, + "epoch": 0.387, + "grad_norm": 1424.0, + "kl_loss_12": 647.3463500976562, + "kl_loss_17": 162.89837036132812, + "kl_loss_3": 2410.9630615234373, + "kl_loss_6": 1529.901220703125, + "learning_rate": 0.0006828814909619373, + "loss": 1243.1787, + "step": 3870 + }, + { + "ce_loss_12": 3.324825716018677, + "ce_loss_17": 3.115768647193909, + "ce_loss_23": 3.0285945534706116, + "ce_loss_3": 4.195588719844818, + "ce_loss_6": 3.7296697974205015, + "epoch": 0.388, + "grad_norm": 1032.0, + "kl_loss_12": 660.1310791015625, + "kl_loss_17": 171.97688674926758, + "kl_loss_3": 2462.1822204589844, + "kl_loss_6": 1541.022509765625, + "learning_rate": 0.0006814038526753205, + "loss": 1199.0514, + "step": 3880 + }, + { + "ce_loss_12": 3.2285571575164793, + "ce_loss_17": 3.017014193534851, + "ce_loss_23": 2.9318854570388795, + "ce_loss_3": 4.104637730121612, + "ce_loss_6": 3.6409971833229067, + "epoch": 0.389, + "grad_norm": 1192.0, + "kl_loss_12": 658.3097137451172, + "kl_loss_17": 170.004736328125, + "kl_loss_3": 2459.505303955078, + "kl_loss_6": 1551.935888671875, + "learning_rate": 0.0006799243876539213, + "loss": 1220.9375, + "step": 3890 + }, + { + "ce_loss_12": 3.1611582159996034, + "ce_loss_17": 2.9470573544502257, + "ce_loss_23": 2.8648053288459776, + "ce_loss_3": 4.075111293792725, + "ce_loss_6": 3.5930355310440065, + "epoch": 0.39, + "grad_norm": 1592.0, + "kl_loss_12": 655.8491027832031, + "kl_loss_17": 167.96773834228514, + "kl_loss_3": 2543.456921386719, + "kl_loss_6": 1586.5950866699218, + "learning_rate": 0.0006784431107959359, + "loss": 1247.7159, + "step": 3900 + }, + { + "ce_loss_12": 3.2199861884117125, + "ce_loss_17": 2.9985650062561033, + "ce_loss_23": 2.9124227643013, + "ce_loss_3": 4.138576877117157, + "ce_loss_6": 3.658678352832794, + "epoch": 0.391, + "grad_norm": 1400.0, + "kl_loss_12": 677.0872192382812, + "kl_loss_17": 171.80386199951172, + "kl_loss_3": 2584.7616943359376, + "kl_loss_6": 1617.6906372070312, + "learning_rate": 0.0006769600370178059, + "loss": 1242.6135, + "step": 3910 + }, + { + "ce_loss_12": 3.1826632142066957, + "ce_loss_17": 2.970789670944214, + "ce_loss_23": 2.8851314187049866, + "ce_loss_3": 4.064342510700226, + "ce_loss_6": 3.6079466462135317, + "epoch": 0.392, + "grad_norm": 1152.0, + "kl_loss_12": 666.1765075683594, + "kl_loss_17": 167.19976959228515, + "kl_loss_3": 2490.077282714844, + "kl_loss_6": 1582.7072326660157, + "learning_rate": 0.0006754751812540679, + "loss": 1207.856, + "step": 3920 + }, + { + "ce_loss_12": 3.2270470380783083, + "ce_loss_17": 3.0110315442085267, + "ce_loss_23": 2.9247236132621763, + "ce_loss_3": 4.1250906705856325, + "ce_loss_6": 3.650589609146118, + "epoch": 0.393, + "grad_norm": 1312.0, + "kl_loss_12": 670.739956665039, + "kl_loss_17": 170.75859451293945, + "kl_loss_3": 2539.429833984375, + "kl_loss_6": 1595.901593017578, + "learning_rate": 0.0006739885584572025, + "loss": 1246.0492, + "step": 3930 + }, + { + "ce_loss_12": 3.2458897948265077, + "ce_loss_17": 3.0323882341384887, + "ce_loss_23": 2.9483110189437864, + "ce_loss_3": 4.162954318523407, + "ce_loss_6": 3.6844339966773987, + "epoch": 0.394, + "grad_norm": 1624.0, + "kl_loss_12": 675.2978454589844, + "kl_loss_17": 173.9802101135254, + "kl_loss_3": 2603.865686035156, + "kl_loss_6": 1632.4938659667969, + "learning_rate": 0.0006725001835974853, + "loss": 1237.2094, + "step": 3940 + }, + { + "ce_loss_12": 3.248879384994507, + "ce_loss_17": 3.030513954162598, + "ce_loss_23": 2.9449787974357604, + "ce_loss_3": 4.149642360210419, + "ce_loss_6": 3.675096559524536, + "epoch": 0.395, + "grad_norm": 1272.0, + "kl_loss_12": 676.2238159179688, + "kl_loss_17": 172.1310821533203, + "kl_loss_3": 2556.122448730469, + "kl_loss_6": 1599.4682495117188, + "learning_rate": 0.0006710100716628344, + "loss": 1221.427, + "step": 3950 + }, + { + "ce_loss_12": 3.2323180079460143, + "ce_loss_17": 3.0159741401672364, + "ce_loss_23": 2.9305651664733885, + "ce_loss_3": 4.120647394657135, + "ce_loss_6": 3.66931232213974, + "epoch": 0.396, + "grad_norm": 1120.0, + "kl_loss_12": 664.6331909179687, + "kl_loss_17": 168.89699172973633, + "kl_loss_3": 2503.3612670898438, + "kl_loss_6": 1598.739678955078, + "learning_rate": 0.0006695182376586602, + "loss": 1242.2291, + "step": 3960 + }, + { + "ce_loss_12": 3.238358223438263, + "ce_loss_17": 3.0327432632446287, + "ce_loss_23": 2.9521273970603943, + "ce_loss_3": 4.076607358455658, + "ce_loss_6": 3.641979396343231, + "epoch": 0.397, + "grad_norm": 1544.0, + "kl_loss_12": 632.5765899658203, + "kl_loss_17": 161.50315856933594, + "kl_loss_3": 2382.7819641113283, + "kl_loss_6": 1499.8943115234374, + "learning_rate": 0.000668024696607715, + "loss": 1247.5514, + "step": 3970 + }, + { + "ce_loss_12": 3.2161014318466186, + "ce_loss_17": 3.007648026943207, + "ce_loss_23": 2.9248135685920715, + "ce_loss_3": 4.094186675548554, + "ce_loss_6": 3.637202739715576, + "epoch": 0.398, + "grad_norm": 1304.0, + "kl_loss_12": 653.1179656982422, + "kl_loss_17": 169.28603439331056, + "kl_loss_3": 2478.8356811523436, + "kl_loss_6": 1564.0840270996093, + "learning_rate": 0.0006665294635499404, + "loss": 1220.7902, + "step": 3980 + }, + { + "ce_loss_12": 3.2357009768486025, + "ce_loss_17": 3.0125701904296873, + "ce_loss_23": 2.9224282026290895, + "ce_loss_3": 4.15496723651886, + "ce_loss_6": 3.675817370414734, + "epoch": 0.399, + "grad_norm": 1640.0, + "kl_loss_12": 691.6109039306641, + "kl_loss_17": 176.89354705810547, + "kl_loss_3": 2617.45673828125, + "kl_loss_6": 1653.7774475097656, + "learning_rate": 0.0006650325535423167, + "loss": 1254.449, + "step": 3990 + }, + { + "ce_loss_12": 3.2309993147850036, + "ce_loss_17": 3.0298078536987303, + "ce_loss_23": 2.9468361496925355, + "ce_loss_3": 4.0738466620445255, + "ce_loss_6": 3.6328693866729735, + "epoch": 0.4, + "grad_norm": 1584.0, + "kl_loss_12": 629.2378356933593, + "kl_loss_17": 162.09895782470704, + "kl_loss_3": 2373.6684814453124, + "kl_loss_6": 1495.4029846191406, + "learning_rate": 0.0006635339816587109, + "loss": 1207.8127, + "step": 4000 + }, + { + "ce_loss_12": 3.1798944234848023, + "ce_loss_17": 2.9722426891326905, + "ce_loss_23": 2.8861594915390016, + "ce_loss_3": 4.098603010177612, + "ce_loss_6": 3.6162397265434265, + "epoch": 0.401, + "grad_norm": 1256.0, + "kl_loss_12": 661.1082214355469, + "kl_loss_17": 171.99208374023436, + "kl_loss_3": 2558.653674316406, + "kl_loss_6": 1598.7052429199218, + "learning_rate": 0.0006620337629897252, + "loss": 1225.1979, + "step": 4010 + }, + { + "ce_loss_12": 3.1930771708488463, + "ce_loss_17": 2.9785884618759155, + "ce_loss_23": 2.890448606014252, + "ce_loss_3": 4.074082720279693, + "ce_loss_6": 3.612712931632996, + "epoch": 0.402, + "grad_norm": 924.0, + "kl_loss_12": 657.1040618896484, + "kl_loss_17": 170.4453887939453, + "kl_loss_3": 2482.5481689453127, + "kl_loss_6": 1571.464794921875, + "learning_rate": 0.0006605319126425454, + "loss": 1241.9932, + "step": 4020 + }, + { + "ce_loss_12": 3.104722809791565, + "ce_loss_17": 2.8953160881996154, + "ce_loss_23": 2.8121333599090574, + "ce_loss_3": 4.031231367588044, + "ce_loss_6": 3.550671195983887, + "epoch": 0.403, + "grad_norm": 1264.0, + "kl_loss_12": 660.7683258056641, + "kl_loss_17": 170.40650787353516, + "kl_loss_3": 2578.062951660156, + "kl_loss_6": 1623.105157470703, + "learning_rate": 0.0006590284457407876, + "loss": 1246.72, + "step": 4030 + }, + { + "ce_loss_12": 3.196132016181946, + "ce_loss_17": 2.986305999755859, + "ce_loss_23": 2.9007562398910522, + "ce_loss_3": 4.083922207355499, + "ce_loss_6": 3.6187917947769166, + "epoch": 0.404, + "grad_norm": 1216.0, + "kl_loss_12": 662.721957397461, + "kl_loss_17": 170.88162689208986, + "kl_loss_3": 2499.286340332031, + "kl_loss_6": 1571.6255126953124, + "learning_rate": 0.0006575233774243465, + "loss": 1223.1504, + "step": 4040 + }, + { + "ce_loss_12": 3.1954822540283203, + "ce_loss_17": 2.9834508061408997, + "ce_loss_23": 2.8960728764534, + "ce_loss_3": 4.081044781208038, + "ce_loss_6": 3.6214296221733093, + "epoch": 0.405, + "grad_norm": 1328.0, + "kl_loss_12": 665.4557220458985, + "kl_loss_17": 172.1760581970215, + "kl_loss_3": 2531.9148193359374, + "kl_loss_6": 1588.3802734375, + "learning_rate": 0.0006560167228492435, + "loss": 1234.3808, + "step": 4050 + }, + { + "ce_loss_12": 3.2196859240531923, + "ce_loss_17": 3.0143555283546446, + "ce_loss_23": 2.9314361333847048, + "ce_loss_3": 4.077447247505188, + "ce_loss_6": 3.628817582130432, + "epoch": 0.406, + "grad_norm": 1272.0, + "kl_loss_12": 640.3491271972656, + "kl_loss_17": 162.64488525390624, + "kl_loss_3": 2424.965771484375, + "kl_loss_6": 1523.4903991699218, + "learning_rate": 0.0006545084971874737, + "loss": 1218.717, + "step": 4060 + }, + { + "ce_loss_12": 3.2052250266075135, + "ce_loss_17": 2.9820220708847045, + "ce_loss_23": 2.8938833713531493, + "ce_loss_3": 4.10874879360199, + "ce_loss_6": 3.6375827312469484, + "epoch": 0.407, + "grad_norm": 1264.0, + "kl_loss_12": 684.7325439453125, + "kl_loss_17": 174.89648895263673, + "kl_loss_3": 2559.311767578125, + "kl_loss_6": 1616.524188232422, + "learning_rate": 0.0006529987156268526, + "loss": 1226.4737, + "step": 4070 + }, + { + "ce_loss_12": 3.124229669570923, + "ce_loss_17": 2.9031473875045775, + "ce_loss_23": 2.8176316142082216, + "ce_loss_3": 4.029061126708984, + "ce_loss_6": 3.551256704330444, + "epoch": 0.408, + "grad_norm": 1184.0, + "kl_loss_12": 668.159292602539, + "kl_loss_17": 169.3333953857422, + "kl_loss_3": 2526.0435424804687, + "kl_loss_6": 1581.6471008300782, + "learning_rate": 0.0006514873933708637, + "loss": 1257.8275, + "step": 4080 + }, + { + "ce_loss_12": 3.2262884378433228, + "ce_loss_17": 3.0169947266578676, + "ce_loss_23": 2.9342875361442564, + "ce_loss_3": 4.106362903118134, + "ce_loss_6": 3.6482708930969237, + "epoch": 0.409, + "grad_norm": 1456.0, + "kl_loss_12": 645.7553192138672, + "kl_loss_17": 163.7945083618164, + "kl_loss_3": 2480.0184936523438, + "kl_loss_6": 1556.7129272460938, + "learning_rate": 0.0006499745456385053, + "loss": 1215.4479, + "step": 4090 + }, + { + "ce_loss_12": 3.1975077271461485, + "ce_loss_17": 2.986157011985779, + "ce_loss_23": 2.9001713037490844, + "ce_loss_3": 4.084038853645325, + "ce_loss_6": 3.6129075288772583, + "epoch": 0.41, + "grad_norm": 1656.0, + "kl_loss_12": 661.6132110595703, + "kl_loss_17": 167.67526931762694, + "kl_loss_3": 2496.7949340820314, + "kl_loss_6": 1569.0898498535157, + "learning_rate": 0.0006484601876641375, + "loss": 1231.4829, + "step": 4100 + }, + { + "ce_loss_12": 3.1768134355545046, + "ce_loss_17": 2.9751313447952272, + "ce_loss_23": 2.8926531076431274, + "ce_loss_3": 4.028258752822876, + "ce_loss_6": 3.5876180052757265, + "epoch": 0.411, + "grad_norm": 1568.0, + "kl_loss_12": 634.2423522949218, + "kl_loss_17": 164.02059326171874, + "kl_loss_3": 2402.484619140625, + "kl_loss_6": 1523.3515686035157, + "learning_rate": 0.000646944334697328, + "loss": 1196.4126, + "step": 4110 + }, + { + "ce_loss_12": 3.2789846658706665, + "ce_loss_17": 3.0728031039237975, + "ce_loss_23": 2.99023118019104, + "ce_loss_3": 4.115758645534515, + "ce_loss_6": 3.6755104422569276, + "epoch": 0.412, + "grad_norm": 1160.0, + "kl_loss_12": 640.690640258789, + "kl_loss_17": 164.9263900756836, + "kl_loss_3": 2379.64365234375, + "kl_loss_6": 1506.9677612304688, + "learning_rate": 0.0006454270020026995, + "loss": 1177.5336, + "step": 4120 + }, + { + "ce_loss_12": 3.2431312799453735, + "ce_loss_17": 3.045191490650177, + "ce_loss_23": 2.9658806800842283, + "ce_loss_3": 4.085465252399445, + "ce_loss_6": 3.6430668354034426, + "epoch": 0.413, + "grad_norm": 924.0, + "kl_loss_12": 624.2956146240234, + "kl_loss_17": 157.6657402038574, + "kl_loss_3": 2371.8968994140623, + "kl_loss_6": 1491.1062805175782, + "learning_rate": 0.0006439082048597755, + "loss": 1175.53, + "step": 4130 + }, + { + "ce_loss_12": 3.243357074260712, + "ce_loss_17": 3.0340223908424377, + "ce_loss_23": 2.954066526889801, + "ce_loss_3": 4.123746228218079, + "ce_loss_6": 3.6575263261795046, + "epoch": 0.414, + "grad_norm": 1336.0, + "kl_loss_12": 659.6564147949218, + "kl_loss_17": 165.08272247314454, + "kl_loss_3": 2486.011584472656, + "kl_loss_6": 1569.0974975585937, + "learning_rate": 0.0006423879585628261, + "loss": 1223.5892, + "step": 4140 + }, + { + "ce_loss_12": 3.206614577770233, + "ce_loss_17": 2.989106571674347, + "ce_loss_23": 2.9063380002975463, + "ce_loss_3": 4.107674717903137, + "ce_loss_6": 3.6339136242866514, + "epoch": 0.415, + "grad_norm": 1888.0, + "kl_loss_12": 671.3660461425782, + "kl_loss_17": 169.32141799926757, + "kl_loss_3": 2529.765869140625, + "kl_loss_6": 1599.0236328125, + "learning_rate": 0.0006408662784207149, + "loss": 1238.7689, + "step": 4150 + }, + { + "ce_loss_12": 3.17088041305542, + "ce_loss_17": 2.966404151916504, + "ce_loss_23": 2.884509301185608, + "ce_loss_3": 4.0522378325462345, + "ce_loss_6": 3.598076891899109, + "epoch": 0.416, + "grad_norm": 1720.0, + "kl_loss_12": 649.9595794677734, + "kl_loss_17": 162.5530792236328, + "kl_loss_3": 2471.0245727539063, + "kl_loss_6": 1560.0284362792968, + "learning_rate": 0.0006393431797567439, + "loss": 1217.1744, + "step": 4160 + }, + { + "ce_loss_12": 3.2344939947128295, + "ce_loss_17": 3.041569209098816, + "ce_loss_23": 2.9603399634361267, + "ce_loss_3": 4.074980568885803, + "ce_loss_6": 3.6258366823196413, + "epoch": 0.417, + "grad_norm": 1352.0, + "kl_loss_12": 637.3605712890625, + "kl_loss_17": 162.68829727172852, + "kl_loss_3": 2389.82490234375, + "kl_loss_6": 1491.63798828125, + "learning_rate": 0.0006378186779084996, + "loss": 1161.8198, + "step": 4170 + }, + { + "ce_loss_12": 3.0947540402412415, + "ce_loss_17": 2.879651737213135, + "ce_loss_23": 2.797255277633667, + "ce_loss_3": 3.9948148369789123, + "ce_loss_6": 3.527016890048981, + "epoch": 0.418, + "grad_norm": 1576.0, + "kl_loss_12": 661.7118896484375, + "kl_loss_17": 166.3891471862793, + "kl_loss_3": 2505.2608032226562, + "kl_loss_6": 1579.3343322753906, + "learning_rate": 0.0006362927882276989, + "loss": 1233.6448, + "step": 4180 + }, + { + "ce_loss_12": 3.251532554626465, + "ce_loss_17": 3.056484353542328, + "ce_loss_23": 2.9750721573829653, + "ce_loss_3": 4.106284773349762, + "ce_loss_6": 3.6547441124916076, + "epoch": 0.419, + "grad_norm": 1288.0, + "kl_loss_12": 630.480712890625, + "kl_loss_17": 163.11334075927735, + "kl_loss_3": 2408.3671875, + "kl_loss_6": 1510.8880310058594, + "learning_rate": 0.000634765526080034, + "loss": 1170.4914, + "step": 4190 + }, + { + "ce_loss_12": 3.267170262336731, + "ce_loss_17": 3.063349890708923, + "ce_loss_23": 2.9791977047920226, + "ce_loss_3": 4.1225879073143, + "ce_loss_6": 3.681334388256073, + "epoch": 0.42, + "grad_norm": 1000.0, + "kl_loss_12": 648.9904571533203, + "kl_loss_17": 169.29965057373047, + "kl_loss_3": 2425.1775634765627, + "kl_loss_6": 1537.0394653320313, + "learning_rate": 0.0006332369068450174, + "loss": 1190.9547, + "step": 4200 + }, + { + "ce_loss_12": 3.2161875009536742, + "ce_loss_17": 3.0059170961380004, + "ce_loss_23": 2.924168360233307, + "ce_loss_3": 4.089441418647766, + "ce_loss_6": 3.636850118637085, + "epoch": 0.421, + "grad_norm": 1144.0, + "kl_loss_12": 653.8730834960937, + "kl_loss_17": 169.04387588500975, + "kl_loss_3": 2467.079138183594, + "kl_loss_6": 1567.153143310547, + "learning_rate": 0.0006317069459158283, + "loss": 1203.6819, + "step": 4210 + }, + { + "ce_loss_12": 3.299080491065979, + "ce_loss_17": 3.100344717502594, + "ce_loss_23": 3.017459952831268, + "ce_loss_3": 4.139323222637176, + "ce_loss_6": 3.6874857187271117, + "epoch": 0.422, + "grad_norm": 1808.0, + "kl_loss_12": 640.5180389404297, + "kl_loss_17": 168.46556854248047, + "kl_loss_3": 2405.2505615234377, + "kl_loss_6": 1504.3184020996093, + "learning_rate": 0.0006301756586991561, + "loss": 1186.276, + "step": 4220 + }, + { + "ce_loss_12": 3.1177937150001527, + "ce_loss_17": 2.9004759073257445, + "ce_loss_23": 2.8152867436408995, + "ce_loss_3": 4.018686580657959, + "ce_loss_6": 3.5499990940093995, + "epoch": 0.423, + "grad_norm": 1304.0, + "kl_loss_12": 670.5303588867188, + "kl_loss_17": 170.51752090454102, + "kl_loss_3": 2563.4893188476562, + "kl_loss_6": 1611.693865966797, + "learning_rate": 0.0006286430606150459, + "loss": 1235.9328, + "step": 4230 + }, + { + "ce_loss_12": 3.3009178400039674, + "ce_loss_17": 3.0949097156524656, + "ce_loss_23": 3.011760425567627, + "ce_loss_3": 4.1611486077308655, + "ce_loss_6": 3.7066882848739624, + "epoch": 0.424, + "grad_norm": 1344.0, + "kl_loss_12": 653.6160919189454, + "kl_loss_17": 167.27356414794923, + "kl_loss_3": 2442.5386352539062, + "kl_loss_6": 1538.9455871582031, + "learning_rate": 0.0006271091670967436, + "loss": 1203.5207, + "step": 4240 + }, + { + "ce_loss_12": 3.235174858570099, + "ce_loss_17": 3.0109243631362914, + "ce_loss_23": 2.923917555809021, + "ce_loss_3": 4.137778735160827, + "ce_loss_6": 3.669788134098053, + "epoch": 0.425, + "grad_norm": 1280.0, + "kl_loss_12": 689.6631042480469, + "kl_loss_17": 173.62762527465821, + "kl_loss_3": 2564.1671875, + "kl_loss_6": 1628.9932861328125, + "learning_rate": 0.0006255739935905395, + "loss": 1234.8453, + "step": 4250 + }, + { + "ce_loss_12": 3.2523061633110046, + "ce_loss_17": 3.04565349817276, + "ce_loss_23": 2.961494970321655, + "ce_loss_3": 4.108072769641876, + "ce_loss_6": 3.6445208549499513, + "epoch": 0.426, + "grad_norm": 1320.0, + "kl_loss_12": 646.4882995605469, + "kl_loss_17": 166.65640563964843, + "kl_loss_3": 2437.322888183594, + "kl_loss_6": 1518.1019897460938, + "learning_rate": 0.0006240375555556145, + "loss": 1236.5479, + "step": 4260 + }, + { + "ce_loss_12": 3.259161615371704, + "ce_loss_17": 3.046313989162445, + "ce_loss_23": 2.961641252040863, + "ce_loss_3": 4.163806760311127, + "ce_loss_6": 3.6912804365158083, + "epoch": 0.427, + "grad_norm": 1440.0, + "kl_loss_12": 667.9634338378906, + "kl_loss_17": 168.58839416503906, + "kl_loss_3": 2539.6944580078125, + "kl_loss_6": 1599.2546325683593, + "learning_rate": 0.000622499868463882, + "loss": 1226.9559, + "step": 4270 + }, + { + "ce_loss_12": 3.214144563674927, + "ce_loss_17": 3.0191205859184267, + "ce_loss_23": 2.937934911251068, + "ce_loss_3": 4.046278321743012, + "ce_loss_6": 3.6019288301467896, + "epoch": 0.428, + "grad_norm": 1336.0, + "kl_loss_12": 633.3546997070313, + "kl_loss_17": 163.72403945922852, + "kl_loss_3": 2392.6220581054686, + "kl_loss_6": 1501.1644958496095, + "learning_rate": 0.0006209609477998338, + "loss": 1190.2975, + "step": 4280 + }, + { + "ce_loss_12": 3.2828716039657593, + "ce_loss_17": 3.0704099774360656, + "ce_loss_23": 2.9856497645378113, + "ce_loss_3": 4.1405447840690615, + "ce_loss_6": 3.6942772030830384, + "epoch": 0.429, + "grad_norm": 1400.0, + "kl_loss_12": 653.5921051025391, + "kl_loss_17": 167.50534133911134, + "kl_loss_3": 2436.593157958984, + "kl_loss_6": 1550.3903076171875, + "learning_rate": 0.0006194208090603844, + "loss": 1219.9226, + "step": 4290 + }, + { + "ce_loss_12": 3.198684871196747, + "ce_loss_17": 2.9946710228919984, + "ce_loss_23": 2.915704309940338, + "ce_loss_3": 4.058174884319305, + "ce_loss_6": 3.602585482597351, + "epoch": 0.43, + "grad_norm": 1112.0, + "kl_loss_12": 633.5590362548828, + "kl_loss_17": 159.19326934814453, + "kl_loss_3": 2418.2192260742186, + "kl_loss_6": 1507.1767578125, + "learning_rate": 0.0006178794677547138, + "loss": 1175.0201, + "step": 4300 + }, + { + "ce_loss_12": 3.2314225912094114, + "ce_loss_17": 3.01648725271225, + "ce_loss_23": 2.9362313985824584, + "ce_loss_3": 4.111103582382202, + "ce_loss_6": 3.655687725543976, + "epoch": 0.431, + "grad_norm": 1208.0, + "kl_loss_12": 660.4817016601562, + "kl_loss_17": 166.56691055297853, + "kl_loss_3": 2481.569189453125, + "kl_loss_6": 1570.9040283203126, + "learning_rate": 0.0006163369394041111, + "loss": 1211.6635, + "step": 4310 + }, + { + "ce_loss_12": 3.1613443970680235, + "ce_loss_17": 2.949456202983856, + "ce_loss_23": 2.8689070463180544, + "ce_loss_3": 4.059965026378632, + "ce_loss_6": 3.5966880559921264, + "epoch": 0.432, + "grad_norm": 1968.0, + "kl_loss_12": 647.4236877441406, + "kl_loss_17": 162.88763275146485, + "kl_loss_3": 2501.3803466796876, + "kl_loss_6": 1571.36474609375, + "learning_rate": 0.0006147932395418205, + "loss": 1242.7928, + "step": 4320 + }, + { + "ce_loss_12": 3.2022632360458374, + "ce_loss_17": 2.997159421443939, + "ce_loss_23": 2.9146835923194887, + "ce_loss_3": 4.049819684028625, + "ce_loss_6": 3.5966903924942017, + "epoch": 0.433, + "grad_norm": 1488.0, + "kl_loss_12": 644.1645294189453, + "kl_loss_17": 164.08428344726562, + "kl_loss_3": 2429.1185791015623, + "kl_loss_6": 1525.9104248046874, + "learning_rate": 0.0006132483837128823, + "loss": 1188.8746, + "step": 4330 + }, + { + "ce_loss_12": 3.1741002798080444, + "ce_loss_17": 2.9674150347709656, + "ce_loss_23": 2.8911412954330444, + "ce_loss_3": 4.052277541160583, + "ce_loss_6": 3.584049391746521, + "epoch": 0.434, + "grad_norm": 1128.0, + "kl_loss_12": 642.8700103759766, + "kl_loss_17": 162.74801635742188, + "kl_loss_3": 2470.38583984375, + "kl_loss_6": 1534.222216796875, + "learning_rate": 0.0006117023874739772, + "loss": 1203.4758, + "step": 4340 + }, + { + "ce_loss_12": 3.1774375438690186, + "ce_loss_17": 2.9687968730926513, + "ce_loss_23": 2.8868483185768126, + "ce_loss_3": 4.05833625793457, + "ce_loss_6": 3.585376000404358, + "epoch": 0.435, + "grad_norm": 1848.0, + "kl_loss_12": 650.7925689697265, + "kl_loss_17": 165.023934173584, + "kl_loss_3": 2497.553576660156, + "kl_loss_6": 1556.6371948242188, + "learning_rate": 0.0006101552663932703, + "loss": 1228.8588, + "step": 4350 + }, + { + "ce_loss_12": 3.21059513092041, + "ce_loss_17": 3.002040982246399, + "ce_loss_23": 2.917381763458252, + "ce_loss_3": 4.075988399982452, + "ce_loss_6": 3.6184175610542297, + "epoch": 0.436, + "grad_norm": 1056.0, + "kl_loss_12": 649.2522644042969, + "kl_loss_17": 168.29192047119142, + "kl_loss_3": 2451.184521484375, + "kl_loss_6": 1536.5940795898437, + "learning_rate": 0.0006086070360502539, + "loss": 1200.7145, + "step": 4360 + }, + { + "ce_loss_12": 3.208553969860077, + "ce_loss_17": 3.00421199798584, + "ce_loss_23": 2.920252966880798, + "ce_loss_3": 4.077391064167022, + "ce_loss_6": 3.6188166737556458, + "epoch": 0.437, + "grad_norm": 1216.0, + "kl_loss_12": 644.7482696533203, + "kl_loss_17": 163.39582138061525, + "kl_loss_3": 2457.9745239257813, + "kl_loss_6": 1541.4103088378906, + "learning_rate": 0.0006070577120355903, + "loss": 1205.3301, + "step": 4370 + }, + { + "ce_loss_12": 3.209773933887482, + "ce_loss_17": 3.0028355002403258, + "ce_loss_23": 2.9219225525856016, + "ce_loss_3": 4.058059203624725, + "ce_loss_6": 3.6191796541213987, + "epoch": 0.438, + "grad_norm": 1144.0, + "kl_loss_12": 628.1895538330078, + "kl_loss_17": 159.51243209838867, + "kl_loss_3": 2378.0887939453123, + "kl_loss_6": 1508.2755065917968, + "learning_rate": 0.0006055073099509549, + "loss": 1191.6818, + "step": 4380 + }, + { + "ce_loss_12": 3.2622617363929747, + "ce_loss_17": 3.0624248027801513, + "ce_loss_23": 2.9811885356903076, + "ce_loss_3": 4.111277318000793, + "ce_loss_6": 3.655835950374603, + "epoch": 0.439, + "grad_norm": 1952.0, + "kl_loss_12": 638.9650054931641, + "kl_loss_17": 164.0619644165039, + "kl_loss_3": 2425.60546875, + "kl_loss_6": 1510.8873718261718, + "learning_rate": 0.0006039558454088796, + "loss": 1209.1945, + "step": 4390 + }, + { + "ce_loss_12": 3.237137520313263, + "ce_loss_17": 3.0239644765853884, + "ce_loss_23": 2.9428634524345396, + "ce_loss_3": 4.1054279088974, + "ce_loss_6": 3.6489357113838197, + "epoch": 0.44, + "grad_norm": 1064.0, + "kl_loss_12": 653.6253509521484, + "kl_loss_17": 165.1566955566406, + "kl_loss_3": 2457.682080078125, + "kl_loss_6": 1551.16640625, + "learning_rate": 0.0006024033340325954, + "loss": 1179.8697, + "step": 4400 + }, + { + "ce_loss_12": 3.2892482042312623, + "ce_loss_17": 3.094030725955963, + "ce_loss_23": 3.014371383190155, + "ce_loss_3": 4.116533076763153, + "ce_loss_6": 3.6817184329032897, + "epoch": 0.441, + "grad_norm": 1020.0, + "kl_loss_12": 616.5719543457031, + "kl_loss_17": 158.1498680114746, + "kl_loss_3": 2336.163928222656, + "kl_loss_6": 1473.74609375, + "learning_rate": 0.0006008497914558743, + "loss": 1168.5965, + "step": 4410 + }, + { + "ce_loss_12": 3.256694257259369, + "ce_loss_17": 3.043798232078552, + "ce_loss_23": 2.95779926776886, + "ce_loss_3": 4.126014041900635, + "ce_loss_6": 3.6652104020118714, + "epoch": 0.442, + "grad_norm": 1200.0, + "kl_loss_12": 661.890103149414, + "kl_loss_17": 175.54345626831054, + "kl_loss_3": 2490.40830078125, + "kl_loss_6": 1560.8852844238281, + "learning_rate": 0.0005992952333228728, + "loss": 1215.0305, + "step": 4420 + }, + { + "ce_loss_12": 3.186796712875366, + "ce_loss_17": 2.9837971210479735, + "ce_loss_23": 2.9028133869171144, + "ce_loss_3": 4.062563931941986, + "ce_loss_6": 3.6060952067375185, + "epoch": 0.443, + "grad_norm": 1456.0, + "kl_loss_12": 638.3213073730469, + "kl_loss_17": 163.08951950073242, + "kl_loss_3": 2466.0949829101564, + "kl_loss_6": 1557.3776916503907, + "learning_rate": 0.0005977396752879741, + "loss": 1196.3525, + "step": 4430 + }, + { + "ce_loss_12": 3.125974440574646, + "ce_loss_17": 2.9124962091445923, + "ce_loss_23": 2.83063462972641, + "ce_loss_3": 3.99697060585022, + "ce_loss_6": 3.5424386978149416, + "epoch": 0.444, + "grad_norm": 1624.0, + "kl_loss_12": 652.8073974609375, + "kl_loss_17": 162.03636627197267, + "kl_loss_3": 2479.3157348632812, + "kl_loss_6": 1570.6985961914063, + "learning_rate": 0.0005961831330156305, + "loss": 1196.4949, + "step": 4440 + }, + { + "ce_loss_12": 3.258762800693512, + "ce_loss_17": 3.0480078935623167, + "ce_loss_23": 2.9659180998802186, + "ce_loss_3": 4.14400018453598, + "ce_loss_6": 3.6833078145980833, + "epoch": 0.445, + "grad_norm": 1136.0, + "kl_loss_12": 641.7249816894531, + "kl_loss_17": 162.82620162963866, + "kl_loss_3": 2477.761853027344, + "kl_loss_6": 1554.4641235351562, + "learning_rate": 0.0005946256221802051, + "loss": 1224.8336, + "step": 4450 + }, + { + "ce_loss_12": 3.2107276558876037, + "ce_loss_17": 3.018704891204834, + "ce_loss_23": 2.9421164989471436, + "ce_loss_3": 4.045899331569672, + "ce_loss_6": 3.599087679386139, + "epoch": 0.446, + "grad_norm": 1360.0, + "kl_loss_12": 614.8436950683594, + "kl_loss_17": 157.52420349121093, + "kl_loss_3": 2342.65703125, + "kl_loss_6": 1471.423095703125, + "learning_rate": 0.0005930671584658151, + "loss": 1218.3521, + "step": 4460 + }, + { + "ce_loss_12": 3.237060749530792, + "ce_loss_17": 3.0288569808006285, + "ce_loss_23": 2.9500340700149534, + "ce_loss_3": 4.092326283454895, + "ce_loss_6": 3.642471969127655, + "epoch": 0.447, + "grad_norm": 1840.0, + "kl_loss_12": 643.4912506103516, + "kl_loss_17": 162.06564025878907, + "kl_loss_3": 2438.0846557617188, + "kl_loss_6": 1528.9453491210938, + "learning_rate": 0.0005915077575661722, + "loss": 1208.8037, + "step": 4470 + }, + { + "ce_loss_12": 3.2556816220283507, + "ce_loss_17": 3.0458347320556642, + "ce_loss_23": 2.963720417022705, + "ce_loss_3": 4.116921710968017, + "ce_loss_6": 3.6561305999755858, + "epoch": 0.448, + "grad_norm": 1200.0, + "kl_loss_12": 655.64404296875, + "kl_loss_17": 166.30454788208007, + "kl_loss_3": 2469.18212890625, + "kl_loss_6": 1545.205877685547, + "learning_rate": 0.000589947435184427, + "loss": 1191.7271, + "step": 4480 + }, + { + "ce_loss_12": 3.299931359291077, + "ce_loss_17": 3.1044653058052063, + "ce_loss_23": 3.0236785292625425, + "ce_loss_3": 4.114307379722595, + "ce_loss_6": 3.6830551862716674, + "epoch": 0.449, + "grad_norm": 1536.0, + "kl_loss_12": 630.1134094238281, + "kl_loss_17": 161.1545211791992, + "kl_loss_3": 2351.4382934570312, + "kl_loss_6": 1485.6003662109374, + "learning_rate": 0.0005883862070330078, + "loss": 1180.9428, + "step": 4490 + }, + { + "ce_loss_12": 3.2484289050102233, + "ce_loss_17": 3.0422987818717955, + "ce_loss_23": 2.9627413749694824, + "ce_loss_3": 4.105246496200562, + "ce_loss_6": 3.656101632118225, + "epoch": 0.45, + "grad_norm": 1208.0, + "kl_loss_12": 648.1339263916016, + "kl_loss_17": 162.97141952514647, + "kl_loss_3": 2441.7771850585937, + "kl_loss_6": 1535.2929260253907, + "learning_rate": 0.0005868240888334653, + "loss": 1190.7193, + "step": 4500 + }, + { + "ce_loss_12": 3.1482197403907777, + "ce_loss_17": 2.9354174375534057, + "ce_loss_23": 2.8536072254180906, + "ce_loss_3": 4.035489583015442, + "ce_loss_6": 3.5650068044662477, + "epoch": 0.451, + "grad_norm": 1328.0, + "kl_loss_12": 655.2563629150391, + "kl_loss_17": 164.87384109497071, + "kl_loss_3": 2491.76171875, + "kl_loss_6": 1554.5841247558594, + "learning_rate": 0.0005852610963163119, + "loss": 1210.5104, + "step": 4510 + }, + { + "ce_loss_12": 3.1546201705932617, + "ce_loss_17": 2.955691623687744, + "ce_loss_23": 2.875958788394928, + "ce_loss_3": 4.0057038307189945, + "ce_loss_6": 3.5569921135902405, + "epoch": 0.452, + "grad_norm": 1160.0, + "kl_loss_12": 636.9841186523438, + "kl_loss_17": 159.1092071533203, + "kl_loss_3": 2414.3504943847656, + "kl_loss_6": 1519.0669189453124, + "learning_rate": 0.0005836972452208654, + "loss": 1173.0025, + "step": 4520 + }, + { + "ce_loss_12": 3.1631070137023927, + "ce_loss_17": 2.958298307657242, + "ce_loss_23": 2.879790633916855, + "ce_loss_3": 4.041290128231049, + "ce_loss_6": 3.57336847782135, + "epoch": 0.453, + "grad_norm": 1256.0, + "kl_loss_12": 641.2759063720703, + "kl_loss_17": 160.77180252075195, + "kl_loss_3": 2469.1424560546875, + "kl_loss_6": 1540.08271484375, + "learning_rate": 0.0005821325512950885, + "loss": 1202.4078, + "step": 4530 + }, + { + "ce_loss_12": 3.185246980190277, + "ce_loss_17": 2.9863064646720887, + "ce_loss_23": 2.904992377758026, + "ce_loss_3": 4.04438099861145, + "ce_loss_6": 3.583273410797119, + "epoch": 0.454, + "grad_norm": 1696.0, + "kl_loss_12": 618.6039413452148, + "kl_loss_17": 155.50898513793945, + "kl_loss_3": 2399.7892883300783, + "kl_loss_6": 1490.1276489257812, + "learning_rate": 0.0005805670302954321, + "loss": 1185.6248, + "step": 4540 + }, + { + "ce_loss_12": 3.1912306547164917, + "ce_loss_17": 2.9909960269927978, + "ce_loss_23": 2.9128016948699953, + "ce_loss_3": 4.045046019554138, + "ce_loss_6": 3.588426637649536, + "epoch": 0.455, + "grad_norm": 1248.0, + "kl_loss_12": 625.5916427612304, + "kl_loss_17": 156.7047134399414, + "kl_loss_3": 2409.2196655273438, + "kl_loss_6": 1500.3096008300781, + "learning_rate": 0.000579000697986675, + "loss": 1170.1033, + "step": 4550 + }, + { + "ce_loss_12": 3.1679309964179994, + "ce_loss_17": 2.9524415612220762, + "ce_loss_23": 2.868469202518463, + "ce_loss_3": 4.056581676006317, + "ce_loss_6": 3.5980002880096436, + "epoch": 0.456, + "grad_norm": 1192.0, + "kl_loss_12": 663.2724822998047, + "kl_loss_17": 165.01891555786133, + "kl_loss_3": 2508.131970214844, + "kl_loss_6": 1586.9396057128906, + "learning_rate": 0.0005774335701417662, + "loss": 1205.8127, + "step": 4560 + }, + { + "ce_loss_12": 3.150467586517334, + "ce_loss_17": 2.946517300605774, + "ce_loss_23": 2.866714262962341, + "ce_loss_3": 4.047620594501495, + "ce_loss_6": 3.5786996126174926, + "epoch": 0.457, + "grad_norm": 1376.0, + "kl_loss_12": 641.4974700927735, + "kl_loss_17": 156.94892120361328, + "kl_loss_3": 2521.1830200195313, + "kl_loss_6": 1573.9168090820312, + "learning_rate": 0.0005758656625416658, + "loss": 1206.3526, + "step": 4570 + }, + { + "ce_loss_12": 3.201842391490936, + "ce_loss_17": 2.9998937368392946, + "ce_loss_23": 2.920228159427643, + "ce_loss_3": 4.064460682868957, + "ce_loss_6": 3.6110380411148073, + "epoch": 0.458, + "grad_norm": 1080.0, + "kl_loss_12": 638.4472229003907, + "kl_loss_17": 163.9334915161133, + "kl_loss_3": 2430.582653808594, + "kl_loss_6": 1528.050634765625, + "learning_rate": 0.0005742969909751859, + "loss": 1177.8109, + "step": 4580 + }, + { + "ce_loss_12": 3.215990889072418, + "ce_loss_17": 3.012350058555603, + "ce_loss_23": 2.930982935428619, + "ce_loss_3": 4.088036739826203, + "ce_loss_6": 3.6223819851875305, + "epoch": 0.459, + "grad_norm": 1032.0, + "kl_loss_12": 640.5331756591797, + "kl_loss_17": 161.85816497802733, + "kl_loss_3": 2471.1506103515626, + "kl_loss_6": 1538.333563232422, + "learning_rate": 0.0005727275712388318, + "loss": 1206.5072, + "step": 4590 + }, + { + "ce_loss_12": 3.2197107672691345, + "ce_loss_17": 3.0256712198257447, + "ce_loss_23": 2.9480438113212584, + "ce_loss_3": 4.056862235069275, + "ce_loss_6": 3.620411014556885, + "epoch": 0.46, + "grad_norm": 1168.0, + "kl_loss_12": 622.0484039306641, + "kl_loss_17": 156.7641174316406, + "kl_loss_3": 2370.812939453125, + "kl_loss_6": 1489.7005432128906, + "learning_rate": 0.0005711574191366427, + "loss": 1173.6023, + "step": 4600 + }, + { + "ce_loss_12": 3.182412791252136, + "ce_loss_17": 2.9851343274116515, + "ce_loss_23": 2.9051843523979186, + "ce_loss_3": 4.031421196460724, + "ce_loss_6": 3.5838393926620484, + "epoch": 0.461, + "grad_norm": 1312.0, + "kl_loss_12": 625.0085205078125, + "kl_loss_17": 157.94583282470703, + "kl_loss_3": 2407.302880859375, + "kl_loss_6": 1504.4279418945312, + "learning_rate": 0.0005695865504800327, + "loss": 1171.1639, + "step": 4610 + }, + { + "ce_loss_12": 3.150743865966797, + "ce_loss_17": 2.927154815196991, + "ce_loss_23": 2.8416743516921996, + "ce_loss_3": 4.090725040435791, + "ce_loss_6": 3.6003042697906493, + "epoch": 0.462, + "grad_norm": 1232.0, + "kl_loss_12": 679.3878112792969, + "kl_loss_17": 168.15841064453124, + "kl_loss_3": 2633.0466064453126, + "kl_loss_6": 1651.2817749023438, + "learning_rate": 0.0005680149810876322, + "loss": 1231.4214, + "step": 4620 + }, + { + "ce_loss_12": 3.174549031257629, + "ce_loss_17": 2.976775574684143, + "ce_loss_23": 2.8976654291152952, + "ce_loss_3": 4.060347425937652, + "ce_loss_6": 3.588455593585968, + "epoch": 0.463, + "grad_norm": 1784.0, + "kl_loss_12": 632.8175598144531, + "kl_loss_17": 159.72229766845703, + "kl_loss_3": 2455.2784790039063, + "kl_loss_6": 1525.344952392578, + "learning_rate": 0.0005664427267851271, + "loss": 1187.507, + "step": 4630 + }, + { + "ce_loss_12": 3.101553475856781, + "ce_loss_17": 2.895111048221588, + "ce_loss_23": 2.81779865026474, + "ce_loss_3": 3.978833782672882, + "ce_loss_6": 3.5145297169685366, + "epoch": 0.464, + "grad_norm": 1184.0, + "kl_loss_12": 627.3488189697266, + "kl_loss_17": 157.98582382202147, + "kl_loss_3": 2433.0452392578127, + "kl_loss_6": 1518.2941955566407, + "learning_rate": 0.0005648698034051009, + "loss": 1178.09, + "step": 4640 + }, + { + "ce_loss_12": 3.212950658798218, + "ce_loss_17": 3.0110865592956544, + "ce_loss_23": 2.9278042793273924, + "ce_loss_3": 4.108318436145782, + "ce_loss_6": 3.6375867247581484, + "epoch": 0.465, + "grad_norm": 1392.0, + "kl_loss_12": 631.4466186523438, + "kl_loss_17": 159.22352905273436, + "kl_loss_3": 2484.774365234375, + "kl_loss_6": 1552.065899658203, + "learning_rate": 0.0005632962267868747, + "loss": 1182.3302, + "step": 4650 + }, + { + "ce_loss_12": 3.147960638999939, + "ce_loss_17": 2.9476505160331725, + "ce_loss_23": 2.8705123901367187, + "ce_loss_3": 4.005294620990753, + "ce_loss_6": 3.5595455527305604, + "epoch": 0.466, + "grad_norm": 1488.0, + "kl_loss_12": 616.3500366210938, + "kl_loss_17": 154.47217254638673, + "kl_loss_3": 2412.197546386719, + "kl_loss_6": 1517.6033508300782, + "learning_rate": 0.0005617220127763474, + "loss": 1185.2042, + "step": 4660 + }, + { + "ce_loss_12": 3.2249313235282897, + "ce_loss_17": 3.024825835227966, + "ce_loss_23": 2.9437759637832643, + "ce_loss_3": 4.074070203304291, + "ce_loss_6": 3.630169630050659, + "epoch": 0.467, + "grad_norm": 1208.0, + "kl_loss_12": 628.17041015625, + "kl_loss_17": 160.3174903869629, + "kl_loss_3": 2400.061901855469, + "kl_loss_6": 1509.721533203125, + "learning_rate": 0.0005601471772258368, + "loss": 1187.7549, + "step": 4670 + }, + { + "ce_loss_12": 3.206970751285553, + "ce_loss_17": 3.0103575348854066, + "ce_loss_23": 2.933112585544586, + "ce_loss_3": 4.055021023750305, + "ce_loss_6": 3.610816144943237, + "epoch": 0.468, + "grad_norm": 1592.0, + "kl_loss_12": 617.1227569580078, + "kl_loss_17": 160.79261436462403, + "kl_loss_3": 2364.9230224609373, + "kl_loss_6": 1479.913037109375, + "learning_rate": 0.0005585717359939192, + "loss": 1184.5354, + "step": 4680 + }, + { + "ce_loss_12": 3.1215055108070375, + "ce_loss_17": 2.924091303348541, + "ce_loss_23": 2.8455187678337097, + "ce_loss_3": 3.966324496269226, + "ce_loss_6": 3.5314618825912474, + "epoch": 0.469, + "grad_norm": 1552.0, + "kl_loss_12": 617.8912048339844, + "kl_loss_17": 157.79645767211915, + "kl_loss_3": 2370.995983886719, + "kl_loss_6": 1497.4713623046875, + "learning_rate": 0.0005569957049452703, + "loss": 1192.5916, + "step": 4690 + }, + { + "ce_loss_12": 3.180953764915466, + "ce_loss_17": 2.9790743708610536, + "ce_loss_23": 2.896568977832794, + "ce_loss_3": 4.062672364711761, + "ce_loss_6": 3.5977124094963076, + "epoch": 0.47, + "grad_norm": 1368.0, + "kl_loss_12": 640.2159027099609, + "kl_loss_17": 162.89459533691405, + "kl_loss_3": 2463.876123046875, + "kl_loss_6": 1536.5316650390625, + "learning_rate": 0.0005554190999505056, + "loss": 1201.356, + "step": 4700 + }, + { + "ce_loss_12": 3.3025781512260437, + "ce_loss_17": 3.0925748705863954, + "ce_loss_23": 3.01226749420166, + "ce_loss_3": 4.1633231282234195, + "ce_loss_6": 3.713399791717529, + "epoch": 0.471, + "grad_norm": 1576.0, + "kl_loss_12": 654.6351318359375, + "kl_loss_17": 165.21662445068358, + "kl_loss_3": 2447.6072021484374, + "kl_loss_6": 1546.1245971679687, + "learning_rate": 0.0005538419368860196, + "loss": 1158.7656, + "step": 4710 + }, + { + "ce_loss_12": 3.2218399047851562, + "ce_loss_17": 3.0213634967803955, + "ce_loss_23": 2.9408982038497924, + "ce_loss_3": 4.07634162902832, + "ce_loss_6": 3.623002791404724, + "epoch": 0.472, + "grad_norm": 1152.0, + "kl_loss_12": 633.4337219238281, + "kl_loss_17": 160.93989334106445, + "kl_loss_3": 2422.4453125, + "kl_loss_6": 1513.5301513671875, + "learning_rate": 0.0005522642316338268, + "loss": 1208.8412, + "step": 4720 + }, + { + "ce_loss_12": 3.2332789063453675, + "ce_loss_17": 3.037247633934021, + "ce_loss_23": 2.964172291755676, + "ce_loss_3": 4.087936019897461, + "ce_loss_6": 3.636195111274719, + "epoch": 0.473, + "grad_norm": 1592.0, + "kl_loss_12": 630.7929901123047, + "kl_loss_17": 160.92865219116212, + "kl_loss_3": 2417.127062988281, + "kl_loss_6": 1508.9412719726563, + "learning_rate": 0.0005506860000814017, + "loss": 1207.6505, + "step": 4730 + }, + { + "ce_loss_12": 3.2517629265785217, + "ce_loss_17": 3.0571023344993593, + "ce_loss_23": 2.9811705946922302, + "ce_loss_3": 4.078539133071899, + "ce_loss_6": 3.640167605876923, + "epoch": 0.474, + "grad_norm": 1376.0, + "kl_loss_12": 616.863818359375, + "kl_loss_17": 154.07796096801758, + "kl_loss_3": 2360.0311767578123, + "kl_loss_6": 1482.3599182128905, + "learning_rate": 0.0005491072581215186, + "loss": 1177.4945, + "step": 4740 + }, + { + "ce_loss_12": 3.243813467025757, + "ce_loss_17": 3.048021924495697, + "ce_loss_23": 2.9640570402145388, + "ce_loss_3": 4.092909860610962, + "ce_loss_6": 3.6491681814193724, + "epoch": 0.475, + "grad_norm": 1728.0, + "kl_loss_12": 640.3056610107421, + "kl_loss_17": 167.65764312744142, + "kl_loss_3": 2424.3011108398437, + "kl_loss_6": 1531.63828125, + "learning_rate": 0.0005475280216520913, + "loss": 1166.2744, + "step": 4750 + }, + { + "ce_loss_12": 3.177130949497223, + "ce_loss_17": 2.9823553442955015, + "ce_loss_23": 2.902591681480408, + "ce_loss_3": 4.017650127410889, + "ce_loss_6": 3.580471134185791, + "epoch": 0.476, + "grad_norm": 1328.0, + "kl_loss_12": 619.7025665283203, + "kl_loss_17": 157.63373794555665, + "kl_loss_3": 2367.5134826660155, + "kl_loss_6": 1489.2049865722656, + "learning_rate": 0.0005459483065760138, + "loss": 1194.0834, + "step": 4760 + }, + { + "ce_loss_12": 3.1291027784347536, + "ce_loss_17": 2.923326003551483, + "ce_loss_23": 2.8447796583175657, + "ce_loss_3": 4.033735084533691, + "ce_loss_6": 3.567008364200592, + "epoch": 0.477, + "grad_norm": 1312.0, + "kl_loss_12": 640.9544830322266, + "kl_loss_17": 158.97169876098633, + "kl_loss_3": 2522.970526123047, + "kl_loss_6": 1591.4856994628906, + "learning_rate": 0.0005443681288009991, + "loss": 1201.4441, + "step": 4770 + }, + { + "ce_loss_12": 3.169213426113129, + "ce_loss_17": 2.9707138419151304, + "ce_loss_23": 2.892877531051636, + "ce_loss_3": 4.032275760173798, + "ce_loss_6": 3.5805038452148437, + "epoch": 0.478, + "grad_norm": 1336.0, + "kl_loss_12": 625.7367645263672, + "kl_loss_17": 158.08297805786134, + "kl_loss_3": 2432.266711425781, + "kl_loss_6": 1530.6936645507812, + "learning_rate": 0.0005427875042394199, + "loss": 1189.791, + "step": 4780 + }, + { + "ce_loss_12": 3.2083109736442568, + "ce_loss_17": 3.0140113115310667, + "ce_loss_23": 2.931912732124329, + "ce_loss_3": 4.046906161308288, + "ce_loss_6": 3.6133265018463137, + "epoch": 0.479, + "grad_norm": 1320.0, + "kl_loss_12": 629.1389862060547, + "kl_loss_17": 163.8302314758301, + "kl_loss_3": 2378.247174072266, + "kl_loss_6": 1515.6248962402344, + "learning_rate": 0.0005412064488081482, + "loss": 1192.5863, + "step": 4790 + }, + { + "ce_loss_12": 3.1922692894935607, + "ce_loss_17": 2.998089075088501, + "ce_loss_23": 2.920244073867798, + "ce_loss_3": 4.036051690578461, + "ce_loss_6": 3.5894766449928284, + "epoch": 0.48, + "grad_norm": 1384.0, + "kl_loss_12": 606.6053298950195, + "kl_loss_17": 154.59100418090821, + "kl_loss_3": 2355.809326171875, + "kl_loss_6": 1466.8870239257812, + "learning_rate": 0.0005396249784283942, + "loss": 1154.8229, + "step": 4800 + }, + { + "ce_loss_12": 3.2233909606933593, + "ce_loss_17": 3.020741271972656, + "ce_loss_23": 2.9392097353935243, + "ce_loss_3": 4.105383563041687, + "ce_loss_6": 3.64882515668869, + "epoch": 0.481, + "grad_norm": 1432.0, + "kl_loss_12": 642.4943237304688, + "kl_loss_17": 161.35825271606444, + "kl_loss_3": 2480.23994140625, + "kl_loss_6": 1553.8017883300781, + "learning_rate": 0.0005380431090255476, + "loss": 1200.9545, + "step": 4810 + }, + { + "ce_loss_12": 3.2107723474502565, + "ce_loss_17": 3.0163851976394653, + "ce_loss_23": 2.943860149383545, + "ce_loss_3": 4.046610677242279, + "ce_loss_6": 3.6110605597496033, + "epoch": 0.482, + "grad_norm": 1064.0, + "kl_loss_12": 608.278759765625, + "kl_loss_17": 150.86255111694337, + "kl_loss_3": 2350.0280334472654, + "kl_loss_6": 1480.5289916992188, + "learning_rate": 0.0005364608565290155, + "loss": 1158.0291, + "step": 4820 + }, + { + "ce_loss_12": 3.232293450832367, + "ce_loss_17": 3.028693342208862, + "ce_loss_23": 2.9507614016532897, + "ce_loss_3": 4.080292189121247, + "ce_loss_6": 3.6320470571517944, + "epoch": 0.483, + "grad_norm": 1344.0, + "kl_loss_12": 628.4602233886719, + "kl_loss_17": 159.22116661071777, + "kl_loss_3": 2406.576037597656, + "kl_loss_6": 1513.3693908691407, + "learning_rate": 0.0005348782368720626, + "loss": 1178.7437, + "step": 4830 + }, + { + "ce_loss_12": 3.161022257804871, + "ce_loss_17": 2.9622381567955016, + "ce_loss_23": 2.8851508855819703, + "ce_loss_3": 4.004925203323364, + "ce_loss_6": 3.550952982902527, + "epoch": 0.484, + "grad_norm": 1144.0, + "kl_loss_12": 608.7730895996094, + "kl_loss_17": 153.5218879699707, + "kl_loss_3": 2357.715283203125, + "kl_loss_6": 1467.4348754882812, + "learning_rate": 0.000533295265991652, + "loss": 1169.3563, + "step": 4840 + }, + { + "ce_loss_12": 3.2256028175354006, + "ce_loss_17": 3.0272963285446166, + "ce_loss_23": 2.9493767857551574, + "ce_loss_3": 4.058467495441437, + "ce_loss_6": 3.626989758014679, + "epoch": 0.485, + "grad_norm": 1520.0, + "kl_loss_12": 616.5258331298828, + "kl_loss_17": 155.4004020690918, + "kl_loss_3": 2356.535711669922, + "kl_loss_6": 1479.7210266113282, + "learning_rate": 0.0005317119598282822, + "loss": 1154.3794, + "step": 4850 + }, + { + "ce_loss_12": 3.229808497428894, + "ce_loss_17": 3.0339391946792604, + "ce_loss_23": 2.9537967324256895, + "ce_loss_3": 4.076095080375671, + "ce_loss_6": 3.632161784172058, + "epoch": 0.486, + "grad_norm": 1320.0, + "kl_loss_12": 631.5225708007813, + "kl_loss_17": 158.15500564575194, + "kl_loss_3": 2388.7497802734374, + "kl_loss_6": 1500.3150573730468, + "learning_rate": 0.0005301283343258293, + "loss": 1169.4148, + "step": 4860 + }, + { + "ce_loss_12": 3.27712562084198, + "ce_loss_17": 3.0884708404541015, + "ce_loss_23": 3.010235869884491, + "ce_loss_3": 4.1028130650520325, + "ce_loss_6": 3.6795893907546997, + "epoch": 0.487, + "grad_norm": 1512.0, + "kl_loss_12": 618.7816528320312, + "kl_loss_17": 158.00575103759766, + "kl_loss_3": 2343.5960083007812, + "kl_loss_6": 1490.7032409667968, + "learning_rate": 0.000528544405431384, + "loss": 1152.7285, + "step": 4870 + }, + { + "ce_loss_12": 3.17356424331665, + "ce_loss_17": 2.9689354300498962, + "ce_loss_23": 2.885971963405609, + "ce_loss_3": 4.02713428735733, + "ce_loss_6": 3.590455937385559, + "epoch": 0.488, + "grad_norm": 1704.0, + "kl_loss_12": 640.2501586914062, + "kl_loss_17": 162.25452423095703, + "kl_loss_3": 2433.520947265625, + "kl_loss_6": 1546.4126037597657, + "learning_rate": 0.000526960189095093, + "loss": 1193.9695, + "step": 4880 + }, + { + "ce_loss_12": 3.156309223175049, + "ce_loss_17": 2.9626466035842896, + "ce_loss_23": 2.8836700916290283, + "ce_loss_3": 4.014741611480713, + "ce_loss_6": 3.5583111882209777, + "epoch": 0.489, + "grad_norm": 1544.0, + "kl_loss_12": 609.2802368164063, + "kl_loss_17": 155.42310028076173, + "kl_loss_3": 2381.7218688964845, + "kl_loss_6": 1482.0472900390625, + "learning_rate": 0.0005253757012699972, + "loss": 1162.8098, + "step": 4890 + }, + { + "ce_loss_12": 3.2240451335906983, + "ce_loss_17": 3.027480626106262, + "ce_loss_23": 2.9526946187019347, + "ce_loss_3": 4.06362019777298, + "ce_loss_6": 3.6180875420570375, + "epoch": 0.49, + "grad_norm": 1568.0, + "kl_loss_12": 621.5676177978515, + "kl_loss_17": 155.44562911987305, + "kl_loss_3": 2374.916650390625, + "kl_loss_6": 1487.183331298828, + "learning_rate": 0.0005237909579118712, + "loss": 1179.1473, + "step": 4900 + }, + { + "ce_loss_12": 3.204844522476196, + "ce_loss_17": 2.998248517513275, + "ce_loss_23": 2.9153013467788695, + "ce_loss_3": 4.079318523406982, + "ce_loss_6": 3.6167992115020753, + "epoch": 0.491, + "grad_norm": 1240.0, + "kl_loss_12": 638.6949798583985, + "kl_loss_17": 163.68312225341796, + "kl_loss_3": 2471.2435791015623, + "kl_loss_6": 1535.6149047851563, + "learning_rate": 0.0005222059749790631, + "loss": 1193.4671, + "step": 4910 + }, + { + "ce_loss_12": 3.243853008747101, + "ce_loss_17": 3.0543193221092224, + "ce_loss_23": 2.97930725812912, + "ce_loss_3": 4.062484884262085, + "ce_loss_6": 3.6296430587768556, + "epoch": 0.492, + "grad_norm": 1072.0, + "kl_loss_12": 611.8506225585937, + "kl_loss_17": 155.02143783569335, + "kl_loss_3": 2325.0894165039062, + "kl_loss_6": 1460.1129760742188, + "learning_rate": 0.0005206207684323337, + "loss": 1139.8899, + "step": 4920 + }, + { + "ce_loss_12": 3.23418208360672, + "ce_loss_17": 3.033683145046234, + "ce_loss_23": 2.956132411956787, + "ce_loss_3": 4.078335225582123, + "ce_loss_6": 3.6333807587623594, + "epoch": 0.493, + "grad_norm": 1200.0, + "kl_loss_12": 625.106005859375, + "kl_loss_17": 159.5037414550781, + "kl_loss_3": 2385.7584594726563, + "kl_loss_6": 1500.72548828125, + "learning_rate": 0.000519035354234695, + "loss": 1187.7187, + "step": 4930 + }, + { + "ce_loss_12": 3.221453142166138, + "ce_loss_17": 3.0156876921653746, + "ce_loss_23": 2.9304900646209715, + "ce_loss_3": 4.065563333034516, + "ce_loss_6": 3.623839497566223, + "epoch": 0.494, + "grad_norm": 1312.0, + "kl_loss_12": 631.9784317016602, + "kl_loss_17": 163.04226608276366, + "kl_loss_3": 2379.5922790527343, + "kl_loss_6": 1495.0668151855468, + "learning_rate": 0.0005174497483512506, + "loss": 1155.3355, + "step": 4940 + }, + { + "ce_loss_12": 3.250180208683014, + "ce_loss_17": 3.0572829842567444, + "ce_loss_23": 2.9815022587776183, + "ce_loss_3": 4.084192478656769, + "ce_loss_6": 3.6426392197608948, + "epoch": 0.495, + "grad_norm": 1928.0, + "kl_loss_12": 613.2151184082031, + "kl_loss_17": 155.47557220458984, + "kl_loss_3": 2378.283447265625, + "kl_loss_6": 1486.3792419433594, + "learning_rate": 0.0005158639667490339, + "loss": 1179.6014, + "step": 4950 + }, + { + "ce_loss_12": 3.172983002662659, + "ce_loss_17": 2.9710723996162414, + "ce_loss_23": 2.893413710594177, + "ce_loss_3": 4.014413499832154, + "ce_loss_6": 3.5701647758483888, + "epoch": 0.496, + "grad_norm": 1336.0, + "kl_loss_12": 622.7073608398438, + "kl_loss_17": 157.3909996032715, + "kl_loss_3": 2391.574560546875, + "kl_loss_6": 1500.2932250976562, + "learning_rate": 0.0005142780253968481, + "loss": 1170.5403, + "step": 4960 + }, + { + "ce_loss_12": 3.112742531299591, + "ce_loss_17": 2.9256129026412965, + "ce_loss_23": 2.850318455696106, + "ce_loss_3": 3.9490036368370056, + "ce_loss_6": 3.5061758756637573, + "epoch": 0.497, + "grad_norm": 1312.0, + "kl_loss_12": 602.9799438476563, + "kl_loss_17": 152.73908004760742, + "kl_loss_3": 2332.318981933594, + "kl_loss_6": 1447.7930541992187, + "learning_rate": 0.0005126919402651053, + "loss": 1129.6568, + "step": 4970 + }, + { + "ce_loss_12": 3.19446576833725, + "ce_loss_17": 2.9870991706848145, + "ce_loss_23": 2.904223990440369, + "ce_loss_3": 4.050873291492462, + "ce_loss_6": 3.601293611526489, + "epoch": 0.498, + "grad_norm": 1488.0, + "kl_loss_12": 632.0435089111328, + "kl_loss_17": 161.6195495605469, + "kl_loss_3": 2404.96845703125, + "kl_loss_6": 1518.4857971191407, + "learning_rate": 0.0005111057273256647, + "loss": 1178.4184, + "step": 4980 + }, + { + "ce_loss_12": 3.2594648718833925, + "ce_loss_17": 3.0783557653427125, + "ce_loss_23": 3.006947600841522, + "ce_loss_3": 4.052989423274994, + "ce_loss_6": 3.6285780549049376, + "epoch": 0.499, + "grad_norm": 1160.0, + "kl_loss_12": 591.1621826171875, + "kl_loss_17": 149.88751144409179, + "kl_loss_3": 2255.0285034179688, + "kl_loss_6": 1416.2771301269531, + "learning_rate": 0.0005095194025516733, + "loss": 1120.7145, + "step": 4990 + }, + { + "ce_loss_12": 3.2018948078155516, + "ce_loss_17": 3.010780358314514, + "ce_loss_23": 2.9380059957504274, + "ce_loss_3": 4.035202991962433, + "ce_loss_6": 3.5946489214897155, + "epoch": 0.5, + "grad_norm": 1480.0, + "kl_loss_12": 611.1847503662109, + "kl_loss_17": 151.75913848876954, + "kl_loss_3": 2353.3317260742188, + "kl_loss_6": 1474.8636840820313, + "learning_rate": 0.000507932981917404, + "loss": 1187.5797, + "step": 5000 + }, + { + "ce_loss_12": 3.168850815296173, + "ce_loss_17": 2.9612261295318603, + "ce_loss_23": 2.8815466165542603, + "ce_loss_3": 4.061691415309906, + "ce_loss_6": 3.5973922848701476, + "epoch": 0.501, + "grad_norm": 1352.0, + "kl_loss_12": 649.269076538086, + "kl_loss_17": 162.09185409545898, + "kl_loss_3": 2494.731311035156, + "kl_loss_6": 1573.7349243164062, + "learning_rate": 0.0005063464813980949, + "loss": 1211.8479, + "step": 5010 + }, + { + "ce_loss_12": 3.1469634652137755, + "ce_loss_17": 2.949883961677551, + "ce_loss_23": 2.87452654838562, + "ce_loss_3": 3.999550521373749, + "ce_loss_6": 3.554929995536804, + "epoch": 0.502, + "grad_norm": 1520.0, + "kl_loss_12": 621.3834503173828, + "kl_loss_17": 154.7421775817871, + "kl_loss_3": 2403.564245605469, + "kl_loss_6": 1519.1332580566407, + "learning_rate": 0.0005047599169697884, + "loss": 1166.2583, + "step": 5020 + }, + { + "ce_loss_12": 3.0992706537246706, + "ce_loss_17": 2.893956518173218, + "ce_loss_23": 2.816172742843628, + "ce_loss_3": 3.962613356113434, + "ce_loss_6": 3.5066237807273866, + "epoch": 0.503, + "grad_norm": 1584.0, + "kl_loss_12": 621.5045135498046, + "kl_loss_17": 155.74066162109375, + "kl_loss_3": 2408.344482421875, + "kl_loss_6": 1497.9727783203125, + "learning_rate": 0.000503173304609171, + "loss": 1146.7187, + "step": 5030 + }, + { + "ce_loss_12": 3.2114111304283144, + "ce_loss_17": 3.00760133266449, + "ce_loss_23": 2.929845404624939, + "ce_loss_3": 4.0550275325775145, + "ce_loss_6": 3.614953136444092, + "epoch": 0.504, + "grad_norm": 1392.0, + "kl_loss_12": 616.6142700195312, + "kl_loss_17": 153.8737907409668, + "kl_loss_3": 2371.5414184570313, + "kl_loss_6": 1496.425360107422, + "learning_rate": 0.0005015866602934111, + "loss": 1149.2646, + "step": 5040 + }, + { + "ce_loss_12": 3.192697358131409, + "ce_loss_17": 2.9853726983070374, + "ce_loss_23": 2.9032091379165648, + "ce_loss_3": 4.063418805599213, + "ce_loss_6": 3.615345025062561, + "epoch": 0.505, + "grad_norm": 1352.0, + "kl_loss_12": 647.4493804931641, + "kl_loss_17": 163.64679336547852, + "kl_loss_3": 2449.2982788085938, + "kl_loss_6": 1550.4903137207032, + "learning_rate": 0.0005, + "loss": 1182.5582, + "step": 5050 + }, + { + "ce_loss_12": 3.176822340488434, + "ce_loss_17": 2.9773082733154297, + "ce_loss_23": 2.8989724278450013, + "ce_loss_3": 4.026598381996155, + "ce_loss_6": 3.575958526134491, + "epoch": 0.506, + "grad_norm": 1360.0, + "kl_loss_12": 629.1418579101562, + "kl_loss_17": 159.41609840393068, + "kl_loss_3": 2382.302380371094, + "kl_loss_6": 1493.960711669922, + "learning_rate": 0.0004984133397065889, + "loss": 1151.1613, + "step": 5060 + }, + { + "ce_loss_12": 3.1813357710838317, + "ce_loss_17": 2.978407895565033, + "ce_loss_23": 2.897935354709625, + "ce_loss_3": 4.0521345138549805, + "ce_loss_6": 3.594593644142151, + "epoch": 0.507, + "grad_norm": 1024.0, + "kl_loss_12": 628.001220703125, + "kl_loss_17": 156.38430099487306, + "kl_loss_3": 2419.7817504882814, + "kl_loss_6": 1517.1355041503907, + "learning_rate": 0.0004968266953908291, + "loss": 1157.7264, + "step": 5070 + }, + { + "ce_loss_12": 3.2114148855209352, + "ce_loss_17": 3.0141122221946715, + "ce_loss_23": 2.939532482624054, + "ce_loss_3": 4.073699676990509, + "ce_loss_6": 3.6233365297317506, + "epoch": 0.508, + "grad_norm": 1256.0, + "kl_loss_12": 617.4212463378906, + "kl_loss_17": 153.03925399780275, + "kl_loss_3": 2417.5599975585938, + "kl_loss_6": 1511.7976196289062, + "learning_rate": 0.0004952400830302117, + "loss": 1168.6787, + "step": 5080 + }, + { + "ce_loss_12": 3.1521151065826416, + "ce_loss_17": 2.9467004776000976, + "ce_loss_23": 2.868589723110199, + "ce_loss_3": 4.024004089832306, + "ce_loss_6": 3.568022918701172, + "epoch": 0.509, + "grad_norm": 1256.0, + "kl_loss_12": 634.872933959961, + "kl_loss_17": 157.10798721313478, + "kl_loss_3": 2442.0026611328126, + "kl_loss_6": 1529.1586181640625, + "learning_rate": 0.0004936535186019053, + "loss": 1166.7918, + "step": 5090 + }, + { + "ce_loss_12": 3.2297829389572144, + "ce_loss_17": 3.040994679927826, + "ce_loss_23": 2.9676496148109437, + "ce_loss_3": 4.055515456199646, + "ce_loss_6": 3.6191888570785524, + "epoch": 0.51, + "grad_norm": 1024.0, + "kl_loss_12": 593.3225189208985, + "kl_loss_17": 148.11423950195314, + "kl_loss_3": 2310.919567871094, + "kl_loss_6": 1443.4416381835938, + "learning_rate": 0.000492067018082596, + "loss": 1142.1032, + "step": 5100 + }, + { + "ce_loss_12": 3.1935842633247375, + "ce_loss_17": 2.980956423282623, + "ce_loss_23": 2.9005528569221495, + "ce_loss_3": 4.082455849647522, + "ce_loss_6": 3.6126163482666014, + "epoch": 0.511, + "grad_norm": 1240.0, + "kl_loss_12": 640.4228515625, + "kl_loss_17": 160.77752609252929, + "kl_loss_3": 2484.1706665039064, + "kl_loss_6": 1551.0786743164062, + "learning_rate": 0.0004904805974483267, + "loss": 1208.9242, + "step": 5110 + }, + { + "ce_loss_12": 3.299477529525757, + "ce_loss_17": 3.091015672683716, + "ce_loss_23": 3.007502889633179, + "ce_loss_3": 4.168566477298737, + "ce_loss_6": 3.719189095497131, + "epoch": 0.512, + "grad_norm": 1664.0, + "kl_loss_12": 667.1521362304687, + "kl_loss_17": 170.16158065795898, + "kl_loss_3": 2471.5020751953125, + "kl_loss_6": 1574.2832885742187, + "learning_rate": 0.0004888942726743353, + "loss": 1230.5736, + "step": 5120 + }, + { + "ce_loss_12": 3.16829628944397, + "ce_loss_17": 2.9704806327819826, + "ce_loss_23": 2.890995967388153, + "ce_loss_3": 4.033894419670105, + "ce_loss_6": 3.5718558073043822, + "epoch": 0.513, + "grad_norm": 1488.0, + "kl_loss_12": 630.4835922241211, + "kl_loss_17": 157.3284164428711, + "kl_loss_3": 2447.2002685546877, + "kl_loss_6": 1523.4559631347656, + "learning_rate": 0.0004873080597348947, + "loss": 1187.4595, + "step": 5130 + }, + { + "ce_loss_12": 3.0680445313453673, + "ce_loss_17": 2.8610573053359984, + "ce_loss_23": 2.782891494035721, + "ce_loss_3": 3.9727707147598266, + "ce_loss_6": 3.504224932193756, + "epoch": 0.514, + "grad_norm": 1040.0, + "kl_loss_12": 635.2664459228515, + "kl_loss_17": 153.44505653381347, + "kl_loss_3": 2509.3319580078123, + "kl_loss_6": 1573.8511840820313, + "learning_rate": 0.0004857219746031519, + "loss": 1189.5951, + "step": 5140 + }, + { + "ce_loss_12": 3.215253460407257, + "ce_loss_17": 3.0270629644393923, + "ce_loss_23": 2.9507899045944215, + "ce_loss_3": 4.048285496234894, + "ce_loss_6": 3.607727384567261, + "epoch": 0.515, + "grad_norm": 1184.0, + "kl_loss_12": 612.8102447509766, + "kl_loss_17": 155.12429580688476, + "kl_loss_3": 2349.26357421875, + "kl_loss_6": 1470.0365478515625, + "learning_rate": 0.0004841360332509663, + "loss": 1161.8135, + "step": 5150 + }, + { + "ce_loss_12": 3.1750805974006653, + "ce_loss_17": 2.9839065790176393, + "ce_loss_23": 2.909264051914215, + "ce_loss_3": 4.012336456775666, + "ce_loss_6": 3.570148003101349, + "epoch": 0.516, + "grad_norm": 1320.0, + "kl_loss_12": 603.9526214599609, + "kl_loss_17": 149.85967941284179, + "kl_loss_3": 2344.778875732422, + "kl_loss_6": 1457.8424560546875, + "learning_rate": 0.0004825502516487497, + "loss": 1113.3957, + "step": 5160 + }, + { + "ce_loss_12": 3.1396324634552, + "ce_loss_17": 2.939078176021576, + "ce_loss_23": 2.8661806344985963, + "ce_loss_3": 4.002681982517243, + "ce_loss_6": 3.5561633348464965, + "epoch": 0.517, + "grad_norm": 1528.0, + "kl_loss_12": 628.6352111816407, + "kl_loss_17": 154.36906967163085, + "kl_loss_3": 2431.3240234375, + "kl_loss_6": 1526.0999755859375, + "learning_rate": 0.00048096464576530507, + "loss": 1186.7277, + "step": 5170 + }, + { + "ce_loss_12": 3.228849673271179, + "ce_loss_17": 3.045258629322052, + "ce_loss_23": 2.9680622220039368, + "ce_loss_3": 4.044418549537658, + "ce_loss_6": 3.6099117040634154, + "epoch": 0.518, + "grad_norm": 1152.0, + "kl_loss_12": 603.8460083007812, + "kl_loss_17": 154.73146209716796, + "kl_loss_3": 2300.7780151367188, + "kl_loss_6": 1438.4257019042968, + "learning_rate": 0.00047937923156766646, + "loss": 1133.8398, + "step": 5180 + }, + { + "ce_loss_12": 3.267943871021271, + "ce_loss_17": 3.0870012402534486, + "ce_loss_23": 3.012511909008026, + "ce_loss_3": 4.066486668586731, + "ce_loss_6": 3.6441221475601195, + "epoch": 0.519, + "grad_norm": 1232.0, + "kl_loss_12": 601.1100494384766, + "kl_loss_17": 151.7480926513672, + "kl_loss_3": 2295.0484985351563, + "kl_loss_6": 1444.3046752929688, + "learning_rate": 0.00047779402502093696, + "loss": 1138.6594, + "step": 5190 + }, + { + "ce_loss_12": 3.247831332683563, + "ce_loss_17": 3.0561190247535706, + "ce_loss_23": 2.98013778924942, + "ce_loss_3": 4.0801305890083315, + "ce_loss_6": 3.637748634815216, + "epoch": 0.52, + "grad_norm": 1248.0, + "kl_loss_12": 612.1767059326172, + "kl_loss_17": 153.50031814575195, + "kl_loss_3": 2341.7104736328124, + "kl_loss_6": 1458.6237915039062, + "learning_rate": 0.0004762090420881289, + "loss": 1154.5121, + "step": 5200 + }, + { + "ce_loss_12": 3.1620102167129516, + "ce_loss_17": 2.971610796451569, + "ce_loss_23": 2.8964218378067015, + "ce_loss_3": 3.98571834564209, + "ce_loss_6": 3.562778389453888, + "epoch": 0.521, + "grad_norm": 1496.0, + "kl_loss_12": 611.9804290771484, + "kl_loss_17": 154.26113586425782, + "kl_loss_3": 2339.6232177734373, + "kl_loss_6": 1482.629901123047, + "learning_rate": 0.00047462429873000296, + "loss": 1134.4658, + "step": 5210 + }, + { + "ce_loss_12": 3.2483769059181213, + "ce_loss_17": 3.0610496520996096, + "ce_loss_23": 2.9822338104248045, + "ce_loss_3": 4.0718208193778995, + "ce_loss_6": 3.6312170386314393, + "epoch": 0.522, + "grad_norm": 1328.0, + "kl_loss_12": 611.2712203979493, + "kl_loss_17": 157.38566665649415, + "kl_loss_3": 2337.005651855469, + "kl_loss_6": 1462.1126342773437, + "learning_rate": 0.0004730398109049071, + "loss": 1139.7597, + "step": 5220 + }, + { + "ce_loss_12": 3.186510670185089, + "ce_loss_17": 2.982706034183502, + "ce_loss_23": 2.9045578479766845, + "ce_loss_3": 4.0648169159889225, + "ce_loss_6": 3.6114538311958313, + "epoch": 0.523, + "grad_norm": 1712.0, + "kl_loss_12": 641.9009521484375, + "kl_loss_17": 159.46908073425294, + "kl_loss_3": 2457.0918823242187, + "kl_loss_6": 1559.2233764648438, + "learning_rate": 0.000471455594568616, + "loss": 1175.8197, + "step": 5230 + }, + { + "ce_loss_12": 3.237089014053345, + "ce_loss_17": 3.050284469127655, + "ce_loss_23": 2.9735464096069335, + "ce_loss_3": 4.047674942016601, + "ce_loss_6": 3.6242913603782654, + "epoch": 0.524, + "grad_norm": 1712.0, + "kl_loss_12": 605.2679107666015, + "kl_loss_17": 153.3778953552246, + "kl_loss_3": 2295.2422485351562, + "kl_loss_6": 1445.4991394042968, + "learning_rate": 0.00046987166567417086, + "loss": 1149.0238, + "step": 5240 + }, + { + "ce_loss_12": 3.167953300476074, + "ce_loss_17": 2.974595832824707, + "ce_loss_23": 2.901353621482849, + "ce_loss_3": 4.015729677677155, + "ce_loss_6": 3.566538155078888, + "epoch": 0.525, + "grad_norm": 1352.0, + "kl_loss_12": 610.1307922363281, + "kl_loss_17": 151.42123947143554, + "kl_loss_3": 2369.18935546875, + "kl_loss_6": 1472.7845275878906, + "learning_rate": 0.00046828804017171776, + "loss": 1122.4889, + "step": 5250 + }, + { + "ce_loss_12": 3.2159897685050964, + "ce_loss_17": 3.016587197780609, + "ce_loss_23": 2.934138369560242, + "ce_loss_3": 4.09003005027771, + "ce_loss_6": 3.626188361644745, + "epoch": 0.526, + "grad_norm": 1496.0, + "kl_loss_12": 623.1038421630859, + "kl_loss_17": 155.54010925292968, + "kl_loss_3": 2416.110217285156, + "kl_loss_6": 1499.7231750488281, + "learning_rate": 0.00046670473400834805, + "loss": 1177.702, + "step": 5260 + }, + { + "ce_loss_12": 3.142614114284515, + "ce_loss_17": 2.9521865844726562, + "ce_loss_23": 2.8796603083610535, + "ce_loss_3": 3.9801135301589965, + "ce_loss_6": 3.5341135025024415, + "epoch": 0.527, + "grad_norm": 1504.0, + "kl_loss_12": 600.9233627319336, + "kl_loss_17": 148.42291259765625, + "kl_loss_3": 2336.1134399414063, + "kl_loss_6": 1451.7852172851562, + "learning_rate": 0.00046512176312793734, + "loss": 1179.2488, + "step": 5270 + }, + { + "ce_loss_12": 3.140128016471863, + "ce_loss_17": 2.948225402832031, + "ce_loss_23": 2.8701087832450867, + "ce_loss_3": 3.9780079245567324, + "ce_loss_6": 3.537747693061829, + "epoch": 0.528, + "grad_norm": 1456.0, + "kl_loss_12": 608.3226043701172, + "kl_loss_17": 152.69099807739258, + "kl_loss_3": 2351.3090087890623, + "kl_loss_6": 1478.2592712402343, + "learning_rate": 0.00046353914347098467, + "loss": 1159.9852, + "step": 5280 + }, + { + "ce_loss_12": 3.240424120426178, + "ce_loss_17": 3.0438877940177917, + "ce_loss_23": 2.968979835510254, + "ce_loss_3": 4.078971469402314, + "ce_loss_6": 3.6362814903259277, + "epoch": 0.529, + "grad_norm": 1456.0, + "kl_loss_12": 602.2308624267578, + "kl_loss_17": 151.3656234741211, + "kl_loss_3": 2341.065155029297, + "kl_loss_6": 1462.9760620117188, + "learning_rate": 0.0004619568909744524, + "loss": 1160.6302, + "step": 5290 + }, + { + "ce_loss_12": 3.2395084261894227, + "ce_loss_17": 3.0485966920852663, + "ce_loss_23": 2.9723729729652404, + "ce_loss_3": 4.065326976776123, + "ce_loss_6": 3.6257777571678163, + "epoch": 0.53, + "grad_norm": 1248.0, + "kl_loss_12": 606.4155975341797, + "kl_loss_17": 152.69616012573243, + "kl_loss_3": 2335.6730346679688, + "kl_loss_6": 1465.7213806152345, + "learning_rate": 0.00046037502157160573, + "loss": 1155.7611, + "step": 5300 + }, + { + "ce_loss_12": 3.12794451713562, + "ce_loss_17": 2.931117033958435, + "ce_loss_23": 2.85086829662323, + "ce_loss_3": 3.966094410419464, + "ce_loss_6": 3.5300044775009156, + "epoch": 0.531, + "grad_norm": 1048.0, + "kl_loss_12": 615.4203247070312, + "kl_loss_17": 155.38886947631835, + "kl_loss_3": 2372.1928100585938, + "kl_loss_6": 1473.2394470214845, + "learning_rate": 0.00045879355119185207, + "loss": 1158.3116, + "step": 5310 + }, + { + "ce_loss_12": 3.211964750289917, + "ce_loss_17": 3.0098439931869505, + "ce_loss_23": 2.93164439201355, + "ce_loss_3": 4.059649121761322, + "ce_loss_6": 3.613554799556732, + "epoch": 0.532, + "grad_norm": 1224.0, + "kl_loss_12": 631.3654266357422, + "kl_loss_17": 157.1368392944336, + "kl_loss_3": 2409.542370605469, + "kl_loss_6": 1514.180194091797, + "learning_rate": 0.0004572124957605803, + "loss": 1181.2221, + "step": 5320 + }, + { + "ce_loss_12": 3.214549386501312, + "ce_loss_17": 3.0166999816894533, + "ce_loss_23": 2.93872389793396, + "ce_loss_3": 4.0620667934417725, + "ce_loss_6": 3.61503267288208, + "epoch": 0.533, + "grad_norm": 1640.0, + "kl_loss_12": 616.4297027587891, + "kl_loss_17": 155.38391342163087, + "kl_loss_3": 2374.2799072265625, + "kl_loss_6": 1490.559344482422, + "learning_rate": 0.00045563187119900103, + "loss": 1147.4326, + "step": 5330 + }, + { + "ce_loss_12": 3.06915682554245, + "ce_loss_17": 2.8725393414497375, + "ce_loss_23": 2.7969800353050234, + "ce_loss_3": 3.9451221227645874, + "ce_loss_6": 3.4815810322761536, + "epoch": 0.534, + "grad_norm": 1472.0, + "kl_loss_12": 620.3168975830079, + "kl_loss_17": 154.61506118774415, + "kl_loss_3": 2445.430871582031, + "kl_loss_6": 1517.9400695800782, + "learning_rate": 0.00045405169342398633, + "loss": 1176.2085, + "step": 5340 + }, + { + "ce_loss_12": 3.160827076435089, + "ce_loss_17": 2.961031210422516, + "ce_loss_23": 2.8817851185798644, + "ce_loss_3": 4.027056002616883, + "ce_loss_6": 3.5688594460487364, + "epoch": 0.535, + "grad_norm": 1144.0, + "kl_loss_12": 625.2731628417969, + "kl_loss_17": 158.35914306640626, + "kl_loss_3": 2426.1759643554688, + "kl_loss_6": 1511.896221923828, + "learning_rate": 0.0004524719783479088, + "loss": 1152.8082, + "step": 5350 + }, + { + "ce_loss_12": 3.1208131194114683, + "ce_loss_17": 2.918009865283966, + "ce_loss_23": 2.8386128544807434, + "ce_loss_3": 4.007599294185638, + "ce_loss_6": 3.5410866260528566, + "epoch": 0.536, + "grad_norm": 1064.0, + "kl_loss_12": 629.1234069824219, + "kl_loss_17": 158.10762481689454, + "kl_loss_3": 2474.5676025390626, + "kl_loss_6": 1533.2588256835938, + "learning_rate": 0.00045089274187848144, + "loss": 1160.9585, + "step": 5360 + }, + { + "ce_loss_12": 3.2206862330436707, + "ce_loss_17": 3.032423961162567, + "ce_loss_23": 2.956794571876526, + "ce_loss_3": 4.06341255903244, + "ce_loss_6": 3.610382318496704, + "epoch": 0.537, + "grad_norm": 1320.0, + "kl_loss_12": 609.552490234375, + "kl_loss_17": 154.00097732543946, + "kl_loss_3": 2372.342419433594, + "kl_loss_6": 1459.6657836914062, + "learning_rate": 0.00044931399991859835, + "loss": 1143.2369, + "step": 5370 + }, + { + "ce_loss_12": 3.0889888763427735, + "ce_loss_17": 2.8907247304916384, + "ce_loss_23": 2.8156118273735045, + "ce_loss_3": 3.93492271900177, + "ce_loss_6": 3.4845082759857178, + "epoch": 0.538, + "grad_norm": 1104.0, + "kl_loss_12": 614.6871841430664, + "kl_loss_17": 152.0139019012451, + "kl_loss_3": 2381.980810546875, + "kl_loss_6": 1484.2903747558594, + "learning_rate": 0.00044773576836617336, + "loss": 1142.4948, + "step": 5380 + }, + { + "ce_loss_12": 3.1936163306236267, + "ce_loss_17": 2.987152099609375, + "ce_loss_23": 2.90910884141922, + "ce_loss_3": 4.039672362804413, + "ce_loss_6": 3.6037211179733277, + "epoch": 0.539, + "grad_norm": 1216.0, + "kl_loss_12": 635.0668792724609, + "kl_loss_17": 157.40247192382813, + "kl_loss_3": 2411.213171386719, + "kl_loss_6": 1533.703955078125, + "learning_rate": 0.00044615806311398056, + "loss": 1193.1957, + "step": 5390 + }, + { + "ce_loss_12": 3.2351110339164735, + "ce_loss_17": 3.052011275291443, + "ce_loss_23": 2.978208839893341, + "ce_loss_3": 4.019779253005981, + "ce_loss_6": 3.603326416015625, + "epoch": 0.54, + "grad_norm": 1184.0, + "kl_loss_12": 591.3331329345704, + "kl_loss_17": 148.71628608703614, + "kl_loss_3": 2251.4184143066404, + "kl_loss_6": 1410.5655151367187, + "learning_rate": 0.00044458090004949454, + "loss": 1145.5472, + "step": 5400 + }, + { + "ce_loss_12": 3.138357937335968, + "ce_loss_17": 2.925539791584015, + "ce_loss_23": 2.8439600586891176, + "ce_loss_3": 4.031644797325134, + "ce_loss_6": 3.570351254940033, + "epoch": 0.541, + "grad_norm": 2040.0, + "kl_loss_12": 648.4575073242188, + "kl_loss_17": 163.3741195678711, + "kl_loss_3": 2506.7625, + "kl_loss_6": 1577.8297485351563, + "learning_rate": 0.0004430042950547297, + "loss": 1174.1352, + "step": 5410 + }, + { + "ce_loss_12": 3.2109811425209047, + "ce_loss_17": 3.008511447906494, + "ce_loss_23": 2.927990770339966, + "ce_loss_3": 4.0686198472976685, + "ce_loss_6": 3.6101776242256163, + "epoch": 0.542, + "grad_norm": 1384.0, + "kl_loss_12": 634.0676239013671, + "kl_loss_17": 160.0429702758789, + "kl_loss_3": 2424.7688720703127, + "kl_loss_6": 1507.9207885742187, + "learning_rate": 0.0004414282640060809, + "loss": 1165.1768, + "step": 5420 + }, + { + "ce_loss_12": 3.3067090272903443, + "ce_loss_17": 3.1117973566055297, + "ce_loss_23": 3.0120622158050536, + "ce_loss_3": 4.103489708900452, + "ce_loss_6": 3.6864388465881346, + "epoch": 0.543, + "grad_norm": 1368.0, + "kl_loss_12": 651.9807342529297, + "kl_loss_17": 198.02007904052735, + "kl_loss_3": 2302.978656005859, + "kl_loss_6": 1468.1529052734375, + "learning_rate": 0.0004398528227741633, + "loss": 1182.8952, + "step": 5430 + }, + { + "ce_loss_12": 3.175755870342255, + "ce_loss_17": 2.9795714020729065, + "ce_loss_23": 2.8910858273506164, + "ce_loss_3": 4.017914116382599, + "ce_loss_6": 3.573156309127808, + "epoch": 0.544, + "grad_norm": 1296.0, + "kl_loss_12": 635.0279296875, + "kl_loss_17": 180.33490676879882, + "kl_loss_3": 2359.365093994141, + "kl_loss_6": 1486.6942016601563, + "learning_rate": 0.00043827798722365264, + "loss": 1175.7504, + "step": 5440 + }, + { + "ce_loss_12": 3.2720242977142333, + "ce_loss_17": 3.0850940346717834, + "ce_loss_23": 3.009948170185089, + "ce_loss_3": 4.080536115169525, + "ce_loss_6": 3.645680582523346, + "epoch": 0.545, + "grad_norm": 1264.0, + "kl_loss_12": 615.475341796875, + "kl_loss_17": 168.24811401367188, + "kl_loss_3": 2308.9675903320312, + "kl_loss_6": 1450.685693359375, + "learning_rate": 0.00043670377321312535, + "loss": 1132.384, + "step": 5450 + }, + { + "ce_loss_12": 3.2696519255638123, + "ce_loss_17": 3.084871006011963, + "ce_loss_23": 3.0106928825378416, + "ce_loss_3": 4.078706085681915, + "ce_loss_6": 3.6569113969802856, + "epoch": 0.546, + "grad_norm": 1152.0, + "kl_loss_12": 605.2304107666016, + "kl_loss_17": 158.32353515625, + "kl_loss_3": 2287.594561767578, + "kl_loss_6": 1445.4706176757813, + "learning_rate": 0.0004351301965948991, + "loss": 1150.127, + "step": 5460 + }, + { + "ce_loss_12": 3.1863266587257386, + "ce_loss_17": 2.9979047179222107, + "ce_loss_23": 2.9250873923301697, + "ce_loss_3": 3.999111866950989, + "ce_loss_6": 3.559188687801361, + "epoch": 0.547, + "grad_norm": 1320.0, + "kl_loss_12": 601.9108062744141, + "kl_loss_17": 155.52551651000977, + "kl_loss_3": 2293.778112792969, + "kl_loss_6": 1427.9961975097656, + "learning_rate": 0.000433557273214873, + "loss": 1136.3562, + "step": 5470 + }, + { + "ce_loss_12": 3.1800923466682436, + "ce_loss_17": 2.9848230957984923, + "ce_loss_23": 2.9074708938598635, + "ce_loss_3": 4.013649213314056, + "ce_loss_6": 3.5686616778373716, + "epoch": 0.548, + "grad_norm": 1448.0, + "kl_loss_12": 608.5576141357421, + "kl_loss_17": 159.38774757385255, + "kl_loss_3": 2328.231311035156, + "kl_loss_6": 1453.5774169921874, + "learning_rate": 0.000431985018912368, + "loss": 1128.23, + "step": 5480 + }, + { + "ce_loss_12": 3.1537001848220827, + "ce_loss_17": 2.957874858379364, + "ce_loss_23": 2.882986044883728, + "ce_loss_3": 4.026316070556641, + "ce_loss_6": 3.571719026565552, + "epoch": 0.549, + "grad_norm": 1144.0, + "kl_loss_12": 622.0732177734375, + "kl_loss_17": 159.61335983276368, + "kl_loss_3": 2414.383020019531, + "kl_loss_6": 1514.8785278320313, + "learning_rate": 0.0004304134495199674, + "loss": 1144.274, + "step": 5490 + }, + { + "ce_loss_12": 3.1874389052391052, + "ce_loss_17": 2.987323296070099, + "ce_loss_23": 2.9131912350654603, + "ce_loss_3": 4.03360345363617, + "ce_loss_6": 3.5936038970947264, + "epoch": 0.55, + "grad_norm": 1336.0, + "kl_loss_12": 632.0410430908203, + "kl_loss_17": 158.54720077514648, + "kl_loss_3": 2405.1260620117187, + "kl_loss_6": 1525.2022094726562, + "learning_rate": 0.0004288425808633575, + "loss": 1158.6854, + "step": 5500 + }, + { + "ce_loss_12": 3.159614682197571, + "ce_loss_17": 2.9681888461112975, + "ce_loss_23": 2.8953501105308534, + "ce_loss_3": 4.004878151416778, + "ce_loss_6": 3.555284786224365, + "epoch": 0.551, + "grad_norm": 1480.0, + "kl_loss_12": 610.0143768310547, + "kl_loss_17": 155.91504287719727, + "kl_loss_3": 2373.51201171875, + "kl_loss_6": 1479.2927673339843, + "learning_rate": 0.0004272724287611684, + "loss": 1158.9906, + "step": 5510 + }, + { + "ce_loss_12": 3.1412819385528565, + "ce_loss_17": 2.9464999437332153, + "ce_loss_23": 2.8689870834350586, + "ce_loss_3": 4.012474822998047, + "ce_loss_6": 3.54014390707016, + "epoch": 0.552, + "grad_norm": 1080.0, + "kl_loss_12": 620.205453491211, + "kl_loss_17": 158.95667114257813, + "kl_loss_3": 2432.532849121094, + "kl_loss_6": 1504.3034973144531, + "learning_rate": 0.00042570300902481425, + "loss": 1162.9969, + "step": 5520 + }, + { + "ce_loss_12": 3.164325165748596, + "ce_loss_17": 2.974717545509338, + "ce_loss_23": 2.9020156264305115, + "ce_loss_3": 3.9918417096138, + "ce_loss_6": 3.5549869298934937, + "epoch": 0.553, + "grad_norm": 1160.0, + "kl_loss_12": 609.5499053955078, + "kl_loss_17": 153.8822929382324, + "kl_loss_3": 2336.972705078125, + "kl_loss_6": 1457.08447265625, + "learning_rate": 0.00042413433745833423, + "loss": 1141.2318, + "step": 5530 + }, + { + "ce_loss_12": 3.166851353645325, + "ce_loss_17": 2.9693361639976503, + "ce_loss_23": 2.896143317222595, + "ce_loss_3": 4.017174100875854, + "ce_loss_6": 3.564766192436218, + "epoch": 0.554, + "grad_norm": 1736.0, + "kl_loss_12": 618.8857513427735, + "kl_loss_17": 154.3293800354004, + "kl_loss_3": 2388.2908447265627, + "kl_loss_6": 1476.0225463867187, + "learning_rate": 0.0004225664298582339, + "loss": 1125.2438, + "step": 5540 + }, + { + "ce_loss_12": 3.241768956184387, + "ce_loss_17": 3.04884078502655, + "ce_loss_23": 2.973956620693207, + "ce_loss_3": 4.063172233104706, + "ce_loss_6": 3.624587297439575, + "epoch": 0.555, + "grad_norm": 1384.0, + "kl_loss_12": 603.5793518066406, + "kl_loss_17": 152.45914840698242, + "kl_loss_3": 2303.410919189453, + "kl_loss_6": 1444.3580261230468, + "learning_rate": 0.000420999302013325, + "loss": 1128.708, + "step": 5550 + }, + { + "ce_loss_12": 3.1585689306259157, + "ce_loss_17": 2.9541547536849975, + "ce_loss_23": 2.8743054032325746, + "ce_loss_3": 4.05275651216507, + "ce_loss_6": 3.570194947719574, + "epoch": 0.556, + "grad_norm": 1472.0, + "kl_loss_12": 636.6203643798829, + "kl_loss_17": 162.817537689209, + "kl_loss_3": 2476.5192138671873, + "kl_loss_6": 1521.946514892578, + "learning_rate": 0.000419432969704568, + "loss": 1157.0515, + "step": 5560 + }, + { + "ce_loss_12": 3.178593170642853, + "ce_loss_17": 2.98986976146698, + "ce_loss_23": 2.913978326320648, + "ce_loss_3": 4.005271148681641, + "ce_loss_6": 3.569079780578613, + "epoch": 0.557, + "grad_norm": 1136.0, + "kl_loss_12": 606.7872985839844, + "kl_loss_17": 154.8161392211914, + "kl_loss_3": 2327.8378295898438, + "kl_loss_6": 1447.2394592285157, + "learning_rate": 0.00041786744870491154, + "loss": 1168.4526, + "step": 5570 + }, + { + "ce_loss_12": 3.136245906352997, + "ce_loss_17": 2.9316209077835085, + "ce_loss_23": 2.853544735908508, + "ce_loss_3": 3.9692373871803284, + "ce_loss_6": 3.537553036212921, + "epoch": 0.558, + "grad_norm": 1272.0, + "kl_loss_12": 626.1948120117188, + "kl_loss_17": 158.22349243164064, + "kl_loss_3": 2367.86435546875, + "kl_loss_6": 1497.551934814453, + "learning_rate": 0.0004163027547791347, + "loss": 1150.2924, + "step": 5580 + }, + { + "ce_loss_12": 3.123509931564331, + "ce_loss_17": 2.924563002586365, + "ce_loss_23": 2.8449893236160277, + "ce_loss_3": 4.011238420009613, + "ce_loss_6": 3.5435333847999573, + "epoch": 0.559, + "grad_norm": 1696.0, + "kl_loss_12": 623.3840209960938, + "kl_loss_17": 158.15001754760743, + "kl_loss_3": 2449.357177734375, + "kl_loss_6": 1514.9872009277344, + "learning_rate": 0.0004147389036836881, + "loss": 1166.2543, + "step": 5590 + }, + { + "ce_loss_12": 3.1620707154273986, + "ce_loss_17": 2.960676372051239, + "ce_loss_23": 2.8864513635635376, + "ce_loss_3": 4.014166557788849, + "ce_loss_6": 3.5708006501197813, + "epoch": 0.56, + "grad_norm": 1456.0, + "kl_loss_12": 617.8463104248046, + "kl_loss_17": 157.17931900024413, + "kl_loss_3": 2374.1302795410156, + "kl_loss_6": 1500.4806701660157, + "learning_rate": 0.00041317591116653486, + "loss": 1178.2407, + "step": 5600 + }, + { + "ce_loss_12": 3.196033942699432, + "ce_loss_17": 2.9927977442741396, + "ce_loss_23": 2.9205702900886537, + "ce_loss_3": 4.053261566162109, + "ce_loss_6": 3.5983447074890136, + "epoch": 0.561, + "grad_norm": 1312.0, + "kl_loss_12": 630.828955078125, + "kl_loss_17": 159.6180679321289, + "kl_loss_3": 2410.134558105469, + "kl_loss_6": 1503.6664672851562, + "learning_rate": 0.0004116137929669921, + "loss": 1153.1006, + "step": 5610 + }, + { + "ce_loss_12": 3.1844432234764097, + "ce_loss_17": 2.9904911518096924, + "ce_loss_23": 2.916596531867981, + "ce_loss_3": 4.0213427901268, + "ce_loss_6": 3.581937515735626, + "epoch": 0.562, + "grad_norm": 1584.0, + "kl_loss_12": 610.8443328857422, + "kl_loss_17": 153.3087303161621, + "kl_loss_3": 2351.8536376953125, + "kl_loss_6": 1478.7730224609375, + "learning_rate": 0.00041005256481557305, + "loss": 1135.6671, + "step": 5620 + }, + { + "ce_loss_12": 3.2639884471893312, + "ce_loss_17": 3.0820303201675414, + "ce_loss_23": 3.0073699235916136, + "ce_loss_3": 4.060350477695465, + "ce_loss_6": 3.637803590297699, + "epoch": 0.563, + "grad_norm": 1520.0, + "kl_loss_12": 590.2365341186523, + "kl_loss_17": 149.79729652404785, + "kl_loss_3": 2250.507531738281, + "kl_loss_6": 1411.2151184082031, + "learning_rate": 0.00040849224243382767, + "loss": 1121.147, + "step": 5630 + }, + { + "ce_loss_12": 3.1370453596115113, + "ce_loss_17": 2.945245790481567, + "ce_loss_23": 2.8680389404296873, + "ce_loss_3": 3.991039717197418, + "ce_loss_6": 3.548636627197266, + "epoch": 0.564, + "grad_norm": 1304.0, + "kl_loss_12": 611.6623901367187, + "kl_loss_17": 154.0716766357422, + "kl_loss_3": 2369.054821777344, + "kl_loss_6": 1491.377264404297, + "learning_rate": 0.000406932841534185, + "loss": 1132.6218, + "step": 5640 + }, + { + "ce_loss_12": 3.112733340263367, + "ce_loss_17": 2.9161797881126406, + "ce_loss_23": 2.8394733667373657, + "ce_loss_3": 3.9644450902938844, + "ce_loss_6": 3.5196509838104246, + "epoch": 0.565, + "grad_norm": 1384.0, + "kl_loss_12": 620.3893859863281, + "kl_loss_17": 156.73301696777344, + "kl_loss_3": 2388.0477783203123, + "kl_loss_6": 1497.893115234375, + "learning_rate": 0.0004053743778197951, + "loss": 1182.3423, + "step": 5650 + }, + { + "ce_loss_12": 3.2095742225646973, + "ce_loss_17": 3.009103775024414, + "ce_loss_23": 2.934436392784119, + "ce_loss_3": 4.047181522846222, + "ce_loss_6": 3.6072173953056335, + "epoch": 0.566, + "grad_norm": 1360.0, + "kl_loss_12": 625.52333984375, + "kl_loss_17": 159.00185623168946, + "kl_loss_3": 2350.4621398925783, + "kl_loss_6": 1480.360858154297, + "learning_rate": 0.0004038168669843697, + "loss": 1164.518, + "step": 5660 + }, + { + "ce_loss_12": 3.154650020599365, + "ce_loss_17": 2.968583071231842, + "ce_loss_23": 2.893648374080658, + "ce_loss_3": 3.968627941608429, + "ce_loss_6": 3.542752408981323, + "epoch": 0.567, + "grad_norm": 1376.0, + "kl_loss_12": 601.3987396240234, + "kl_loss_17": 153.14722862243653, + "kl_loss_3": 2304.2887268066406, + "kl_loss_6": 1447.1326049804688, + "learning_rate": 0.000402260324712026, + "loss": 1153.7849, + "step": 5670 + }, + { + "ce_loss_12": 3.203722608089447, + "ce_loss_17": 3.0093868017196654, + "ce_loss_23": 2.9355448961257933, + "ce_loss_3": 4.062652409076691, + "ce_loss_6": 3.607566201686859, + "epoch": 0.568, + "grad_norm": 1472.0, + "kl_loss_12": 612.4659759521485, + "kl_loss_17": 152.28708038330078, + "kl_loss_3": 2393.439862060547, + "kl_loss_6": 1497.2868591308593, + "learning_rate": 0.00040070476667712743, + "loss": 1140.8777, + "step": 5680 + }, + { + "ce_loss_12": 3.226268935203552, + "ce_loss_17": 3.034188437461853, + "ce_loss_23": 2.9592732310295107, + "ce_loss_3": 4.057484591007233, + "ce_loss_6": 3.622624135017395, + "epoch": 0.569, + "grad_norm": 1664.0, + "kl_loss_12": 609.1282775878906, + "kl_loss_17": 154.65127334594726, + "kl_loss_3": 2351.16806640625, + "kl_loss_6": 1479.722100830078, + "learning_rate": 0.0003991502085441259, + "loss": 1153.9484, + "step": 5690 + }, + { + "ce_loss_12": 3.253240728378296, + "ce_loss_17": 3.0729852318763733, + "ce_loss_23": 3.0002264022827148, + "ce_loss_3": 4.052351331710815, + "ce_loss_6": 3.6278217792510987, + "epoch": 0.57, + "grad_norm": 1056.0, + "kl_loss_12": 594.1734252929688, + "kl_loss_17": 151.32156372070312, + "kl_loss_3": 2258.421423339844, + "kl_loss_6": 1413.1331481933594, + "learning_rate": 0.0003975966659674047, + "loss": 1134.815, + "step": 5700 + }, + { + "ce_loss_12": 3.2350077748298647, + "ce_loss_17": 3.048025059700012, + "ce_loss_23": 2.972712218761444, + "ce_loss_3": 4.07706515789032, + "ce_loss_6": 3.623251235485077, + "epoch": 0.571, + "grad_norm": 1456.0, + "kl_loss_12": 605.6468902587891, + "kl_loss_17": 154.75096168518067, + "kl_loss_3": 2340.8067932128906, + "kl_loss_6": 1448.8978088378906, + "learning_rate": 0.0003960441545911204, + "loss": 1130.0355, + "step": 5710 + }, + { + "ce_loss_12": 3.216277575492859, + "ce_loss_17": 3.027455914020538, + "ce_loss_23": 2.954277515411377, + "ce_loss_3": 4.042887318134308, + "ce_loss_6": 3.608815836906433, + "epoch": 0.572, + "grad_norm": 1264.0, + "kl_loss_12": 609.4589202880859, + "kl_loss_17": 153.23536071777343, + "kl_loss_3": 2345.3074157714846, + "kl_loss_6": 1468.869970703125, + "learning_rate": 0.0003944926900490452, + "loss": 1139.2996, + "step": 5720 + }, + { + "ce_loss_12": 3.1584270715713503, + "ce_loss_17": 2.95372451543808, + "ce_loss_23": 2.877091670036316, + "ce_loss_3": 4.0117509126663204, + "ce_loss_6": 3.5662245988845824, + "epoch": 0.573, + "grad_norm": 1104.0, + "kl_loss_12": 624.748178100586, + "kl_loss_17": 156.7990867614746, + "kl_loss_3": 2389.1001098632814, + "kl_loss_6": 1502.028076171875, + "learning_rate": 0.0003929422879644099, + "loss": 1145.1283, + "step": 5730 + }, + { + "ce_loss_12": 3.1456753730773928, + "ce_loss_17": 2.9637307286262513, + "ce_loss_23": 2.890525197982788, + "ce_loss_3": 3.970808744430542, + "ce_loss_6": 3.5373560428619384, + "epoch": 0.574, + "grad_norm": 1464.0, + "kl_loss_12": 594.8839279174805, + "kl_loss_17": 149.4309772491455, + "kl_loss_3": 2317.085040283203, + "kl_loss_6": 1446.6775451660155, + "learning_rate": 0.0003913929639497462, + "loss": 1108.5066, + "step": 5740 + }, + { + "ce_loss_12": 3.1146702647209166, + "ce_loss_17": 2.9165299773216247, + "ce_loss_23": 2.8424209713935853, + "ce_loss_3": 3.975343644618988, + "ce_loss_6": 3.523813855648041, + "epoch": 0.575, + "grad_norm": 1512.0, + "kl_loss_12": 605.5942993164062, + "kl_loss_17": 151.10165100097657, + "kl_loss_3": 2400.0088134765624, + "kl_loss_6": 1497.5197509765626, + "learning_rate": 0.00038984473360672965, + "loss": 1136.1696, + "step": 5750 + }, + { + "ce_loss_12": 3.116701638698578, + "ce_loss_17": 2.925139605998993, + "ce_loss_23": 2.8473670959472654, + "ce_loss_3": 3.9752248883247376, + "ce_loss_6": 3.5281368851661683, + "epoch": 0.576, + "grad_norm": 1064.0, + "kl_loss_12": 603.5871429443359, + "kl_loss_17": 150.55634155273438, + "kl_loss_3": 2375.617053222656, + "kl_loss_6": 1483.469708251953, + "learning_rate": 0.0003882976125260229, + "loss": 1127.8689, + "step": 5760 + }, + { + "ce_loss_12": 3.1820274710655214, + "ce_loss_17": 2.9905288577079774, + "ce_loss_23": 2.9172300934791564, + "ce_loss_3": 4.021161353588104, + "ce_loss_6": 3.5737295508384705, + "epoch": 0.577, + "grad_norm": 1392.0, + "kl_loss_12": 608.3142395019531, + "kl_loss_17": 153.50858612060546, + "kl_loss_3": 2349.4509094238283, + "kl_loss_6": 1464.999334716797, + "learning_rate": 0.00038675161628711776, + "loss": 1145.1527, + "step": 5770 + }, + { + "ce_loss_12": 3.212972104549408, + "ce_loss_17": 3.0255725383758545, + "ce_loss_23": 2.9506805896759034, + "ce_loss_3": 4.036686909198761, + "ce_loss_6": 3.5992730140686033, + "epoch": 0.578, + "grad_norm": 1336.0, + "kl_loss_12": 603.1133209228516, + "kl_loss_17": 152.6038848876953, + "kl_loss_3": 2314.387731933594, + "kl_loss_6": 1444.664453125, + "learning_rate": 0.0003852067604581794, + "loss": 1164.7646, + "step": 5780 + }, + { + "ce_loss_12": 3.1712546825408934, + "ce_loss_17": 2.9766078114509584, + "ce_loss_23": 2.901553583145142, + "ce_loss_3": 4.013803851604462, + "ce_loss_6": 3.5649865865707397, + "epoch": 0.579, + "grad_norm": 1632.0, + "kl_loss_12": 612.6786087036132, + "kl_loss_17": 151.10699539184571, + "kl_loss_3": 2381.509417724609, + "kl_loss_6": 1479.929248046875, + "learning_rate": 0.0003836630605958888, + "loss": 1138.6457, + "step": 5790 + }, + { + "ce_loss_12": 3.220193159580231, + "ce_loss_17": 3.034937250614166, + "ce_loss_23": 2.9608628392219543, + "ce_loss_3": 4.047442483901977, + "ce_loss_6": 3.6091946363449097, + "epoch": 0.58, + "grad_norm": 1480.0, + "kl_loss_12": 613.7919372558594, + "kl_loss_17": 153.43617324829103, + "kl_loss_3": 2354.0041259765626, + "kl_loss_6": 1478.952703857422, + "learning_rate": 0.0003821205322452863, + "loss": 1186.6608, + "step": 5800 + }, + { + "ce_loss_12": 3.1956213474273683, + "ce_loss_17": 3.013074266910553, + "ce_loss_23": 2.9406906962394714, + "ce_loss_3": 4.023329126834869, + "ce_loss_6": 3.5796982526779173, + "epoch": 0.581, + "grad_norm": 1184.0, + "kl_loss_12": 603.7600204467774, + "kl_loss_17": 150.3345703125, + "kl_loss_3": 2334.2036071777343, + "kl_loss_6": 1453.7977111816406, + "learning_rate": 0.0003805791909396155, + "loss": 1142.1789, + "step": 5810 + }, + { + "ce_loss_12": 3.152891659736633, + "ce_loss_17": 2.958096611499786, + "ce_loss_23": 2.8864975810050963, + "ce_loss_3": 3.990143322944641, + "ce_loss_6": 3.55165935754776, + "epoch": 0.582, + "grad_norm": 1440.0, + "kl_loss_12": 601.065625, + "kl_loss_17": 149.30062942504884, + "kl_loss_3": 2348.582684326172, + "kl_loss_6": 1460.9480102539062, + "learning_rate": 0.0003790390522001662, + "loss": 1147.1742, + "step": 5820 + }, + { + "ce_loss_12": 3.0979885935783384, + "ce_loss_17": 2.910538339614868, + "ce_loss_23": 2.839175593852997, + "ce_loss_3": 3.9419143199920654, + "ce_loss_6": 3.4933337450027464, + "epoch": 0.583, + "grad_norm": 1432.0, + "kl_loss_12": 599.3214111328125, + "kl_loss_17": 147.92266845703125, + "kl_loss_3": 2364.748254394531, + "kl_loss_6": 1473.270654296875, + "learning_rate": 0.0003775001315361183, + "loss": 1131.8652, + "step": 5830 + }, + { + "ce_loss_12": 3.195239317417145, + "ce_loss_17": 2.9988956689834594, + "ce_loss_23": 2.9231552362442015, + "ce_loss_3": 4.044886898994446, + "ce_loss_6": 3.590891981124878, + "epoch": 0.584, + "grad_norm": 1256.0, + "kl_loss_12": 604.733544921875, + "kl_loss_17": 153.50776290893555, + "kl_loss_3": 2353.9761474609377, + "kl_loss_6": 1461.015869140625, + "learning_rate": 0.0003759624444443858, + "loss": 1144.9896, + "step": 5840 + }, + { + "ce_loss_12": 3.2173043370246885, + "ce_loss_17": 3.0344321608543394, + "ce_loss_23": 2.961859107017517, + "ce_loss_3": 4.042249095439911, + "ce_loss_6": 3.6024664402008058, + "epoch": 0.585, + "grad_norm": 1120.0, + "kl_loss_12": 602.2097274780274, + "kl_loss_17": 150.3396198272705, + "kl_loss_3": 2335.1426208496096, + "kl_loss_6": 1460.1220397949219, + "learning_rate": 0.00037442600640946044, + "loss": 1127.3688, + "step": 5850 + }, + { + "ce_loss_12": 3.1868869066238403, + "ce_loss_17": 3.0009064435958863, + "ce_loss_23": 2.9291869044303893, + "ce_loss_3": 4.007125806808472, + "ce_loss_6": 3.574330222606659, + "epoch": 0.586, + "grad_norm": 1376.0, + "kl_loss_12": 598.3401458740234, + "kl_loss_17": 149.0207206726074, + "kl_loss_3": 2315.733856201172, + "kl_loss_6": 1440.1342041015625, + "learning_rate": 0.00037289083290325663, + "loss": 1108.8477, + "step": 5860 + }, + { + "ce_loss_12": 3.1647374510765074, + "ce_loss_17": 2.974149215221405, + "ce_loss_23": 2.9004238486289977, + "ce_loss_3": 3.9901832222938536, + "ce_loss_6": 3.549623227119446, + "epoch": 0.587, + "grad_norm": 1416.0, + "kl_loss_12": 587.2964050292969, + "kl_loss_17": 149.29082946777345, + "kl_loss_3": 2292.4321899414062, + "kl_loss_6": 1422.3937255859375, + "learning_rate": 0.0003713569393849543, + "loss": 1121.9459, + "step": 5870 + }, + { + "ce_loss_12": 3.2203643798828123, + "ce_loss_17": 3.0308145403862, + "ce_loss_23": 2.9575372338294983, + "ce_loss_3": 4.051881039142609, + "ce_loss_6": 3.606668543815613, + "epoch": 0.588, + "grad_norm": 1488.0, + "kl_loss_12": 608.5747924804688, + "kl_loss_17": 152.33790664672853, + "kl_loss_3": 2340.599249267578, + "kl_loss_6": 1460.192236328125, + "learning_rate": 0.00036982434130084397, + "loss": 1139.0371, + "step": 5880 + }, + { + "ce_loss_12": 3.1405606150627134, + "ce_loss_17": 2.9459444642066956, + "ce_loss_23": 2.871601629257202, + "ce_loss_3": 3.963181185722351, + "ce_loss_6": 3.526632297039032, + "epoch": 0.589, + "grad_norm": 1272.0, + "kl_loss_12": 612.6804351806641, + "kl_loss_17": 153.99071502685547, + "kl_loss_3": 2329.165905761719, + "kl_loss_6": 1457.7456909179687, + "learning_rate": 0.00036829305408417166, + "loss": 1146.7389, + "step": 5890 + }, + { + "ce_loss_12": 3.1303426861763, + "ce_loss_17": 2.9341423153877257, + "ce_loss_23": 2.857453668117523, + "ce_loss_3": 3.9833008646965027, + "ce_loss_6": 3.537202799320221, + "epoch": 0.59, + "grad_norm": 1216.0, + "kl_loss_12": 610.0974426269531, + "kl_loss_17": 153.63433265686035, + "kl_loss_3": 2385.1614379882812, + "kl_loss_6": 1491.5378967285155, + "learning_rate": 0.0003667630931549826, + "loss": 1144.5164, + "step": 5900 + }, + { + "ce_loss_12": 3.1039226174354555, + "ce_loss_17": 2.904790771007538, + "ce_loss_23": 2.83035945892334, + "ce_loss_3": 3.9925180077552795, + "ce_loss_6": 3.518354058265686, + "epoch": 0.591, + "grad_norm": 1448.0, + "kl_loss_12": 627.5811126708984, + "kl_loss_17": 153.9984390258789, + "kl_loss_3": 2474.5466064453126, + "kl_loss_6": 1526.5736022949218, + "learning_rate": 0.00036523447391996613, + "loss": 1169.5881, + "step": 5910 + }, + { + "ce_loss_12": 3.176124703884125, + "ce_loss_17": 2.9896682262420655, + "ce_loss_23": 2.9186110138893127, + "ce_loss_3": 4.004823994636536, + "ce_loss_6": 3.567076015472412, + "epoch": 0.592, + "grad_norm": 1168.0, + "kl_loss_12": 592.742041015625, + "kl_loss_17": 147.32718963623046, + "kl_loss_3": 2302.6177368164062, + "kl_loss_6": 1434.5437927246094, + "learning_rate": 0.00036370721177230114, + "loss": 1121.4209, + "step": 5920 + }, + { + "ce_loss_12": 3.184492063522339, + "ce_loss_17": 2.99191175699234, + "ce_loss_23": 2.9177701115608214, + "ce_loss_3": 4.032565438747406, + "ce_loss_6": 3.587347114086151, + "epoch": 0.593, + "grad_norm": 1240.0, + "kl_loss_12": 614.3560852050781, + "kl_loss_17": 155.10681838989257, + "kl_loss_3": 2378.0327758789062, + "kl_loss_6": 1490.3172973632813, + "learning_rate": 0.00036218132209150044, + "loss": 1146.4869, + "step": 5930 + }, + { + "ce_loss_12": 3.149495279788971, + "ce_loss_17": 2.944100499153137, + "ce_loss_23": 2.864597165584564, + "ce_loss_3": 4.026948845386505, + "ce_loss_6": 3.567388117313385, + "epoch": 0.594, + "grad_norm": 1224.0, + "kl_loss_12": 635.0625183105469, + "kl_loss_17": 160.64792404174804, + "kl_loss_3": 2469.66064453125, + "kl_loss_6": 1542.0163940429688, + "learning_rate": 0.0003606568202432562, + "loss": 1169.6428, + "step": 5940 + }, + { + "ce_loss_12": 3.2069575071334837, + "ce_loss_17": 3.019283354282379, + "ce_loss_23": 2.943927586078644, + "ce_loss_3": 4.075339150428772, + "ce_loss_6": 3.610486102104187, + "epoch": 0.595, + "grad_norm": 1384.0, + "kl_loss_12": 609.5270141601562, + "kl_loss_17": 153.40417938232423, + "kl_loss_3": 2420.6677978515627, + "kl_loss_6": 1490.734033203125, + "learning_rate": 0.0003591337215792851, + "loss": 1137.5319, + "step": 5950 + }, + { + "ce_loss_12": 3.2310115337371825, + "ce_loss_17": 3.051220166683197, + "ce_loss_23": 2.981062948703766, + "ce_loss_3": 4.030495893955231, + "ce_loss_6": 3.6143756747245788, + "epoch": 0.596, + "grad_norm": 1120.0, + "kl_loss_12": 591.7509918212891, + "kl_loss_17": 147.1378318786621, + "kl_loss_3": 2284.257049560547, + "kl_loss_6": 1437.5657470703125, + "learning_rate": 0.00035761204143717383, + "loss": 1130.8195, + "step": 5960 + }, + { + "ce_loss_12": 3.1949357390403748, + "ce_loss_17": 3.00533322095871, + "ce_loss_23": 2.9311126232147218, + "ce_loss_3": 4.0243856549263, + "ce_loss_6": 3.584021067619324, + "epoch": 0.597, + "grad_norm": 1416.0, + "kl_loss_12": 608.8888397216797, + "kl_loss_17": 154.51462936401367, + "kl_loss_3": 2345.8383850097657, + "kl_loss_6": 1462.9470703125, + "learning_rate": 0.0003560917951402245, + "loss": 1163.4166, + "step": 5970 + }, + { + "ce_loss_12": 3.176355230808258, + "ce_loss_17": 2.987331819534302, + "ce_loss_23": 2.914967715740204, + "ce_loss_3": 4.010832035541535, + "ce_loss_6": 3.5688653111457826, + "epoch": 0.598, + "grad_norm": 1384.0, + "kl_loss_12": 599.978385925293, + "kl_loss_17": 149.56870956420897, + "kl_loss_3": 2336.5320678710937, + "kl_loss_6": 1451.1222106933594, + "learning_rate": 0.00035457299799730046, + "loss": 1134.7748, + "step": 5980 + }, + { + "ce_loss_12": 3.234948694705963, + "ce_loss_17": 3.045933425426483, + "ce_loss_23": 2.974212908744812, + "ce_loss_3": 4.054404509067536, + "ce_loss_6": 3.619036543369293, + "epoch": 0.599, + "grad_norm": 1544.0, + "kl_loss_12": 602.2896270751953, + "kl_loss_17": 151.2698760986328, + "kl_loss_3": 2319.4597351074217, + "kl_loss_6": 1446.1132751464843, + "learning_rate": 0.0003530556653026721, + "loss": 1139.9934, + "step": 5990 + }, + { + "ce_loss_12": 3.157231020927429, + "ce_loss_17": 2.969841682910919, + "ce_loss_23": 2.8940274357795714, + "ce_loss_3": 4.010360980033875, + "ce_loss_6": 3.559044063091278, + "epoch": 0.6, + "grad_norm": 1504.0, + "kl_loss_12": 591.1567291259765, + "kl_loss_17": 148.09835624694824, + "kl_loss_3": 2353.2507446289064, + "kl_loss_6": 1458.3669311523438, + "learning_rate": 0.00035153981233586274, + "loss": 1146.9252, + "step": 6000 + }, + { + "ce_loss_12": 3.128659951686859, + "ce_loss_17": 2.934162414073944, + "ce_loss_23": 2.862848687171936, + "ce_loss_3": 3.975719666481018, + "ce_loss_6": 3.5243224501609802, + "epoch": 0.601, + "grad_norm": 1312.0, + "kl_loss_12": 599.4510681152344, + "kl_loss_17": 145.699942779541, + "kl_loss_3": 2355.63330078125, + "kl_loss_6": 1463.7941650390626, + "learning_rate": 0.00035002545436149473, + "loss": 1179.7224, + "step": 6010 + }, + { + "ce_loss_12": 3.1527183294296264, + "ce_loss_17": 2.9563739776611326, + "ce_loss_23": 2.8834879636764525, + "ce_loss_3": 4.005002570152283, + "ce_loss_6": 3.5590574741363525, + "epoch": 0.602, + "grad_norm": 1224.0, + "kl_loss_12": 619.0013916015625, + "kl_loss_17": 155.90919570922853, + "kl_loss_3": 2394.2220092773437, + "kl_loss_6": 1506.751025390625, + "learning_rate": 0.0003485126066291364, + "loss": 1136.7865, + "step": 6020 + }, + { + "ce_loss_12": 3.179857540130615, + "ce_loss_17": 2.984136152267456, + "ce_loss_23": 2.910652816295624, + "ce_loss_3": 4.023264217376709, + "ce_loss_6": 3.581345570087433, + "epoch": 0.603, + "grad_norm": 1248.0, + "kl_loss_12": 594.2115051269532, + "kl_loss_17": 148.24975967407227, + "kl_loss_3": 2348.5629150390623, + "kl_loss_6": 1469.4192687988282, + "learning_rate": 0.0003470012843731476, + "loss": 1144.6861, + "step": 6030 + }, + { + "ce_loss_12": 3.12680469751358, + "ce_loss_17": 2.9382981777191164, + "ce_loss_23": 2.8633129358291627, + "ce_loss_3": 3.98297358751297, + "ce_loss_6": 3.539493441581726, + "epoch": 0.604, + "grad_norm": 1176.0, + "kl_loss_12": 604.3844543457031, + "kl_loss_17": 150.13841400146484, + "kl_loss_3": 2366.5043212890623, + "kl_loss_6": 1483.6065490722656, + "learning_rate": 0.00034549150281252633, + "loss": 1171.1531, + "step": 6040 + }, + { + "ce_loss_12": 3.109435427188873, + "ce_loss_17": 2.926034116744995, + "ce_loss_23": 2.849857747554779, + "ce_loss_3": 3.928074502944946, + "ce_loss_6": 3.498618996143341, + "epoch": 0.605, + "grad_norm": 1608.0, + "kl_loss_12": 592.2354248046875, + "kl_loss_17": 151.66972579956055, + "kl_loss_3": 2292.69853515625, + "kl_loss_6": 1438.456610107422, + "learning_rate": 0.0003439832771507565, + "loss": 1122.3068, + "step": 6050 + }, + { + "ce_loss_12": 3.124900698661804, + "ce_loss_17": 2.931507778167725, + "ce_loss_23": 2.856498110294342, + "ce_loss_3": 3.9654696106910707, + "ce_loss_6": 3.519077789783478, + "epoch": 0.606, + "grad_norm": 1048.0, + "kl_loss_12": 605.8426788330078, + "kl_loss_17": 150.15861396789552, + "kl_loss_3": 2368.8140502929687, + "kl_loss_6": 1475.5860717773437, + "learning_rate": 0.0003424766225756537, + "loss": 1129.8809, + "step": 6060 + }, + { + "ce_loss_12": 3.1771543860435485, + "ce_loss_17": 2.985655927658081, + "ce_loss_23": 2.911463403701782, + "ce_loss_3": 4.007305264472961, + "ce_loss_6": 3.5662036895751954, + "epoch": 0.607, + "grad_norm": 1272.0, + "kl_loss_12": 608.8062805175781, + "kl_loss_17": 150.2465950012207, + "kl_loss_3": 2340.9812622070312, + "kl_loss_6": 1462.123797607422, + "learning_rate": 0.00034097155425921255, + "loss": 1121.8317, + "step": 6070 + }, + { + "ce_loss_12": 3.0874522805213926, + "ce_loss_17": 2.8925950050354006, + "ce_loss_23": 2.816449236869812, + "ce_loss_3": 3.9341826319694517, + "ce_loss_6": 3.486039936542511, + "epoch": 0.608, + "grad_norm": 1352.0, + "kl_loss_12": 612.0148345947266, + "kl_loss_17": 152.9915786743164, + "kl_loss_3": 2393.93974609375, + "kl_loss_6": 1485.4637573242187, + "learning_rate": 0.0003394680873574546, + "loss": 1142.242, + "step": 6080 + }, + { + "ce_loss_12": 3.1850800275802613, + "ce_loss_17": 2.989072859287262, + "ce_loss_23": 2.9126479387283326, + "ce_loss_3": 4.03468519449234, + "ce_loss_6": 3.579446589946747, + "epoch": 0.609, + "grad_norm": 1696.0, + "kl_loss_12": 611.8661071777344, + "kl_loss_17": 152.29093399047852, + "kl_loss_3": 2403.592901611328, + "kl_loss_6": 1492.8733154296874, + "learning_rate": 0.0003379662370102747, + "loss": 1137.6554, + "step": 6090 + }, + { + "ce_loss_12": 3.1844406366348266, + "ce_loss_17": 2.9984473824501037, + "ce_loss_23": 2.9271413803100588, + "ce_loss_3": 3.9991069436073303, + "ce_loss_6": 3.5708351850509645, + "epoch": 0.61, + "grad_norm": 1200.0, + "kl_loss_12": 603.7627899169922, + "kl_loss_17": 150.33085174560546, + "kl_loss_3": 2321.7859497070312, + "kl_loss_6": 1456.6421813964844, + "learning_rate": 0.0003364660183412892, + "loss": 1136.9385, + "step": 6100 + }, + { + "ce_loss_12": 3.171545147895813, + "ce_loss_17": 2.98282972574234, + "ce_loss_23": 2.9057871460914613, + "ce_loss_3": 3.994250988960266, + "ce_loss_6": 3.5580520749092104, + "epoch": 0.611, + "grad_norm": 1312.0, + "kl_loss_12": 610.9669769287109, + "kl_loss_17": 151.82451553344725, + "kl_loss_3": 2345.481341552734, + "kl_loss_6": 1464.9674865722657, + "learning_rate": 0.0003349674464576834, + "loss": 1152.2445, + "step": 6110 + }, + { + "ce_loss_12": 3.1259689927101135, + "ce_loss_17": 2.9322103381156923, + "ce_loss_23": 2.857883644104004, + "ce_loss_3": 3.975605821609497, + "ce_loss_6": 3.5234857201576233, + "epoch": 0.612, + "grad_norm": 1216.0, + "kl_loss_12": 604.9526611328125, + "kl_loss_17": 152.25790100097657, + "kl_loss_3": 2390.8796508789064, + "kl_loss_6": 1477.9359069824218, + "learning_rate": 0.00033347053645005966, + "loss": 1121.5988, + "step": 6120 + }, + { + "ce_loss_12": 3.209582006931305, + "ce_loss_17": 3.023663640022278, + "ce_loss_23": 2.9509991884231566, + "ce_loss_3": 4.024103116989136, + "ce_loss_6": 3.5973620772361756, + "epoch": 0.613, + "grad_norm": 1448.0, + "kl_loss_12": 590.1886627197266, + "kl_loss_17": 146.934236907959, + "kl_loss_3": 2285.4545166015623, + "kl_loss_6": 1437.0543029785156, + "learning_rate": 0.00033197530339228485, + "loss": 1125.8941, + "step": 6130 + }, + { + "ce_loss_12": 3.1829272985458372, + "ce_loss_17": 2.987203872203827, + "ce_loss_23": 2.9105037689208983, + "ce_loss_3": 4.009172821044922, + "ce_loss_6": 3.5784335494041444, + "epoch": 0.614, + "grad_norm": 1160.0, + "kl_loss_12": 608.9163391113282, + "kl_loss_17": 153.8809036254883, + "kl_loss_3": 2322.467742919922, + "kl_loss_6": 1457.5041198730469, + "learning_rate": 0.00033048176234133967, + "loss": 1128.7283, + "step": 6140 + }, + { + "ce_loss_12": 3.1708322763442993, + "ce_loss_17": 2.9837953567504885, + "ce_loss_23": 2.911977767944336, + "ce_loss_3": 3.9975093483924864, + "ce_loss_6": 3.5627474784851074, + "epoch": 0.615, + "grad_norm": 1336.0, + "kl_loss_12": 609.0241882324219, + "kl_loss_17": 151.88128700256348, + "kl_loss_3": 2350.5504638671873, + "kl_loss_6": 1470.2871520996093, + "learning_rate": 0.0003289899283371657, + "loss": 1146.8797, + "step": 6150 + }, + { + "ce_loss_12": 3.1794658422470095, + "ce_loss_17": 2.9889869689941406, + "ce_loss_23": 2.916124200820923, + "ce_loss_3": 4.025843060016632, + "ce_loss_6": 3.573673188686371, + "epoch": 0.616, + "grad_norm": 1424.0, + "kl_loss_12": 586.6110168457031, + "kl_loss_17": 148.034024810791, + "kl_loss_3": 2350.829797363281, + "kl_loss_6": 1451.2182983398438, + "learning_rate": 0.0003274998164025148, + "loss": 1149.0533, + "step": 6160 + }, + { + "ce_loss_12": 3.215155506134033, + "ce_loss_17": 3.0253263831138613, + "ce_loss_23": 2.9493968963623045, + "ce_loss_3": 4.043154263496399, + "ce_loss_6": 3.6066914439201354, + "epoch": 0.617, + "grad_norm": 1312.0, + "kl_loss_12": 607.980241394043, + "kl_loss_17": 152.23887901306153, + "kl_loss_3": 2323.497283935547, + "kl_loss_6": 1456.3749694824219, + "learning_rate": 0.0003260114415427975, + "loss": 1161.5605, + "step": 6170 + }, + { + "ce_loss_12": 3.140718376636505, + "ce_loss_17": 2.9506392121315, + "ce_loss_23": 2.8773612856864927, + "ce_loss_3": 4.008118355274201, + "ce_loss_6": 3.5459144592285154, + "epoch": 0.618, + "grad_norm": 1408.0, + "kl_loss_12": 598.2550018310546, + "kl_loss_17": 149.49109649658203, + "kl_loss_3": 2396.45498046875, + "kl_loss_6": 1475.0368041992188, + "learning_rate": 0.0003245248187459323, + "loss": 1160.7645, + "step": 6180 + }, + { + "ce_loss_12": 3.1301306247711183, + "ce_loss_17": 2.9471630215644837, + "ce_loss_23": 2.8765426874160767, + "ce_loss_3": 3.94159791469574, + "ce_loss_6": 3.507413148880005, + "epoch": 0.619, + "grad_norm": 1112.0, + "kl_loss_12": 581.0814636230468, + "kl_loss_17": 144.73041152954102, + "kl_loss_3": 2278.9553833007812, + "kl_loss_6": 1409.4451171875, + "learning_rate": 0.00032303996298219416, + "loss": 1107.7658, + "step": 6190 + }, + { + "ce_loss_12": 3.2049764752388, + "ce_loss_17": 3.026884377002716, + "ce_loss_23": 2.9530259490013124, + "ce_loss_3": 4.021306729316711, + "ce_loss_6": 3.588157629966736, + "epoch": 0.62, + "grad_norm": 1224.0, + "kl_loss_12": 579.8495864868164, + "kl_loss_17": 146.9240924835205, + "kl_loss_3": 2267.1603881835936, + "kl_loss_6": 1408.4067260742188, + "learning_rate": 0.00032155688920406414, + "loss": 1105.2465, + "step": 6200 + }, + { + "ce_loss_12": 3.121407675743103, + "ce_loss_17": 2.929682528972626, + "ce_loss_23": 2.855285167694092, + "ce_loss_3": 3.995857799053192, + "ce_loss_6": 3.5331621408462524, + "epoch": 0.621, + "grad_norm": 1120.0, + "kl_loss_12": 601.7544982910156, + "kl_loss_17": 150.88608016967774, + "kl_loss_3": 2400.5302612304686, + "kl_loss_6": 1472.9722839355468, + "learning_rate": 0.0003200756123460788, + "loss": 1165.9913, + "step": 6210 + }, + { + "ce_loss_12": 3.167928087711334, + "ce_loss_17": 2.972314703464508, + "ce_loss_23": 2.894737720489502, + "ce_loss_3": 4.021480929851532, + "ce_loss_6": 3.567240130901337, + "epoch": 0.622, + "grad_norm": 1440.0, + "kl_loss_12": 616.119351196289, + "kl_loss_17": 153.373934173584, + "kl_loss_3": 2405.947863769531, + "kl_loss_6": 1489.7071838378906, + "learning_rate": 0.00031859614732467957, + "loss": 1159.7865, + "step": 6220 + }, + { + "ce_loss_12": 3.2072643995285035, + "ce_loss_17": 3.0212273478507994, + "ce_loss_23": 2.9487449049949648, + "ce_loss_3": 4.019171059131622, + "ce_loss_6": 3.587374818325043, + "epoch": 0.623, + "grad_norm": 1056.0, + "kl_loss_12": 587.6943115234375, + "kl_loss_17": 146.84216384887696, + "kl_loss_3": 2285.8351196289063, + "kl_loss_6": 1420.5199340820313, + "learning_rate": 0.00031711850903806275, + "loss": 1112.425, + "step": 6230 + }, + { + "ce_loss_12": 3.126500689983368, + "ce_loss_17": 2.9322530746459963, + "ce_loss_23": 2.8578232526779175, + "ce_loss_3": 3.983571135997772, + "ce_loss_6": 3.5279215216636657, + "epoch": 0.624, + "grad_norm": 1176.0, + "kl_loss_12": 613.6227737426758, + "kl_loss_17": 153.28499908447264, + "kl_loss_3": 2400.1694946289062, + "kl_loss_6": 1488.192919921875, + "learning_rate": 0.0003156427123660297, + "loss": 1135.8245, + "step": 6240 + }, + { + "ce_loss_12": 3.196655297279358, + "ce_loss_17": 3.0089795351028443, + "ce_loss_23": 2.9388261318206785, + "ce_loss_3": 4.000034284591675, + "ce_loss_6": 3.582012724876404, + "epoch": 0.625, + "grad_norm": 1560.0, + "kl_loss_12": 596.1258331298828, + "kl_loss_17": 148.52873306274415, + "kl_loss_3": 2285.386181640625, + "kl_loss_6": 1436.193377685547, + "learning_rate": 0.0003141687721698363, + "loss": 1134.0557, + "step": 6250 + }, + { + "ce_loss_12": 3.1654615879058836, + "ce_loss_17": 2.9897891521453857, + "ce_loss_23": 2.9180311799049377, + "ce_loss_3": 3.9630396962165833, + "ce_loss_6": 3.5340129017829893, + "epoch": 0.626, + "grad_norm": 1064.0, + "kl_loss_12": 565.9640930175781, + "kl_loss_17": 141.93714752197266, + "kl_loss_3": 2219.7123840332033, + "kl_loss_6": 1378.172296142578, + "learning_rate": 0.00031269670329204396, + "loss": 1106.1096, + "step": 6260 + }, + { + "ce_loss_12": 3.216164839267731, + "ce_loss_17": 3.030501461029053, + "ce_loss_23": 2.955772066116333, + "ce_loss_3": 4.008867633342743, + "ce_loss_6": 3.5905953645706177, + "epoch": 0.627, + "grad_norm": 1336.0, + "kl_loss_12": 592.2648681640625, + "kl_loss_17": 148.63665695190429, + "kl_loss_3": 2268.0641845703126, + "kl_loss_6": 1425.7196228027344, + "learning_rate": 0.00031122652055637015, + "loss": 1125.5658, + "step": 6270 + }, + { + "ce_loss_12": 3.1709736585617065, + "ce_loss_17": 2.9849471926689146, + "ce_loss_23": 2.9148017048835753, + "ce_loss_3": 4.019098281860352, + "ce_loss_6": 3.567766749858856, + "epoch": 0.628, + "grad_norm": 1080.0, + "kl_loss_12": 601.929150390625, + "kl_loss_17": 150.09540405273438, + "kl_loss_3": 2373.2475158691404, + "kl_loss_6": 1467.5196044921875, + "learning_rate": 0.0003097582387675385, + "loss": 1122.9711, + "step": 6280 + }, + { + "ce_loss_12": 3.213307094573975, + "ce_loss_17": 3.0226985931396486, + "ce_loss_23": 2.9502960085868835, + "ce_loss_3": 4.035060441493988, + "ce_loss_6": 3.5958678007125853, + "epoch": 0.629, + "grad_norm": 1152.0, + "kl_loss_12": 598.458171081543, + "kl_loss_17": 148.96140213012694, + "kl_loss_3": 2318.7240600585938, + "kl_loss_6": 1443.7019104003907, + "learning_rate": 0.00030829187271113034, + "loss": 1122.701, + "step": 6290 + }, + { + "ce_loss_12": 3.184871768951416, + "ce_loss_17": 3.0090969800949097, + "ce_loss_23": 2.9364115834236144, + "ce_loss_3": 4.002640330791474, + "ce_loss_6": 3.570776116847992, + "epoch": 0.63, + "grad_norm": 1128.0, + "kl_loss_12": 578.4871795654296, + "kl_loss_17": 145.35349922180177, + "kl_loss_3": 2268.692517089844, + "kl_loss_6": 1417.2647277832032, + "learning_rate": 0.00030682743715343565, + "loss": 1127.8535, + "step": 6300 + }, + { + "ce_loss_12": 3.157248556613922, + "ce_loss_17": 2.9611279249191282, + "ce_loss_23": 2.88759206533432, + "ce_loss_3": 3.994263780117035, + "ce_loss_6": 3.5590861797332765, + "epoch": 0.631, + "grad_norm": 1328.0, + "kl_loss_12": 603.9541046142579, + "kl_loss_17": 152.56044921875, + "kl_loss_3": 2347.0472778320313, + "kl_loss_6": 1476.6657836914062, + "learning_rate": 0.0003053649468413043, + "loss": 1155.5088, + "step": 6310 + }, + { + "ce_loss_12": 3.2615909457206724, + "ce_loss_17": 3.0738922357559204, + "ce_loss_23": 2.9989441514015196, + "ce_loss_3": 4.070518684387207, + "ce_loss_6": 3.650091600418091, + "epoch": 0.632, + "grad_norm": 1824.0, + "kl_loss_12": 598.5535140991211, + "kl_loss_17": 150.5441162109375, + "kl_loss_3": 2293.541394042969, + "kl_loss_6": 1446.69013671875, + "learning_rate": 0.00030390441650199725, + "loss": 1119.5223, + "step": 6320 + }, + { + "ce_loss_12": 3.165328633785248, + "ce_loss_17": 2.9779279112815855, + "ce_loss_23": 2.9034200429916384, + "ce_loss_3": 3.984565556049347, + "ce_loss_6": 3.5544453859329224, + "epoch": 0.633, + "grad_norm": 1160.0, + "kl_loss_12": 591.5257629394531, + "kl_loss_17": 147.43704071044922, + "kl_loss_3": 2294.754736328125, + "kl_loss_6": 1435.9236389160155, + "learning_rate": 0.00030244586084303903, + "loss": 1112.9562, + "step": 6330 + }, + { + "ce_loss_12": 3.146142864227295, + "ce_loss_17": 2.9525272369384767, + "ce_loss_23": 2.879060161113739, + "ce_loss_3": 3.98732351064682, + "ce_loss_6": 3.556743288040161, + "epoch": 0.634, + "grad_norm": 964.0, + "kl_loss_12": 612.9192779541015, + "kl_loss_17": 151.7993896484375, + "kl_loss_3": 2369.3794189453124, + "kl_loss_6": 1497.5594482421875, + "learning_rate": 0.00030098929455206903, + "loss": 1128.7233, + "step": 6340 + }, + { + "ce_loss_12": 3.1373493790626528, + "ce_loss_17": 2.951869750022888, + "ce_loss_23": 2.87964323759079, + "ce_loss_3": 3.972902524471283, + "ce_loss_6": 3.526309645175934, + "epoch": 0.635, + "grad_norm": 1208.0, + "kl_loss_12": 590.8793487548828, + "kl_loss_17": 145.87656936645507, + "kl_loss_3": 2343.8325744628905, + "kl_loss_6": 1446.49482421875, + "learning_rate": 0.00029953473229669324, + "loss": 1165.4348, + "step": 6350 + }, + { + "ce_loss_12": 3.1724481105804445, + "ce_loss_17": 2.976711869239807, + "ce_loss_23": 2.906025779247284, + "ce_loss_3": 4.003190112113953, + "ce_loss_6": 3.571776270866394, + "epoch": 0.636, + "grad_norm": 1048.0, + "kl_loss_12": 608.5196533203125, + "kl_loss_17": 147.74198837280272, + "kl_loss_3": 2350.3733459472655, + "kl_loss_6": 1479.3622009277344, + "learning_rate": 0.00029808218872433767, + "loss": 1128.2008, + "step": 6360 + }, + { + "ce_loss_12": 3.2232914566993713, + "ce_loss_17": 3.0327231764793394, + "ce_loss_23": 2.9622638821601868, + "ce_loss_3": 4.034514009952545, + "ce_loss_6": 3.605409836769104, + "epoch": 0.637, + "grad_norm": 1096.0, + "kl_loss_12": 592.4508743286133, + "kl_loss_17": 145.98952713012696, + "kl_loss_3": 2294.2256591796877, + "kl_loss_6": 1436.6225830078124, + "learning_rate": 0.0002966316784621, + "loss": 1110.1354, + "step": 6370 + }, + { + "ce_loss_12": 3.15024710893631, + "ce_loss_17": 2.9527449727058412, + "ce_loss_23": 2.8774781823158264, + "ce_loss_3": 3.998103404045105, + "ce_loss_6": 3.549445116519928, + "epoch": 0.638, + "grad_norm": 1104.0, + "kl_loss_12": 609.9254791259766, + "kl_loss_17": 151.64585189819337, + "kl_loss_3": 2374.343786621094, + "kl_loss_6": 1481.7275695800781, + "learning_rate": 0.0002951832161166024, + "loss": 1124.3397, + "step": 6380 + }, + { + "ce_loss_12": 3.213831353187561, + "ce_loss_17": 3.022569644451141, + "ce_loss_23": 2.9454912304878236, + "ce_loss_3": 4.0392385721206665, + "ce_loss_6": 3.605069971084595, + "epoch": 0.639, + "grad_norm": 1072.0, + "kl_loss_12": 605.1690795898437, + "kl_loss_17": 152.21279106140136, + "kl_loss_3": 2328.7935546875, + "kl_loss_6": 1465.1787719726562, + "learning_rate": 0.0002937368162738445, + "loss": 1114.6975, + "step": 6390 + }, + { + "ce_loss_12": 3.1600446581840513, + "ce_loss_17": 2.979280388355255, + "ce_loss_23": 2.9099658846855165, + "ce_loss_3": 3.9755544304847716, + "ce_loss_6": 3.5501636147499083, + "epoch": 0.64, + "grad_norm": 1216.0, + "kl_loss_12": 580.9705139160156, + "kl_loss_17": 143.8193145751953, + "kl_loss_3": 2297.1350341796874, + "kl_loss_6": 1437.3939392089844, + "learning_rate": 0.0002922924934990568, + "loss": 1134.1188, + "step": 6400 + }, + { + "ce_loss_12": 3.108557164669037, + "ce_loss_17": 2.916198897361755, + "ce_loss_23": 2.842394769191742, + "ce_loss_3": 3.9649079561233522, + "ce_loss_6": 3.516673135757446, + "epoch": 0.641, + "grad_norm": 1200.0, + "kl_loss_12": 600.0068923950196, + "kl_loss_17": 148.4601577758789, + "kl_loss_3": 2399.68701171875, + "kl_loss_6": 1498.5277709960938, + "learning_rate": 0.0002908502623365536, + "loss": 1143.1725, + "step": 6410 + }, + { + "ce_loss_12": 3.044642722606659, + "ce_loss_17": 2.8523294806480406, + "ce_loss_23": 2.7781936526298523, + "ce_loss_3": 3.919729804992676, + "ce_loss_6": 3.457630491256714, + "epoch": 0.642, + "grad_norm": 3200.0, + "kl_loss_12": 598.8210571289062, + "kl_loss_17": 147.02215728759765, + "kl_loss_3": 2417.6473999023438, + "kl_loss_6": 1497.975439453125, + "learning_rate": 0.0002894101373095867, + "loss": 1146.0413, + "step": 6420 + }, + { + "ce_loss_12": 3.2444644451141356, + "ce_loss_17": 3.0542767524719237, + "ce_loss_23": 2.9808418273925783, + "ce_loss_3": 4.047118508815766, + "ce_loss_6": 3.6216217398643495, + "epoch": 0.643, + "grad_norm": 1416.0, + "kl_loss_12": 610.3435943603515, + "kl_loss_17": 150.961669921875, + "kl_loss_3": 2293.247906494141, + "kl_loss_6": 1440.0751953125, + "learning_rate": 0.00028797213292019926, + "loss": 1124.4115, + "step": 6430 + }, + { + "ce_loss_12": 3.223282754421234, + "ce_loss_17": 3.033059549331665, + "ce_loss_23": 2.957798421382904, + "ce_loss_3": 4.038387930393219, + "ce_loss_6": 3.608675718307495, + "epoch": 0.644, + "grad_norm": 948.0, + "kl_loss_12": 602.3243682861328, + "kl_loss_17": 150.3400001525879, + "kl_loss_3": 2312.091912841797, + "kl_loss_6": 1452.2079223632813, + "learning_rate": 0.0002865362636490791, + "loss": 1151.496, + "step": 6440 + }, + { + "ce_loss_12": 3.2280746579170225, + "ce_loss_17": 3.047099208831787, + "ce_loss_23": 2.976475703716278, + "ce_loss_3": 4.046898591518402, + "ce_loss_6": 3.6178045153617857, + "epoch": 0.645, + "grad_norm": 1344.0, + "kl_loss_12": 590.5327224731445, + "kl_loss_17": 147.85210762023925, + "kl_loss_3": 2296.6646057128905, + "kl_loss_6": 1439.5161071777343, + "learning_rate": 0.0002851025439554142, + "loss": 1116.2505, + "step": 6450 + }, + { + "ce_loss_12": 3.213414740562439, + "ce_loss_17": 3.0239868998527526, + "ce_loss_23": 2.9495511770248415, + "ce_loss_3": 4.0160266041755674, + "ce_loss_6": 3.602611756324768, + "epoch": 0.646, + "grad_norm": 1248.0, + "kl_loss_12": 590.5889404296875, + "kl_loss_17": 147.70597915649415, + "kl_loss_3": 2252.881640625, + "kl_loss_6": 1425.1687744140625, + "learning_rate": 0.00028367098827674573, + "loss": 1108.9879, + "step": 6460 + }, + { + "ce_loss_12": 3.150103771686554, + "ce_loss_17": 2.9648916363716125, + "ce_loss_23": 2.893376624584198, + "ce_loss_3": 3.979960310459137, + "ce_loss_6": 3.532588255405426, + "epoch": 0.647, + "grad_norm": 1312.0, + "kl_loss_12": 588.588998413086, + "kl_loss_17": 146.35768966674806, + "kl_loss_3": 2310.770294189453, + "kl_loss_6": 1421.0911865234375, + "learning_rate": 0.00028224161102882397, + "loss": 1128.1974, + "step": 6470 + }, + { + "ce_loss_12": 3.124748420715332, + "ce_loss_17": 2.94069367647171, + "ce_loss_23": 2.873618412017822, + "ce_loss_3": 3.9308391571044923, + "ce_loss_6": 3.506719410419464, + "epoch": 0.648, + "grad_norm": 1432.0, + "kl_loss_12": 587.4733581542969, + "kl_loss_17": 142.98950424194337, + "kl_loss_3": 2271.9211547851564, + "kl_loss_6": 1421.7393371582032, + "learning_rate": 0.00028081442660546124, + "loss": 1119.6853, + "step": 6480 + }, + { + "ce_loss_12": 3.1836213946342466, + "ce_loss_17": 2.9990946412086488, + "ce_loss_23": 2.9259018540382384, + "ce_loss_3": 3.999267888069153, + "ce_loss_6": 3.5609941363334654, + "epoch": 0.649, + "grad_norm": 1752.0, + "kl_loss_12": 593.3670318603515, + "kl_loss_17": 150.30060958862305, + "kl_loss_3": 2291.286004638672, + "kl_loss_6": 1419.9478332519532, + "learning_rate": 0.0002793894493783892, + "loss": 1122.9418, + "step": 6490 + }, + { + "ce_loss_12": 3.200083112716675, + "ce_loss_17": 3.0166388154029846, + "ce_loss_23": 2.946355402469635, + "ce_loss_3": 4.015068662166596, + "ce_loss_6": 3.58471964597702, + "epoch": 0.65, + "grad_norm": 1240.0, + "kl_loss_12": 577.6244537353516, + "kl_loss_17": 143.19990081787108, + "kl_loss_3": 2275.6525634765626, + "kl_loss_6": 1418.7616638183595, + "learning_rate": 0.0002779666936971129, + "loss": 1109.6523, + "step": 6500 + }, + { + "ce_loss_12": 3.22410135269165, + "ce_loss_17": 3.034737467765808, + "ce_loss_23": 2.9630566596984864, + "ce_loss_3": 4.054137790203095, + "ce_loss_6": 3.6148640871047975, + "epoch": 0.651, + "grad_norm": 1088.0, + "kl_loss_12": 598.0519134521485, + "kl_loss_17": 147.48272628784179, + "kl_loss_3": 2338.8424743652345, + "kl_loss_6": 1454.3167419433594, + "learning_rate": 0.00027654617388876614, + "loss": 1136.659, + "step": 6510 + }, + { + "ce_loss_12": 3.232533669471741, + "ce_loss_17": 3.0474787831306456, + "ce_loss_23": 2.9730993270874024, + "ce_loss_3": 4.049802815914154, + "ce_loss_6": 3.6165407419204714, + "epoch": 0.652, + "grad_norm": 1360.0, + "kl_loss_12": 592.7881439208984, + "kl_loss_17": 149.90747146606446, + "kl_loss_3": 2309.7993225097657, + "kl_loss_6": 1438.9326171875, + "learning_rate": 0.0002751279042579672, + "loss": 1127.0908, + "step": 6520 + }, + { + "ce_loss_12": 3.176752769947052, + "ce_loss_17": 2.9937629222869875, + "ce_loss_23": 2.924986481666565, + "ce_loss_3": 3.9907779455184937, + "ce_loss_6": 3.5558727622032165, + "epoch": 0.653, + "grad_norm": 1352.0, + "kl_loss_12": 581.5867752075195, + "kl_loss_17": 143.07452926635742, + "kl_loss_3": 2284.7026916503905, + "kl_loss_6": 1414.8037902832032, + "learning_rate": 0.00027371189908667604, + "loss": 1130.4285, + "step": 6530 + }, + { + "ce_loss_12": 3.2427014231681826, + "ce_loss_17": 3.0470166921615602, + "ce_loss_23": 2.971630573272705, + "ce_loss_3": 4.092951714992523, + "ce_loss_6": 3.6393123149871824, + "epoch": 0.654, + "grad_norm": 1456.0, + "kl_loss_12": 604.3457977294922, + "kl_loss_17": 153.89494667053222, + "kl_loss_3": 2381.4398864746095, + "kl_loss_6": 1469.5702087402344, + "learning_rate": 0.00027229817263404863, + "loss": 1154.1786, + "step": 6540 + }, + { + "ce_loss_12": 3.20701208114624, + "ce_loss_17": 3.0258082151412964, + "ce_loss_23": 2.9565651059150695, + "ce_loss_3": 3.9878334045410155, + "ce_loss_6": 3.5799823880195616, + "epoch": 0.655, + "grad_norm": 1232.0, + "kl_loss_12": 575.495622253418, + "kl_loss_17": 144.19720306396485, + "kl_loss_3": 2215.0460876464845, + "kl_loss_6": 1402.7090881347656, + "learning_rate": 0.0002708867391362948, + "loss": 1106.6969, + "step": 6550 + }, + { + "ce_loss_12": 3.185906779766083, + "ce_loss_17": 3.0091296672821044, + "ce_loss_23": 2.9405287146568297, + "ce_loss_3": 3.981113874912262, + "ce_loss_6": 3.5535893082618712, + "epoch": 0.656, + "grad_norm": 1104.0, + "kl_loss_12": 562.3880432128906, + "kl_loss_17": 142.22967758178712, + "kl_loss_3": 2219.839807128906, + "kl_loss_6": 1364.049249267578, + "learning_rate": 0.0002694776128065345, + "loss": 1104.3037, + "step": 6560 + }, + { + "ce_loss_12": 3.138961064815521, + "ce_loss_17": 2.941958963871002, + "ce_loss_23": 2.8727088570594788, + "ce_loss_3": 3.9581244349479676, + "ce_loss_6": 3.5262621879577636, + "epoch": 0.657, + "grad_norm": 1368.0, + "kl_loss_12": 599.2472259521485, + "kl_loss_17": 148.40215988159179, + "kl_loss_3": 2321.914453125, + "kl_loss_6": 1458.2103759765625, + "learning_rate": 0.00026807080783465374, + "loss": 1110.8821, + "step": 6570 + }, + { + "ce_loss_12": 3.2356321930885317, + "ce_loss_17": 3.0464752554893493, + "ce_loss_23": 2.9742892026901244, + "ce_loss_3": 4.063664793968201, + "ce_loss_6": 3.637293744087219, + "epoch": 0.658, + "grad_norm": 1000.0, + "kl_loss_12": 602.9461944580078, + "kl_loss_17": 149.45880355834962, + "kl_loss_3": 2330.4873901367187, + "kl_loss_6": 1463.7465087890625, + "learning_rate": 0.00026666633838716316, + "loss": 1140.2938, + "step": 6580 + }, + { + "ce_loss_12": 3.1436243057250977, + "ce_loss_17": 2.9555483937263487, + "ce_loss_23": 2.881186819076538, + "ce_loss_3": 3.9724827885627745, + "ce_loss_6": 3.537186050415039, + "epoch": 0.659, + "grad_norm": 1392.0, + "kl_loss_12": 605.7148712158203, + "kl_loss_17": 152.03119735717775, + "kl_loss_3": 2340.710888671875, + "kl_loss_6": 1465.8653442382813, + "learning_rate": 0.00026526421860705474, + "loss": 1144.2678, + "step": 6590 + }, + { + "ce_loss_12": 3.1641727924346923, + "ce_loss_17": 2.973479175567627, + "ce_loss_23": 2.8982677459716797, + "ce_loss_3": 4.001079571247101, + "ce_loss_6": 3.568039798736572, + "epoch": 0.66, + "grad_norm": 1296.0, + "kl_loss_12": 603.3994934082032, + "kl_loss_17": 151.27597427368164, + "kl_loss_3": 2330.6815490722656, + "kl_loss_6": 1461.9353454589843, + "learning_rate": 0.0002638644626136587, + "loss": 1121.041, + "step": 6600 + }, + { + "ce_loss_12": 3.1769190669059753, + "ce_loss_17": 2.9900643467903136, + "ce_loss_23": 2.9231056213378905, + "ce_loss_3": 3.994716131687164, + "ce_loss_6": 3.56888769865036, + "epoch": 0.661, + "grad_norm": 1376.0, + "kl_loss_12": 591.1800964355468, + "kl_loss_17": 145.49627571105958, + "kl_loss_3": 2285.549725341797, + "kl_loss_6": 1434.5020751953125, + "learning_rate": 0.00026246708450250255, + "loss": 1120.9137, + "step": 6610 + }, + { + "ce_loss_12": 3.15739620923996, + "ce_loss_17": 2.972544002532959, + "ce_loss_23": 2.9045681595802306, + "ce_loss_3": 3.9569130659103395, + "ce_loss_6": 3.539802873134613, + "epoch": 0.662, + "grad_norm": 1440.0, + "kl_loss_12": 582.6712707519531, + "kl_loss_17": 145.07760772705078, + "kl_loss_3": 2261.2706298828125, + "kl_loss_6": 1423.3890197753906, + "learning_rate": 0.00026107209834516854, + "loss": 1110.5779, + "step": 6620 + }, + { + "ce_loss_12": 3.1285615086555483, + "ce_loss_17": 2.9375810623168945, + "ce_loss_23": 2.8670949935913086, + "ce_loss_3": 3.987957274913788, + "ce_loss_6": 3.540684628486633, + "epoch": 0.663, + "grad_norm": 1184.0, + "kl_loss_12": 597.255239868164, + "kl_loss_17": 148.15191955566405, + "kl_loss_3": 2383.203448486328, + "kl_loss_6": 1482.990771484375, + "learning_rate": 0.0002596795181891514, + "loss": 1148.891, + "step": 6630 + }, + { + "ce_loss_12": 3.1358890414237974, + "ce_loss_17": 2.9432680010795593, + "ce_loss_23": 2.868211197853088, + "ce_loss_3": 3.9691487431526182, + "ce_loss_6": 3.5316598892211912, + "epoch": 0.664, + "grad_norm": 1616.0, + "kl_loss_12": 606.9677642822265, + "kl_loss_17": 152.69766693115236, + "kl_loss_3": 2344.5061767578127, + "kl_loss_6": 1471.2357299804687, + "learning_rate": 0.000258289358057718, + "loss": 1173.8878, + "step": 6640 + }, + { + "ce_loss_12": 3.2025445699691772, + "ce_loss_17": 3.0105114579200745, + "ce_loss_23": 2.9322133660316467, + "ce_loss_3": 4.045162808895111, + "ce_loss_6": 3.597779726982117, + "epoch": 0.665, + "grad_norm": 1248.0, + "kl_loss_12": 607.0796997070313, + "kl_loss_17": 154.87155456542968, + "kl_loss_3": 2362.5149047851564, + "kl_loss_6": 1467.0385498046876, + "learning_rate": 0.0002569016319497657, + "loss": 1145.5117, + "step": 6650 + }, + { + "ce_loss_12": 3.192644727230072, + "ce_loss_17": 2.9978889346122743, + "ce_loss_23": 2.9197690725326537, + "ce_loss_3": 4.026680159568786, + "ce_loss_6": 3.5872318506240846, + "epoch": 0.666, + "grad_norm": 1312.0, + "kl_loss_12": 611.0297134399414, + "kl_loss_17": 154.25914001464844, + "kl_loss_3": 2363.039794921875, + "kl_loss_6": 1478.5891235351562, + "learning_rate": 0.00025551635383968066, + "loss": 1158.082, + "step": 6660 + }, + { + "ce_loss_12": 3.1067636609077454, + "ce_loss_17": 2.9205042839050295, + "ce_loss_23": 2.8470686197280886, + "ce_loss_3": 3.942138361930847, + "ce_loss_6": 3.496360182762146, + "epoch": 0.667, + "grad_norm": 1456.0, + "kl_loss_12": 601.7290283203125, + "kl_loss_17": 149.90078125, + "kl_loss_3": 2356.6501403808593, + "kl_loss_6": 1464.00458984375, + "learning_rate": 0.00025413353767719804, + "loss": 1142.4888, + "step": 6670 + }, + { + "ce_loss_12": 3.158530426025391, + "ce_loss_17": 2.9713932633399964, + "ce_loss_23": 2.9024715542793276, + "ce_loss_3": 3.9879515528678895, + "ce_loss_6": 3.552604818344116, + "epoch": 0.668, + "grad_norm": 1632.0, + "kl_loss_12": 593.8800552368164, + "kl_loss_17": 145.14826431274415, + "kl_loss_3": 2332.1615966796876, + "kl_loss_6": 1458.469189453125, + "learning_rate": 0.0002527531973872617, + "loss": 1132.6044, + "step": 6680 + }, + { + "ce_loss_12": 3.171540653705597, + "ce_loss_17": 2.9899519324302672, + "ce_loss_23": 2.917602562904358, + "ce_loss_3": 3.9797378540039063, + "ce_loss_6": 3.5506080865859984, + "epoch": 0.669, + "grad_norm": 1576.0, + "kl_loss_12": 588.2021850585937, + "kl_loss_17": 146.58099365234375, + "kl_loss_3": 2298.2243774414064, + "kl_loss_6": 1430.4866027832031, + "learning_rate": 0.0002513753468698826, + "loss": 1117.3383, + "step": 6690 + }, + { + "ce_loss_12": 3.143074405193329, + "ce_loss_17": 2.9521414399147035, + "ce_loss_23": 2.880590558052063, + "ce_loss_3": 3.9751343846321108, + "ce_loss_6": 3.5359527111053466, + "epoch": 0.67, + "grad_norm": 1152.0, + "kl_loss_12": 601.9620422363281, + "kl_loss_17": 149.78503875732423, + "kl_loss_3": 2354.22490234375, + "kl_loss_6": 1465.5066467285155, + "learning_rate": 0.0002500000000000001, + "loss": 1136.8999, + "step": 6700 + }, + { + "ce_loss_12": 3.2374098777770994, + "ce_loss_17": 3.0599971771240235, + "ce_loss_23": 2.9913960814476015, + "ce_loss_3": 4.014524698257446, + "ce_loss_6": 3.598774230480194, + "epoch": 0.671, + "grad_norm": 1808.0, + "kl_loss_12": 577.1859619140625, + "kl_loss_17": 142.84659118652343, + "kl_loss_3": 2219.120251464844, + "kl_loss_6": 1389.6295349121094, + "learning_rate": 0.0002486271706273421, + "loss": 1138.1179, + "step": 6710 + }, + { + "ce_loss_12": 3.175167953968048, + "ce_loss_17": 2.9992194294929506, + "ce_loss_23": 2.932393181324005, + "ce_loss_3": 3.957785654067993, + "ce_loss_6": 3.5485648989677427, + "epoch": 0.672, + "grad_norm": 1656.0, + "kl_loss_12": 573.5445266723633, + "kl_loss_17": 142.3242702484131, + "kl_loss_3": 2217.2351928710937, + "kl_loss_6": 1394.2926696777345, + "learning_rate": 0.0002472568725762853, + "loss": 1113.6104, + "step": 6720 + }, + { + "ce_loss_12": 3.1725917220115663, + "ce_loss_17": 2.9979816913604735, + "ce_loss_23": 2.927784335613251, + "ce_loss_3": 3.9532659888267516, + "ce_loss_6": 3.541015100479126, + "epoch": 0.673, + "grad_norm": 1704.0, + "kl_loss_12": 565.4108581542969, + "kl_loss_17": 141.81959800720216, + "kl_loss_3": 2226.5724914550783, + "kl_loss_6": 1387.7454345703125, + "learning_rate": 0.00024588911964571554, + "loss": 1101.4466, + "step": 6730 + }, + { + "ce_loss_12": 3.2031340479850767, + "ce_loss_17": 3.005861687660217, + "ce_loss_23": 2.9257205724716187, + "ce_loss_3": 4.046345973014832, + "ce_loss_6": 3.6056156754493713, + "epoch": 0.674, + "grad_norm": 1248.0, + "kl_loss_12": 618.6064208984375, + "kl_loss_17": 156.18606986999512, + "kl_loss_3": 2352.5917114257813, + "kl_loss_6": 1489.669677734375, + "learning_rate": 0.00024452392560888974, + "loss": 1133.0781, + "step": 6740 + }, + { + "ce_loss_12": 3.0902359008789064, + "ce_loss_17": 2.902163362503052, + "ce_loss_23": 2.8289408922195434, + "ce_loss_3": 3.9031673073768616, + "ce_loss_6": 3.4809311509132383, + "epoch": 0.675, + "grad_norm": 932.0, + "kl_loss_12": 581.6207214355469, + "kl_loss_17": 142.64251518249512, + "kl_loss_3": 2299.742852783203, + "kl_loss_6": 1443.7633056640625, + "learning_rate": 0.00024316130421329695, + "loss": 1109.1988, + "step": 6750 + }, + { + "ce_loss_12": 3.1604875445365908, + "ce_loss_17": 2.9751969516277312, + "ce_loss_23": 2.9060828745365144, + "ce_loss_3": 3.9672070026397703, + "ce_loss_6": 3.5406667947769166, + "epoch": 0.676, + "grad_norm": 1144.0, + "kl_loss_12": 582.4213714599609, + "kl_loss_17": 144.71898345947267, + "kl_loss_3": 2288.937127685547, + "kl_loss_6": 1428.474053955078, + "learning_rate": 0.00024180126918051909, + "loss": 1120.6777, + "step": 6760 + }, + { + "ce_loss_12": 3.208397078514099, + "ce_loss_17": 3.022159683704376, + "ce_loss_23": 2.948068857192993, + "ce_loss_3": 4.01270581483841, + "ce_loss_6": 3.5845556616783143, + "epoch": 0.677, + "grad_norm": 1328.0, + "kl_loss_12": 589.6311950683594, + "kl_loss_17": 147.20864715576172, + "kl_loss_3": 2279.206201171875, + "kl_loss_6": 1416.8634216308594, + "learning_rate": 0.00024044383420609406, + "loss": 1104.9541, + "step": 6770 + }, + { + "ce_loss_12": 3.209148478507996, + "ce_loss_17": 3.033456230163574, + "ce_loss_23": 2.964026403427124, + "ce_loss_3": 3.992376971244812, + "ce_loss_6": 3.5765359163284303, + "epoch": 0.678, + "grad_norm": 1416.0, + "kl_loss_12": 576.0615325927735, + "kl_loss_17": 142.7035717010498, + "kl_loss_3": 2249.72431640625, + "kl_loss_6": 1399.3908142089845, + "learning_rate": 0.00023908901295937712, + "loss": 1123.5682, + "step": 6780 + }, + { + "ce_loss_12": 3.19677255153656, + "ce_loss_17": 3.0181459307670595, + "ce_loss_23": 2.94276225566864, + "ce_loss_3": 4.002016091346741, + "ce_loss_6": 3.5729544520378114, + "epoch": 0.679, + "grad_norm": 1368.0, + "kl_loss_12": 576.8889617919922, + "kl_loss_17": 144.82299575805663, + "kl_loss_3": 2257.2669372558594, + "kl_loss_6": 1399.4642395019532, + "learning_rate": 0.00023773681908340283, + "loss": 1128.3949, + "step": 6790 + }, + { + "ce_loss_12": 3.1967212319374085, + "ce_loss_17": 2.998736011981964, + "ce_loss_23": 2.921932280063629, + "ce_loss_3": 4.031246483325958, + "ce_loss_6": 3.5901097416877747, + "epoch": 0.68, + "grad_norm": 1552.0, + "kl_loss_12": 618.5885345458985, + "kl_loss_17": 155.23205490112304, + "kl_loss_3": 2373.9909118652345, + "kl_loss_6": 1489.2642150878905, + "learning_rate": 0.00023638726619474876, + "loss": 1160.0055, + "step": 6800 + }, + { + "ce_loss_12": 3.1839903354644776, + "ce_loss_17": 2.9923721075057985, + "ce_loss_23": 2.915251064300537, + "ce_loss_3": 4.039820122718811, + "ce_loss_6": 3.5939448714256286, + "epoch": 0.681, + "grad_norm": 1568.0, + "kl_loss_12": 604.9242568969727, + "kl_loss_17": 151.71022109985353, + "kl_loss_3": 2384.5748291015625, + "kl_loss_6": 1487.0956420898438, + "learning_rate": 0.0002350403678833976, + "loss": 1141.8998, + "step": 6810 + }, + { + "ce_loss_12": 3.1043522715568543, + "ce_loss_17": 2.916385734081268, + "ce_loss_23": 2.84534512758255, + "ce_loss_3": 3.937952661514282, + "ce_loss_6": 3.502272379398346, + "epoch": 0.682, + "grad_norm": 1296.0, + "kl_loss_12": 584.9004409790039, + "kl_loss_17": 144.92746849060057, + "kl_loss_3": 2332.2755493164063, + "kl_loss_6": 1461.111065673828, + "learning_rate": 0.00023369613771260007, + "loss": 1119.822, + "step": 6820 + }, + { + "ce_loss_12": 3.2144542694091798, + "ce_loss_17": 3.029024124145508, + "ce_loss_23": 2.955704939365387, + "ce_loss_3": 4.042465269565582, + "ce_loss_6": 3.6094631791114806, + "epoch": 0.683, + "grad_norm": 1440.0, + "kl_loss_12": 596.6627593994141, + "kl_loss_17": 149.4819065093994, + "kl_loss_3": 2335.4446228027346, + "kl_loss_6": 1454.4375061035157, + "learning_rate": 0.00023235458921873925, + "loss": 1139.2387, + "step": 6830 + }, + { + "ce_loss_12": 3.1983043551445007, + "ce_loss_17": 2.9906994700431824, + "ce_loss_23": 2.911998617649078, + "ce_loss_3": 4.063765621185302, + "ce_loss_6": 3.60615394115448, + "epoch": 0.684, + "grad_norm": 2112.0, + "kl_loss_12": 625.3911346435547, + "kl_loss_17": 156.50121154785157, + "kl_loss_3": 2434.817175292969, + "kl_loss_6": 1516.2722412109374, + "learning_rate": 0.0002310157359111938, + "loss": 1176.7929, + "step": 6840 + }, + { + "ce_loss_12": 3.089562547206879, + "ce_loss_17": 2.884237289428711, + "ce_loss_23": 2.8070345878601075, + "ce_loss_3": 3.99581755399704, + "ce_loss_6": 3.5228491544723513, + "epoch": 0.685, + "grad_norm": 1376.0, + "kl_loss_12": 615.8566284179688, + "kl_loss_17": 151.1296371459961, + "kl_loss_3": 2489.9639892578125, + "kl_loss_6": 1544.7711181640625, + "learning_rate": 0.0002296795912722014, + "loss": 1170.5421, + "step": 6850 + }, + { + "ce_loss_12": 3.2020869612693788, + "ce_loss_17": 3.016293668746948, + "ce_loss_23": 2.94240859746933, + "ce_loss_3": 3.9994003772735596, + "ce_loss_6": 3.5754223704338073, + "epoch": 0.686, + "grad_norm": 1128.0, + "kl_loss_12": 587.4125350952148, + "kl_loss_17": 146.60589141845702, + "kl_loss_3": 2264.4470092773436, + "kl_loss_6": 1416.0138854980469, + "learning_rate": 0.0002283461687567236, + "loss": 1095.1857, + "step": 6860 + }, + { + "ce_loss_12": 3.250152349472046, + "ce_loss_17": 3.0724654316902162, + "ce_loss_23": 3.000766944885254, + "ce_loss_3": 4.037157213687896, + "ce_loss_6": 3.6243523478507997, + "epoch": 0.687, + "grad_norm": 1128.0, + "kl_loss_12": 573.4170837402344, + "kl_loss_17": 145.43106575012206, + "kl_loss_3": 2224.8468505859373, + "kl_loss_6": 1396.8940673828124, + "learning_rate": 0.00022701548179231045, + "loss": 1117.4321, + "step": 6870 + }, + { + "ce_loss_12": 3.2201813578605654, + "ce_loss_17": 3.031428003311157, + "ce_loss_23": 2.954519069194794, + "ce_loss_3": 4.044415283203125, + "ce_loss_6": 3.5991448640823362, + "epoch": 0.688, + "grad_norm": 1344.0, + "kl_loss_12": 592.9102249145508, + "kl_loss_17": 150.1623119354248, + "kl_loss_3": 2327.7541625976564, + "kl_loss_6": 1440.0543701171875, + "learning_rate": 0.00022568754377896516, + "loss": 1110.0392, + "step": 6880 + }, + { + "ce_loss_12": 3.209094429016113, + "ce_loss_17": 3.0246420383453367, + "ce_loss_23": 2.952113616466522, + "ce_loss_3": 4.003854358196259, + "ce_loss_6": 3.5801191091537476, + "epoch": 0.689, + "grad_norm": 1720.0, + "kl_loss_12": 592.9092987060546, + "kl_loss_17": 147.33820648193358, + "kl_loss_3": 2274.3667724609377, + "kl_loss_6": 1413.89697265625, + "learning_rate": 0.00022436236808900844, + "loss": 1107.8881, + "step": 6890 + }, + { + "ce_loss_12": 3.108115577697754, + "ce_loss_17": 2.91782306432724, + "ce_loss_23": 2.844039022922516, + "ce_loss_3": 3.9408508062362673, + "ce_loss_6": 3.498040962219238, + "epoch": 0.69, + "grad_norm": 1200.0, + "kl_loss_12": 594.958203125, + "kl_loss_17": 148.14904098510743, + "kl_loss_3": 2346.5946533203123, + "kl_loss_6": 1460.1141479492187, + "learning_rate": 0.00022303996806694487, + "loss": 1125.8866, + "step": 6900 + }, + { + "ce_loss_12": 3.1772311329841614, + "ce_loss_17": 2.9900581002235413, + "ce_loss_23": 2.921220266819, + "ce_loss_3": 4.001453697681427, + "ce_loss_6": 3.5635480999946596, + "epoch": 0.691, + "grad_norm": 1208.0, + "kl_loss_12": 586.475065612793, + "kl_loss_17": 143.4232997894287, + "kl_loss_3": 2316.4053588867187, + "kl_loss_6": 1432.9564514160156, + "learning_rate": 0.00022172035702932823, + "loss": 1117.1566, + "step": 6910 + }, + { + "ce_loss_12": 3.2220534086227417, + "ce_loss_17": 3.040873372554779, + "ce_loss_23": 2.9694732785224915, + "ce_loss_3": 4.017025899887085, + "ce_loss_6": 3.600583755970001, + "epoch": 0.692, + "grad_norm": 1336.0, + "kl_loss_12": 579.6053695678711, + "kl_loss_17": 148.07587966918945, + "kl_loss_3": 2232.5748901367188, + "kl_loss_6": 1407.3253784179688, + "learning_rate": 0.00022040354826462666, + "loss": 1099.7223, + "step": 6920 + }, + { + "ce_loss_12": 3.147498941421509, + "ce_loss_17": 2.965992307662964, + "ce_loss_23": 2.8967095255851745, + "ce_loss_3": 3.9676818251609802, + "ce_loss_6": 3.537060356140137, + "epoch": 0.693, + "grad_norm": 1400.0, + "kl_loss_12": 575.1736312866211, + "kl_loss_17": 142.63258590698243, + "kl_loss_3": 2292.0581909179687, + "kl_loss_6": 1423.503076171875, + "learning_rate": 0.0002190895550330899, + "loss": 1126.8942, + "step": 6930 + }, + { + "ce_loss_12": 3.1000084280967712, + "ce_loss_17": 2.9018681764602663, + "ce_loss_23": 2.8272014617919923, + "ce_loss_3": 3.9402028799057005, + "ce_loss_6": 3.503280687332153, + "epoch": 0.694, + "grad_norm": 1400.0, + "kl_loss_12": 605.8569915771484, + "kl_loss_17": 150.15897674560546, + "kl_loss_3": 2352.3010498046874, + "kl_loss_6": 1482.665869140625, + "learning_rate": 0.00021777839056661552, + "loss": 1121.7594, + "step": 6940 + }, + { + "ce_loss_12": 3.1669057846069335, + "ce_loss_17": 2.983146142959595, + "ce_loss_23": 2.9131749987602236, + "ce_loss_3": 3.9779937624931336, + "ce_loss_6": 3.542782926559448, + "epoch": 0.695, + "grad_norm": 1184.0, + "kl_loss_12": 585.135107421875, + "kl_loss_17": 146.01019325256348, + "kl_loss_3": 2290.107806396484, + "kl_loss_6": 1422.4843505859376, + "learning_rate": 0.0002164700680686147, + "loss": 1102.3137, + "step": 6950 + }, + { + "ce_loss_12": 3.20974086523056, + "ce_loss_17": 3.0271469235420225, + "ce_loss_23": 2.9553972721099853, + "ce_loss_3": 4.0066024422645565, + "ce_loss_6": 3.5798150181770323, + "epoch": 0.696, + "grad_norm": 1432.0, + "kl_loss_12": 578.3921051025391, + "kl_loss_17": 147.39475708007814, + "kl_loss_3": 2247.7211364746095, + "kl_loss_6": 1400.5633666992187, + "learning_rate": 0.0002151646007138806, + "loss": 1101.2953, + "step": 6960 + }, + { + "ce_loss_12": 3.1077236413955687, + "ce_loss_17": 2.9199201107025146, + "ce_loss_23": 2.8468748211860655, + "ce_loss_3": 3.9442235946655275, + "ce_loss_6": 3.5072736620903013, + "epoch": 0.697, + "grad_norm": 1128.0, + "kl_loss_12": 594.4067352294921, + "kl_loss_17": 148.54014892578124, + "kl_loss_3": 2346.5884643554687, + "kl_loss_6": 1464.653057861328, + "learning_rate": 0.00021386200164845526, + "loss": 1122.6799, + "step": 6970 + }, + { + "ce_loss_12": 3.258119261264801, + "ce_loss_17": 3.0787838459014893, + "ce_loss_23": 3.0076930999755858, + "ce_loss_3": 4.033900547027588, + "ce_loss_6": 3.622900068759918, + "epoch": 0.698, + "grad_norm": 1256.0, + "kl_loss_12": 576.0225189208984, + "kl_loss_17": 144.2971618652344, + "kl_loss_3": 2218.9604919433596, + "kl_loss_6": 1390.7083740234375, + "learning_rate": 0.0002125622839894964, + "loss": 1091.1894, + "step": 6980 + }, + { + "ce_loss_12": 3.203335177898407, + "ce_loss_17": 3.028899538516998, + "ce_loss_23": 2.959743618965149, + "ce_loss_3": 4.007862174510956, + "ce_loss_6": 3.584617245197296, + "epoch": 0.699, + "grad_norm": 1232.0, + "kl_loss_12": 568.6464782714844, + "kl_loss_17": 142.77477569580077, + "kl_loss_3": 2233.1903686523438, + "kl_loss_6": 1392.2311157226563, + "learning_rate": 0.00021126546082514663, + "loss": 1094.3217, + "step": 6990 + }, + { + "ce_loss_12": 3.2287532091140747, + "ce_loss_17": 3.0504186034202574, + "ce_loss_23": 2.979254949092865, + "ce_loss_3": 4.0115339159965515, + "ce_loss_6": 3.6003906965255736, + "epoch": 0.7, + "grad_norm": 1152.0, + "kl_loss_12": 580.096305847168, + "kl_loss_17": 144.75360679626465, + "kl_loss_3": 2235.0508361816405, + "kl_loss_6": 1406.2971618652343, + "learning_rate": 0.00020997154521440098, + "loss": 1092.5399, + "step": 7000 + }, + { + "ce_loss_12": 3.1771739721298218, + "ce_loss_17": 2.99692097902298, + "ce_loss_23": 2.9295363426208496, + "ce_loss_3": 3.9901798963546753, + "ce_loss_6": 3.5643086552619936, + "epoch": 0.701, + "grad_norm": 1312.0, + "kl_loss_12": 581.0184539794922, + "kl_loss_17": 143.7947853088379, + "kl_loss_3": 2281.5001831054688, + "kl_loss_6": 1420.4745666503907, + "learning_rate": 0.0002086805501869749, + "loss": 1099.1198, + "step": 7010 + }, + { + "ce_loss_12": 3.1693623661994934, + "ce_loss_17": 2.972110378742218, + "ce_loss_23": 2.8974018573760985, + "ce_loss_3": 4.006498980522156, + "ce_loss_6": 3.5661180138587953, + "epoch": 0.702, + "grad_norm": 1328.0, + "kl_loss_12": 606.0820495605469, + "kl_loss_17": 150.7113815307617, + "kl_loss_3": 2364.7195556640627, + "kl_loss_6": 1468.3957214355469, + "learning_rate": 0.0002073924887431744, + "loss": 1125.9121, + "step": 7020 + }, + { + "ce_loss_12": 3.1665372014045716, + "ce_loss_17": 2.9757155537605287, + "ce_loss_23": 2.9051700711250303, + "ce_loss_3": 3.9907246470451354, + "ce_loss_6": 3.551064443588257, + "epoch": 0.703, + "grad_norm": 1416.0, + "kl_loss_12": 592.3192321777344, + "kl_loss_17": 145.40274963378906, + "kl_loss_3": 2334.407586669922, + "kl_loss_6": 1456.1877990722655, + "learning_rate": 0.00020610737385376348, + "loss": 1151.4578, + "step": 7030 + }, + { + "ce_loss_12": 3.2095029950141907, + "ce_loss_17": 3.025873637199402, + "ce_loss_23": 2.95588641166687, + "ce_loss_3": 3.990134584903717, + "ce_loss_6": 3.5787421226501466, + "epoch": 0.704, + "grad_norm": 1512.0, + "kl_loss_12": 578.5413848876954, + "kl_loss_17": 145.04814453125, + "kl_loss_3": 2222.1506408691407, + "kl_loss_6": 1396.5619079589844, + "learning_rate": 0.00020482521845983521, + "loss": 1116.9852, + "step": 7040 + }, + { + "ce_loss_12": 3.2187553524971007, + "ce_loss_17": 3.0291430354118347, + "ce_loss_23": 2.9548076152801515, + "ce_loss_3": 4.0312185168266295, + "ce_loss_6": 3.5974926352500916, + "epoch": 0.705, + "grad_norm": 1536.0, + "kl_loss_12": 598.0195999145508, + "kl_loss_17": 150.73707084655763, + "kl_loss_3": 2315.3036499023438, + "kl_loss_6": 1432.486279296875, + "learning_rate": 0.00020354603547267987, + "loss": 1131.4294, + "step": 7050 + }, + { + "ce_loss_12": 3.208362877368927, + "ce_loss_17": 3.014052724838257, + "ce_loss_23": 2.9381724119186403, + "ce_loss_3": 4.033243632316589, + "ce_loss_6": 3.602437198162079, + "epoch": 0.706, + "grad_norm": 1056.0, + "kl_loss_12": 599.1079956054688, + "kl_loss_17": 149.3365219116211, + "kl_loss_3": 2316.6187255859377, + "kl_loss_6": 1450.9468078613281, + "learning_rate": 0.00020226983777365604, + "loss": 1150.7684, + "step": 7060 + }, + { + "ce_loss_12": 3.1097822904586794, + "ce_loss_17": 2.9277292609214784, + "ce_loss_23": 2.8572330713272094, + "ce_loss_3": 3.970816802978516, + "ce_loss_6": 3.528963398933411, + "epoch": 0.707, + "grad_norm": 1104.0, + "kl_loss_12": 575.9199295043945, + "kl_loss_17": 142.6550651550293, + "kl_loss_3": 2367.4453247070314, + "kl_loss_6": 1481.1780944824218, + "learning_rate": 0.00020099663821406056, + "loss": 1124.9882, + "step": 7070 + }, + { + "ce_loss_12": 3.201669692993164, + "ce_loss_17": 3.0207114934921266, + "ce_loss_23": 2.950303483009338, + "ce_loss_3": 3.99226975440979, + "ce_loss_6": 3.581176018714905, + "epoch": 0.708, + "grad_norm": 1424.0, + "kl_loss_12": 572.6087646484375, + "kl_loss_17": 142.79727745056152, + "kl_loss_3": 2232.3437744140624, + "kl_loss_6": 1398.9955017089844, + "learning_rate": 0.00019972644961499853, + "loss": 1115.7543, + "step": 7080 + }, + { + "ce_loss_12": 3.1833733320236206, + "ce_loss_17": 2.9917277693748474, + "ce_loss_23": 2.9189423322677612, + "ce_loss_3": 4.023042452335358, + "ce_loss_6": 3.582495379447937, + "epoch": 0.709, + "grad_norm": 2096.0, + "kl_loss_12": 602.1402572631836, + "kl_loss_17": 148.9338066101074, + "kl_loss_3": 2353.1352416992186, + "kl_loss_6": 1466.5263977050781, + "learning_rate": 0.00019845928476725522, + "loss": 1130.4533, + "step": 7090 + }, + { + "ce_loss_12": 3.2478617787361146, + "ce_loss_17": 3.0631136178970335, + "ce_loss_23": 2.987545442581177, + "ce_loss_3": 4.052975380420685, + "ce_loss_6": 3.630919361114502, + "epoch": 0.71, + "grad_norm": 1128.0, + "kl_loss_12": 595.8744415283203, + "kl_loss_17": 149.51752853393555, + "kl_loss_3": 2277.5608825683594, + "kl_loss_6": 1439.5630126953124, + "learning_rate": 0.00019719515643116677, + "loss": 1149.5734, + "step": 7100 + }, + { + "ce_loss_12": 3.1794224858283995, + "ce_loss_17": 3.0005685448646546, + "ce_loss_23": 2.928575849533081, + "ce_loss_3": 3.98095588684082, + "ce_loss_6": 3.552529680728912, + "epoch": 0.711, + "grad_norm": 1176.0, + "kl_loss_12": 581.0087188720703, + "kl_loss_17": 145.46866683959962, + "kl_loss_3": 2260.6100463867188, + "kl_loss_6": 1397.145037841797, + "learning_rate": 0.0001959340773364911, + "loss": 1117.3887, + "step": 7110 + }, + { + "ce_loss_12": 3.203937256336212, + "ce_loss_17": 3.017394721508026, + "ce_loss_23": 2.9459567189216616, + "ce_loss_3": 4.019110321998596, + "ce_loss_6": 3.585983669757843, + "epoch": 0.712, + "grad_norm": 1352.0, + "kl_loss_12": 592.6168884277344, + "kl_loss_17": 148.39927024841307, + "kl_loss_3": 2298.1994018554688, + "kl_loss_6": 1427.4383544921875, + "learning_rate": 0.0001946760601822809, + "loss": 1099.9972, + "step": 7120 + }, + { + "ce_loss_12": 3.252336490154266, + "ce_loss_17": 3.0694833278656004, + "ce_loss_23": 2.998616576194763, + "ce_loss_3": 4.036021447181701, + "ce_loss_6": 3.618166470527649, + "epoch": 0.713, + "grad_norm": 1352.0, + "kl_loss_12": 583.6036193847656, + "kl_loss_17": 145.45759658813478, + "kl_loss_3": 2245.6363159179687, + "kl_loss_6": 1390.04169921875, + "learning_rate": 0.00019342111763675512, + "loss": 1080.6883, + "step": 7130 + }, + { + "ce_loss_12": 3.2442177057266237, + "ce_loss_17": 3.067750263214111, + "ce_loss_23": 2.995485985279083, + "ce_loss_3": 4.031316709518433, + "ce_loss_6": 3.6112120509147645, + "epoch": 0.714, + "grad_norm": 1504.0, + "kl_loss_12": 578.5055465698242, + "kl_loss_17": 145.30845413208007, + "kl_loss_3": 2234.8689147949217, + "kl_loss_6": 1396.4184204101562, + "learning_rate": 0.00019216926233717085, + "loss": 1089.952, + "step": 7140 + }, + { + "ce_loss_12": 3.1445976376533507, + "ce_loss_17": 2.9616607785224915, + "ce_loss_23": 2.889950382709503, + "ce_loss_3": 4.022900247573853, + "ce_loss_6": 3.572395408153534, + "epoch": 0.715, + "grad_norm": 1528.0, + "kl_loss_12": 581.4880859375, + "kl_loss_17": 143.35638961791992, + "kl_loss_3": 2402.4765563964843, + "kl_loss_6": 1503.399151611328, + "learning_rate": 0.00019092050688969737, + "loss": 1145.067, + "step": 7150 + }, + { + "ce_loss_12": 3.207192873954773, + "ce_loss_17": 3.0271405935287476, + "ce_loss_23": 2.9592058658599854, + "ce_loss_3": 4.003524768352508, + "ce_loss_6": 3.5840381622314452, + "epoch": 0.716, + "grad_norm": 1184.0, + "kl_loss_12": 575.2487243652344, + "kl_loss_17": 143.97141494750977, + "kl_loss_3": 2269.0265625, + "kl_loss_6": 1419.7068603515625, + "learning_rate": 0.00018967486386928817, + "loss": 1102.258, + "step": 7160 + }, + { + "ce_loss_12": 3.099281966686249, + "ce_loss_17": 2.9145707488059998, + "ce_loss_23": 2.838303065299988, + "ce_loss_3": 3.9301305770874024, + "ce_loss_6": 3.498118782043457, + "epoch": 0.717, + "grad_norm": 1248.0, + "kl_loss_12": 593.0745239257812, + "kl_loss_17": 146.49713897705078, + "kl_loss_3": 2328.5628295898437, + "kl_loss_6": 1448.4362487792969, + "learning_rate": 0.00018843234581955443, + "loss": 1158.2389, + "step": 7170 + }, + { + "ce_loss_12": 3.108709764480591, + "ce_loss_17": 2.919401240348816, + "ce_loss_23": 2.8431138396263123, + "ce_loss_3": 3.933067190647125, + "ce_loss_6": 3.5124778032302855, + "epoch": 0.718, + "grad_norm": 1120.0, + "kl_loss_12": 597.4008911132812, + "kl_loss_17": 147.00185546875, + "kl_loss_3": 2323.1351928710938, + "kl_loss_6": 1465.429376220703, + "learning_rate": 0.00018719296525263924, + "loss": 1130.7549, + "step": 7180 + }, + { + "ce_loss_12": 3.1889859080314635, + "ce_loss_17": 3.0144015312194825, + "ce_loss_23": 2.943605732917786, + "ce_loss_3": 3.9734396815299986, + "ce_loss_6": 3.5531632900238037, + "epoch": 0.719, + "grad_norm": 1072.0, + "kl_loss_12": 572.4617446899414, + "kl_loss_17": 144.9117515563965, + "kl_loss_3": 2214.2423400878906, + "kl_loss_6": 1372.7084411621095, + "learning_rate": 0.0001859567346490913, + "loss": 1084.4133, + "step": 7190 + }, + { + "ce_loss_12": 3.1837105631828306, + "ce_loss_17": 2.998974609375, + "ce_loss_23": 2.9240657448768617, + "ce_loss_3": 4.012367558479309, + "ce_loss_6": 3.5784144282341, + "epoch": 0.72, + "grad_norm": 1472.0, + "kl_loss_12": 601.2089935302735, + "kl_loss_17": 152.0529815673828, + "kl_loss_3": 2333.0523010253905, + "kl_loss_6": 1457.9123229980469, + "learning_rate": 0.0001847236664577389, + "loss": 1114.916, + "step": 7200 + }, + { + "ce_loss_12": 3.188700783252716, + "ce_loss_17": 3.014432442188263, + "ce_loss_23": 2.94440495967865, + "ce_loss_3": 3.970227265357971, + "ce_loss_6": 3.5592984199523925, + "epoch": 0.721, + "grad_norm": 968.0, + "kl_loss_12": 566.3043823242188, + "kl_loss_17": 143.4260395050049, + "kl_loss_3": 2203.2349060058596, + "kl_loss_6": 1379.9577514648438, + "learning_rate": 0.00018349377309556487, + "loss": 1078.4733, + "step": 7210 + }, + { + "ce_loss_12": 3.151521551609039, + "ce_loss_17": 2.965848422050476, + "ce_loss_23": 2.892253887653351, + "ce_loss_3": 4.010136902332306, + "ce_loss_6": 3.55112361907959, + "epoch": 0.722, + "grad_norm": 1208.0, + "kl_loss_12": 601.6816268920899, + "kl_loss_17": 149.03083419799805, + "kl_loss_3": 2394.7946655273436, + "kl_loss_6": 1489.086114501953, + "learning_rate": 0.00018226706694758193, + "loss": 1139.2559, + "step": 7220 + }, + { + "ce_loss_12": 3.2205862522125246, + "ce_loss_17": 3.03840092420578, + "ce_loss_23": 2.969194233417511, + "ce_loss_3": 4.030839443206787, + "ce_loss_6": 3.6026239514350893, + "epoch": 0.723, + "grad_norm": 1640.0, + "kl_loss_12": 584.8420654296875, + "kl_loss_17": 144.78758735656737, + "kl_loss_3": 2293.312860107422, + "kl_loss_6": 1431.7969970703125, + "learning_rate": 0.0001810435603667075, + "loss": 1141.4186, + "step": 7230 + }, + { + "ce_loss_12": 3.0781525492668154, + "ce_loss_17": 2.892492079734802, + "ce_loss_23": 2.821278524398804, + "ce_loss_3": 3.905537283420563, + "ce_loss_6": 3.470686209201813, + "epoch": 0.724, + "grad_norm": 1224.0, + "kl_loss_12": 578.4883483886719, + "kl_loss_17": 143.28313026428222, + "kl_loss_3": 2293.3400268554688, + "kl_loss_6": 1430.2943786621095, + "learning_rate": 0.0001798232656736389, + "loss": 1136.9285, + "step": 7240 + }, + { + "ce_loss_12": 3.232522702217102, + "ce_loss_17": 3.0540451884269713, + "ce_loss_23": 2.980920839309692, + "ce_loss_3": 4.012105321884155, + "ce_loss_6": 3.5947370648384096, + "epoch": 0.725, + "grad_norm": 1216.0, + "kl_loss_12": 573.4094543457031, + "kl_loss_17": 145.14729194641114, + "kl_loss_3": 2195.9948669433593, + "kl_loss_6": 1368.5669921875, + "learning_rate": 0.0001786061951567303, + "loss": 1095.492, + "step": 7250 + }, + { + "ce_loss_12": 3.1569305181503298, + "ce_loss_17": 2.970924460887909, + "ce_loss_23": 2.899139416217804, + "ce_loss_3": 3.9716181993484496, + "ce_loss_6": 3.545856165885925, + "epoch": 0.726, + "grad_norm": 1520.0, + "kl_loss_12": 589.0374481201172, + "kl_loss_17": 147.1114543914795, + "kl_loss_3": 2295.5287536621095, + "kl_loss_6": 1434.188897705078, + "learning_rate": 0.00017739236107186857, + "loss": 1124.3455, + "step": 7260 + }, + { + "ce_loss_12": 3.2377625346183776, + "ce_loss_17": 3.066043329238892, + "ce_loss_23": 2.9976449489593504, + "ce_loss_3": 4.003402233123779, + "ce_loss_6": 3.5984499335289, + "epoch": 0.727, + "grad_norm": 940.0, + "kl_loss_12": 564.48974609375, + "kl_loss_17": 140.15952224731444, + "kl_loss_3": 2170.3753051757812, + "kl_loss_6": 1369.9614013671876, + "learning_rate": 0.00017618177564234904, + "loss": 1083.0557, + "step": 7270 + }, + { + "ce_loss_12": 3.2049625515937805, + "ce_loss_17": 3.030275619029999, + "ce_loss_23": 2.9614447474479677, + "ce_loss_3": 3.986129355430603, + "ce_loss_6": 3.5735137104988097, + "epoch": 0.728, + "grad_norm": 1160.0, + "kl_loss_12": 555.6091201782226, + "kl_loss_17": 138.71613235473632, + "kl_loss_3": 2169.0173767089846, + "kl_loss_6": 1353.0901977539063, + "learning_rate": 0.00017497445105875377, + "loss": 1080.5398, + "step": 7280 + }, + { + "ce_loss_12": 3.140712082386017, + "ce_loss_17": 2.9468762159347532, + "ce_loss_23": 2.876932406425476, + "ce_loss_3": 3.9740896821022034, + "ce_loss_6": 3.534068500995636, + "epoch": 0.729, + "grad_norm": 1200.0, + "kl_loss_12": 597.3080123901367, + "kl_loss_17": 147.67110023498535, + "kl_loss_3": 2346.868408203125, + "kl_loss_6": 1457.0498901367187, + "learning_rate": 0.000173770399478828, + "loss": 1122.9779, + "step": 7290 + }, + { + "ce_loss_12": 3.058976912498474, + "ce_loss_17": 2.8826879620552064, + "ce_loss_23": 2.8132660746574403, + "ce_loss_3": 3.8742360711097716, + "ce_loss_6": 3.438892364501953, + "epoch": 0.73, + "grad_norm": 1264.0, + "kl_loss_12": 570.9928634643554, + "kl_loss_17": 140.67396697998046, + "kl_loss_3": 2276.644677734375, + "kl_loss_6": 1412.6644165039063, + "learning_rate": 0.0001725696330273575, + "loss": 1131.4014, + "step": 7300 + }, + { + "ce_loss_12": 3.227369546890259, + "ce_loss_17": 3.0480236291885374, + "ce_loss_23": 2.978104054927826, + "ce_loss_3": 4.010425055027008, + "ce_loss_6": 3.603419530391693, + "epoch": 0.731, + "grad_norm": 1304.0, + "kl_loss_12": 564.4230331420898, + "kl_loss_17": 141.29519424438476, + "kl_loss_3": 2205.4941833496096, + "kl_loss_6": 1380.6068481445313, + "learning_rate": 0.00017137216379604724, + "loss": 1080.4908, + "step": 7310 + }, + { + "ce_loss_12": 3.116891658306122, + "ce_loss_17": 2.933759880065918, + "ce_loss_23": 2.862773048877716, + "ce_loss_3": 3.94740868806839, + "ce_loss_6": 3.502023792266846, + "epoch": 0.732, + "grad_norm": 1440.0, + "kl_loss_12": 575.4544143676758, + "kl_loss_17": 144.30592575073243, + "kl_loss_3": 2295.069012451172, + "kl_loss_6": 1417.2135070800782, + "learning_rate": 0.00017017800384339925, + "loss": 1114.4125, + "step": 7320 + }, + { + "ce_loss_12": 3.083300220966339, + "ce_loss_17": 2.893224263191223, + "ce_loss_23": 2.82099050283432, + "ce_loss_3": 3.9307360887527465, + "ce_loss_6": 3.4917693614959715, + "epoch": 0.733, + "grad_norm": 1216.0, + "kl_loss_12": 593.4081375122071, + "kl_loss_17": 145.57610702514648, + "kl_loss_3": 2350.2811889648438, + "kl_loss_6": 1473.9288024902344, + "learning_rate": 0.00016898716519459073, + "loss": 1109.7352, + "step": 7330 + }, + { + "ce_loss_12": 3.208773601055145, + "ce_loss_17": 3.0135222554206846, + "ce_loss_23": 2.9373353004455565, + "ce_loss_3": 4.047830474376679, + "ce_loss_6": 3.6058629393577575, + "epoch": 0.734, + "grad_norm": 1216.0, + "kl_loss_12": 609.4722335815429, + "kl_loss_17": 152.3070224761963, + "kl_loss_3": 2354.237579345703, + "kl_loss_6": 1459.6238159179688, + "learning_rate": 0.00016779965984135375, + "loss": 1126.5805, + "step": 7340 + }, + { + "ce_loss_12": 3.1197221755981444, + "ce_loss_17": 2.937836170196533, + "ce_loss_23": 2.8669689655303956, + "ce_loss_3": 3.9445420265197755, + "ce_loss_6": 3.505992555618286, + "epoch": 0.735, + "grad_norm": 1272.0, + "kl_loss_12": 572.8582366943359, + "kl_loss_17": 143.15871887207032, + "kl_loss_3": 2274.748468017578, + "kl_loss_6": 1415.9035888671874, + "learning_rate": 0.00016661549974185424, + "loss": 1106.4482, + "step": 7350 + }, + { + "ce_loss_12": 3.1488837718963625, + "ce_loss_17": 2.962867999076843, + "ce_loss_23": 2.8920888781547545, + "ce_loss_3": 3.9611996173858643, + "ce_loss_6": 3.538123643398285, + "epoch": 0.736, + "grad_norm": 1592.0, + "kl_loss_12": 583.8536376953125, + "kl_loss_17": 145.90351219177245, + "kl_loss_3": 2276.6626098632814, + "kl_loss_6": 1423.073419189453, + "learning_rate": 0.00016543469682057105, + "loss": 1097.6937, + "step": 7360 + }, + { + "ce_loss_12": 3.1773533821105957, + "ce_loss_17": 2.9906538844108583, + "ce_loss_23": 2.917534816265106, + "ce_loss_3": 3.990094232559204, + "ce_loss_6": 3.5587793827056884, + "epoch": 0.737, + "grad_norm": 1656.0, + "kl_loss_12": 591.9924346923829, + "kl_loss_17": 148.6124095916748, + "kl_loss_3": 2289.5767211914062, + "kl_loss_6": 1421.1706970214843, + "learning_rate": 0.00016425726296817632, + "loss": 1107.2316, + "step": 7370 + }, + { + "ce_loss_12": 3.1852425217628477, + "ce_loss_17": 3.004103994369507, + "ce_loss_23": 2.9343242764472963, + "ce_loss_3": 3.981669914722443, + "ce_loss_6": 3.561198091506958, + "epoch": 0.738, + "grad_norm": 1256.0, + "kl_loss_12": 569.1318725585937, + "kl_loss_17": 143.4800178527832, + "kl_loss_3": 2231.682751464844, + "kl_loss_6": 1396.7402954101562, + "learning_rate": 0.00016308321004141607, + "loss": 1100.6882, + "step": 7380 + }, + { + "ce_loss_12": 3.148279881477356, + "ce_loss_17": 2.95694876909256, + "ce_loss_23": 2.8799212694168093, + "ce_loss_3": 3.969662404060364, + "ce_loss_6": 3.5375774264335633, + "epoch": 0.739, + "grad_norm": 1256.0, + "kl_loss_12": 595.9786041259765, + "kl_loss_17": 150.35715103149414, + "kl_loss_3": 2306.9234130859377, + "kl_loss_6": 1443.4886779785156, + "learning_rate": 0.00016191254986299043, + "loss": 1103.6375, + "step": 7390 + }, + { + "ce_loss_12": 3.1669175028800964, + "ce_loss_17": 2.994936764240265, + "ce_loss_23": 2.925759744644165, + "ce_loss_3": 3.961804914474487, + "ce_loss_6": 3.5510587096214294, + "epoch": 0.74, + "grad_norm": 1648.0, + "kl_loss_12": 568.7914382934571, + "kl_loss_17": 140.78036155700684, + "kl_loss_3": 2246.350238037109, + "kl_loss_6": 1410.0535949707032, + "learning_rate": 0.00016074529422143398, + "loss": 1114.4383, + "step": 7400 + }, + { + "ce_loss_12": 3.147038972377777, + "ce_loss_17": 2.9591886162757874, + "ce_loss_23": 2.888611650466919, + "ce_loss_3": 3.9801909923553467, + "ce_loss_6": 3.5400409579277037, + "epoch": 0.741, + "grad_norm": 1432.0, + "kl_loss_12": 591.3722869873047, + "kl_loss_17": 147.95472526550293, + "kl_loss_3": 2320.990362548828, + "kl_loss_6": 1443.7143188476562, + "learning_rate": 0.0001595814548709983, + "loss": 1131.6094, + "step": 7410 + }, + { + "ce_loss_12": 3.2124876499176027, + "ce_loss_17": 3.020291340351105, + "ce_loss_23": 2.94623681306839, + "ce_loss_3": 4.042755770683288, + "ce_loss_6": 3.5979917287826537, + "epoch": 0.742, + "grad_norm": 1120.0, + "kl_loss_12": 604.6123016357421, + "kl_loss_17": 151.30472259521486, + "kl_loss_3": 2353.1304321289062, + "kl_loss_6": 1456.8755493164062, + "learning_rate": 0.00015842104353153285, + "loss": 1128.1342, + "step": 7420 + }, + { + "ce_loss_12": 3.2141287565231322, + "ce_loss_17": 3.0256335973739623, + "ce_loss_23": 2.9571072816848756, + "ce_loss_3": 4.025741386413574, + "ce_loss_6": 3.604836130142212, + "epoch": 0.743, + "grad_norm": 1232.0, + "kl_loss_12": 585.7257080078125, + "kl_loss_17": 146.6546787261963, + "kl_loss_3": 2293.339666748047, + "kl_loss_6": 1436.2982543945313, + "learning_rate": 0.0001572640718883667, + "loss": 1135.7382, + "step": 7430 + }, + { + "ce_loss_12": 3.149288332462311, + "ce_loss_17": 2.9716370582580565, + "ce_loss_23": 2.905295217037201, + "ce_loss_3": 3.948617625236511, + "ce_loss_6": 3.524720585346222, + "epoch": 0.744, + "grad_norm": 1128.0, + "kl_loss_12": 574.8381134033203, + "kl_loss_17": 141.23406295776368, + "kl_loss_3": 2242.906658935547, + "kl_loss_6": 1400.0741394042968, + "learning_rate": 0.0001561105515921915, + "loss": 1121.8488, + "step": 7440 + }, + { + "ce_loss_12": 3.024454426765442, + "ce_loss_17": 2.8424919843673706, + "ce_loss_23": 2.772695004940033, + "ce_loss_3": 3.887095332145691, + "ce_loss_6": 3.433519518375397, + "epoch": 0.745, + "grad_norm": 1152.0, + "kl_loss_12": 579.8395751953125, + "kl_loss_17": 140.73728218078614, + "kl_loss_3": 2375.4398681640623, + "kl_loss_6": 1464.7660583496095, + "learning_rate": 0.0001549604942589441, + "loss": 1117.5556, + "step": 7450 + }, + { + "ce_loss_12": 3.1888160347938537, + "ce_loss_17": 3.0143482089042664, + "ce_loss_23": 2.9467873096466066, + "ce_loss_3": 3.954085910320282, + "ce_loss_6": 3.5487560749053957, + "epoch": 0.746, + "grad_norm": 1256.0, + "kl_loss_12": 558.4238723754883, + "kl_loss_17": 139.93923835754396, + "kl_loss_3": 2171.600372314453, + "kl_loss_6": 1351.2838012695313, + "learning_rate": 0.00015381391146968864, + "loss": 1081.5531, + "step": 7460 + }, + { + "ce_loss_12": 3.160641372203827, + "ce_loss_17": 2.979509401321411, + "ce_loss_23": 2.912901020050049, + "ce_loss_3": 3.981812298297882, + "ce_loss_6": 3.5481289267539977, + "epoch": 0.747, + "grad_norm": 1264.0, + "kl_loss_12": 567.4259902954102, + "kl_loss_17": 139.06588706970214, + "kl_loss_3": 2265.0782836914063, + "kl_loss_6": 1410.2451721191405, + "learning_rate": 0.00015267081477050133, + "loss": 1106.7512, + "step": 7470 + }, + { + "ce_loss_12": 3.258499574661255, + "ce_loss_17": 3.0776013016700743, + "ce_loss_23": 3.0025953531265257, + "ce_loss_3": 4.046333730220795, + "ce_loss_6": 3.6256653308868407, + "epoch": 0.748, + "grad_norm": 1008.0, + "kl_loss_12": 583.4841796875, + "kl_loss_17": 147.9802734375, + "kl_loss_3": 2234.120538330078, + "kl_loss_6": 1391.3614807128906, + "learning_rate": 0.00015153121567235335, + "loss": 1083.2256, + "step": 7480 + }, + { + "ce_loss_12": 3.1553537011146546, + "ce_loss_17": 2.9760383009910583, + "ce_loss_23": 2.906737780570984, + "ce_loss_3": 3.9737986087799073, + "ce_loss_6": 3.5447561144828796, + "epoch": 0.749, + "grad_norm": 960.0, + "kl_loss_12": 583.4796676635742, + "kl_loss_17": 145.632071685791, + "kl_loss_3": 2306.9868774414062, + "kl_loss_6": 1438.073828125, + "learning_rate": 0.00015039512565099468, + "loss": 1088.3311, + "step": 7490 + }, + { + "ce_loss_12": 3.2105521678924562, + "ce_loss_17": 3.0372775912284853, + "ce_loss_23": 2.9659903049468994, + "ce_loss_3": 4.011225068569184, + "ce_loss_6": 3.5905876040458677, + "epoch": 0.75, + "grad_norm": 1232.0, + "kl_loss_12": 577.4969635009766, + "kl_loss_17": 144.20574073791505, + "kl_loss_3": 2254.465222167969, + "kl_loss_6": 1404.108416748047, + "learning_rate": 0.00014926255614683932, + "loss": 1139.2535, + "step": 7500 + }, + { + "ce_loss_12": 3.156857895851135, + "ce_loss_17": 2.9743181586265566, + "ce_loss_23": 2.9030750036239623, + "ce_loss_3": 3.9559382677078245, + "ce_loss_6": 3.524167549610138, + "epoch": 0.751, + "grad_norm": 1240.0, + "kl_loss_12": 573.277944946289, + "kl_loss_17": 143.0907730102539, + "kl_loss_3": 2269.5970092773437, + "kl_loss_6": 1393.0914978027345, + "learning_rate": 0.0001481335185648498, + "loss": 1106.7481, + "step": 7510 + }, + { + "ce_loss_12": 3.1793925046920775, + "ce_loss_17": 2.9973050355911255, + "ce_loss_23": 2.927828311920166, + "ce_loss_3": 3.9792739033699034, + "ce_loss_6": 3.556736075878143, + "epoch": 0.752, + "grad_norm": 1296.0, + "kl_loss_12": 577.0787658691406, + "kl_loss_17": 143.2110652923584, + "kl_loss_3": 2254.5094604492188, + "kl_loss_6": 1404.6462707519531, + "learning_rate": 0.0001470080242744218, + "loss": 1089.9562, + "step": 7520 + }, + { + "ce_loss_12": 3.1780881762504576, + "ce_loss_17": 2.9967373251914977, + "ce_loss_23": 2.9308495044708254, + "ce_loss_3": 3.989187252521515, + "ce_loss_6": 3.5582465291023255, + "epoch": 0.753, + "grad_norm": 1080.0, + "kl_loss_12": 566.506803894043, + "kl_loss_17": 140.71898460388184, + "kl_loss_3": 2270.319207763672, + "kl_loss_6": 1415.9373596191406, + "learning_rate": 0.0001458860846092705, + "loss": 1108.6918, + "step": 7530 + }, + { + "ce_loss_12": 3.212061035633087, + "ce_loss_17": 3.037975001335144, + "ce_loss_23": 2.967976760864258, + "ce_loss_3": 3.991036283969879, + "ce_loss_6": 3.5845932602882384, + "epoch": 0.754, + "grad_norm": 1472.0, + "kl_loss_12": 566.121485900879, + "kl_loss_17": 143.4044864654541, + "kl_loss_3": 2191.715924072266, + "kl_loss_6": 1378.2916015625, + "learning_rate": 0.00014476771086731566, + "loss": 1072.3135, + "step": 7540 + }, + { + "ce_loss_12": 3.301655721664429, + "ce_loss_17": 3.1161754846572878, + "ce_loss_23": 3.040296173095703, + "ce_loss_3": 4.090375530719757, + "ce_loss_6": 3.6660074591636658, + "epoch": 0.755, + "grad_norm": 1080.0, + "kl_loss_12": 585.9059158325196, + "kl_loss_17": 152.66415138244628, + "kl_loss_3": 2234.569793701172, + "kl_loss_6": 1399.7505004882812, + "learning_rate": 0.00014365291431056872, + "loss": 1124.2975, + "step": 7550 + }, + { + "ce_loss_12": 3.1535741806030275, + "ce_loss_17": 2.96162930727005, + "ce_loss_23": 2.8849114894866945, + "ce_loss_3": 3.971159243583679, + "ce_loss_6": 3.534115183353424, + "epoch": 0.756, + "grad_norm": 1296.0, + "kl_loss_12": 602.309245300293, + "kl_loss_17": 150.6013381958008, + "kl_loss_3": 2330.9420532226563, + "kl_loss_6": 1448.74794921875, + "learning_rate": 0.00014254170616501827, + "loss": 1121.1361, + "step": 7560 + }, + { + "ce_loss_12": 3.10304309129715, + "ce_loss_17": 2.9030748844146728, + "ce_loss_23": 2.8271848559379578, + "ce_loss_3": 3.959608733654022, + "ce_loss_6": 3.5100831270217894, + "epoch": 0.757, + "grad_norm": 1496.0, + "kl_loss_12": 618.4882049560547, + "kl_loss_17": 151.1617515563965, + "kl_loss_3": 2398.7765869140626, + "kl_loss_6": 1508.4453002929688, + "learning_rate": 0.0001414340976205183, + "loss": 1158.8504, + "step": 7570 + }, + { + "ce_loss_12": 3.1013665199279785, + "ce_loss_17": 2.9130101442337035, + "ce_loss_23": 2.8398788452148436, + "ce_loss_3": 3.9428343892097475, + "ce_loss_6": 3.497762608528137, + "epoch": 0.758, + "grad_norm": 1296.0, + "kl_loss_12": 583.5838302612304, + "kl_loss_17": 144.6000778198242, + "kl_loss_3": 2343.6672424316407, + "kl_loss_6": 1444.6453247070312, + "learning_rate": 0.00014033009983067452, + "loss": 1113.1201, + "step": 7580 + }, + { + "ce_loss_12": 3.2369961500167848, + "ce_loss_17": 3.0648522853851317, + "ce_loss_23": 2.99524964094162, + "ce_loss_3": 4.009769260883331, + "ce_loss_6": 3.5936578631401064, + "epoch": 0.759, + "grad_norm": 1004.0, + "kl_loss_12": 562.3574142456055, + "kl_loss_17": 140.76032257080078, + "kl_loss_3": 2182.551416015625, + "kl_loss_6": 1360.4345947265624, + "learning_rate": 0.00013922972391273224, + "loss": 1085.3129, + "step": 7590 + }, + { + "ce_loss_12": 3.241999626159668, + "ce_loss_17": 3.066638720035553, + "ce_loss_23": 2.9950135946273804, + "ce_loss_3": 4.057851004600525, + "ce_loss_6": 3.6200713872909547, + "epoch": 0.76, + "grad_norm": 1480.0, + "kl_loss_12": 571.0613189697266, + "kl_loss_17": 145.2515625, + "kl_loss_3": 2271.0092163085938, + "kl_loss_6": 1395.415838623047, + "learning_rate": 0.0001381329809474649, + "loss": 1105.9053, + "step": 7600 + }, + { + "ce_loss_12": 3.169899654388428, + "ce_loss_17": 2.978970980644226, + "ce_loss_23": 2.9027320623397825, + "ce_loss_3": 4.01324679851532, + "ce_loss_6": 3.568336808681488, + "epoch": 0.761, + "grad_norm": 1512.0, + "kl_loss_12": 606.602799987793, + "kl_loss_17": 149.62340240478517, + "kl_loss_3": 2372.2795593261717, + "kl_loss_6": 1475.2612731933593, + "learning_rate": 0.0001370398819790621, + "loss": 1135.6359, + "step": 7610 + }, + { + "ce_loss_12": 3.278676617145538, + "ce_loss_17": 3.1012438535690308, + "ce_loss_23": 3.0304234266281127, + "ce_loss_3": 4.068235874176025, + "ce_loss_6": 3.648307228088379, + "epoch": 0.762, + "grad_norm": 2128.0, + "kl_loss_12": 571.0301834106446, + "kl_loss_17": 144.67262840270996, + "kl_loss_3": 2207.5101623535156, + "kl_loss_6": 1374.4108947753907, + "learning_rate": 0.00013595043801501794, + "loss": 1080.6941, + "step": 7620 + }, + { + "ce_loss_12": 3.108250892162323, + "ce_loss_17": 2.917290711402893, + "ce_loss_23": 2.841147005558014, + "ce_loss_3": 3.989256405830383, + "ce_loss_6": 3.525123429298401, + "epoch": 0.763, + "grad_norm": 2176.0, + "kl_loss_12": 597.8586959838867, + "kl_loss_17": 149.18791084289552, + "kl_loss_3": 2424.509356689453, + "kl_loss_6": 1504.0890197753906, + "learning_rate": 0.00013486466002602133, + "loss": 1135.434, + "step": 7630 + }, + { + "ce_loss_12": 3.193066394329071, + "ce_loss_17": 3.0153016090393066, + "ce_loss_23": 2.9443026423454284, + "ce_loss_3": 3.9735458493232727, + "ce_loss_6": 3.563677763938904, + "epoch": 0.764, + "grad_norm": 1752.0, + "kl_loss_12": 569.2323852539063, + "kl_loss_17": 142.40617179870605, + "kl_loss_3": 2224.7341735839846, + "kl_loss_6": 1391.6013488769531, + "learning_rate": 0.00013378255894584462, + "loss": 1120.4004, + "step": 7640 + }, + { + "ce_loss_12": 3.1473358988761904, + "ce_loss_17": 2.9575576305389406, + "ce_loss_23": 2.8846905708312987, + "ce_loss_3": 3.982984387874603, + "ce_loss_6": 3.537565624713898, + "epoch": 0.765, + "grad_norm": 1680.0, + "kl_loss_12": 591.0188079833985, + "kl_loss_17": 147.65668258666992, + "kl_loss_3": 2337.673974609375, + "kl_loss_6": 1449.8600463867188, + "learning_rate": 0.0001327041456712334, + "loss": 1126.1551, + "step": 7650 + }, + { + "ce_loss_12": 3.1835631370544433, + "ce_loss_17": 3.0017666697502134, + "ce_loss_23": 2.9280351758003236, + "ce_loss_3": 3.9966321110725405, + "ce_loss_6": 3.5685526967048644, + "epoch": 0.766, + "grad_norm": 1416.0, + "kl_loss_12": 586.9018341064453, + "kl_loss_17": 146.2630973815918, + "kl_loss_3": 2289.165563964844, + "kl_loss_6": 1428.6635864257812, + "learning_rate": 0.00013162943106179747, + "loss": 1120.1879, + "step": 7660 + }, + { + "ce_loss_12": 3.153433895111084, + "ce_loss_17": 2.9747941851615907, + "ce_loss_23": 2.905145025253296, + "ce_loss_3": 3.9545072674751283, + "ce_loss_6": 3.537098562717438, + "epoch": 0.767, + "grad_norm": 1152.0, + "kl_loss_12": 575.4893569946289, + "kl_loss_17": 143.7412311553955, + "kl_loss_3": 2260.8550231933596, + "kl_loss_6": 1416.2470336914062, + "learning_rate": 0.00013055842593990132, + "loss": 1103.8006, + "step": 7670 + }, + { + "ce_loss_12": 3.1097405552864075, + "ce_loss_17": 2.927348804473877, + "ce_loss_23": 2.858966362476349, + "ce_loss_3": 3.9165873646736147, + "ce_loss_6": 3.4887708902359007, + "epoch": 0.768, + "grad_norm": 1240.0, + "kl_loss_12": 570.0885345458985, + "kl_loss_17": 141.93867988586425, + "kl_loss_3": 2243.9757446289063, + "kl_loss_6": 1392.8080627441407, + "learning_rate": 0.00012949114109055414, + "loss": 1117.4389, + "step": 7680 + }, + { + "ce_loss_12": 3.1493123054504393, + "ce_loss_17": 2.9641464948654175, + "ce_loss_23": 2.892187166213989, + "ce_loss_3": 3.962111532688141, + "ce_loss_6": 3.539284372329712, + "epoch": 0.769, + "grad_norm": 1200.0, + "kl_loss_12": 589.2413375854492, + "kl_loss_17": 146.9356632232666, + "kl_loss_3": 2291.6872436523436, + "kl_loss_6": 1432.8521789550782, + "learning_rate": 0.00012842758726130281, + "loss": 1126.0098, + "step": 7690 + }, + { + "ce_loss_12": 3.2033556580543516, + "ce_loss_17": 3.0117141604423523, + "ce_loss_23": 2.9377166032791138, + "ce_loss_3": 4.038209271430969, + "ce_loss_6": 3.5961333394050596, + "epoch": 0.77, + "grad_norm": 1600.0, + "kl_loss_12": 597.1304321289062, + "kl_loss_17": 147.96473426818847, + "kl_loss_3": 2323.345587158203, + "kl_loss_6": 1451.0074890136718, + "learning_rate": 0.00012736777516212267, + "loss": 1111.0631, + "step": 7700 + }, + { + "ce_loss_12": 3.1941524147987366, + "ce_loss_17": 3.009080731868744, + "ce_loss_23": 2.9352129101753235, + "ce_loss_3": 4.007384192943573, + "ce_loss_6": 3.578681743144989, + "epoch": 0.771, + "grad_norm": 1888.0, + "kl_loss_12": 595.233056640625, + "kl_loss_17": 147.88577880859376, + "kl_loss_3": 2300.504150390625, + "kl_loss_6": 1432.0664123535157, + "learning_rate": 0.00012631171546530968, + "loss": 1102.1405, + "step": 7710 + }, + { + "ce_loss_12": 3.202558135986328, + "ce_loss_17": 3.014161264896393, + "ce_loss_23": 2.939709556102753, + "ce_loss_3": 4.0079020142555235, + "ce_loss_6": 3.5834249019622804, + "epoch": 0.772, + "grad_norm": 1264.0, + "kl_loss_12": 593.998779296875, + "kl_loss_17": 148.89095573425294, + "kl_loss_3": 2292.7206176757813, + "kl_loss_6": 1440.4880065917969, + "learning_rate": 0.00012525941880537307, + "loss": 1129.073, + "step": 7720 + }, + { + "ce_loss_12": 3.226823461055756, + "ce_loss_17": 3.046484351158142, + "ce_loss_23": 2.9750453352928163, + "ce_loss_3": 4.030374526977539, + "ce_loss_6": 3.6044886827468874, + "epoch": 0.773, + "grad_norm": 1768.0, + "kl_loss_12": 578.7604827880859, + "kl_loss_17": 143.64957084655762, + "kl_loss_3": 2262.451580810547, + "kl_loss_6": 1409.8765258789062, + "learning_rate": 0.00012421089577892869, + "loss": 1104.2249, + "step": 7730 + }, + { + "ce_loss_12": 3.1875843524932863, + "ce_loss_17": 2.998730957508087, + "ce_loss_23": 2.926790475845337, + "ce_loss_3": 4.0098074078559875, + "ce_loss_6": 3.5727877974510194, + "epoch": 0.774, + "grad_norm": 1544.0, + "kl_loss_12": 592.0867767333984, + "kl_loss_17": 145.81984939575196, + "kl_loss_3": 2326.2348388671876, + "kl_loss_6": 1440.2564636230468, + "learning_rate": 0.0001231661569445919, + "loss": 1120.5097, + "step": 7740 + }, + { + "ce_loss_12": 3.0565276861190798, + "ce_loss_17": 2.872712802886963, + "ce_loss_23": 2.801181507110596, + "ce_loss_3": 3.8839349031448362, + "ce_loss_6": 3.443760085105896, + "epoch": 0.775, + "grad_norm": 1464.0, + "kl_loss_12": 583.5263763427735, + "kl_loss_17": 144.68877258300782, + "kl_loss_3": 2318.9962524414063, + "kl_loss_6": 1432.9780212402343, + "learning_rate": 0.00012212521282287093, + "loss": 1136.4404, + "step": 7750 + }, + { + "ce_loss_12": 3.193848180770874, + "ce_loss_17": 3.005165958404541, + "ce_loss_23": 2.93333922624588, + "ce_loss_3": 3.9853929042816163, + "ce_loss_6": 3.574661636352539, + "epoch": 0.776, + "grad_norm": 1392.0, + "kl_loss_12": 588.491716003418, + "kl_loss_17": 147.5692222595215, + "kl_loss_3": 2248.444256591797, + "kl_loss_6": 1412.8768005371094, + "learning_rate": 0.00012108807389606158, + "loss": 1125.0908, + "step": 7760 + }, + { + "ce_loss_12": 3.1853034496307373, + "ce_loss_17": 3.0101011633872985, + "ce_loss_23": 2.9394747376441956, + "ce_loss_3": 3.9992923855781557, + "ce_loss_6": 3.570225441455841, + "epoch": 0.777, + "grad_norm": 1768.0, + "kl_loss_12": 573.4608947753907, + "kl_loss_17": 141.5246440887451, + "kl_loss_3": 2273.6357299804686, + "kl_loss_6": 1414.4267822265624, + "learning_rate": 0.00012005475060814159, + "loss": 1103.6802, + "step": 7770 + }, + { + "ce_loss_12": 3.126719558238983, + "ce_loss_17": 2.943731892108917, + "ce_loss_23": 2.8726248383522033, + "ce_loss_3": 3.9623207807540894, + "ce_loss_6": 3.5243662714958193, + "epoch": 0.778, + "grad_norm": 1496.0, + "kl_loss_12": 591.7267028808594, + "kl_loss_17": 146.49259567260742, + "kl_loss_3": 2338.225732421875, + "kl_loss_6": 1456.5484680175782, + "learning_rate": 0.00011902525336466464, + "loss": 1125.9965, + "step": 7780 + }, + { + "ce_loss_12": 3.127916467189789, + "ce_loss_17": 2.933424413204193, + "ce_loss_23": 2.8582921385765077, + "ce_loss_3": 3.969379949569702, + "ce_loss_6": 3.522745895385742, + "epoch": 0.779, + "grad_norm": 1424.0, + "kl_loss_12": 601.2532257080078, + "kl_loss_17": 149.19645347595215, + "kl_loss_3": 2357.2268798828127, + "kl_loss_6": 1463.6623596191407, + "learning_rate": 0.00011799959253265668, + "loss": 1123.1512, + "step": 7790 + }, + { + "ce_loss_12": 3.1708114981651305, + "ce_loss_17": 2.9871245503425596, + "ce_loss_23": 2.916107642650604, + "ce_loss_3": 4.001013469696045, + "ce_loss_6": 3.560157287120819, + "epoch": 0.78, + "grad_norm": 1496.0, + "kl_loss_12": 590.2251251220703, + "kl_loss_17": 148.66517486572266, + "kl_loss_3": 2325.2627197265624, + "kl_loss_6": 1449.2373352050781, + "learning_rate": 0.00011697777844051105, + "loss": 1120.4944, + "step": 7800 + }, + { + "ce_loss_12": 3.1667218923568727, + "ce_loss_17": 2.980285406112671, + "ce_loss_23": 2.904732513427734, + "ce_loss_3": 4.023805069923401, + "ce_loss_6": 3.5702453374862673, + "epoch": 0.781, + "grad_norm": 1648.0, + "kl_loss_12": 590.4723083496094, + "kl_loss_17": 149.11718254089357, + "kl_loss_3": 2376.918103027344, + "kl_loss_6": 1475.6016967773437, + "learning_rate": 0.00011595982137788402, + "loss": 1133.0325, + "step": 7810 + }, + { + "ce_loss_12": 3.135530209541321, + "ce_loss_17": 2.960279083251953, + "ce_loss_23": 2.8913881301879885, + "ce_loss_3": 3.938090467453003, + "ce_loss_6": 3.510548102855682, + "epoch": 0.782, + "grad_norm": 1464.0, + "kl_loss_12": 570.5912612915039, + "kl_loss_17": 142.605171585083, + "kl_loss_3": 2234.8112365722654, + "kl_loss_6": 1390.4976623535156, + "learning_rate": 0.00011494573159559212, + "loss": 1103.2514, + "step": 7820 + }, + { + "ce_loss_12": 3.122647488117218, + "ce_loss_17": 2.943215036392212, + "ce_loss_23": 2.8696703553199767, + "ce_loss_3": 3.951231670379639, + "ce_loss_6": 3.5149996638298036, + "epoch": 0.783, + "grad_norm": 1032.0, + "kl_loss_12": 582.6005279541016, + "kl_loss_17": 146.78236503601073, + "kl_loss_3": 2295.499377441406, + "kl_loss_6": 1435.052508544922, + "learning_rate": 0.00011393551930550828, + "loss": 1132.6006, + "step": 7830 + }, + { + "ce_loss_12": 3.2461665868759155, + "ce_loss_17": 3.0698845982551575, + "ce_loss_23": 2.998344969749451, + "ce_loss_3": 4.034614992141724, + "ce_loss_6": 3.6171064019203185, + "epoch": 0.784, + "grad_norm": 1128.0, + "kl_loss_12": 579.5715423583985, + "kl_loss_17": 146.70325202941893, + "kl_loss_3": 2228.7357788085938, + "kl_loss_6": 1396.08271484375, + "learning_rate": 0.00011292919468045875, + "loss": 1099.162, + "step": 7840 + }, + { + "ce_loss_12": 3.207807648181915, + "ce_loss_17": 3.024890828132629, + "ce_loss_23": 2.9529799103736876, + "ce_loss_3": 4.01508207321167, + "ce_loss_6": 3.581845533847809, + "epoch": 0.785, + "grad_norm": 1024.0, + "kl_loss_12": 583.2962356567383, + "kl_loss_17": 145.15651016235353, + "kl_loss_3": 2266.380047607422, + "kl_loss_6": 1409.449639892578, + "learning_rate": 0.00011192676785412154, + "loss": 1098.5452, + "step": 7850 + }, + { + "ce_loss_12": 3.1559677720069885, + "ce_loss_17": 2.9682850360870363, + "ce_loss_23": 2.893582081794739, + "ce_loss_3": 4.001304793357849, + "ce_loss_6": 3.553962767124176, + "epoch": 0.786, + "grad_norm": 1264.0, + "kl_loss_12": 587.6689178466797, + "kl_loss_17": 148.94214668273926, + "kl_loss_3": 2338.846575927734, + "kl_loss_6": 1454.2427062988281, + "learning_rate": 0.00011092824892092374, + "loss": 1125.9066, + "step": 7860 + }, + { + "ce_loss_12": 3.0928285360336303, + "ce_loss_17": 2.9074668288230896, + "ce_loss_23": 2.83629949092865, + "ce_loss_3": 3.939790117740631, + "ce_loss_6": 3.4982715368270876, + "epoch": 0.787, + "grad_norm": 1440.0, + "kl_loss_12": 590.170278930664, + "kl_loss_17": 144.4903522491455, + "kl_loss_3": 2361.683386230469, + "kl_loss_6": 1468.2724670410157, + "learning_rate": 0.0001099336479359398, + "loss": 1114.6393, + "step": 7870 + }, + { + "ce_loss_12": 3.199421751499176, + "ce_loss_17": 3.024543786048889, + "ce_loss_23": 2.953835356235504, + "ce_loss_3": 3.9906187534332274, + "ce_loss_6": 3.57261518239975, + "epoch": 0.788, + "grad_norm": 1560.0, + "kl_loss_12": 574.4062911987305, + "kl_loss_17": 143.5504535675049, + "kl_loss_3": 2242.3897094726562, + "kl_loss_6": 1404.6826171875, + "learning_rate": 0.00010894297491479043, + "loss": 1106.2311, + "step": 7880 + }, + { + "ce_loss_12": 3.187730407714844, + "ce_loss_17": 3.0060059309005736, + "ce_loss_23": 2.936438286304474, + "ce_loss_3": 3.9977443695068358, + "ce_loss_6": 3.568432259559631, + "epoch": 0.789, + "grad_norm": 1224.0, + "kl_loss_12": 581.9048828125, + "kl_loss_17": 143.8780460357666, + "kl_loss_3": 2277.025604248047, + "kl_loss_6": 1424.236395263672, + "learning_rate": 0.00010795623983354214, + "loss": 1099.8393, + "step": 7890 + }, + { + "ce_loss_12": 3.10813227891922, + "ce_loss_17": 2.916833019256592, + "ce_loss_23": 2.8418298721313477, + "ce_loss_3": 3.9271308898925783, + "ce_loss_6": 3.4919618248939512, + "epoch": 0.79, + "grad_norm": 1488.0, + "kl_loss_12": 597.6287445068359, + "kl_loss_17": 150.24290161132814, + "kl_loss_3": 2335.408770751953, + "kl_loss_6": 1458.048272705078, + "learning_rate": 0.00010697345262860636, + "loss": 1118.3721, + "step": 7900 + }, + { + "ce_loss_12": 3.2186970472335816, + "ce_loss_17": 3.0451616168022158, + "ce_loss_23": 2.976191544532776, + "ce_loss_3": 4.004420411586762, + "ce_loss_6": 3.581884229183197, + "epoch": 0.791, + "grad_norm": 1064.0, + "kl_loss_12": 576.2975189208985, + "kl_loss_17": 143.90794258117677, + "kl_loss_3": 2230.136193847656, + "kl_loss_6": 1375.139794921875, + "learning_rate": 0.00010599462319663906, + "loss": 1085.8157, + "step": 7910 + }, + { + "ce_loss_12": 3.1901622653007506, + "ce_loss_17": 3.015137493610382, + "ce_loss_23": 2.9455866694450377, + "ce_loss_3": 3.972465181350708, + "ce_loss_6": 3.556162619590759, + "epoch": 0.792, + "grad_norm": 1288.0, + "kl_loss_12": 566.1257873535156, + "kl_loss_17": 142.95177536010743, + "kl_loss_3": 2209.9227783203123, + "kl_loss_6": 1372.5943603515625, + "learning_rate": 0.00010501976139444191, + "loss": 1080.5205, + "step": 7920 + }, + { + "ce_loss_12": 3.2104328751564024, + "ce_loss_17": 3.034326171875, + "ce_loss_23": 2.9652673244476317, + "ce_loss_3": 4.009120631217956, + "ce_loss_6": 3.5926668524742125, + "epoch": 0.793, + "grad_norm": 1712.0, + "kl_loss_12": 566.6971725463867, + "kl_loss_17": 143.57656860351562, + "kl_loss_3": 2236.0341735839843, + "kl_loss_6": 1402.0322265625, + "learning_rate": 0.0001040488770388625, + "loss": 1109.2703, + "step": 7930 + }, + { + "ce_loss_12": 3.173999750614166, + "ce_loss_17": 2.99107563495636, + "ce_loss_23": 2.9221983909606934, + "ce_loss_3": 3.990390157699585, + "ce_loss_6": 3.553859841823578, + "epoch": 0.794, + "grad_norm": 1376.0, + "kl_loss_12": 588.7955169677734, + "kl_loss_17": 144.7465305328369, + "kl_loss_3": 2311.021765136719, + "kl_loss_6": 1438.6768127441405, + "learning_rate": 0.00010308197990669538, + "loss": 1111.9271, + "step": 7940 + }, + { + "ce_loss_12": 3.2868431329727175, + "ce_loss_17": 3.1038758635520933, + "ce_loss_23": 3.028200459480286, + "ce_loss_3": 4.08767261505127, + "ce_loss_6": 3.662339985370636, + "epoch": 0.795, + "grad_norm": 1176.0, + "kl_loss_12": 589.1928344726563, + "kl_loss_17": 149.45320930480958, + "kl_loss_3": 2279.2579650878906, + "kl_loss_6": 1420.1681518554688, + "learning_rate": 0.0001021190797345839, + "loss": 1101.1299, + "step": 7950 + }, + { + "ce_loss_12": 3.038166892528534, + "ce_loss_17": 2.844325542449951, + "ce_loss_23": 2.7677196502685546, + "ce_loss_3": 3.8836674213409426, + "ce_loss_6": 3.4468006014823915, + "epoch": 0.796, + "grad_norm": 1216.0, + "kl_loss_12": 608.7845748901367, + "kl_loss_17": 150.89987487792968, + "kl_loss_3": 2379.402459716797, + "kl_loss_6": 1489.8852416992188, + "learning_rate": 0.00010116018621892236, + "loss": 1130.7251, + "step": 7960 + }, + { + "ce_loss_12": 3.2337188839912416, + "ce_loss_17": 3.0451438426971436, + "ce_loss_23": 2.971688520908356, + "ce_loss_3": 4.049557948112488, + "ce_loss_6": 3.6113546967506407, + "epoch": 0.797, + "grad_norm": 1640.0, + "kl_loss_12": 609.7105682373046, + "kl_loss_17": 154.39617004394532, + "kl_loss_3": 2322.890661621094, + "kl_loss_6": 1448.7145568847657, + "learning_rate": 0.00010020530901575753, + "loss": 1102.8385, + "step": 7970 + }, + { + "ce_loss_12": 3.2448107600212097, + "ce_loss_17": 3.0599387168884276, + "ce_loss_23": 2.989718186855316, + "ce_loss_3": 4.044394028186798, + "ce_loss_6": 3.6220335364341736, + "epoch": 0.798, + "grad_norm": 1104.0, + "kl_loss_12": 588.3841613769531, + "kl_loss_17": 146.60692825317383, + "kl_loss_3": 2274.386651611328, + "kl_loss_6": 1414.5964416503907, + "learning_rate": 9.925445774069231e-05, + "loss": 1089.287, + "step": 7980 + }, + { + "ce_loss_12": 3.2004425048828127, + "ce_loss_17": 3.016433036327362, + "ce_loss_23": 2.9427908420562745, + "ce_loss_3": 4.0063443899154665, + "ce_loss_6": 3.581237316131592, + "epoch": 0.799, + "grad_norm": 1224.0, + "kl_loss_12": 578.0600479125976, + "kl_loss_17": 145.65839042663575, + "kl_loss_3": 2246.4029907226563, + "kl_loss_6": 1404.698797607422, + "learning_rate": 9.830764196878872e-05, + "loss": 1078.946, + "step": 7990 + }, + { + "ce_loss_12": 3.1517406582832335, + "ce_loss_17": 2.967311954498291, + "ce_loss_23": 2.897081661224365, + "ce_loss_3": 3.969217538833618, + "ce_loss_6": 3.530978775024414, + "epoch": 0.8, + "grad_norm": 1184.0, + "kl_loss_12": 583.9973236083985, + "kl_loss_17": 143.61578063964845, + "kl_loss_3": 2322.5286376953127, + "kl_loss_6": 1439.1912536621094, + "learning_rate": 9.736487123447069e-05, + "loss": 1113.2518, + "step": 8000 + }, + { + "ce_loss_12": 3.108854520320892, + "ce_loss_17": 2.924087369441986, + "ce_loss_23": 2.8529945492744444, + "ce_loss_3": 3.980188012123108, + "ce_loss_6": 3.537500786781311, + "epoch": 0.801, + "grad_norm": 1344.0, + "kl_loss_12": 595.7058074951171, + "kl_loss_17": 147.8631378173828, + "kl_loss_3": 2427.1471618652345, + "kl_loss_6": 1536.2865356445313, + "learning_rate": 9.642615503142926e-05, + "loss": 1147.1516, + "step": 8010 + }, + { + "ce_loss_12": 3.166256773471832, + "ce_loss_17": 2.9796546459198, + "ce_loss_23": 2.910013258457184, + "ce_loss_3": 3.989797592163086, + "ce_loss_6": 3.5497280240058897, + "epoch": 0.802, + "grad_norm": 1264.0, + "kl_loss_12": 579.2446594238281, + "kl_loss_17": 145.01176528930665, + "kl_loss_3": 2317.938995361328, + "kl_loss_6": 1423.7095642089844, + "learning_rate": 9.549150281252633e-05, + "loss": 1100.4728, + "step": 8020 + }, + { + "ce_loss_12": 3.1893199682235718, + "ce_loss_17": 3.0053165555000305, + "ce_loss_23": 2.932640087604523, + "ce_loss_3": 4.004192996025085, + "ce_loss_6": 3.5693791270256043, + "epoch": 0.803, + "grad_norm": 1352.0, + "kl_loss_12": 586.4850799560547, + "kl_loss_17": 146.8351272583008, + "kl_loss_3": 2301.4699523925783, + "kl_loss_6": 1417.170965576172, + "learning_rate": 9.4560923989699e-05, + "loss": 1125.9923, + "step": 8030 + }, + { + "ce_loss_12": 3.181557857990265, + "ce_loss_17": 2.99809535741806, + "ce_loss_23": 2.9256274342536925, + "ce_loss_3": 3.9848276019096374, + "ce_loss_6": 3.5591329097747804, + "epoch": 0.804, + "grad_norm": 1328.0, + "kl_loss_12": 583.3985107421875, + "kl_loss_17": 146.57659759521485, + "kl_loss_3": 2264.577001953125, + "kl_loss_6": 1412.1575256347655, + "learning_rate": 9.363442793386607e-05, + "loss": 1128.8498, + "step": 8040 + }, + { + "ce_loss_12": 3.1663032174110413, + "ce_loss_17": 2.9711507201194762, + "ce_loss_23": 2.89576895236969, + "ce_loss_3": 4.003983414173126, + "ce_loss_6": 3.563196539878845, + "epoch": 0.805, + "grad_norm": 1736.0, + "kl_loss_12": 603.7290191650391, + "kl_loss_17": 149.50324897766114, + "kl_loss_3": 2350.3124877929686, + "kl_loss_6": 1470.9401794433593, + "learning_rate": 9.271202397483213e-05, + "loss": 1107.8284, + "step": 8050 + }, + { + "ce_loss_12": 3.1769514799118044, + "ce_loss_17": 3.001621627807617, + "ce_loss_23": 2.9336405396461487, + "ce_loss_3": 3.970787966251373, + "ce_loss_6": 3.5444902658462523, + "epoch": 0.806, + "grad_norm": 1664.0, + "kl_loss_12": 570.4025726318359, + "kl_loss_17": 142.5333168029785, + "kl_loss_3": 2242.686627197266, + "kl_loss_6": 1388.724920654297, + "learning_rate": 9.179372140119524e-05, + "loss": 1110.6841, + "step": 8060 + }, + { + "ce_loss_12": 3.1296725630760194, + "ce_loss_17": 2.948356294631958, + "ce_loss_23": 2.8794933319091798, + "ce_loss_3": 3.9368799805641173, + "ce_loss_6": 3.5070855975151063, + "epoch": 0.807, + "grad_norm": 1320.0, + "kl_loss_12": 571.2600555419922, + "kl_loss_17": 143.2032382965088, + "kl_loss_3": 2268.2776000976564, + "kl_loss_6": 1407.6453186035155, + "learning_rate": 9.087952946025175e-05, + "loss": 1119.4178, + "step": 8070 + }, + { + "ce_loss_12": 3.2234219908714294, + "ce_loss_17": 3.0531278967857363, + "ce_loss_23": 2.982680094242096, + "ce_loss_3": 3.997486340999603, + "ce_loss_6": 3.582521307468414, + "epoch": 0.808, + "grad_norm": 1592.0, + "kl_loss_12": 557.1702667236328, + "kl_loss_17": 140.71428451538085, + "kl_loss_3": 2192.4074279785154, + "kl_loss_6": 1356.8794982910156, + "learning_rate": 8.996945735790446e-05, + "loss": 1100.1039, + "step": 8080 + }, + { + "ce_loss_12": 3.1345752835273744, + "ce_loss_17": 2.9529730439186097, + "ce_loss_23": 2.8853927135467528, + "ce_loss_3": 3.9299724102020264, + "ce_loss_6": 3.5153716087341307, + "epoch": 0.809, + "grad_norm": 1584.0, + "kl_loss_12": 578.4022567749023, + "kl_loss_17": 142.59668655395507, + "kl_loss_3": 2262.134716796875, + "kl_loss_6": 1423.9153991699218, + "learning_rate": 8.906351425856951e-05, + "loss": 1113.9498, + "step": 8090 + }, + { + "ce_loss_12": 3.122279465198517, + "ce_loss_17": 2.938877081871033, + "ce_loss_23": 2.8691246151924132, + "ce_loss_3": 3.953269875049591, + "ce_loss_6": 3.51852263212204, + "epoch": 0.81, + "grad_norm": 1712.0, + "kl_loss_12": 585.718653869629, + "kl_loss_17": 144.48522872924804, + "kl_loss_3": 2343.4195922851563, + "kl_loss_6": 1460.1082458496094, + "learning_rate": 8.816170928508365e-05, + "loss": 1131.4787, + "step": 8100 + }, + { + "ce_loss_12": 3.0904855251312258, + "ce_loss_17": 2.9062983989715576, + "ce_loss_23": 2.8338136434555055, + "ce_loss_3": 3.94469074010849, + "ce_loss_6": 3.4927369713783265, + "epoch": 0.811, + "grad_norm": 1192.0, + "kl_loss_12": 592.2267547607422, + "kl_loss_17": 145.41789321899415, + "kl_loss_3": 2381.3392028808594, + "kl_loss_6": 1464.616064453125, + "learning_rate": 8.7264051518613e-05, + "loss": 1122.3471, + "step": 8110 + }, + { + "ce_loss_12": 3.1645127296447755, + "ce_loss_17": 2.9843564867973327, + "ce_loss_23": 2.915467309951782, + "ce_loss_3": 3.9609650492668154, + "ce_loss_6": 3.5383892178535463, + "epoch": 0.812, + "grad_norm": 1440.0, + "kl_loss_12": 567.5652648925782, + "kl_loss_17": 140.82563972473145, + "kl_loss_3": 2232.4432006835937, + "kl_loss_6": 1387.62041015625, + "learning_rate": 8.637054999856148e-05, + "loss": 1101.3601, + "step": 8120 + }, + { + "ce_loss_12": 3.1670919060707092, + "ce_loss_17": 2.9802419543266296, + "ce_loss_23": 2.9051790118217466, + "ce_loss_3": 3.9901094317436216, + "ce_loss_6": 3.552306079864502, + "epoch": 0.813, + "grad_norm": 1080.0, + "kl_loss_12": 582.9289855957031, + "kl_loss_17": 146.94060516357422, + "kl_loss_3": 2298.3286865234377, + "kl_loss_6": 1424.87265625, + "learning_rate": 8.548121372247918e-05, + "loss": 1125.2434, + "step": 8130 + }, + { + "ce_loss_12": 3.2213086128234862, + "ce_loss_17": 3.049425995349884, + "ce_loss_23": 2.9792168021202086, + "ce_loss_3": 4.012645065784454, + "ce_loss_6": 3.588515615463257, + "epoch": 0.814, + "grad_norm": 1528.0, + "kl_loss_12": 571.1836853027344, + "kl_loss_17": 144.43416442871094, + "kl_loss_3": 2263.109686279297, + "kl_loss_6": 1401.958349609375, + "learning_rate": 8.459605164597267e-05, + "loss": 1098.2332, + "step": 8140 + }, + { + "ce_loss_12": 3.1174206852912905, + "ce_loss_17": 2.9347835898399355, + "ce_loss_23": 2.8660162448883058, + "ce_loss_3": 3.939826858043671, + "ce_loss_6": 3.5120959639549256, + "epoch": 0.815, + "grad_norm": 1160.0, + "kl_loss_12": 577.2853576660157, + "kl_loss_17": 142.1588333129883, + "kl_loss_3": 2292.7109558105467, + "kl_loss_6": 1427.1336730957032, + "learning_rate": 8.371507268261436e-05, + "loss": 1120.7432, + "step": 8150 + }, + { + "ce_loss_12": 3.1832391023635864, + "ce_loss_17": 3.0008301734924316, + "ce_loss_23": 2.927248704433441, + "ce_loss_3": 3.9983168244361877, + "ce_loss_6": 3.5687580823898317, + "epoch": 0.816, + "grad_norm": 1104.0, + "kl_loss_12": 578.5268798828125, + "kl_loss_17": 144.10266876220703, + "kl_loss_3": 2272.6112060546875, + "kl_loss_6": 1411.9078552246094, + "learning_rate": 8.283828570385238e-05, + "loss": 1085.3471, + "step": 8160 + }, + { + "ce_loss_12": 3.1839546084403993, + "ce_loss_17": 3.002742660045624, + "ce_loss_23": 2.9315690994262695, + "ce_loss_3": 3.993039095401764, + "ce_loss_6": 3.5616119384765623, + "epoch": 0.817, + "grad_norm": 1208.0, + "kl_loss_12": 576.1334503173828, + "kl_loss_17": 145.7689308166504, + "kl_loss_3": 2252.5451782226564, + "kl_loss_6": 1399.2629638671874, + "learning_rate": 8.196569953892202e-05, + "loss": 1100.5814, + "step": 8170 + }, + { + "ce_loss_12": 3.1222561597824097, + "ce_loss_17": 2.934907341003418, + "ce_loss_23": 2.8655828475952148, + "ce_loss_3": 3.9398762702941896, + "ce_loss_6": 3.5047176122665404, + "epoch": 0.818, + "grad_norm": 1336.0, + "kl_loss_12": 590.4174057006836, + "kl_loss_17": 145.5976676940918, + "kl_loss_3": 2284.329034423828, + "kl_loss_6": 1428.17099609375, + "learning_rate": 8.109732297475635e-05, + "loss": 1102.7464, + "step": 8180 + }, + { + "ce_loss_12": 3.108860397338867, + "ce_loss_17": 2.9075496077537535, + "ce_loss_23": 2.833207094669342, + "ce_loss_3": 3.978651523590088, + "ce_loss_6": 3.52808883190155, + "epoch": 0.819, + "grad_norm": 1384.0, + "kl_loss_12": 609.459033203125, + "kl_loss_17": 149.10407562255858, + "kl_loss_3": 2399.7066040039062, + "kl_loss_6": 1502.5516845703125, + "learning_rate": 8.023316475589754e-05, + "loss": 1143.5553, + "step": 8190 + }, + { + "ce_loss_12": 3.0797939777374266, + "ce_loss_17": 2.8759528040885924, + "ce_loss_23": 2.7991921067237855, + "ce_loss_3": 3.976499283313751, + "ce_loss_6": 3.499998915195465, + "epoch": 0.82, + "grad_norm": 1504.0, + "kl_loss_12": 621.8446380615235, + "kl_loss_17": 155.72899703979493, + "kl_loss_3": 2481.224475097656, + "kl_loss_6": 1543.8018615722656, + "learning_rate": 7.937323358440934e-05, + "loss": 1164.697, + "step": 8200 + }, + { + "ce_loss_12": 3.163694477081299, + "ce_loss_17": 2.9910318493843078, + "ce_loss_23": 2.923853099346161, + "ce_loss_3": 3.9446559071540834, + "ce_loss_6": 3.5332563638687136, + "epoch": 0.821, + "grad_norm": 1280.0, + "kl_loss_12": 569.1499160766601, + "kl_loss_17": 141.1395122528076, + "kl_loss_3": 2207.359051513672, + "kl_loss_6": 1381.9457214355468, + "learning_rate": 7.851753811978923e-05, + "loss": 1094.5141, + "step": 8210 + }, + { + "ce_loss_12": 3.189778006076813, + "ce_loss_17": 3.005267596244812, + "ce_loss_23": 2.9345471978187563, + "ce_loss_3": 4.013755440711975, + "ce_loss_6": 3.577826511859894, + "epoch": 0.822, + "grad_norm": 1096.0, + "kl_loss_12": 582.4062759399415, + "kl_loss_17": 145.88867416381837, + "kl_loss_3": 2304.8142822265627, + "kl_loss_6": 1445.2988586425781, + "learning_rate": 7.766608697888095e-05, + "loss": 1104.6173, + "step": 8220 + }, + { + "ce_loss_12": 3.2018064975738527, + "ce_loss_17": 3.015978515148163, + "ce_loss_23": 2.943447482585907, + "ce_loss_3": 4.017671203613281, + "ce_loss_6": 3.579065716266632, + "epoch": 0.823, + "grad_norm": 1128.0, + "kl_loss_12": 591.9320648193359, + "kl_loss_17": 147.93717842102052, + "kl_loss_3": 2317.0777893066406, + "kl_loss_6": 1440.9440124511718, + "learning_rate": 7.681888873578785e-05, + "loss": 1130.5294, + "step": 8230 + }, + { + "ce_loss_12": 3.139638078212738, + "ce_loss_17": 2.944541811943054, + "ce_loss_23": 2.868843162059784, + "ce_loss_3": 3.9680713653564452, + "ce_loss_6": 3.5303648948669433, + "epoch": 0.824, + "grad_norm": 1192.0, + "kl_loss_12": 600.1248504638672, + "kl_loss_17": 151.06409645080566, + "kl_loss_3": 2342.7918701171875, + "kl_loss_6": 1459.1837097167968, + "learning_rate": 7.597595192178702e-05, + "loss": 1116.1597, + "step": 8240 + }, + { + "ce_loss_12": 3.142949867248535, + "ce_loss_17": 2.9508732080459597, + "ce_loss_23": 2.875047504901886, + "ce_loss_3": 3.9895151257514954, + "ce_loss_6": 3.5404590725898744, + "epoch": 0.825, + "grad_norm": 1608.0, + "kl_loss_12": 601.725863647461, + "kl_loss_17": 149.29254531860352, + "kl_loss_3": 2395.7788024902343, + "kl_loss_6": 1489.7219360351562, + "learning_rate": 7.513728502524286e-05, + "loss": 1143.15, + "step": 8250 + }, + { + "ce_loss_12": 3.1205203771591186, + "ce_loss_17": 2.9424642443656923, + "ce_loss_23": 2.8756930828094482, + "ce_loss_3": 3.9297197222709657, + "ce_loss_6": 3.494407081604004, + "epoch": 0.826, + "grad_norm": 976.0, + "kl_loss_12": 566.674186706543, + "kl_loss_17": 139.45149269104004, + "kl_loss_3": 2244.5056274414064, + "kl_loss_6": 1381.1739562988282, + "learning_rate": 7.430289649152156e-05, + "loss": 1108.4273, + "step": 8260 + }, + { + "ce_loss_12": 3.051215946674347, + "ce_loss_17": 2.857154166698456, + "ce_loss_23": 2.785940408706665, + "ce_loss_3": 3.9020636320114135, + "ce_loss_6": 3.4623627066612244, + "epoch": 0.827, + "grad_norm": 1240.0, + "kl_loss_12": 596.697541809082, + "kl_loss_17": 145.13237800598145, + "kl_loss_3": 2389.1919555664062, + "kl_loss_6": 1498.638653564453, + "learning_rate": 7.347279472290646e-05, + "loss": 1121.9785, + "step": 8270 + }, + { + "ce_loss_12": 3.171514904499054, + "ce_loss_17": 2.9911115646362303, + "ce_loss_23": 2.9191166639328, + "ce_loss_3": 3.9974769592285155, + "ce_loss_6": 3.562214803695679, + "epoch": 0.828, + "grad_norm": 1376.0, + "kl_loss_12": 586.6393783569335, + "kl_loss_17": 145.4167694091797, + "kl_loss_3": 2318.9116638183596, + "kl_loss_6": 1439.5349731445312, + "learning_rate": 7.264698807851328e-05, + "loss": 1125.8037, + "step": 8280 + }, + { + "ce_loss_12": 3.1453389286994935, + "ce_loss_17": 2.968476486206055, + "ce_loss_23": 2.9005788803100585, + "ce_loss_3": 3.94170058965683, + "ce_loss_6": 3.5211100339889527, + "epoch": 0.829, + "grad_norm": 1672.0, + "kl_loss_12": 571.3219741821289, + "kl_loss_17": 141.82576370239258, + "kl_loss_3": 2241.121954345703, + "kl_loss_6": 1391.2749145507812, + "learning_rate": 7.182548487420554e-05, + "loss": 1100.5468, + "step": 8290 + }, + { + "ce_loss_12": 3.1946922898292542, + "ce_loss_17": 3.012455129623413, + "ce_loss_23": 2.9442262768745424, + "ce_loss_3": 3.9918017983436584, + "ce_loss_6": 3.5774158477783202, + "epoch": 0.83, + "grad_norm": 1496.0, + "kl_loss_12": 588.873046875, + "kl_loss_17": 146.08888092041016, + "kl_loss_3": 2280.5714782714845, + "kl_loss_6": 1430.998712158203, + "learning_rate": 7.100829338251146e-05, + "loss": 1105.674, + "step": 8300 + }, + { + "ce_loss_12": 3.1336124777793883, + "ce_loss_17": 2.9436112999916078, + "ce_loss_23": 2.8674718499183656, + "ce_loss_3": 3.9784403085708617, + "ce_loss_6": 3.5349905490875244, + "epoch": 0.831, + "grad_norm": 1816.0, + "kl_loss_12": 595.4360076904297, + "kl_loss_17": 149.88540954589843, + "kl_loss_3": 2332.4470458984374, + "kl_loss_6": 1458.7423767089845, + "learning_rate": 7.019542183254046e-05, + "loss": 1110.8072, + "step": 8310 + }, + { + "ce_loss_12": 3.166594052314758, + "ce_loss_17": 2.982927882671356, + "ce_loss_23": 2.9061940908432007, + "ce_loss_3": 3.9700739979743958, + "ce_loss_6": 3.5450675249099732, + "epoch": 0.832, + "grad_norm": 1960.0, + "kl_loss_12": 595.8716079711915, + "kl_loss_17": 152.76154747009278, + "kl_loss_3": 2281.1293090820313, + "kl_loss_6": 1424.6028869628906, + "learning_rate": 6.938687840989971e-05, + "loss": 1110.1248, + "step": 8320 + }, + { + "ce_loss_12": 3.117111122608185, + "ce_loss_17": 2.9309823870658875, + "ce_loss_23": 2.8565820455551147, + "ce_loss_3": 3.9307853102684023, + "ce_loss_6": 3.504629969596863, + "epoch": 0.833, + "grad_norm": 1344.0, + "kl_loss_12": 591.458935546875, + "kl_loss_17": 148.20268898010255, + "kl_loss_3": 2284.296240234375, + "kl_loss_6": 1423.8598999023438, + "learning_rate": 6.858267125661271e-05, + "loss": 1124.1814, + "step": 8330 + }, + { + "ce_loss_12": 3.166881597042084, + "ce_loss_17": 2.9819628596305847, + "ce_loss_23": 2.9103707790374758, + "ce_loss_3": 3.9883024096488953, + "ce_loss_6": 3.557008695602417, + "epoch": 0.834, + "grad_norm": 1608.0, + "kl_loss_12": 579.8424926757813, + "kl_loss_17": 143.51333312988282, + "kl_loss_3": 2292.975341796875, + "kl_loss_6": 1430.6951477050782, + "learning_rate": 6.778280847103668e-05, + "loss": 1132.3985, + "step": 8340 + }, + { + "ce_loss_12": 3.1806155323982237, + "ce_loss_17": 2.988746666908264, + "ce_loss_23": 2.916960620880127, + "ce_loss_3": 3.9827450037002565, + "ce_loss_6": 3.5612888813018797, + "epoch": 0.835, + "grad_norm": 1192.0, + "kl_loss_12": 600.9124450683594, + "kl_loss_17": 148.11241302490234, + "kl_loss_3": 2297.2760620117188, + "kl_loss_6": 1448.2837646484375, + "learning_rate": 6.698729810778065e-05, + "loss": 1111.2381, + "step": 8350 + }, + { + "ce_loss_12": 3.094005656242371, + "ce_loss_17": 2.9059388995170594, + "ce_loss_23": 2.8351017355918886, + "ce_loss_3": 3.9261900424957275, + "ce_loss_6": 3.483336842060089, + "epoch": 0.836, + "grad_norm": 1808.0, + "kl_loss_12": 577.0751876831055, + "kl_loss_17": 141.79758338928224, + "kl_loss_3": 2304.9660217285154, + "kl_loss_6": 1416.2479858398438, + "learning_rate": 6.619614817762538e-05, + "loss": 1116.3715, + "step": 8360 + }, + { + "ce_loss_12": 3.076896774768829, + "ce_loss_17": 2.88525732755661, + "ce_loss_23": 2.8103936910629272, + "ce_loss_3": 3.944066882133484, + "ce_loss_6": 3.4942168831825255, + "epoch": 0.837, + "grad_norm": 1352.0, + "kl_loss_12": 597.8021301269531, + "kl_loss_17": 145.08125419616698, + "kl_loss_3": 2404.910748291016, + "kl_loss_6": 1493.6878356933594, + "learning_rate": 6.540936664744196e-05, + "loss": 1140.2389, + "step": 8370 + }, + { + "ce_loss_12": 3.1969696402549745, + "ce_loss_17": 3.012162458896637, + "ce_loss_23": 2.935665822029114, + "ce_loss_3": 4.026661574840546, + "ce_loss_6": 3.588542306423187, + "epoch": 0.838, + "grad_norm": 1112.0, + "kl_loss_12": 587.9847961425781, + "kl_loss_17": 146.612784576416, + "kl_loss_3": 2310.599072265625, + "kl_loss_6": 1440.4951171875, + "learning_rate": 6.462696144011149e-05, + "loss": 1104.9021, + "step": 8380 + }, + { + "ce_loss_12": 3.1555564284324644, + "ce_loss_17": 2.9739543437957763, + "ce_loss_23": 2.9015734672546385, + "ce_loss_3": 3.9582534790039063, + "ce_loss_6": 3.53470242023468, + "epoch": 0.839, + "grad_norm": 1020.0, + "kl_loss_12": 590.8940887451172, + "kl_loss_17": 148.50811920166015, + "kl_loss_3": 2272.6828735351564, + "kl_loss_6": 1426.033673095703, + "learning_rate": 6.384894043444567e-05, + "loss": 1095.9856, + "step": 8390 + }, + { + "ce_loss_12": 3.180431544780731, + "ce_loss_17": 2.9897021770477297, + "ce_loss_23": 2.9183462262153625, + "ce_loss_3": 4.010989665985107, + "ce_loss_6": 3.573097813129425, + "epoch": 0.84, + "grad_norm": 1896.0, + "kl_loss_12": 592.878450012207, + "kl_loss_17": 147.2418525695801, + "kl_loss_3": 2311.8345581054687, + "kl_loss_6": 1441.7733642578125, + "learning_rate": 6.307531146510753e-05, + "loss": 1111.0891, + "step": 8400 + }, + { + "ce_loss_12": 3.152303433418274, + "ce_loss_17": 2.971992623806, + "ce_loss_23": 2.899820351600647, + "ce_loss_3": 3.9444443464279173, + "ce_loss_6": 3.5234825372695924, + "epoch": 0.841, + "grad_norm": 984.0, + "kl_loss_12": 576.4628295898438, + "kl_loss_17": 145.34779739379883, + "kl_loss_3": 2229.6512451171875, + "kl_loss_6": 1396.1082397460937, + "learning_rate": 6.230608232253226e-05, + "loss": 1085.9627, + "step": 8410 + }, + { + "ce_loss_12": 3.1236330389976503, + "ce_loss_17": 2.9306414484977723, + "ce_loss_23": 2.856862449645996, + "ce_loss_3": 3.9838555335998533, + "ce_loss_6": 3.54256409406662, + "epoch": 0.842, + "grad_norm": 1360.0, + "kl_loss_12": 598.253190612793, + "kl_loss_17": 146.61602249145508, + "kl_loss_3": 2374.219549560547, + "kl_loss_6": 1488.299090576172, + "learning_rate": 6.154126075284855e-05, + "loss": 1117.1647, + "step": 8420 + }, + { + "ce_loss_12": 3.2005908370018004, + "ce_loss_17": 3.0218408346176147, + "ce_loss_23": 2.952148914337158, + "ce_loss_3": 3.98382648229599, + "ce_loss_6": 3.5727873921394346, + "epoch": 0.843, + "grad_norm": 1488.0, + "kl_loss_12": 572.6099502563477, + "kl_loss_17": 141.8627487182617, + "kl_loss_3": 2217.095245361328, + "kl_loss_6": 1395.0587524414063, + "learning_rate": 6.078085445780129e-05, + "loss": 1080.7017, + "step": 8430 + }, + { + "ce_loss_12": 3.2066327333450317, + "ce_loss_17": 3.022295427322388, + "ce_loss_23": 2.9517967462539674, + "ce_loss_3": 4.03835631608963, + "ce_loss_6": 3.593828630447388, + "epoch": 0.844, + "grad_norm": 1552.0, + "kl_loss_12": 584.0733032226562, + "kl_loss_17": 145.30355567932128, + "kl_loss_3": 2326.1559936523436, + "kl_loss_6": 1435.4254028320313, + "learning_rate": 6.002487109467347e-05, + "loss": 1099.6542, + "step": 8440 + }, + { + "ce_loss_12": 3.2176345586776733, + "ce_loss_17": 3.031240129470825, + "ce_loss_23": 2.9586237907409667, + "ce_loss_3": 4.014890730381012, + "ce_loss_6": 3.5906749844551085, + "epoch": 0.845, + "grad_norm": 1312.0, + "kl_loss_12": 594.9793701171875, + "kl_loss_17": 150.60174942016602, + "kl_loss_3": 2280.4541870117187, + "kl_loss_6": 1426.2670654296876, + "learning_rate": 5.927331827620902e-05, + "loss": 1099.7339, + "step": 8450 + }, + { + "ce_loss_12": 3.1921154141426085, + "ce_loss_17": 3.008522391319275, + "ce_loss_23": 2.9371198296546934, + "ce_loss_3": 3.9657519578933718, + "ce_loss_6": 3.5528266072273254, + "epoch": 0.846, + "grad_norm": 1096.0, + "kl_loss_12": 571.7378692626953, + "kl_loss_17": 142.95513916015625, + "kl_loss_3": 2193.6331298828127, + "kl_loss_6": 1370.4137329101563, + "learning_rate": 5.852620357053651e-05, + "loss": 1092.7303, + "step": 8460 + }, + { + "ce_loss_12": 3.2279970169067385, + "ce_loss_17": 3.0521787762641908, + "ce_loss_23": 2.984923815727234, + "ce_loss_3": 4.020310306549073, + "ce_loss_6": 3.6034207463264467, + "epoch": 0.847, + "grad_norm": 1104.0, + "kl_loss_12": 572.8000030517578, + "kl_loss_17": 142.0571964263916, + "kl_loss_3": 2229.488739013672, + "kl_loss_6": 1393.1040954589844, + "learning_rate": 5.778353450109286e-05, + "loss": 1094.8873, + "step": 8470 + }, + { + "ce_loss_12": 3.263732075691223, + "ce_loss_17": 3.080262279510498, + "ce_loss_23": 3.0068118929862977, + "ce_loss_3": 4.08725289106369, + "ce_loss_6": 3.647321677207947, + "epoch": 0.848, + "grad_norm": 1600.0, + "kl_loss_12": 588.9251647949219, + "kl_loss_17": 148.03672370910644, + "kl_loss_3": 2293.0857788085937, + "kl_loss_6": 1425.0013000488282, + "learning_rate": 5.7045318546547206e-05, + "loss": 1102.1039, + "step": 8480 + }, + { + "ce_loss_12": 3.161975991725922, + "ce_loss_17": 2.979814553260803, + "ce_loss_23": 2.9075967669487, + "ce_loss_3": 3.995291531085968, + "ce_loss_6": 3.546854817867279, + "epoch": 0.849, + "grad_norm": 1168.0, + "kl_loss_12": 582.0340927124023, + "kl_loss_17": 144.6420669555664, + "kl_loss_3": 2319.0878173828123, + "kl_loss_6": 1431.592059326172, + "learning_rate": 5.631156314072605e-05, + "loss": 1105.4393, + "step": 8490 + }, + { + "ce_loss_12": 3.1869612336158752, + "ce_loss_17": 3.0106932759284972, + "ce_loss_23": 2.9394050359725954, + "ce_loss_3": 3.9711960554122925, + "ce_loss_6": 3.551208019256592, + "epoch": 0.85, + "grad_norm": 1280.0, + "kl_loss_12": 573.1393142700196, + "kl_loss_17": 145.41687927246093, + "kl_loss_3": 2234.1766723632813, + "kl_loss_6": 1387.2991516113282, + "learning_rate": 5.5582275672538315e-05, + "loss": 1088.0804, + "step": 8500 + }, + { + "ce_loss_12": 3.1155901670455934, + "ce_loss_17": 2.921160626411438, + "ce_loss_23": 2.84966185092926, + "ce_loss_3": 3.977519929409027, + "ce_loss_6": 3.5322319507598876, + "epoch": 0.851, + "grad_norm": 1208.0, + "kl_loss_12": 604.7118682861328, + "kl_loss_17": 149.9787338256836, + "kl_loss_3": 2407.1728332519533, + "kl_loss_6": 1502.0127014160157, + "learning_rate": 5.4857463485900484e-05, + "loss": 1136.5094, + "step": 8510 + }, + { + "ce_loss_12": 3.160780608654022, + "ce_loss_17": 2.9806625962257387, + "ce_loss_23": 2.905897092819214, + "ce_loss_3": 3.9615924835205076, + "ce_loss_6": 3.5400408267974854, + "epoch": 0.852, + "grad_norm": 1120.0, + "kl_loss_12": 579.8729110717774, + "kl_loss_17": 143.8041172027588, + "kl_loss_3": 2269.61416015625, + "kl_loss_6": 1410.9498962402345, + "learning_rate": 5.413713387966329e-05, + "loss": 1101.6381, + "step": 8520 + }, + { + "ce_loss_12": 3.1798332571983337, + "ce_loss_17": 2.9922619700431823, + "ce_loss_23": 2.921483409404755, + "ce_loss_3": 3.9979071974754334, + "ce_loss_6": 3.5667684197425844, + "epoch": 0.853, + "grad_norm": 1632.0, + "kl_loss_12": 582.9076202392578, + "kl_loss_17": 144.66502304077147, + "kl_loss_3": 2311.5151245117186, + "kl_loss_6": 1439.8556213378906, + "learning_rate": 5.34212941075381e-05, + "loss": 1113.2516, + "step": 8530 + }, + { + "ce_loss_12": 3.1774136900901793, + "ce_loss_17": 3.007692849636078, + "ce_loss_23": 2.939822018146515, + "ce_loss_3": 3.970205819606781, + "ce_loss_6": 3.544040083885193, + "epoch": 0.854, + "grad_norm": 1128.0, + "kl_loss_12": 552.7987213134766, + "kl_loss_17": 139.36349868774414, + "kl_loss_3": 2221.3649780273436, + "kl_loss_6": 1372.9589233398438, + "learning_rate": 5.270995137802315e-05, + "loss": 1088.395, + "step": 8540 + }, + { + "ce_loss_12": 3.1294121980667113, + "ce_loss_17": 2.9471004366874696, + "ce_loss_23": 2.8777196526527407, + "ce_loss_3": 3.944415974617004, + "ce_loss_6": 3.5077300667762756, + "epoch": 0.855, + "grad_norm": 1176.0, + "kl_loss_12": 576.7997192382812, + "kl_loss_17": 142.75135650634766, + "kl_loss_3": 2294.3022277832033, + "kl_loss_6": 1419.12314453125, + "learning_rate": 5.2003112854332125e-05, + "loss": 1116.2262, + "step": 8550 + }, + { + "ce_loss_12": 3.1126842498779297, + "ce_loss_17": 2.9394203424453735, + "ce_loss_23": 2.872460675239563, + "ce_loss_3": 3.917052447795868, + "ce_loss_6": 3.493591618537903, + "epoch": 0.856, + "grad_norm": 1112.0, + "kl_loss_12": 567.4592391967774, + "kl_loss_17": 140.08689804077147, + "kl_loss_3": 2261.3334411621095, + "kl_loss_6": 1411.8925170898438, + "learning_rate": 5.130078565432089e-05, + "loss": 1080.4924, + "step": 8560 + }, + { + "ce_loss_12": 3.1774247765541075, + "ce_loss_17": 3.004711401462555, + "ce_loss_23": 2.9378722071647645, + "ce_loss_3": 3.964539682865143, + "ce_loss_6": 3.548519825935364, + "epoch": 0.857, + "grad_norm": 1128.0, + "kl_loss_12": 568.490444946289, + "kl_loss_17": 139.40370483398436, + "kl_loss_3": 2232.951800537109, + "kl_loss_6": 1394.6330444335938, + "learning_rate": 5.060297685041659e-05, + "loss": 1075.8057, + "step": 8570 + }, + { + "ce_loss_12": 3.127906584739685, + "ce_loss_17": 2.9368413925170898, + "ce_loss_23": 2.8627336859703063, + "ce_loss_3": 3.952434945106506, + "ce_loss_6": 3.514178156852722, + "epoch": 0.858, + "grad_norm": 1072.0, + "kl_loss_12": 593.2263397216797, + "kl_loss_17": 149.17412948608398, + "kl_loss_3": 2324.631884765625, + "kl_loss_6": 1445.1130859375, + "learning_rate": 4.99096934695461e-05, + "loss": 1125.4021, + "step": 8580 + }, + { + "ce_loss_12": 3.1838493227958677, + "ce_loss_17": 2.9971067309379578, + "ce_loss_23": 2.9266952991485597, + "ce_loss_3": 3.990589106082916, + "ce_loss_6": 3.5651594519615175, + "epoch": 0.859, + "grad_norm": 1184.0, + "kl_loss_12": 574.5701309204102, + "kl_loss_17": 141.33912353515626, + "kl_loss_3": 2263.241564941406, + "kl_loss_6": 1406.6405212402344, + "learning_rate": 4.922094249306558e-05, + "loss": 1088.1566, + "step": 8590 + }, + { + "ce_loss_12": 3.217962348461151, + "ce_loss_17": 3.0296669721603395, + "ce_loss_23": 2.958587646484375, + "ce_loss_3": 4.027351129055023, + "ce_loss_6": 3.5949318766593934, + "epoch": 0.86, + "grad_norm": 1192.0, + "kl_loss_12": 591.8625091552734, + "kl_loss_17": 148.62201843261718, + "kl_loss_3": 2292.3825927734374, + "kl_loss_6": 1419.3357971191406, + "learning_rate": 4.853673085668947e-05, + "loss": 1090.5944, + "step": 8600 + }, + { + "ce_loss_12": 3.224953126907349, + "ce_loss_17": 3.0404186248779297, + "ce_loss_23": 2.9691394686698915, + "ce_loss_3": 4.035121858119965, + "ce_loss_6": 3.6093873739242555, + "epoch": 0.861, + "grad_norm": 1320.0, + "kl_loss_12": 579.5304275512696, + "kl_loss_17": 142.73463401794433, + "kl_loss_3": 2281.0828186035155, + "kl_loss_6": 1424.9526123046876, + "learning_rate": 4.78570654504214e-05, + "loss": 1107.402, + "step": 8610 + }, + { + "ce_loss_12": 3.175167644023895, + "ce_loss_17": 2.996447467803955, + "ce_loss_23": 2.92565678358078, + "ce_loss_3": 3.9797300577163695, + "ce_loss_6": 3.5572490453720094, + "epoch": 0.862, + "grad_norm": 844.0, + "kl_loss_12": 570.9628707885743, + "kl_loss_17": 142.35733489990236, + "kl_loss_3": 2269.9085327148437, + "kl_loss_6": 1417.7039611816406, + "learning_rate": 4.7181953118484556e-05, + "loss": 1103.3342, + "step": 8620 + }, + { + "ce_loss_12": 3.198027265071869, + "ce_loss_17": 3.018433129787445, + "ce_loss_23": 2.950749433040619, + "ce_loss_3": 3.995873987674713, + "ce_loss_6": 3.576699137687683, + "epoch": 0.863, + "grad_norm": 1752.0, + "kl_loss_12": 575.5412963867187, + "kl_loss_17": 141.98919525146485, + "kl_loss_3": 2234.7420288085937, + "kl_loss_6": 1400.40283203125, + "learning_rate": 4.651140065925269e-05, + "loss": 1115.0733, + "step": 8630 + }, + { + "ce_loss_12": 3.1308565497398377, + "ce_loss_17": 2.9508320331573485, + "ce_loss_23": 2.877324306964874, + "ce_loss_3": 3.946554684638977, + "ce_loss_6": 3.5142532348632813, + "epoch": 0.864, + "grad_norm": 1648.0, + "kl_loss_12": 576.0889328002929, + "kl_loss_17": 144.31983528137206, + "kl_loss_3": 2291.02646484375, + "kl_loss_6": 1423.6650085449219, + "learning_rate": 4.58454148251814e-05, + "loss": 1113.8387, + "step": 8640 + }, + { + "ce_loss_12": 3.1479450821876527, + "ce_loss_17": 2.9588719725608827, + "ce_loss_23": 2.886811625957489, + "ce_loss_3": 3.9861248254776003, + "ce_loss_6": 3.5503336071968077, + "epoch": 0.865, + "grad_norm": 1176.0, + "kl_loss_12": 581.4473831176758, + "kl_loss_17": 142.7117492675781, + "kl_loss_3": 2323.2736877441407, + "kl_loss_6": 1445.4842407226563, + "learning_rate": 4.518400232274078e-05, + "loss": 1105.7314, + "step": 8650 + }, + { + "ce_loss_12": 3.1681225895881653, + "ce_loss_17": 2.9916425704956056, + "ce_loss_23": 2.918812358379364, + "ce_loss_3": 3.982468605041504, + "ce_loss_6": 3.5500745296478273, + "epoch": 0.866, + "grad_norm": 1080.0, + "kl_loss_12": 576.0150848388672, + "kl_loss_17": 146.00164642333985, + "kl_loss_3": 2261.0727966308596, + "kl_loss_6": 1400.8898498535157, + "learning_rate": 4.452716981234745e-05, + "loss": 1073.9408, + "step": 8660 + }, + { + "ce_loss_12": 3.1406161308288576, + "ce_loss_17": 2.96300151348114, + "ce_loss_23": 2.894453203678131, + "ce_loss_3": 3.940590834617615, + "ce_loss_6": 3.517574894428253, + "epoch": 0.867, + "grad_norm": 1012.0, + "kl_loss_12": 569.3233200073242, + "kl_loss_17": 140.169140625, + "kl_loss_3": 2242.9851501464846, + "kl_loss_6": 1399.3250549316406, + "learning_rate": 4.3874923908297335e-05, + "loss": 1075.665, + "step": 8670 + }, + { + "ce_loss_12": 3.2043825507164003, + "ce_loss_17": 3.0199650645256044, + "ce_loss_23": 2.948937273025513, + "ce_loss_3": 4.026477837562561, + "ce_loss_6": 3.5921675324440003, + "epoch": 0.868, + "grad_norm": 1624.0, + "kl_loss_12": 586.2451538085937, + "kl_loss_17": 145.81369247436524, + "kl_loss_3": 2317.316552734375, + "kl_loss_6": 1441.9366943359375, + "learning_rate": 4.322727117869951e-05, + "loss": 1101.8223, + "step": 8680 + }, + { + "ce_loss_12": 3.202632784843445, + "ce_loss_17": 3.0219478249549865, + "ce_loss_23": 2.952727460861206, + "ce_loss_3": 4.023267018795013, + "ce_loss_6": 3.589569592475891, + "epoch": 0.869, + "grad_norm": 1976.0, + "kl_loss_12": 577.7976440429687, + "kl_loss_17": 143.98717498779297, + "kl_loss_3": 2304.5221618652345, + "kl_loss_6": 1424.5413330078125, + "learning_rate": 4.2584218145409916e-05, + "loss": 1100.8338, + "step": 8690 + }, + { + "ce_loss_12": 3.229657542705536, + "ce_loss_17": 3.0619057297706602, + "ce_loss_23": 2.9937222719192507, + "ce_loss_3": 4.008101737499237, + "ce_loss_6": 3.5956004738807676, + "epoch": 0.87, + "grad_norm": 1312.0, + "kl_loss_12": 558.1629928588867, + "kl_loss_17": 139.7387897491455, + "kl_loss_3": 2204.8014892578126, + "kl_loss_6": 1372.325421142578, + "learning_rate": 4.194577128396521e-05, + "loss": 1074.2211, + "step": 8700 + }, + { + "ce_loss_12": 3.1359732866287233, + "ce_loss_17": 2.95313845872879, + "ce_loss_23": 2.8832003831863404, + "ce_loss_3": 3.938497769832611, + "ce_loss_6": 3.5137906432151795, + "epoch": 0.871, + "grad_norm": 1080.0, + "kl_loss_12": 565.1838592529297, + "kl_loss_17": 140.53310203552246, + "kl_loss_3": 2263.5222351074217, + "kl_loss_6": 1411.5387634277345, + "learning_rate": 4.1311937023518264e-05, + "loss": 1109.5805, + "step": 8710 + }, + { + "ce_loss_12": 3.143942928314209, + "ce_loss_17": 2.970666193962097, + "ce_loss_23": 2.9042790651321413, + "ce_loss_3": 4.002935862541198, + "ce_loss_6": 3.5461191058158876, + "epoch": 0.872, + "grad_norm": 1176.0, + "kl_loss_12": 555.6151596069336, + "kl_loss_17": 137.86699371337892, + "kl_loss_3": 2360.5023193359375, + "kl_loss_6": 1443.6187255859375, + "learning_rate": 4.0682721746773344e-05, + "loss": 1104.3949, + "step": 8720 + }, + { + "ce_loss_12": 3.0327890515327454, + "ce_loss_17": 2.846096920967102, + "ce_loss_23": 2.7767909169197083, + "ce_loss_3": 3.8801970958709715, + "ce_loss_6": 3.4383257031440735, + "epoch": 0.873, + "grad_norm": 1280.0, + "kl_loss_12": 581.7624359130859, + "kl_loss_17": 141.84159965515136, + "kl_loss_3": 2346.536975097656, + "kl_loss_6": 1451.5994262695312, + "learning_rate": 4.0058131789920904e-05, + "loss": 1097.1305, + "step": 8730 + }, + { + "ce_loss_12": 3.1715624570846557, + "ce_loss_17": 2.990919554233551, + "ce_loss_23": 2.9208243608474733, + "ce_loss_3": 3.9677069544792176, + "ce_loss_6": 3.5485462307929994, + "epoch": 0.874, + "grad_norm": 1088.0, + "kl_loss_12": 571.050749206543, + "kl_loss_17": 139.16770629882814, + "kl_loss_3": 2268.381427001953, + "kl_loss_6": 1413.6592590332032, + "learning_rate": 3.9438173442575e-05, + "loss": 1129.0674, + "step": 8740 + }, + { + "ce_loss_12": 3.1989192724227906, + "ce_loss_17": 3.015646827220917, + "ce_loss_23": 2.9440499424934385, + "ce_loss_3": 3.991265523433685, + "ce_loss_6": 3.56772917509079, + "epoch": 0.875, + "grad_norm": 1160.0, + "kl_loss_12": 572.0689254760742, + "kl_loss_17": 141.18715057373046, + "kl_loss_3": 2222.944921875, + "kl_loss_6": 1381.3336486816406, + "learning_rate": 3.882285294770937e-05, + "loss": 1087.6204, + "step": 8750 + }, + { + "ce_loss_12": 3.151508927345276, + "ce_loss_17": 2.9730342268943786, + "ce_loss_23": 2.9029712200164797, + "ce_loss_3": 3.939802515506744, + "ce_loss_6": 3.5210021257400514, + "epoch": 0.876, + "grad_norm": 1128.0, + "kl_loss_12": 565.9525894165039, + "kl_loss_17": 140.94870376586914, + "kl_loss_3": 2218.5689025878905, + "kl_loss_6": 1382.3477416992187, + "learning_rate": 3.821217650159453e-05, + "loss": 1101.3357, + "step": 8760 + }, + { + "ce_loss_12": 3.052991545200348, + "ce_loss_17": 2.864273762702942, + "ce_loss_23": 2.7928737878799437, + "ce_loss_3": 3.9019633650779726, + "ce_loss_6": 3.4615947723388674, + "epoch": 0.877, + "grad_norm": 1432.0, + "kl_loss_12": 585.8959518432617, + "kl_loss_17": 143.41930541992187, + "kl_loss_3": 2347.320965576172, + "kl_loss_6": 1475.3373657226562, + "learning_rate": 3.760615025373543e-05, + "loss": 1114.6322, + "step": 8770 + }, + { + "ce_loss_12": 3.2168009877204895, + "ce_loss_17": 3.027889835834503, + "ce_loss_23": 2.9533602833747863, + "ce_loss_3": 4.035237908363342, + "ce_loss_6": 3.606297481060028, + "epoch": 0.878, + "grad_norm": 1152.0, + "kl_loss_12": 592.520458984375, + "kl_loss_17": 148.62770309448243, + "kl_loss_3": 2314.968133544922, + "kl_loss_6": 1441.3265686035156, + "learning_rate": 3.700478030680987e-05, + "loss": 1128.3713, + "step": 8780 + }, + { + "ce_loss_12": 3.2068337321281435, + "ce_loss_17": 3.0260632753372194, + "ce_loss_23": 2.956136953830719, + "ce_loss_3": 4.020135986804962, + "ce_loss_6": 3.580900454521179, + "epoch": 0.879, + "grad_norm": 1004.0, + "kl_loss_12": 573.6394485473633, + "kl_loss_17": 141.99930267333986, + "kl_loss_3": 2273.6502685546875, + "kl_loss_6": 1410.6490173339844, + "learning_rate": 3.6408072716606344e-05, + "loss": 1094.7658, + "step": 8790 + }, + { + "ce_loss_12": 3.143653464317322, + "ce_loss_17": 2.9556998133659365, + "ce_loss_23": 2.883739101886749, + "ce_loss_3": 3.983898949623108, + "ce_loss_6": 3.5392743229866026, + "epoch": 0.88, + "grad_norm": 1488.0, + "kl_loss_12": 593.0009582519531, + "kl_loss_17": 145.94639472961427, + "kl_loss_3": 2355.1291748046874, + "kl_loss_6": 1462.3504699707032, + "learning_rate": 3.5816033491963716e-05, + "loss": 1144.8818, + "step": 8800 + }, + { + "ce_loss_12": 3.0133171916007995, + "ce_loss_17": 2.8266145408153536, + "ce_loss_23": 2.758497190475464, + "ce_loss_3": 3.875349199771881, + "ce_loss_6": 3.4153018474578856, + "epoch": 0.881, + "grad_norm": 1256.0, + "kl_loss_12": 582.8653411865234, + "kl_loss_17": 142.17961196899415, + "kl_loss_3": 2382.8354614257814, + "kl_loss_6": 1465.045965576172, + "learning_rate": 3.522866859471047e-05, + "loss": 1119.4389, + "step": 8810 + }, + { + "ce_loss_12": 3.207599139213562, + "ce_loss_17": 3.040723407268524, + "ce_loss_23": 2.9749446511268616, + "ce_loss_3": 3.980712521076202, + "ce_loss_6": 3.5772807717323305, + "epoch": 0.882, + "grad_norm": 1352.0, + "kl_loss_12": 552.4797805786133, + "kl_loss_17": 137.82118072509766, + "kl_loss_3": 2169.927130126953, + "kl_loss_6": 1353.0986389160157, + "learning_rate": 3.46459839396045e-05, + "loss": 1076.6996, + "step": 8820 + }, + { + "ce_loss_12": 3.138606333732605, + "ce_loss_17": 2.9527758955955505, + "ce_loss_23": 2.881995415687561, + "ce_loss_3": 3.9630982995033266, + "ce_loss_6": 3.535629165172577, + "epoch": 0.883, + "grad_norm": 1384.0, + "kl_loss_12": 575.5853576660156, + "kl_loss_17": 142.77523384094238, + "kl_loss_3": 2280.493377685547, + "kl_loss_6": 1425.925, + "learning_rate": 3.406798539427386e-05, + "loss": 1126.7855, + "step": 8830 + }, + { + "ce_loss_12": 3.201924538612366, + "ce_loss_17": 3.023479151725769, + "ce_loss_23": 2.9534876942634583, + "ce_loss_3": 4.0110024333000185, + "ce_loss_6": 3.580251407623291, + "epoch": 0.884, + "grad_norm": 1664.0, + "kl_loss_12": 576.2884735107422, + "kl_loss_17": 141.3286678314209, + "kl_loss_3": 2289.6893310546875, + "kl_loss_6": 1425.62265625, + "learning_rate": 3.349467877915746e-05, + "loss": 1107.0573, + "step": 8840 + }, + { + "ce_loss_12": 3.171042811870575, + "ce_loss_17": 2.987072694301605, + "ce_loss_23": 2.9178980588912964, + "ce_loss_3": 4.000065970420837, + "ce_loss_6": 3.5598766565322877, + "epoch": 0.885, + "grad_norm": 1808.0, + "kl_loss_12": 585.4442535400391, + "kl_loss_17": 144.81348342895507, + "kl_loss_3": 2338.6510986328126, + "kl_loss_6": 1453.5890319824218, + "learning_rate": 3.292606986744667e-05, + "loss": 1141.7707, + "step": 8850 + }, + { + "ce_loss_12": 3.1157912969589234, + "ce_loss_17": 2.940700352191925, + "ce_loss_23": 2.872869074344635, + "ce_loss_3": 3.9454143762588503, + "ce_loss_6": 3.5097142457962036, + "epoch": 0.886, + "grad_norm": 1440.0, + "kl_loss_12": 570.6747161865235, + "kl_loss_17": 139.37343406677246, + "kl_loss_3": 2311.179510498047, + "kl_loss_6": 1435.1692626953125, + "learning_rate": 3.23621643850267e-05, + "loss": 1108.4781, + "step": 8860 + }, + { + "ce_loss_12": 3.1907800674438476, + "ce_loss_17": 3.012934935092926, + "ce_loss_23": 2.9429600715637205, + "ce_loss_3": 3.997164797782898, + "ce_loss_6": 3.570061945915222, + "epoch": 0.887, + "grad_norm": 1808.0, + "kl_loss_12": 588.4341278076172, + "kl_loss_17": 146.12384529113768, + "kl_loss_3": 2284.929650878906, + "kl_loss_6": 1431.1104370117187, + "learning_rate": 3.180296801041971e-05, + "loss": 1091.9624, + "step": 8870 + }, + { + "ce_loss_12": 3.2134608030319214, + "ce_loss_17": 3.0334770321846007, + "ce_loss_23": 2.9668296813964843, + "ce_loss_3": 4.027132308483123, + "ce_loss_6": 3.5883197665214537, + "epoch": 0.888, + "grad_norm": 1208.0, + "kl_loss_12": 568.7455657958984, + "kl_loss_17": 139.985436630249, + "kl_loss_3": 2280.6083984375, + "kl_loss_6": 1402.873974609375, + "learning_rate": 3.124848637472688e-05, + "loss": 1078.6188, + "step": 8880 + }, + { + "ce_loss_12": 3.0495067715644835, + "ce_loss_17": 2.8635574698448183, + "ce_loss_23": 2.795310652256012, + "ce_loss_3": 3.8754838943481444, + "ce_loss_6": 3.438282108306885, + "epoch": 0.889, + "grad_norm": 1504.0, + "kl_loss_12": 561.9246047973633, + "kl_loss_17": 137.8976535797119, + "kl_loss_3": 2283.874896240234, + "kl_loss_6": 1423.3114990234376, + "learning_rate": 3.069872506157212e-05, + "loss": 1093.7986, + "step": 8890 + }, + { + "ce_loss_12": 3.1471518754959105, + "ce_loss_17": 2.964433026313782, + "ce_loss_23": 2.8955094456672668, + "ce_loss_3": 3.951979339122772, + "ce_loss_6": 3.5208448648452757, + "epoch": 0.89, + "grad_norm": 1768.0, + "kl_loss_12": 574.7976623535156, + "kl_loss_17": 140.52447242736815, + "kl_loss_3": 2277.884588623047, + "kl_loss_6": 1407.819873046875, + "learning_rate": 3.0153689607045842e-05, + "loss": 1092.9762, + "step": 8900 + }, + { + "ce_loss_12": 3.0624502897262573, + "ce_loss_17": 2.8712844431400297, + "ce_loss_23": 2.800961834192276, + "ce_loss_3": 3.938171899318695, + "ce_loss_6": 3.4819725155830383, + "epoch": 0.891, + "grad_norm": 1520.0, + "kl_loss_12": 600.0223052978515, + "kl_loss_17": 145.49247665405272, + "kl_loss_3": 2433.209509277344, + "kl_loss_6": 1518.5408630371094, + "learning_rate": 2.9613385499648926e-05, + "loss": 1120.3951, + "step": 8910 + }, + { + "ce_loss_12": 3.1023963928222655, + "ce_loss_17": 2.917354369163513, + "ce_loss_23": 2.8503448128700257, + "ce_loss_3": 3.9019821763038633, + "ce_loss_6": 3.4850382566452027, + "epoch": 0.892, + "grad_norm": 1392.0, + "kl_loss_12": 569.1156387329102, + "kl_loss_17": 141.6527893066406, + "kl_loss_3": 2246.032568359375, + "kl_loss_6": 1404.1896179199218, + "learning_rate": 2.9077818180237692e-05, + "loss": 1101.0201, + "step": 8920 + }, + { + "ce_loss_12": 3.140770733356476, + "ce_loss_17": 2.9576468110084533, + "ce_loss_23": 2.885908079147339, + "ce_loss_3": 3.9712109088897707, + "ce_loss_6": 3.5452078700065615, + "epoch": 0.893, + "grad_norm": 1328.0, + "kl_loss_12": 573.0570495605468, + "kl_loss_17": 141.92409553527833, + "kl_loss_3": 2277.2909423828123, + "kl_loss_6": 1428.9821228027345, + "learning_rate": 2.8546993041969172e-05, + "loss": 1103.4523, + "step": 8930 + }, + { + "ce_loss_12": 3.1720087051391603, + "ce_loss_17": 3.0010063529014586, + "ce_loss_23": 2.9312214612960816, + "ce_loss_3": 3.9582890391349794, + "ce_loss_6": 3.5403059005737303, + "epoch": 0.894, + "grad_norm": 1184.0, + "kl_loss_12": 562.5646118164062, + "kl_loss_17": 139.52182235717774, + "kl_loss_3": 2220.4663696289062, + "kl_loss_6": 1379.2326599121093, + "learning_rate": 2.802091543024671e-05, + "loss": 1091.9836, + "step": 8940 + }, + { + "ce_loss_12": 3.1763763666152953, + "ce_loss_17": 2.9964365363121033, + "ce_loss_23": 2.925124967098236, + "ce_loss_3": 4.00547845363617, + "ce_loss_6": 3.5667017459869386, + "epoch": 0.895, + "grad_norm": 1280.0, + "kl_loss_12": 574.3386627197266, + "kl_loss_17": 142.8935531616211, + "kl_loss_3": 2320.6210083007813, + "kl_loss_6": 1437.8731384277344, + "learning_rate": 2.7499590642665774e-05, + "loss": 1126.5984, + "step": 8950 + }, + { + "ce_loss_12": 3.196235418319702, + "ce_loss_17": 3.006606698036194, + "ce_loss_23": 2.940862798690796, + "ce_loss_3": 3.9963621139526366, + "ce_loss_6": 3.5574281811714172, + "epoch": 0.896, + "grad_norm": 1376.0, + "kl_loss_12": 600.9005172729492, + "kl_loss_17": 144.96383399963378, + "kl_loss_3": 2270.6853088378907, + "kl_loss_6": 1394.1507629394532, + "learning_rate": 2.6983023928961405e-05, + "loss": 1094.6697, + "step": 8960 + }, + { + "ce_loss_12": 3.1587356090545655, + "ce_loss_17": 2.975158321857452, + "ce_loss_23": 2.905811440944672, + "ce_loss_3": 3.9668765664100647, + "ce_loss_6": 3.541996717453003, + "epoch": 0.897, + "grad_norm": 1328.0, + "kl_loss_12": 574.8809600830078, + "kl_loss_17": 142.63181800842284, + "kl_loss_3": 2260.277233886719, + "kl_loss_6": 1414.5728637695313, + "learning_rate": 2.6471220490954628e-05, + "loss": 1112.7987, + "step": 8970 + }, + { + "ce_loss_12": 3.1388833999633787, + "ce_loss_17": 2.9679617881774902, + "ce_loss_23": 2.90387909412384, + "ce_loss_3": 3.959093451499939, + "ce_loss_6": 3.513088059425354, + "epoch": 0.898, + "grad_norm": 1152.0, + "kl_loss_12": 563.8492416381836, + "kl_loss_17": 139.28057365417482, + "kl_loss_3": 2266.877862548828, + "kl_loss_6": 1390.3409851074218, + "learning_rate": 2.596418548250029e-05, + "loss": 1096.2148, + "step": 8980 + }, + { + "ce_loss_12": 3.1796159386634826, + "ce_loss_17": 3.004649484157562, + "ce_loss_23": 2.933781862258911, + "ce_loss_3": 3.988151228427887, + "ce_loss_6": 3.5624248504638674, + "epoch": 0.899, + "grad_norm": 1192.0, + "kl_loss_12": 577.6036361694336, + "kl_loss_17": 144.3431022644043, + "kl_loss_3": 2283.327990722656, + "kl_loss_6": 1423.6157470703124, + "learning_rate": 2.5461924009435368e-05, + "loss": 1090.8762, + "step": 8990 + }, + { + "ce_loss_12": 3.1727102279663084, + "ce_loss_17": 2.9933549761772156, + "ce_loss_23": 2.9242831587791445, + "ce_loss_3": 3.9769126534461976, + "ce_loss_6": 3.5508437037467955, + "epoch": 0.9, + "grad_norm": 1208.0, + "kl_loss_12": 577.268571472168, + "kl_loss_17": 144.11421699523925, + "kl_loss_3": 2256.0469665527344, + "kl_loss_6": 1406.268896484375, + "learning_rate": 2.4964441129527336e-05, + "loss": 1115.5373, + "step": 9000 + }, + { + "ce_loss_12": 3.1747339844703673, + "ce_loss_17": 2.998010790348053, + "ce_loss_23": 2.9308466792106627, + "ce_loss_3": 3.9608139991760254, + "ce_loss_6": 3.5379756569862364, + "epoch": 0.901, + "grad_norm": 1096.0, + "kl_loss_12": 557.693244934082, + "kl_loss_17": 138.3488021850586, + "kl_loss_3": 2215.690496826172, + "kl_loss_6": 1369.4028015136719, + "learning_rate": 2.4471741852423235e-05, + "loss": 1080.0834, + "step": 9010 + }, + { + "ce_loss_12": 3.224750006198883, + "ce_loss_17": 3.0440508127212524, + "ce_loss_23": 2.972705078125, + "ce_loss_3": 4.0271160364151, + "ce_loss_6": 3.605271244049072, + "epoch": 0.902, + "grad_norm": 1256.0, + "kl_loss_12": 569.5615295410156, + "kl_loss_17": 142.0729030609131, + "kl_loss_3": 2235.7255310058595, + "kl_loss_6": 1400.379168701172, + "learning_rate": 2.3983831139599287e-05, + "loss": 1091.273, + "step": 9020 + }, + { + "ce_loss_12": 3.1377035856246946, + "ce_loss_17": 2.9660804271698, + "ce_loss_23": 2.8988550782203673, + "ce_loss_3": 3.945742964744568, + "ce_loss_6": 3.515814507007599, + "epoch": 0.903, + "grad_norm": 1184.0, + "kl_loss_12": 553.5863739013672, + "kl_loss_17": 139.5826530456543, + "kl_loss_3": 2236.3738403320312, + "kl_loss_6": 1376.8331909179688, + "learning_rate": 2.3500713904311022e-05, + "loss": 1065.7949, + "step": 9030 + }, + { + "ce_loss_12": 3.1767478704452516, + "ce_loss_17": 3.004851484298706, + "ce_loss_23": 2.9408997654914857, + "ce_loss_3": 3.950446879863739, + "ce_loss_6": 3.5378254652023315, + "epoch": 0.904, + "grad_norm": 1176.0, + "kl_loss_12": 550.520964050293, + "kl_loss_17": 136.76641120910645, + "kl_loss_3": 2169.320031738281, + "kl_loss_6": 1355.3678344726563, + "learning_rate": 2.3022395011543685e-05, + "loss": 1063.1572, + "step": 9040 + }, + { + "ce_loss_12": 3.218639242649078, + "ce_loss_17": 3.028334105014801, + "ce_loss_23": 2.9564494013786318, + "ce_loss_3": 4.017892599105835, + "ce_loss_6": 3.594402241706848, + "epoch": 0.905, + "grad_norm": 1320.0, + "kl_loss_12": 587.4106262207031, + "kl_loss_17": 146.13946876525878, + "kl_loss_3": 2273.7346374511717, + "kl_loss_6": 1420.69189453125, + "learning_rate": 2.2548879277963063e-05, + "loss": 1120.748, + "step": 9050 + }, + { + "ce_loss_12": 3.1238484501838686, + "ce_loss_17": 2.9536948800086975, + "ce_loss_23": 2.884401059150696, + "ce_loss_3": 3.9171729922294616, + "ce_loss_6": 3.503283417224884, + "epoch": 0.906, + "grad_norm": 1320.0, + "kl_loss_12": 563.0601135253906, + "kl_loss_17": 139.91237602233886, + "kl_loss_3": 2230.7384643554688, + "kl_loss_6": 1391.4769958496095, + "learning_rate": 2.208017147186736e-05, + "loss": 1063.8878, + "step": 9060 + }, + { + "ce_loss_12": 3.1227041482925415, + "ce_loss_17": 2.9394129514694214, + "ce_loss_23": 2.8684030890464784, + "ce_loss_3": 3.9229846715927126, + "ce_loss_6": 3.497808504104614, + "epoch": 0.907, + "grad_norm": 1328.0, + "kl_loss_12": 566.8505737304688, + "kl_loss_17": 139.52213935852052, + "kl_loss_3": 2248.106579589844, + "kl_loss_6": 1396.276318359375, + "learning_rate": 2.1616276313139227e-05, + "loss": 1083.2905, + "step": 9070 + }, + { + "ce_loss_12": 3.166498911380768, + "ce_loss_17": 2.9832795023918153, + "ce_loss_23": 2.9130069255828857, + "ce_loss_3": 3.9726845741271974, + "ce_loss_6": 3.5516761779785155, + "epoch": 0.908, + "grad_norm": 1160.0, + "kl_loss_12": 571.0766891479492, + "kl_loss_17": 142.20061111450195, + "kl_loss_3": 2263.641534423828, + "kl_loss_6": 1413.3831909179687, + "learning_rate": 2.1157198473197415e-05, + "loss": 1108.6984, + "step": 9080 + }, + { + "ce_loss_12": 3.2202314376831054, + "ce_loss_17": 3.039029121398926, + "ce_loss_23": 2.9674927830696105, + "ce_loss_3": 4.036566078662872, + "ce_loss_6": 3.599036192893982, + "epoch": 0.909, + "grad_norm": 1848.0, + "kl_loss_12": 582.7767959594727, + "kl_loss_17": 144.65241088867188, + "kl_loss_3": 2281.345770263672, + "kl_loss_6": 1415.5024841308593, + "learning_rate": 2.0702942574950812e-05, + "loss": 1102.0615, + "step": 9090 + }, + { + "ce_loss_12": 3.16324383020401, + "ce_loss_17": 2.9806368827819822, + "ce_loss_23": 2.9070268034935, + "ce_loss_3": 3.980290472507477, + "ce_loss_6": 3.54543274641037, + "epoch": 0.91, + "grad_norm": 936.0, + "kl_loss_12": 584.4819976806641, + "kl_loss_17": 146.09104804992677, + "kl_loss_3": 2285.574676513672, + "kl_loss_6": 1430.7371948242187, + "learning_rate": 2.025351319275137e-05, + "loss": 1102.6951, + "step": 9100 + }, + { + "ce_loss_12": 3.2741725206375123, + "ce_loss_17": 3.0896798014640807, + "ce_loss_23": 3.016503632068634, + "ce_loss_3": 4.07792466878891, + "ce_loss_6": 3.6508442163467407, + "epoch": 0.911, + "grad_norm": 1336.0, + "kl_loss_12": 594.4164245605468, + "kl_loss_17": 145.94216613769532, + "kl_loss_3": 2303.3822387695313, + "kl_loss_6": 1443.0927124023438, + "learning_rate": 1.9808914852347816e-05, + "loss": 1134.358, + "step": 9110 + }, + { + "ce_loss_12": 3.124447190761566, + "ce_loss_17": 2.9432847023010256, + "ce_loss_23": 2.8710927724838258, + "ce_loss_3": 3.928308629989624, + "ce_loss_6": 3.5085084676742553, + "epoch": 0.912, + "grad_norm": 1256.0, + "kl_loss_12": 568.8922149658204, + "kl_loss_17": 142.07119445800782, + "kl_loss_3": 2238.437371826172, + "kl_loss_6": 1394.7023681640626, + "learning_rate": 1.9369152030840554e-05, + "loss": 1085.0721, + "step": 9120 + }, + { + "ce_loss_12": 3.2017552852630615, + "ce_loss_17": 3.0240061283111572, + "ce_loss_23": 2.957836413383484, + "ce_loss_3": 4.012298822402954, + "ce_loss_6": 3.5821467876434325, + "epoch": 0.913, + "grad_norm": 1168.0, + "kl_loss_12": 573.5562255859375, + "kl_loss_17": 142.86845626831055, + "kl_loss_3": 2296.6152099609376, + "kl_loss_6": 1425.6638244628907, + "learning_rate": 1.893422915663645e-05, + "loss": 1105.5535, + "step": 9130 + }, + { + "ce_loss_12": 3.0918240547180176, + "ce_loss_17": 2.903041398525238, + "ce_loss_23": 2.830662488937378, + "ce_loss_3": 3.9411617040634157, + "ce_loss_6": 3.49194792509079, + "epoch": 0.914, + "grad_norm": 1128.0, + "kl_loss_12": 588.6283416748047, + "kl_loss_17": 142.85717430114747, + "kl_loss_3": 2352.675671386719, + "kl_loss_6": 1455.5821411132813, + "learning_rate": 1.850415060940386e-05, + "loss": 1122.8133, + "step": 9140 + }, + { + "ce_loss_12": 3.193700540065765, + "ce_loss_17": 3.0201195597648622, + "ce_loss_23": 2.9516518115997314, + "ce_loss_3": 3.9756213068962096, + "ce_loss_6": 3.563271164894104, + "epoch": 0.915, + "grad_norm": 1320.0, + "kl_loss_12": 570.439616394043, + "kl_loss_17": 142.03744583129884, + "kl_loss_3": 2215.4312072753905, + "kl_loss_6": 1383.9830505371094, + "learning_rate": 1.8078920720028978e-05, + "loss": 1093.2822, + "step": 9150 + }, + { + "ce_loss_12": 3.1198755383491514, + "ce_loss_17": 2.948425328731537, + "ce_loss_23": 2.881678056716919, + "ce_loss_3": 3.909556198120117, + "ce_loss_6": 3.4925370097160338, + "epoch": 0.916, + "grad_norm": 1416.0, + "kl_loss_12": 558.5831649780273, + "kl_loss_17": 137.62323989868165, + "kl_loss_3": 2209.2303161621094, + "kl_loss_6": 1378.3816833496094, + "learning_rate": 1.765854377057219e-05, + "loss": 1093.5438, + "step": 9160 + }, + { + "ce_loss_12": 3.0918163418769837, + "ce_loss_17": 2.923488640785217, + "ce_loss_23": 2.857994091510773, + "ce_loss_3": 3.8988531112670897, + "ce_loss_6": 3.4716381311416624, + "epoch": 0.917, + "grad_norm": 1280.0, + "kl_loss_12": 554.4416610717774, + "kl_loss_17": 136.77627487182616, + "kl_loss_3": 2239.9227783203123, + "kl_loss_6": 1389.2361389160155, + "learning_rate": 1.724302399422456e-05, + "loss": 1087.9522, + "step": 9170 + }, + { + "ce_loss_12": 3.0822932839393617, + "ce_loss_17": 2.9033358573913572, + "ce_loss_23": 2.828555905818939, + "ce_loss_3": 3.892942178249359, + "ce_loss_6": 3.4676493406295776, + "epoch": 0.918, + "grad_norm": 952.0, + "kl_loss_12": 581.4710708618164, + "kl_loss_17": 145.52764968872071, + "kl_loss_3": 2287.389471435547, + "kl_loss_6": 1429.6156860351562, + "learning_rate": 1.683236557526574e-05, + "loss": 1106.5897, + "step": 9180 + }, + { + "ce_loss_12": 3.167659103870392, + "ce_loss_17": 2.995775747299194, + "ce_loss_23": 2.931483507156372, + "ce_loss_3": 3.937394344806671, + "ce_loss_6": 3.524160325527191, + "epoch": 0.919, + "grad_norm": 1192.0, + "kl_loss_12": 550.8754241943359, + "kl_loss_17": 136.88702011108398, + "kl_loss_3": 2165.176025390625, + "kl_loss_6": 1345.518359375, + "learning_rate": 1.6426572649021475e-05, + "loss": 1079.3829, + "step": 9190 + }, + { + "ce_loss_12": 3.2008050322532653, + "ce_loss_17": 3.031719481945038, + "ce_loss_23": 2.9624359726905825, + "ce_loss_3": 3.962590718269348, + "ce_loss_6": 3.5502659916877746, + "epoch": 0.92, + "grad_norm": 1344.0, + "kl_loss_12": 563.794337463379, + "kl_loss_17": 142.30584411621095, + "kl_loss_3": 2185.1565856933594, + "kl_loss_6": 1358.118389892578, + "learning_rate": 1.6025649301821876e-05, + "loss": 1078.7441, + "step": 9200 + }, + { + "ce_loss_12": 3.1995946049690245, + "ce_loss_17": 3.0219401597976683, + "ce_loss_23": 2.9507028460502625, + "ce_loss_3": 3.975095748901367, + "ce_loss_6": 3.5622946858406066, + "epoch": 0.921, + "grad_norm": 2768.0, + "kl_loss_12": 570.4993179321289, + "kl_loss_17": 143.03371391296386, + "kl_loss_3": 2209.851190185547, + "kl_loss_6": 1378.5418823242187, + "learning_rate": 1.5629599570960716e-05, + "loss": 1073.4479, + "step": 9210 + }, + { + "ce_loss_12": 3.1116721272468566, + "ce_loss_17": 2.940151798725128, + "ce_loss_23": 2.869356560707092, + "ce_loss_3": 3.927424907684326, + "ce_loss_6": 3.4975394368171693, + "epoch": 0.922, + "grad_norm": 1152.0, + "kl_loss_12": 568.6913955688476, + "kl_loss_17": 140.26254806518554, + "kl_loss_3": 2283.1548706054687, + "kl_loss_6": 1417.19638671875, + "learning_rate": 1.5238427444654367e-05, + "loss": 1091.6278, + "step": 9220 + }, + { + "ce_loss_12": 3.1642475366592406, + "ce_loss_17": 2.9866513133049013, + "ce_loss_23": 2.9176761746406554, + "ce_loss_3": 3.957422173023224, + "ce_loss_6": 3.533956062793732, + "epoch": 0.923, + "grad_norm": 1312.0, + "kl_loss_12": 561.3609939575196, + "kl_loss_17": 140.5565975189209, + "kl_loss_3": 2228.4589599609376, + "kl_loss_6": 1376.9875793457031, + "learning_rate": 1.4852136862001764e-05, + "loss": 1080.0947, + "step": 9230 + }, + { + "ce_loss_12": 3.136372518539429, + "ce_loss_17": 2.958689272403717, + "ce_loss_23": 2.8898529171943665, + "ce_loss_3": 3.9163593292236327, + "ce_loss_6": 3.504320228099823, + "epoch": 0.924, + "grad_norm": 1192.0, + "kl_loss_12": 557.1435043334961, + "kl_loss_17": 137.54037590026854, + "kl_loss_3": 2205.422784423828, + "kl_loss_6": 1378.7695922851562, + "learning_rate": 1.4470731712944884e-05, + "loss": 1092.7418, + "step": 9240 + }, + { + "ce_loss_12": 3.159243071079254, + "ce_loss_17": 2.978089451789856, + "ce_loss_23": 2.9081845045089723, + "ce_loss_3": 3.955523502826691, + "ce_loss_6": 3.5331156134605406, + "epoch": 0.925, + "grad_norm": 1064.0, + "kl_loss_12": 573.0755340576172, + "kl_loss_17": 142.85941848754882, + "kl_loss_3": 2231.54228515625, + "kl_loss_6": 1392.987042236328, + "learning_rate": 1.4094215838229174e-05, + "loss": 1109.1504, + "step": 9250 + }, + { + "ce_loss_12": 3.1348060131073, + "ce_loss_17": 2.953113579750061, + "ce_loss_23": 2.8837756752967834, + "ce_loss_3": 3.949278748035431, + "ce_loss_6": 3.516173839569092, + "epoch": 0.926, + "grad_norm": 1368.0, + "kl_loss_12": 572.2881134033203, + "kl_loss_17": 141.6354190826416, + "kl_loss_3": 2281.9481567382813, + "kl_loss_6": 1415.3080078125, + "learning_rate": 1.372259302936546e-05, + "loss": 1133.9979, + "step": 9260 + }, + { + "ce_loss_12": 3.232729196548462, + "ce_loss_17": 3.049306321144104, + "ce_loss_23": 2.9741173505783083, + "ce_loss_3": 4.02828049659729, + "ce_loss_6": 3.6032612323760986, + "epoch": 0.927, + "grad_norm": 1408.0, + "kl_loss_12": 583.2842529296875, + "kl_loss_17": 148.82182655334472, + "kl_loss_3": 2254.1648559570312, + "kl_loss_6": 1400.4453918457032, + "learning_rate": 1.3355867028591206e-05, + "loss": 1085.9334, + "step": 9270 + }, + { + "ce_loss_12": 3.1285869121551513, + "ce_loss_17": 2.9587782263755797, + "ce_loss_23": 2.8916184306144714, + "ce_loss_3": 3.9068692564964294, + "ce_loss_6": 3.4927313566207885, + "epoch": 0.928, + "grad_norm": 1224.0, + "kl_loss_12": 556.6924621582032, + "kl_loss_17": 138.16119041442872, + "kl_loss_3": 2195.2857666015625, + "kl_loss_6": 1365.2273010253907, + "learning_rate": 1.2994041528833267e-05, + "loss": 1074.7875, + "step": 9280 + }, + { + "ce_loss_12": 3.1338427305221557, + "ce_loss_17": 2.9578555464744567, + "ce_loss_23": 2.887498342990875, + "ce_loss_3": 3.933479642868042, + "ce_loss_6": 3.5059791684150694, + "epoch": 0.929, + "grad_norm": 1768.0, + "kl_loss_12": 565.173583984375, + "kl_loss_17": 138.6213806152344, + "kl_loss_3": 2250.180096435547, + "kl_loss_6": 1393.6943969726562, + "learning_rate": 1.2637120173670358e-05, + "loss": 1079.2549, + "step": 9290 + }, + { + "ce_loss_12": 3.160716378688812, + "ce_loss_17": 2.976524293422699, + "ce_loss_23": 2.90571870803833, + "ce_loss_3": 3.969735598564148, + "ce_loss_6": 3.5413543820381164, + "epoch": 0.93, + "grad_norm": 1416.0, + "kl_loss_12": 571.4928176879882, + "kl_loss_17": 142.84209671020508, + "kl_loss_3": 2266.608044433594, + "kl_loss_6": 1409.5825256347657, + "learning_rate": 1.2285106557296478e-05, + "loss": 1091.1059, + "step": 9300 + }, + { + "ce_loss_12": 3.0556710720062257, + "ce_loss_17": 2.8692134499549864, + "ce_loss_23": 2.80194947719574, + "ce_loss_3": 3.9183158397674562, + "ce_loss_6": 3.461040568351746, + "epoch": 0.931, + "grad_norm": 1288.0, + "kl_loss_12": 577.5393493652343, + "kl_loss_17": 141.31912994384766, + "kl_loss_3": 2367.245227050781, + "kl_loss_6": 1463.9541076660157, + "learning_rate": 1.1938004224484989e-05, + "loss": 1114.1457, + "step": 9310 + }, + { + "ce_loss_12": 3.2702009081840515, + "ce_loss_17": 3.0906524658203125, + "ce_loss_23": 3.019772231578827, + "ce_loss_3": 4.06068868637085, + "ce_loss_6": 3.641372537612915, + "epoch": 0.932, + "grad_norm": 1464.0, + "kl_loss_12": 576.6890625, + "kl_loss_17": 143.3515167236328, + "kl_loss_3": 2251.400598144531, + "kl_loss_6": 1405.9549194335937, + "learning_rate": 1.1595816670552429e-05, + "loss": 1113.5938, + "step": 9320 + }, + { + "ce_loss_12": 3.183160388469696, + "ce_loss_17": 3.012767970561981, + "ce_loss_23": 2.942166805267334, + "ce_loss_3": 3.97890499830246, + "ce_loss_6": 3.5550689578056334, + "epoch": 0.933, + "grad_norm": 1272.0, + "kl_loss_12": 562.0327362060547, + "kl_loss_17": 142.5811809539795, + "kl_loss_3": 2224.3913330078126, + "kl_loss_6": 1383.7475524902343, + "learning_rate": 1.1258547341323699e-05, + "loss": 1074.1326, + "step": 9330 + }, + { + "ce_loss_12": 3.2246156096458436, + "ce_loss_17": 3.04879025220871, + "ce_loss_23": 2.976650893688202, + "ce_loss_3": 4.006984710693359, + "ce_loss_6": 3.5918793439865113, + "epoch": 0.934, + "grad_norm": 1744.0, + "kl_loss_12": 570.4189254760743, + "kl_loss_17": 142.10177764892578, + "kl_loss_3": 2235.414031982422, + "kl_loss_6": 1388.5227294921874, + "learning_rate": 1.0926199633097156e-05, + "loss": 1084.0532, + "step": 9340 + }, + { + "ce_loss_12": 3.221409249305725, + "ce_loss_17": 3.0520886540412904, + "ce_loss_23": 2.9852556109428408, + "ce_loss_3": 3.985976588726044, + "ce_loss_6": 3.5706624031066894, + "epoch": 0.935, + "grad_norm": 1168.0, + "kl_loss_12": 552.4551422119141, + "kl_loss_17": 138.07305488586425, + "kl_loss_3": 2189.827197265625, + "kl_loss_6": 1352.4185485839844, + "learning_rate": 1.0598776892610684e-05, + "loss": 1097.9495, + "step": 9350 + }, + { + "ce_loss_12": 3.05479975938797, + "ce_loss_17": 2.877264070510864, + "ce_loss_23": 2.81108136177063, + "ce_loss_3": 3.872506487369537, + "ce_loss_6": 3.443566989898682, + "epoch": 0.936, + "grad_norm": 1360.0, + "kl_loss_12": 564.5641067504882, + "kl_loss_17": 138.46149826049805, + "kl_loss_3": 2271.185723876953, + "kl_loss_6": 1414.4029724121094, + "learning_rate": 1.0276282417007399e-05, + "loss": 1082.4025, + "step": 9360 + }, + { + "ce_loss_12": 3.1909990668296815, + "ce_loss_17": 3.018402063846588, + "ce_loss_23": 2.9506608486175536, + "ce_loss_3": 3.968773341178894, + "ce_loss_6": 3.55495970249176, + "epoch": 0.937, + "grad_norm": 1656.0, + "kl_loss_12": 558.7230865478516, + "kl_loss_17": 137.2345790863037, + "kl_loss_3": 2196.6784545898436, + "kl_loss_6": 1368.8283325195312, + "learning_rate": 9.958719453803277e-06, + "loss": 1081.8428, + "step": 9370 + }, + { + "ce_loss_12": 3.196353805065155, + "ce_loss_17": 3.016882526874542, + "ce_loss_23": 2.9470358014106752, + "ce_loss_3": 4.0054005026817325, + "ce_loss_6": 3.582688271999359, + "epoch": 0.938, + "grad_norm": 1336.0, + "kl_loss_12": 574.5041320800781, + "kl_loss_17": 140.950936126709, + "kl_loss_3": 2263.826794433594, + "kl_loss_6": 1420.0086730957032, + "learning_rate": 9.646091200853802e-06, + "loss": 1086.8387, + "step": 9380 + }, + { + "ce_loss_12": 3.151876986026764, + "ce_loss_17": 2.9755756497383117, + "ce_loss_23": 2.9081675529479982, + "ce_loss_3": 3.9371665716171265, + "ce_loss_6": 3.5238523721694945, + "epoch": 0.939, + "grad_norm": 1232.0, + "kl_loss_12": 562.5504806518554, + "kl_loss_17": 138.05519409179686, + "kl_loss_3": 2198.64248046875, + "kl_loss_6": 1379.655517578125, + "learning_rate": 9.338400806321978e-06, + "loss": 1055.6605, + "step": 9390 + }, + { + "ce_loss_12": 3.1880789279937742, + "ce_loss_17": 3.0081345677375793, + "ce_loss_23": 2.937654137611389, + "ce_loss_3": 3.978235685825348, + "ce_loss_6": 3.561854135990143, + "epoch": 0.94, + "grad_norm": 1064.0, + "kl_loss_12": 573.3843231201172, + "kl_loss_17": 144.109534072876, + "kl_loss_3": 2226.9682800292967, + "kl_loss_6": 1392.0303039550781, + "learning_rate": 9.035651368646646e-06, + "loss": 1076.0094, + "step": 9400 + }, + { + "ce_loss_12": 3.1873153328895567, + "ce_loss_17": 3.0186848282814025, + "ce_loss_23": 2.9502647638320925, + "ce_loss_3": 3.9735482692718507, + "ce_loss_6": 3.565557610988617, + "epoch": 0.941, + "grad_norm": 1296.0, + "kl_loss_12": 556.7777435302735, + "kl_loss_17": 138.17250480651856, + "kl_loss_3": 2211.0478515625, + "kl_loss_6": 1386.376727294922, + "learning_rate": 8.737845936511335e-06, + "loss": 1084.0635, + "step": 9410 + }, + { + "ce_loss_12": 3.149757242202759, + "ce_loss_17": 2.968151795864105, + "ce_loss_23": 2.8966172456741335, + "ce_loss_3": 3.9596046447753905, + "ce_loss_6": 3.52842036485672, + "epoch": 0.942, + "grad_norm": 1136.0, + "kl_loss_12": 574.9564529418946, + "kl_loss_17": 144.03371810913086, + "kl_loss_3": 2288.7548889160157, + "kl_loss_6": 1410.6072082519531, + "learning_rate": 8.444987508813451e-06, + "loss": 1090.6997, + "step": 9420 + }, + { + "ce_loss_12": 3.117500162124634, + "ce_loss_17": 2.926789343357086, + "ce_loss_23": 2.8557647585868837, + "ce_loss_3": 3.9515535950660707, + "ce_loss_6": 3.5129860162734987, + "epoch": 0.943, + "grad_norm": 1616.0, + "kl_loss_12": 588.9390014648437, + "kl_loss_17": 144.02214965820312, + "kl_loss_3": 2352.1965087890626, + "kl_loss_6": 1475.5303894042968, + "learning_rate": 8.157079034633974e-06, + "loss": 1114.2029, + "step": 9430 + }, + { + "ce_loss_12": 3.09841548204422, + "ce_loss_17": 2.918646454811096, + "ce_loss_23": 2.8522272944450378, + "ce_loss_3": 3.902868723869324, + "ce_loss_6": 3.4759815335273743, + "epoch": 0.944, + "grad_norm": 1112.0, + "kl_loss_12": 572.3801406860351, + "kl_loss_17": 140.67530975341796, + "kl_loss_3": 2286.2723999023438, + "kl_loss_6": 1426.1576354980468, + "learning_rate": 7.874123413208145e-06, + "loss": 1092.9133, + "step": 9440 + }, + { + "ce_loss_12": 3.0825069665908815, + "ce_loss_17": 2.8904207587242126, + "ce_loss_23": 2.822537696361542, + "ce_loss_3": 3.9040890097618104, + "ce_loss_6": 3.4774235248565675, + "epoch": 0.945, + "grad_norm": 1232.0, + "kl_loss_12": 572.2075897216797, + "kl_loss_17": 139.49128265380858, + "kl_loss_3": 2288.7567504882813, + "kl_loss_6": 1428.809637451172, + "learning_rate": 7.59612349389599e-06, + "loss": 1105.5749, + "step": 9450 + }, + { + "ce_loss_12": 3.1537629127502442, + "ce_loss_17": 2.9811459302902223, + "ce_loss_23": 2.9168223261833193, + "ce_loss_3": 3.932907783985138, + "ce_loss_6": 3.5217889189720153, + "epoch": 0.946, + "grad_norm": 1288.0, + "kl_loss_12": 552.6924682617188, + "kl_loss_17": 136.8698528289795, + "kl_loss_3": 2176.718395996094, + "kl_loss_6": 1352.736553955078, + "learning_rate": 7.323082076153509e-06, + "loss": 1075.3162, + "step": 9460 + }, + { + "ce_loss_12": 3.1922634482383727, + "ce_loss_17": 3.0193036556243897, + "ce_loss_23": 2.9505120396614073, + "ce_loss_3": 3.9770920157432554, + "ce_loss_6": 3.562787342071533, + "epoch": 0.947, + "grad_norm": 1392.0, + "kl_loss_12": 567.2095199584961, + "kl_loss_17": 143.54181022644042, + "kl_loss_3": 2199.2372436523438, + "kl_loss_6": 1374.141180419922, + "learning_rate": 7.055001909504755e-06, + "loss": 1096.5682, + "step": 9470 + }, + { + "ce_loss_12": 3.2373849511146546, + "ce_loss_17": 3.057986521720886, + "ce_loss_23": 2.9876526832580566, + "ce_loss_3": 4.023206043243408, + "ce_loss_6": 3.6000648617744444, + "epoch": 0.948, + "grad_norm": 1320.0, + "kl_loss_12": 574.0479385375977, + "kl_loss_17": 141.67123413085938, + "kl_loss_3": 2233.8025024414064, + "kl_loss_6": 1388.5472778320313, + "learning_rate": 6.791885693514133e-06, + "loss": 1090.4167, + "step": 9480 + }, + { + "ce_loss_12": 3.146625804901123, + "ce_loss_17": 2.968444359302521, + "ce_loss_23": 2.899620831012726, + "ce_loss_3": 3.959327292442322, + "ce_loss_6": 3.52789853811264, + "epoch": 0.949, + "grad_norm": 1144.0, + "kl_loss_12": 571.9940902709961, + "kl_loss_17": 142.52353057861328, + "kl_loss_3": 2291.0288146972657, + "kl_loss_6": 1423.9354431152344, + "learning_rate": 6.533736077758867e-06, + "loss": 1108.4849, + "step": 9490 + }, + { + "ce_loss_12": 3.1210038542747496, + "ce_loss_17": 2.93543461561203, + "ce_loss_23": 2.863549029827118, + "ce_loss_3": 3.959295082092285, + "ce_loss_6": 3.506491792201996, + "epoch": 0.95, + "grad_norm": 1536.0, + "kl_loss_12": 587.0334289550781, + "kl_loss_17": 145.91167297363282, + "kl_loss_3": 2348.747198486328, + "kl_loss_6": 1444.5876403808593, + "learning_rate": 6.2805556618028556e-06, + "loss": 1103.8811, + "step": 9500 + }, + { + "ce_loss_12": 3.178951954841614, + "ce_loss_17": 3.012415373325348, + "ce_loss_23": 2.9441534757614134, + "ce_loss_3": 3.9545786142349244, + "ce_loss_6": 3.5391871690750123, + "epoch": 0.951, + "grad_norm": 1200.0, + "kl_loss_12": 546.3071563720703, + "kl_loss_17": 138.0149948120117, + "kl_loss_3": 2174.970806884766, + "kl_loss_6": 1346.4740905761719, + "learning_rate": 6.032346995169968e-06, + "loss": 1041.4939, + "step": 9510 + }, + { + "ce_loss_12": 3.191404628753662, + "ce_loss_17": 3.0147984504699705, + "ce_loss_23": 2.947719120979309, + "ce_loss_3": 3.9881349205970764, + "ce_loss_6": 3.5655474066734314, + "epoch": 0.952, + "grad_norm": 1400.0, + "kl_loss_12": 564.9931838989257, + "kl_loss_17": 141.5421936035156, + "kl_loss_3": 2245.5843994140623, + "kl_loss_6": 1398.5834228515625, + "learning_rate": 5.789112577318789e-06, + "loss": 1081.0586, + "step": 9520 + }, + { + "ce_loss_12": 3.1786576747894286, + "ce_loss_17": 3.0030580163002014, + "ce_loss_23": 2.9331778049468995, + "ce_loss_3": 3.993498134613037, + "ce_loss_6": 3.5653710842132567, + "epoch": 0.953, + "grad_norm": 1016.0, + "kl_loss_12": 576.1721008300781, + "kl_loss_17": 143.14058609008788, + "kl_loss_3": 2286.6453857421875, + "kl_loss_6": 1422.6028137207031, + "learning_rate": 5.550854857617194e-06, + "loss": 1085.1561, + "step": 9530 + }, + { + "ce_loss_12": 3.1633151888847353, + "ce_loss_17": 2.982179653644562, + "ce_loss_23": 2.909160816669464, + "ce_loss_3": 4.002788579463958, + "ce_loss_6": 3.5556951642036436, + "epoch": 0.954, + "grad_norm": 1416.0, + "kl_loss_12": 589.4462615966797, + "kl_loss_17": 146.34397735595704, + "kl_loss_3": 2331.482177734375, + "kl_loss_6": 1442.4031555175782, + "learning_rate": 5.317576235317756e-06, + "loss": 1115.7521, + "step": 9540 + }, + { + "ce_loss_12": 3.17785267829895, + "ce_loss_17": 3.010032832622528, + "ce_loss_23": 2.9419298648834227, + "ce_loss_3": 3.9465092420578003, + "ce_loss_6": 3.5357933402061463, + "epoch": 0.955, + "grad_norm": 1376.0, + "kl_loss_12": 552.7575500488281, + "kl_loss_17": 138.37635917663573, + "kl_loss_3": 2173.8107421875, + "kl_loss_6": 1348.920098876953, + "learning_rate": 5.089279059533658e-06, + "loss": 1085.1736, + "step": 9550 + }, + { + "ce_loss_12": 3.2433340787887572, + "ce_loss_17": 3.0595552921295166, + "ce_loss_23": 2.9883474588394163, + "ce_loss_3": 4.016605389118195, + "ce_loss_6": 3.606806826591492, + "epoch": 0.956, + "grad_norm": 1088.0, + "kl_loss_12": 581.2642028808593, + "kl_loss_17": 144.9947322845459, + "kl_loss_3": 2217.130999755859, + "kl_loss_6": 1397.3179748535156, + "learning_rate": 4.865965629214819e-06, + "loss": 1080.5666, + "step": 9560 + }, + { + "ce_loss_12": 3.1855320811271666, + "ce_loss_17": 3.00831778049469, + "ce_loss_23": 2.938026010990143, + "ce_loss_3": 3.987771379947662, + "ce_loss_6": 3.565508818626404, + "epoch": 0.957, + "grad_norm": 1144.0, + "kl_loss_12": 576.47158203125, + "kl_loss_17": 143.28992500305176, + "kl_loss_3": 2279.5961730957033, + "kl_loss_6": 1420.334716796875, + "learning_rate": 4.6476381931251366e-06, + "loss": 1079.9785, + "step": 9570 + }, + { + "ce_loss_12": 3.1721609950065615, + "ce_loss_17": 2.9947481989860534, + "ce_loss_23": 2.9272478461265563, + "ce_loss_3": 3.9524666905403136, + "ce_loss_6": 3.5414546608924864, + "epoch": 0.958, + "grad_norm": 996.0, + "kl_loss_12": 558.231167602539, + "kl_loss_17": 139.46002197265625, + "kl_loss_3": 2199.8872131347657, + "kl_loss_6": 1371.4146362304687, + "learning_rate": 4.434298949819449e-06, + "loss": 1077.8086, + "step": 9580 + }, + { + "ce_loss_12": 3.150679886341095, + "ce_loss_17": 2.965765857696533, + "ce_loss_23": 2.89153196811676, + "ce_loss_3": 3.97977819442749, + "ce_loss_6": 3.5495267033576967, + "epoch": 0.959, + "grad_norm": 1112.0, + "kl_loss_12": 597.0422882080078, + "kl_loss_17": 149.77814559936525, + "kl_loss_3": 2361.7612182617186, + "kl_loss_6": 1485.7872192382813, + "learning_rate": 4.2259500476214406e-06, + "loss": 1116.4518, + "step": 9590 + }, + { + "ce_loss_12": 3.119322323799133, + "ce_loss_17": 2.9391478180885313, + "ce_loss_23": 2.869364786148071, + "ce_loss_3": 3.931707274913788, + "ce_loss_6": 3.5022275924682615, + "epoch": 0.96, + "grad_norm": 1016.0, + "kl_loss_12": 571.7809692382813, + "kl_loss_17": 142.30400886535645, + "kl_loss_3": 2293.1997253417967, + "kl_loss_6": 1421.6312622070313, + "learning_rate": 4.02259358460233e-06, + "loss": 1090.1605, + "step": 9600 + }, + { + "ce_loss_12": 3.1802860736846923, + "ce_loss_17": 3.003819525241852, + "ce_loss_23": 2.9309369921684265, + "ce_loss_3": 3.971915304660797, + "ce_loss_6": 3.5549774169921875, + "epoch": 0.961, + "grad_norm": 1600.0, + "kl_loss_12": 563.6734725952149, + "kl_loss_17": 143.5174545288086, + "kl_loss_3": 2203.2087463378907, + "kl_loss_6": 1372.5609191894532, + "learning_rate": 3.8242316085594916e-06, + "loss": 1076.3301, + "step": 9610 + }, + { + "ce_loss_12": 3.0840321660041807, + "ce_loss_17": 2.896312749385834, + "ce_loss_23": 2.8244978189468384, + "ce_loss_3": 3.9476155757904055, + "ce_loss_6": 3.483164632320404, + "epoch": 0.962, + "grad_norm": 1384.0, + "kl_loss_12": 592.8782974243164, + "kl_loss_17": 145.58575363159179, + "kl_loss_3": 2403.8203125, + "kl_loss_6": 1472.9635925292969, + "learning_rate": 3.630866116995757e-06, + "loss": 1131.7562, + "step": 9620 + }, + { + "ce_loss_12": 3.210359513759613, + "ce_loss_17": 3.037873101234436, + "ce_loss_23": 2.9704142928123476, + "ce_loss_3": 3.9840270042419434, + "ce_loss_6": 3.568054986000061, + "epoch": 0.963, + "grad_norm": 1104.0, + "kl_loss_12": 557.3888397216797, + "kl_loss_17": 139.5612030029297, + "kl_loss_3": 2199.9749450683594, + "kl_loss_6": 1360.462451171875, + "learning_rate": 3.4424990570994797e-06, + "loss": 1095.2693, + "step": 9630 + }, + { + "ce_loss_12": 3.2016509532928468, + "ce_loss_17": 3.028516483306885, + "ce_loss_23": 2.956979250907898, + "ce_loss_3": 3.9896968841552733, + "ce_loss_6": 3.5759765028953554, + "epoch": 0.964, + "grad_norm": 1088.0, + "kl_loss_12": 564.2158920288086, + "kl_loss_17": 140.01034622192384, + "kl_loss_3": 2225.219024658203, + "kl_loss_6": 1392.4480834960937, + "learning_rate": 3.2591323257248896e-06, + "loss": 1083.4895, + "step": 9640 + }, + { + "ce_loss_12": 3.0668023824691772, + "ce_loss_17": 2.89301438331604, + "ce_loss_23": 2.8247917175292967, + "ce_loss_3": 3.8839977025985717, + "ce_loss_6": 3.457288372516632, + "epoch": 0.965, + "grad_norm": 1200.0, + "kl_loss_12": 569.8777862548828, + "kl_loss_17": 140.01002349853516, + "kl_loss_3": 2277.7728637695313, + "kl_loss_6": 1424.39072265625, + "learning_rate": 3.0807677693729385e-06, + "loss": 1106.5152, + "step": 9650 + }, + { + "ce_loss_12": 3.2410813212394713, + "ce_loss_17": 3.0613807320594786, + "ce_loss_23": 2.993683362007141, + "ce_loss_3": 4.020160353183746, + "ce_loss_6": 3.606537926197052, + "epoch": 0.966, + "grad_norm": 1240.0, + "kl_loss_12": 559.5683502197265, + "kl_loss_17": 139.44141578674316, + "kl_loss_3": 2204.943395996094, + "kl_loss_6": 1372.416357421875, + "learning_rate": 2.9074071841727055e-06, + "loss": 1065.3305, + "step": 9660 + }, + { + "ce_loss_12": 3.176542055606842, + "ce_loss_17": 2.996139633655548, + "ce_loss_23": 2.9270712018013, + "ce_loss_3": 3.9607030749320984, + "ce_loss_6": 3.546536076068878, + "epoch": 0.967, + "grad_norm": 1280.0, + "kl_loss_12": 571.554232788086, + "kl_loss_17": 141.04352188110352, + "kl_loss_3": 2236.0992919921873, + "kl_loss_6": 1394.805859375, + "learning_rate": 2.739052315863355e-06, + "loss": 1065.0479, + "step": 9670 + }, + { + "ce_loss_12": 3.145386826992035, + "ce_loss_17": 2.9713452219963075, + "ce_loss_23": 2.9046486973762513, + "ce_loss_3": 3.9547670125961303, + "ce_loss_6": 3.5205410599708555, + "epoch": 0.968, + "grad_norm": 1264.0, + "kl_loss_12": 567.9315902709961, + "kl_loss_17": 139.7953674316406, + "kl_loss_3": 2270.7963317871095, + "kl_loss_6": 1403.4765014648438, + "learning_rate": 2.5757048597765396e-06, + "loss": 1077.185, + "step": 9680 + }, + { + "ce_loss_12": 3.16786447763443, + "ce_loss_17": 2.985814690589905, + "ce_loss_23": 2.917538571357727, + "ce_loss_3": 3.9775120854377746, + "ce_loss_6": 3.5524555683135985, + "epoch": 0.969, + "grad_norm": 2144.0, + "kl_loss_12": 572.3357543945312, + "kl_loss_17": 140.7734790802002, + "kl_loss_3": 2259.3738037109374, + "kl_loss_6": 1412.3394409179687, + "learning_rate": 2.417366460819359e-06, + "loss": 1092.2597, + "step": 9690 + }, + { + "ce_loss_12": 3.183715283870697, + "ce_loss_17": 3.000692093372345, + "ce_loss_23": 2.9283769130706787, + "ce_loss_3": 4.005233979225158, + "ce_loss_6": 3.5692750930786135, + "epoch": 0.97, + "grad_norm": 1200.0, + "kl_loss_12": 579.0124114990234, + "kl_loss_17": 144.61399765014647, + "kl_loss_3": 2297.602227783203, + "kl_loss_6": 1426.8571472167969, + "learning_rate": 2.2640387134577057e-06, + "loss": 1087.3692, + "step": 9700 + }, + { + "ce_loss_12": 3.094398832321167, + "ce_loss_17": 2.930269551277161, + "ce_loss_23": 2.8638766765594483, + "ce_loss_3": 3.866019880771637, + "ce_loss_6": 3.454293668270111, + "epoch": 0.971, + "grad_norm": 1272.0, + "kl_loss_12": 534.4760848999024, + "kl_loss_17": 133.8003993988037, + "kl_loss_3": 2132.0041442871093, + "kl_loss_6": 1319.9703063964844, + "learning_rate": 2.115723161700278e-06, + "loss": 1054.8391, + "step": 9710 + }, + { + "ce_loss_12": 3.0973223447799683, + "ce_loss_17": 2.9092549443244935, + "ce_loss_23": 2.8401790976524355, + "ce_loss_3": 3.9219605565071105, + "ce_loss_6": 3.4790166139602663, + "epoch": 0.972, + "grad_norm": 1424.0, + "kl_loss_12": 578.4467071533203, + "kl_loss_17": 144.3120086669922, + "kl_loss_3": 2312.3845458984374, + "kl_loss_6": 1429.7915771484375, + "learning_rate": 1.9724212990830937e-06, + "loss": 1107.4664, + "step": 9720 + }, + { + "ce_loss_12": 3.218557965755463, + "ce_loss_17": 3.0418383717536925, + "ce_loss_23": 2.9717555403709413, + "ce_loss_3": 4.036358904838562, + "ce_loss_6": 3.6048622369766234, + "epoch": 0.973, + "grad_norm": 1304.0, + "kl_loss_12": 577.412744140625, + "kl_loss_17": 142.9114616394043, + "kl_loss_3": 2285.247198486328, + "kl_loss_6": 1417.2271362304687, + "learning_rate": 1.8341345686543331e-06, + "loss": 1095.3488, + "step": 9730 + }, + { + "ce_loss_12": 3.201361894607544, + "ce_loss_17": 3.0281089544296265, + "ce_loss_23": 2.9613715410232544, + "ce_loss_3": 3.9689332246780396, + "ce_loss_6": 3.564466345310211, + "epoch": 0.974, + "grad_norm": 1240.0, + "kl_loss_12": 563.000537109375, + "kl_loss_17": 139.48666038513184, + "kl_loss_3": 2177.404364013672, + "kl_loss_6": 1362.3533264160155, + "learning_rate": 1.7008643629596864e-06, + "loss": 1093.4836, + "step": 9740 + }, + { + "ce_loss_12": 3.1929385900497436, + "ce_loss_17": 3.012971067428589, + "ce_loss_23": 2.9438130855560303, + "ce_loss_3": 3.983791542053223, + "ce_loss_6": 3.551871454715729, + "epoch": 0.975, + "grad_norm": 1072.0, + "kl_loss_12": 565.6134002685546, + "kl_loss_17": 142.52141304016112, + "kl_loss_3": 2257.5871032714845, + "kl_loss_6": 1384.605078125, + "learning_rate": 1.5726120240288633e-06, + "loss": 1100.0057, + "step": 9750 + }, + { + "ce_loss_12": 3.099290895462036, + "ce_loss_17": 2.9217312216758726, + "ce_loss_23": 2.8552627086639406, + "ce_loss_3": 3.891197717189789, + "ce_loss_6": 3.471648406982422, + "epoch": 0.976, + "grad_norm": 1392.0, + "kl_loss_12": 564.18046875, + "kl_loss_17": 138.4033515930176, + "kl_loss_3": 2235.2678833007812, + "kl_loss_6": 1394.619287109375, + "learning_rate": 1.4493788433612708e-06, + "loss": 1080.4131, + "step": 9760 + }, + { + "ce_loss_12": 3.2073326230049135, + "ce_loss_17": 3.0262670040130617, + "ce_loss_23": 2.958265542984009, + "ce_loss_3": 4.009546434879303, + "ce_loss_6": 3.5924960136413575, + "epoch": 0.977, + "grad_norm": 1288.0, + "kl_loss_12": 575.0556549072265, + "kl_loss_17": 140.97935523986817, + "kl_loss_3": 2266.6252502441407, + "kl_loss_6": 1418.9163452148437, + "learning_rate": 1.3311660619138578e-06, + "loss": 1099.3467, + "step": 9770 + }, + { + "ce_loss_12": 3.2005524158477785, + "ce_loss_17": 3.027380907535553, + "ce_loss_23": 2.957142150402069, + "ce_loss_3": 3.960201692581177, + "ce_loss_6": 3.556391179561615, + "epoch": 0.978, + "grad_norm": 916.0, + "kl_loss_12": 558.7214614868165, + "kl_loss_17": 142.58913536071776, + "kl_loss_3": 2156.0397766113283, + "kl_loss_6": 1354.8435607910155, + "learning_rate": 1.2179748700879012e-06, + "loss": 1076.5953, + "step": 9780 + }, + { + "ce_loss_12": 3.138460564613342, + "ce_loss_17": 2.959139549732208, + "ce_loss_23": 2.888117957115173, + "ce_loss_3": 3.9335604310035706, + "ce_loss_6": 3.512487268447876, + "epoch": 0.979, + "grad_norm": 1192.0, + "kl_loss_12": 559.8058776855469, + "kl_loss_17": 140.0412868499756, + "kl_loss_3": 2227.7612548828124, + "kl_loss_6": 1383.373291015625, + "learning_rate": 1.1098064077174619e-06, + "loss": 1085.6109, + "step": 9790 + }, + { + "ce_loss_12": 3.1648455619812013, + "ce_loss_17": 2.9841744542121886, + "ce_loss_23": 2.9151817083358766, + "ce_loss_3": 3.995981550216675, + "ce_loss_6": 3.558866500854492, + "epoch": 0.98, + "grad_norm": 1216.0, + "kl_loss_12": 570.8790130615234, + "kl_loss_17": 140.18775787353516, + "kl_loss_3": 2304.3897338867187, + "kl_loss_6": 1428.1238586425782, + "learning_rate": 1.006661764057837e-06, + "loss": 1093.3727, + "step": 9800 + }, + { + "ce_loss_12": 3.17210830450058, + "ce_loss_17": 2.9936544418334963, + "ce_loss_23": 2.9235389471054076, + "ce_loss_3": 3.9661587357521055, + "ce_loss_6": 3.5450394988059997, + "epoch": 0.981, + "grad_norm": 1112.0, + "kl_loss_12": 565.6225311279297, + "kl_loss_17": 139.11999893188477, + "kl_loss_3": 2233.8164794921877, + "kl_loss_6": 1397.879736328125, + "learning_rate": 9.085419777743465e-07, + "loss": 1076.6973, + "step": 9810 + }, + { + "ce_loss_12": 3.123118817806244, + "ce_loss_17": 2.9490628600120545, + "ce_loss_23": 2.883944594860077, + "ce_loss_3": 3.9237585186958315, + "ce_loss_6": 3.5086126923561096, + "epoch": 0.982, + "grad_norm": 1208.0, + "kl_loss_12": 560.7953918457031, + "kl_loss_17": 136.67298736572266, + "kl_loss_3": 2240.8614501953125, + "kl_loss_6": 1406.42685546875, + "learning_rate": 8.15448036932176e-07, + "loss": 1067.3266, + "step": 9820 + }, + { + "ce_loss_12": 3.1652329325675965, + "ce_loss_17": 2.9871429324150087, + "ce_loss_23": 2.917445969581604, + "ce_loss_3": 3.9597526669502257, + "ce_loss_6": 3.540745508670807, + "epoch": 0.983, + "grad_norm": 1032.0, + "kl_loss_12": 570.1407669067382, + "kl_loss_17": 139.7938346862793, + "kl_loss_3": 2250.53916015625, + "kl_loss_6": 1410.855517578125, + "learning_rate": 7.273808789862724e-07, + "loss": 1099.9738, + "step": 9830 + }, + { + "ce_loss_12": 3.2317925453186036, + "ce_loss_17": 3.0565574049949644, + "ce_loss_23": 2.9858224153518678, + "ce_loss_3": 4.018745148181916, + "ce_loss_6": 3.601056897640228, + "epoch": 0.984, + "grad_norm": 1288.0, + "kl_loss_12": 571.2086029052734, + "kl_loss_17": 141.71254806518556, + "kl_loss_3": 2230.4893310546877, + "kl_loss_6": 1392.4493469238282, + "learning_rate": 6.443413907720186e-07, + "loss": 1079.042, + "step": 9840 + }, + { + "ce_loss_12": 3.1713022589683533, + "ce_loss_17": 2.998173379898071, + "ce_loss_23": 2.929114842414856, + "ce_loss_3": 3.9736075520515444, + "ce_loss_6": 3.5422671914100645, + "epoch": 0.985, + "grad_norm": 1120.0, + "kl_loss_12": 560.0438995361328, + "kl_loss_17": 139.74670486450196, + "kl_loss_3": 2217.0209899902343, + "kl_loss_6": 1372.0538940429688, + "learning_rate": 5.663304084960185e-07, + "loss": 1075.6818, + "step": 9850 + }, + { + "ce_loss_12": 3.1089388132095337, + "ce_loss_17": 2.9285882234573366, + "ce_loss_23": 2.860867071151733, + "ce_loss_3": 3.9209674477577208, + "ce_loss_6": 3.4886171579360963, + "epoch": 0.986, + "grad_norm": 1160.0, + "kl_loss_12": 573.1731323242187, + "kl_loss_17": 141.9664836883545, + "kl_loss_3": 2278.8267578125, + "kl_loss_6": 1418.0163940429688, + "learning_rate": 4.933487177280482e-07, + "loss": 1074.8143, + "step": 9860 + }, + { + "ce_loss_12": 3.1958683490753175, + "ce_loss_17": 3.0272325396537783, + "ce_loss_23": 2.9600538849830627, + "ce_loss_3": 3.9845810890197755, + "ce_loss_6": 3.563033866882324, + "epoch": 0.987, + "grad_norm": 1472.0, + "kl_loss_12": 554.428938293457, + "kl_loss_17": 136.6791862487793, + "kl_loss_3": 2213.4049560546873, + "kl_loss_6": 1366.6476318359375, + "learning_rate": 4.2539705339295075e-07, + "loss": 1068.2839, + "step": 9870 + }, + { + "ce_loss_12": 3.069399726390839, + "ce_loss_17": 2.8871357917785643, + "ce_loss_23": 2.819878804683685, + "ce_loss_3": 3.8779906034469604, + "ce_loss_6": 3.451492929458618, + "epoch": 0.988, + "grad_norm": 1768.0, + "kl_loss_12": 573.8369171142579, + "kl_loss_17": 140.35310707092285, + "kl_loss_3": 2272.8044921875, + "kl_loss_6": 1411.7773986816405, + "learning_rate": 3.6247609976319816e-07, + "loss": 1079.8621, + "step": 9880 + }, + { + "ce_loss_12": 3.1557024359703063, + "ce_loss_17": 2.9749324560165404, + "ce_loss_23": 2.9044813990592955, + "ce_loss_3": 3.975613594055176, + "ce_loss_6": 3.542822539806366, + "epoch": 0.989, + "grad_norm": 1392.0, + "kl_loss_12": 576.6555145263671, + "kl_loss_17": 142.19445838928223, + "kl_loss_3": 2278.8561584472654, + "kl_loss_6": 1421.6364135742188, + "learning_rate": 3.0458649045211895e-07, + "loss": 1116.6603, + "step": 9890 + }, + { + "ce_loss_12": 3.132976603507996, + "ce_loss_17": 2.9434738397598266, + "ce_loss_23": 2.8695745706558227, + "ce_loss_3": 3.9394386887550352, + "ce_loss_6": 3.5215404748916628, + "epoch": 0.99, + "grad_norm": 1264.0, + "kl_loss_12": 583.2439514160156, + "kl_loss_17": 145.68852615356445, + "kl_loss_3": 2269.205242919922, + "kl_loss_6": 1432.1942932128907, + "learning_rate": 2.517288084074587e-07, + "loss": 1110.5715, + "step": 9900 + }, + { + "ce_loss_12": 3.1796838283538817, + "ce_loss_17": 2.9893929719924928, + "ce_loss_23": 2.914987099170685, + "ce_loss_3": 4.009390783309937, + "ce_loss_6": 3.5760415315628054, + "epoch": 0.991, + "grad_norm": 1168.0, + "kl_loss_12": 594.3054718017578, + "kl_loss_17": 146.74233474731446, + "kl_loss_3": 2322.8610900878907, + "kl_loss_6": 1455.1484313964843, + "learning_rate": 2.0390358590538505e-07, + "loss": 1105.3267, + "step": 9910 + }, + { + "ce_loss_12": 3.16956467628479, + "ce_loss_17": 2.9881795763969423, + "ce_loss_23": 2.9189654111862184, + "ce_loss_3": 3.9762886643409727, + "ce_loss_6": 3.549886417388916, + "epoch": 0.992, + "grad_norm": 984.0, + "kl_loss_12": 580.1301849365234, + "kl_loss_17": 143.14102058410646, + "kl_loss_3": 2271.9272216796876, + "kl_loss_6": 1423.6721069335938, + "learning_rate": 1.61111304545436e-07, + "loss": 1087.4358, + "step": 9920 + }, + { + "ce_loss_12": 3.1379828572273256, + "ce_loss_17": 2.961352360248566, + "ce_loss_23": 2.893160116672516, + "ce_loss_3": 3.9378134489059446, + "ce_loss_6": 3.513296627998352, + "epoch": 0.993, + "grad_norm": 1120.0, + "kl_loss_12": 568.2974365234375, + "kl_loss_17": 139.27664604187012, + "kl_loss_3": 2249.902557373047, + "kl_loss_6": 1397.9799865722657, + "learning_rate": 1.2335239524541298e-07, + "loss": 1074.8127, + "step": 9930 + }, + { + "ce_loss_12": 3.1109976887702944, + "ce_loss_17": 2.9319659233093263, + "ce_loss_23": 2.8629417061805724, + "ce_loss_3": 3.9133081912994383, + "ce_loss_6": 3.487721061706543, + "epoch": 0.994, + "grad_norm": 1136.0, + "kl_loss_12": 564.0472381591796, + "kl_loss_17": 140.5075912475586, + "kl_loss_3": 2240.4443786621096, + "kl_loss_6": 1389.5944091796875, + "learning_rate": 9.06272382371065e-08, + "loss": 1088.2604, + "step": 9940 + }, + { + "ce_loss_12": 3.179794526100159, + "ce_loss_17": 2.996285843849182, + "ce_loss_23": 2.9301026582717897, + "ce_loss_3": 3.989123558998108, + "ce_loss_6": 3.562809419631958, + "epoch": 0.995, + "grad_norm": 960.0, + "kl_loss_12": 582.4565826416016, + "kl_loss_17": 141.76634674072267, + "kl_loss_3": 2293.5713134765624, + "kl_loss_6": 1433.8289489746094, + "learning_rate": 6.293616306246586e-08, + "loss": 1097.1456, + "step": 9950 + }, + { + "ce_loss_12": 3.1582908034324646, + "ce_loss_17": 2.985929048061371, + "ce_loss_23": 2.920081615447998, + "ce_loss_3": 3.9357052087783813, + "ce_loss_6": 3.51796350479126, + "epoch": 0.996, + "grad_norm": 1376.0, + "kl_loss_12": 552.5988143920898, + "kl_loss_17": 137.1218578338623, + "kl_loss_3": 2184.5366088867186, + "kl_loss_6": 1357.4372802734374, + "learning_rate": 4.027944857032395e-08, + "loss": 1050.7697, + "step": 9960 + }, + { + "ce_loss_12": 3.1540108561515807, + "ce_loss_17": 2.9885053396224976, + "ce_loss_23": 2.9229701161384583, + "ce_loss_3": 3.9114290356636046, + "ce_loss_6": 3.5061278700828553, + "epoch": 0.997, + "grad_norm": 1352.0, + "kl_loss_12": 536.3432586669921, + "kl_loss_17": 133.10860023498535, + "kl_loss_3": 2136.299206542969, + "kl_loss_6": 1313.874041748047, + "learning_rate": 2.265732291356626e-08, + "loss": 1041.3225, + "step": 9970 + }, + { + "ce_loss_12": 3.2016385912895204, + "ce_loss_17": 3.028936493396759, + "ce_loss_23": 2.960548794269562, + "ce_loss_3": 3.9737475991249083, + "ce_loss_6": 3.5642799854278566, + "epoch": 0.998, + "grad_norm": 1232.0, + "kl_loss_12": 558.7162109375, + "kl_loss_17": 138.92991981506347, + "kl_loss_3": 2174.8899475097655, + "kl_loss_6": 1359.594744873047, + "learning_rate": 1.0069963546743833e-08, + "loss": 1084.9953, + "step": 9980 + }, + { + "ce_loss_12": 3.18752703666687, + "ce_loss_17": 3.0042120099067686, + "ce_loss_23": 2.933737003803253, + "ce_loss_3": 3.983630645275116, + "ce_loss_6": 3.556314158439636, + "epoch": 0.999, + "grad_norm": 1608.0, + "kl_loss_12": 573.6374496459961, + "kl_loss_17": 141.28692550659179, + "kl_loss_3": 2248.137664794922, + "kl_loss_6": 1396.3108642578125, + "learning_rate": 2.517497224463483e-09, + "loss": 1080.3868, + "step": 9990 + }, + { + "ce_loss_12": 3.1513269662857057, + "ce_loss_17": 2.9621216654777527, + "ce_loss_23": 2.8894708275794985, + "ce_loss_3": 3.9925246834754944, + "ce_loss_6": 3.5433329701423646, + "epoch": 1.0, + "grad_norm": 2096.0, + "kl_loss_12": 586.7055053710938, + "kl_loss_17": 144.41668090820312, + "kl_loss_3": 2356.4806884765626, + "kl_loss_6": 1458.015887451172, + "learning_rate": 0.0, + "loss": 1117.4484, + "step": 10000 + } + ], + "logging_steps": 10, + "max_steps": 10000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 250, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 8.502582338838856e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}