{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 10000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "ce_loss_10": 5.479339599609375, "ce_loss_13": 3.4827667474746704, "ce_loss_2": 13.979248523712158, "ce_loss_3": 13.771953105926514, "ce_loss_7": 7.430000305175781, "epoch": 0.0001, "grad_norm": 81408.0, "kl_loss_10": 4489.56494140625, "kl_loss_2": 22049.2119140625, "kl_loss_3": 21566.693359375, "kl_loss_7": 7499.004150390625, "learning_rate": 1e-05, "loss": 14123.4883, "step": 1 }, { "ce_loss_10": 5.119714260101318, "ce_loss_13": 3.53999932607015, "ce_loss_2": 11.240631209479439, "ce_loss_3": 10.95585854848226, "ce_loss_7": 6.906492206785414, "epoch": 0.001, "grad_norm": 37888.0, "kl_loss_10": 3271.7383083767363, "kl_loss_2": 15601.7939453125, "kl_loss_3": 14746.902018229166, "kl_loss_7": 6279.269124348958, "learning_rate": 0.0001, "loss": 9990.5972, "step": 10 }, { "ce_loss_10": 4.4336272239685055, "ce_loss_13": 3.5471752166748045, "ce_loss_2": 7.957242059707641, "ce_loss_3": 7.463982796669006, "ce_loss_7": 5.84502854347229, "epoch": 0.002, "grad_norm": 9216.0, "kl_loss_10": 1664.693670654297, "kl_loss_2": 8140.2906494140625, "kl_loss_3": 7229.052661132812, "kl_loss_7": 4268.229284667968, "learning_rate": 0.0002, "loss": 5408.8828, "step": 20 }, { "ce_loss_10": 3.938058543205261, "ce_loss_13": 3.333306384086609, "ce_loss_2": 6.773653674125671, "ce_loss_3": 6.433317041397094, "ce_loss_7": 5.134593844413757, "epoch": 0.003, "grad_norm": 3248.0, "kl_loss_10": 1150.4377380371093, "kl_loss_2": 6385.490258789063, "kl_loss_3": 5771.109155273438, "kl_loss_7": 3370.8108642578127, "learning_rate": 0.0003, "loss": 4090.45, "step": 30 }, { "ce_loss_10": 3.9883425116539, "ce_loss_13": 3.5081888794898988, "ce_loss_2": 6.318326234817505, "ce_loss_3": 6.045081973075867, "ce_loss_7": 4.969816541671753, "epoch": 0.004, "grad_norm": 6240.0, "kl_loss_10": 955.3555145263672, "kl_loss_2": 5198.101293945312, "kl_loss_3": 4741.139855957032, "kl_loss_7": 2762.6712036132812, "learning_rate": 0.0004, "loss": 3452.1297, "step": 40 }, { "ce_loss_10": 3.94427330493927, "ce_loss_13": 3.4815958857536318, "ce_loss_2": 6.088573956489563, "ce_loss_3": 5.815527606010437, "ce_loss_7": 4.780157661437988, "epoch": 0.005, "grad_norm": 4768.0, "kl_loss_10": 882.2040557861328, "kl_loss_2": 4868.76328125, "kl_loss_3": 4416.1666259765625, "kl_loss_7": 2467.264501953125, "learning_rate": 0.0005, "loss": 3156.5504, "step": 50 }, { "ce_loss_10": 3.877071762084961, "ce_loss_13": 3.493886411190033, "ce_loss_2": 5.86357319355011, "ce_loss_3": 5.616779232025147, "ce_loss_7": 4.6486598491668705, "epoch": 0.006, "grad_norm": 4768.0, "kl_loss_10": 781.2706634521485, "kl_loss_2": 4452.719311523438, "kl_loss_3": 4034.9894165039063, "kl_loss_7": 2237.3867736816405, "learning_rate": 0.0006, "loss": 2874.2984, "step": 60 }, { "ce_loss_10": 3.7734105229377746, "ce_loss_13": 3.408212423324585, "ce_loss_2": 5.7149782419204715, "ce_loss_3": 5.489736318588257, "ce_loss_7": 4.5027463555336, "epoch": 0.007, "grad_norm": 2896.0, "kl_loss_10": 747.7415161132812, "kl_loss_2": 4370.160473632813, "kl_loss_3": 3989.598352050781, "kl_loss_7": 2142.413153076172, "learning_rate": 0.0007, "loss": 2776.958, "step": 70 }, { "ce_loss_10": 3.7659215092658997, "ce_loss_13": 3.409997522830963, "ce_loss_2": 5.646710276603699, "ce_loss_3": 5.404528284072876, "ce_loss_7": 4.458205795288086, "epoch": 0.008, "grad_norm": 2256.0, "kl_loss_10": 717.0317596435547, "kl_loss_2": 4256.561865234375, "kl_loss_3": 3846.635607910156, "kl_loss_7": 2051.1767517089843, "learning_rate": 0.0008, "loss": 2710.7359, "step": 80 }, { "ce_loss_10": 3.6950827717781065, "ce_loss_13": 3.3681079149246216, "ce_loss_2": 5.57774977684021, "ce_loss_3": 5.358799338340759, "ce_loss_7": 4.384551775455475, "epoch": 0.009, "grad_norm": 2784.0, "kl_loss_10": 674.085107421875, "kl_loss_2": 4234.844958496094, "kl_loss_3": 3849.5977783203125, "kl_loss_7": 2047.007257080078, "learning_rate": 0.0009000000000000001, "loss": 2670.6148, "step": 90 }, { "ce_loss_10": 3.831665110588074, "ce_loss_13": 3.493525803089142, "ce_loss_2": 5.601197052001953, "ce_loss_3": 5.398815608024597, "ce_loss_7": 4.526931476593018, "epoch": 0.01, "grad_norm": 2752.0, "kl_loss_10": 672.117709350586, "kl_loss_2": 4021.284912109375, "kl_loss_3": 3663.468957519531, "kl_loss_7": 2026.185284423828, "learning_rate": 0.001, "loss": 2598.8504, "step": 100 }, { "ce_loss_10": 3.7488776206970216, "ce_loss_13": 3.4458404183387756, "ce_loss_2": 5.519205498695373, "ce_loss_3": 5.296326518058777, "ce_loss_7": 4.427326142787933, "epoch": 0.011, "grad_norm": 1728.0, "kl_loss_10": 619.106655883789, "kl_loss_2": 3960.856494140625, "kl_loss_3": 3572.990026855469, "kl_loss_7": 1934.6764282226563, "learning_rate": 0.0009999974825027757, "loss": 2513.0074, "step": 110 }, { "ce_loss_10": 3.805395770072937, "ce_loss_13": 3.503445029258728, "ce_loss_2": 5.498133254051209, "ce_loss_3": 5.255028605461121, "ce_loss_7": 4.415169513225555, "epoch": 0.012, "grad_norm": 2040.0, "kl_loss_10": 603.263638305664, "kl_loss_2": 3843.529895019531, "kl_loss_3": 3407.914306640625, "kl_loss_7": 1813.6988830566406, "learning_rate": 0.0009999899300364532, "loss": 2390.1301, "step": 120 }, { "ce_loss_10": 3.7707916855812074, "ce_loss_13": 3.475922393798828, "ce_loss_2": 5.500737500190735, "ce_loss_3": 5.266239738464355, "ce_loss_7": 4.383851003646851, "epoch": 0.013, "grad_norm": 2624.0, "kl_loss_10": 587.465219116211, "kl_loss_2": 3864.1528442382814, "kl_loss_3": 3434.2073608398437, "kl_loss_7": 1783.62373046875, "learning_rate": 0.0009999773426770863, "loss": 2449.8629, "step": 130 }, { "ce_loss_10": 3.867912781238556, "ce_loss_13": 3.509277641773224, "ce_loss_2": 5.469627714157104, "ce_loss_3": 5.194933176040649, "ce_loss_7": 4.379399788379669, "epoch": 0.014, "grad_norm": 1992.0, "kl_loss_10": 725.8971160888672, "kl_loss_2": 3778.5274291992187, "kl_loss_3": 3296.354528808594, "kl_loss_7": 1732.4250732421874, "learning_rate": 0.0009999597205514296, "loss": 2405.5832, "step": 140 }, { "ce_loss_10": 3.77968590259552, "ce_loss_13": 3.471135640144348, "ce_loss_2": 5.36739604473114, "ce_loss_3": 5.106346774101257, "ce_loss_7": 4.320439124107361, "epoch": 0.015, "grad_norm": 1360.0, "kl_loss_10": 624.2580291748047, "kl_loss_2": 3632.6698486328123, "kl_loss_3": 3176.562145996094, "kl_loss_7": 1695.959698486328, "learning_rate": 0.0009999370638369377, "loss": 2293.6836, "step": 150 }, { "ce_loss_10": 3.8028363585472107, "ce_loss_13": 3.5099030256271364, "ce_loss_2": 5.394668865203857, "ce_loss_3": 5.231132960319519, "ce_loss_7": 4.338041806221009, "epoch": 0.016, "grad_norm": 3296.0, "kl_loss_10": 591.1725463867188, "kl_loss_2": 3644.2818969726563, "kl_loss_3": 3364.4876342773437, "kl_loss_7": 1640.8858154296875, "learning_rate": 0.000999909372761763, "loss": 2313.8473, "step": 160 }, { "ce_loss_10": 3.715697240829468, "ce_loss_13": 3.4447871685028075, "ce_loss_2": 5.341588139533997, "ce_loss_3": 5.263697862625122, "ce_loss_7": 4.2574918389320375, "epoch": 0.017, "grad_norm": 3008.0, "kl_loss_10": 555.6443099975586, "kl_loss_2": 3670.3303466796874, "kl_loss_3": 3553.7742431640627, "kl_loss_7": 1629.572772216797, "learning_rate": 0.0009998766476047546, "loss": 2372.3059, "step": 170 }, { "ce_loss_10": 3.7622690200805664, "ce_loss_13": 3.4889180302619933, "ce_loss_2": 5.369840741157532, "ce_loss_3": 5.276954698562622, "ce_loss_7": 4.275516867637634, "epoch": 0.018, "grad_norm": 2040.0, "kl_loss_10": 565.5127258300781, "kl_loss_2": 3642.269982910156, "kl_loss_3": 3495.3918212890626, "kl_loss_7": 1571.4158569335937, "learning_rate": 0.0009998388886954545, "loss": 2349.4688, "step": 180 }, { "ce_loss_10": 3.712801456451416, "ce_loss_13": 3.4555400371551515, "ce_loss_2": 5.312930059432984, "ce_loss_3": 5.148007488250732, "ce_loss_7": 4.23377754688263, "epoch": 0.019, "grad_norm": 1328.0, "kl_loss_10": 534.2748489379883, "kl_loss_2": 3599.4434326171877, "kl_loss_3": 3312.2544921875, "kl_loss_7": 1559.4685668945312, "learning_rate": 0.0009997960964140947, "loss": 2241.091, "step": 190 }, { "ce_loss_10": 3.6890772104263307, "ce_loss_13": 3.4474449634552, "ce_loss_2": 5.328355288505554, "ce_loss_3": 5.103678369522095, "ce_loss_7": 4.223123550415039, "epoch": 0.02, "grad_norm": 1584.0, "kl_loss_10": 498.3182800292969, "kl_loss_2": 3626.2685302734376, "kl_loss_3": 3230.7111083984373, "kl_loss_7": 1544.03125, "learning_rate": 0.0009997482711915926, "loss": 2212.8523, "step": 200 }, { "ce_loss_10": 3.643280267715454, "ce_loss_13": 3.4110453128814697, "ce_loss_2": 5.262782073020935, "ce_loss_3": 5.006648206710816, "ce_loss_7": 4.161720204353332, "epoch": 0.021, "grad_norm": 1320.0, "kl_loss_10": 468.05088500976564, "kl_loss_2": 3600.3508911132812, "kl_loss_3": 3146.037072753906, "kl_loss_7": 1514.3593139648438, "learning_rate": 0.0009996954135095479, "loss": 2163.3328, "step": 210 }, { "ce_loss_10": 3.743840980529785, "ce_loss_13": 3.495615518093109, "ce_loss_2": 5.276893544197082, "ce_loss_3": 5.026828193664551, "ce_loss_7": 4.215770494937897, "epoch": 0.022, "grad_norm": 952.0, "kl_loss_10": 494.9872482299805, "kl_loss_2": 3434.308557128906, "kl_loss_3": 2996.470593261719, "kl_loss_7": 1447.3476196289062, "learning_rate": 0.0009996375239002368, "loss": 2094.248, "step": 220 }, { "ce_loss_10": 3.8117304921150206, "ce_loss_13": 3.5717169761657717, "ce_loss_2": 5.300674176216125, "ce_loss_3": 5.045718550682068, "ce_loss_7": 4.271833729743958, "epoch": 0.023, "grad_norm": 1064.0, "kl_loss_10": 491.1131820678711, "kl_loss_2": 3352.0796875, "kl_loss_3": 2909.836950683594, "kl_loss_7": 1405.5986450195312, "learning_rate": 0.0009995746029466072, "loss": 2050.6086, "step": 230 }, { "ce_loss_10": 3.6075421810150146, "ce_loss_13": 3.3550766468048097, "ce_loss_2": 5.39588577747345, "ce_loss_3": 4.985904622077942, "ce_loss_7": 4.14452086687088, "epoch": 0.024, "grad_norm": 1496.0, "kl_loss_10": 521.684194946289, "kl_loss_2": 3944.3539916992186, "kl_loss_3": 3201.487194824219, "kl_loss_7": 1583.0420288085938, "learning_rate": 0.0009995066512822719, "loss": 2234.7746, "step": 240 }, { "ce_loss_10": 3.6849907636642456, "ce_loss_13": 3.461445081233978, "ce_loss_2": 5.414009666442871, "ce_loss_3": 5.085514402389526, "ce_loss_7": 4.184376835823059, "epoch": 0.025, "grad_norm": 1800.0, "kl_loss_10": 465.4432067871094, "kl_loss_2": 3782.4762451171873, "kl_loss_3": 3206.6178466796873, "kl_loss_7": 1450.9975891113281, "learning_rate": 0.000999433669591504, "loss": 2142.3535, "step": 250 }, { "ce_loss_10": 3.6025625109672545, "ce_loss_13": 3.360257649421692, "ce_loss_2": 5.237245011329651, "ce_loss_3": 4.9437507629394535, "ce_loss_7": 4.080421531200409, "epoch": 0.026, "grad_norm": 1408.0, "kl_loss_10": 503.2585876464844, "kl_loss_2": 3655.8213134765624, "kl_loss_3": 3140.7313842773438, "kl_loss_7": 1460.5682739257813, "learning_rate": 0.000999355658609228, "loss": 2133.6004, "step": 260 }, { "ce_loss_10": 3.6813029885292052, "ce_loss_13": 3.395027530193329, "ce_loss_2": 5.295657467842102, "ce_loss_3": 5.023426985740661, "ce_loss_7": 4.133508098125458, "epoch": 0.027, "grad_norm": 1416.0, "kl_loss_10": 572.3903137207031, "kl_loss_2": 3669.314978027344, "kl_loss_3": 3183.432019042969, "kl_loss_7": 1464.0882202148437, "learning_rate": 0.0009992726191210138, "loss": 2179.2967, "step": 270 }, { "ce_loss_10": 3.696367251873016, "ce_loss_13": 3.433962404727936, "ce_loss_2": 5.21666829586029, "ce_loss_3": 4.99695348739624, "ce_loss_7": 4.169408094882965, "epoch": 0.028, "grad_norm": 1880.0, "kl_loss_10": 529.3393615722656, "kl_loss_2": 3457.8086547851562, "kl_loss_3": 3089.980187988281, "kl_loss_7": 1482.3380798339845, "learning_rate": 0.0009991845519630679, "loss": 2115.8172, "step": 280 }, { "ce_loss_10": 3.556672739982605, "ce_loss_13": 3.3172685623168947, "ce_loss_2": 5.112358474731446, "ce_loss_3": 4.917420530319214, "ce_loss_7": 4.036571848392486, "epoch": 0.029, "grad_norm": 2000.0, "kl_loss_10": 477.5372833251953, "kl_loss_2": 3475.2529418945314, "kl_loss_3": 3146.291943359375, "kl_loss_7": 1444.9558898925782, "learning_rate": 0.0009990914580222257, "loss": 2130.9104, "step": 290 }, { "ce_loss_10": 3.6650490760803223, "ce_loss_13": 3.455529069900513, "ce_loss_2": 5.149494194984436, "ce_loss_3": 4.940038657188415, "ce_loss_7": 4.130682170391083, "epoch": 0.03, "grad_norm": 1560.0, "kl_loss_10": 441.9183044433594, "kl_loss_2": 3299.1113159179686, "kl_loss_3": 2933.9962768554688, "kl_loss_7": 1369.8260986328125, "learning_rate": 0.0009989933382359422, "loss": 2069.7893, "step": 300 }, { "ce_loss_10": 3.6985942125320435, "ce_loss_13": 3.465806806087494, "ce_loss_2": 5.143499898910522, "ce_loss_3": 4.909449362754822, "ce_loss_7": 4.1382394433021545, "epoch": 0.031, "grad_norm": 1120.0, "kl_loss_10": 486.39428558349607, "kl_loss_2": 3301.6126342773437, "kl_loss_3": 2880.7590454101564, "kl_loss_7": 1392.4209594726562, "learning_rate": 0.0009988901935922825, "loss": 2022.2506, "step": 310 }, { "ce_loss_10": 3.544218695163727, "ce_loss_13": 3.314011883735657, "ce_loss_2": 5.10150101184845, "ce_loss_3": 4.842008900642395, "ce_loss_7": 4.024464392662049, "epoch": 0.032, "grad_norm": 1472.0, "kl_loss_10": 469.5253311157227, "kl_loss_2": 3487.786022949219, "kl_loss_3": 3028.9456176757812, "kl_loss_7": 1444.1999450683593, "learning_rate": 0.0009987820251299122, "loss": 2047.4186, "step": 320 }, { "ce_loss_10": 3.67177551984787, "ce_loss_13": 3.4466672420501707, "ce_loss_2": 5.135675239562988, "ce_loss_3": 4.8595945835113525, "ce_loss_7": 4.121568500995636, "epoch": 0.033, "grad_norm": 1012.0, "kl_loss_10": 450.39354553222654, "kl_loss_2": 3306.36376953125, "kl_loss_3": 2827.274365234375, "kl_loss_7": 1387.9264587402345, "learning_rate": 0.0009986688339380862, "loss": 1975.759, "step": 330 }, { "ce_loss_10": 3.6029638409614564, "ce_loss_13": 3.397365379333496, "ce_loss_2": 5.057221698760986, "ce_loss_3": 4.7936498641967775, "ce_loss_7": 4.015895903110504, "epoch": 0.034, "grad_norm": 964.0, "kl_loss_10": 436.06551971435545, "kl_loss_2": 3221.0221435546873, "kl_loss_3": 2750.7028198242188, "kl_loss_7": 1266.2399963378907, "learning_rate": 0.0009985506211566387, "loss": 1936.1705, "step": 340 }, { "ce_loss_10": 3.6370130658149717, "ce_loss_13": 3.4315125226974486, "ce_loss_2": 5.061046314239502, "ce_loss_3": 4.7848950862884525, "ce_loss_7": 4.02301949262619, "epoch": 0.035, "grad_norm": 908.0, "kl_loss_10": 422.44232482910155, "kl_loss_2": 3177.237683105469, "kl_loss_3": 2690.365087890625, "kl_loss_7": 1217.3221435546875, "learning_rate": 0.0009984273879759713, "loss": 1896.4475, "step": 350 }, { "ce_loss_10": 3.656745362281799, "ce_loss_13": 3.4570682406425477, "ce_loss_2": 5.137815022468567, "ce_loss_3": 4.873619461059571, "ce_loss_7": 4.083463799953461, "epoch": 0.036, "grad_norm": 860.0, "kl_loss_10": 423.36531524658204, "kl_loss_2": 3273.7410522460937, "kl_loss_3": 2798.973498535156, "kl_loss_7": 1268.806689453125, "learning_rate": 0.0009982991356370402, "loss": 1973.0059, "step": 360 }, { "ce_loss_10": 3.631060302257538, "ce_loss_13": 3.4341819286346436, "ce_loss_2": 5.11769585609436, "ce_loss_3": 4.834220147132873, "ce_loss_7": 4.047011601924896, "epoch": 0.037, "grad_norm": 908.0, "kl_loss_10": 402.81113891601564, "kl_loss_2": 3276.052880859375, "kl_loss_3": 2791.4117065429687, "kl_loss_7": 1245.3732849121093, "learning_rate": 0.0009981658654313456, "loss": 1941.0666, "step": 370 }, { "ce_loss_10": 3.7020971179008484, "ce_loss_13": 3.5137467861175535, "ce_loss_2": 5.156114864349365, "ce_loss_3": 4.874492716789246, "ce_loss_7": 4.092896187305451, "epoch": 0.038, "grad_norm": 744.0, "kl_loss_10": 382.19567413330077, "kl_loss_2": 3216.3584228515624, "kl_loss_3": 2713.040576171875, "kl_loss_7": 1200.0062133789063, "learning_rate": 0.000998027578700917, "loss": 1916.7457, "step": 380 }, { "ce_loss_10": 3.629340207576752, "ce_loss_13": 3.4466560959815977, "ce_loss_2": 5.104858756065369, "ce_loss_3": 4.827202153205872, "ce_loss_7": 4.051906526088715, "epoch": 0.039, "grad_norm": 768.0, "kl_loss_10": 387.5618530273438, "kl_loss_2": 3239.9035766601564, "kl_loss_3": 2754.5089477539063, "kl_loss_7": 1245.49443359375, "learning_rate": 0.0009978842768382998, "loss": 1919.6182, "step": 390 }, { "ce_loss_10": 3.6458646416664124, "ce_loss_13": 3.4677427411079407, "ce_loss_2": 5.076069569587707, "ce_loss_3": 4.798897671699524, "ce_loss_7": 4.036490082740784, "epoch": 0.04, "grad_norm": 820.0, "kl_loss_10": 365.37960052490234, "kl_loss_2": 3139.388073730469, "kl_loss_3": 2645.03125, "kl_loss_7": 1170.367169189453, "learning_rate": 0.0009977359612865424, "loss": 1848.2086, "step": 400 }, { "ce_loss_10": 3.6510703682899477, "ce_loss_13": 3.472544801235199, "ce_loss_2": 5.0927152872085575, "ce_loss_3": 4.818557095527649, "ce_loss_7": 4.048879408836365, "epoch": 0.041, "grad_norm": 752.0, "kl_loss_10": 376.4607299804687, "kl_loss_2": 3183.3623168945314, "kl_loss_3": 2696.904638671875, "kl_loss_7": 1198.10205078125, "learning_rate": 0.0009975826335391806, "loss": 1850.6066, "step": 410 }, { "ce_loss_10": 3.664944088459015, "ce_loss_13": 3.4915570259094237, "ce_loss_2": 5.092843031883239, "ce_loss_3": 4.81715497970581, "ce_loss_7": 4.06898148059845, "epoch": 0.042, "grad_norm": 1072.0, "kl_loss_10": 367.5922546386719, "kl_loss_2": 3121.2123901367186, "kl_loss_3": 2637.7326782226564, "kl_loss_7": 1178.5750122070312, "learning_rate": 0.0009974242951402235, "loss": 1847.4906, "step": 420 }, { "ce_loss_10": 3.6901652693748472, "ce_loss_13": 3.5015287518501284, "ce_loss_2": 5.113956260681152, "ce_loss_3": 4.831623649597168, "ce_loss_7": 4.073742997646332, "epoch": 0.043, "grad_norm": 932.0, "kl_loss_10": 391.2719299316406, "kl_loss_2": 3171.550817871094, "kl_loss_3": 2672.8351318359373, "kl_loss_7": 1198.8372039794922, "learning_rate": 0.0009972609476841367, "loss": 1839.4168, "step": 430 }, { "ce_loss_10": 3.592795264720917, "ce_loss_13": 3.407615542411804, "ce_loss_2": 5.048645877838135, "ce_loss_3": 4.779121279716492, "ce_loss_7": 3.982761597633362, "epoch": 0.044, "grad_norm": 928.0, "kl_loss_10": 377.92359313964846, "kl_loss_2": 3195.713342285156, "kl_loss_3": 2713.7881591796877, "kl_loss_7": 1185.5625, "learning_rate": 0.0009970925928158272, "loss": 1868.092, "step": 440 }, { "ce_loss_10": 3.542843294143677, "ce_loss_13": 3.354374420642853, "ce_loss_2": 5.013250637054443, "ce_loss_3": 4.739123964309693, "ce_loss_7": 3.935924601554871, "epoch": 0.045, "grad_norm": 740.0, "kl_loss_10": 385.2865692138672, "kl_loss_2": 3278.8071044921876, "kl_loss_3": 2790.4721435546876, "kl_loss_7": 1226.6742797851562, "learning_rate": 0.000996919232230627, "loss": 1885.8758, "step": 450 }, { "ce_loss_10": 3.609917199611664, "ce_loss_13": 3.4386712551116942, "ce_loss_2": 5.020998239517212, "ce_loss_3": 4.756829810142517, "ce_loss_7": 4.001234555244446, "epoch": 0.046, "grad_norm": 872.0, "kl_loss_10": 358.4470748901367, "kl_loss_2": 3100.1795166015627, "kl_loss_3": 2620.8273803710936, "kl_loss_7": 1157.8196044921874, "learning_rate": 0.0009967408676742752, "loss": 1772.8766, "step": 460 }, { "ce_loss_10": 3.7562451124191285, "ce_loss_13": 3.5811493396759033, "ce_loss_2": 5.11839497089386, "ce_loss_3": 4.844864320755005, "ce_loss_7": 4.1195793628692625, "epoch": 0.047, "grad_norm": 968.0, "kl_loss_10": 364.69328155517576, "kl_loss_2": 3032.6340087890626, "kl_loss_3": 2548.6266967773436, "kl_loss_7": 1130.8773040771484, "learning_rate": 0.0009965575009429006, "loss": 1825.8629, "step": 470 }, { "ce_loss_10": 3.542626643180847, "ce_loss_13": 3.364771544933319, "ce_loss_2": 4.9806403636932375, "ce_loss_3": 4.703183531761169, "ce_loss_7": 3.9297071576118467, "epoch": 0.048, "grad_norm": 772.0, "kl_loss_10": 368.38177795410155, "kl_loss_2": 3172.022900390625, "kl_loss_3": 2678.579626464844, "kl_loss_7": 1172.0243133544923, "learning_rate": 0.0009963691338830043, "loss": 1818.5924, "step": 480 }, { "ce_loss_10": 3.6282991647720335, "ce_loss_13": 3.4611623764038084, "ce_loss_2": 5.030923771858215, "ce_loss_3": 4.765255475044251, "ce_loss_7": 3.995365762710571, "epoch": 0.049, "grad_norm": 944.0, "kl_loss_10": 346.68406372070314, "kl_loss_2": 3111.6420288085938, "kl_loss_3": 2633.9958740234374, "kl_loss_7": 1125.4209197998048, "learning_rate": 0.0009961757683914405, "loss": 1782.6619, "step": 490 }, { "ce_loss_10": 3.6188631772994997, "ce_loss_13": 3.450295829772949, "ce_loss_2": 4.988259315490723, "ce_loss_3": 4.726764726638794, "ce_loss_7": 4.00168125629425, "epoch": 0.05, "grad_norm": 1184.0, "kl_loss_10": 362.3049346923828, "kl_loss_2": 3035.001806640625, "kl_loss_3": 2588.0874145507814, "kl_loss_7": 1166.9710693359375, "learning_rate": 0.0009959774064153978, "loss": 1805.0438, "step": 500 }, { "ce_loss_10": 3.623943197727203, "ce_loss_13": 3.4620243430137636, "ce_loss_2": 4.959137892723083, "ce_loss_3": 4.687646722793579, "ce_loss_7": 3.976128029823303, "epoch": 0.051, "grad_norm": 856.0, "kl_loss_10": 343.2813385009766, "kl_loss_2": 2963.6288208007813, "kl_loss_3": 2485.8935424804686, "kl_loss_7": 1086.964697265625, "learning_rate": 0.0009957740499523787, "loss": 1751.4643, "step": 510 }, { "ce_loss_10": 3.6490553617477417, "ce_loss_13": 3.476555550098419, "ce_loss_2": 4.994476556777954, "ce_loss_3": 4.725382924079895, "ce_loss_7": 4.001345467567444, "epoch": 0.052, "grad_norm": 808.0, "kl_loss_10": 347.32325134277346, "kl_loss_2": 2968.236572265625, "kl_loss_3": 2495.7832275390624, "kl_loss_7": 1099.3744354248047, "learning_rate": 0.0009955657010501807, "loss": 1740.4176, "step": 520 }, { "ce_loss_10": 3.6094146251678465, "ce_loss_13": 3.4360305190086367, "ce_loss_2": 4.987359571456909, "ce_loss_3": 4.711909174919128, "ce_loss_7": 3.96878160238266, "epoch": 0.053, "grad_norm": 732.0, "kl_loss_10": 356.96947326660154, "kl_loss_2": 3066.1166015625, "kl_loss_3": 2574.2064819335938, "kl_loss_7": 1113.183071899414, "learning_rate": 0.000995352361806875, "loss": 1757.3914, "step": 530 }, { "ce_loss_10": 3.6483884930610655, "ce_loss_13": 3.4761168599128722, "ce_loss_2": 5.01164448261261, "ce_loss_3": 4.73403651714325, "ce_loss_7": 4.005823755264283, "epoch": 0.054, "grad_norm": 868.0, "kl_loss_10": 358.3400619506836, "kl_loss_2": 3025.010693359375, "kl_loss_3": 2540.0701538085937, "kl_loss_7": 1117.8957305908202, "learning_rate": 0.0009951340343707852, "loss": 1783.3418, "step": 540 }, { "ce_loss_10": 3.693763518333435, "ce_loss_13": 3.5300124645233155, "ce_loss_2": 5.04529185295105, "ce_loss_3": 4.776751947402954, "ce_loss_7": 4.050560343265533, "epoch": 0.055, "grad_norm": 580.0, "kl_loss_10": 343.201188659668, "kl_loss_2": 2966.6511840820312, "kl_loss_3": 2491.584606933594, "kl_loss_7": 1070.658499145508, "learning_rate": 0.0009949107209404665, "loss": 1740.307, "step": 550 }, { "ce_loss_10": 3.618695652484894, "ce_loss_13": 3.4460346341133117, "ce_loss_2": 4.953143644332886, "ce_loss_3": 4.67682032585144, "ce_loss_7": 3.9601072311401366, "epoch": 0.056, "grad_norm": 972.0, "kl_loss_10": 355.8962005615234, "kl_loss_2": 2990.009143066406, "kl_loss_3": 2495.9183959960938, "kl_loss_7": 1092.0468170166016, "learning_rate": 0.0009946824237646824, "loss": 1737.0576, "step": 560 }, { "ce_loss_10": 3.5657299041748045, "ce_loss_13": 3.3921077370643617, "ce_loss_2": 4.9473305463790895, "ce_loss_3": 4.655314612388611, "ce_loss_7": 3.9485832929611204, "epoch": 0.057, "grad_norm": 1232.0, "kl_loss_10": 368.3774078369141, "kl_loss_2": 3077.5546997070314, "kl_loss_3": 2563.977990722656, "kl_loss_7": 1171.9384887695312, "learning_rate": 0.0009944491451423828, "loss": 1812.8215, "step": 570 }, { "ce_loss_10": 3.5597246408462526, "ce_loss_13": 3.38997106552124, "ce_loss_2": 4.957224941253662, "ce_loss_3": 4.669384074211121, "ce_loss_7": 3.9783090591430663, "epoch": 0.058, "grad_norm": 1048.0, "kl_loss_10": 352.9766845703125, "kl_loss_2": 3080.3538452148437, "kl_loss_3": 2573.69345703125, "kl_loss_7": 1221.7482543945312, "learning_rate": 0.0009942108874226813, "loss": 1775.8918, "step": 580 }, { "ce_loss_10": 3.667470908164978, "ce_loss_13": 3.5143657088279725, "ce_loss_2": 4.977405524253845, "ce_loss_3": 4.70070378780365, "ce_loss_7": 4.062089693546295, "epoch": 0.059, "grad_norm": 1160.0, "kl_loss_10": 326.54786376953126, "kl_loss_2": 2889.81787109375, "kl_loss_3": 2394.497277832031, "kl_loss_7": 1154.6502380371094, "learning_rate": 0.00099396765300483, "loss": 1684.8838, "step": 590 }, { "ce_loss_10": 3.65077520608902, "ce_loss_13": 3.4909046292304993, "ce_loss_2": 4.953103184700012, "ce_loss_3": 4.675009846687317, "ce_loss_7": 4.037787747383118, "epoch": 0.06, "grad_norm": 728.0, "kl_loss_10": 333.6824432373047, "kl_loss_2": 2888.043603515625, "kl_loss_3": 2401.467254638672, "kl_loss_7": 1146.9622497558594, "learning_rate": 0.0009937194443381972, "loss": 1692.9094, "step": 600 }, { "ce_loss_10": 3.6720112562179565, "ce_loss_13": 3.5144667506217955, "ce_loss_2": 4.945521140098572, "ce_loss_3": 4.670798707008362, "ce_loss_7": 4.003339779376984, "epoch": 0.061, "grad_norm": 728.0, "kl_loss_10": 340.24414978027346, "kl_loss_2": 2848.255480957031, "kl_loss_3": 2358.6506469726564, "kl_loss_7": 1042.5767547607422, "learning_rate": 0.0009934662639222412, "loss": 1695.9006, "step": 610 }, { "ce_loss_10": 3.6284273624420167, "ce_loss_13": 3.466042399406433, "ce_loss_2": 4.974099659919739, "ce_loss_3": 4.698220872879029, "ce_loss_7": 3.9703264474868774, "epoch": 0.062, "grad_norm": 708.0, "kl_loss_10": 346.28453369140624, "kl_loss_2": 2978.781689453125, "kl_loss_3": 2496.677685546875, "kl_loss_7": 1062.910955810547, "learning_rate": 0.000993208114306486, "loss": 1704.2672, "step": 620 }, { "ce_loss_10": 3.5462576508522035, "ce_loss_13": 3.380946898460388, "ce_loss_2": 4.922283387184143, "ce_loss_3": 4.633153581619263, "ce_loss_7": 3.890825295448303, "epoch": 0.063, "grad_norm": 924.0, "kl_loss_10": 358.3551940917969, "kl_loss_2": 3032.9190673828125, "kl_loss_3": 2531.955603027344, "kl_loss_7": 1071.5194458007813, "learning_rate": 0.0009929449980904952, "loss": 1693.2549, "step": 630 }, { "ce_loss_10": 3.6085665225982666, "ce_loss_13": 3.444735288619995, "ce_loss_2": 4.934487676620483, "ce_loss_3": 4.655823493003846, "ce_loss_7": 3.935356545448303, "epoch": 0.064, "grad_norm": 676.0, "kl_loss_10": 344.3735855102539, "kl_loss_2": 2962.493859863281, "kl_loss_3": 2465.4102416992187, "kl_loss_7": 1045.9244415283204, "learning_rate": 0.0009926769179238466, "loss": 1690.2553, "step": 640 }, { "ce_loss_10": 3.657176661491394, "ce_loss_13": 3.4894155979156496, "ce_loss_2": 4.984645247459412, "ce_loss_3": 4.697536993026733, "ce_loss_7": 3.984337937831879, "epoch": 0.065, "grad_norm": 812.0, "kl_loss_10": 351.49694671630857, "kl_loss_2": 2961.2925659179687, "kl_loss_3": 2455.3551025390625, "kl_loss_7": 1056.1930267333985, "learning_rate": 0.000992403876506104, "loss": 1699.9176, "step": 650 }, { "ce_loss_10": 3.5853109121322633, "ce_loss_13": 3.4265636444091796, "ce_loss_2": 4.949072217941284, "ce_loss_3": 4.657009506225586, "ce_loss_7": 3.9192400932312013, "epoch": 0.066, "grad_norm": 772.0, "kl_loss_10": 332.7637084960937, "kl_loss_2": 3005.3072998046873, "kl_loss_3": 2488.590020751953, "kl_loss_7": 1034.6812866210937, "learning_rate": 0.0009921258765867918, "loss": 1712.7359, "step": 660 }, { "ce_loss_10": 3.543238043785095, "ce_loss_13": 3.392865073680878, "ce_loss_2": 4.929511904716492, "ce_loss_3": 4.673877739906311, "ce_loss_7": 3.8979400753974915, "epoch": 0.067, "grad_norm": 1216.0, "kl_loss_10": 326.31287689208983, "kl_loss_2": 3073.3475952148438, "kl_loss_3": 2606.592980957031, "kl_loss_7": 1089.1828704833983, "learning_rate": 0.0009918429209653662, "loss": 1742.882, "step": 670 }, { "ce_loss_10": 3.60556218624115, "ce_loss_13": 3.451234769821167, "ce_loss_2": 4.9643912553787235, "ce_loss_3": 4.685172462463379, "ce_loss_7": 3.9489428043365478, "epoch": 0.068, "grad_norm": 700.0, "kl_loss_10": 326.2720092773437, "kl_loss_2": 2991.5349365234374, "kl_loss_3": 2499.9562133789063, "kl_loss_7": 1058.8525512695312, "learning_rate": 0.0009915550124911866, "loss": 1675.9207, "step": 680 }, { "ce_loss_10": 3.6152788639068603, "ce_loss_13": 3.463881015777588, "ce_loss_2": 4.9310142517089846, "ce_loss_3": 4.651708984375, "ce_loss_7": 3.939838695526123, "epoch": 0.069, "grad_norm": 716.0, "kl_loss_10": 321.7038208007813, "kl_loss_2": 2904.1300415039063, "kl_loss_3": 2416.9381103515625, "kl_loss_7": 1006.4677703857421, "learning_rate": 0.0009912621540634887, "loss": 1665.2684, "step": 690 }, { "ce_loss_10": 3.6430228471755983, "ce_loss_13": 3.4952101826667787, "ce_loss_2": 4.929437565803528, "ce_loss_3": 4.6475961923599245, "ce_loss_7": 3.9429776191711428, "epoch": 0.07, "grad_norm": 676.0, "kl_loss_10": 309.61268615722656, "kl_loss_2": 2848.814501953125, "kl_loss_3": 2359.46318359375, "kl_loss_7": 970.4827117919922, "learning_rate": 0.0009909643486313534, "loss": 1639.2395, "step": 700 }, { "ce_loss_10": 3.526335525512695, "ce_loss_13": 3.3703501343727114, "ce_loss_2": 4.889838075637817, "ce_loss_3": 4.6030642032623295, "ce_loss_7": 3.8521526575088503, "epoch": 0.071, "grad_norm": 744.0, "kl_loss_10": 340.5390853881836, "kl_loss_2": 3011.11240234375, "kl_loss_3": 2515.4811889648436, "kl_loss_7": 1017.710009765625, "learning_rate": 0.000990661599193678, "loss": 1737.2715, "step": 710 }, { "ce_loss_10": 3.6673552870750425, "ce_loss_13": 3.5033403754234316, "ce_loss_2": 4.93988311290741, "ce_loss_3": 4.67398898601532, "ce_loss_7": 3.9688475489616395, "epoch": 0.072, "grad_norm": 796.0, "kl_loss_10": 340.83275604248047, "kl_loss_2": 2865.5041381835936, "kl_loss_3": 2386.658837890625, "kl_loss_7": 996.4190338134765, "learning_rate": 0.0009903539087991462, "loss": 1651.048, "step": 720 }, { "ce_loss_10": 3.6324875712394715, "ce_loss_13": 3.4752389669418333, "ce_loss_2": 4.927369832992554, "ce_loss_3": 4.656826686859131, "ce_loss_7": 3.941362977027893, "epoch": 0.073, "grad_norm": 672.0, "kl_loss_10": 338.573225402832, "kl_loss_2": 2878.319189453125, "kl_loss_3": 2403.4671142578127, "kl_loss_7": 991.7197357177735, "learning_rate": 0.0009900412805461966, "loss": 1664.0748, "step": 730 }, { "ce_loss_10": 3.697860896587372, "ce_loss_13": 3.5502901554107664, "ce_loss_2": 4.959825038909912, "ce_loss_3": 4.680054187774658, "ce_loss_7": 4.008367860317231, "epoch": 0.074, "grad_norm": 796.0, "kl_loss_10": 322.8813171386719, "kl_loss_2": 2810.9089233398436, "kl_loss_3": 2318.1740844726564, "kl_loss_7": 980.3480072021484, "learning_rate": 0.0009897237175829927, "loss": 1630.2344, "step": 740 }, { "ce_loss_10": 3.5930413126945497, "ce_loss_13": 3.43618665933609, "ce_loss_2": 4.910944557189941, "ce_loss_3": 4.628472471237183, "ce_loss_7": 3.9170363903045655, "epoch": 0.075, "grad_norm": 720.0, "kl_loss_10": 332.21988067626955, "kl_loss_2": 2928.557727050781, "kl_loss_3": 2429.9159301757813, "kl_loss_7": 1037.6262634277343, "learning_rate": 0.0009894012231073895, "loss": 1665.4367, "step": 750 }, { "ce_loss_10": 3.6464996695518495, "ce_loss_13": 3.4838218331336974, "ce_loss_2": 4.924402260780335, "ce_loss_3": 4.645452523231507, "ce_loss_7": 3.9448330640792846, "epoch": 0.076, "grad_norm": 812.0, "kl_loss_10": 338.6822082519531, "kl_loss_2": 2855.4515014648437, "kl_loss_3": 2358.476416015625, "kl_loss_7": 978.1411010742188, "learning_rate": 0.0009890738003669028, "loss": 1654.1621, "step": 760 }, { "ce_loss_10": 3.617565965652466, "ce_loss_13": 3.455268681049347, "ce_loss_2": 4.933386254310608, "ce_loss_3": 4.651405668258667, "ce_loss_7": 3.9341206789016723, "epoch": 0.077, "grad_norm": 756.0, "kl_loss_10": 337.93136138916014, "kl_loss_2": 2949.602490234375, "kl_loss_3": 2451.218469238281, "kl_loss_7": 1020.4571960449218, "learning_rate": 0.0009887414526586764, "loss": 1640.4555, "step": 770 }, { "ce_loss_10": 3.6583608746528626, "ce_loss_13": 3.512969744205475, "ce_loss_2": 4.9441753149032595, "ce_loss_3": 4.656214547157288, "ce_loss_7": 3.964318811893463, "epoch": 0.078, "grad_norm": 720.0, "kl_loss_10": 313.43713836669923, "kl_loss_2": 2854.152880859375, "kl_loss_3": 2348.5727111816404, "kl_loss_7": 969.1142120361328, "learning_rate": 0.0009884041833294476, "loss": 1599.7842, "step": 780 }, { "ce_loss_10": 3.6560466647148133, "ce_loss_13": 3.508514332771301, "ce_loss_2": 4.940361285209656, "ce_loss_3": 4.645708775520324, "ce_loss_7": 3.958513784408569, "epoch": 0.079, "grad_norm": 832.0, "kl_loss_10": 319.23270416259766, "kl_loss_2": 2852.032861328125, "kl_loss_3": 2330.51533203125, "kl_loss_7": 969.9107818603516, "learning_rate": 0.000988061995775515, "loss": 1668.3449, "step": 790 }, { "ce_loss_10": 3.5980430364608766, "ce_loss_13": 3.440366840362549, "ce_loss_2": 4.8732929706573485, "ce_loss_3": 4.587121820449829, "ce_loss_7": 3.9043478846549986, "epoch": 0.08, "grad_norm": 752.0, "kl_loss_10": 323.7010192871094, "kl_loss_2": 2868.414514160156, "kl_loss_3": 2356.581396484375, "kl_loss_7": 987.0360656738281, "learning_rate": 0.0009877148934427035, "loss": 1633.2111, "step": 800 }, { "ce_loss_10": 3.633367455005646, "ce_loss_13": 3.4834325551986693, "ce_loss_2": 4.935962653160095, "ce_loss_3": 4.627329421043396, "ce_loss_7": 3.925890827178955, "epoch": 0.081, "grad_norm": 820.0, "kl_loss_10": 330.4556167602539, "kl_loss_2": 2885.1009033203127, "kl_loss_3": 2351.8309020996094, "kl_loss_7": 957.707730102539, "learning_rate": 0.0009873628798263297, "loss": 1611.097, "step": 810 }, { "ce_loss_10": 3.605324161052704, "ce_loss_13": 3.438004171848297, "ce_loss_2": 4.856884765625, "ce_loss_3": 4.56407413482666, "ce_loss_7": 3.8718148946762083, "epoch": 0.082, "grad_norm": 840.0, "kl_loss_10": 339.57317504882815, "kl_loss_2": 2826.0930053710936, "kl_loss_3": 2305.033795166016, "kl_loss_7": 931.82373046875, "learning_rate": 0.0009870059584711668, "loss": 1639.3607, "step": 820 }, { "ce_loss_10": 3.60188170671463, "ce_loss_13": 3.455841100215912, "ce_loss_2": 4.85420286655426, "ce_loss_3": 4.581924772262573, "ce_loss_7": 3.8951406598091127, "epoch": 0.083, "grad_norm": 720.0, "kl_loss_10": 317.57149810791014, "kl_loss_2": 2801.2140380859373, "kl_loss_3": 2316.871270751953, "kl_loss_7": 949.605337524414, "learning_rate": 0.000986644132971409, "loss": 1599.6842, "step": 830 }, { "ce_loss_10": 3.5939020037651064, "ce_loss_13": 3.4429898500442504, "ce_loss_2": 4.88135507106781, "ce_loss_3": 4.604088640213012, "ce_loss_7": 3.9158664107322694, "epoch": 0.084, "grad_norm": 932.0, "kl_loss_10": 322.8277191162109, "kl_loss_2": 2865.847692871094, "kl_loss_3": 2367.4215576171873, "kl_loss_7": 996.9576171875, "learning_rate": 0.0009862774069706345, "loss": 1629.1093, "step": 840 }, { "ce_loss_10": 3.710948944091797, "ce_loss_13": 3.5685924649238587, "ce_loss_2": 4.930621600151062, "ce_loss_3": 4.65263340473175, "ce_loss_7": 3.9990792274475098, "epoch": 0.085, "grad_norm": 684.0, "kl_loss_10": 304.0562255859375, "kl_loss_2": 2742.24169921875, "kl_loss_3": 2253.91962890625, "kl_loss_7": 950.4928100585937, "learning_rate": 0.000985905784161771, "loss": 1590.0119, "step": 850 }, { "ce_loss_10": 3.63605819940567, "ce_loss_13": 3.4998138546943665, "ce_loss_2": 4.900371265411377, "ce_loss_3": 4.62078812122345, "ce_loss_7": 3.934238409996033, "epoch": 0.086, "grad_norm": 748.0, "kl_loss_10": 294.4667907714844, "kl_loss_2": 2800.617395019531, "kl_loss_3": 2314.4944458007812, "kl_loss_7": 955.0795837402344, "learning_rate": 0.000985529268287055, "loss": 1585.186, "step": 860 }, { "ce_loss_10": 3.5651148438453673, "ce_loss_13": 3.4233306527137755, "ce_loss_2": 4.871410083770752, "ce_loss_3": 4.5925886869430546, "ce_loss_7": 3.877922761440277, "epoch": 0.087, "grad_norm": 796.0, "kl_loss_10": 301.2444900512695, "kl_loss_2": 2878.2498046875, "kl_loss_3": 2387.6543212890624, "kl_loss_7": 975.5103942871094, "learning_rate": 0.0009851478631379982, "loss": 1626.462, "step": 870 }, { "ce_loss_10": 3.6220229983329775, "ce_loss_13": 3.4835654973983763, "ce_loss_2": 4.903548383712769, "ce_loss_3": 4.61605658531189, "ce_loss_7": 3.9362378478050233, "epoch": 0.088, "grad_norm": 844.0, "kl_loss_10": 293.3538963317871, "kl_loss_2": 2833.7354125976562, "kl_loss_3": 2335.5184326171875, "kl_loss_7": 967.1238098144531, "learning_rate": 0.0009847615725553456, "loss": 1597.0803, "step": 880 }, { "ce_loss_10": 3.671082556247711, "ce_loss_13": 3.542756676673889, "ce_loss_2": 4.8840786695480345, "ce_loss_3": 4.608758640289307, "ce_loss_7": 3.9651415824890135, "epoch": 0.089, "grad_norm": 676.0, "kl_loss_10": 274.7398094177246, "kl_loss_2": 2672.2400390625, "kl_loss_3": 2185.940838623047, "kl_loss_7": 914.7755340576172, "learning_rate": 0.0009843704004290394, "loss": 1572.2007, "step": 890 }, { "ce_loss_10": 3.5845912218093874, "ce_loss_13": 3.4463690519332886, "ce_loss_2": 4.845745182037353, "ce_loss_3": 4.566518807411194, "ce_loss_7": 3.8977394104003906, "epoch": 0.09, "grad_norm": 800.0, "kl_loss_10": 293.04640731811526, "kl_loss_2": 2812.2204833984374, "kl_loss_3": 2313.156042480469, "kl_loss_7": 966.190869140625, "learning_rate": 0.0009839743506981783, "loss": 1597.2805, "step": 900 }, { "ce_loss_10": 3.5071211099624633, "ce_loss_13": 3.369294321537018, "ce_loss_2": 4.836311769485474, "ce_loss_3": 4.550878620147705, "ce_loss_7": 3.8309507608413695, "epoch": 0.091, "grad_norm": 716.0, "kl_loss_10": 298.81206665039065, "kl_loss_2": 2958.2573974609377, "kl_loss_3": 2443.9187561035155, "kl_loss_7": 1005.0242462158203, "learning_rate": 0.0009835734273509786, "loss": 1627.2797, "step": 910 }, { "ce_loss_10": 3.6050177574157716, "ce_loss_13": 3.4665517807006836, "ce_loss_2": 4.881958699226379, "ce_loss_3": 4.6013915777206424, "ce_loss_7": 3.9145362257957457, "epoch": 0.092, "grad_norm": 720.0, "kl_loss_10": 288.0885604858398, "kl_loss_2": 2799.756945800781, "kl_loss_3": 2307.6742553710938, "kl_loss_7": 959.5810729980469, "learning_rate": 0.0009831676344247342, "loss": 1585.5819, "step": 920 }, { "ce_loss_10": 3.615782046318054, "ce_loss_13": 3.484424388408661, "ce_loss_2": 4.840068244934082, "ce_loss_3": 4.566077804565429, "ce_loss_7": 3.905368459224701, "epoch": 0.093, "grad_norm": 592.0, "kl_loss_10": 284.13806304931643, "kl_loss_2": 2716.098291015625, "kl_loss_3": 2237.568524169922, "kl_loss_7": 925.932373046875, "learning_rate": 0.0009827569760057755, "loss": 1574.975, "step": 930 }, { "ce_loss_10": 3.5478480219841004, "ce_loss_13": 3.4008304595947267, "ce_loss_2": 4.878832292556763, "ce_loss_3": 4.597835183143616, "ce_loss_7": 3.860486149787903, "epoch": 0.094, "grad_norm": 812.0, "kl_loss_10": 311.2947525024414, "kl_loss_2": 2955.33916015625, "kl_loss_3": 2458.781884765625, "kl_loss_7": 985.0075500488281, "learning_rate": 0.000982341456229428, "loss": 1619.0104, "step": 940 }, { "ce_loss_10": 3.6401113510131835, "ce_loss_13": 3.4997127175331117, "ce_loss_2": 4.909311819076538, "ce_loss_3": 4.633120918273926, "ce_loss_7": 3.936661887168884, "epoch": 0.095, "grad_norm": 768.0, "kl_loss_10": 304.94605484008787, "kl_loss_2": 2847.3047485351562, "kl_loss_3": 2358.746990966797, "kl_loss_7": 958.3424041748046, "learning_rate": 0.000981921079279971, "loss": 1575.8767, "step": 950 }, { "ce_loss_10": 3.6493973970413207, "ce_loss_13": 3.5170445680618285, "ce_loss_2": 4.842743754386902, "ce_loss_3": 4.559553527832032, "ce_loss_7": 3.913013446331024, "epoch": 0.096, "grad_norm": 632.0, "kl_loss_10": 287.80171127319335, "kl_loss_2": 2681.031005859375, "kl_loss_3": 2186.1464904785157, "kl_loss_7": 891.4322113037109, "learning_rate": 0.0009814958493905962, "loss": 1541.8673, "step": 960 }, { "ce_loss_10": 3.6059035897254943, "ce_loss_13": 3.464053213596344, "ce_loss_2": 4.885409092903137, "ce_loss_3": 4.605575942993164, "ce_loss_7": 3.901495134830475, "epoch": 0.097, "grad_norm": 644.0, "kl_loss_10": 302.9938400268555, "kl_loss_2": 2842.060888671875, "kl_loss_3": 2348.8412109375, "kl_loss_7": 943.344677734375, "learning_rate": 0.0009810657708433637, "loss": 1620.3537, "step": 970 }, { "ce_loss_10": 3.6700100898742676, "ce_loss_13": 3.538521420955658, "ce_loss_2": 4.868229222297669, "ce_loss_3": 4.590689539909363, "ce_loss_7": 3.9474687933921815, "epoch": 0.098, "grad_norm": 808.0, "kl_loss_10": 283.2241409301758, "kl_loss_2": 2674.522265625, "kl_loss_3": 2192.326556396484, "kl_loss_7": 894.1458190917969, "learning_rate": 0.0009806308479691594, "loss": 1528.2636, "step": 980 }, { "ce_loss_10": 3.691223752498627, "ce_loss_13": 3.55548814535141, "ce_loss_2": 4.925488543510437, "ce_loss_3": 4.648779034614563, "ce_loss_7": 3.9924039959907534, "epoch": 0.099, "grad_norm": 740.0, "kl_loss_10": 294.3150146484375, "kl_loss_2": 2748.0041381835936, "kl_loss_3": 2268.979638671875, "kl_loss_7": 946.8526397705078, "learning_rate": 0.0009801910851476522, "loss": 1554.0744, "step": 990 }, { "ce_loss_10": 3.6008501768112184, "ce_loss_13": 3.465990114212036, "ce_loss_2": 4.890150642395019, "ce_loss_3": 4.609904193878174, "ce_loss_7": 3.9068346500396727, "epoch": 0.1, "grad_norm": 736.0, "kl_loss_10": 294.7660331726074, "kl_loss_2": 2875.2068603515627, "kl_loss_3": 2379.5891052246093, "kl_loss_7": 970.1351318359375, "learning_rate": 0.0009797464868072487, "loss": 1582.4648, "step": 1000 }, { "ce_loss_10": 3.5892885446548464, "ce_loss_13": 3.454503262042999, "ce_loss_2": 4.837452292442322, "ce_loss_3": 4.55982882976532, "ce_loss_7": 3.887318527698517, "epoch": 0.101, "grad_norm": 724.0, "kl_loss_10": 288.82502670288085, "kl_loss_2": 2762.65830078125, "kl_loss_3": 2282.756170654297, "kl_loss_7": 944.8302276611328, "learning_rate": 0.0009792970574250492, "loss": 1564.9662, "step": 1010 }, { "ce_loss_10": 3.6221608400344847, "ce_loss_13": 3.482994794845581, "ce_loss_2": 4.848793458938599, "ce_loss_3": 4.575083756446839, "ce_loss_7": 3.914657413959503, "epoch": 0.102, "grad_norm": 612.0, "kl_loss_10": 290.8812942504883, "kl_loss_2": 2743.8400146484373, "kl_loss_3": 2261.9089599609374, "kl_loss_7": 937.5250091552734, "learning_rate": 0.0009788428015268028, "loss": 1536.8119, "step": 1020 }, { "ce_loss_10": 3.6110181331634523, "ce_loss_13": 3.47798638343811, "ce_loss_2": 4.840990829467773, "ce_loss_3": 4.55189163684845, "ce_loss_7": 3.9010056853294373, "epoch": 0.103, "grad_norm": 616.0, "kl_loss_10": 281.37939529418946, "kl_loss_2": 2739.4623291015623, "kl_loss_3": 2238.093048095703, "kl_loss_7": 923.4858306884765, "learning_rate": 0.0009783837236868609, "loss": 1534.7721, "step": 1030 }, { "ce_loss_10": 3.5802615523338317, "ce_loss_13": 3.4459127306938173, "ce_loss_2": 4.818247056007385, "ce_loss_3": 4.546270060539245, "ce_loss_7": 3.8740112662315367, "epoch": 0.104, "grad_norm": 696.0, "kl_loss_10": 281.4418014526367, "kl_loss_2": 2719.910290527344, "kl_loss_3": 2248.530157470703, "kl_loss_7": 921.926953125, "learning_rate": 0.0009779198285281327, "loss": 1537.119, "step": 1040 }, { "ce_loss_10": 3.577412283420563, "ce_loss_13": 3.4400023460388183, "ce_loss_2": 4.825755000114441, "ce_loss_3": 4.554906344413757, "ce_loss_7": 3.8695693135261537, "epoch": 0.105, "grad_norm": 784.0, "kl_loss_10": 293.84764709472654, "kl_loss_2": 2770.2111328125, "kl_loss_3": 2280.982073974609, "kl_loss_7": 916.6518432617188, "learning_rate": 0.0009774511207220368, "loss": 1562.095, "step": 1050 }, { "ce_loss_10": 3.621231746673584, "ce_loss_13": 3.4823400259017943, "ce_loss_2": 4.867471241950989, "ce_loss_3": 4.584862947463989, "ce_loss_7": 3.895237350463867, "epoch": 0.106, "grad_norm": 588.0, "kl_loss_10": 306.07321014404295, "kl_loss_2": 2785.361218261719, "kl_loss_3": 2286.776574707031, "kl_loss_7": 918.4756744384765, "learning_rate": 0.0009769776049884564, "loss": 1554.5619, "step": 1060 }, { "ce_loss_10": 3.5330086588859557, "ce_loss_13": 3.387469935417175, "ce_loss_2": 4.804182314872742, "ce_loss_3": 4.539949297904968, "ce_loss_7": 3.8264609456062315, "epoch": 0.107, "grad_norm": 1184.0, "kl_loss_10": 307.66697082519534, "kl_loss_2": 2836.2517578125, "kl_loss_3": 2373.5376220703124, "kl_loss_7": 943.6192169189453, "learning_rate": 0.0009764992860956889, "loss": 1622.7785, "step": 1070 }, { "ce_loss_10": 3.677293050289154, "ce_loss_13": 3.5469510316848756, "ce_loss_2": 4.837077927589417, "ce_loss_3": 4.588571333885193, "ce_loss_7": 3.9465363740921022, "epoch": 0.108, "grad_norm": 816.0, "kl_loss_10": 286.8066802978516, "kl_loss_2": 2605.4248657226562, "kl_loss_3": 2175.9279296875, "kl_loss_7": 899.353060913086, "learning_rate": 0.0009760161688604008, "loss": 1520.9383, "step": 1080 }, { "ce_loss_10": 3.6768419981002807, "ce_loss_13": 3.54748477935791, "ce_loss_2": 4.881722617149353, "ce_loss_3": 4.620517659187317, "ce_loss_7": 3.9953080892562864, "epoch": 0.109, "grad_norm": 840.0, "kl_loss_10": 283.82303619384766, "kl_loss_2": 2660.0453125, "kl_loss_3": 2210.3756591796873, "kl_loss_7": 954.3282287597656, "learning_rate": 0.0009755282581475768, "loss": 1552.3523, "step": 1090 }, { "ce_loss_10": 3.742873156070709, "ce_loss_13": 3.60170716047287, "ce_loss_2": 4.9219562292099, "ce_loss_3": 4.631097722053528, "ce_loss_7": 4.016275346279144, "epoch": 0.11, "grad_norm": 792.0, "kl_loss_10": 295.95645599365236, "kl_loss_2": 2660.5046997070312, "kl_loss_3": 2150.144982910156, "kl_loss_7": 938.1217224121094, "learning_rate": 0.0009750355588704727, "loss": 1496.9391, "step": 1100 }, { "ce_loss_10": 3.5732216477394103, "ce_loss_13": 3.427997899055481, "ce_loss_2": 4.788290286064148, "ce_loss_3": 4.501250839233398, "ce_loss_7": 3.849083948135376, "epoch": 0.111, "grad_norm": 644.0, "kl_loss_10": 301.9219177246094, "kl_loss_2": 2692.5292846679686, "kl_loss_3": 2192.219659423828, "kl_loss_7": 902.1104858398437, "learning_rate": 0.0009745380759905647, "loss": 1547.9881, "step": 1110 }, { "ce_loss_10": 3.525436317920685, "ce_loss_13": 3.388839864730835, "ce_loss_2": 4.766349339485169, "ce_loss_3": 4.478921818733215, "ce_loss_7": 3.8117297768592833, "epoch": 0.112, "grad_norm": 636.0, "kl_loss_10": 288.7658378601074, "kl_loss_2": 2767.7005126953127, "kl_loss_3": 2266.3693115234373, "kl_loss_7": 916.3693817138671, "learning_rate": 0.0009740358145174998, "loss": 1582.2694, "step": 1120 }, { "ce_loss_10": 3.674707901477814, "ce_loss_13": 3.541641688346863, "ce_loss_2": 4.839509201049805, "ce_loss_3": 4.554335117340088, "ce_loss_7": 3.9309728384017943, "epoch": 0.113, "grad_norm": 740.0, "kl_loss_10": 293.9353363037109, "kl_loss_2": 2627.9318603515626, "kl_loss_3": 2118.4319458007812, "kl_loss_7": 883.5943176269532, "learning_rate": 0.0009735287795090455, "loss": 1505.1257, "step": 1130 }, { "ce_loss_10": 3.5646776437759398, "ce_loss_13": 3.4284933686256407, "ce_loss_2": 4.8010115146636965, "ce_loss_3": 4.510753107070923, "ce_loss_7": 3.839044988155365, "epoch": 0.114, "grad_norm": 692.0, "kl_loss_10": 289.08748931884764, "kl_loss_2": 2724.9734130859374, "kl_loss_3": 2216.0496459960937, "kl_loss_7": 891.9239013671875, "learning_rate": 0.0009730169760710386, "loss": 1526.1704, "step": 1140 }, { "ce_loss_10": 3.647395300865173, "ce_loss_13": 3.51713547706604, "ce_loss_2": 4.854923152923584, "ce_loss_3": 4.577234363555908, "ce_loss_7": 3.928617572784424, "epoch": 0.115, "grad_norm": 800.0, "kl_loss_10": 280.8671928405762, "kl_loss_2": 2669.748742675781, "kl_loss_3": 2182.849468994141, "kl_loss_7": 895.4142913818359, "learning_rate": 0.0009725004093573342, "loss": 1526.191, "step": 1150 }, { "ce_loss_10": 3.5862129092216493, "ce_loss_13": 3.4506229400634765, "ce_loss_2": 4.798703122138977, "ce_loss_3": 4.520225930213928, "ce_loss_7": 3.877876043319702, "epoch": 0.116, "grad_norm": 840.0, "kl_loss_10": 283.1919075012207, "kl_loss_2": 2672.7715087890624, "kl_loss_3": 2193.748876953125, "kl_loss_7": 903.5404602050781, "learning_rate": 0.0009719790845697534, "loss": 1504.2701, "step": 1160 }, { "ce_loss_10": 3.5309566259384155, "ce_loss_13": 3.4061906576156615, "ce_loss_2": 4.704360723495483, "ce_loss_3": 4.450148797035217, "ce_loss_7": 3.803053593635559, "epoch": 0.117, "grad_norm": 696.0, "kl_loss_10": 271.77204208374025, "kl_loss_2": 2620.2907592773436, "kl_loss_3": 2176.4858520507814, "kl_loss_7": 863.540869140625, "learning_rate": 0.0009714530069580309, "loss": 1485.2044, "step": 1170 }, { "ce_loss_10": 3.640796720981598, "ce_loss_13": 3.507249903678894, "ce_loss_2": 4.853176116943359, "ce_loss_3": 4.5715264797210695, "ce_loss_7": 3.914932680130005, "epoch": 0.118, "grad_norm": 716.0, "kl_loss_10": 285.63293685913084, "kl_loss_2": 2675.1877319335936, "kl_loss_3": 2189.7057678222654, "kl_loss_7": 884.98447265625, "learning_rate": 0.0009709221818197624, "loss": 1502.0164, "step": 1180 }, { "ce_loss_10": 3.6675962805747986, "ce_loss_13": 3.534939968585968, "ce_loss_2": 4.88215401172638, "ce_loss_3": 4.607950353622437, "ce_loss_7": 3.9379210352897642, "epoch": 0.119, "grad_norm": 596.0, "kl_loss_10": 288.61556854248045, "kl_loss_2": 2711.667822265625, "kl_loss_3": 2227.545977783203, "kl_loss_7": 887.7295013427735, "learning_rate": 0.0009703866145003512, "loss": 1525.4232, "step": 1190 }, { "ce_loss_10": 3.6349379420280457, "ce_loss_13": 3.5029913663864134, "ce_loss_2": 4.829423713684082, "ce_loss_3": 4.558488368988037, "ce_loss_7": 3.908590841293335, "epoch": 0.12, "grad_norm": 660.0, "kl_loss_10": 279.50138397216796, "kl_loss_2": 2676.350244140625, "kl_loss_3": 2191.2725830078125, "kl_loss_7": 882.7566497802734, "learning_rate": 0.0009698463103929542, "loss": 1529.4317, "step": 1200 }, { "ce_loss_10": 3.605515944957733, "ce_loss_13": 3.472998011112213, "ce_loss_2": 4.827000212669373, "ce_loss_3": 4.540698933601379, "ce_loss_7": 3.879436028003693, "epoch": 0.121, "grad_norm": 652.0, "kl_loss_10": 281.2242576599121, "kl_loss_2": 2695.142529296875, "kl_loss_3": 2191.8710388183595, "kl_loss_7": 882.5638031005859, "learning_rate": 0.0009693012749384279, "loss": 1527.1828, "step": 1210 }, { "ce_loss_10": 3.617890453338623, "ce_loss_13": 3.4903839349746706, "ce_loss_2": 4.823957228660584, "ce_loss_3": 4.546852803230285, "ce_loss_7": 3.8918931126594543, "epoch": 0.122, "grad_norm": 596.0, "kl_loss_10": 274.6055084228516, "kl_loss_2": 2677.51435546875, "kl_loss_3": 2182.2475463867186, "kl_loss_7": 884.2765747070313, "learning_rate": 0.0009687515136252732, "loss": 1502.8832, "step": 1220 }, { "ce_loss_10": 3.571158289909363, "ce_loss_13": 3.4428164839744566, "ce_loss_2": 4.832195687294006, "ce_loss_3": 4.558122348785401, "ce_loss_7": 3.866991031169891, "epoch": 0.123, "grad_norm": 656.0, "kl_loss_10": 285.63698654174806, "kl_loss_2": 2814.325549316406, "kl_loss_3": 2321.4359924316404, "kl_loss_7": 924.2180969238282, "learning_rate": 0.0009681970319895803, "loss": 1610.0467, "step": 1230 }, { "ce_loss_10": 3.6617783904075623, "ce_loss_13": 3.5239094376564024, "ce_loss_2": 4.840570569038391, "ce_loss_3": 4.5621686458587645, "ce_loss_7": 3.9261529445648193, "epoch": 0.124, "grad_norm": 660.0, "kl_loss_10": 282.9206481933594, "kl_loss_2": 2658.744873046875, "kl_loss_3": 2162.91650390625, "kl_loss_7": 894.8567260742187, "learning_rate": 0.0009676378356149733, "loss": 1510.0703, "step": 1240 }, { "ce_loss_10": 3.632222390174866, "ce_loss_13": 3.49722797870636, "ce_loss_2": 4.803181719779968, "ce_loss_3": 4.527125644683838, "ce_loss_7": 3.893145501613617, "epoch": 0.125, "grad_norm": 676.0, "kl_loss_10": 306.93408966064453, "kl_loss_2": 2618.3517578125, "kl_loss_3": 2135.0671936035155, "kl_loss_7": 870.7611785888672, "learning_rate": 0.0009670739301325534, "loss": 1495.915, "step": 1250 }, { "ce_loss_10": 3.5965846180915833, "ce_loss_13": 3.461331534385681, "ce_loss_2": 4.77229871749878, "ce_loss_3": 4.488967990875244, "ce_loss_7": 3.870732378959656, "epoch": 0.126, "grad_norm": 824.0, "kl_loss_10": 288.87402572631834, "kl_loss_2": 2631.6656005859377, "kl_loss_3": 2132.2338745117186, "kl_loss_7": 890.5492980957031, "learning_rate": 0.0009665053212208426, "loss": 1507.3391, "step": 1260 }, { "ce_loss_10": 3.6325414419174193, "ce_loss_13": 3.5006507635116577, "ce_loss_2": 4.82985291481018, "ce_loss_3": 4.53967547416687, "ce_loss_7": 3.907087206840515, "epoch": 0.127, "grad_norm": 824.0, "kl_loss_10": 289.66627197265626, "kl_loss_2": 2682.7635498046875, "kl_loss_3": 2171.759143066406, "kl_loss_7": 897.6279174804688, "learning_rate": 0.0009659320146057262, "loss": 1515.1299, "step": 1270 }, { "ce_loss_10": 3.6294240951538086, "ce_loss_13": 3.5012729167938232, "ce_loss_2": 4.802068519592285, "ce_loss_3": 4.516877055168152, "ce_loss_7": 3.912596344947815, "epoch": 0.128, "grad_norm": 1040.0, "kl_loss_10": 274.3899444580078, "kl_loss_2": 2616.2175048828126, "kl_loss_3": 2113.874139404297, "kl_loss_7": 894.8648956298828, "learning_rate": 0.0009653540160603955, "loss": 1485.5743, "step": 1280 }, { "ce_loss_10": 3.631951367855072, "ce_loss_13": 3.5082743883132936, "ce_loss_2": 4.7942791938781735, "ce_loss_3": 4.533441662788391, "ce_loss_7": 3.911020016670227, "epoch": 0.129, "grad_norm": 980.0, "kl_loss_10": 277.70714950561523, "kl_loss_2": 2607.6315795898436, "kl_loss_3": 2154.8384338378905, "kl_loss_7": 902.8254302978515, "learning_rate": 0.0009647713314052896, "loss": 1475.7309, "step": 1290 }, { "ce_loss_10": 3.5910762190818786, "ce_loss_13": 3.4583710193634034, "ce_loss_2": 4.806964182853699, "ce_loss_3": 4.536605000495911, "ce_loss_7": 3.892735993862152, "epoch": 0.13, "grad_norm": 1032.0, "kl_loss_10": 281.282731628418, "kl_loss_2": 2713.5140380859375, "kl_loss_3": 2245.954937744141, "kl_loss_7": 924.882958984375, "learning_rate": 0.0009641839665080363, "loss": 1529.1627, "step": 1300 }, { "ce_loss_10": 3.5369811177253725, "ce_loss_13": 3.4184723615646364, "ce_loss_2": 4.746987700462341, "ce_loss_3": 4.47196786403656, "ce_loss_7": 3.8142111539840697, "epoch": 0.131, "grad_norm": 708.0, "kl_loss_10": 267.0766883850098, "kl_loss_2": 2651.264123535156, "kl_loss_3": 2169.8530151367186, "kl_loss_7": 874.0395812988281, "learning_rate": 0.0009635919272833937, "loss": 1472.4912, "step": 1310 }, { "ce_loss_10": 3.582905340194702, "ce_loss_13": 3.4547547817230226, "ce_loss_2": 4.782030344009399, "ce_loss_3": 4.50511953830719, "ce_loss_7": 3.8575597286224363, "epoch": 0.132, "grad_norm": 640.0, "kl_loss_10": 274.49700088500975, "kl_loss_2": 2645.7089721679686, "kl_loss_3": 2148.3076110839843, "kl_loss_7": 865.2912628173829, "learning_rate": 0.0009629952196931902, "loss": 1461.5725, "step": 1320 }, { "ce_loss_10": 3.560918188095093, "ce_loss_13": 3.4357552766799926, "ce_loss_2": 4.777603983879089, "ce_loss_3": 4.497129726409912, "ce_loss_7": 3.8250754475593567, "epoch": 0.133, "grad_norm": 612.0, "kl_loss_10": 266.5273551940918, "kl_loss_2": 2692.33935546875, "kl_loss_3": 2197.4263916015625, "kl_loss_7": 846.3100128173828, "learning_rate": 0.0009623938497462645, "loss": 1482.4779, "step": 1330 }, { "ce_loss_10": 3.559932196140289, "ce_loss_13": 3.4353162169456484, "ce_loss_2": 4.754807543754578, "ce_loss_3": 4.478498530387879, "ce_loss_7": 3.8313623666763306, "epoch": 0.134, "grad_norm": 564.0, "kl_loss_10": 268.2800895690918, "kl_loss_2": 2653.6271240234373, "kl_loss_3": 2162.7194641113283, "kl_loss_7": 859.2419372558594, "learning_rate": 0.0009617878234984055, "loss": 1499.2066, "step": 1340 }, { "ce_loss_10": 3.651080513000488, "ce_loss_13": 3.533881187438965, "ce_loss_2": 4.8088576078414915, "ce_loss_3": 4.535065650939941, "ce_loss_7": 3.9042758703231812, "epoch": 0.135, "grad_norm": 712.0, "kl_loss_10": 256.59825744628904, "kl_loss_2": 2581.625207519531, "kl_loss_3": 2098.4682495117186, "kl_loss_7": 828.9938018798828, "learning_rate": 0.0009611771470522907, "loss": 1464.5767, "step": 1350 }, { "ce_loss_10": 3.5779558777809144, "ce_loss_13": 3.457493555545807, "ce_loss_2": 4.792022109031677, "ce_loss_3": 4.514930057525635, "ce_loss_7": 3.8448525190353395, "epoch": 0.136, "grad_norm": 616.0, "kl_loss_10": 259.41123428344724, "kl_loss_2": 2657.6331420898437, "kl_loss_3": 2171.1466857910154, "kl_loss_7": 847.0537750244141, "learning_rate": 0.0009605618265574251, "loss": 1459.6229, "step": 1360 }, { "ce_loss_10": 3.5429495334625245, "ce_loss_13": 3.4162652492523193, "ce_loss_2": 4.794952082633972, "ce_loss_3": 4.535301685333252, "ce_loss_7": 3.8165592908859254, "epoch": 0.137, "grad_norm": 620.0, "kl_loss_10": 271.0598449707031, "kl_loss_2": 2776.145849609375, "kl_loss_3": 2325.675885009766, "kl_loss_7": 881.587744140625, "learning_rate": 0.0009599418682100792, "loss": 1522.4414, "step": 1370 }, { "ce_loss_10": 3.58179566860199, "ce_loss_13": 3.459395945072174, "ce_loss_2": 4.792193937301636, "ce_loss_3": 4.521099305152893, "ce_loss_7": 3.84169602394104, "epoch": 0.138, "grad_norm": 724.0, "kl_loss_10": 257.83258666992185, "kl_loss_2": 2672.4068237304687, "kl_loss_3": 2198.559918212891, "kl_loss_7": 850.8091857910156, "learning_rate": 0.0009593172782532268, "loss": 1496.2724, "step": 1380 }, { "ce_loss_10": 3.622367191314697, "ce_loss_13": 3.506042146682739, "ce_loss_2": 4.801430583000183, "ce_loss_3": 4.530508184432984, "ce_loss_7": 3.888216722011566, "epoch": 0.139, "grad_norm": 672.0, "kl_loss_10": 260.9531532287598, "kl_loss_2": 2599.6354858398436, "kl_loss_3": 2121.937152099609, "kl_loss_7": 852.8548278808594, "learning_rate": 0.0009586880629764817, "loss": 1464.8023, "step": 1390 }, { "ce_loss_10": 3.546726655960083, "ce_loss_13": 3.428490459918976, "ce_loss_2": 4.748290467262268, "ce_loss_3": 4.471861267089844, "ce_loss_7": 3.824984240531921, "epoch": 0.14, "grad_norm": 620.0, "kl_loss_10": 260.18620986938475, "kl_loss_2": 2649.2240234375, "kl_loss_3": 2164.870428466797, "kl_loss_7": 870.0703582763672, "learning_rate": 0.0009580542287160348, "loss": 1462.9275, "step": 1400 }, { "ce_loss_10": 3.5134201645851135, "ce_loss_13": 3.396924638748169, "ce_loss_2": 4.727832221984864, "ce_loss_3": 4.457144689559937, "ce_loss_7": 3.781324291229248, "epoch": 0.141, "grad_norm": 724.0, "kl_loss_10": 257.8106407165527, "kl_loss_2": 2672.565283203125, "kl_loss_3": 2194.398052978516, "kl_loss_7": 841.9467041015625, "learning_rate": 0.0009574157818545901, "loss": 1469.0121, "step": 1410 }, { "ce_loss_10": 3.583372378349304, "ce_loss_13": 3.4670314311981203, "ce_loss_2": 4.753075981140137, "ce_loss_3": 4.488786149024963, "ce_loss_7": 3.8414045095443727, "epoch": 0.142, "grad_norm": 768.0, "kl_loss_10": 250.4652572631836, "kl_loss_2": 2575.260546875, "kl_loss_3": 2109.250030517578, "kl_loss_7": 815.4136535644532, "learning_rate": 0.0009567727288213005, "loss": 1470.4241, "step": 1420 }, { "ce_loss_10": 3.5615610837936402, "ce_loss_13": 3.4428680539131165, "ce_loss_2": 4.766120481491089, "ce_loss_3": 4.489290237426758, "ce_loss_7": 3.8387726664543154, "epoch": 0.143, "grad_norm": 680.0, "kl_loss_10": 259.5032684326172, "kl_loss_2": 2652.6231079101562, "kl_loss_3": 2168.8318054199217, "kl_loss_7": 872.5292297363281, "learning_rate": 0.0009561250760917027, "loss": 1465.2545, "step": 1430 }, { "ce_loss_10": 3.5825438022613527, "ce_loss_13": 3.4635141372680662, "ce_loss_2": 4.774414443969727, "ce_loss_3": 4.498082184791565, "ce_loss_7": 3.8522005438804627, "epoch": 0.144, "grad_norm": 656.0, "kl_loss_10": 263.3311599731445, "kl_loss_2": 2662.4484375, "kl_loss_3": 2176.186492919922, "kl_loss_7": 865.9247039794922, "learning_rate": 0.0009554728301876525, "loss": 1454.278, "step": 1440 }, { "ce_loss_10": 3.6376792669296263, "ce_loss_13": 3.515091061592102, "ce_loss_2": 4.810996460914612, "ce_loss_3": 4.536413979530335, "ce_loss_7": 3.9078781604766846, "epoch": 0.145, "grad_norm": 616.0, "kl_loss_10": 259.68054962158203, "kl_loss_2": 2600.8175415039063, "kl_loss_3": 2120.5454040527343, "kl_loss_7": 864.2900634765625, "learning_rate": 0.0009548159976772592, "loss": 1508.1567, "step": 1450 }, { "ce_loss_10": 3.5796504259109496, "ce_loss_13": 3.456580376625061, "ce_loss_2": 4.787333536148071, "ce_loss_3": 4.520044946670533, "ce_loss_7": 3.8587978959083555, "epoch": 0.146, "grad_norm": 624.0, "kl_loss_10": 265.1648490905762, "kl_loss_2": 2666.8885864257813, "kl_loss_3": 2195.818231201172, "kl_loss_7": 871.2362884521484, "learning_rate": 0.0009541545851748186, "loss": 1477.8201, "step": 1460 }, { "ce_loss_10": 3.4508144855499268, "ce_loss_13": 3.3300524830818174, "ce_loss_2": 4.699088287353516, "ce_loss_3": 4.421405148506165, "ce_loss_7": 3.735712671279907, "epoch": 0.147, "grad_norm": 844.0, "kl_loss_10": 262.5924041748047, "kl_loss_2": 2730.21630859375, "kl_loss_3": 2243.504345703125, "kl_loss_7": 878.0860382080078, "learning_rate": 0.0009534885993407473, "loss": 1496.8188, "step": 1470 }, { "ce_loss_10": 3.611809027194977, "ce_loss_13": 3.4930022716522218, "ce_loss_2": 4.806360912322998, "ce_loss_3": 4.5402860879898075, "ce_loss_7": 3.8858142852783204, "epoch": 0.148, "grad_norm": 740.0, "kl_loss_10": 256.4318244934082, "kl_loss_2": 2655.588269042969, "kl_loss_3": 2175.6559936523436, "kl_loss_7": 861.8565673828125, "learning_rate": 0.0009528180468815154, "loss": 1488.9336, "step": 1480 }, { "ce_loss_10": 3.6558565139770507, "ce_loss_13": 3.538043713569641, "ce_loss_2": 4.811466526985169, "ce_loss_3": 4.544855618476868, "ce_loss_7": 3.9390755891799927, "epoch": 0.149, "grad_norm": 844.0, "kl_loss_10": 264.13821868896486, "kl_loss_2": 2565.1232788085936, "kl_loss_3": 2095.556463623047, "kl_loss_7": 897.2646911621093, "learning_rate": 0.0009521429345495787, "loss": 1465.2869, "step": 1490 }, { "ce_loss_10": 3.646085023880005, "ce_loss_13": 3.5196659207344054, "ce_loss_2": 4.780038499832154, "ce_loss_3": 4.50464768409729, "ce_loss_7": 3.927055561542511, "epoch": 0.15, "grad_norm": 980.0, "kl_loss_10": 266.5307144165039, "kl_loss_2": 2540.4637084960937, "kl_loss_3": 2068.8232849121096, "kl_loss_7": 888.35068359375, "learning_rate": 0.0009514632691433108, "loss": 1455.9041, "step": 1500 }, { "ce_loss_10": 3.5988011956214905, "ce_loss_13": 3.482589673995972, "ce_loss_2": 4.76681923866272, "ce_loss_3": 4.485762524604797, "ce_loss_7": 3.8728180885314942, "epoch": 0.151, "grad_norm": 600.0, "kl_loss_10": 260.8206298828125, "kl_loss_2": 2587.971142578125, "kl_loss_3": 2094.4052795410157, "kl_loss_7": 863.3080963134765, "learning_rate": 0.0009507790575069346, "loss": 1457.9502, "step": 1510 }, { "ce_loss_10": 3.5764056205749513, "ce_loss_13": 3.453061044216156, "ce_loss_2": 4.775901889801025, "ce_loss_3": 4.500339031219482, "ce_loss_7": 3.849775242805481, "epoch": 0.152, "grad_norm": 672.0, "kl_loss_10": 258.1785354614258, "kl_loss_2": 2655.3977172851564, "kl_loss_3": 2164.0363708496093, "kl_loss_7": 857.1902496337891, "learning_rate": 0.0009500903065304539, "loss": 1495.6711, "step": 1520 }, { "ce_loss_10": 3.608113396167755, "ce_loss_13": 3.498811888694763, "ce_loss_2": 4.760950970649719, "ce_loss_3": 4.486514663696289, "ce_loss_7": 3.8602269887924194, "epoch": 0.153, "grad_norm": 664.0, "kl_loss_10": 245.0189353942871, "kl_loss_2": 2552.576379394531, "kl_loss_3": 2060.3522766113283, "kl_loss_7": 806.0807342529297, "learning_rate": 0.0009493970231495835, "loss": 1444.8406, "step": 1530 }, { "ce_loss_10": 3.547162938117981, "ce_loss_13": 3.44165985584259, "ce_loss_2": 4.701804065704346, "ce_loss_3": 4.424288666248321, "ce_loss_7": 3.8007919073104857, "epoch": 0.154, "grad_norm": 648.0, "kl_loss_10": 241.08162002563478, "kl_loss_2": 2573.9149780273438, "kl_loss_3": 2088.7893615722655, "kl_loss_7": 812.2064361572266, "learning_rate": 0.0009486992143456792, "loss": 1427.6314, "step": 1540 }, { "ce_loss_10": 3.5828447937965393, "ce_loss_13": 3.4581031084060667, "ce_loss_2": 4.834084248542785, "ce_loss_3": 4.553447818756103, "ce_loss_7": 3.8656217455863953, "epoch": 0.155, "grad_norm": 660.0, "kl_loss_10": 263.4285690307617, "kl_loss_2": 2765.8754150390623, "kl_loss_3": 2266.0636291503906, "kl_loss_7": 882.9553771972656, "learning_rate": 0.0009479968871456679, "loss": 1498.7352, "step": 1550 }, { "ce_loss_10": 3.547222447395325, "ce_loss_13": 3.4320276618003844, "ce_loss_2": 4.768963408470154, "ce_loss_3": 4.480878567695617, "ce_loss_7": 3.828988194465637, "epoch": 0.156, "grad_norm": 760.0, "kl_loss_10": 259.2473831176758, "kl_loss_2": 2697.095703125, "kl_loss_3": 2199.6418579101564, "kl_loss_7": 874.4399932861328, "learning_rate": 0.0009472900486219768, "loss": 1467.8941, "step": 1560 }, { "ce_loss_10": 3.54234699010849, "ce_loss_13": 3.416030561923981, "ce_loss_2": 4.702804708480835, "ce_loss_3": 4.435993790626526, "ce_loss_7": 3.811834490299225, "epoch": 0.157, "grad_norm": 996.0, "kl_loss_10": 266.18832244873045, "kl_loss_2": 2600.0114868164064, "kl_loss_3": 2127.867303466797, "kl_loss_7": 872.285043334961, "learning_rate": 0.000946578705892462, "loss": 1470.9803, "step": 1570 }, { "ce_loss_10": 3.5780028820037844, "ce_loss_13": 3.457060468196869, "ce_loss_2": 4.7225889444351195, "ce_loss_3": 4.4475972890853885, "ce_loss_7": 3.8437977194786073, "epoch": 0.158, "grad_norm": 804.0, "kl_loss_10": 277.74141387939454, "kl_loss_2": 2544.743029785156, "kl_loss_3": 2066.6833618164064, "kl_loss_7": 835.531689453125, "learning_rate": 0.0009458628661203367, "loss": 1460.073, "step": 1580 }, { "ce_loss_10": 3.5895689606666563, "ce_loss_13": 3.4543488025665283, "ce_loss_2": 4.777545094490051, "ce_loss_3": 4.501162362098694, "ce_loss_7": 3.846569240093231, "epoch": 0.159, "grad_norm": 748.0, "kl_loss_10": 280.14837341308595, "kl_loss_2": 2651.9770629882814, "kl_loss_3": 2168.784558105469, "kl_loss_7": 873.0370697021484, "learning_rate": 0.0009451425365140996, "loss": 1445.3969, "step": 1590 }, { "ce_loss_10": 3.6579004883766175, "ce_loss_13": 3.5379099249839783, "ce_loss_2": 4.773951435089112, "ce_loss_3": 4.50466411113739, "ce_loss_7": 3.922029638290405, "epoch": 0.16, "grad_norm": 728.0, "kl_loss_10": 273.34312896728517, "kl_loss_2": 2508.3281860351562, "kl_loss_3": 2021.1453735351563, "kl_loss_7": 841.1831604003906, "learning_rate": 0.0009444177243274617, "loss": 1408.8492, "step": 1600 }, { "ce_loss_10": 3.514503800868988, "ce_loss_13": 3.388037991523743, "ce_loss_2": 4.701039886474609, "ce_loss_3": 4.418904185295105, "ce_loss_7": 3.7763099312782287, "epoch": 0.161, "grad_norm": 704.0, "kl_loss_10": 268.4227348327637, "kl_loss_2": 2642.4529418945312, "kl_loss_3": 2156.727893066406, "kl_loss_7": 856.544287109375, "learning_rate": 0.0009436884368592739, "loss": 1462.7545, "step": 1610 }, { "ce_loss_10": 3.55902304649353, "ce_loss_13": 3.441978645324707, "ce_loss_2": 4.705282998085022, "ce_loss_3": 4.427343964576721, "ce_loss_7": 3.810055124759674, "epoch": 0.162, "grad_norm": 692.0, "kl_loss_10": 253.71325302124023, "kl_loss_2": 2545.9316528320314, "kl_loss_3": 2055.34326171875, "kl_loss_7": 814.7054443359375, "learning_rate": 0.0009429546814534529, "loss": 1452.6556, "step": 1620 }, { "ce_loss_10": 3.567894661426544, "ce_loss_13": 3.4576117157936097, "ce_loss_2": 4.725762176513672, "ce_loss_3": 4.453288149833679, "ce_loss_7": 3.8241241455078123, "epoch": 0.163, "grad_norm": 600.0, "kl_loss_10": 249.5528419494629, "kl_loss_2": 2561.039794921875, "kl_loss_3": 2084.6285034179687, "kl_loss_7": 811.9796569824218, "learning_rate": 0.0009422164654989072, "loss": 1405.3155, "step": 1630 }, { "ce_loss_10": 3.682257628440857, "ce_loss_13": 3.57005797624588, "ce_loss_2": 4.807721471786499, "ce_loss_3": 4.5406172513961796, "ce_loss_7": 3.9288353323936462, "epoch": 0.164, "grad_norm": 632.0, "kl_loss_10": 249.95079803466797, "kl_loss_2": 2525.5760131835937, "kl_loss_3": 2063.9668884277344, "kl_loss_7": 811.2918426513672, "learning_rate": 0.0009414737964294635, "loss": 1427.1312, "step": 1640 }, { "ce_loss_10": 3.6101224184036256, "ce_loss_13": 3.5010381817817686, "ce_loss_2": 4.720621514320373, "ce_loss_3": 4.4590880393981935, "ce_loss_7": 3.8465168356895445, "epoch": 0.165, "grad_norm": 592.0, "kl_loss_10": 244.4941291809082, "kl_loss_2": 2467.5499755859373, "kl_loss_3": 2009.9993835449218, "kl_loss_7": 785.2798095703125, "learning_rate": 0.000940726681723791, "loss": 1420.5047, "step": 1650 }, { "ce_loss_10": 3.4529421091079713, "ce_loss_13": 3.3362591743469237, "ce_loss_2": 4.67095410823822, "ce_loss_3": 4.395775043964386, "ce_loss_7": 3.7144131302833556, "epoch": 0.166, "grad_norm": 760.0, "kl_loss_10": 256.26583633422854, "kl_loss_2": 2688.0345825195313, "kl_loss_3": 2212.7634155273436, "kl_loss_7": 836.4253662109375, "learning_rate": 0.0009399751289053266, "loss": 1423.8466, "step": 1660 }, { "ce_loss_10": 3.667633831501007, "ce_loss_13": 3.557128643989563, "ce_loss_2": 4.805055928230286, "ce_loss_3": 4.532746481895447, "ce_loss_7": 3.911760663986206, "epoch": 0.167, "grad_norm": 700.0, "kl_loss_10": 250.78092575073242, "kl_loss_2": 2539.268176269531, "kl_loss_3": 2059.193713378906, "kl_loss_7": 797.4180786132813, "learning_rate": 0.0009392191455421988, "loss": 1439.8459, "step": 1670 }, { "ce_loss_10": 3.6344913125038145, "ce_loss_13": 3.5230419993400575, "ce_loss_2": 4.79052848815918, "ce_loss_3": 4.512491989135742, "ce_loss_7": 3.8782394886016847, "epoch": 0.168, "grad_norm": 672.0, "kl_loss_10": 262.5000991821289, "kl_loss_2": 2580.610949707031, "kl_loss_3": 2102.571502685547, "kl_loss_7": 817.8095031738281, "learning_rate": 0.0009384587392471515, "loss": 1409.023, "step": 1680 }, { "ce_loss_10": 3.6255006551742555, "ce_loss_13": 3.514340567588806, "ce_loss_2": 4.734428143501281, "ce_loss_3": 4.468456673622131, "ce_loss_7": 3.8644169330596925, "epoch": 0.169, "grad_norm": 628.0, "kl_loss_10": 242.1350540161133, "kl_loss_2": 2468.8160400390625, "kl_loss_3": 2004.5263061523438, "kl_loss_7": 785.5388061523438, "learning_rate": 0.0009376939176774678, "loss": 1384.7148, "step": 1690 }, { "ce_loss_10": 3.601682686805725, "ce_loss_13": 3.4858548164367678, "ce_loss_2": 4.752888894081115, "ce_loss_3": 4.474552822113037, "ce_loss_7": 3.842711091041565, "epoch": 0.17, "grad_norm": 636.0, "kl_loss_10": 245.69830017089845, "kl_loss_2": 2544.683557128906, "kl_loss_3": 2064.160784912109, "kl_loss_7": 792.8873626708985, "learning_rate": 0.0009369246885348925, "loss": 1434.5433, "step": 1700 }, { "ce_loss_10": 3.5952138662338258, "ce_loss_13": 3.4776424884796144, "ce_loss_2": 4.792232918739319, "ce_loss_3": 4.513515877723694, "ce_loss_7": 3.8616483092308043, "epoch": 0.171, "grad_norm": 644.0, "kl_loss_10": 250.0074020385742, "kl_loss_2": 2643.389611816406, "kl_loss_3": 2155.037109375, "kl_loss_7": 835.121694946289, "learning_rate": 0.0009361510595655545, "loss": 1446.8347, "step": 1710 }, { "ce_loss_10": 3.558023285865784, "ce_loss_13": 3.438031017780304, "ce_loss_2": 4.730398392677307, "ce_loss_3": 4.452085471153259, "ce_loss_7": 3.815502095222473, "epoch": 0.172, "grad_norm": 672.0, "kl_loss_10": 260.42660064697264, "kl_loss_2": 2622.6736572265627, "kl_loss_3": 2126.2764099121096, "kl_loss_7": 833.0841033935546, "learning_rate": 0.0009353730385598887, "loss": 1443.5211, "step": 1720 }, { "ce_loss_10": 3.4771748185157776, "ce_loss_13": 3.364219045639038, "ce_loss_2": 4.693475008010864, "ce_loss_3": 4.410137629508972, "ce_loss_7": 3.7466461181640627, "epoch": 0.173, "grad_norm": 576.0, "kl_loss_10": 244.67605361938476, "kl_loss_2": 2652.6466064453125, "kl_loss_3": 2161.871044921875, "kl_loss_7": 827.6846221923828, "learning_rate": 0.0009345906333525581, "loss": 1466.803, "step": 1730 }, { "ce_loss_10": 3.515894877910614, "ce_loss_13": 3.403614568710327, "ce_loss_2": 4.707322573661804, "ce_loss_3": 4.422236812114716, "ce_loss_7": 3.7741501927375793, "epoch": 0.174, "grad_norm": 608.0, "kl_loss_10": 250.64810333251953, "kl_loss_2": 2637.284143066406, "kl_loss_3": 2135.0481140136717, "kl_loss_7": 835.5378570556641, "learning_rate": 0.0009338038518223745, "loss": 1437.4744, "step": 1740 }, { "ce_loss_10": 3.5834938049316407, "ce_loss_13": 3.468910980224609, "ce_loss_2": 4.762004089355469, "ce_loss_3": 4.486204957962036, "ce_loss_7": 3.8505713820457457, "epoch": 0.175, "grad_norm": 652.0, "kl_loss_10": 254.51539306640626, "kl_loss_2": 2618.7681884765625, "kl_loss_3": 2135.773858642578, "kl_loss_7": 849.8012634277344, "learning_rate": 0.0009330127018922195, "loss": 1479.132, "step": 1750 }, { "ce_loss_10": 3.538338470458984, "ce_loss_13": 3.4237607955932616, "ce_loss_2": 4.7127416133880615, "ce_loss_3": 4.443085932731629, "ce_loss_7": 3.794516444206238, "epoch": 0.176, "grad_norm": 628.0, "kl_loss_10": 245.44887084960936, "kl_loss_2": 2605.848291015625, "kl_loss_3": 2119.558026123047, "kl_loss_7": 818.0517547607421, "learning_rate": 0.0009322171915289634, "loss": 1443.3754, "step": 1760 }, { "ce_loss_10": 3.5648101806640624, "ce_loss_13": 3.459370458126068, "ce_loss_2": 4.707282447814942, "ce_loss_3": 4.433714365959167, "ce_loss_7": 3.812247085571289, "epoch": 0.177, "grad_norm": 576.0, "kl_loss_10": 245.77867431640624, "kl_loss_2": 2558.2099365234376, "kl_loss_3": 2069.932727050781, "kl_loss_7": 809.8514526367187, "learning_rate": 0.0009314173287433873, "loss": 1402.6621, "step": 1770 }, { "ce_loss_10": 3.5681435227394105, "ce_loss_13": 3.4554543256759644, "ce_loss_2": 4.716624093055725, "ce_loss_3": 4.441683101654053, "ce_loss_7": 3.8178189396858215, "epoch": 0.178, "grad_norm": 704.0, "kl_loss_10": 250.21724319458008, "kl_loss_2": 2566.923547363281, "kl_loss_3": 2076.703576660156, "kl_loss_7": 808.6476196289062, "learning_rate": 0.0009306131215901003, "loss": 1403.6738, "step": 1780 }, { "ce_loss_10": 3.60051885843277, "ce_loss_13": 3.4851089835166933, "ce_loss_2": 4.74112594127655, "ce_loss_3": 4.468067002296448, "ce_loss_7": 3.8350677728652953, "epoch": 0.179, "grad_norm": 656.0, "kl_loss_10": 247.00397262573242, "kl_loss_2": 2550.1098876953124, "kl_loss_3": 2071.478955078125, "kl_loss_7": 797.3671112060547, "learning_rate": 0.0009298045781674596, "loss": 1386.7528, "step": 1790 }, { "ce_loss_10": 3.576521575450897, "ce_loss_13": 3.465667748451233, "ce_loss_2": 4.70545973777771, "ce_loss_3": 4.437681531906128, "ce_loss_7": 3.823224997520447, "epoch": 0.18, "grad_norm": 640.0, "kl_loss_10": 245.9371192932129, "kl_loss_2": 2516.936376953125, "kl_loss_3": 2031.5426147460937, "kl_loss_7": 793.7673767089843, "learning_rate": 0.0009289917066174886, "loss": 1415.4195, "step": 1800 }, { "ce_loss_10": 3.568215787410736, "ce_loss_13": 3.465099549293518, "ce_loss_2": 4.663200092315674, "ce_loss_3": 4.39816825389862, "ce_loss_7": 3.797432005405426, "epoch": 0.181, "grad_norm": 596.0, "kl_loss_10": 236.99261932373048, "kl_loss_2": 2444.4304931640627, "kl_loss_3": 1977.5733642578125, "kl_loss_7": 762.0940002441406, "learning_rate": 0.0009281745151257945, "loss": 1372.7959, "step": 1810 }, { "ce_loss_10": 3.589988100528717, "ce_loss_13": 3.4779568314552307, "ce_loss_2": 4.741122603416443, "ce_loss_3": 4.463086032867432, "ce_loss_7": 3.8317890048027037, "epoch": 0.182, "grad_norm": 576.0, "kl_loss_10": 245.52628021240236, "kl_loss_2": 2546.031115722656, "kl_loss_3": 2057.9885314941407, "kl_loss_7": 789.636849975586, "learning_rate": 0.0009273530119214868, "loss": 1414.9602, "step": 1820 }, { "ce_loss_10": 3.6874829173088073, "ce_loss_13": 3.5830198526382446, "ce_loss_2": 4.805440378189087, "ce_loss_3": 4.542007470130921, "ce_loss_7": 3.922217535972595, "epoch": 0.183, "grad_norm": 668.0, "kl_loss_10": 240.62074966430663, "kl_loss_2": 2477.285852050781, "kl_loss_3": 2025.298876953125, "kl_loss_7": 778.8258850097657, "learning_rate": 0.0009265272052770935, "loss": 1365.1876, "step": 1830 }, { "ce_loss_10": 3.5063879013061525, "ce_loss_13": 3.3919414281845093, "ce_loss_2": 4.691436982154846, "ce_loss_3": 4.40977828502655, "ce_loss_7": 3.7569626212120055, "epoch": 0.184, "grad_norm": 672.0, "kl_loss_10": 245.37701873779298, "kl_loss_2": 2600.8256103515623, "kl_loss_3": 2103.731726074219, "kl_loss_7": 796.2739471435547, "learning_rate": 0.0009256971035084784, "loss": 1423.7646, "step": 1840 }, { "ce_loss_10": 3.4534160137176513, "ce_loss_13": 3.337074172496796, "ce_loss_2": 4.650833582878112, "ce_loss_3": 4.375414204597473, "ce_loss_7": 3.7153374671936037, "epoch": 0.185, "grad_norm": 872.0, "kl_loss_10": 253.35809020996095, "kl_loss_2": 2636.6057983398437, "kl_loss_3": 2149.6516052246093, "kl_loss_7": 833.7322570800782, "learning_rate": 0.0009248627149747573, "loss": 1433.1182, "step": 1850 }, { "ce_loss_10": 3.6552318572998046, "ce_loss_13": 3.5436174392700197, "ce_loss_2": 4.771462321281433, "ce_loss_3": 4.505353546142578, "ce_loss_7": 3.8980504512786864, "epoch": 0.186, "grad_norm": 628.0, "kl_loss_10": 244.59689865112304, "kl_loss_2": 2502.3949462890623, "kl_loss_3": 2027.7750183105468, "kl_loss_7": 792.985708618164, "learning_rate": 0.0009240240480782129, "loss": 1402.8563, "step": 1860 }, { "ce_loss_10": 3.559772253036499, "ce_loss_13": 3.444066059589386, "ce_loss_2": 4.714746379852295, "ce_loss_3": 4.439750409126281, "ce_loss_7": 3.8083682656288147, "epoch": 0.187, "grad_norm": 696.0, "kl_loss_10": 248.7159553527832, "kl_loss_2": 2569.0499755859373, "kl_loss_3": 2081.2441528320314, "kl_loss_7": 799.65380859375, "learning_rate": 0.0009231811112642122, "loss": 1391.885, "step": 1870 }, { "ce_loss_10": 3.603023958206177, "ce_loss_13": 3.4911489486694336, "ce_loss_2": 4.7107093334198, "ce_loss_3": 4.44477071762085, "ce_loss_7": 3.8424261450767516, "epoch": 0.188, "grad_norm": 756.0, "kl_loss_10": 245.3149284362793, "kl_loss_2": 2484.4417358398437, "kl_loss_3": 2013.6257080078126, "kl_loss_7": 788.6129425048828, "learning_rate": 0.0009223339130211192, "loss": 1382.5715, "step": 1880 }, { "ce_loss_10": 3.451503300666809, "ce_loss_13": 3.3456833481788637, "ce_loss_2": 4.6354892492294315, "ce_loss_3": 4.368392133712769, "ce_loss_7": 3.7025105237960814, "epoch": 0.189, "grad_norm": 796.0, "kl_loss_10": 235.8703857421875, "kl_loss_2": 2606.5722534179686, "kl_loss_3": 2137.981573486328, "kl_loss_7": 795.582894897461, "learning_rate": 0.0009214824618802108, "loss": 1426.9203, "step": 1890 }, { "ce_loss_10": 3.633524978160858, "ce_loss_13": 3.5242482304573057, "ce_loss_2": 4.770471715927124, "ce_loss_3": 4.501252269744873, "ce_loss_7": 3.883507859706879, "epoch": 0.19, "grad_norm": 652.0, "kl_loss_10": 237.73654251098634, "kl_loss_2": 2486.365759277344, "kl_loss_3": 2019.5263671875, "kl_loss_7": 793.6973388671875, "learning_rate": 0.0009206267664155906, "loss": 1428.9256, "step": 1900 }, { "ce_loss_10": 3.5532122611999513, "ce_loss_13": 3.443064200878143, "ce_loss_2": 4.697825288772583, "ce_loss_3": 4.427114844322205, "ce_loss_7": 3.799003005027771, "epoch": 0.191, "grad_norm": 636.0, "kl_loss_10": 241.10890350341796, "kl_loss_2": 2548.555554199219, "kl_loss_3": 2061.9953002929688, "kl_loss_7": 794.9706726074219, "learning_rate": 0.0009197668352441024, "loss": 1417.5695, "step": 1910 }, { "ce_loss_10": 3.608381187915802, "ce_loss_13": 3.4997890830039977, "ce_loss_2": 4.748308372497559, "ce_loss_3": 4.471417784690857, "ce_loss_7": 3.851922130584717, "epoch": 0.192, "grad_norm": 636.0, "kl_loss_10": 242.21438293457032, "kl_loss_2": 2509.6267700195312, "kl_loss_3": 2027.6890441894532, "kl_loss_7": 779.7179229736328, "learning_rate": 0.0009189026770252437, "loss": 1396.1437, "step": 1920 }, { "ce_loss_10": 3.6384175658226012, "ce_loss_13": 3.5275412440299987, "ce_loss_2": 4.762041211128235, "ce_loss_3": 4.48351948261261, "ce_loss_7": 3.8741258502006533, "epoch": 0.193, "grad_norm": 688.0, "kl_loss_10": 250.4880401611328, "kl_loss_2": 2491.730749511719, "kl_loss_3": 2004.1307067871094, "kl_loss_7": 785.3524200439454, "learning_rate": 0.000918034300461078, "loss": 1438.3092, "step": 1930 }, { "ce_loss_10": 3.675648069381714, "ce_loss_13": 3.555274224281311, "ce_loss_2": 4.77588381767273, "ce_loss_3": 4.506980061531067, "ce_loss_7": 3.9165189027786256, "epoch": 0.194, "grad_norm": 1048.0, "kl_loss_10": 251.8736488342285, "kl_loss_2": 2458.3200805664064, "kl_loss_3": 1995.1934143066405, "kl_loss_7": 806.3202392578125, "learning_rate": 0.0009171617142961477, "loss": 1389.0176, "step": 1940 }, { "ce_loss_10": 3.623457467556, "ce_loss_13": 3.512966477870941, "ce_loss_2": 4.729074192047119, "ce_loss_3": 4.464083290100097, "ce_loss_7": 3.8867802739143373, "epoch": 0.195, "grad_norm": 688.0, "kl_loss_10": 255.58710021972655, "kl_loss_2": 2479.066796875, "kl_loss_3": 2001.9678588867187, "kl_loss_7": 833.6603210449218, "learning_rate": 0.0009162849273173857, "loss": 1403.0846, "step": 1950 }, { "ce_loss_10": 3.5657452821731566, "ce_loss_13": 3.457024359703064, "ce_loss_2": 4.679703283309936, "ce_loss_3": 4.409625816345215, "ce_loss_7": 3.8033991694450378, "epoch": 0.196, "grad_norm": 656.0, "kl_loss_10": 242.9659797668457, "kl_loss_2": 2473.7406372070313, "kl_loss_3": 2000.534735107422, "kl_loss_7": 783.4248046875, "learning_rate": 0.0009154039483540273, "loss": 1391.609, "step": 1960 }, { "ce_loss_10": 3.5444339156150817, "ce_loss_13": 3.433286416530609, "ce_loss_2": 4.677814674377442, "ce_loss_3": 4.395683622360229, "ce_loss_7": 3.784545695781708, "epoch": 0.197, "grad_norm": 608.0, "kl_loss_10": 239.23334732055665, "kl_loss_2": 2520.074182128906, "kl_loss_3": 2031.6637634277345, "kl_loss_7": 792.3442230224609, "learning_rate": 0.0009145187862775209, "loss": 1388.6972, "step": 1970 }, { "ce_loss_10": 3.572359085083008, "ce_loss_13": 3.466273844242096, "ce_loss_2": 4.692143273353577, "ce_loss_3": 4.418303954601288, "ce_loss_7": 3.8197664856910705, "epoch": 0.198, "grad_norm": 660.0, "kl_loss_10": 241.7268035888672, "kl_loss_2": 2492.987420654297, "kl_loss_3": 2004.3476135253907, "kl_loss_7": 794.6048614501954, "learning_rate": 0.0009136294500014386, "loss": 1377.9902, "step": 1980 }, { "ce_loss_10": 3.52831609249115, "ce_loss_13": 3.4167757987976075, "ce_loss_2": 4.705040359497071, "ce_loss_3": 4.434882926940918, "ce_loss_7": 3.7779016494750977, "epoch": 0.199, "grad_norm": 684.0, "kl_loss_10": 242.86552047729492, "kl_loss_2": 2578.6255493164062, "kl_loss_3": 2108.4060180664064, "kl_loss_7": 798.0517791748047, "learning_rate": 0.000912735948481387, "loss": 1426.8047, "step": 1990 }, { "ce_loss_10": 3.5601553082466126, "ce_loss_13": 3.449883997440338, "ce_loss_2": 4.691212105751037, "ce_loss_3": 4.414692604541779, "ce_loss_7": 3.8016102075576783, "epoch": 0.2, "grad_norm": 684.0, "kl_loss_10": 242.28478622436523, "kl_loss_2": 2530.514270019531, "kl_loss_3": 2040.9486206054687, "kl_loss_7": 800.2102844238282, "learning_rate": 0.0009118382907149164, "loss": 1370.7061, "step": 2000 }, { "ce_loss_10": 3.5833643674850464, "ce_loss_13": 3.4740814447402952, "ce_loss_2": 4.70447518825531, "ce_loss_3": 4.429442811012268, "ce_loss_7": 3.8237846970558165, "epoch": 0.201, "grad_norm": 612.0, "kl_loss_10": 244.51040420532226, "kl_loss_2": 2494.5580932617186, "kl_loss_3": 2005.631494140625, "kl_loss_7": 779.4999328613281, "learning_rate": 0.0009109364857414306, "loss": 1380.7336, "step": 2010 }, { "ce_loss_10": 3.5532099485397337, "ce_loss_13": 3.4470490336418154, "ce_loss_2": 4.681869411468506, "ce_loss_3": 4.40200264453888, "ce_loss_7": 3.790750026702881, "epoch": 0.202, "grad_norm": 608.0, "kl_loss_10": 240.87973327636718, "kl_loss_2": 2528.7482421875, "kl_loss_3": 2036.1677551269531, "kl_loss_7": 777.9466033935547, "learning_rate": 0.0009100305426420956, "loss": 1419.7547, "step": 2020 }, { "ce_loss_10": 3.5112710118293764, "ce_loss_13": 3.404540646076202, "ce_loss_2": 4.711292386054993, "ce_loss_3": 4.432630777359009, "ce_loss_7": 3.757065165042877, "epoch": 0.203, "grad_norm": 664.0, "kl_loss_10": 238.4617919921875, "kl_loss_2": 2652.4912963867187, "kl_loss_3": 2152.2258422851564, "kl_loss_7": 790.063916015625, "learning_rate": 0.0009091204705397484, "loss": 1413.6135, "step": 2030 }, { "ce_loss_10": 3.508105480670929, "ce_loss_13": 3.399987006187439, "ce_loss_2": 4.703747749328613, "ce_loss_3": 4.428358674049377, "ce_loss_7": 3.7540559649467466, "epoch": 0.204, "grad_norm": 700.0, "kl_loss_10": 242.5270248413086, "kl_loss_2": 2644.1144165039063, "kl_loss_3": 2155.070489501953, "kl_loss_7": 790.7262329101562, "learning_rate": 0.0009082062785988049, "loss": 1424.9719, "step": 2040 }, { "ce_loss_10": 3.638819897174835, "ce_loss_13": 3.5337455749511717, "ce_loss_2": 4.727799487113953, "ce_loss_3": 4.457953143119812, "ce_loss_7": 3.8601122856140138, "epoch": 0.205, "grad_norm": 668.0, "kl_loss_10": 235.8659812927246, "kl_loss_2": 2476.5026977539064, "kl_loss_3": 1996.3927185058594, "kl_loss_7": 769.8516876220704, "learning_rate": 0.0009072879760251679, "loss": 1387.9949, "step": 2050 }, { "ce_loss_10": 3.5858229279518126, "ce_loss_13": 3.475198233127594, "ce_loss_2": 4.739975643157959, "ce_loss_3": 4.475312519073486, "ce_loss_7": 3.834290158748627, "epoch": 0.206, "grad_norm": 700.0, "kl_loss_10": 239.9431396484375, "kl_loss_2": 2570.9485107421874, "kl_loss_3": 2100.634240722656, "kl_loss_7": 789.2198791503906, "learning_rate": 0.0009063655720661341, "loss": 1402.2605, "step": 2060 }, { "ce_loss_10": 3.6313581228256226, "ce_loss_13": 3.5262081384658814, "ce_loss_2": 4.7349327325820925, "ce_loss_3": 4.470538520812989, "ce_loss_7": 3.864632213115692, "epoch": 0.207, "grad_norm": 580.0, "kl_loss_10": 238.97062911987305, "kl_loss_2": 2454.8896240234376, "kl_loss_3": 1987.1748107910157, "kl_loss_7": 776.5097869873047, "learning_rate": 0.000905439076010301, "loss": 1376.7035, "step": 2070 }, { "ce_loss_10": 3.5894328594207763, "ce_loss_13": 3.4751851201057433, "ce_loss_2": 4.723314690589905, "ce_loss_3": 4.451727390289307, "ce_loss_7": 3.830363655090332, "epoch": 0.208, "grad_norm": 620.0, "kl_loss_10": 243.43872604370117, "kl_loss_2": 2525.0844848632814, "kl_loss_3": 2046.1018615722655, "kl_loss_7": 793.8133911132812, "learning_rate": 0.0009045084971874737, "loss": 1367.5893, "step": 2080 }, { "ce_loss_10": 3.5676583290100097, "ce_loss_13": 3.452693998813629, "ce_loss_2": 4.699956917762757, "ce_loss_3": 4.424242115020752, "ce_loss_7": 3.806007480621338, "epoch": 0.209, "grad_norm": 688.0, "kl_loss_10": 249.41274871826172, "kl_loss_2": 2529.7607299804686, "kl_loss_3": 2042.927227783203, "kl_loss_7": 789.6784515380859, "learning_rate": 0.0009035738449685707, "loss": 1418.6186, "step": 2090 }, { "ce_loss_10": 3.510753297805786, "ce_loss_13": 3.3990254640579223, "ce_loss_2": 4.691071200370788, "ce_loss_3": 4.41790235042572, "ce_loss_7": 3.7591400265693666, "epoch": 0.21, "grad_norm": 600.0, "kl_loss_10": 248.95298919677734, "kl_loss_2": 2601.993273925781, "kl_loss_3": 2124.828485107422, "kl_loss_7": 799.376498413086, "learning_rate": 0.0009026351287655293, "loss": 1399.0971, "step": 2100 }, { "ce_loss_10": 3.697406494617462, "ce_loss_13": 3.5970078110694885, "ce_loss_2": 4.7389151096344, "ce_loss_3": 4.481091260910034, "ce_loss_7": 3.9209010362625123, "epoch": 0.211, "grad_norm": 600.0, "kl_loss_10": 229.3176498413086, "kl_loss_2": 2353.4455688476564, "kl_loss_3": 1885.3362854003906, "kl_loss_7": 749.6781646728516, "learning_rate": 0.0009016923580312113, "loss": 1321.2097, "step": 2110 }, { "ce_loss_10": 3.565862810611725, "ce_loss_13": 3.4591265320777893, "ce_loss_2": 4.665999031066894, "ce_loss_3": 4.391572332382202, "ce_loss_7": 3.7967191696166993, "epoch": 0.212, "grad_norm": 732.0, "kl_loss_10": 243.075350189209, "kl_loss_2": 2458.2255859375, "kl_loss_3": 1975.4440124511718, "kl_loss_7": 771.7640777587891, "learning_rate": 0.0009007455422593077, "loss": 1392.0321, "step": 2120 }, { "ce_loss_10": 3.574350452423096, "ce_loss_13": 3.4604103803634643, "ce_loss_2": 4.7152410507202145, "ce_loss_3": 4.439797115325928, "ce_loss_7": 3.8057913303375246, "epoch": 0.213, "grad_norm": 652.0, "kl_loss_10": 251.99988250732423, "kl_loss_2": 2551.55615234375, "kl_loss_3": 2068.113671875, "kl_loss_7": 789.3677795410156, "learning_rate": 0.0008997946909842425, "loss": 1402.5921, "step": 2130 }, { "ce_loss_10": 3.592576038837433, "ce_loss_13": 3.476356315612793, "ce_loss_2": 4.7715356826782225, "ce_loss_3": 4.504428267478943, "ce_loss_7": 3.843649852275848, "epoch": 0.214, "grad_norm": 660.0, "kl_loss_10": 255.3404312133789, "kl_loss_2": 2625.214599609375, "kl_loss_3": 2155.4658203125, "kl_loss_7": 813.4428436279297, "learning_rate": 0.0008988398137810777, "loss": 1403.5207, "step": 2140 }, { "ce_loss_10": 3.620520067214966, "ce_loss_13": 3.513581109046936, "ce_loss_2": 4.717863583564759, "ce_loss_3": 4.442376029491425, "ce_loss_7": 3.8534181356430053, "epoch": 0.215, "grad_norm": 700.0, "kl_loss_10": 239.26677551269532, "kl_loss_2": 2448.3839477539063, "kl_loss_3": 1962.8316284179687, "kl_loss_7": 763.2356109619141, "learning_rate": 0.0008978809202654162, "loss": 1354.8944, "step": 2150 }, { "ce_loss_10": 3.593782067298889, "ce_loss_13": 3.4892191767692564, "ce_loss_2": 4.713660454750061, "ce_loss_3": 4.43155483007431, "ce_loss_7": 3.8341444969177245, "epoch": 0.216, "grad_norm": 640.0, "kl_loss_10": 237.50842971801757, "kl_loss_2": 2454.586071777344, "kl_loss_3": 1970.583270263672, "kl_loss_7": 773.5592163085937, "learning_rate": 0.0008969180200933046, "loss": 1383.4818, "step": 2160 }, { "ce_loss_10": 3.56014689207077, "ce_loss_13": 3.4516719341278077, "ce_loss_2": 4.715594840049744, "ce_loss_3": 4.431590890884399, "ce_loss_7": 3.8131863117218017, "epoch": 0.217, "grad_norm": 712.0, "kl_loss_10": 241.1098258972168, "kl_loss_2": 2533.49033203125, "kl_loss_3": 2041.2841003417968, "kl_loss_7": 799.241552734375, "learning_rate": 0.0008959511229611376, "loss": 1406.9449, "step": 2170 }, { "ce_loss_10": 3.634247362613678, "ce_loss_13": 3.529753494262695, "ce_loss_2": 4.747422552108764, "ce_loss_3": 4.480298018455505, "ce_loss_7": 3.8834722995758058, "epoch": 0.218, "grad_norm": 744.0, "kl_loss_10": 231.06951522827148, "kl_loss_2": 2480.240673828125, "kl_loss_3": 2003.8335388183593, "kl_loss_7": 794.5106719970703, "learning_rate": 0.0008949802386055581, "loss": 1379.2705, "step": 2180 }, { "ce_loss_10": 3.4931302070617676, "ce_loss_13": 3.3903717041015624, "ce_loss_2": 4.634695625305175, "ce_loss_3": 4.343023872375488, "ce_loss_7": 3.735668647289276, "epoch": 0.219, "grad_norm": 704.0, "kl_loss_10": 229.31054229736327, "kl_loss_2": 2487.9470336914064, "kl_loss_3": 1978.2749877929687, "kl_loss_7": 772.9935424804687, "learning_rate": 0.0008940053768033609, "loss": 1398.8061, "step": 2190 }, { "ce_loss_10": 3.579288733005524, "ce_loss_13": 3.476969850063324, "ce_loss_2": 4.679602265357971, "ce_loss_3": 4.408792352676391, "ce_loss_7": 3.818285346031189, "epoch": 0.22, "grad_norm": 648.0, "kl_loss_10": 225.21361923217773, "kl_loss_2": 2457.1766845703123, "kl_loss_3": 1985.3013549804687, "kl_loss_7": 762.693115234375, "learning_rate": 0.0008930265473713938, "loss": 1358.0689, "step": 2200 }, { "ce_loss_10": 3.5425936341285706, "ce_loss_13": 3.437610614299774, "ce_loss_2": 4.679268145561219, "ce_loss_3": 4.395039463043213, "ce_loss_7": 3.7786786198616027, "epoch": 0.221, "grad_norm": 624.0, "kl_loss_10": 227.02418670654296, "kl_loss_2": 2514.80498046875, "kl_loss_3": 2012.999462890625, "kl_loss_7": 766.7205108642578, "learning_rate": 0.0008920437601664579, "loss": 1344.9316, "step": 2210 }, { "ce_loss_10": 3.5330151677131654, "ce_loss_13": 3.4283509850502014, "ce_loss_2": 4.65971040725708, "ce_loss_3": 4.389861440658569, "ce_loss_7": 3.7775445342063905, "epoch": 0.222, "grad_norm": 728.0, "kl_loss_10": 231.53972396850585, "kl_loss_2": 2495.336804199219, "kl_loss_3": 2020.2352600097656, "kl_loss_7": 785.6470977783204, "learning_rate": 0.0008910570250852097, "loss": 1358.0102, "step": 2220 }, { "ce_loss_10": 3.6386430144309996, "ce_loss_13": 3.5394553184509276, "ce_loss_2": 4.721383547782898, "ce_loss_3": 4.441709399223328, "ce_loss_7": 3.8573225855827333, "epoch": 0.223, "grad_norm": 656.0, "kl_loss_10": 222.80670547485352, "kl_loss_2": 2415.298693847656, "kl_loss_3": 1914.3474975585937, "kl_loss_7": 735.9223663330079, "learning_rate": 0.0008900663520640604, "loss": 1330.9881, "step": 2230 }, { "ce_loss_10": 3.5963090658187866, "ce_loss_13": 3.4863692045211794, "ce_loss_2": 4.697564601898193, "ce_loss_3": 4.4291857242584225, "ce_loss_7": 3.8206969499588013, "epoch": 0.224, "grad_norm": 616.0, "kl_loss_10": 232.82473220825196, "kl_loss_2": 2436.1601440429686, "kl_loss_3": 1975.5118774414063, "kl_loss_7": 746.4637390136719, "learning_rate": 0.0008890717510790764, "loss": 1355.2247, "step": 2240 }, { "ce_loss_10": 3.550048661231995, "ce_loss_13": 3.444666588306427, "ce_loss_2": 4.6846558332443236, "ce_loss_3": 4.415020489692688, "ce_loss_7": 3.7784482836723328, "epoch": 0.225, "grad_norm": 748.0, "kl_loss_10": 234.0259765625, "kl_loss_2": 2511.7267456054688, "kl_loss_3": 2033.7661254882812, "kl_loss_7": 757.7471649169922, "learning_rate": 0.0008880732321458784, "loss": 1391.5023, "step": 2250 }, { "ce_loss_10": 3.5846696734428405, "ce_loss_13": 3.475912594795227, "ce_loss_2": 4.6859821557998655, "ce_loss_3": 4.403112530708313, "ce_loss_7": 3.8075541138648985, "epoch": 0.226, "grad_norm": 768.0, "kl_loss_10": 241.0058906555176, "kl_loss_2": 2434.0427978515627, "kl_loss_3": 1946.942852783203, "kl_loss_7": 750.951953125, "learning_rate": 0.0008870708053195413, "loss": 1371.0441, "step": 2260 }, { "ce_loss_10": 3.6066513299942016, "ce_loss_13": 3.5011353135108947, "ce_loss_2": 4.688438081741333, "ce_loss_3": 4.419037127494812, "ce_loss_7": 3.8243068933486937, "epoch": 0.227, "grad_norm": 612.0, "kl_loss_10": 236.37487716674804, "kl_loss_2": 2419.1595703125, "kl_loss_3": 1947.7892822265626, "kl_loss_7": 736.9884735107422, "learning_rate": 0.0008860644806944918, "loss": 1346.316, "step": 2270 }, { "ce_loss_10": 3.5470305681228638, "ce_loss_13": 3.4408384203910827, "ce_loss_2": 4.675415754318237, "ce_loss_3": 4.405515837669372, "ce_loss_7": 3.7811434388160707, "epoch": 0.228, "grad_norm": 712.0, "kl_loss_10": 236.5175895690918, "kl_loss_2": 2511.8415283203126, "kl_loss_3": 2041.5828552246094, "kl_loss_7": 773.2159851074218, "learning_rate": 0.0008850542684044079, "loss": 1347.2301, "step": 2280 }, { "ce_loss_10": 3.525200033187866, "ce_loss_13": 3.4121009707450867, "ce_loss_2": 4.704805684089661, "ce_loss_3": 4.428252863883972, "ce_loss_7": 3.7681017994880674, "epoch": 0.229, "grad_norm": 744.0, "kl_loss_10": 243.2204231262207, "kl_loss_2": 2609.259875488281, "kl_loss_3": 2137.3057250976562, "kl_loss_7": 781.1770416259766, "learning_rate": 0.0008840401786221159, "loss": 1392.1494, "step": 2290 }, { "ce_loss_10": 3.644639456272125, "ce_loss_13": 3.546596646308899, "ce_loss_2": 4.720036673545837, "ce_loss_3": 4.461656093597412, "ce_loss_7": 3.8639742493629456, "epoch": 0.23, "grad_norm": 736.0, "kl_loss_10": 221.5949806213379, "kl_loss_2": 2383.692004394531, "kl_loss_3": 1920.7404052734375, "kl_loss_7": 726.6697357177734, "learning_rate": 0.000883022221559489, "loss": 1309.8631, "step": 2300 }, { "ce_loss_10": 3.6106560468673705, "ce_loss_13": 3.5103928685188293, "ce_loss_2": 4.718800568580628, "ce_loss_3": 4.453631711006165, "ce_loss_7": 3.833037328720093, "epoch": 0.231, "grad_norm": 668.0, "kl_loss_10": 224.89765014648438, "kl_loss_2": 2469.4252197265623, "kl_loss_3": 2018.495166015625, "kl_loss_7": 748.8079467773438, "learning_rate": 0.0008820004074673434, "loss": 1405.4977, "step": 2310 }, { "ce_loss_10": 3.509887623786926, "ce_loss_13": 3.4120625376701357, "ce_loss_2": 4.630102276802063, "ce_loss_3": 4.358427214622497, "ce_loss_7": 3.748315227031708, "epoch": 0.232, "grad_norm": 604.0, "kl_loss_10": 223.46416931152345, "kl_loss_2": 2484.790771484375, "kl_loss_3": 2005.2869995117187, "kl_loss_7": 761.2884399414063, "learning_rate": 0.0008809747466353355, "loss": 1341.5085, "step": 2320 }, { "ce_loss_10": 3.522110950946808, "ce_loss_13": 3.4228403091430666, "ce_loss_2": 4.653229188919068, "ce_loss_3": 4.378945517539978, "ce_loss_7": 3.7502294540405274, "epoch": 0.233, "grad_norm": 744.0, "kl_loss_10": 224.23116912841797, "kl_loss_2": 2499.1381958007814, "kl_loss_3": 2020.5157836914063, "kl_loss_7": 752.2868743896485, "learning_rate": 0.0008799452493918585, "loss": 1366.2092, "step": 2330 }, { "ce_loss_10": 3.600525939464569, "ce_loss_13": 3.501133692264557, "ce_loss_2": 4.698138499259949, "ce_loss_3": 4.4309428334236145, "ce_loss_7": 3.8393119096755983, "epoch": 0.234, "grad_norm": 656.0, "kl_loss_10": 221.8571762084961, "kl_loss_2": 2452.500280761719, "kl_loss_3": 1976.1439636230468, "kl_loss_7": 759.1389068603515, "learning_rate": 0.0008789119261039385, "loss": 1400.5569, "step": 2340 }, { "ce_loss_10": 3.5126537322998046, "ce_loss_13": 3.412049424648285, "ce_loss_2": 4.627605974674225, "ce_loss_3": 4.359820437431336, "ce_loss_7": 3.747655713558197, "epoch": 0.235, "grad_norm": 584.0, "kl_loss_10": 220.69495086669923, "kl_loss_2": 2450.3417724609376, "kl_loss_3": 1979.037158203125, "kl_loss_7": 752.3414123535156, "learning_rate": 0.0008778747871771292, "loss": 1338.277, "step": 2350 }, { "ce_loss_10": 3.5650462746620177, "ce_loss_13": 3.4650426387786863, "ce_loss_2": 4.640904521942138, "ce_loss_3": 4.3729163646698, "ce_loss_7": 3.78610600233078, "epoch": 0.236, "grad_norm": 628.0, "kl_loss_10": 215.22831954956055, "kl_loss_2": 2399.6547119140623, "kl_loss_3": 1925.4503356933594, "kl_loss_7": 727.8779388427735, "learning_rate": 0.0008768338430554083, "loss": 1316.2055, "step": 2360 }, { "ce_loss_10": 3.572676420211792, "ce_loss_13": 3.4714962005615235, "ce_loss_2": 4.678735136985779, "ce_loss_3": 4.39429270029068, "ce_loss_7": 3.8077693939208985, "epoch": 0.237, "grad_norm": 688.0, "kl_loss_10": 226.92397766113282, "kl_loss_2": 2426.2300659179687, "kl_loss_3": 1939.4405090332032, "kl_loss_7": 752.637564086914, "learning_rate": 0.0008757891042210713, "loss": 1346.3338, "step": 2370 }, { "ce_loss_10": 3.592969560623169, "ce_loss_13": 3.493350553512573, "ce_loss_2": 4.688189601898193, "ce_loss_3": 4.413512086868286, "ce_loss_7": 3.821557307243347, "epoch": 0.238, "grad_norm": 656.0, "kl_loss_10": 225.66336822509766, "kl_loss_2": 2421.9510131835937, "kl_loss_3": 1946.20556640625, "kl_loss_7": 745.2722961425782, "learning_rate": 0.0008747405811946271, "loss": 1343.8345, "step": 2380 }, { "ce_loss_10": 3.49123694896698, "ce_loss_13": 3.389770042896271, "ce_loss_2": 4.654137110710144, "ce_loss_3": 4.386571860313415, "ce_loss_7": 3.731127667427063, "epoch": 0.239, "grad_norm": 616.0, "kl_loss_10": 230.47370223999025, "kl_loss_2": 2561.850231933594, "kl_loss_3": 2084.1000549316404, "kl_loss_7": 769.9209930419922, "learning_rate": 0.0008736882845346905, "loss": 1355.4398, "step": 2390 }, { "ce_loss_10": 3.5909661054611206, "ce_loss_13": 3.4839738249778747, "ce_loss_2": 4.705090403556824, "ce_loss_3": 4.426928949356079, "ce_loss_7": 3.8166149973869326, "epoch": 0.24, "grad_norm": 652.0, "kl_loss_10": 232.27595291137695, "kl_loss_2": 2463.9607543945312, "kl_loss_3": 1976.524102783203, "kl_loss_7": 748.5501831054687, "learning_rate": 0.0008726322248378774, "loss": 1350.1158, "step": 2400 }, { "ce_loss_10": 3.5857128262519837, "ce_loss_13": 3.485344707965851, "ce_loss_2": 4.720745325088501, "ce_loss_3": 4.446980690956115, "ce_loss_7": 3.815141475200653, "epoch": 0.241, "grad_norm": 620.0, "kl_loss_10": 225.08902893066406, "kl_loss_2": 2502.8332275390626, "kl_loss_3": 2020.9147888183593, "kl_loss_7": 748.0698608398437, "learning_rate": 0.0008715724127386971, "loss": 1388.577, "step": 2410 }, { "ce_loss_10": 3.656253182888031, "ce_loss_13": 3.5548863530159, "ce_loss_2": 4.740737318992615, "ce_loss_3": 4.4647379398345945, "ce_loss_7": 3.869425129890442, "epoch": 0.242, "grad_norm": 656.0, "kl_loss_10": 233.72190628051757, "kl_loss_2": 2420.5750244140627, "kl_loss_3": 1941.4000915527345, "kl_loss_7": 733.7942932128906, "learning_rate": 0.0008705088589094458, "loss": 1349.3883, "step": 2420 }, { "ce_loss_10": 3.6831162333488465, "ce_loss_13": 3.5650919318199157, "ce_loss_2": 4.759288740158081, "ce_loss_3": 4.490408134460449, "ce_loss_7": 3.8880489230155946, "epoch": 0.243, "grad_norm": 640.0, "kl_loss_10": 258.1027114868164, "kl_loss_2": 2453.8090209960938, "kl_loss_3": 1977.7547729492187, "kl_loss_7": 746.0192138671875, "learning_rate": 0.0008694415740600988, "loss": 1371.979, "step": 2430 }, { "ce_loss_10": 3.539147210121155, "ce_loss_13": 3.418752908706665, "ce_loss_2": 4.6640907526016235, "ce_loss_3": 4.396868014335633, "ce_loss_7": 3.753141713142395, "epoch": 0.244, "grad_norm": 720.0, "kl_loss_10": 272.4710403442383, "kl_loss_2": 2511.5777099609377, "kl_loss_3": 2045.4482543945312, "kl_loss_7": 744.3600494384766, "learning_rate": 0.0008683705689382025, "loss": 1374.2081, "step": 2440 }, { "ce_loss_10": 3.614233338832855, "ce_loss_13": 3.502686250209808, "ce_loss_2": 4.680193209648133, "ce_loss_3": 4.409785914421081, "ce_loss_7": 3.81562682390213, "epoch": 0.245, "grad_norm": 680.0, "kl_loss_10": 242.92661514282227, "kl_loss_2": 2418.696484375, "kl_loss_3": 1945.9917602539062, "kl_loss_7": 727.0407897949219, "learning_rate": 0.0008672958543287666, "loss": 1361.5771, "step": 2450 }, { "ce_loss_10": 3.6207616090774537, "ce_loss_13": 3.5146057486534117, "ce_loss_2": 4.6799437522888185, "ce_loss_3": 4.408400678634644, "ce_loss_7": 3.8393305063247682, "epoch": 0.246, "grad_norm": 640.0, "kl_loss_10": 233.26868438720703, "kl_loss_2": 2373.7197509765624, "kl_loss_3": 1900.9493347167968, "kl_loss_7": 737.9279724121094, "learning_rate": 0.0008662174410541554, "loss": 1323.3875, "step": 2460 }, { "ce_loss_10": 3.5795403718948364, "ce_loss_13": 3.4791687726974487, "ce_loss_2": 4.657073163986206, "ce_loss_3": 4.389124321937561, "ce_loss_7": 3.797624135017395, "epoch": 0.247, "grad_norm": 688.0, "kl_loss_10": 228.68382720947267, "kl_loss_2": 2405.7741943359374, "kl_loss_3": 1929.0893249511719, "kl_loss_7": 730.4046020507812, "learning_rate": 0.0008651353399739787, "loss": 1361.2713, "step": 2470 }, { "ce_loss_10": 3.6015311241149903, "ce_loss_13": 3.5007805585861207, "ce_loss_2": 4.693076491355896, "ce_loss_3": 4.420244932174683, "ce_loss_7": 3.8255343675613402, "epoch": 0.248, "grad_norm": 628.0, "kl_loss_10": 225.77268676757814, "kl_loss_2": 2413.6783447265625, "kl_loss_3": 1937.1076232910157, "kl_loss_7": 735.3206512451172, "learning_rate": 0.0008640495619849821, "loss": 1345.3404, "step": 2480 }, { "ce_loss_10": 3.5668503522872923, "ce_loss_13": 3.4637187004089354, "ce_loss_2": 4.644854807853699, "ce_loss_3": 4.374481606483459, "ce_loss_7": 3.791785490512848, "epoch": 0.249, "grad_norm": 616.0, "kl_loss_10": 223.47670059204103, "kl_loss_2": 2406.82578125, "kl_loss_3": 1930.5429321289062, "kl_loss_7": 738.2828582763672, "learning_rate": 0.0008629601180209381, "loss": 1326.733, "step": 2490 }, { "ce_loss_10": 3.5605925559997558, "ce_loss_13": 3.4623565435409547, "ce_loss_2": 4.648912143707276, "ce_loss_3": 4.37358832359314, "ce_loss_7": 3.7822588205337526, "epoch": 0.25, "grad_norm": 588.0, "kl_loss_10": 221.60515823364258, "kl_loss_2": 2408.634729003906, "kl_loss_3": 1918.1406311035157, "kl_loss_7": 733.2383361816406, "learning_rate": 0.000861867019052535, "loss": 1350.9314, "step": 2500 }, { "ce_loss_10": 3.4750850677490233, "ce_loss_13": 3.3757749915122988, "ce_loss_2": 4.618335509300232, "ce_loss_3": 4.344382691383362, "ce_loss_7": 3.7118528127670287, "epoch": 0.251, "grad_norm": 664.0, "kl_loss_10": 225.6886344909668, "kl_loss_2": 2520.0691040039064, "kl_loss_3": 2028.4780883789062, "kl_loss_7": 750.8930267333984, "learning_rate": 0.0008607702760872678, "loss": 1377.2211, "step": 2510 }, { "ce_loss_10": 3.5948320031166077, "ce_loss_13": 3.493862783908844, "ce_loss_2": 4.663220858573913, "ce_loss_3": 4.39898452758789, "ce_loss_7": 3.8143251180648803, "epoch": 0.252, "grad_norm": 736.0, "kl_loss_10": 220.9385528564453, "kl_loss_2": 2382.33095703125, "kl_loss_3": 1919.1317260742187, "kl_loss_7": 728.4733703613281, "learning_rate": 0.0008596699001693256, "loss": 1356.6151, "step": 2520 }, { "ce_loss_10": 3.6045937299728394, "ce_loss_13": 3.5089424014091493, "ce_loss_2": 4.674148344993592, "ce_loss_3": 4.401587581634521, "ce_loss_7": 3.8156301379203796, "epoch": 0.253, "grad_norm": 664.0, "kl_loss_10": 222.60222702026368, "kl_loss_2": 2399.647021484375, "kl_loss_3": 1923.217791748047, "kl_loss_7": 722.3135375976562, "learning_rate": 0.0008585659023794818, "loss": 1357.2354, "step": 2530 }, { "ce_loss_10": 3.5605056166648863, "ce_loss_13": 3.458607590198517, "ce_loss_2": 4.6924147605896, "ce_loss_3": 4.421391654014587, "ce_loss_7": 3.799249517917633, "epoch": 0.254, "grad_norm": 660.0, "kl_loss_10": 233.0737617492676, "kl_loss_2": 2499.324670410156, "kl_loss_3": 2030.4549194335937, "kl_loss_7": 761.279296875, "learning_rate": 0.0008574582938349817, "loss": 1364.7606, "step": 2540 }, { "ce_loss_10": 3.5620136737823485, "ce_loss_13": 3.450424087047577, "ce_loss_2": 4.679884123802185, "ce_loss_3": 4.403433465957642, "ce_loss_7": 3.8059414982795716, "epoch": 0.255, "grad_norm": 648.0, "kl_loss_10": 238.74318084716796, "kl_loss_2": 2486.331640625, "kl_loss_3": 1999.8115600585938, "kl_loss_7": 776.2368225097656, "learning_rate": 0.0008563470856894315, "loss": 1329.6849, "step": 2550 }, { "ce_loss_10": 3.540405642986298, "ce_loss_13": 3.4457826972007752, "ce_loss_2": 4.656697821617127, "ce_loss_3": 4.386443245410919, "ce_loss_7": 3.772416353225708, "epoch": 0.256, "grad_norm": 760.0, "kl_loss_10": 221.72702865600587, "kl_loss_2": 2443.3952514648436, "kl_loss_3": 1969.1475952148437, "kl_loss_7": 745.7592987060547, "learning_rate": 0.0008552322891326845, "loss": 1346.8541, "step": 2560 }, { "ce_loss_10": 3.5136868953704834, "ce_loss_13": 3.415074276924133, "ce_loss_2": 4.637244987487793, "ce_loss_3": 4.365770423412323, "ce_loss_7": 3.741610062122345, "epoch": 0.257, "grad_norm": 788.0, "kl_loss_10": 218.68516159057617, "kl_loss_2": 2477.789599609375, "kl_loss_3": 2001.3069702148437, "kl_loss_7": 743.3714080810547, "learning_rate": 0.0008541139153907296, "loss": 1329.1979, "step": 2570 }, { "ce_loss_10": 3.472187507152557, "ce_loss_13": 3.3729379415512084, "ce_loss_2": 4.581104445457458, "ce_loss_3": 4.308674609661102, "ce_loss_7": 3.69760080575943, "epoch": 0.258, "grad_norm": 636.0, "kl_loss_10": 213.4689498901367, "kl_loss_2": 2453.299768066406, "kl_loss_3": 1976.8992919921875, "kl_loss_7": 745.6326965332031, "learning_rate": 0.0008529919757255782, "loss": 1354.7893, "step": 2580 }, { "ce_loss_10": 3.500008797645569, "ce_loss_13": 3.408738708496094, "ce_loss_2": 4.560009336471557, "ce_loss_3": 4.2931175351142885, "ce_loss_7": 3.716734218597412, "epoch": 0.259, "grad_norm": 624.0, "kl_loss_10": 208.80025100708008, "kl_loss_2": 2371.1708251953123, "kl_loss_3": 1897.6802124023438, "kl_loss_7": 721.6227478027344, "learning_rate": 0.0008518664814351503, "loss": 1306.301, "step": 2590 }, { "ce_loss_10": 3.472637712955475, "ce_loss_13": 3.37472482919693, "ce_loss_2": 4.598471093177795, "ce_loss_3": 4.321799778938294, "ce_loss_7": 3.7131651520729063, "epoch": 0.26, "grad_norm": 644.0, "kl_loss_10": 222.20911254882813, "kl_loss_2": 2491.116162109375, "kl_loss_3": 2007.4335876464843, "kl_loss_7": 764.1704193115235, "learning_rate": 0.0008507374438531607, "loss": 1407.2535, "step": 2600 }, { "ce_loss_10": 3.447394275665283, "ce_loss_13": 3.3539512395858764, "ce_loss_2": 4.5548292875289915, "ce_loss_3": 4.286789774894714, "ce_loss_7": 3.6768516659736634, "epoch": 0.261, "grad_norm": 676.0, "kl_loss_10": 214.65092697143555, "kl_loss_2": 2437.03447265625, "kl_loss_3": 1973.9089477539062, "kl_loss_7": 738.8113952636719, "learning_rate": 0.0008496048743490053, "loss": 1332.7279, "step": 2610 }, { "ce_loss_10": 3.597834813594818, "ce_loss_13": 3.5061428785324096, "ce_loss_2": 4.655121803283691, "ce_loss_3": 4.391561770439148, "ce_loss_7": 3.814839816093445, "epoch": 0.262, "grad_norm": 564.0, "kl_loss_10": 212.99711074829102, "kl_loss_2": 2362.529577636719, "kl_loss_3": 1891.9757995605469, "kl_loss_7": 720.1662811279297, "learning_rate": 0.0008484687843276469, "loss": 1316.5832, "step": 2620 }, { "ce_loss_10": 3.533200740814209, "ce_loss_13": 3.4373727798461915, "ce_loss_2": 4.636826205253601, "ce_loss_3": 4.3528993129730225, "ce_loss_7": 3.7636064171791075, "epoch": 0.263, "grad_norm": 688.0, "kl_loss_10": 217.95888977050782, "kl_loss_2": 2432.091143798828, "kl_loss_3": 1936.0632568359374, "kl_loss_7": 738.968881225586, "learning_rate": 0.0008473291852294987, "loss": 1361.4943, "step": 2630 }, { "ce_loss_10": 3.5451728224754335, "ce_loss_13": 3.446604347229004, "ce_loss_2": 4.630346298217773, "ce_loss_3": 4.3619812488555905, "ce_loss_7": 3.7699208855628967, "epoch": 0.264, "grad_norm": 672.0, "kl_loss_10": 220.66769561767578, "kl_loss_2": 2436.2069458007813, "kl_loss_3": 1956.8639526367188, "kl_loss_7": 742.7248840332031, "learning_rate": 0.0008461860885303114, "loss": 1327.3721, "step": 2640 }, { "ce_loss_10": 3.5666414141654967, "ce_loss_13": 3.4715107679367065, "ce_loss_2": 4.639662265777588, "ce_loss_3": 4.371685028076172, "ce_loss_7": 3.788040292263031, "epoch": 0.265, "grad_norm": 656.0, "kl_loss_10": 216.69636611938478, "kl_loss_2": 2373.723107910156, "kl_loss_3": 1899.1220764160157, "kl_loss_7": 725.1952423095703, "learning_rate": 0.000845039505741056, "loss": 1327.8555, "step": 2650 }, { "ce_loss_10": 3.5541250467300416, "ce_loss_13": 3.4555353045463564, "ce_loss_2": 4.645513963699341, "ce_loss_3": 4.378093981742859, "ce_loss_7": 3.7833709001541136, "epoch": 0.266, "grad_norm": 668.0, "kl_loss_10": 224.05798721313477, "kl_loss_2": 2449.707385253906, "kl_loss_3": 1967.4787109375, "kl_loss_7": 750.5478302001953, "learning_rate": 0.0008438894484078086, "loss": 1378.657, "step": 2660 }, { "ce_loss_10": 3.557729125022888, "ce_loss_13": 3.4628395080566405, "ce_loss_2": 4.638984179496765, "ce_loss_3": 4.374520492553711, "ce_loss_7": 3.7801038026809692, "epoch": 0.267, "grad_norm": 796.0, "kl_loss_10": 218.22870254516602, "kl_loss_2": 2393.3899047851564, "kl_loss_3": 1931.0333312988282, "kl_loss_7": 732.3969909667969, "learning_rate": 0.0008427359281116334, "loss": 1329.4188, "step": 2670 }, { "ce_loss_10": 3.4619020819664, "ce_loss_13": 3.3649930715560914, "ce_loss_2": 4.586506628990174, "ce_loss_3": 4.3114288449287415, "ce_loss_7": 3.6977506399154665, "epoch": 0.268, "grad_norm": 560.0, "kl_loss_10": 218.7227699279785, "kl_loss_2": 2471.7220703125, "kl_loss_3": 1986.8973815917968, "kl_loss_7": 744.8811431884766, "learning_rate": 0.0008415789564684673, "loss": 1344.4947, "step": 2680 }, { "ce_loss_10": 3.7084735155105593, "ce_loss_13": 3.610187065601349, "ce_loss_2": 4.759761667251587, "ce_loss_3": 4.487373423576355, "ce_loss_7": 3.9243152022361754, "epoch": 0.269, "grad_norm": 756.0, "kl_loss_10": 223.18955688476564, "kl_loss_2": 2329.3449951171874, "kl_loss_3": 1847.8426208496094, "kl_loss_7": 721.1707153320312, "learning_rate": 0.0008404185451290017, "loss": 1296.1146, "step": 2690 }, { "ce_loss_10": 3.578732097148895, "ce_loss_13": 3.4770421504974367, "ce_loss_2": 4.659151983261109, "ce_loss_3": 4.38085663318634, "ce_loss_7": 3.7948765754699707, "epoch": 0.27, "grad_norm": 692.0, "kl_loss_10": 224.61487731933593, "kl_loss_2": 2417.559912109375, "kl_loss_3": 1939.3710815429688, "kl_loss_7": 727.4687561035156, "learning_rate": 0.0008392547057785661, "loss": 1317.3512, "step": 2700 }, { "ce_loss_10": 3.5002851486206055, "ce_loss_13": 3.396597516536713, "ce_loss_2": 4.633592844009399, "ce_loss_3": 4.365511727333069, "ce_loss_7": 3.738453209400177, "epoch": 0.271, "grad_norm": 732.0, "kl_loss_10": 231.73975296020507, "kl_loss_2": 2517.132354736328, "kl_loss_3": 2044.1573425292968, "kl_loss_7": 768.5197204589844, "learning_rate": 0.0008380874501370098, "loss": 1329.0642, "step": 2710 }, { "ce_loss_10": 3.5027819752693174, "ce_loss_13": 3.4010127544403077, "ce_loss_2": 4.628546047210693, "ce_loss_3": 4.359855842590332, "ce_loss_7": 3.7310682773590087, "epoch": 0.272, "grad_norm": 628.0, "kl_loss_10": 236.13679275512695, "kl_loss_2": 2503.883825683594, "kl_loss_3": 2020.1560424804688, "kl_loss_7": 758.8711700439453, "learning_rate": 0.0008369167899585841, "loss": 1363.7068, "step": 2720 }, { "ce_loss_10": 3.6181455850601196, "ce_loss_13": 3.521961879730225, "ce_loss_2": 4.664963984489441, "ce_loss_3": 4.396141123771668, "ce_loss_7": 3.839101779460907, "epoch": 0.273, "grad_norm": 636.0, "kl_loss_10": 223.16615371704103, "kl_loss_2": 2348.37099609375, "kl_loss_3": 1879.9346130371093, "kl_loss_7": 730.2560852050781, "learning_rate": 0.0008357427370318238, "loss": 1337.943, "step": 2730 }, { "ce_loss_10": 3.571904718875885, "ce_loss_13": 3.4762736320495606, "ce_loss_2": 4.677034759521485, "ce_loss_3": 4.40289398431778, "ce_loss_7": 3.7918145298957824, "epoch": 0.274, "grad_norm": 772.0, "kl_loss_10": 222.57760772705078, "kl_loss_2": 2451.346435546875, "kl_loss_3": 1973.4313354492188, "kl_loss_7": 730.7371429443359, "learning_rate": 0.0008345653031794292, "loss": 1347.6243, "step": 2740 }, { "ce_loss_10": 3.5737530469894407, "ce_loss_13": 3.4740692615509032, "ce_loss_2": 4.659031462669373, "ce_loss_3": 4.387771344184875, "ce_loss_7": 3.792672348022461, "epoch": 0.275, "grad_norm": 672.0, "kl_loss_10": 222.67840805053712, "kl_loss_2": 2406.277941894531, "kl_loss_3": 1924.3234985351562, "kl_loss_7": 730.7620574951172, "learning_rate": 0.0008333845002581458, "loss": 1320.2523, "step": 2750 }, { "ce_loss_10": 3.498860251903534, "ce_loss_13": 3.400104033946991, "ce_loss_2": 4.611243772506714, "ce_loss_3": 4.342458128929138, "ce_loss_7": 3.733369469642639, "epoch": 0.276, "grad_norm": 644.0, "kl_loss_10": 224.65963973999024, "kl_loss_2": 2495.7015869140623, "kl_loss_3": 2015.1633422851562, "kl_loss_7": 762.1438781738282, "learning_rate": 0.0008322003401586462, "loss": 1364.4495, "step": 2760 }, { "ce_loss_10": 3.532784569263458, "ce_loss_13": 3.440683197975159, "ce_loss_2": 4.59234881401062, "ce_loss_3": 4.320498394966125, "ce_loss_7": 3.7502055525779725, "epoch": 0.277, "grad_norm": 724.0, "kl_loss_10": 211.5718635559082, "kl_loss_2": 2343.010675048828, "kl_loss_3": 1873.985821533203, "kl_loss_7": 709.5305114746094, "learning_rate": 0.0008310128348054094, "loss": 1276.2701, "step": 2770 }, { "ce_loss_10": 3.5014058470726015, "ce_loss_13": 3.406921911239624, "ce_loss_2": 4.603280448913575, "ce_loss_3": 4.329492771625519, "ce_loss_7": 3.7248639822006226, "epoch": 0.278, "grad_norm": 652.0, "kl_loss_10": 214.84819107055665, "kl_loss_2": 2431.7943481445313, "kl_loss_3": 1951.13515625, "kl_loss_7": 731.5488677978516, "learning_rate": 0.0008298219961566008, "loss": 1329.707, "step": 2780 }, { "ce_loss_10": 3.4713513970375063, "ce_loss_13": 3.3771822571754457, "ce_loss_2": 4.587963104248047, "ce_loss_3": 4.32047404050827, "ce_loss_7": 3.711584746837616, "epoch": 0.279, "grad_norm": 644.0, "kl_loss_10": 217.99566726684571, "kl_loss_2": 2492.9334106445312, "kl_loss_3": 2016.429022216797, "kl_loss_7": 761.9394226074219, "learning_rate": 0.0008286278362039527, "loss": 1336.5162, "step": 2790 }, { "ce_loss_10": 3.496282184123993, "ce_loss_13": 3.3998995065689086, "ce_loss_2": 4.622646689414978, "ce_loss_3": 4.352741932868957, "ce_loss_7": 3.7300979018211367, "epoch": 0.28, "grad_norm": 592.0, "kl_loss_10": 216.96264114379883, "kl_loss_2": 2489.9998046875, "kl_loss_3": 2008.0425537109375, "kl_loss_7": 746.8909149169922, "learning_rate": 0.0008274303669726426, "loss": 1325.7328, "step": 2800 }, { "ce_loss_10": 3.4048958301544188, "ce_loss_13": 3.3045366764068604, "ce_loss_2": 4.5690556287765505, "ce_loss_3": 4.298348617553711, "ce_loss_7": 3.6378442645072937, "epoch": 0.281, "grad_norm": 684.0, "kl_loss_10": 218.18540115356444, "kl_loss_2": 2561.6716186523436, "kl_loss_3": 2080.7119262695314, "kl_loss_7": 743.8994750976562, "learning_rate": 0.0008262296005211721, "loss": 1337.6219, "step": 2810 }, { "ce_loss_10": 3.5260050296783447, "ce_loss_13": 3.428924763202667, "ce_loss_2": 4.642134022712708, "ce_loss_3": 4.368475294113159, "ce_loss_7": 3.7550152063369753, "epoch": 0.282, "grad_norm": 600.0, "kl_loss_10": 216.54320907592773, "kl_loss_2": 2444.2397338867186, "kl_loss_3": 1975.6794677734374, "kl_loss_7": 734.2523712158203, "learning_rate": 0.0008250255489412463, "loss": 1322.247, "step": 2820 }, { "ce_loss_10": 3.629942464828491, "ce_loss_13": 3.532360863685608, "ce_loss_2": 4.7163821935653685, "ce_loss_3": 4.444535660743713, "ce_loss_7": 3.846136474609375, "epoch": 0.283, "grad_norm": 628.0, "kl_loss_10": 214.22548904418946, "kl_loss_2": 2410.5466918945312, "kl_loss_3": 1930.2673034667969, "kl_loss_7": 714.048681640625, "learning_rate": 0.0008238182243576511, "loss": 1325.0883, "step": 2830 }, { "ce_loss_10": 3.5913167357444764, "ce_loss_13": 3.5031124353408813, "ce_loss_2": 4.611292886734009, "ce_loss_3": 4.339277529716492, "ce_loss_7": 3.796242094039917, "epoch": 0.284, "grad_norm": 620.0, "kl_loss_10": 208.4808135986328, "kl_loss_2": 2294.337286376953, "kl_loss_3": 1814.4247924804688, "kl_loss_7": 695.5996673583984, "learning_rate": 0.0008226076389281315, "loss": 1277.3086, "step": 2840 }, { "ce_loss_10": 3.632950210571289, "ce_loss_13": 3.542364180088043, "ce_loss_2": 4.697378945350647, "ce_loss_3": 4.428278470039368, "ce_loss_7": 3.8434852004051208, "epoch": 0.285, "grad_norm": 592.0, "kl_loss_10": 210.92243499755858, "kl_loss_2": 2375.7556274414064, "kl_loss_3": 1902.3470825195313, "kl_loss_7": 701.8125823974609, "learning_rate": 0.0008213938048432696, "loss": 1285.7082, "step": 2850 }, { "ce_loss_10": 3.561896014213562, "ce_loss_13": 3.4673075318336486, "ce_loss_2": 4.635823488235474, "ce_loss_3": 4.3728371381759645, "ce_loss_7": 3.780589020252228, "epoch": 0.286, "grad_norm": 616.0, "kl_loss_10": 216.6977653503418, "kl_loss_2": 2390.834924316406, "kl_loss_3": 1924.6818054199218, "kl_loss_7": 726.8750396728516, "learning_rate": 0.0008201767343263612, "loss": 1324.6124, "step": 2860 }, { "ce_loss_10": 3.4997401237487793, "ce_loss_13": 3.4044744968414307, "ce_loss_2": 4.604890465736389, "ce_loss_3": 4.338030159473419, "ce_loss_7": 3.7291186928749083, "epoch": 0.287, "grad_norm": 616.0, "kl_loss_10": 213.92771530151367, "kl_loss_2": 2444.1182250976562, "kl_loss_3": 1971.163818359375, "kl_loss_7": 731.3478240966797, "learning_rate": 0.0008189564396332927, "loss": 1291.9086, "step": 2870 }, { "ce_loss_10": 3.480617916584015, "ce_loss_13": 3.388473629951477, "ce_loss_2": 4.600887513160705, "ce_loss_3": 4.323178672790528, "ce_loss_7": 3.7104127168655396, "epoch": 0.288, "grad_norm": 668.0, "kl_loss_10": 212.88904190063477, "kl_loss_2": 2441.765899658203, "kl_loss_3": 1961.8893615722657, "kl_loss_7": 728.4373413085938, "learning_rate": 0.0008177329330524181, "loss": 1342.4608, "step": 2880 }, { "ce_loss_10": 3.5435534834861757, "ce_loss_13": 3.4502355217933656, "ce_loss_2": 4.6120285987854, "ce_loss_3": 4.346097040176391, "ce_loss_7": 3.762561321258545, "epoch": 0.289, "grad_norm": 648.0, "kl_loss_10": 212.22290649414063, "kl_loss_2": 2358.1793823242188, "kl_loss_3": 1890.9413208007813, "kl_loss_7": 714.5174743652344, "learning_rate": 0.0008165062269044352, "loss": 1305.3231, "step": 2890 }, { "ce_loss_10": 3.4996484994888304, "ce_loss_13": 3.401354455947876, "ce_loss_2": 4.609268927574158, "ce_loss_3": 4.3294067740440365, "ce_loss_7": 3.723408377170563, "epoch": 0.29, "grad_norm": 660.0, "kl_loss_10": 216.81241302490236, "kl_loss_2": 2451.4824340820314, "kl_loss_3": 1968.3146179199218, "kl_loss_7": 729.5468353271484, "learning_rate": 0.0008152763335422613, "loss": 1337.7896, "step": 2900 }, { "ce_loss_10": 3.4890666246414184, "ce_loss_13": 3.392501711845398, "ce_loss_2": 4.58982219696045, "ce_loss_3": 4.312074947357178, "ce_loss_7": 3.713588225841522, "epoch": 0.291, "grad_norm": 664.0, "kl_loss_10": 218.38675384521486, "kl_loss_2": 2445.5037841796875, "kl_loss_3": 1949.8568176269532, "kl_loss_7": 729.6879028320312, "learning_rate": 0.0008140432653509088, "loss": 1317.595, "step": 2910 }, { "ce_loss_10": 3.538894033432007, "ce_loss_13": 3.4391178250312806, "ce_loss_2": 4.60951418876648, "ce_loss_3": 4.337265026569367, "ce_loss_7": 3.7542282700538636, "epoch": 0.292, "grad_norm": 576.0, "kl_loss_10": 218.85857162475585, "kl_loss_2": 2397.1072692871094, "kl_loss_3": 1916.8259216308593, "kl_loss_7": 718.4374481201172, "learning_rate": 0.0008128070347473608, "loss": 1302.2107, "step": 2920 }, { "ce_loss_10": 3.5429399847984313, "ce_loss_13": 3.447796130180359, "ce_loss_2": 4.665868854522705, "ce_loss_3": 4.389448404312134, "ce_loss_7": 3.7667205929756165, "epoch": 0.293, "grad_norm": 664.0, "kl_loss_10": 216.54725646972656, "kl_loss_2": 2487.7160583496093, "kl_loss_3": 2004.9421325683593, "kl_loss_7": 736.1060913085937, "learning_rate": 0.0008115676541804455, "loss": 1333.5637, "step": 2930 }, { "ce_loss_10": 3.5453550815582275, "ce_loss_13": 3.4535977363586428, "ce_loss_2": 4.623500943183899, "ce_loss_3": 4.348728823661804, "ce_loss_7": 3.760838878154755, "epoch": 0.294, "grad_norm": 580.0, "kl_loss_10": 209.94191284179686, "kl_loss_2": 2400.48662109375, "kl_loss_3": 1909.5526062011718, "kl_loss_7": 710.1752807617188, "learning_rate": 0.0008103251361307119, "loss": 1325.5172, "step": 2940 }, { "ce_loss_10": 3.578377163410187, "ce_loss_13": 3.4808244347572326, "ce_loss_2": 4.6591003894805905, "ce_loss_3": 4.395820617675781, "ce_loss_7": 3.793817377090454, "epoch": 0.295, "grad_norm": 616.0, "kl_loss_10": 214.81473617553712, "kl_loss_2": 2396.3223205566405, "kl_loss_3": 1926.4922485351562, "kl_loss_7": 722.0272766113281, "learning_rate": 0.0008090794931103026, "loss": 1300.3234, "step": 2950 }, { "ce_loss_10": 3.566417765617371, "ce_loss_13": 3.475232172012329, "ce_loss_2": 4.628555154800415, "ce_loss_3": 4.358175444602966, "ce_loss_7": 3.7831589698791506, "epoch": 0.296, "grad_norm": 692.0, "kl_loss_10": 209.84390869140626, "kl_loss_2": 2350.2305419921877, "kl_loss_3": 1877.9652465820313, "kl_loss_7": 713.7039794921875, "learning_rate": 0.0008078307376628291, "loss": 1303.6331, "step": 2960 }, { "ce_loss_10": 3.6232991099357603, "ce_loss_13": 3.534627139568329, "ce_loss_2": 4.6475036382675174, "ce_loss_3": 4.389086437225342, "ce_loss_7": 3.83059047460556, "epoch": 0.297, "grad_norm": 644.0, "kl_loss_10": 205.1537940979004, "kl_loss_2": 2274.82734375, "kl_loss_3": 1823.2497436523438, "kl_loss_7": 686.9072265625, "learning_rate": 0.000806578882363245, "loss": 1259.2264, "step": 2970 }, { "ce_loss_10": 3.536562275886536, "ce_loss_13": 3.447048234939575, "ce_loss_2": 4.597748541831971, "ce_loss_3": 4.3311933994293215, "ce_loss_7": 3.7559500217437742, "epoch": 0.298, "grad_norm": 736.0, "kl_loss_10": 208.43729248046876, "kl_loss_2": 2344.390216064453, "kl_loss_3": 1878.6112243652344, "kl_loss_7": 714.4485260009766, "learning_rate": 0.0008053239398177191, "loss": 1329.3172, "step": 2980 }, { "ce_loss_10": 3.524178981781006, "ce_loss_13": 3.4312392354011534, "ce_loss_2": 4.604809284210205, "ce_loss_3": 4.337883043289184, "ce_loss_7": 3.7429209470748903, "epoch": 0.299, "grad_norm": 684.0, "kl_loss_10": 211.32650604248047, "kl_loss_2": 2394.308056640625, "kl_loss_3": 1917.52822265625, "kl_loss_7": 709.9231262207031, "learning_rate": 0.0008040659226635089, "loss": 1341.8297, "step": 2990 }, { "ce_loss_10": 3.65326806306839, "ce_loss_13": 3.555258011817932, "ce_loss_2": 4.710744786262512, "ce_loss_3": 4.444170761108398, "ce_loss_7": 3.8668533086776735, "epoch": 0.3, "grad_norm": 640.0, "kl_loss_10": 219.24570388793944, "kl_loss_2": 2376.9404907226562, "kl_loss_3": 1902.857159423828, "kl_loss_7": 725.9926879882812, "learning_rate": 0.0008028048435688333, "loss": 1298.4502, "step": 3000 }, { "ce_loss_10": 3.521394634246826, "ce_loss_13": 3.4270112991333006, "ce_loss_2": 4.624356460571289, "ce_loss_3": 4.355751609802246, "ce_loss_7": 3.7494575500488283, "epoch": 0.301, "grad_norm": 716.0, "kl_loss_10": 217.2972724914551, "kl_loss_2": 2452.999304199219, "kl_loss_3": 1985.1250549316405, "kl_loss_7": 732.1629119873047, "learning_rate": 0.0008015407152327448, "loss": 1335.19, "step": 3010 }, { "ce_loss_10": 3.5699279427528383, "ce_loss_13": 3.475005257129669, "ce_loss_2": 4.65969865322113, "ce_loss_3": 4.38304386138916, "ce_loss_7": 3.784406042098999, "epoch": 0.302, "grad_norm": 620.0, "kl_loss_10": 215.99359130859375, "kl_loss_2": 2432.162463378906, "kl_loss_3": 1951.8839721679688, "kl_loss_7": 718.2368713378906, "learning_rate": 0.0008002735503850016, "loss": 1332.6505, "step": 3020 }, { "ce_loss_10": 3.4684691429138184, "ce_loss_13": 3.367643666267395, "ce_loss_2": 4.5924430847167965, "ce_loss_3": 4.30932047367096, "ce_loss_7": 3.6915883660316466, "epoch": 0.303, "grad_norm": 636.0, "kl_loss_10": 224.01161422729493, "kl_loss_2": 2494.453234863281, "kl_loss_3": 2004.73359375, "kl_loss_7": 736.367529296875, "learning_rate": 0.0007990033617859396, "loss": 1348.4062, "step": 3030 }, { "ce_loss_10": 3.5133005499839784, "ce_loss_13": 3.417665791511536, "ce_loss_2": 4.581400918960571, "ce_loss_3": 4.318250679969788, "ce_loss_7": 3.734131360054016, "epoch": 0.304, "grad_norm": 692.0, "kl_loss_10": 218.55305099487305, "kl_loss_2": 2367.1648193359374, "kl_loss_3": 1894.6960754394531, "kl_loss_7": 712.4279693603515, "learning_rate": 0.000797730162226344, "loss": 1274.1975, "step": 3040 }, { "ce_loss_10": 3.540754234790802, "ce_loss_13": 3.4410573482513427, "ce_loss_2": 4.607666325569153, "ce_loss_3": 4.33906877040863, "ce_loss_7": 3.76459002494812, "epoch": 0.305, "grad_norm": 692.0, "kl_loss_10": 221.26933517456055, "kl_loss_2": 2377.095458984375, "kl_loss_3": 1910.9453735351562, "kl_loss_7": 729.3416778564454, "learning_rate": 0.0007964539645273203, "loss": 1293.3233, "step": 3050 }, { "ce_loss_10": 3.549929714202881, "ce_loss_13": 3.4547195076942443, "ce_loss_2": 4.595946025848389, "ce_loss_3": 4.332681286334991, "ce_loss_7": 3.7608805656433106, "epoch": 0.306, "grad_norm": 608.0, "kl_loss_10": 214.02068862915038, "kl_loss_2": 2324.1172485351562, "kl_loss_3": 1866.7198425292968, "kl_loss_7": 705.0489013671875, "learning_rate": 0.000795174781540165, "loss": 1301.7614, "step": 3060 }, { "ce_loss_10": 3.626460921764374, "ce_loss_13": 3.5295538663864137, "ce_loss_2": 4.639704465866089, "ce_loss_3": 4.383927941322327, "ce_loss_7": 3.8362658858299254, "epoch": 0.307, "grad_norm": 644.0, "kl_loss_10": 215.03676071166993, "kl_loss_2": 2264.9541259765624, "kl_loss_3": 1824.0037841796875, "kl_loss_7": 696.3071411132812, "learning_rate": 0.0007938926261462366, "loss": 1288.9521, "step": 3070 }, { "ce_loss_10": 3.5775561928749084, "ce_loss_13": 3.480459380149841, "ce_loss_2": 4.618080592155456, "ce_loss_3": 4.350315952301026, "ce_loss_7": 3.7854344248771667, "epoch": 0.308, "grad_norm": 648.0, "kl_loss_10": 216.656893157959, "kl_loss_2": 2357.475067138672, "kl_loss_3": 1888.0174133300782, "kl_loss_7": 712.7872009277344, "learning_rate": 0.0007926075112568258, "loss": 1316.9054, "step": 3080 }, { "ce_loss_10": 3.5692449688911436, "ce_loss_13": 3.4759126543998717, "ce_loss_2": 4.623606491088867, "ce_loss_3": 4.366301465034485, "ce_loss_7": 3.78162659406662, "epoch": 0.309, "grad_norm": 560.0, "kl_loss_10": 213.1074462890625, "kl_loss_2": 2357.0720764160155, "kl_loss_3": 1902.1767517089843, "kl_loss_7": 709.6952423095703, "learning_rate": 0.0007913194498130252, "loss": 1281.0172, "step": 3090 }, { "ce_loss_10": 3.494074010848999, "ce_loss_13": 3.400245749950409, "ce_loss_2": 4.5784650325775145, "ce_loss_3": 4.316486406326294, "ce_loss_7": 3.7143809318542482, "epoch": 0.31, "grad_norm": 736.0, "kl_loss_10": 216.9530891418457, "kl_loss_2": 2388.186309814453, "kl_loss_3": 1924.5094665527345, "kl_loss_7": 718.4751098632812, "learning_rate": 0.0007900284547855992, "loss": 1312.7211, "step": 3100 }, { "ce_loss_10": 3.5040755391120912, "ce_loss_13": 3.409269428253174, "ce_loss_2": 4.549410009384156, "ce_loss_3": 4.294600343704223, "ce_loss_7": 3.7213049054145815, "epoch": 0.311, "grad_norm": 800.0, "kl_loss_10": 210.81134338378905, "kl_loss_2": 2329.0393676757812, "kl_loss_3": 1876.9636657714843, "kl_loss_7": 708.2128143310547, "learning_rate": 0.0007887345391748532, "loss": 1312.8156, "step": 3110 }, { "ce_loss_10": 3.634432864189148, "ce_loss_13": 3.543325686454773, "ce_loss_2": 4.651146030426025, "ce_loss_3": 4.387193036079407, "ce_loss_7": 3.8459346532821654, "epoch": 0.312, "grad_norm": 1168.0, "kl_loss_10": 212.2933433532715, "kl_loss_2": 2284.2329711914062, "kl_loss_3": 1829.6757873535157, "kl_loss_7": 706.4437377929687, "learning_rate": 0.0007874377160105036, "loss": 1259.3671, "step": 3120 }, { "ce_loss_10": 3.530054819583893, "ce_loss_13": 3.4342761754989626, "ce_loss_2": 4.628887629508972, "ce_loss_3": 4.362399673461914, "ce_loss_7": 3.7490867018699645, "epoch": 0.313, "grad_norm": 608.0, "kl_loss_10": 212.55482711791993, "kl_loss_2": 2429.394366455078, "kl_loss_3": 1971.4875915527343, "kl_loss_7": 728.8083862304687, "learning_rate": 0.0007861379983515449, "loss": 1354.4891, "step": 3130 }, { "ce_loss_10": 3.6109140157699584, "ce_loss_13": 3.5200807809829713, "ce_loss_2": 4.655977535247803, "ce_loss_3": 4.39032473564148, "ce_loss_7": 3.831193280220032, "epoch": 0.314, "grad_norm": 592.0, "kl_loss_10": 209.2868881225586, "kl_loss_2": 2336.8374755859377, "kl_loss_3": 1868.733642578125, "kl_loss_7": 717.5943817138672, "learning_rate": 0.0007848353992861195, "loss": 1273.946, "step": 3140 }, { "ce_loss_10": 3.6957940101623534, "ce_loss_13": 3.595130515098572, "ce_loss_2": 4.7389120101928714, "ce_loss_3": 4.469233250617981, "ce_loss_7": 3.926785933971405, "epoch": 0.315, "grad_norm": 888.0, "kl_loss_10": 223.79472427368165, "kl_loss_2": 2334.7629638671874, "kl_loss_3": 1867.655010986328, "kl_loss_7": 743.88798828125, "learning_rate": 0.0007835299319313853, "loss": 1303.1903, "step": 3150 }, { "ce_loss_10": 3.5704684495925902, "ce_loss_13": 3.476880931854248, "ce_loss_2": 4.606448101997375, "ce_loss_3": 4.3400969982147215, "ce_loss_7": 3.7886768341064454, "epoch": 0.316, "grad_norm": 700.0, "kl_loss_10": 211.18966979980468, "kl_loss_2": 2323.449572753906, "kl_loss_3": 1851.584783935547, "kl_loss_7": 721.3533996582031, "learning_rate": 0.0007822216094333848, "loss": 1322.3376, "step": 3160 }, { "ce_loss_10": 3.5810484290122986, "ce_loss_13": 3.4873368740081787, "ce_loss_2": 4.65300440788269, "ce_loss_3": 4.387210464477539, "ce_loss_7": 3.807372546195984, "epoch": 0.317, "grad_norm": 752.0, "kl_loss_10": 212.44315567016602, "kl_loss_2": 2384.089465332031, "kl_loss_3": 1914.7229309082031, "kl_loss_7": 731.2730682373046, "learning_rate": 0.0007809104449669101, "loss": 1294.9703, "step": 3170 }, { "ce_loss_10": 3.532199835777283, "ce_loss_13": 3.4395654439926147, "ce_loss_2": 4.584282898902893, "ce_loss_3": 4.30876350402832, "ce_loss_7": 3.7615070223808287, "epoch": 0.318, "grad_norm": 916.0, "kl_loss_10": 207.75176467895508, "kl_loss_2": 2339.527239990234, "kl_loss_3": 1858.5612731933593, "kl_loss_7": 730.9345184326172, "learning_rate": 0.0007795964517353734, "loss": 1278.7686, "step": 3180 }, { "ce_loss_10": 3.518466317653656, "ce_loss_13": 3.426977741718292, "ce_loss_2": 4.596842670440674, "ce_loss_3": 4.325531184673309, "ce_loss_7": 3.750142526626587, "epoch": 0.319, "grad_norm": 648.0, "kl_loss_10": 211.74872894287108, "kl_loss_2": 2403.7151733398437, "kl_loss_3": 1931.1865478515624, "kl_loss_7": 753.2544128417969, "learning_rate": 0.000778279642970672, "loss": 1282.6858, "step": 3190 }, { "ce_loss_10": 3.5179845094680786, "ce_loss_13": 3.428935539722443, "ce_loss_2": 4.562283158302307, "ce_loss_3": 4.295236802101135, "ce_loss_7": 3.7340755701065063, "epoch": 0.32, "grad_norm": 904.0, "kl_loss_10": 205.840421295166, "kl_loss_2": 2345.048876953125, "kl_loss_3": 1866.877911376953, "kl_loss_7": 720.2934661865235, "learning_rate": 0.0007769600319330552, "loss": 1264.9217, "step": 3200 }, { "ce_loss_10": 3.554915177822113, "ce_loss_13": 3.466732156276703, "ce_loss_2": 4.653983449935913, "ce_loss_3": 4.384042191505432, "ce_loss_7": 3.7919634103775026, "epoch": 0.321, "grad_norm": 708.0, "kl_loss_10": 205.94034423828126, "kl_loss_2": 2414.151336669922, "kl_loss_3": 1938.7024047851562, "kl_loss_7": 735.4687530517579, "learning_rate": 0.0007756376319109917, "loss": 1299.3125, "step": 3210 }, { "ce_loss_10": 3.601811099052429, "ce_loss_13": 3.513204276561737, "ce_loss_2": 4.643903732299805, "ce_loss_3": 4.372084999084473, "ce_loss_7": 3.82387717962265, "epoch": 0.322, "grad_norm": 856.0, "kl_loss_10": 205.92393646240234, "kl_loss_2": 2310.6433349609374, "kl_loss_3": 1837.0882263183594, "kl_loss_7": 727.0832824707031, "learning_rate": 0.0007743124562210351, "loss": 1252.0768, "step": 3220 }, { "ce_loss_10": 3.613737678527832, "ce_loss_13": 3.5243070006370543, "ce_loss_2": 4.6432843685150145, "ce_loss_3": 4.373880839347839, "ce_loss_7": 3.835604417324066, "epoch": 0.323, "grad_norm": 804.0, "kl_loss_10": 206.8451774597168, "kl_loss_2": 2302.7849548339846, "kl_loss_3": 1831.265850830078, "kl_loss_7": 718.6182281494141, "learning_rate": 0.0007729845182076895, "loss": 1281.717, "step": 3230 }, { "ce_loss_10": 3.54460072517395, "ce_loss_13": 3.458022344112396, "ce_loss_2": 4.567643523216248, "ce_loss_3": 4.304308319091797, "ce_loss_7": 3.7557874441146852, "epoch": 0.324, "grad_norm": 780.0, "kl_loss_10": 202.53059158325195, "kl_loss_2": 2275.4922607421877, "kl_loss_3": 1814.183154296875, "kl_loss_7": 706.1778442382813, "learning_rate": 0.0007716538312432765, "loss": 1299.5142, "step": 3240 }, { "ce_loss_10": 3.5034128069877624, "ce_loss_13": 3.4109013199806215, "ce_loss_2": 4.59195454120636, "ce_loss_3": 4.316867542266846, "ce_loss_7": 3.7340264201164244, "epoch": 0.325, "grad_norm": 620.0, "kl_loss_10": 212.30709838867188, "kl_loss_2": 2399.0862243652346, "kl_loss_3": 1912.8944152832032, "kl_loss_7": 738.4792083740234, "learning_rate": 0.0007703204087277988, "loss": 1308.0572, "step": 3250 }, { "ce_loss_10": 3.60279586315155, "ce_loss_13": 3.5141580939292907, "ce_loss_2": 4.619040894508362, "ce_loss_3": 4.348745739459991, "ce_loss_7": 3.81355699300766, "epoch": 0.326, "grad_norm": 728.0, "kl_loss_10": 202.46756286621093, "kl_loss_2": 2248.315087890625, "kl_loss_3": 1773.8885681152344, "kl_loss_7": 686.2587005615235, "learning_rate": 0.0007689842640888063, "loss": 1245.8748, "step": 3260 }, { "ce_loss_10": 3.6051684260368346, "ce_loss_13": 3.5150891542434692, "ce_loss_2": 4.619964861869812, "ce_loss_3": 4.360685467720032, "ce_loss_7": 3.8154699206352234, "epoch": 0.327, "grad_norm": 684.0, "kl_loss_10": 208.96502685546875, "kl_loss_2": 2265.3287658691406, "kl_loss_3": 1811.821759033203, "kl_loss_7": 703.9406219482422, "learning_rate": 0.0007676454107812607, "loss": 1264.3093, "step": 3270 }, { "ce_loss_10": 3.537815499305725, "ce_loss_13": 3.444960331916809, "ce_loss_2": 4.608094549179077, "ce_loss_3": 4.3397119522094725, "ce_loss_7": 3.7506736159324645, "epoch": 0.328, "grad_norm": 616.0, "kl_loss_10": 211.32426528930665, "kl_loss_2": 2388.8077392578125, "kl_loss_3": 1915.7237182617187, "kl_loss_7": 707.672705078125, "learning_rate": 0.0007663038622873999, "loss": 1279.8335, "step": 3280 }, { "ce_loss_10": 3.574945878982544, "ce_loss_13": 3.4833101868629455, "ce_loss_2": 4.628302264213562, "ce_loss_3": 4.366952037811279, "ce_loss_7": 3.784594464302063, "epoch": 0.329, "grad_norm": 596.0, "kl_loss_10": 211.56422576904296, "kl_loss_2": 2351.5840576171877, "kl_loss_3": 1879.0997131347656, "kl_loss_7": 694.7927307128906, "learning_rate": 0.0007649596321166025, "loss": 1256.8023, "step": 3290 }, { "ce_loss_10": 3.4788912653923036, "ce_loss_13": 3.3914729714393617, "ce_loss_2": 4.513116896152496, "ce_loss_3": 4.253373873233795, "ce_loss_7": 3.6928447008132936, "epoch": 0.33, "grad_norm": 600.0, "kl_loss_10": 203.03155212402345, "kl_loss_2": 2285.9035522460936, "kl_loss_3": 1828.6068603515625, "kl_loss_7": 691.3944427490235, "learning_rate": 0.0007636127338052513, "loss": 1273.8033, "step": 3300 }, { "ce_loss_10": 3.5868964433670043, "ce_loss_13": 3.49528044462204, "ce_loss_2": 4.663711452484131, "ce_loss_3": 4.39831657409668, "ce_loss_7": 3.804142189025879, "epoch": 0.331, "grad_norm": 624.0, "kl_loss_10": 211.30522232055665, "kl_loss_2": 2397.5427856445312, "kl_loss_3": 1927.950506591797, "kl_loss_7": 706.7799133300781, "learning_rate": 0.0007622631809165971, "loss": 1277.9496, "step": 3310 }, { "ce_loss_10": 3.582921600341797, "ce_loss_13": 3.4965414881706236, "ce_loss_2": 4.58232958316803, "ce_loss_3": 4.323147928714752, "ce_loss_7": 3.783066177368164, "epoch": 0.332, "grad_norm": 688.0, "kl_loss_10": 197.47354049682616, "kl_loss_2": 2216.269598388672, "kl_loss_3": 1760.1223999023437, "kl_loss_7": 664.7514923095703, "learning_rate": 0.000760910987040623, "loss": 1245.9068, "step": 3320 }, { "ce_loss_10": 3.5663990497589113, "ce_loss_13": 3.474509632587433, "ce_loss_2": 4.641107606887817, "ce_loss_3": 4.369283008575439, "ce_loss_7": 3.78259996175766, "epoch": 0.333, "grad_norm": 616.0, "kl_loss_10": 210.06242904663085, "kl_loss_2": 2402.0831298828125, "kl_loss_3": 1926.8569946289062, "kl_loss_7": 714.4321014404297, "learning_rate": 0.000759556165793906, "loss": 1272.2351, "step": 3330 }, { "ce_loss_10": 3.5859936118125915, "ce_loss_13": 3.4947105884552, "ce_loss_2": 4.635438013076782, "ce_loss_3": 4.3713214635849, "ce_loss_7": 3.79847708940506, "epoch": 0.334, "grad_norm": 600.0, "kl_loss_10": 207.2735107421875, "kl_loss_2": 2336.860705566406, "kl_loss_3": 1864.5109497070312, "kl_loss_7": 698.3573028564454, "learning_rate": 0.000758198730819481, "loss": 1291.4691, "step": 3340 }, { "ce_loss_10": 3.530641829967499, "ce_loss_13": 3.44451619386673, "ce_loss_2": 4.589142799377441, "ce_loss_3": 4.3217404961586, "ce_loss_7": 3.7362788200378416, "epoch": 0.335, "grad_norm": 624.0, "kl_loss_10": 202.07082290649413, "kl_loss_2": 2360.776690673828, "kl_loss_3": 1886.3787536621094, "kl_loss_7": 695.4340698242188, "learning_rate": 0.0007568386957867032, "loss": 1283.006, "step": 3350 }, { "ce_loss_10": 3.6058520078659058, "ce_loss_13": 3.5129651188850404, "ce_loss_2": 4.643417167663574, "ce_loss_3": 4.37364354133606, "ce_loss_7": 3.813083219528198, "epoch": 0.336, "grad_norm": 784.0, "kl_loss_10": 207.97874145507814, "kl_loss_2": 2295.3684020996093, "kl_loss_3": 1825.3164672851562, "kl_loss_7": 687.7690948486328, "learning_rate": 0.0007554760743911103, "loss": 1276.5395, "step": 3360 }, { "ce_loss_10": 3.5018799662590028, "ce_loss_13": 3.4133763194084166, "ce_loss_2": 4.551569533348084, "ce_loss_3": 4.283941590785981, "ce_loss_7": 3.704894995689392, "epoch": 0.337, "grad_norm": 644.0, "kl_loss_10": 201.99492797851562, "kl_loss_2": 2352.8791320800783, "kl_loss_3": 1880.3864440917969, "kl_loss_7": 682.2205291748047, "learning_rate": 0.0007541108803542846, "loss": 1306.1851, "step": 3370 }, { "ce_loss_10": 3.5562949419021606, "ce_loss_13": 3.467681646347046, "ce_loss_2": 4.61066963672638, "ce_loss_3": 4.3377085566520694, "ce_loss_7": 3.7622047662734985, "epoch": 0.338, "grad_norm": 632.0, "kl_loss_10": 205.39780044555664, "kl_loss_2": 2363.417413330078, "kl_loss_3": 1877.7716186523437, "kl_loss_7": 681.8937408447266, "learning_rate": 0.0007527431274237149, "loss": 1343.544, "step": 3380 }, { "ce_loss_10": 3.5283817052841187, "ce_loss_13": 3.4397483229637147, "ce_loss_2": 4.570840525627136, "ce_loss_3": 4.304589962959289, "ce_loss_7": 3.7283903479576113, "epoch": 0.339, "grad_norm": 572.0, "kl_loss_10": 203.6351058959961, "kl_loss_2": 2336.9517456054687, "kl_loss_3": 1865.9661682128906, "kl_loss_7": 677.4322174072265, "learning_rate": 0.0007513728293726579, "loss": 1277.8105, "step": 3390 }, { "ce_loss_10": 3.644584619998932, "ce_loss_13": 3.556074547767639, "ce_loss_2": 4.664924669265747, "ce_loss_3": 4.399448752403259, "ce_loss_7": 3.8509042620658875, "epoch": 0.34, "grad_norm": 644.0, "kl_loss_10": 203.6712448120117, "kl_loss_2": 2293.6278381347656, "kl_loss_3": 1826.5482238769532, "kl_loss_7": 683.2333282470703, "learning_rate": 0.00075, "loss": 1246.8078, "step": 3400 }, { "ce_loss_10": 3.6313098788261415, "ce_loss_13": 3.5418556571006774, "ce_loss_2": 4.690052318572998, "ce_loss_3": 4.416153597831726, "ce_loss_7": 3.843977117538452, "epoch": 0.341, "grad_norm": 644.0, "kl_loss_10": 205.15843811035157, "kl_loss_2": 2335.1500244140625, "kl_loss_3": 1861.3693969726562, "kl_loss_7": 693.7598510742188, "learning_rate": 0.0007486246531301177, "loss": 1258.7575, "step": 3410 }, { "ce_loss_10": 3.443863534927368, "ce_loss_13": 3.3510751008987425, "ce_loss_2": 4.5022605657577515, "ce_loss_3": 4.230472648143769, "ce_loss_7": 3.6559174418449403, "epoch": 0.342, "grad_norm": 664.0, "kl_loss_10": 202.74062805175782, "kl_loss_2": 2345.9876220703127, "kl_loss_3": 1864.947119140625, "kl_loss_7": 688.1812042236328, "learning_rate": 0.0007472468026127384, "loss": 1260.6121, "step": 3420 }, { "ce_loss_10": 3.5721543431282043, "ce_loss_13": 3.4770930409431458, "ce_loss_2": 4.665278792381287, "ce_loss_3": 4.404071187973022, "ce_loss_7": 3.788026750087738, "epoch": 0.343, "grad_norm": 712.0, "kl_loss_10": 214.17584533691405, "kl_loss_2": 2439.352404785156, "kl_loss_3": 1972.9944885253906, "kl_loss_7": 720.6741455078125, "learning_rate": 0.000745866462322802, "loss": 1320.8363, "step": 3430 }, { "ce_loss_10": 3.560039293766022, "ce_loss_13": 3.4741207122802735, "ce_loss_2": 4.592786359786987, "ce_loss_3": 4.334179782867432, "ce_loss_7": 3.7690793752670286, "epoch": 0.344, "grad_norm": 700.0, "kl_loss_10": 200.53601684570313, "kl_loss_2": 2283.709338378906, "kl_loss_3": 1835.7159240722656, "kl_loss_7": 673.5906463623047, "learning_rate": 0.0007444836461603195, "loss": 1261.9196, "step": 3440 }, { "ce_loss_10": 3.6245110511779783, "ce_loss_13": 3.5312341451644897, "ce_loss_2": 4.6719811201095585, "ce_loss_3": 4.406914234161377, "ce_loss_7": 3.8266146540641786, "epoch": 0.345, "grad_norm": 648.0, "kl_loss_10": 214.16654891967772, "kl_loss_2": 2362.7242797851563, "kl_loss_3": 1898.2087890625, "kl_loss_7": 704.3192626953125, "learning_rate": 0.0007430983680502344, "loss": 1301.0338, "step": 3450 }, { "ce_loss_10": 3.4667457938194275, "ce_loss_13": 3.377876877784729, "ce_loss_2": 4.545617830753327, "ce_loss_3": 4.272857880592346, "ce_loss_7": 3.6741854548454285, "epoch": 0.346, "grad_norm": 608.0, "kl_loss_10": 206.17358779907227, "kl_loss_2": 2388.203955078125, "kl_loss_3": 1909.6425415039062, "kl_loss_7": 697.4611602783203, "learning_rate": 0.0007417106419422819, "loss": 1290.2338, "step": 3460 }, { "ce_loss_10": 3.571521496772766, "ce_loss_13": 3.4770392775535583, "ce_loss_2": 4.614993333816528, "ce_loss_3": 4.345528078079224, "ce_loss_7": 3.7798440217971803, "epoch": 0.347, "grad_norm": 656.0, "kl_loss_10": 204.45724334716797, "kl_loss_2": 2308.5895385742188, "kl_loss_3": 1833.5083312988281, "kl_loss_7": 683.0788604736329, "learning_rate": 0.0007403204818108486, "loss": 1275.3799, "step": 3470 }, { "ce_loss_10": 3.5445627093315126, "ce_loss_13": 3.4533625841140747, "ce_loss_2": 4.6031595230102536, "ce_loss_3": 4.338364768028259, "ce_loss_7": 3.746653878688812, "epoch": 0.348, "grad_norm": 576.0, "kl_loss_10": 208.29350357055665, "kl_loss_2": 2371.4946899414062, "kl_loss_3": 1909.6997863769532, "kl_loss_7": 686.1026062011719, "learning_rate": 0.0007389279016548316, "loss": 1247.1171, "step": 3480 }, { "ce_loss_10": 3.553533661365509, "ce_loss_13": 3.458456254005432, "ce_loss_2": 4.6566637516021725, "ce_loss_3": 4.37568781375885, "ce_loss_7": 3.7642048597335815, "epoch": 0.349, "grad_norm": 684.0, "kl_loss_10": 212.6401054382324, "kl_loss_2": 2451.175671386719, "kl_loss_3": 1951.6068969726562, "kl_loss_7": 702.7821899414063, "learning_rate": 0.0007375329154974975, "loss": 1307.9424, "step": 3490 }, { "ce_loss_10": 3.5084131717681886, "ce_loss_13": 3.4206284284591675, "ce_loss_2": 4.546359324455262, "ce_loss_3": 4.28377673625946, "ce_loss_7": 3.7158578753471376, "epoch": 0.35, "grad_norm": 676.0, "kl_loss_10": 208.94808502197264, "kl_loss_2": 2307.420263671875, "kl_loss_3": 1848.263897705078, "kl_loss_7": 683.7157653808594, "learning_rate": 0.0007361355373863414, "loss": 1294.9244, "step": 3500 }, { "ce_loss_10": 3.563704586029053, "ce_loss_13": 3.471196401119232, "ce_loss_2": 4.5938108444213865, "ce_loss_3": 4.332852721214294, "ce_loss_7": 3.7703267097473145, "epoch": 0.351, "grad_norm": 580.0, "kl_loss_10": 208.09024658203126, "kl_loss_2": 2287.9739318847655, "kl_loss_3": 1828.5689758300782, "kl_loss_7": 673.030307006836, "learning_rate": 0.0007347357813929454, "loss": 1287.6393, "step": 3510 }, { "ce_loss_10": 3.5099044919013975, "ce_loss_13": 3.419321870803833, "ce_loss_2": 4.543364262580871, "ce_loss_3": 4.27359983921051, "ce_loss_7": 3.7119598269462584, "epoch": 0.352, "grad_norm": 620.0, "kl_loss_10": 207.49536819458007, "kl_loss_2": 2274.5087951660157, "kl_loss_3": 1817.6334594726563, "kl_loss_7": 673.7456726074219, "learning_rate": 0.0007333336616128369, "loss": 1275.3445, "step": 3520 }, { "ce_loss_10": 3.4893477082252504, "ce_loss_13": 3.394596815109253, "ce_loss_2": 4.558405804634094, "ce_loss_3": 4.2951094031333925, "ce_loss_7": 3.69784619808197, "epoch": 0.353, "grad_norm": 636.0, "kl_loss_10": 211.34491577148438, "kl_loss_2": 2368.5742614746096, "kl_loss_3": 1904.588153076172, "kl_loss_7": 699.6206512451172, "learning_rate": 0.0007319291921653463, "loss": 1290.8219, "step": 3530 }, { "ce_loss_10": 3.5741103887557983, "ce_loss_13": 3.4787994265556335, "ce_loss_2": 4.633104467391968, "ce_loss_3": 4.363708543777466, "ce_loss_7": 3.7883455634117125, "epoch": 0.354, "grad_norm": 688.0, "kl_loss_10": 211.75357818603516, "kl_loss_2": 2353.7052368164063, "kl_loss_3": 1872.2497680664062, "kl_loss_7": 696.9233825683593, "learning_rate": 0.0007305223871934656, "loss": 1261.161, "step": 3540 }, { "ce_loss_10": 3.53648921251297, "ce_loss_13": 3.4479789614677427, "ce_loss_2": 4.58404312133789, "ce_loss_3": 4.315784668922424, "ce_loss_7": 3.7411927938461305, "epoch": 0.355, "grad_norm": 644.0, "kl_loss_10": 204.81159896850585, "kl_loss_2": 2318.000451660156, "kl_loss_3": 1841.0889282226562, "kl_loss_7": 674.2556213378906, "learning_rate": 0.0007291132608637052, "loss": 1261.7902, "step": 3550 }, { "ce_loss_10": 3.4981685996055605, "ce_loss_13": 3.4104817390441893, "ce_loss_2": 4.630524325370788, "ce_loss_3": 4.356097209453583, "ce_loss_7": 3.7044720530509947, "epoch": 0.356, "grad_norm": 596.0, "kl_loss_10": 201.21338653564453, "kl_loss_2": 2484.7927124023436, "kl_loss_3": 2010.760675048828, "kl_loss_7": 676.6305114746094, "learning_rate": 0.0007277018273659516, "loss": 1327.9727, "step": 3560 }, { "ce_loss_10": 3.625146007537842, "ce_loss_13": 3.531074047088623, "ce_loss_2": 4.6699333667755125, "ce_loss_3": 4.4035911679267885, "ce_loss_7": 3.836405646800995, "epoch": 0.357, "grad_norm": 620.0, "kl_loss_10": 209.5035614013672, "kl_loss_2": 2341.261669921875, "kl_loss_3": 1873.83955078125, "kl_loss_7": 701.3334136962891, "learning_rate": 0.0007262881009133242, "loss": 1275.0637, "step": 3570 }, { "ce_loss_10": 3.5401904344558717, "ce_loss_13": 3.4541720032691954, "ce_loss_2": 4.572478699684143, "ce_loss_3": 4.315898811817169, "ce_loss_7": 3.7435639023780825, "epoch": 0.358, "grad_norm": 616.0, "kl_loss_10": 201.09010009765626, "kl_loss_2": 2313.080432128906, "kl_loss_3": 1855.7528381347656, "kl_loss_7": 673.6273254394531, "learning_rate": 0.0007248720957420329, "loss": 1252.028, "step": 3580 }, { "ce_loss_10": 3.55083909034729, "ce_loss_13": 3.466167140007019, "ce_loss_2": 4.594191384315491, "ce_loss_3": 4.320683646202087, "ce_loss_7": 3.750420665740967, "epoch": 0.359, "grad_norm": 592.0, "kl_loss_10": 201.2973388671875, "kl_loss_2": 2304.7897521972654, "kl_loss_3": 1830.9858337402343, "kl_loss_7": 667.1753356933593, "learning_rate": 0.0007234538261112341, "loss": 1305.9864, "step": 3590 }, { "ce_loss_10": 3.5870068073272705, "ce_loss_13": 3.4963618993759153, "ce_loss_2": 4.64858865737915, "ce_loss_3": 4.383180546760559, "ce_loss_7": 3.7978907346725466, "epoch": 0.36, "grad_norm": 580.0, "kl_loss_10": 206.438224029541, "kl_loss_2": 2350.6687255859374, "kl_loss_3": 1879.8920471191407, "kl_loss_7": 691.6597686767578, "learning_rate": 0.0007220333063028871, "loss": 1262.9199, "step": 3600 }, { "ce_loss_10": 3.617115843296051, "ce_loss_13": 3.5226447105407717, "ce_loss_2": 4.7090448379516605, "ce_loss_3": 4.440777349472046, "ce_loss_7": 3.927542436122894, "epoch": 0.361, "grad_norm": 700.0, "kl_loss_10": 215.82505798339844, "kl_loss_2": 2451.148791503906, "kl_loss_3": 1982.2255798339843, "kl_loss_7": 911.9942932128906, "learning_rate": 0.0007206105506216106, "loss": 1351.6553, "step": 3610 }, { "ce_loss_10": 3.4991318702697756, "ce_loss_13": 3.4086315035820007, "ce_loss_2": 4.517360043525696, "ce_loss_3": 4.2627614617347716, "ce_loss_7": 3.7236221551895143, "epoch": 0.362, "grad_norm": 836.0, "kl_loss_10": 210.4427146911621, "kl_loss_2": 2266.5427673339846, "kl_loss_3": 1820.741632080078, "kl_loss_7": 721.7343872070312, "learning_rate": 0.0007191855733945387, "loss": 1249.7904, "step": 3620 }, { "ce_loss_10": 3.5907591581344604, "ce_loss_13": 3.5010143160820006, "ce_loss_2": 4.622481203079223, "ce_loss_3": 4.356601357460022, "ce_loss_7": 3.79650160074234, "epoch": 0.363, "grad_norm": 572.0, "kl_loss_10": 204.04148864746094, "kl_loss_2": 2289.9740478515623, "kl_loss_3": 1818.431005859375, "kl_loss_7": 692.2773986816406, "learning_rate": 0.0007177583889711762, "loss": 1250.2074, "step": 3630 }, { "ce_loss_10": 3.5057953119277956, "ce_loss_13": 3.411814069747925, "ce_loss_2": 4.563321113586426, "ce_loss_3": 4.290196192264557, "ce_loss_7": 3.7388221859931945, "epoch": 0.364, "grad_norm": 952.0, "kl_loss_10": 206.33124389648438, "kl_loss_2": 2346.8802978515623, "kl_loss_3": 1869.4090270996094, "kl_loss_7": 731.2307403564453, "learning_rate": 0.0007163290117232541, "loss": 1286.6971, "step": 3640 }, { "ce_loss_10": 3.625288701057434, "ce_loss_13": 3.5375028014183045, "ce_loss_2": 4.612985825538635, "ce_loss_3": 4.35078866481781, "ce_loss_7": 3.820189893245697, "epoch": 0.365, "grad_norm": 676.0, "kl_loss_10": 199.10545043945314, "kl_loss_2": 2225.872174072266, "kl_loss_3": 1769.0750122070312, "kl_loss_7": 679.4105773925781, "learning_rate": 0.0007148974560445859, "loss": 1248.8154, "step": 3650 }, { "ce_loss_10": 3.5454740643501284, "ce_loss_13": 3.458422267436981, "ce_loss_2": 4.550081968307495, "ce_loss_3": 4.287899553775787, "ce_loss_7": 3.7501559376716616, "epoch": 0.366, "grad_norm": 588.0, "kl_loss_10": 198.9521499633789, "kl_loss_2": 2238.7438537597654, "kl_loss_3": 1775.3712280273437, "kl_loss_7": 679.886441040039, "learning_rate": 0.0007134637363509209, "loss": 1224.5007, "step": 3660 }, { "ce_loss_10": 3.6524737238883973, "ce_loss_13": 3.566513454914093, "ce_loss_2": 4.64628803730011, "ce_loss_3": 4.383014440536499, "ce_loss_7": 3.85452960729599, "epoch": 0.367, "grad_norm": 804.0, "kl_loss_10": 195.8438636779785, "kl_loss_2": 2202.7541381835936, "kl_loss_3": 1744.0477172851563, "kl_loss_7": 669.0114837646485, "learning_rate": 0.0007120278670798009, "loss": 1241.0041, "step": 3670 }, { "ce_loss_10": 3.451411759853363, "ce_loss_13": 3.362742209434509, "ce_loss_2": 4.563682770729065, "ce_loss_3": 4.3000654697418215, "ce_loss_7": 3.6735877275466917, "epoch": 0.368, "grad_norm": 692.0, "kl_loss_10": 207.53860931396486, "kl_loss_2": 2452.550360107422, "kl_loss_3": 1973.4927856445313, "kl_loss_7": 716.8897491455078, "learning_rate": 0.0007105898626904133, "loss": 1338.2093, "step": 3680 }, { "ce_loss_10": 3.5554641008377077, "ce_loss_13": 3.4662737131118773, "ce_loss_2": 4.611227035522461, "ce_loss_3": 4.340918231010437, "ce_loss_7": 3.768160092830658, "epoch": 0.369, "grad_norm": 564.0, "kl_loss_10": 202.72654418945314, "kl_loss_2": 2339.7893005371093, "kl_loss_3": 1860.5950439453125, "kl_loss_7": 686.1352569580079, "learning_rate": 0.0007091497376634463, "loss": 1252.1551, "step": 3690 }, { "ce_loss_10": 3.5008182168006896, "ce_loss_13": 3.412228453159332, "ce_loss_2": 4.538051557540894, "ce_loss_3": 4.271669220924378, "ce_loss_7": 3.707795190811157, "epoch": 0.37, "grad_norm": 688.0, "kl_loss_10": 203.25593032836915, "kl_loss_2": 2313.7531005859373, "kl_loss_3": 1845.0465942382812, "kl_loss_7": 679.24736328125, "learning_rate": 0.0007077075065009433, "loss": 1276.0328, "step": 3700 }, { "ce_loss_10": 3.6071534514427186, "ce_loss_13": 3.5158491373062133, "ce_loss_2": 4.6654202222824095, "ce_loss_3": 4.392651915550232, "ce_loss_7": 3.819450116157532, "epoch": 0.371, "grad_norm": 752.0, "kl_loss_10": 208.91845016479493, "kl_loss_2": 2358.2886352539062, "kl_loss_3": 1879.4206665039062, "kl_loss_7": 699.9392761230469, "learning_rate": 0.0007062631837261557, "loss": 1268.6693, "step": 3710 }, { "ce_loss_10": 3.476445233821869, "ce_loss_13": 3.390165627002716, "ce_loss_2": 4.5276483535766605, "ce_loss_3": 4.25641827583313, "ce_loss_7": 3.683011364936829, "epoch": 0.372, "grad_norm": 640.0, "kl_loss_10": 202.1516098022461, "kl_loss_2": 2314.015344238281, "kl_loss_3": 1841.6598022460937, "kl_loss_7": 682.8849884033203, "learning_rate": 0.0007048167838833977, "loss": 1289.0859, "step": 3720 }, { "ce_loss_10": 3.5699679255485535, "ce_loss_13": 3.480803608894348, "ce_loss_2": 4.588373041152954, "ce_loss_3": 4.323996567726136, "ce_loss_7": 3.7677942156791686, "epoch": 0.373, "grad_norm": 744.0, "kl_loss_10": 203.9791275024414, "kl_loss_2": 2290.5110717773437, "kl_loss_3": 1815.935235595703, "kl_loss_7": 672.9903137207032, "learning_rate": 0.0007033683215379002, "loss": 1247.4349, "step": 3730 }, { "ce_loss_10": 3.556042289733887, "ce_loss_13": 3.4659453988075257, "ce_loss_2": 4.594552016258239, "ce_loss_3": 4.319927525520325, "ce_loss_7": 3.7590227007865904, "epoch": 0.374, "grad_norm": 608.0, "kl_loss_10": 199.66127700805663, "kl_loss_2": 2281.638720703125, "kl_loss_3": 1802.195782470703, "kl_loss_7": 667.0344909667969, "learning_rate": 0.0007019178112756625, "loss": 1258.8061, "step": 3740 }, { "ce_loss_10": 3.518285346031189, "ce_loss_13": 3.432125985622406, "ce_loss_2": 4.562115430831909, "ce_loss_3": 4.29533269405365, "ce_loss_7": 3.7262615442276, "epoch": 0.375, "grad_norm": 640.0, "kl_loss_10": 200.82229309082032, "kl_loss_2": 2292.846240234375, "kl_loss_3": 1823.6997009277343, "kl_loss_7": 673.9724395751953, "learning_rate": 0.0007004652677033068, "loss": 1263.2482, "step": 3750 }, { "ce_loss_10": 3.5903055548667906, "ce_loss_13": 3.506292223930359, "ce_loss_2": 4.5962906837463375, "ce_loss_3": 4.3357175350189205, "ce_loss_7": 3.786800575256348, "epoch": 0.376, "grad_norm": 656.0, "kl_loss_10": 200.60662612915038, "kl_loss_2": 2252.357458496094, "kl_loss_3": 1790.429217529297, "kl_loss_7": 660.8626007080078, "learning_rate": 0.0006990107054479312, "loss": 1245.262, "step": 3760 }, { "ce_loss_10": 3.5865476965904235, "ce_loss_13": 3.4931369185447694, "ce_loss_2": 4.599129343032837, "ce_loss_3": 4.335857176780701, "ce_loss_7": 3.77964334487915, "epoch": 0.377, "grad_norm": 700.0, "kl_loss_10": 209.85385513305664, "kl_loss_2": 2261.3722229003906, "kl_loss_3": 1806.916485595703, "kl_loss_7": 670.6618469238281, "learning_rate": 0.000697554139156961, "loss": 1247.3398, "step": 3770 }, { "ce_loss_10": 3.572757053375244, "ce_loss_13": 3.477368426322937, "ce_loss_2": 4.60923056602478, "ce_loss_3": 4.342034792900085, "ce_loss_7": 3.766658973693848, "epoch": 0.378, "grad_norm": 628.0, "kl_loss_10": 218.62799072265625, "kl_loss_2": 2331.5685607910154, "kl_loss_3": 1853.7978393554688, "kl_loss_7": 681.1488891601563, "learning_rate": 0.0006960955834980027, "loss": 1246.4775, "step": 3780 }, { "ce_loss_10": 3.5454328536987303, "ce_loss_13": 3.449174666404724, "ce_loss_2": 4.563591694831848, "ce_loss_3": 4.298812806606293, "ce_loss_7": 3.7385509848594665, "epoch": 0.379, "grad_norm": 740.0, "kl_loss_10": 215.9669273376465, "kl_loss_2": 2275.3556274414063, "kl_loss_3": 1807.9537841796875, "kl_loss_7": 673.0077819824219, "learning_rate": 0.0006946350531586958, "loss": 1251.4496, "step": 3790 }, { "ce_loss_10": 3.5613739252090455, "ce_loss_13": 3.4710352540016176, "ce_loss_2": 4.586881446838379, "ce_loss_3": 4.326097631454468, "ce_loss_7": 3.7597612500190736, "epoch": 0.38, "grad_norm": 636.0, "kl_loss_10": 210.37978897094726, "kl_loss_2": 2278.926678466797, "kl_loss_3": 1818.876287841797, "kl_loss_7": 669.4569915771484, "learning_rate": 0.0006931725628465643, "loss": 1275.2133, "step": 3800 }, { "ce_loss_10": 3.590466618537903, "ce_loss_13": 3.4937587857246397, "ce_loss_2": 4.623274827003479, "ce_loss_3": 4.352746081352234, "ce_loss_7": 3.7943554282188416, "epoch": 0.381, "grad_norm": 628.0, "kl_loss_10": 211.29082336425782, "kl_loss_2": 2296.4768188476564, "kl_loss_3": 1818.6126953125, "kl_loss_7": 678.5069641113281, "learning_rate": 0.0006917081272888696, "loss": 1259.3377, "step": 3810 }, { "ce_loss_10": 3.487190854549408, "ce_loss_13": 3.393886852264404, "ce_loss_2": 4.553677868843079, "ce_loss_3": 4.300019836425781, "ce_loss_7": 3.6916916847229, "epoch": 0.382, "grad_norm": 596.0, "kl_loss_10": 216.8355224609375, "kl_loss_2": 2365.8853271484377, "kl_loss_3": 1922.5055114746094, "kl_loss_7": 684.1588439941406, "learning_rate": 0.0006902417612324615, "loss": 1266.6071, "step": 3820 }, { "ce_loss_10": 3.6190301895141603, "ce_loss_13": 3.5245797634124756, "ce_loss_2": 4.67730553150177, "ce_loss_3": 4.405901682376862, "ce_loss_7": 3.8294657945632933, "epoch": 0.383, "grad_norm": 680.0, "kl_loss_10": 219.55593795776366, "kl_loss_2": 2360.2657958984373, "kl_loss_3": 1879.3323364257812, "kl_loss_7": 698.4299591064453, "learning_rate": 0.00068877347944363, "loss": 1281.5383, "step": 3830 }, { "ce_loss_10": 3.612284016609192, "ce_loss_13": 3.522170841693878, "ce_loss_2": 4.627012848854065, "ce_loss_3": 4.361323833465576, "ce_loss_7": 3.8073740243911742, "epoch": 0.384, "grad_norm": 852.0, "kl_loss_10": 211.08247299194335, "kl_loss_2": 2264.5133850097654, "kl_loss_3": 1800.5201843261718, "kl_loss_7": 672.6636840820313, "learning_rate": 0.0006873032967079561, "loss": 1258.6725, "step": 3840 }, { "ce_loss_10": 3.5931047439575194, "ce_loss_13": 3.5063655853271483, "ce_loss_2": 4.588197422027588, "ce_loss_3": 4.324539279937744, "ce_loss_7": 3.7907418251037597, "epoch": 0.385, "grad_norm": 664.0, "kl_loss_10": 203.42158584594728, "kl_loss_2": 2234.902947998047, "kl_loss_3": 1772.472021484375, "kl_loss_7": 662.3596588134766, "learning_rate": 0.0006858312278301637, "loss": 1226.7012, "step": 3850 }, { "ce_loss_10": 3.635795843601227, "ce_loss_13": 3.549472713470459, "ce_loss_2": 4.623842811584472, "ce_loss_3": 4.353901195526123, "ce_loss_7": 3.825099301338196, "epoch": 0.386, "grad_norm": 736.0, "kl_loss_10": 204.89519424438475, "kl_loss_2": 2217.719934082031, "kl_loss_3": 1747.3561584472657, "kl_loss_7": 659.4771606445313, "learning_rate": 0.0006843572876339704, "loss": 1225.6961, "step": 3860 }, { "ce_loss_10": 3.5519859790802, "ce_loss_13": 3.466093647480011, "ce_loss_2": 4.525204968452454, "ce_loss_3": 4.264074110984803, "ce_loss_7": 3.742415523529053, "epoch": 0.387, "grad_norm": 668.0, "kl_loss_10": 199.43942337036134, "kl_loss_2": 2183.9968811035155, "kl_loss_3": 1725.1913513183595, "kl_loss_7": 644.7796203613282, "learning_rate": 0.0006828814909619373, "loss": 1252.2885, "step": 3870 }, { "ce_loss_10": 3.674282944202423, "ce_loss_13": 3.5820161938667296, "ce_loss_2": 4.6895040512084964, "ce_loss_3": 4.4149659156799315, "ce_loss_7": 3.86486736536026, "epoch": 0.388, "grad_norm": 576.0, "kl_loss_10": 211.43887939453126, "kl_loss_2": 2266.184930419922, "kl_loss_3": 1785.1635192871095, "kl_loss_7": 661.8903228759766, "learning_rate": 0.0006814038526753205, "loss": 1223.6402, "step": 3880 }, { "ce_loss_10": 3.5698843955993653, "ce_loss_13": 3.479625034332275, "ce_loss_2": 4.587342977523804, "ce_loss_3": 4.3197312474250795, "ce_loss_7": 3.766351842880249, "epoch": 0.389, "grad_norm": 616.0, "kl_loss_10": 208.68895874023437, "kl_loss_2": 2258.2895751953124, "kl_loss_3": 1782.5825317382812, "kl_loss_7": 664.0186126708984, "learning_rate": 0.0006799243876539213, "loss": 1238.0235, "step": 3890 }, { "ce_loss_10": 3.500353288650513, "ce_loss_13": 3.408971738815308, "ce_loss_2": 4.572002196311951, "ce_loss_3": 4.29817762374878, "ce_loss_7": 3.699842798709869, "epoch": 0.39, "grad_norm": 836.0, "kl_loss_10": 206.20438537597656, "kl_loss_2": 2378.092236328125, "kl_loss_3": 1891.477294921875, "kl_loss_7": 671.2416046142578, "learning_rate": 0.0006784431107959359, "loss": 1281.9199, "step": 3900 }, { "ce_loss_10": 3.5523509979248047, "ce_loss_13": 3.460574519634247, "ce_loss_2": 4.626534819602966, "ce_loss_3": 4.3451399326324465, "ce_loss_7": 3.7586856484413147, "epoch": 0.391, "grad_norm": 668.0, "kl_loss_10": 206.77191925048828, "kl_loss_2": 2369.177069091797, "kl_loss_3": 1882.2445129394532, "kl_loss_7": 681.9191589355469, "learning_rate": 0.0006769600370178059, "loss": 1269.1178, "step": 3910 }, { "ce_loss_10": 3.5188350439071656, "ce_loss_13": 3.4273067116737366, "ce_loss_2": 4.554541206359863, "ce_loss_3": 4.289613115787506, "ce_loss_7": 3.724477529525757, "epoch": 0.392, "grad_norm": 560.0, "kl_loss_10": 199.67687606811523, "kl_loss_2": 2289.5328735351563, "kl_loss_3": 1825.1418823242188, "kl_loss_7": 674.4119354248047, "learning_rate": 0.0006754751812540679, "loss": 1229.9105, "step": 3920 }, { "ce_loss_10": 3.5683494329452516, "ce_loss_13": 3.482330596446991, "ce_loss_2": 4.621720671653748, "ce_loss_3": 4.348645758628845, "ce_loss_7": 3.7689119219779967, "epoch": 0.393, "grad_norm": 644.0, "kl_loss_10": 204.25539169311523, "kl_loss_2": 2339.84892578125, "kl_loss_3": 1860.5805419921876, "kl_loss_7": 678.0737976074219, "learning_rate": 0.0006739885584572025, "loss": 1265.1324, "step": 3930 }, { "ce_loss_10": 3.596517300605774, "ce_loss_13": 3.5048423767089845, "ce_loss_2": 4.657581090927124, "ce_loss_3": 4.393209981918335, "ce_loss_7": 3.8048507928848267, "epoch": 0.394, "grad_norm": 740.0, "kl_loss_10": 206.0053512573242, "kl_loss_2": 2374.587469482422, "kl_loss_3": 1914.2769775390625, "kl_loss_7": 691.7073638916015, "learning_rate": 0.0006725001835974853, "loss": 1262.3447, "step": 3940 }, { "ce_loss_10": 3.5837875604629517, "ce_loss_13": 3.49317661523819, "ce_loss_2": 4.625016355514527, "ce_loss_3": 4.360765337944031, "ce_loss_7": 3.795220899581909, "epoch": 0.395, "grad_norm": 624.0, "kl_loss_10": 202.6530014038086, "kl_loss_2": 2326.028674316406, "kl_loss_3": 1851.947198486328, "kl_loss_7": 690.550015258789, "learning_rate": 0.0006710100716628344, "loss": 1233.6354, "step": 3950 }, { "ce_loss_10": 3.566684401035309, "ce_loss_13": 3.4755138635635374, "ce_loss_2": 4.594186568260193, "ce_loss_3": 4.3349669694900514, "ce_loss_7": 3.7691023349761963, "epoch": 0.396, "grad_norm": 612.0, "kl_loss_10": 198.98139343261718, "kl_loss_2": 2292.616107177734, "kl_loss_3": 1833.538885498047, "kl_loss_7": 679.8143615722656, "learning_rate": 0.0006695182376586602, "loss": 1262.3783, "step": 3960 }, { "ce_loss_10": 3.596343123912811, "ce_loss_13": 3.512529468536377, "ce_loss_2": 4.574345445632934, "ce_loss_3": 4.312320637702942, "ce_loss_7": 3.790241527557373, "epoch": 0.397, "grad_norm": 708.0, "kl_loss_10": 191.98949813842773, "kl_loss_2": 2169.2684020996094, "kl_loss_3": 1705.870343017578, "kl_loss_7": 641.6268707275391, "learning_rate": 0.000668024696607715, "loss": 1235.5194, "step": 3970 }, { "ce_loss_10": 3.555418300628662, "ce_loss_13": 3.470967173576355, "ce_loss_2": 4.566342353820801, "ce_loss_3": 4.313628911972046, "ce_loss_7": 3.7538477182388306, "epoch": 0.398, "grad_norm": 636.0, "kl_loss_10": 198.03487548828124, "kl_loss_2": 2281.672326660156, "kl_loss_3": 1825.4962097167968, "kl_loss_7": 667.3125, "learning_rate": 0.0006665294635499404, "loss": 1243.9658, "step": 3980 }, { "ce_loss_10": 3.561533272266388, "ce_loss_13": 3.4714276075363157, "ce_loss_2": 4.635219573974609, "ce_loss_3": 4.372889280319214, "ce_loss_7": 3.771253454685211, "epoch": 0.399, "grad_norm": 876.0, "kl_loss_10": 208.96954040527345, "kl_loss_2": 2390.9282470703124, "kl_loss_3": 1928.2362426757813, "kl_loss_7": 697.694384765625, "learning_rate": 0.0006650325535423167, "loss": 1276.8535, "step": 3990 }, { "ce_loss_10": 3.5832207202911377, "ce_loss_13": 3.4972815036773683, "ce_loss_2": 4.57238998413086, "ce_loss_3": 4.3085246801376345, "ce_loss_7": 3.7773876547813416, "epoch": 0.4, "grad_norm": 680.0, "kl_loss_10": 194.11105422973634, "kl_loss_2": 2185.9527893066406, "kl_loss_3": 1735.314678955078, "kl_loss_7": 647.3017364501953, "learning_rate": 0.0006635339816587109, "loss": 1234.4078, "step": 4000 }, { "ce_loss_10": 3.524456286430359, "ce_loss_13": 3.4352723956108093, "ce_loss_2": 4.582833385467529, "ce_loss_3": 4.318940043449402, "ce_loss_7": 3.7211544632911684, "epoch": 0.401, "grad_norm": 624.0, "kl_loss_10": 200.03394927978516, "kl_loss_2": 2362.4151245117187, "kl_loss_3": 1889.6026794433594, "kl_loss_7": 668.6993530273437, "learning_rate": 0.0006620337629897252, "loss": 1251.5271, "step": 4010 }, { "ce_loss_10": 3.531862771511078, "ce_loss_13": 3.4413245558738708, "ce_loss_2": 4.564659547805786, "ce_loss_3": 4.295144200325012, "ce_loss_7": 3.728400182723999, "epoch": 0.402, "grad_norm": 556.0, "kl_loss_10": 199.88810348510742, "kl_loss_2": 2302.102014160156, "kl_loss_3": 1823.07431640625, "kl_loss_7": 668.4421752929687, "learning_rate": 0.0006605319126425454, "loss": 1275.6262, "step": 4020 }, { "ce_loss_10": 3.4339096665382387, "ce_loss_13": 3.350968396663666, "ce_loss_2": 4.514854836463928, "ce_loss_3": 4.238226985931396, "ce_loss_7": 3.644399344921112, "epoch": 0.403, "grad_norm": 644.0, "kl_loss_10": 199.70583419799806, "kl_loss_2": 2387.631066894531, "kl_loss_3": 1906.55048828125, "kl_loss_7": 681.1330749511719, "learning_rate": 0.0006590284457407876, "loss": 1275.1312, "step": 4030 }, { "ce_loss_10": 3.5380223751068116, "ce_loss_13": 3.448465049266815, "ce_loss_2": 4.57396821975708, "ce_loss_3": 4.313019490242004, "ce_loss_7": 3.7362441062927245, "epoch": 0.404, "grad_norm": 688.0, "kl_loss_10": 198.1963623046875, "kl_loss_2": 2292.718151855469, "kl_loss_3": 1830.845147705078, "kl_loss_7": 661.0607788085938, "learning_rate": 0.0006575233774243465, "loss": 1249.6318, "step": 4040 }, { "ce_loss_10": 3.528366136550903, "ce_loss_13": 3.4398876786231996, "ce_loss_2": 4.5835960626602175, "ce_loss_3": 4.318015563488006, "ce_loss_7": 3.734170937538147, "epoch": 0.405, "grad_norm": 744.0, "kl_loss_10": 199.81770248413085, "kl_loss_2": 2371.9314208984374, "kl_loss_3": 1896.1641540527344, "kl_loss_7": 680.1676055908204, "learning_rate": 0.0006560167228492435, "loss": 1274.3472, "step": 4050 }, { "ce_loss_10": 3.5713927507400514, "ce_loss_13": 3.4897979974746702, "ce_loss_2": 4.564716410636902, "ce_loss_3": 4.304848039150238, "ce_loss_7": 3.764770579338074, "epoch": 0.406, "grad_norm": 632.0, "kl_loss_10": 190.77299575805665, "kl_loss_2": 2213.3808837890624, "kl_loss_3": 1758.434912109375, "kl_loss_7": 651.0917877197265, "learning_rate": 0.0006545084971874737, "loss": 1244.4535, "step": 4060 }, { "ce_loss_10": 3.5361163854599, "ce_loss_13": 3.446142256259918, "ce_loss_2": 4.613076639175415, "ce_loss_3": 4.341050863265991, "ce_loss_7": 3.7489752054214476, "epoch": 0.407, "grad_norm": 724.0, "kl_loss_10": 204.6956573486328, "kl_loss_2": 2384.869763183594, "kl_loss_3": 1900.13232421875, "kl_loss_7": 693.6003051757813, "learning_rate": 0.0006529987156268526, "loss": 1264.1867, "step": 4070 }, { "ce_loss_10": 3.4604807257652284, "ce_loss_13": 3.3677136301994324, "ce_loss_2": 4.529002094268799, "ce_loss_3": 4.2508728981018065, "ce_loss_7": 3.671179461479187, "epoch": 0.408, "grad_norm": 652.0, "kl_loss_10": 200.91606369018555, "kl_loss_2": 2349.1596496582033, "kl_loss_3": 1858.6745910644531, "kl_loss_7": 678.4067779541016, "learning_rate": 0.0006514873933708637, "loss": 1288.6936, "step": 4080 }, { "ce_loss_10": 3.5669307708740234, "ce_loss_13": 3.4793569445610046, "ce_loss_2": 4.594972729682922, "ce_loss_3": 4.326959013938904, "ce_loss_7": 3.7646772980690004, "epoch": 0.409, "grad_norm": 624.0, "kl_loss_10": 195.33749084472657, "kl_loss_2": 2285.8158325195313, "kl_loss_3": 1812.939276123047, "kl_loss_7": 660.7087280273438, "learning_rate": 0.0006499745456385053, "loss": 1246.0525, "step": 4090 }, { "ce_loss_10": 3.532058572769165, "ce_loss_13": 3.446662437915802, "ce_loss_2": 4.566872882843017, "ce_loss_3": 4.295732653141021, "ce_loss_7": 3.7351000905036926, "epoch": 0.41, "grad_norm": 592.0, "kl_loss_10": 197.3603828430176, "kl_loss_2": 2284.4489013671873, "kl_loss_3": 1809.6247680664062, "kl_loss_7": 669.4995666503906, "learning_rate": 0.0006484601876641375, "loss": 1259.2045, "step": 4100 }, { "ce_loss_10": 3.523475456237793, "ce_loss_13": 3.4387876391410828, "ce_loss_2": 4.524111318588257, "ce_loss_3": 4.253742909431457, "ce_loss_7": 3.7196569561958315, "epoch": 0.411, "grad_norm": 608.0, "kl_loss_10": 194.7115333557129, "kl_loss_2": 2231.6944580078125, "kl_loss_3": 1757.1336669921875, "kl_loss_7": 654.9919372558594, "learning_rate": 0.000646944334697328, "loss": 1224.3209, "step": 4110 }, { "ce_loss_10": 3.6390475988388062, "ce_loss_13": 3.55245840549469, "ce_loss_2": 4.6237300157547, "ce_loss_3": 4.356840944290161, "ce_loss_7": 3.83481205701828, "epoch": 0.412, "grad_norm": 564.0, "kl_loss_10": 194.73427352905273, "kl_loss_2": 2191.641564941406, "kl_loss_3": 1726.6775146484374, "kl_loss_7": 650.4818603515625, "learning_rate": 0.0006454270020026995, "loss": 1203.4656, "step": 4120 }, { "ce_loss_10": 3.6017306566238405, "ce_loss_13": 3.520050418376923, "ce_loss_2": 4.577455329895019, "ce_loss_3": 4.320154881477356, "ce_loss_7": 3.792659568786621, "epoch": 0.413, "grad_norm": 576.0, "kl_loss_10": 189.64769592285157, "kl_loss_2": 2176.6477478027346, "kl_loss_3": 1722.2450378417968, "kl_loss_7": 643.5274627685546, "learning_rate": 0.0006439082048597755, "loss": 1192.4902, "step": 4130 }, { "ce_loss_10": 3.5903451800346375, "ce_loss_13": 3.507824885845184, "ce_loss_2": 4.608788180351257, "ce_loss_3": 4.346335411071777, "ce_loss_7": 3.7915576100349426, "epoch": 0.414, "grad_norm": 580.0, "kl_loss_10": 197.56676864624023, "kl_loss_2": 2267.8357421875, "kl_loss_3": 1807.2357055664063, "kl_loss_7": 666.154751586914, "learning_rate": 0.0006423879585628261, "loss": 1240.4789, "step": 4140 }, { "ce_loss_10": 3.55734179019928, "ce_loss_13": 3.467788887023926, "ce_loss_2": 4.6172443151474, "ce_loss_3": 4.339134466648102, "ce_loss_7": 3.7582768201828003, "epoch": 0.415, "grad_norm": 688.0, "kl_loss_10": 201.20330123901368, "kl_loss_2": 2351.881707763672, "kl_loss_3": 1863.0321411132813, "kl_loss_7": 675.1522674560547, "learning_rate": 0.0006408662784207149, "loss": 1267.4067, "step": 4150 }, { "ce_loss_10": 3.5083068370819093, "ce_loss_13": 3.421711838245392, "ce_loss_2": 4.537308859825134, "ce_loss_3": 4.2711180448532104, "ce_loss_7": 3.708827292919159, "epoch": 0.416, "grad_norm": 696.0, "kl_loss_10": 195.25809020996093, "kl_loss_2": 2288.984796142578, "kl_loss_3": 1822.1060485839844, "kl_loss_7": 665.5764404296875, "learning_rate": 0.0006393431797567439, "loss": 1250.1072, "step": 4160 }, { "ce_loss_10": 3.5913987278938295, "ce_loss_13": 3.509873795509338, "ce_loss_2": 4.573607659339904, "ce_loss_3": 4.318917143344879, "ce_loss_7": 3.7776596546173096, "epoch": 0.417, "grad_norm": 596.0, "kl_loss_10": 194.73566436767578, "kl_loss_2": 2221.0649841308596, "kl_loss_3": 1766.04775390625, "kl_loss_7": 648.3369201660156, "learning_rate": 0.0006378186779084996, "loss": 1190.9323, "step": 4170 }, { "ce_loss_10": 3.4334940314292908, "ce_loss_13": 3.3453728437423704, "ce_loss_2": 4.485861015319824, "ce_loss_3": 4.215418803691864, "ce_loss_7": 3.6394405364990234, "epoch": 0.418, "grad_norm": 676.0, "kl_loss_10": 196.98247756958008, "kl_loss_2": 2312.9372680664064, "kl_loss_3": 1836.5678649902343, "kl_loss_7": 670.5612365722657, "learning_rate": 0.0006362927882276989, "loss": 1261.1342, "step": 4180 }, { "ce_loss_10": 3.622100257873535, "ce_loss_13": 3.5377141356468202, "ce_loss_2": 4.620994114875794, "ce_loss_3": 4.349101042747497, "ce_loss_7": 3.81082307100296, "epoch": 0.419, "grad_norm": 620.0, "kl_loss_10": 192.66633071899415, "kl_loss_2": 2225.1201049804686, "kl_loss_3": 1751.446875, "kl_loss_7": 636.4902648925781, "learning_rate": 0.000634765526080034, "loss": 1194.2031, "step": 4190 }, { "ce_loss_10": 3.626552712917328, "ce_loss_13": 3.5395890951156614, "ce_loss_2": 4.6221943378448485, "ce_loss_3": 4.355586886405945, "ce_loss_7": 3.8178786516189573, "epoch": 0.42, "grad_norm": 612.0, "kl_loss_10": 198.29063568115234, "kl_loss_2": 2233.3197509765623, "kl_loss_3": 1764.3755920410156, "kl_loss_7": 656.7103454589844, "learning_rate": 0.0006332369068450174, "loss": 1207.3598, "step": 4200 }, { "ce_loss_10": 3.5582772374153135, "ce_loss_13": 3.4749309182167054, "ce_loss_2": 4.574929785728455, "ce_loss_3": 4.314627623558044, "ce_loss_7": 3.7553325653076173, "epoch": 0.421, "grad_norm": 588.0, "kl_loss_10": 195.14224548339843, "kl_loss_2": 2252.938916015625, "kl_loss_3": 1800.94208984375, "kl_loss_7": 656.2003112792969, "learning_rate": 0.0006317069459158283, "loss": 1220.2742, "step": 4210 }, { "ce_loss_10": 3.66640442609787, "ce_loss_13": 3.584319996833801, "ce_loss_2": 4.649453711509705, "ce_loss_3": 4.3819632768630985, "ce_loss_7": 3.8596243381500246, "epoch": 0.422, "grad_norm": 592.0, "kl_loss_10": 193.5803665161133, "kl_loss_2": 2195.1602294921877, "kl_loss_3": 1735.2393310546875, "kl_loss_7": 647.3203796386719, "learning_rate": 0.0006301756586991561, "loss": 1218.0184, "step": 4220 }, { "ce_loss_10": 3.452160143852234, "ce_loss_13": 3.3657590985298156, "ce_loss_2": 4.505249190330505, "ce_loss_3": 4.242624092102051, "ce_loss_7": 3.6535696148872376, "epoch": 0.423, "grad_norm": 764.0, "kl_loss_10": 198.62798614501952, "kl_loss_2": 2358.790142822266, "kl_loss_3": 1893.541082763672, "kl_loss_7": 677.0058868408203, "learning_rate": 0.0006286430606150459, "loss": 1264.3267, "step": 4230 }, { "ce_loss_10": 3.6465710401535034, "ce_loss_13": 3.563078057765961, "ce_loss_2": 4.641510224342346, "ce_loss_3": 4.377968907356262, "ce_loss_7": 3.843237745761871, "epoch": 0.424, "grad_norm": 752.0, "kl_loss_10": 199.9485771179199, "kl_loss_2": 2240.6987426757814, "kl_loss_3": 1778.9172729492188, "kl_loss_7": 666.6387664794922, "learning_rate": 0.0006271091670967436, "loss": 1223.7141, "step": 4240 }, { "ce_loss_10": 3.570220148563385, "ce_loss_13": 3.474740993976593, "ce_loss_2": 4.616126585006714, "ce_loss_3": 4.345292592048645, "ce_loss_7": 3.7782423973083494, "epoch": 0.425, "grad_norm": 604.0, "kl_loss_10": 206.9665100097656, "kl_loss_2": 2359.6162658691405, "kl_loss_3": 1878.0119995117188, "kl_loss_7": 699.4417297363282, "learning_rate": 0.0006255739935905395, "loss": 1260.2877, "step": 4250 }, { "ce_loss_10": 3.6002479434013366, "ce_loss_13": 3.5161559224128722, "ce_loss_2": 4.592222595214844, "ce_loss_3": 4.328677630424499, "ce_loss_7": 3.790011668205261, "epoch": 0.426, "grad_norm": 688.0, "kl_loss_10": 196.72698440551758, "kl_loss_2": 2221.3829833984373, "kl_loss_3": 1756.8525756835938, "kl_loss_7": 652.2526733398438, "learning_rate": 0.0006240375555556145, "loss": 1261.0352, "step": 4260 }, { "ce_loss_10": 3.6026462078094483, "ce_loss_13": 3.5168575167655947, "ce_loss_2": 4.6545480489730835, "ce_loss_3": 4.386264157295227, "ce_loss_7": 3.8051093459129333, "epoch": 0.427, "grad_norm": 580.0, "kl_loss_10": 197.9854705810547, "kl_loss_2": 2316.6946411132812, "kl_loss_3": 1838.7398193359375, "kl_loss_7": 667.8762664794922, "learning_rate": 0.000622499868463882, "loss": 1243.157, "step": 4270 }, { "ce_loss_10": 3.574436700344086, "ce_loss_13": 3.4921345114707947, "ce_loss_2": 4.568159365653992, "ce_loss_3": 4.298846364021301, "ce_loss_7": 3.765162992477417, "epoch": 0.428, "grad_norm": 620.0, "kl_loss_10": 194.46619186401367, "kl_loss_2": 2240.061688232422, "kl_loss_3": 1767.6112670898438, "kl_loss_7": 647.0378051757813, "learning_rate": 0.0006209609477998338, "loss": 1226.4191, "step": 4280 }, { "ce_loss_10": 3.6271798372268678, "ce_loss_13": 3.5434911131858824, "ce_loss_2": 4.628253221511841, "ce_loss_3": 4.367080307006836, "ce_loss_7": 3.823750925064087, "epoch": 0.429, "grad_norm": 596.0, "kl_loss_10": 199.08662948608398, "kl_loss_2": 2248.870013427734, "kl_loss_3": 1777.940362548828, "kl_loss_7": 666.140267944336, "learning_rate": 0.0006194208090603844, "loss": 1245.6613, "step": 4290 }, { "ce_loss_10": 3.550025999546051, "ce_loss_13": 3.4652276039123535, "ce_loss_2": 4.55794665813446, "ce_loss_3": 4.292889666557312, "ce_loss_7": 3.7512326836586, "epoch": 0.43, "grad_norm": 696.0, "kl_loss_10": 194.7114356994629, "kl_loss_2": 2238.0842895507812, "kl_loss_3": 1764.372296142578, "kl_loss_7": 659.5233337402344, "learning_rate": 0.0006178794677547138, "loss": 1204.7698, "step": 4300 }, { "ce_loss_10": 3.5732582211494446, "ce_loss_13": 3.4901079893112184, "ce_loss_2": 4.593334412574768, "ce_loss_3": 4.330301976203918, "ce_loss_7": 3.7785701513290406, "epoch": 0.431, "grad_norm": 716.0, "kl_loss_10": 204.0959487915039, "kl_loss_2": 2270.5096435546875, "kl_loss_3": 1810.6998352050782, "kl_loss_7": 680.2683868408203, "learning_rate": 0.0006163369394041111, "loss": 1234.0865, "step": 4310 }, { "ce_loss_10": 3.522435462474823, "ce_loss_13": 3.427918314933777, "ce_loss_2": 4.549873030185699, "ce_loss_3": 4.289526271820068, "ce_loss_7": 3.7253190755844114, "epoch": 0.432, "grad_norm": 800.0, "kl_loss_10": 208.77886199951172, "kl_loss_2": 2301.5377990722654, "kl_loss_3": 1836.5447998046875, "kl_loss_7": 679.7338165283203, "learning_rate": 0.0006147932395418205, "loss": 1277.3873, "step": 4320 }, { "ce_loss_10": 3.5494457960128782, "ce_loss_13": 3.4614178776741027, "ce_loss_2": 4.544875764846802, "ce_loss_3": 4.281032645702362, "ce_loss_7": 3.7431655168533324, "epoch": 0.433, "grad_norm": 576.0, "kl_loss_10": 207.28576126098633, "kl_loss_2": 2223.6774475097654, "kl_loss_3": 1762.4080688476563, "kl_loss_7": 660.7338745117188, "learning_rate": 0.0006132483837128823, "loss": 1209.1447, "step": 4330 }, { "ce_loss_10": 3.5334264755249025, "ce_loss_13": 3.4463666915893554, "ce_loss_2": 4.564454817771912, "ce_loss_3": 4.294960129261017, "ce_loss_7": 3.73456689119339, "epoch": 0.434, "grad_norm": 772.0, "kl_loss_10": 203.00026016235353, "kl_loss_2": 2313.798968505859, "kl_loss_3": 1837.2741760253907, "kl_loss_7": 664.0175872802735, "learning_rate": 0.0006117023874739772, "loss": 1240.8283, "step": 4340 }, { "ce_loss_10": 3.5215348839759826, "ce_loss_13": 3.4345417499542235, "ce_loss_2": 4.554822826385498, "ce_loss_3": 4.285668563842774, "ce_loss_7": 3.7274380683898927, "epoch": 0.435, "grad_norm": 600.0, "kl_loss_10": 199.90404205322267, "kl_loss_2": 2303.0773803710936, "kl_loss_3": 1827.208251953125, "kl_loss_7": 672.8900817871094, "learning_rate": 0.0006101552663932703, "loss": 1260.7756, "step": 4350 }, { "ce_loss_10": 3.554371106624603, "ce_loss_13": 3.4667278289794923, "ce_loss_2": 4.5602539539337155, "ce_loss_3": 4.297711455821991, "ce_loss_7": 3.74671790599823, "epoch": 0.436, "grad_norm": 664.0, "kl_loss_10": 201.51847763061522, "kl_loss_2": 2254.3937927246093, "kl_loss_3": 1790.253662109375, "kl_loss_7": 662.9543731689453, "learning_rate": 0.0006086070360502539, "loss": 1241.8814, "step": 4360 }, { "ce_loss_10": 3.5543460965156557, "ce_loss_13": 3.470974051952362, "ce_loss_2": 4.571776509284973, "ce_loss_3": 4.305854046344757, "ce_loss_7": 3.7547166466712953, "epoch": 0.437, "grad_norm": 608.0, "kl_loss_10": 196.51984329223632, "kl_loss_2": 2276.0406494140625, "kl_loss_3": 1801.6470031738281, "kl_loss_7": 659.9133270263671, "learning_rate": 0.0006070577120355903, "loss": 1236.9521, "step": 4370 }, { "ce_loss_10": 3.5628577947616575, "ce_loss_13": 3.47283878326416, "ce_loss_2": 4.547589898109436, "ce_loss_3": 4.279985129833221, "ce_loss_7": 3.7624007940292357, "epoch": 0.438, "grad_norm": 700.0, "kl_loss_10": 200.08970794677734, "kl_loss_2": 2194.6991455078123, "kl_loss_3": 1728.643865966797, "kl_loss_7": 657.7827362060547, "learning_rate": 0.0006055073099509549, "loss": 1218.3828, "step": 4380 }, { "ce_loss_10": 3.6181469678878786, "ce_loss_13": 3.531364715099335, "ce_loss_2": 4.607781720161438, "ce_loss_3": 4.3447977781295775, "ce_loss_7": 3.8072004199028013, "epoch": 0.439, "grad_norm": 616.0, "kl_loss_10": 200.97432174682618, "kl_loss_2": 2223.8163146972656, "kl_loss_3": 1756.1592224121093, "kl_loss_7": 652.6859497070312, "learning_rate": 0.0006039558454088796, "loss": 1239.9502, "step": 4390 }, { "ce_loss_10": 3.598993420600891, "ce_loss_13": 3.508393979072571, "ce_loss_2": 4.611153769493103, "ce_loss_3": 4.343827414512634, "ce_loss_7": 3.798681151866913, "epoch": 0.44, "grad_norm": 636.0, "kl_loss_10": 207.16089324951173, "kl_loss_2": 2267.215954589844, "kl_loss_3": 1798.3621337890625, "kl_loss_7": 665.1716247558594, "learning_rate": 0.0006024033340325954, "loss": 1210.7668, "step": 4400 }, { "ce_loss_10": 3.6592474579811096, "ce_loss_13": 3.575475811958313, "ce_loss_2": 4.615442514419556, "ce_loss_3": 4.356250524520874, "ce_loss_7": 3.841563415527344, "epoch": 0.441, "grad_norm": 564.0, "kl_loss_10": 192.91486740112305, "kl_loss_2": 2138.478411865234, "kl_loss_3": 1682.6779296875, "kl_loss_7": 628.4862640380859, "learning_rate": 0.0006008497914558743, "loss": 1188.8043, "step": 4410 }, { "ce_loss_10": 3.603752911090851, "ce_loss_13": 3.514535641670227, "ce_loss_2": 4.619881939888001, "ce_loss_3": 4.351648759841919, "ce_loss_7": 3.8029965996742248, "epoch": 0.442, "grad_norm": 680.0, "kl_loss_10": 203.31059799194335, "kl_loss_2": 2279.580682373047, "kl_loss_3": 1800.9931640625, "kl_loss_7": 667.106298828125, "learning_rate": 0.0005992952333228728, "loss": 1234.8536, "step": 4420 }, { "ce_loss_10": 3.5360623002052307, "ce_loss_13": 3.452274763584137, "ce_loss_2": 4.555125761032104, "ce_loss_3": 4.292830312252045, "ce_loss_7": 3.7339815139770507, "epoch": 0.443, "grad_norm": 660.0, "kl_loss_10": 193.53399200439452, "kl_loss_2": 2284.7425048828127, "kl_loss_3": 1820.065606689453, "kl_loss_7": 662.5294036865234, "learning_rate": 0.0005977396752879741, "loss": 1233.2003, "step": 4430 }, { "ce_loss_10": 3.4606794357299804, "ce_loss_13": 3.377534472942352, "ce_loss_2": 4.48543610572815, "ce_loss_3": 4.220411324501038, "ce_loss_7": 3.6664485812187193, "epoch": 0.444, "grad_norm": 580.0, "kl_loss_10": 191.26479797363282, "kl_loss_2": 2280.07265625, "kl_loss_3": 1810.239013671875, "kl_loss_7": 656.8585327148437, "learning_rate": 0.0005961831330156305, "loss": 1222.7674, "step": 4440 }, { "ce_loss_10": 3.603837263584137, "ce_loss_13": 3.5208237767219543, "ce_loss_2": 4.638635230064392, "ce_loss_3": 4.366818988323212, "ce_loss_7": 3.8011784672737123, "epoch": 0.445, "grad_norm": 652.0, "kl_loss_10": 193.8144203186035, "kl_loss_2": 2316.0056640625, "kl_loss_3": 1833.1619812011718, "kl_loss_7": 659.0817749023438, "learning_rate": 0.0005946256221802051, "loss": 1263.1171, "step": 4450 }, { "ce_loss_10": 3.5832170486450194, "ce_loss_13": 3.5048667788505554, "ce_loss_2": 4.5584005355834964, "ce_loss_3": 4.296856260299682, "ce_loss_7": 3.7672229290008543, "epoch": 0.446, "grad_norm": 700.0, "kl_loss_10": 189.50232849121093, "kl_loss_2": 2181.445458984375, "kl_loss_3": 1725.645037841797, "kl_loss_7": 639.0539123535157, "learning_rate": 0.0005930671584658151, "loss": 1259.6497, "step": 4460 }, { "ce_loss_10": 3.5820990085601805, "ce_loss_13": 3.4986414194107054, "ce_loss_2": 4.585966444015503, "ce_loss_3": 4.327683901786804, "ce_loss_7": 3.778271293640137, "epoch": 0.447, "grad_norm": 624.0, "kl_loss_10": 192.16329650878907, "kl_loss_2": 2241.6711364746093, "kl_loss_3": 1786.2928161621094, "kl_loss_7": 656.8107452392578, "learning_rate": 0.0005915077575661722, "loss": 1237.7033, "step": 4470 }, { "ce_loss_10": 3.601723861694336, "ce_loss_13": 3.5175135850906374, "ce_loss_2": 4.623058772087097, "ce_loss_3": 4.352630817890168, "ce_loss_7": 3.801686096191406, "epoch": 0.448, "grad_norm": 576.0, "kl_loss_10": 197.77509002685548, "kl_loss_2": 2287.8472229003905, "kl_loss_3": 1814.064471435547, "kl_loss_7": 669.7168884277344, "learning_rate": 0.000589947435184427, "loss": 1221.476, "step": 4480 }, { "ce_loss_10": 3.667625939846039, "ce_loss_13": 3.5879098773002625, "ce_loss_2": 4.623752212524414, "ce_loss_3": 4.3639614343643185, "ce_loss_7": 3.854480040073395, "epoch": 0.449, "grad_norm": 676.0, "kl_loss_10": 191.73966369628906, "kl_loss_2": 2169.5962097167967, "kl_loss_3": 1711.8840759277343, "kl_loss_7": 644.2419403076171, "learning_rate": 0.0005883862070330078, "loss": 1205.0104, "step": 4490 }, { "ce_loss_10": 3.5975982904434205, "ce_loss_13": 3.5138633131980894, "ce_loss_2": 4.596203637123108, "ce_loss_3": 4.342746245861053, "ce_loss_7": 3.7984990000724794, "epoch": 0.45, "grad_norm": 680.0, "kl_loss_10": 192.790771484375, "kl_loss_2": 2245.34130859375, "kl_loss_3": 1787.1853515625, "kl_loss_7": 655.1883148193359, "learning_rate": 0.0005868240888334653, "loss": 1211.5924, "step": 4500 }, { "ce_loss_10": 3.484956693649292, "ce_loss_13": 3.3994694352149963, "ce_loss_2": 4.541581082344055, "ce_loss_3": 4.265275609493256, "ce_loss_7": 3.685898816585541, "epoch": 0.451, "grad_norm": 664.0, "kl_loss_10": 197.9249183654785, "kl_loss_2": 2329.778839111328, "kl_loss_3": 1847.1219604492187, "kl_loss_7": 669.9408386230468, "learning_rate": 0.0005852610963163119, "loss": 1246.2838, "step": 4510 }, { "ce_loss_10": 3.506740427017212, "ce_loss_13": 3.425083673000336, "ce_loss_2": 4.510421705245972, "ce_loss_3": 4.246485769748688, "ce_loss_7": 3.6969696044921876, "epoch": 0.452, "grad_norm": 600.0, "kl_loss_10": 188.6581832885742, "kl_loss_2": 2238.5858459472656, "kl_loss_3": 1770.7893127441407, "kl_loss_7": 646.1408782958985, "learning_rate": 0.0005836972452208654, "loss": 1201.6553, "step": 4520 }, { "ce_loss_10": 3.505844843387604, "ce_loss_13": 3.4249507427215575, "ce_loss_2": 4.529585886001587, "ce_loss_3": 4.277110803127289, "ce_loss_7": 3.708256196975708, "epoch": 0.453, "grad_norm": 668.0, "kl_loss_10": 193.22399291992187, "kl_loss_2": 2265.8468383789063, "kl_loss_3": 1815.4268432617187, "kl_loss_7": 656.5519592285157, "learning_rate": 0.0005821325512950885, "loss": 1236.8736, "step": 4530 }, { "ce_loss_10": 3.5389772057533264, "ce_loss_13": 3.4585880279541015, "ce_loss_2": 4.540098547935486, "ce_loss_3": 4.2798211693763735, "ce_loss_7": 3.7288518071174623, "epoch": 0.454, "grad_norm": 592.0, "kl_loss_10": 187.7821243286133, "kl_loss_2": 2205.198779296875, "kl_loss_3": 1748.45341796875, "kl_loss_7": 639.7683776855469, "learning_rate": 0.0005805670302954321, "loss": 1221.9566, "step": 4540 }, { "ce_loss_10": 3.544492793083191, "ce_loss_13": 3.4652194142341615, "ce_loss_2": 4.541022229194641, "ce_loss_3": 4.279520082473755, "ce_loss_7": 3.7328044533729554, "epoch": 0.455, "grad_norm": 656.0, "kl_loss_10": 186.06844177246094, "kl_loss_2": 2226.980224609375, "kl_loss_3": 1765.562744140625, "kl_loss_7": 639.6060729980469, "learning_rate": 0.000579000697986675, "loss": 1199.4846, "step": 4550 }, { "ce_loss_10": 3.5037956118583677, "ce_loss_13": 3.4134857773780825, "ce_loss_2": 4.544147634506226, "ce_loss_3": 4.274082601070404, "ce_loss_7": 3.707910752296448, "epoch": 0.456, "grad_norm": 664.0, "kl_loss_10": 200.43186416625977, "kl_loss_2": 2315.5464111328124, "kl_loss_3": 1832.4145568847657, "kl_loss_7": 672.0404296875, "learning_rate": 0.0005774335701417662, "loss": 1229.2445, "step": 4560 }, { "ce_loss_10": 3.4942433714866636, "ce_loss_13": 3.4095874786376954, "ce_loss_2": 4.549353170394897, "ce_loss_3": 4.279058015346527, "ce_loss_7": 3.693026268482208, "epoch": 0.457, "grad_norm": 608.0, "kl_loss_10": 192.1516143798828, "kl_loss_2": 2342.186248779297, "kl_loss_3": 1864.820654296875, "kl_loss_7": 655.2123260498047, "learning_rate": 0.0005758656625416658, "loss": 1241.1571, "step": 4570 }, { "ce_loss_10": 3.5480048656463623, "ce_loss_13": 3.4622543811798097, "ce_loss_2": 4.561884355545044, "ce_loss_3": 4.293217277526855, "ce_loss_7": 3.743514323234558, "epoch": 0.458, "grad_norm": 616.0, "kl_loss_10": 194.93896102905273, "kl_loss_2": 2260.48984375, "kl_loss_3": 1786.8557556152343, "kl_loss_7": 654.7685607910156, "learning_rate": 0.0005742969909751859, "loss": 1199.7715, "step": 4580 }, { "ce_loss_10": 3.558157193660736, "ce_loss_13": 3.4740110039711, "ce_loss_2": 4.582901740074158, "ce_loss_3": 4.310809695720673, "ce_loss_7": 3.7480292677879334, "epoch": 0.459, "grad_norm": 636.0, "kl_loss_10": 193.16277923583985, "kl_loss_2": 2285.9891052246094, "kl_loss_3": 1800.92705078125, "kl_loss_7": 648.6197357177734, "learning_rate": 0.0005727275712388318, "loss": 1238.3732, "step": 4590 }, { "ce_loss_10": 3.5862102270126344, "ce_loss_13": 3.509055662155151, "ce_loss_2": 4.560896277427673, "ce_loss_3": 4.298801875114441, "ce_loss_7": 3.773897314071655, "epoch": 0.46, "grad_norm": 768.0, "kl_loss_10": 186.60687026977538, "kl_loss_2": 2190.591516113281, "kl_loss_3": 1728.75283203125, "kl_loss_7": 633.5937683105469, "learning_rate": 0.0005711574191366427, "loss": 1204.0141, "step": 4600 }, { "ce_loss_10": 3.537917101383209, "ce_loss_13": 3.456929898262024, "ce_loss_2": 4.532997250556946, "ce_loss_3": 4.271801400184631, "ce_loss_7": 3.7239314556121825, "epoch": 0.461, "grad_norm": 544.0, "kl_loss_10": 188.38971405029298, "kl_loss_2": 2244.5726928710938, "kl_loss_3": 1779.3487548828125, "kl_loss_7": 643.0867309570312, "learning_rate": 0.0005695865504800327, "loss": 1208.6229, "step": 4610 }, { "ce_loss_10": 3.475814175605774, "ce_loss_13": 3.3895989418029786, "ce_loss_2": 4.570864033699036, "ce_loss_3": 4.298530387878418, "ce_loss_7": 3.6918977737426757, "epoch": 0.462, "grad_norm": 688.0, "kl_loss_10": 199.44021301269532, "kl_loss_2": 2396.831396484375, "kl_loss_3": 1919.1037109375, "kl_loss_7": 685.7258270263671, "learning_rate": 0.0005680149810876322, "loss": 1259.1618, "step": 4620 }, { "ce_loss_10": 3.5307737231254577, "ce_loss_13": 3.448805606365204, "ce_loss_2": 4.553147649765014, "ce_loss_3": 4.283793473243714, "ce_loss_7": 3.720176661014557, "epoch": 0.463, "grad_norm": 632.0, "kl_loss_10": 191.36487274169923, "kl_loss_2": 2267.567822265625, "kl_loss_3": 1802.5030578613282, "kl_loss_7": 648.5958099365234, "learning_rate": 0.0005664427267851271, "loss": 1217.3594, "step": 4630 }, { "ce_loss_10": 3.4447478532791136, "ce_loss_13": 3.362277901172638, "ce_loss_2": 4.474937617778778, "ce_loss_3": 4.203511357307434, "ce_loss_7": 3.640981078147888, "epoch": 0.464, "grad_norm": 616.0, "kl_loss_10": 189.61345367431642, "kl_loss_2": 2284.305810546875, "kl_loss_3": 1801.5720520019531, "kl_loss_7": 647.2827972412109, "learning_rate": 0.0005648698034051009, "loss": 1216.2738, "step": 4640 }, { "ce_loss_10": 3.5612680554389953, "ce_loss_13": 3.479226899147034, "ce_loss_2": 4.606190347671509, "ce_loss_3": 4.343288254737854, "ce_loss_7": 3.7559192776679993, "epoch": 0.465, "grad_norm": 680.0, "kl_loss_10": 189.31488800048828, "kl_loss_2": 2300.2595642089846, "kl_loss_3": 1835.63125, "kl_loss_7": 647.0011413574218, "learning_rate": 0.0005632962267868747, "loss": 1204.3232, "step": 4650 }, { "ce_loss_10": 3.504312825202942, "ce_loss_13": 3.4246782064437866, "ce_loss_2": 4.501714015007019, "ce_loss_3": 4.243509244918823, "ce_loss_7": 3.6963974952697756, "epoch": 0.466, "grad_norm": 656.0, "kl_loss_10": 184.82376022338866, "kl_loss_2": 2221.081378173828, "kl_loss_3": 1770.11845703125, "kl_loss_7": 636.5809020996094, "learning_rate": 0.0005617220127763474, "loss": 1219.1382, "step": 4660 }, { "ce_loss_10": 3.578074049949646, "ce_loss_13": 3.497161865234375, "ce_loss_2": 4.561417579650879, "ce_loss_3": 4.303230881690979, "ce_loss_7": 3.7666924834251403, "epoch": 0.467, "grad_norm": 592.0, "kl_loss_10": 188.17724151611327, "kl_loss_2": 2198.6551513671875, "kl_loss_3": 1739.3878234863282, "kl_loss_7": 638.6716613769531, "learning_rate": 0.0005601471772258368, "loss": 1209.8152, "step": 4670 }, { "ce_loss_10": 3.5602858781814577, "ce_loss_13": 3.4812931418418884, "ce_loss_2": 4.544128322601319, "ce_loss_3": 4.283940744400025, "ce_loss_7": 3.750748324394226, "epoch": 0.468, "grad_norm": 684.0, "kl_loss_10": 186.29373779296876, "kl_loss_2": 2186.011083984375, "kl_loss_3": 1724.9905029296874, "kl_loss_7": 634.5341583251953, "learning_rate": 0.0005585717359939192, "loss": 1216.8666, "step": 4680 }, { "ce_loss_10": 3.47387490272522, "ce_loss_13": 3.3916377425193787, "ce_loss_2": 4.47896523475647, "ce_loss_3": 4.213813447952271, "ce_loss_7": 3.6638592004776003, "epoch": 0.469, "grad_norm": 736.0, "kl_loss_10": 187.3494743347168, "kl_loss_2": 2222.502734375, "kl_loss_3": 1755.6538635253905, "kl_loss_7": 638.4273468017578, "learning_rate": 0.0005569957049452703, "loss": 1235.6265, "step": 4690 }, { "ce_loss_10": 3.530002760887146, "ce_loss_13": 3.4474871516227723, "ce_loss_2": 4.558400893211365, "ce_loss_3": 4.2877805709838865, "ce_loss_7": 3.7245721340179445, "epoch": 0.47, "grad_norm": 704.0, "kl_loss_10": 192.37612838745116, "kl_loss_2": 2285.8403198242186, "kl_loss_3": 1808.7154968261718, "kl_loss_7": 653.8845581054687, "learning_rate": 0.0005554190999505056, "loss": 1234.8666, "step": 4700 }, { "ce_loss_10": 3.655286133289337, "ce_loss_13": 3.5717312812805178, "ce_loss_2": 4.666804194450378, "ce_loss_3": 4.405120444297791, "ce_loss_7": 3.852936863899231, "epoch": 0.471, "grad_norm": 612.0, "kl_loss_10": 194.36407165527345, "kl_loss_2": 2267.82900390625, "kl_loss_3": 1798.198681640625, "kl_loss_7": 661.5685516357422, "learning_rate": 0.0005538419368860196, "loss": 1183.023, "step": 4710 }, { "ce_loss_10": 3.5788578033447265, "ce_loss_13": 3.498483991622925, "ce_loss_2": 4.574405527114868, "ce_loss_3": 4.313237249851227, "ce_loss_7": 3.768782043457031, "epoch": 0.472, "grad_norm": 600.0, "kl_loss_10": 190.92964248657228, "kl_loss_2": 2231.130651855469, "kl_loss_3": 1765.2990478515626, "kl_loss_7": 643.7991912841796, "learning_rate": 0.0005522642316338268, "loss": 1233.693, "step": 4720 }, { "ce_loss_10": 3.581640887260437, "ce_loss_13": 3.5026119351387024, "ce_loss_2": 4.585021948814392, "ce_loss_3": 4.325532901287079, "ce_loss_7": 3.7722782731056212, "epoch": 0.473, "grad_norm": 608.0, "kl_loss_10": 190.94201431274413, "kl_loss_2": 2235.0365600585938, "kl_loss_3": 1776.091015625, "kl_loss_7": 644.9752258300781, "learning_rate": 0.0005506860000814017, "loss": 1245.2729, "step": 4730 }, { "ce_loss_10": 3.609380042552948, "ce_loss_13": 3.5285964608192444, "ce_loss_2": 4.5732040166854855, "ce_loss_3": 4.316051697731018, "ce_loss_7": 3.7950204849243163, "epoch": 0.474, "grad_norm": 624.0, "kl_loss_10": 185.59933853149414, "kl_loss_2": 2152.8808044433595, "kl_loss_3": 1698.98193359375, "kl_loss_7": 630.6379913330078, "learning_rate": 0.0005491072581215186, "loss": 1197.5367, "step": 4740 }, { "ce_loss_10": 3.6150610566139223, "ce_loss_13": 3.5275300979614257, "ce_loss_2": 4.606984066963196, "ce_loss_3": 4.331383717060089, "ce_loss_7": 3.8067922830581664, "epoch": 0.475, "grad_norm": 636.0, "kl_loss_10": 196.42518692016603, "kl_loss_2": 2246.8279663085937, "kl_loss_3": 1758.9309448242188, "kl_loss_7": 653.222998046875, "learning_rate": 0.0005475280216520913, "loss": 1187.7061, "step": 4750 }, { "ce_loss_10": 3.5246535778045653, "ce_loss_13": 3.4453783988952638, "ce_loss_2": 4.515398740768433, "ce_loss_3": 4.251828491687775, "ce_loss_7": 3.7125940799713133, "epoch": 0.476, "grad_norm": 660.0, "kl_loss_10": 186.9199966430664, "kl_loss_2": 2199.839562988281, "kl_loss_3": 1734.392041015625, "kl_loss_7": 632.1179809570312, "learning_rate": 0.0005459483065760138, "loss": 1229.7142, "step": 4760 }, { "ce_loss_10": 3.4620707392692567, "ce_loss_13": 3.379168164730072, "ce_loss_2": 4.535378384590149, "ce_loss_3": 4.269984316825867, "ce_loss_7": 3.66820707321167, "epoch": 0.477, "grad_norm": 668.0, "kl_loss_10": 189.84198379516602, "kl_loss_2": 2346.7093200683594, "kl_loss_3": 1881.5502380371095, "kl_loss_7": 655.4429504394532, "learning_rate": 0.0005443681288009991, "loss": 1231.516, "step": 4770 }, { "ce_loss_10": 3.5201017260551453, "ce_loss_13": 3.4394583463668824, "ce_loss_2": 4.551519656181336, "ce_loss_3": 4.275486898422241, "ce_loss_7": 3.712023985385895, "epoch": 0.478, "grad_norm": 560.0, "kl_loss_10": 188.47934265136718, "kl_loss_2": 2298.261828613281, "kl_loss_3": 1807.319403076172, "kl_loss_7": 646.2581420898438, "learning_rate": 0.0005427875042394199, "loss": 1231.2074, "step": 4780 }, { "ce_loss_10": 3.5546525955200194, "ce_loss_13": 3.4689871072769165, "ce_loss_2": 4.55171308517456, "ce_loss_3": 4.2830651044845585, "ce_loss_7": 3.7494895100593566, "epoch": 0.479, "grad_norm": 568.0, "kl_loss_10": 193.1684341430664, "kl_loss_2": 2223.558709716797, "kl_loss_3": 1744.958233642578, "kl_loss_7": 652.0952423095703, "learning_rate": 0.0005412064488081482, "loss": 1232.2334, "step": 4790 }, { "ce_loss_10": 3.560468685626984, "ce_loss_13": 3.4794244885444643, "ce_loss_2": 4.549170875549317, "ce_loss_3": 4.280433797836304, "ce_loss_7": 3.744898808002472, "epoch": 0.48, "grad_norm": 548.0, "kl_loss_10": 188.24676589965821, "kl_loss_2": 2217.6575561523437, "kl_loss_3": 1743.2784423828125, "kl_loss_7": 636.4865295410157, "learning_rate": 0.0005396249784283942, "loss": 1197.0651, "step": 4800 }, { "ce_loss_10": 3.575687527656555, "ce_loss_13": 3.4918730735778807, "ce_loss_2": 4.614717435836792, "ce_loss_3": 4.347899675369263, "ce_loss_7": 3.7766286730766296, "epoch": 0.481, "grad_norm": 592.0, "kl_loss_10": 195.0629508972168, "kl_loss_2": 2307.5621826171873, "kl_loss_3": 1836.4153686523437, "kl_loss_7": 665.144580078125, "learning_rate": 0.0005380431090255476, "loss": 1235.3045, "step": 4810 }, { "ce_loss_10": 3.565406787395477, "ce_loss_13": 3.487363612651825, "ce_loss_2": 4.546458888053894, "ce_loss_3": 4.2899955153465275, "ce_loss_7": 3.747445857524872, "epoch": 0.482, "grad_norm": 608.0, "kl_loss_10": 183.49071578979493, "kl_loss_2": 2200.6481811523436, "kl_loss_3": 1737.9393493652344, "kl_loss_7": 622.3307556152344, "learning_rate": 0.0005364608565290155, "loss": 1189.2841, "step": 4820 }, { "ce_loss_10": 3.5748016953468325, "ce_loss_13": 3.493953990936279, "ce_loss_2": 4.58906729221344, "ce_loss_3": 4.324404489994049, "ce_loss_7": 3.7643205761909484, "epoch": 0.483, "grad_norm": 640.0, "kl_loss_10": 190.96404800415038, "kl_loss_2": 2251.1127075195313, "kl_loss_3": 1785.181817626953, "kl_loss_7": 641.9654663085937, "learning_rate": 0.0005348782368720626, "loss": 1217.6031, "step": 4830 }, { "ce_loss_10": 3.5082598328590393, "ce_loss_13": 3.427862787246704, "ce_loss_2": 4.508589172363282, "ce_loss_3": 4.243466067314148, "ce_loss_7": 3.6949036836624147, "epoch": 0.484, "grad_norm": 560.0, "kl_loss_10": 186.74840545654297, "kl_loss_2": 2224.133184814453, "kl_loss_3": 1753.7203369140625, "kl_loss_7": 630.4135833740235, "learning_rate": 0.000533295265991652, "loss": 1216.8205, "step": 4840 }, { "ce_loss_10": 3.5815645456314087, "ce_loss_13": 3.4982495427131655, "ce_loss_2": 4.554982018470764, "ce_loss_3": 4.299691355228424, "ce_loss_7": 3.7735238790512087, "epoch": 0.485, "grad_norm": 584.0, "kl_loss_10": 187.77717666625978, "kl_loss_2": 2175.573095703125, "kl_loss_3": 1715.7588928222656, "kl_loss_7": 631.838656616211, "learning_rate": 0.0005317119598282822, "loss": 1183.9046, "step": 4850 }, { "ce_loss_10": 3.586243951320648, "ce_loss_13": 3.5034523725509645, "ce_loss_2": 4.583103036880493, "ce_loss_3": 4.312316799163819, "ce_loss_7": 3.777419722080231, "epoch": 0.486, "grad_norm": 648.0, "kl_loss_10": 189.01727676391602, "kl_loss_2": 2203.9095703125, "kl_loss_3": 1739.787420654297, "kl_loss_7": 638.8051147460938, "learning_rate": 0.0005301283343258293, "loss": 1199.0793, "step": 4860 }, { "ce_loss_10": 3.644785749912262, "ce_loss_13": 3.563555288314819, "ce_loss_2": 4.610913848876953, "ce_loss_3": 4.34997011423111, "ce_loss_7": 3.832633006572723, "epoch": 0.487, "grad_norm": 648.0, "kl_loss_10": 187.26018371582032, "kl_loss_2": 2164.9867431640623, "kl_loss_3": 1703.8299499511718, "kl_loss_7": 629.719970703125, "learning_rate": 0.000528544405431384, "loss": 1174.2795, "step": 4870 }, { "ce_loss_10": 3.5308486342430117, "ce_loss_13": 3.4465184569358827, "ce_loss_2": 4.54223735332489, "ce_loss_3": 4.275557327270508, "ce_loss_7": 3.728735053539276, "epoch": 0.488, "grad_norm": 692.0, "kl_loss_10": 194.1014518737793, "kl_loss_2": 2267.4865783691407, "kl_loss_3": 1794.8980224609375, "kl_loss_7": 653.2649841308594, "learning_rate": 0.000526960189095093, "loss": 1222.7201, "step": 4880 }, { "ce_loss_10": 3.5016911029815674, "ce_loss_13": 3.422479736804962, "ce_loss_2": 4.5065477132797245, "ce_loss_3": 4.244668066501617, "ce_loss_7": 3.6958303570747377, "epoch": 0.489, "grad_norm": 624.0, "kl_loss_10": 185.53594131469725, "kl_loss_2": 2219.627575683594, "kl_loss_3": 1760.8566650390626, "kl_loss_7": 633.8929748535156, "learning_rate": 0.0005253757012699972, "loss": 1199.7284, "step": 4890 }, { "ce_loss_10": 3.592365336418152, "ce_loss_13": 3.5133956909179687, "ce_loss_2": 4.582755160331726, "ce_loss_3": 4.310234916210175, "ce_loss_7": 3.779422330856323, "epoch": 0.49, "grad_norm": 608.0, "kl_loss_10": 188.63387451171874, "kl_loss_2": 2196.80634765625, "kl_loss_3": 1721.877880859375, "kl_loss_7": 628.9239440917969, "learning_rate": 0.0005237909579118712, "loss": 1209.9893, "step": 4900 }, { "ce_loss_10": 3.5542251110076903, "ce_loss_13": 3.470878171920776, "ce_loss_2": 4.575217127799988, "ce_loss_3": 4.311789894104004, "ce_loss_7": 3.748956894874573, "epoch": 0.491, "grad_norm": 688.0, "kl_loss_10": 192.452791595459, "kl_loss_2": 2289.5204833984376, "kl_loss_3": 1820.7321044921875, "kl_loss_7": 654.6072784423828, "learning_rate": 0.0005222059749790631, "loss": 1232.3309, "step": 4910 }, { "ce_loss_10": 3.6172361254692076, "ce_loss_13": 3.538671875, "ce_loss_2": 4.572561645507813, "ce_loss_3": 4.3095086216926575, "ce_loss_7": 3.7990201711654663, "epoch": 0.492, "grad_norm": 580.0, "kl_loss_10": 186.14958953857422, "kl_loss_2": 2152.0723571777344, "kl_loss_3": 1686.5931701660156, "kl_loss_7": 627.8851165771484, "learning_rate": 0.0005206207684323337, "loss": 1161.1154, "step": 4920 }, { "ce_loss_10": 3.597834038734436, "ce_loss_13": 3.5186328291893005, "ce_loss_2": 4.576415348052978, "ce_loss_3": 4.318524956703186, "ce_loss_7": 3.7833918571472167, "epoch": 0.493, "grad_norm": 680.0, "kl_loss_10": 190.28093643188475, "kl_loss_2": 2205.189178466797, "kl_loss_3": 1744.895166015625, "kl_loss_7": 637.9018249511719, "learning_rate": 0.000519035354234695, "loss": 1221.5055, "step": 4930 }, { "ce_loss_10": 3.5777213335037232, "ce_loss_13": 3.4926177620887757, "ce_loss_2": 4.569476509094239, "ce_loss_3": 4.300305211544037, "ce_loss_7": 3.7719446539878847, "epoch": 0.494, "grad_norm": 652.0, "kl_loss_10": 191.98795700073242, "kl_loss_2": 2217.8314697265623, "kl_loss_3": 1735.119366455078, "kl_loss_7": 648.0345703125, "learning_rate": 0.0005174497483512506, "loss": 1188.0275, "step": 4940 }, { "ce_loss_10": 3.617672252655029, "ce_loss_13": 3.5411610841751098, "ce_loss_2": 4.595355463027954, "ce_loss_3": 4.32707976102829, "ce_loss_7": 3.8022171378135683, "epoch": 0.495, "grad_norm": 704.0, "kl_loss_10": 185.97076797485352, "kl_loss_2": 2193.0468505859376, "kl_loss_3": 1726.8812622070313, "kl_loss_7": 638.2306701660157, "learning_rate": 0.0005158639667490339, "loss": 1220.6553, "step": 4950 }, { "ce_loss_10": 3.5151694416999817, "ce_loss_13": 3.4326966643333434, "ce_loss_2": 4.5227725267410275, "ce_loss_3": 4.255635476112365, "ce_loss_7": 3.710303211212158, "epoch": 0.496, "grad_norm": 632.0, "kl_loss_10": 189.1722068786621, "kl_loss_2": 2227.908306884766, "kl_loss_3": 1751.7201293945313, "kl_loss_7": 643.16630859375, "learning_rate": 0.0005142780253968481, "loss": 1203.2568, "step": 4960 }, { "ce_loss_10": 3.4694177746772765, "ce_loss_13": 3.3919921875, "ce_loss_2": 4.455039215087891, "ce_loss_3": 4.192563462257385, "ce_loss_7": 3.6608409881591797, "epoch": 0.497, "grad_norm": 672.0, "kl_loss_10": 182.45398559570313, "kl_loss_2": 2196.9568115234374, "kl_loss_3": 1734.4176452636718, "kl_loss_7": 624.9595611572265, "learning_rate": 0.0005126919402651053, "loss": 1165.1617, "step": 4970 }, { "ce_loss_10": 3.5411869525909423, "ce_loss_13": 3.4560129284858703, "ce_loss_2": 4.551884508132934, "ce_loss_3": 4.285728931427002, "ce_loss_7": 3.730460357666016, "epoch": 0.498, "grad_norm": 612.0, "kl_loss_10": 190.1128143310547, "kl_loss_2": 2234.540148925781, "kl_loss_3": 1768.0274841308594, "kl_loss_7": 642.9938751220703, "learning_rate": 0.0005111057273256647, "loss": 1218.0719, "step": 4980 }, { "ce_loss_10": 3.640482187271118, "ce_loss_13": 3.563759708404541, "ce_loss_2": 4.559627389907837, "ce_loss_3": 4.304729497432708, "ce_loss_7": 3.809755790233612, "epoch": 0.499, "grad_norm": 600.0, "kl_loss_10": 181.2877067565918, "kl_loss_2": 2076.4399169921876, "kl_loss_3": 1633.4311096191407, "kl_loss_7": 606.3493682861329, "learning_rate": 0.0005095194025516733, "loss": 1149.4782, "step": 4990 }, { "ce_loss_10": 3.561459171772003, "ce_loss_13": 3.485899102687836, "ce_loss_2": 4.532869434356689, "ce_loss_3": 4.273192000389099, "ce_loss_7": 3.7427910447120665, "epoch": 0.5, "grad_norm": 612.0, "kl_loss_10": 182.62270965576172, "kl_loss_2": 2161.449591064453, "kl_loss_3": 1706.75859375, "kl_loss_7": 617.4605316162109, "learning_rate": 0.000507932981917404, "loss": 1217.3309, "step": 5000 }, { "ce_loss_10": 3.518963348865509, "ce_loss_13": 3.4364115476608275, "ce_loss_2": 4.566620469093323, "ce_loss_3": 4.296507096290588, "ce_loss_7": 3.7167017698287963, "epoch": 0.501, "grad_norm": 604.0, "kl_loss_10": 191.43318862915038, "kl_loss_2": 2312.919299316406, "kl_loss_3": 1835.5362060546875, "kl_loss_7": 654.5825164794921, "learning_rate": 0.0005063464813980949, "loss": 1243.5809, "step": 5010 }, { "ce_loss_10": 3.503278911113739, "ce_loss_13": 3.423468828201294, "ce_loss_2": 4.508650994300842, "ce_loss_3": 4.244243478775024, "ce_loss_7": 3.6842400670051574, "epoch": 0.502, "grad_norm": 616.0, "kl_loss_10": 187.45429153442382, "kl_loss_2": 2242.6967956542967, "kl_loss_3": 1780.9238586425781, "kl_loss_7": 636.0354858398438, "learning_rate": 0.0005047599169697884, "loss": 1195.7843, "step": 5020 }, { "ce_loss_10": 3.4397648930549622, "ce_loss_13": 3.357620894908905, "ce_loss_2": 4.463168692588806, "ce_loss_3": 4.195060646533966, "ce_loss_7": 3.633397877216339, "epoch": 0.503, "grad_norm": 604.0, "kl_loss_10": 185.41551361083984, "kl_loss_2": 2258.051135253906, "kl_loss_3": 1778.3387390136718, "kl_loss_7": 635.239013671875, "learning_rate": 0.000503173304609171, "loss": 1183.8663, "step": 5030 }, { "ce_loss_10": 3.5603776931762696, "ce_loss_13": 3.4799546360969544, "ce_loss_2": 4.5456082105636595, "ce_loss_3": 4.285192847251892, "ce_loss_7": 3.7480576753616335, "epoch": 0.504, "grad_norm": 656.0, "kl_loss_10": 184.81720504760742, "kl_loss_2": 2170.377197265625, "kl_loss_3": 1713.3125, "kl_loss_7": 627.2051513671875, "learning_rate": 0.0005015866602934111, "loss": 1173.4953, "step": 5040 }, { "ce_loss_10": 3.5348097562789915, "ce_loss_13": 3.4481786727905273, "ce_loss_2": 4.561786007881165, "ce_loss_3": 4.291987287998199, "ce_loss_7": 3.732908022403717, "epoch": 0.505, "grad_norm": 584.0, "kl_loss_10": 195.19094161987306, "kl_loss_2": 2283.529962158203, "kl_loss_3": 1808.7241088867188, "kl_loss_7": 661.38330078125, "learning_rate": 0.0005, "loss": 1216.3971, "step": 5050 }, { "ce_loss_10": 3.5199029207229615, "ce_loss_13": 3.4400954723358153, "ce_loss_2": 4.522394108772278, "ce_loss_3": 4.258548331260681, "ce_loss_7": 3.7067763924598696, "epoch": 0.506, "grad_norm": 632.0, "kl_loss_10": 190.6364860534668, "kl_loss_2": 2246.9065795898437, "kl_loss_3": 1774.8018188476562, "kl_loss_7": 642.158203125, "learning_rate": 0.0004984133397065889, "loss": 1187.0591, "step": 5060 }, { "ce_loss_10": 3.529603970050812, "ce_loss_13": 3.448684501647949, "ce_loss_2": 4.540892434120178, "ce_loss_3": 4.281192362308502, "ce_loss_7": 3.727533829212189, "epoch": 0.507, "grad_norm": 572.0, "kl_loss_10": 189.2146110534668, "kl_loss_2": 2238.786248779297, "kl_loss_3": 1779.2802124023438, "kl_loss_7": 641.3925506591797, "learning_rate": 0.0004968266953908291, "loss": 1190.1465, "step": 5070 }, { "ce_loss_10": 3.5666260600090025, "ce_loss_13": 3.486749768257141, "ce_loss_2": 4.580948376655579, "ce_loss_3": 4.316030120849609, "ce_loss_7": 3.7590123891830443, "epoch": 0.508, "grad_norm": 608.0, "kl_loss_10": 183.170157623291, "kl_loss_2": 2245.8196716308594, "kl_loss_3": 1773.8548583984375, "kl_loss_7": 630.8032928466797, "learning_rate": 0.0004952400830302117, "loss": 1205.3312, "step": 5080 }, { "ce_loss_10": 3.4943687319755554, "ce_loss_13": 3.4131668329238893, "ce_loss_2": 4.523447823524475, "ce_loss_3": 4.255696547031403, "ce_loss_7": 3.686564898490906, "epoch": 0.509, "grad_norm": 624.0, "kl_loss_10": 190.01820449829103, "kl_loss_2": 2279.890344238281, "kl_loss_3": 1807.6574096679688, "kl_loss_7": 647.3827026367187, "learning_rate": 0.0004936535186019053, "loss": 1207.5289, "step": 5090 }, { "ce_loss_10": 3.5966561436653137, "ce_loss_13": 3.5205499291419984, "ce_loss_2": 4.557806515693665, "ce_loss_3": 4.297008419036866, "ce_loss_7": 3.777693784236908, "epoch": 0.51, "grad_norm": 572.0, "kl_loss_10": 181.29688186645507, "kl_loss_2": 2148.9375854492187, "kl_loss_3": 1687.6425170898438, "kl_loss_7": 609.7850677490235, "learning_rate": 0.000492067018082596, "loss": 1180.1517, "step": 5100 }, { "ce_loss_10": 3.5341065168380736, "ce_loss_13": 3.448958945274353, "ce_loss_2": 4.584142446517944, "ce_loss_3": 4.311909413337707, "ce_loss_7": 3.7378315210342405, "epoch": 0.511, "grad_norm": 580.0, "kl_loss_10": 191.75616531372071, "kl_loss_2": 2313.8392028808594, "kl_loss_3": 1838.1626281738281, "kl_loss_7": 657.9944549560547, "learning_rate": 0.0004904805974483267, "loss": 1252.0359, "step": 5110 }, { "ce_loss_10": 3.6478444814682005, "ce_loss_13": 3.5622426509857177, "ce_loss_2": 4.652135348320007, "ce_loss_3": 4.385519003868103, "ce_loss_7": 3.8461916565895082, "epoch": 0.512, "grad_norm": 620.0, "kl_loss_10": 196.4123405456543, "kl_loss_2": 2261.5567626953125, "kl_loss_3": 1794.3322509765626, "kl_loss_7": 663.2056915283204, "learning_rate": 0.0004888942726743353, "loss": 1254.773, "step": 5120 }, { "ce_loss_10": 3.5161622405052184, "ce_loss_13": 3.435041069984436, "ce_loss_2": 4.527030563354492, "ce_loss_3": 4.273452854156494, "ce_loss_7": 3.7132395029067995, "epoch": 0.513, "grad_norm": 612.0, "kl_loss_10": 189.22552337646485, "kl_loss_2": 2261.9498779296873, "kl_loss_3": 1801.7636291503907, "kl_loss_7": 649.0409454345703, "learning_rate": 0.0004873080597348947, "loss": 1220.4549, "step": 5130 }, { "ce_loss_10": 3.4059476256370544, "ce_loss_13": 3.325529730319977, "ce_loss_2": 4.472175240516663, "ce_loss_3": 4.212775444984436, "ce_loss_7": 3.6100828886032104, "epoch": 0.514, "grad_norm": 576.0, "kl_loss_10": 188.01781005859374, "kl_loss_2": 2348.922229003906, "kl_loss_3": 1884.4989135742187, "kl_loss_7": 653.7215118408203, "learning_rate": 0.0004857219746031519, "loss": 1228.3554, "step": 5140 }, { "ce_loss_10": 3.5706036925315856, "ce_loss_13": 3.4925912499427794, "ce_loss_2": 4.564787793159485, "ce_loss_3": 4.288926684856415, "ce_loss_7": 3.7588186025619508, "epoch": 0.515, "grad_norm": 564.0, "kl_loss_10": 187.27239913940429, "kl_loss_2": 2197.20703125, "kl_loss_3": 1721.3864501953126, "kl_loss_7": 633.3875030517578, "learning_rate": 0.0004841360332509663, "loss": 1198.5317, "step": 5150 }, { "ce_loss_10": 3.5291930079460143, "ce_loss_13": 3.451045203208923, "ce_loss_2": 4.509535562992096, "ce_loss_3": 4.244547712802887, "ce_loss_7": 3.7146947622299193, "epoch": 0.516, "grad_norm": 640.0, "kl_loss_10": 182.93116302490233, "kl_loss_2": 2188.2712890625, "kl_loss_3": 1720.3389709472656, "kl_loss_7": 621.1425567626953, "learning_rate": 0.0004825502516487497, "loss": 1155.4164, "step": 5160 }, { "ce_loss_10": 3.494162142276764, "ce_loss_13": 3.410848069190979, "ce_loss_2": 4.509466361999512, "ce_loss_3": 4.249596023559571, "ce_loss_7": 3.689158225059509, "epoch": 0.517, "grad_norm": 776.0, "kl_loss_10": 188.6508804321289, "kl_loss_2": 2267.020611572266, "kl_loss_3": 1803.9957458496094, "kl_loss_7": 643.8313232421875, "learning_rate": 0.00048096464576530507, "loss": 1222.8347, "step": 5170 }, { "ce_loss_10": 3.5969889640808104, "ce_loss_13": 3.5190072774887087, "ce_loss_2": 4.547854423522949, "ce_loss_3": 4.293277430534363, "ce_loss_7": 3.7762330770492554, "epoch": 0.518, "grad_norm": 620.0, "kl_loss_10": 184.8033348083496, "kl_loss_2": 2134.698907470703, "kl_loss_3": 1683.638848876953, "kl_loss_7": 620.0321014404296, "learning_rate": 0.00047937923156766646, "loss": 1168.0455, "step": 5180 }, { "ce_loss_10": 3.6420543789863586, "ce_loss_13": 3.5626509547233582, "ce_loss_2": 4.591393780708313, "ce_loss_3": 4.326744735240936, "ce_loss_7": 3.8211991429328918, "epoch": 0.519, "grad_norm": 560.0, "kl_loss_10": 186.71140975952147, "kl_loss_2": 2131.37041015625, "kl_loss_3": 1673.548291015625, "kl_loss_7": 620.3797760009766, "learning_rate": 0.00047779402502093696, "loss": 1176.4619, "step": 5190 }, { "ce_loss_10": 3.6047547817230225, "ce_loss_13": 3.5276923775672913, "ce_loss_2": 4.578773355484008, "ce_loss_3": 4.314329183101654, "ce_loss_7": 3.7894126772880554, "epoch": 0.52, "grad_norm": 572.0, "kl_loss_10": 184.64933090209962, "kl_loss_2": 2171.194439697266, "kl_loss_3": 1701.0345581054687, "kl_loss_7": 621.1681274414062, "learning_rate": 0.0004762090420881289, "loss": 1192.2752, "step": 5200 }, { "ce_loss_10": 3.524991714954376, "ce_loss_13": 3.449671447277069, "ce_loss_2": 4.498011994361877, "ce_loss_3": 4.23529599905014, "ce_loss_7": 3.705312669277191, "epoch": 0.521, "grad_norm": 608.0, "kl_loss_10": 186.35540390014648, "kl_loss_2": 2183.97861328125, "kl_loss_3": 1723.4856018066407, "kl_loss_7": 620.2787628173828, "learning_rate": 0.00047462429873000296, "loss": 1166.6783, "step": 5210 }, { "ce_loss_10": 3.610927963256836, "ce_loss_13": 3.5292730212211607, "ce_loss_2": 4.586357808113098, "ce_loss_3": 4.316179418563843, "ce_loss_7": 3.7876657485961913, "epoch": 0.522, "grad_norm": 572.0, "kl_loss_10": 187.58379135131835, "kl_loss_2": 2205.0898559570314, "kl_loss_3": 1728.4106079101562, "kl_loss_7": 624.2286590576172, "learning_rate": 0.0004730398109049071, "loss": 1181.2787, "step": 5220 }, { "ce_loss_10": 3.542900788784027, "ce_loss_13": 3.4592981576919555, "ce_loss_2": 4.5604215383529665, "ce_loss_3": 4.294589376449585, "ce_loss_7": 3.7346125721931456, "epoch": 0.523, "grad_norm": 632.0, "kl_loss_10": 192.22620544433593, "kl_loss_2": 2275.4386779785154, "kl_loss_3": 1804.1369689941407, "kl_loss_7": 648.4010925292969, "learning_rate": 0.000471455594568616, "loss": 1206.5586, "step": 5230 }, { "ce_loss_10": 3.612694036960602, "ce_loss_13": 3.5346154451370237, "ce_loss_2": 4.571756148338318, "ce_loss_3": 4.30354597568512, "ce_loss_7": 3.7924134016036986, "epoch": 0.524, "grad_norm": 584.0, "kl_loss_10": 184.57232360839845, "kl_loss_2": 2148.5399963378904, "kl_loss_3": 1679.7537841796875, "kl_loss_7": 619.590966796875, "learning_rate": 0.00046987166567417086, "loss": 1185.6557, "step": 5240 }, { "ce_loss_10": 3.5288819313049316, "ce_loss_13": 3.452391028404236, "ce_loss_2": 4.524345111846924, "ce_loss_3": 4.255153965950012, "ce_loss_7": 3.7151949644088744, "epoch": 0.525, "grad_norm": 640.0, "kl_loss_10": 184.01749114990236, "kl_loss_2": 2198.7858947753907, "kl_loss_3": 1730.872314453125, "kl_loss_7": 629.3970092773437, "learning_rate": 0.00046828804017171776, "loss": 1156.5996, "step": 5250 }, { "ce_loss_10": 3.5754063725471497, "ce_loss_13": 3.4882086515426636, "ce_loss_2": 4.589629459381103, "ce_loss_3": 4.328830146789551, "ce_loss_7": 3.7704445004463194, "epoch": 0.526, "grad_norm": 640.0, "kl_loss_10": 189.38601303100586, "kl_loss_2": 2242.6748046875, "kl_loss_3": 1771.9065246582031, "kl_loss_7": 637.7771270751953, "learning_rate": 0.00046670473400834805, "loss": 1218.8605, "step": 5260 }, { "ce_loss_10": 3.5049550890922547, "ce_loss_13": 3.428373408317566, "ce_loss_2": 4.489202237129211, "ce_loss_3": 4.228802132606506, "ce_loss_7": 3.686991608142853, "epoch": 0.527, "grad_norm": 580.0, "kl_loss_10": 181.47291641235353, "kl_loss_2": 2184.507427978516, "kl_loss_3": 1721.9762329101563, "kl_loss_7": 614.4228942871093, "learning_rate": 0.00046512176312793734, "loss": 1216.9304, "step": 5270 }, { "ce_loss_10": 3.497020888328552, "ce_loss_13": 3.415910315513611, "ce_loss_2": 4.500096344947815, "ce_loss_3": 4.221375334262848, "ce_loss_7": 3.6874555468559267, "epoch": 0.528, "grad_norm": 608.0, "kl_loss_10": 183.7262046813965, "kl_loss_2": 2223.9841369628907, "kl_loss_3": 1744.9040588378907, "kl_loss_7": 628.2290283203125, "learning_rate": 0.00046353914347098467, "loss": 1206.4577, "step": 5280 }, { "ce_loss_10": 3.5970619559288024, "ce_loss_13": 3.5186134576797485, "ce_loss_2": 4.588784885406494, "ce_loss_3": 4.328100037574768, "ce_loss_7": 3.7806106090545653, "epoch": 0.529, "grad_norm": 608.0, "kl_loss_10": 183.81845779418944, "kl_loss_2": 2204.89072265625, "kl_loss_3": 1738.6330078125, "kl_loss_7": 622.7282592773438, "learning_rate": 0.0004619568909744524, "loss": 1214.3289, "step": 5290 }, { "ce_loss_10": 3.5965808272361754, "ce_loss_13": 3.519477891921997, "ce_loss_2": 4.575191998481751, "ce_loss_3": 4.308115267753601, "ce_loss_7": 3.779456090927124, "epoch": 0.53, "grad_norm": 624.0, "kl_loss_10": 185.90534057617188, "kl_loss_2": 2166.9622314453127, "kl_loss_3": 1701.3540832519532, "kl_loss_7": 623.1412811279297, "learning_rate": 0.00046037502157160573, "loss": 1194.0631, "step": 5300 }, { "ce_loss_10": 3.475346398353577, "ce_loss_13": 3.3953770637512206, "ce_loss_2": 4.472927665710449, "ce_loss_3": 4.211131680011749, "ce_loss_7": 3.672848129272461, "epoch": 0.531, "grad_norm": 608.0, "kl_loss_10": 188.33962783813476, "kl_loss_2": 2232.2967163085937, "kl_loss_3": 1767.3839172363282, "kl_loss_7": 649.8835662841797, "learning_rate": 0.00045879355119185207, "loss": 1212.3993, "step": 5310 }, { "ce_loss_10": 3.555951988697052, "ce_loss_13": 3.474162495136261, "ce_loss_2": 4.560364985466004, "ce_loss_3": 4.293745231628418, "ce_loss_7": 3.751049613952637, "epoch": 0.532, "grad_norm": 672.0, "kl_loss_10": 190.96983184814454, "kl_loss_2": 2257.3020629882812, "kl_loss_3": 1780.9093383789063, "kl_loss_7": 650.6417663574218, "learning_rate": 0.0004572124957605803, "loss": 1223.1152, "step": 5320 }, { "ce_loss_10": 3.5723905324935914, "ce_loss_13": 3.492247462272644, "ce_loss_2": 4.554775309562683, "ce_loss_3": 4.289375352859497, "ce_loss_7": 3.7621920228004457, "epoch": 0.533, "grad_norm": 584.0, "kl_loss_10": 185.00704040527344, "kl_loss_2": 2210.7857055664062, "kl_loss_3": 1738.2788696289062, "kl_loss_7": 631.7678100585938, "learning_rate": 0.00045563187119900103, "loss": 1171.3742, "step": 5330 }, { "ce_loss_10": 3.4156481266021728, "ce_loss_13": 3.338289904594421, "ce_loss_2": 4.459405374526978, "ce_loss_3": 4.184291207790375, "ce_loss_7": 3.612143576145172, "epoch": 0.534, "grad_norm": 668.0, "kl_loss_10": 185.77383117675782, "kl_loss_2": 2280.9642578125, "kl_loss_3": 1801.5166015625, "kl_loss_7": 637.4973205566406, "learning_rate": 0.00045405169342398633, "loss": 1214.5622, "step": 5340 }, { "ce_loss_10": 3.5048020482063293, "ce_loss_13": 3.422155427932739, "ce_loss_2": 4.527113747596741, "ce_loss_3": 4.256860768795013, "ce_loss_7": 3.6958253622055053, "epoch": 0.535, "grad_norm": 580.0, "kl_loss_10": 188.63988189697267, "kl_loss_2": 2252.499432373047, "kl_loss_3": 1773.2778076171876, "kl_loss_7": 633.0957061767579, "learning_rate": 0.0004524719783479088, "loss": 1187.9953, "step": 5350 }, { "ce_loss_10": 3.460780155658722, "ce_loss_13": 3.378307545185089, "ce_loss_2": 4.497902464866638, "ce_loss_3": 4.2276026725769045, "ce_loss_7": 3.6559959650039673, "epoch": 0.536, "grad_norm": 580.0, "kl_loss_10": 189.0280532836914, "kl_loss_2": 2293.6262939453127, "kl_loss_3": 1820.3978698730468, "kl_loss_7": 642.6393402099609, "learning_rate": 0.00045089274187848144, "loss": 1197.8392, "step": 5360 }, { "ce_loss_10": 3.5799126744270326, "ce_loss_13": 3.501321530342102, "ce_loss_2": 4.5603124618530275, "ce_loss_3": 4.297963404655457, "ce_loss_7": 3.7619481921195983, "epoch": 0.537, "grad_norm": 672.0, "kl_loss_10": 183.09423599243163, "kl_loss_2": 2192.1404357910155, "kl_loss_3": 1730.206787109375, "kl_loss_7": 620.7777648925781, "learning_rate": 0.00044931399991859835, "loss": 1181.3807, "step": 5370 }, { "ce_loss_10": 3.4432420253753664, "ce_loss_13": 3.364873206615448, "ce_loss_2": 4.452599573135376, "ce_loss_3": 4.183995950222015, "ce_loss_7": 3.6285991072654724, "epoch": 0.538, "grad_norm": 600.0, "kl_loss_10": 182.95552597045898, "kl_loss_2": 2236.559704589844, "kl_loss_3": 1765.850408935547, "kl_loss_7": 629.1190887451172, "learning_rate": 0.00044773576836617336, "loss": 1181.7396, "step": 5380 }, { "ce_loss_10": 3.537210750579834, "ce_loss_13": 3.4561371922492983, "ce_loss_2": 4.546432638168335, "ce_loss_3": 4.281138265132904, "ce_loss_7": 3.7339015364646913, "epoch": 0.539, "grad_norm": 612.0, "kl_loss_10": 189.98071517944337, "kl_loss_2": 2253.4164794921876, "kl_loss_3": 1781.9518432617188, "kl_loss_7": 650.4307464599609, "learning_rate": 0.00044615806311398056, "loss": 1232.9109, "step": 5390 }, { "ce_loss_10": 3.6113093972206114, "ce_loss_13": 3.5354915499687194, "ce_loss_2": 4.540320181846619, "ce_loss_3": 4.277068996429444, "ce_loss_7": 3.787187647819519, "epoch": 0.54, "grad_norm": 580.0, "kl_loss_10": 181.3637908935547, "kl_loss_2": 2094.3728942871094, "kl_loss_3": 1633.9456298828125, "kl_loss_7": 605.8723022460938, "learning_rate": 0.00044458090004949454, "loss": 1175.0439, "step": 5400 }, { "ce_loss_10": 3.47382390499115, "ce_loss_13": 3.39083354473114, "ce_loss_2": 4.532833766937256, "ce_loss_3": 4.262583804130554, "ce_loss_7": 3.6737227201461793, "epoch": 0.541, "grad_norm": 620.0, "kl_loss_10": 194.1818962097168, "kl_loss_2": 2371.5374450683594, "kl_loss_3": 1877.3123657226563, "kl_loss_7": 665.8367095947266, "learning_rate": 0.0004430042950547297, "loss": 1218.705, "step": 5410 }, { "ce_loss_10": 3.5697335839271545, "ce_loss_13": 3.483165454864502, "ce_loss_2": 4.578557109832763, "ce_loss_3": 4.31482458114624, "ce_loss_7": 3.763143301010132, "epoch": 0.542, "grad_norm": 572.0, "kl_loss_10": 191.8735610961914, "kl_loss_2": 2253.524365234375, "kl_loss_3": 1779.8523681640625, "kl_loss_7": 645.0971527099609, "learning_rate": 0.0004414282640060809, "loss": 1200.7552, "step": 5420 }, { "ce_loss_10": 3.656325376033783, "ce_loss_13": 3.575901198387146, "ce_loss_2": 4.611030888557434, "ce_loss_3": 4.35529580116272, "ce_loss_7": 3.8402703166007996, "epoch": 0.543, "grad_norm": 672.0, "kl_loss_10": 186.09361267089844, "kl_loss_2": 2127.538677978516, "kl_loss_3": 1677.6395080566406, "kl_loss_7": 622.3258697509766, "learning_rate": 0.0004398528227741633, "loss": 1179.4629, "step": 5430 }, { "ce_loss_10": 3.5199654936790465, "ce_loss_13": 3.442525625228882, "ce_loss_2": 4.519460201263428, "ce_loss_3": 4.247548985481262, "ce_loss_7": 3.7133419036865236, "epoch": 0.544, "grad_norm": 656.0, "kl_loss_10": 186.4021957397461, "kl_loss_2": 2206.209338378906, "kl_loss_3": 1726.7237670898437, "kl_loss_7": 636.2167572021484, "learning_rate": 0.00043827798722365264, "loss": 1202.1797, "step": 5440 }, { "ce_loss_10": 3.6471530318260195, "ce_loss_13": 3.566143047809601, "ce_loss_2": 4.5952486276626585, "ce_loss_3": 4.333053851127625, "ce_loss_7": 3.8201894760131836, "epoch": 0.545, "grad_norm": 592.0, "kl_loss_10": 185.36949920654297, "kl_loss_2": 2143.129284667969, "kl_loss_3": 1675.2244567871094, "kl_loss_7": 617.6786651611328, "learning_rate": 0.00043670377321312535, "loss": 1164.6765, "step": 5450 }, { "ce_loss_10": 3.6508351445198057, "ce_loss_13": 3.574675273895264, "ce_loss_2": 4.5991229772567745, "ce_loss_3": 4.339683651924133, "ce_loss_7": 3.8303612232208253, "epoch": 0.546, "grad_norm": 700.0, "kl_loss_10": 183.1472366333008, "kl_loss_2": 2130.7037048339844, "kl_loss_3": 1667.0253173828125, "kl_loss_7": 613.02021484375, "learning_rate": 0.0004351301965948991, "loss": 1168.8242, "step": 5460 }, { "ce_loss_10": 3.559572923183441, "ce_loss_13": 3.478611421585083, "ce_loss_2": 4.511995816230774, "ce_loss_3": 4.249083304405213, "ce_loss_7": 3.7354837536811827, "epoch": 0.547, "grad_norm": 636.0, "kl_loss_10": 181.6176902770996, "kl_loss_2": 2130.2896118164062, "kl_loss_3": 1667.8863098144532, "kl_loss_7": 614.7661987304688, "learning_rate": 0.000433557273214873, "loss": 1176.8127, "step": 5470 }, { "ce_loss_10": 3.545152747631073, "ce_loss_13": 3.4662238121032716, "ce_loss_2": 4.518579649925232, "ce_loss_3": 4.245905971527099, "ce_loss_7": 3.7270439863204956, "epoch": 0.548, "grad_norm": 608.0, "kl_loss_10": 184.05833053588867, "kl_loss_2": 2168.807977294922, "kl_loss_3": 1696.2628234863282, "kl_loss_7": 616.8240295410156, "learning_rate": 0.000431985018912368, "loss": 1150.4518, "step": 5480 }, { "ce_loss_10": 3.514492917060852, "ce_loss_13": 3.4341874718666077, "ce_loss_2": 4.534255909919739, "ce_loss_3": 4.270003151893616, "ce_loss_7": 3.7031027913093566, "epoch": 0.549, "grad_norm": 600.0, "kl_loss_10": 189.14087448120117, "kl_loss_2": 2268.4575805664062, "kl_loss_3": 1800.3566040039063, "kl_loss_7": 639.125277709961, "learning_rate": 0.0004304134495199674, "loss": 1178.9426, "step": 5490 }, { "ce_loss_10": 3.538786220550537, "ce_loss_13": 3.4557671666145326, "ce_loss_2": 4.5282275676727295, "ce_loss_3": 4.265328872203827, "ce_loss_7": 3.731334662437439, "epoch": 0.55, "grad_norm": 604.0, "kl_loss_10": 188.5583984375, "kl_loss_2": 2236.901904296875, "kl_loss_3": 1761.930596923828, "kl_loss_7": 644.1045196533203, "learning_rate": 0.0004288425808633575, "loss": 1185.0572, "step": 5500 }, { "ce_loss_10": 3.514096534252167, "ce_loss_13": 3.435099017620087, "ce_loss_2": 4.509266877174378, "ce_loss_3": 4.252711880207062, "ce_loss_7": 3.6944369435310365, "epoch": 0.551, "grad_norm": 664.0, "kl_loss_10": 184.48614044189452, "kl_loss_2": 2223.2010803222656, "kl_loss_3": 1765.0783264160157, "kl_loss_7": 630.5097595214844, "learning_rate": 0.0004272724287611684, "loss": 1201.5842, "step": 5510 }, { "ce_loss_10": 3.490022134780884, "ce_loss_13": 3.4118714332580566, "ce_loss_2": 4.514768314361572, "ce_loss_3": 4.2408933401107785, "ce_loss_7": 3.680497145652771, "epoch": 0.552, "grad_norm": 652.0, "kl_loss_10": 185.8211784362793, "kl_loss_2": 2267.8012084960938, "kl_loss_3": 1790.0665649414063, "kl_loss_7": 633.0888458251953, "learning_rate": 0.00042570300902481425, "loss": 1202.0281, "step": 5520 }, { "ce_loss_10": 3.523720991611481, "ce_loss_13": 3.448110568523407, "ce_loss_2": 4.499163627624512, "ce_loss_3": 4.242252886295319, "ce_loss_7": 3.704312777519226, "epoch": 0.553, "grad_norm": 608.0, "kl_loss_10": 183.24146575927733, "kl_loss_2": 2193.4769287109375, "kl_loss_3": 1734.7890686035157, "kl_loss_7": 623.5370147705078, "learning_rate": 0.00042413433745833423, "loss": 1179.776, "step": 5530 }, { "ce_loss_10": 3.5270172238349913, "ce_loss_13": 3.448072147369385, "ce_loss_2": 4.536388492584228, "ce_loss_3": 4.260759913921357, "ce_loss_7": 3.715148115158081, "epoch": 0.554, "grad_norm": 556.0, "kl_loss_10": 183.68499755859375, "kl_loss_2": 2226.1301025390626, "kl_loss_3": 1743.2217163085938, "kl_loss_7": 626.4557403564453, "learning_rate": 0.0004225664298582339, "loss": 1157.0496, "step": 5540 }, { "ce_loss_10": 3.6083423376083372, "ce_loss_13": 3.530562436580658, "ce_loss_2": 4.568499255180359, "ce_loss_3": 4.307069134712219, "ce_loss_7": 3.7862043499946596, "epoch": 0.555, "grad_norm": 548.0, "kl_loss_10": 182.37268829345703, "kl_loss_2": 2137.707696533203, "kl_loss_3": 1673.7792663574219, "kl_loss_7": 611.6491577148438, "learning_rate": 0.000420999302013325, "loss": 1149.7553, "step": 5550 }, { "ce_loss_10": 3.5049922823905946, "ce_loss_13": 3.420680546760559, "ce_loss_2": 4.553832268714904, "ce_loss_3": 4.279029071331024, "ce_loss_7": 3.700891983509064, "epoch": 0.556, "grad_norm": 572.0, "kl_loss_10": 190.98652191162108, "kl_loss_2": 2305.641845703125, "kl_loss_3": 1822.5648254394532, "kl_loss_7": 641.2451202392579, "learning_rate": 0.000419432969704568, "loss": 1204.391, "step": 5560 }, { "ce_loss_10": 3.548888063430786, "ce_loss_13": 3.4704429507255554, "ce_loss_2": 4.518404316902161, "ce_loss_3": 4.257292962074279, "ce_loss_7": 3.735389542579651, "epoch": 0.557, "grad_norm": 564.0, "kl_loss_10": 182.6816421508789, "kl_loss_2": 2144.968542480469, "kl_loss_3": 1682.6951538085937, "kl_loss_7": 617.6556762695312, "learning_rate": 0.00041786744870491154, "loss": 1202.9963, "step": 5570 }, { "ce_loss_10": 3.491339087486267, "ce_loss_13": 3.412715029716492, "ce_loss_2": 4.4881198644638065, "ce_loss_3": 4.219742333889007, "ce_loss_7": 3.679445171356201, "epoch": 0.558, "grad_norm": 576.0, "kl_loss_10": 189.42200622558593, "kl_loss_2": 2234.280969238281, "kl_loss_3": 1757.53349609375, "kl_loss_7": 641.0087585449219, "learning_rate": 0.0004163027547791347, "loss": 1192.3963, "step": 5580 }, { "ce_loss_10": 3.4689704895019533, "ce_loss_13": 3.3872820258140566, "ce_loss_2": 4.518157267570496, "ce_loss_3": 4.244075846672058, "ce_loss_7": 3.6619726419448853, "epoch": 0.559, "grad_norm": 688.0, "kl_loss_10": 188.0017578125, "kl_loss_2": 2320.9525756835938, "kl_loss_3": 1834.1813659667969, "kl_loss_7": 642.0279479980469, "learning_rate": 0.0004147389036836881, "loss": 1210.1521, "step": 5590 }, { "ce_loss_10": 3.5183377385139467, "ce_loss_13": 3.4371410965919496, "ce_loss_2": 4.522028660774231, "ce_loss_3": 4.258878147602081, "ce_loss_7": 3.706261694431305, "epoch": 0.56, "grad_norm": 652.0, "kl_loss_10": 185.66660232543944, "kl_loss_2": 2233.013397216797, "kl_loss_3": 1764.0713806152344, "kl_loss_7": 637.6866302490234, "learning_rate": 0.00041317591116653486, "loss": 1219.6441, "step": 5600 }, { "ce_loss_10": 3.558071720600128, "ce_loss_13": 3.474745440483093, "ce_loss_2": 4.558679819107056, "ce_loss_3": 4.291901731491089, "ce_loss_7": 3.746951687335968, "epoch": 0.561, "grad_norm": 592.0, "kl_loss_10": 189.82635574340821, "kl_loss_2": 2230.9510803222656, "kl_loss_3": 1759.6529296875, "kl_loss_7": 636.9456726074219, "learning_rate": 0.0004116137929669921, "loss": 1188.2356, "step": 5610 }, { "ce_loss_10": 3.544596457481384, "ce_loss_13": 3.465434396266937, "ce_loss_2": 4.526343536376953, "ce_loss_3": 4.262159049510956, "ce_loss_7": 3.7297433972358705, "epoch": 0.562, "grad_norm": 700.0, "kl_loss_10": 184.16798706054686, "kl_loss_2": 2204.5443481445313, "kl_loss_3": 1738.5609375, "kl_loss_7": 629.1714752197265, "learning_rate": 0.00041005256481557305, "loss": 1174.8596, "step": 5620 }, { "ce_loss_10": 3.6428149700164796, "ce_loss_13": 3.568005383014679, "ce_loss_2": 4.574557089805603, "ce_loss_3": 4.320431900024414, "ce_loss_7": 3.8154868602752687, "epoch": 0.563, "grad_norm": 580.0, "kl_loss_10": 178.43261108398437, "kl_loss_2": 2081.9929809570312, "kl_loss_3": 1633.8047790527344, "kl_loss_7": 600.1017929077149, "learning_rate": 0.00040849224243382767, "loss": 1150.8125, "step": 5630 }, { "ce_loss_10": 3.4989004015922545, "ce_loss_13": 3.4218288540840147, "ce_loss_2": 4.497757744789124, "ce_loss_3": 4.228800570964813, "ce_loss_7": 3.6881244659423826, "epoch": 0.564, "grad_norm": 576.0, "kl_loss_10": 184.93341827392578, "kl_loss_2": 2224.632287597656, "kl_loss_3": 1749.1263427734375, "kl_loss_7": 632.0666015625, "learning_rate": 0.000406932841534185, "loss": 1173.0332, "step": 5640 }, { "ce_loss_10": 3.453734540939331, "ce_loss_13": 3.372727131843567, "ce_loss_2": 4.460113084316253, "ce_loss_3": 4.19973611831665, "ce_loss_7": 3.6455657839775086, "epoch": 0.565, "grad_norm": 708.0, "kl_loss_10": 186.30313568115236, "kl_loss_2": 2260.893664550781, "kl_loss_3": 1792.1846252441405, "kl_loss_7": 638.3344879150391, "learning_rate": 0.0004053743778197951, "loss": 1219.3186, "step": 5650 }, { "ce_loss_10": 3.565755784511566, "ce_loss_13": 3.481943702697754, "ce_loss_2": 4.545414447784424, "ce_loss_3": 4.281696927547455, "ce_loss_7": 3.7513938307762147, "epoch": 0.566, "grad_norm": 584.0, "kl_loss_10": 188.62994842529298, "kl_loss_2": 2184.7360778808593, "kl_loss_3": 1721.9289123535157, "kl_loss_7": 628.1358184814453, "learning_rate": 0.0004038168669843697, "loss": 1209.3523, "step": 5660 }, { "ce_loss_10": 3.532804882526398, "ce_loss_13": 3.4522215127944946, "ce_loss_2": 4.494965553283691, "ce_loss_3": 4.231216824054718, "ce_loss_7": 3.7118934392929077, "epoch": 0.567, "grad_norm": 620.0, "kl_loss_10": 183.03904342651367, "kl_loss_2": 2154.956463623047, "kl_loss_3": 1695.0998046875, "kl_loss_7": 613.3763107299804, "learning_rate": 0.000402260324712026, "loss": 1195.8986, "step": 5670 }, { "ce_loss_10": 3.5749718070030214, "ce_loss_13": 3.497403085231781, "ce_loss_2": 4.588955020904541, "ce_loss_3": 4.319999086856842, "ce_loss_7": 3.7625349521636964, "epoch": 0.568, "grad_norm": 616.0, "kl_loss_10": 184.26412506103514, "kl_loss_2": 2236.5206665039063, "kl_loss_3": 1760.365301513672, "kl_loss_7": 624.1568267822265, "learning_rate": 0.00040070476667712743, "loss": 1174.4818, "step": 5680 }, { "ce_loss_10": 3.595443320274353, "ce_loss_13": 3.5173869848251345, "ce_loss_2": 4.573628330230713, "ce_loss_3": 4.3121489644050595, "ce_loss_7": 3.7780985593795777, "epoch": 0.569, "grad_norm": 540.0, "kl_loss_10": 184.3900894165039, "kl_loss_2": 2190.797717285156, "kl_loss_3": 1726.8204223632813, "kl_loss_7": 618.142544555664, "learning_rate": 0.0003991502085441259, "loss": 1191.0875, "step": 5690 }, { "ce_loss_10": 3.6352679252624513, "ce_loss_13": 3.556475079059601, "ce_loss_2": 4.568906188011169, "ce_loss_3": 4.311613416671753, "ce_loss_7": 3.8102620005607606, "epoch": 0.57, "grad_norm": 616.0, "kl_loss_10": 180.942374420166, "kl_loss_2": 2084.3558349609375, "kl_loss_3": 1627.6179626464843, "kl_loss_7": 599.5358856201171, "learning_rate": 0.0003975966659674047, "loss": 1160.7822, "step": 5700 }, { "ce_loss_10": 3.5962194561958314, "ce_loss_13": 3.517608177661896, "ce_loss_2": 4.578224086761475, "ce_loss_3": 4.314012908935547, "ce_loss_7": 3.7789862513542176, "epoch": 0.571, "grad_norm": 644.0, "kl_loss_10": 182.5239112854004, "kl_loss_2": 2180.907177734375, "kl_loss_3": 1721.9898742675782, "kl_loss_7": 614.957388305664, "learning_rate": 0.0003960441545911204, "loss": 1160.7484, "step": 5710 }, { "ce_loss_10": 3.5932918190956116, "ce_loss_13": 3.5129475712776186, "ce_loss_2": 4.558131432533264, "ce_loss_3": 4.293534338474274, "ce_loss_7": 3.7742814660072326, "epoch": 0.572, "grad_norm": 604.0, "kl_loss_10": 183.15422897338868, "kl_loss_2": 2156.431115722656, "kl_loss_3": 1695.9377807617188, "kl_loss_7": 619.908480834961, "learning_rate": 0.0003944926900490452, "loss": 1164.068, "step": 5720 }, { "ce_loss_10": 3.5127488017082213, "ce_loss_13": 3.430432641506195, "ce_loss_2": 4.5248651027679445, "ce_loss_3": 4.258909869194031, "ce_loss_7": 3.709870958328247, "epoch": 0.573, "grad_norm": 564.0, "kl_loss_10": 186.0645439147949, "kl_loss_2": 2235.3706176757814, "kl_loss_3": 1765.8205688476562, "kl_loss_7": 637.5899017333984, "learning_rate": 0.0003929422879644099, "loss": 1176.3957, "step": 5730 }, { "ce_loss_10": 3.510514330863953, "ce_loss_13": 3.436869239807129, "ce_loss_2": 4.478006148338318, "ce_loss_3": 4.212211620807648, "ce_loss_7": 3.6878655314445496, "epoch": 0.574, "grad_norm": 608.0, "kl_loss_10": 179.26688079833986, "kl_loss_2": 2168.3131591796873, "kl_loss_3": 1699.4741943359375, "kl_loss_7": 606.771630859375, "learning_rate": 0.0003913929639497462, "loss": 1141.8648, "step": 5740 }, { "ce_loss_10": 3.468266797065735, "ce_loss_13": 3.3873007535934447, "ce_loss_2": 4.490426182746887, "ce_loss_3": 4.221552240848541, "ce_loss_7": 3.6532665491104126, "epoch": 0.575, "grad_norm": 600.0, "kl_loss_10": 182.11020889282227, "kl_loss_2": 2259.020544433594, "kl_loss_3": 1779.8084838867187, "kl_loss_7": 622.9515014648438, "learning_rate": 0.00038984473360672965, "loss": 1169.1125, "step": 5750 }, { "ce_loss_10": 3.4774887681007387, "ce_loss_13": 3.3949706315994264, "ce_loss_2": 4.497473883628845, "ce_loss_3": 4.2249194264411924, "ce_loss_7": 3.664697051048279, "epoch": 0.576, "grad_norm": 596.0, "kl_loss_10": 181.4011428833008, "kl_loss_2": 2244.824786376953, "kl_loss_3": 1764.2872131347656, "kl_loss_7": 621.9651702880859, "learning_rate": 0.0003882976125260229, "loss": 1170.2874, "step": 5760 }, { "ce_loss_10": 3.5439630150794983, "ce_loss_13": 3.4651756167411802, "ce_loss_2": 4.539518880844116, "ce_loss_3": 4.274804329872131, "ce_loss_7": 3.728801262378693, "epoch": 0.577, "grad_norm": 592.0, "kl_loss_10": 183.33100814819335, "kl_loss_2": 2204.0270751953126, "kl_loss_3": 1723.9171936035157, "kl_loss_7": 615.5777862548828, "learning_rate": 0.00038675161628711776, "loss": 1179.8893, "step": 5770 }, { "ce_loss_10": 3.5816867470741274, "ce_loss_13": 3.5046088337898254, "ce_loss_2": 4.544821619987488, "ce_loss_3": 4.285539746284485, "ce_loss_7": 3.761784756183624, "epoch": 0.578, "grad_norm": 620.0, "kl_loss_10": 181.6286849975586, "kl_loss_2": 2136.3880615234375, "kl_loss_3": 1677.4017333984375, "kl_loss_7": 610.2154174804688, "learning_rate": 0.0003852067604581794, "loss": 1194.1891, "step": 5780 }, { "ce_loss_10": 3.523706150054932, "ce_loss_13": 3.448537766933441, "ce_loss_2": 4.533637523651123, "ce_loss_3": 4.265281748771668, "ce_loss_7": 3.709212040901184, "epoch": 0.579, "grad_norm": 676.0, "kl_loss_10": 181.67257690429688, "kl_loss_2": 2230.821612548828, "kl_loss_3": 1755.830584716797, "kl_loss_7": 620.3396881103515, "learning_rate": 0.0003836630605958888, "loss": 1177.6782, "step": 5790 }, { "ce_loss_10": 3.583223593235016, "ce_loss_13": 3.503636026382446, "ce_loss_2": 4.566303539276123, "ce_loss_3": 4.3056800127029415, "ce_loss_7": 3.76351158618927, "epoch": 0.58, "grad_norm": 708.0, "kl_loss_10": 183.71082077026367, "kl_loss_2": 2228.4331115722657, "kl_loss_3": 1769.3681579589843, "kl_loss_7": 628.4158660888672, "learning_rate": 0.0003821205322452863, "loss": 1235.8768, "step": 5800 }, { "ce_loss_10": 3.563581478595734, "ce_loss_13": 3.488909196853638, "ce_loss_2": 4.543065023422241, "ce_loss_3": 4.286456656455994, "ce_loss_7": 3.7441120743751526, "epoch": 0.581, "grad_norm": 584.0, "kl_loss_10": 180.5809585571289, "kl_loss_2": 2191.5135986328123, "kl_loss_3": 1729.1048767089844, "kl_loss_7": 608.2429626464843, "learning_rate": 0.0003805791909396155, "loss": 1179.2295, "step": 5810 }, { "ce_loss_10": 3.5160235166549683, "ce_loss_13": 3.43984659910202, "ce_loss_2": 4.500444793701172, "ce_loss_3": 4.2373772144317625, "ce_loss_7": 3.6964723467826843, "epoch": 0.582, "grad_norm": 652.0, "kl_loss_10": 180.02818908691407, "kl_loss_2": 2186.5078369140624, "kl_loss_3": 1730.8345642089844, "kl_loss_7": 613.5680450439453, "learning_rate": 0.0003790390522001662, "loss": 1191.4708, "step": 5820 }, { "ce_loss_10": 3.447020876407623, "ce_loss_13": 3.3710612773895265, "ce_loss_2": 4.448183393478393, "ce_loss_3": 4.185709154605865, "ce_loss_7": 3.6283095359802244, "epoch": 0.583, "grad_norm": 620.0, "kl_loss_10": 180.24705505371094, "kl_loss_2": 2242.3388488769533, "kl_loss_3": 1776.8985290527344, "kl_loss_7": 620.0051086425781, "learning_rate": 0.0003775001315361183, "loss": 1173.2469, "step": 5830 }, { "ce_loss_10": 3.560646951198578, "ce_loss_13": 3.481656861305237, "ce_loss_2": 4.561934852600098, "ce_loss_3": 4.297064936161041, "ce_loss_7": 3.746256446838379, "epoch": 0.584, "grad_norm": 560.0, "kl_loss_10": 183.9656074523926, "kl_loss_2": 2215.3773864746095, "kl_loss_3": 1750.5085021972657, "kl_loss_7": 621.9390472412109, "learning_rate": 0.0003759624444443858, "loss": 1186.5547, "step": 5840 }, { "ce_loss_10": 3.592632758617401, "ce_loss_13": 3.520240008831024, "ce_loss_2": 4.567729663848877, "ce_loss_3": 4.300854158401489, "ce_loss_7": 3.769944798946381, "epoch": 0.585, "grad_norm": 568.0, "kl_loss_10": 180.2906066894531, "kl_loss_2": 2170.1985412597655, "kl_loss_3": 1706.62548828125, "kl_loss_7": 608.6328552246093, "learning_rate": 0.00037442600640946044, "loss": 1155.9348, "step": 5850 }, { "ce_loss_10": 3.550674855709076, "ce_loss_13": 3.475678253173828, "ce_loss_2": 4.5188051700592045, "ce_loss_3": 4.257573843002319, "ce_loss_7": 3.733881187438965, "epoch": 0.586, "grad_norm": 624.0, "kl_loss_10": 180.34449844360353, "kl_loss_2": 2161.917333984375, "kl_loss_3": 1700.5603820800782, "kl_loss_7": 615.4381408691406, "learning_rate": 0.00037289083290325663, "loss": 1151.5385, "step": 5860 }, { "ce_loss_10": 3.5404091477394104, "ce_loss_13": 3.4616484522819517, "ce_loss_2": 4.5070148229599, "ce_loss_3": 4.242105662822723, "ce_loss_7": 3.7187010407447816, "epoch": 0.587, "grad_norm": 592.0, "kl_loss_10": 183.17743911743165, "kl_loss_2": 2149.7393432617187, "kl_loss_3": 1683.0787292480468, "kl_loss_7": 610.0913803100586, "learning_rate": 0.0003713569393849543, "loss": 1154.5703, "step": 5870 }, { "ce_loss_10": 3.5839020013809204, "ce_loss_13": 3.5078009486198427, "ce_loss_2": 4.56416871547699, "ce_loss_3": 4.296731424331665, "ce_loss_7": 3.767895996570587, "epoch": 0.588, "grad_norm": 592.0, "kl_loss_10": 183.36542816162108, "kl_loss_2": 2186.738494873047, "kl_loss_3": 1717.5487915039062, "kl_loss_7": 612.2841430664063, "learning_rate": 0.00036982434130084397, "loss": 1179.8928, "step": 5880 }, { "ce_loss_10": 3.4997439622879027, "ce_loss_13": 3.4187664270401, "ce_loss_2": 4.478350329399109, "ce_loss_3": 4.210885548591614, "ce_loss_7": 3.6801365852355956, "epoch": 0.589, "grad_norm": 664.0, "kl_loss_10": 186.01408843994142, "kl_loss_2": 2192.050701904297, "kl_loss_3": 1713.8697509765625, "kl_loss_7": 622.2605224609375, "learning_rate": 0.00036829305408417166, "loss": 1185.5467, "step": 5890 }, { "ce_loss_10": 3.4883674502372743, "ce_loss_13": 3.4076414942741393, "ce_loss_2": 4.51081612110138, "ce_loss_3": 4.233860373497009, "ce_loss_7": 3.68140949010849, "epoch": 0.59, "grad_norm": 632.0, "kl_loss_10": 185.69306488037108, "kl_loss_2": 2265.8583251953123, "kl_loss_3": 1770.2322631835937, "kl_loss_7": 633.7182983398437, "learning_rate": 0.0003667630931549826, "loss": 1189.5502, "step": 5900 }, { "ce_loss_10": 3.454320323467255, "ce_loss_13": 3.376146912574768, "ce_loss_2": 4.510071706771851, "ce_loss_3": 4.2408855676651, "ce_loss_7": 3.649706947803497, "epoch": 0.591, "grad_norm": 728.0, "kl_loss_10": 185.1581298828125, "kl_loss_2": 2343.439013671875, "kl_loss_3": 1859.8356567382812, "kl_loss_7": 639.2615692138672, "learning_rate": 0.00036523447391996613, "loss": 1217.3514, "step": 5910 }, { "ce_loss_10": 3.549302911758423, "ce_loss_13": 3.4722840428352355, "ce_loss_2": 4.514612603187561, "ce_loss_3": 4.256480038166046, "ce_loss_7": 3.727895641326904, "epoch": 0.592, "grad_norm": 580.0, "kl_loss_10": 181.60699539184571, "kl_loss_2": 2162.6364685058593, "kl_loss_3": 1701.0076782226563, "kl_loss_7": 610.4459930419922, "learning_rate": 0.00036370721177230114, "loss": 1162.5948, "step": 5920 }, { "ce_loss_10": 3.543530595302582, "ce_loss_13": 3.4660569787025453, "ce_loss_2": 4.543927192687988, "ce_loss_3": 4.277280712127686, "ce_loss_7": 3.728453516960144, "epoch": 0.593, "grad_norm": 628.0, "kl_loss_10": 184.26243515014647, "kl_loss_2": 2218.4042541503904, "kl_loss_3": 1743.315625, "kl_loss_7": 620.730111694336, "learning_rate": 0.00036218132209150044, "loss": 1186.6707, "step": 5930 }, { "ce_loss_10": 3.497697722911835, "ce_loss_13": 3.4142557263374327, "ce_loss_2": 4.5388647556304935, "ce_loss_3": 4.264691114425659, "ce_loss_7": 3.6943756103515626, "epoch": 0.594, "grad_norm": 524.0, "kl_loss_10": 188.87873077392578, "kl_loss_2": 2304.517468261719, "kl_loss_3": 1814.6093872070312, "kl_loss_7": 639.0129974365234, "learning_rate": 0.0003606568202432562, "loss": 1197.9809, "step": 5940 }, { "ce_loss_10": 3.565451109409332, "ce_loss_13": 3.4856663823127745, "ce_loss_2": 4.5841080904006954, "ce_loss_3": 4.317169034481049, "ce_loss_7": 3.754825806617737, "epoch": 0.595, "grad_norm": 696.0, "kl_loss_10": 187.19320907592774, "kl_loss_2": 2274.0406982421873, "kl_loss_3": 1793.8463073730468, "kl_loss_7": 630.620458984375, "learning_rate": 0.0003591337215792851, "loss": 1177.4938, "step": 5950 }, { "ce_loss_10": 3.611758494377136, "ce_loss_13": 3.5361703038215637, "ce_loss_2": 4.54854645729065, "ce_loss_3": 4.2874367237091064, "ce_loss_7": 3.781387460231781, "epoch": 0.596, "grad_norm": 536.0, "kl_loss_10": 179.44385452270507, "kl_loss_2": 2134.903210449219, "kl_loss_3": 1672.3198852539062, "kl_loss_7": 603.0327301025391, "learning_rate": 0.00035761204143717383, "loss": 1174.0895, "step": 5960 }, { "ce_loss_10": 3.564636397361755, "ce_loss_13": 3.4857504963874817, "ce_loss_2": 4.562372779846191, "ce_loss_3": 4.294865238666534, "ce_loss_7": 3.747916042804718, "epoch": 0.597, "grad_norm": 616.0, "kl_loss_10": 181.63295822143556, "kl_loss_2": 2217.5136901855467, "kl_loss_3": 1751.903790283203, "kl_loss_7": 618.9376495361328, "learning_rate": 0.0003560917951402245, "loss": 1215.2734, "step": 5970 }, { "ce_loss_10": 3.5358213543891908, "ce_loss_13": 3.461250603199005, "ce_loss_2": 4.515460109710693, "ce_loss_3": 4.252109396457672, "ce_loss_7": 3.720645487308502, "epoch": 0.598, "grad_norm": 616.0, "kl_loss_10": 180.68030853271483, "kl_loss_2": 2199.883331298828, "kl_loss_3": 1727.857843017578, "kl_loss_7": 614.7005615234375, "learning_rate": 0.00035457299799730046, "loss": 1174.0783, "step": 5980 }, { "ce_loss_10": 3.6016149520874023, "ce_loss_13": 3.523995506763458, "ce_loss_2": 4.564206576347351, "ce_loss_3": 4.302748084068298, "ce_loss_7": 3.7862717866897584, "epoch": 0.599, "grad_norm": 600.0, "kl_loss_10": 181.36301651000977, "kl_loss_2": 2153.0896545410155, "kl_loss_3": 1694.4290161132812, "kl_loss_7": 614.9286560058594, "learning_rate": 0.0003530556653026721, "loss": 1181.7495, "step": 5990 }, { "ce_loss_10": 3.5210883378982545, "ce_loss_13": 3.4458776116371155, "ce_loss_2": 4.520641088485718, "ce_loss_3": 4.254351568222046, "ce_loss_7": 3.699181377887726, "epoch": 0.6, "grad_norm": 764.0, "kl_loss_10": 179.21529235839844, "kl_loss_2": 2227.2805419921874, "kl_loss_3": 1758.5499328613282, "kl_loss_7": 610.0478576660156, "learning_rate": 0.00035153981233586274, "loss": 1193.8637, "step": 6000 }, { "ce_loss_10": 3.499428999423981, "ce_loss_13": 3.422479748725891, "ce_loss_2": 4.4867565631866455, "ce_loss_3": 4.227682662010193, "ce_loss_7": 3.6805691361427306, "epoch": 0.601, "grad_norm": 584.0, "kl_loss_10": 179.26205139160157, "kl_loss_2": 2193.6119079589844, "kl_loss_3": 1731.6024475097656, "kl_loss_7": 612.7285736083984, "learning_rate": 0.00035002545436149473, "loss": 1214.442, "step": 6010 }, { "ce_loss_10": 3.507369041442871, "ce_loss_13": 3.427609443664551, "ce_loss_2": 4.515847969055176, "ce_loss_3": 4.248699688911438, "ce_loss_7": 3.6938512086868287, "epoch": 0.602, "grad_norm": 592.0, "kl_loss_10": 187.4394386291504, "kl_loss_2": 2240.724530029297, "kl_loss_3": 1766.6628112792969, "kl_loss_7": 629.6498748779297, "learning_rate": 0.0003485126066291364, "loss": 1169.8236, "step": 6020 }, { "ce_loss_10": 3.5554185032844545, "ce_loss_13": 3.4788596630096436, "ce_loss_2": 4.540017461776733, "ce_loss_3": 4.2838677883148195, "ce_loss_7": 3.736110508441925, "epoch": 0.603, "grad_norm": 520.0, "kl_loss_10": 179.3347900390625, "kl_loss_2": 2192.1767639160157, "kl_loss_3": 1731.594775390625, "kl_loss_7": 613.0785980224609, "learning_rate": 0.0003470012843731476, "loss": 1185.9094, "step": 6030 }, { "ce_loss_10": 3.494213032722473, "ce_loss_13": 3.41587815284729, "ce_loss_2": 4.493516874313355, "ce_loss_3": 4.230558323860168, "ce_loss_7": 3.6748696088790895, "epoch": 0.604, "grad_norm": 604.0, "kl_loss_10": 180.02317504882814, "kl_loss_2": 2220.4429626464844, "kl_loss_3": 1750.9172302246093, "kl_loss_7": 613.3353332519531, "learning_rate": 0.00034549150281252633, "loss": 1207.7186, "step": 6040 }, { "ce_loss_10": 3.4735769987106324, "ce_loss_13": 3.398567247390747, "ce_loss_2": 4.450454211235046, "ce_loss_3": 4.185557043552398, "ce_loss_7": 3.660480535030365, "epoch": 0.605, "grad_norm": 608.0, "kl_loss_10": 181.83876571655273, "kl_loss_2": 2163.210076904297, "kl_loss_3": 1694.9721801757812, "kl_loss_7": 612.4143432617187, "learning_rate": 0.0003439832771507565, "loss": 1157.9707, "step": 6050 }, { "ce_loss_10": 3.4816818594932557, "ce_loss_13": 3.4034390568733217, "ce_loss_2": 4.478318929672241, "ce_loss_3": 4.211991810798645, "ce_loss_7": 3.6656970381736755, "epoch": 0.606, "grad_norm": 560.0, "kl_loss_10": 181.10105361938477, "kl_loss_2": 2226.4850891113283, "kl_loss_3": 1757.37236328125, "kl_loss_7": 619.9399398803711, "learning_rate": 0.0003424766225756537, "loss": 1172.4078, "step": 6060 }, { "ce_loss_10": 3.5375612139701844, "ce_loss_13": 3.4606423020362853, "ce_loss_2": 4.53115668296814, "ce_loss_3": 4.261643159389496, "ce_loss_7": 3.7194941639900208, "epoch": 0.607, "grad_norm": 600.0, "kl_loss_10": 181.42390975952148, "kl_loss_2": 2202.2797973632814, "kl_loss_3": 1733.2598999023437, "kl_loss_7": 615.9942810058594, "learning_rate": 0.00034097155425921255, "loss": 1158.2284, "step": 6070 }, { "ce_loss_10": 3.433805537223816, "ce_loss_13": 3.354471778869629, "ce_loss_2": 4.449812698364258, "ce_loss_3": 4.179146933555603, "ce_loss_7": 3.6204983830451964, "epoch": 0.608, "grad_norm": 592.0, "kl_loss_10": 183.06991577148438, "kl_loss_2": 2273.455847167969, "kl_loss_3": 1787.318505859375, "kl_loss_7": 624.6353576660156, "learning_rate": 0.0003394680873574546, "loss": 1187.3987, "step": 6080 }, { "ce_loss_10": 3.54138503074646, "ce_loss_13": 3.4626068115234374, "ce_loss_2": 4.556825470924378, "ce_loss_3": 4.281811666488648, "ce_loss_7": 3.7267327547073363, "epoch": 0.609, "grad_norm": 620.0, "kl_loss_10": 183.72728881835937, "kl_loss_2": 2232.6337280273438, "kl_loss_3": 1752.9180786132813, "kl_loss_7": 617.8084594726563, "learning_rate": 0.0003379662370102747, "loss": 1176.7848, "step": 6090 }, { "ce_loss_10": 3.5495489597320558, "ce_loss_13": 3.4742938756942747, "ce_loss_2": 4.515744471549988, "ce_loss_3": 4.251201486587524, "ce_loss_7": 3.726244103908539, "epoch": 0.61, "grad_norm": 640.0, "kl_loss_10": 179.96657028198243, "kl_loss_2": 2182.172985839844, "kl_loss_3": 1717.0841491699218, "kl_loss_7": 617.4093353271485, "learning_rate": 0.0003364660183412892, "loss": 1176.2052, "step": 6100 }, { "ce_loss_10": 3.5306557536125185, "ce_loss_13": 3.4546700954437255, "ce_loss_2": 4.500067496299744, "ce_loss_3": 4.235128319263458, "ce_loss_7": 3.7075342297554017, "epoch": 0.611, "grad_norm": 592.0, "kl_loss_10": 182.79292755126954, "kl_loss_2": 2182.2781616210937, "kl_loss_3": 1714.5931213378906, "kl_loss_7": 613.8878936767578, "learning_rate": 0.0003349674464576834, "loss": 1190.8153, "step": 6110 }, { "ce_loss_10": 3.477449345588684, "ce_loss_13": 3.3995738983154298, "ce_loss_2": 4.485787630081177, "ce_loss_3": 4.219619536399842, "ce_loss_7": 3.6623815417289736, "epoch": 0.612, "grad_norm": 628.0, "kl_loss_10": 181.76175689697266, "kl_loss_2": 2235.75986328125, "kl_loss_3": 1763.9286254882813, "kl_loss_7": 619.9945251464844, "learning_rate": 0.00033347053645005966, "loss": 1163.8981, "step": 6120 }, { "ce_loss_10": 3.5906055331230164, "ce_loss_13": 3.514803075790405, "ce_loss_2": 4.5458073854446415, "ce_loss_3": 4.283458161354065, "ce_loss_7": 3.772923803329468, "epoch": 0.613, "grad_norm": 644.0, "kl_loss_10": 178.51968688964843, "kl_loss_2": 2116.6735778808593, "kl_loss_3": 1659.320733642578, "kl_loss_7": 606.7959167480469, "learning_rate": 0.00033197530339228485, "loss": 1170.5501, "step": 6130 }, { "ce_loss_10": 3.5471089243888856, "ce_loss_13": 3.468013954162598, "ce_loss_2": 4.5254878282546995, "ce_loss_3": 4.254842627048492, "ce_loss_7": 3.73079137802124, "epoch": 0.614, "grad_norm": 532.0, "kl_loss_10": 183.3593994140625, "kl_loss_2": 2176.643206787109, "kl_loss_3": 1701.0747802734375, "kl_loss_7": 619.2012481689453, "learning_rate": 0.00033048176234133967, "loss": 1166.8168, "step": 6140 }, { "ce_loss_10": 3.5306158542633055, "ce_loss_13": 3.453017568588257, "ce_loss_2": 4.494013047218322, "ce_loss_3": 4.233083915710449, "ce_loss_7": 3.7115015268325804, "epoch": 0.615, "grad_norm": 592.0, "kl_loss_10": 183.00715713500978, "kl_loss_2": 2175.1028686523437, "kl_loss_3": 1702.7456420898438, "kl_loss_7": 619.7471405029297, "learning_rate": 0.0003289899283371657, "loss": 1181.7955, "step": 6150 }, { "ce_loss_10": 3.5544473528862, "ce_loss_13": 3.4786699175834657, "ce_loss_2": 4.547568416595459, "ce_loss_3": 4.281970739364624, "ce_loss_7": 3.7363924741744996, "epoch": 0.616, "grad_norm": 600.0, "kl_loss_10": 178.17992782592773, "kl_loss_2": 2185.039025878906, "kl_loss_3": 1723.0535766601563, "kl_loss_7": 600.4897644042969, "learning_rate": 0.0003274998164025148, "loss": 1196.8087, "step": 6160 }, { "ce_loss_10": 3.586019229888916, "ce_loss_13": 3.509108769893646, "ce_loss_2": 4.5615111827850345, "ce_loss_3": 4.2898026466369625, "ce_loss_7": 3.76910115480423, "epoch": 0.617, "grad_norm": 596.0, "kl_loss_10": 183.4706718444824, "kl_loss_2": 2168.4442443847656, "kl_loss_3": 1695.5099731445312, "kl_loss_7": 616.1861694335937, "learning_rate": 0.0003260114415427975, "loss": 1190.7359, "step": 6170 }, { "ce_loss_10": 3.5073242664337156, "ce_loss_13": 3.4292925119400026, "ce_loss_2": 4.523944449424744, "ce_loss_3": 4.251231408119201, "ce_loss_7": 3.6900092363357544, "epoch": 0.618, "grad_norm": 612.0, "kl_loss_10": 180.3868850708008, "kl_loss_2": 2258.1235778808596, "kl_loss_3": 1773.0142578125, "kl_loss_7": 615.9339263916015, "learning_rate": 0.0003245248187459323, "loss": 1218.0189, "step": 6180 }, { "ce_loss_10": 3.4972055196762084, "ce_loss_13": 3.4217321276664734, "ce_loss_2": 4.4563206195831295, "ce_loss_3": 4.195159709453582, "ce_loss_7": 3.6716169476509095, "epoch": 0.619, "grad_norm": 596.0, "kl_loss_10": 176.01737060546876, "kl_loss_2": 2149.769183349609, "kl_loss_3": 1675.0436157226563, "kl_loss_7": 597.3845794677734, "learning_rate": 0.00032303996298219416, "loss": 1151.9841, "step": 6190 }, { "ce_loss_10": 3.5777448058128356, "ce_loss_13": 3.500323712825775, "ce_loss_2": 4.53541202545166, "ce_loss_3": 4.266927003860474, "ce_loss_7": 3.755027210712433, "epoch": 0.62, "grad_norm": 540.0, "kl_loss_10": 178.23485260009767, "kl_loss_2": 2112.595593261719, "kl_loss_3": 1646.6524230957032, "kl_loss_7": 602.416943359375, "learning_rate": 0.00032155688920406414, "loss": 1145.6068, "step": 6200 }, { "ce_loss_10": 3.489628565311432, "ce_loss_13": 3.408998668193817, "ce_loss_2": 4.5190582275390625, "ce_loss_3": 4.245685923099518, "ce_loss_7": 3.671571230888367, "epoch": 0.621, "grad_norm": 652.0, "kl_loss_10": 183.76829681396484, "kl_loss_2": 2272.4242309570313, "kl_loss_3": 1788.1328247070312, "kl_loss_7": 627.5300720214843, "learning_rate": 0.0003200756123460788, "loss": 1224.8912, "step": 6210 }, { "ce_loss_10": 3.5219372153282165, "ce_loss_13": 3.4430843591690063, "ce_loss_2": 4.530118870735168, "ce_loss_3": 4.26385805606842, "ce_loss_7": 3.708560848236084, "epoch": 0.622, "grad_norm": 684.0, "kl_loss_10": 185.2090690612793, "kl_loss_2": 2254.1378173828125, "kl_loss_3": 1774.3692199707032, "kl_loss_7": 633.037890625, "learning_rate": 0.00031859614732467957, "loss": 1207.0312, "step": 6220 }, { "ce_loss_10": 3.5700612902641295, "ce_loss_13": 3.4917181968688964, "ce_loss_2": 4.540509462356567, "ce_loss_3": 4.275044929981232, "ce_loss_7": 3.7488471269607544, "epoch": 0.623, "grad_norm": 564.0, "kl_loss_10": 178.5159034729004, "kl_loss_2": 2155.8026611328123, "kl_loss_3": 1685.940985107422, "kl_loss_7": 600.1484497070312, "learning_rate": 0.00031711850903806275, "loss": 1157.7447, "step": 6230 }, { "ce_loss_10": 3.479930281639099, "ce_loss_13": 3.39938303232193, "ce_loss_2": 4.482881689071656, "ce_loss_3": 4.214280414581299, "ce_loss_7": 3.666577732563019, "epoch": 0.624, "grad_norm": 528.0, "kl_loss_10": 185.9188034057617, "kl_loss_2": 2243.2543823242186, "kl_loss_3": 1758.6941833496094, "kl_loss_7": 628.0701446533203, "learning_rate": 0.0003156427123660297, "loss": 1172.3383, "step": 6240 }, { "ce_loss_10": 3.5643810868263244, "ce_loss_13": 3.4881609320640563, "ce_loss_2": 4.518170762062073, "ce_loss_3": 4.258860862255096, "ce_loss_7": 3.745578372478485, "epoch": 0.625, "grad_norm": 596.0, "kl_loss_10": 180.73046417236327, "kl_loss_2": 2135.2883361816407, "kl_loss_3": 1669.8626892089844, "kl_loss_7": 610.9410751342773, "learning_rate": 0.0003141687721698363, "loss": 1172.6947, "step": 6250 }, { "ce_loss_10": 3.536016345024109, "ce_loss_13": 3.4606189489364625, "ce_loss_2": 4.476572108268738, "ce_loss_3": 4.211447751522064, "ce_loss_7": 3.7014155983924866, "epoch": 0.626, "grad_norm": 616.0, "kl_loss_10": 175.8163749694824, "kl_loss_2": 2105.446813964844, "kl_loss_3": 1637.2767333984375, "kl_loss_7": 587.5009735107421, "learning_rate": 0.00031269670329204396, "loss": 1155.6384, "step": 6260 }, { "ce_loss_10": 3.5712973356246946, "ce_loss_13": 3.4947034239768984, "ce_loss_2": 4.515408515930176, "ce_loss_3": 4.251049220561981, "ce_loss_7": 3.7454182147979735, "epoch": 0.627, "grad_norm": 644.0, "kl_loss_10": 181.6370933532715, "kl_loss_2": 2120.7030395507813, "kl_loss_3": 1650.25341796875, "kl_loss_7": 607.5851348876953, "learning_rate": 0.00031122652055637015, "loss": 1169.2292, "step": 6270 }, { "ce_loss_10": 3.536707639694214, "ce_loss_13": 3.460920011997223, "ce_loss_2": 4.534442710876465, "ce_loss_3": 4.263644289970398, "ce_loss_7": 3.7196394085884092, "epoch": 0.628, "grad_norm": 556.0, "kl_loss_10": 181.97393569946288, "kl_loss_2": 2233.067547607422, "kl_loss_3": 1750.9132995605469, "kl_loss_7": 618.4156631469726, "learning_rate": 0.0003097582387675385, "loss": 1169.3315, "step": 6280 }, { "ce_loss_10": 3.5805759191513062, "ce_loss_13": 3.503600060939789, "ce_loss_2": 4.546688604354858, "ce_loss_3": 4.285043132305145, "ce_loss_7": 3.7596506476402283, "epoch": 0.629, "grad_norm": 536.0, "kl_loss_10": 181.50545425415038, "kl_loss_2": 2176.2076171875, "kl_loss_3": 1706.1532836914062, "kl_loss_7": 611.5946624755859, "learning_rate": 0.00030829187271113034, "loss": 1162.2808, "step": 6290 }, { "ce_loss_10": 3.5692893385887148, "ce_loss_13": 3.49398432970047, "ce_loss_2": 4.5324320793151855, "ce_loss_3": 4.271732580661774, "ce_loss_7": 3.738038659095764, "epoch": 0.63, "grad_norm": 660.0, "kl_loss_10": 176.80067443847656, "kl_loss_2": 2142.2837646484377, "kl_loss_3": 1672.4158203125, "kl_loss_7": 598.4024932861328, "learning_rate": 0.00030682743715343565, "loss": 1178.4112, "step": 6300 }, { "ce_loss_10": 3.5165117979049683, "ce_loss_13": 3.4367071866989134, "ce_loss_2": 4.5106003999710085, "ce_loss_3": 4.248035335540772, "ce_loss_7": 3.709369492530823, "epoch": 0.631, "grad_norm": 624.0, "kl_loss_10": 185.85676803588868, "kl_loss_2": 2187.9409912109377, "kl_loss_3": 1716.0930541992188, "kl_loss_7": 624.2622802734375, "learning_rate": 0.0003053649468413043, "loss": 1194.6155, "step": 6310 }, { "ce_loss_10": 3.6293103814125063, "ce_loss_13": 3.5522167325019836, "ce_loss_2": 4.589023590087891, "ce_loss_3": 4.323796653747559, "ce_loss_7": 3.8087464213371276, "epoch": 0.632, "grad_norm": 664.0, "kl_loss_10": 183.21706161499023, "kl_loss_2": 2147.7636474609376, "kl_loss_3": 1686.63291015625, "kl_loss_7": 615.6221435546875, "learning_rate": 0.00030390441650199725, "loss": 1158.5613, "step": 6320 }, { "ce_loss_10": 3.528099310398102, "ce_loss_13": 3.4539591908454894, "ce_loss_2": 4.50098488330841, "ce_loss_3": 4.2318372368812565, "ce_loss_7": 3.70650874376297, "epoch": 0.633, "grad_norm": 676.0, "kl_loss_10": 181.68777465820312, "kl_loss_2": 2164.3626098632812, "kl_loss_3": 1687.4978088378907, "kl_loss_7": 610.1166748046875, "learning_rate": 0.00030244586084303903, "loss": 1154.3, "step": 6330 }, { "ce_loss_10": 3.4934327363967896, "ce_loss_13": 3.416351318359375, "ce_loss_2": 4.505590105056763, "ce_loss_3": 4.235206222534179, "ce_loss_7": 3.6859657049179075, "epoch": 0.634, "grad_norm": 564.0, "kl_loss_10": 183.83423309326173, "kl_loss_2": 2252.3034423828126, "kl_loss_3": 1765.4126586914062, "kl_loss_7": 627.5603424072266, "learning_rate": 0.00030098929455206903, "loss": 1173.0053, "step": 6340 }, { "ce_loss_10": 3.5009153842926026, "ce_loss_13": 3.4256786108016968, "ce_loss_2": 4.492053604125976, "ce_loss_3": 4.224474251270294, "ce_loss_7": 3.6754886388778685, "epoch": 0.635, "grad_norm": 592.0, "kl_loss_10": 180.03737106323243, "kl_loss_2": 2236.1428771972655, "kl_loss_3": 1754.843768310547, "kl_loss_7": 615.7969268798828, "learning_rate": 0.00029953473229669324, "loss": 1215.3177, "step": 6350 }, { "ce_loss_10": 3.5320404410362243, "ce_loss_13": 3.4564929485321043, "ce_loss_2": 4.505661821365356, "ce_loss_3": 4.2487224817276, "ce_loss_7": 3.717836594581604, "epoch": 0.636, "grad_norm": 560.0, "kl_loss_10": 180.38322067260742, "kl_loss_2": 2164.6011291503905, "kl_loss_3": 1703.929705810547, "kl_loss_7": 616.6229248046875, "learning_rate": 0.00029808218872433767, "loss": 1152.0346, "step": 6360 }, { "ce_loss_10": 3.5955461502075194, "ce_loss_13": 3.521399176120758, "ce_loss_2": 4.553752660751343, "ce_loss_3": 4.287336015701294, "ce_loss_7": 3.7661701798439027, "epoch": 0.637, "grad_norm": 584.0, "kl_loss_10": 178.29160919189454, "kl_loss_2": 2154.0076416015627, "kl_loss_3": 1683.874932861328, "kl_loss_7": 604.1974517822266, "learning_rate": 0.0002966316784621, "loss": 1148.5613, "step": 6370 }, { "ce_loss_10": 3.509734773635864, "ce_loss_13": 3.4283226490020753, "ce_loss_2": 4.500501930713654, "ce_loss_3": 4.237296044826508, "ce_loss_7": 3.697099339962006, "epoch": 0.638, "grad_norm": 572.0, "kl_loss_10": 183.85193252563477, "kl_loss_2": 2219.202685546875, "kl_loss_3": 1744.0482971191407, "kl_loss_7": 628.3488616943359, "learning_rate": 0.0002951832161166024, "loss": 1161.3599, "step": 6380 }, { "ce_loss_10": 3.5833853006362917, "ce_loss_13": 3.5059871673583984, "ce_loss_2": 4.560363245010376, "ce_loss_3": 4.295464622974396, "ce_loss_7": 3.7690476536750794, "epoch": 0.639, "grad_norm": 524.0, "kl_loss_10": 182.52049560546874, "kl_loss_2": 2159.0023681640623, "kl_loss_3": 1692.8950073242188, "kl_loss_7": 613.0403747558594, "learning_rate": 0.0002937368162738445, "loss": 1138.2498, "step": 6390 }, { "ce_loss_10": 3.5200544476509092, "ce_loss_13": 3.4506627917289734, "ce_loss_2": 4.487471246719361, "ce_loss_3": 4.225354993343354, "ce_loss_7": 3.695161283016205, "epoch": 0.64, "grad_norm": 648.0, "kl_loss_10": 174.7782325744629, "kl_loss_2": 2168.6176025390623, "kl_loss_3": 1700.3829406738282, "kl_loss_7": 598.6395080566406, "learning_rate": 0.0002922924934990568, "loss": 1174.9205, "step": 6400 }, { "ce_loss_10": 3.460334539413452, "ce_loss_13": 3.3851306796073914, "ce_loss_2": 4.486385345458984, "ce_loss_3": 4.209315371513367, "ce_loss_7": 3.646379458904266, "epoch": 0.641, "grad_norm": 592.0, "kl_loss_10": 181.4815986633301, "kl_loss_2": 2269.7528442382813, "kl_loss_3": 1780.3827819824219, "kl_loss_7": 623.2667114257813, "learning_rate": 0.0002908502623365536, "loss": 1180.7166, "step": 6410 }, { "ce_loss_10": 3.400831735134125, "ce_loss_13": 3.323111522197723, "ce_loss_2": 4.43465530872345, "ce_loss_3": 4.168019390106201, "ce_loss_7": 3.5887860655784607, "epoch": 0.642, "grad_norm": 584.0, "kl_loss_10": 180.2845359802246, "kl_loss_2": 2285.9919677734374, "kl_loss_3": 1807.420263671875, "kl_loss_7": 623.3097045898437, "learning_rate": 0.0002894101373095867, "loss": 1196.7524, "step": 6420 }, { "ce_loss_10": 3.610305404663086, "ce_loss_13": 3.5335601687431337, "ce_loss_2": 4.569022560119629, "ce_loss_3": 4.3065975427627565, "ce_loss_7": 3.788155424594879, "epoch": 0.643, "grad_norm": 656.0, "kl_loss_10": 185.8272720336914, "kl_loss_2": 2151.861962890625, "kl_loss_3": 1684.6893188476563, "kl_loss_7": 614.9325622558594, "learning_rate": 0.00028797213292019926, "loss": 1162.4543, "step": 6430 }, { "ce_loss_10": 3.5838815212249755, "ce_loss_13": 3.5060059309005736, "ce_loss_2": 4.542932081222534, "ce_loss_3": 4.284253716468811, "ce_loss_7": 3.7631338119506834, "epoch": 0.644, "grad_norm": 536.0, "kl_loss_10": 182.34116134643554, "kl_loss_2": 2161.139373779297, "kl_loss_3": 1697.281298828125, "kl_loss_7": 612.268832397461, "learning_rate": 0.0002865362636490791, "loss": 1187.0314, "step": 6440 }, { "ce_loss_10": 3.598045587539673, "ce_loss_13": 3.524975371360779, "ce_loss_2": 4.552241158485413, "ce_loss_3": 4.294051146507263, "ce_loss_7": 3.7727458000183107, "epoch": 0.645, "grad_norm": 536.0, "kl_loss_10": 178.94673080444335, "kl_loss_2": 2142.5753845214845, "kl_loss_3": 1685.976092529297, "kl_loss_7": 604.8534698486328, "learning_rate": 0.0002851025439554142, "loss": 1148.6578, "step": 6450 }, { "ce_loss_10": 3.5864033341407775, "ce_loss_13": 3.5102365136146547, "ce_loss_2": 4.530459260940551, "ce_loss_3": 4.2697702050209045, "ce_loss_7": 3.77256600856781, "epoch": 0.646, "grad_norm": 552.0, "kl_loss_10": 180.96249084472657, "kl_loss_2": 2094.927349853516, "kl_loss_3": 1631.7639465332031, "kl_loss_7": 608.3878204345704, "learning_rate": 0.00028367098827674573, "loss": 1141.2359, "step": 6460 }, { "ce_loss_10": 3.5153084993362427, "ce_loss_13": 3.4397502303123475, "ce_loss_2": 4.504270768165588, "ce_loss_3": 4.232501339912415, "ce_loss_7": 3.69371120929718, "epoch": 0.647, "grad_norm": 588.0, "kl_loss_10": 178.14280624389647, "kl_loss_2": 2186.196148681641, "kl_loss_3": 1706.314013671875, "kl_loss_7": 600.8890106201172, "learning_rate": 0.00028224161102882397, "loss": 1170.0225, "step": 6470 }, { "ce_loss_10": 3.494782865047455, "ce_loss_13": 3.418469178676605, "ce_loss_2": 4.45595076084137, "ce_loss_3": 4.1893230199813845, "ce_loss_7": 3.6707924604415894, "epoch": 0.648, "grad_norm": 644.0, "kl_loss_10": 177.30072097778321, "kl_loss_2": 2145.1575622558594, "kl_loss_3": 1676.7467163085937, "kl_loss_7": 599.7084075927735, "learning_rate": 0.00028081442660546124, "loss": 1164.476, "step": 6480 }, { "ce_loss_10": 3.5571305990219115, "ce_loss_13": 3.4820198893547056, "ce_loss_2": 4.520304107666016, "ce_loss_3": 4.250013303756714, "ce_loss_7": 3.7307825326919555, "epoch": 0.649, "grad_norm": 708.0, "kl_loss_10": 180.5020294189453, "kl_loss_2": 2162.021893310547, "kl_loss_3": 1681.335223388672, "kl_loss_7": 604.5856201171875, "learning_rate": 0.0002793894493783892, "loss": 1161.7205, "step": 6490 }, { "ce_loss_10": 3.5730626702308657, "ce_loss_13": 3.4996850967407225, "ce_loss_2": 4.535577750205993, "ce_loss_3": 4.2806238532066345, "ce_loss_7": 3.746683120727539, "epoch": 0.65, "grad_norm": 532.0, "kl_loss_10": 175.4969383239746, "kl_loss_2": 2147.980224609375, "kl_loss_3": 1685.3772094726562, "kl_loss_7": 592.4264343261718, "learning_rate": 0.0002779666936971129, "loss": 1147.2826, "step": 6500 }, { "ce_loss_10": 3.579540717601776, "ce_loss_13": 3.503932845592499, "ce_loss_2": 4.570328307151795, "ce_loss_3": 4.304692578315735, "ce_loss_7": 3.760816919803619, "epoch": 0.651, "grad_norm": 560.0, "kl_loss_10": 180.5074890136719, "kl_loss_2": 2190.618371582031, "kl_loss_3": 1722.614599609375, "kl_loss_7": 613.0008575439454, "learning_rate": 0.00027654617388876614, "loss": 1176.0404, "step": 6510 }, { "ce_loss_10": 3.6101376891136168, "ce_loss_13": 3.5372079849243163, "ce_loss_2": 4.574031090736389, "ce_loss_3": 4.305119824409485, "ce_loss_7": 3.7854838371276855, "epoch": 0.652, "grad_norm": 506.0, "kl_loss_10": 179.79571838378905, "kl_loss_2": 2171.3123046875, "kl_loss_3": 1687.553973388672, "kl_loss_7": 603.6635681152344, "learning_rate": 0.0002751279042579672, "loss": 1161.0621, "step": 6520 }, { "ce_loss_10": 3.5500629782676696, "ce_loss_13": 3.475240981578827, "ce_loss_2": 4.515746712684631, "ce_loss_3": 4.248768877983093, "ce_loss_7": 3.726365828514099, "epoch": 0.653, "grad_norm": 520.0, "kl_loss_10": 175.72006454467774, "kl_loss_2": 2132.0258361816404, "kl_loss_3": 1663.1835388183595, "kl_loss_7": 593.7185913085938, "learning_rate": 0.00027371189908667604, "loss": 1173.0754, "step": 6530 }, { "ce_loss_10": 3.6066598892211914, "ce_loss_13": 3.5256664633750914, "ce_loss_2": 4.603748297691345, "ce_loss_3": 4.334264886379242, "ce_loss_7": 3.7890505313873293, "epoch": 0.654, "grad_norm": 556.0, "kl_loss_10": 184.21709976196288, "kl_loss_2": 2224.855090332031, "kl_loss_3": 1750.1156860351562, "kl_loss_7": 618.167140197754, "learning_rate": 0.00027229817263404863, "loss": 1200.1538, "step": 6540 }, { "ce_loss_10": 3.579574966430664, "ce_loss_13": 3.502782142162323, "ce_loss_2": 4.505041122436523, "ce_loss_3": 4.243296790122986, "ce_loss_7": 3.751440441608429, "epoch": 0.655, "grad_norm": 552.0, "kl_loss_10": 178.47067565917968, "kl_loss_2": 2091.8775146484377, "kl_loss_3": 1621.7340759277345, "kl_loss_7": 596.6207824707031, "learning_rate": 0.0002708867391362948, "loss": 1145.7798, "step": 6550 }, { "ce_loss_10": 3.5594303607940674, "ce_loss_13": 3.4848424673080443, "ce_loss_2": 4.510186004638672, "ce_loss_3": 4.239946413040161, "ce_loss_7": 3.729544758796692, "epoch": 0.656, "grad_norm": 600.0, "kl_loss_10": 174.02187423706056, "kl_loss_2": 2098.8442565917967, "kl_loss_3": 1625.1830017089844, "kl_loss_7": 579.9141540527344, "learning_rate": 0.0002694776128065345, "loss": 1152.9096, "step": 6560 }, { "ce_loss_10": 3.500006926059723, "ce_loss_13": 3.4239490151405336, "ce_loss_2": 4.463183629512787, "ce_loss_3": 4.198473536968232, "ce_loss_7": 3.6787616848945617, "epoch": 0.657, "grad_norm": 524.0, "kl_loss_10": 181.49803161621094, "kl_loss_2": 2175.4376220703125, "kl_loss_3": 1692.8995849609375, "kl_loss_7": 616.3900787353516, "learning_rate": 0.00026807080783465374, "loss": 1144.908, "step": 6570 }, { "ce_loss_10": 3.609026849269867, "ce_loss_13": 3.5301132678985594, "ce_loss_2": 4.5842578411102295, "ce_loss_3": 4.322691702842713, "ce_loss_7": 3.7907386422157288, "epoch": 0.658, "grad_norm": 540.0, "kl_loss_10": 181.2076759338379, "kl_loss_2": 2170.1943603515624, "kl_loss_3": 1703.6680847167968, "kl_loss_7": 614.0023223876954, "learning_rate": 0.00026666633838716316, "loss": 1180.9756, "step": 6580 }, { "ce_loss_10": 3.505808639526367, "ce_loss_13": 3.424725067615509, "ce_loss_2": 4.498762392997742, "ce_loss_3": 4.228746104240417, "ce_loss_7": 3.6866647005081177, "epoch": 0.659, "grad_norm": 660.0, "kl_loss_10": 183.16211471557617, "kl_loss_2": 2205.399041748047, "kl_loss_3": 1729.7577697753907, "kl_loss_7": 619.1098449707031, "learning_rate": 0.00026526421860705474, "loss": 1196.5574, "step": 6590 }, { "ce_loss_10": 3.5278443932533263, "ce_loss_13": 3.451081359386444, "ce_loss_2": 4.514556968212128, "ce_loss_3": 4.246424973011017, "ce_loss_7": 3.7130470991134645, "epoch": 0.66, "grad_norm": 604.0, "kl_loss_10": 183.11859054565429, "kl_loss_2": 2195.080352783203, "kl_loss_3": 1720.6291870117188, "kl_loss_7": 617.9165832519532, "learning_rate": 0.0002638644626136587, "loss": 1167.115, "step": 6600 }, { "ce_loss_10": 3.5388341546058655, "ce_loss_13": 3.4648394107818605, "ce_loss_2": 4.518757033348083, "ce_loss_3": 4.251232302188873, "ce_loss_7": 3.7166133403778074, "epoch": 0.661, "grad_norm": 600.0, "kl_loss_10": 177.9744987487793, "kl_loss_2": 2169.1557434082033, "kl_loss_3": 1695.717547607422, "kl_loss_7": 605.9922027587891, "learning_rate": 0.00026246708450250255, "loss": 1163.9504, "step": 6610 }, { "ce_loss_10": 3.536445343494415, "ce_loss_13": 3.4618695259094237, "ce_loss_2": 4.500265717506409, "ce_loss_3": 4.239484262466431, "ce_loss_7": 3.7084587097167967, "epoch": 0.662, "grad_norm": 624.0, "kl_loss_10": 177.32165069580077, "kl_loss_2": 2153.5470336914063, "kl_loss_3": 1682.642596435547, "kl_loss_7": 596.0139556884766, "learning_rate": 0.00026107209834516854, "loss": 1159.9879, "step": 6620 }, { "ce_loss_10": 3.4876843810081484, "ce_loss_13": 3.4082067966461183, "ce_loss_2": 4.498864269256591, "ce_loss_3": 4.235283279418946, "ce_loss_7": 3.666292154788971, "epoch": 0.663, "grad_norm": 620.0, "kl_loss_10": 180.81258544921874, "kl_loss_2": 2256.257257080078, "kl_loss_3": 1782.119805908203, "kl_loss_7": 615.3174774169922, "learning_rate": 0.0002596795181891514, "loss": 1197.8284, "step": 6630 }, { "ce_loss_10": 3.4956326842308045, "ce_loss_13": 3.414097845554352, "ce_loss_2": 4.498321509361267, "ce_loss_3": 4.228005886077881, "ce_loss_7": 3.6832273960113526, "epoch": 0.664, "grad_norm": 676.0, "kl_loss_10": 186.22876663208007, "kl_loss_2": 2223.420690917969, "kl_loss_3": 1743.7244079589843, "kl_loss_7": 627.1363647460937, "learning_rate": 0.000258289358057718, "loss": 1222.5622, "step": 6640 }, { "ce_loss_10": 3.5669368505477905, "ce_loss_13": 3.4856945157051085, "ce_loss_2": 4.551595258712768, "ce_loss_3": 4.286670958995819, "ce_loss_7": 3.751522934436798, "epoch": 0.665, "grad_norm": 556.0, "kl_loss_10": 185.22528228759765, "kl_loss_2": 2211.366003417969, "kl_loss_3": 1740.3289489746094, "kl_loss_7": 619.9797149658203, "learning_rate": 0.0002569016319497657, "loss": 1184.505, "step": 6650 }, { "ce_loss_10": 3.5523419260978697, "ce_loss_13": 3.4712039113044737, "ce_loss_2": 4.537931609153747, "ce_loss_3": 4.279768109321594, "ce_loss_7": 3.7336499214172365, "epoch": 0.666, "grad_norm": 544.0, "kl_loss_10": 186.748779296875, "kl_loss_2": 2205.344372558594, "kl_loss_3": 1734.7506713867188, "kl_loss_7": 622.0667907714844, "learning_rate": 0.00025551635383968066, "loss": 1198.5273, "step": 6660 }, { "ce_loss_10": 3.463807392120361, "ce_loss_13": 3.3866657257080077, "ce_loss_2": 4.469345259666443, "ce_loss_3": 4.193110883235931, "ce_loss_7": 3.6494885683059692, "epoch": 0.667, "grad_norm": 576.0, "kl_loss_10": 184.71422729492187, "kl_loss_2": 2248.0073059082033, "kl_loss_3": 1753.0323059082032, "kl_loss_7": 619.0755401611328, "learning_rate": 0.00025413353767719804, "loss": 1195.2947, "step": 6670 }, { "ce_loss_10": 3.5190200567245484, "ce_loss_13": 3.4452382922172546, "ce_loss_2": 4.497325706481933, "ce_loss_3": 4.232502174377442, "ce_loss_7": 3.694538187980652, "epoch": 0.668, "grad_norm": 568.0, "kl_loss_10": 177.03625259399413, "kl_loss_2": 2189.713330078125, "kl_loss_3": 1718.3252258300781, "kl_loss_7": 606.7687957763671, "learning_rate": 0.0002527531973872617, "loss": 1177.4366, "step": 6680 }, { "ce_loss_10": 3.538633036613464, "ce_loss_13": 3.4624911904335023, "ce_loss_2": 4.504513430595398, "ce_loss_3": 4.237690329551697, "ce_loss_7": 3.7170337319374083, "epoch": 0.669, "grad_norm": 592.0, "kl_loss_10": 178.7047462463379, "kl_loss_2": 2166.141973876953, "kl_loss_3": 1687.0463806152343, "kl_loss_7": 609.9779602050781, "learning_rate": 0.0002513753468698826, "loss": 1160.7738, "step": 6690 }, { "ce_loss_10": 3.510761630535126, "ce_loss_13": 3.4321574330329896, "ce_loss_2": 4.506561207771301, "ce_loss_3": 4.238252663612366, "ce_loss_7": 3.6953013062477114, "epoch": 0.67, "grad_norm": 544.0, "kl_loss_10": 182.07874755859376, "kl_loss_2": 2231.308416748047, "kl_loss_3": 1749.7796997070313, "kl_loss_7": 618.383251953125, "learning_rate": 0.0002500000000000001, "loss": 1185.6723, "step": 6700 }, { "ce_loss_10": 3.62176308631897, "ce_loss_13": 3.547257494926453, "ce_loss_2": 4.5295734882354735, "ce_loss_3": 4.277957272529602, "ce_loss_7": 3.788572609424591, "epoch": 0.671, "grad_norm": 548.0, "kl_loss_10": 173.84563446044922, "kl_loss_2": 2050.3306396484377, "kl_loss_3": 1604.508935546875, "kl_loss_7": 584.9092193603516, "learning_rate": 0.0002486271706273421, "loss": 1168.4034, "step": 6710 }, { "ce_loss_10": 3.557868146896362, "ce_loss_13": 3.485461747646332, "ce_loss_2": 4.488811063766479, "ce_loss_3": 4.22833331823349, "ce_loss_7": 3.7254873156547545, "epoch": 0.672, "grad_norm": 644.0, "kl_loss_10": 175.32781143188475, "kl_loss_2": 2091.0470642089845, "kl_loss_3": 1627.7723266601563, "kl_loss_7": 589.4509521484375, "learning_rate": 0.0002472568725762853, "loss": 1154.7741, "step": 6720 }, { "ce_loss_10": 3.5483877897262572, "ce_loss_13": 3.4755659341812133, "ce_loss_2": 4.477482891082763, "ce_loss_3": 4.2254945039749146, "ce_loss_7": 3.717711102962494, "epoch": 0.673, "grad_norm": 564.0, "kl_loss_10": 173.398193359375, "kl_loss_2": 2110.752795410156, "kl_loss_3": 1653.0817565917969, "kl_loss_7": 586.6092742919922, "learning_rate": 0.00024588911964571554, "loss": 1145.849, "step": 6730 }, { "ce_loss_10": 3.5657129168510435, "ce_loss_13": 3.4857802987098694, "ce_loss_2": 4.5576330661773685, "ce_loss_3": 4.288707995414734, "ce_loss_7": 3.753812789916992, "epoch": 0.674, "grad_norm": 524.0, "kl_loss_10": 187.46376113891603, "kl_loss_2": 2205.4671936035156, "kl_loss_3": 1727.960107421875, "kl_loss_7": 626.6214233398438, "learning_rate": 0.00024452392560888974, "loss": 1167.7188, "step": 6740 }, { "ce_loss_10": 3.455358147621155, "ce_loss_13": 3.378260016441345, "ce_loss_2": 4.419786167144776, "ce_loss_3": 4.155612635612488, "ce_loss_7": 3.6355133295059203, "epoch": 0.675, "grad_norm": 532.0, "kl_loss_10": 177.83211364746094, "kl_loss_2": 2172.1373962402345, "kl_loss_3": 1704.7519836425781, "kl_loss_7": 606.2507995605469, "learning_rate": 0.00024316130421329695, "loss": 1157.1621, "step": 6750 }, { "ce_loss_10": 3.535005438327789, "ce_loss_13": 3.4575978398323057, "ce_loss_2": 4.505799317359925, "ce_loss_3": 4.232890093326569, "ce_loss_7": 3.7101247310638428, "epoch": 0.676, "grad_norm": 564.0, "kl_loss_10": 177.2459358215332, "kl_loss_2": 2136.964288330078, "kl_loss_3": 1660.3365295410156, "kl_loss_7": 591.8486877441406, "learning_rate": 0.00024180126918051909, "loss": 1154.5281, "step": 6760 }, { "ce_loss_10": 3.577043890953064, "ce_loss_13": 3.5019183993339538, "ce_loss_2": 4.527614569664001, "ce_loss_3": 4.265857553482055, "ce_loss_7": 3.7534381628036497, "epoch": 0.677, "grad_norm": 604.0, "kl_loss_10": 178.1947784423828, "kl_loss_2": 2127.7522155761717, "kl_loss_3": 1659.7866516113281, "kl_loss_7": 597.4583068847656, "learning_rate": 0.00024044383420609406, "loss": 1141.1451, "step": 6770 }, { "ce_loss_10": 3.589032161235809, "ce_loss_13": 3.514654505252838, "ce_loss_2": 4.520573258399963, "ce_loss_3": 4.2588379859924315, "ce_loss_7": 3.7536001801490784, "epoch": 0.678, "grad_norm": 552.0, "kl_loss_10": 175.52578201293946, "kl_loss_2": 2107.4931701660157, "kl_loss_3": 1641.6564514160157, "kl_loss_7": 591.3939788818359, "learning_rate": 0.00023908901295937712, "loss": 1175.0256, "step": 6780 }, { "ce_loss_10": 3.5837427616119384, "ce_loss_13": 3.505910849571228, "ce_loss_2": 4.535600376129151, "ce_loss_3": 4.271885943412781, "ce_loss_7": 3.755298101902008, "epoch": 0.679, "grad_norm": 596.0, "kl_loss_10": 177.23758392333986, "kl_loss_2": 2111.0602905273436, "kl_loss_3": 1645.3706420898438, "kl_loss_7": 592.0553497314453, "learning_rate": 0.00023773681908340283, "loss": 1169.8496, "step": 6790 }, { "ce_loss_10": 3.5592074632644652, "ce_loss_13": 3.4772790670394897, "ce_loss_2": 4.548656535148621, "ce_loss_3": 4.2832125425338745, "ce_loss_7": 3.7424607038497926, "epoch": 0.68, "grad_norm": 600.0, "kl_loss_10": 187.7086395263672, "kl_loss_2": 2222.717413330078, "kl_loss_3": 1751.2139282226562, "kl_loss_7": 621.8694488525391, "learning_rate": 0.00023638726619474876, "loss": 1203.8379, "step": 6800 }, { "ce_loss_10": 3.5476158022880555, "ce_loss_13": 3.467449462413788, "ce_loss_2": 4.563005781173706, "ce_loss_3": 4.29602427482605, "ce_loss_7": 3.7365992784500124, "epoch": 0.681, "grad_norm": 580.0, "kl_loss_10": 183.3829345703125, "kl_loss_2": 2228.727575683594, "kl_loss_3": 1760.1514953613282, "kl_loss_7": 626.0395812988281, "learning_rate": 0.0002350403678833976, "loss": 1182.506, "step": 6810 }, { "ce_loss_10": 3.4750794649124144, "ce_loss_13": 3.39876846075058, "ce_loss_2": 4.465261030197143, "ce_loss_3": 4.1875766038894655, "ce_loss_7": 3.6560620784759523, "epoch": 0.682, "grad_norm": 490.0, "kl_loss_10": 178.67537307739258, "kl_loss_2": 2200.3225830078127, "kl_loss_3": 1714.0246276855469, "kl_loss_7": 608.0651733398438, "learning_rate": 0.00023369613771260007, "loss": 1160.444, "step": 6820 }, { "ce_loss_10": 3.5863471627235413, "ce_loss_13": 3.5096321582794188, "ce_loss_2": 4.573717498779297, "ce_loss_3": 4.300703597068787, "ce_loss_7": 3.767488884925842, "epoch": 0.683, "grad_norm": 608.0, "kl_loss_10": 181.20342712402345, "kl_loss_2": 2206.5099487304688, "kl_loss_3": 1721.635076904297, "kl_loss_7": 610.5273590087891, "learning_rate": 0.00023235458921873925, "loss": 1187.8242, "step": 6830 }, { "ce_loss_10": 3.5434704184532166, "ce_loss_13": 3.4598939180374146, "ce_loss_2": 4.561892867088318, "ce_loss_3": 4.300772976875305, "ce_loss_7": 3.73870370388031, "epoch": 0.684, "grad_norm": 640.0, "kl_loss_10": 187.79423599243165, "kl_loss_2": 2268.489392089844, "kl_loss_3": 1798.0396545410156, "kl_loss_7": 637.83154296875, "learning_rate": 0.0002310157359111938, "loss": 1215.3348, "step": 6840 }, { "ce_loss_10": 3.4310184836387636, "ce_loss_13": 3.3527446746826173, "ce_loss_2": 4.52064049243927, "ce_loss_3": 4.243770575523376, "ce_loss_7": 3.6270575404167174, "epoch": 0.685, "grad_norm": 656.0, "kl_loss_10": 183.79262008666993, "kl_loss_2": 2376.693957519531, "kl_loss_3": 1883.7591125488282, "kl_loss_7": 632.1836151123047, "learning_rate": 0.0002296795912722014, "loss": 1227.4164, "step": 6850 }, { "ce_loss_10": 3.570713925361633, "ce_loss_13": 3.494589388370514, "ce_loss_2": 4.519134759902954, "ce_loss_3": 4.253862988948822, "ce_loss_7": 3.7498515605926515, "epoch": 0.686, "grad_norm": 576.0, "kl_loss_10": 179.78029174804686, "kl_loss_2": 2128.572625732422, "kl_loss_3": 1654.8151062011718, "kl_loss_7": 601.8367279052734, "learning_rate": 0.0002283461687567236, "loss": 1133.3289, "step": 6860 }, { "ce_loss_10": 3.6324430108070374, "ce_loss_13": 3.5565361857414244, "ce_loss_2": 4.557056021690369, "ce_loss_3": 4.298560571670532, "ce_loss_7": 3.8067931652069094, "epoch": 0.687, "grad_norm": 506.0, "kl_loss_10": 176.90674362182617, "kl_loss_2": 2057.1298095703123, "kl_loss_3": 1601.75263671875, "kl_loss_7": 589.7931121826172, "learning_rate": 0.00022701548179231045, "loss": 1148.6605, "step": 6870 }, { "ce_loss_10": 3.582988679409027, "ce_loss_13": 3.5054625153541563, "ce_loss_2": 4.558988261222839, "ce_loss_3": 4.300906538963318, "ce_loss_7": 3.7617339849472047, "epoch": 0.688, "grad_norm": 628.0, "kl_loss_10": 181.7885940551758, "kl_loss_2": 2183.6723815917967, "kl_loss_3": 1725.755859375, "kl_loss_7": 613.1118804931641, "learning_rate": 0.00022568754377896516, "loss": 1157.5781, "step": 6880 }, { "ce_loss_10": 3.5760830521583555, "ce_loss_13": 3.499359941482544, "ce_loss_2": 4.526648283004761, "ce_loss_3": 4.2548288941383365, "ce_loss_7": 3.746409332752228, "epoch": 0.689, "grad_norm": 596.0, "kl_loss_10": 180.4591537475586, "kl_loss_2": 2140.339678955078, "kl_loss_3": 1666.426806640625, "kl_loss_7": 608.2543426513672, "learning_rate": 0.00022436236808900844, "loss": 1146.7832, "step": 6890 }, { "ce_loss_10": 3.4679219722747803, "ce_loss_13": 3.3943055748939512, "ce_loss_2": 4.462708353996277, "ce_loss_3": 4.191701900959015, "ce_loss_7": 3.6564658761024473, "epoch": 0.69, "grad_norm": 576.0, "kl_loss_10": 181.4543029785156, "kl_loss_2": 2231.3488586425783, "kl_loss_3": 1747.2512329101562, "kl_loss_7": 621.3712341308594, "learning_rate": 0.00022303996806694487, "loss": 1171.5013, "step": 6900 }, { "ce_loss_10": 3.5484726190567017, "ce_loss_13": 3.4742958664894106, "ce_loss_2": 4.519419646263122, "ce_loss_3": 4.2546670794487, "ce_loss_7": 3.7258636236190794, "epoch": 0.691, "grad_norm": 628.0, "kl_loss_10": 177.98818740844726, "kl_loss_2": 2169.697412109375, "kl_loss_3": 1700.1277648925782, "kl_loss_7": 608.3069030761719, "learning_rate": 0.00022172035702932823, "loss": 1158.7983, "step": 6910 }, { "ce_loss_10": 3.5924888372421266, "ce_loss_13": 3.52042818069458, "ce_loss_2": 4.517103600502014, "ce_loss_3": 4.2600155711174015, "ce_loss_7": 3.7615610361099243, "epoch": 0.692, "grad_norm": 644.0, "kl_loss_10": 178.67746124267578, "kl_loss_2": 2075.089074707031, "kl_loss_3": 1619.080419921875, "kl_loss_7": 597.07578125, "learning_rate": 0.00022040354826462666, "loss": 1140.3766, "step": 6920 }, { "ce_loss_10": 3.5235054731369018, "ce_loss_13": 3.4497315883636475, "ce_loss_2": 4.493763208389282, "ce_loss_3": 4.228443372249603, "ce_loss_7": 3.696590280532837, "epoch": 0.693, "grad_norm": 608.0, "kl_loss_10": 176.88443298339843, "kl_loss_2": 2155.86865234375, "kl_loss_3": 1688.133123779297, "kl_loss_7": 594.834016418457, "learning_rate": 0.0002190895550330899, "loss": 1170.6351, "step": 6930 }, { "ce_loss_10": 3.4576660275459288, "ce_loss_13": 3.3801838874816896, "ce_loss_2": 4.465155124664307, "ce_loss_3": 4.190037369728088, "ce_loss_7": 3.644961953163147, "epoch": 0.694, "grad_norm": 596.0, "kl_loss_10": 183.47678833007814, "kl_loss_2": 2243.730157470703, "kl_loss_3": 1750.4187255859374, "kl_loss_7": 620.829443359375, "learning_rate": 0.00021777839056661552, "loss": 1165.1125, "step": 6940 }, { "ce_loss_10": 3.5390109062194823, "ce_loss_13": 3.464726150035858, "ce_loss_2": 4.509364485740662, "ce_loss_3": 4.234912276268005, "ce_loss_7": 3.7123560190200804, "epoch": 0.695, "grad_norm": 544.0, "kl_loss_10": 176.9818588256836, "kl_loss_2": 2161.626544189453, "kl_loss_3": 1678.7994750976563, "kl_loss_7": 599.7094299316407, "learning_rate": 0.0002164700680686147, "loss": 1138.0607, "step": 6950 }, { "ce_loss_10": 3.584149193763733, "ce_loss_13": 3.509235203266144, "ce_loss_2": 4.522939825057984, "ce_loss_3": 4.255844712257385, "ce_loss_7": 3.757488739490509, "epoch": 0.696, "grad_norm": 520.0, "kl_loss_10": 178.28938369750978, "kl_loss_2": 2107.5391052246096, "kl_loss_3": 1637.0810913085938, "kl_loss_7": 596.8087493896485, "learning_rate": 0.0002151646007138806, "loss": 1144.8846, "step": 6960 }, { "ce_loss_10": 3.463143539428711, "ce_loss_13": 3.386814093589783, "ce_loss_2": 4.468677043914795, "ce_loss_3": 4.195722925662994, "ce_loss_7": 3.644878602027893, "epoch": 0.697, "grad_norm": 592.0, "kl_loss_10": 182.87069091796874, "kl_loss_2": 2238.085968017578, "kl_loss_3": 1753.8017456054688, "kl_loss_7": 618.0010162353516, "learning_rate": 0.00021386200164845526, "loss": 1174.7895, "step": 6970 }, { "ce_loss_10": 3.646360158920288, "ce_loss_13": 3.5726787090301513, "ce_loss_2": 4.5610116720199585, "ce_loss_3": 4.303556060791015, "ce_loss_7": 3.814376199245453, "epoch": 0.698, "grad_norm": 564.0, "kl_loss_10": 176.60812377929688, "kl_loss_2": 2073.2183532714844, "kl_loss_3": 1616.5314147949218, "kl_loss_7": 592.3170806884766, "learning_rate": 0.0002125622839894964, "loss": 1126.8248, "step": 6980 }, { "ce_loss_10": 3.5844451546669007, "ce_loss_13": 3.5105634808540342, "ce_loss_2": 4.530939984321594, "ce_loss_3": 4.263714623451233, "ce_loss_7": 3.7546409368515015, "epoch": 0.699, "grad_norm": 580.0, "kl_loss_10": 177.07121353149415, "kl_loss_2": 2114.079455566406, "kl_loss_3": 1646.3038818359375, "kl_loss_7": 590.21640625, "learning_rate": 0.00021126546082514663, "loss": 1144.4324, "step": 6990 }, { "ce_loss_10": 3.6092105984687803, "ce_loss_13": 3.533507966995239, "ce_loss_2": 4.539715147018432, "ce_loss_3": 4.274128103256226, "ce_loss_7": 3.7783223032951354, "epoch": 0.7, "grad_norm": 576.0, "kl_loss_10": 177.3388931274414, "kl_loss_2": 2107.8220703125, "kl_loss_3": 1636.2730224609375, "kl_loss_7": 594.1880798339844, "learning_rate": 0.00020997154521440098, "loss": 1131.7685, "step": 7000 }, { "ce_loss_10": 3.5483237147331237, "ce_loss_13": 3.476468551158905, "ce_loss_2": 4.5004148244857785, "ce_loss_3": 4.238211619853973, "ce_loss_7": 3.722394573688507, "epoch": 0.701, "grad_norm": 556.0, "kl_loss_10": 174.87986907958984, "kl_loss_2": 2127.186975097656, "kl_loss_3": 1661.8602966308595, "kl_loss_7": 600.6610717773438, "learning_rate": 0.0002086805501869749, "loss": 1133.7422, "step": 7010 }, { "ce_loss_10": 3.5188135743141173, "ce_loss_13": 3.441002869606018, "ce_loss_2": 4.517698335647583, "ce_loss_3": 4.247731244564056, "ce_loss_7": 3.704049062728882, "epoch": 0.702, "grad_norm": 616.0, "kl_loss_10": 182.97085342407226, "kl_loss_2": 2238.2483459472655, "kl_loss_3": 1746.861260986328, "kl_loss_7": 621.9453765869141, "learning_rate": 0.0002073924887431744, "loss": 1180.4881, "step": 7020 }, { "ce_loss_10": 3.5274356603622437, "ce_loss_13": 3.45092910528183, "ce_loss_2": 4.4901411771774296, "ce_loss_3": 4.230588483810425, "ce_loss_7": 3.706618547439575, "epoch": 0.703, "grad_norm": 568.0, "kl_loss_10": 179.11029281616212, "kl_loss_2": 2178.3450439453127, "kl_loss_3": 1711.4957885742188, "kl_loss_7": 605.4426422119141, "learning_rate": 0.00020610737385376348, "loss": 1200.9115, "step": 7030 }, { "ce_loss_10": 3.5887810468673704, "ce_loss_13": 3.5163929224014283, "ce_loss_2": 4.518351888656616, "ce_loss_3": 4.254893863201142, "ce_loss_7": 3.7612039923667906, "epoch": 0.704, "grad_norm": 628.0, "kl_loss_10": 176.6663619995117, "kl_loss_2": 2075.716662597656, "kl_loss_3": 1610.9020690917969, "kl_loss_7": 588.8746612548828, "learning_rate": 0.00020482521845983521, "loss": 1151.7219, "step": 7040 }, { "ce_loss_10": 3.5866637587547303, "ce_loss_13": 3.5072137475013734, "ce_loss_2": 4.558261132240295, "ce_loss_3": 4.291126704216003, "ce_loss_7": 3.7625884056091308, "epoch": 0.705, "grad_norm": 600.0, "kl_loss_10": 182.52303237915038, "kl_loss_2": 2193.1544799804688, "kl_loss_3": 1715.2766052246093, "kl_loss_7": 612.0993133544922, "learning_rate": 0.00020354603547267987, "loss": 1187.2512, "step": 7050 }, { "ce_loss_10": 3.56976774930954, "ce_loss_13": 3.488901746273041, "ce_loss_2": 4.5605854988098145, "ce_loss_3": 4.2862097263336185, "ce_loss_7": 3.7558568716049194, "epoch": 0.706, "grad_norm": 504.0, "kl_loss_10": 182.46872100830078, "kl_loss_2": 2185.692938232422, "kl_loss_3": 1703.4005493164063, "kl_loss_7": 615.3342132568359, "learning_rate": 0.00020226983777365604, "loss": 1201.599, "step": 7060 }, { "ce_loss_10": 3.46960107088089, "ce_loss_13": 3.394390141963959, "ce_loss_2": 4.4708491563797, "ce_loss_3": 4.21563994884491, "ce_loss_7": 3.6478799104690554, "epoch": 0.707, "grad_norm": 548.0, "kl_loss_10": 174.23039703369142, "kl_loss_2": 2219.3698486328126, "kl_loss_3": 1767.6679748535157, "kl_loss_7": 596.5048126220703, "learning_rate": 0.00020099663821406056, "loss": 1167.8441, "step": 7070 }, { "ce_loss_10": 3.573564553260803, "ce_loss_13": 3.4988652229309083, "ce_loss_2": 4.518075895309448, "ce_loss_3": 4.2526293873786924, "ce_loss_7": 3.74619642496109, "epoch": 0.708, "grad_norm": 688.0, "kl_loss_10": 173.7955307006836, "kl_loss_2": 2112.61328125, "kl_loss_3": 1644.760516357422, "kl_loss_7": 588.589468383789, "learning_rate": 0.00019972644961499853, "loss": 1168.0168, "step": 7080 }, { "ce_loss_10": 3.5425114035606384, "ce_loss_13": 3.4652504205703734, "ce_loss_2": 4.536031889915466, "ce_loss_3": 4.265958952903747, "ce_loss_7": 3.7277685403823853, "epoch": 0.709, "grad_norm": 544.0, "kl_loss_10": 181.94257354736328, "kl_loss_2": 2208.387451171875, "kl_loss_3": 1727.9980712890624, "kl_loss_7": 619.6463317871094, "learning_rate": 0.00019845928476725522, "loss": 1173.2897, "step": 7090 }, { "ce_loss_10": 3.6211097598075868, "ce_loss_13": 3.542751681804657, "ce_loss_2": 4.576697874069214, "ce_loss_3": 4.307754421234131, "ce_loss_7": 3.794824481010437, "epoch": 0.71, "grad_norm": 524.0, "kl_loss_10": 179.40447082519532, "kl_loss_2": 2133.6560546875, "kl_loss_3": 1661.1115417480469, "kl_loss_7": 603.4232849121094, "learning_rate": 0.00019719515643116677, "loss": 1187.0576, "step": 7100 }, { "ce_loss_10": 3.563658607006073, "ce_loss_13": 3.486394798755646, "ce_loss_2": 4.523072552680969, "ce_loss_3": 4.254948425292969, "ce_loss_7": 3.7338495373725893, "epoch": 0.711, "grad_norm": 560.0, "kl_loss_10": 177.84368362426758, "kl_loss_2": 2144.635882568359, "kl_loss_3": 1666.16875, "kl_loss_7": 594.3132598876953, "learning_rate": 0.0001959340773364911, "loss": 1165.8826, "step": 7110 }, { "ce_loss_10": 3.5770322680473328, "ce_loss_13": 3.5012174606323243, "ce_loss_2": 4.550109481811523, "ce_loss_3": 4.284217190742493, "ce_loss_7": 3.7552335023880006, "epoch": 0.712, "grad_norm": 482.0, "kl_loss_10": 179.49577865600585, "kl_loss_2": 2181.1701049804688, "kl_loss_3": 1700.0443542480468, "kl_loss_7": 603.1331329345703, "learning_rate": 0.0001946760601822809, "loss": 1144.9554, "step": 7120 }, { "ce_loss_10": 3.6210792899131774, "ce_loss_13": 3.549504554271698, "ce_loss_2": 4.563032126426696, "ce_loss_3": 4.2925217628479, "ce_loss_7": 3.7989898562431335, "epoch": 0.713, "grad_norm": 592.0, "kl_loss_10": 177.09535369873046, "kl_loss_2": 2104.0981018066404, "kl_loss_3": 1631.4184448242188, "kl_loss_7": 592.4103118896485, "learning_rate": 0.00019342111763675512, "loss": 1123.9035, "step": 7130 }, { "ce_loss_10": 3.624540627002716, "ce_loss_13": 3.5509743094444275, "ce_loss_2": 4.5522850275039675, "ce_loss_3": 4.289403009414673, "ce_loss_7": 3.7917919158935547, "epoch": 0.714, "grad_norm": 588.0, "kl_loss_10": 179.54557189941406, "kl_loss_2": 2098.2009887695312, "kl_loss_3": 1627.7805236816407, "kl_loss_7": 597.2573303222656, "learning_rate": 0.00019216926233717085, "loss": 1127.0122, "step": 7140 }, { "ce_loss_10": 3.5141358375549316, "ce_loss_13": 3.439559853076935, "ce_loss_2": 4.534635162353515, "ce_loss_3": 4.271865749359131, "ce_loss_7": 3.6872041702270506, "epoch": 0.715, "grad_norm": 660.0, "kl_loss_10": 176.31234970092774, "kl_loss_2": 2255.184912109375, "kl_loss_3": 1791.5307861328124, "kl_loss_7": 594.7268737792969, "learning_rate": 0.00019092050688969737, "loss": 1192.3771, "step": 7150 }, { "ce_loss_10": 3.586177408695221, "ce_loss_13": 3.5133618116378784, "ce_loss_2": 4.527247905731201, "ce_loss_3": 4.265925621986389, "ce_loss_7": 3.7605576038360597, "epoch": 0.716, "grad_norm": 644.0, "kl_loss_10": 177.39978713989257, "kl_loss_2": 2138.382684326172, "kl_loss_3": 1670.822119140625, "kl_loss_7": 599.2600921630859, "learning_rate": 0.00018967486386928817, "loss": 1143.1982, "step": 7160 }, { "ce_loss_10": 3.4582155346870422, "ce_loss_13": 3.3820405125617983, "ce_loss_2": 4.456401991844177, "ce_loss_3": 4.1904214262962345, "ce_loss_7": 3.640235483646393, "epoch": 0.717, "grad_norm": 644.0, "kl_loss_10": 181.15178756713868, "kl_loss_2": 2234.275775146484, "kl_loss_3": 1755.7729919433593, "kl_loss_7": 621.9208374023438, "learning_rate": 0.00018843234581955443, "loss": 1211.3026, "step": 7170 }, { "ce_loss_10": 3.4746442079544066, "ce_loss_13": 3.3969290494918822, "ce_loss_2": 4.4550795435905455, "ce_loss_3": 4.190334832668304, "ce_loss_7": 3.6564103603363036, "epoch": 0.718, "grad_norm": 552.0, "kl_loss_10": 182.11315155029297, "kl_loss_2": 2189.7255920410157, "kl_loss_3": 1717.2798217773438, "kl_loss_7": 618.1327026367187, "learning_rate": 0.00018719296525263924, "loss": 1174.7828, "step": 7180 }, { "ce_loss_10": 3.571851980686188, "ce_loss_13": 3.4972564935684205, "ce_loss_2": 4.505244612693787, "ce_loss_3": 4.243821203708649, "ce_loss_7": 3.744515597820282, "epoch": 0.719, "grad_norm": 616.0, "kl_loss_10": 176.35762710571288, "kl_loss_2": 2085.3956665039063, "kl_loss_3": 1620.6713073730468, "kl_loss_7": 587.7710571289062, "learning_rate": 0.0001859567346490913, "loss": 1127.6644, "step": 7190 }, { "ce_loss_10": 3.5473140597343447, "ce_loss_13": 3.469071900844574, "ce_loss_2": 4.532921981811524, "ce_loss_3": 4.260496711730957, "ce_loss_7": 3.727588391304016, "epoch": 0.72, "grad_norm": 576.0, "kl_loss_10": 181.04826431274415, "kl_loss_2": 2198.079150390625, "kl_loss_3": 1714.5421325683594, "kl_loss_7": 608.8879028320313, "learning_rate": 0.0001847236664577389, "loss": 1142.0284, "step": 7200 }, { "ce_loss_10": 3.5739798665046694, "ce_loss_13": 3.498915135860443, "ce_loss_2": 4.512744069099426, "ce_loss_3": 4.2453584432601925, "ce_loss_7": 3.7430235743522644, "epoch": 0.721, "grad_norm": 560.0, "kl_loss_10": 177.07028579711914, "kl_loss_2": 2100.2286865234373, "kl_loss_3": 1626.5753784179688, "kl_loss_7": 587.8365112304688, "learning_rate": 0.00018349377309556487, "loss": 1123.1494, "step": 7210 }, { "ce_loss_10": 3.5153507471084593, "ce_loss_13": 3.438252806663513, "ce_loss_2": 4.529551863670349, "ce_loss_3": 4.264591979980469, "ce_loss_7": 3.6999141216278075, "epoch": 0.722, "grad_norm": 576.0, "kl_loss_10": 181.94500274658202, "kl_loss_2": 2259.3618774414062, "kl_loss_3": 1782.3347534179688, "kl_loss_7": 618.6104766845704, "learning_rate": 0.00018226706694758193, "loss": 1192.0385, "step": 7220 }, { "ce_loss_10": 3.589731001853943, "ce_loss_13": 3.5162469148635864, "ce_loss_2": 4.535777926445007, "ce_loss_3": 4.275981712341308, "ce_loss_7": 3.758218777179718, "epoch": 0.723, "grad_norm": 536.0, "kl_loss_10": 176.7706611633301, "kl_loss_2": 2136.6498046875, "kl_loss_3": 1678.7979248046875, "kl_loss_7": 600.344839477539, "learning_rate": 0.0001810435603667075, "loss": 1186.8562, "step": 7230 }, { "ce_loss_10": 3.4363317847251893, "ce_loss_13": 3.3615066409111023, "ce_loss_2": 4.428185939788818, "ce_loss_3": 4.15840493440628, "ce_loss_7": 3.6154449939727784, "epoch": 0.724, "grad_norm": 568.0, "kl_loss_10": 175.6705749511719, "kl_loss_2": 2191.837860107422, "kl_loss_3": 1708.5897644042968, "kl_loss_7": 600.9333648681641, "learning_rate": 0.0001798232656736389, "loss": 1187.3889, "step": 7240 }, { "ce_loss_10": 3.6142520189285277, "ce_loss_13": 3.539129304885864, "ce_loss_2": 4.541441655158996, "ce_loss_3": 4.278818452358246, "ce_loss_7": 3.7878984928131105, "epoch": 0.725, "grad_norm": 548.0, "kl_loss_10": 176.2219985961914, "kl_loss_2": 2082.9966674804687, "kl_loss_3": 1618.1460876464844, "kl_loss_7": 589.8986907958985, "learning_rate": 0.0001786061951567303, "loss": 1139.4487, "step": 7250 }, { "ce_loss_10": 3.528095841407776, "ce_loss_13": 3.449831175804138, "ce_loss_2": 4.499278616905213, "ce_loss_3": 4.2353353023529055, "ce_loss_7": 3.7135850310325624, "epoch": 0.726, "grad_norm": 564.0, "kl_loss_10": 179.76034393310547, "kl_loss_2": 2139.9875549316407, "kl_loss_3": 1671.8400817871093, "kl_loss_7": 601.5716674804687, "learning_rate": 0.00017739236107186857, "loss": 1166.0127, "step": 7260 }, { "ce_loss_10": 3.6185179114341737, "ce_loss_13": 3.5442421674728393, "ce_loss_2": 4.529335474967956, "ce_loss_3": 4.268719971179962, "ce_loss_7": 3.782019078731537, "epoch": 0.727, "grad_norm": 506.0, "kl_loss_10": 174.4645896911621, "kl_loss_2": 2059.7219299316407, "kl_loss_3": 1594.1942993164062, "kl_loss_7": 584.5985778808594, "learning_rate": 0.00017618177564234904, "loss": 1131.8243, "step": 7270 }, { "ce_loss_10": 3.5931476950645447, "ce_loss_13": 3.5195810914039614, "ce_loss_2": 4.50758855342865, "ce_loss_3": 4.243484151363373, "ce_loss_7": 3.7607154488563537, "epoch": 0.728, "grad_norm": 560.0, "kl_loss_10": 172.751806640625, "kl_loss_2": 2033.7148681640624, "kl_loss_3": 1570.946112060547, "kl_loss_7": 576.0696563720703, "learning_rate": 0.00017497445105875377, "loss": 1116.918, "step": 7280 }, { "ce_loss_10": 3.5072262048721314, "ce_loss_13": 3.429379200935364, "ce_loss_2": 4.499281525611877, "ce_loss_3": 4.232627415657044, "ce_loss_7": 3.695177102088928, "epoch": 0.729, "grad_norm": 552.0, "kl_loss_10": 181.318611907959, "kl_loss_2": 2210.19443359375, "kl_loss_3": 1730.8836486816406, "kl_loss_7": 613.139291381836, "learning_rate": 0.000173770399478828, "loss": 1168.2677, "step": 7290 }, { "ce_loss_10": 3.422491526603699, "ce_loss_13": 3.347836971282959, "ce_loss_2": 4.407784128189087, "ce_loss_3": 4.131522953510284, "ce_loss_7": 3.6013160228729246, "epoch": 0.73, "grad_norm": 560.0, "kl_loss_10": 176.02440795898437, "kl_loss_2": 2191.964385986328, "kl_loss_3": 1698.5522827148438, "kl_loss_7": 599.1639129638672, "learning_rate": 0.0001725696330273575, "loss": 1197.3154, "step": 7300 }, { "ce_loss_10": 3.611281132698059, "ce_loss_13": 3.536842370033264, "ce_loss_2": 4.535079216957092, "ce_loss_3": 4.276083791255951, "ce_loss_7": 3.782840621471405, "epoch": 0.731, "grad_norm": 608.0, "kl_loss_10": 174.63313903808594, "kl_loss_2": 2067.6716491699217, "kl_loss_3": 1609.9422119140625, "kl_loss_7": 585.3752410888671, "learning_rate": 0.00017137216379604724, "loss": 1120.0867, "step": 7310 }, { "ce_loss_10": 3.491976761817932, "ce_loss_13": 3.4171910762786863, "ce_loss_2": 4.477530479431152, "ce_loss_3": 4.205724453926086, "ce_loss_7": 3.667802131175995, "epoch": 0.732, "grad_norm": 588.0, "kl_loss_10": 177.26437683105468, "kl_loss_2": 2177.8865478515627, "kl_loss_3": 1690.7372009277344, "kl_loss_7": 596.9918426513672, "learning_rate": 0.00017017800384339925, "loss": 1158.1862, "step": 7320 }, { "ce_loss_10": 3.446759831905365, "ce_loss_13": 3.3701040625572203, "ce_loss_2": 4.4656068086624146, "ce_loss_3": 4.189201056957245, "ce_loss_7": 3.63215457201004, "epoch": 0.733, "grad_norm": 608.0, "kl_loss_10": 179.6235023498535, "kl_loss_2": 2245.6789367675783, "kl_loss_3": 1758.8689514160155, "kl_loss_7": 611.1446807861328, "learning_rate": 0.00016898716519459073, "loss": 1147.9725, "step": 7330 }, { "ce_loss_10": 3.5716673254966738, "ce_loss_13": 3.4945391058921813, "ce_loss_2": 4.573297142982483, "ce_loss_3": 4.307599520683288, "ce_loss_7": 3.7545908093452454, "epoch": 0.734, "grad_norm": 564.0, "kl_loss_10": 182.98650054931642, "kl_loss_2": 2208.1832763671873, "kl_loss_3": 1733.5484619140625, "kl_loss_7": 619.9228546142579, "learning_rate": 0.00016779965984135375, "loss": 1166.6811, "step": 7340 }, { "ce_loss_10": 3.478439450263977, "ce_loss_13": 3.4015959978103636, "ce_loss_2": 4.458614790439606, "ce_loss_3": 4.194760942459107, "ce_loss_7": 3.6524015784263613, "epoch": 0.735, "grad_norm": 612.0, "kl_loss_10": 173.75391540527343, "kl_loss_2": 2180.093780517578, "kl_loss_3": 1698.9966857910156, "kl_loss_7": 593.231803894043, "learning_rate": 0.00016661549974185424, "loss": 1159.2525, "step": 7350 }, { "ce_loss_10": 3.51222710609436, "ce_loss_13": 3.4394211292266847, "ce_loss_2": 4.489507508277893, "ce_loss_3": 4.216231632232666, "ce_loss_7": 3.6876235485076903, "epoch": 0.736, "grad_norm": 604.0, "kl_loss_10": 179.0154716491699, "kl_loss_2": 2169.4521362304686, "kl_loss_3": 1690.815203857422, "kl_loss_7": 602.3167053222656, "learning_rate": 0.00016543469682057105, "loss": 1143.9477, "step": 7360 }, { "ce_loss_10": 3.5415560364723206, "ce_loss_13": 3.465597319602966, "ce_loss_2": 4.508477449417114, "ce_loss_3": 4.240069580078125, "ce_loss_7": 3.7229697704315186, "epoch": 0.737, "grad_norm": 564.0, "kl_loss_10": 181.52649993896483, "kl_loss_2": 2153.332647705078, "kl_loss_3": 1671.2495178222657, "kl_loss_7": 610.9849151611328, "learning_rate": 0.00016425726296817632, "loss": 1153.5225, "step": 7370 }, { "ce_loss_10": 3.5615882515907287, "ce_loss_13": 3.4901331782341005, "ce_loss_2": 4.51513102054596, "ce_loss_3": 4.248232364654541, "ce_loss_7": 3.7354116439819336, "epoch": 0.738, "grad_norm": 544.0, "kl_loss_10": 174.93305130004882, "kl_loss_2": 2115.262805175781, "kl_loss_3": 1640.2427490234375, "kl_loss_7": 589.8659851074219, "learning_rate": 0.00016308321004141607, "loss": 1140.3666, "step": 7380 }, { "ce_loss_10": 3.518048846721649, "ce_loss_13": 3.438374364376068, "ce_loss_2": 4.499938416481018, "ce_loss_3": 4.236223828792572, "ce_loss_7": 3.695832920074463, "epoch": 0.739, "grad_norm": 548.0, "kl_loss_10": 181.39317779541017, "kl_loss_2": 2175.456677246094, "kl_loss_3": 1701.57724609375, "kl_loss_7": 609.4653137207031, "learning_rate": 0.00016191254986299043, "loss": 1150.5328, "step": 7390 }, { "ce_loss_10": 3.5613037228584288, "ce_loss_13": 3.4887098908424377, "ce_loss_2": 4.503171324729919, "ce_loss_3": 4.245256781578064, "ce_loss_7": 3.7236536622047423, "epoch": 0.74, "grad_norm": 680.0, "kl_loss_10": 174.15130844116212, "kl_loss_2": 2131.4445068359373, "kl_loss_3": 1674.7688537597655, "kl_loss_7": 591.5977661132813, "learning_rate": 0.00016074529422143398, "loss": 1164.3935, "step": 7400 }, { "ce_loss_10": 3.5027013421058655, "ce_loss_13": 3.429375433921814, "ce_loss_2": 4.4999552249908445, "ce_loss_3": 4.231460630893707, "ce_loss_7": 3.6830108165740967, "epoch": 0.741, "grad_norm": 736.0, "kl_loss_10": 175.83671493530272, "kl_loss_2": 2196.9726989746096, "kl_loss_3": 1720.8603271484376, "kl_loss_7": 599.7947540283203, "learning_rate": 0.0001595814548709983, "loss": 1180.4217, "step": 7410 }, { "ce_loss_10": 3.576788854598999, "ce_loss_13": 3.498660683631897, "ce_loss_2": 4.549895691871643, "ce_loss_3": 4.287308168411255, "ce_loss_7": 3.7568582773208616, "epoch": 0.742, "grad_norm": 556.0, "kl_loss_10": 181.97546997070313, "kl_loss_2": 2178.1142333984376, "kl_loss_3": 1714.0122802734375, "kl_loss_7": 610.0974487304687, "learning_rate": 0.00015842104353153285, "loss": 1164.6248, "step": 7420 }, { "ce_loss_10": 3.5943754434585573, "ce_loss_13": 3.5180631637573243, "ce_loss_2": 4.549882531166077, "ce_loss_3": 4.288905811309815, "ce_loss_7": 3.7695993304252626, "epoch": 0.743, "grad_norm": 548.0, "kl_loss_10": 179.57282943725585, "kl_loss_2": 2154.6554992675783, "kl_loss_3": 1684.7554626464844, "kl_loss_7": 607.9901489257812, "learning_rate": 0.0001572640718883667, "loss": 1181.4139, "step": 7430 }, { "ce_loss_10": 3.5268728017807005, "ce_loss_13": 3.454422962665558, "ce_loss_2": 4.4702025055885315, "ce_loss_3": 4.211061191558838, "ce_loss_7": 3.699466872215271, "epoch": 0.744, "grad_norm": 544.0, "kl_loss_10": 173.9086715698242, "kl_loss_2": 2107.2433349609373, "kl_loss_3": 1643.049658203125, "kl_loss_7": 587.0272888183594, "learning_rate": 0.0001561105515921915, "loss": 1164.3465, "step": 7440 }, { "ce_loss_10": 3.376306939125061, "ce_loss_13": 3.3052693247795104, "ce_loss_2": 4.399094796180725, "ce_loss_3": 4.130729305744171, "ce_loss_7": 3.5687609910964966, "epoch": 0.745, "grad_norm": 540.0, "kl_loss_10": 174.5767349243164, "kl_loss_2": 2266.337322998047, "kl_loss_3": 1780.6517333984375, "kl_loss_7": 616.0360229492187, "learning_rate": 0.0001549604942589441, "loss": 1163.9994, "step": 7450 }, { "ce_loss_10": 3.5653053879737855, "ce_loss_13": 3.493623507022858, "ce_loss_2": 4.478042149543763, "ce_loss_3": 4.218795919418335, "ce_loss_7": 3.731026256084442, "epoch": 0.746, "grad_norm": 580.0, "kl_loss_10": 170.22484588623047, "kl_loss_2": 2039.5635498046875, "kl_loss_3": 1579.8051452636719, "kl_loss_7": 567.1303924560547, "learning_rate": 0.00015381391146968864, "loss": 1115.5928, "step": 7460 }, { "ce_loss_10": 3.5406330108642576, "ce_loss_13": 3.4665817737579347, "ce_loss_2": 4.507574367523193, "ce_loss_3": 4.2422141313552855, "ce_loss_7": 3.711947810649872, "epoch": 0.747, "grad_norm": 576.0, "kl_loss_10": 173.49912338256837, "kl_loss_2": 2137.4108154296873, "kl_loss_3": 1666.1288146972656, "kl_loss_7": 586.0349029541015, "learning_rate": 0.00015267081477050133, "loss": 1153.2315, "step": 7470 }, { "ce_loss_10": 3.6397757053375246, "ce_loss_13": 3.565910828113556, "ce_loss_2": 4.558345174789428, "ce_loss_3": 4.3014825820922855, "ce_loss_7": 3.813869845867157, "epoch": 0.748, "grad_norm": 524.0, "kl_loss_10": 179.69472961425782, "kl_loss_2": 2081.7364990234373, "kl_loss_3": 1619.6140869140625, "kl_loss_7": 597.4245025634766, "learning_rate": 0.00015153121567235335, "loss": 1120.7676, "step": 7480 }, { "ce_loss_10": 3.529373216629028, "ce_loss_13": 3.454616332054138, "ce_loss_2": 4.507159662246704, "ce_loss_3": 4.2369110703468325, "ce_loss_7": 3.7003498554229735, "epoch": 0.749, "grad_norm": 596.0, "kl_loss_10": 178.19662170410157, "kl_loss_2": 2201.308489990234, "kl_loss_3": 1718.668115234375, "kl_loss_7": 600.4444549560546, "learning_rate": 0.00015039512565099468, "loss": 1130.487, "step": 7490 }, { "ce_loss_10": 3.59435373544693, "ce_loss_13": 3.5217554926872254, "ce_loss_2": 4.542114019393921, "ce_loss_3": 4.2746872186660765, "ce_loss_7": 3.768599247932434, "epoch": 0.75, "grad_norm": 532.0, "kl_loss_10": 177.41806030273438, "kl_loss_2": 2130.0947509765624, "kl_loss_3": 1653.6429382324218, "kl_loss_7": 598.9670806884766, "learning_rate": 0.00014926255614683932, "loss": 1188.0775, "step": 7500 }, { "ce_loss_10": 3.5343728065490723, "ce_loss_13": 3.462270963191986, "ce_loss_2": 4.491153955459595, "ce_loss_3": 4.2244093179702755, "ce_loss_7": 3.70688259601593, "epoch": 0.751, "grad_norm": 584.0, "kl_loss_10": 175.2909957885742, "kl_loss_2": 2134.823455810547, "kl_loss_3": 1661.539990234375, "kl_loss_7": 592.2256774902344, "learning_rate": 0.0001481335185648498, "loss": 1152.3602, "step": 7510 }, { "ce_loss_10": 3.5509208917617796, "ce_loss_13": 3.4760316491127012, "ce_loss_2": 4.4910846710205075, "ce_loss_3": 4.236609256267547, "ce_loss_7": 3.7286911368370057, "epoch": 0.752, "grad_norm": 560.0, "kl_loss_10": 175.903653717041, "kl_loss_2": 2132.4962768554688, "kl_loss_3": 1669.2187805175781, "kl_loss_7": 598.009976196289, "learning_rate": 0.0001470080242744218, "loss": 1135.5451, "step": 7520 }, { "ce_loss_10": 3.5404749631881716, "ce_loss_13": 3.4668622732162477, "ce_loss_2": 4.505393123626709, "ce_loss_3": 4.248504590988159, "ce_loss_7": 3.7097239255905152, "epoch": 0.753, "grad_norm": 600.0, "kl_loss_10": 172.68473205566406, "kl_loss_2": 2143.0695861816407, "kl_loss_3": 1687.700439453125, "kl_loss_7": 591.5756866455079, "learning_rate": 0.0001458860846092705, "loss": 1151.0906, "step": 7530 }, { "ce_loss_10": 3.578909718990326, "ce_loss_13": 3.503495466709137, "ce_loss_2": 4.502352619171143, "ce_loss_3": 4.240141928195953, "ce_loss_7": 3.750500977039337, "epoch": 0.754, "grad_norm": 604.0, "kl_loss_10": 174.89483642578125, "kl_loss_2": 2075.617956542969, "kl_loss_3": 1612.4501708984376, "kl_loss_7": 588.5097457885743, "learning_rate": 0.00014476771086731566, "loss": 1116.6235, "step": 7540 }, { "ce_loss_10": 3.688204324245453, "ce_loss_13": 3.610430431365967, "ce_loss_2": 4.621451306343078, "ce_loss_3": 4.3530316829681395, "ce_loss_7": 3.8562689661979674, "epoch": 0.755, "grad_norm": 572.0, "kl_loss_10": 181.31634902954102, "kl_loss_2": 2096.732080078125, "kl_loss_3": 1625.2518310546875, "kl_loss_7": 592.3898040771485, "learning_rate": 0.00014365291431056872, "loss": 1170.6359, "step": 7550 }, { "ce_loss_10": 3.513639771938324, "ce_loss_13": 3.43876428604126, "ce_loss_2": 4.494768452644348, "ce_loss_3": 4.226865899562836, "ce_loss_7": 3.6938853025436402, "epoch": 0.756, "grad_norm": 648.0, "kl_loss_10": 182.01916885375977, "kl_loss_2": 2211.1534912109373, "kl_loss_3": 1723.5334899902343, "kl_loss_7": 617.1691345214844, "learning_rate": 0.00014254170616501827, "loss": 1163.1255, "step": 7560 }, { "ce_loss_10": 3.4477534770965574, "ce_loss_13": 3.3702123761177063, "ce_loss_2": 4.465814185142517, "ce_loss_3": 4.193384432792664, "ce_loss_7": 3.6376350045204164, "epoch": 0.757, "grad_norm": 652.0, "kl_loss_10": 181.91958312988282, "kl_loss_2": 2272.9578369140627, "kl_loss_3": 1780.6268127441406, "kl_loss_7": 631.3283477783203, "learning_rate": 0.0001414340976205183, "loss": 1210.6553, "step": 7570 }, { "ce_loss_10": 3.4623551964759827, "ce_loss_13": 3.386858320236206, "ce_loss_2": 4.47217173576355, "ce_loss_3": 4.196212124824524, "ce_loss_7": 3.6454687833786013, "epoch": 0.758, "grad_norm": 652.0, "kl_loss_10": 175.49118347167968, "kl_loss_2": 2225.9182312011717, "kl_loss_3": 1743.3149719238281, "kl_loss_7": 604.4145355224609, "learning_rate": 0.00014033009983067452, "loss": 1165.3377, "step": 7580 }, { "ce_loss_10": 3.625165855884552, "ce_loss_13": 3.553388500213623, "ce_loss_2": 4.5477535963058475, "ce_loss_3": 4.282978129386902, "ce_loss_7": 3.790937566757202, "epoch": 0.759, "grad_norm": 540.0, "kl_loss_10": 173.22186889648438, "kl_loss_2": 2076.229632568359, "kl_loss_3": 1605.808331298828, "kl_loss_7": 578.3152954101563, "learning_rate": 0.00013922972391273224, "loss": 1124.4209, "step": 7590 }, { "ce_loss_10": 3.624656689167023, "ce_loss_13": 3.5520288705825807, "ce_loss_2": 4.581440138816833, "ce_loss_3": 4.323860204219818, "ce_loss_7": 3.799424684047699, "epoch": 0.76, "grad_norm": 604.0, "kl_loss_10": 176.8631507873535, "kl_loss_2": 2111.316943359375, "kl_loss_3": 1657.235821533203, "kl_loss_7": 591.5200927734375, "learning_rate": 0.0001381329809474649, "loss": 1146.3586, "step": 7600 }, { "ce_loss_10": 3.532001996040344, "ce_loss_13": 3.4530585527420046, "ce_loss_2": 4.544336724281311, "ce_loss_3": 4.269702458381653, "ce_loss_7": 3.7165846705436705, "epoch": 0.761, "grad_norm": 632.0, "kl_loss_10": 181.38144760131837, "kl_loss_2": 2247.282580566406, "kl_loss_3": 1759.4795043945312, "kl_loss_7": 616.4709213256835, "learning_rate": 0.0001370398819790621, "loss": 1186.2754, "step": 7610 }, { "ce_loss_10": 3.6697842359542845, "ce_loss_13": 3.5929376244544984, "ce_loss_2": 4.604382491111755, "ce_loss_3": 4.336557102203369, "ce_loss_7": 3.8371459245681763, "epoch": 0.762, "grad_norm": 612.0, "kl_loss_10": 176.96341781616212, "kl_loss_2": 2080.2568908691405, "kl_loss_3": 1604.4097290039062, "kl_loss_7": 582.9634078979492, "learning_rate": 0.00013595043801501794, "loss": 1108.4416, "step": 7620 }, { "ce_loss_10": 3.4644748091697695, "ce_loss_13": 3.386727011203766, "ce_loss_2": 4.503179264068604, "ce_loss_3": 4.235963094234466, "ce_loss_7": 3.650843346118927, "epoch": 0.763, "grad_norm": 664.0, "kl_loss_10": 180.12555694580078, "kl_loss_2": 2289.173895263672, "kl_loss_3": 1815.476171875, "kl_loss_7": 622.1785308837891, "learning_rate": 0.00013486466002602133, "loss": 1194.0496, "step": 7630 }, { "ce_loss_10": 3.577344560623169, "ce_loss_13": 3.503310763835907, "ce_loss_2": 4.512240695953369, "ce_loss_3": 4.2521095991134645, "ce_loss_7": 3.7476378440856934, "epoch": 0.764, "grad_norm": 556.0, "kl_loss_10": 175.91430206298827, "kl_loss_2": 2097.193493652344, "kl_loss_3": 1632.0237731933594, "kl_loss_7": 587.4400573730469, "learning_rate": 0.00013378255894584462, "loss": 1166.6646, "step": 7640 }, { "ce_loss_10": 3.5123034000396727, "ce_loss_13": 3.433635425567627, "ce_loss_2": 4.500353503227234, "ce_loss_3": 4.2323464274406435, "ce_loss_7": 3.6943077445030212, "epoch": 0.765, "grad_norm": 560.0, "kl_loss_10": 181.23019485473634, "kl_loss_2": 2206.8176452636717, "kl_loss_3": 1726.0316467285156, "kl_loss_7": 608.9391540527344, "learning_rate": 0.0001327041456712334, "loss": 1171.7679, "step": 7650 }, { "ce_loss_10": 3.55541011095047, "ce_loss_13": 3.477762734889984, "ce_loss_2": 4.513465809822082, "ce_loss_3": 4.241680002212524, "ce_loss_7": 3.7298651814460753, "epoch": 0.766, "grad_norm": 544.0, "kl_loss_10": 180.71754302978516, "kl_loss_2": 2169.4558044433593, "kl_loss_3": 1686.6506469726562, "kl_loss_7": 611.8586975097656, "learning_rate": 0.00013162943106179747, "loss": 1171.1721, "step": 7660 }, { "ce_loss_10": 3.5293742179870606, "ce_loss_13": 3.456415057182312, "ce_loss_2": 4.477925181388855, "ce_loss_3": 4.21549437046051, "ce_loss_7": 3.7069293022155763, "epoch": 0.767, "grad_norm": 588.0, "kl_loss_10": 176.59339599609376, "kl_loss_2": 2121.027722167969, "kl_loss_3": 1652.0191345214844, "kl_loss_7": 595.7734832763672, "learning_rate": 0.00013055842593990132, "loss": 1142.6258, "step": 7670 }, { "ce_loss_10": 3.4710524678230286, "ce_loss_13": 3.399113714694977, "ce_loss_2": 4.434552907943726, "ce_loss_3": 4.163622748851776, "ce_loss_7": 3.6474674701690675, "epoch": 0.768, "grad_norm": 540.0, "kl_loss_10": 174.6668930053711, "kl_loss_2": 2126.9706176757813, "kl_loss_3": 1655.3524780273438, "kl_loss_7": 590.6725830078125, "learning_rate": 0.00012949114109055414, "loss": 1168.1568, "step": 7680 }, { "ce_loss_10": 3.519006776809692, "ce_loss_13": 3.4431997537612915, "ce_loss_2": 4.4897076964378355, "ce_loss_3": 4.226867043972016, "ce_loss_7": 3.6994680523872376, "epoch": 0.769, "grad_norm": 584.0, "kl_loss_10": 177.6523193359375, "kl_loss_2": 2161.8881958007814, "kl_loss_3": 1689.2801330566406, "kl_loss_7": 607.9943145751953, "learning_rate": 0.00012842758726130281, "loss": 1170.0649, "step": 7690 }, { "ce_loss_10": 3.5628538727760315, "ce_loss_13": 3.485966920852661, "ce_loss_2": 4.55888135433197, "ce_loss_3": 4.292485213279724, "ce_loss_7": 3.7444689750671385, "epoch": 0.77, "grad_norm": 580.0, "kl_loss_10": 179.38818130493163, "kl_loss_2": 2210.8425048828126, "kl_loss_3": 1733.9449951171875, "kl_loss_7": 610.3931243896484, "learning_rate": 0.00012736777516212267, "loss": 1160.5377, "step": 7700 }, { "ce_loss_10": 3.557508039474487, "ce_loss_13": 3.4799222111701966, "ce_loss_2": 4.522961139678955, "ce_loss_3": 4.253420984745025, "ce_loss_7": 3.736038076877594, "epoch": 0.771, "grad_norm": 548.0, "kl_loss_10": 181.55507125854493, "kl_loss_2": 2158.760382080078, "kl_loss_3": 1679.4241821289063, "kl_loss_7": 612.0054595947265, "learning_rate": 0.00012631171546530968, "loss": 1138.0437, "step": 7710 }, { "ce_loss_10": 3.573415291309357, "ce_loss_13": 3.4920427322387697, "ce_loss_2": 4.5341356039047245, "ce_loss_3": 4.2658631801605225, "ce_loss_7": 3.752500355243683, "epoch": 0.772, "grad_norm": 568.0, "kl_loss_10": 181.86062927246093, "kl_loss_2": 2149.480059814453, "kl_loss_3": 1673.6519470214844, "kl_loss_7": 603.6808334350586, "learning_rate": 0.00012525941880537307, "loss": 1168.6842, "step": 7720 }, { "ce_loss_10": 3.6038484454154966, "ce_loss_13": 3.528382158279419, "ce_loss_2": 4.546409988403321, "ce_loss_3": 4.28290638923645, "ce_loss_7": 3.774893271923065, "epoch": 0.773, "grad_norm": 648.0, "kl_loss_10": 176.13294677734376, "kl_loss_2": 2093.1892028808593, "kl_loss_3": 1628.6803955078126, "kl_loss_7": 588.4353439331055, "learning_rate": 0.00012421089577892869, "loss": 1139.2071, "step": 7730 }, { "ce_loss_10": 3.555491530895233, "ce_loss_13": 3.4761422514915465, "ce_loss_2": 4.545820116996765, "ce_loss_3": 4.266150867938995, "ce_loss_7": 3.7351402401924134, "epoch": 0.774, "grad_norm": 668.0, "kl_loss_10": 179.60176849365234, "kl_loss_2": 2216.0720031738283, "kl_loss_3": 1715.4457092285156, "kl_loss_7": 609.4783508300782, "learning_rate": 0.0001231661569445919, "loss": 1172.4699, "step": 7740 }, { "ce_loss_10": 3.410160577297211, "ce_loss_13": 3.3377888798713684, "ce_loss_2": 4.401880002021789, "ce_loss_3": 4.1333277225494385, "ce_loss_7": 3.589407229423523, "epoch": 0.775, "grad_norm": 560.0, "kl_loss_10": 176.03026962280273, "kl_loss_2": 2206.2500732421877, "kl_loss_3": 1718.6787414550781, "kl_loss_7": 601.0735870361328, "learning_rate": 0.00012212521282287093, "loss": 1191.8578, "step": 7750 }, { "ce_loss_10": 3.5700145840644835, "ce_loss_13": 3.493156003952026, "ce_loss_2": 4.517843317985535, "ce_loss_3": 4.254901158809662, "ce_loss_7": 3.748375141620636, "epoch": 0.776, "grad_norm": 536.0, "kl_loss_10": 180.4297233581543, "kl_loss_2": 2117.2120727539063, "kl_loss_3": 1651.861083984375, "kl_loss_7": 599.0601287841797, "learning_rate": 0.00012108807389606158, "loss": 1171.4985, "step": 7760 }, { "ce_loss_10": 3.5604520797729493, "ce_loss_13": 3.4879041433334352, "ce_loss_2": 4.51350736618042, "ce_loss_3": 4.255460405349732, "ce_loss_7": 3.737185871601105, "epoch": 0.777, "grad_norm": 624.0, "kl_loss_10": 173.82694396972656, "kl_loss_2": 2134.6752502441404, "kl_loss_3": 1670.9033142089843, "kl_loss_7": 592.1039428710938, "learning_rate": 0.00012005475060814159, "loss": 1139.6322, "step": 7770 }, { "ce_loss_10": 3.5012547731399537, "ce_loss_13": 3.4265154361724854, "ce_loss_2": 4.493270707130432, "ce_loss_3": 4.232757782936096, "ce_loss_7": 3.676969814300537, "epoch": 0.778, "grad_norm": 592.0, "kl_loss_10": 178.45665435791017, "kl_loss_2": 2218.041455078125, "kl_loss_3": 1749.4460815429688, "kl_loss_7": 609.0319793701171, "learning_rate": 0.00011902525336466464, "loss": 1173.4994, "step": 7780 }, { "ce_loss_10": 3.487755036354065, "ce_loss_13": 3.40771107673645, "ce_loss_2": 4.503837430477143, "ce_loss_3": 4.227473521232605, "ce_loss_7": 3.6715272665023804, "epoch": 0.779, "grad_norm": 556.0, "kl_loss_10": 182.97367095947266, "kl_loss_2": 2253.76220703125, "kl_loss_3": 1756.6077819824218, "kl_loss_7": 618.8783203125, "learning_rate": 0.00011799959253265668, "loss": 1168.3436, "step": 7790 }, { "ce_loss_10": 3.548134469985962, "ce_loss_13": 3.4717200636863708, "ce_loss_2": 4.531218719482422, "ce_loss_3": 4.259068071842194, "ce_loss_7": 3.725462853908539, "epoch": 0.78, "grad_norm": 588.0, "kl_loss_10": 179.5894790649414, "kl_loss_2": 2197.6992370605467, "kl_loss_3": 1714.0868286132813, "kl_loss_7": 606.9559936523438, "learning_rate": 0.00011697777844051105, "loss": 1168.1586, "step": 7800 }, { "ce_loss_10": 3.5325579047203064, "ce_loss_13": 3.4524773359298706, "ce_loss_2": 4.540277624130249, "ce_loss_3": 4.275496506690979, "ce_loss_7": 3.709948194026947, "epoch": 0.781, "grad_norm": 600.0, "kl_loss_10": 182.08444366455078, "kl_loss_2": 2253.191998291016, "kl_loss_3": 1783.90888671875, "kl_loss_7": 609.2252471923828, "learning_rate": 0.00011595982137788402, "loss": 1182.0791, "step": 7810 }, { "ce_loss_10": 3.507384693622589, "ce_loss_13": 3.433124232292175, "ce_loss_2": 4.452573490142822, "ce_loss_3": 4.191938650608063, "ce_loss_7": 3.6804782152175903, "epoch": 0.782, "grad_norm": 552.0, "kl_loss_10": 174.53733520507814, "kl_loss_2": 2103.1436462402344, "kl_loss_3": 1636.1021728515625, "kl_loss_7": 594.1032348632813, "learning_rate": 0.00011494573159559212, "loss": 1150.1953, "step": 7820 }, { "ce_loss_10": 3.495812237262726, "ce_loss_13": 3.4193639039993284, "ce_loss_2": 4.4669132947921755, "ce_loss_3": 4.2113652467727665, "ce_loss_7": 3.67316712141037, "epoch": 0.783, "grad_norm": 572.0, "kl_loss_10": 178.65593719482422, "kl_loss_2": 2173.239221191406, "kl_loss_3": 1708.3340942382813, "kl_loss_7": 603.083627319336, "learning_rate": 0.00011393551930550828, "loss": 1187.9246, "step": 7830 }, { "ce_loss_10": 3.6368354201316833, "ce_loss_13": 3.559674692153931, "ce_loss_2": 4.571843910217285, "ce_loss_3": 4.303619515895844, "ce_loss_7": 3.8069366455078124, "epoch": 0.784, "grad_norm": 588.0, "kl_loss_10": 179.06233749389648, "kl_loss_2": 2120.6896240234373, "kl_loss_3": 1638.9197570800782, "kl_loss_7": 595.7463287353515, "learning_rate": 0.00011292919468045875, "loss": 1145.6585, "step": 7840 }, { "ce_loss_10": 3.584019410610199, "ce_loss_13": 3.5086099743843078, "ce_loss_2": 4.53436963558197, "ce_loss_3": 4.271309959888458, "ce_loss_7": 3.7602915167808533, "epoch": 0.785, "grad_norm": 528.0, "kl_loss_10": 177.57644500732422, "kl_loss_2": 2126.9706481933595, "kl_loss_3": 1654.4735168457032, "kl_loss_7": 600.0492980957031, "learning_rate": 0.00011192676785412154, "loss": 1144.0532, "step": 7850 }, { "ce_loss_10": 3.522589087486267, "ce_loss_13": 3.4456050395965576, "ce_loss_2": 4.529689431190491, "ce_loss_3": 4.258461606502533, "ce_loss_7": 3.704596519470215, "epoch": 0.786, "grad_norm": 624.0, "kl_loss_10": 178.9210517883301, "kl_loss_2": 2216.430499267578, "kl_loss_3": 1733.1487976074218, "kl_loss_7": 602.0237121582031, "learning_rate": 0.00011092824892092374, "loss": 1161.7434, "step": 7860 }, { "ce_loss_10": 3.454429876804352, "ce_loss_13": 3.376889729499817, "ce_loss_2": 4.473304414749146, "ce_loss_3": 4.201044774055481, "ce_loss_7": 3.6403449535369874, "epoch": 0.787, "grad_norm": 544.0, "kl_loss_10": 178.50691452026368, "kl_loss_2": 2241.591131591797, "kl_loss_3": 1762.5004089355468, "kl_loss_7": 614.9870758056641, "learning_rate": 0.0001099336479359398, "loss": 1163.7643, "step": 7870 }, { "ce_loss_10": 3.5764689803123475, "ce_loss_13": 3.507636034488678, "ce_loss_2": 4.512799096107483, "ce_loss_3": 4.25046044588089, "ce_loss_7": 3.746009385585785, "epoch": 0.788, "grad_norm": 564.0, "kl_loss_10": 175.3071716308594, "kl_loss_2": 2102.777294921875, "kl_loss_3": 1634.8632263183595, "kl_loss_7": 592.2164337158204, "learning_rate": 0.00010894297491479043, "loss": 1142.6834, "step": 7880 }, { "ce_loss_10": 3.575552821159363, "ce_loss_13": 3.5023175954818724, "ce_loss_2": 4.539198517799377, "ce_loss_3": 4.279193782806397, "ce_loss_7": 3.750091075897217, "epoch": 0.789, "grad_norm": 576.0, "kl_loss_10": 176.76428680419923, "kl_loss_2": 2146.3808166503904, "kl_loss_3": 1681.0488159179688, "kl_loss_7": 595.56142578125, "learning_rate": 0.00010795623983354214, "loss": 1139.8293, "step": 7890 }, { "ce_loss_10": 3.4591768264770506, "ce_loss_13": 3.3825891733169557, "ce_loss_2": 4.4514943838119505, "ce_loss_3": 4.181638932228088, "ce_loss_7": 3.643087315559387, "epoch": 0.79, "grad_norm": 580.0, "kl_loss_10": 182.44262008666993, "kl_loss_2": 2230.9637817382813, "kl_loss_3": 1740.3924072265625, "kl_loss_7": 621.2922943115234, "learning_rate": 0.00010697345262860636, "loss": 1171.6089, "step": 7900 }, { "ce_loss_10": 3.600342130661011, "ce_loss_13": 3.5264546155929564, "ce_loss_2": 4.545495390892029, "ce_loss_3": 4.278535521030426, "ce_loss_7": 3.771434724330902, "epoch": 0.791, "grad_norm": 736.0, "kl_loss_10": 177.22287063598634, "kl_loss_2": 2132.298791503906, "kl_loss_3": 1650.1429077148437, "kl_loss_7": 593.3167419433594, "learning_rate": 0.00010599462319663906, "loss": 1136.3734, "step": 7910 }, { "ce_loss_10": 3.5746383547782896, "ce_loss_13": 3.4998196601867675, "ce_loss_2": 4.493499338626862, "ce_loss_3": 4.230222713947296, "ce_loss_7": 3.7425215244293213, "epoch": 0.792, "grad_norm": 520.0, "kl_loss_10": 174.49715042114258, "kl_loss_2": 2051.4084716796874, "kl_loss_3": 1592.7509643554688, "kl_loss_7": 582.2202606201172, "learning_rate": 0.00010501976139444191, "loss": 1118.4902, "step": 7920 }, { "ce_loss_10": 3.6047690868377686, "ce_loss_13": 3.5289911150932314, "ce_loss_2": 4.545255088806153, "ce_loss_3": 4.2847788572311405, "ce_loss_7": 3.7748185992240906, "epoch": 0.793, "grad_norm": 604.0, "kl_loss_10": 176.07794952392578, "kl_loss_2": 2104.0453186035156, "kl_loss_3": 1645.7491271972656, "kl_loss_7": 587.9952331542969, "learning_rate": 0.0001040488770388625, "loss": 1154.1295, "step": 7930 }, { "ce_loss_10": 3.548888790607452, "ce_loss_13": 3.4759244203567503, "ce_loss_2": 4.515872287750244, "ce_loss_3": 4.250580382347107, "ce_loss_7": 3.7205033540725707, "epoch": 0.794, "grad_norm": 680.0, "kl_loss_10": 177.23135299682616, "kl_loss_2": 2173.47548828125, "kl_loss_3": 1700.3454467773438, "kl_loss_7": 599.390249633789, "learning_rate": 0.00010308197990669538, "loss": 1149.7575, "step": 7940 }, { "ce_loss_10": 3.664888024330139, "ce_loss_13": 3.5850353479385375, "ce_loss_2": 4.610143923759461, "ce_loss_3": 4.346996653079986, "ce_loss_7": 3.83613098859787, "epoch": 0.795, "grad_norm": 540.0, "kl_loss_10": 179.47011337280273, "kl_loss_2": 2129.81064453125, "kl_loss_3": 1662.4896118164063, "kl_loss_7": 599.4926940917969, "learning_rate": 0.0001021190797345839, "loss": 1140.0146, "step": 7950 }, { "ce_loss_10": 3.3896429777145385, "ce_loss_13": 3.3098750829696657, "ce_loss_2": 4.413335740566254, "ce_loss_3": 4.134794509410858, "ce_loss_7": 3.57609201669693, "epoch": 0.796, "grad_norm": 580.0, "kl_loss_10": 185.28996887207032, "kl_loss_2": 2269.882763671875, "kl_loss_3": 1772.13056640625, "kl_loss_7": 628.5536560058594, "learning_rate": 0.00010116018621892236, "loss": 1175.2182, "step": 7960 }, { "ce_loss_10": 3.603187918663025, "ce_loss_13": 3.5232182860374452, "ce_loss_2": 4.567694234848022, "ce_loss_3": 4.309406304359436, "ce_loss_7": 3.779837393760681, "epoch": 0.797, "grad_norm": 608.0, "kl_loss_10": 186.1454734802246, "kl_loss_2": 2165.1848999023437, "kl_loss_3": 1705.360009765625, "kl_loss_7": 616.0535675048828, "learning_rate": 0.00010020530901575753, "loss": 1136.0533, "step": 7970 }, { "ce_loss_10": 3.625728499889374, "ce_loss_13": 3.5490816116333006, "ce_loss_2": 4.573475480079651, "ce_loss_3": 4.304804050922394, "ce_loss_7": 3.799483132362366, "epoch": 0.798, "grad_norm": 520.0, "kl_loss_10": 180.7791946411133, "kl_loss_2": 2134.7111938476564, "kl_loss_3": 1658.3630126953126, "kl_loss_7": 601.8699676513672, "learning_rate": 9.925445774069231e-05, "loss": 1126.8894, "step": 7980 }, { "ce_loss_10": 3.5760633826255797, "ce_loss_13": 3.500509262084961, "ce_loss_2": 4.527716112136841, "ce_loss_3": 4.2627903580665585, "ce_loss_7": 3.754010498523712, "epoch": 0.799, "grad_norm": 728.0, "kl_loss_10": 177.96156311035156, "kl_loss_2": 2117.894659423828, "kl_loss_3": 1646.0127319335938, "kl_loss_7": 595.1834564208984, "learning_rate": 9.830764196878872e-05, "loss": 1125.6953, "step": 7990 }, { "ce_loss_10": 3.5167272210121157, "ce_loss_13": 3.443312036991119, "ce_loss_2": 4.485463619232178, "ce_loss_3": 4.227659916877746, "ce_loss_7": 3.6942790031433104, "epoch": 0.8, "grad_norm": 480.0, "kl_loss_10": 175.60029678344728, "kl_loss_2": 2190.2330810546873, "kl_loss_3": 1721.7053161621093, "kl_loss_7": 603.3801788330078, "learning_rate": 9.736487123447069e-05, "loss": 1159.6166, "step": 8000 }, { "ce_loss_10": 3.4639697551727293, "ce_loss_13": 3.389075720310211, "ce_loss_2": 4.485336112976074, "ce_loss_3": 4.228267467021942, "ce_loss_7": 3.6402989268302917, "epoch": 0.801, "grad_norm": 600.0, "kl_loss_10": 179.87705307006837, "kl_loss_2": 2294.8387084960937, "kl_loss_3": 1823.8187927246095, "kl_loss_7": 608.9864349365234, "learning_rate": 9.642615503142926e-05, "loss": 1194.0703, "step": 8010 }, { "ce_loss_10": 3.5347692489624025, "ce_loss_13": 3.4572302103042603, "ce_loss_2": 4.520514702796936, "ce_loss_3": 4.254623317718506, "ce_loss_7": 3.7080873131752012, "epoch": 0.802, "grad_norm": 572.0, "kl_loss_10": 175.5455764770508, "kl_loss_2": 2196.378839111328, "kl_loss_3": 1715.636651611328, "kl_loss_7": 596.8724548339844, "learning_rate": 9.549150281252633e-05, "loss": 1151.6992, "step": 8020 }, { "ce_loss_10": 3.563262867927551, "ce_loss_13": 3.486225724220276, "ce_loss_2": 4.527439785003662, "ce_loss_3": 4.256748235225677, "ce_loss_7": 3.7390462875366213, "epoch": 0.803, "grad_norm": 520.0, "kl_loss_10": 179.0418388366699, "kl_loss_2": 2160.1055908203125, "kl_loss_3": 1681.3586303710938, "kl_loss_7": 596.7608947753906, "learning_rate": 9.4560923989699e-05, "loss": 1169.5601, "step": 8030 }, { "ce_loss_10": 3.549173581600189, "ce_loss_13": 3.4747613072395325, "ce_loss_2": 4.515510749816895, "ce_loss_3": 4.245619797706604, "ce_loss_7": 3.7281826019287108, "epoch": 0.804, "grad_norm": 552.0, "kl_loss_10": 177.9036865234375, "kl_loss_2": 2149.996447753906, "kl_loss_3": 1673.0265747070312, "kl_loss_7": 598.6687103271485, "learning_rate": 9.363442793386607e-05, "loss": 1174.7094, "step": 8040 }, { "ce_loss_10": 3.5321462750434875, "ce_loss_13": 3.453168177604675, "ce_loss_2": 4.5332019329071045, "ce_loss_3": 4.265519142150879, "ce_loss_7": 3.7162665724754333, "epoch": 0.805, "grad_norm": 592.0, "kl_loss_10": 181.04829177856445, "kl_loss_2": 2218.4782836914064, "kl_loss_3": 1732.6318908691405, "kl_loss_7": 617.0633758544922, "learning_rate": 9.271202397483213e-05, "loss": 1149.8916, "step": 8050 }, { "ce_loss_10": 3.547755253314972, "ce_loss_13": 3.474861478805542, "ce_loss_2": 4.498465514183044, "ce_loss_3": 4.235939025878906, "ce_loss_7": 3.7163867115974427, "epoch": 0.806, "grad_norm": 572.0, "kl_loss_10": 175.92396697998046, "kl_loss_2": 2136.2159729003906, "kl_loss_3": 1668.2355712890626, "kl_loss_7": 590.7317443847656, "learning_rate": 9.179372140119524e-05, "loss": 1168.4604, "step": 8060 }, { "ce_loss_10": 3.494523513317108, "ce_loss_13": 3.4206513285636904, "ce_loss_2": 4.459244108200073, "ce_loss_3": 4.188582479953766, "ce_loss_7": 3.6680249691009523, "epoch": 0.807, "grad_norm": 564.0, "kl_loss_10": 176.53147811889647, "kl_loss_2": 2154.551867675781, "kl_loss_3": 1677.4455200195312, "kl_loss_7": 596.7226654052735, "learning_rate": 9.087952946025175e-05, "loss": 1164.4886, "step": 8070 }, { "ce_loss_10": 3.6058158397674562, "ce_loss_13": 3.5339553594589233, "ce_loss_2": 4.533062171936035, "ce_loss_3": 4.26933354139328, "ce_loss_7": 3.768651068210602, "epoch": 0.808, "grad_norm": 592.0, "kl_loss_10": 173.78207092285157, "kl_loss_2": 2082.071905517578, "kl_loss_3": 1614.2910522460938, "kl_loss_7": 576.9482543945312, "learning_rate": 8.996945735790446e-05, "loss": 1146.8303, "step": 8080 }, { "ce_loss_10": 3.50276095867157, "ce_loss_13": 3.428582501411438, "ce_loss_2": 4.457551169395447, "ce_loss_3": 4.193507122993469, "ce_loss_7": 3.672742247581482, "epoch": 0.809, "grad_norm": 608.0, "kl_loss_10": 175.95007400512696, "kl_loss_2": 2152.8104553222656, "kl_loss_3": 1678.2301330566406, "kl_loss_7": 594.3645935058594, "learning_rate": 8.906351425856951e-05, "loss": 1158.1713, "step": 8090 }, { "ce_loss_10": 3.4856011509895324, "ce_loss_13": 3.412043738365173, "ce_loss_2": 4.477128624916077, "ce_loss_3": 4.2102068901062015, "ce_loss_7": 3.663009238243103, "epoch": 0.81, "grad_norm": 588.0, "kl_loss_10": 178.9893539428711, "kl_loss_2": 2217.222705078125, "kl_loss_3": 1734.5539611816407, "kl_loss_7": 606.3814819335937, "learning_rate": 8.816170928508365e-05, "loss": 1174.1137, "step": 8100 }, { "ce_loss_10": 3.456532561779022, "ce_loss_13": 3.380963850021362, "ce_loss_2": 4.470215916633606, "ce_loss_3": 4.203172373771667, "ce_loss_7": 3.637952506542206, "epoch": 0.811, "grad_norm": 564.0, "kl_loss_10": 181.76464233398437, "kl_loss_2": 2271.6646240234377, "kl_loss_3": 1789.1731323242188, "kl_loss_7": 618.0579010009766, "learning_rate": 8.7264051518613e-05, "loss": 1188.6756, "step": 8110 }, { "ce_loss_10": 3.5451604604721068, "ce_loss_13": 3.4716222047805787, "ce_loss_2": 4.482484936714172, "ce_loss_3": 4.218600440025329, "ce_loss_7": 3.7114962100982667, "epoch": 0.812, "grad_norm": 572.0, "kl_loss_10": 174.23039016723632, "kl_loss_2": 2104.2210205078127, "kl_loss_3": 1635.3122253417969, "kl_loss_7": 586.6013153076171, "learning_rate": 8.637054999856148e-05, "loss": 1140.2461, "step": 8120 }, { "ce_loss_10": 3.5334657073020934, "ce_loss_13": 3.4553168177604676, "ce_loss_2": 4.5001609325408936, "ce_loss_3": 4.233760714530945, "ce_loss_7": 3.71214896440506, "epoch": 0.813, "grad_norm": 572.0, "kl_loss_10": 179.94693908691406, "kl_loss_2": 2168.638543701172, "kl_loss_3": 1690.7197631835938, "kl_loss_7": 602.7170349121094, "learning_rate": 8.548121372247918e-05, "loss": 1176.2271, "step": 8130 }, { "ce_loss_10": 3.6031864166259764, "ce_loss_13": 3.5284059882164, "ce_loss_2": 4.540188145637512, "ce_loss_3": 4.276613438129425, "ce_loss_7": 3.7673808455467226, "epoch": 0.814, "grad_norm": 576.0, "kl_loss_10": 175.132576751709, "kl_loss_2": 2116.421954345703, "kl_loss_3": 1652.4540222167968, "kl_loss_7": 584.193586730957, "learning_rate": 8.459605164597267e-05, "loss": 1140.1102, "step": 8140 }, { "ce_loss_10": 3.4851497173309327, "ce_loss_13": 3.4121100902557373, "ce_loss_2": 4.4567595481872555, "ce_loss_3": 4.188820004463196, "ce_loss_7": 3.6609971284866334, "epoch": 0.815, "grad_norm": 516.0, "kl_loss_10": 176.46202392578124, "kl_loss_2": 2164.97646484375, "kl_loss_3": 1690.654833984375, "kl_loss_7": 595.82392578125, "learning_rate": 8.371507268261436e-05, "loss": 1160.1355, "step": 8150 }, { "ce_loss_10": 3.5612674951553345, "ce_loss_13": 3.486202526092529, "ce_loss_2": 4.5333171606063845, "ce_loss_3": 4.264222574234009, "ce_loss_7": 3.7375367999076845, "epoch": 0.816, "grad_norm": 536.0, "kl_loss_10": 178.54783096313477, "kl_loss_2": 2174.8032287597657, "kl_loss_3": 1693.2032836914063, "kl_loss_7": 601.639274597168, "learning_rate": 8.283828570385238e-05, "loss": 1135.0793, "step": 8160 }, { "ce_loss_10": 3.566178250312805, "ce_loss_13": 3.4907922625541685, "ce_loss_2": 4.535942006111145, "ce_loss_3": 4.269787204265595, "ce_loss_7": 3.745415151119232, "epoch": 0.817, "grad_norm": 596.0, "kl_loss_10": 175.95259857177734, "kl_loss_2": 2127.893908691406, "kl_loss_3": 1655.5139526367188, "kl_loss_7": 597.2055206298828, "learning_rate": 8.196569953892202e-05, "loss": 1147.5553, "step": 8170 }, { "ce_loss_10": 3.485050618648529, "ce_loss_13": 3.410007894039154, "ce_loss_2": 4.454114603996277, "ce_loss_3": 4.187376809120178, "ce_loss_7": 3.6601618766784667, "epoch": 0.818, "grad_norm": 640.0, "kl_loss_10": 177.51841354370117, "kl_loss_2": 2140.9811096191406, "kl_loss_3": 1670.5337829589844, "kl_loss_7": 598.4014099121093, "learning_rate": 8.109732297475635e-05, "loss": 1142.2542, "step": 8180 }, { "ce_loss_10": 3.4574038982391357, "ce_loss_13": 3.376466763019562, "ce_loss_2": 4.488081407546997, "ce_loss_3": 4.2140247344970705, "ce_loss_7": 3.65143164396286, "epoch": 0.819, "grad_norm": 588.0, "kl_loss_10": 184.73964157104493, "kl_loss_2": 2257.5185974121096, "kl_loss_3": 1764.1057739257812, "kl_loss_7": 626.5130950927735, "learning_rate": 8.023316475589754e-05, "loss": 1190.8261, "step": 8190 }, { "ce_loss_10": 3.4220961928367615, "ce_loss_13": 3.338273751735687, "ce_loss_2": 4.495982336997986, "ce_loss_3": 4.211030387878418, "ce_loss_7": 3.615649092197418, "epoch": 0.82, "grad_norm": 680.0, "kl_loss_10": 185.78453063964844, "kl_loss_2": 2349.4278076171877, "kl_loss_3": 1849.0640197753905, "kl_loss_7": 637.3896453857421, "learning_rate": 7.937323358440934e-05, "loss": 1214.0248, "step": 8200 }, { "ce_loss_10": 3.541324031352997, "ce_loss_13": 3.4684749960899355, "ce_loss_2": 4.468911576271057, "ce_loss_3": 4.20685533285141, "ce_loss_7": 3.709389495849609, "epoch": 0.821, "grad_norm": 584.0, "kl_loss_10": 174.97513656616212, "kl_loss_2": 2090.5933227539062, "kl_loss_3": 1628.1781616210938, "kl_loss_7": 589.6404022216797, "learning_rate": 7.851753811978923e-05, "loss": 1140.9928, "step": 8210 }, { "ce_loss_10": 3.5598355412483214, "ce_loss_13": 3.4843420505523683, "ce_loss_2": 4.541475534439087, "ce_loss_3": 4.275748348236084, "ce_loss_7": 3.735049307346344, "epoch": 0.822, "grad_norm": 604.0, "kl_loss_10": 177.00316925048827, "kl_loss_2": 2174.6779052734373, "kl_loss_3": 1702.5414611816407, "kl_loss_7": 595.3967559814453, "learning_rate": 7.766608697888095e-05, "loss": 1150.2977, "step": 8220 }, { "ce_loss_10": 3.57365106344223, "ce_loss_13": 3.498138427734375, "ce_loss_2": 4.5424954175949095, "ce_loss_3": 4.2809364080429075, "ce_loss_7": 3.7457818508148195, "epoch": 0.823, "grad_norm": 576.0, "kl_loss_10": 180.9578956604004, "kl_loss_2": 2174.856481933594, "kl_loss_3": 1712.8867797851562, "kl_loss_7": 606.9090911865235, "learning_rate": 7.681888873578785e-05, "loss": 1172.8941, "step": 8230 }, { "ce_loss_10": 3.5023999333381655, "ce_loss_13": 3.423751747608185, "ce_loss_2": 4.507507848739624, "ce_loss_3": 4.228441286087036, "ce_loss_7": 3.6896154999732973, "epoch": 0.824, "grad_norm": 556.0, "kl_loss_10": 182.18136978149414, "kl_loss_2": 2222.666143798828, "kl_loss_3": 1725.1198669433593, "kl_loss_7": 612.9842071533203, "learning_rate": 7.597595192178702e-05, "loss": 1157.2363, "step": 8240 }, { "ce_loss_10": 3.501276743412018, "ce_loss_13": 3.422858786582947, "ce_loss_2": 4.514269304275513, "ce_loss_3": 4.2400298595428465, "ce_loss_7": 3.6824575424194337, "epoch": 0.825, "grad_norm": 588.0, "kl_loss_10": 181.8477668762207, "kl_loss_2": 2277.1808898925783, "kl_loss_3": 1781.4539184570312, "kl_loss_7": 622.9125793457031, "learning_rate": 7.513728502524286e-05, "loss": 1187.7779, "step": 8250 }, { "ce_loss_10": 3.5026116013526916, "ce_loss_13": 3.428036153316498, "ce_loss_2": 4.455591607093811, "ce_loss_3": 4.192325818538666, "ce_loss_7": 3.6698171854019166, "epoch": 0.826, "grad_norm": 540.0, "kl_loss_10": 170.86422119140624, "kl_loss_2": 2124.475665283203, "kl_loss_3": 1660.3611083984374, "kl_loss_7": 580.019741821289, "learning_rate": 7.430289649152156e-05, "loss": 1161.5576, "step": 8260 }, { "ce_loss_10": 3.404016637802124, "ce_loss_13": 3.3291639566421507, "ce_loss_2": 4.41770989894867, "ce_loss_3": 4.153000998497009, "ce_loss_7": 3.591460871696472, "epoch": 0.827, "grad_norm": 564.0, "kl_loss_10": 179.06679763793946, "kl_loss_2": 2271.2553527832033, "kl_loss_3": 1785.926287841797, "kl_loss_7": 620.811245727539, "learning_rate": 7.347279472290646e-05, "loss": 1175.1479, "step": 8270 }, { "ce_loss_10": 3.5473936796188354, "ce_loss_13": 3.4728646278381348, "ce_loss_2": 4.529109454154968, "ce_loss_3": 4.2634922623634335, "ce_loss_7": 3.7243195176124573, "epoch": 0.828, "grad_norm": 516.0, "kl_loss_10": 176.9839729309082, "kl_loss_2": 2192.3595458984373, "kl_loss_3": 1719.4920288085937, "kl_loss_7": 599.4450622558594, "learning_rate": 7.264698807851328e-05, "loss": 1170.4515, "step": 8280 }, { "ce_loss_10": 3.511405515670776, "ce_loss_13": 3.4420324087142946, "ce_loss_2": 4.462396240234375, "ce_loss_3": 4.196757709980011, "ce_loss_7": 3.678698420524597, "epoch": 0.829, "grad_norm": 520.0, "kl_loss_10": 173.053133392334, "kl_loss_2": 2122.6127746582033, "kl_loss_3": 1647.0698120117188, "kl_loss_7": 586.3415740966797, "learning_rate": 7.182548487420554e-05, "loss": 1152.9492, "step": 8290 }, { "ce_loss_10": 3.56430242061615, "ce_loss_13": 3.4877224922180177, "ce_loss_2": 4.515660381317138, "ce_loss_3": 4.256710803508758, "ce_loss_7": 3.7377355813980104, "epoch": 0.83, "grad_norm": 552.0, "kl_loss_10": 178.47344284057618, "kl_loss_2": 2141.9302307128905, "kl_loss_3": 1673.7640502929687, "kl_loss_7": 594.9386505126953, "learning_rate": 7.100829338251146e-05, "loss": 1142.0348, "step": 8300 }, { "ce_loss_10": 3.500733995437622, "ce_loss_13": 3.420394682884216, "ce_loss_2": 4.495737314224243, "ce_loss_3": 4.226100885868073, "ce_loss_7": 3.6811896324157716, "epoch": 0.831, "grad_norm": 664.0, "kl_loss_10": 181.1860824584961, "kl_loss_2": 2206.595349121094, "kl_loss_3": 1725.6842834472657, "kl_loss_7": 613.2857147216797, "learning_rate": 7.019542183254046e-05, "loss": 1155.5816, "step": 8310 }, { "ce_loss_10": 3.542024350166321, "ce_loss_13": 3.46200088262558, "ce_loss_2": 4.494865345954895, "ce_loss_3": 4.2283999681472775, "ce_loss_7": 3.7202147483825683, "epoch": 0.832, "grad_norm": 700.0, "kl_loss_10": 184.21017608642578, "kl_loss_2": 2155.698864746094, "kl_loss_3": 1680.3289855957032, "kl_loss_7": 609.4942947387696, "learning_rate": 6.938687840989971e-05, "loss": 1152.3119, "step": 8320 }, { "ce_loss_10": 3.475117301940918, "ce_loss_13": 3.396902585029602, "ce_loss_2": 4.447841107845306, "ce_loss_3": 4.1811567902565, "ce_loss_7": 3.65566908121109, "epoch": 0.833, "grad_norm": 644.0, "kl_loss_10": 180.3916358947754, "kl_loss_2": 2151.0271850585937, "kl_loss_3": 1678.1678161621094, "kl_loss_7": 600.837434387207, "learning_rate": 6.858267125661271e-05, "loss": 1171.0359, "step": 8330 }, { "ce_loss_10": 3.5341761112213135, "ce_loss_13": 3.4623092293739317, "ce_loss_2": 4.504952430725098, "ce_loss_3": 4.235734903812409, "ce_loss_7": 3.7161438941955565, "epoch": 0.834, "grad_norm": 652.0, "kl_loss_10": 176.11358489990235, "kl_loss_2": 2152.3007080078123, "kl_loss_3": 1674.9229736328125, "kl_loss_7": 599.1006164550781, "learning_rate": 6.778280847103668e-05, "loss": 1187.2795, "step": 8340 }, { "ce_loss_10": 3.5474065065383913, "ce_loss_13": 3.4685073494911194, "ce_loss_2": 4.511071228981018, "ce_loss_3": 4.243249070644379, "ce_loss_7": 3.7250569343566893, "epoch": 0.835, "grad_norm": 544.0, "kl_loss_10": 179.0617935180664, "kl_loss_2": 2161.984759521484, "kl_loss_3": 1690.6224731445313, "kl_loss_7": 606.1362609863281, "learning_rate": 6.698729810778065e-05, "loss": 1153.2188, "step": 8350 }, { "ce_loss_10": 3.4568483591079713, "ce_loss_13": 3.3825206756591797, "ce_loss_2": 4.450219774246216, "ce_loss_3": 4.178645396232605, "ce_loss_7": 3.6368404626846313, "epoch": 0.836, "grad_norm": 628.0, "kl_loss_10": 176.9057861328125, "kl_loss_2": 2207.3519287109375, "kl_loss_3": 1716.1288146972656, "kl_loss_7": 600.357534790039, "learning_rate": 6.619614817762538e-05, "loss": 1175.9664, "step": 8360 }, { "ce_loss_10": 3.421834397315979, "ce_loss_13": 3.3451184391975404, "ce_loss_2": 4.458368134498596, "ce_loss_3": 4.186334764957428, "ce_loss_7": 3.611315131187439, "epoch": 0.837, "grad_norm": 524.0, "kl_loss_10": 179.41786346435546, "kl_loss_2": 2288.001843261719, "kl_loss_3": 1799.511328125, "kl_loss_7": 622.7222686767578, "learning_rate": 6.540936664744196e-05, "loss": 1185.6504, "step": 8370 }, { "ce_loss_10": 3.5705604910850526, "ce_loss_13": 3.495253837108612, "ce_loss_2": 4.550519323348999, "ce_loss_3": 4.287250196933746, "ce_loss_7": 3.7482882261276247, "epoch": 0.838, "grad_norm": 556.0, "kl_loss_10": 178.4987823486328, "kl_loss_2": 2165.8636779785156, "kl_loss_3": 1697.199853515625, "kl_loss_7": 600.5514739990234, "learning_rate": 6.462696144011149e-05, "loss": 1144.948, "step": 8380 }, { "ce_loss_10": 3.521126616001129, "ce_loss_13": 3.44657279253006, "ce_loss_2": 4.47376012802124, "ce_loss_3": 4.215767812728882, "ce_loss_7": 3.700752067565918, "epoch": 0.839, "grad_norm": 552.0, "kl_loss_10": 181.0975456237793, "kl_loss_2": 2145.9520629882813, "kl_loss_3": 1683.1618103027345, "kl_loss_7": 606.7530914306641, "learning_rate": 6.384894043444567e-05, "loss": 1140.7508, "step": 8390 }, { "ce_loss_10": 3.5482063770294188, "ce_loss_13": 3.4719661116600036, "ce_loss_2": 4.523540115356445, "ce_loss_3": 4.260622024536133, "ce_loss_7": 3.7237455368041994, "epoch": 0.84, "grad_norm": 540.0, "kl_loss_10": 178.300057220459, "kl_loss_2": 2181.8050231933594, "kl_loss_3": 1707.3205505371093, "kl_loss_7": 602.2170059204102, "learning_rate": 6.307531146510753e-05, "loss": 1150.0869, "step": 8400 }, { "ce_loss_10": 3.526041495800018, "ce_loss_13": 3.4509783387184143, "ce_loss_2": 4.471963119506836, "ce_loss_3": 4.206485414505005, "ce_loss_7": 3.701741063594818, "epoch": 0.841, "grad_norm": 560.0, "kl_loss_10": 177.0880439758301, "kl_loss_2": 2118.260693359375, "kl_loss_3": 1641.845037841797, "kl_loss_7": 595.3580291748046, "learning_rate": 6.230608232253226e-05, "loss": 1129.8508, "step": 8410 }, { "ce_loss_10": 3.4824550271034242, "ce_loss_13": 3.405357301235199, "ce_loss_2": 4.482615494728089, "ce_loss_3": 4.2257519364356995, "ce_loss_7": 3.6680721044540405, "epoch": 0.842, "grad_norm": 512.0, "kl_loss_10": 179.48506774902344, "kl_loss_2": 2227.8513061523436, "kl_loss_3": 1761.4332275390625, "kl_loss_7": 616.7242858886718, "learning_rate": 6.154126075284855e-05, "loss": 1155.2555, "step": 8420 }, { "ce_loss_10": 3.577245807647705, "ce_loss_13": 3.5012118101119993, "ce_loss_2": 4.512918734550476, "ce_loss_3": 4.249192714691162, "ce_loss_7": 3.7460012435913086, "epoch": 0.843, "grad_norm": 704.0, "kl_loss_10": 174.01815643310547, "kl_loss_2": 2066.7827331542967, "kl_loss_3": 1608.2534240722657, "kl_loss_7": 586.0584747314454, "learning_rate": 6.078085445780129e-05, "loss": 1117.5314, "step": 8430 }, { "ce_loss_10": 3.584468650817871, "ce_loss_13": 3.5081024169921875, "ce_loss_2": 4.569616174697876, "ce_loss_3": 4.300150573253632, "ce_loss_7": 3.7608886480331423, "epoch": 0.844, "grad_norm": 576.0, "kl_loss_10": 177.62249679565429, "kl_loss_2": 2185.7296508789063, "kl_loss_3": 1710.7079772949219, "kl_loss_7": 599.2171966552735, "learning_rate": 6.002487109467347e-05, "loss": 1141.6974, "step": 8440 }, { "ce_loss_10": 3.587876856327057, "ce_loss_13": 3.5108195781707763, "ce_loss_2": 4.524105596542358, "ce_loss_3": 4.2644576787948605, "ce_loss_7": 3.756032574176788, "epoch": 0.845, "grad_norm": 592.0, "kl_loss_10": 181.40281448364257, "kl_loss_2": 2131.273962402344, "kl_loss_3": 1667.7534301757812, "kl_loss_7": 605.4166229248046, "learning_rate": 5.927331827620902e-05, "loss": 1141.2443, "step": 8450 }, { "ce_loss_10": 3.573608911037445, "ce_loss_13": 3.499223828315735, "ce_loss_2": 4.488192296028137, "ce_loss_3": 4.230596256256104, "ce_loss_7": 3.7483445525169374, "epoch": 0.846, "grad_norm": 552.0, "kl_loss_10": 175.3866973876953, "kl_loss_2": 2041.3302062988282, "kl_loss_3": 1588.8981994628907, "kl_loss_7": 591.4093719482422, "learning_rate": 5.852620357053651e-05, "loss": 1132.2791, "step": 8460 }, { "ce_loss_10": 3.6111098527908325, "ce_loss_13": 3.536815571784973, "ce_loss_2": 4.544794130325317, "ce_loss_3": 4.2818133473396305, "ce_loss_7": 3.780075490474701, "epoch": 0.847, "grad_norm": 596.0, "kl_loss_10": 174.2255989074707, "kl_loss_2": 2095.0241271972654, "kl_loss_3": 1629.0387634277345, "kl_loss_7": 588.6748046875, "learning_rate": 5.778353450109286e-05, "loss": 1140.0846, "step": 8470 }, { "ce_loss_10": 3.648575019836426, "ce_loss_13": 3.5720423340797423, "ce_loss_2": 4.605719590187073, "ce_loss_3": 4.344667458534241, "ce_loss_7": 3.8233685731887816, "epoch": 0.848, "grad_norm": 486.0, "kl_loss_10": 179.81479415893554, "kl_loss_2": 2138.249481201172, "kl_loss_3": 1668.5974670410155, "kl_loss_7": 599.209912109375, "learning_rate": 5.7045318546547206e-05, "loss": 1146.8947, "step": 8480 }, { "ce_loss_10": 3.5448459148406983, "ce_loss_13": 3.468916821479797, "ce_loss_2": 4.523812007904053, "ce_loss_3": 4.25856339931488, "ce_loss_7": 3.7180402636528016, "epoch": 0.849, "grad_norm": 556.0, "kl_loss_10": 176.35201721191407, "kl_loss_2": 2188.633197021484, "kl_loss_3": 1710.0829528808595, "kl_loss_7": 595.4919921875, "learning_rate": 5.631156314072605e-05, "loss": 1145.8699, "step": 8490 }, { "ce_loss_10": 3.559221601486206, "ce_loss_13": 3.4834945678710936, "ce_loss_2": 4.495839285850525, "ce_loss_3": 4.22744711637497, "ce_loss_7": 3.7315674662590026, "epoch": 0.85, "grad_norm": 536.0, "kl_loss_10": 176.7962844848633, "kl_loss_2": 2110.8933044433593, "kl_loss_3": 1632.140673828125, "kl_loss_7": 588.6574279785157, "learning_rate": 5.5582275672538315e-05, "loss": 1128.7181, "step": 8500 }, { "ce_loss_10": 3.4811159491539003, "ce_loss_13": 3.403310573101044, "ce_loss_2": 4.505023097991943, "ce_loss_3": 4.238485896587372, "ce_loss_7": 3.6721346259117125, "epoch": 0.851, "grad_norm": 608.0, "kl_loss_10": 182.8627899169922, "kl_loss_2": 2282.4833068847656, "kl_loss_3": 1798.2781005859374, "kl_loss_7": 625.0588409423829, "learning_rate": 5.4857463485900484e-05, "loss": 1192.5725, "step": 8510 }, { "ce_loss_10": 3.5297972202301025, "ce_loss_13": 3.454351043701172, "ce_loss_2": 4.489051342010498, "ce_loss_3": 4.219796097278595, "ce_loss_7": 3.7097468852996824, "epoch": 0.852, "grad_norm": 592.0, "kl_loss_10": 178.4038864135742, "kl_loss_2": 2146.418908691406, "kl_loss_3": 1667.9574584960938, "kl_loss_7": 602.1778778076172, "learning_rate": 5.413713387966329e-05, "loss": 1150.9164, "step": 8520 }, { "ce_loss_10": 3.558277463912964, "ce_loss_13": 3.480745458602905, "ce_loss_2": 4.530928635597229, "ce_loss_3": 4.266195034980774, "ce_loss_7": 3.7276942253112795, "epoch": 0.853, "grad_norm": 620.0, "kl_loss_10": 178.86384658813478, "kl_loss_2": 2174.0791748046877, "kl_loss_3": 1703.99384765625, "kl_loss_7": 598.962451171875, "learning_rate": 5.34212941075381e-05, "loss": 1160.2438, "step": 8530 }, { "ce_loss_10": 3.559523808956146, "ce_loss_13": 3.4899546623229982, "ce_loss_2": 4.511786758899689, "ce_loss_3": 4.244810569286346, "ce_loss_7": 3.7282875180244446, "epoch": 0.854, "grad_norm": 544.0, "kl_loss_10": 173.61905364990236, "kl_loss_2": 2125.2814514160154, "kl_loss_3": 1651.0782043457032, "kl_loss_7": 580.7644989013672, "learning_rate": 5.270995137802315e-05, "loss": 1139.1208, "step": 8540 }, { "ce_loss_10": 3.4913312673568724, "ce_loss_13": 3.4199952483177185, "ce_loss_2": 4.46144163608551, "ce_loss_3": 4.1969265818595884, "ce_loss_7": 3.6703786849975586, "epoch": 0.855, "grad_norm": 596.0, "kl_loss_10": 176.40887756347655, "kl_loss_2": 2170.294287109375, "kl_loss_3": 1693.5492065429687, "kl_loss_7": 604.686279296875, "learning_rate": 5.2003112854332125e-05, "loss": 1161.1432, "step": 8550 }, { "ce_loss_10": 3.495318293571472, "ce_loss_13": 3.421377086639404, "ce_loss_2": 4.460399007797241, "ce_loss_3": 4.191486561298371, "ce_loss_7": 3.666605508327484, "epoch": 0.856, "grad_norm": 624.0, "kl_loss_10": 174.107186126709, "kl_loss_2": 2159.1057250976564, "kl_loss_3": 1686.5014953613281, "kl_loss_7": 595.0416564941406, "learning_rate": 5.130078565432089e-05, "loss": 1138.6503, "step": 8560 }, { "ce_loss_10": 3.5646494030952454, "ce_loss_13": 3.4924421072006226, "ce_loss_2": 4.498122811317444, "ce_loss_3": 4.236236476898194, "ce_loss_7": 3.731403958797455, "epoch": 0.857, "grad_norm": 548.0, "kl_loss_10": 173.7933578491211, "kl_loss_2": 2107.9238403320314, "kl_loss_3": 1647.1588623046875, "kl_loss_7": 586.2743255615235, "learning_rate": 5.060297685041659e-05, "loss": 1120.3278, "step": 8570 }, { "ce_loss_10": 3.498642110824585, "ce_loss_13": 3.423696291446686, "ce_loss_2": 4.494770348072052, "ce_loss_3": 4.218523621559143, "ce_loss_7": 3.6762722969055175, "epoch": 0.858, "grad_norm": 548.0, "kl_loss_10": 180.7342544555664, "kl_loss_2": 2221.7594360351563, "kl_loss_3": 1733.2135437011718, "kl_loss_7": 609.6224884033203, "learning_rate": 4.99096934695461e-05, "loss": 1183.0167, "step": 8580 }, { "ce_loss_10": 3.55733345746994, "ce_loss_13": 3.4829642295837404, "ce_loss_2": 4.523844695091247, "ce_loss_3": 4.2578066945075985, "ce_loss_7": 3.7367467999458315, "epoch": 0.859, "grad_norm": 544.0, "kl_loss_10": 175.85337829589844, "kl_loss_2": 2157.883190917969, "kl_loss_3": 1681.7412048339843, "kl_loss_7": 598.1185791015625, "learning_rate": 4.922094249306558e-05, "loss": 1131.2188, "step": 8590 }, { "ce_loss_10": 3.5841567873954774, "ce_loss_13": 3.509797990322113, "ce_loss_2": 4.5469663619995115, "ce_loss_3": 4.2871175646781925, "ce_loss_7": 3.7628376722335815, "epoch": 0.86, "grad_norm": 604.0, "kl_loss_10": 179.83917465209962, "kl_loss_2": 2154.5059204101562, "kl_loss_3": 1690.2081298828125, "kl_loss_7": 604.0245941162109, "learning_rate": 4.853673085668947e-05, "loss": 1135.9622, "step": 8600 }, { "ce_loss_10": 3.60320885181427, "ce_loss_13": 3.529503357410431, "ce_loss_2": 4.566924571990967, "ce_loss_3": 4.302417039871216, "ce_loss_7": 3.7819976687431334, "epoch": 0.861, "grad_norm": 596.0, "kl_loss_10": 177.36936798095704, "kl_loss_2": 2148.3923461914064, "kl_loss_3": 1680.8319580078125, "kl_loss_7": 597.4716522216797, "learning_rate": 4.78570654504214e-05, "loss": 1156.3883, "step": 8610 }, { "ce_loss_10": 3.5473016500473022, "ce_loss_13": 3.4740379452705383, "ce_loss_2": 4.5167618751525875, "ce_loss_3": 4.248826539516449, "ce_loss_7": 3.7283095955848693, "epoch": 0.862, "grad_norm": 512.0, "kl_loss_10": 175.62952041625977, "kl_loss_2": 2176.8274475097655, "kl_loss_3": 1698.4800903320313, "kl_loss_7": 603.3765747070313, "learning_rate": 4.7181953118484556e-05, "loss": 1157.7057, "step": 8620 }, { "ce_loss_10": 3.5743127822875977, "ce_loss_13": 3.49841423034668, "ce_loss_2": 4.522394800186158, "ce_loss_3": 4.2583330273628235, "ce_loss_7": 3.751324450969696, "epoch": 0.863, "grad_norm": 604.0, "kl_loss_10": 175.00742568969727, "kl_loss_2": 2093.126568603516, "kl_loss_3": 1630.9019409179687, "kl_loss_7": 592.01396484375, "learning_rate": 4.651140065925269e-05, "loss": 1159.3387, "step": 8630 }, { "ce_loss_10": 3.507640373706818, "ce_loss_13": 3.434223484992981, "ce_loss_2": 4.487589573860168, "ce_loss_3": 4.2153548240661625, "ce_loss_7": 3.6856843709945677, "epoch": 0.864, "grad_norm": 588.0, "kl_loss_10": 177.25660781860353, "kl_loss_2": 2189.234814453125, "kl_loss_3": 1705.8089416503906, "kl_loss_7": 594.7530731201172, "learning_rate": 4.58454148251814e-05, "loss": 1175.3236, "step": 8640 }, { "ce_loss_10": 3.5295264959335326, "ce_loss_13": 3.451808476448059, "ce_loss_2": 4.534165596961975, "ce_loss_3": 4.261823272705078, "ce_loss_7": 3.7138017773628236, "epoch": 0.865, "grad_norm": 568.0, "kl_loss_10": 177.66054000854493, "kl_loss_2": 2227.2971801757812, "kl_loss_3": 1739.9827575683594, "kl_loss_7": 610.7921539306641, "learning_rate": 4.518400232274078e-05, "loss": 1162.0056, "step": 8650 }, { "ce_loss_10": 3.5479356169700624, "ce_loss_13": 3.4702929258346558, "ce_loss_2": 4.501330161094666, "ce_loss_3": 4.237502670288086, "ce_loss_7": 3.723937380313873, "epoch": 0.866, "grad_norm": 524.0, "kl_loss_10": 179.25594482421874, "kl_loss_2": 2137.7675231933595, "kl_loss_3": 1671.597607421875, "kl_loss_7": 602.4135375976563, "learning_rate": 4.452716981234745e-05, "loss": 1122.9633, "step": 8660 }, { "ce_loss_10": 3.5203991651535036, "ce_loss_13": 3.447481095790863, "ce_loss_2": 4.47524061203003, "ce_loss_3": 4.205355083942413, "ce_loss_7": 3.695466148853302, "epoch": 0.867, "grad_norm": 568.0, "kl_loss_10": 174.45485000610353, "kl_loss_2": 2135.713586425781, "kl_loss_3": 1657.6833251953126, "kl_loss_7": 594.2672515869141, "learning_rate": 4.3874923908297335e-05, "loss": 1125.4834, "step": 8670 }, { "ce_loss_10": 3.575284111499786, "ce_loss_13": 3.498721444606781, "ce_loss_2": 4.54740161895752, "ce_loss_3": 4.281282663345337, "ce_loss_7": 3.7500776290893554, "epoch": 0.868, "grad_norm": 596.0, "kl_loss_10": 178.63047256469727, "kl_loss_2": 2175.332312011719, "kl_loss_3": 1702.8009948730469, "kl_loss_7": 597.87509765625, "learning_rate": 4.322727117869951e-05, "loss": 1149.3994, "step": 8680 }, { "ce_loss_10": 3.575519359111786, "ce_loss_13": 3.4998192310333254, "ce_loss_2": 4.55748233795166, "ce_loss_3": 4.284105372428894, "ce_loss_7": 3.752281701564789, "epoch": 0.869, "grad_norm": 584.0, "kl_loss_10": 179.47224349975585, "kl_loss_2": 2189.6538024902343, "kl_loss_3": 1698.4593017578125, "kl_loss_7": 604.034390258789, "learning_rate": 4.2584218145409916e-05, "loss": 1151.0721, "step": 8690 }, { "ce_loss_10": 3.6216215252876283, "ce_loss_13": 3.551214134693146, "ce_loss_2": 4.543716049194336, "ce_loss_3": 4.272314977645874, "ce_loss_7": 3.786858594417572, "epoch": 0.87, "grad_norm": 600.0, "kl_loss_10": 174.5644790649414, "kl_loss_2": 2072.967736816406, "kl_loss_3": 1598.7917419433593, "kl_loss_7": 582.653207397461, "learning_rate": 4.194577128396521e-05, "loss": 1108.3934, "step": 8700 }, { "ce_loss_10": 3.498377776145935, "ce_loss_13": 3.425851809978485, "ce_loss_2": 4.466418659687042, "ce_loss_3": 4.194790709018707, "ce_loss_7": 3.670235824584961, "epoch": 0.871, "grad_norm": 506.0, "kl_loss_10": 174.37066497802735, "kl_loss_2": 2168.9560546875, "kl_loss_3": 1689.01513671875, "kl_loss_7": 590.7626220703125, "learning_rate": 4.1311937023518264e-05, "loss": 1166.4488, "step": 8710 }, { "ce_loss_10": 3.5134344696998596, "ce_loss_13": 3.4397946119308473, "ce_loss_2": 4.529711484909058, "ce_loss_3": 4.263941979408264, "ce_loss_7": 3.682065725326538, "epoch": 0.872, "grad_norm": 460.0, "kl_loss_10": 171.77398529052735, "kl_loss_2": 2246.7038696289064, "kl_loss_3": 1774.6010986328124, "kl_loss_7": 576.3054107666015, "learning_rate": 4.0682721746773344e-05, "loss": 1163.771, "step": 8720 }, { "ce_loss_10": 3.3905357241630556, "ce_loss_13": 3.314958465099335, "ce_loss_2": 4.396602368354797, "ce_loss_3": 4.132218360900879, "ce_loss_7": 3.5750641107559202, "epoch": 0.873, "grad_norm": 552.0, "kl_loss_10": 175.90054779052736, "kl_loss_2": 2222.7579711914063, "kl_loss_3": 1749.4481201171875, "kl_loss_7": 613.7805450439453, "learning_rate": 4.0058131789920904e-05, "loss": 1143.7059, "step": 8730 }, { "ce_loss_10": 3.5397099256515503, "ce_loss_13": 3.4643173098564146, "ce_loss_2": 4.497500014305115, "ce_loss_3": 4.226579332351685, "ce_loss_7": 3.709259867668152, "epoch": 0.874, "grad_norm": 572.0, "kl_loss_10": 176.51957778930665, "kl_loss_2": 2162.519940185547, "kl_loss_3": 1680.782940673828, "kl_loss_7": 600.4374298095703, "learning_rate": 3.9438173442575e-05, "loss": 1188.067, "step": 8740 }, { "ce_loss_10": 3.5728036165237427, "ce_loss_13": 3.4973131656646728, "ce_loss_2": 4.514467573165893, "ce_loss_3": 4.250711810588837, "ce_loss_7": 3.740223217010498, "epoch": 0.875, "grad_norm": 524.0, "kl_loss_10": 175.66529846191406, "kl_loss_2": 2112.5622009277345, "kl_loss_3": 1651.0288818359375, "kl_loss_7": 594.9720977783203, "learning_rate": 3.882285294770937e-05, "loss": 1137.6895, "step": 8750 }, { "ce_loss_10": 3.5377378940582274, "ce_loss_13": 3.4619855165481566, "ce_loss_2": 4.4779297590255736, "ce_loss_3": 4.2127908825874325, "ce_loss_7": 3.7088746547698976, "epoch": 0.876, "grad_norm": 600.0, "kl_loss_10": 178.39902954101564, "kl_loss_2": 2127.570068359375, "kl_loss_3": 1648.2625427246094, "kl_loss_7": 594.3859985351562, "learning_rate": 3.821217650159453e-05, "loss": 1155.4234, "step": 8760 }, { "ce_loss_10": 3.4084259629249574, "ce_loss_13": 3.332693111896515, "ce_loss_2": 4.428424310684204, "ce_loss_3": 4.158231461048127, "ce_loss_7": 3.6029205918312073, "epoch": 0.877, "grad_norm": 548.0, "kl_loss_10": 180.0270034790039, "kl_loss_2": 2236.819873046875, "kl_loss_3": 1758.5952392578124, "kl_loss_7": 625.4339080810547, "learning_rate": 3.760615025373543e-05, "loss": 1171.5936, "step": 8770 }, { "ce_loss_10": 3.595931589603424, "ce_loss_13": 3.5179906845092774, "ce_loss_2": 4.587141966819763, "ce_loss_3": 4.309424257278442, "ce_loss_7": 3.7760006308555605, "epoch": 0.878, "grad_norm": 660.0, "kl_loss_10": 183.31888961791992, "kl_loss_2": 2207.746258544922, "kl_loss_3": 1714.543280029297, "kl_loss_7": 607.3442749023437, "learning_rate": 3.700478030680987e-05, "loss": 1181.1754, "step": 8780 }, { "ce_loss_10": 3.5762731194496156, "ce_loss_13": 3.5029969453811645, "ce_loss_2": 4.536650991439819, "ce_loss_3": 4.271095442771911, "ce_loss_7": 3.748454582691193, "epoch": 0.879, "grad_norm": 502.0, "kl_loss_10": 176.4349349975586, "kl_loss_2": 2141.4830688476563, "kl_loss_3": 1675.0247314453125, "kl_loss_7": 590.1913375854492, "learning_rate": 3.6408072716606344e-05, "loss": 1149.3131, "step": 8790 }, { "ce_loss_10": 3.50073447227478, "ce_loss_13": 3.4274021863937376, "ce_loss_2": 4.501238942146301, "ce_loss_3": 4.239216554164886, "ce_loss_7": 3.683646392822266, "epoch": 0.88, "grad_norm": 600.0, "kl_loss_10": 180.50948486328124, "kl_loss_2": 2229.279052734375, "kl_loss_3": 1758.1964477539063, "kl_loss_7": 612.8555847167969, "learning_rate": 3.5816033491963716e-05, "loss": 1204.1847, "step": 8800 }, { "ce_loss_10": 3.3653410911560058, "ce_loss_13": 3.289612293243408, "ce_loss_2": 4.399015557765961, "ce_loss_3": 4.129138934612274, "ce_loss_7": 3.5484530568122863, "epoch": 0.881, "grad_norm": 696.0, "kl_loss_10": 176.9532043457031, "kl_loss_2": 2282.183734130859, "kl_loss_3": 1791.626202392578, "kl_loss_7": 607.9001495361329, "learning_rate": 3.522866859471047e-05, "loss": 1184.3774, "step": 8810 }, { "ce_loss_10": 3.597711908817291, "ce_loss_13": 3.5282416582107543, "ce_loss_2": 4.506113409996033, "ce_loss_3": 4.251258683204651, "ce_loss_7": 3.7641051173210145, "epoch": 0.882, "grad_norm": 636.0, "kl_loss_10": 169.83995971679687, "kl_loss_2": 2038.54423828125, "kl_loss_3": 1585.0056030273438, "kl_loss_7": 568.1876190185546, "learning_rate": 3.46459839396045e-05, "loss": 1125.9656, "step": 8820 }, { "ce_loss_10": 3.529653000831604, "ce_loss_13": 3.449263334274292, "ce_loss_2": 4.503918576240539, "ce_loss_3": 4.2343867182731625, "ce_loss_7": 3.712590277194977, "epoch": 0.883, "grad_norm": 576.0, "kl_loss_10": 178.9514488220215, "kl_loss_2": 2152.299346923828, "kl_loss_3": 1671.0650329589844, "kl_loss_7": 603.799105834961, "learning_rate": 3.406798539427386e-05, "loss": 1176.0018, "step": 8830 }, { "ce_loss_10": 3.5842846632003784, "ce_loss_13": 3.510246682167053, "ce_loss_2": 4.541045117378235, "ce_loss_3": 4.27923276424408, "ce_loss_7": 3.7581582188606264, "epoch": 0.884, "grad_norm": 576.0, "kl_loss_10": 176.05337142944336, "kl_loss_2": 2155.0933471679687, "kl_loss_3": 1681.9300415039063, "kl_loss_7": 595.8100616455079, "learning_rate": 3.349467877915746e-05, "loss": 1155.9855, "step": 8840 }, { "ce_loss_10": 3.5404534935951233, "ce_loss_13": 3.4635657548904417, "ce_loss_2": 4.524973630905151, "ce_loss_3": 4.259057784080506, "ce_loss_7": 3.7238620042800905, "epoch": 0.885, "grad_norm": 604.0, "kl_loss_10": 178.34489822387695, "kl_loss_2": 2212.789306640625, "kl_loss_3": 1740.2167602539062, "kl_loss_7": 609.8538391113282, "learning_rate": 3.292606986744667e-05, "loss": 1199.5514, "step": 8850 }, { "ce_loss_10": 3.4956598401069643, "ce_loss_13": 3.4253405332565308, "ce_loss_2": 4.470083999633789, "ce_loss_3": 4.206580317020416, "ce_loss_7": 3.6703789830207825, "epoch": 0.886, "grad_norm": 580.0, "kl_loss_10": 174.30244827270508, "kl_loss_2": 2159.222479248047, "kl_loss_3": 1693.7933227539063, "kl_loss_7": 599.4409484863281, "learning_rate": 3.23621643850267e-05, "loss": 1154.7352, "step": 8860 }, { "ce_loss_10": 3.5727248191833496, "ce_loss_13": 3.496046614646912, "ce_loss_2": 4.52955162525177, "ce_loss_3": 4.25726010799408, "ce_loss_7": 3.7465644001960756, "epoch": 0.887, "grad_norm": 552.0, "kl_loss_10": 179.55787811279296, "kl_loss_2": 2159.213214111328, "kl_loss_3": 1673.28759765625, "kl_loss_7": 605.2644119262695, "learning_rate": 3.180296801041971e-05, "loss": 1139.4904, "step": 8870 }, { "ce_loss_10": 3.595055866241455, "ce_loss_13": 3.5219205260276794, "ce_loss_2": 4.565229892730713, "ce_loss_3": 4.302072286605835, "ce_loss_7": 3.7657612919807435, "epoch": 0.888, "grad_norm": 488.0, "kl_loss_10": 174.51052551269532, "kl_loss_2": 2168.450274658203, "kl_loss_3": 1696.4366394042968, "kl_loss_7": 592.0915069580078, "learning_rate": 3.124848637472688e-05, "loss": 1132.4514, "step": 8880 }, { "ce_loss_10": 3.420267331600189, "ce_loss_13": 3.346735382080078, "ce_loss_2": 4.411080622673035, "ce_loss_3": 4.143524849414826, "ce_loss_7": 3.6017141342163086, "epoch": 0.889, "grad_norm": 600.0, "kl_loss_10": 174.23086242675782, "kl_loss_2": 2199.3413024902343, "kl_loss_3": 1717.92958984375, "kl_loss_7": 596.7423614501953, "learning_rate": 3.069872506157212e-05, "loss": 1155.8682, "step": 8890 }, { "ce_loss_10": 3.5183400988578795, "ce_loss_13": 3.4446550846099853, "ce_loss_2": 4.4763764381408695, "ce_loss_3": 4.213694953918457, "ce_loss_7": 3.693737256526947, "epoch": 0.89, "grad_norm": 544.0, "kl_loss_10": 176.23825073242188, "kl_loss_2": 2152.81796875, "kl_loss_3": 1686.6082885742187, "kl_loss_7": 599.2687866210938, "learning_rate": 3.0153689607045842e-05, "loss": 1144.8437, "step": 8900 }, { "ce_loss_10": 3.4148733854293822, "ce_loss_13": 3.3367454648017882, "ce_loss_2": 4.46618926525116, "ce_loss_3": 4.190043389797211, "ce_loss_7": 3.606413686275482, "epoch": 0.891, "grad_norm": 556.0, "kl_loss_10": 181.56116943359376, "kl_loss_2": 2316.6614379882812, "kl_loss_3": 1823.7576782226563, "kl_loss_7": 624.2273193359375, "learning_rate": 2.9613385499648926e-05, "loss": 1174.4811, "step": 8910 }, { "ce_loss_10": 3.472314703464508, "ce_loss_13": 3.3953917384147645, "ce_loss_2": 4.439359056949615, "ce_loss_3": 4.1694392442703245, "ce_loss_7": 3.6529108047485352, "epoch": 0.892, "grad_norm": 632.0, "kl_loss_10": 176.6066520690918, "kl_loss_2": 2142.8492736816406, "kl_loss_3": 1665.7835510253906, "kl_loss_7": 596.6286926269531, "learning_rate": 2.9077818180237692e-05, "loss": 1160.4215, "step": 8920 }, { "ce_loss_10": 3.5216124176979067, "ce_loss_13": 3.444805955886841, "ce_loss_2": 4.504645991325378, "ce_loss_3": 4.23671303987503, "ce_loss_7": 3.703485441207886, "epoch": 0.893, "grad_norm": 584.0, "kl_loss_10": 176.44115447998047, "kl_loss_2": 2174.382470703125, "kl_loss_3": 1696.403564453125, "kl_loss_7": 600.8687164306641, "learning_rate": 2.8546993041969172e-05, "loss": 1152.6479, "step": 8930 }, { "ce_loss_10": 3.5529621839523315, "ce_loss_13": 3.4791373729705812, "ce_loss_2": 4.487787294387817, "ce_loss_3": 4.227098524570465, "ce_loss_7": 3.7245055556297304, "epoch": 0.894, "grad_norm": 506.0, "kl_loss_10": 174.32415542602538, "kl_loss_2": 2110.938995361328, "kl_loss_3": 1637.950372314453, "kl_loss_7": 590.5497222900391, "learning_rate": 2.802091543024671e-05, "loss": 1153.5114, "step": 8940 }, { "ce_loss_10": 3.5515737652778627, "ce_loss_13": 3.4770275354385376, "ce_loss_2": 4.526445126533508, "ce_loss_3": 4.267046928405762, "ce_loss_7": 3.728025937080383, "epoch": 0.895, "grad_norm": 612.0, "kl_loss_10": 177.65689697265626, "kl_loss_2": 2195.3055725097656, "kl_loss_3": 1727.4093139648437, "kl_loss_7": 604.4041717529296, "learning_rate": 2.7499590642665774e-05, "loss": 1190.9908, "step": 8950 }, { "ce_loss_10": 3.5625943899154664, "ce_loss_13": 3.4850521326065063, "ce_loss_2": 4.553565168380738, "ce_loss_3": 4.279750061035156, "ce_loss_7": 3.739047312736511, "epoch": 0.896, "grad_norm": 512.0, "kl_loss_10": 178.24014129638672, "kl_loss_2": 2193.0329833984374, "kl_loss_3": 1709.0227905273437, "kl_loss_7": 602.5221313476562, "learning_rate": 2.6983023928961405e-05, "loss": 1147.626, "step": 8960 }, { "ce_loss_10": 3.532795751094818, "ce_loss_13": 3.4568071961402893, "ce_loss_2": 4.499462056159973, "ce_loss_3": 4.242076885700226, "ce_loss_7": 3.709202516078949, "epoch": 0.897, "grad_norm": 616.0, "kl_loss_10": 177.7622848510742, "kl_loss_2": 2147.4717651367187, "kl_loss_3": 1687.235498046875, "kl_loss_7": 597.6563537597656, "learning_rate": 2.6471220490954628e-05, "loss": 1172.1214, "step": 8970 }, { "ce_loss_10": 3.5174603939056395, "ce_loss_13": 3.4463194727897646, "ce_loss_2": 4.477302503585816, "ce_loss_3": 4.214387357234955, "ce_loss_7": 3.6873306155204775, "epoch": 0.898, "grad_norm": 592.0, "kl_loss_10": 174.26932220458986, "kl_loss_2": 2152.279821777344, "kl_loss_3": 1683.4832763671875, "kl_loss_7": 590.3468292236328, "learning_rate": 2.596418548250029e-05, "loss": 1156.035, "step": 8980 }, { "ce_loss_10": 3.5602595686912535, "ce_loss_13": 3.485771131515503, "ce_loss_2": 4.5211225032806395, "ce_loss_3": 4.257622516155243, "ce_loss_7": 3.7368709683418273, "epoch": 0.899, "grad_norm": 524.0, "kl_loss_10": 179.2437530517578, "kl_loss_2": 2158.524066162109, "kl_loss_3": 1691.2223327636718, "kl_loss_7": 601.7962463378906, "learning_rate": 2.5461924009435368e-05, "loss": 1142.976, "step": 8990 }, { "ce_loss_10": 3.5547463297843933, "ce_loss_13": 3.479547905921936, "ce_loss_2": 4.515283250808716, "ce_loss_3": 4.250737547874451, "ce_loss_7": 3.732912743091583, "epoch": 0.9, "grad_norm": 572.0, "kl_loss_10": 177.4375427246094, "kl_loss_2": 2139.4888916015625, "kl_loss_3": 1664.3239318847657, "kl_loss_7": 599.9839569091797, "learning_rate": 2.4964441129527336e-05, "loss": 1166.3201, "step": 9000 }, { "ce_loss_10": 3.553958511352539, "ce_loss_13": 3.476763606071472, "ce_loss_2": 4.496755647659302, "ce_loss_3": 4.227215158939361, "ce_loss_7": 3.7206253528594972, "epoch": 0.901, "grad_norm": 540.0, "kl_loss_10": 174.2966407775879, "kl_loss_2": 2111.5487548828123, "kl_loss_3": 1639.27841796875, "kl_loss_7": 584.4350708007812, "learning_rate": 2.4471741852423235e-05, "loss": 1125.0274, "step": 9010 }, { "ce_loss_10": 3.600440430641174, "ce_loss_13": 3.522721529006958, "ce_loss_2": 4.542108774185181, "ce_loss_3": 4.282799339294433, "ce_loss_7": 3.7742578268051146, "epoch": 0.902, "grad_norm": 524.0, "kl_loss_10": 175.27555160522462, "kl_loss_2": 2098.4271484375, "kl_loss_3": 1637.176934814453, "kl_loss_7": 587.0385147094727, "learning_rate": 2.3983831139599287e-05, "loss": 1139.7687, "step": 9020 }, { "ce_loss_10": 3.519875633716583, "ce_loss_13": 3.446099603176117, "ce_loss_2": 4.47457070350647, "ce_loss_3": 4.212863862514496, "ce_loss_7": 3.68597651720047, "epoch": 0.903, "grad_norm": 508.0, "kl_loss_10": 174.20441055297852, "kl_loss_2": 2129.7295166015624, "kl_loss_3": 1661.6650817871093, "kl_loss_7": 579.8323806762695, "learning_rate": 2.3500713904311022e-05, "loss": 1116.7298, "step": 9030 }, { "ce_loss_10": 3.5635103940963746, "ce_loss_13": 3.4901776790618895, "ce_loss_2": 4.492858815193176, "ce_loss_3": 4.233377468585968, "ce_loss_7": 3.7266834378242493, "epoch": 0.904, "grad_norm": 568.0, "kl_loss_10": 172.6203300476074, "kl_loss_2": 2067.9869079589844, "kl_loss_3": 1612.4867309570313, "kl_loss_7": 575.5206619262696, "learning_rate": 2.3022395011543685e-05, "loss": 1119.9885, "step": 9040 }, { "ce_loss_10": 3.592438757419586, "ce_loss_13": 3.515104389190674, "ce_loss_2": 4.541802954673767, "ce_loss_3": 4.281132400035858, "ce_loss_7": 3.7722238898277283, "epoch": 0.905, "grad_norm": 572.0, "kl_loss_10": 180.47207794189453, "kl_loss_2": 2144.9797241210936, "kl_loss_3": 1672.9977722167969, "kl_loss_7": 611.6667938232422, "learning_rate": 2.2548879277963063e-05, "loss": 1176.2332, "step": 9050 }, { "ce_loss_10": 3.5052724361419676, "ce_loss_13": 3.43240772485733, "ce_loss_2": 4.459889388084411, "ce_loss_3": 4.187293374538422, "ce_loss_7": 3.677665722370148, "epoch": 0.906, "grad_norm": 536.0, "kl_loss_10": 175.61516647338868, "kl_loss_2": 2136.0928955078125, "kl_loss_3": 1652.1388549804688, "kl_loss_7": 587.7709274291992, "learning_rate": 2.208017147186736e-05, "loss": 1112.7982, "step": 9060 }, { "ce_loss_10": 3.5033626675605776, "ce_loss_13": 3.4270050883293153, "ce_loss_2": 4.460723853111267, "ce_loss_3": 4.200226056575775, "ce_loss_7": 3.675853359699249, "epoch": 0.907, "grad_norm": 532.0, "kl_loss_10": 175.80412521362305, "kl_loss_2": 2135.014111328125, "kl_loss_3": 1673.9324096679688, "kl_loss_7": 594.3165740966797, "learning_rate": 2.1616276313139227e-05, "loss": 1130.9125, "step": 9070 }, { "ce_loss_10": 3.540289318561554, "ce_loss_13": 3.4620949029922485, "ce_loss_2": 4.504480719566345, "ce_loss_3": 4.243532609939575, "ce_loss_7": 3.7148184418678283, "epoch": 0.908, "grad_norm": 564.0, "kl_loss_10": 176.60092849731444, "kl_loss_2": 2138.388262939453, "kl_loss_3": 1670.2328552246095, "kl_loss_7": 593.4554962158203, "learning_rate": 2.1157198473197415e-05, "loss": 1155.7779, "step": 9080 }, { "ce_loss_10": 3.608424699306488, "ce_loss_13": 3.5318522691726684, "ce_loss_2": 4.569310665130615, "ce_loss_3": 4.307307338714599, "ce_loss_7": 3.7887983441352846, "epoch": 0.909, "grad_norm": 532.0, "kl_loss_10": 179.34665451049804, "kl_loss_2": 2147.910009765625, "kl_loss_3": 1676.3675476074218, "kl_loss_7": 609.1422882080078, "learning_rate": 2.0702942574950812e-05, "loss": 1150.5193, "step": 9090 }, { "ce_loss_10": 3.531160354614258, "ce_loss_13": 3.4534537196159363, "ce_loss_2": 4.502471184730529, "ce_loss_3": 4.2366371870040895, "ce_loss_7": 3.7111218690872194, "epoch": 0.91, "grad_norm": 576.0, "kl_loss_10": 178.72406845092775, "kl_loss_2": 2166.001983642578, "kl_loss_3": 1694.5049133300781, "kl_loss_7": 603.0742904663086, "learning_rate": 2.025351319275137e-05, "loss": 1154.2008, "step": 9100 }, { "ce_loss_10": 3.657759261131287, "ce_loss_13": 3.5795228123664855, "ce_loss_2": 4.611237382888794, "ce_loss_3": 4.346826362609863, "ce_loss_7": 3.829502213001251, "epoch": 0.911, "grad_norm": 568.0, "kl_loss_10": 182.78317489624024, "kl_loss_2": 2152.8356689453126, "kl_loss_3": 1681.0592834472657, "kl_loss_7": 612.198373413086, "learning_rate": 1.9808914852347816e-05, "loss": 1183.935, "step": 9110 }, { "ce_loss_10": 3.5076343536376955, "ce_loss_13": 3.4301217675209044, "ce_loss_2": 4.475468993186951, "ce_loss_3": 4.200416827201844, "ce_loss_7": 3.690832221508026, "epoch": 0.912, "grad_norm": 520.0, "kl_loss_10": 177.99471740722657, "kl_loss_2": 2138.6880798339844, "kl_loss_3": 1648.8575317382813, "kl_loss_7": 603.4748657226562, "learning_rate": 1.9369152030840554e-05, "loss": 1133.9587, "step": 9120 }, { "ce_loss_10": 3.5838579297065736, "ce_loss_13": 3.5108367919921877, "ce_loss_2": 4.5474550247192385, "ce_loss_3": 4.282457900047302, "ce_loss_7": 3.752550458908081, "epoch": 0.913, "grad_norm": 592.0, "kl_loss_10": 175.99298171997071, "kl_loss_2": 2175.4973083496093, "kl_loss_3": 1708.1368530273437, "kl_loss_7": 595.3381591796875, "learning_rate": 1.893422915663645e-05, "loss": 1154.3063, "step": 9130 }, { "ce_loss_10": 3.4526755094528196, "ce_loss_13": 3.376223611831665, "ce_loss_2": 4.463929057121277, "ce_loss_3": 4.188671815395355, "ce_loss_7": 3.6436040878295897, "epoch": 0.914, "grad_norm": 528.0, "kl_loss_10": 178.96754608154296, "kl_loss_2": 2226.8522216796873, "kl_loss_3": 1741.089337158203, "kl_loss_7": 614.4321655273437, "learning_rate": 1.850415060940386e-05, "loss": 1177.2793, "step": 9140 }, { "ce_loss_10": 3.577260196208954, "ce_loss_13": 3.4996687054634092, "ce_loss_2": 4.506159293651581, "ce_loss_3": 4.247144281864166, "ce_loss_7": 3.7475706696510316, "epoch": 0.915, "grad_norm": 576.0, "kl_loss_10": 176.05650253295897, "kl_loss_2": 2092.3647644042967, "kl_loss_3": 1626.7418640136718, "kl_loss_7": 590.4974914550781, "learning_rate": 1.8078920720028978e-05, "loss": 1136.8029, "step": 9150 }, { "ce_loss_10": 3.5006513595581055, "ce_loss_13": 3.4293729782104494, "ce_loss_2": 4.446831393241882, "ce_loss_3": 4.180594873428345, "ce_loss_7": 3.670674538612366, "epoch": 0.916, "grad_norm": 584.0, "kl_loss_10": 173.59793243408203, "kl_loss_2": 2105.566021728516, "kl_loss_3": 1637.2901428222656, "kl_loss_7": 585.1143585205078, "learning_rate": 1.765854377057219e-05, "loss": 1156.8438, "step": 9160 }, { "ce_loss_10": 3.4831743359565737, "ce_loss_13": 3.410732936859131, "ce_loss_2": 4.439985752105713, "ce_loss_3": 4.173957622051239, "ce_loss_7": 3.652401328086853, "epoch": 0.917, "grad_norm": 552.0, "kl_loss_10": 172.13598022460937, "kl_loss_2": 2136.6128540039062, "kl_loss_3": 1663.3346557617188, "kl_loss_7": 585.5476837158203, "learning_rate": 1.724302399422456e-05, "loss": 1148.3008, "step": 9170 }, { "ce_loss_10": 3.4418306827545164, "ce_loss_13": 3.365187871456146, "ce_loss_2": 4.424201607704163, "ce_loss_3": 4.15188490152359, "ce_loss_7": 3.617412793636322, "epoch": 0.918, "grad_norm": 540.0, "kl_loss_10": 181.21381607055665, "kl_loss_2": 2192.6884948730467, "kl_loss_3": 1711.2826416015625, "kl_loss_7": 608.8610565185547, "learning_rate": 1.683236557526574e-05, "loss": 1171.7086, "step": 9180 }, { "ce_loss_10": 3.5525336861610413, "ce_loss_13": 3.479281461238861, "ce_loss_2": 4.4706899404525755, "ce_loss_3": 4.209463405609131, "ce_loss_7": 3.7199216723442077, "epoch": 0.919, "grad_norm": 552.0, "kl_loss_10": 172.36394195556642, "kl_loss_2": 2051.388214111328, "kl_loss_3": 1592.7936218261718, "kl_loss_7": 577.4165252685547, "learning_rate": 1.6426572649021475e-05, "loss": 1138.484, "step": 9190 }, { "ce_loss_10": 3.5873886704444886, "ce_loss_13": 3.515676808357239, "ce_loss_2": 4.504871940612793, "ce_loss_3": 4.245928645133972, "ce_loss_7": 3.7530444860458374, "epoch": 0.92, "grad_norm": 560.0, "kl_loss_10": 175.5730728149414, "kl_loss_2": 2067.4602783203127, "kl_loss_3": 1611.9698425292968, "kl_loss_7": 583.8149566650391, "learning_rate": 1.6025649301821876e-05, "loss": 1125.4826, "step": 9200 }, { "ce_loss_10": 3.579323208332062, "ce_loss_13": 3.50703387260437, "ce_loss_2": 4.493245768547058, "ce_loss_3": 4.232775616645813, "ce_loss_7": 3.7471976399421694, "epoch": 0.921, "grad_norm": 620.0, "kl_loss_10": 177.73348236083984, "kl_loss_2": 2084.493505859375, "kl_loss_3": 1628.828369140625, "kl_loss_7": 594.5904907226562, "learning_rate": 1.5629599570960716e-05, "loss": 1123.5703, "step": 9210 }, { "ce_loss_10": 3.482688879966736, "ce_loss_13": 3.4090826153755187, "ce_loss_2": 4.466846561431884, "ce_loss_3": 4.195373678207398, "ce_loss_7": 3.658596193790436, "epoch": 0.922, "grad_norm": 588.0, "kl_loss_10": 175.80986251831055, "kl_loss_2": 2196.6754943847654, "kl_loss_3": 1711.6648010253907, "kl_loss_7": 598.8134338378907, "learning_rate": 1.5238427444654367e-05, "loss": 1155.2326, "step": 9220 }, { "ce_loss_10": 3.543232810497284, "ce_loss_13": 3.467606770992279, "ce_loss_2": 4.491106653213501, "ce_loss_3": 4.219079720973968, "ce_loss_7": 3.710583233833313, "epoch": 0.923, "grad_norm": 548.0, "kl_loss_10": 174.20623474121095, "kl_loss_2": 2120.3849365234373, "kl_loss_3": 1639.1658142089843, "kl_loss_7": 584.3361358642578, "learning_rate": 1.4852136862001764e-05, "loss": 1130.8816, "step": 9230 }, { "ce_loss_10": 3.5088143348693848, "ce_loss_13": 3.435594344139099, "ce_loss_2": 4.446617817878723, "ce_loss_3": 4.185671401023865, "ce_loss_7": 3.681156051158905, "epoch": 0.924, "grad_norm": 584.0, "kl_loss_10": 172.72551879882812, "kl_loss_2": 2097.9775573730467, "kl_loss_3": 1637.147705078125, "kl_loss_7": 588.3473190307617, "learning_rate": 1.4470731712944884e-05, "loss": 1146.0963, "step": 9240 }, { "ce_loss_10": 3.5362769246101378, "ce_loss_13": 3.461189365386963, "ce_loss_2": 4.501055717468262, "ce_loss_3": 4.224046432971955, "ce_loss_7": 3.714902651309967, "epoch": 0.925, "grad_norm": 548.0, "kl_loss_10": 178.0212043762207, "kl_loss_2": 2145.880969238281, "kl_loss_3": 1660.7241577148438, "kl_loss_7": 597.9191680908203, "learning_rate": 1.4094215838229174e-05, "loss": 1173.0712, "step": 9250 }, { "ce_loss_10": 3.498918581008911, "ce_loss_13": 3.4251022219657896, "ce_loss_2": 4.4930708646774296, "ce_loss_3": 4.217134141921997, "ce_loss_7": 3.675217390060425, "epoch": 0.926, "grad_norm": 628.0, "kl_loss_10": 177.476513671875, "kl_loss_2": 2207.0631896972654, "kl_loss_3": 1717.4895629882812, "kl_loss_7": 603.3580856323242, "learning_rate": 1.372259302936546e-05, "loss": 1205.574, "step": 9260 }, { "ce_loss_10": 3.6163718700408936, "ce_loss_13": 3.537315881252289, "ce_loss_2": 4.571975326538086, "ce_loss_3": 4.302231848239899, "ce_loss_7": 3.783820962905884, "epoch": 0.927, "grad_norm": 576.0, "kl_loss_10": 181.85763092041014, "kl_loss_2": 2140.0186767578125, "kl_loss_3": 1662.9603210449218, "kl_loss_7": 600.1376983642579, "learning_rate": 1.3355867028591206e-05, "loss": 1136.2721, "step": 9270 }, { "ce_loss_10": 3.514796030521393, "ce_loss_13": 3.440743112564087, "ce_loss_2": 4.449939727783203, "ce_loss_3": 4.1810842752456665, "ce_loss_7": 3.683949875831604, "epoch": 0.928, "grad_norm": 564.0, "kl_loss_10": 175.13030853271485, "kl_loss_2": 2107.975048828125, "kl_loss_3": 1633.0731262207032, "kl_loss_7": 589.7343017578125, "learning_rate": 1.2994041528833267e-05, "loss": 1127.6617, "step": 9280 }, { "ce_loss_10": 3.5171499490737914, "ce_loss_13": 3.440678071975708, "ce_loss_2": 4.470655179023742, "ce_loss_3": 4.200739192962646, "ce_loss_7": 3.690466821193695, "epoch": 0.929, "grad_norm": 584.0, "kl_loss_10": 174.82103958129883, "kl_loss_2": 2150.8757751464846, "kl_loss_3": 1677.1400146484375, "kl_loss_7": 592.8601806640625, "learning_rate": 1.2637120173670358e-05, "loss": 1145.5388, "step": 9290 }, { "ce_loss_10": 3.5360547065734864, "ce_loss_13": 3.459415102005005, "ce_loss_2": 4.509140729904175, "ce_loss_3": 4.243091595172882, "ce_loss_7": 3.717022383213043, "epoch": 0.93, "grad_norm": 616.0, "kl_loss_10": 177.1988067626953, "kl_loss_2": 2160.6812377929687, "kl_loss_3": 1681.2325256347656, "kl_loss_7": 601.0645294189453, "learning_rate": 1.2285106557296478e-05, "loss": 1155.5311, "step": 9300 }, { "ce_loss_10": 3.4133058071136473, "ce_loss_13": 3.340100085735321, "ce_loss_2": 4.45502986907959, "ce_loss_3": 4.176646625995636, "ce_loss_7": 3.593162167072296, "epoch": 0.931, "grad_norm": 684.0, "kl_loss_10": 177.049959564209, "kl_loss_2": 2280.7558776855467, "kl_loss_3": 1784.5312927246093, "kl_loss_7": 606.032177734375, "learning_rate": 1.1938004224484989e-05, "loss": 1177.6771, "step": 9310 }, { "ce_loss_10": 3.6532346606254578, "ce_loss_13": 3.5762033224105836, "ce_loss_2": 4.59397509098053, "ce_loss_3": 4.32731124162674, "ce_loss_7": 3.8223699569702148, "epoch": 0.932, "grad_norm": 552.0, "kl_loss_10": 179.57099151611328, "kl_loss_2": 2131.067413330078, "kl_loss_3": 1656.3818481445312, "kl_loss_7": 601.6865631103516, "learning_rate": 1.1595816670552429e-05, "loss": 1167.2541, "step": 9320 }, { "ce_loss_10": 3.582180309295654, "ce_loss_13": 3.5061400294303895, "ce_loss_2": 4.524780786037445, "ce_loss_3": 4.259365129470825, "ce_loss_7": 3.747998225688934, "epoch": 0.933, "grad_norm": 568.0, "kl_loss_10": 175.25698852539062, "kl_loss_2": 2109.6074279785157, "kl_loss_3": 1638.4892211914062, "kl_loss_7": 583.7247375488281, "learning_rate": 1.1258547341323699e-05, "loss": 1126.3885, "step": 9330 }, { "ce_loss_10": 3.6068438053131104, "ce_loss_13": 3.5321076273918153, "ce_loss_2": 4.546216082572937, "ce_loss_3": 4.2822174549102785, "ce_loss_7": 3.7781530022621155, "epoch": 0.934, "grad_norm": 584.0, "kl_loss_10": 177.80956649780273, "kl_loss_2": 2142.086737060547, "kl_loss_3": 1668.28828125, "kl_loss_7": 595.8422210693359, "learning_rate": 1.0926199633097156e-05, "loss": 1139.4527, "step": 9340 }, { "ce_loss_10": 3.610630822181702, "ce_loss_13": 3.540286922454834, "ce_loss_2": 4.522486686706543, "ce_loss_3": 4.263027024269104, "ce_loss_7": 3.7751463413238526, "epoch": 0.935, "grad_norm": 568.0, "kl_loss_10": 172.0374610900879, "kl_loss_2": 2078.3720947265624, "kl_loss_3": 1611.571875, "kl_loss_7": 582.5087860107421, "learning_rate": 1.0598776892610684e-05, "loss": 1147.9141, "step": 9350 }, { "ce_loss_10": 3.4225520491600037, "ce_loss_13": 3.350025403499603, "ce_loss_2": 4.415711855888366, "ce_loss_3": 4.137125706672668, "ce_loss_7": 3.5997640252113343, "epoch": 0.936, "grad_norm": 552.0, "kl_loss_10": 173.3252960205078, "kl_loss_2": 2195.4003051757813, "kl_loss_3": 1704.4373046875, "kl_loss_7": 593.9061553955078, "learning_rate": 1.0276282417007399e-05, "loss": 1147.3932, "step": 9360 }, { "ce_loss_10": 3.581203269958496, "ce_loss_13": 3.5097854137420654, "ce_loss_2": 4.501006007194519, "ce_loss_3": 4.243209981918335, "ce_loss_7": 3.747337484359741, "epoch": 0.937, "grad_norm": 596.0, "kl_loss_10": 171.74332122802736, "kl_loss_2": 2068.851727294922, "kl_loss_3": 1615.05986328125, "kl_loss_7": 580.8333526611328, "learning_rate": 9.958719453803277e-06, "loss": 1127.7196, "step": 9370 }, { "ce_loss_10": 3.578359854221344, "ce_loss_13": 3.504493975639343, "ce_loss_2": 4.52911410331726, "ce_loss_3": 4.265734839439392, "ce_loss_7": 3.757120943069458, "epoch": 0.938, "grad_norm": 568.0, "kl_loss_10": 176.9942184448242, "kl_loss_2": 2134.3438110351562, "kl_loss_3": 1664.9087463378905, "kl_loss_7": 601.7982177734375, "learning_rate": 9.646091200853802e-06, "loss": 1132.6439, "step": 9380 }, { "ce_loss_10": 3.5366848587989805, "ce_loss_13": 3.4644816398620604, "ce_loss_2": 4.483027625083923, "ce_loss_3": 4.215219330787659, "ce_loss_7": 3.704088735580444, "epoch": 0.939, "grad_norm": 536.0, "kl_loss_10": 172.74244918823243, "kl_loss_2": 2099.378448486328, "kl_loss_3": 1622.1510986328126, "kl_loss_7": 583.5708526611328, "learning_rate": 9.338400806321978e-06, "loss": 1100.1545, "step": 9390 }, { "ce_loss_10": 3.571504032611847, "ce_loss_13": 3.493305134773254, "ce_loss_2": 4.510696125030518, "ce_loss_3": 4.24888288974762, "ce_loss_7": 3.742924678325653, "epoch": 0.94, "grad_norm": 516.0, "kl_loss_10": 177.7894515991211, "kl_loss_2": 2105.6502197265627, "kl_loss_3": 1641.3450012207031, "kl_loss_7": 590.5869018554688, "learning_rate": 9.035651368646646e-06, "loss": 1131.6762, "step": 9400 }, { "ce_loss_10": 3.572381889820099, "ce_loss_13": 3.5001948475837708, "ce_loss_2": 4.5045966625213625, "ce_loss_3": 4.241596531867981, "ce_loss_7": 3.742561626434326, "epoch": 0.941, "grad_norm": 612.0, "kl_loss_10": 173.46388778686523, "kl_loss_2": 2088.4161376953125, "kl_loss_3": 1622.1473693847656, "kl_loss_7": 583.22626953125, "learning_rate": 8.737845936511335e-06, "loss": 1133.2381, "step": 9410 }, { "ce_loss_10": 3.522591459751129, "ce_loss_13": 3.446700024604797, "ce_loss_2": 4.507854294776917, "ce_loss_3": 4.236737239360809, "ce_loss_7": 3.698591649532318, "epoch": 0.942, "grad_norm": 572.0, "kl_loss_10": 178.92413635253905, "kl_loss_2": 2186.715954589844, "kl_loss_3": 1705.135888671875, "kl_loss_7": 600.7923004150391, "learning_rate": 8.444987508813451e-06, "loss": 1149.7434, "step": 9420 }, { "ce_loss_10": 3.475817048549652, "ce_loss_13": 3.3989389657974245, "ce_loss_2": 4.473474383354187, "ce_loss_3": 4.199087584018708, "ce_loss_7": 3.6562391996383665, "epoch": 0.943, "grad_norm": 628.0, "kl_loss_10": 179.55375061035156, "kl_loss_2": 2263.5999450683594, "kl_loss_3": 1768.8511352539062, "kl_loss_7": 615.3503479003906, "learning_rate": 8.157079034633974e-06, "loss": 1178.9379, "step": 9430 }, { "ce_loss_10": 3.473416876792908, "ce_loss_13": 3.40051189661026, "ce_loss_2": 4.446794199943542, "ce_loss_3": 4.178017342090607, "ce_loss_7": 3.6506085276603697, "epoch": 0.944, "grad_norm": 552.0, "kl_loss_10": 174.8175079345703, "kl_loss_2": 2187.4151916503906, "kl_loss_3": 1711.5777709960937, "kl_loss_7": 600.643310546875, "learning_rate": 7.874123413208145e-06, "loss": 1147.171, "step": 9440 }, { "ce_loss_10": 3.445332610607147, "ce_loss_13": 3.369913935661316, "ce_loss_2": 4.434185910224914, "ce_loss_3": 4.161804282665253, "ce_loss_7": 3.6255853891372682, "epoch": 0.945, "grad_norm": 572.0, "kl_loss_10": 175.52419662475586, "kl_loss_2": 2184.3239013671873, "kl_loss_3": 1698.89501953125, "kl_loss_7": 598.6286346435547, "learning_rate": 7.59612349389599e-06, "loss": 1155.226, "step": 9450 }, { "ce_loss_10": 3.534553039073944, "ce_loss_13": 3.462403440475464, "ce_loss_2": 4.465369653701782, "ce_loss_3": 4.198646211624146, "ce_loss_7": 3.708206284046173, "epoch": 0.946, "grad_norm": 580.0, "kl_loss_10": 172.39571685791014, "kl_loss_2": 2074.147509765625, "kl_loss_3": 1600.0551879882812, "kl_loss_7": 581.5289535522461, "learning_rate": 7.323082076153509e-06, "loss": 1126.5964, "step": 9460 }, { "ce_loss_10": 3.576580452919006, "ce_loss_13": 3.503724229335785, "ce_loss_2": 4.510397839546203, "ce_loss_3": 4.244443106651306, "ce_loss_7": 3.7473479986190794, "epoch": 0.947, "grad_norm": 572.0, "kl_loss_10": 179.0567657470703, "kl_loss_2": 2087.8162841796875, "kl_loss_3": 1617.9851928710937, "kl_loss_7": 593.7520263671875, "learning_rate": 7.055001909504755e-06, "loss": 1153.5377, "step": 9470 }, { "ce_loss_10": 3.6078381657600405, "ce_loss_13": 3.5327057957649233, "ce_loss_2": 4.550836896896362, "ce_loss_3": 4.287693047523499, "ce_loss_7": 3.7842918038368225, "epoch": 0.948, "grad_norm": 616.0, "kl_loss_10": 177.3563217163086, "kl_loss_2": 2119.7028747558593, "kl_loss_3": 1649.5453063964844, "kl_loss_7": 593.8896911621093, "learning_rate": 6.791885693514133e-06, "loss": 1138.076, "step": 9480 }, { "ce_loss_10": 3.5228418946266173, "ce_loss_13": 3.4471074819564818, "ce_loss_2": 4.505965852737427, "ce_loss_3": 4.226520001888275, "ce_loss_7": 3.6976125478744506, "epoch": 0.949, "grad_norm": 544.0, "kl_loss_10": 179.4360237121582, "kl_loss_2": 2204.2863159179688, "kl_loss_3": 1709.0836547851563, "kl_loss_7": 603.0492492675781, "learning_rate": 6.533736077758867e-06, "loss": 1164.073, "step": 9490 }, { "ce_loss_10": 3.480616366863251, "ce_loss_13": 3.4073901891708376, "ce_loss_2": 4.4987491250038145, "ce_loss_3": 4.22996586561203, "ce_loss_7": 3.6651357769966126, "epoch": 0.95, "grad_norm": 596.0, "kl_loss_10": 179.7611167907715, "kl_loss_2": 2254.7767639160156, "kl_loss_3": 1772.1949829101563, "kl_loss_7": 613.6569396972657, "learning_rate": 6.2805556618028556e-06, "loss": 1174.4523, "step": 9500 }, { "ce_loss_10": 3.5753507733345034, "ce_loss_13": 3.5015788078308105, "ce_loss_2": 4.508717775344849, "ce_loss_3": 4.236187517642975, "ce_loss_7": 3.735841393470764, "epoch": 0.951, "grad_norm": 600.0, "kl_loss_10": 171.3624740600586, "kl_loss_2": 2070.820721435547, "kl_loss_3": 1594.0807983398438, "kl_loss_7": 569.2584503173828, "learning_rate": 6.032346995169968e-06, "loss": 1091.2504, "step": 9510 }, { "ce_loss_10": 3.5802130341529845, "ce_loss_13": 3.505545949935913, "ce_loss_2": 4.52126247882843, "ce_loss_3": 4.2531510353088375, "ce_loss_7": 3.7508664727211, "epoch": 0.952, "grad_norm": 572.0, "kl_loss_10": 175.58048248291016, "kl_loss_2": 2115.77294921875, "kl_loss_3": 1640.845928955078, "kl_loss_7": 590.4191436767578, "learning_rate": 5.789112577318789e-06, "loss": 1125.406, "step": 9520 }, { "ce_loss_10": 3.5525818467140198, "ce_loss_13": 3.474942719936371, "ce_loss_2": 4.529764556884766, "ce_loss_3": 4.262361979484558, "ce_loss_7": 3.725843298435211, "epoch": 0.953, "grad_norm": 560.0, "kl_loss_10": 178.63978881835936, "kl_loss_2": 2187.347344970703, "kl_loss_3": 1713.5655639648437, "kl_loss_7": 604.028662109375, "learning_rate": 5.550854857617194e-06, "loss": 1138.3246, "step": 9530 }, { "ce_loss_10": 3.5410927653312685, "ce_loss_13": 3.464462494850159, "ce_loss_2": 4.532546710968018, "ce_loss_3": 4.2611222743988035, "ce_loss_7": 3.717101490497589, "epoch": 0.954, "grad_norm": 596.0, "kl_loss_10": 179.71324462890624, "kl_loss_2": 2218.49970703125, "kl_loss_3": 1729.5723754882813, "kl_loss_7": 606.5572113037109, "learning_rate": 5.317576235317756e-06, "loss": 1164.7152, "step": 9540 }, { "ce_loss_10": 3.567497718334198, "ce_loss_13": 3.4959131717681884, "ce_loss_2": 4.49013991355896, "ce_loss_3": 4.226358330249786, "ce_loss_7": 3.7339030742645263, "epoch": 0.955, "grad_norm": 580.0, "kl_loss_10": 171.57061843872071, "kl_loss_2": 2045.03349609375, "kl_loss_3": 1581.6937194824218, "kl_loss_7": 573.2978149414063, "learning_rate": 5.089279059533658e-06, "loss": 1144.5578, "step": 9550 }, { "ce_loss_10": 3.6264307737350463, "ce_loss_13": 3.549366092681885, "ce_loss_2": 4.555849361419678, "ce_loss_3": 4.291936588287354, "ce_loss_7": 3.796077787876129, "epoch": 0.956, "grad_norm": 532.0, "kl_loss_10": 180.97408142089844, "kl_loss_2": 2101.3386962890627, "kl_loss_3": 1636.1171264648438, "kl_loss_7": 603.2738311767578, "learning_rate": 4.865965629214819e-06, "loss": 1128.9252, "step": 9560 }, { "ce_loss_10": 3.5740526914596558, "ce_loss_13": 3.4976862549781798, "ce_loss_2": 4.531564974784851, "ce_loss_3": 4.273219418525696, "ce_loss_7": 3.7471871614456176, "epoch": 0.957, "grad_norm": 496.0, "kl_loss_10": 178.81691131591796, "kl_loss_2": 2162.70166015625, "kl_loss_3": 1696.7497314453126, "kl_loss_7": 603.7053924560547, "learning_rate": 4.6476381931251366e-06, "loss": 1126.9263, "step": 9570 }, { "ce_loss_10": 3.5494153618812563, "ce_loss_13": 3.475415658950806, "ce_loss_2": 4.496997284889221, "ce_loss_3": 4.230398142337799, "ce_loss_7": 3.728412318229675, "epoch": 0.958, "grad_norm": 496.0, "kl_loss_10": 176.03445205688476, "kl_loss_2": 2117.532794189453, "kl_loss_3": 1643.0480895996093, "kl_loss_7": 594.0362182617188, "learning_rate": 4.434298949819449e-06, "loss": 1135.89, "step": 9580 }, { "ce_loss_10": 3.5075241327285767, "ce_loss_13": 3.4301467418670653, "ce_loss_2": 4.514026093482971, "ce_loss_3": 4.236743009090423, "ce_loss_7": 3.6883852958679197, "epoch": 0.959, "grad_norm": 584.0, "kl_loss_10": 182.3026496887207, "kl_loss_2": 2271.594354248047, "kl_loss_3": 1773.125311279297, "kl_loss_7": 624.3575988769531, "learning_rate": 4.2259500476214406e-06, "loss": 1183.4918, "step": 9590 }, { "ce_loss_10": 3.491668391227722, "ce_loss_13": 3.4157654523849486, "ce_loss_2": 4.465724205970764, "ce_loss_3": 4.201405656337738, "ce_loss_7": 3.665540862083435, "epoch": 0.96, "grad_norm": 556.0, "kl_loss_10": 177.03348541259766, "kl_loss_2": 2184.085583496094, "kl_loss_3": 1717.5306762695313, "kl_loss_7": 602.626220703125, "learning_rate": 4.02259358460233e-06, "loss": 1148.7472, "step": 9600 }, { "ce_loss_10": 3.5573193550109865, "ce_loss_13": 3.4829213500022886, "ce_loss_2": 4.506733560562134, "ce_loss_3": 4.238800776004791, "ce_loss_7": 3.7266565203666686, "epoch": 0.961, "grad_norm": 580.0, "kl_loss_10": 176.3166290283203, "kl_loss_2": 2114.0912475585938, "kl_loss_3": 1637.3499328613282, "kl_loss_7": 588.5086486816406, "learning_rate": 3.8242316085594916e-06, "loss": 1126.8451, "step": 9610 }, { "ce_loss_10": 3.447552573680878, "ce_loss_13": 3.369247031211853, "ce_loss_2": 4.473224306106568, "ce_loss_3": 4.197393763065338, "ce_loss_7": 3.6313952803611755, "epoch": 0.962, "grad_norm": 556.0, "kl_loss_10": 180.42641220092773, "kl_loss_2": 2280.7411865234376, "kl_loss_3": 1787.7846984863281, "kl_loss_7": 615.477572631836, "learning_rate": 3.630866116995757e-06, "loss": 1194.5547, "step": 9620 }, { "ce_loss_10": 3.5979113578796387, "ce_loss_13": 3.5257344841957092, "ce_loss_2": 4.537094449996948, "ce_loss_3": 4.265922880172729, "ce_loss_7": 3.7622151970863342, "epoch": 0.963, "grad_norm": 572.0, "kl_loss_10": 174.69513320922852, "kl_loss_2": 2105.5745544433594, "kl_loss_3": 1622.0366455078124, "kl_loss_7": 578.8991943359375, "learning_rate": 3.4424990570994797e-06, "loss": 1156.3669, "step": 9630 }, { "ce_loss_10": 3.585505282878876, "ce_loss_13": 3.5114797711372376, "ce_loss_2": 4.518644833564759, "ce_loss_3": 4.256495106220245, "ce_loss_7": 3.7576936960220335, "epoch": 0.964, "grad_norm": 482.0, "kl_loss_10": 175.59745712280272, "kl_loss_2": 2103.631524658203, "kl_loss_3": 1631.6069641113281, "kl_loss_7": 588.9240661621094, "learning_rate": 3.2591323257248896e-06, "loss": 1134.1978, "step": 9640 }, { "ce_loss_10": 3.437925660610199, "ce_loss_13": 3.3662607192993166, "ce_loss_2": 4.409651112556458, "ce_loss_3": 4.150672721862793, "ce_loss_7": 3.6122069478034975, "epoch": 0.965, "grad_norm": 556.0, "kl_loss_10": 174.7218978881836, "kl_loss_2": 2173.464489746094, "kl_loss_3": 1704.9824951171875, "kl_loss_7": 600.8370666503906, "learning_rate": 3.0807677693729385e-06, "loss": 1163.455, "step": 9650 }, { "ce_loss_10": 3.623323905467987, "ce_loss_13": 3.55154949426651, "ce_loss_2": 4.551669549942017, "ce_loss_3": 4.290165424346924, "ce_loss_7": 3.794328248500824, "epoch": 0.966, "grad_norm": 544.0, "kl_loss_10": 174.09824600219727, "kl_loss_2": 2080.361853027344, "kl_loss_3": 1626.3007873535157, "kl_loss_7": 584.5892837524414, "learning_rate": 2.9074071841727055e-06, "loss": 1115.8137, "step": 9660 }, { "ce_loss_10": 3.548972153663635, "ce_loss_13": 3.4729049801826477, "ce_loss_2": 4.494955968856812, "ce_loss_3": 4.230167889595032, "ce_loss_7": 3.730636739730835, "epoch": 0.967, "grad_norm": 632.0, "kl_loss_10": 177.06267852783202, "kl_loss_2": 2105.5685302734373, "kl_loss_3": 1641.2366760253906, "kl_loss_7": 599.4572601318359, "learning_rate": 2.739052315863355e-06, "loss": 1112.1609, "step": 9670 }, { "ce_loss_10": 3.5363902688026427, "ce_loss_13": 3.4610472440719606, "ce_loss_2": 4.502471828460694, "ce_loss_3": 4.230240440368652, "ce_loss_7": 3.7059998750686645, "epoch": 0.968, "grad_norm": 560.0, "kl_loss_10": 176.56764450073243, "kl_loss_2": 2152.0122924804687, "kl_loss_3": 1676.3801025390626, "kl_loss_7": 591.2061340332032, "learning_rate": 2.5757048597765396e-06, "loss": 1135.4543, "step": 9680 }, { "ce_loss_10": 3.5459084630012514, "ce_loss_13": 3.4721821188926696, "ce_loss_2": 4.505685806274414, "ce_loss_3": 4.235912537574768, "ce_loss_7": 3.722131609916687, "epoch": 0.969, "grad_norm": 560.0, "kl_loss_10": 176.31484680175782, "kl_loss_2": 2142.29345703125, "kl_loss_3": 1672.873828125, "kl_loss_7": 599.0281616210938, "learning_rate": 2.417366460819359e-06, "loss": 1141.189, "step": 9690 }, { "ce_loss_10": 3.5568428516387938, "ce_loss_13": 3.47944039106369, "ce_loss_2": 4.546383309364319, "ce_loss_3": 4.280533790588379, "ce_loss_7": 3.73818119764328, "epoch": 0.97, "grad_norm": 592.0, "kl_loss_10": 181.22289581298827, "kl_loss_2": 2223.5619262695313, "kl_loss_3": 1743.523046875, "kl_loss_7": 618.0255676269531, "learning_rate": 2.2640387134577057e-06, "loss": 1150.9949, "step": 9700 }, { "ce_loss_10": 3.4835644006729125, "ce_loss_13": 3.409128963947296, "ce_loss_2": 4.400413775444031, "ce_loss_3": 4.1408212065696715, "ce_loss_7": 3.6511133790016173, "epoch": 0.971, "grad_norm": 584.0, "kl_loss_10": 169.66612396240234, "kl_loss_2": 2037.37294921875, "kl_loss_3": 1580.9468200683593, "kl_loss_7": 575.112336730957, "learning_rate": 2.115723161700278e-06, "loss": 1111.2564, "step": 9710 }, { "ce_loss_10": 3.462701106071472, "ce_loss_13": 3.383505952358246, "ce_loss_2": 4.462756657600403, "ce_loss_3": 4.1902328610420225, "ce_loss_7": 3.6453136444091796, "epoch": 0.972, "grad_norm": 676.0, "kl_loss_10": 180.0776268005371, "kl_loss_2": 2223.634521484375, "kl_loss_3": 1740.0434143066407, "kl_loss_7": 612.3085083007812, "learning_rate": 1.9724212990830937e-06, "loss": 1170.462, "step": 9720 }, { "ce_loss_10": 3.6076322913169863, "ce_loss_13": 3.532732355594635, "ce_loss_2": 4.577161026000977, "ce_loss_3": 4.311069667339325, "ce_loss_7": 3.7834354996681214, "epoch": 0.973, "grad_norm": 488.0, "kl_loss_10": 178.08698196411132, "kl_loss_2": 2168.2475769042967, "kl_loss_3": 1699.6189208984374, "kl_loss_7": 598.9332580566406, "learning_rate": 1.8341345686543331e-06, "loss": 1146.8779, "step": 9730 }, { "ce_loss_10": 3.5909879326820375, "ce_loss_13": 3.5183821320533752, "ce_loss_2": 4.50426287651062, "ce_loss_3": 4.235173010826111, "ce_loss_7": 3.757961595058441, "epoch": 0.974, "grad_norm": 548.0, "kl_loss_10": 174.61135635375976, "kl_loss_2": 2063.9743225097654, "kl_loss_3": 1591.8313293457031, "kl_loss_7": 585.6029083251954, "learning_rate": 1.7008643629596864e-06, "loss": 1145.0081, "step": 9740 }, { "ce_loss_10": 3.5759197235107423, "ce_loss_13": 3.4986127734184267, "ce_loss_2": 4.5397637486457825, "ce_loss_3": 4.2685352802276615, "ce_loss_7": 3.7446988224983215, "epoch": 0.975, "grad_norm": 552.0, "kl_loss_10": 176.2814811706543, "kl_loss_2": 2161.20859375, "kl_loss_3": 1678.5180541992188, "kl_loss_7": 590.2671813964844, "learning_rate": 1.5726120240288633e-06, "loss": 1164.5706, "step": 9750 }, { "ce_loss_10": 3.4757342100143434, "ce_loss_13": 3.4012367367744445, "ce_loss_2": 4.433041834831238, "ce_loss_3": 4.165659952163696, "ce_loss_7": 3.6462602019309998, "epoch": 0.976, "grad_norm": 572.0, "kl_loss_10": 174.65177154541016, "kl_loss_2": 2138.293341064453, "kl_loss_3": 1655.4461547851563, "kl_loss_7": 589.6313079833984, "learning_rate": 1.4493788433612708e-06, "loss": 1134.1515, "step": 9760 }, { "ce_loss_10": 3.5877037525177, "ce_loss_13": 3.5132053971290587, "ce_loss_2": 4.55386061668396, "ce_loss_3": 4.287087714672088, "ce_loss_7": 3.7638731479644774, "epoch": 0.977, "grad_norm": 536.0, "kl_loss_10": 177.9455436706543, "kl_loss_2": 2173.966436767578, "kl_loss_3": 1692.082745361328, "kl_loss_7": 599.7038208007813, "learning_rate": 1.3311660619138578e-06, "loss": 1161.4269, "step": 9770 }, { "ce_loss_10": 3.584187960624695, "ce_loss_13": 3.510979926586151, "ce_loss_2": 4.489086222648621, "ce_loss_3": 4.228979337215423, "ce_loss_7": 3.748577618598938, "epoch": 0.978, "grad_norm": 516.0, "kl_loss_10": 176.20037689208985, "kl_loss_2": 2033.9857421875, "kl_loss_3": 1575.569403076172, "kl_loss_7": 583.5555999755859, "learning_rate": 1.2179748700879012e-06, "loss": 1135.4594, "step": 9780 }, { "ce_loss_10": 3.516654706001282, "ce_loss_13": 3.442041552066803, "ce_loss_2": 4.460341954231263, "ce_loss_3": 4.201344418525696, "ce_loss_7": 3.6880866169929503, "epoch": 0.979, "grad_norm": 648.0, "kl_loss_10": 175.97493591308594, "kl_loss_2": 2106.5852966308594, "kl_loss_3": 1640.5349182128907, "kl_loss_7": 589.2118927001953, "learning_rate": 1.1098064077174619e-06, "loss": 1139.4391, "step": 9790 }, { "ce_loss_10": 3.548008131980896, "ce_loss_13": 3.470580744743347, "ce_loss_2": 4.531388640403748, "ce_loss_3": 4.256609618663788, "ce_loss_7": 3.7258023023605347, "epoch": 0.98, "grad_norm": 660.0, "kl_loss_10": 175.85005264282228, "kl_loss_2": 2184.833563232422, "kl_loss_3": 1695.749658203125, "kl_loss_7": 597.7893035888671, "learning_rate": 1.006661764057837e-06, "loss": 1144.1424, "step": 9800 }, { "ce_loss_10": 3.5516860127449035, "ce_loss_13": 3.479186308383942, "ce_loss_2": 4.507886123657227, "ce_loss_3": 4.23874124288559, "ce_loss_7": 3.7239818572998047, "epoch": 0.981, "grad_norm": 548.0, "kl_loss_10": 174.7688331604004, "kl_loss_2": 2140.183038330078, "kl_loss_3": 1663.4087707519532, "kl_loss_7": 592.6127227783203, "learning_rate": 9.085419777743465e-07, "loss": 1136.217, "step": 9810 }, { "ce_loss_10": 3.4896764159202576, "ce_loss_13": 3.4188039541244506, "ce_loss_2": 4.450884318351745, "ce_loss_3": 4.184124147891998, "ce_loss_7": 3.6670993685722353, "epoch": 0.982, "grad_norm": 476.0, "kl_loss_10": 171.6952751159668, "kl_loss_2": 2127.2390258789064, "kl_loss_3": 1658.6653991699218, "kl_loss_7": 588.117578125, "learning_rate": 8.15448036932176e-07, "loss": 1121.8644, "step": 9820 }, { "ce_loss_10": 3.542994940280914, "ce_loss_13": 3.471325635910034, "ce_loss_2": 4.491614294052124, "ce_loss_3": 4.226279616355896, "ce_loss_7": 3.716835379600525, "epoch": 0.983, "grad_norm": 580.0, "kl_loss_10": 175.40776138305665, "kl_loss_2": 2138.5871826171874, "kl_loss_3": 1668.4995056152343, "kl_loss_7": 599.0077606201172, "learning_rate": 7.273808789862724e-07, "loss": 1157.4876, "step": 9830 }, { "ce_loss_10": 3.62471262216568, "ce_loss_13": 3.552128314971924, "ce_loss_2": 4.560764002799988, "ce_loss_3": 4.2987874269485475, "ce_loss_7": 3.7973197221755983, "epoch": 0.984, "grad_norm": 536.0, "kl_loss_10": 177.9404067993164, "kl_loss_2": 2121.9407958984375, "kl_loss_3": 1649.4432312011718, "kl_loss_7": 593.6075317382813, "learning_rate": 6.443413907720186e-07, "loss": 1128.3074, "step": 9840 }, { "ce_loss_10": 3.553659164905548, "ce_loss_13": 3.479843807220459, "ce_loss_2": 4.502509045600891, "ce_loss_3": 4.239866006374359, "ce_loss_7": 3.7261658310890198, "epoch": 0.985, "grad_norm": 612.0, "kl_loss_10": 175.90703582763672, "kl_loss_2": 2105.8654174804688, "kl_loss_3": 1643.8707397460937, "kl_loss_7": 589.3217529296875, "learning_rate": 5.663304084960185e-07, "loss": 1125.6893, "step": 9850 }, { "ce_loss_10": 3.4857439756393434, "ce_loss_13": 3.40972044467926, "ce_loss_2": 4.458719778060913, "ce_loss_3": 4.193828642368317, "ce_loss_7": 3.661728310585022, "epoch": 0.986, "grad_norm": 544.0, "kl_loss_10": 175.7668014526367, "kl_loss_2": 2168.083819580078, "kl_loss_3": 1695.883349609375, "kl_loss_7": 599.47685546875, "learning_rate": 4.933487177280482e-07, "loss": 1132.0084, "step": 9860 }, { "ce_loss_10": 3.577410614490509, "ce_loss_13": 3.50371458530426, "ce_loss_2": 4.517120695114135, "ce_loss_3": 4.256355273723602, "ce_loss_7": 3.745650053024292, "epoch": 0.987, "grad_norm": 580.0, "kl_loss_10": 172.69470291137696, "kl_loss_2": 2116.2484741210938, "kl_loss_3": 1646.7192932128905, "kl_loss_7": 586.3196258544922, "learning_rate": 4.2539705339295075e-07, "loss": 1129.2027, "step": 9870 }, { "ce_loss_10": 3.4351974010467528, "ce_loss_13": 3.359704864025116, "ce_loss_2": 4.414662563800812, "ce_loss_3": 4.1485153317451475, "ce_loss_7": 3.614791524410248, "epoch": 0.988, "grad_norm": 624.0, "kl_loss_10": 176.81834564208984, "kl_loss_2": 2189.1237670898436, "kl_loss_3": 1714.1568420410156, "kl_loss_7": 602.8686370849609, "learning_rate": 3.6247609976319816e-07, "loss": 1142.2324, "step": 9880 }, { "ce_loss_10": 3.5325068116188048, "ce_loss_13": 3.4560230016708373, "ce_loss_2": 4.515677762031555, "ce_loss_3": 4.241289448738098, "ce_loss_7": 3.7140289902687074, "epoch": 0.989, "grad_norm": 644.0, "kl_loss_10": 178.62700347900392, "kl_loss_2": 2181.043316650391, "kl_loss_3": 1701.3241455078125, "kl_loss_7": 601.8407318115235, "learning_rate": 3.0458649045211895e-07, "loss": 1177.6322, "step": 9890 }, { "ce_loss_10": 3.505313539505005, "ce_loss_13": 3.4275246262550354, "ce_loss_2": 4.470349764823913, "ce_loss_3": 4.199915885925293, "ce_loss_7": 3.687111556529999, "epoch": 0.99, "grad_norm": 628.0, "kl_loss_10": 179.9844207763672, "kl_loss_2": 2144.779681396484, "kl_loss_3": 1664.5868041992187, "kl_loss_7": 610.2965026855469, "learning_rate": 2.517288084074587e-07, "loss": 1173.5785, "step": 9900 }, { "ce_loss_10": 3.541435408592224, "ce_loss_13": 3.4641653418540956, "ce_loss_2": 4.540918755531311, "ce_loss_3": 4.268487918376922, "ce_loss_7": 3.728367364406586, "epoch": 0.991, "grad_norm": 544.0, "kl_loss_10": 181.58360061645507, "kl_loss_2": 2223.09423828125, "kl_loss_3": 1733.728173828125, "kl_loss_7": 618.8083801269531, "learning_rate": 2.0390358590538505e-07, "loss": 1164.2306, "step": 9910 }, { "ce_loss_10": 3.5465844750404356, "ce_loss_13": 3.4692795395851137, "ce_loss_2": 4.505300617218017, "ce_loss_3": 4.238151812553406, "ce_loss_7": 3.7215544462203978, "epoch": 0.992, "grad_norm": 516.0, "kl_loss_10": 178.79893417358397, "kl_loss_2": 2149.3738037109374, "kl_loss_3": 1683.4728881835938, "kl_loss_7": 602.8562408447266, "learning_rate": 1.61111304545436e-07, "loss": 1139.9141, "step": 9920 }, { "ce_loss_10": 3.5144612431526183, "ce_loss_13": 3.439807415008545, "ce_loss_2": 4.468925881385803, "ce_loss_3": 4.204157900810242, "ce_loss_7": 3.68552029132843, "epoch": 0.993, "grad_norm": 524.0, "kl_loss_10": 174.9011474609375, "kl_loss_2": 2131.701556396484, "kl_loss_3": 1667.3637084960938, "kl_loss_7": 591.8195831298829, "learning_rate": 1.2335239524541298e-07, "loss": 1123.1069, "step": 9930 }, { "ce_loss_10": 3.485284912586212, "ce_loss_13": 3.4107711553573608, "ce_loss_2": 4.4413145065307615, "ce_loss_3": 4.1761764764785765, "ce_loss_7": 3.658044862747192, "epoch": 0.994, "grad_norm": 552.0, "kl_loss_10": 174.74987030029297, "kl_loss_2": 2137.9515625, "kl_loss_3": 1659.8411071777343, "kl_loss_7": 590.9619750976562, "learning_rate": 9.06272382371065e-08, "loss": 1140.1338, "step": 9940 }, { "ce_loss_10": 3.5549147844314577, "ce_loss_13": 3.482628679275513, "ce_loss_2": 4.527088284492493, "ce_loss_3": 4.2653639078140255, "ce_loss_7": 3.7300615668296815, "epoch": 0.995, "grad_norm": 540.0, "kl_loss_10": 177.89019927978515, "kl_loss_2": 2179.192108154297, "kl_loss_3": 1710.0057861328125, "kl_loss_7": 601.8619506835937, "learning_rate": 6.293616306246586e-08, "loss": 1148.1468, "step": 9950 }, { "ce_loss_10": 3.5492191195487974, "ce_loss_13": 3.4784142851829527, "ce_loss_2": 4.47113618850708, "ce_loss_3": 4.207776916027069, "ce_loss_7": 3.7163458704948424, "epoch": 0.996, "grad_norm": 568.0, "kl_loss_10": 171.20833358764648, "kl_loss_2": 2067.156182861328, "kl_loss_3": 1607.7954223632812, "kl_loss_7": 575.8024002075196, "learning_rate": 4.027944857032395e-08, "loss": 1102.1236, "step": 9960 }, { "ce_loss_10": 3.5417333483695983, "ce_loss_13": 3.4737359166145323, "ce_loss_2": 4.454948210716248, "ce_loss_3": 4.189112281799316, "ce_loss_7": 3.7030033111572265, "epoch": 0.997, "grad_norm": 564.0, "kl_loss_10": 169.30588455200194, "kl_loss_2": 2030.570849609375, "kl_loss_3": 1568.8222778320312, "kl_loss_7": 562.5833770751954, "learning_rate": 2.265732291356626e-08, "loss": 1096.3691, "step": 9970 }, { "ce_loss_10": 3.5887541651725767, "ce_loss_13": 3.5155721068382264, "ce_loss_2": 4.518339204788208, "ce_loss_3": 4.2516262292861935, "ce_loss_7": 3.7596161723136903, "epoch": 0.998, "grad_norm": 516.0, "kl_loss_10": 174.8034523010254, "kl_loss_2": 2081.462506103516, "kl_loss_3": 1607.8740112304688, "kl_loss_7": 584.5911361694336, "learning_rate": 1.0069963546743833e-08, "loss": 1138.0035, "step": 9980 }, { "ce_loss_10": 3.567852771282196, "ce_loss_13": 3.4926111340522765, "ce_loss_2": 4.526482367515564, "ce_loss_3": 4.2642577409744264, "ce_loss_7": 3.741330122947693, "epoch": 0.999, "grad_norm": 504.0, "kl_loss_10": 177.2101951599121, "kl_loss_2": 2140.8559020996095, "kl_loss_3": 1666.8830322265626, "kl_loss_7": 597.1717834472656, "learning_rate": 2.517497224463483e-09, "loss": 1140.1191, "step": 9990 }, { "ce_loss_10": 3.5264371991157533, "ce_loss_13": 3.450861382484436, "ce_loss_2": 4.53892297744751, "ce_loss_3": 4.266285753250122, "ce_loss_7": 3.7066094994544985, "epoch": 1.0, "grad_norm": 580.0, "kl_loss_10": 180.3290283203125, "kl_loss_2": 2255.579718017578, "kl_loss_3": 1769.6880432128905, "kl_loss_7": 615.3499603271484, "learning_rate": 0.0, "loss": 1181.1314, "step": 10000 } ], "logging_steps": 10, "max_steps": 10000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.177819035608023e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }