diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,16049 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 10000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "ce_loss_10": 5.479339599609375, + "ce_loss_13": 3.4827667474746704, + "ce_loss_2": 13.979248523712158, + "ce_loss_3": 13.771953105926514, + "ce_loss_7": 7.430000305175781, + "epoch": 0.0001, + "grad_norm": 81408.0, + "kl_loss_10": 4489.56494140625, + "kl_loss_2": 22049.2119140625, + "kl_loss_3": 21566.693359375, + "kl_loss_7": 7499.004150390625, + "learning_rate": 1e-05, + "loss": 14123.4883, + "step": 1 + }, + { + "ce_loss_10": 5.119714260101318, + "ce_loss_13": 3.53999932607015, + "ce_loss_2": 11.240631209479439, + "ce_loss_3": 10.95585854848226, + "ce_loss_7": 6.906492206785414, + "epoch": 0.001, + "grad_norm": 37888.0, + "kl_loss_10": 3271.7383083767363, + "kl_loss_2": 15601.7939453125, + "kl_loss_3": 14746.902018229166, + "kl_loss_7": 6279.269124348958, + "learning_rate": 0.0001, + "loss": 9990.5972, + "step": 10 + }, + { + "ce_loss_10": 4.4336272239685055, + "ce_loss_13": 3.5471752166748045, + "ce_loss_2": 7.957242059707641, + "ce_loss_3": 7.463982796669006, + "ce_loss_7": 5.84502854347229, + "epoch": 0.002, + "grad_norm": 9216.0, + "kl_loss_10": 1664.693670654297, + "kl_loss_2": 8140.2906494140625, + "kl_loss_3": 7229.052661132812, + "kl_loss_7": 4268.229284667968, + "learning_rate": 0.0002, + "loss": 5408.8828, + "step": 20 + }, + { + "ce_loss_10": 3.938058543205261, + "ce_loss_13": 3.333306384086609, + "ce_loss_2": 6.773653674125671, + "ce_loss_3": 6.433317041397094, + "ce_loss_7": 5.134593844413757, + "epoch": 0.003, + "grad_norm": 3248.0, + "kl_loss_10": 1150.4377380371093, + "kl_loss_2": 6385.490258789063, + "kl_loss_3": 5771.109155273438, + "kl_loss_7": 3370.8108642578127, + "learning_rate": 0.0003, + "loss": 4090.45, + "step": 30 + }, + { + "ce_loss_10": 3.9883425116539, + "ce_loss_13": 3.5081888794898988, + "ce_loss_2": 6.318326234817505, + "ce_loss_3": 6.045081973075867, + "ce_loss_7": 4.969816541671753, + "epoch": 0.004, + "grad_norm": 6240.0, + "kl_loss_10": 955.3555145263672, + "kl_loss_2": 5198.101293945312, + "kl_loss_3": 4741.139855957032, + "kl_loss_7": 2762.6712036132812, + "learning_rate": 0.0004, + "loss": 3452.1297, + "step": 40 + }, + { + "ce_loss_10": 3.94427330493927, + "ce_loss_13": 3.4815958857536318, + "ce_loss_2": 6.088573956489563, + "ce_loss_3": 5.815527606010437, + "ce_loss_7": 4.780157661437988, + "epoch": 0.005, + "grad_norm": 4768.0, + "kl_loss_10": 882.2040557861328, + "kl_loss_2": 4868.76328125, + "kl_loss_3": 4416.1666259765625, + "kl_loss_7": 2467.264501953125, + "learning_rate": 0.0005, + "loss": 3156.5504, + "step": 50 + }, + { + "ce_loss_10": 3.877071762084961, + "ce_loss_13": 3.493886411190033, + "ce_loss_2": 5.86357319355011, + "ce_loss_3": 5.616779232025147, + "ce_loss_7": 4.6486598491668705, + "epoch": 0.006, + "grad_norm": 4768.0, + "kl_loss_10": 781.2706634521485, + "kl_loss_2": 4452.719311523438, + "kl_loss_3": 4034.9894165039063, + "kl_loss_7": 2237.3867736816405, + "learning_rate": 0.0006, + "loss": 2874.2984, + "step": 60 + }, + { + "ce_loss_10": 3.7734105229377746, + "ce_loss_13": 3.408212423324585, + "ce_loss_2": 5.7149782419204715, + "ce_loss_3": 5.489736318588257, + "ce_loss_7": 4.5027463555336, + "epoch": 0.007, + "grad_norm": 2896.0, + "kl_loss_10": 747.7415161132812, + "kl_loss_2": 4370.160473632813, + "kl_loss_3": 3989.598352050781, + "kl_loss_7": 2142.413153076172, + "learning_rate": 0.0007, + "loss": 2776.958, + "step": 70 + }, + { + "ce_loss_10": 3.7659215092658997, + "ce_loss_13": 3.409997522830963, + "ce_loss_2": 5.646710276603699, + "ce_loss_3": 5.404528284072876, + "ce_loss_7": 4.458205795288086, + "epoch": 0.008, + "grad_norm": 2256.0, + "kl_loss_10": 717.0317596435547, + "kl_loss_2": 4256.561865234375, + "kl_loss_3": 3846.635607910156, + "kl_loss_7": 2051.1767517089843, + "learning_rate": 0.0008, + "loss": 2710.7359, + "step": 80 + }, + { + "ce_loss_10": 3.6950827717781065, + "ce_loss_13": 3.3681079149246216, + "ce_loss_2": 5.57774977684021, + "ce_loss_3": 5.358799338340759, + "ce_loss_7": 4.384551775455475, + "epoch": 0.009, + "grad_norm": 2784.0, + "kl_loss_10": 674.085107421875, + "kl_loss_2": 4234.844958496094, + "kl_loss_3": 3849.5977783203125, + "kl_loss_7": 2047.007257080078, + "learning_rate": 0.0009000000000000001, + "loss": 2670.6148, + "step": 90 + }, + { + "ce_loss_10": 3.831665110588074, + "ce_loss_13": 3.493525803089142, + "ce_loss_2": 5.601197052001953, + "ce_loss_3": 5.398815608024597, + "ce_loss_7": 4.526931476593018, + "epoch": 0.01, + "grad_norm": 2752.0, + "kl_loss_10": 672.117709350586, + "kl_loss_2": 4021.284912109375, + "kl_loss_3": 3663.468957519531, + "kl_loss_7": 2026.185284423828, + "learning_rate": 0.001, + "loss": 2598.8504, + "step": 100 + }, + { + "ce_loss_10": 3.7488776206970216, + "ce_loss_13": 3.4458404183387756, + "ce_loss_2": 5.519205498695373, + "ce_loss_3": 5.296326518058777, + "ce_loss_7": 4.427326142787933, + "epoch": 0.011, + "grad_norm": 1728.0, + "kl_loss_10": 619.106655883789, + "kl_loss_2": 3960.856494140625, + "kl_loss_3": 3572.990026855469, + "kl_loss_7": 1934.6764282226563, + "learning_rate": 0.0009999974825027757, + "loss": 2513.0074, + "step": 110 + }, + { + "ce_loss_10": 3.805395770072937, + "ce_loss_13": 3.503445029258728, + "ce_loss_2": 5.498133254051209, + "ce_loss_3": 5.255028605461121, + "ce_loss_7": 4.415169513225555, + "epoch": 0.012, + "grad_norm": 2040.0, + "kl_loss_10": 603.263638305664, + "kl_loss_2": 3843.529895019531, + "kl_loss_3": 3407.914306640625, + "kl_loss_7": 1813.6988830566406, + "learning_rate": 0.0009999899300364532, + "loss": 2390.1301, + "step": 120 + }, + { + "ce_loss_10": 3.7707916855812074, + "ce_loss_13": 3.475922393798828, + "ce_loss_2": 5.500737500190735, + "ce_loss_3": 5.266239738464355, + "ce_loss_7": 4.383851003646851, + "epoch": 0.013, + "grad_norm": 2624.0, + "kl_loss_10": 587.465219116211, + "kl_loss_2": 3864.1528442382814, + "kl_loss_3": 3434.2073608398437, + "kl_loss_7": 1783.62373046875, + "learning_rate": 0.0009999773426770863, + "loss": 2449.8629, + "step": 130 + }, + { + "ce_loss_10": 3.867912781238556, + "ce_loss_13": 3.509277641773224, + "ce_loss_2": 5.469627714157104, + "ce_loss_3": 5.194933176040649, + "ce_loss_7": 4.379399788379669, + "epoch": 0.014, + "grad_norm": 1992.0, + "kl_loss_10": 725.8971160888672, + "kl_loss_2": 3778.5274291992187, + "kl_loss_3": 3296.354528808594, + "kl_loss_7": 1732.4250732421874, + "learning_rate": 0.0009999597205514296, + "loss": 2405.5832, + "step": 140 + }, + { + "ce_loss_10": 3.77968590259552, + "ce_loss_13": 3.471135640144348, + "ce_loss_2": 5.36739604473114, + "ce_loss_3": 5.106346774101257, + "ce_loss_7": 4.320439124107361, + "epoch": 0.015, + "grad_norm": 1360.0, + "kl_loss_10": 624.2580291748047, + "kl_loss_2": 3632.6698486328123, + "kl_loss_3": 3176.562145996094, + "kl_loss_7": 1695.959698486328, + "learning_rate": 0.0009999370638369377, + "loss": 2293.6836, + "step": 150 + }, + { + "ce_loss_10": 3.8028363585472107, + "ce_loss_13": 3.5099030256271364, + "ce_loss_2": 5.394668865203857, + "ce_loss_3": 5.231132960319519, + "ce_loss_7": 4.338041806221009, + "epoch": 0.016, + "grad_norm": 3296.0, + "kl_loss_10": 591.1725463867188, + "kl_loss_2": 3644.2818969726563, + "kl_loss_3": 3364.4876342773437, + "kl_loss_7": 1640.8858154296875, + "learning_rate": 0.000999909372761763, + "loss": 2313.8473, + "step": 160 + }, + { + "ce_loss_10": 3.715697240829468, + "ce_loss_13": 3.4447871685028075, + "ce_loss_2": 5.341588139533997, + "ce_loss_3": 5.263697862625122, + "ce_loss_7": 4.2574918389320375, + "epoch": 0.017, + "grad_norm": 3008.0, + "kl_loss_10": 555.6443099975586, + "kl_loss_2": 3670.3303466796874, + "kl_loss_3": 3553.7742431640627, + "kl_loss_7": 1629.572772216797, + "learning_rate": 0.0009998766476047546, + "loss": 2372.3059, + "step": 170 + }, + { + "ce_loss_10": 3.7622690200805664, + "ce_loss_13": 3.4889180302619933, + "ce_loss_2": 5.369840741157532, + "ce_loss_3": 5.276954698562622, + "ce_loss_7": 4.275516867637634, + "epoch": 0.018, + "grad_norm": 2040.0, + "kl_loss_10": 565.5127258300781, + "kl_loss_2": 3642.269982910156, + "kl_loss_3": 3495.3918212890626, + "kl_loss_7": 1571.4158569335937, + "learning_rate": 0.0009998388886954545, + "loss": 2349.4688, + "step": 180 + }, + { + "ce_loss_10": 3.712801456451416, + "ce_loss_13": 3.4555400371551515, + "ce_loss_2": 5.312930059432984, + "ce_loss_3": 5.148007488250732, + "ce_loss_7": 4.23377754688263, + "epoch": 0.019, + "grad_norm": 1328.0, + "kl_loss_10": 534.2748489379883, + "kl_loss_2": 3599.4434326171877, + "kl_loss_3": 3312.2544921875, + "kl_loss_7": 1559.4685668945312, + "learning_rate": 0.0009997960964140947, + "loss": 2241.091, + "step": 190 + }, + { + "ce_loss_10": 3.6890772104263307, + "ce_loss_13": 3.4474449634552, + "ce_loss_2": 5.328355288505554, + "ce_loss_3": 5.103678369522095, + "ce_loss_7": 4.223123550415039, + "epoch": 0.02, + "grad_norm": 1584.0, + "kl_loss_10": 498.3182800292969, + "kl_loss_2": 3626.2685302734376, + "kl_loss_3": 3230.7111083984373, + "kl_loss_7": 1544.03125, + "learning_rate": 0.0009997482711915926, + "loss": 2212.8523, + "step": 200 + }, + { + "ce_loss_10": 3.643280267715454, + "ce_loss_13": 3.4110453128814697, + "ce_loss_2": 5.262782073020935, + "ce_loss_3": 5.006648206710816, + "ce_loss_7": 4.161720204353332, + "epoch": 0.021, + "grad_norm": 1320.0, + "kl_loss_10": 468.05088500976564, + "kl_loss_2": 3600.3508911132812, + "kl_loss_3": 3146.037072753906, + "kl_loss_7": 1514.3593139648438, + "learning_rate": 0.0009996954135095479, + "loss": 2163.3328, + "step": 210 + }, + { + "ce_loss_10": 3.743840980529785, + "ce_loss_13": 3.495615518093109, + "ce_loss_2": 5.276893544197082, + "ce_loss_3": 5.026828193664551, + "ce_loss_7": 4.215770494937897, + "epoch": 0.022, + "grad_norm": 952.0, + "kl_loss_10": 494.9872482299805, + "kl_loss_2": 3434.308557128906, + "kl_loss_3": 2996.470593261719, + "kl_loss_7": 1447.3476196289062, + "learning_rate": 0.0009996375239002368, + "loss": 2094.248, + "step": 220 + }, + { + "ce_loss_10": 3.8117304921150206, + "ce_loss_13": 3.5717169761657717, + "ce_loss_2": 5.300674176216125, + "ce_loss_3": 5.045718550682068, + "ce_loss_7": 4.271833729743958, + "epoch": 0.023, + "grad_norm": 1064.0, + "kl_loss_10": 491.1131820678711, + "kl_loss_2": 3352.0796875, + "kl_loss_3": 2909.836950683594, + "kl_loss_7": 1405.5986450195312, + "learning_rate": 0.0009995746029466072, + "loss": 2050.6086, + "step": 230 + }, + { + "ce_loss_10": 3.6075421810150146, + "ce_loss_13": 3.3550766468048097, + "ce_loss_2": 5.39588577747345, + "ce_loss_3": 4.985904622077942, + "ce_loss_7": 4.14452086687088, + "epoch": 0.024, + "grad_norm": 1496.0, + "kl_loss_10": 521.684194946289, + "kl_loss_2": 3944.3539916992186, + "kl_loss_3": 3201.487194824219, + "kl_loss_7": 1583.0420288085938, + "learning_rate": 0.0009995066512822719, + "loss": 2234.7746, + "step": 240 + }, + { + "ce_loss_10": 3.6849907636642456, + "ce_loss_13": 3.461445081233978, + "ce_loss_2": 5.414009666442871, + "ce_loss_3": 5.085514402389526, + "ce_loss_7": 4.184376835823059, + "epoch": 0.025, + "grad_norm": 1800.0, + "kl_loss_10": 465.4432067871094, + "kl_loss_2": 3782.4762451171873, + "kl_loss_3": 3206.6178466796873, + "kl_loss_7": 1450.9975891113281, + "learning_rate": 0.000999433669591504, + "loss": 2142.3535, + "step": 250 + }, + { + "ce_loss_10": 3.6025625109672545, + "ce_loss_13": 3.360257649421692, + "ce_loss_2": 5.237245011329651, + "ce_loss_3": 4.9437507629394535, + "ce_loss_7": 4.080421531200409, + "epoch": 0.026, + "grad_norm": 1408.0, + "kl_loss_10": 503.2585876464844, + "kl_loss_2": 3655.8213134765624, + "kl_loss_3": 3140.7313842773438, + "kl_loss_7": 1460.5682739257813, + "learning_rate": 0.000999355658609228, + "loss": 2133.6004, + "step": 260 + }, + { + "ce_loss_10": 3.6813029885292052, + "ce_loss_13": 3.395027530193329, + "ce_loss_2": 5.295657467842102, + "ce_loss_3": 5.023426985740661, + "ce_loss_7": 4.133508098125458, + "epoch": 0.027, + "grad_norm": 1416.0, + "kl_loss_10": 572.3903137207031, + "kl_loss_2": 3669.314978027344, + "kl_loss_3": 3183.432019042969, + "kl_loss_7": 1464.0882202148437, + "learning_rate": 0.0009992726191210138, + "loss": 2179.2967, + "step": 270 + }, + { + "ce_loss_10": 3.696367251873016, + "ce_loss_13": 3.433962404727936, + "ce_loss_2": 5.21666829586029, + "ce_loss_3": 4.99695348739624, + "ce_loss_7": 4.169408094882965, + "epoch": 0.028, + "grad_norm": 1880.0, + "kl_loss_10": 529.3393615722656, + "kl_loss_2": 3457.8086547851562, + "kl_loss_3": 3089.980187988281, + "kl_loss_7": 1482.3380798339845, + "learning_rate": 0.0009991845519630679, + "loss": 2115.8172, + "step": 280 + }, + { + "ce_loss_10": 3.556672739982605, + "ce_loss_13": 3.3172685623168947, + "ce_loss_2": 5.112358474731446, + "ce_loss_3": 4.917420530319214, + "ce_loss_7": 4.036571848392486, + "epoch": 0.029, + "grad_norm": 2000.0, + "kl_loss_10": 477.5372833251953, + "kl_loss_2": 3475.2529418945314, + "kl_loss_3": 3146.291943359375, + "kl_loss_7": 1444.9558898925782, + "learning_rate": 0.0009990914580222257, + "loss": 2130.9104, + "step": 290 + }, + { + "ce_loss_10": 3.6650490760803223, + "ce_loss_13": 3.455529069900513, + "ce_loss_2": 5.149494194984436, + "ce_loss_3": 4.940038657188415, + "ce_loss_7": 4.130682170391083, + "epoch": 0.03, + "grad_norm": 1560.0, + "kl_loss_10": 441.9183044433594, + "kl_loss_2": 3299.1113159179686, + "kl_loss_3": 2933.9962768554688, + "kl_loss_7": 1369.8260986328125, + "learning_rate": 0.0009989933382359422, + "loss": 2069.7893, + "step": 300 + }, + { + "ce_loss_10": 3.6985942125320435, + "ce_loss_13": 3.465806806087494, + "ce_loss_2": 5.143499898910522, + "ce_loss_3": 4.909449362754822, + "ce_loss_7": 4.1382394433021545, + "epoch": 0.031, + "grad_norm": 1120.0, + "kl_loss_10": 486.39428558349607, + "kl_loss_2": 3301.6126342773437, + "kl_loss_3": 2880.7590454101564, + "kl_loss_7": 1392.4209594726562, + "learning_rate": 0.0009988901935922825, + "loss": 2022.2506, + "step": 310 + }, + { + "ce_loss_10": 3.544218695163727, + "ce_loss_13": 3.314011883735657, + "ce_loss_2": 5.10150101184845, + "ce_loss_3": 4.842008900642395, + "ce_loss_7": 4.024464392662049, + "epoch": 0.032, + "grad_norm": 1472.0, + "kl_loss_10": 469.5253311157227, + "kl_loss_2": 3487.786022949219, + "kl_loss_3": 3028.9456176757812, + "kl_loss_7": 1444.1999450683593, + "learning_rate": 0.0009987820251299122, + "loss": 2047.4186, + "step": 320 + }, + { + "ce_loss_10": 3.67177551984787, + "ce_loss_13": 3.4466672420501707, + "ce_loss_2": 5.135675239562988, + "ce_loss_3": 4.8595945835113525, + "ce_loss_7": 4.121568500995636, + "epoch": 0.033, + "grad_norm": 1012.0, + "kl_loss_10": 450.39354553222654, + "kl_loss_2": 3306.36376953125, + "kl_loss_3": 2827.274365234375, + "kl_loss_7": 1387.9264587402345, + "learning_rate": 0.0009986688339380862, + "loss": 1975.759, + "step": 330 + }, + { + "ce_loss_10": 3.6029638409614564, + "ce_loss_13": 3.397365379333496, + "ce_loss_2": 5.057221698760986, + "ce_loss_3": 4.7936498641967775, + "ce_loss_7": 4.015895903110504, + "epoch": 0.034, + "grad_norm": 964.0, + "kl_loss_10": 436.06551971435545, + "kl_loss_2": 3221.0221435546873, + "kl_loss_3": 2750.7028198242188, + "kl_loss_7": 1266.2399963378907, + "learning_rate": 0.0009985506211566387, + "loss": 1936.1705, + "step": 340 + }, + { + "ce_loss_10": 3.6370130658149717, + "ce_loss_13": 3.4315125226974486, + "ce_loss_2": 5.061046314239502, + "ce_loss_3": 4.7848950862884525, + "ce_loss_7": 4.02301949262619, + "epoch": 0.035, + "grad_norm": 908.0, + "kl_loss_10": 422.44232482910155, + "kl_loss_2": 3177.237683105469, + "kl_loss_3": 2690.365087890625, + "kl_loss_7": 1217.3221435546875, + "learning_rate": 0.0009984273879759713, + "loss": 1896.4475, + "step": 350 + }, + { + "ce_loss_10": 3.656745362281799, + "ce_loss_13": 3.4570682406425477, + "ce_loss_2": 5.137815022468567, + "ce_loss_3": 4.873619461059571, + "ce_loss_7": 4.083463799953461, + "epoch": 0.036, + "grad_norm": 860.0, + "kl_loss_10": 423.36531524658204, + "kl_loss_2": 3273.7410522460937, + "kl_loss_3": 2798.973498535156, + "kl_loss_7": 1268.806689453125, + "learning_rate": 0.0009982991356370402, + "loss": 1973.0059, + "step": 360 + }, + { + "ce_loss_10": 3.631060302257538, + "ce_loss_13": 3.4341819286346436, + "ce_loss_2": 5.11769585609436, + "ce_loss_3": 4.834220147132873, + "ce_loss_7": 4.047011601924896, + "epoch": 0.037, + "grad_norm": 908.0, + "kl_loss_10": 402.81113891601564, + "kl_loss_2": 3276.052880859375, + "kl_loss_3": 2791.4117065429687, + "kl_loss_7": 1245.3732849121093, + "learning_rate": 0.0009981658654313456, + "loss": 1941.0666, + "step": 370 + }, + { + "ce_loss_10": 3.7020971179008484, + "ce_loss_13": 3.5137467861175535, + "ce_loss_2": 5.156114864349365, + "ce_loss_3": 4.874492716789246, + "ce_loss_7": 4.092896187305451, + "epoch": 0.038, + "grad_norm": 744.0, + "kl_loss_10": 382.19567413330077, + "kl_loss_2": 3216.3584228515624, + "kl_loss_3": 2713.040576171875, + "kl_loss_7": 1200.0062133789063, + "learning_rate": 0.000998027578700917, + "loss": 1916.7457, + "step": 380 + }, + { + "ce_loss_10": 3.629340207576752, + "ce_loss_13": 3.4466560959815977, + "ce_loss_2": 5.104858756065369, + "ce_loss_3": 4.827202153205872, + "ce_loss_7": 4.051906526088715, + "epoch": 0.039, + "grad_norm": 768.0, + "kl_loss_10": 387.5618530273438, + "kl_loss_2": 3239.9035766601564, + "kl_loss_3": 2754.5089477539063, + "kl_loss_7": 1245.49443359375, + "learning_rate": 0.0009978842768382998, + "loss": 1919.6182, + "step": 390 + }, + { + "ce_loss_10": 3.6458646416664124, + "ce_loss_13": 3.4677427411079407, + "ce_loss_2": 5.076069569587707, + "ce_loss_3": 4.798897671699524, + "ce_loss_7": 4.036490082740784, + "epoch": 0.04, + "grad_norm": 820.0, + "kl_loss_10": 365.37960052490234, + "kl_loss_2": 3139.388073730469, + "kl_loss_3": 2645.03125, + "kl_loss_7": 1170.367169189453, + "learning_rate": 0.0009977359612865424, + "loss": 1848.2086, + "step": 400 + }, + { + "ce_loss_10": 3.6510703682899477, + "ce_loss_13": 3.472544801235199, + "ce_loss_2": 5.0927152872085575, + "ce_loss_3": 4.818557095527649, + "ce_loss_7": 4.048879408836365, + "epoch": 0.041, + "grad_norm": 752.0, + "kl_loss_10": 376.4607299804687, + "kl_loss_2": 3183.3623168945314, + "kl_loss_3": 2696.904638671875, + "kl_loss_7": 1198.10205078125, + "learning_rate": 0.0009975826335391806, + "loss": 1850.6066, + "step": 410 + }, + { + "ce_loss_10": 3.664944088459015, + "ce_loss_13": 3.4915570259094237, + "ce_loss_2": 5.092843031883239, + "ce_loss_3": 4.81715497970581, + "ce_loss_7": 4.06898148059845, + "epoch": 0.042, + "grad_norm": 1072.0, + "kl_loss_10": 367.5922546386719, + "kl_loss_2": 3121.2123901367186, + "kl_loss_3": 2637.7326782226564, + "kl_loss_7": 1178.5750122070312, + "learning_rate": 0.0009974242951402235, + "loss": 1847.4906, + "step": 420 + }, + { + "ce_loss_10": 3.6901652693748472, + "ce_loss_13": 3.5015287518501284, + "ce_loss_2": 5.113956260681152, + "ce_loss_3": 4.831623649597168, + "ce_loss_7": 4.073742997646332, + "epoch": 0.043, + "grad_norm": 932.0, + "kl_loss_10": 391.2719299316406, + "kl_loss_2": 3171.550817871094, + "kl_loss_3": 2672.8351318359373, + "kl_loss_7": 1198.8372039794922, + "learning_rate": 0.0009972609476841367, + "loss": 1839.4168, + "step": 430 + }, + { + "ce_loss_10": 3.592795264720917, + "ce_loss_13": 3.407615542411804, + "ce_loss_2": 5.048645877838135, + "ce_loss_3": 4.779121279716492, + "ce_loss_7": 3.982761597633362, + "epoch": 0.044, + "grad_norm": 928.0, + "kl_loss_10": 377.92359313964846, + "kl_loss_2": 3195.713342285156, + "kl_loss_3": 2713.7881591796877, + "kl_loss_7": 1185.5625, + "learning_rate": 0.0009970925928158272, + "loss": 1868.092, + "step": 440 + }, + { + "ce_loss_10": 3.542843294143677, + "ce_loss_13": 3.354374420642853, + "ce_loss_2": 5.013250637054443, + "ce_loss_3": 4.739123964309693, + "ce_loss_7": 3.935924601554871, + "epoch": 0.045, + "grad_norm": 740.0, + "kl_loss_10": 385.2865692138672, + "kl_loss_2": 3278.8071044921876, + "kl_loss_3": 2790.4721435546876, + "kl_loss_7": 1226.6742797851562, + "learning_rate": 0.000996919232230627, + "loss": 1885.8758, + "step": 450 + }, + { + "ce_loss_10": 3.609917199611664, + "ce_loss_13": 3.4386712551116942, + "ce_loss_2": 5.020998239517212, + "ce_loss_3": 4.756829810142517, + "ce_loss_7": 4.001234555244446, + "epoch": 0.046, + "grad_norm": 872.0, + "kl_loss_10": 358.4470748901367, + "kl_loss_2": 3100.1795166015627, + "kl_loss_3": 2620.8273803710936, + "kl_loss_7": 1157.8196044921874, + "learning_rate": 0.0009967408676742752, + "loss": 1772.8766, + "step": 460 + }, + { + "ce_loss_10": 3.7562451124191285, + "ce_loss_13": 3.5811493396759033, + "ce_loss_2": 5.11839497089386, + "ce_loss_3": 4.844864320755005, + "ce_loss_7": 4.1195793628692625, + "epoch": 0.047, + "grad_norm": 968.0, + "kl_loss_10": 364.69328155517576, + "kl_loss_2": 3032.6340087890626, + "kl_loss_3": 2548.6266967773436, + "kl_loss_7": 1130.8773040771484, + "learning_rate": 0.0009965575009429006, + "loss": 1825.8629, + "step": 470 + }, + { + "ce_loss_10": 3.542626643180847, + "ce_loss_13": 3.364771544933319, + "ce_loss_2": 4.9806403636932375, + "ce_loss_3": 4.703183531761169, + "ce_loss_7": 3.9297071576118467, + "epoch": 0.048, + "grad_norm": 772.0, + "kl_loss_10": 368.38177795410155, + "kl_loss_2": 3172.022900390625, + "kl_loss_3": 2678.579626464844, + "kl_loss_7": 1172.0243133544923, + "learning_rate": 0.0009963691338830043, + "loss": 1818.5924, + "step": 480 + }, + { + "ce_loss_10": 3.6282991647720335, + "ce_loss_13": 3.4611623764038084, + "ce_loss_2": 5.030923771858215, + "ce_loss_3": 4.765255475044251, + "ce_loss_7": 3.995365762710571, + "epoch": 0.049, + "grad_norm": 944.0, + "kl_loss_10": 346.68406372070314, + "kl_loss_2": 3111.6420288085938, + "kl_loss_3": 2633.9958740234374, + "kl_loss_7": 1125.4209197998048, + "learning_rate": 0.0009961757683914405, + "loss": 1782.6619, + "step": 490 + }, + { + "ce_loss_10": 3.6188631772994997, + "ce_loss_13": 3.450295829772949, + "ce_loss_2": 4.988259315490723, + "ce_loss_3": 4.726764726638794, + "ce_loss_7": 4.00168125629425, + "epoch": 0.05, + "grad_norm": 1184.0, + "kl_loss_10": 362.3049346923828, + "kl_loss_2": 3035.001806640625, + "kl_loss_3": 2588.0874145507814, + "kl_loss_7": 1166.9710693359375, + "learning_rate": 0.0009959774064153978, + "loss": 1805.0438, + "step": 500 + }, + { + "ce_loss_10": 3.623943197727203, + "ce_loss_13": 3.4620243430137636, + "ce_loss_2": 4.959137892723083, + "ce_loss_3": 4.687646722793579, + "ce_loss_7": 3.976128029823303, + "epoch": 0.051, + "grad_norm": 856.0, + "kl_loss_10": 343.2813385009766, + "kl_loss_2": 2963.6288208007813, + "kl_loss_3": 2485.8935424804686, + "kl_loss_7": 1086.964697265625, + "learning_rate": 0.0009957740499523787, + "loss": 1751.4643, + "step": 510 + }, + { + "ce_loss_10": 3.6490553617477417, + "ce_loss_13": 3.476555550098419, + "ce_loss_2": 4.994476556777954, + "ce_loss_3": 4.725382924079895, + "ce_loss_7": 4.001345467567444, + "epoch": 0.052, + "grad_norm": 808.0, + "kl_loss_10": 347.32325134277346, + "kl_loss_2": 2968.236572265625, + "kl_loss_3": 2495.7832275390624, + "kl_loss_7": 1099.3744354248047, + "learning_rate": 0.0009955657010501807, + "loss": 1740.4176, + "step": 520 + }, + { + "ce_loss_10": 3.6094146251678465, + "ce_loss_13": 3.4360305190086367, + "ce_loss_2": 4.987359571456909, + "ce_loss_3": 4.711909174919128, + "ce_loss_7": 3.96878160238266, + "epoch": 0.053, + "grad_norm": 732.0, + "kl_loss_10": 356.96947326660154, + "kl_loss_2": 3066.1166015625, + "kl_loss_3": 2574.2064819335938, + "kl_loss_7": 1113.183071899414, + "learning_rate": 0.000995352361806875, + "loss": 1757.3914, + "step": 530 + }, + { + "ce_loss_10": 3.6483884930610655, + "ce_loss_13": 3.4761168599128722, + "ce_loss_2": 5.01164448261261, + "ce_loss_3": 4.73403651714325, + "ce_loss_7": 4.005823755264283, + "epoch": 0.054, + "grad_norm": 868.0, + "kl_loss_10": 358.3400619506836, + "kl_loss_2": 3025.010693359375, + "kl_loss_3": 2540.0701538085937, + "kl_loss_7": 1117.8957305908202, + "learning_rate": 0.0009951340343707852, + "loss": 1783.3418, + "step": 540 + }, + { + "ce_loss_10": 3.693763518333435, + "ce_loss_13": 3.5300124645233155, + "ce_loss_2": 5.04529185295105, + "ce_loss_3": 4.776751947402954, + "ce_loss_7": 4.050560343265533, + "epoch": 0.055, + "grad_norm": 580.0, + "kl_loss_10": 343.201188659668, + "kl_loss_2": 2966.6511840820312, + "kl_loss_3": 2491.584606933594, + "kl_loss_7": 1070.658499145508, + "learning_rate": 0.0009949107209404665, + "loss": 1740.307, + "step": 550 + }, + { + "ce_loss_10": 3.618695652484894, + "ce_loss_13": 3.4460346341133117, + "ce_loss_2": 4.953143644332886, + "ce_loss_3": 4.67682032585144, + "ce_loss_7": 3.9601072311401366, + "epoch": 0.056, + "grad_norm": 972.0, + "kl_loss_10": 355.8962005615234, + "kl_loss_2": 2990.009143066406, + "kl_loss_3": 2495.9183959960938, + "kl_loss_7": 1092.0468170166016, + "learning_rate": 0.0009946824237646824, + "loss": 1737.0576, + "step": 560 + }, + { + "ce_loss_10": 3.5657299041748045, + "ce_loss_13": 3.3921077370643617, + "ce_loss_2": 4.9473305463790895, + "ce_loss_3": 4.655314612388611, + "ce_loss_7": 3.9485832929611204, + "epoch": 0.057, + "grad_norm": 1232.0, + "kl_loss_10": 368.3774078369141, + "kl_loss_2": 3077.5546997070314, + "kl_loss_3": 2563.977990722656, + "kl_loss_7": 1171.9384887695312, + "learning_rate": 0.0009944491451423828, + "loss": 1812.8215, + "step": 570 + }, + { + "ce_loss_10": 3.5597246408462526, + "ce_loss_13": 3.38997106552124, + "ce_loss_2": 4.957224941253662, + "ce_loss_3": 4.669384074211121, + "ce_loss_7": 3.9783090591430663, + "epoch": 0.058, + "grad_norm": 1048.0, + "kl_loss_10": 352.9766845703125, + "kl_loss_2": 3080.3538452148437, + "kl_loss_3": 2573.69345703125, + "kl_loss_7": 1221.7482543945312, + "learning_rate": 0.0009942108874226813, + "loss": 1775.8918, + "step": 580 + }, + { + "ce_loss_10": 3.667470908164978, + "ce_loss_13": 3.5143657088279725, + "ce_loss_2": 4.977405524253845, + "ce_loss_3": 4.70070378780365, + "ce_loss_7": 4.062089693546295, + "epoch": 0.059, + "grad_norm": 1160.0, + "kl_loss_10": 326.54786376953126, + "kl_loss_2": 2889.81787109375, + "kl_loss_3": 2394.497277832031, + "kl_loss_7": 1154.6502380371094, + "learning_rate": 0.00099396765300483, + "loss": 1684.8838, + "step": 590 + }, + { + "ce_loss_10": 3.65077520608902, + "ce_loss_13": 3.4909046292304993, + "ce_loss_2": 4.953103184700012, + "ce_loss_3": 4.675009846687317, + "ce_loss_7": 4.037787747383118, + "epoch": 0.06, + "grad_norm": 728.0, + "kl_loss_10": 333.6824432373047, + "kl_loss_2": 2888.043603515625, + "kl_loss_3": 2401.467254638672, + "kl_loss_7": 1146.9622497558594, + "learning_rate": 0.0009937194443381972, + "loss": 1692.9094, + "step": 600 + }, + { + "ce_loss_10": 3.6720112562179565, + "ce_loss_13": 3.5144667506217955, + "ce_loss_2": 4.945521140098572, + "ce_loss_3": 4.670798707008362, + "ce_loss_7": 4.003339779376984, + "epoch": 0.061, + "grad_norm": 728.0, + "kl_loss_10": 340.24414978027346, + "kl_loss_2": 2848.255480957031, + "kl_loss_3": 2358.6506469726564, + "kl_loss_7": 1042.5767547607422, + "learning_rate": 0.0009934662639222412, + "loss": 1695.9006, + "step": 610 + }, + { + "ce_loss_10": 3.6284273624420167, + "ce_loss_13": 3.466042399406433, + "ce_loss_2": 4.974099659919739, + "ce_loss_3": 4.698220872879029, + "ce_loss_7": 3.9703264474868774, + "epoch": 0.062, + "grad_norm": 708.0, + "kl_loss_10": 346.28453369140624, + "kl_loss_2": 2978.781689453125, + "kl_loss_3": 2496.677685546875, + "kl_loss_7": 1062.910955810547, + "learning_rate": 0.000993208114306486, + "loss": 1704.2672, + "step": 620 + }, + { + "ce_loss_10": 3.5462576508522035, + "ce_loss_13": 3.380946898460388, + "ce_loss_2": 4.922283387184143, + "ce_loss_3": 4.633153581619263, + "ce_loss_7": 3.890825295448303, + "epoch": 0.063, + "grad_norm": 924.0, + "kl_loss_10": 358.3551940917969, + "kl_loss_2": 3032.9190673828125, + "kl_loss_3": 2531.955603027344, + "kl_loss_7": 1071.5194458007813, + "learning_rate": 0.0009929449980904952, + "loss": 1693.2549, + "step": 630 + }, + { + "ce_loss_10": 3.6085665225982666, + "ce_loss_13": 3.444735288619995, + "ce_loss_2": 4.934487676620483, + "ce_loss_3": 4.655823493003846, + "ce_loss_7": 3.935356545448303, + "epoch": 0.064, + "grad_norm": 676.0, + "kl_loss_10": 344.3735855102539, + "kl_loss_2": 2962.493859863281, + "kl_loss_3": 2465.4102416992187, + "kl_loss_7": 1045.9244415283204, + "learning_rate": 0.0009926769179238466, + "loss": 1690.2553, + "step": 640 + }, + { + "ce_loss_10": 3.657176661491394, + "ce_loss_13": 3.4894155979156496, + "ce_loss_2": 4.984645247459412, + "ce_loss_3": 4.697536993026733, + "ce_loss_7": 3.984337937831879, + "epoch": 0.065, + "grad_norm": 812.0, + "kl_loss_10": 351.49694671630857, + "kl_loss_2": 2961.2925659179687, + "kl_loss_3": 2455.3551025390625, + "kl_loss_7": 1056.1930267333985, + "learning_rate": 0.000992403876506104, + "loss": 1699.9176, + "step": 650 + }, + { + "ce_loss_10": 3.5853109121322633, + "ce_loss_13": 3.4265636444091796, + "ce_loss_2": 4.949072217941284, + "ce_loss_3": 4.657009506225586, + "ce_loss_7": 3.9192400932312013, + "epoch": 0.066, + "grad_norm": 772.0, + "kl_loss_10": 332.7637084960937, + "kl_loss_2": 3005.3072998046873, + "kl_loss_3": 2488.590020751953, + "kl_loss_7": 1034.6812866210937, + "learning_rate": 0.0009921258765867918, + "loss": 1712.7359, + "step": 660 + }, + { + "ce_loss_10": 3.543238043785095, + "ce_loss_13": 3.392865073680878, + "ce_loss_2": 4.929511904716492, + "ce_loss_3": 4.673877739906311, + "ce_loss_7": 3.8979400753974915, + "epoch": 0.067, + "grad_norm": 1216.0, + "kl_loss_10": 326.31287689208983, + "kl_loss_2": 3073.3475952148438, + "kl_loss_3": 2606.592980957031, + "kl_loss_7": 1089.1828704833983, + "learning_rate": 0.0009918429209653662, + "loss": 1742.882, + "step": 670 + }, + { + "ce_loss_10": 3.60556218624115, + "ce_loss_13": 3.451234769821167, + "ce_loss_2": 4.9643912553787235, + "ce_loss_3": 4.685172462463379, + "ce_loss_7": 3.9489428043365478, + "epoch": 0.068, + "grad_norm": 700.0, + "kl_loss_10": 326.2720092773437, + "kl_loss_2": 2991.5349365234374, + "kl_loss_3": 2499.9562133789063, + "kl_loss_7": 1058.8525512695312, + "learning_rate": 0.0009915550124911866, + "loss": 1675.9207, + "step": 680 + }, + { + "ce_loss_10": 3.6152788639068603, + "ce_loss_13": 3.463881015777588, + "ce_loss_2": 4.9310142517089846, + "ce_loss_3": 4.651708984375, + "ce_loss_7": 3.939838695526123, + "epoch": 0.069, + "grad_norm": 716.0, + "kl_loss_10": 321.7038208007813, + "kl_loss_2": 2904.1300415039063, + "kl_loss_3": 2416.9381103515625, + "kl_loss_7": 1006.4677703857421, + "learning_rate": 0.0009912621540634887, + "loss": 1665.2684, + "step": 690 + }, + { + "ce_loss_10": 3.6430228471755983, + "ce_loss_13": 3.4952101826667787, + "ce_loss_2": 4.929437565803528, + "ce_loss_3": 4.6475961923599245, + "ce_loss_7": 3.9429776191711428, + "epoch": 0.07, + "grad_norm": 676.0, + "kl_loss_10": 309.61268615722656, + "kl_loss_2": 2848.814501953125, + "kl_loss_3": 2359.46318359375, + "kl_loss_7": 970.4827117919922, + "learning_rate": 0.0009909643486313534, + "loss": 1639.2395, + "step": 700 + }, + { + "ce_loss_10": 3.526335525512695, + "ce_loss_13": 3.3703501343727114, + "ce_loss_2": 4.889838075637817, + "ce_loss_3": 4.6030642032623295, + "ce_loss_7": 3.8521526575088503, + "epoch": 0.071, + "grad_norm": 744.0, + "kl_loss_10": 340.5390853881836, + "kl_loss_2": 3011.11240234375, + "kl_loss_3": 2515.4811889648436, + "kl_loss_7": 1017.710009765625, + "learning_rate": 0.000990661599193678, + "loss": 1737.2715, + "step": 710 + }, + { + "ce_loss_10": 3.6673552870750425, + "ce_loss_13": 3.5033403754234316, + "ce_loss_2": 4.93988311290741, + "ce_loss_3": 4.67398898601532, + "ce_loss_7": 3.9688475489616395, + "epoch": 0.072, + "grad_norm": 796.0, + "kl_loss_10": 340.83275604248047, + "kl_loss_2": 2865.5041381835936, + "kl_loss_3": 2386.658837890625, + "kl_loss_7": 996.4190338134765, + "learning_rate": 0.0009903539087991462, + "loss": 1651.048, + "step": 720 + }, + { + "ce_loss_10": 3.6324875712394715, + "ce_loss_13": 3.4752389669418333, + "ce_loss_2": 4.927369832992554, + "ce_loss_3": 4.656826686859131, + "ce_loss_7": 3.941362977027893, + "epoch": 0.073, + "grad_norm": 672.0, + "kl_loss_10": 338.573225402832, + "kl_loss_2": 2878.319189453125, + "kl_loss_3": 2403.4671142578127, + "kl_loss_7": 991.7197357177735, + "learning_rate": 0.0009900412805461966, + "loss": 1664.0748, + "step": 730 + }, + { + "ce_loss_10": 3.697860896587372, + "ce_loss_13": 3.5502901554107664, + "ce_loss_2": 4.959825038909912, + "ce_loss_3": 4.680054187774658, + "ce_loss_7": 4.008367860317231, + "epoch": 0.074, + "grad_norm": 796.0, + "kl_loss_10": 322.8813171386719, + "kl_loss_2": 2810.9089233398436, + "kl_loss_3": 2318.1740844726564, + "kl_loss_7": 980.3480072021484, + "learning_rate": 0.0009897237175829927, + "loss": 1630.2344, + "step": 740 + }, + { + "ce_loss_10": 3.5930413126945497, + "ce_loss_13": 3.43618665933609, + "ce_loss_2": 4.910944557189941, + "ce_loss_3": 4.628472471237183, + "ce_loss_7": 3.9170363903045655, + "epoch": 0.075, + "grad_norm": 720.0, + "kl_loss_10": 332.21988067626955, + "kl_loss_2": 2928.557727050781, + "kl_loss_3": 2429.9159301757813, + "kl_loss_7": 1037.6262634277343, + "learning_rate": 0.0009894012231073895, + "loss": 1665.4367, + "step": 750 + }, + { + "ce_loss_10": 3.6464996695518495, + "ce_loss_13": 3.4838218331336974, + "ce_loss_2": 4.924402260780335, + "ce_loss_3": 4.645452523231507, + "ce_loss_7": 3.9448330640792846, + "epoch": 0.076, + "grad_norm": 812.0, + "kl_loss_10": 338.6822082519531, + "kl_loss_2": 2855.4515014648437, + "kl_loss_3": 2358.476416015625, + "kl_loss_7": 978.1411010742188, + "learning_rate": 0.0009890738003669028, + "loss": 1654.1621, + "step": 760 + }, + { + "ce_loss_10": 3.617565965652466, + "ce_loss_13": 3.455268681049347, + "ce_loss_2": 4.933386254310608, + "ce_loss_3": 4.651405668258667, + "ce_loss_7": 3.9341206789016723, + "epoch": 0.077, + "grad_norm": 756.0, + "kl_loss_10": 337.93136138916014, + "kl_loss_2": 2949.602490234375, + "kl_loss_3": 2451.218469238281, + "kl_loss_7": 1020.4571960449218, + "learning_rate": 0.0009887414526586764, + "loss": 1640.4555, + "step": 770 + }, + { + "ce_loss_10": 3.6583608746528626, + "ce_loss_13": 3.512969744205475, + "ce_loss_2": 4.9441753149032595, + "ce_loss_3": 4.656214547157288, + "ce_loss_7": 3.964318811893463, + "epoch": 0.078, + "grad_norm": 720.0, + "kl_loss_10": 313.43713836669923, + "kl_loss_2": 2854.152880859375, + "kl_loss_3": 2348.5727111816404, + "kl_loss_7": 969.1142120361328, + "learning_rate": 0.0009884041833294476, + "loss": 1599.7842, + "step": 780 + }, + { + "ce_loss_10": 3.6560466647148133, + "ce_loss_13": 3.508514332771301, + "ce_loss_2": 4.940361285209656, + "ce_loss_3": 4.645708775520324, + "ce_loss_7": 3.958513784408569, + "epoch": 0.079, + "grad_norm": 832.0, + "kl_loss_10": 319.23270416259766, + "kl_loss_2": 2852.032861328125, + "kl_loss_3": 2330.51533203125, + "kl_loss_7": 969.9107818603516, + "learning_rate": 0.000988061995775515, + "loss": 1668.3449, + "step": 790 + }, + { + "ce_loss_10": 3.5980430364608766, + "ce_loss_13": 3.440366840362549, + "ce_loss_2": 4.8732929706573485, + "ce_loss_3": 4.587121820449829, + "ce_loss_7": 3.9043478846549986, + "epoch": 0.08, + "grad_norm": 752.0, + "kl_loss_10": 323.7010192871094, + "kl_loss_2": 2868.414514160156, + "kl_loss_3": 2356.581396484375, + "kl_loss_7": 987.0360656738281, + "learning_rate": 0.0009877148934427035, + "loss": 1633.2111, + "step": 800 + }, + { + "ce_loss_10": 3.633367455005646, + "ce_loss_13": 3.4834325551986693, + "ce_loss_2": 4.935962653160095, + "ce_loss_3": 4.627329421043396, + "ce_loss_7": 3.925890827178955, + "epoch": 0.081, + "grad_norm": 820.0, + "kl_loss_10": 330.4556167602539, + "kl_loss_2": 2885.1009033203127, + "kl_loss_3": 2351.8309020996094, + "kl_loss_7": 957.707730102539, + "learning_rate": 0.0009873628798263297, + "loss": 1611.097, + "step": 810 + }, + { + "ce_loss_10": 3.605324161052704, + "ce_loss_13": 3.438004171848297, + "ce_loss_2": 4.856884765625, + "ce_loss_3": 4.56407413482666, + "ce_loss_7": 3.8718148946762083, + "epoch": 0.082, + "grad_norm": 840.0, + "kl_loss_10": 339.57317504882815, + "kl_loss_2": 2826.0930053710936, + "kl_loss_3": 2305.033795166016, + "kl_loss_7": 931.82373046875, + "learning_rate": 0.0009870059584711668, + "loss": 1639.3607, + "step": 820 + }, + { + "ce_loss_10": 3.60188170671463, + "ce_loss_13": 3.455841100215912, + "ce_loss_2": 4.85420286655426, + "ce_loss_3": 4.581924772262573, + "ce_loss_7": 3.8951406598091127, + "epoch": 0.083, + "grad_norm": 720.0, + "kl_loss_10": 317.57149810791014, + "kl_loss_2": 2801.2140380859373, + "kl_loss_3": 2316.871270751953, + "kl_loss_7": 949.605337524414, + "learning_rate": 0.000986644132971409, + "loss": 1599.6842, + "step": 830 + }, + { + "ce_loss_10": 3.5939020037651064, + "ce_loss_13": 3.4429898500442504, + "ce_loss_2": 4.88135507106781, + "ce_loss_3": 4.604088640213012, + "ce_loss_7": 3.9158664107322694, + "epoch": 0.084, + "grad_norm": 932.0, + "kl_loss_10": 322.8277191162109, + "kl_loss_2": 2865.847692871094, + "kl_loss_3": 2367.4215576171873, + "kl_loss_7": 996.9576171875, + "learning_rate": 0.0009862774069706345, + "loss": 1629.1093, + "step": 840 + }, + { + "ce_loss_10": 3.710948944091797, + "ce_loss_13": 3.5685924649238587, + "ce_loss_2": 4.930621600151062, + "ce_loss_3": 4.65263340473175, + "ce_loss_7": 3.9990792274475098, + "epoch": 0.085, + "grad_norm": 684.0, + "kl_loss_10": 304.0562255859375, + "kl_loss_2": 2742.24169921875, + "kl_loss_3": 2253.91962890625, + "kl_loss_7": 950.4928100585937, + "learning_rate": 0.000985905784161771, + "loss": 1590.0119, + "step": 850 + }, + { + "ce_loss_10": 3.63605819940567, + "ce_loss_13": 3.4998138546943665, + "ce_loss_2": 4.900371265411377, + "ce_loss_3": 4.62078812122345, + "ce_loss_7": 3.934238409996033, + "epoch": 0.086, + "grad_norm": 748.0, + "kl_loss_10": 294.4667907714844, + "kl_loss_2": 2800.617395019531, + "kl_loss_3": 2314.4944458007812, + "kl_loss_7": 955.0795837402344, + "learning_rate": 0.000985529268287055, + "loss": 1585.186, + "step": 860 + }, + { + "ce_loss_10": 3.5651148438453673, + "ce_loss_13": 3.4233306527137755, + "ce_loss_2": 4.871410083770752, + "ce_loss_3": 4.5925886869430546, + "ce_loss_7": 3.877922761440277, + "epoch": 0.087, + "grad_norm": 796.0, + "kl_loss_10": 301.2444900512695, + "kl_loss_2": 2878.2498046875, + "kl_loss_3": 2387.6543212890624, + "kl_loss_7": 975.5103942871094, + "learning_rate": 0.0009851478631379982, + "loss": 1626.462, + "step": 870 + }, + { + "ce_loss_10": 3.6220229983329775, + "ce_loss_13": 3.4835654973983763, + "ce_loss_2": 4.903548383712769, + "ce_loss_3": 4.61605658531189, + "ce_loss_7": 3.9362378478050233, + "epoch": 0.088, + "grad_norm": 844.0, + "kl_loss_10": 293.3538963317871, + "kl_loss_2": 2833.7354125976562, + "kl_loss_3": 2335.5184326171875, + "kl_loss_7": 967.1238098144531, + "learning_rate": 0.0009847615725553456, + "loss": 1597.0803, + "step": 880 + }, + { + "ce_loss_10": 3.671082556247711, + "ce_loss_13": 3.542756676673889, + "ce_loss_2": 4.8840786695480345, + "ce_loss_3": 4.608758640289307, + "ce_loss_7": 3.9651415824890135, + "epoch": 0.089, + "grad_norm": 676.0, + "kl_loss_10": 274.7398094177246, + "kl_loss_2": 2672.2400390625, + "kl_loss_3": 2185.940838623047, + "kl_loss_7": 914.7755340576172, + "learning_rate": 0.0009843704004290394, + "loss": 1572.2007, + "step": 890 + }, + { + "ce_loss_10": 3.5845912218093874, + "ce_loss_13": 3.4463690519332886, + "ce_loss_2": 4.845745182037353, + "ce_loss_3": 4.566518807411194, + "ce_loss_7": 3.8977394104003906, + "epoch": 0.09, + "grad_norm": 800.0, + "kl_loss_10": 293.04640731811526, + "kl_loss_2": 2812.2204833984374, + "kl_loss_3": 2313.156042480469, + "kl_loss_7": 966.190869140625, + "learning_rate": 0.0009839743506981783, + "loss": 1597.2805, + "step": 900 + }, + { + "ce_loss_10": 3.5071211099624633, + "ce_loss_13": 3.369294321537018, + "ce_loss_2": 4.836311769485474, + "ce_loss_3": 4.550878620147705, + "ce_loss_7": 3.8309507608413695, + "epoch": 0.091, + "grad_norm": 716.0, + "kl_loss_10": 298.81206665039065, + "kl_loss_2": 2958.2573974609377, + "kl_loss_3": 2443.9187561035155, + "kl_loss_7": 1005.0242462158203, + "learning_rate": 0.0009835734273509786, + "loss": 1627.2797, + "step": 910 + }, + { + "ce_loss_10": 3.6050177574157716, + "ce_loss_13": 3.4665517807006836, + "ce_loss_2": 4.881958699226379, + "ce_loss_3": 4.6013915777206424, + "ce_loss_7": 3.9145362257957457, + "epoch": 0.092, + "grad_norm": 720.0, + "kl_loss_10": 288.0885604858398, + "kl_loss_2": 2799.756945800781, + "kl_loss_3": 2307.6742553710938, + "kl_loss_7": 959.5810729980469, + "learning_rate": 0.0009831676344247342, + "loss": 1585.5819, + "step": 920 + }, + { + "ce_loss_10": 3.615782046318054, + "ce_loss_13": 3.484424388408661, + "ce_loss_2": 4.840068244934082, + "ce_loss_3": 4.566077804565429, + "ce_loss_7": 3.905368459224701, + "epoch": 0.093, + "grad_norm": 592.0, + "kl_loss_10": 284.13806304931643, + "kl_loss_2": 2716.098291015625, + "kl_loss_3": 2237.568524169922, + "kl_loss_7": 925.932373046875, + "learning_rate": 0.0009827569760057755, + "loss": 1574.975, + "step": 930 + }, + { + "ce_loss_10": 3.5478480219841004, + "ce_loss_13": 3.4008304595947267, + "ce_loss_2": 4.878832292556763, + "ce_loss_3": 4.597835183143616, + "ce_loss_7": 3.860486149787903, + "epoch": 0.094, + "grad_norm": 812.0, + "kl_loss_10": 311.2947525024414, + "kl_loss_2": 2955.33916015625, + "kl_loss_3": 2458.781884765625, + "kl_loss_7": 985.0075500488281, + "learning_rate": 0.000982341456229428, + "loss": 1619.0104, + "step": 940 + }, + { + "ce_loss_10": 3.6401113510131835, + "ce_loss_13": 3.4997127175331117, + "ce_loss_2": 4.909311819076538, + "ce_loss_3": 4.633120918273926, + "ce_loss_7": 3.936661887168884, + "epoch": 0.095, + "grad_norm": 768.0, + "kl_loss_10": 304.94605484008787, + "kl_loss_2": 2847.3047485351562, + "kl_loss_3": 2358.746990966797, + "kl_loss_7": 958.3424041748046, + "learning_rate": 0.000981921079279971, + "loss": 1575.8767, + "step": 950 + }, + { + "ce_loss_10": 3.6493973970413207, + "ce_loss_13": 3.5170445680618285, + "ce_loss_2": 4.842743754386902, + "ce_loss_3": 4.559553527832032, + "ce_loss_7": 3.913013446331024, + "epoch": 0.096, + "grad_norm": 632.0, + "kl_loss_10": 287.80171127319335, + "kl_loss_2": 2681.031005859375, + "kl_loss_3": 2186.1464904785157, + "kl_loss_7": 891.4322113037109, + "learning_rate": 0.0009814958493905962, + "loss": 1541.8673, + "step": 960 + }, + { + "ce_loss_10": 3.6059035897254943, + "ce_loss_13": 3.464053213596344, + "ce_loss_2": 4.885409092903137, + "ce_loss_3": 4.605575942993164, + "ce_loss_7": 3.901495134830475, + "epoch": 0.097, + "grad_norm": 644.0, + "kl_loss_10": 302.9938400268555, + "kl_loss_2": 2842.060888671875, + "kl_loss_3": 2348.8412109375, + "kl_loss_7": 943.344677734375, + "learning_rate": 0.0009810657708433637, + "loss": 1620.3537, + "step": 970 + }, + { + "ce_loss_10": 3.6700100898742676, + "ce_loss_13": 3.538521420955658, + "ce_loss_2": 4.868229222297669, + "ce_loss_3": 4.590689539909363, + "ce_loss_7": 3.9474687933921815, + "epoch": 0.098, + "grad_norm": 808.0, + "kl_loss_10": 283.2241409301758, + "kl_loss_2": 2674.522265625, + "kl_loss_3": 2192.326556396484, + "kl_loss_7": 894.1458190917969, + "learning_rate": 0.0009806308479691594, + "loss": 1528.2636, + "step": 980 + }, + { + "ce_loss_10": 3.691223752498627, + "ce_loss_13": 3.55548814535141, + "ce_loss_2": 4.925488543510437, + "ce_loss_3": 4.648779034614563, + "ce_loss_7": 3.9924039959907534, + "epoch": 0.099, + "grad_norm": 740.0, + "kl_loss_10": 294.3150146484375, + "kl_loss_2": 2748.0041381835936, + "kl_loss_3": 2268.979638671875, + "kl_loss_7": 946.8526397705078, + "learning_rate": 0.0009801910851476522, + "loss": 1554.0744, + "step": 990 + }, + { + "ce_loss_10": 3.6008501768112184, + "ce_loss_13": 3.465990114212036, + "ce_loss_2": 4.890150642395019, + "ce_loss_3": 4.609904193878174, + "ce_loss_7": 3.9068346500396727, + "epoch": 0.1, + "grad_norm": 736.0, + "kl_loss_10": 294.7660331726074, + "kl_loss_2": 2875.2068603515627, + "kl_loss_3": 2379.5891052246093, + "kl_loss_7": 970.1351318359375, + "learning_rate": 0.0009797464868072487, + "loss": 1582.4648, + "step": 1000 + }, + { + "ce_loss_10": 3.5892885446548464, + "ce_loss_13": 3.454503262042999, + "ce_loss_2": 4.837452292442322, + "ce_loss_3": 4.55982882976532, + "ce_loss_7": 3.887318527698517, + "epoch": 0.101, + "grad_norm": 724.0, + "kl_loss_10": 288.82502670288085, + "kl_loss_2": 2762.65830078125, + "kl_loss_3": 2282.756170654297, + "kl_loss_7": 944.8302276611328, + "learning_rate": 0.0009792970574250492, + "loss": 1564.9662, + "step": 1010 + }, + { + "ce_loss_10": 3.6221608400344847, + "ce_loss_13": 3.482994794845581, + "ce_loss_2": 4.848793458938599, + "ce_loss_3": 4.575083756446839, + "ce_loss_7": 3.914657413959503, + "epoch": 0.102, + "grad_norm": 612.0, + "kl_loss_10": 290.8812942504883, + "kl_loss_2": 2743.8400146484373, + "kl_loss_3": 2261.9089599609374, + "kl_loss_7": 937.5250091552734, + "learning_rate": 0.0009788428015268028, + "loss": 1536.8119, + "step": 1020 + }, + { + "ce_loss_10": 3.6110181331634523, + "ce_loss_13": 3.47798638343811, + "ce_loss_2": 4.840990829467773, + "ce_loss_3": 4.55189163684845, + "ce_loss_7": 3.9010056853294373, + "epoch": 0.103, + "grad_norm": 616.0, + "kl_loss_10": 281.37939529418946, + "kl_loss_2": 2739.4623291015623, + "kl_loss_3": 2238.093048095703, + "kl_loss_7": 923.4858306884765, + "learning_rate": 0.0009783837236868609, + "loss": 1534.7721, + "step": 1030 + }, + { + "ce_loss_10": 3.5802615523338317, + "ce_loss_13": 3.4459127306938173, + "ce_loss_2": 4.818247056007385, + "ce_loss_3": 4.546270060539245, + "ce_loss_7": 3.8740112662315367, + "epoch": 0.104, + "grad_norm": 696.0, + "kl_loss_10": 281.4418014526367, + "kl_loss_2": 2719.910290527344, + "kl_loss_3": 2248.530157470703, + "kl_loss_7": 921.926953125, + "learning_rate": 0.0009779198285281327, + "loss": 1537.119, + "step": 1040 + }, + { + "ce_loss_10": 3.577412283420563, + "ce_loss_13": 3.4400023460388183, + "ce_loss_2": 4.825755000114441, + "ce_loss_3": 4.554906344413757, + "ce_loss_7": 3.8695693135261537, + "epoch": 0.105, + "grad_norm": 784.0, + "kl_loss_10": 293.84764709472654, + "kl_loss_2": 2770.2111328125, + "kl_loss_3": 2280.982073974609, + "kl_loss_7": 916.6518432617188, + "learning_rate": 0.0009774511207220368, + "loss": 1562.095, + "step": 1050 + }, + { + "ce_loss_10": 3.621231746673584, + "ce_loss_13": 3.4823400259017943, + "ce_loss_2": 4.867471241950989, + "ce_loss_3": 4.584862947463989, + "ce_loss_7": 3.895237350463867, + "epoch": 0.106, + "grad_norm": 588.0, + "kl_loss_10": 306.07321014404295, + "kl_loss_2": 2785.361218261719, + "kl_loss_3": 2286.776574707031, + "kl_loss_7": 918.4756744384765, + "learning_rate": 0.0009769776049884564, + "loss": 1554.5619, + "step": 1060 + }, + { + "ce_loss_10": 3.5330086588859557, + "ce_loss_13": 3.387469935417175, + "ce_loss_2": 4.804182314872742, + "ce_loss_3": 4.539949297904968, + "ce_loss_7": 3.8264609456062315, + "epoch": 0.107, + "grad_norm": 1184.0, + "kl_loss_10": 307.66697082519534, + "kl_loss_2": 2836.2517578125, + "kl_loss_3": 2373.5376220703124, + "kl_loss_7": 943.6192169189453, + "learning_rate": 0.0009764992860956889, + "loss": 1622.7785, + "step": 1070 + }, + { + "ce_loss_10": 3.677293050289154, + "ce_loss_13": 3.5469510316848756, + "ce_loss_2": 4.837077927589417, + "ce_loss_3": 4.588571333885193, + "ce_loss_7": 3.9465363740921022, + "epoch": 0.108, + "grad_norm": 816.0, + "kl_loss_10": 286.8066802978516, + "kl_loss_2": 2605.4248657226562, + "kl_loss_3": 2175.9279296875, + "kl_loss_7": 899.353060913086, + "learning_rate": 0.0009760161688604008, + "loss": 1520.9383, + "step": 1080 + }, + { + "ce_loss_10": 3.6768419981002807, + "ce_loss_13": 3.54748477935791, + "ce_loss_2": 4.881722617149353, + "ce_loss_3": 4.620517659187317, + "ce_loss_7": 3.9953080892562864, + "epoch": 0.109, + "grad_norm": 840.0, + "kl_loss_10": 283.82303619384766, + "kl_loss_2": 2660.0453125, + "kl_loss_3": 2210.3756591796873, + "kl_loss_7": 954.3282287597656, + "learning_rate": 0.0009755282581475768, + "loss": 1552.3523, + "step": 1090 + }, + { + "ce_loss_10": 3.742873156070709, + "ce_loss_13": 3.60170716047287, + "ce_loss_2": 4.9219562292099, + "ce_loss_3": 4.631097722053528, + "ce_loss_7": 4.016275346279144, + "epoch": 0.11, + "grad_norm": 792.0, + "kl_loss_10": 295.95645599365236, + "kl_loss_2": 2660.5046997070312, + "kl_loss_3": 2150.144982910156, + "kl_loss_7": 938.1217224121094, + "learning_rate": 0.0009750355588704727, + "loss": 1496.9391, + "step": 1100 + }, + { + "ce_loss_10": 3.5732216477394103, + "ce_loss_13": 3.427997899055481, + "ce_loss_2": 4.788290286064148, + "ce_loss_3": 4.501250839233398, + "ce_loss_7": 3.849083948135376, + "epoch": 0.111, + "grad_norm": 644.0, + "kl_loss_10": 301.9219177246094, + "kl_loss_2": 2692.5292846679686, + "kl_loss_3": 2192.219659423828, + "kl_loss_7": 902.1104858398437, + "learning_rate": 0.0009745380759905647, + "loss": 1547.9881, + "step": 1110 + }, + { + "ce_loss_10": 3.525436317920685, + "ce_loss_13": 3.388839864730835, + "ce_loss_2": 4.766349339485169, + "ce_loss_3": 4.478921818733215, + "ce_loss_7": 3.8117297768592833, + "epoch": 0.112, + "grad_norm": 636.0, + "kl_loss_10": 288.7658378601074, + "kl_loss_2": 2767.7005126953127, + "kl_loss_3": 2266.3693115234373, + "kl_loss_7": 916.3693817138671, + "learning_rate": 0.0009740358145174998, + "loss": 1582.2694, + "step": 1120 + }, + { + "ce_loss_10": 3.674707901477814, + "ce_loss_13": 3.541641688346863, + "ce_loss_2": 4.839509201049805, + "ce_loss_3": 4.554335117340088, + "ce_loss_7": 3.9309728384017943, + "epoch": 0.113, + "grad_norm": 740.0, + "kl_loss_10": 293.9353363037109, + "kl_loss_2": 2627.9318603515626, + "kl_loss_3": 2118.4319458007812, + "kl_loss_7": 883.5943176269532, + "learning_rate": 0.0009735287795090455, + "loss": 1505.1257, + "step": 1130 + }, + { + "ce_loss_10": 3.5646776437759398, + "ce_loss_13": 3.4284933686256407, + "ce_loss_2": 4.8010115146636965, + "ce_loss_3": 4.510753107070923, + "ce_loss_7": 3.839044988155365, + "epoch": 0.114, + "grad_norm": 692.0, + "kl_loss_10": 289.08748931884764, + "kl_loss_2": 2724.9734130859374, + "kl_loss_3": 2216.0496459960937, + "kl_loss_7": 891.9239013671875, + "learning_rate": 0.0009730169760710386, + "loss": 1526.1704, + "step": 1140 + }, + { + "ce_loss_10": 3.647395300865173, + "ce_loss_13": 3.51713547706604, + "ce_loss_2": 4.854923152923584, + "ce_loss_3": 4.577234363555908, + "ce_loss_7": 3.928617572784424, + "epoch": 0.115, + "grad_norm": 800.0, + "kl_loss_10": 280.8671928405762, + "kl_loss_2": 2669.748742675781, + "kl_loss_3": 2182.849468994141, + "kl_loss_7": 895.4142913818359, + "learning_rate": 0.0009725004093573342, + "loss": 1526.191, + "step": 1150 + }, + { + "ce_loss_10": 3.5862129092216493, + "ce_loss_13": 3.4506229400634765, + "ce_loss_2": 4.798703122138977, + "ce_loss_3": 4.520225930213928, + "ce_loss_7": 3.877876043319702, + "epoch": 0.116, + "grad_norm": 840.0, + "kl_loss_10": 283.1919075012207, + "kl_loss_2": 2672.7715087890624, + "kl_loss_3": 2193.748876953125, + "kl_loss_7": 903.5404602050781, + "learning_rate": 0.0009719790845697534, + "loss": 1504.2701, + "step": 1160 + }, + { + "ce_loss_10": 3.5309566259384155, + "ce_loss_13": 3.4061906576156615, + "ce_loss_2": 4.704360723495483, + "ce_loss_3": 4.450148797035217, + "ce_loss_7": 3.803053593635559, + "epoch": 0.117, + "grad_norm": 696.0, + "kl_loss_10": 271.77204208374025, + "kl_loss_2": 2620.2907592773436, + "kl_loss_3": 2176.4858520507814, + "kl_loss_7": 863.540869140625, + "learning_rate": 0.0009714530069580309, + "loss": 1485.2044, + "step": 1170 + }, + { + "ce_loss_10": 3.640796720981598, + "ce_loss_13": 3.507249903678894, + "ce_loss_2": 4.853176116943359, + "ce_loss_3": 4.5715264797210695, + "ce_loss_7": 3.914932680130005, + "epoch": 0.118, + "grad_norm": 716.0, + "kl_loss_10": 285.63293685913084, + "kl_loss_2": 2675.1877319335936, + "kl_loss_3": 2189.7057678222654, + "kl_loss_7": 884.98447265625, + "learning_rate": 0.0009709221818197624, + "loss": 1502.0164, + "step": 1180 + }, + { + "ce_loss_10": 3.6675962805747986, + "ce_loss_13": 3.534939968585968, + "ce_loss_2": 4.88215401172638, + "ce_loss_3": 4.607950353622437, + "ce_loss_7": 3.9379210352897642, + "epoch": 0.119, + "grad_norm": 596.0, + "kl_loss_10": 288.61556854248045, + "kl_loss_2": 2711.667822265625, + "kl_loss_3": 2227.545977783203, + "kl_loss_7": 887.7295013427735, + "learning_rate": 0.0009703866145003512, + "loss": 1525.4232, + "step": 1190 + }, + { + "ce_loss_10": 3.6349379420280457, + "ce_loss_13": 3.5029913663864134, + "ce_loss_2": 4.829423713684082, + "ce_loss_3": 4.558488368988037, + "ce_loss_7": 3.908590841293335, + "epoch": 0.12, + "grad_norm": 660.0, + "kl_loss_10": 279.50138397216796, + "kl_loss_2": 2676.350244140625, + "kl_loss_3": 2191.2725830078125, + "kl_loss_7": 882.7566497802734, + "learning_rate": 0.0009698463103929542, + "loss": 1529.4317, + "step": 1200 + }, + { + "ce_loss_10": 3.605515944957733, + "ce_loss_13": 3.472998011112213, + "ce_loss_2": 4.827000212669373, + "ce_loss_3": 4.540698933601379, + "ce_loss_7": 3.879436028003693, + "epoch": 0.121, + "grad_norm": 652.0, + "kl_loss_10": 281.2242576599121, + "kl_loss_2": 2695.142529296875, + "kl_loss_3": 2191.8710388183595, + "kl_loss_7": 882.5638031005859, + "learning_rate": 0.0009693012749384279, + "loss": 1527.1828, + "step": 1210 + }, + { + "ce_loss_10": 3.617890453338623, + "ce_loss_13": 3.4903839349746706, + "ce_loss_2": 4.823957228660584, + "ce_loss_3": 4.546852803230285, + "ce_loss_7": 3.8918931126594543, + "epoch": 0.122, + "grad_norm": 596.0, + "kl_loss_10": 274.6055084228516, + "kl_loss_2": 2677.51435546875, + "kl_loss_3": 2182.2475463867186, + "kl_loss_7": 884.2765747070313, + "learning_rate": 0.0009687515136252732, + "loss": 1502.8832, + "step": 1220 + }, + { + "ce_loss_10": 3.571158289909363, + "ce_loss_13": 3.4428164839744566, + "ce_loss_2": 4.832195687294006, + "ce_loss_3": 4.558122348785401, + "ce_loss_7": 3.866991031169891, + "epoch": 0.123, + "grad_norm": 656.0, + "kl_loss_10": 285.63698654174806, + "kl_loss_2": 2814.325549316406, + "kl_loss_3": 2321.4359924316404, + "kl_loss_7": 924.2180969238282, + "learning_rate": 0.0009681970319895803, + "loss": 1610.0467, + "step": 1230 + }, + { + "ce_loss_10": 3.6617783904075623, + "ce_loss_13": 3.5239094376564024, + "ce_loss_2": 4.840570569038391, + "ce_loss_3": 4.5621686458587645, + "ce_loss_7": 3.9261529445648193, + "epoch": 0.124, + "grad_norm": 660.0, + "kl_loss_10": 282.9206481933594, + "kl_loss_2": 2658.744873046875, + "kl_loss_3": 2162.91650390625, + "kl_loss_7": 894.8567260742187, + "learning_rate": 0.0009676378356149733, + "loss": 1510.0703, + "step": 1240 + }, + { + "ce_loss_10": 3.632222390174866, + "ce_loss_13": 3.49722797870636, + "ce_loss_2": 4.803181719779968, + "ce_loss_3": 4.527125644683838, + "ce_loss_7": 3.893145501613617, + "epoch": 0.125, + "grad_norm": 676.0, + "kl_loss_10": 306.93408966064453, + "kl_loss_2": 2618.3517578125, + "kl_loss_3": 2135.0671936035155, + "kl_loss_7": 870.7611785888672, + "learning_rate": 0.0009670739301325534, + "loss": 1495.915, + "step": 1250 + }, + { + "ce_loss_10": 3.5965846180915833, + "ce_loss_13": 3.461331534385681, + "ce_loss_2": 4.77229871749878, + "ce_loss_3": 4.488967990875244, + "ce_loss_7": 3.870732378959656, + "epoch": 0.126, + "grad_norm": 824.0, + "kl_loss_10": 288.87402572631834, + "kl_loss_2": 2631.6656005859377, + "kl_loss_3": 2132.2338745117186, + "kl_loss_7": 890.5492980957031, + "learning_rate": 0.0009665053212208426, + "loss": 1507.3391, + "step": 1260 + }, + { + "ce_loss_10": 3.6325414419174193, + "ce_loss_13": 3.5006507635116577, + "ce_loss_2": 4.82985291481018, + "ce_loss_3": 4.53967547416687, + "ce_loss_7": 3.907087206840515, + "epoch": 0.127, + "grad_norm": 824.0, + "kl_loss_10": 289.66627197265626, + "kl_loss_2": 2682.7635498046875, + "kl_loss_3": 2171.759143066406, + "kl_loss_7": 897.6279174804688, + "learning_rate": 0.0009659320146057262, + "loss": 1515.1299, + "step": 1270 + }, + { + "ce_loss_10": 3.6294240951538086, + "ce_loss_13": 3.5012729167938232, + "ce_loss_2": 4.802068519592285, + "ce_loss_3": 4.516877055168152, + "ce_loss_7": 3.912596344947815, + "epoch": 0.128, + "grad_norm": 1040.0, + "kl_loss_10": 274.3899444580078, + "kl_loss_2": 2616.2175048828126, + "kl_loss_3": 2113.874139404297, + "kl_loss_7": 894.8648956298828, + "learning_rate": 0.0009653540160603955, + "loss": 1485.5743, + "step": 1280 + }, + { + "ce_loss_10": 3.631951367855072, + "ce_loss_13": 3.5082743883132936, + "ce_loss_2": 4.7942791938781735, + "ce_loss_3": 4.533441662788391, + "ce_loss_7": 3.911020016670227, + "epoch": 0.129, + "grad_norm": 980.0, + "kl_loss_10": 277.70714950561523, + "kl_loss_2": 2607.6315795898436, + "kl_loss_3": 2154.8384338378905, + "kl_loss_7": 902.8254302978515, + "learning_rate": 0.0009647713314052896, + "loss": 1475.7309, + "step": 1290 + }, + { + "ce_loss_10": 3.5910762190818786, + "ce_loss_13": 3.4583710193634034, + "ce_loss_2": 4.806964182853699, + "ce_loss_3": 4.536605000495911, + "ce_loss_7": 3.892735993862152, + "epoch": 0.13, + "grad_norm": 1032.0, + "kl_loss_10": 281.282731628418, + "kl_loss_2": 2713.5140380859375, + "kl_loss_3": 2245.954937744141, + "kl_loss_7": 924.882958984375, + "learning_rate": 0.0009641839665080363, + "loss": 1529.1627, + "step": 1300 + }, + { + "ce_loss_10": 3.5369811177253725, + "ce_loss_13": 3.4184723615646364, + "ce_loss_2": 4.746987700462341, + "ce_loss_3": 4.47196786403656, + "ce_loss_7": 3.8142111539840697, + "epoch": 0.131, + "grad_norm": 708.0, + "kl_loss_10": 267.0766883850098, + "kl_loss_2": 2651.264123535156, + "kl_loss_3": 2169.8530151367186, + "kl_loss_7": 874.0395812988281, + "learning_rate": 0.0009635919272833937, + "loss": 1472.4912, + "step": 1310 + }, + { + "ce_loss_10": 3.582905340194702, + "ce_loss_13": 3.4547547817230226, + "ce_loss_2": 4.782030344009399, + "ce_loss_3": 4.50511953830719, + "ce_loss_7": 3.8575597286224363, + "epoch": 0.132, + "grad_norm": 640.0, + "kl_loss_10": 274.49700088500975, + "kl_loss_2": 2645.7089721679686, + "kl_loss_3": 2148.3076110839843, + "kl_loss_7": 865.2912628173829, + "learning_rate": 0.0009629952196931902, + "loss": 1461.5725, + "step": 1320 + }, + { + "ce_loss_10": 3.560918188095093, + "ce_loss_13": 3.4357552766799926, + "ce_loss_2": 4.777603983879089, + "ce_loss_3": 4.497129726409912, + "ce_loss_7": 3.8250754475593567, + "epoch": 0.133, + "grad_norm": 612.0, + "kl_loss_10": 266.5273551940918, + "kl_loss_2": 2692.33935546875, + "kl_loss_3": 2197.4263916015625, + "kl_loss_7": 846.3100128173828, + "learning_rate": 0.0009623938497462645, + "loss": 1482.4779, + "step": 1330 + }, + { + "ce_loss_10": 3.559932196140289, + "ce_loss_13": 3.4353162169456484, + "ce_loss_2": 4.754807543754578, + "ce_loss_3": 4.478498530387879, + "ce_loss_7": 3.8313623666763306, + "epoch": 0.134, + "grad_norm": 564.0, + "kl_loss_10": 268.2800895690918, + "kl_loss_2": 2653.6271240234373, + "kl_loss_3": 2162.7194641113283, + "kl_loss_7": 859.2419372558594, + "learning_rate": 0.0009617878234984055, + "loss": 1499.2066, + "step": 1340 + }, + { + "ce_loss_10": 3.651080513000488, + "ce_loss_13": 3.533881187438965, + "ce_loss_2": 4.8088576078414915, + "ce_loss_3": 4.535065650939941, + "ce_loss_7": 3.9042758703231812, + "epoch": 0.135, + "grad_norm": 712.0, + "kl_loss_10": 256.59825744628904, + "kl_loss_2": 2581.625207519531, + "kl_loss_3": 2098.4682495117186, + "kl_loss_7": 828.9938018798828, + "learning_rate": 0.0009611771470522907, + "loss": 1464.5767, + "step": 1350 + }, + { + "ce_loss_10": 3.5779558777809144, + "ce_loss_13": 3.457493555545807, + "ce_loss_2": 4.792022109031677, + "ce_loss_3": 4.514930057525635, + "ce_loss_7": 3.8448525190353395, + "epoch": 0.136, + "grad_norm": 616.0, + "kl_loss_10": 259.41123428344724, + "kl_loss_2": 2657.6331420898437, + "kl_loss_3": 2171.1466857910154, + "kl_loss_7": 847.0537750244141, + "learning_rate": 0.0009605618265574251, + "loss": 1459.6229, + "step": 1360 + }, + { + "ce_loss_10": 3.5429495334625245, + "ce_loss_13": 3.4162652492523193, + "ce_loss_2": 4.794952082633972, + "ce_loss_3": 4.535301685333252, + "ce_loss_7": 3.8165592908859254, + "epoch": 0.137, + "grad_norm": 620.0, + "kl_loss_10": 271.0598449707031, + "kl_loss_2": 2776.145849609375, + "kl_loss_3": 2325.675885009766, + "kl_loss_7": 881.587744140625, + "learning_rate": 0.0009599418682100792, + "loss": 1522.4414, + "step": 1370 + }, + { + "ce_loss_10": 3.58179566860199, + "ce_loss_13": 3.459395945072174, + "ce_loss_2": 4.792193937301636, + "ce_loss_3": 4.521099305152893, + "ce_loss_7": 3.84169602394104, + "epoch": 0.138, + "grad_norm": 724.0, + "kl_loss_10": 257.83258666992185, + "kl_loss_2": 2672.4068237304687, + "kl_loss_3": 2198.559918212891, + "kl_loss_7": 850.8091857910156, + "learning_rate": 0.0009593172782532268, + "loss": 1496.2724, + "step": 1380 + }, + { + "ce_loss_10": 3.622367191314697, + "ce_loss_13": 3.506042146682739, + "ce_loss_2": 4.801430583000183, + "ce_loss_3": 4.530508184432984, + "ce_loss_7": 3.888216722011566, + "epoch": 0.139, + "grad_norm": 672.0, + "kl_loss_10": 260.9531532287598, + "kl_loss_2": 2599.6354858398436, + "kl_loss_3": 2121.937152099609, + "kl_loss_7": 852.8548278808594, + "learning_rate": 0.0009586880629764817, + "loss": 1464.8023, + "step": 1390 + }, + { + "ce_loss_10": 3.546726655960083, + "ce_loss_13": 3.428490459918976, + "ce_loss_2": 4.748290467262268, + "ce_loss_3": 4.471861267089844, + "ce_loss_7": 3.824984240531921, + "epoch": 0.14, + "grad_norm": 620.0, + "kl_loss_10": 260.18620986938475, + "kl_loss_2": 2649.2240234375, + "kl_loss_3": 2164.870428466797, + "kl_loss_7": 870.0703582763672, + "learning_rate": 0.0009580542287160348, + "loss": 1462.9275, + "step": 1400 + }, + { + "ce_loss_10": 3.5134201645851135, + "ce_loss_13": 3.396924638748169, + "ce_loss_2": 4.727832221984864, + "ce_loss_3": 4.457144689559937, + "ce_loss_7": 3.781324291229248, + "epoch": 0.141, + "grad_norm": 724.0, + "kl_loss_10": 257.8106407165527, + "kl_loss_2": 2672.565283203125, + "kl_loss_3": 2194.398052978516, + "kl_loss_7": 841.9467041015625, + "learning_rate": 0.0009574157818545901, + "loss": 1469.0121, + "step": 1410 + }, + { + "ce_loss_10": 3.583372378349304, + "ce_loss_13": 3.4670314311981203, + "ce_loss_2": 4.753075981140137, + "ce_loss_3": 4.488786149024963, + "ce_loss_7": 3.8414045095443727, + "epoch": 0.142, + "grad_norm": 768.0, + "kl_loss_10": 250.4652572631836, + "kl_loss_2": 2575.260546875, + "kl_loss_3": 2109.250030517578, + "kl_loss_7": 815.4136535644532, + "learning_rate": 0.0009567727288213005, + "loss": 1470.4241, + "step": 1420 + }, + { + "ce_loss_10": 3.5615610837936402, + "ce_loss_13": 3.4428680539131165, + "ce_loss_2": 4.766120481491089, + "ce_loss_3": 4.489290237426758, + "ce_loss_7": 3.8387726664543154, + "epoch": 0.143, + "grad_norm": 680.0, + "kl_loss_10": 259.5032684326172, + "kl_loss_2": 2652.6231079101562, + "kl_loss_3": 2168.8318054199217, + "kl_loss_7": 872.5292297363281, + "learning_rate": 0.0009561250760917027, + "loss": 1465.2545, + "step": 1430 + }, + { + "ce_loss_10": 3.5825438022613527, + "ce_loss_13": 3.4635141372680662, + "ce_loss_2": 4.774414443969727, + "ce_loss_3": 4.498082184791565, + "ce_loss_7": 3.8522005438804627, + "epoch": 0.144, + "grad_norm": 656.0, + "kl_loss_10": 263.3311599731445, + "kl_loss_2": 2662.4484375, + "kl_loss_3": 2176.186492919922, + "kl_loss_7": 865.9247039794922, + "learning_rate": 0.0009554728301876525, + "loss": 1454.278, + "step": 1440 + }, + { + "ce_loss_10": 3.6376792669296263, + "ce_loss_13": 3.515091061592102, + "ce_loss_2": 4.810996460914612, + "ce_loss_3": 4.536413979530335, + "ce_loss_7": 3.9078781604766846, + "epoch": 0.145, + "grad_norm": 616.0, + "kl_loss_10": 259.68054962158203, + "kl_loss_2": 2600.8175415039063, + "kl_loss_3": 2120.5454040527343, + "kl_loss_7": 864.2900634765625, + "learning_rate": 0.0009548159976772592, + "loss": 1508.1567, + "step": 1450 + }, + { + "ce_loss_10": 3.5796504259109496, + "ce_loss_13": 3.456580376625061, + "ce_loss_2": 4.787333536148071, + "ce_loss_3": 4.520044946670533, + "ce_loss_7": 3.8587978959083555, + "epoch": 0.146, + "grad_norm": 624.0, + "kl_loss_10": 265.1648490905762, + "kl_loss_2": 2666.8885864257813, + "kl_loss_3": 2195.818231201172, + "kl_loss_7": 871.2362884521484, + "learning_rate": 0.0009541545851748186, + "loss": 1477.8201, + "step": 1460 + }, + { + "ce_loss_10": 3.4508144855499268, + "ce_loss_13": 3.3300524830818174, + "ce_loss_2": 4.699088287353516, + "ce_loss_3": 4.421405148506165, + "ce_loss_7": 3.735712671279907, + "epoch": 0.147, + "grad_norm": 844.0, + "kl_loss_10": 262.5924041748047, + "kl_loss_2": 2730.21630859375, + "kl_loss_3": 2243.504345703125, + "kl_loss_7": 878.0860382080078, + "learning_rate": 0.0009534885993407473, + "loss": 1496.8188, + "step": 1470 + }, + { + "ce_loss_10": 3.611809027194977, + "ce_loss_13": 3.4930022716522218, + "ce_loss_2": 4.806360912322998, + "ce_loss_3": 4.5402860879898075, + "ce_loss_7": 3.8858142852783204, + "epoch": 0.148, + "grad_norm": 740.0, + "kl_loss_10": 256.4318244934082, + "kl_loss_2": 2655.588269042969, + "kl_loss_3": 2175.6559936523436, + "kl_loss_7": 861.8565673828125, + "learning_rate": 0.0009528180468815154, + "loss": 1488.9336, + "step": 1480 + }, + { + "ce_loss_10": 3.6558565139770507, + "ce_loss_13": 3.538043713569641, + "ce_loss_2": 4.811466526985169, + "ce_loss_3": 4.544855618476868, + "ce_loss_7": 3.9390755891799927, + "epoch": 0.149, + "grad_norm": 844.0, + "kl_loss_10": 264.13821868896486, + "kl_loss_2": 2565.1232788085936, + "kl_loss_3": 2095.556463623047, + "kl_loss_7": 897.2646911621093, + "learning_rate": 0.0009521429345495787, + "loss": 1465.2869, + "step": 1490 + }, + { + "ce_loss_10": 3.646085023880005, + "ce_loss_13": 3.5196659207344054, + "ce_loss_2": 4.780038499832154, + "ce_loss_3": 4.50464768409729, + "ce_loss_7": 3.927055561542511, + "epoch": 0.15, + "grad_norm": 980.0, + "kl_loss_10": 266.5307144165039, + "kl_loss_2": 2540.4637084960937, + "kl_loss_3": 2068.8232849121096, + "kl_loss_7": 888.35068359375, + "learning_rate": 0.0009514632691433108, + "loss": 1455.9041, + "step": 1500 + }, + { + "ce_loss_10": 3.5988011956214905, + "ce_loss_13": 3.482589673995972, + "ce_loss_2": 4.76681923866272, + "ce_loss_3": 4.485762524604797, + "ce_loss_7": 3.8728180885314942, + "epoch": 0.151, + "grad_norm": 600.0, + "kl_loss_10": 260.8206298828125, + "kl_loss_2": 2587.971142578125, + "kl_loss_3": 2094.4052795410157, + "kl_loss_7": 863.3080963134765, + "learning_rate": 0.0009507790575069346, + "loss": 1457.9502, + "step": 1510 + }, + { + "ce_loss_10": 3.5764056205749513, + "ce_loss_13": 3.453061044216156, + "ce_loss_2": 4.775901889801025, + "ce_loss_3": 4.500339031219482, + "ce_loss_7": 3.849775242805481, + "epoch": 0.152, + "grad_norm": 672.0, + "kl_loss_10": 258.1785354614258, + "kl_loss_2": 2655.3977172851564, + "kl_loss_3": 2164.0363708496093, + "kl_loss_7": 857.1902496337891, + "learning_rate": 0.0009500903065304539, + "loss": 1495.6711, + "step": 1520 + }, + { + "ce_loss_10": 3.608113396167755, + "ce_loss_13": 3.498811888694763, + "ce_loss_2": 4.760950970649719, + "ce_loss_3": 4.486514663696289, + "ce_loss_7": 3.8602269887924194, + "epoch": 0.153, + "grad_norm": 664.0, + "kl_loss_10": 245.0189353942871, + "kl_loss_2": 2552.576379394531, + "kl_loss_3": 2060.3522766113283, + "kl_loss_7": 806.0807342529297, + "learning_rate": 0.0009493970231495835, + "loss": 1444.8406, + "step": 1530 + }, + { + "ce_loss_10": 3.547162938117981, + "ce_loss_13": 3.44165985584259, + "ce_loss_2": 4.701804065704346, + "ce_loss_3": 4.424288666248321, + "ce_loss_7": 3.8007919073104857, + "epoch": 0.154, + "grad_norm": 648.0, + "kl_loss_10": 241.08162002563478, + "kl_loss_2": 2573.9149780273438, + "kl_loss_3": 2088.7893615722655, + "kl_loss_7": 812.2064361572266, + "learning_rate": 0.0009486992143456792, + "loss": 1427.6314, + "step": 1540 + }, + { + "ce_loss_10": 3.5828447937965393, + "ce_loss_13": 3.4581031084060667, + "ce_loss_2": 4.834084248542785, + "ce_loss_3": 4.553447818756103, + "ce_loss_7": 3.8656217455863953, + "epoch": 0.155, + "grad_norm": 660.0, + "kl_loss_10": 263.4285690307617, + "kl_loss_2": 2765.8754150390623, + "kl_loss_3": 2266.0636291503906, + "kl_loss_7": 882.9553771972656, + "learning_rate": 0.0009479968871456679, + "loss": 1498.7352, + "step": 1550 + }, + { + "ce_loss_10": 3.547222447395325, + "ce_loss_13": 3.4320276618003844, + "ce_loss_2": 4.768963408470154, + "ce_loss_3": 4.480878567695617, + "ce_loss_7": 3.828988194465637, + "epoch": 0.156, + "grad_norm": 760.0, + "kl_loss_10": 259.2473831176758, + "kl_loss_2": 2697.095703125, + "kl_loss_3": 2199.6418579101564, + "kl_loss_7": 874.4399932861328, + "learning_rate": 0.0009472900486219768, + "loss": 1467.8941, + "step": 1560 + }, + { + "ce_loss_10": 3.54234699010849, + "ce_loss_13": 3.416030561923981, + "ce_loss_2": 4.702804708480835, + "ce_loss_3": 4.435993790626526, + "ce_loss_7": 3.811834490299225, + "epoch": 0.157, + "grad_norm": 996.0, + "kl_loss_10": 266.18832244873045, + "kl_loss_2": 2600.0114868164064, + "kl_loss_3": 2127.867303466797, + "kl_loss_7": 872.285043334961, + "learning_rate": 0.000946578705892462, + "loss": 1470.9803, + "step": 1570 + }, + { + "ce_loss_10": 3.5780028820037844, + "ce_loss_13": 3.457060468196869, + "ce_loss_2": 4.7225889444351195, + "ce_loss_3": 4.4475972890853885, + "ce_loss_7": 3.8437977194786073, + "epoch": 0.158, + "grad_norm": 804.0, + "kl_loss_10": 277.74141387939454, + "kl_loss_2": 2544.743029785156, + "kl_loss_3": 2066.6833618164064, + "kl_loss_7": 835.531689453125, + "learning_rate": 0.0009458628661203367, + "loss": 1460.073, + "step": 1580 + }, + { + "ce_loss_10": 3.5895689606666563, + "ce_loss_13": 3.4543488025665283, + "ce_loss_2": 4.777545094490051, + "ce_loss_3": 4.501162362098694, + "ce_loss_7": 3.846569240093231, + "epoch": 0.159, + "grad_norm": 748.0, + "kl_loss_10": 280.14837341308595, + "kl_loss_2": 2651.9770629882814, + "kl_loss_3": 2168.784558105469, + "kl_loss_7": 873.0370697021484, + "learning_rate": 0.0009451425365140996, + "loss": 1445.3969, + "step": 1590 + }, + { + "ce_loss_10": 3.6579004883766175, + "ce_loss_13": 3.5379099249839783, + "ce_loss_2": 4.773951435089112, + "ce_loss_3": 4.50466411113739, + "ce_loss_7": 3.922029638290405, + "epoch": 0.16, + "grad_norm": 728.0, + "kl_loss_10": 273.34312896728517, + "kl_loss_2": 2508.3281860351562, + "kl_loss_3": 2021.1453735351563, + "kl_loss_7": 841.1831604003906, + "learning_rate": 0.0009444177243274617, + "loss": 1408.8492, + "step": 1600 + }, + { + "ce_loss_10": 3.514503800868988, + "ce_loss_13": 3.388037991523743, + "ce_loss_2": 4.701039886474609, + "ce_loss_3": 4.418904185295105, + "ce_loss_7": 3.7763099312782287, + "epoch": 0.161, + "grad_norm": 704.0, + "kl_loss_10": 268.4227348327637, + "kl_loss_2": 2642.4529418945312, + "kl_loss_3": 2156.727893066406, + "kl_loss_7": 856.544287109375, + "learning_rate": 0.0009436884368592739, + "loss": 1462.7545, + "step": 1610 + }, + { + "ce_loss_10": 3.55902304649353, + "ce_loss_13": 3.441978645324707, + "ce_loss_2": 4.705282998085022, + "ce_loss_3": 4.427343964576721, + "ce_loss_7": 3.810055124759674, + "epoch": 0.162, + "grad_norm": 692.0, + "kl_loss_10": 253.71325302124023, + "kl_loss_2": 2545.9316528320314, + "kl_loss_3": 2055.34326171875, + "kl_loss_7": 814.7054443359375, + "learning_rate": 0.0009429546814534529, + "loss": 1452.6556, + "step": 1620 + }, + { + "ce_loss_10": 3.567894661426544, + "ce_loss_13": 3.4576117157936097, + "ce_loss_2": 4.725762176513672, + "ce_loss_3": 4.453288149833679, + "ce_loss_7": 3.8241241455078123, + "epoch": 0.163, + "grad_norm": 600.0, + "kl_loss_10": 249.5528419494629, + "kl_loss_2": 2561.039794921875, + "kl_loss_3": 2084.6285034179687, + "kl_loss_7": 811.9796569824218, + "learning_rate": 0.0009422164654989072, + "loss": 1405.3155, + "step": 1630 + }, + { + "ce_loss_10": 3.682257628440857, + "ce_loss_13": 3.57005797624588, + "ce_loss_2": 4.807721471786499, + "ce_loss_3": 4.5406172513961796, + "ce_loss_7": 3.9288353323936462, + "epoch": 0.164, + "grad_norm": 632.0, + "kl_loss_10": 249.95079803466797, + "kl_loss_2": 2525.5760131835937, + "kl_loss_3": 2063.9668884277344, + "kl_loss_7": 811.2918426513672, + "learning_rate": 0.0009414737964294635, + "loss": 1427.1312, + "step": 1640 + }, + { + "ce_loss_10": 3.6101224184036256, + "ce_loss_13": 3.5010381817817686, + "ce_loss_2": 4.720621514320373, + "ce_loss_3": 4.4590880393981935, + "ce_loss_7": 3.8465168356895445, + "epoch": 0.165, + "grad_norm": 592.0, + "kl_loss_10": 244.4941291809082, + "kl_loss_2": 2467.5499755859373, + "kl_loss_3": 2009.9993835449218, + "kl_loss_7": 785.2798095703125, + "learning_rate": 0.000940726681723791, + "loss": 1420.5047, + "step": 1650 + }, + { + "ce_loss_10": 3.4529421091079713, + "ce_loss_13": 3.3362591743469237, + "ce_loss_2": 4.67095410823822, + "ce_loss_3": 4.395775043964386, + "ce_loss_7": 3.7144131302833556, + "epoch": 0.166, + "grad_norm": 760.0, + "kl_loss_10": 256.26583633422854, + "kl_loss_2": 2688.0345825195313, + "kl_loss_3": 2212.7634155273436, + "kl_loss_7": 836.4253662109375, + "learning_rate": 0.0009399751289053266, + "loss": 1423.8466, + "step": 1660 + }, + { + "ce_loss_10": 3.667633831501007, + "ce_loss_13": 3.557128643989563, + "ce_loss_2": 4.805055928230286, + "ce_loss_3": 4.532746481895447, + "ce_loss_7": 3.911760663986206, + "epoch": 0.167, + "grad_norm": 700.0, + "kl_loss_10": 250.78092575073242, + "kl_loss_2": 2539.268176269531, + "kl_loss_3": 2059.193713378906, + "kl_loss_7": 797.4180786132813, + "learning_rate": 0.0009392191455421988, + "loss": 1439.8459, + "step": 1670 + }, + { + "ce_loss_10": 3.6344913125038145, + "ce_loss_13": 3.5230419993400575, + "ce_loss_2": 4.79052848815918, + "ce_loss_3": 4.512491989135742, + "ce_loss_7": 3.8782394886016847, + "epoch": 0.168, + "grad_norm": 672.0, + "kl_loss_10": 262.5000991821289, + "kl_loss_2": 2580.610949707031, + "kl_loss_3": 2102.571502685547, + "kl_loss_7": 817.8095031738281, + "learning_rate": 0.0009384587392471515, + "loss": 1409.023, + "step": 1680 + }, + { + "ce_loss_10": 3.6255006551742555, + "ce_loss_13": 3.514340567588806, + "ce_loss_2": 4.734428143501281, + "ce_loss_3": 4.468456673622131, + "ce_loss_7": 3.8644169330596925, + "epoch": 0.169, + "grad_norm": 628.0, + "kl_loss_10": 242.1350540161133, + "kl_loss_2": 2468.8160400390625, + "kl_loss_3": 2004.5263061523438, + "kl_loss_7": 785.5388061523438, + "learning_rate": 0.0009376939176774678, + "loss": 1384.7148, + "step": 1690 + }, + { + "ce_loss_10": 3.601682686805725, + "ce_loss_13": 3.4858548164367678, + "ce_loss_2": 4.752888894081115, + "ce_loss_3": 4.474552822113037, + "ce_loss_7": 3.842711091041565, + "epoch": 0.17, + "grad_norm": 636.0, + "kl_loss_10": 245.69830017089845, + "kl_loss_2": 2544.683557128906, + "kl_loss_3": 2064.160784912109, + "kl_loss_7": 792.8873626708985, + "learning_rate": 0.0009369246885348925, + "loss": 1434.5433, + "step": 1700 + }, + { + "ce_loss_10": 3.5952138662338258, + "ce_loss_13": 3.4776424884796144, + "ce_loss_2": 4.792232918739319, + "ce_loss_3": 4.513515877723694, + "ce_loss_7": 3.8616483092308043, + "epoch": 0.171, + "grad_norm": 644.0, + "kl_loss_10": 250.0074020385742, + "kl_loss_2": 2643.389611816406, + "kl_loss_3": 2155.037109375, + "kl_loss_7": 835.121694946289, + "learning_rate": 0.0009361510595655545, + "loss": 1446.8347, + "step": 1710 + }, + { + "ce_loss_10": 3.558023285865784, + "ce_loss_13": 3.438031017780304, + "ce_loss_2": 4.730398392677307, + "ce_loss_3": 4.452085471153259, + "ce_loss_7": 3.815502095222473, + "epoch": 0.172, + "grad_norm": 672.0, + "kl_loss_10": 260.42660064697264, + "kl_loss_2": 2622.6736572265627, + "kl_loss_3": 2126.2764099121096, + "kl_loss_7": 833.0841033935546, + "learning_rate": 0.0009353730385598887, + "loss": 1443.5211, + "step": 1720 + }, + { + "ce_loss_10": 3.4771748185157776, + "ce_loss_13": 3.364219045639038, + "ce_loss_2": 4.693475008010864, + "ce_loss_3": 4.410137629508972, + "ce_loss_7": 3.7466461181640627, + "epoch": 0.173, + "grad_norm": 576.0, + "kl_loss_10": 244.67605361938476, + "kl_loss_2": 2652.6466064453125, + "kl_loss_3": 2161.871044921875, + "kl_loss_7": 827.6846221923828, + "learning_rate": 0.0009345906333525581, + "loss": 1466.803, + "step": 1730 + }, + { + "ce_loss_10": 3.515894877910614, + "ce_loss_13": 3.403614568710327, + "ce_loss_2": 4.707322573661804, + "ce_loss_3": 4.422236812114716, + "ce_loss_7": 3.7741501927375793, + "epoch": 0.174, + "grad_norm": 608.0, + "kl_loss_10": 250.64810333251953, + "kl_loss_2": 2637.284143066406, + "kl_loss_3": 2135.0481140136717, + "kl_loss_7": 835.5378570556641, + "learning_rate": 0.0009338038518223745, + "loss": 1437.4744, + "step": 1740 + }, + { + "ce_loss_10": 3.5834938049316407, + "ce_loss_13": 3.468910980224609, + "ce_loss_2": 4.762004089355469, + "ce_loss_3": 4.486204957962036, + "ce_loss_7": 3.8505713820457457, + "epoch": 0.175, + "grad_norm": 652.0, + "kl_loss_10": 254.51539306640626, + "kl_loss_2": 2618.7681884765625, + "kl_loss_3": 2135.773858642578, + "kl_loss_7": 849.8012634277344, + "learning_rate": 0.0009330127018922195, + "loss": 1479.132, + "step": 1750 + }, + { + "ce_loss_10": 3.538338470458984, + "ce_loss_13": 3.4237607955932616, + "ce_loss_2": 4.7127416133880615, + "ce_loss_3": 4.443085932731629, + "ce_loss_7": 3.794516444206238, + "epoch": 0.176, + "grad_norm": 628.0, + "kl_loss_10": 245.44887084960936, + "kl_loss_2": 2605.848291015625, + "kl_loss_3": 2119.558026123047, + "kl_loss_7": 818.0517547607421, + "learning_rate": 0.0009322171915289634, + "loss": 1443.3754, + "step": 1760 + }, + { + "ce_loss_10": 3.5648101806640624, + "ce_loss_13": 3.459370458126068, + "ce_loss_2": 4.707282447814942, + "ce_loss_3": 4.433714365959167, + "ce_loss_7": 3.812247085571289, + "epoch": 0.177, + "grad_norm": 576.0, + "kl_loss_10": 245.77867431640624, + "kl_loss_2": 2558.2099365234376, + "kl_loss_3": 2069.932727050781, + "kl_loss_7": 809.8514526367187, + "learning_rate": 0.0009314173287433873, + "loss": 1402.6621, + "step": 1770 + }, + { + "ce_loss_10": 3.5681435227394105, + "ce_loss_13": 3.4554543256759644, + "ce_loss_2": 4.716624093055725, + "ce_loss_3": 4.441683101654053, + "ce_loss_7": 3.8178189396858215, + "epoch": 0.178, + "grad_norm": 704.0, + "kl_loss_10": 250.21724319458008, + "kl_loss_2": 2566.923547363281, + "kl_loss_3": 2076.703576660156, + "kl_loss_7": 808.6476196289062, + "learning_rate": 0.0009306131215901003, + "loss": 1403.6738, + "step": 1780 + }, + { + "ce_loss_10": 3.60051885843277, + "ce_loss_13": 3.4851089835166933, + "ce_loss_2": 4.74112594127655, + "ce_loss_3": 4.468067002296448, + "ce_loss_7": 3.8350677728652953, + "epoch": 0.179, + "grad_norm": 656.0, + "kl_loss_10": 247.00397262573242, + "kl_loss_2": 2550.1098876953124, + "kl_loss_3": 2071.478955078125, + "kl_loss_7": 797.3671112060547, + "learning_rate": 0.0009298045781674596, + "loss": 1386.7528, + "step": 1790 + }, + { + "ce_loss_10": 3.576521575450897, + "ce_loss_13": 3.465667748451233, + "ce_loss_2": 4.70545973777771, + "ce_loss_3": 4.437681531906128, + "ce_loss_7": 3.823224997520447, + "epoch": 0.18, + "grad_norm": 640.0, + "kl_loss_10": 245.9371192932129, + "kl_loss_2": 2516.936376953125, + "kl_loss_3": 2031.5426147460937, + "kl_loss_7": 793.7673767089843, + "learning_rate": 0.0009289917066174886, + "loss": 1415.4195, + "step": 1800 + }, + { + "ce_loss_10": 3.568215787410736, + "ce_loss_13": 3.465099549293518, + "ce_loss_2": 4.663200092315674, + "ce_loss_3": 4.39816825389862, + "ce_loss_7": 3.797432005405426, + "epoch": 0.181, + "grad_norm": 596.0, + "kl_loss_10": 236.99261932373048, + "kl_loss_2": 2444.4304931640627, + "kl_loss_3": 1977.5733642578125, + "kl_loss_7": 762.0940002441406, + "learning_rate": 0.0009281745151257945, + "loss": 1372.7959, + "step": 1810 + }, + { + "ce_loss_10": 3.589988100528717, + "ce_loss_13": 3.4779568314552307, + "ce_loss_2": 4.741122603416443, + "ce_loss_3": 4.463086032867432, + "ce_loss_7": 3.8317890048027037, + "epoch": 0.182, + "grad_norm": 576.0, + "kl_loss_10": 245.52628021240236, + "kl_loss_2": 2546.031115722656, + "kl_loss_3": 2057.9885314941407, + "kl_loss_7": 789.636849975586, + "learning_rate": 0.0009273530119214868, + "loss": 1414.9602, + "step": 1820 + }, + { + "ce_loss_10": 3.6874829173088073, + "ce_loss_13": 3.5830198526382446, + "ce_loss_2": 4.805440378189087, + "ce_loss_3": 4.542007470130921, + "ce_loss_7": 3.922217535972595, + "epoch": 0.183, + "grad_norm": 668.0, + "kl_loss_10": 240.62074966430663, + "kl_loss_2": 2477.285852050781, + "kl_loss_3": 2025.298876953125, + "kl_loss_7": 778.8258850097657, + "learning_rate": 0.0009265272052770935, + "loss": 1365.1876, + "step": 1830 + }, + { + "ce_loss_10": 3.5063879013061525, + "ce_loss_13": 3.3919414281845093, + "ce_loss_2": 4.691436982154846, + "ce_loss_3": 4.40977828502655, + "ce_loss_7": 3.7569626212120055, + "epoch": 0.184, + "grad_norm": 672.0, + "kl_loss_10": 245.37701873779298, + "kl_loss_2": 2600.8256103515623, + "kl_loss_3": 2103.731726074219, + "kl_loss_7": 796.2739471435547, + "learning_rate": 0.0009256971035084784, + "loss": 1423.7646, + "step": 1840 + }, + { + "ce_loss_10": 3.4534160137176513, + "ce_loss_13": 3.337074172496796, + "ce_loss_2": 4.650833582878112, + "ce_loss_3": 4.375414204597473, + "ce_loss_7": 3.7153374671936037, + "epoch": 0.185, + "grad_norm": 872.0, + "kl_loss_10": 253.35809020996095, + "kl_loss_2": 2636.6057983398437, + "kl_loss_3": 2149.6516052246093, + "kl_loss_7": 833.7322570800782, + "learning_rate": 0.0009248627149747573, + "loss": 1433.1182, + "step": 1850 + }, + { + "ce_loss_10": 3.6552318572998046, + "ce_loss_13": 3.5436174392700197, + "ce_loss_2": 4.771462321281433, + "ce_loss_3": 4.505353546142578, + "ce_loss_7": 3.8980504512786864, + "epoch": 0.186, + "grad_norm": 628.0, + "kl_loss_10": 244.59689865112304, + "kl_loss_2": 2502.3949462890623, + "kl_loss_3": 2027.7750183105468, + "kl_loss_7": 792.985708618164, + "learning_rate": 0.0009240240480782129, + "loss": 1402.8563, + "step": 1860 + }, + { + "ce_loss_10": 3.559772253036499, + "ce_loss_13": 3.444066059589386, + "ce_loss_2": 4.714746379852295, + "ce_loss_3": 4.439750409126281, + "ce_loss_7": 3.8083682656288147, + "epoch": 0.187, + "grad_norm": 696.0, + "kl_loss_10": 248.7159553527832, + "kl_loss_2": 2569.0499755859373, + "kl_loss_3": 2081.2441528320314, + "kl_loss_7": 799.65380859375, + "learning_rate": 0.0009231811112642122, + "loss": 1391.885, + "step": 1870 + }, + { + "ce_loss_10": 3.603023958206177, + "ce_loss_13": 3.4911489486694336, + "ce_loss_2": 4.7107093334198, + "ce_loss_3": 4.44477071762085, + "ce_loss_7": 3.8424261450767516, + "epoch": 0.188, + "grad_norm": 756.0, + "kl_loss_10": 245.3149284362793, + "kl_loss_2": 2484.4417358398437, + "kl_loss_3": 2013.6257080078126, + "kl_loss_7": 788.6129425048828, + "learning_rate": 0.0009223339130211192, + "loss": 1382.5715, + "step": 1880 + }, + { + "ce_loss_10": 3.451503300666809, + "ce_loss_13": 3.3456833481788637, + "ce_loss_2": 4.6354892492294315, + "ce_loss_3": 4.368392133712769, + "ce_loss_7": 3.7025105237960814, + "epoch": 0.189, + "grad_norm": 796.0, + "kl_loss_10": 235.8703857421875, + "kl_loss_2": 2606.5722534179686, + "kl_loss_3": 2137.981573486328, + "kl_loss_7": 795.582894897461, + "learning_rate": 0.0009214824618802108, + "loss": 1426.9203, + "step": 1890 + }, + { + "ce_loss_10": 3.633524978160858, + "ce_loss_13": 3.5242482304573057, + "ce_loss_2": 4.770471715927124, + "ce_loss_3": 4.501252269744873, + "ce_loss_7": 3.883507859706879, + "epoch": 0.19, + "grad_norm": 652.0, + "kl_loss_10": 237.73654251098634, + "kl_loss_2": 2486.365759277344, + "kl_loss_3": 2019.5263671875, + "kl_loss_7": 793.6973388671875, + "learning_rate": 0.0009206267664155906, + "loss": 1428.9256, + "step": 1900 + }, + { + "ce_loss_10": 3.5532122611999513, + "ce_loss_13": 3.443064200878143, + "ce_loss_2": 4.697825288772583, + "ce_loss_3": 4.427114844322205, + "ce_loss_7": 3.799003005027771, + "epoch": 0.191, + "grad_norm": 636.0, + "kl_loss_10": 241.10890350341796, + "kl_loss_2": 2548.555554199219, + "kl_loss_3": 2061.9953002929688, + "kl_loss_7": 794.9706726074219, + "learning_rate": 0.0009197668352441024, + "loss": 1417.5695, + "step": 1910 + }, + { + "ce_loss_10": 3.608381187915802, + "ce_loss_13": 3.4997890830039977, + "ce_loss_2": 4.748308372497559, + "ce_loss_3": 4.471417784690857, + "ce_loss_7": 3.851922130584717, + "epoch": 0.192, + "grad_norm": 636.0, + "kl_loss_10": 242.21438293457032, + "kl_loss_2": 2509.6267700195312, + "kl_loss_3": 2027.6890441894532, + "kl_loss_7": 779.7179229736328, + "learning_rate": 0.0009189026770252437, + "loss": 1396.1437, + "step": 1920 + }, + { + "ce_loss_10": 3.6384175658226012, + "ce_loss_13": 3.5275412440299987, + "ce_loss_2": 4.762041211128235, + "ce_loss_3": 4.48351948261261, + "ce_loss_7": 3.8741258502006533, + "epoch": 0.193, + "grad_norm": 688.0, + "kl_loss_10": 250.4880401611328, + "kl_loss_2": 2491.730749511719, + "kl_loss_3": 2004.1307067871094, + "kl_loss_7": 785.3524200439454, + "learning_rate": 0.000918034300461078, + "loss": 1438.3092, + "step": 1930 + }, + { + "ce_loss_10": 3.675648069381714, + "ce_loss_13": 3.555274224281311, + "ce_loss_2": 4.77588381767273, + "ce_loss_3": 4.506980061531067, + "ce_loss_7": 3.9165189027786256, + "epoch": 0.194, + "grad_norm": 1048.0, + "kl_loss_10": 251.8736488342285, + "kl_loss_2": 2458.3200805664064, + "kl_loss_3": 1995.1934143066405, + "kl_loss_7": 806.3202392578125, + "learning_rate": 0.0009171617142961477, + "loss": 1389.0176, + "step": 1940 + }, + { + "ce_loss_10": 3.623457467556, + "ce_loss_13": 3.512966477870941, + "ce_loss_2": 4.729074192047119, + "ce_loss_3": 4.464083290100097, + "ce_loss_7": 3.8867802739143373, + "epoch": 0.195, + "grad_norm": 688.0, + "kl_loss_10": 255.58710021972655, + "kl_loss_2": 2479.066796875, + "kl_loss_3": 2001.9678588867187, + "kl_loss_7": 833.6603210449218, + "learning_rate": 0.0009162849273173857, + "loss": 1403.0846, + "step": 1950 + }, + { + "ce_loss_10": 3.5657452821731566, + "ce_loss_13": 3.457024359703064, + "ce_loss_2": 4.679703283309936, + "ce_loss_3": 4.409625816345215, + "ce_loss_7": 3.8033991694450378, + "epoch": 0.196, + "grad_norm": 656.0, + "kl_loss_10": 242.9659797668457, + "kl_loss_2": 2473.7406372070313, + "kl_loss_3": 2000.534735107422, + "kl_loss_7": 783.4248046875, + "learning_rate": 0.0009154039483540273, + "loss": 1391.609, + "step": 1960 + }, + { + "ce_loss_10": 3.5444339156150817, + "ce_loss_13": 3.433286416530609, + "ce_loss_2": 4.677814674377442, + "ce_loss_3": 4.395683622360229, + "ce_loss_7": 3.784545695781708, + "epoch": 0.197, + "grad_norm": 608.0, + "kl_loss_10": 239.23334732055665, + "kl_loss_2": 2520.074182128906, + "kl_loss_3": 2031.6637634277345, + "kl_loss_7": 792.3442230224609, + "learning_rate": 0.0009145187862775209, + "loss": 1388.6972, + "step": 1970 + }, + { + "ce_loss_10": 3.572359085083008, + "ce_loss_13": 3.466273844242096, + "ce_loss_2": 4.692143273353577, + "ce_loss_3": 4.418303954601288, + "ce_loss_7": 3.8197664856910705, + "epoch": 0.198, + "grad_norm": 660.0, + "kl_loss_10": 241.7268035888672, + "kl_loss_2": 2492.987420654297, + "kl_loss_3": 2004.3476135253907, + "kl_loss_7": 794.6048614501954, + "learning_rate": 0.0009136294500014386, + "loss": 1377.9902, + "step": 1980 + }, + { + "ce_loss_10": 3.52831609249115, + "ce_loss_13": 3.4167757987976075, + "ce_loss_2": 4.705040359497071, + "ce_loss_3": 4.434882926940918, + "ce_loss_7": 3.7779016494750977, + "epoch": 0.199, + "grad_norm": 684.0, + "kl_loss_10": 242.86552047729492, + "kl_loss_2": 2578.6255493164062, + "kl_loss_3": 2108.4060180664064, + "kl_loss_7": 798.0517791748047, + "learning_rate": 0.000912735948481387, + "loss": 1426.8047, + "step": 1990 + }, + { + "ce_loss_10": 3.5601553082466126, + "ce_loss_13": 3.449883997440338, + "ce_loss_2": 4.691212105751037, + "ce_loss_3": 4.414692604541779, + "ce_loss_7": 3.8016102075576783, + "epoch": 0.2, + "grad_norm": 684.0, + "kl_loss_10": 242.28478622436523, + "kl_loss_2": 2530.514270019531, + "kl_loss_3": 2040.9486206054687, + "kl_loss_7": 800.2102844238282, + "learning_rate": 0.0009118382907149164, + "loss": 1370.7061, + "step": 2000 + }, + { + "ce_loss_10": 3.5833643674850464, + "ce_loss_13": 3.4740814447402952, + "ce_loss_2": 4.70447518825531, + "ce_loss_3": 4.429442811012268, + "ce_loss_7": 3.8237846970558165, + "epoch": 0.201, + "grad_norm": 612.0, + "kl_loss_10": 244.51040420532226, + "kl_loss_2": 2494.5580932617186, + "kl_loss_3": 2005.631494140625, + "kl_loss_7": 779.4999328613281, + "learning_rate": 0.0009109364857414306, + "loss": 1380.7336, + "step": 2010 + }, + { + "ce_loss_10": 3.5532099485397337, + "ce_loss_13": 3.4470490336418154, + "ce_loss_2": 4.681869411468506, + "ce_loss_3": 4.40200264453888, + "ce_loss_7": 3.790750026702881, + "epoch": 0.202, + "grad_norm": 608.0, + "kl_loss_10": 240.87973327636718, + "kl_loss_2": 2528.7482421875, + "kl_loss_3": 2036.1677551269531, + "kl_loss_7": 777.9466033935547, + "learning_rate": 0.0009100305426420956, + "loss": 1419.7547, + "step": 2020 + }, + { + "ce_loss_10": 3.5112710118293764, + "ce_loss_13": 3.404540646076202, + "ce_loss_2": 4.711292386054993, + "ce_loss_3": 4.432630777359009, + "ce_loss_7": 3.757065165042877, + "epoch": 0.203, + "grad_norm": 664.0, + "kl_loss_10": 238.4617919921875, + "kl_loss_2": 2652.4912963867187, + "kl_loss_3": 2152.2258422851564, + "kl_loss_7": 790.063916015625, + "learning_rate": 0.0009091204705397484, + "loss": 1413.6135, + "step": 2030 + }, + { + "ce_loss_10": 3.508105480670929, + "ce_loss_13": 3.399987006187439, + "ce_loss_2": 4.703747749328613, + "ce_loss_3": 4.428358674049377, + "ce_loss_7": 3.7540559649467466, + "epoch": 0.204, + "grad_norm": 700.0, + "kl_loss_10": 242.5270248413086, + "kl_loss_2": 2644.1144165039063, + "kl_loss_3": 2155.070489501953, + "kl_loss_7": 790.7262329101562, + "learning_rate": 0.0009082062785988049, + "loss": 1424.9719, + "step": 2040 + }, + { + "ce_loss_10": 3.638819897174835, + "ce_loss_13": 3.5337455749511717, + "ce_loss_2": 4.727799487113953, + "ce_loss_3": 4.457953143119812, + "ce_loss_7": 3.8601122856140138, + "epoch": 0.205, + "grad_norm": 668.0, + "kl_loss_10": 235.8659812927246, + "kl_loss_2": 2476.5026977539064, + "kl_loss_3": 1996.3927185058594, + "kl_loss_7": 769.8516876220704, + "learning_rate": 0.0009072879760251679, + "loss": 1387.9949, + "step": 2050 + }, + { + "ce_loss_10": 3.5858229279518126, + "ce_loss_13": 3.475198233127594, + "ce_loss_2": 4.739975643157959, + "ce_loss_3": 4.475312519073486, + "ce_loss_7": 3.834290158748627, + "epoch": 0.206, + "grad_norm": 700.0, + "kl_loss_10": 239.9431396484375, + "kl_loss_2": 2570.9485107421874, + "kl_loss_3": 2100.634240722656, + "kl_loss_7": 789.2198791503906, + "learning_rate": 0.0009063655720661341, + "loss": 1402.2605, + "step": 2060 + }, + { + "ce_loss_10": 3.6313581228256226, + "ce_loss_13": 3.5262081384658814, + "ce_loss_2": 4.7349327325820925, + "ce_loss_3": 4.470538520812989, + "ce_loss_7": 3.864632213115692, + "epoch": 0.207, + "grad_norm": 580.0, + "kl_loss_10": 238.97062911987305, + "kl_loss_2": 2454.8896240234376, + "kl_loss_3": 1987.1748107910157, + "kl_loss_7": 776.5097869873047, + "learning_rate": 0.000905439076010301, + "loss": 1376.7035, + "step": 2070 + }, + { + "ce_loss_10": 3.5894328594207763, + "ce_loss_13": 3.4751851201057433, + "ce_loss_2": 4.723314690589905, + "ce_loss_3": 4.451727390289307, + "ce_loss_7": 3.830363655090332, + "epoch": 0.208, + "grad_norm": 620.0, + "kl_loss_10": 243.43872604370117, + "kl_loss_2": 2525.0844848632814, + "kl_loss_3": 2046.1018615722655, + "kl_loss_7": 793.8133911132812, + "learning_rate": 0.0009045084971874737, + "loss": 1367.5893, + "step": 2080 + }, + { + "ce_loss_10": 3.5676583290100097, + "ce_loss_13": 3.452693998813629, + "ce_loss_2": 4.699956917762757, + "ce_loss_3": 4.424242115020752, + "ce_loss_7": 3.806007480621338, + "epoch": 0.209, + "grad_norm": 688.0, + "kl_loss_10": 249.41274871826172, + "kl_loss_2": 2529.7607299804686, + "kl_loss_3": 2042.927227783203, + "kl_loss_7": 789.6784515380859, + "learning_rate": 0.0009035738449685707, + "loss": 1418.6186, + "step": 2090 + }, + { + "ce_loss_10": 3.510753297805786, + "ce_loss_13": 3.3990254640579223, + "ce_loss_2": 4.691071200370788, + "ce_loss_3": 4.41790235042572, + "ce_loss_7": 3.7591400265693666, + "epoch": 0.21, + "grad_norm": 600.0, + "kl_loss_10": 248.95298919677734, + "kl_loss_2": 2601.993273925781, + "kl_loss_3": 2124.828485107422, + "kl_loss_7": 799.376498413086, + "learning_rate": 0.0009026351287655293, + "loss": 1399.0971, + "step": 2100 + }, + { + "ce_loss_10": 3.697406494617462, + "ce_loss_13": 3.5970078110694885, + "ce_loss_2": 4.7389151096344, + "ce_loss_3": 4.481091260910034, + "ce_loss_7": 3.9209010362625123, + "epoch": 0.211, + "grad_norm": 600.0, + "kl_loss_10": 229.3176498413086, + "kl_loss_2": 2353.4455688476564, + "kl_loss_3": 1885.3362854003906, + "kl_loss_7": 749.6781646728516, + "learning_rate": 0.0009016923580312113, + "loss": 1321.2097, + "step": 2110 + }, + { + "ce_loss_10": 3.565862810611725, + "ce_loss_13": 3.4591265320777893, + "ce_loss_2": 4.665999031066894, + "ce_loss_3": 4.391572332382202, + "ce_loss_7": 3.7967191696166993, + "epoch": 0.212, + "grad_norm": 732.0, + "kl_loss_10": 243.075350189209, + "kl_loss_2": 2458.2255859375, + "kl_loss_3": 1975.4440124511718, + "kl_loss_7": 771.7640777587891, + "learning_rate": 0.0009007455422593077, + "loss": 1392.0321, + "step": 2120 + }, + { + "ce_loss_10": 3.574350452423096, + "ce_loss_13": 3.4604103803634643, + "ce_loss_2": 4.7152410507202145, + "ce_loss_3": 4.439797115325928, + "ce_loss_7": 3.8057913303375246, + "epoch": 0.213, + "grad_norm": 652.0, + "kl_loss_10": 251.99988250732423, + "kl_loss_2": 2551.55615234375, + "kl_loss_3": 2068.113671875, + "kl_loss_7": 789.3677795410156, + "learning_rate": 0.0008997946909842425, + "loss": 1402.5921, + "step": 2130 + }, + { + "ce_loss_10": 3.592576038837433, + "ce_loss_13": 3.476356315612793, + "ce_loss_2": 4.7715356826782225, + "ce_loss_3": 4.504428267478943, + "ce_loss_7": 3.843649852275848, + "epoch": 0.214, + "grad_norm": 660.0, + "kl_loss_10": 255.3404312133789, + "kl_loss_2": 2625.214599609375, + "kl_loss_3": 2155.4658203125, + "kl_loss_7": 813.4428436279297, + "learning_rate": 0.0008988398137810777, + "loss": 1403.5207, + "step": 2140 + }, + { + "ce_loss_10": 3.620520067214966, + "ce_loss_13": 3.513581109046936, + "ce_loss_2": 4.717863583564759, + "ce_loss_3": 4.442376029491425, + "ce_loss_7": 3.8534181356430053, + "epoch": 0.215, + "grad_norm": 700.0, + "kl_loss_10": 239.26677551269532, + "kl_loss_2": 2448.3839477539063, + "kl_loss_3": 1962.8316284179687, + "kl_loss_7": 763.2356109619141, + "learning_rate": 0.0008978809202654162, + "loss": 1354.8944, + "step": 2150 + }, + { + "ce_loss_10": 3.593782067298889, + "ce_loss_13": 3.4892191767692564, + "ce_loss_2": 4.713660454750061, + "ce_loss_3": 4.43155483007431, + "ce_loss_7": 3.8341444969177245, + "epoch": 0.216, + "grad_norm": 640.0, + "kl_loss_10": 237.50842971801757, + "kl_loss_2": 2454.586071777344, + "kl_loss_3": 1970.583270263672, + "kl_loss_7": 773.5592163085937, + "learning_rate": 0.0008969180200933046, + "loss": 1383.4818, + "step": 2160 + }, + { + "ce_loss_10": 3.56014689207077, + "ce_loss_13": 3.4516719341278077, + "ce_loss_2": 4.715594840049744, + "ce_loss_3": 4.431590890884399, + "ce_loss_7": 3.8131863117218017, + "epoch": 0.217, + "grad_norm": 712.0, + "kl_loss_10": 241.1098258972168, + "kl_loss_2": 2533.49033203125, + "kl_loss_3": 2041.2841003417968, + "kl_loss_7": 799.241552734375, + "learning_rate": 0.0008959511229611376, + "loss": 1406.9449, + "step": 2170 + }, + { + "ce_loss_10": 3.634247362613678, + "ce_loss_13": 3.529753494262695, + "ce_loss_2": 4.747422552108764, + "ce_loss_3": 4.480298018455505, + "ce_loss_7": 3.8834722995758058, + "epoch": 0.218, + "grad_norm": 744.0, + "kl_loss_10": 231.06951522827148, + "kl_loss_2": 2480.240673828125, + "kl_loss_3": 2003.8335388183593, + "kl_loss_7": 794.5106719970703, + "learning_rate": 0.0008949802386055581, + "loss": 1379.2705, + "step": 2180 + }, + { + "ce_loss_10": 3.4931302070617676, + "ce_loss_13": 3.3903717041015624, + "ce_loss_2": 4.634695625305175, + "ce_loss_3": 4.343023872375488, + "ce_loss_7": 3.735668647289276, + "epoch": 0.219, + "grad_norm": 704.0, + "kl_loss_10": 229.31054229736327, + "kl_loss_2": 2487.9470336914064, + "kl_loss_3": 1978.2749877929687, + "kl_loss_7": 772.9935424804687, + "learning_rate": 0.0008940053768033609, + "loss": 1398.8061, + "step": 2190 + }, + { + "ce_loss_10": 3.579288733005524, + "ce_loss_13": 3.476969850063324, + "ce_loss_2": 4.679602265357971, + "ce_loss_3": 4.408792352676391, + "ce_loss_7": 3.818285346031189, + "epoch": 0.22, + "grad_norm": 648.0, + "kl_loss_10": 225.21361923217773, + "kl_loss_2": 2457.1766845703123, + "kl_loss_3": 1985.3013549804687, + "kl_loss_7": 762.693115234375, + "learning_rate": 0.0008930265473713938, + "loss": 1358.0689, + "step": 2200 + }, + { + "ce_loss_10": 3.5425936341285706, + "ce_loss_13": 3.437610614299774, + "ce_loss_2": 4.679268145561219, + "ce_loss_3": 4.395039463043213, + "ce_loss_7": 3.7786786198616027, + "epoch": 0.221, + "grad_norm": 624.0, + "kl_loss_10": 227.02418670654296, + "kl_loss_2": 2514.80498046875, + "kl_loss_3": 2012.999462890625, + "kl_loss_7": 766.7205108642578, + "learning_rate": 0.0008920437601664579, + "loss": 1344.9316, + "step": 2210 + }, + { + "ce_loss_10": 3.5330151677131654, + "ce_loss_13": 3.4283509850502014, + "ce_loss_2": 4.65971040725708, + "ce_loss_3": 4.389861440658569, + "ce_loss_7": 3.7775445342063905, + "epoch": 0.222, + "grad_norm": 728.0, + "kl_loss_10": 231.53972396850585, + "kl_loss_2": 2495.336804199219, + "kl_loss_3": 2020.2352600097656, + "kl_loss_7": 785.6470977783204, + "learning_rate": 0.0008910570250852097, + "loss": 1358.0102, + "step": 2220 + }, + { + "ce_loss_10": 3.6386430144309996, + "ce_loss_13": 3.5394553184509276, + "ce_loss_2": 4.721383547782898, + "ce_loss_3": 4.441709399223328, + "ce_loss_7": 3.8573225855827333, + "epoch": 0.223, + "grad_norm": 656.0, + "kl_loss_10": 222.80670547485352, + "kl_loss_2": 2415.298693847656, + "kl_loss_3": 1914.3474975585937, + "kl_loss_7": 735.9223663330079, + "learning_rate": 0.0008900663520640604, + "loss": 1330.9881, + "step": 2230 + }, + { + "ce_loss_10": 3.5963090658187866, + "ce_loss_13": 3.4863692045211794, + "ce_loss_2": 4.697564601898193, + "ce_loss_3": 4.4291857242584225, + "ce_loss_7": 3.8206969499588013, + "epoch": 0.224, + "grad_norm": 616.0, + "kl_loss_10": 232.82473220825196, + "kl_loss_2": 2436.1601440429686, + "kl_loss_3": 1975.5118774414063, + "kl_loss_7": 746.4637390136719, + "learning_rate": 0.0008890717510790764, + "loss": 1355.2247, + "step": 2240 + }, + { + "ce_loss_10": 3.550048661231995, + "ce_loss_13": 3.444666588306427, + "ce_loss_2": 4.6846558332443236, + "ce_loss_3": 4.415020489692688, + "ce_loss_7": 3.7784482836723328, + "epoch": 0.225, + "grad_norm": 748.0, + "kl_loss_10": 234.0259765625, + "kl_loss_2": 2511.7267456054688, + "kl_loss_3": 2033.7661254882812, + "kl_loss_7": 757.7471649169922, + "learning_rate": 0.0008880732321458784, + "loss": 1391.5023, + "step": 2250 + }, + { + "ce_loss_10": 3.5846696734428405, + "ce_loss_13": 3.475912594795227, + "ce_loss_2": 4.6859821557998655, + "ce_loss_3": 4.403112530708313, + "ce_loss_7": 3.8075541138648985, + "epoch": 0.226, + "grad_norm": 768.0, + "kl_loss_10": 241.0058906555176, + "kl_loss_2": 2434.0427978515627, + "kl_loss_3": 1946.942852783203, + "kl_loss_7": 750.951953125, + "learning_rate": 0.0008870708053195413, + "loss": 1371.0441, + "step": 2260 + }, + { + "ce_loss_10": 3.6066513299942016, + "ce_loss_13": 3.5011353135108947, + "ce_loss_2": 4.688438081741333, + "ce_loss_3": 4.419037127494812, + "ce_loss_7": 3.8243068933486937, + "epoch": 0.227, + "grad_norm": 612.0, + "kl_loss_10": 236.37487716674804, + "kl_loss_2": 2419.1595703125, + "kl_loss_3": 1947.7892822265626, + "kl_loss_7": 736.9884735107422, + "learning_rate": 0.0008860644806944918, + "loss": 1346.316, + "step": 2270 + }, + { + "ce_loss_10": 3.5470305681228638, + "ce_loss_13": 3.4408384203910827, + "ce_loss_2": 4.675415754318237, + "ce_loss_3": 4.405515837669372, + "ce_loss_7": 3.7811434388160707, + "epoch": 0.228, + "grad_norm": 712.0, + "kl_loss_10": 236.5175895690918, + "kl_loss_2": 2511.8415283203126, + "kl_loss_3": 2041.5828552246094, + "kl_loss_7": 773.2159851074218, + "learning_rate": 0.0008850542684044079, + "loss": 1347.2301, + "step": 2280 + }, + { + "ce_loss_10": 3.525200033187866, + "ce_loss_13": 3.4121009707450867, + "ce_loss_2": 4.704805684089661, + "ce_loss_3": 4.428252863883972, + "ce_loss_7": 3.7681017994880674, + "epoch": 0.229, + "grad_norm": 744.0, + "kl_loss_10": 243.2204231262207, + "kl_loss_2": 2609.259875488281, + "kl_loss_3": 2137.3057250976562, + "kl_loss_7": 781.1770416259766, + "learning_rate": 0.0008840401786221159, + "loss": 1392.1494, + "step": 2290 + }, + { + "ce_loss_10": 3.644639456272125, + "ce_loss_13": 3.546596646308899, + "ce_loss_2": 4.720036673545837, + "ce_loss_3": 4.461656093597412, + "ce_loss_7": 3.8639742493629456, + "epoch": 0.23, + "grad_norm": 736.0, + "kl_loss_10": 221.5949806213379, + "kl_loss_2": 2383.692004394531, + "kl_loss_3": 1920.7404052734375, + "kl_loss_7": 726.6697357177734, + "learning_rate": 0.000883022221559489, + "loss": 1309.8631, + "step": 2300 + }, + { + "ce_loss_10": 3.6106560468673705, + "ce_loss_13": 3.5103928685188293, + "ce_loss_2": 4.718800568580628, + "ce_loss_3": 4.453631711006165, + "ce_loss_7": 3.833037328720093, + "epoch": 0.231, + "grad_norm": 668.0, + "kl_loss_10": 224.89765014648438, + "kl_loss_2": 2469.4252197265623, + "kl_loss_3": 2018.495166015625, + "kl_loss_7": 748.8079467773438, + "learning_rate": 0.0008820004074673434, + "loss": 1405.4977, + "step": 2310 + }, + { + "ce_loss_10": 3.509887623786926, + "ce_loss_13": 3.4120625376701357, + "ce_loss_2": 4.630102276802063, + "ce_loss_3": 4.358427214622497, + "ce_loss_7": 3.748315227031708, + "epoch": 0.232, + "grad_norm": 604.0, + "kl_loss_10": 223.46416931152345, + "kl_loss_2": 2484.790771484375, + "kl_loss_3": 2005.2869995117187, + "kl_loss_7": 761.2884399414063, + "learning_rate": 0.0008809747466353355, + "loss": 1341.5085, + "step": 2320 + }, + { + "ce_loss_10": 3.522110950946808, + "ce_loss_13": 3.4228403091430666, + "ce_loss_2": 4.653229188919068, + "ce_loss_3": 4.378945517539978, + "ce_loss_7": 3.7502294540405274, + "epoch": 0.233, + "grad_norm": 744.0, + "kl_loss_10": 224.23116912841797, + "kl_loss_2": 2499.1381958007814, + "kl_loss_3": 2020.5157836914063, + "kl_loss_7": 752.2868743896485, + "learning_rate": 0.0008799452493918585, + "loss": 1366.2092, + "step": 2330 + }, + { + "ce_loss_10": 3.600525939464569, + "ce_loss_13": 3.501133692264557, + "ce_loss_2": 4.698138499259949, + "ce_loss_3": 4.4309428334236145, + "ce_loss_7": 3.8393119096755983, + "epoch": 0.234, + "grad_norm": 656.0, + "kl_loss_10": 221.8571762084961, + "kl_loss_2": 2452.500280761719, + "kl_loss_3": 1976.1439636230468, + "kl_loss_7": 759.1389068603515, + "learning_rate": 0.0008789119261039385, + "loss": 1400.5569, + "step": 2340 + }, + { + "ce_loss_10": 3.5126537322998046, + "ce_loss_13": 3.412049424648285, + "ce_loss_2": 4.627605974674225, + "ce_loss_3": 4.359820437431336, + "ce_loss_7": 3.747655713558197, + "epoch": 0.235, + "grad_norm": 584.0, + "kl_loss_10": 220.69495086669923, + "kl_loss_2": 2450.3417724609376, + "kl_loss_3": 1979.037158203125, + "kl_loss_7": 752.3414123535156, + "learning_rate": 0.0008778747871771292, + "loss": 1338.277, + "step": 2350 + }, + { + "ce_loss_10": 3.5650462746620177, + "ce_loss_13": 3.4650426387786863, + "ce_loss_2": 4.640904521942138, + "ce_loss_3": 4.3729163646698, + "ce_loss_7": 3.78610600233078, + "epoch": 0.236, + "grad_norm": 628.0, + "kl_loss_10": 215.22831954956055, + "kl_loss_2": 2399.6547119140623, + "kl_loss_3": 1925.4503356933594, + "kl_loss_7": 727.8779388427735, + "learning_rate": 0.0008768338430554083, + "loss": 1316.2055, + "step": 2360 + }, + { + "ce_loss_10": 3.572676420211792, + "ce_loss_13": 3.4714962005615235, + "ce_loss_2": 4.678735136985779, + "ce_loss_3": 4.39429270029068, + "ce_loss_7": 3.8077693939208985, + "epoch": 0.237, + "grad_norm": 688.0, + "kl_loss_10": 226.92397766113282, + "kl_loss_2": 2426.2300659179687, + "kl_loss_3": 1939.4405090332032, + "kl_loss_7": 752.637564086914, + "learning_rate": 0.0008757891042210713, + "loss": 1346.3338, + "step": 2370 + }, + { + "ce_loss_10": 3.592969560623169, + "ce_loss_13": 3.493350553512573, + "ce_loss_2": 4.688189601898193, + "ce_loss_3": 4.413512086868286, + "ce_loss_7": 3.821557307243347, + "epoch": 0.238, + "grad_norm": 656.0, + "kl_loss_10": 225.66336822509766, + "kl_loss_2": 2421.9510131835937, + "kl_loss_3": 1946.20556640625, + "kl_loss_7": 745.2722961425782, + "learning_rate": 0.0008747405811946271, + "loss": 1343.8345, + "step": 2380 + }, + { + "ce_loss_10": 3.49123694896698, + "ce_loss_13": 3.389770042896271, + "ce_loss_2": 4.654137110710144, + "ce_loss_3": 4.386571860313415, + "ce_loss_7": 3.731127667427063, + "epoch": 0.239, + "grad_norm": 616.0, + "kl_loss_10": 230.47370223999025, + "kl_loss_2": 2561.850231933594, + "kl_loss_3": 2084.1000549316404, + "kl_loss_7": 769.9209930419922, + "learning_rate": 0.0008736882845346905, + "loss": 1355.4398, + "step": 2390 + }, + { + "ce_loss_10": 3.5909661054611206, + "ce_loss_13": 3.4839738249778747, + "ce_loss_2": 4.705090403556824, + "ce_loss_3": 4.426928949356079, + "ce_loss_7": 3.8166149973869326, + "epoch": 0.24, + "grad_norm": 652.0, + "kl_loss_10": 232.27595291137695, + "kl_loss_2": 2463.9607543945312, + "kl_loss_3": 1976.524102783203, + "kl_loss_7": 748.5501831054687, + "learning_rate": 0.0008726322248378774, + "loss": 1350.1158, + "step": 2400 + }, + { + "ce_loss_10": 3.5857128262519837, + "ce_loss_13": 3.485344707965851, + "ce_loss_2": 4.720745325088501, + "ce_loss_3": 4.446980690956115, + "ce_loss_7": 3.815141475200653, + "epoch": 0.241, + "grad_norm": 620.0, + "kl_loss_10": 225.08902893066406, + "kl_loss_2": 2502.8332275390626, + "kl_loss_3": 2020.9147888183593, + "kl_loss_7": 748.0698608398437, + "learning_rate": 0.0008715724127386971, + "loss": 1388.577, + "step": 2410 + }, + { + "ce_loss_10": 3.656253182888031, + "ce_loss_13": 3.5548863530159, + "ce_loss_2": 4.740737318992615, + "ce_loss_3": 4.4647379398345945, + "ce_loss_7": 3.869425129890442, + "epoch": 0.242, + "grad_norm": 656.0, + "kl_loss_10": 233.72190628051757, + "kl_loss_2": 2420.5750244140627, + "kl_loss_3": 1941.4000915527345, + "kl_loss_7": 733.7942932128906, + "learning_rate": 0.0008705088589094458, + "loss": 1349.3883, + "step": 2420 + }, + { + "ce_loss_10": 3.6831162333488465, + "ce_loss_13": 3.5650919318199157, + "ce_loss_2": 4.759288740158081, + "ce_loss_3": 4.490408134460449, + "ce_loss_7": 3.8880489230155946, + "epoch": 0.243, + "grad_norm": 640.0, + "kl_loss_10": 258.1027114868164, + "kl_loss_2": 2453.8090209960938, + "kl_loss_3": 1977.7547729492187, + "kl_loss_7": 746.0192138671875, + "learning_rate": 0.0008694415740600988, + "loss": 1371.979, + "step": 2430 + }, + { + "ce_loss_10": 3.539147210121155, + "ce_loss_13": 3.418752908706665, + "ce_loss_2": 4.6640907526016235, + "ce_loss_3": 4.396868014335633, + "ce_loss_7": 3.753141713142395, + "epoch": 0.244, + "grad_norm": 720.0, + "kl_loss_10": 272.4710403442383, + "kl_loss_2": 2511.5777099609377, + "kl_loss_3": 2045.4482543945312, + "kl_loss_7": 744.3600494384766, + "learning_rate": 0.0008683705689382025, + "loss": 1374.2081, + "step": 2440 + }, + { + "ce_loss_10": 3.614233338832855, + "ce_loss_13": 3.502686250209808, + "ce_loss_2": 4.680193209648133, + "ce_loss_3": 4.409785914421081, + "ce_loss_7": 3.81562682390213, + "epoch": 0.245, + "grad_norm": 680.0, + "kl_loss_10": 242.92661514282227, + "kl_loss_2": 2418.696484375, + "kl_loss_3": 1945.9917602539062, + "kl_loss_7": 727.0407897949219, + "learning_rate": 0.0008672958543287666, + "loss": 1361.5771, + "step": 2450 + }, + { + "ce_loss_10": 3.6207616090774537, + "ce_loss_13": 3.5146057486534117, + "ce_loss_2": 4.6799437522888185, + "ce_loss_3": 4.408400678634644, + "ce_loss_7": 3.8393305063247682, + "epoch": 0.246, + "grad_norm": 640.0, + "kl_loss_10": 233.26868438720703, + "kl_loss_2": 2373.7197509765624, + "kl_loss_3": 1900.9493347167968, + "kl_loss_7": 737.9279724121094, + "learning_rate": 0.0008662174410541554, + "loss": 1323.3875, + "step": 2460 + }, + { + "ce_loss_10": 3.5795403718948364, + "ce_loss_13": 3.4791687726974487, + "ce_loss_2": 4.657073163986206, + "ce_loss_3": 4.389124321937561, + "ce_loss_7": 3.797624135017395, + "epoch": 0.247, + "grad_norm": 688.0, + "kl_loss_10": 228.68382720947267, + "kl_loss_2": 2405.7741943359374, + "kl_loss_3": 1929.0893249511719, + "kl_loss_7": 730.4046020507812, + "learning_rate": 0.0008651353399739787, + "loss": 1361.2713, + "step": 2470 + }, + { + "ce_loss_10": 3.6015311241149903, + "ce_loss_13": 3.5007805585861207, + "ce_loss_2": 4.693076491355896, + "ce_loss_3": 4.420244932174683, + "ce_loss_7": 3.8255343675613402, + "epoch": 0.248, + "grad_norm": 628.0, + "kl_loss_10": 225.77268676757814, + "kl_loss_2": 2413.6783447265625, + "kl_loss_3": 1937.1076232910157, + "kl_loss_7": 735.3206512451172, + "learning_rate": 0.0008640495619849821, + "loss": 1345.3404, + "step": 2480 + }, + { + "ce_loss_10": 3.5668503522872923, + "ce_loss_13": 3.4637187004089354, + "ce_loss_2": 4.644854807853699, + "ce_loss_3": 4.374481606483459, + "ce_loss_7": 3.791785490512848, + "epoch": 0.249, + "grad_norm": 616.0, + "kl_loss_10": 223.47670059204103, + "kl_loss_2": 2406.82578125, + "kl_loss_3": 1930.5429321289062, + "kl_loss_7": 738.2828582763672, + "learning_rate": 0.0008629601180209381, + "loss": 1326.733, + "step": 2490 + }, + { + "ce_loss_10": 3.5605925559997558, + "ce_loss_13": 3.4623565435409547, + "ce_loss_2": 4.648912143707276, + "ce_loss_3": 4.37358832359314, + "ce_loss_7": 3.7822588205337526, + "epoch": 0.25, + "grad_norm": 588.0, + "kl_loss_10": 221.60515823364258, + "kl_loss_2": 2408.634729003906, + "kl_loss_3": 1918.1406311035157, + "kl_loss_7": 733.2383361816406, + "learning_rate": 0.000861867019052535, + "loss": 1350.9314, + "step": 2500 + }, + { + "ce_loss_10": 3.4750850677490233, + "ce_loss_13": 3.3757749915122988, + "ce_loss_2": 4.618335509300232, + "ce_loss_3": 4.344382691383362, + "ce_loss_7": 3.7118528127670287, + "epoch": 0.251, + "grad_norm": 664.0, + "kl_loss_10": 225.6886344909668, + "kl_loss_2": 2520.0691040039064, + "kl_loss_3": 2028.4780883789062, + "kl_loss_7": 750.8930267333984, + "learning_rate": 0.0008607702760872678, + "loss": 1377.2211, + "step": 2510 + }, + { + "ce_loss_10": 3.5948320031166077, + "ce_loss_13": 3.493862783908844, + "ce_loss_2": 4.663220858573913, + "ce_loss_3": 4.39898452758789, + "ce_loss_7": 3.8143251180648803, + "epoch": 0.252, + "grad_norm": 736.0, + "kl_loss_10": 220.9385528564453, + "kl_loss_2": 2382.33095703125, + "kl_loss_3": 1919.1317260742187, + "kl_loss_7": 728.4733703613281, + "learning_rate": 0.0008596699001693256, + "loss": 1356.6151, + "step": 2520 + }, + { + "ce_loss_10": 3.6045937299728394, + "ce_loss_13": 3.5089424014091493, + "ce_loss_2": 4.674148344993592, + "ce_loss_3": 4.401587581634521, + "ce_loss_7": 3.8156301379203796, + "epoch": 0.253, + "grad_norm": 664.0, + "kl_loss_10": 222.60222702026368, + "kl_loss_2": 2399.647021484375, + "kl_loss_3": 1923.217791748047, + "kl_loss_7": 722.3135375976562, + "learning_rate": 0.0008585659023794818, + "loss": 1357.2354, + "step": 2530 + }, + { + "ce_loss_10": 3.5605056166648863, + "ce_loss_13": 3.458607590198517, + "ce_loss_2": 4.6924147605896, + "ce_loss_3": 4.421391654014587, + "ce_loss_7": 3.799249517917633, + "epoch": 0.254, + "grad_norm": 660.0, + "kl_loss_10": 233.0737617492676, + "kl_loss_2": 2499.324670410156, + "kl_loss_3": 2030.4549194335937, + "kl_loss_7": 761.279296875, + "learning_rate": 0.0008574582938349817, + "loss": 1364.7606, + "step": 2540 + }, + { + "ce_loss_10": 3.5620136737823485, + "ce_loss_13": 3.450424087047577, + "ce_loss_2": 4.679884123802185, + "ce_loss_3": 4.403433465957642, + "ce_loss_7": 3.8059414982795716, + "epoch": 0.255, + "grad_norm": 648.0, + "kl_loss_10": 238.74318084716796, + "kl_loss_2": 2486.331640625, + "kl_loss_3": 1999.8115600585938, + "kl_loss_7": 776.2368225097656, + "learning_rate": 0.0008563470856894315, + "loss": 1329.6849, + "step": 2550 + }, + { + "ce_loss_10": 3.540405642986298, + "ce_loss_13": 3.4457826972007752, + "ce_loss_2": 4.656697821617127, + "ce_loss_3": 4.386443245410919, + "ce_loss_7": 3.772416353225708, + "epoch": 0.256, + "grad_norm": 760.0, + "kl_loss_10": 221.72702865600587, + "kl_loss_2": 2443.3952514648436, + "kl_loss_3": 1969.1475952148437, + "kl_loss_7": 745.7592987060547, + "learning_rate": 0.0008552322891326845, + "loss": 1346.8541, + "step": 2560 + }, + { + "ce_loss_10": 3.5136868953704834, + "ce_loss_13": 3.415074276924133, + "ce_loss_2": 4.637244987487793, + "ce_loss_3": 4.365770423412323, + "ce_loss_7": 3.741610062122345, + "epoch": 0.257, + "grad_norm": 788.0, + "kl_loss_10": 218.68516159057617, + "kl_loss_2": 2477.789599609375, + "kl_loss_3": 2001.3069702148437, + "kl_loss_7": 743.3714080810547, + "learning_rate": 0.0008541139153907296, + "loss": 1329.1979, + "step": 2570 + }, + { + "ce_loss_10": 3.472187507152557, + "ce_loss_13": 3.3729379415512084, + "ce_loss_2": 4.581104445457458, + "ce_loss_3": 4.308674609661102, + "ce_loss_7": 3.69760080575943, + "epoch": 0.258, + "grad_norm": 636.0, + "kl_loss_10": 213.4689498901367, + "kl_loss_2": 2453.299768066406, + "kl_loss_3": 1976.8992919921875, + "kl_loss_7": 745.6326965332031, + "learning_rate": 0.0008529919757255782, + "loss": 1354.7893, + "step": 2580 + }, + { + "ce_loss_10": 3.500008797645569, + "ce_loss_13": 3.408738708496094, + "ce_loss_2": 4.560009336471557, + "ce_loss_3": 4.2931175351142885, + "ce_loss_7": 3.716734218597412, + "epoch": 0.259, + "grad_norm": 624.0, + "kl_loss_10": 208.80025100708008, + "kl_loss_2": 2371.1708251953123, + "kl_loss_3": 1897.6802124023438, + "kl_loss_7": 721.6227478027344, + "learning_rate": 0.0008518664814351503, + "loss": 1306.301, + "step": 2590 + }, + { + "ce_loss_10": 3.472637712955475, + "ce_loss_13": 3.37472482919693, + "ce_loss_2": 4.598471093177795, + "ce_loss_3": 4.321799778938294, + "ce_loss_7": 3.7131651520729063, + "epoch": 0.26, + "grad_norm": 644.0, + "kl_loss_10": 222.20911254882813, + "kl_loss_2": 2491.116162109375, + "kl_loss_3": 2007.4335876464843, + "kl_loss_7": 764.1704193115235, + "learning_rate": 0.0008507374438531607, + "loss": 1407.2535, + "step": 2600 + }, + { + "ce_loss_10": 3.447394275665283, + "ce_loss_13": 3.3539512395858764, + "ce_loss_2": 4.5548292875289915, + "ce_loss_3": 4.286789774894714, + "ce_loss_7": 3.6768516659736634, + "epoch": 0.261, + "grad_norm": 676.0, + "kl_loss_10": 214.65092697143555, + "kl_loss_2": 2437.03447265625, + "kl_loss_3": 1973.9089477539062, + "kl_loss_7": 738.8113952636719, + "learning_rate": 0.0008496048743490053, + "loss": 1332.7279, + "step": 2610 + }, + { + "ce_loss_10": 3.597834813594818, + "ce_loss_13": 3.5061428785324096, + "ce_loss_2": 4.655121803283691, + "ce_loss_3": 4.391561770439148, + "ce_loss_7": 3.814839816093445, + "epoch": 0.262, + "grad_norm": 564.0, + "kl_loss_10": 212.99711074829102, + "kl_loss_2": 2362.529577636719, + "kl_loss_3": 1891.9757995605469, + "kl_loss_7": 720.1662811279297, + "learning_rate": 0.0008484687843276469, + "loss": 1316.5832, + "step": 2620 + }, + { + "ce_loss_10": 3.533200740814209, + "ce_loss_13": 3.4373727798461915, + "ce_loss_2": 4.636826205253601, + "ce_loss_3": 4.3528993129730225, + "ce_loss_7": 3.7636064171791075, + "epoch": 0.263, + "grad_norm": 688.0, + "kl_loss_10": 217.95888977050782, + "kl_loss_2": 2432.091143798828, + "kl_loss_3": 1936.0632568359374, + "kl_loss_7": 738.968881225586, + "learning_rate": 0.0008473291852294987, + "loss": 1361.4943, + "step": 2630 + }, + { + "ce_loss_10": 3.5451728224754335, + "ce_loss_13": 3.446604347229004, + "ce_loss_2": 4.630346298217773, + "ce_loss_3": 4.3619812488555905, + "ce_loss_7": 3.7699208855628967, + "epoch": 0.264, + "grad_norm": 672.0, + "kl_loss_10": 220.66769561767578, + "kl_loss_2": 2436.2069458007813, + "kl_loss_3": 1956.8639526367188, + "kl_loss_7": 742.7248840332031, + "learning_rate": 0.0008461860885303114, + "loss": 1327.3721, + "step": 2640 + }, + { + "ce_loss_10": 3.5666414141654967, + "ce_loss_13": 3.4715107679367065, + "ce_loss_2": 4.639662265777588, + "ce_loss_3": 4.371685028076172, + "ce_loss_7": 3.788040292263031, + "epoch": 0.265, + "grad_norm": 656.0, + "kl_loss_10": 216.69636611938478, + "kl_loss_2": 2373.723107910156, + "kl_loss_3": 1899.1220764160157, + "kl_loss_7": 725.1952423095703, + "learning_rate": 0.000845039505741056, + "loss": 1327.8555, + "step": 2650 + }, + { + "ce_loss_10": 3.5541250467300416, + "ce_loss_13": 3.4555353045463564, + "ce_loss_2": 4.645513963699341, + "ce_loss_3": 4.378093981742859, + "ce_loss_7": 3.7833709001541136, + "epoch": 0.266, + "grad_norm": 668.0, + "kl_loss_10": 224.05798721313477, + "kl_loss_2": 2449.707385253906, + "kl_loss_3": 1967.4787109375, + "kl_loss_7": 750.5478302001953, + "learning_rate": 0.0008438894484078086, + "loss": 1378.657, + "step": 2660 + }, + { + "ce_loss_10": 3.557729125022888, + "ce_loss_13": 3.4628395080566405, + "ce_loss_2": 4.638984179496765, + "ce_loss_3": 4.374520492553711, + "ce_loss_7": 3.7801038026809692, + "epoch": 0.267, + "grad_norm": 796.0, + "kl_loss_10": 218.22870254516602, + "kl_loss_2": 2393.3899047851564, + "kl_loss_3": 1931.0333312988282, + "kl_loss_7": 732.3969909667969, + "learning_rate": 0.0008427359281116334, + "loss": 1329.4188, + "step": 2670 + }, + { + "ce_loss_10": 3.4619020819664, + "ce_loss_13": 3.3649930715560914, + "ce_loss_2": 4.586506628990174, + "ce_loss_3": 4.3114288449287415, + "ce_loss_7": 3.6977506399154665, + "epoch": 0.268, + "grad_norm": 560.0, + "kl_loss_10": 218.7227699279785, + "kl_loss_2": 2471.7220703125, + "kl_loss_3": 1986.8973815917968, + "kl_loss_7": 744.8811431884766, + "learning_rate": 0.0008415789564684673, + "loss": 1344.4947, + "step": 2680 + }, + { + "ce_loss_10": 3.7084735155105593, + "ce_loss_13": 3.610187065601349, + "ce_loss_2": 4.759761667251587, + "ce_loss_3": 4.487373423576355, + "ce_loss_7": 3.9243152022361754, + "epoch": 0.269, + "grad_norm": 756.0, + "kl_loss_10": 223.18955688476564, + "kl_loss_2": 2329.3449951171874, + "kl_loss_3": 1847.8426208496094, + "kl_loss_7": 721.1707153320312, + "learning_rate": 0.0008404185451290017, + "loss": 1296.1146, + "step": 2690 + }, + { + "ce_loss_10": 3.578732097148895, + "ce_loss_13": 3.4770421504974367, + "ce_loss_2": 4.659151983261109, + "ce_loss_3": 4.38085663318634, + "ce_loss_7": 3.7948765754699707, + "epoch": 0.27, + "grad_norm": 692.0, + "kl_loss_10": 224.61487731933593, + "kl_loss_2": 2417.559912109375, + "kl_loss_3": 1939.3710815429688, + "kl_loss_7": 727.4687561035156, + "learning_rate": 0.0008392547057785661, + "loss": 1317.3512, + "step": 2700 + }, + { + "ce_loss_10": 3.5002851486206055, + "ce_loss_13": 3.396597516536713, + "ce_loss_2": 4.633592844009399, + "ce_loss_3": 4.365511727333069, + "ce_loss_7": 3.738453209400177, + "epoch": 0.271, + "grad_norm": 732.0, + "kl_loss_10": 231.73975296020507, + "kl_loss_2": 2517.132354736328, + "kl_loss_3": 2044.1573425292968, + "kl_loss_7": 768.5197204589844, + "learning_rate": 0.0008380874501370098, + "loss": 1329.0642, + "step": 2710 + }, + { + "ce_loss_10": 3.5027819752693174, + "ce_loss_13": 3.4010127544403077, + "ce_loss_2": 4.628546047210693, + "ce_loss_3": 4.359855842590332, + "ce_loss_7": 3.7310682773590087, + "epoch": 0.272, + "grad_norm": 628.0, + "kl_loss_10": 236.13679275512695, + "kl_loss_2": 2503.883825683594, + "kl_loss_3": 2020.1560424804688, + "kl_loss_7": 758.8711700439453, + "learning_rate": 0.0008369167899585841, + "loss": 1363.7068, + "step": 2720 + }, + { + "ce_loss_10": 3.6181455850601196, + "ce_loss_13": 3.521961879730225, + "ce_loss_2": 4.664963984489441, + "ce_loss_3": 4.396141123771668, + "ce_loss_7": 3.839101779460907, + "epoch": 0.273, + "grad_norm": 636.0, + "kl_loss_10": 223.16615371704103, + "kl_loss_2": 2348.37099609375, + "kl_loss_3": 1879.9346130371093, + "kl_loss_7": 730.2560852050781, + "learning_rate": 0.0008357427370318238, + "loss": 1337.943, + "step": 2730 + }, + { + "ce_loss_10": 3.571904718875885, + "ce_loss_13": 3.4762736320495606, + "ce_loss_2": 4.677034759521485, + "ce_loss_3": 4.40289398431778, + "ce_loss_7": 3.7918145298957824, + "epoch": 0.274, + "grad_norm": 772.0, + "kl_loss_10": 222.57760772705078, + "kl_loss_2": 2451.346435546875, + "kl_loss_3": 1973.4313354492188, + "kl_loss_7": 730.7371429443359, + "learning_rate": 0.0008345653031794292, + "loss": 1347.6243, + "step": 2740 + }, + { + "ce_loss_10": 3.5737530469894407, + "ce_loss_13": 3.4740692615509032, + "ce_loss_2": 4.659031462669373, + "ce_loss_3": 4.387771344184875, + "ce_loss_7": 3.792672348022461, + "epoch": 0.275, + "grad_norm": 672.0, + "kl_loss_10": 222.67840805053712, + "kl_loss_2": 2406.277941894531, + "kl_loss_3": 1924.3234985351562, + "kl_loss_7": 730.7620574951172, + "learning_rate": 0.0008333845002581458, + "loss": 1320.2523, + "step": 2750 + }, + { + "ce_loss_10": 3.498860251903534, + "ce_loss_13": 3.400104033946991, + "ce_loss_2": 4.611243772506714, + "ce_loss_3": 4.342458128929138, + "ce_loss_7": 3.733369469642639, + "epoch": 0.276, + "grad_norm": 644.0, + "kl_loss_10": 224.65963973999024, + "kl_loss_2": 2495.7015869140623, + "kl_loss_3": 2015.1633422851562, + "kl_loss_7": 762.1438781738282, + "learning_rate": 0.0008322003401586462, + "loss": 1364.4495, + "step": 2760 + }, + { + "ce_loss_10": 3.532784569263458, + "ce_loss_13": 3.440683197975159, + "ce_loss_2": 4.59234881401062, + "ce_loss_3": 4.320498394966125, + "ce_loss_7": 3.7502055525779725, + "epoch": 0.277, + "grad_norm": 724.0, + "kl_loss_10": 211.5718635559082, + "kl_loss_2": 2343.010675048828, + "kl_loss_3": 1873.985821533203, + "kl_loss_7": 709.5305114746094, + "learning_rate": 0.0008310128348054094, + "loss": 1276.2701, + "step": 2770 + }, + { + "ce_loss_10": 3.5014058470726015, + "ce_loss_13": 3.406921911239624, + "ce_loss_2": 4.603280448913575, + "ce_loss_3": 4.329492771625519, + "ce_loss_7": 3.7248639822006226, + "epoch": 0.278, + "grad_norm": 652.0, + "kl_loss_10": 214.84819107055665, + "kl_loss_2": 2431.7943481445313, + "kl_loss_3": 1951.13515625, + "kl_loss_7": 731.5488677978516, + "learning_rate": 0.0008298219961566008, + "loss": 1329.707, + "step": 2780 + }, + { + "ce_loss_10": 3.4713513970375063, + "ce_loss_13": 3.3771822571754457, + "ce_loss_2": 4.587963104248047, + "ce_loss_3": 4.32047404050827, + "ce_loss_7": 3.711584746837616, + "epoch": 0.279, + "grad_norm": 644.0, + "kl_loss_10": 217.99566726684571, + "kl_loss_2": 2492.9334106445312, + "kl_loss_3": 2016.429022216797, + "kl_loss_7": 761.9394226074219, + "learning_rate": 0.0008286278362039527, + "loss": 1336.5162, + "step": 2790 + }, + { + "ce_loss_10": 3.496282184123993, + "ce_loss_13": 3.3998995065689086, + "ce_loss_2": 4.622646689414978, + "ce_loss_3": 4.352741932868957, + "ce_loss_7": 3.7300979018211367, + "epoch": 0.28, + "grad_norm": 592.0, + "kl_loss_10": 216.96264114379883, + "kl_loss_2": 2489.9998046875, + "kl_loss_3": 2008.0425537109375, + "kl_loss_7": 746.8909149169922, + "learning_rate": 0.0008274303669726426, + "loss": 1325.7328, + "step": 2800 + }, + { + "ce_loss_10": 3.4048958301544188, + "ce_loss_13": 3.3045366764068604, + "ce_loss_2": 4.5690556287765505, + "ce_loss_3": 4.298348617553711, + "ce_loss_7": 3.6378442645072937, + "epoch": 0.281, + "grad_norm": 684.0, + "kl_loss_10": 218.18540115356444, + "kl_loss_2": 2561.6716186523436, + "kl_loss_3": 2080.7119262695314, + "kl_loss_7": 743.8994750976562, + "learning_rate": 0.0008262296005211721, + "loss": 1337.6219, + "step": 2810 + }, + { + "ce_loss_10": 3.5260050296783447, + "ce_loss_13": 3.428924763202667, + "ce_loss_2": 4.642134022712708, + "ce_loss_3": 4.368475294113159, + "ce_loss_7": 3.7550152063369753, + "epoch": 0.282, + "grad_norm": 600.0, + "kl_loss_10": 216.54320907592773, + "kl_loss_2": 2444.2397338867186, + "kl_loss_3": 1975.6794677734374, + "kl_loss_7": 734.2523712158203, + "learning_rate": 0.0008250255489412463, + "loss": 1322.247, + "step": 2820 + }, + { + "ce_loss_10": 3.629942464828491, + "ce_loss_13": 3.532360863685608, + "ce_loss_2": 4.7163821935653685, + "ce_loss_3": 4.444535660743713, + "ce_loss_7": 3.846136474609375, + "epoch": 0.283, + "grad_norm": 628.0, + "kl_loss_10": 214.22548904418946, + "kl_loss_2": 2410.5466918945312, + "kl_loss_3": 1930.2673034667969, + "kl_loss_7": 714.048681640625, + "learning_rate": 0.0008238182243576511, + "loss": 1325.0883, + "step": 2830 + }, + { + "ce_loss_10": 3.5913167357444764, + "ce_loss_13": 3.5031124353408813, + "ce_loss_2": 4.611292886734009, + "ce_loss_3": 4.339277529716492, + "ce_loss_7": 3.796242094039917, + "epoch": 0.284, + "grad_norm": 620.0, + "kl_loss_10": 208.4808135986328, + "kl_loss_2": 2294.337286376953, + "kl_loss_3": 1814.4247924804688, + "kl_loss_7": 695.5996673583984, + "learning_rate": 0.0008226076389281315, + "loss": 1277.3086, + "step": 2840 + }, + { + "ce_loss_10": 3.632950210571289, + "ce_loss_13": 3.542364180088043, + "ce_loss_2": 4.697378945350647, + "ce_loss_3": 4.428278470039368, + "ce_loss_7": 3.8434852004051208, + "epoch": 0.285, + "grad_norm": 592.0, + "kl_loss_10": 210.92243499755858, + "kl_loss_2": 2375.7556274414064, + "kl_loss_3": 1902.3470825195313, + "kl_loss_7": 701.8125823974609, + "learning_rate": 0.0008213938048432696, + "loss": 1285.7082, + "step": 2850 + }, + { + "ce_loss_10": 3.561896014213562, + "ce_loss_13": 3.4673075318336486, + "ce_loss_2": 4.635823488235474, + "ce_loss_3": 4.3728371381759645, + "ce_loss_7": 3.780589020252228, + "epoch": 0.286, + "grad_norm": 616.0, + "kl_loss_10": 216.6977653503418, + "kl_loss_2": 2390.834924316406, + "kl_loss_3": 1924.6818054199218, + "kl_loss_7": 726.8750396728516, + "learning_rate": 0.0008201767343263612, + "loss": 1324.6124, + "step": 2860 + }, + { + "ce_loss_10": 3.4997401237487793, + "ce_loss_13": 3.4044744968414307, + "ce_loss_2": 4.604890465736389, + "ce_loss_3": 4.338030159473419, + "ce_loss_7": 3.7291186928749083, + "epoch": 0.287, + "grad_norm": 616.0, + "kl_loss_10": 213.92771530151367, + "kl_loss_2": 2444.1182250976562, + "kl_loss_3": 1971.163818359375, + "kl_loss_7": 731.3478240966797, + "learning_rate": 0.0008189564396332927, + "loss": 1291.9086, + "step": 2870 + }, + { + "ce_loss_10": 3.480617916584015, + "ce_loss_13": 3.388473629951477, + "ce_loss_2": 4.600887513160705, + "ce_loss_3": 4.323178672790528, + "ce_loss_7": 3.7104127168655396, + "epoch": 0.288, + "grad_norm": 668.0, + "kl_loss_10": 212.88904190063477, + "kl_loss_2": 2441.765899658203, + "kl_loss_3": 1961.8893615722657, + "kl_loss_7": 728.4373413085938, + "learning_rate": 0.0008177329330524181, + "loss": 1342.4608, + "step": 2880 + }, + { + "ce_loss_10": 3.5435534834861757, + "ce_loss_13": 3.4502355217933656, + "ce_loss_2": 4.6120285987854, + "ce_loss_3": 4.346097040176391, + "ce_loss_7": 3.762561321258545, + "epoch": 0.289, + "grad_norm": 648.0, + "kl_loss_10": 212.22290649414063, + "kl_loss_2": 2358.1793823242188, + "kl_loss_3": 1890.9413208007813, + "kl_loss_7": 714.5174743652344, + "learning_rate": 0.0008165062269044352, + "loss": 1305.3231, + "step": 2890 + }, + { + "ce_loss_10": 3.4996484994888304, + "ce_loss_13": 3.401354455947876, + "ce_loss_2": 4.609268927574158, + "ce_loss_3": 4.3294067740440365, + "ce_loss_7": 3.723408377170563, + "epoch": 0.29, + "grad_norm": 660.0, + "kl_loss_10": 216.81241302490236, + "kl_loss_2": 2451.4824340820314, + "kl_loss_3": 1968.3146179199218, + "kl_loss_7": 729.5468353271484, + "learning_rate": 0.0008152763335422613, + "loss": 1337.7896, + "step": 2900 + }, + { + "ce_loss_10": 3.4890666246414184, + "ce_loss_13": 3.392501711845398, + "ce_loss_2": 4.58982219696045, + "ce_loss_3": 4.312074947357178, + "ce_loss_7": 3.713588225841522, + "epoch": 0.291, + "grad_norm": 664.0, + "kl_loss_10": 218.38675384521486, + "kl_loss_2": 2445.5037841796875, + "kl_loss_3": 1949.8568176269532, + "kl_loss_7": 729.6879028320312, + "learning_rate": 0.0008140432653509088, + "loss": 1317.595, + "step": 2910 + }, + { + "ce_loss_10": 3.538894033432007, + "ce_loss_13": 3.4391178250312806, + "ce_loss_2": 4.60951418876648, + "ce_loss_3": 4.337265026569367, + "ce_loss_7": 3.7542282700538636, + "epoch": 0.292, + "grad_norm": 576.0, + "kl_loss_10": 218.85857162475585, + "kl_loss_2": 2397.1072692871094, + "kl_loss_3": 1916.8259216308593, + "kl_loss_7": 718.4374481201172, + "learning_rate": 0.0008128070347473608, + "loss": 1302.2107, + "step": 2920 + }, + { + "ce_loss_10": 3.5429399847984313, + "ce_loss_13": 3.447796130180359, + "ce_loss_2": 4.665868854522705, + "ce_loss_3": 4.389448404312134, + "ce_loss_7": 3.7667205929756165, + "epoch": 0.293, + "grad_norm": 664.0, + "kl_loss_10": 216.54725646972656, + "kl_loss_2": 2487.7160583496093, + "kl_loss_3": 2004.9421325683593, + "kl_loss_7": 736.1060913085937, + "learning_rate": 0.0008115676541804455, + "loss": 1333.5637, + "step": 2930 + }, + { + "ce_loss_10": 3.5453550815582275, + "ce_loss_13": 3.4535977363586428, + "ce_loss_2": 4.623500943183899, + "ce_loss_3": 4.348728823661804, + "ce_loss_7": 3.760838878154755, + "epoch": 0.294, + "grad_norm": 580.0, + "kl_loss_10": 209.94191284179686, + "kl_loss_2": 2400.48662109375, + "kl_loss_3": 1909.5526062011718, + "kl_loss_7": 710.1752807617188, + "learning_rate": 0.0008103251361307119, + "loss": 1325.5172, + "step": 2940 + }, + { + "ce_loss_10": 3.578377163410187, + "ce_loss_13": 3.4808244347572326, + "ce_loss_2": 4.6591003894805905, + "ce_loss_3": 4.395820617675781, + "ce_loss_7": 3.793817377090454, + "epoch": 0.295, + "grad_norm": 616.0, + "kl_loss_10": 214.81473617553712, + "kl_loss_2": 2396.3223205566405, + "kl_loss_3": 1926.4922485351562, + "kl_loss_7": 722.0272766113281, + "learning_rate": 0.0008090794931103026, + "loss": 1300.3234, + "step": 2950 + }, + { + "ce_loss_10": 3.566417765617371, + "ce_loss_13": 3.475232172012329, + "ce_loss_2": 4.628555154800415, + "ce_loss_3": 4.358175444602966, + "ce_loss_7": 3.7831589698791506, + "epoch": 0.296, + "grad_norm": 692.0, + "kl_loss_10": 209.84390869140626, + "kl_loss_2": 2350.2305419921877, + "kl_loss_3": 1877.9652465820313, + "kl_loss_7": 713.7039794921875, + "learning_rate": 0.0008078307376628291, + "loss": 1303.6331, + "step": 2960 + }, + { + "ce_loss_10": 3.6232991099357603, + "ce_loss_13": 3.534627139568329, + "ce_loss_2": 4.6475036382675174, + "ce_loss_3": 4.389086437225342, + "ce_loss_7": 3.83059047460556, + "epoch": 0.297, + "grad_norm": 644.0, + "kl_loss_10": 205.1537940979004, + "kl_loss_2": 2274.82734375, + "kl_loss_3": 1823.2497436523438, + "kl_loss_7": 686.9072265625, + "learning_rate": 0.000806578882363245, + "loss": 1259.2264, + "step": 2970 + }, + { + "ce_loss_10": 3.536562275886536, + "ce_loss_13": 3.447048234939575, + "ce_loss_2": 4.597748541831971, + "ce_loss_3": 4.3311933994293215, + "ce_loss_7": 3.7559500217437742, + "epoch": 0.298, + "grad_norm": 736.0, + "kl_loss_10": 208.43729248046876, + "kl_loss_2": 2344.390216064453, + "kl_loss_3": 1878.6112243652344, + "kl_loss_7": 714.4485260009766, + "learning_rate": 0.0008053239398177191, + "loss": 1329.3172, + "step": 2980 + }, + { + "ce_loss_10": 3.524178981781006, + "ce_loss_13": 3.4312392354011534, + "ce_loss_2": 4.604809284210205, + "ce_loss_3": 4.337883043289184, + "ce_loss_7": 3.7429209470748903, + "epoch": 0.299, + "grad_norm": 684.0, + "kl_loss_10": 211.32650604248047, + "kl_loss_2": 2394.308056640625, + "kl_loss_3": 1917.52822265625, + "kl_loss_7": 709.9231262207031, + "learning_rate": 0.0008040659226635089, + "loss": 1341.8297, + "step": 2990 + }, + { + "ce_loss_10": 3.65326806306839, + "ce_loss_13": 3.555258011817932, + "ce_loss_2": 4.710744786262512, + "ce_loss_3": 4.444170761108398, + "ce_loss_7": 3.8668533086776735, + "epoch": 0.3, + "grad_norm": 640.0, + "kl_loss_10": 219.24570388793944, + "kl_loss_2": 2376.9404907226562, + "kl_loss_3": 1902.857159423828, + "kl_loss_7": 725.9926879882812, + "learning_rate": 0.0008028048435688333, + "loss": 1298.4502, + "step": 3000 + }, + { + "ce_loss_10": 3.521394634246826, + "ce_loss_13": 3.4270112991333006, + "ce_loss_2": 4.624356460571289, + "ce_loss_3": 4.355751609802246, + "ce_loss_7": 3.7494575500488283, + "epoch": 0.301, + "grad_norm": 716.0, + "kl_loss_10": 217.2972724914551, + "kl_loss_2": 2452.999304199219, + "kl_loss_3": 1985.1250549316405, + "kl_loss_7": 732.1629119873047, + "learning_rate": 0.0008015407152327448, + "loss": 1335.19, + "step": 3010 + }, + { + "ce_loss_10": 3.5699279427528383, + "ce_loss_13": 3.475005257129669, + "ce_loss_2": 4.65969865322113, + "ce_loss_3": 4.38304386138916, + "ce_loss_7": 3.784406042098999, + "epoch": 0.302, + "grad_norm": 620.0, + "kl_loss_10": 215.99359130859375, + "kl_loss_2": 2432.162463378906, + "kl_loss_3": 1951.8839721679688, + "kl_loss_7": 718.2368713378906, + "learning_rate": 0.0008002735503850016, + "loss": 1332.6505, + "step": 3020 + }, + { + "ce_loss_10": 3.4684691429138184, + "ce_loss_13": 3.367643666267395, + "ce_loss_2": 4.5924430847167965, + "ce_loss_3": 4.30932047367096, + "ce_loss_7": 3.6915883660316466, + "epoch": 0.303, + "grad_norm": 636.0, + "kl_loss_10": 224.01161422729493, + "kl_loss_2": 2494.453234863281, + "kl_loss_3": 2004.73359375, + "kl_loss_7": 736.367529296875, + "learning_rate": 0.0007990033617859396, + "loss": 1348.4062, + "step": 3030 + }, + { + "ce_loss_10": 3.5133005499839784, + "ce_loss_13": 3.417665791511536, + "ce_loss_2": 4.581400918960571, + "ce_loss_3": 4.318250679969788, + "ce_loss_7": 3.734131360054016, + "epoch": 0.304, + "grad_norm": 692.0, + "kl_loss_10": 218.55305099487305, + "kl_loss_2": 2367.1648193359374, + "kl_loss_3": 1894.6960754394531, + "kl_loss_7": 712.4279693603515, + "learning_rate": 0.000797730162226344, + "loss": 1274.1975, + "step": 3040 + }, + { + "ce_loss_10": 3.540754234790802, + "ce_loss_13": 3.4410573482513427, + "ce_loss_2": 4.607666325569153, + "ce_loss_3": 4.33906877040863, + "ce_loss_7": 3.76459002494812, + "epoch": 0.305, + "grad_norm": 692.0, + "kl_loss_10": 221.26933517456055, + "kl_loss_2": 2377.095458984375, + "kl_loss_3": 1910.9453735351562, + "kl_loss_7": 729.3416778564454, + "learning_rate": 0.0007964539645273203, + "loss": 1293.3233, + "step": 3050 + }, + { + "ce_loss_10": 3.549929714202881, + "ce_loss_13": 3.4547195076942443, + "ce_loss_2": 4.595946025848389, + "ce_loss_3": 4.332681286334991, + "ce_loss_7": 3.7608805656433106, + "epoch": 0.306, + "grad_norm": 608.0, + "kl_loss_10": 214.02068862915038, + "kl_loss_2": 2324.1172485351562, + "kl_loss_3": 1866.7198425292968, + "kl_loss_7": 705.0489013671875, + "learning_rate": 0.000795174781540165, + "loss": 1301.7614, + "step": 3060 + }, + { + "ce_loss_10": 3.626460921764374, + "ce_loss_13": 3.5295538663864137, + "ce_loss_2": 4.639704465866089, + "ce_loss_3": 4.383927941322327, + "ce_loss_7": 3.8362658858299254, + "epoch": 0.307, + "grad_norm": 644.0, + "kl_loss_10": 215.03676071166993, + "kl_loss_2": 2264.9541259765624, + "kl_loss_3": 1824.0037841796875, + "kl_loss_7": 696.3071411132812, + "learning_rate": 0.0007938926261462366, + "loss": 1288.9521, + "step": 3070 + }, + { + "ce_loss_10": 3.5775561928749084, + "ce_loss_13": 3.480459380149841, + "ce_loss_2": 4.618080592155456, + "ce_loss_3": 4.350315952301026, + "ce_loss_7": 3.7854344248771667, + "epoch": 0.308, + "grad_norm": 648.0, + "kl_loss_10": 216.656893157959, + "kl_loss_2": 2357.475067138672, + "kl_loss_3": 1888.0174133300782, + "kl_loss_7": 712.7872009277344, + "learning_rate": 0.0007926075112568258, + "loss": 1316.9054, + "step": 3080 + }, + { + "ce_loss_10": 3.5692449688911436, + "ce_loss_13": 3.4759126543998717, + "ce_loss_2": 4.623606491088867, + "ce_loss_3": 4.366301465034485, + "ce_loss_7": 3.78162659406662, + "epoch": 0.309, + "grad_norm": 560.0, + "kl_loss_10": 213.1074462890625, + "kl_loss_2": 2357.0720764160155, + "kl_loss_3": 1902.1767517089843, + "kl_loss_7": 709.6952423095703, + "learning_rate": 0.0007913194498130252, + "loss": 1281.0172, + "step": 3090 + }, + { + "ce_loss_10": 3.494074010848999, + "ce_loss_13": 3.400245749950409, + "ce_loss_2": 4.5784650325775145, + "ce_loss_3": 4.316486406326294, + "ce_loss_7": 3.7143809318542482, + "epoch": 0.31, + "grad_norm": 736.0, + "kl_loss_10": 216.9530891418457, + "kl_loss_2": 2388.186309814453, + "kl_loss_3": 1924.5094665527345, + "kl_loss_7": 718.4751098632812, + "learning_rate": 0.0007900284547855992, + "loss": 1312.7211, + "step": 3100 + }, + { + "ce_loss_10": 3.5040755391120912, + "ce_loss_13": 3.409269428253174, + "ce_loss_2": 4.549410009384156, + "ce_loss_3": 4.294600343704223, + "ce_loss_7": 3.7213049054145815, + "epoch": 0.311, + "grad_norm": 800.0, + "kl_loss_10": 210.81134338378905, + "kl_loss_2": 2329.0393676757812, + "kl_loss_3": 1876.9636657714843, + "kl_loss_7": 708.2128143310547, + "learning_rate": 0.0007887345391748532, + "loss": 1312.8156, + "step": 3110 + }, + { + "ce_loss_10": 3.634432864189148, + "ce_loss_13": 3.543325686454773, + "ce_loss_2": 4.651146030426025, + "ce_loss_3": 4.387193036079407, + "ce_loss_7": 3.8459346532821654, + "epoch": 0.312, + "grad_norm": 1168.0, + "kl_loss_10": 212.2933433532715, + "kl_loss_2": 2284.2329711914062, + "kl_loss_3": 1829.6757873535157, + "kl_loss_7": 706.4437377929687, + "learning_rate": 0.0007874377160105036, + "loss": 1259.3671, + "step": 3120 + }, + { + "ce_loss_10": 3.530054819583893, + "ce_loss_13": 3.4342761754989626, + "ce_loss_2": 4.628887629508972, + "ce_loss_3": 4.362399673461914, + "ce_loss_7": 3.7490867018699645, + "epoch": 0.313, + "grad_norm": 608.0, + "kl_loss_10": 212.55482711791993, + "kl_loss_2": 2429.394366455078, + "kl_loss_3": 1971.4875915527343, + "kl_loss_7": 728.8083862304687, + "learning_rate": 0.0007861379983515449, + "loss": 1354.4891, + "step": 3130 + }, + { + "ce_loss_10": 3.6109140157699584, + "ce_loss_13": 3.5200807809829713, + "ce_loss_2": 4.655977535247803, + "ce_loss_3": 4.39032473564148, + "ce_loss_7": 3.831193280220032, + "epoch": 0.314, + "grad_norm": 592.0, + "kl_loss_10": 209.2868881225586, + "kl_loss_2": 2336.8374755859377, + "kl_loss_3": 1868.733642578125, + "kl_loss_7": 717.5943817138672, + "learning_rate": 0.0007848353992861195, + "loss": 1273.946, + "step": 3140 + }, + { + "ce_loss_10": 3.6957940101623534, + "ce_loss_13": 3.595130515098572, + "ce_loss_2": 4.7389120101928714, + "ce_loss_3": 4.469233250617981, + "ce_loss_7": 3.926785933971405, + "epoch": 0.315, + "grad_norm": 888.0, + "kl_loss_10": 223.79472427368165, + "kl_loss_2": 2334.7629638671874, + "kl_loss_3": 1867.655010986328, + "kl_loss_7": 743.88798828125, + "learning_rate": 0.0007835299319313853, + "loss": 1303.1903, + "step": 3150 + }, + { + "ce_loss_10": 3.5704684495925902, + "ce_loss_13": 3.476880931854248, + "ce_loss_2": 4.606448101997375, + "ce_loss_3": 4.3400969982147215, + "ce_loss_7": 3.7886768341064454, + "epoch": 0.316, + "grad_norm": 700.0, + "kl_loss_10": 211.18966979980468, + "kl_loss_2": 2323.449572753906, + "kl_loss_3": 1851.584783935547, + "kl_loss_7": 721.3533996582031, + "learning_rate": 0.0007822216094333848, + "loss": 1322.3376, + "step": 3160 + }, + { + "ce_loss_10": 3.5810484290122986, + "ce_loss_13": 3.4873368740081787, + "ce_loss_2": 4.65300440788269, + "ce_loss_3": 4.387210464477539, + "ce_loss_7": 3.807372546195984, + "epoch": 0.317, + "grad_norm": 752.0, + "kl_loss_10": 212.44315567016602, + "kl_loss_2": 2384.089465332031, + "kl_loss_3": 1914.7229309082031, + "kl_loss_7": 731.2730682373046, + "learning_rate": 0.0007809104449669101, + "loss": 1294.9703, + "step": 3170 + }, + { + "ce_loss_10": 3.532199835777283, + "ce_loss_13": 3.4395654439926147, + "ce_loss_2": 4.584282898902893, + "ce_loss_3": 4.30876350402832, + "ce_loss_7": 3.7615070223808287, + "epoch": 0.318, + "grad_norm": 916.0, + "kl_loss_10": 207.75176467895508, + "kl_loss_2": 2339.527239990234, + "kl_loss_3": 1858.5612731933593, + "kl_loss_7": 730.9345184326172, + "learning_rate": 0.0007795964517353734, + "loss": 1278.7686, + "step": 3180 + }, + { + "ce_loss_10": 3.518466317653656, + "ce_loss_13": 3.426977741718292, + "ce_loss_2": 4.596842670440674, + "ce_loss_3": 4.325531184673309, + "ce_loss_7": 3.750142526626587, + "epoch": 0.319, + "grad_norm": 648.0, + "kl_loss_10": 211.74872894287108, + "kl_loss_2": 2403.7151733398437, + "kl_loss_3": 1931.1865478515624, + "kl_loss_7": 753.2544128417969, + "learning_rate": 0.000778279642970672, + "loss": 1282.6858, + "step": 3190 + }, + { + "ce_loss_10": 3.5179845094680786, + "ce_loss_13": 3.428935539722443, + "ce_loss_2": 4.562283158302307, + "ce_loss_3": 4.295236802101135, + "ce_loss_7": 3.7340755701065063, + "epoch": 0.32, + "grad_norm": 904.0, + "kl_loss_10": 205.840421295166, + "kl_loss_2": 2345.048876953125, + "kl_loss_3": 1866.877911376953, + "kl_loss_7": 720.2934661865235, + "learning_rate": 0.0007769600319330552, + "loss": 1264.9217, + "step": 3200 + }, + { + "ce_loss_10": 3.554915177822113, + "ce_loss_13": 3.466732156276703, + "ce_loss_2": 4.653983449935913, + "ce_loss_3": 4.384042191505432, + "ce_loss_7": 3.7919634103775026, + "epoch": 0.321, + "grad_norm": 708.0, + "kl_loss_10": 205.94034423828126, + "kl_loss_2": 2414.151336669922, + "kl_loss_3": 1938.7024047851562, + "kl_loss_7": 735.4687530517579, + "learning_rate": 0.0007756376319109917, + "loss": 1299.3125, + "step": 3210 + }, + { + "ce_loss_10": 3.601811099052429, + "ce_loss_13": 3.513204276561737, + "ce_loss_2": 4.643903732299805, + "ce_loss_3": 4.372084999084473, + "ce_loss_7": 3.82387717962265, + "epoch": 0.322, + "grad_norm": 856.0, + "kl_loss_10": 205.92393646240234, + "kl_loss_2": 2310.6433349609374, + "kl_loss_3": 1837.0882263183594, + "kl_loss_7": 727.0832824707031, + "learning_rate": 0.0007743124562210351, + "loss": 1252.0768, + "step": 3220 + }, + { + "ce_loss_10": 3.613737678527832, + "ce_loss_13": 3.5243070006370543, + "ce_loss_2": 4.6432843685150145, + "ce_loss_3": 4.373880839347839, + "ce_loss_7": 3.835604417324066, + "epoch": 0.323, + "grad_norm": 804.0, + "kl_loss_10": 206.8451774597168, + "kl_loss_2": 2302.7849548339846, + "kl_loss_3": 1831.265850830078, + "kl_loss_7": 718.6182281494141, + "learning_rate": 0.0007729845182076895, + "loss": 1281.717, + "step": 3230 + }, + { + "ce_loss_10": 3.54460072517395, + "ce_loss_13": 3.458022344112396, + "ce_loss_2": 4.567643523216248, + "ce_loss_3": 4.304308319091797, + "ce_loss_7": 3.7557874441146852, + "epoch": 0.324, + "grad_norm": 780.0, + "kl_loss_10": 202.53059158325195, + "kl_loss_2": 2275.4922607421877, + "kl_loss_3": 1814.183154296875, + "kl_loss_7": 706.1778442382813, + "learning_rate": 0.0007716538312432765, + "loss": 1299.5142, + "step": 3240 + }, + { + "ce_loss_10": 3.5034128069877624, + "ce_loss_13": 3.4109013199806215, + "ce_loss_2": 4.59195454120636, + "ce_loss_3": 4.316867542266846, + "ce_loss_7": 3.7340264201164244, + "epoch": 0.325, + "grad_norm": 620.0, + "kl_loss_10": 212.30709838867188, + "kl_loss_2": 2399.0862243652346, + "kl_loss_3": 1912.8944152832032, + "kl_loss_7": 738.4792083740234, + "learning_rate": 0.0007703204087277988, + "loss": 1308.0572, + "step": 3250 + }, + { + "ce_loss_10": 3.60279586315155, + "ce_loss_13": 3.5141580939292907, + "ce_loss_2": 4.619040894508362, + "ce_loss_3": 4.348745739459991, + "ce_loss_7": 3.81355699300766, + "epoch": 0.326, + "grad_norm": 728.0, + "kl_loss_10": 202.46756286621093, + "kl_loss_2": 2248.315087890625, + "kl_loss_3": 1773.8885681152344, + "kl_loss_7": 686.2587005615235, + "learning_rate": 0.0007689842640888063, + "loss": 1245.8748, + "step": 3260 + }, + { + "ce_loss_10": 3.6051684260368346, + "ce_loss_13": 3.5150891542434692, + "ce_loss_2": 4.619964861869812, + "ce_loss_3": 4.360685467720032, + "ce_loss_7": 3.8154699206352234, + "epoch": 0.327, + "grad_norm": 684.0, + "kl_loss_10": 208.96502685546875, + "kl_loss_2": 2265.3287658691406, + "kl_loss_3": 1811.821759033203, + "kl_loss_7": 703.9406219482422, + "learning_rate": 0.0007676454107812607, + "loss": 1264.3093, + "step": 3270 + }, + { + "ce_loss_10": 3.537815499305725, + "ce_loss_13": 3.444960331916809, + "ce_loss_2": 4.608094549179077, + "ce_loss_3": 4.3397119522094725, + "ce_loss_7": 3.7506736159324645, + "epoch": 0.328, + "grad_norm": 616.0, + "kl_loss_10": 211.32426528930665, + "kl_loss_2": 2388.8077392578125, + "kl_loss_3": 1915.7237182617187, + "kl_loss_7": 707.672705078125, + "learning_rate": 0.0007663038622873999, + "loss": 1279.8335, + "step": 3280 + }, + { + "ce_loss_10": 3.574945878982544, + "ce_loss_13": 3.4833101868629455, + "ce_loss_2": 4.628302264213562, + "ce_loss_3": 4.366952037811279, + "ce_loss_7": 3.784594464302063, + "epoch": 0.329, + "grad_norm": 596.0, + "kl_loss_10": 211.56422576904296, + "kl_loss_2": 2351.5840576171877, + "kl_loss_3": 1879.0997131347656, + "kl_loss_7": 694.7927307128906, + "learning_rate": 0.0007649596321166025, + "loss": 1256.8023, + "step": 3290 + }, + { + "ce_loss_10": 3.4788912653923036, + "ce_loss_13": 3.3914729714393617, + "ce_loss_2": 4.513116896152496, + "ce_loss_3": 4.253373873233795, + "ce_loss_7": 3.6928447008132936, + "epoch": 0.33, + "grad_norm": 600.0, + "kl_loss_10": 203.03155212402345, + "kl_loss_2": 2285.9035522460936, + "kl_loss_3": 1828.6068603515625, + "kl_loss_7": 691.3944427490235, + "learning_rate": 0.0007636127338052513, + "loss": 1273.8033, + "step": 3300 + }, + { + "ce_loss_10": 3.5868964433670043, + "ce_loss_13": 3.49528044462204, + "ce_loss_2": 4.663711452484131, + "ce_loss_3": 4.39831657409668, + "ce_loss_7": 3.804142189025879, + "epoch": 0.331, + "grad_norm": 624.0, + "kl_loss_10": 211.30522232055665, + "kl_loss_2": 2397.5427856445312, + "kl_loss_3": 1927.950506591797, + "kl_loss_7": 706.7799133300781, + "learning_rate": 0.0007622631809165971, + "loss": 1277.9496, + "step": 3310 + }, + { + "ce_loss_10": 3.582921600341797, + "ce_loss_13": 3.4965414881706236, + "ce_loss_2": 4.58232958316803, + "ce_loss_3": 4.323147928714752, + "ce_loss_7": 3.783066177368164, + "epoch": 0.332, + "grad_norm": 688.0, + "kl_loss_10": 197.47354049682616, + "kl_loss_2": 2216.269598388672, + "kl_loss_3": 1760.1223999023437, + "kl_loss_7": 664.7514923095703, + "learning_rate": 0.000760910987040623, + "loss": 1245.9068, + "step": 3320 + }, + { + "ce_loss_10": 3.5663990497589113, + "ce_loss_13": 3.474509632587433, + "ce_loss_2": 4.641107606887817, + "ce_loss_3": 4.369283008575439, + "ce_loss_7": 3.78259996175766, + "epoch": 0.333, + "grad_norm": 616.0, + "kl_loss_10": 210.06242904663085, + "kl_loss_2": 2402.0831298828125, + "kl_loss_3": 1926.8569946289062, + "kl_loss_7": 714.4321014404297, + "learning_rate": 0.000759556165793906, + "loss": 1272.2351, + "step": 3330 + }, + { + "ce_loss_10": 3.5859936118125915, + "ce_loss_13": 3.4947105884552, + "ce_loss_2": 4.635438013076782, + "ce_loss_3": 4.3713214635849, + "ce_loss_7": 3.79847708940506, + "epoch": 0.334, + "grad_norm": 600.0, + "kl_loss_10": 207.2735107421875, + "kl_loss_2": 2336.860705566406, + "kl_loss_3": 1864.5109497070312, + "kl_loss_7": 698.3573028564454, + "learning_rate": 0.000758198730819481, + "loss": 1291.4691, + "step": 3340 + }, + { + "ce_loss_10": 3.530641829967499, + "ce_loss_13": 3.44451619386673, + "ce_loss_2": 4.589142799377441, + "ce_loss_3": 4.3217404961586, + "ce_loss_7": 3.7362788200378416, + "epoch": 0.335, + "grad_norm": 624.0, + "kl_loss_10": 202.07082290649413, + "kl_loss_2": 2360.776690673828, + "kl_loss_3": 1886.3787536621094, + "kl_loss_7": 695.4340698242188, + "learning_rate": 0.0007568386957867032, + "loss": 1283.006, + "step": 3350 + }, + { + "ce_loss_10": 3.6058520078659058, + "ce_loss_13": 3.5129651188850404, + "ce_loss_2": 4.643417167663574, + "ce_loss_3": 4.37364354133606, + "ce_loss_7": 3.813083219528198, + "epoch": 0.336, + "grad_norm": 784.0, + "kl_loss_10": 207.97874145507814, + "kl_loss_2": 2295.3684020996093, + "kl_loss_3": 1825.3164672851562, + "kl_loss_7": 687.7690948486328, + "learning_rate": 0.0007554760743911103, + "loss": 1276.5395, + "step": 3360 + }, + { + "ce_loss_10": 3.5018799662590028, + "ce_loss_13": 3.4133763194084166, + "ce_loss_2": 4.551569533348084, + "ce_loss_3": 4.283941590785981, + "ce_loss_7": 3.704894995689392, + "epoch": 0.337, + "grad_norm": 644.0, + "kl_loss_10": 201.99492797851562, + "kl_loss_2": 2352.8791320800783, + "kl_loss_3": 1880.3864440917969, + "kl_loss_7": 682.2205291748047, + "learning_rate": 0.0007541108803542846, + "loss": 1306.1851, + "step": 3370 + }, + { + "ce_loss_10": 3.5562949419021606, + "ce_loss_13": 3.467681646347046, + "ce_loss_2": 4.61066963672638, + "ce_loss_3": 4.3377085566520694, + "ce_loss_7": 3.7622047662734985, + "epoch": 0.338, + "grad_norm": 632.0, + "kl_loss_10": 205.39780044555664, + "kl_loss_2": 2363.417413330078, + "kl_loss_3": 1877.7716186523437, + "kl_loss_7": 681.8937408447266, + "learning_rate": 0.0007527431274237149, + "loss": 1343.544, + "step": 3380 + }, + { + "ce_loss_10": 3.5283817052841187, + "ce_loss_13": 3.4397483229637147, + "ce_loss_2": 4.570840525627136, + "ce_loss_3": 4.304589962959289, + "ce_loss_7": 3.7283903479576113, + "epoch": 0.339, + "grad_norm": 572.0, + "kl_loss_10": 203.6351058959961, + "kl_loss_2": 2336.9517456054687, + "kl_loss_3": 1865.9661682128906, + "kl_loss_7": 677.4322174072265, + "learning_rate": 0.0007513728293726579, + "loss": 1277.8105, + "step": 3390 + }, + { + "ce_loss_10": 3.644584619998932, + "ce_loss_13": 3.556074547767639, + "ce_loss_2": 4.664924669265747, + "ce_loss_3": 4.399448752403259, + "ce_loss_7": 3.8509042620658875, + "epoch": 0.34, + "grad_norm": 644.0, + "kl_loss_10": 203.6712448120117, + "kl_loss_2": 2293.6278381347656, + "kl_loss_3": 1826.5482238769532, + "kl_loss_7": 683.2333282470703, + "learning_rate": 0.00075, + "loss": 1246.8078, + "step": 3400 + }, + { + "ce_loss_10": 3.6313098788261415, + "ce_loss_13": 3.5418556571006774, + "ce_loss_2": 4.690052318572998, + "ce_loss_3": 4.416153597831726, + "ce_loss_7": 3.843977117538452, + "epoch": 0.341, + "grad_norm": 644.0, + "kl_loss_10": 205.15843811035157, + "kl_loss_2": 2335.1500244140625, + "kl_loss_3": 1861.3693969726562, + "kl_loss_7": 693.7598510742188, + "learning_rate": 0.0007486246531301177, + "loss": 1258.7575, + "step": 3410 + }, + { + "ce_loss_10": 3.443863534927368, + "ce_loss_13": 3.3510751008987425, + "ce_loss_2": 4.5022605657577515, + "ce_loss_3": 4.230472648143769, + "ce_loss_7": 3.6559174418449403, + "epoch": 0.342, + "grad_norm": 664.0, + "kl_loss_10": 202.74062805175782, + "kl_loss_2": 2345.9876220703127, + "kl_loss_3": 1864.947119140625, + "kl_loss_7": 688.1812042236328, + "learning_rate": 0.0007472468026127384, + "loss": 1260.6121, + "step": 3420 + }, + { + "ce_loss_10": 3.5721543431282043, + "ce_loss_13": 3.4770930409431458, + "ce_loss_2": 4.665278792381287, + "ce_loss_3": 4.404071187973022, + "ce_loss_7": 3.788026750087738, + "epoch": 0.343, + "grad_norm": 712.0, + "kl_loss_10": 214.17584533691405, + "kl_loss_2": 2439.352404785156, + "kl_loss_3": 1972.9944885253906, + "kl_loss_7": 720.6741455078125, + "learning_rate": 0.000745866462322802, + "loss": 1320.8363, + "step": 3430 + }, + { + "ce_loss_10": 3.560039293766022, + "ce_loss_13": 3.4741207122802735, + "ce_loss_2": 4.592786359786987, + "ce_loss_3": 4.334179782867432, + "ce_loss_7": 3.7690793752670286, + "epoch": 0.344, + "grad_norm": 700.0, + "kl_loss_10": 200.53601684570313, + "kl_loss_2": 2283.709338378906, + "kl_loss_3": 1835.7159240722656, + "kl_loss_7": 673.5906463623047, + "learning_rate": 0.0007444836461603195, + "loss": 1261.9196, + "step": 3440 + }, + { + "ce_loss_10": 3.6245110511779783, + "ce_loss_13": 3.5312341451644897, + "ce_loss_2": 4.6719811201095585, + "ce_loss_3": 4.406914234161377, + "ce_loss_7": 3.8266146540641786, + "epoch": 0.345, + "grad_norm": 648.0, + "kl_loss_10": 214.16654891967772, + "kl_loss_2": 2362.7242797851563, + "kl_loss_3": 1898.2087890625, + "kl_loss_7": 704.3192626953125, + "learning_rate": 0.0007430983680502344, + "loss": 1301.0338, + "step": 3450 + }, + { + "ce_loss_10": 3.4667457938194275, + "ce_loss_13": 3.377876877784729, + "ce_loss_2": 4.545617830753327, + "ce_loss_3": 4.272857880592346, + "ce_loss_7": 3.6741854548454285, + "epoch": 0.346, + "grad_norm": 608.0, + "kl_loss_10": 206.17358779907227, + "kl_loss_2": 2388.203955078125, + "kl_loss_3": 1909.6425415039062, + "kl_loss_7": 697.4611602783203, + "learning_rate": 0.0007417106419422819, + "loss": 1290.2338, + "step": 3460 + }, + { + "ce_loss_10": 3.571521496772766, + "ce_loss_13": 3.4770392775535583, + "ce_loss_2": 4.614993333816528, + "ce_loss_3": 4.345528078079224, + "ce_loss_7": 3.7798440217971803, + "epoch": 0.347, + "grad_norm": 656.0, + "kl_loss_10": 204.45724334716797, + "kl_loss_2": 2308.5895385742188, + "kl_loss_3": 1833.5083312988281, + "kl_loss_7": 683.0788604736329, + "learning_rate": 0.0007403204818108486, + "loss": 1275.3799, + "step": 3470 + }, + { + "ce_loss_10": 3.5445627093315126, + "ce_loss_13": 3.4533625841140747, + "ce_loss_2": 4.6031595230102536, + "ce_loss_3": 4.338364768028259, + "ce_loss_7": 3.746653878688812, + "epoch": 0.348, + "grad_norm": 576.0, + "kl_loss_10": 208.29350357055665, + "kl_loss_2": 2371.4946899414062, + "kl_loss_3": 1909.6997863769532, + "kl_loss_7": 686.1026062011719, + "learning_rate": 0.0007389279016548316, + "loss": 1247.1171, + "step": 3480 + }, + { + "ce_loss_10": 3.553533661365509, + "ce_loss_13": 3.458456254005432, + "ce_loss_2": 4.6566637516021725, + "ce_loss_3": 4.37568781375885, + "ce_loss_7": 3.7642048597335815, + "epoch": 0.349, + "grad_norm": 684.0, + "kl_loss_10": 212.6401054382324, + "kl_loss_2": 2451.175671386719, + "kl_loss_3": 1951.6068969726562, + "kl_loss_7": 702.7821899414063, + "learning_rate": 0.0007375329154974975, + "loss": 1307.9424, + "step": 3490 + }, + { + "ce_loss_10": 3.5084131717681886, + "ce_loss_13": 3.4206284284591675, + "ce_loss_2": 4.546359324455262, + "ce_loss_3": 4.28377673625946, + "ce_loss_7": 3.7158578753471376, + "epoch": 0.35, + "grad_norm": 676.0, + "kl_loss_10": 208.94808502197264, + "kl_loss_2": 2307.420263671875, + "kl_loss_3": 1848.263897705078, + "kl_loss_7": 683.7157653808594, + "learning_rate": 0.0007361355373863414, + "loss": 1294.9244, + "step": 3500 + }, + { + "ce_loss_10": 3.563704586029053, + "ce_loss_13": 3.471196401119232, + "ce_loss_2": 4.5938108444213865, + "ce_loss_3": 4.332852721214294, + "ce_loss_7": 3.7703267097473145, + "epoch": 0.351, + "grad_norm": 580.0, + "kl_loss_10": 208.09024658203126, + "kl_loss_2": 2287.9739318847655, + "kl_loss_3": 1828.5689758300782, + "kl_loss_7": 673.030307006836, + "learning_rate": 0.0007347357813929454, + "loss": 1287.6393, + "step": 3510 + }, + { + "ce_loss_10": 3.5099044919013975, + "ce_loss_13": 3.419321870803833, + "ce_loss_2": 4.543364262580871, + "ce_loss_3": 4.27359983921051, + "ce_loss_7": 3.7119598269462584, + "epoch": 0.352, + "grad_norm": 620.0, + "kl_loss_10": 207.49536819458007, + "kl_loss_2": 2274.5087951660157, + "kl_loss_3": 1817.6334594726563, + "kl_loss_7": 673.7456726074219, + "learning_rate": 0.0007333336616128369, + "loss": 1275.3445, + "step": 3520 + }, + { + "ce_loss_10": 3.4893477082252504, + "ce_loss_13": 3.394596815109253, + "ce_loss_2": 4.558405804634094, + "ce_loss_3": 4.2951094031333925, + "ce_loss_7": 3.69784619808197, + "epoch": 0.353, + "grad_norm": 636.0, + "kl_loss_10": 211.34491577148438, + "kl_loss_2": 2368.5742614746096, + "kl_loss_3": 1904.588153076172, + "kl_loss_7": 699.6206512451172, + "learning_rate": 0.0007319291921653463, + "loss": 1290.8219, + "step": 3530 + }, + { + "ce_loss_10": 3.5741103887557983, + "ce_loss_13": 3.4787994265556335, + "ce_loss_2": 4.633104467391968, + "ce_loss_3": 4.363708543777466, + "ce_loss_7": 3.7883455634117125, + "epoch": 0.354, + "grad_norm": 688.0, + "kl_loss_10": 211.75357818603516, + "kl_loss_2": 2353.7052368164063, + "kl_loss_3": 1872.2497680664062, + "kl_loss_7": 696.9233825683593, + "learning_rate": 0.0007305223871934656, + "loss": 1261.161, + "step": 3540 + }, + { + "ce_loss_10": 3.53648921251297, + "ce_loss_13": 3.4479789614677427, + "ce_loss_2": 4.58404312133789, + "ce_loss_3": 4.315784668922424, + "ce_loss_7": 3.7411927938461305, + "epoch": 0.355, + "grad_norm": 644.0, + "kl_loss_10": 204.81159896850585, + "kl_loss_2": 2318.000451660156, + "kl_loss_3": 1841.0889282226562, + "kl_loss_7": 674.2556213378906, + "learning_rate": 0.0007291132608637052, + "loss": 1261.7902, + "step": 3550 + }, + { + "ce_loss_10": 3.4981685996055605, + "ce_loss_13": 3.4104817390441893, + "ce_loss_2": 4.630524325370788, + "ce_loss_3": 4.356097209453583, + "ce_loss_7": 3.7044720530509947, + "epoch": 0.356, + "grad_norm": 596.0, + "kl_loss_10": 201.21338653564453, + "kl_loss_2": 2484.7927124023436, + "kl_loss_3": 2010.760675048828, + "kl_loss_7": 676.6305114746094, + "learning_rate": 0.0007277018273659516, + "loss": 1327.9727, + "step": 3560 + }, + { + "ce_loss_10": 3.625146007537842, + "ce_loss_13": 3.531074047088623, + "ce_loss_2": 4.6699333667755125, + "ce_loss_3": 4.4035911679267885, + "ce_loss_7": 3.836405646800995, + "epoch": 0.357, + "grad_norm": 620.0, + "kl_loss_10": 209.5035614013672, + "kl_loss_2": 2341.261669921875, + "kl_loss_3": 1873.83955078125, + "kl_loss_7": 701.3334136962891, + "learning_rate": 0.0007262881009133242, + "loss": 1275.0637, + "step": 3570 + }, + { + "ce_loss_10": 3.5401904344558717, + "ce_loss_13": 3.4541720032691954, + "ce_loss_2": 4.572478699684143, + "ce_loss_3": 4.315898811817169, + "ce_loss_7": 3.7435639023780825, + "epoch": 0.358, + "grad_norm": 616.0, + "kl_loss_10": 201.09010009765626, + "kl_loss_2": 2313.080432128906, + "kl_loss_3": 1855.7528381347656, + "kl_loss_7": 673.6273254394531, + "learning_rate": 0.0007248720957420329, + "loss": 1252.028, + "step": 3580 + }, + { + "ce_loss_10": 3.55083909034729, + "ce_loss_13": 3.466167140007019, + "ce_loss_2": 4.594191384315491, + "ce_loss_3": 4.320683646202087, + "ce_loss_7": 3.750420665740967, + "epoch": 0.359, + "grad_norm": 592.0, + "kl_loss_10": 201.2973388671875, + "kl_loss_2": 2304.7897521972654, + "kl_loss_3": 1830.9858337402343, + "kl_loss_7": 667.1753356933593, + "learning_rate": 0.0007234538261112341, + "loss": 1305.9864, + "step": 3590 + }, + { + "ce_loss_10": 3.5870068073272705, + "ce_loss_13": 3.4963618993759153, + "ce_loss_2": 4.64858865737915, + "ce_loss_3": 4.383180546760559, + "ce_loss_7": 3.7978907346725466, + "epoch": 0.36, + "grad_norm": 580.0, + "kl_loss_10": 206.438224029541, + "kl_loss_2": 2350.6687255859374, + "kl_loss_3": 1879.8920471191407, + "kl_loss_7": 691.6597686767578, + "learning_rate": 0.0007220333063028871, + "loss": 1262.9199, + "step": 3600 + }, + { + "ce_loss_10": 3.617115843296051, + "ce_loss_13": 3.5226447105407717, + "ce_loss_2": 4.7090448379516605, + "ce_loss_3": 4.440777349472046, + "ce_loss_7": 3.927542436122894, + "epoch": 0.361, + "grad_norm": 700.0, + "kl_loss_10": 215.82505798339844, + "kl_loss_2": 2451.148791503906, + "kl_loss_3": 1982.2255798339843, + "kl_loss_7": 911.9942932128906, + "learning_rate": 0.0007206105506216106, + "loss": 1351.6553, + "step": 3610 + }, + { + "ce_loss_10": 3.4991318702697756, + "ce_loss_13": 3.4086315035820007, + "ce_loss_2": 4.517360043525696, + "ce_loss_3": 4.2627614617347716, + "ce_loss_7": 3.7236221551895143, + "epoch": 0.362, + "grad_norm": 836.0, + "kl_loss_10": 210.4427146911621, + "kl_loss_2": 2266.5427673339846, + "kl_loss_3": 1820.741632080078, + "kl_loss_7": 721.7343872070312, + "learning_rate": 0.0007191855733945387, + "loss": 1249.7904, + "step": 3620 + }, + { + "ce_loss_10": 3.5907591581344604, + "ce_loss_13": 3.5010143160820006, + "ce_loss_2": 4.622481203079223, + "ce_loss_3": 4.356601357460022, + "ce_loss_7": 3.79650160074234, + "epoch": 0.363, + "grad_norm": 572.0, + "kl_loss_10": 204.04148864746094, + "kl_loss_2": 2289.9740478515623, + "kl_loss_3": 1818.431005859375, + "kl_loss_7": 692.2773986816406, + "learning_rate": 0.0007177583889711762, + "loss": 1250.2074, + "step": 3630 + }, + { + "ce_loss_10": 3.5057953119277956, + "ce_loss_13": 3.411814069747925, + "ce_loss_2": 4.563321113586426, + "ce_loss_3": 4.290196192264557, + "ce_loss_7": 3.7388221859931945, + "epoch": 0.364, + "grad_norm": 952.0, + "kl_loss_10": 206.33124389648438, + "kl_loss_2": 2346.8802978515623, + "kl_loss_3": 1869.4090270996094, + "kl_loss_7": 731.2307403564453, + "learning_rate": 0.0007163290117232541, + "loss": 1286.6971, + "step": 3640 + }, + { + "ce_loss_10": 3.625288701057434, + "ce_loss_13": 3.5375028014183045, + "ce_loss_2": 4.612985825538635, + "ce_loss_3": 4.35078866481781, + "ce_loss_7": 3.820189893245697, + "epoch": 0.365, + "grad_norm": 676.0, + "kl_loss_10": 199.10545043945314, + "kl_loss_2": 2225.872174072266, + "kl_loss_3": 1769.0750122070312, + "kl_loss_7": 679.4105773925781, + "learning_rate": 0.0007148974560445859, + "loss": 1248.8154, + "step": 3650 + }, + { + "ce_loss_10": 3.5454740643501284, + "ce_loss_13": 3.458422267436981, + "ce_loss_2": 4.550081968307495, + "ce_loss_3": 4.287899553775787, + "ce_loss_7": 3.7501559376716616, + "epoch": 0.366, + "grad_norm": 588.0, + "kl_loss_10": 198.9521499633789, + "kl_loss_2": 2238.7438537597654, + "kl_loss_3": 1775.3712280273437, + "kl_loss_7": 679.886441040039, + "learning_rate": 0.0007134637363509209, + "loss": 1224.5007, + "step": 3660 + }, + { + "ce_loss_10": 3.6524737238883973, + "ce_loss_13": 3.566513454914093, + "ce_loss_2": 4.64628803730011, + "ce_loss_3": 4.383014440536499, + "ce_loss_7": 3.85452960729599, + "epoch": 0.367, + "grad_norm": 804.0, + "kl_loss_10": 195.8438636779785, + "kl_loss_2": 2202.7541381835936, + "kl_loss_3": 1744.0477172851563, + "kl_loss_7": 669.0114837646485, + "learning_rate": 0.0007120278670798009, + "loss": 1241.0041, + "step": 3670 + }, + { + "ce_loss_10": 3.451411759853363, + "ce_loss_13": 3.362742209434509, + "ce_loss_2": 4.563682770729065, + "ce_loss_3": 4.3000654697418215, + "ce_loss_7": 3.6735877275466917, + "epoch": 0.368, + "grad_norm": 692.0, + "kl_loss_10": 207.53860931396486, + "kl_loss_2": 2452.550360107422, + "kl_loss_3": 1973.4927856445313, + "kl_loss_7": 716.8897491455078, + "learning_rate": 0.0007105898626904133, + "loss": 1338.2093, + "step": 3680 + }, + { + "ce_loss_10": 3.5554641008377077, + "ce_loss_13": 3.4662737131118773, + "ce_loss_2": 4.611227035522461, + "ce_loss_3": 4.340918231010437, + "ce_loss_7": 3.768160092830658, + "epoch": 0.369, + "grad_norm": 564.0, + "kl_loss_10": 202.72654418945314, + "kl_loss_2": 2339.7893005371093, + "kl_loss_3": 1860.5950439453125, + "kl_loss_7": 686.1352569580079, + "learning_rate": 0.0007091497376634463, + "loss": 1252.1551, + "step": 3690 + }, + { + "ce_loss_10": 3.5008182168006896, + "ce_loss_13": 3.412228453159332, + "ce_loss_2": 4.538051557540894, + "ce_loss_3": 4.271669220924378, + "ce_loss_7": 3.707795190811157, + "epoch": 0.37, + "grad_norm": 688.0, + "kl_loss_10": 203.25593032836915, + "kl_loss_2": 2313.7531005859373, + "kl_loss_3": 1845.0465942382812, + "kl_loss_7": 679.24736328125, + "learning_rate": 0.0007077075065009433, + "loss": 1276.0328, + "step": 3700 + }, + { + "ce_loss_10": 3.6071534514427186, + "ce_loss_13": 3.5158491373062133, + "ce_loss_2": 4.6654202222824095, + "ce_loss_3": 4.392651915550232, + "ce_loss_7": 3.819450116157532, + "epoch": 0.371, + "grad_norm": 752.0, + "kl_loss_10": 208.91845016479493, + "kl_loss_2": 2358.2886352539062, + "kl_loss_3": 1879.4206665039062, + "kl_loss_7": 699.9392761230469, + "learning_rate": 0.0007062631837261557, + "loss": 1268.6693, + "step": 3710 + }, + { + "ce_loss_10": 3.476445233821869, + "ce_loss_13": 3.390165627002716, + "ce_loss_2": 4.5276483535766605, + "ce_loss_3": 4.25641827583313, + "ce_loss_7": 3.683011364936829, + "epoch": 0.372, + "grad_norm": 640.0, + "kl_loss_10": 202.1516098022461, + "kl_loss_2": 2314.015344238281, + "kl_loss_3": 1841.6598022460937, + "kl_loss_7": 682.8849884033203, + "learning_rate": 0.0007048167838833977, + "loss": 1289.0859, + "step": 3720 + }, + { + "ce_loss_10": 3.5699679255485535, + "ce_loss_13": 3.480803608894348, + "ce_loss_2": 4.588373041152954, + "ce_loss_3": 4.323996567726136, + "ce_loss_7": 3.7677942156791686, + "epoch": 0.373, + "grad_norm": 744.0, + "kl_loss_10": 203.9791275024414, + "kl_loss_2": 2290.5110717773437, + "kl_loss_3": 1815.935235595703, + "kl_loss_7": 672.9903137207032, + "learning_rate": 0.0007033683215379002, + "loss": 1247.4349, + "step": 3730 + }, + { + "ce_loss_10": 3.556042289733887, + "ce_loss_13": 3.4659453988075257, + "ce_loss_2": 4.594552016258239, + "ce_loss_3": 4.319927525520325, + "ce_loss_7": 3.7590227007865904, + "epoch": 0.374, + "grad_norm": 608.0, + "kl_loss_10": 199.66127700805663, + "kl_loss_2": 2281.638720703125, + "kl_loss_3": 1802.195782470703, + "kl_loss_7": 667.0344909667969, + "learning_rate": 0.0007019178112756625, + "loss": 1258.8061, + "step": 3740 + }, + { + "ce_loss_10": 3.518285346031189, + "ce_loss_13": 3.432125985622406, + "ce_loss_2": 4.562115430831909, + "ce_loss_3": 4.29533269405365, + "ce_loss_7": 3.7262615442276, + "epoch": 0.375, + "grad_norm": 640.0, + "kl_loss_10": 200.82229309082032, + "kl_loss_2": 2292.846240234375, + "kl_loss_3": 1823.6997009277343, + "kl_loss_7": 673.9724395751953, + "learning_rate": 0.0007004652677033068, + "loss": 1263.2482, + "step": 3750 + }, + { + "ce_loss_10": 3.5903055548667906, + "ce_loss_13": 3.506292223930359, + "ce_loss_2": 4.5962906837463375, + "ce_loss_3": 4.3357175350189205, + "ce_loss_7": 3.786800575256348, + "epoch": 0.376, + "grad_norm": 656.0, + "kl_loss_10": 200.60662612915038, + "kl_loss_2": 2252.357458496094, + "kl_loss_3": 1790.429217529297, + "kl_loss_7": 660.8626007080078, + "learning_rate": 0.0006990107054479312, + "loss": 1245.262, + "step": 3760 + }, + { + "ce_loss_10": 3.5865476965904235, + "ce_loss_13": 3.4931369185447694, + "ce_loss_2": 4.599129343032837, + "ce_loss_3": 4.335857176780701, + "ce_loss_7": 3.77964334487915, + "epoch": 0.377, + "grad_norm": 700.0, + "kl_loss_10": 209.85385513305664, + "kl_loss_2": 2261.3722229003906, + "kl_loss_3": 1806.916485595703, + "kl_loss_7": 670.6618469238281, + "learning_rate": 0.000697554139156961, + "loss": 1247.3398, + "step": 3770 + }, + { + "ce_loss_10": 3.572757053375244, + "ce_loss_13": 3.477368426322937, + "ce_loss_2": 4.60923056602478, + "ce_loss_3": 4.342034792900085, + "ce_loss_7": 3.766658973693848, + "epoch": 0.378, + "grad_norm": 628.0, + "kl_loss_10": 218.62799072265625, + "kl_loss_2": 2331.5685607910154, + "kl_loss_3": 1853.7978393554688, + "kl_loss_7": 681.1488891601563, + "learning_rate": 0.0006960955834980027, + "loss": 1246.4775, + "step": 3780 + }, + { + "ce_loss_10": 3.5454328536987303, + "ce_loss_13": 3.449174666404724, + "ce_loss_2": 4.563591694831848, + "ce_loss_3": 4.298812806606293, + "ce_loss_7": 3.7385509848594665, + "epoch": 0.379, + "grad_norm": 740.0, + "kl_loss_10": 215.9669273376465, + "kl_loss_2": 2275.3556274414063, + "kl_loss_3": 1807.9537841796875, + "kl_loss_7": 673.0077819824219, + "learning_rate": 0.0006946350531586958, + "loss": 1251.4496, + "step": 3790 + }, + { + "ce_loss_10": 3.5613739252090455, + "ce_loss_13": 3.4710352540016176, + "ce_loss_2": 4.586881446838379, + "ce_loss_3": 4.326097631454468, + "ce_loss_7": 3.7597612500190736, + "epoch": 0.38, + "grad_norm": 636.0, + "kl_loss_10": 210.37978897094726, + "kl_loss_2": 2278.926678466797, + "kl_loss_3": 1818.876287841797, + "kl_loss_7": 669.4569915771484, + "learning_rate": 0.0006931725628465643, + "loss": 1275.2133, + "step": 3800 + }, + { + "ce_loss_10": 3.590466618537903, + "ce_loss_13": 3.4937587857246397, + "ce_loss_2": 4.623274827003479, + "ce_loss_3": 4.352746081352234, + "ce_loss_7": 3.7943554282188416, + "epoch": 0.381, + "grad_norm": 628.0, + "kl_loss_10": 211.29082336425782, + "kl_loss_2": 2296.4768188476564, + "kl_loss_3": 1818.6126953125, + "kl_loss_7": 678.5069641113281, + "learning_rate": 0.0006917081272888696, + "loss": 1259.3377, + "step": 3810 + }, + { + "ce_loss_10": 3.487190854549408, + "ce_loss_13": 3.393886852264404, + "ce_loss_2": 4.553677868843079, + "ce_loss_3": 4.300019836425781, + "ce_loss_7": 3.6916916847229, + "epoch": 0.382, + "grad_norm": 596.0, + "kl_loss_10": 216.8355224609375, + "kl_loss_2": 2365.8853271484377, + "kl_loss_3": 1922.5055114746094, + "kl_loss_7": 684.1588439941406, + "learning_rate": 0.0006902417612324615, + "loss": 1266.6071, + "step": 3820 + }, + { + "ce_loss_10": 3.6190301895141603, + "ce_loss_13": 3.5245797634124756, + "ce_loss_2": 4.67730553150177, + "ce_loss_3": 4.405901682376862, + "ce_loss_7": 3.8294657945632933, + "epoch": 0.383, + "grad_norm": 680.0, + "kl_loss_10": 219.55593795776366, + "kl_loss_2": 2360.2657958984373, + "kl_loss_3": 1879.3323364257812, + "kl_loss_7": 698.4299591064453, + "learning_rate": 0.00068877347944363, + "loss": 1281.5383, + "step": 3830 + }, + { + "ce_loss_10": 3.612284016609192, + "ce_loss_13": 3.522170841693878, + "ce_loss_2": 4.627012848854065, + "ce_loss_3": 4.361323833465576, + "ce_loss_7": 3.8073740243911742, + "epoch": 0.384, + "grad_norm": 852.0, + "kl_loss_10": 211.08247299194335, + "kl_loss_2": 2264.5133850097654, + "kl_loss_3": 1800.5201843261718, + "kl_loss_7": 672.6636840820313, + "learning_rate": 0.0006873032967079561, + "loss": 1258.6725, + "step": 3840 + }, + { + "ce_loss_10": 3.5931047439575194, + "ce_loss_13": 3.5063655853271483, + "ce_loss_2": 4.588197422027588, + "ce_loss_3": 4.324539279937744, + "ce_loss_7": 3.7907418251037597, + "epoch": 0.385, + "grad_norm": 664.0, + "kl_loss_10": 203.42158584594728, + "kl_loss_2": 2234.902947998047, + "kl_loss_3": 1772.472021484375, + "kl_loss_7": 662.3596588134766, + "learning_rate": 0.0006858312278301637, + "loss": 1226.7012, + "step": 3850 + }, + { + "ce_loss_10": 3.635795843601227, + "ce_loss_13": 3.549472713470459, + "ce_loss_2": 4.623842811584472, + "ce_loss_3": 4.353901195526123, + "ce_loss_7": 3.825099301338196, + "epoch": 0.386, + "grad_norm": 736.0, + "kl_loss_10": 204.89519424438475, + "kl_loss_2": 2217.719934082031, + "kl_loss_3": 1747.3561584472657, + "kl_loss_7": 659.4771606445313, + "learning_rate": 0.0006843572876339704, + "loss": 1225.6961, + "step": 3860 + }, + { + "ce_loss_10": 3.5519859790802, + "ce_loss_13": 3.466093647480011, + "ce_loss_2": 4.525204968452454, + "ce_loss_3": 4.264074110984803, + "ce_loss_7": 3.742415523529053, + "epoch": 0.387, + "grad_norm": 668.0, + "kl_loss_10": 199.43942337036134, + "kl_loss_2": 2183.9968811035155, + "kl_loss_3": 1725.1913513183595, + "kl_loss_7": 644.7796203613282, + "learning_rate": 0.0006828814909619373, + "loss": 1252.2885, + "step": 3870 + }, + { + "ce_loss_10": 3.674282944202423, + "ce_loss_13": 3.5820161938667296, + "ce_loss_2": 4.6895040512084964, + "ce_loss_3": 4.4149659156799315, + "ce_loss_7": 3.86486736536026, + "epoch": 0.388, + "grad_norm": 576.0, + "kl_loss_10": 211.43887939453126, + "kl_loss_2": 2266.184930419922, + "kl_loss_3": 1785.1635192871095, + "kl_loss_7": 661.8903228759766, + "learning_rate": 0.0006814038526753205, + "loss": 1223.6402, + "step": 3880 + }, + { + "ce_loss_10": 3.5698843955993653, + "ce_loss_13": 3.479625034332275, + "ce_loss_2": 4.587342977523804, + "ce_loss_3": 4.3197312474250795, + "ce_loss_7": 3.766351842880249, + "epoch": 0.389, + "grad_norm": 616.0, + "kl_loss_10": 208.68895874023437, + "kl_loss_2": 2258.2895751953124, + "kl_loss_3": 1782.5825317382812, + "kl_loss_7": 664.0186126708984, + "learning_rate": 0.0006799243876539213, + "loss": 1238.0235, + "step": 3890 + }, + { + "ce_loss_10": 3.500353288650513, + "ce_loss_13": 3.408971738815308, + "ce_loss_2": 4.572002196311951, + "ce_loss_3": 4.29817762374878, + "ce_loss_7": 3.699842798709869, + "epoch": 0.39, + "grad_norm": 836.0, + "kl_loss_10": 206.20438537597656, + "kl_loss_2": 2378.092236328125, + "kl_loss_3": 1891.477294921875, + "kl_loss_7": 671.2416046142578, + "learning_rate": 0.0006784431107959359, + "loss": 1281.9199, + "step": 3900 + }, + { + "ce_loss_10": 3.5523509979248047, + "ce_loss_13": 3.460574519634247, + "ce_loss_2": 4.626534819602966, + "ce_loss_3": 4.3451399326324465, + "ce_loss_7": 3.7586856484413147, + "epoch": 0.391, + "grad_norm": 668.0, + "kl_loss_10": 206.77191925048828, + "kl_loss_2": 2369.177069091797, + "kl_loss_3": 1882.2445129394532, + "kl_loss_7": 681.9191589355469, + "learning_rate": 0.0006769600370178059, + "loss": 1269.1178, + "step": 3910 + }, + { + "ce_loss_10": 3.5188350439071656, + "ce_loss_13": 3.4273067116737366, + "ce_loss_2": 4.554541206359863, + "ce_loss_3": 4.289613115787506, + "ce_loss_7": 3.724477529525757, + "epoch": 0.392, + "grad_norm": 560.0, + "kl_loss_10": 199.67687606811523, + "kl_loss_2": 2289.5328735351563, + "kl_loss_3": 1825.1418823242188, + "kl_loss_7": 674.4119354248047, + "learning_rate": 0.0006754751812540679, + "loss": 1229.9105, + "step": 3920 + }, + { + "ce_loss_10": 3.5683494329452516, + "ce_loss_13": 3.482330596446991, + "ce_loss_2": 4.621720671653748, + "ce_loss_3": 4.348645758628845, + "ce_loss_7": 3.7689119219779967, + "epoch": 0.393, + "grad_norm": 644.0, + "kl_loss_10": 204.25539169311523, + "kl_loss_2": 2339.84892578125, + "kl_loss_3": 1860.5805419921876, + "kl_loss_7": 678.0737976074219, + "learning_rate": 0.0006739885584572025, + "loss": 1265.1324, + "step": 3930 + }, + { + "ce_loss_10": 3.596517300605774, + "ce_loss_13": 3.5048423767089845, + "ce_loss_2": 4.657581090927124, + "ce_loss_3": 4.393209981918335, + "ce_loss_7": 3.8048507928848267, + "epoch": 0.394, + "grad_norm": 740.0, + "kl_loss_10": 206.0053512573242, + "kl_loss_2": 2374.587469482422, + "kl_loss_3": 1914.2769775390625, + "kl_loss_7": 691.7073638916015, + "learning_rate": 0.0006725001835974853, + "loss": 1262.3447, + "step": 3940 + }, + { + "ce_loss_10": 3.5837875604629517, + "ce_loss_13": 3.49317661523819, + "ce_loss_2": 4.625016355514527, + "ce_loss_3": 4.360765337944031, + "ce_loss_7": 3.795220899581909, + "epoch": 0.395, + "grad_norm": 624.0, + "kl_loss_10": 202.6530014038086, + "kl_loss_2": 2326.028674316406, + "kl_loss_3": 1851.947198486328, + "kl_loss_7": 690.550015258789, + "learning_rate": 0.0006710100716628344, + "loss": 1233.6354, + "step": 3950 + }, + { + "ce_loss_10": 3.566684401035309, + "ce_loss_13": 3.4755138635635374, + "ce_loss_2": 4.594186568260193, + "ce_loss_3": 4.3349669694900514, + "ce_loss_7": 3.7691023349761963, + "epoch": 0.396, + "grad_norm": 612.0, + "kl_loss_10": 198.98139343261718, + "kl_loss_2": 2292.616107177734, + "kl_loss_3": 1833.538885498047, + "kl_loss_7": 679.8143615722656, + "learning_rate": 0.0006695182376586602, + "loss": 1262.3783, + "step": 3960 + }, + { + "ce_loss_10": 3.596343123912811, + "ce_loss_13": 3.512529468536377, + "ce_loss_2": 4.574345445632934, + "ce_loss_3": 4.312320637702942, + "ce_loss_7": 3.790241527557373, + "epoch": 0.397, + "grad_norm": 708.0, + "kl_loss_10": 191.98949813842773, + "kl_loss_2": 2169.2684020996094, + "kl_loss_3": 1705.870343017578, + "kl_loss_7": 641.6268707275391, + "learning_rate": 0.000668024696607715, + "loss": 1235.5194, + "step": 3970 + }, + { + "ce_loss_10": 3.555418300628662, + "ce_loss_13": 3.470967173576355, + "ce_loss_2": 4.566342353820801, + "ce_loss_3": 4.313628911972046, + "ce_loss_7": 3.7538477182388306, + "epoch": 0.398, + "grad_norm": 636.0, + "kl_loss_10": 198.03487548828124, + "kl_loss_2": 2281.672326660156, + "kl_loss_3": 1825.4962097167968, + "kl_loss_7": 667.3125, + "learning_rate": 0.0006665294635499404, + "loss": 1243.9658, + "step": 3980 + }, + { + "ce_loss_10": 3.561533272266388, + "ce_loss_13": 3.4714276075363157, + "ce_loss_2": 4.635219573974609, + "ce_loss_3": 4.372889280319214, + "ce_loss_7": 3.771253454685211, + "epoch": 0.399, + "grad_norm": 876.0, + "kl_loss_10": 208.96954040527345, + "kl_loss_2": 2390.9282470703124, + "kl_loss_3": 1928.2362426757813, + "kl_loss_7": 697.694384765625, + "learning_rate": 0.0006650325535423167, + "loss": 1276.8535, + "step": 3990 + }, + { + "ce_loss_10": 3.5832207202911377, + "ce_loss_13": 3.4972815036773683, + "ce_loss_2": 4.57238998413086, + "ce_loss_3": 4.3085246801376345, + "ce_loss_7": 3.7773876547813416, + "epoch": 0.4, + "grad_norm": 680.0, + "kl_loss_10": 194.11105422973634, + "kl_loss_2": 2185.9527893066406, + "kl_loss_3": 1735.314678955078, + "kl_loss_7": 647.3017364501953, + "learning_rate": 0.0006635339816587109, + "loss": 1234.4078, + "step": 4000 + }, + { + "ce_loss_10": 3.524456286430359, + "ce_loss_13": 3.4352723956108093, + "ce_loss_2": 4.582833385467529, + "ce_loss_3": 4.318940043449402, + "ce_loss_7": 3.7211544632911684, + "epoch": 0.401, + "grad_norm": 624.0, + "kl_loss_10": 200.03394927978516, + "kl_loss_2": 2362.4151245117187, + "kl_loss_3": 1889.6026794433594, + "kl_loss_7": 668.6993530273437, + "learning_rate": 0.0006620337629897252, + "loss": 1251.5271, + "step": 4010 + }, + { + "ce_loss_10": 3.531862771511078, + "ce_loss_13": 3.4413245558738708, + "ce_loss_2": 4.564659547805786, + "ce_loss_3": 4.295144200325012, + "ce_loss_7": 3.728400182723999, + "epoch": 0.402, + "grad_norm": 556.0, + "kl_loss_10": 199.88810348510742, + "kl_loss_2": 2302.102014160156, + "kl_loss_3": 1823.07431640625, + "kl_loss_7": 668.4421752929687, + "learning_rate": 0.0006605319126425454, + "loss": 1275.6262, + "step": 4020 + }, + { + "ce_loss_10": 3.4339096665382387, + "ce_loss_13": 3.350968396663666, + "ce_loss_2": 4.514854836463928, + "ce_loss_3": 4.238226985931396, + "ce_loss_7": 3.644399344921112, + "epoch": 0.403, + "grad_norm": 644.0, + "kl_loss_10": 199.70583419799806, + "kl_loss_2": 2387.631066894531, + "kl_loss_3": 1906.55048828125, + "kl_loss_7": 681.1330749511719, + "learning_rate": 0.0006590284457407876, + "loss": 1275.1312, + "step": 4030 + }, + { + "ce_loss_10": 3.5380223751068116, + "ce_loss_13": 3.448465049266815, + "ce_loss_2": 4.57396821975708, + "ce_loss_3": 4.313019490242004, + "ce_loss_7": 3.7362441062927245, + "epoch": 0.404, + "grad_norm": 688.0, + "kl_loss_10": 198.1963623046875, + "kl_loss_2": 2292.718151855469, + "kl_loss_3": 1830.845147705078, + "kl_loss_7": 661.0607788085938, + "learning_rate": 0.0006575233774243465, + "loss": 1249.6318, + "step": 4040 + }, + { + "ce_loss_10": 3.528366136550903, + "ce_loss_13": 3.4398876786231996, + "ce_loss_2": 4.5835960626602175, + "ce_loss_3": 4.318015563488006, + "ce_loss_7": 3.734170937538147, + "epoch": 0.405, + "grad_norm": 744.0, + "kl_loss_10": 199.81770248413085, + "kl_loss_2": 2371.9314208984374, + "kl_loss_3": 1896.1641540527344, + "kl_loss_7": 680.1676055908204, + "learning_rate": 0.0006560167228492435, + "loss": 1274.3472, + "step": 4050 + }, + { + "ce_loss_10": 3.5713927507400514, + "ce_loss_13": 3.4897979974746702, + "ce_loss_2": 4.564716410636902, + "ce_loss_3": 4.304848039150238, + "ce_loss_7": 3.764770579338074, + "epoch": 0.406, + "grad_norm": 632.0, + "kl_loss_10": 190.77299575805665, + "kl_loss_2": 2213.3808837890624, + "kl_loss_3": 1758.434912109375, + "kl_loss_7": 651.0917877197265, + "learning_rate": 0.0006545084971874737, + "loss": 1244.4535, + "step": 4060 + }, + { + "ce_loss_10": 3.5361163854599, + "ce_loss_13": 3.446142256259918, + "ce_loss_2": 4.613076639175415, + "ce_loss_3": 4.341050863265991, + "ce_loss_7": 3.7489752054214476, + "epoch": 0.407, + "grad_norm": 724.0, + "kl_loss_10": 204.6956573486328, + "kl_loss_2": 2384.869763183594, + "kl_loss_3": 1900.13232421875, + "kl_loss_7": 693.6003051757813, + "learning_rate": 0.0006529987156268526, + "loss": 1264.1867, + "step": 4070 + }, + { + "ce_loss_10": 3.4604807257652284, + "ce_loss_13": 3.3677136301994324, + "ce_loss_2": 4.529002094268799, + "ce_loss_3": 4.2508728981018065, + "ce_loss_7": 3.671179461479187, + "epoch": 0.408, + "grad_norm": 652.0, + "kl_loss_10": 200.91606369018555, + "kl_loss_2": 2349.1596496582033, + "kl_loss_3": 1858.6745910644531, + "kl_loss_7": 678.4067779541016, + "learning_rate": 0.0006514873933708637, + "loss": 1288.6936, + "step": 4080 + }, + { + "ce_loss_10": 3.5669307708740234, + "ce_loss_13": 3.4793569445610046, + "ce_loss_2": 4.594972729682922, + "ce_loss_3": 4.326959013938904, + "ce_loss_7": 3.7646772980690004, + "epoch": 0.409, + "grad_norm": 624.0, + "kl_loss_10": 195.33749084472657, + "kl_loss_2": 2285.8158325195313, + "kl_loss_3": 1812.939276123047, + "kl_loss_7": 660.7087280273438, + "learning_rate": 0.0006499745456385053, + "loss": 1246.0525, + "step": 4090 + }, + { + "ce_loss_10": 3.532058572769165, + "ce_loss_13": 3.446662437915802, + "ce_loss_2": 4.566872882843017, + "ce_loss_3": 4.295732653141021, + "ce_loss_7": 3.7351000905036926, + "epoch": 0.41, + "grad_norm": 592.0, + "kl_loss_10": 197.3603828430176, + "kl_loss_2": 2284.4489013671873, + "kl_loss_3": 1809.6247680664062, + "kl_loss_7": 669.4995666503906, + "learning_rate": 0.0006484601876641375, + "loss": 1259.2045, + "step": 4100 + }, + { + "ce_loss_10": 3.523475456237793, + "ce_loss_13": 3.4387876391410828, + "ce_loss_2": 4.524111318588257, + "ce_loss_3": 4.253742909431457, + "ce_loss_7": 3.7196569561958315, + "epoch": 0.411, + "grad_norm": 608.0, + "kl_loss_10": 194.7115333557129, + "kl_loss_2": 2231.6944580078125, + "kl_loss_3": 1757.1336669921875, + "kl_loss_7": 654.9919372558594, + "learning_rate": 0.000646944334697328, + "loss": 1224.3209, + "step": 4110 + }, + { + "ce_loss_10": 3.6390475988388062, + "ce_loss_13": 3.55245840549469, + "ce_loss_2": 4.6237300157547, + "ce_loss_3": 4.356840944290161, + "ce_loss_7": 3.83481205701828, + "epoch": 0.412, + "grad_norm": 564.0, + "kl_loss_10": 194.73427352905273, + "kl_loss_2": 2191.641564941406, + "kl_loss_3": 1726.6775146484374, + "kl_loss_7": 650.4818603515625, + "learning_rate": 0.0006454270020026995, + "loss": 1203.4656, + "step": 4120 + }, + { + "ce_loss_10": 3.6017306566238405, + "ce_loss_13": 3.520050418376923, + "ce_loss_2": 4.577455329895019, + "ce_loss_3": 4.320154881477356, + "ce_loss_7": 3.792659568786621, + "epoch": 0.413, + "grad_norm": 576.0, + "kl_loss_10": 189.64769592285157, + "kl_loss_2": 2176.6477478027346, + "kl_loss_3": 1722.2450378417968, + "kl_loss_7": 643.5274627685546, + "learning_rate": 0.0006439082048597755, + "loss": 1192.4902, + "step": 4130 + }, + { + "ce_loss_10": 3.5903451800346375, + "ce_loss_13": 3.507824885845184, + "ce_loss_2": 4.608788180351257, + "ce_loss_3": 4.346335411071777, + "ce_loss_7": 3.7915576100349426, + "epoch": 0.414, + "grad_norm": 580.0, + "kl_loss_10": 197.56676864624023, + "kl_loss_2": 2267.8357421875, + "kl_loss_3": 1807.2357055664063, + "kl_loss_7": 666.154751586914, + "learning_rate": 0.0006423879585628261, + "loss": 1240.4789, + "step": 4140 + }, + { + "ce_loss_10": 3.55734179019928, + "ce_loss_13": 3.467788887023926, + "ce_loss_2": 4.6172443151474, + "ce_loss_3": 4.339134466648102, + "ce_loss_7": 3.7582768201828003, + "epoch": 0.415, + "grad_norm": 688.0, + "kl_loss_10": 201.20330123901368, + "kl_loss_2": 2351.881707763672, + "kl_loss_3": 1863.0321411132813, + "kl_loss_7": 675.1522674560547, + "learning_rate": 0.0006408662784207149, + "loss": 1267.4067, + "step": 4150 + }, + { + "ce_loss_10": 3.5083068370819093, + "ce_loss_13": 3.421711838245392, + "ce_loss_2": 4.537308859825134, + "ce_loss_3": 4.2711180448532104, + "ce_loss_7": 3.708827292919159, + "epoch": 0.416, + "grad_norm": 696.0, + "kl_loss_10": 195.25809020996093, + "kl_loss_2": 2288.984796142578, + "kl_loss_3": 1822.1060485839844, + "kl_loss_7": 665.5764404296875, + "learning_rate": 0.0006393431797567439, + "loss": 1250.1072, + "step": 4160 + }, + { + "ce_loss_10": 3.5913987278938295, + "ce_loss_13": 3.509873795509338, + "ce_loss_2": 4.573607659339904, + "ce_loss_3": 4.318917143344879, + "ce_loss_7": 3.7776596546173096, + "epoch": 0.417, + "grad_norm": 596.0, + "kl_loss_10": 194.73566436767578, + "kl_loss_2": 2221.0649841308596, + "kl_loss_3": 1766.04775390625, + "kl_loss_7": 648.3369201660156, + "learning_rate": 0.0006378186779084996, + "loss": 1190.9323, + "step": 4170 + }, + { + "ce_loss_10": 3.4334940314292908, + "ce_loss_13": 3.3453728437423704, + "ce_loss_2": 4.485861015319824, + "ce_loss_3": 4.215418803691864, + "ce_loss_7": 3.6394405364990234, + "epoch": 0.418, + "grad_norm": 676.0, + "kl_loss_10": 196.98247756958008, + "kl_loss_2": 2312.9372680664064, + "kl_loss_3": 1836.5678649902343, + "kl_loss_7": 670.5612365722657, + "learning_rate": 0.0006362927882276989, + "loss": 1261.1342, + "step": 4180 + }, + { + "ce_loss_10": 3.622100257873535, + "ce_loss_13": 3.5377141356468202, + "ce_loss_2": 4.620994114875794, + "ce_loss_3": 4.349101042747497, + "ce_loss_7": 3.81082307100296, + "epoch": 0.419, + "grad_norm": 620.0, + "kl_loss_10": 192.66633071899415, + "kl_loss_2": 2225.1201049804686, + "kl_loss_3": 1751.446875, + "kl_loss_7": 636.4902648925781, + "learning_rate": 0.000634765526080034, + "loss": 1194.2031, + "step": 4190 + }, + { + "ce_loss_10": 3.626552712917328, + "ce_loss_13": 3.5395890951156614, + "ce_loss_2": 4.6221943378448485, + "ce_loss_3": 4.355586886405945, + "ce_loss_7": 3.8178786516189573, + "epoch": 0.42, + "grad_norm": 612.0, + "kl_loss_10": 198.29063568115234, + "kl_loss_2": 2233.3197509765623, + "kl_loss_3": 1764.3755920410156, + "kl_loss_7": 656.7103454589844, + "learning_rate": 0.0006332369068450174, + "loss": 1207.3598, + "step": 4200 + }, + { + "ce_loss_10": 3.5582772374153135, + "ce_loss_13": 3.4749309182167054, + "ce_loss_2": 4.574929785728455, + "ce_loss_3": 4.314627623558044, + "ce_loss_7": 3.7553325653076173, + "epoch": 0.421, + "grad_norm": 588.0, + "kl_loss_10": 195.14224548339843, + "kl_loss_2": 2252.938916015625, + "kl_loss_3": 1800.94208984375, + "kl_loss_7": 656.2003112792969, + "learning_rate": 0.0006317069459158283, + "loss": 1220.2742, + "step": 4210 + }, + { + "ce_loss_10": 3.66640442609787, + "ce_loss_13": 3.584319996833801, + "ce_loss_2": 4.649453711509705, + "ce_loss_3": 4.3819632768630985, + "ce_loss_7": 3.8596243381500246, + "epoch": 0.422, + "grad_norm": 592.0, + "kl_loss_10": 193.5803665161133, + "kl_loss_2": 2195.1602294921877, + "kl_loss_3": 1735.2393310546875, + "kl_loss_7": 647.3203796386719, + "learning_rate": 0.0006301756586991561, + "loss": 1218.0184, + "step": 4220 + }, + { + "ce_loss_10": 3.452160143852234, + "ce_loss_13": 3.3657590985298156, + "ce_loss_2": 4.505249190330505, + "ce_loss_3": 4.242624092102051, + "ce_loss_7": 3.6535696148872376, + "epoch": 0.423, + "grad_norm": 764.0, + "kl_loss_10": 198.62798614501952, + "kl_loss_2": 2358.790142822266, + "kl_loss_3": 1893.541082763672, + "kl_loss_7": 677.0058868408203, + "learning_rate": 0.0006286430606150459, + "loss": 1264.3267, + "step": 4230 + }, + { + "ce_loss_10": 3.6465710401535034, + "ce_loss_13": 3.563078057765961, + "ce_loss_2": 4.641510224342346, + "ce_loss_3": 4.377968907356262, + "ce_loss_7": 3.843237745761871, + "epoch": 0.424, + "grad_norm": 752.0, + "kl_loss_10": 199.9485771179199, + "kl_loss_2": 2240.6987426757814, + "kl_loss_3": 1778.9172729492188, + "kl_loss_7": 666.6387664794922, + "learning_rate": 0.0006271091670967436, + "loss": 1223.7141, + "step": 4240 + }, + { + "ce_loss_10": 3.570220148563385, + "ce_loss_13": 3.474740993976593, + "ce_loss_2": 4.616126585006714, + "ce_loss_3": 4.345292592048645, + "ce_loss_7": 3.7782423973083494, + "epoch": 0.425, + "grad_norm": 604.0, + "kl_loss_10": 206.9665100097656, + "kl_loss_2": 2359.6162658691405, + "kl_loss_3": 1878.0119995117188, + "kl_loss_7": 699.4417297363282, + "learning_rate": 0.0006255739935905395, + "loss": 1260.2877, + "step": 4250 + }, + { + "ce_loss_10": 3.6002479434013366, + "ce_loss_13": 3.5161559224128722, + "ce_loss_2": 4.592222595214844, + "ce_loss_3": 4.328677630424499, + "ce_loss_7": 3.790011668205261, + "epoch": 0.426, + "grad_norm": 688.0, + "kl_loss_10": 196.72698440551758, + "kl_loss_2": 2221.3829833984373, + "kl_loss_3": 1756.8525756835938, + "kl_loss_7": 652.2526733398438, + "learning_rate": 0.0006240375555556145, + "loss": 1261.0352, + "step": 4260 + }, + { + "ce_loss_10": 3.6026462078094483, + "ce_loss_13": 3.5168575167655947, + "ce_loss_2": 4.6545480489730835, + "ce_loss_3": 4.386264157295227, + "ce_loss_7": 3.8051093459129333, + "epoch": 0.427, + "grad_norm": 580.0, + "kl_loss_10": 197.9854705810547, + "kl_loss_2": 2316.6946411132812, + "kl_loss_3": 1838.7398193359375, + "kl_loss_7": 667.8762664794922, + "learning_rate": 0.000622499868463882, + "loss": 1243.157, + "step": 4270 + }, + { + "ce_loss_10": 3.574436700344086, + "ce_loss_13": 3.4921345114707947, + "ce_loss_2": 4.568159365653992, + "ce_loss_3": 4.298846364021301, + "ce_loss_7": 3.765162992477417, + "epoch": 0.428, + "grad_norm": 620.0, + "kl_loss_10": 194.46619186401367, + "kl_loss_2": 2240.061688232422, + "kl_loss_3": 1767.6112670898438, + "kl_loss_7": 647.0378051757813, + "learning_rate": 0.0006209609477998338, + "loss": 1226.4191, + "step": 4280 + }, + { + "ce_loss_10": 3.6271798372268678, + "ce_loss_13": 3.5434911131858824, + "ce_loss_2": 4.628253221511841, + "ce_loss_3": 4.367080307006836, + "ce_loss_7": 3.823750925064087, + "epoch": 0.429, + "grad_norm": 596.0, + "kl_loss_10": 199.08662948608398, + "kl_loss_2": 2248.870013427734, + "kl_loss_3": 1777.940362548828, + "kl_loss_7": 666.140267944336, + "learning_rate": 0.0006194208090603844, + "loss": 1245.6613, + "step": 4290 + }, + { + "ce_loss_10": 3.550025999546051, + "ce_loss_13": 3.4652276039123535, + "ce_loss_2": 4.55794665813446, + "ce_loss_3": 4.292889666557312, + "ce_loss_7": 3.7512326836586, + "epoch": 0.43, + "grad_norm": 696.0, + "kl_loss_10": 194.7114356994629, + "kl_loss_2": 2238.0842895507812, + "kl_loss_3": 1764.372296142578, + "kl_loss_7": 659.5233337402344, + "learning_rate": 0.0006178794677547138, + "loss": 1204.7698, + "step": 4300 + }, + { + "ce_loss_10": 3.5732582211494446, + "ce_loss_13": 3.4901079893112184, + "ce_loss_2": 4.593334412574768, + "ce_loss_3": 4.330301976203918, + "ce_loss_7": 3.7785701513290406, + "epoch": 0.431, + "grad_norm": 716.0, + "kl_loss_10": 204.0959487915039, + "kl_loss_2": 2270.5096435546875, + "kl_loss_3": 1810.6998352050782, + "kl_loss_7": 680.2683868408203, + "learning_rate": 0.0006163369394041111, + "loss": 1234.0865, + "step": 4310 + }, + { + "ce_loss_10": 3.522435462474823, + "ce_loss_13": 3.427918314933777, + "ce_loss_2": 4.549873030185699, + "ce_loss_3": 4.289526271820068, + "ce_loss_7": 3.7253190755844114, + "epoch": 0.432, + "grad_norm": 800.0, + "kl_loss_10": 208.77886199951172, + "kl_loss_2": 2301.5377990722654, + "kl_loss_3": 1836.5447998046875, + "kl_loss_7": 679.7338165283203, + "learning_rate": 0.0006147932395418205, + "loss": 1277.3873, + "step": 4320 + }, + { + "ce_loss_10": 3.5494457960128782, + "ce_loss_13": 3.4614178776741027, + "ce_loss_2": 4.544875764846802, + "ce_loss_3": 4.281032645702362, + "ce_loss_7": 3.7431655168533324, + "epoch": 0.433, + "grad_norm": 576.0, + "kl_loss_10": 207.28576126098633, + "kl_loss_2": 2223.6774475097654, + "kl_loss_3": 1762.4080688476563, + "kl_loss_7": 660.7338745117188, + "learning_rate": 0.0006132483837128823, + "loss": 1209.1447, + "step": 4330 + }, + { + "ce_loss_10": 3.5334264755249025, + "ce_loss_13": 3.4463666915893554, + "ce_loss_2": 4.564454817771912, + "ce_loss_3": 4.294960129261017, + "ce_loss_7": 3.73456689119339, + "epoch": 0.434, + "grad_norm": 772.0, + "kl_loss_10": 203.00026016235353, + "kl_loss_2": 2313.798968505859, + "kl_loss_3": 1837.2741760253907, + "kl_loss_7": 664.0175872802735, + "learning_rate": 0.0006117023874739772, + "loss": 1240.8283, + "step": 4340 + }, + { + "ce_loss_10": 3.5215348839759826, + "ce_loss_13": 3.4345417499542235, + "ce_loss_2": 4.554822826385498, + "ce_loss_3": 4.285668563842774, + "ce_loss_7": 3.7274380683898927, + "epoch": 0.435, + "grad_norm": 600.0, + "kl_loss_10": 199.90404205322267, + "kl_loss_2": 2303.0773803710936, + "kl_loss_3": 1827.208251953125, + "kl_loss_7": 672.8900817871094, + "learning_rate": 0.0006101552663932703, + "loss": 1260.7756, + "step": 4350 + }, + { + "ce_loss_10": 3.554371106624603, + "ce_loss_13": 3.4667278289794923, + "ce_loss_2": 4.5602539539337155, + "ce_loss_3": 4.297711455821991, + "ce_loss_7": 3.74671790599823, + "epoch": 0.436, + "grad_norm": 664.0, + "kl_loss_10": 201.51847763061522, + "kl_loss_2": 2254.3937927246093, + "kl_loss_3": 1790.253662109375, + "kl_loss_7": 662.9543731689453, + "learning_rate": 0.0006086070360502539, + "loss": 1241.8814, + "step": 4360 + }, + { + "ce_loss_10": 3.5543460965156557, + "ce_loss_13": 3.470974051952362, + "ce_loss_2": 4.571776509284973, + "ce_loss_3": 4.305854046344757, + "ce_loss_7": 3.7547166466712953, + "epoch": 0.437, + "grad_norm": 608.0, + "kl_loss_10": 196.51984329223632, + "kl_loss_2": 2276.0406494140625, + "kl_loss_3": 1801.6470031738281, + "kl_loss_7": 659.9133270263671, + "learning_rate": 0.0006070577120355903, + "loss": 1236.9521, + "step": 4370 + }, + { + "ce_loss_10": 3.5628577947616575, + "ce_loss_13": 3.47283878326416, + "ce_loss_2": 4.547589898109436, + "ce_loss_3": 4.279985129833221, + "ce_loss_7": 3.7624007940292357, + "epoch": 0.438, + "grad_norm": 700.0, + "kl_loss_10": 200.08970794677734, + "kl_loss_2": 2194.6991455078123, + "kl_loss_3": 1728.643865966797, + "kl_loss_7": 657.7827362060547, + "learning_rate": 0.0006055073099509549, + "loss": 1218.3828, + "step": 4380 + }, + { + "ce_loss_10": 3.6181469678878786, + "ce_loss_13": 3.531364715099335, + "ce_loss_2": 4.607781720161438, + "ce_loss_3": 4.3447977781295775, + "ce_loss_7": 3.8072004199028013, + "epoch": 0.439, + "grad_norm": 616.0, + "kl_loss_10": 200.97432174682618, + "kl_loss_2": 2223.8163146972656, + "kl_loss_3": 1756.1592224121093, + "kl_loss_7": 652.6859497070312, + "learning_rate": 0.0006039558454088796, + "loss": 1239.9502, + "step": 4390 + }, + { + "ce_loss_10": 3.598993420600891, + "ce_loss_13": 3.508393979072571, + "ce_loss_2": 4.611153769493103, + "ce_loss_3": 4.343827414512634, + "ce_loss_7": 3.798681151866913, + "epoch": 0.44, + "grad_norm": 636.0, + "kl_loss_10": 207.16089324951173, + "kl_loss_2": 2267.215954589844, + "kl_loss_3": 1798.3621337890625, + "kl_loss_7": 665.1716247558594, + "learning_rate": 0.0006024033340325954, + "loss": 1210.7668, + "step": 4400 + }, + { + "ce_loss_10": 3.6592474579811096, + "ce_loss_13": 3.575475811958313, + "ce_loss_2": 4.615442514419556, + "ce_loss_3": 4.356250524520874, + "ce_loss_7": 3.841563415527344, + "epoch": 0.441, + "grad_norm": 564.0, + "kl_loss_10": 192.91486740112305, + "kl_loss_2": 2138.478411865234, + "kl_loss_3": 1682.6779296875, + "kl_loss_7": 628.4862640380859, + "learning_rate": 0.0006008497914558743, + "loss": 1188.8043, + "step": 4410 + }, + { + "ce_loss_10": 3.603752911090851, + "ce_loss_13": 3.514535641670227, + "ce_loss_2": 4.619881939888001, + "ce_loss_3": 4.351648759841919, + "ce_loss_7": 3.8029965996742248, + "epoch": 0.442, + "grad_norm": 680.0, + "kl_loss_10": 203.31059799194335, + "kl_loss_2": 2279.580682373047, + "kl_loss_3": 1800.9931640625, + "kl_loss_7": 667.106298828125, + "learning_rate": 0.0005992952333228728, + "loss": 1234.8536, + "step": 4420 + }, + { + "ce_loss_10": 3.5360623002052307, + "ce_loss_13": 3.452274763584137, + "ce_loss_2": 4.555125761032104, + "ce_loss_3": 4.292830312252045, + "ce_loss_7": 3.7339815139770507, + "epoch": 0.443, + "grad_norm": 660.0, + "kl_loss_10": 193.53399200439452, + "kl_loss_2": 2284.7425048828127, + "kl_loss_3": 1820.065606689453, + "kl_loss_7": 662.5294036865234, + "learning_rate": 0.0005977396752879741, + "loss": 1233.2003, + "step": 4430 + }, + { + "ce_loss_10": 3.4606794357299804, + "ce_loss_13": 3.377534472942352, + "ce_loss_2": 4.48543610572815, + "ce_loss_3": 4.220411324501038, + "ce_loss_7": 3.6664485812187193, + "epoch": 0.444, + "grad_norm": 580.0, + "kl_loss_10": 191.26479797363282, + "kl_loss_2": 2280.07265625, + "kl_loss_3": 1810.239013671875, + "kl_loss_7": 656.8585327148437, + "learning_rate": 0.0005961831330156305, + "loss": 1222.7674, + "step": 4440 + }, + { + "ce_loss_10": 3.603837263584137, + "ce_loss_13": 3.5208237767219543, + "ce_loss_2": 4.638635230064392, + "ce_loss_3": 4.366818988323212, + "ce_loss_7": 3.8011784672737123, + "epoch": 0.445, + "grad_norm": 652.0, + "kl_loss_10": 193.8144203186035, + "kl_loss_2": 2316.0056640625, + "kl_loss_3": 1833.1619812011718, + "kl_loss_7": 659.0817749023438, + "learning_rate": 0.0005946256221802051, + "loss": 1263.1171, + "step": 4450 + }, + { + "ce_loss_10": 3.5832170486450194, + "ce_loss_13": 3.5048667788505554, + "ce_loss_2": 4.5584005355834964, + "ce_loss_3": 4.296856260299682, + "ce_loss_7": 3.7672229290008543, + "epoch": 0.446, + "grad_norm": 700.0, + "kl_loss_10": 189.50232849121093, + "kl_loss_2": 2181.445458984375, + "kl_loss_3": 1725.645037841797, + "kl_loss_7": 639.0539123535157, + "learning_rate": 0.0005930671584658151, + "loss": 1259.6497, + "step": 4460 + }, + { + "ce_loss_10": 3.5820990085601805, + "ce_loss_13": 3.4986414194107054, + "ce_loss_2": 4.585966444015503, + "ce_loss_3": 4.327683901786804, + "ce_loss_7": 3.778271293640137, + "epoch": 0.447, + "grad_norm": 624.0, + "kl_loss_10": 192.16329650878907, + "kl_loss_2": 2241.6711364746093, + "kl_loss_3": 1786.2928161621094, + "kl_loss_7": 656.8107452392578, + "learning_rate": 0.0005915077575661722, + "loss": 1237.7033, + "step": 4470 + }, + { + "ce_loss_10": 3.601723861694336, + "ce_loss_13": 3.5175135850906374, + "ce_loss_2": 4.623058772087097, + "ce_loss_3": 4.352630817890168, + "ce_loss_7": 3.801686096191406, + "epoch": 0.448, + "grad_norm": 576.0, + "kl_loss_10": 197.77509002685548, + "kl_loss_2": 2287.8472229003905, + "kl_loss_3": 1814.064471435547, + "kl_loss_7": 669.7168884277344, + "learning_rate": 0.000589947435184427, + "loss": 1221.476, + "step": 4480 + }, + { + "ce_loss_10": 3.667625939846039, + "ce_loss_13": 3.5879098773002625, + "ce_loss_2": 4.623752212524414, + "ce_loss_3": 4.3639614343643185, + "ce_loss_7": 3.854480040073395, + "epoch": 0.449, + "grad_norm": 676.0, + "kl_loss_10": 191.73966369628906, + "kl_loss_2": 2169.5962097167967, + "kl_loss_3": 1711.8840759277343, + "kl_loss_7": 644.2419403076171, + "learning_rate": 0.0005883862070330078, + "loss": 1205.0104, + "step": 4490 + }, + { + "ce_loss_10": 3.5975982904434205, + "ce_loss_13": 3.5138633131980894, + "ce_loss_2": 4.596203637123108, + "ce_loss_3": 4.342746245861053, + "ce_loss_7": 3.7984990000724794, + "epoch": 0.45, + "grad_norm": 680.0, + "kl_loss_10": 192.790771484375, + "kl_loss_2": 2245.34130859375, + "kl_loss_3": 1787.1853515625, + "kl_loss_7": 655.1883148193359, + "learning_rate": 0.0005868240888334653, + "loss": 1211.5924, + "step": 4500 + }, + { + "ce_loss_10": 3.484956693649292, + "ce_loss_13": 3.3994694352149963, + "ce_loss_2": 4.541581082344055, + "ce_loss_3": 4.265275609493256, + "ce_loss_7": 3.685898816585541, + "epoch": 0.451, + "grad_norm": 664.0, + "kl_loss_10": 197.9249183654785, + "kl_loss_2": 2329.778839111328, + "kl_loss_3": 1847.1219604492187, + "kl_loss_7": 669.9408386230468, + "learning_rate": 0.0005852610963163119, + "loss": 1246.2838, + "step": 4510 + }, + { + "ce_loss_10": 3.506740427017212, + "ce_loss_13": 3.425083673000336, + "ce_loss_2": 4.510421705245972, + "ce_loss_3": 4.246485769748688, + "ce_loss_7": 3.6969696044921876, + "epoch": 0.452, + "grad_norm": 600.0, + "kl_loss_10": 188.6581832885742, + "kl_loss_2": 2238.5858459472656, + "kl_loss_3": 1770.7893127441407, + "kl_loss_7": 646.1408782958985, + "learning_rate": 0.0005836972452208654, + "loss": 1201.6553, + "step": 4520 + }, + { + "ce_loss_10": 3.505844843387604, + "ce_loss_13": 3.4249507427215575, + "ce_loss_2": 4.529585886001587, + "ce_loss_3": 4.277110803127289, + "ce_loss_7": 3.708256196975708, + "epoch": 0.453, + "grad_norm": 668.0, + "kl_loss_10": 193.22399291992187, + "kl_loss_2": 2265.8468383789063, + "kl_loss_3": 1815.4268432617187, + "kl_loss_7": 656.5519592285157, + "learning_rate": 0.0005821325512950885, + "loss": 1236.8736, + "step": 4530 + }, + { + "ce_loss_10": 3.5389772057533264, + "ce_loss_13": 3.4585880279541015, + "ce_loss_2": 4.540098547935486, + "ce_loss_3": 4.2798211693763735, + "ce_loss_7": 3.7288518071174623, + "epoch": 0.454, + "grad_norm": 592.0, + "kl_loss_10": 187.7821243286133, + "kl_loss_2": 2205.198779296875, + "kl_loss_3": 1748.45341796875, + "kl_loss_7": 639.7683776855469, + "learning_rate": 0.0005805670302954321, + "loss": 1221.9566, + "step": 4540 + }, + { + "ce_loss_10": 3.544492793083191, + "ce_loss_13": 3.4652194142341615, + "ce_loss_2": 4.541022229194641, + "ce_loss_3": 4.279520082473755, + "ce_loss_7": 3.7328044533729554, + "epoch": 0.455, + "grad_norm": 656.0, + "kl_loss_10": 186.06844177246094, + "kl_loss_2": 2226.980224609375, + "kl_loss_3": 1765.562744140625, + "kl_loss_7": 639.6060729980469, + "learning_rate": 0.000579000697986675, + "loss": 1199.4846, + "step": 4550 + }, + { + "ce_loss_10": 3.5037956118583677, + "ce_loss_13": 3.4134857773780825, + "ce_loss_2": 4.544147634506226, + "ce_loss_3": 4.274082601070404, + "ce_loss_7": 3.707910752296448, + "epoch": 0.456, + "grad_norm": 664.0, + "kl_loss_10": 200.43186416625977, + "kl_loss_2": 2315.5464111328124, + "kl_loss_3": 1832.4145568847657, + "kl_loss_7": 672.0404296875, + "learning_rate": 0.0005774335701417662, + "loss": 1229.2445, + "step": 4560 + }, + { + "ce_loss_10": 3.4942433714866636, + "ce_loss_13": 3.4095874786376954, + "ce_loss_2": 4.549353170394897, + "ce_loss_3": 4.279058015346527, + "ce_loss_7": 3.693026268482208, + "epoch": 0.457, + "grad_norm": 608.0, + "kl_loss_10": 192.1516143798828, + "kl_loss_2": 2342.186248779297, + "kl_loss_3": 1864.820654296875, + "kl_loss_7": 655.2123260498047, + "learning_rate": 0.0005758656625416658, + "loss": 1241.1571, + "step": 4570 + }, + { + "ce_loss_10": 3.5480048656463623, + "ce_loss_13": 3.4622543811798097, + "ce_loss_2": 4.561884355545044, + "ce_loss_3": 4.293217277526855, + "ce_loss_7": 3.743514323234558, + "epoch": 0.458, + "grad_norm": 616.0, + "kl_loss_10": 194.93896102905273, + "kl_loss_2": 2260.48984375, + "kl_loss_3": 1786.8557556152343, + "kl_loss_7": 654.7685607910156, + "learning_rate": 0.0005742969909751859, + "loss": 1199.7715, + "step": 4580 + }, + { + "ce_loss_10": 3.558157193660736, + "ce_loss_13": 3.4740110039711, + "ce_loss_2": 4.582901740074158, + "ce_loss_3": 4.310809695720673, + "ce_loss_7": 3.7480292677879334, + "epoch": 0.459, + "grad_norm": 636.0, + "kl_loss_10": 193.16277923583985, + "kl_loss_2": 2285.9891052246094, + "kl_loss_3": 1800.92705078125, + "kl_loss_7": 648.6197357177734, + "learning_rate": 0.0005727275712388318, + "loss": 1238.3732, + "step": 4590 + }, + { + "ce_loss_10": 3.5862102270126344, + "ce_loss_13": 3.509055662155151, + "ce_loss_2": 4.560896277427673, + "ce_loss_3": 4.298801875114441, + "ce_loss_7": 3.773897314071655, + "epoch": 0.46, + "grad_norm": 768.0, + "kl_loss_10": 186.60687026977538, + "kl_loss_2": 2190.591516113281, + "kl_loss_3": 1728.75283203125, + "kl_loss_7": 633.5937683105469, + "learning_rate": 0.0005711574191366427, + "loss": 1204.0141, + "step": 4600 + }, + { + "ce_loss_10": 3.537917101383209, + "ce_loss_13": 3.456929898262024, + "ce_loss_2": 4.532997250556946, + "ce_loss_3": 4.271801400184631, + "ce_loss_7": 3.7239314556121825, + "epoch": 0.461, + "grad_norm": 544.0, + "kl_loss_10": 188.38971405029298, + "kl_loss_2": 2244.5726928710938, + "kl_loss_3": 1779.3487548828125, + "kl_loss_7": 643.0867309570312, + "learning_rate": 0.0005695865504800327, + "loss": 1208.6229, + "step": 4610 + }, + { + "ce_loss_10": 3.475814175605774, + "ce_loss_13": 3.3895989418029786, + "ce_loss_2": 4.570864033699036, + "ce_loss_3": 4.298530387878418, + "ce_loss_7": 3.6918977737426757, + "epoch": 0.462, + "grad_norm": 688.0, + "kl_loss_10": 199.44021301269532, + "kl_loss_2": 2396.831396484375, + "kl_loss_3": 1919.1037109375, + "kl_loss_7": 685.7258270263671, + "learning_rate": 0.0005680149810876322, + "loss": 1259.1618, + "step": 4620 + }, + { + "ce_loss_10": 3.5307737231254577, + "ce_loss_13": 3.448805606365204, + "ce_loss_2": 4.553147649765014, + "ce_loss_3": 4.283793473243714, + "ce_loss_7": 3.720176661014557, + "epoch": 0.463, + "grad_norm": 632.0, + "kl_loss_10": 191.36487274169923, + "kl_loss_2": 2267.567822265625, + "kl_loss_3": 1802.5030578613282, + "kl_loss_7": 648.5958099365234, + "learning_rate": 0.0005664427267851271, + "loss": 1217.3594, + "step": 4630 + }, + { + "ce_loss_10": 3.4447478532791136, + "ce_loss_13": 3.362277901172638, + "ce_loss_2": 4.474937617778778, + "ce_loss_3": 4.203511357307434, + "ce_loss_7": 3.640981078147888, + "epoch": 0.464, + "grad_norm": 616.0, + "kl_loss_10": 189.61345367431642, + "kl_loss_2": 2284.305810546875, + "kl_loss_3": 1801.5720520019531, + "kl_loss_7": 647.2827972412109, + "learning_rate": 0.0005648698034051009, + "loss": 1216.2738, + "step": 4640 + }, + { + "ce_loss_10": 3.5612680554389953, + "ce_loss_13": 3.479226899147034, + "ce_loss_2": 4.606190347671509, + "ce_loss_3": 4.343288254737854, + "ce_loss_7": 3.7559192776679993, + "epoch": 0.465, + "grad_norm": 680.0, + "kl_loss_10": 189.31488800048828, + "kl_loss_2": 2300.2595642089846, + "kl_loss_3": 1835.63125, + "kl_loss_7": 647.0011413574218, + "learning_rate": 0.0005632962267868747, + "loss": 1204.3232, + "step": 4650 + }, + { + "ce_loss_10": 3.504312825202942, + "ce_loss_13": 3.4246782064437866, + "ce_loss_2": 4.501714015007019, + "ce_loss_3": 4.243509244918823, + "ce_loss_7": 3.6963974952697756, + "epoch": 0.466, + "grad_norm": 656.0, + "kl_loss_10": 184.82376022338866, + "kl_loss_2": 2221.081378173828, + "kl_loss_3": 1770.11845703125, + "kl_loss_7": 636.5809020996094, + "learning_rate": 0.0005617220127763474, + "loss": 1219.1382, + "step": 4660 + }, + { + "ce_loss_10": 3.578074049949646, + "ce_loss_13": 3.497161865234375, + "ce_loss_2": 4.561417579650879, + "ce_loss_3": 4.303230881690979, + "ce_loss_7": 3.7666924834251403, + "epoch": 0.467, + "grad_norm": 592.0, + "kl_loss_10": 188.17724151611327, + "kl_loss_2": 2198.6551513671875, + "kl_loss_3": 1739.3878234863282, + "kl_loss_7": 638.6716613769531, + "learning_rate": 0.0005601471772258368, + "loss": 1209.8152, + "step": 4670 + }, + { + "ce_loss_10": 3.5602858781814577, + "ce_loss_13": 3.4812931418418884, + "ce_loss_2": 4.544128322601319, + "ce_loss_3": 4.283940744400025, + "ce_loss_7": 3.750748324394226, + "epoch": 0.468, + "grad_norm": 684.0, + "kl_loss_10": 186.29373779296876, + "kl_loss_2": 2186.011083984375, + "kl_loss_3": 1724.9905029296874, + "kl_loss_7": 634.5341583251953, + "learning_rate": 0.0005585717359939192, + "loss": 1216.8666, + "step": 4680 + }, + { + "ce_loss_10": 3.47387490272522, + "ce_loss_13": 3.3916377425193787, + "ce_loss_2": 4.47896523475647, + "ce_loss_3": 4.213813447952271, + "ce_loss_7": 3.6638592004776003, + "epoch": 0.469, + "grad_norm": 736.0, + "kl_loss_10": 187.3494743347168, + "kl_loss_2": 2222.502734375, + "kl_loss_3": 1755.6538635253905, + "kl_loss_7": 638.4273468017578, + "learning_rate": 0.0005569957049452703, + "loss": 1235.6265, + "step": 4690 + }, + { + "ce_loss_10": 3.530002760887146, + "ce_loss_13": 3.4474871516227723, + "ce_loss_2": 4.558400893211365, + "ce_loss_3": 4.2877805709838865, + "ce_loss_7": 3.7245721340179445, + "epoch": 0.47, + "grad_norm": 704.0, + "kl_loss_10": 192.37612838745116, + "kl_loss_2": 2285.8403198242186, + "kl_loss_3": 1808.7154968261718, + "kl_loss_7": 653.8845581054687, + "learning_rate": 0.0005554190999505056, + "loss": 1234.8666, + "step": 4700 + }, + { + "ce_loss_10": 3.655286133289337, + "ce_loss_13": 3.5717312812805178, + "ce_loss_2": 4.666804194450378, + "ce_loss_3": 4.405120444297791, + "ce_loss_7": 3.852936863899231, + "epoch": 0.471, + "grad_norm": 612.0, + "kl_loss_10": 194.36407165527345, + "kl_loss_2": 2267.82900390625, + "kl_loss_3": 1798.198681640625, + "kl_loss_7": 661.5685516357422, + "learning_rate": 0.0005538419368860196, + "loss": 1183.023, + "step": 4710 + }, + { + "ce_loss_10": 3.5788578033447265, + "ce_loss_13": 3.498483991622925, + "ce_loss_2": 4.574405527114868, + "ce_loss_3": 4.313237249851227, + "ce_loss_7": 3.768782043457031, + "epoch": 0.472, + "grad_norm": 600.0, + "kl_loss_10": 190.92964248657228, + "kl_loss_2": 2231.130651855469, + "kl_loss_3": 1765.2990478515626, + "kl_loss_7": 643.7991912841796, + "learning_rate": 0.0005522642316338268, + "loss": 1233.693, + "step": 4720 + }, + { + "ce_loss_10": 3.581640887260437, + "ce_loss_13": 3.5026119351387024, + "ce_loss_2": 4.585021948814392, + "ce_loss_3": 4.325532901287079, + "ce_loss_7": 3.7722782731056212, + "epoch": 0.473, + "grad_norm": 608.0, + "kl_loss_10": 190.94201431274413, + "kl_loss_2": 2235.0365600585938, + "kl_loss_3": 1776.091015625, + "kl_loss_7": 644.9752258300781, + "learning_rate": 0.0005506860000814017, + "loss": 1245.2729, + "step": 4730 + }, + { + "ce_loss_10": 3.609380042552948, + "ce_loss_13": 3.5285964608192444, + "ce_loss_2": 4.5732040166854855, + "ce_loss_3": 4.316051697731018, + "ce_loss_7": 3.7950204849243163, + "epoch": 0.474, + "grad_norm": 624.0, + "kl_loss_10": 185.59933853149414, + "kl_loss_2": 2152.8808044433595, + "kl_loss_3": 1698.98193359375, + "kl_loss_7": 630.6379913330078, + "learning_rate": 0.0005491072581215186, + "loss": 1197.5367, + "step": 4740 + }, + { + "ce_loss_10": 3.6150610566139223, + "ce_loss_13": 3.5275300979614257, + "ce_loss_2": 4.606984066963196, + "ce_loss_3": 4.331383717060089, + "ce_loss_7": 3.8067922830581664, + "epoch": 0.475, + "grad_norm": 636.0, + "kl_loss_10": 196.42518692016603, + "kl_loss_2": 2246.8279663085937, + "kl_loss_3": 1758.9309448242188, + "kl_loss_7": 653.222998046875, + "learning_rate": 0.0005475280216520913, + "loss": 1187.7061, + "step": 4750 + }, + { + "ce_loss_10": 3.5246535778045653, + "ce_loss_13": 3.4453783988952638, + "ce_loss_2": 4.515398740768433, + "ce_loss_3": 4.251828491687775, + "ce_loss_7": 3.7125940799713133, + "epoch": 0.476, + "grad_norm": 660.0, + "kl_loss_10": 186.9199966430664, + "kl_loss_2": 2199.839562988281, + "kl_loss_3": 1734.392041015625, + "kl_loss_7": 632.1179809570312, + "learning_rate": 0.0005459483065760138, + "loss": 1229.7142, + "step": 4760 + }, + { + "ce_loss_10": 3.4620707392692567, + "ce_loss_13": 3.379168164730072, + "ce_loss_2": 4.535378384590149, + "ce_loss_3": 4.269984316825867, + "ce_loss_7": 3.66820707321167, + "epoch": 0.477, + "grad_norm": 668.0, + "kl_loss_10": 189.84198379516602, + "kl_loss_2": 2346.7093200683594, + "kl_loss_3": 1881.5502380371095, + "kl_loss_7": 655.4429504394532, + "learning_rate": 0.0005443681288009991, + "loss": 1231.516, + "step": 4770 + }, + { + "ce_loss_10": 3.5201017260551453, + "ce_loss_13": 3.4394583463668824, + "ce_loss_2": 4.551519656181336, + "ce_loss_3": 4.275486898422241, + "ce_loss_7": 3.712023985385895, + "epoch": 0.478, + "grad_norm": 560.0, + "kl_loss_10": 188.47934265136718, + "kl_loss_2": 2298.261828613281, + "kl_loss_3": 1807.319403076172, + "kl_loss_7": 646.2581420898438, + "learning_rate": 0.0005427875042394199, + "loss": 1231.2074, + "step": 4780 + }, + { + "ce_loss_10": 3.5546525955200194, + "ce_loss_13": 3.4689871072769165, + "ce_loss_2": 4.55171308517456, + "ce_loss_3": 4.2830651044845585, + "ce_loss_7": 3.7494895100593566, + "epoch": 0.479, + "grad_norm": 568.0, + "kl_loss_10": 193.1684341430664, + "kl_loss_2": 2223.558709716797, + "kl_loss_3": 1744.958233642578, + "kl_loss_7": 652.0952423095703, + "learning_rate": 0.0005412064488081482, + "loss": 1232.2334, + "step": 4790 + }, + { + "ce_loss_10": 3.560468685626984, + "ce_loss_13": 3.4794244885444643, + "ce_loss_2": 4.549170875549317, + "ce_loss_3": 4.280433797836304, + "ce_loss_7": 3.744898808002472, + "epoch": 0.48, + "grad_norm": 548.0, + "kl_loss_10": 188.24676589965821, + "kl_loss_2": 2217.6575561523437, + "kl_loss_3": 1743.2784423828125, + "kl_loss_7": 636.4865295410157, + "learning_rate": 0.0005396249784283942, + "loss": 1197.0651, + "step": 4800 + }, + { + "ce_loss_10": 3.575687527656555, + "ce_loss_13": 3.4918730735778807, + "ce_loss_2": 4.614717435836792, + "ce_loss_3": 4.347899675369263, + "ce_loss_7": 3.7766286730766296, + "epoch": 0.481, + "grad_norm": 592.0, + "kl_loss_10": 195.0629508972168, + "kl_loss_2": 2307.5621826171873, + "kl_loss_3": 1836.4153686523437, + "kl_loss_7": 665.144580078125, + "learning_rate": 0.0005380431090255476, + "loss": 1235.3045, + "step": 4810 + }, + { + "ce_loss_10": 3.565406787395477, + "ce_loss_13": 3.487363612651825, + "ce_loss_2": 4.546458888053894, + "ce_loss_3": 4.2899955153465275, + "ce_loss_7": 3.747445857524872, + "epoch": 0.482, + "grad_norm": 608.0, + "kl_loss_10": 183.49071578979493, + "kl_loss_2": 2200.6481811523436, + "kl_loss_3": 1737.9393493652344, + "kl_loss_7": 622.3307556152344, + "learning_rate": 0.0005364608565290155, + "loss": 1189.2841, + "step": 4820 + }, + { + "ce_loss_10": 3.5748016953468325, + "ce_loss_13": 3.493953990936279, + "ce_loss_2": 4.58906729221344, + "ce_loss_3": 4.324404489994049, + "ce_loss_7": 3.7643205761909484, + "epoch": 0.483, + "grad_norm": 640.0, + "kl_loss_10": 190.96404800415038, + "kl_loss_2": 2251.1127075195313, + "kl_loss_3": 1785.181817626953, + "kl_loss_7": 641.9654663085937, + "learning_rate": 0.0005348782368720626, + "loss": 1217.6031, + "step": 4830 + }, + { + "ce_loss_10": 3.5082598328590393, + "ce_loss_13": 3.427862787246704, + "ce_loss_2": 4.508589172363282, + "ce_loss_3": 4.243466067314148, + "ce_loss_7": 3.6949036836624147, + "epoch": 0.484, + "grad_norm": 560.0, + "kl_loss_10": 186.74840545654297, + "kl_loss_2": 2224.133184814453, + "kl_loss_3": 1753.7203369140625, + "kl_loss_7": 630.4135833740235, + "learning_rate": 0.000533295265991652, + "loss": 1216.8205, + "step": 4840 + }, + { + "ce_loss_10": 3.5815645456314087, + "ce_loss_13": 3.4982495427131655, + "ce_loss_2": 4.554982018470764, + "ce_loss_3": 4.299691355228424, + "ce_loss_7": 3.7735238790512087, + "epoch": 0.485, + "grad_norm": 584.0, + "kl_loss_10": 187.77717666625978, + "kl_loss_2": 2175.573095703125, + "kl_loss_3": 1715.7588928222656, + "kl_loss_7": 631.838656616211, + "learning_rate": 0.0005317119598282822, + "loss": 1183.9046, + "step": 4850 + }, + { + "ce_loss_10": 3.586243951320648, + "ce_loss_13": 3.5034523725509645, + "ce_loss_2": 4.583103036880493, + "ce_loss_3": 4.312316799163819, + "ce_loss_7": 3.777419722080231, + "epoch": 0.486, + "grad_norm": 648.0, + "kl_loss_10": 189.01727676391602, + "kl_loss_2": 2203.9095703125, + "kl_loss_3": 1739.787420654297, + "kl_loss_7": 638.8051147460938, + "learning_rate": 0.0005301283343258293, + "loss": 1199.0793, + "step": 4860 + }, + { + "ce_loss_10": 3.644785749912262, + "ce_loss_13": 3.563555288314819, + "ce_loss_2": 4.610913848876953, + "ce_loss_3": 4.34997011423111, + "ce_loss_7": 3.832633006572723, + "epoch": 0.487, + "grad_norm": 648.0, + "kl_loss_10": 187.26018371582032, + "kl_loss_2": 2164.9867431640623, + "kl_loss_3": 1703.8299499511718, + "kl_loss_7": 629.719970703125, + "learning_rate": 0.000528544405431384, + "loss": 1174.2795, + "step": 4870 + }, + { + "ce_loss_10": 3.5308486342430117, + "ce_loss_13": 3.4465184569358827, + "ce_loss_2": 4.54223735332489, + "ce_loss_3": 4.275557327270508, + "ce_loss_7": 3.728735053539276, + "epoch": 0.488, + "grad_norm": 692.0, + "kl_loss_10": 194.1014518737793, + "kl_loss_2": 2267.4865783691407, + "kl_loss_3": 1794.8980224609375, + "kl_loss_7": 653.2649841308594, + "learning_rate": 0.000526960189095093, + "loss": 1222.7201, + "step": 4880 + }, + { + "ce_loss_10": 3.5016911029815674, + "ce_loss_13": 3.422479736804962, + "ce_loss_2": 4.5065477132797245, + "ce_loss_3": 4.244668066501617, + "ce_loss_7": 3.6958303570747377, + "epoch": 0.489, + "grad_norm": 624.0, + "kl_loss_10": 185.53594131469725, + "kl_loss_2": 2219.627575683594, + "kl_loss_3": 1760.8566650390626, + "kl_loss_7": 633.8929748535156, + "learning_rate": 0.0005253757012699972, + "loss": 1199.7284, + "step": 4890 + }, + { + "ce_loss_10": 3.592365336418152, + "ce_loss_13": 3.5133956909179687, + "ce_loss_2": 4.582755160331726, + "ce_loss_3": 4.310234916210175, + "ce_loss_7": 3.779422330856323, + "epoch": 0.49, + "grad_norm": 608.0, + "kl_loss_10": 188.63387451171874, + "kl_loss_2": 2196.80634765625, + "kl_loss_3": 1721.877880859375, + "kl_loss_7": 628.9239440917969, + "learning_rate": 0.0005237909579118712, + "loss": 1209.9893, + "step": 4900 + }, + { + "ce_loss_10": 3.5542251110076903, + "ce_loss_13": 3.470878171920776, + "ce_loss_2": 4.575217127799988, + "ce_loss_3": 4.311789894104004, + "ce_loss_7": 3.748956894874573, + "epoch": 0.491, + "grad_norm": 688.0, + "kl_loss_10": 192.452791595459, + "kl_loss_2": 2289.5204833984376, + "kl_loss_3": 1820.7321044921875, + "kl_loss_7": 654.6072784423828, + "learning_rate": 0.0005222059749790631, + "loss": 1232.3309, + "step": 4910 + }, + { + "ce_loss_10": 3.6172361254692076, + "ce_loss_13": 3.538671875, + "ce_loss_2": 4.572561645507813, + "ce_loss_3": 4.3095086216926575, + "ce_loss_7": 3.7990201711654663, + "epoch": 0.492, + "grad_norm": 580.0, + "kl_loss_10": 186.14958953857422, + "kl_loss_2": 2152.0723571777344, + "kl_loss_3": 1686.5931701660156, + "kl_loss_7": 627.8851165771484, + "learning_rate": 0.0005206207684323337, + "loss": 1161.1154, + "step": 4920 + }, + { + "ce_loss_10": 3.597834038734436, + "ce_loss_13": 3.5186328291893005, + "ce_loss_2": 4.576415348052978, + "ce_loss_3": 4.318524956703186, + "ce_loss_7": 3.7833918571472167, + "epoch": 0.493, + "grad_norm": 680.0, + "kl_loss_10": 190.28093643188475, + "kl_loss_2": 2205.189178466797, + "kl_loss_3": 1744.895166015625, + "kl_loss_7": 637.9018249511719, + "learning_rate": 0.000519035354234695, + "loss": 1221.5055, + "step": 4930 + }, + { + "ce_loss_10": 3.5777213335037232, + "ce_loss_13": 3.4926177620887757, + "ce_loss_2": 4.569476509094239, + "ce_loss_3": 4.300305211544037, + "ce_loss_7": 3.7719446539878847, + "epoch": 0.494, + "grad_norm": 652.0, + "kl_loss_10": 191.98795700073242, + "kl_loss_2": 2217.8314697265623, + "kl_loss_3": 1735.119366455078, + "kl_loss_7": 648.0345703125, + "learning_rate": 0.0005174497483512506, + "loss": 1188.0275, + "step": 4940 + }, + { + "ce_loss_10": 3.617672252655029, + "ce_loss_13": 3.5411610841751098, + "ce_loss_2": 4.595355463027954, + "ce_loss_3": 4.32707976102829, + "ce_loss_7": 3.8022171378135683, + "epoch": 0.495, + "grad_norm": 704.0, + "kl_loss_10": 185.97076797485352, + "kl_loss_2": 2193.0468505859376, + "kl_loss_3": 1726.8812622070313, + "kl_loss_7": 638.2306701660157, + "learning_rate": 0.0005158639667490339, + "loss": 1220.6553, + "step": 4950 + }, + { + "ce_loss_10": 3.5151694416999817, + "ce_loss_13": 3.4326966643333434, + "ce_loss_2": 4.5227725267410275, + "ce_loss_3": 4.255635476112365, + "ce_loss_7": 3.710303211212158, + "epoch": 0.496, + "grad_norm": 632.0, + "kl_loss_10": 189.1722068786621, + "kl_loss_2": 2227.908306884766, + "kl_loss_3": 1751.7201293945313, + "kl_loss_7": 643.16630859375, + "learning_rate": 0.0005142780253968481, + "loss": 1203.2568, + "step": 4960 + }, + { + "ce_loss_10": 3.4694177746772765, + "ce_loss_13": 3.3919921875, + "ce_loss_2": 4.455039215087891, + "ce_loss_3": 4.192563462257385, + "ce_loss_7": 3.6608409881591797, + "epoch": 0.497, + "grad_norm": 672.0, + "kl_loss_10": 182.45398559570313, + "kl_loss_2": 2196.9568115234374, + "kl_loss_3": 1734.4176452636718, + "kl_loss_7": 624.9595611572265, + "learning_rate": 0.0005126919402651053, + "loss": 1165.1617, + "step": 4970 + }, + { + "ce_loss_10": 3.5411869525909423, + "ce_loss_13": 3.4560129284858703, + "ce_loss_2": 4.551884508132934, + "ce_loss_3": 4.285728931427002, + "ce_loss_7": 3.730460357666016, + "epoch": 0.498, + "grad_norm": 612.0, + "kl_loss_10": 190.1128143310547, + "kl_loss_2": 2234.540148925781, + "kl_loss_3": 1768.0274841308594, + "kl_loss_7": 642.9938751220703, + "learning_rate": 0.0005111057273256647, + "loss": 1218.0719, + "step": 4980 + }, + { + "ce_loss_10": 3.640482187271118, + "ce_loss_13": 3.563759708404541, + "ce_loss_2": 4.559627389907837, + "ce_loss_3": 4.304729497432708, + "ce_loss_7": 3.809755790233612, + "epoch": 0.499, + "grad_norm": 600.0, + "kl_loss_10": 181.2877067565918, + "kl_loss_2": 2076.4399169921876, + "kl_loss_3": 1633.4311096191407, + "kl_loss_7": 606.3493682861329, + "learning_rate": 0.0005095194025516733, + "loss": 1149.4782, + "step": 4990 + }, + { + "ce_loss_10": 3.561459171772003, + "ce_loss_13": 3.485899102687836, + "ce_loss_2": 4.532869434356689, + "ce_loss_3": 4.273192000389099, + "ce_loss_7": 3.7427910447120665, + "epoch": 0.5, + "grad_norm": 612.0, + "kl_loss_10": 182.62270965576172, + "kl_loss_2": 2161.449591064453, + "kl_loss_3": 1706.75859375, + "kl_loss_7": 617.4605316162109, + "learning_rate": 0.000507932981917404, + "loss": 1217.3309, + "step": 5000 + }, + { + "ce_loss_10": 3.518963348865509, + "ce_loss_13": 3.4364115476608275, + "ce_loss_2": 4.566620469093323, + "ce_loss_3": 4.296507096290588, + "ce_loss_7": 3.7167017698287963, + "epoch": 0.501, + "grad_norm": 604.0, + "kl_loss_10": 191.43318862915038, + "kl_loss_2": 2312.919299316406, + "kl_loss_3": 1835.5362060546875, + "kl_loss_7": 654.5825164794921, + "learning_rate": 0.0005063464813980949, + "loss": 1243.5809, + "step": 5010 + }, + { + "ce_loss_10": 3.503278911113739, + "ce_loss_13": 3.423468828201294, + "ce_loss_2": 4.508650994300842, + "ce_loss_3": 4.244243478775024, + "ce_loss_7": 3.6842400670051574, + "epoch": 0.502, + "grad_norm": 616.0, + "kl_loss_10": 187.45429153442382, + "kl_loss_2": 2242.6967956542967, + "kl_loss_3": 1780.9238586425781, + "kl_loss_7": 636.0354858398438, + "learning_rate": 0.0005047599169697884, + "loss": 1195.7843, + "step": 5020 + }, + { + "ce_loss_10": 3.4397648930549622, + "ce_loss_13": 3.357620894908905, + "ce_loss_2": 4.463168692588806, + "ce_loss_3": 4.195060646533966, + "ce_loss_7": 3.633397877216339, + "epoch": 0.503, + "grad_norm": 604.0, + "kl_loss_10": 185.41551361083984, + "kl_loss_2": 2258.051135253906, + "kl_loss_3": 1778.3387390136718, + "kl_loss_7": 635.239013671875, + "learning_rate": 0.000503173304609171, + "loss": 1183.8663, + "step": 5030 + }, + { + "ce_loss_10": 3.5603776931762696, + "ce_loss_13": 3.4799546360969544, + "ce_loss_2": 4.5456082105636595, + "ce_loss_3": 4.285192847251892, + "ce_loss_7": 3.7480576753616335, + "epoch": 0.504, + "grad_norm": 656.0, + "kl_loss_10": 184.81720504760742, + "kl_loss_2": 2170.377197265625, + "kl_loss_3": 1713.3125, + "kl_loss_7": 627.2051513671875, + "learning_rate": 0.0005015866602934111, + "loss": 1173.4953, + "step": 5040 + }, + { + "ce_loss_10": 3.5348097562789915, + "ce_loss_13": 3.4481786727905273, + "ce_loss_2": 4.561786007881165, + "ce_loss_3": 4.291987287998199, + "ce_loss_7": 3.732908022403717, + "epoch": 0.505, + "grad_norm": 584.0, + "kl_loss_10": 195.19094161987306, + "kl_loss_2": 2283.529962158203, + "kl_loss_3": 1808.7241088867188, + "kl_loss_7": 661.38330078125, + "learning_rate": 0.0005, + "loss": 1216.3971, + "step": 5050 + }, + { + "ce_loss_10": 3.5199029207229615, + "ce_loss_13": 3.4400954723358153, + "ce_loss_2": 4.522394108772278, + "ce_loss_3": 4.258548331260681, + "ce_loss_7": 3.7067763924598696, + "epoch": 0.506, + "grad_norm": 632.0, + "kl_loss_10": 190.6364860534668, + "kl_loss_2": 2246.9065795898437, + "kl_loss_3": 1774.8018188476562, + "kl_loss_7": 642.158203125, + "learning_rate": 0.0004984133397065889, + "loss": 1187.0591, + "step": 5060 + }, + { + "ce_loss_10": 3.529603970050812, + "ce_loss_13": 3.448684501647949, + "ce_loss_2": 4.540892434120178, + "ce_loss_3": 4.281192362308502, + "ce_loss_7": 3.727533829212189, + "epoch": 0.507, + "grad_norm": 572.0, + "kl_loss_10": 189.2146110534668, + "kl_loss_2": 2238.786248779297, + "kl_loss_3": 1779.2802124023438, + "kl_loss_7": 641.3925506591797, + "learning_rate": 0.0004968266953908291, + "loss": 1190.1465, + "step": 5070 + }, + { + "ce_loss_10": 3.5666260600090025, + "ce_loss_13": 3.486749768257141, + "ce_loss_2": 4.580948376655579, + "ce_loss_3": 4.316030120849609, + "ce_loss_7": 3.7590123891830443, + "epoch": 0.508, + "grad_norm": 608.0, + "kl_loss_10": 183.170157623291, + "kl_loss_2": 2245.8196716308594, + "kl_loss_3": 1773.8548583984375, + "kl_loss_7": 630.8032928466797, + "learning_rate": 0.0004952400830302117, + "loss": 1205.3312, + "step": 5080 + }, + { + "ce_loss_10": 3.4943687319755554, + "ce_loss_13": 3.4131668329238893, + "ce_loss_2": 4.523447823524475, + "ce_loss_3": 4.255696547031403, + "ce_loss_7": 3.686564898490906, + "epoch": 0.509, + "grad_norm": 624.0, + "kl_loss_10": 190.01820449829103, + "kl_loss_2": 2279.890344238281, + "kl_loss_3": 1807.6574096679688, + "kl_loss_7": 647.3827026367187, + "learning_rate": 0.0004936535186019053, + "loss": 1207.5289, + "step": 5090 + }, + { + "ce_loss_10": 3.5966561436653137, + "ce_loss_13": 3.5205499291419984, + "ce_loss_2": 4.557806515693665, + "ce_loss_3": 4.297008419036866, + "ce_loss_7": 3.777693784236908, + "epoch": 0.51, + "grad_norm": 572.0, + "kl_loss_10": 181.29688186645507, + "kl_loss_2": 2148.9375854492187, + "kl_loss_3": 1687.6425170898438, + "kl_loss_7": 609.7850677490235, + "learning_rate": 0.000492067018082596, + "loss": 1180.1517, + "step": 5100 + }, + { + "ce_loss_10": 3.5341065168380736, + "ce_loss_13": 3.448958945274353, + "ce_loss_2": 4.584142446517944, + "ce_loss_3": 4.311909413337707, + "ce_loss_7": 3.7378315210342405, + "epoch": 0.511, + "grad_norm": 580.0, + "kl_loss_10": 191.75616531372071, + "kl_loss_2": 2313.8392028808594, + "kl_loss_3": 1838.1626281738281, + "kl_loss_7": 657.9944549560547, + "learning_rate": 0.0004904805974483267, + "loss": 1252.0359, + "step": 5110 + }, + { + "ce_loss_10": 3.6478444814682005, + "ce_loss_13": 3.5622426509857177, + "ce_loss_2": 4.652135348320007, + "ce_loss_3": 4.385519003868103, + "ce_loss_7": 3.8461916565895082, + "epoch": 0.512, + "grad_norm": 620.0, + "kl_loss_10": 196.4123405456543, + "kl_loss_2": 2261.5567626953125, + "kl_loss_3": 1794.3322509765626, + "kl_loss_7": 663.2056915283204, + "learning_rate": 0.0004888942726743353, + "loss": 1254.773, + "step": 5120 + }, + { + "ce_loss_10": 3.5161622405052184, + "ce_loss_13": 3.435041069984436, + "ce_loss_2": 4.527030563354492, + "ce_loss_3": 4.273452854156494, + "ce_loss_7": 3.7132395029067995, + "epoch": 0.513, + "grad_norm": 612.0, + "kl_loss_10": 189.22552337646485, + "kl_loss_2": 2261.9498779296873, + "kl_loss_3": 1801.7636291503907, + "kl_loss_7": 649.0409454345703, + "learning_rate": 0.0004873080597348947, + "loss": 1220.4549, + "step": 5130 + }, + { + "ce_loss_10": 3.4059476256370544, + "ce_loss_13": 3.325529730319977, + "ce_loss_2": 4.472175240516663, + "ce_loss_3": 4.212775444984436, + "ce_loss_7": 3.6100828886032104, + "epoch": 0.514, + "grad_norm": 576.0, + "kl_loss_10": 188.01781005859374, + "kl_loss_2": 2348.922229003906, + "kl_loss_3": 1884.4989135742187, + "kl_loss_7": 653.7215118408203, + "learning_rate": 0.0004857219746031519, + "loss": 1228.3554, + "step": 5140 + }, + { + "ce_loss_10": 3.5706036925315856, + "ce_loss_13": 3.4925912499427794, + "ce_loss_2": 4.564787793159485, + "ce_loss_3": 4.288926684856415, + "ce_loss_7": 3.7588186025619508, + "epoch": 0.515, + "grad_norm": 564.0, + "kl_loss_10": 187.27239913940429, + "kl_loss_2": 2197.20703125, + "kl_loss_3": 1721.3864501953126, + "kl_loss_7": 633.3875030517578, + "learning_rate": 0.0004841360332509663, + "loss": 1198.5317, + "step": 5150 + }, + { + "ce_loss_10": 3.5291930079460143, + "ce_loss_13": 3.451045203208923, + "ce_loss_2": 4.509535562992096, + "ce_loss_3": 4.244547712802887, + "ce_loss_7": 3.7146947622299193, + "epoch": 0.516, + "grad_norm": 640.0, + "kl_loss_10": 182.93116302490233, + "kl_loss_2": 2188.2712890625, + "kl_loss_3": 1720.3389709472656, + "kl_loss_7": 621.1425567626953, + "learning_rate": 0.0004825502516487497, + "loss": 1155.4164, + "step": 5160 + }, + { + "ce_loss_10": 3.494162142276764, + "ce_loss_13": 3.410848069190979, + "ce_loss_2": 4.509466361999512, + "ce_loss_3": 4.249596023559571, + "ce_loss_7": 3.689158225059509, + "epoch": 0.517, + "grad_norm": 776.0, + "kl_loss_10": 188.6508804321289, + "kl_loss_2": 2267.020611572266, + "kl_loss_3": 1803.9957458496094, + "kl_loss_7": 643.8313232421875, + "learning_rate": 0.00048096464576530507, + "loss": 1222.8347, + "step": 5170 + }, + { + "ce_loss_10": 3.5969889640808104, + "ce_loss_13": 3.5190072774887087, + "ce_loss_2": 4.547854423522949, + "ce_loss_3": 4.293277430534363, + "ce_loss_7": 3.7762330770492554, + "epoch": 0.518, + "grad_norm": 620.0, + "kl_loss_10": 184.8033348083496, + "kl_loss_2": 2134.698907470703, + "kl_loss_3": 1683.638848876953, + "kl_loss_7": 620.0321014404296, + "learning_rate": 0.00047937923156766646, + "loss": 1168.0455, + "step": 5180 + }, + { + "ce_loss_10": 3.6420543789863586, + "ce_loss_13": 3.5626509547233582, + "ce_loss_2": 4.591393780708313, + "ce_loss_3": 4.326744735240936, + "ce_loss_7": 3.8211991429328918, + "epoch": 0.519, + "grad_norm": 560.0, + "kl_loss_10": 186.71140975952147, + "kl_loss_2": 2131.37041015625, + "kl_loss_3": 1673.548291015625, + "kl_loss_7": 620.3797760009766, + "learning_rate": 0.00047779402502093696, + "loss": 1176.4619, + "step": 5190 + }, + { + "ce_loss_10": 3.6047547817230225, + "ce_loss_13": 3.5276923775672913, + "ce_loss_2": 4.578773355484008, + "ce_loss_3": 4.314329183101654, + "ce_loss_7": 3.7894126772880554, + "epoch": 0.52, + "grad_norm": 572.0, + "kl_loss_10": 184.64933090209962, + "kl_loss_2": 2171.194439697266, + "kl_loss_3": 1701.0345581054687, + "kl_loss_7": 621.1681274414062, + "learning_rate": 0.0004762090420881289, + "loss": 1192.2752, + "step": 5200 + }, + { + "ce_loss_10": 3.524991714954376, + "ce_loss_13": 3.449671447277069, + "ce_loss_2": 4.498011994361877, + "ce_loss_3": 4.23529599905014, + "ce_loss_7": 3.705312669277191, + "epoch": 0.521, + "grad_norm": 608.0, + "kl_loss_10": 186.35540390014648, + "kl_loss_2": 2183.97861328125, + "kl_loss_3": 1723.4856018066407, + "kl_loss_7": 620.2787628173828, + "learning_rate": 0.00047462429873000296, + "loss": 1166.6783, + "step": 5210 + }, + { + "ce_loss_10": 3.610927963256836, + "ce_loss_13": 3.5292730212211607, + "ce_loss_2": 4.586357808113098, + "ce_loss_3": 4.316179418563843, + "ce_loss_7": 3.7876657485961913, + "epoch": 0.522, + "grad_norm": 572.0, + "kl_loss_10": 187.58379135131835, + "kl_loss_2": 2205.0898559570314, + "kl_loss_3": 1728.4106079101562, + "kl_loss_7": 624.2286590576172, + "learning_rate": 0.0004730398109049071, + "loss": 1181.2787, + "step": 5220 + }, + { + "ce_loss_10": 3.542900788784027, + "ce_loss_13": 3.4592981576919555, + "ce_loss_2": 4.5604215383529665, + "ce_loss_3": 4.294589376449585, + "ce_loss_7": 3.7346125721931456, + "epoch": 0.523, + "grad_norm": 632.0, + "kl_loss_10": 192.22620544433593, + "kl_loss_2": 2275.4386779785154, + "kl_loss_3": 1804.1369689941407, + "kl_loss_7": 648.4010925292969, + "learning_rate": 0.000471455594568616, + "loss": 1206.5586, + "step": 5230 + }, + { + "ce_loss_10": 3.612694036960602, + "ce_loss_13": 3.5346154451370237, + "ce_loss_2": 4.571756148338318, + "ce_loss_3": 4.30354597568512, + "ce_loss_7": 3.7924134016036986, + "epoch": 0.524, + "grad_norm": 584.0, + "kl_loss_10": 184.57232360839845, + "kl_loss_2": 2148.5399963378904, + "kl_loss_3": 1679.7537841796875, + "kl_loss_7": 619.590966796875, + "learning_rate": 0.00046987166567417086, + "loss": 1185.6557, + "step": 5240 + }, + { + "ce_loss_10": 3.5288819313049316, + "ce_loss_13": 3.452391028404236, + "ce_loss_2": 4.524345111846924, + "ce_loss_3": 4.255153965950012, + "ce_loss_7": 3.7151949644088744, + "epoch": 0.525, + "grad_norm": 640.0, + "kl_loss_10": 184.01749114990236, + "kl_loss_2": 2198.7858947753907, + "kl_loss_3": 1730.872314453125, + "kl_loss_7": 629.3970092773437, + "learning_rate": 0.00046828804017171776, + "loss": 1156.5996, + "step": 5250 + }, + { + "ce_loss_10": 3.5754063725471497, + "ce_loss_13": 3.4882086515426636, + "ce_loss_2": 4.589629459381103, + "ce_loss_3": 4.328830146789551, + "ce_loss_7": 3.7704445004463194, + "epoch": 0.526, + "grad_norm": 640.0, + "kl_loss_10": 189.38601303100586, + "kl_loss_2": 2242.6748046875, + "kl_loss_3": 1771.9065246582031, + "kl_loss_7": 637.7771270751953, + "learning_rate": 0.00046670473400834805, + "loss": 1218.8605, + "step": 5260 + }, + { + "ce_loss_10": 3.5049550890922547, + "ce_loss_13": 3.428373408317566, + "ce_loss_2": 4.489202237129211, + "ce_loss_3": 4.228802132606506, + "ce_loss_7": 3.686991608142853, + "epoch": 0.527, + "grad_norm": 580.0, + "kl_loss_10": 181.47291641235353, + "kl_loss_2": 2184.507427978516, + "kl_loss_3": 1721.9762329101563, + "kl_loss_7": 614.4228942871093, + "learning_rate": 0.00046512176312793734, + "loss": 1216.9304, + "step": 5270 + }, + { + "ce_loss_10": 3.497020888328552, + "ce_loss_13": 3.415910315513611, + "ce_loss_2": 4.500096344947815, + "ce_loss_3": 4.221375334262848, + "ce_loss_7": 3.6874555468559267, + "epoch": 0.528, + "grad_norm": 608.0, + "kl_loss_10": 183.7262046813965, + "kl_loss_2": 2223.9841369628907, + "kl_loss_3": 1744.9040588378907, + "kl_loss_7": 628.2290283203125, + "learning_rate": 0.00046353914347098467, + "loss": 1206.4577, + "step": 5280 + }, + { + "ce_loss_10": 3.5970619559288024, + "ce_loss_13": 3.5186134576797485, + "ce_loss_2": 4.588784885406494, + "ce_loss_3": 4.328100037574768, + "ce_loss_7": 3.7806106090545653, + "epoch": 0.529, + "grad_norm": 608.0, + "kl_loss_10": 183.81845779418944, + "kl_loss_2": 2204.89072265625, + "kl_loss_3": 1738.6330078125, + "kl_loss_7": 622.7282592773438, + "learning_rate": 0.0004619568909744524, + "loss": 1214.3289, + "step": 5290 + }, + { + "ce_loss_10": 3.5965808272361754, + "ce_loss_13": 3.519477891921997, + "ce_loss_2": 4.575191998481751, + "ce_loss_3": 4.308115267753601, + "ce_loss_7": 3.779456090927124, + "epoch": 0.53, + "grad_norm": 624.0, + "kl_loss_10": 185.90534057617188, + "kl_loss_2": 2166.9622314453127, + "kl_loss_3": 1701.3540832519532, + "kl_loss_7": 623.1412811279297, + "learning_rate": 0.00046037502157160573, + "loss": 1194.0631, + "step": 5300 + }, + { + "ce_loss_10": 3.475346398353577, + "ce_loss_13": 3.3953770637512206, + "ce_loss_2": 4.472927665710449, + "ce_loss_3": 4.211131680011749, + "ce_loss_7": 3.672848129272461, + "epoch": 0.531, + "grad_norm": 608.0, + "kl_loss_10": 188.33962783813476, + "kl_loss_2": 2232.2967163085937, + "kl_loss_3": 1767.3839172363282, + "kl_loss_7": 649.8835662841797, + "learning_rate": 0.00045879355119185207, + "loss": 1212.3993, + "step": 5310 + }, + { + "ce_loss_10": 3.555951988697052, + "ce_loss_13": 3.474162495136261, + "ce_loss_2": 4.560364985466004, + "ce_loss_3": 4.293745231628418, + "ce_loss_7": 3.751049613952637, + "epoch": 0.532, + "grad_norm": 672.0, + "kl_loss_10": 190.96983184814454, + "kl_loss_2": 2257.3020629882812, + "kl_loss_3": 1780.9093383789063, + "kl_loss_7": 650.6417663574218, + "learning_rate": 0.0004572124957605803, + "loss": 1223.1152, + "step": 5320 + }, + { + "ce_loss_10": 3.5723905324935914, + "ce_loss_13": 3.492247462272644, + "ce_loss_2": 4.554775309562683, + "ce_loss_3": 4.289375352859497, + "ce_loss_7": 3.7621920228004457, + "epoch": 0.533, + "grad_norm": 584.0, + "kl_loss_10": 185.00704040527344, + "kl_loss_2": 2210.7857055664062, + "kl_loss_3": 1738.2788696289062, + "kl_loss_7": 631.7678100585938, + "learning_rate": 0.00045563187119900103, + "loss": 1171.3742, + "step": 5330 + }, + { + "ce_loss_10": 3.4156481266021728, + "ce_loss_13": 3.338289904594421, + "ce_loss_2": 4.459405374526978, + "ce_loss_3": 4.184291207790375, + "ce_loss_7": 3.612143576145172, + "epoch": 0.534, + "grad_norm": 668.0, + "kl_loss_10": 185.77383117675782, + "kl_loss_2": 2280.9642578125, + "kl_loss_3": 1801.5166015625, + "kl_loss_7": 637.4973205566406, + "learning_rate": 0.00045405169342398633, + "loss": 1214.5622, + "step": 5340 + }, + { + "ce_loss_10": 3.5048020482063293, + "ce_loss_13": 3.422155427932739, + "ce_loss_2": 4.527113747596741, + "ce_loss_3": 4.256860768795013, + "ce_loss_7": 3.6958253622055053, + "epoch": 0.535, + "grad_norm": 580.0, + "kl_loss_10": 188.63988189697267, + "kl_loss_2": 2252.499432373047, + "kl_loss_3": 1773.2778076171876, + "kl_loss_7": 633.0957061767579, + "learning_rate": 0.0004524719783479088, + "loss": 1187.9953, + "step": 5350 + }, + { + "ce_loss_10": 3.460780155658722, + "ce_loss_13": 3.378307545185089, + "ce_loss_2": 4.497902464866638, + "ce_loss_3": 4.2276026725769045, + "ce_loss_7": 3.6559959650039673, + "epoch": 0.536, + "grad_norm": 580.0, + "kl_loss_10": 189.0280532836914, + "kl_loss_2": 2293.6262939453127, + "kl_loss_3": 1820.3978698730468, + "kl_loss_7": 642.6393402099609, + "learning_rate": 0.00045089274187848144, + "loss": 1197.8392, + "step": 5360 + }, + { + "ce_loss_10": 3.5799126744270326, + "ce_loss_13": 3.501321530342102, + "ce_loss_2": 4.5603124618530275, + "ce_loss_3": 4.297963404655457, + "ce_loss_7": 3.7619481921195983, + "epoch": 0.537, + "grad_norm": 672.0, + "kl_loss_10": 183.09423599243163, + "kl_loss_2": 2192.1404357910155, + "kl_loss_3": 1730.206787109375, + "kl_loss_7": 620.7777648925781, + "learning_rate": 0.00044931399991859835, + "loss": 1181.3807, + "step": 5370 + }, + { + "ce_loss_10": 3.4432420253753664, + "ce_loss_13": 3.364873206615448, + "ce_loss_2": 4.452599573135376, + "ce_loss_3": 4.183995950222015, + "ce_loss_7": 3.6285991072654724, + "epoch": 0.538, + "grad_norm": 600.0, + "kl_loss_10": 182.95552597045898, + "kl_loss_2": 2236.559704589844, + "kl_loss_3": 1765.850408935547, + "kl_loss_7": 629.1190887451172, + "learning_rate": 0.00044773576836617336, + "loss": 1181.7396, + "step": 5380 + }, + { + "ce_loss_10": 3.537210750579834, + "ce_loss_13": 3.4561371922492983, + "ce_loss_2": 4.546432638168335, + "ce_loss_3": 4.281138265132904, + "ce_loss_7": 3.7339015364646913, + "epoch": 0.539, + "grad_norm": 612.0, + "kl_loss_10": 189.98071517944337, + "kl_loss_2": 2253.4164794921876, + "kl_loss_3": 1781.9518432617188, + "kl_loss_7": 650.4307464599609, + "learning_rate": 0.00044615806311398056, + "loss": 1232.9109, + "step": 5390 + }, + { + "ce_loss_10": 3.6113093972206114, + "ce_loss_13": 3.5354915499687194, + "ce_loss_2": 4.540320181846619, + "ce_loss_3": 4.277068996429444, + "ce_loss_7": 3.787187647819519, + "epoch": 0.54, + "grad_norm": 580.0, + "kl_loss_10": 181.3637908935547, + "kl_loss_2": 2094.3728942871094, + "kl_loss_3": 1633.9456298828125, + "kl_loss_7": 605.8723022460938, + "learning_rate": 0.00044458090004949454, + "loss": 1175.0439, + "step": 5400 + }, + { + "ce_loss_10": 3.47382390499115, + "ce_loss_13": 3.39083354473114, + "ce_loss_2": 4.532833766937256, + "ce_loss_3": 4.262583804130554, + "ce_loss_7": 3.6737227201461793, + "epoch": 0.541, + "grad_norm": 620.0, + "kl_loss_10": 194.1818962097168, + "kl_loss_2": 2371.5374450683594, + "kl_loss_3": 1877.3123657226563, + "kl_loss_7": 665.8367095947266, + "learning_rate": 0.0004430042950547297, + "loss": 1218.705, + "step": 5410 + }, + { + "ce_loss_10": 3.5697335839271545, + "ce_loss_13": 3.483165454864502, + "ce_loss_2": 4.578557109832763, + "ce_loss_3": 4.31482458114624, + "ce_loss_7": 3.763143301010132, + "epoch": 0.542, + "grad_norm": 572.0, + "kl_loss_10": 191.8735610961914, + "kl_loss_2": 2253.524365234375, + "kl_loss_3": 1779.8523681640625, + "kl_loss_7": 645.0971527099609, + "learning_rate": 0.0004414282640060809, + "loss": 1200.7552, + "step": 5420 + }, + { + "ce_loss_10": 3.656325376033783, + "ce_loss_13": 3.575901198387146, + "ce_loss_2": 4.611030888557434, + "ce_loss_3": 4.35529580116272, + "ce_loss_7": 3.8402703166007996, + "epoch": 0.543, + "grad_norm": 672.0, + "kl_loss_10": 186.09361267089844, + "kl_loss_2": 2127.538677978516, + "kl_loss_3": 1677.6395080566406, + "kl_loss_7": 622.3258697509766, + "learning_rate": 0.0004398528227741633, + "loss": 1179.4629, + "step": 5430 + }, + { + "ce_loss_10": 3.5199654936790465, + "ce_loss_13": 3.442525625228882, + "ce_loss_2": 4.519460201263428, + "ce_loss_3": 4.247548985481262, + "ce_loss_7": 3.7133419036865236, + "epoch": 0.544, + "grad_norm": 656.0, + "kl_loss_10": 186.4021957397461, + "kl_loss_2": 2206.209338378906, + "kl_loss_3": 1726.7237670898437, + "kl_loss_7": 636.2167572021484, + "learning_rate": 0.00043827798722365264, + "loss": 1202.1797, + "step": 5440 + }, + { + "ce_loss_10": 3.6471530318260195, + "ce_loss_13": 3.566143047809601, + "ce_loss_2": 4.5952486276626585, + "ce_loss_3": 4.333053851127625, + "ce_loss_7": 3.8201894760131836, + "epoch": 0.545, + "grad_norm": 592.0, + "kl_loss_10": 185.36949920654297, + "kl_loss_2": 2143.129284667969, + "kl_loss_3": 1675.2244567871094, + "kl_loss_7": 617.6786651611328, + "learning_rate": 0.00043670377321312535, + "loss": 1164.6765, + "step": 5450 + }, + { + "ce_loss_10": 3.6508351445198057, + "ce_loss_13": 3.574675273895264, + "ce_loss_2": 4.5991229772567745, + "ce_loss_3": 4.339683651924133, + "ce_loss_7": 3.8303612232208253, + "epoch": 0.546, + "grad_norm": 700.0, + "kl_loss_10": 183.1472366333008, + "kl_loss_2": 2130.7037048339844, + "kl_loss_3": 1667.0253173828125, + "kl_loss_7": 613.02021484375, + "learning_rate": 0.0004351301965948991, + "loss": 1168.8242, + "step": 5460 + }, + { + "ce_loss_10": 3.559572923183441, + "ce_loss_13": 3.478611421585083, + "ce_loss_2": 4.511995816230774, + "ce_loss_3": 4.249083304405213, + "ce_loss_7": 3.7354837536811827, + "epoch": 0.547, + "grad_norm": 636.0, + "kl_loss_10": 181.6176902770996, + "kl_loss_2": 2130.2896118164062, + "kl_loss_3": 1667.8863098144532, + "kl_loss_7": 614.7661987304688, + "learning_rate": 0.000433557273214873, + "loss": 1176.8127, + "step": 5470 + }, + { + "ce_loss_10": 3.545152747631073, + "ce_loss_13": 3.4662238121032716, + "ce_loss_2": 4.518579649925232, + "ce_loss_3": 4.245905971527099, + "ce_loss_7": 3.7270439863204956, + "epoch": 0.548, + "grad_norm": 608.0, + "kl_loss_10": 184.05833053588867, + "kl_loss_2": 2168.807977294922, + "kl_loss_3": 1696.2628234863282, + "kl_loss_7": 616.8240295410156, + "learning_rate": 0.000431985018912368, + "loss": 1150.4518, + "step": 5480 + }, + { + "ce_loss_10": 3.514492917060852, + "ce_loss_13": 3.4341874718666077, + "ce_loss_2": 4.534255909919739, + "ce_loss_3": 4.270003151893616, + "ce_loss_7": 3.7031027913093566, + "epoch": 0.549, + "grad_norm": 600.0, + "kl_loss_10": 189.14087448120117, + "kl_loss_2": 2268.4575805664062, + "kl_loss_3": 1800.3566040039063, + "kl_loss_7": 639.125277709961, + "learning_rate": 0.0004304134495199674, + "loss": 1178.9426, + "step": 5490 + }, + { + "ce_loss_10": 3.538786220550537, + "ce_loss_13": 3.4557671666145326, + "ce_loss_2": 4.5282275676727295, + "ce_loss_3": 4.265328872203827, + "ce_loss_7": 3.731334662437439, + "epoch": 0.55, + "grad_norm": 604.0, + "kl_loss_10": 188.5583984375, + "kl_loss_2": 2236.901904296875, + "kl_loss_3": 1761.930596923828, + "kl_loss_7": 644.1045196533203, + "learning_rate": 0.0004288425808633575, + "loss": 1185.0572, + "step": 5500 + }, + { + "ce_loss_10": 3.514096534252167, + "ce_loss_13": 3.435099017620087, + "ce_loss_2": 4.509266877174378, + "ce_loss_3": 4.252711880207062, + "ce_loss_7": 3.6944369435310365, + "epoch": 0.551, + "grad_norm": 664.0, + "kl_loss_10": 184.48614044189452, + "kl_loss_2": 2223.2010803222656, + "kl_loss_3": 1765.0783264160157, + "kl_loss_7": 630.5097595214844, + "learning_rate": 0.0004272724287611684, + "loss": 1201.5842, + "step": 5510 + }, + { + "ce_loss_10": 3.490022134780884, + "ce_loss_13": 3.4118714332580566, + "ce_loss_2": 4.514768314361572, + "ce_loss_3": 4.2408933401107785, + "ce_loss_7": 3.680497145652771, + "epoch": 0.552, + "grad_norm": 652.0, + "kl_loss_10": 185.8211784362793, + "kl_loss_2": 2267.8012084960938, + "kl_loss_3": 1790.0665649414063, + "kl_loss_7": 633.0888458251953, + "learning_rate": 0.00042570300902481425, + "loss": 1202.0281, + "step": 5520 + }, + { + "ce_loss_10": 3.523720991611481, + "ce_loss_13": 3.448110568523407, + "ce_loss_2": 4.499163627624512, + "ce_loss_3": 4.242252886295319, + "ce_loss_7": 3.704312777519226, + "epoch": 0.553, + "grad_norm": 608.0, + "kl_loss_10": 183.24146575927733, + "kl_loss_2": 2193.4769287109375, + "kl_loss_3": 1734.7890686035157, + "kl_loss_7": 623.5370147705078, + "learning_rate": 0.00042413433745833423, + "loss": 1179.776, + "step": 5530 + }, + { + "ce_loss_10": 3.5270172238349913, + "ce_loss_13": 3.448072147369385, + "ce_loss_2": 4.536388492584228, + "ce_loss_3": 4.260759913921357, + "ce_loss_7": 3.715148115158081, + "epoch": 0.554, + "grad_norm": 556.0, + "kl_loss_10": 183.68499755859375, + "kl_loss_2": 2226.1301025390626, + "kl_loss_3": 1743.2217163085938, + "kl_loss_7": 626.4557403564453, + "learning_rate": 0.0004225664298582339, + "loss": 1157.0496, + "step": 5540 + }, + { + "ce_loss_10": 3.6083423376083372, + "ce_loss_13": 3.530562436580658, + "ce_loss_2": 4.568499255180359, + "ce_loss_3": 4.307069134712219, + "ce_loss_7": 3.7862043499946596, + "epoch": 0.555, + "grad_norm": 548.0, + "kl_loss_10": 182.37268829345703, + "kl_loss_2": 2137.707696533203, + "kl_loss_3": 1673.7792663574219, + "kl_loss_7": 611.6491577148438, + "learning_rate": 0.000420999302013325, + "loss": 1149.7553, + "step": 5550 + }, + { + "ce_loss_10": 3.5049922823905946, + "ce_loss_13": 3.420680546760559, + "ce_loss_2": 4.553832268714904, + "ce_loss_3": 4.279029071331024, + "ce_loss_7": 3.700891983509064, + "epoch": 0.556, + "grad_norm": 572.0, + "kl_loss_10": 190.98652191162108, + "kl_loss_2": 2305.641845703125, + "kl_loss_3": 1822.5648254394532, + "kl_loss_7": 641.2451202392579, + "learning_rate": 0.000419432969704568, + "loss": 1204.391, + "step": 5560 + }, + { + "ce_loss_10": 3.548888063430786, + "ce_loss_13": 3.4704429507255554, + "ce_loss_2": 4.518404316902161, + "ce_loss_3": 4.257292962074279, + "ce_loss_7": 3.735389542579651, + "epoch": 0.557, + "grad_norm": 564.0, + "kl_loss_10": 182.6816421508789, + "kl_loss_2": 2144.968542480469, + "kl_loss_3": 1682.6951538085937, + "kl_loss_7": 617.6556762695312, + "learning_rate": 0.00041786744870491154, + "loss": 1202.9963, + "step": 5570 + }, + { + "ce_loss_10": 3.491339087486267, + "ce_loss_13": 3.412715029716492, + "ce_loss_2": 4.4881198644638065, + "ce_loss_3": 4.219742333889007, + "ce_loss_7": 3.679445171356201, + "epoch": 0.558, + "grad_norm": 576.0, + "kl_loss_10": 189.42200622558593, + "kl_loss_2": 2234.280969238281, + "kl_loss_3": 1757.53349609375, + "kl_loss_7": 641.0087585449219, + "learning_rate": 0.0004163027547791347, + "loss": 1192.3963, + "step": 5580 + }, + { + "ce_loss_10": 3.4689704895019533, + "ce_loss_13": 3.3872820258140566, + "ce_loss_2": 4.518157267570496, + "ce_loss_3": 4.244075846672058, + "ce_loss_7": 3.6619726419448853, + "epoch": 0.559, + "grad_norm": 688.0, + "kl_loss_10": 188.0017578125, + "kl_loss_2": 2320.9525756835938, + "kl_loss_3": 1834.1813659667969, + "kl_loss_7": 642.0279479980469, + "learning_rate": 0.0004147389036836881, + "loss": 1210.1521, + "step": 5590 + }, + { + "ce_loss_10": 3.5183377385139467, + "ce_loss_13": 3.4371410965919496, + "ce_loss_2": 4.522028660774231, + "ce_loss_3": 4.258878147602081, + "ce_loss_7": 3.706261694431305, + "epoch": 0.56, + "grad_norm": 652.0, + "kl_loss_10": 185.66660232543944, + "kl_loss_2": 2233.013397216797, + "kl_loss_3": 1764.0713806152344, + "kl_loss_7": 637.6866302490234, + "learning_rate": 0.00041317591116653486, + "loss": 1219.6441, + "step": 5600 + }, + { + "ce_loss_10": 3.558071720600128, + "ce_loss_13": 3.474745440483093, + "ce_loss_2": 4.558679819107056, + "ce_loss_3": 4.291901731491089, + "ce_loss_7": 3.746951687335968, + "epoch": 0.561, + "grad_norm": 592.0, + "kl_loss_10": 189.82635574340821, + "kl_loss_2": 2230.9510803222656, + "kl_loss_3": 1759.6529296875, + "kl_loss_7": 636.9456726074219, + "learning_rate": 0.0004116137929669921, + "loss": 1188.2356, + "step": 5610 + }, + { + "ce_loss_10": 3.544596457481384, + "ce_loss_13": 3.465434396266937, + "ce_loss_2": 4.526343536376953, + "ce_loss_3": 4.262159049510956, + "ce_loss_7": 3.7297433972358705, + "epoch": 0.562, + "grad_norm": 700.0, + "kl_loss_10": 184.16798706054686, + "kl_loss_2": 2204.5443481445313, + "kl_loss_3": 1738.5609375, + "kl_loss_7": 629.1714752197265, + "learning_rate": 0.00041005256481557305, + "loss": 1174.8596, + "step": 5620 + }, + { + "ce_loss_10": 3.6428149700164796, + "ce_loss_13": 3.568005383014679, + "ce_loss_2": 4.574557089805603, + "ce_loss_3": 4.320431900024414, + "ce_loss_7": 3.8154868602752687, + "epoch": 0.563, + "grad_norm": 580.0, + "kl_loss_10": 178.43261108398437, + "kl_loss_2": 2081.9929809570312, + "kl_loss_3": 1633.8047790527344, + "kl_loss_7": 600.1017929077149, + "learning_rate": 0.00040849224243382767, + "loss": 1150.8125, + "step": 5630 + }, + { + "ce_loss_10": 3.4989004015922545, + "ce_loss_13": 3.4218288540840147, + "ce_loss_2": 4.497757744789124, + "ce_loss_3": 4.228800570964813, + "ce_loss_7": 3.6881244659423826, + "epoch": 0.564, + "grad_norm": 576.0, + "kl_loss_10": 184.93341827392578, + "kl_loss_2": 2224.632287597656, + "kl_loss_3": 1749.1263427734375, + "kl_loss_7": 632.0666015625, + "learning_rate": 0.000406932841534185, + "loss": 1173.0332, + "step": 5640 + }, + { + "ce_loss_10": 3.453734540939331, + "ce_loss_13": 3.372727131843567, + "ce_loss_2": 4.460113084316253, + "ce_loss_3": 4.19973611831665, + "ce_loss_7": 3.6455657839775086, + "epoch": 0.565, + "grad_norm": 708.0, + "kl_loss_10": 186.30313568115236, + "kl_loss_2": 2260.893664550781, + "kl_loss_3": 1792.1846252441405, + "kl_loss_7": 638.3344879150391, + "learning_rate": 0.0004053743778197951, + "loss": 1219.3186, + "step": 5650 + }, + { + "ce_loss_10": 3.565755784511566, + "ce_loss_13": 3.481943702697754, + "ce_loss_2": 4.545414447784424, + "ce_loss_3": 4.281696927547455, + "ce_loss_7": 3.7513938307762147, + "epoch": 0.566, + "grad_norm": 584.0, + "kl_loss_10": 188.62994842529298, + "kl_loss_2": 2184.7360778808593, + "kl_loss_3": 1721.9289123535157, + "kl_loss_7": 628.1358184814453, + "learning_rate": 0.0004038168669843697, + "loss": 1209.3523, + "step": 5660 + }, + { + "ce_loss_10": 3.532804882526398, + "ce_loss_13": 3.4522215127944946, + "ce_loss_2": 4.494965553283691, + "ce_loss_3": 4.231216824054718, + "ce_loss_7": 3.7118934392929077, + "epoch": 0.567, + "grad_norm": 620.0, + "kl_loss_10": 183.03904342651367, + "kl_loss_2": 2154.956463623047, + "kl_loss_3": 1695.0998046875, + "kl_loss_7": 613.3763107299804, + "learning_rate": 0.000402260324712026, + "loss": 1195.8986, + "step": 5670 + }, + { + "ce_loss_10": 3.5749718070030214, + "ce_loss_13": 3.497403085231781, + "ce_loss_2": 4.588955020904541, + "ce_loss_3": 4.319999086856842, + "ce_loss_7": 3.7625349521636964, + "epoch": 0.568, + "grad_norm": 616.0, + "kl_loss_10": 184.26412506103514, + "kl_loss_2": 2236.5206665039063, + "kl_loss_3": 1760.365301513672, + "kl_loss_7": 624.1568267822265, + "learning_rate": 0.00040070476667712743, + "loss": 1174.4818, + "step": 5680 + }, + { + "ce_loss_10": 3.595443320274353, + "ce_loss_13": 3.5173869848251345, + "ce_loss_2": 4.573628330230713, + "ce_loss_3": 4.3121489644050595, + "ce_loss_7": 3.7780985593795777, + "epoch": 0.569, + "grad_norm": 540.0, + "kl_loss_10": 184.3900894165039, + "kl_loss_2": 2190.797717285156, + "kl_loss_3": 1726.8204223632813, + "kl_loss_7": 618.142544555664, + "learning_rate": 0.0003991502085441259, + "loss": 1191.0875, + "step": 5690 + }, + { + "ce_loss_10": 3.6352679252624513, + "ce_loss_13": 3.556475079059601, + "ce_loss_2": 4.568906188011169, + "ce_loss_3": 4.311613416671753, + "ce_loss_7": 3.8102620005607606, + "epoch": 0.57, + "grad_norm": 616.0, + "kl_loss_10": 180.942374420166, + "kl_loss_2": 2084.3558349609375, + "kl_loss_3": 1627.6179626464843, + "kl_loss_7": 599.5358856201171, + "learning_rate": 0.0003975966659674047, + "loss": 1160.7822, + "step": 5700 + }, + { + "ce_loss_10": 3.5962194561958314, + "ce_loss_13": 3.517608177661896, + "ce_loss_2": 4.578224086761475, + "ce_loss_3": 4.314012908935547, + "ce_loss_7": 3.7789862513542176, + "epoch": 0.571, + "grad_norm": 644.0, + "kl_loss_10": 182.5239112854004, + "kl_loss_2": 2180.907177734375, + "kl_loss_3": 1721.9898742675782, + "kl_loss_7": 614.957388305664, + "learning_rate": 0.0003960441545911204, + "loss": 1160.7484, + "step": 5710 + }, + { + "ce_loss_10": 3.5932918190956116, + "ce_loss_13": 3.5129475712776186, + "ce_loss_2": 4.558131432533264, + "ce_loss_3": 4.293534338474274, + "ce_loss_7": 3.7742814660072326, + "epoch": 0.572, + "grad_norm": 604.0, + "kl_loss_10": 183.15422897338868, + "kl_loss_2": 2156.431115722656, + "kl_loss_3": 1695.9377807617188, + "kl_loss_7": 619.908480834961, + "learning_rate": 0.0003944926900490452, + "loss": 1164.068, + "step": 5720 + }, + { + "ce_loss_10": 3.5127488017082213, + "ce_loss_13": 3.430432641506195, + "ce_loss_2": 4.5248651027679445, + "ce_loss_3": 4.258909869194031, + "ce_loss_7": 3.709870958328247, + "epoch": 0.573, + "grad_norm": 564.0, + "kl_loss_10": 186.0645439147949, + "kl_loss_2": 2235.3706176757814, + "kl_loss_3": 1765.8205688476562, + "kl_loss_7": 637.5899017333984, + "learning_rate": 0.0003929422879644099, + "loss": 1176.3957, + "step": 5730 + }, + { + "ce_loss_10": 3.510514330863953, + "ce_loss_13": 3.436869239807129, + "ce_loss_2": 4.478006148338318, + "ce_loss_3": 4.212211620807648, + "ce_loss_7": 3.6878655314445496, + "epoch": 0.574, + "grad_norm": 608.0, + "kl_loss_10": 179.26688079833986, + "kl_loss_2": 2168.3131591796873, + "kl_loss_3": 1699.4741943359375, + "kl_loss_7": 606.771630859375, + "learning_rate": 0.0003913929639497462, + "loss": 1141.8648, + "step": 5740 + }, + { + "ce_loss_10": 3.468266797065735, + "ce_loss_13": 3.3873007535934447, + "ce_loss_2": 4.490426182746887, + "ce_loss_3": 4.221552240848541, + "ce_loss_7": 3.6532665491104126, + "epoch": 0.575, + "grad_norm": 600.0, + "kl_loss_10": 182.11020889282227, + "kl_loss_2": 2259.020544433594, + "kl_loss_3": 1779.8084838867187, + "kl_loss_7": 622.9515014648438, + "learning_rate": 0.00038984473360672965, + "loss": 1169.1125, + "step": 5750 + }, + { + "ce_loss_10": 3.4774887681007387, + "ce_loss_13": 3.3949706315994264, + "ce_loss_2": 4.497473883628845, + "ce_loss_3": 4.2249194264411924, + "ce_loss_7": 3.664697051048279, + "epoch": 0.576, + "grad_norm": 596.0, + "kl_loss_10": 181.4011428833008, + "kl_loss_2": 2244.824786376953, + "kl_loss_3": 1764.2872131347656, + "kl_loss_7": 621.9651702880859, + "learning_rate": 0.0003882976125260229, + "loss": 1170.2874, + "step": 5760 + }, + { + "ce_loss_10": 3.5439630150794983, + "ce_loss_13": 3.4651756167411802, + "ce_loss_2": 4.539518880844116, + "ce_loss_3": 4.274804329872131, + "ce_loss_7": 3.728801262378693, + "epoch": 0.577, + "grad_norm": 592.0, + "kl_loss_10": 183.33100814819335, + "kl_loss_2": 2204.0270751953126, + "kl_loss_3": 1723.9171936035157, + "kl_loss_7": 615.5777862548828, + "learning_rate": 0.00038675161628711776, + "loss": 1179.8893, + "step": 5770 + }, + { + "ce_loss_10": 3.5816867470741274, + "ce_loss_13": 3.5046088337898254, + "ce_loss_2": 4.544821619987488, + "ce_loss_3": 4.285539746284485, + "ce_loss_7": 3.761784756183624, + "epoch": 0.578, + "grad_norm": 620.0, + "kl_loss_10": 181.6286849975586, + "kl_loss_2": 2136.3880615234375, + "kl_loss_3": 1677.4017333984375, + "kl_loss_7": 610.2154174804688, + "learning_rate": 0.0003852067604581794, + "loss": 1194.1891, + "step": 5780 + }, + { + "ce_loss_10": 3.523706150054932, + "ce_loss_13": 3.448537766933441, + "ce_loss_2": 4.533637523651123, + "ce_loss_3": 4.265281748771668, + "ce_loss_7": 3.709212040901184, + "epoch": 0.579, + "grad_norm": 676.0, + "kl_loss_10": 181.67257690429688, + "kl_loss_2": 2230.821612548828, + "kl_loss_3": 1755.830584716797, + "kl_loss_7": 620.3396881103515, + "learning_rate": 0.0003836630605958888, + "loss": 1177.6782, + "step": 5790 + }, + { + "ce_loss_10": 3.583223593235016, + "ce_loss_13": 3.503636026382446, + "ce_loss_2": 4.566303539276123, + "ce_loss_3": 4.3056800127029415, + "ce_loss_7": 3.76351158618927, + "epoch": 0.58, + "grad_norm": 708.0, + "kl_loss_10": 183.71082077026367, + "kl_loss_2": 2228.4331115722657, + "kl_loss_3": 1769.3681579589843, + "kl_loss_7": 628.4158660888672, + "learning_rate": 0.0003821205322452863, + "loss": 1235.8768, + "step": 5800 + }, + { + "ce_loss_10": 3.563581478595734, + "ce_loss_13": 3.488909196853638, + "ce_loss_2": 4.543065023422241, + "ce_loss_3": 4.286456656455994, + "ce_loss_7": 3.7441120743751526, + "epoch": 0.581, + "grad_norm": 584.0, + "kl_loss_10": 180.5809585571289, + "kl_loss_2": 2191.5135986328123, + "kl_loss_3": 1729.1048767089844, + "kl_loss_7": 608.2429626464843, + "learning_rate": 0.0003805791909396155, + "loss": 1179.2295, + "step": 5810 + }, + { + "ce_loss_10": 3.5160235166549683, + "ce_loss_13": 3.43984659910202, + "ce_loss_2": 4.500444793701172, + "ce_loss_3": 4.2373772144317625, + "ce_loss_7": 3.6964723467826843, + "epoch": 0.582, + "grad_norm": 652.0, + "kl_loss_10": 180.02818908691407, + "kl_loss_2": 2186.5078369140624, + "kl_loss_3": 1730.8345642089844, + "kl_loss_7": 613.5680450439453, + "learning_rate": 0.0003790390522001662, + "loss": 1191.4708, + "step": 5820 + }, + { + "ce_loss_10": 3.447020876407623, + "ce_loss_13": 3.3710612773895265, + "ce_loss_2": 4.448183393478393, + "ce_loss_3": 4.185709154605865, + "ce_loss_7": 3.6283095359802244, + "epoch": 0.583, + "grad_norm": 620.0, + "kl_loss_10": 180.24705505371094, + "kl_loss_2": 2242.3388488769533, + "kl_loss_3": 1776.8985290527344, + "kl_loss_7": 620.0051086425781, + "learning_rate": 0.0003775001315361183, + "loss": 1173.2469, + "step": 5830 + }, + { + "ce_loss_10": 3.560646951198578, + "ce_loss_13": 3.481656861305237, + "ce_loss_2": 4.561934852600098, + "ce_loss_3": 4.297064936161041, + "ce_loss_7": 3.746256446838379, + "epoch": 0.584, + "grad_norm": 560.0, + "kl_loss_10": 183.9656074523926, + "kl_loss_2": 2215.3773864746095, + "kl_loss_3": 1750.5085021972657, + "kl_loss_7": 621.9390472412109, + "learning_rate": 0.0003759624444443858, + "loss": 1186.5547, + "step": 5840 + }, + { + "ce_loss_10": 3.592632758617401, + "ce_loss_13": 3.520240008831024, + "ce_loss_2": 4.567729663848877, + "ce_loss_3": 4.300854158401489, + "ce_loss_7": 3.769944798946381, + "epoch": 0.585, + "grad_norm": 568.0, + "kl_loss_10": 180.2906066894531, + "kl_loss_2": 2170.1985412597655, + "kl_loss_3": 1706.62548828125, + "kl_loss_7": 608.6328552246093, + "learning_rate": 0.00037442600640946044, + "loss": 1155.9348, + "step": 5850 + }, + { + "ce_loss_10": 3.550674855709076, + "ce_loss_13": 3.475678253173828, + "ce_loss_2": 4.5188051700592045, + "ce_loss_3": 4.257573843002319, + "ce_loss_7": 3.733881187438965, + "epoch": 0.586, + "grad_norm": 624.0, + "kl_loss_10": 180.34449844360353, + "kl_loss_2": 2161.917333984375, + "kl_loss_3": 1700.5603820800782, + "kl_loss_7": 615.4381408691406, + "learning_rate": 0.00037289083290325663, + "loss": 1151.5385, + "step": 5860 + }, + { + "ce_loss_10": 3.5404091477394104, + "ce_loss_13": 3.4616484522819517, + "ce_loss_2": 4.5070148229599, + "ce_loss_3": 4.242105662822723, + "ce_loss_7": 3.7187010407447816, + "epoch": 0.587, + "grad_norm": 592.0, + "kl_loss_10": 183.17743911743165, + "kl_loss_2": 2149.7393432617187, + "kl_loss_3": 1683.0787292480468, + "kl_loss_7": 610.0913803100586, + "learning_rate": 0.0003713569393849543, + "loss": 1154.5703, + "step": 5870 + }, + { + "ce_loss_10": 3.5839020013809204, + "ce_loss_13": 3.5078009486198427, + "ce_loss_2": 4.56416871547699, + "ce_loss_3": 4.296731424331665, + "ce_loss_7": 3.767895996570587, + "epoch": 0.588, + "grad_norm": 592.0, + "kl_loss_10": 183.36542816162108, + "kl_loss_2": 2186.738494873047, + "kl_loss_3": 1717.5487915039062, + "kl_loss_7": 612.2841430664063, + "learning_rate": 0.00036982434130084397, + "loss": 1179.8928, + "step": 5880 + }, + { + "ce_loss_10": 3.4997439622879027, + "ce_loss_13": 3.4187664270401, + "ce_loss_2": 4.478350329399109, + "ce_loss_3": 4.210885548591614, + "ce_loss_7": 3.6801365852355956, + "epoch": 0.589, + "grad_norm": 664.0, + "kl_loss_10": 186.01408843994142, + "kl_loss_2": 2192.050701904297, + "kl_loss_3": 1713.8697509765625, + "kl_loss_7": 622.2605224609375, + "learning_rate": 0.00036829305408417166, + "loss": 1185.5467, + "step": 5890 + }, + { + "ce_loss_10": 3.4883674502372743, + "ce_loss_13": 3.4076414942741393, + "ce_loss_2": 4.51081612110138, + "ce_loss_3": 4.233860373497009, + "ce_loss_7": 3.68140949010849, + "epoch": 0.59, + "grad_norm": 632.0, + "kl_loss_10": 185.69306488037108, + "kl_loss_2": 2265.8583251953123, + "kl_loss_3": 1770.2322631835937, + "kl_loss_7": 633.7182983398437, + "learning_rate": 0.0003667630931549826, + "loss": 1189.5502, + "step": 5900 + }, + { + "ce_loss_10": 3.454320323467255, + "ce_loss_13": 3.376146912574768, + "ce_loss_2": 4.510071706771851, + "ce_loss_3": 4.2408855676651, + "ce_loss_7": 3.649706947803497, + "epoch": 0.591, + "grad_norm": 728.0, + "kl_loss_10": 185.1581298828125, + "kl_loss_2": 2343.439013671875, + "kl_loss_3": 1859.8356567382812, + "kl_loss_7": 639.2615692138672, + "learning_rate": 0.00036523447391996613, + "loss": 1217.3514, + "step": 5910 + }, + { + "ce_loss_10": 3.549302911758423, + "ce_loss_13": 3.4722840428352355, + "ce_loss_2": 4.514612603187561, + "ce_loss_3": 4.256480038166046, + "ce_loss_7": 3.727895641326904, + "epoch": 0.592, + "grad_norm": 580.0, + "kl_loss_10": 181.60699539184571, + "kl_loss_2": 2162.6364685058593, + "kl_loss_3": 1701.0076782226563, + "kl_loss_7": 610.4459930419922, + "learning_rate": 0.00036370721177230114, + "loss": 1162.5948, + "step": 5920 + }, + { + "ce_loss_10": 3.543530595302582, + "ce_loss_13": 3.4660569787025453, + "ce_loss_2": 4.543927192687988, + "ce_loss_3": 4.277280712127686, + "ce_loss_7": 3.728453516960144, + "epoch": 0.593, + "grad_norm": 628.0, + "kl_loss_10": 184.26243515014647, + "kl_loss_2": 2218.4042541503904, + "kl_loss_3": 1743.315625, + "kl_loss_7": 620.730111694336, + "learning_rate": 0.00036218132209150044, + "loss": 1186.6707, + "step": 5930 + }, + { + "ce_loss_10": 3.497697722911835, + "ce_loss_13": 3.4142557263374327, + "ce_loss_2": 4.5388647556304935, + "ce_loss_3": 4.264691114425659, + "ce_loss_7": 3.6943756103515626, + "epoch": 0.594, + "grad_norm": 524.0, + "kl_loss_10": 188.87873077392578, + "kl_loss_2": 2304.517468261719, + "kl_loss_3": 1814.6093872070312, + "kl_loss_7": 639.0129974365234, + "learning_rate": 0.0003606568202432562, + "loss": 1197.9809, + "step": 5940 + }, + { + "ce_loss_10": 3.565451109409332, + "ce_loss_13": 3.4856663823127745, + "ce_loss_2": 4.5841080904006954, + "ce_loss_3": 4.317169034481049, + "ce_loss_7": 3.754825806617737, + "epoch": 0.595, + "grad_norm": 696.0, + "kl_loss_10": 187.19320907592774, + "kl_loss_2": 2274.0406982421873, + "kl_loss_3": 1793.8463073730468, + "kl_loss_7": 630.620458984375, + "learning_rate": 0.0003591337215792851, + "loss": 1177.4938, + "step": 5950 + }, + { + "ce_loss_10": 3.611758494377136, + "ce_loss_13": 3.5361703038215637, + "ce_loss_2": 4.54854645729065, + "ce_loss_3": 4.2874367237091064, + "ce_loss_7": 3.781387460231781, + "epoch": 0.596, + "grad_norm": 536.0, + "kl_loss_10": 179.44385452270507, + "kl_loss_2": 2134.903210449219, + "kl_loss_3": 1672.3198852539062, + "kl_loss_7": 603.0327301025391, + "learning_rate": 0.00035761204143717383, + "loss": 1174.0895, + "step": 5960 + }, + { + "ce_loss_10": 3.564636397361755, + "ce_loss_13": 3.4857504963874817, + "ce_loss_2": 4.562372779846191, + "ce_loss_3": 4.294865238666534, + "ce_loss_7": 3.747916042804718, + "epoch": 0.597, + "grad_norm": 616.0, + "kl_loss_10": 181.63295822143556, + "kl_loss_2": 2217.5136901855467, + "kl_loss_3": 1751.903790283203, + "kl_loss_7": 618.9376495361328, + "learning_rate": 0.0003560917951402245, + "loss": 1215.2734, + "step": 5970 + }, + { + "ce_loss_10": 3.5358213543891908, + "ce_loss_13": 3.461250603199005, + "ce_loss_2": 4.515460109710693, + "ce_loss_3": 4.252109396457672, + "ce_loss_7": 3.720645487308502, + "epoch": 0.598, + "grad_norm": 616.0, + "kl_loss_10": 180.68030853271483, + "kl_loss_2": 2199.883331298828, + "kl_loss_3": 1727.857843017578, + "kl_loss_7": 614.7005615234375, + "learning_rate": 0.00035457299799730046, + "loss": 1174.0783, + "step": 5980 + }, + { + "ce_loss_10": 3.6016149520874023, + "ce_loss_13": 3.523995506763458, + "ce_loss_2": 4.564206576347351, + "ce_loss_3": 4.302748084068298, + "ce_loss_7": 3.7862717866897584, + "epoch": 0.599, + "grad_norm": 600.0, + "kl_loss_10": 181.36301651000977, + "kl_loss_2": 2153.0896545410155, + "kl_loss_3": 1694.4290161132812, + "kl_loss_7": 614.9286560058594, + "learning_rate": 0.0003530556653026721, + "loss": 1181.7495, + "step": 5990 + }, + { + "ce_loss_10": 3.5210883378982545, + "ce_loss_13": 3.4458776116371155, + "ce_loss_2": 4.520641088485718, + "ce_loss_3": 4.254351568222046, + "ce_loss_7": 3.699181377887726, + "epoch": 0.6, + "grad_norm": 764.0, + "kl_loss_10": 179.21529235839844, + "kl_loss_2": 2227.2805419921874, + "kl_loss_3": 1758.5499328613282, + "kl_loss_7": 610.0478576660156, + "learning_rate": 0.00035153981233586274, + "loss": 1193.8637, + "step": 6000 + }, + { + "ce_loss_10": 3.499428999423981, + "ce_loss_13": 3.422479748725891, + "ce_loss_2": 4.4867565631866455, + "ce_loss_3": 4.227682662010193, + "ce_loss_7": 3.6805691361427306, + "epoch": 0.601, + "grad_norm": 584.0, + "kl_loss_10": 179.26205139160157, + "kl_loss_2": 2193.6119079589844, + "kl_loss_3": 1731.6024475097656, + "kl_loss_7": 612.7285736083984, + "learning_rate": 0.00035002545436149473, + "loss": 1214.442, + "step": 6010 + }, + { + "ce_loss_10": 3.507369041442871, + "ce_loss_13": 3.427609443664551, + "ce_loss_2": 4.515847969055176, + "ce_loss_3": 4.248699688911438, + "ce_loss_7": 3.6938512086868287, + "epoch": 0.602, + "grad_norm": 592.0, + "kl_loss_10": 187.4394386291504, + "kl_loss_2": 2240.724530029297, + "kl_loss_3": 1766.6628112792969, + "kl_loss_7": 629.6498748779297, + "learning_rate": 0.0003485126066291364, + "loss": 1169.8236, + "step": 6020 + }, + { + "ce_loss_10": 3.5554185032844545, + "ce_loss_13": 3.4788596630096436, + "ce_loss_2": 4.540017461776733, + "ce_loss_3": 4.2838677883148195, + "ce_loss_7": 3.736110508441925, + "epoch": 0.603, + "grad_norm": 520.0, + "kl_loss_10": 179.3347900390625, + "kl_loss_2": 2192.1767639160157, + "kl_loss_3": 1731.594775390625, + "kl_loss_7": 613.0785980224609, + "learning_rate": 0.0003470012843731476, + "loss": 1185.9094, + "step": 6030 + }, + { + "ce_loss_10": 3.494213032722473, + "ce_loss_13": 3.41587815284729, + "ce_loss_2": 4.493516874313355, + "ce_loss_3": 4.230558323860168, + "ce_loss_7": 3.6748696088790895, + "epoch": 0.604, + "grad_norm": 604.0, + "kl_loss_10": 180.02317504882814, + "kl_loss_2": 2220.4429626464844, + "kl_loss_3": 1750.9172302246093, + "kl_loss_7": 613.3353332519531, + "learning_rate": 0.00034549150281252633, + "loss": 1207.7186, + "step": 6040 + }, + { + "ce_loss_10": 3.4735769987106324, + "ce_loss_13": 3.398567247390747, + "ce_loss_2": 4.450454211235046, + "ce_loss_3": 4.185557043552398, + "ce_loss_7": 3.660480535030365, + "epoch": 0.605, + "grad_norm": 608.0, + "kl_loss_10": 181.83876571655273, + "kl_loss_2": 2163.210076904297, + "kl_loss_3": 1694.9721801757812, + "kl_loss_7": 612.4143432617187, + "learning_rate": 0.0003439832771507565, + "loss": 1157.9707, + "step": 6050 + }, + { + "ce_loss_10": 3.4816818594932557, + "ce_loss_13": 3.4034390568733217, + "ce_loss_2": 4.478318929672241, + "ce_loss_3": 4.211991810798645, + "ce_loss_7": 3.6656970381736755, + "epoch": 0.606, + "grad_norm": 560.0, + "kl_loss_10": 181.10105361938477, + "kl_loss_2": 2226.4850891113283, + "kl_loss_3": 1757.37236328125, + "kl_loss_7": 619.9399398803711, + "learning_rate": 0.0003424766225756537, + "loss": 1172.4078, + "step": 6060 + }, + { + "ce_loss_10": 3.5375612139701844, + "ce_loss_13": 3.4606423020362853, + "ce_loss_2": 4.53115668296814, + "ce_loss_3": 4.261643159389496, + "ce_loss_7": 3.7194941639900208, + "epoch": 0.607, + "grad_norm": 600.0, + "kl_loss_10": 181.42390975952148, + "kl_loss_2": 2202.2797973632814, + "kl_loss_3": 1733.2598999023437, + "kl_loss_7": 615.9942810058594, + "learning_rate": 0.00034097155425921255, + "loss": 1158.2284, + "step": 6070 + }, + { + "ce_loss_10": 3.433805537223816, + "ce_loss_13": 3.354471778869629, + "ce_loss_2": 4.449812698364258, + "ce_loss_3": 4.179146933555603, + "ce_loss_7": 3.6204983830451964, + "epoch": 0.608, + "grad_norm": 592.0, + "kl_loss_10": 183.06991577148438, + "kl_loss_2": 2273.455847167969, + "kl_loss_3": 1787.318505859375, + "kl_loss_7": 624.6353576660156, + "learning_rate": 0.0003394680873574546, + "loss": 1187.3987, + "step": 6080 + }, + { + "ce_loss_10": 3.54138503074646, + "ce_loss_13": 3.4626068115234374, + "ce_loss_2": 4.556825470924378, + "ce_loss_3": 4.281811666488648, + "ce_loss_7": 3.7267327547073363, + "epoch": 0.609, + "grad_norm": 620.0, + "kl_loss_10": 183.72728881835937, + "kl_loss_2": 2232.6337280273438, + "kl_loss_3": 1752.9180786132813, + "kl_loss_7": 617.8084594726563, + "learning_rate": 0.0003379662370102747, + "loss": 1176.7848, + "step": 6090 + }, + { + "ce_loss_10": 3.5495489597320558, + "ce_loss_13": 3.4742938756942747, + "ce_loss_2": 4.515744471549988, + "ce_loss_3": 4.251201486587524, + "ce_loss_7": 3.726244103908539, + "epoch": 0.61, + "grad_norm": 640.0, + "kl_loss_10": 179.96657028198243, + "kl_loss_2": 2182.172985839844, + "kl_loss_3": 1717.0841491699218, + "kl_loss_7": 617.4093353271485, + "learning_rate": 0.0003364660183412892, + "loss": 1176.2052, + "step": 6100 + }, + { + "ce_loss_10": 3.5306557536125185, + "ce_loss_13": 3.4546700954437255, + "ce_loss_2": 4.500067496299744, + "ce_loss_3": 4.235128319263458, + "ce_loss_7": 3.7075342297554017, + "epoch": 0.611, + "grad_norm": 592.0, + "kl_loss_10": 182.79292755126954, + "kl_loss_2": 2182.2781616210937, + "kl_loss_3": 1714.5931213378906, + "kl_loss_7": 613.8878936767578, + "learning_rate": 0.0003349674464576834, + "loss": 1190.8153, + "step": 6110 + }, + { + "ce_loss_10": 3.477449345588684, + "ce_loss_13": 3.3995738983154298, + "ce_loss_2": 4.485787630081177, + "ce_loss_3": 4.219619536399842, + "ce_loss_7": 3.6623815417289736, + "epoch": 0.612, + "grad_norm": 628.0, + "kl_loss_10": 181.76175689697266, + "kl_loss_2": 2235.75986328125, + "kl_loss_3": 1763.9286254882813, + "kl_loss_7": 619.9945251464844, + "learning_rate": 0.00033347053645005966, + "loss": 1163.8981, + "step": 6120 + }, + { + "ce_loss_10": 3.5906055331230164, + "ce_loss_13": 3.514803075790405, + "ce_loss_2": 4.5458073854446415, + "ce_loss_3": 4.283458161354065, + "ce_loss_7": 3.772923803329468, + "epoch": 0.613, + "grad_norm": 644.0, + "kl_loss_10": 178.51968688964843, + "kl_loss_2": 2116.6735778808593, + "kl_loss_3": 1659.320733642578, + "kl_loss_7": 606.7959167480469, + "learning_rate": 0.00033197530339228485, + "loss": 1170.5501, + "step": 6130 + }, + { + "ce_loss_10": 3.5471089243888856, + "ce_loss_13": 3.468013954162598, + "ce_loss_2": 4.5254878282546995, + "ce_loss_3": 4.254842627048492, + "ce_loss_7": 3.73079137802124, + "epoch": 0.614, + "grad_norm": 532.0, + "kl_loss_10": 183.3593994140625, + "kl_loss_2": 2176.643206787109, + "kl_loss_3": 1701.0747802734375, + "kl_loss_7": 619.2012481689453, + "learning_rate": 0.00033048176234133967, + "loss": 1166.8168, + "step": 6140 + }, + { + "ce_loss_10": 3.5306158542633055, + "ce_loss_13": 3.453017568588257, + "ce_loss_2": 4.494013047218322, + "ce_loss_3": 4.233083915710449, + "ce_loss_7": 3.7115015268325804, + "epoch": 0.615, + "grad_norm": 592.0, + "kl_loss_10": 183.00715713500978, + "kl_loss_2": 2175.1028686523437, + "kl_loss_3": 1702.7456420898438, + "kl_loss_7": 619.7471405029297, + "learning_rate": 0.0003289899283371657, + "loss": 1181.7955, + "step": 6150 + }, + { + "ce_loss_10": 3.5544473528862, + "ce_loss_13": 3.4786699175834657, + "ce_loss_2": 4.547568416595459, + "ce_loss_3": 4.281970739364624, + "ce_loss_7": 3.7363924741744996, + "epoch": 0.616, + "grad_norm": 600.0, + "kl_loss_10": 178.17992782592773, + "kl_loss_2": 2185.039025878906, + "kl_loss_3": 1723.0535766601563, + "kl_loss_7": 600.4897644042969, + "learning_rate": 0.0003274998164025148, + "loss": 1196.8087, + "step": 6160 + }, + { + "ce_loss_10": 3.586019229888916, + "ce_loss_13": 3.509108769893646, + "ce_loss_2": 4.5615111827850345, + "ce_loss_3": 4.2898026466369625, + "ce_loss_7": 3.76910115480423, + "epoch": 0.617, + "grad_norm": 596.0, + "kl_loss_10": 183.4706718444824, + "kl_loss_2": 2168.4442443847656, + "kl_loss_3": 1695.5099731445312, + "kl_loss_7": 616.1861694335937, + "learning_rate": 0.0003260114415427975, + "loss": 1190.7359, + "step": 6170 + }, + { + "ce_loss_10": 3.5073242664337156, + "ce_loss_13": 3.4292925119400026, + "ce_loss_2": 4.523944449424744, + "ce_loss_3": 4.251231408119201, + "ce_loss_7": 3.6900092363357544, + "epoch": 0.618, + "grad_norm": 612.0, + "kl_loss_10": 180.3868850708008, + "kl_loss_2": 2258.1235778808596, + "kl_loss_3": 1773.0142578125, + "kl_loss_7": 615.9339263916015, + "learning_rate": 0.0003245248187459323, + "loss": 1218.0189, + "step": 6180 + }, + { + "ce_loss_10": 3.4972055196762084, + "ce_loss_13": 3.4217321276664734, + "ce_loss_2": 4.4563206195831295, + "ce_loss_3": 4.195159709453582, + "ce_loss_7": 3.6716169476509095, + "epoch": 0.619, + "grad_norm": 596.0, + "kl_loss_10": 176.01737060546876, + "kl_loss_2": 2149.769183349609, + "kl_loss_3": 1675.0436157226563, + "kl_loss_7": 597.3845794677734, + "learning_rate": 0.00032303996298219416, + "loss": 1151.9841, + "step": 6190 + }, + { + "ce_loss_10": 3.5777448058128356, + "ce_loss_13": 3.500323712825775, + "ce_loss_2": 4.53541202545166, + "ce_loss_3": 4.266927003860474, + "ce_loss_7": 3.755027210712433, + "epoch": 0.62, + "grad_norm": 540.0, + "kl_loss_10": 178.23485260009767, + "kl_loss_2": 2112.595593261719, + "kl_loss_3": 1646.6524230957032, + "kl_loss_7": 602.416943359375, + "learning_rate": 0.00032155688920406414, + "loss": 1145.6068, + "step": 6200 + }, + { + "ce_loss_10": 3.489628565311432, + "ce_loss_13": 3.408998668193817, + "ce_loss_2": 4.5190582275390625, + "ce_loss_3": 4.245685923099518, + "ce_loss_7": 3.671571230888367, + "epoch": 0.621, + "grad_norm": 652.0, + "kl_loss_10": 183.76829681396484, + "kl_loss_2": 2272.4242309570313, + "kl_loss_3": 1788.1328247070312, + "kl_loss_7": 627.5300720214843, + "learning_rate": 0.0003200756123460788, + "loss": 1224.8912, + "step": 6210 + }, + { + "ce_loss_10": 3.5219372153282165, + "ce_loss_13": 3.4430843591690063, + "ce_loss_2": 4.530118870735168, + "ce_loss_3": 4.26385805606842, + "ce_loss_7": 3.708560848236084, + "epoch": 0.622, + "grad_norm": 684.0, + "kl_loss_10": 185.2090690612793, + "kl_loss_2": 2254.1378173828125, + "kl_loss_3": 1774.3692199707032, + "kl_loss_7": 633.037890625, + "learning_rate": 0.00031859614732467957, + "loss": 1207.0312, + "step": 6220 + }, + { + "ce_loss_10": 3.5700612902641295, + "ce_loss_13": 3.4917181968688964, + "ce_loss_2": 4.540509462356567, + "ce_loss_3": 4.275044929981232, + "ce_loss_7": 3.7488471269607544, + "epoch": 0.623, + "grad_norm": 564.0, + "kl_loss_10": 178.5159034729004, + "kl_loss_2": 2155.8026611328123, + "kl_loss_3": 1685.940985107422, + "kl_loss_7": 600.1484497070312, + "learning_rate": 0.00031711850903806275, + "loss": 1157.7447, + "step": 6230 + }, + { + "ce_loss_10": 3.479930281639099, + "ce_loss_13": 3.39938303232193, + "ce_loss_2": 4.482881689071656, + "ce_loss_3": 4.214280414581299, + "ce_loss_7": 3.666577732563019, + "epoch": 0.624, + "grad_norm": 528.0, + "kl_loss_10": 185.9188034057617, + "kl_loss_2": 2243.2543823242186, + "kl_loss_3": 1758.6941833496094, + "kl_loss_7": 628.0701446533203, + "learning_rate": 0.0003156427123660297, + "loss": 1172.3383, + "step": 6240 + }, + { + "ce_loss_10": 3.5643810868263244, + "ce_loss_13": 3.4881609320640563, + "ce_loss_2": 4.518170762062073, + "ce_loss_3": 4.258860862255096, + "ce_loss_7": 3.745578372478485, + "epoch": 0.625, + "grad_norm": 596.0, + "kl_loss_10": 180.73046417236327, + "kl_loss_2": 2135.2883361816407, + "kl_loss_3": 1669.8626892089844, + "kl_loss_7": 610.9410751342773, + "learning_rate": 0.0003141687721698363, + "loss": 1172.6947, + "step": 6250 + }, + { + "ce_loss_10": 3.536016345024109, + "ce_loss_13": 3.4606189489364625, + "ce_loss_2": 4.476572108268738, + "ce_loss_3": 4.211447751522064, + "ce_loss_7": 3.7014155983924866, + "epoch": 0.626, + "grad_norm": 616.0, + "kl_loss_10": 175.8163749694824, + "kl_loss_2": 2105.446813964844, + "kl_loss_3": 1637.2767333984375, + "kl_loss_7": 587.5009735107421, + "learning_rate": 0.00031269670329204396, + "loss": 1155.6384, + "step": 6260 + }, + { + "ce_loss_10": 3.5712973356246946, + "ce_loss_13": 3.4947034239768984, + "ce_loss_2": 4.515408515930176, + "ce_loss_3": 4.251049220561981, + "ce_loss_7": 3.7454182147979735, + "epoch": 0.627, + "grad_norm": 644.0, + "kl_loss_10": 181.6370933532715, + "kl_loss_2": 2120.7030395507813, + "kl_loss_3": 1650.25341796875, + "kl_loss_7": 607.5851348876953, + "learning_rate": 0.00031122652055637015, + "loss": 1169.2292, + "step": 6270 + }, + { + "ce_loss_10": 3.536707639694214, + "ce_loss_13": 3.460920011997223, + "ce_loss_2": 4.534442710876465, + "ce_loss_3": 4.263644289970398, + "ce_loss_7": 3.7196394085884092, + "epoch": 0.628, + "grad_norm": 556.0, + "kl_loss_10": 181.97393569946288, + "kl_loss_2": 2233.067547607422, + "kl_loss_3": 1750.9132995605469, + "kl_loss_7": 618.4156631469726, + "learning_rate": 0.0003097582387675385, + "loss": 1169.3315, + "step": 6280 + }, + { + "ce_loss_10": 3.5805759191513062, + "ce_loss_13": 3.503600060939789, + "ce_loss_2": 4.546688604354858, + "ce_loss_3": 4.285043132305145, + "ce_loss_7": 3.7596506476402283, + "epoch": 0.629, + "grad_norm": 536.0, + "kl_loss_10": 181.50545425415038, + "kl_loss_2": 2176.2076171875, + "kl_loss_3": 1706.1532836914062, + "kl_loss_7": 611.5946624755859, + "learning_rate": 0.00030829187271113034, + "loss": 1162.2808, + "step": 6290 + }, + { + "ce_loss_10": 3.5692893385887148, + "ce_loss_13": 3.49398432970047, + "ce_loss_2": 4.5324320793151855, + "ce_loss_3": 4.271732580661774, + "ce_loss_7": 3.738038659095764, + "epoch": 0.63, + "grad_norm": 660.0, + "kl_loss_10": 176.80067443847656, + "kl_loss_2": 2142.2837646484377, + "kl_loss_3": 1672.4158203125, + "kl_loss_7": 598.4024932861328, + "learning_rate": 0.00030682743715343565, + "loss": 1178.4112, + "step": 6300 + }, + { + "ce_loss_10": 3.5165117979049683, + "ce_loss_13": 3.4367071866989134, + "ce_loss_2": 4.5106003999710085, + "ce_loss_3": 4.248035335540772, + "ce_loss_7": 3.709369492530823, + "epoch": 0.631, + "grad_norm": 624.0, + "kl_loss_10": 185.85676803588868, + "kl_loss_2": 2187.9409912109377, + "kl_loss_3": 1716.0930541992188, + "kl_loss_7": 624.2622802734375, + "learning_rate": 0.0003053649468413043, + "loss": 1194.6155, + "step": 6310 + }, + { + "ce_loss_10": 3.6293103814125063, + "ce_loss_13": 3.5522167325019836, + "ce_loss_2": 4.589023590087891, + "ce_loss_3": 4.323796653747559, + "ce_loss_7": 3.8087464213371276, + "epoch": 0.632, + "grad_norm": 664.0, + "kl_loss_10": 183.21706161499023, + "kl_loss_2": 2147.7636474609376, + "kl_loss_3": 1686.63291015625, + "kl_loss_7": 615.6221435546875, + "learning_rate": 0.00030390441650199725, + "loss": 1158.5613, + "step": 6320 + }, + { + "ce_loss_10": 3.528099310398102, + "ce_loss_13": 3.4539591908454894, + "ce_loss_2": 4.50098488330841, + "ce_loss_3": 4.2318372368812565, + "ce_loss_7": 3.70650874376297, + "epoch": 0.633, + "grad_norm": 676.0, + "kl_loss_10": 181.68777465820312, + "kl_loss_2": 2164.3626098632812, + "kl_loss_3": 1687.4978088378907, + "kl_loss_7": 610.1166748046875, + "learning_rate": 0.00030244586084303903, + "loss": 1154.3, + "step": 6330 + }, + { + "ce_loss_10": 3.4934327363967896, + "ce_loss_13": 3.416351318359375, + "ce_loss_2": 4.505590105056763, + "ce_loss_3": 4.235206222534179, + "ce_loss_7": 3.6859657049179075, + "epoch": 0.634, + "grad_norm": 564.0, + "kl_loss_10": 183.83423309326173, + "kl_loss_2": 2252.3034423828126, + "kl_loss_3": 1765.4126586914062, + "kl_loss_7": 627.5603424072266, + "learning_rate": 0.00030098929455206903, + "loss": 1173.0053, + "step": 6340 + }, + { + "ce_loss_10": 3.5009153842926026, + "ce_loss_13": 3.4256786108016968, + "ce_loss_2": 4.492053604125976, + "ce_loss_3": 4.224474251270294, + "ce_loss_7": 3.6754886388778685, + "epoch": 0.635, + "grad_norm": 592.0, + "kl_loss_10": 180.03737106323243, + "kl_loss_2": 2236.1428771972655, + "kl_loss_3": 1754.843768310547, + "kl_loss_7": 615.7969268798828, + "learning_rate": 0.00029953473229669324, + "loss": 1215.3177, + "step": 6350 + }, + { + "ce_loss_10": 3.5320404410362243, + "ce_loss_13": 3.4564929485321043, + "ce_loss_2": 4.505661821365356, + "ce_loss_3": 4.2487224817276, + "ce_loss_7": 3.717836594581604, + "epoch": 0.636, + "grad_norm": 560.0, + "kl_loss_10": 180.38322067260742, + "kl_loss_2": 2164.6011291503905, + "kl_loss_3": 1703.929705810547, + "kl_loss_7": 616.6229248046875, + "learning_rate": 0.00029808218872433767, + "loss": 1152.0346, + "step": 6360 + }, + { + "ce_loss_10": 3.5955461502075194, + "ce_loss_13": 3.521399176120758, + "ce_loss_2": 4.553752660751343, + "ce_loss_3": 4.287336015701294, + "ce_loss_7": 3.7661701798439027, + "epoch": 0.637, + "grad_norm": 584.0, + "kl_loss_10": 178.29160919189454, + "kl_loss_2": 2154.0076416015627, + "kl_loss_3": 1683.874932861328, + "kl_loss_7": 604.1974517822266, + "learning_rate": 0.0002966316784621, + "loss": 1148.5613, + "step": 6370 + }, + { + "ce_loss_10": 3.509734773635864, + "ce_loss_13": 3.4283226490020753, + "ce_loss_2": 4.500501930713654, + "ce_loss_3": 4.237296044826508, + "ce_loss_7": 3.697099339962006, + "epoch": 0.638, + "grad_norm": 572.0, + "kl_loss_10": 183.85193252563477, + "kl_loss_2": 2219.202685546875, + "kl_loss_3": 1744.0482971191407, + "kl_loss_7": 628.3488616943359, + "learning_rate": 0.0002951832161166024, + "loss": 1161.3599, + "step": 6380 + }, + { + "ce_loss_10": 3.5833853006362917, + "ce_loss_13": 3.5059871673583984, + "ce_loss_2": 4.560363245010376, + "ce_loss_3": 4.295464622974396, + "ce_loss_7": 3.7690476536750794, + "epoch": 0.639, + "grad_norm": 524.0, + "kl_loss_10": 182.52049560546874, + "kl_loss_2": 2159.0023681640623, + "kl_loss_3": 1692.8950073242188, + "kl_loss_7": 613.0403747558594, + "learning_rate": 0.0002937368162738445, + "loss": 1138.2498, + "step": 6390 + }, + { + "ce_loss_10": 3.5200544476509092, + "ce_loss_13": 3.4506627917289734, + "ce_loss_2": 4.487471246719361, + "ce_loss_3": 4.225354993343354, + "ce_loss_7": 3.695161283016205, + "epoch": 0.64, + "grad_norm": 648.0, + "kl_loss_10": 174.7782325744629, + "kl_loss_2": 2168.6176025390623, + "kl_loss_3": 1700.3829406738282, + "kl_loss_7": 598.6395080566406, + "learning_rate": 0.0002922924934990568, + "loss": 1174.9205, + "step": 6400 + }, + { + "ce_loss_10": 3.460334539413452, + "ce_loss_13": 3.3851306796073914, + "ce_loss_2": 4.486385345458984, + "ce_loss_3": 4.209315371513367, + "ce_loss_7": 3.646379458904266, + "epoch": 0.641, + "grad_norm": 592.0, + "kl_loss_10": 181.4815986633301, + "kl_loss_2": 2269.7528442382813, + "kl_loss_3": 1780.3827819824219, + "kl_loss_7": 623.2667114257813, + "learning_rate": 0.0002908502623365536, + "loss": 1180.7166, + "step": 6410 + }, + { + "ce_loss_10": 3.400831735134125, + "ce_loss_13": 3.323111522197723, + "ce_loss_2": 4.43465530872345, + "ce_loss_3": 4.168019390106201, + "ce_loss_7": 3.5887860655784607, + "epoch": 0.642, + "grad_norm": 584.0, + "kl_loss_10": 180.2845359802246, + "kl_loss_2": 2285.9919677734374, + "kl_loss_3": 1807.420263671875, + "kl_loss_7": 623.3097045898437, + "learning_rate": 0.0002894101373095867, + "loss": 1196.7524, + "step": 6420 + }, + { + "ce_loss_10": 3.610305404663086, + "ce_loss_13": 3.5335601687431337, + "ce_loss_2": 4.569022560119629, + "ce_loss_3": 4.3065975427627565, + "ce_loss_7": 3.788155424594879, + "epoch": 0.643, + "grad_norm": 656.0, + "kl_loss_10": 185.8272720336914, + "kl_loss_2": 2151.861962890625, + "kl_loss_3": 1684.6893188476563, + "kl_loss_7": 614.9325622558594, + "learning_rate": 0.00028797213292019926, + "loss": 1162.4543, + "step": 6430 + }, + { + "ce_loss_10": 3.5838815212249755, + "ce_loss_13": 3.5060059309005736, + "ce_loss_2": 4.542932081222534, + "ce_loss_3": 4.284253716468811, + "ce_loss_7": 3.7631338119506834, + "epoch": 0.644, + "grad_norm": 536.0, + "kl_loss_10": 182.34116134643554, + "kl_loss_2": 2161.139373779297, + "kl_loss_3": 1697.281298828125, + "kl_loss_7": 612.268832397461, + "learning_rate": 0.0002865362636490791, + "loss": 1187.0314, + "step": 6440 + }, + { + "ce_loss_10": 3.598045587539673, + "ce_loss_13": 3.524975371360779, + "ce_loss_2": 4.552241158485413, + "ce_loss_3": 4.294051146507263, + "ce_loss_7": 3.7727458000183107, + "epoch": 0.645, + "grad_norm": 536.0, + "kl_loss_10": 178.94673080444335, + "kl_loss_2": 2142.5753845214845, + "kl_loss_3": 1685.976092529297, + "kl_loss_7": 604.8534698486328, + "learning_rate": 0.0002851025439554142, + "loss": 1148.6578, + "step": 6450 + }, + { + "ce_loss_10": 3.5864033341407775, + "ce_loss_13": 3.5102365136146547, + "ce_loss_2": 4.530459260940551, + "ce_loss_3": 4.2697702050209045, + "ce_loss_7": 3.77256600856781, + "epoch": 0.646, + "grad_norm": 552.0, + "kl_loss_10": 180.96249084472657, + "kl_loss_2": 2094.927349853516, + "kl_loss_3": 1631.7639465332031, + "kl_loss_7": 608.3878204345704, + "learning_rate": 0.00028367098827674573, + "loss": 1141.2359, + "step": 6460 + }, + { + "ce_loss_10": 3.5153084993362427, + "ce_loss_13": 3.4397502303123475, + "ce_loss_2": 4.504270768165588, + "ce_loss_3": 4.232501339912415, + "ce_loss_7": 3.69371120929718, + "epoch": 0.647, + "grad_norm": 588.0, + "kl_loss_10": 178.14280624389647, + "kl_loss_2": 2186.196148681641, + "kl_loss_3": 1706.314013671875, + "kl_loss_7": 600.8890106201172, + "learning_rate": 0.00028224161102882397, + "loss": 1170.0225, + "step": 6470 + }, + { + "ce_loss_10": 3.494782865047455, + "ce_loss_13": 3.418469178676605, + "ce_loss_2": 4.45595076084137, + "ce_loss_3": 4.1893230199813845, + "ce_loss_7": 3.6707924604415894, + "epoch": 0.648, + "grad_norm": 644.0, + "kl_loss_10": 177.30072097778321, + "kl_loss_2": 2145.1575622558594, + "kl_loss_3": 1676.7467163085937, + "kl_loss_7": 599.7084075927735, + "learning_rate": 0.00028081442660546124, + "loss": 1164.476, + "step": 6480 + }, + { + "ce_loss_10": 3.5571305990219115, + "ce_loss_13": 3.4820198893547056, + "ce_loss_2": 4.520304107666016, + "ce_loss_3": 4.250013303756714, + "ce_loss_7": 3.7307825326919555, + "epoch": 0.649, + "grad_norm": 708.0, + "kl_loss_10": 180.5020294189453, + "kl_loss_2": 2162.021893310547, + "kl_loss_3": 1681.335223388672, + "kl_loss_7": 604.5856201171875, + "learning_rate": 0.0002793894493783892, + "loss": 1161.7205, + "step": 6490 + }, + { + "ce_loss_10": 3.5730626702308657, + "ce_loss_13": 3.4996850967407225, + "ce_loss_2": 4.535577750205993, + "ce_loss_3": 4.2806238532066345, + "ce_loss_7": 3.746683120727539, + "epoch": 0.65, + "grad_norm": 532.0, + "kl_loss_10": 175.4969383239746, + "kl_loss_2": 2147.980224609375, + "kl_loss_3": 1685.3772094726562, + "kl_loss_7": 592.4264343261718, + "learning_rate": 0.0002779666936971129, + "loss": 1147.2826, + "step": 6500 + }, + { + "ce_loss_10": 3.579540717601776, + "ce_loss_13": 3.503932845592499, + "ce_loss_2": 4.570328307151795, + "ce_loss_3": 4.304692578315735, + "ce_loss_7": 3.760816919803619, + "epoch": 0.651, + "grad_norm": 560.0, + "kl_loss_10": 180.5074890136719, + "kl_loss_2": 2190.618371582031, + "kl_loss_3": 1722.614599609375, + "kl_loss_7": 613.0008575439454, + "learning_rate": 0.00027654617388876614, + "loss": 1176.0404, + "step": 6510 + }, + { + "ce_loss_10": 3.6101376891136168, + "ce_loss_13": 3.5372079849243163, + "ce_loss_2": 4.574031090736389, + "ce_loss_3": 4.305119824409485, + "ce_loss_7": 3.7854838371276855, + "epoch": 0.652, + "grad_norm": 506.0, + "kl_loss_10": 179.79571838378905, + "kl_loss_2": 2171.3123046875, + "kl_loss_3": 1687.553973388672, + "kl_loss_7": 603.6635681152344, + "learning_rate": 0.0002751279042579672, + "loss": 1161.0621, + "step": 6520 + }, + { + "ce_loss_10": 3.5500629782676696, + "ce_loss_13": 3.475240981578827, + "ce_loss_2": 4.515746712684631, + "ce_loss_3": 4.248768877983093, + "ce_loss_7": 3.726365828514099, + "epoch": 0.653, + "grad_norm": 520.0, + "kl_loss_10": 175.72006454467774, + "kl_loss_2": 2132.0258361816404, + "kl_loss_3": 1663.1835388183595, + "kl_loss_7": 593.7185913085938, + "learning_rate": 0.00027371189908667604, + "loss": 1173.0754, + "step": 6530 + }, + { + "ce_loss_10": 3.6066598892211914, + "ce_loss_13": 3.5256664633750914, + "ce_loss_2": 4.603748297691345, + "ce_loss_3": 4.334264886379242, + "ce_loss_7": 3.7890505313873293, + "epoch": 0.654, + "grad_norm": 556.0, + "kl_loss_10": 184.21709976196288, + "kl_loss_2": 2224.855090332031, + "kl_loss_3": 1750.1156860351562, + "kl_loss_7": 618.167140197754, + "learning_rate": 0.00027229817263404863, + "loss": 1200.1538, + "step": 6540 + }, + { + "ce_loss_10": 3.579574966430664, + "ce_loss_13": 3.502782142162323, + "ce_loss_2": 4.505041122436523, + "ce_loss_3": 4.243296790122986, + "ce_loss_7": 3.751440441608429, + "epoch": 0.655, + "grad_norm": 552.0, + "kl_loss_10": 178.47067565917968, + "kl_loss_2": 2091.8775146484377, + "kl_loss_3": 1621.7340759277345, + "kl_loss_7": 596.6207824707031, + "learning_rate": 0.0002708867391362948, + "loss": 1145.7798, + "step": 6550 + }, + { + "ce_loss_10": 3.5594303607940674, + "ce_loss_13": 3.4848424673080443, + "ce_loss_2": 4.510186004638672, + "ce_loss_3": 4.239946413040161, + "ce_loss_7": 3.729544758796692, + "epoch": 0.656, + "grad_norm": 600.0, + "kl_loss_10": 174.02187423706056, + "kl_loss_2": 2098.8442565917967, + "kl_loss_3": 1625.1830017089844, + "kl_loss_7": 579.9141540527344, + "learning_rate": 0.0002694776128065345, + "loss": 1152.9096, + "step": 6560 + }, + { + "ce_loss_10": 3.500006926059723, + "ce_loss_13": 3.4239490151405336, + "ce_loss_2": 4.463183629512787, + "ce_loss_3": 4.198473536968232, + "ce_loss_7": 3.6787616848945617, + "epoch": 0.657, + "grad_norm": 524.0, + "kl_loss_10": 181.49803161621094, + "kl_loss_2": 2175.4376220703125, + "kl_loss_3": 1692.8995849609375, + "kl_loss_7": 616.3900787353516, + "learning_rate": 0.00026807080783465374, + "loss": 1144.908, + "step": 6570 + }, + { + "ce_loss_10": 3.609026849269867, + "ce_loss_13": 3.5301132678985594, + "ce_loss_2": 4.5842578411102295, + "ce_loss_3": 4.322691702842713, + "ce_loss_7": 3.7907386422157288, + "epoch": 0.658, + "grad_norm": 540.0, + "kl_loss_10": 181.2076759338379, + "kl_loss_2": 2170.1943603515624, + "kl_loss_3": 1703.6680847167968, + "kl_loss_7": 614.0023223876954, + "learning_rate": 0.00026666633838716316, + "loss": 1180.9756, + "step": 6580 + }, + { + "ce_loss_10": 3.505808639526367, + "ce_loss_13": 3.424725067615509, + "ce_loss_2": 4.498762392997742, + "ce_loss_3": 4.228746104240417, + "ce_loss_7": 3.6866647005081177, + "epoch": 0.659, + "grad_norm": 660.0, + "kl_loss_10": 183.16211471557617, + "kl_loss_2": 2205.399041748047, + "kl_loss_3": 1729.7577697753907, + "kl_loss_7": 619.1098449707031, + "learning_rate": 0.00026526421860705474, + "loss": 1196.5574, + "step": 6590 + }, + { + "ce_loss_10": 3.5278443932533263, + "ce_loss_13": 3.451081359386444, + "ce_loss_2": 4.514556968212128, + "ce_loss_3": 4.246424973011017, + "ce_loss_7": 3.7130470991134645, + "epoch": 0.66, + "grad_norm": 604.0, + "kl_loss_10": 183.11859054565429, + "kl_loss_2": 2195.080352783203, + "kl_loss_3": 1720.6291870117188, + "kl_loss_7": 617.9165832519532, + "learning_rate": 0.0002638644626136587, + "loss": 1167.115, + "step": 6600 + }, + { + "ce_loss_10": 3.5388341546058655, + "ce_loss_13": 3.4648394107818605, + "ce_loss_2": 4.518757033348083, + "ce_loss_3": 4.251232302188873, + "ce_loss_7": 3.7166133403778074, + "epoch": 0.661, + "grad_norm": 600.0, + "kl_loss_10": 177.9744987487793, + "kl_loss_2": 2169.1557434082033, + "kl_loss_3": 1695.717547607422, + "kl_loss_7": 605.9922027587891, + "learning_rate": 0.00026246708450250255, + "loss": 1163.9504, + "step": 6610 + }, + { + "ce_loss_10": 3.536445343494415, + "ce_loss_13": 3.4618695259094237, + "ce_loss_2": 4.500265717506409, + "ce_loss_3": 4.239484262466431, + "ce_loss_7": 3.7084587097167967, + "epoch": 0.662, + "grad_norm": 624.0, + "kl_loss_10": 177.32165069580077, + "kl_loss_2": 2153.5470336914063, + "kl_loss_3": 1682.642596435547, + "kl_loss_7": 596.0139556884766, + "learning_rate": 0.00026107209834516854, + "loss": 1159.9879, + "step": 6620 + }, + { + "ce_loss_10": 3.4876843810081484, + "ce_loss_13": 3.4082067966461183, + "ce_loss_2": 4.498864269256591, + "ce_loss_3": 4.235283279418946, + "ce_loss_7": 3.666292154788971, + "epoch": 0.663, + "grad_norm": 620.0, + "kl_loss_10": 180.81258544921874, + "kl_loss_2": 2256.257257080078, + "kl_loss_3": 1782.119805908203, + "kl_loss_7": 615.3174774169922, + "learning_rate": 0.0002596795181891514, + "loss": 1197.8284, + "step": 6630 + }, + { + "ce_loss_10": 3.4956326842308045, + "ce_loss_13": 3.414097845554352, + "ce_loss_2": 4.498321509361267, + "ce_loss_3": 4.228005886077881, + "ce_loss_7": 3.6832273960113526, + "epoch": 0.664, + "grad_norm": 676.0, + "kl_loss_10": 186.22876663208007, + "kl_loss_2": 2223.420690917969, + "kl_loss_3": 1743.7244079589843, + "kl_loss_7": 627.1363647460937, + "learning_rate": 0.000258289358057718, + "loss": 1222.5622, + "step": 6640 + }, + { + "ce_loss_10": 3.5669368505477905, + "ce_loss_13": 3.4856945157051085, + "ce_loss_2": 4.551595258712768, + "ce_loss_3": 4.286670958995819, + "ce_loss_7": 3.751522934436798, + "epoch": 0.665, + "grad_norm": 556.0, + "kl_loss_10": 185.22528228759765, + "kl_loss_2": 2211.366003417969, + "kl_loss_3": 1740.3289489746094, + "kl_loss_7": 619.9797149658203, + "learning_rate": 0.0002569016319497657, + "loss": 1184.505, + "step": 6650 + }, + { + "ce_loss_10": 3.5523419260978697, + "ce_loss_13": 3.4712039113044737, + "ce_loss_2": 4.537931609153747, + "ce_loss_3": 4.279768109321594, + "ce_loss_7": 3.7336499214172365, + "epoch": 0.666, + "grad_norm": 544.0, + "kl_loss_10": 186.748779296875, + "kl_loss_2": 2205.344372558594, + "kl_loss_3": 1734.7506713867188, + "kl_loss_7": 622.0667907714844, + "learning_rate": 0.00025551635383968066, + "loss": 1198.5273, + "step": 6660 + }, + { + "ce_loss_10": 3.463807392120361, + "ce_loss_13": 3.3866657257080077, + "ce_loss_2": 4.469345259666443, + "ce_loss_3": 4.193110883235931, + "ce_loss_7": 3.6494885683059692, + "epoch": 0.667, + "grad_norm": 576.0, + "kl_loss_10": 184.71422729492187, + "kl_loss_2": 2248.0073059082033, + "kl_loss_3": 1753.0323059082032, + "kl_loss_7": 619.0755401611328, + "learning_rate": 0.00025413353767719804, + "loss": 1195.2947, + "step": 6670 + }, + { + "ce_loss_10": 3.5190200567245484, + "ce_loss_13": 3.4452382922172546, + "ce_loss_2": 4.497325706481933, + "ce_loss_3": 4.232502174377442, + "ce_loss_7": 3.694538187980652, + "epoch": 0.668, + "grad_norm": 568.0, + "kl_loss_10": 177.03625259399413, + "kl_loss_2": 2189.713330078125, + "kl_loss_3": 1718.3252258300781, + "kl_loss_7": 606.7687957763671, + "learning_rate": 0.0002527531973872617, + "loss": 1177.4366, + "step": 6680 + }, + { + "ce_loss_10": 3.538633036613464, + "ce_loss_13": 3.4624911904335023, + "ce_loss_2": 4.504513430595398, + "ce_loss_3": 4.237690329551697, + "ce_loss_7": 3.7170337319374083, + "epoch": 0.669, + "grad_norm": 592.0, + "kl_loss_10": 178.7047462463379, + "kl_loss_2": 2166.141973876953, + "kl_loss_3": 1687.0463806152343, + "kl_loss_7": 609.9779602050781, + "learning_rate": 0.0002513753468698826, + "loss": 1160.7738, + "step": 6690 + }, + { + "ce_loss_10": 3.510761630535126, + "ce_loss_13": 3.4321574330329896, + "ce_loss_2": 4.506561207771301, + "ce_loss_3": 4.238252663612366, + "ce_loss_7": 3.6953013062477114, + "epoch": 0.67, + "grad_norm": 544.0, + "kl_loss_10": 182.07874755859376, + "kl_loss_2": 2231.308416748047, + "kl_loss_3": 1749.7796997070313, + "kl_loss_7": 618.383251953125, + "learning_rate": 0.0002500000000000001, + "loss": 1185.6723, + "step": 6700 + }, + { + "ce_loss_10": 3.62176308631897, + "ce_loss_13": 3.547257494926453, + "ce_loss_2": 4.5295734882354735, + "ce_loss_3": 4.277957272529602, + "ce_loss_7": 3.788572609424591, + "epoch": 0.671, + "grad_norm": 548.0, + "kl_loss_10": 173.84563446044922, + "kl_loss_2": 2050.3306396484377, + "kl_loss_3": 1604.508935546875, + "kl_loss_7": 584.9092193603516, + "learning_rate": 0.0002486271706273421, + "loss": 1168.4034, + "step": 6710 + }, + { + "ce_loss_10": 3.557868146896362, + "ce_loss_13": 3.485461747646332, + "ce_loss_2": 4.488811063766479, + "ce_loss_3": 4.22833331823349, + "ce_loss_7": 3.7254873156547545, + "epoch": 0.672, + "grad_norm": 644.0, + "kl_loss_10": 175.32781143188475, + "kl_loss_2": 2091.0470642089845, + "kl_loss_3": 1627.7723266601563, + "kl_loss_7": 589.4509521484375, + "learning_rate": 0.0002472568725762853, + "loss": 1154.7741, + "step": 6720 + }, + { + "ce_loss_10": 3.5483877897262572, + "ce_loss_13": 3.4755659341812133, + "ce_loss_2": 4.477482891082763, + "ce_loss_3": 4.2254945039749146, + "ce_loss_7": 3.717711102962494, + "epoch": 0.673, + "grad_norm": 564.0, + "kl_loss_10": 173.398193359375, + "kl_loss_2": 2110.752795410156, + "kl_loss_3": 1653.0817565917969, + "kl_loss_7": 586.6092742919922, + "learning_rate": 0.00024588911964571554, + "loss": 1145.849, + "step": 6730 + }, + { + "ce_loss_10": 3.5657129168510435, + "ce_loss_13": 3.4857802987098694, + "ce_loss_2": 4.5576330661773685, + "ce_loss_3": 4.288707995414734, + "ce_loss_7": 3.753812789916992, + "epoch": 0.674, + "grad_norm": 524.0, + "kl_loss_10": 187.46376113891603, + "kl_loss_2": 2205.4671936035156, + "kl_loss_3": 1727.960107421875, + "kl_loss_7": 626.6214233398438, + "learning_rate": 0.00024452392560888974, + "loss": 1167.7188, + "step": 6740 + }, + { + "ce_loss_10": 3.455358147621155, + "ce_loss_13": 3.378260016441345, + "ce_loss_2": 4.419786167144776, + "ce_loss_3": 4.155612635612488, + "ce_loss_7": 3.6355133295059203, + "epoch": 0.675, + "grad_norm": 532.0, + "kl_loss_10": 177.83211364746094, + "kl_loss_2": 2172.1373962402345, + "kl_loss_3": 1704.7519836425781, + "kl_loss_7": 606.2507995605469, + "learning_rate": 0.00024316130421329695, + "loss": 1157.1621, + "step": 6750 + }, + { + "ce_loss_10": 3.535005438327789, + "ce_loss_13": 3.4575978398323057, + "ce_loss_2": 4.505799317359925, + "ce_loss_3": 4.232890093326569, + "ce_loss_7": 3.7101247310638428, + "epoch": 0.676, + "grad_norm": 564.0, + "kl_loss_10": 177.2459358215332, + "kl_loss_2": 2136.964288330078, + "kl_loss_3": 1660.3365295410156, + "kl_loss_7": 591.8486877441406, + "learning_rate": 0.00024180126918051909, + "loss": 1154.5281, + "step": 6760 + }, + { + "ce_loss_10": 3.577043890953064, + "ce_loss_13": 3.5019183993339538, + "ce_loss_2": 4.527614569664001, + "ce_loss_3": 4.265857553482055, + "ce_loss_7": 3.7534381628036497, + "epoch": 0.677, + "grad_norm": 604.0, + "kl_loss_10": 178.1947784423828, + "kl_loss_2": 2127.7522155761717, + "kl_loss_3": 1659.7866516113281, + "kl_loss_7": 597.4583068847656, + "learning_rate": 0.00024044383420609406, + "loss": 1141.1451, + "step": 6770 + }, + { + "ce_loss_10": 3.589032161235809, + "ce_loss_13": 3.514654505252838, + "ce_loss_2": 4.520573258399963, + "ce_loss_3": 4.2588379859924315, + "ce_loss_7": 3.7536001801490784, + "epoch": 0.678, + "grad_norm": 552.0, + "kl_loss_10": 175.52578201293946, + "kl_loss_2": 2107.4931701660157, + "kl_loss_3": 1641.6564514160157, + "kl_loss_7": 591.3939788818359, + "learning_rate": 0.00023908901295937712, + "loss": 1175.0256, + "step": 6780 + }, + { + "ce_loss_10": 3.5837427616119384, + "ce_loss_13": 3.505910849571228, + "ce_loss_2": 4.535600376129151, + "ce_loss_3": 4.271885943412781, + "ce_loss_7": 3.755298101902008, + "epoch": 0.679, + "grad_norm": 596.0, + "kl_loss_10": 177.23758392333986, + "kl_loss_2": 2111.0602905273436, + "kl_loss_3": 1645.3706420898438, + "kl_loss_7": 592.0553497314453, + "learning_rate": 0.00023773681908340283, + "loss": 1169.8496, + "step": 6790 + }, + { + "ce_loss_10": 3.5592074632644652, + "ce_loss_13": 3.4772790670394897, + "ce_loss_2": 4.548656535148621, + "ce_loss_3": 4.2832125425338745, + "ce_loss_7": 3.7424607038497926, + "epoch": 0.68, + "grad_norm": 600.0, + "kl_loss_10": 187.7086395263672, + "kl_loss_2": 2222.717413330078, + "kl_loss_3": 1751.2139282226562, + "kl_loss_7": 621.8694488525391, + "learning_rate": 0.00023638726619474876, + "loss": 1203.8379, + "step": 6800 + }, + { + "ce_loss_10": 3.5476158022880555, + "ce_loss_13": 3.467449462413788, + "ce_loss_2": 4.563005781173706, + "ce_loss_3": 4.29602427482605, + "ce_loss_7": 3.7365992784500124, + "epoch": 0.681, + "grad_norm": 580.0, + "kl_loss_10": 183.3829345703125, + "kl_loss_2": 2228.727575683594, + "kl_loss_3": 1760.1514953613282, + "kl_loss_7": 626.0395812988281, + "learning_rate": 0.0002350403678833976, + "loss": 1182.506, + "step": 6810 + }, + { + "ce_loss_10": 3.4750794649124144, + "ce_loss_13": 3.39876846075058, + "ce_loss_2": 4.465261030197143, + "ce_loss_3": 4.1875766038894655, + "ce_loss_7": 3.6560620784759523, + "epoch": 0.682, + "grad_norm": 490.0, + "kl_loss_10": 178.67537307739258, + "kl_loss_2": 2200.3225830078127, + "kl_loss_3": 1714.0246276855469, + "kl_loss_7": 608.0651733398438, + "learning_rate": 0.00023369613771260007, + "loss": 1160.444, + "step": 6820 + }, + { + "ce_loss_10": 3.5863471627235413, + "ce_loss_13": 3.5096321582794188, + "ce_loss_2": 4.573717498779297, + "ce_loss_3": 4.300703597068787, + "ce_loss_7": 3.767488884925842, + "epoch": 0.683, + "grad_norm": 608.0, + "kl_loss_10": 181.20342712402345, + "kl_loss_2": 2206.5099487304688, + "kl_loss_3": 1721.635076904297, + "kl_loss_7": 610.5273590087891, + "learning_rate": 0.00023235458921873925, + "loss": 1187.8242, + "step": 6830 + }, + { + "ce_loss_10": 3.5434704184532166, + "ce_loss_13": 3.4598939180374146, + "ce_loss_2": 4.561892867088318, + "ce_loss_3": 4.300772976875305, + "ce_loss_7": 3.73870370388031, + "epoch": 0.684, + "grad_norm": 640.0, + "kl_loss_10": 187.79423599243165, + "kl_loss_2": 2268.489392089844, + "kl_loss_3": 1798.0396545410156, + "kl_loss_7": 637.83154296875, + "learning_rate": 0.0002310157359111938, + "loss": 1215.3348, + "step": 6840 + }, + { + "ce_loss_10": 3.4310184836387636, + "ce_loss_13": 3.3527446746826173, + "ce_loss_2": 4.52064049243927, + "ce_loss_3": 4.243770575523376, + "ce_loss_7": 3.6270575404167174, + "epoch": 0.685, + "grad_norm": 656.0, + "kl_loss_10": 183.79262008666993, + "kl_loss_2": 2376.693957519531, + "kl_loss_3": 1883.7591125488282, + "kl_loss_7": 632.1836151123047, + "learning_rate": 0.0002296795912722014, + "loss": 1227.4164, + "step": 6850 + }, + { + "ce_loss_10": 3.570713925361633, + "ce_loss_13": 3.494589388370514, + "ce_loss_2": 4.519134759902954, + "ce_loss_3": 4.253862988948822, + "ce_loss_7": 3.7498515605926515, + "epoch": 0.686, + "grad_norm": 576.0, + "kl_loss_10": 179.78029174804686, + "kl_loss_2": 2128.572625732422, + "kl_loss_3": 1654.8151062011718, + "kl_loss_7": 601.8367279052734, + "learning_rate": 0.0002283461687567236, + "loss": 1133.3289, + "step": 6860 + }, + { + "ce_loss_10": 3.6324430108070374, + "ce_loss_13": 3.5565361857414244, + "ce_loss_2": 4.557056021690369, + "ce_loss_3": 4.298560571670532, + "ce_loss_7": 3.8067931652069094, + "epoch": 0.687, + "grad_norm": 506.0, + "kl_loss_10": 176.90674362182617, + "kl_loss_2": 2057.1298095703123, + "kl_loss_3": 1601.75263671875, + "kl_loss_7": 589.7931121826172, + "learning_rate": 0.00022701548179231045, + "loss": 1148.6605, + "step": 6870 + }, + { + "ce_loss_10": 3.582988679409027, + "ce_loss_13": 3.5054625153541563, + "ce_loss_2": 4.558988261222839, + "ce_loss_3": 4.300906538963318, + "ce_loss_7": 3.7617339849472047, + "epoch": 0.688, + "grad_norm": 628.0, + "kl_loss_10": 181.7885940551758, + "kl_loss_2": 2183.6723815917967, + "kl_loss_3": 1725.755859375, + "kl_loss_7": 613.1118804931641, + "learning_rate": 0.00022568754377896516, + "loss": 1157.5781, + "step": 6880 + }, + { + "ce_loss_10": 3.5760830521583555, + "ce_loss_13": 3.499359941482544, + "ce_loss_2": 4.526648283004761, + "ce_loss_3": 4.2548288941383365, + "ce_loss_7": 3.746409332752228, + "epoch": 0.689, + "grad_norm": 596.0, + "kl_loss_10": 180.4591537475586, + "kl_loss_2": 2140.339678955078, + "kl_loss_3": 1666.426806640625, + "kl_loss_7": 608.2543426513672, + "learning_rate": 0.00022436236808900844, + "loss": 1146.7832, + "step": 6890 + }, + { + "ce_loss_10": 3.4679219722747803, + "ce_loss_13": 3.3943055748939512, + "ce_loss_2": 4.462708353996277, + "ce_loss_3": 4.191701900959015, + "ce_loss_7": 3.6564658761024473, + "epoch": 0.69, + "grad_norm": 576.0, + "kl_loss_10": 181.4543029785156, + "kl_loss_2": 2231.3488586425783, + "kl_loss_3": 1747.2512329101562, + "kl_loss_7": 621.3712341308594, + "learning_rate": 0.00022303996806694487, + "loss": 1171.5013, + "step": 6900 + }, + { + "ce_loss_10": 3.5484726190567017, + "ce_loss_13": 3.4742958664894106, + "ce_loss_2": 4.519419646263122, + "ce_loss_3": 4.2546670794487, + "ce_loss_7": 3.7258636236190794, + "epoch": 0.691, + "grad_norm": 628.0, + "kl_loss_10": 177.98818740844726, + "kl_loss_2": 2169.697412109375, + "kl_loss_3": 1700.1277648925782, + "kl_loss_7": 608.3069030761719, + "learning_rate": 0.00022172035702932823, + "loss": 1158.7983, + "step": 6910 + }, + { + "ce_loss_10": 3.5924888372421266, + "ce_loss_13": 3.52042818069458, + "ce_loss_2": 4.517103600502014, + "ce_loss_3": 4.2600155711174015, + "ce_loss_7": 3.7615610361099243, + "epoch": 0.692, + "grad_norm": 644.0, + "kl_loss_10": 178.67746124267578, + "kl_loss_2": 2075.089074707031, + "kl_loss_3": 1619.080419921875, + "kl_loss_7": 597.07578125, + "learning_rate": 0.00022040354826462666, + "loss": 1140.3766, + "step": 6920 + }, + { + "ce_loss_10": 3.5235054731369018, + "ce_loss_13": 3.4497315883636475, + "ce_loss_2": 4.493763208389282, + "ce_loss_3": 4.228443372249603, + "ce_loss_7": 3.696590280532837, + "epoch": 0.693, + "grad_norm": 608.0, + "kl_loss_10": 176.88443298339843, + "kl_loss_2": 2155.86865234375, + "kl_loss_3": 1688.133123779297, + "kl_loss_7": 594.834016418457, + "learning_rate": 0.0002190895550330899, + "loss": 1170.6351, + "step": 6930 + }, + { + "ce_loss_10": 3.4576660275459288, + "ce_loss_13": 3.3801838874816896, + "ce_loss_2": 4.465155124664307, + "ce_loss_3": 4.190037369728088, + "ce_loss_7": 3.644961953163147, + "epoch": 0.694, + "grad_norm": 596.0, + "kl_loss_10": 183.47678833007814, + "kl_loss_2": 2243.730157470703, + "kl_loss_3": 1750.4187255859374, + "kl_loss_7": 620.829443359375, + "learning_rate": 0.00021777839056661552, + "loss": 1165.1125, + "step": 6940 + }, + { + "ce_loss_10": 3.5390109062194823, + "ce_loss_13": 3.464726150035858, + "ce_loss_2": 4.509364485740662, + "ce_loss_3": 4.234912276268005, + "ce_loss_7": 3.7123560190200804, + "epoch": 0.695, + "grad_norm": 544.0, + "kl_loss_10": 176.9818588256836, + "kl_loss_2": 2161.626544189453, + "kl_loss_3": 1678.7994750976563, + "kl_loss_7": 599.7094299316407, + "learning_rate": 0.0002164700680686147, + "loss": 1138.0607, + "step": 6950 + }, + { + "ce_loss_10": 3.584149193763733, + "ce_loss_13": 3.509235203266144, + "ce_loss_2": 4.522939825057984, + "ce_loss_3": 4.255844712257385, + "ce_loss_7": 3.757488739490509, + "epoch": 0.696, + "grad_norm": 520.0, + "kl_loss_10": 178.28938369750978, + "kl_loss_2": 2107.5391052246096, + "kl_loss_3": 1637.0810913085938, + "kl_loss_7": 596.8087493896485, + "learning_rate": 0.0002151646007138806, + "loss": 1144.8846, + "step": 6960 + }, + { + "ce_loss_10": 3.463143539428711, + "ce_loss_13": 3.386814093589783, + "ce_loss_2": 4.468677043914795, + "ce_loss_3": 4.195722925662994, + "ce_loss_7": 3.644878602027893, + "epoch": 0.697, + "grad_norm": 592.0, + "kl_loss_10": 182.87069091796874, + "kl_loss_2": 2238.085968017578, + "kl_loss_3": 1753.8017456054688, + "kl_loss_7": 618.0010162353516, + "learning_rate": 0.00021386200164845526, + "loss": 1174.7895, + "step": 6970 + }, + { + "ce_loss_10": 3.646360158920288, + "ce_loss_13": 3.5726787090301513, + "ce_loss_2": 4.5610116720199585, + "ce_loss_3": 4.303556060791015, + "ce_loss_7": 3.814376199245453, + "epoch": 0.698, + "grad_norm": 564.0, + "kl_loss_10": 176.60812377929688, + "kl_loss_2": 2073.2183532714844, + "kl_loss_3": 1616.5314147949218, + "kl_loss_7": 592.3170806884766, + "learning_rate": 0.0002125622839894964, + "loss": 1126.8248, + "step": 6980 + }, + { + "ce_loss_10": 3.5844451546669007, + "ce_loss_13": 3.5105634808540342, + "ce_loss_2": 4.530939984321594, + "ce_loss_3": 4.263714623451233, + "ce_loss_7": 3.7546409368515015, + "epoch": 0.699, + "grad_norm": 580.0, + "kl_loss_10": 177.07121353149415, + "kl_loss_2": 2114.079455566406, + "kl_loss_3": 1646.3038818359375, + "kl_loss_7": 590.21640625, + "learning_rate": 0.00021126546082514663, + "loss": 1144.4324, + "step": 6990 + }, + { + "ce_loss_10": 3.6092105984687803, + "ce_loss_13": 3.533507966995239, + "ce_loss_2": 4.539715147018432, + "ce_loss_3": 4.274128103256226, + "ce_loss_7": 3.7783223032951354, + "epoch": 0.7, + "grad_norm": 576.0, + "kl_loss_10": 177.3388931274414, + "kl_loss_2": 2107.8220703125, + "kl_loss_3": 1636.2730224609375, + "kl_loss_7": 594.1880798339844, + "learning_rate": 0.00020997154521440098, + "loss": 1131.7685, + "step": 7000 + }, + { + "ce_loss_10": 3.5483237147331237, + "ce_loss_13": 3.476468551158905, + "ce_loss_2": 4.5004148244857785, + "ce_loss_3": 4.238211619853973, + "ce_loss_7": 3.722394573688507, + "epoch": 0.701, + "grad_norm": 556.0, + "kl_loss_10": 174.87986907958984, + "kl_loss_2": 2127.186975097656, + "kl_loss_3": 1661.8602966308595, + "kl_loss_7": 600.6610717773438, + "learning_rate": 0.0002086805501869749, + "loss": 1133.7422, + "step": 7010 + }, + { + "ce_loss_10": 3.5188135743141173, + "ce_loss_13": 3.441002869606018, + "ce_loss_2": 4.517698335647583, + "ce_loss_3": 4.247731244564056, + "ce_loss_7": 3.704049062728882, + "epoch": 0.702, + "grad_norm": 616.0, + "kl_loss_10": 182.97085342407226, + "kl_loss_2": 2238.2483459472655, + "kl_loss_3": 1746.861260986328, + "kl_loss_7": 621.9453765869141, + "learning_rate": 0.0002073924887431744, + "loss": 1180.4881, + "step": 7020 + }, + { + "ce_loss_10": 3.5274356603622437, + "ce_loss_13": 3.45092910528183, + "ce_loss_2": 4.4901411771774296, + "ce_loss_3": 4.230588483810425, + "ce_loss_7": 3.706618547439575, + "epoch": 0.703, + "grad_norm": 568.0, + "kl_loss_10": 179.11029281616212, + "kl_loss_2": 2178.3450439453127, + "kl_loss_3": 1711.4957885742188, + "kl_loss_7": 605.4426422119141, + "learning_rate": 0.00020610737385376348, + "loss": 1200.9115, + "step": 7030 + }, + { + "ce_loss_10": 3.5887810468673704, + "ce_loss_13": 3.5163929224014283, + "ce_loss_2": 4.518351888656616, + "ce_loss_3": 4.254893863201142, + "ce_loss_7": 3.7612039923667906, + "epoch": 0.704, + "grad_norm": 628.0, + "kl_loss_10": 176.6663619995117, + "kl_loss_2": 2075.716662597656, + "kl_loss_3": 1610.9020690917969, + "kl_loss_7": 588.8746612548828, + "learning_rate": 0.00020482521845983521, + "loss": 1151.7219, + "step": 7040 + }, + { + "ce_loss_10": 3.5866637587547303, + "ce_loss_13": 3.5072137475013734, + "ce_loss_2": 4.558261132240295, + "ce_loss_3": 4.291126704216003, + "ce_loss_7": 3.7625884056091308, + "epoch": 0.705, + "grad_norm": 600.0, + "kl_loss_10": 182.52303237915038, + "kl_loss_2": 2193.1544799804688, + "kl_loss_3": 1715.2766052246093, + "kl_loss_7": 612.0993133544922, + "learning_rate": 0.00020354603547267987, + "loss": 1187.2512, + "step": 7050 + }, + { + "ce_loss_10": 3.56976774930954, + "ce_loss_13": 3.488901746273041, + "ce_loss_2": 4.5605854988098145, + "ce_loss_3": 4.2862097263336185, + "ce_loss_7": 3.7558568716049194, + "epoch": 0.706, + "grad_norm": 504.0, + "kl_loss_10": 182.46872100830078, + "kl_loss_2": 2185.692938232422, + "kl_loss_3": 1703.4005493164063, + "kl_loss_7": 615.3342132568359, + "learning_rate": 0.00020226983777365604, + "loss": 1201.599, + "step": 7060 + }, + { + "ce_loss_10": 3.46960107088089, + "ce_loss_13": 3.394390141963959, + "ce_loss_2": 4.4708491563797, + "ce_loss_3": 4.21563994884491, + "ce_loss_7": 3.6478799104690554, + "epoch": 0.707, + "grad_norm": 548.0, + "kl_loss_10": 174.23039703369142, + "kl_loss_2": 2219.3698486328126, + "kl_loss_3": 1767.6679748535157, + "kl_loss_7": 596.5048126220703, + "learning_rate": 0.00020099663821406056, + "loss": 1167.8441, + "step": 7070 + }, + { + "ce_loss_10": 3.573564553260803, + "ce_loss_13": 3.4988652229309083, + "ce_loss_2": 4.518075895309448, + "ce_loss_3": 4.2526293873786924, + "ce_loss_7": 3.74619642496109, + "epoch": 0.708, + "grad_norm": 688.0, + "kl_loss_10": 173.7955307006836, + "kl_loss_2": 2112.61328125, + "kl_loss_3": 1644.760516357422, + "kl_loss_7": 588.589468383789, + "learning_rate": 0.00019972644961499853, + "loss": 1168.0168, + "step": 7080 + }, + { + "ce_loss_10": 3.5425114035606384, + "ce_loss_13": 3.4652504205703734, + "ce_loss_2": 4.536031889915466, + "ce_loss_3": 4.265958952903747, + "ce_loss_7": 3.7277685403823853, + "epoch": 0.709, + "grad_norm": 544.0, + "kl_loss_10": 181.94257354736328, + "kl_loss_2": 2208.387451171875, + "kl_loss_3": 1727.9980712890624, + "kl_loss_7": 619.6463317871094, + "learning_rate": 0.00019845928476725522, + "loss": 1173.2897, + "step": 7090 + }, + { + "ce_loss_10": 3.6211097598075868, + "ce_loss_13": 3.542751681804657, + "ce_loss_2": 4.576697874069214, + "ce_loss_3": 4.307754421234131, + "ce_loss_7": 3.794824481010437, + "epoch": 0.71, + "grad_norm": 524.0, + "kl_loss_10": 179.40447082519532, + "kl_loss_2": 2133.6560546875, + "kl_loss_3": 1661.1115417480469, + "kl_loss_7": 603.4232849121094, + "learning_rate": 0.00019719515643116677, + "loss": 1187.0576, + "step": 7100 + }, + { + "ce_loss_10": 3.563658607006073, + "ce_loss_13": 3.486394798755646, + "ce_loss_2": 4.523072552680969, + "ce_loss_3": 4.254948425292969, + "ce_loss_7": 3.7338495373725893, + "epoch": 0.711, + "grad_norm": 560.0, + "kl_loss_10": 177.84368362426758, + "kl_loss_2": 2144.635882568359, + "kl_loss_3": 1666.16875, + "kl_loss_7": 594.3132598876953, + "learning_rate": 0.0001959340773364911, + "loss": 1165.8826, + "step": 7110 + }, + { + "ce_loss_10": 3.5770322680473328, + "ce_loss_13": 3.5012174606323243, + "ce_loss_2": 4.550109481811523, + "ce_loss_3": 4.284217190742493, + "ce_loss_7": 3.7552335023880006, + "epoch": 0.712, + "grad_norm": 482.0, + "kl_loss_10": 179.49577865600585, + "kl_loss_2": 2181.1701049804688, + "kl_loss_3": 1700.0443542480468, + "kl_loss_7": 603.1331329345703, + "learning_rate": 0.0001946760601822809, + "loss": 1144.9554, + "step": 7120 + }, + { + "ce_loss_10": 3.6210792899131774, + "ce_loss_13": 3.549504554271698, + "ce_loss_2": 4.563032126426696, + "ce_loss_3": 4.2925217628479, + "ce_loss_7": 3.7989898562431335, + "epoch": 0.713, + "grad_norm": 592.0, + "kl_loss_10": 177.09535369873046, + "kl_loss_2": 2104.0981018066404, + "kl_loss_3": 1631.4184448242188, + "kl_loss_7": 592.4103118896485, + "learning_rate": 0.00019342111763675512, + "loss": 1123.9035, + "step": 7130 + }, + { + "ce_loss_10": 3.624540627002716, + "ce_loss_13": 3.5509743094444275, + "ce_loss_2": 4.5522850275039675, + "ce_loss_3": 4.289403009414673, + "ce_loss_7": 3.7917919158935547, + "epoch": 0.714, + "grad_norm": 588.0, + "kl_loss_10": 179.54557189941406, + "kl_loss_2": 2098.2009887695312, + "kl_loss_3": 1627.7805236816407, + "kl_loss_7": 597.2573303222656, + "learning_rate": 0.00019216926233717085, + "loss": 1127.0122, + "step": 7140 + }, + { + "ce_loss_10": 3.5141358375549316, + "ce_loss_13": 3.439559853076935, + "ce_loss_2": 4.534635162353515, + "ce_loss_3": 4.271865749359131, + "ce_loss_7": 3.6872041702270506, + "epoch": 0.715, + "grad_norm": 660.0, + "kl_loss_10": 176.31234970092774, + "kl_loss_2": 2255.184912109375, + "kl_loss_3": 1791.5307861328124, + "kl_loss_7": 594.7268737792969, + "learning_rate": 0.00019092050688969737, + "loss": 1192.3771, + "step": 7150 + }, + { + "ce_loss_10": 3.586177408695221, + "ce_loss_13": 3.5133618116378784, + "ce_loss_2": 4.527247905731201, + "ce_loss_3": 4.265925621986389, + "ce_loss_7": 3.7605576038360597, + "epoch": 0.716, + "grad_norm": 644.0, + "kl_loss_10": 177.39978713989257, + "kl_loss_2": 2138.382684326172, + "kl_loss_3": 1670.822119140625, + "kl_loss_7": 599.2600921630859, + "learning_rate": 0.00018967486386928817, + "loss": 1143.1982, + "step": 7160 + }, + { + "ce_loss_10": 3.4582155346870422, + "ce_loss_13": 3.3820405125617983, + "ce_loss_2": 4.456401991844177, + "ce_loss_3": 4.1904214262962345, + "ce_loss_7": 3.640235483646393, + "epoch": 0.717, + "grad_norm": 644.0, + "kl_loss_10": 181.15178756713868, + "kl_loss_2": 2234.275775146484, + "kl_loss_3": 1755.7729919433593, + "kl_loss_7": 621.9208374023438, + "learning_rate": 0.00018843234581955443, + "loss": 1211.3026, + "step": 7170 + }, + { + "ce_loss_10": 3.4746442079544066, + "ce_loss_13": 3.3969290494918822, + "ce_loss_2": 4.4550795435905455, + "ce_loss_3": 4.190334832668304, + "ce_loss_7": 3.6564103603363036, + "epoch": 0.718, + "grad_norm": 552.0, + "kl_loss_10": 182.11315155029297, + "kl_loss_2": 2189.7255920410157, + "kl_loss_3": 1717.2798217773438, + "kl_loss_7": 618.1327026367187, + "learning_rate": 0.00018719296525263924, + "loss": 1174.7828, + "step": 7180 + }, + { + "ce_loss_10": 3.571851980686188, + "ce_loss_13": 3.4972564935684205, + "ce_loss_2": 4.505244612693787, + "ce_loss_3": 4.243821203708649, + "ce_loss_7": 3.744515597820282, + "epoch": 0.719, + "grad_norm": 616.0, + "kl_loss_10": 176.35762710571288, + "kl_loss_2": 2085.3956665039063, + "kl_loss_3": 1620.6713073730468, + "kl_loss_7": 587.7710571289062, + "learning_rate": 0.0001859567346490913, + "loss": 1127.6644, + "step": 7190 + }, + { + "ce_loss_10": 3.5473140597343447, + "ce_loss_13": 3.469071900844574, + "ce_loss_2": 4.532921981811524, + "ce_loss_3": 4.260496711730957, + "ce_loss_7": 3.727588391304016, + "epoch": 0.72, + "grad_norm": 576.0, + "kl_loss_10": 181.04826431274415, + "kl_loss_2": 2198.079150390625, + "kl_loss_3": 1714.5421325683594, + "kl_loss_7": 608.8879028320313, + "learning_rate": 0.0001847236664577389, + "loss": 1142.0284, + "step": 7200 + }, + { + "ce_loss_10": 3.5739798665046694, + "ce_loss_13": 3.498915135860443, + "ce_loss_2": 4.512744069099426, + "ce_loss_3": 4.2453584432601925, + "ce_loss_7": 3.7430235743522644, + "epoch": 0.721, + "grad_norm": 560.0, + "kl_loss_10": 177.07028579711914, + "kl_loss_2": 2100.2286865234373, + "kl_loss_3": 1626.5753784179688, + "kl_loss_7": 587.8365112304688, + "learning_rate": 0.00018349377309556487, + "loss": 1123.1494, + "step": 7210 + }, + { + "ce_loss_10": 3.5153507471084593, + "ce_loss_13": 3.438252806663513, + "ce_loss_2": 4.529551863670349, + "ce_loss_3": 4.264591979980469, + "ce_loss_7": 3.6999141216278075, + "epoch": 0.722, + "grad_norm": 576.0, + "kl_loss_10": 181.94500274658202, + "kl_loss_2": 2259.3618774414062, + "kl_loss_3": 1782.3347534179688, + "kl_loss_7": 618.6104766845704, + "learning_rate": 0.00018226706694758193, + "loss": 1192.0385, + "step": 7220 + }, + { + "ce_loss_10": 3.589731001853943, + "ce_loss_13": 3.5162469148635864, + "ce_loss_2": 4.535777926445007, + "ce_loss_3": 4.275981712341308, + "ce_loss_7": 3.758218777179718, + "epoch": 0.723, + "grad_norm": 536.0, + "kl_loss_10": 176.7706611633301, + "kl_loss_2": 2136.6498046875, + "kl_loss_3": 1678.7979248046875, + "kl_loss_7": 600.344839477539, + "learning_rate": 0.0001810435603667075, + "loss": 1186.8562, + "step": 7230 + }, + { + "ce_loss_10": 3.4363317847251893, + "ce_loss_13": 3.3615066409111023, + "ce_loss_2": 4.428185939788818, + "ce_loss_3": 4.15840493440628, + "ce_loss_7": 3.6154449939727784, + "epoch": 0.724, + "grad_norm": 568.0, + "kl_loss_10": 175.6705749511719, + "kl_loss_2": 2191.837860107422, + "kl_loss_3": 1708.5897644042968, + "kl_loss_7": 600.9333648681641, + "learning_rate": 0.0001798232656736389, + "loss": 1187.3889, + "step": 7240 + }, + { + "ce_loss_10": 3.6142520189285277, + "ce_loss_13": 3.539129304885864, + "ce_loss_2": 4.541441655158996, + "ce_loss_3": 4.278818452358246, + "ce_loss_7": 3.7878984928131105, + "epoch": 0.725, + "grad_norm": 548.0, + "kl_loss_10": 176.2219985961914, + "kl_loss_2": 2082.9966674804687, + "kl_loss_3": 1618.1460876464844, + "kl_loss_7": 589.8986907958985, + "learning_rate": 0.0001786061951567303, + "loss": 1139.4487, + "step": 7250 + }, + { + "ce_loss_10": 3.528095841407776, + "ce_loss_13": 3.449831175804138, + "ce_loss_2": 4.499278616905213, + "ce_loss_3": 4.2353353023529055, + "ce_loss_7": 3.7135850310325624, + "epoch": 0.726, + "grad_norm": 564.0, + "kl_loss_10": 179.76034393310547, + "kl_loss_2": 2139.9875549316407, + "kl_loss_3": 1671.8400817871093, + "kl_loss_7": 601.5716674804687, + "learning_rate": 0.00017739236107186857, + "loss": 1166.0127, + "step": 7260 + }, + { + "ce_loss_10": 3.6185179114341737, + "ce_loss_13": 3.5442421674728393, + "ce_loss_2": 4.529335474967956, + "ce_loss_3": 4.268719971179962, + "ce_loss_7": 3.782019078731537, + "epoch": 0.727, + "grad_norm": 506.0, + "kl_loss_10": 174.4645896911621, + "kl_loss_2": 2059.7219299316407, + "kl_loss_3": 1594.1942993164062, + "kl_loss_7": 584.5985778808594, + "learning_rate": 0.00017618177564234904, + "loss": 1131.8243, + "step": 7270 + }, + { + "ce_loss_10": 3.5931476950645447, + "ce_loss_13": 3.5195810914039614, + "ce_loss_2": 4.50758855342865, + "ce_loss_3": 4.243484151363373, + "ce_loss_7": 3.7607154488563537, + "epoch": 0.728, + "grad_norm": 560.0, + "kl_loss_10": 172.751806640625, + "kl_loss_2": 2033.7148681640624, + "kl_loss_3": 1570.946112060547, + "kl_loss_7": 576.0696563720703, + "learning_rate": 0.00017497445105875377, + "loss": 1116.918, + "step": 7280 + }, + { + "ce_loss_10": 3.5072262048721314, + "ce_loss_13": 3.429379200935364, + "ce_loss_2": 4.499281525611877, + "ce_loss_3": 4.232627415657044, + "ce_loss_7": 3.695177102088928, + "epoch": 0.729, + "grad_norm": 552.0, + "kl_loss_10": 181.318611907959, + "kl_loss_2": 2210.19443359375, + "kl_loss_3": 1730.8836486816406, + "kl_loss_7": 613.139291381836, + "learning_rate": 0.000173770399478828, + "loss": 1168.2677, + "step": 7290 + }, + { + "ce_loss_10": 3.422491526603699, + "ce_loss_13": 3.347836971282959, + "ce_loss_2": 4.407784128189087, + "ce_loss_3": 4.131522953510284, + "ce_loss_7": 3.6013160228729246, + "epoch": 0.73, + "grad_norm": 560.0, + "kl_loss_10": 176.02440795898437, + "kl_loss_2": 2191.964385986328, + "kl_loss_3": 1698.5522827148438, + "kl_loss_7": 599.1639129638672, + "learning_rate": 0.0001725696330273575, + "loss": 1197.3154, + "step": 7300 + }, + { + "ce_loss_10": 3.611281132698059, + "ce_loss_13": 3.536842370033264, + "ce_loss_2": 4.535079216957092, + "ce_loss_3": 4.276083791255951, + "ce_loss_7": 3.782840621471405, + "epoch": 0.731, + "grad_norm": 608.0, + "kl_loss_10": 174.63313903808594, + "kl_loss_2": 2067.6716491699217, + "kl_loss_3": 1609.9422119140625, + "kl_loss_7": 585.3752410888671, + "learning_rate": 0.00017137216379604724, + "loss": 1120.0867, + "step": 7310 + }, + { + "ce_loss_10": 3.491976761817932, + "ce_loss_13": 3.4171910762786863, + "ce_loss_2": 4.477530479431152, + "ce_loss_3": 4.205724453926086, + "ce_loss_7": 3.667802131175995, + "epoch": 0.732, + "grad_norm": 588.0, + "kl_loss_10": 177.26437683105468, + "kl_loss_2": 2177.8865478515627, + "kl_loss_3": 1690.7372009277344, + "kl_loss_7": 596.9918426513672, + "learning_rate": 0.00017017800384339925, + "loss": 1158.1862, + "step": 7320 + }, + { + "ce_loss_10": 3.446759831905365, + "ce_loss_13": 3.3701040625572203, + "ce_loss_2": 4.4656068086624146, + "ce_loss_3": 4.189201056957245, + "ce_loss_7": 3.63215457201004, + "epoch": 0.733, + "grad_norm": 608.0, + "kl_loss_10": 179.6235023498535, + "kl_loss_2": 2245.6789367675783, + "kl_loss_3": 1758.8689514160155, + "kl_loss_7": 611.1446807861328, + "learning_rate": 0.00016898716519459073, + "loss": 1147.9725, + "step": 7330 + }, + { + "ce_loss_10": 3.5716673254966738, + "ce_loss_13": 3.4945391058921813, + "ce_loss_2": 4.573297142982483, + "ce_loss_3": 4.307599520683288, + "ce_loss_7": 3.7545908093452454, + "epoch": 0.734, + "grad_norm": 564.0, + "kl_loss_10": 182.98650054931642, + "kl_loss_2": 2208.1832763671873, + "kl_loss_3": 1733.5484619140625, + "kl_loss_7": 619.9228546142579, + "learning_rate": 0.00016779965984135375, + "loss": 1166.6811, + "step": 7340 + }, + { + "ce_loss_10": 3.478439450263977, + "ce_loss_13": 3.4015959978103636, + "ce_loss_2": 4.458614790439606, + "ce_loss_3": 4.194760942459107, + "ce_loss_7": 3.6524015784263613, + "epoch": 0.735, + "grad_norm": 612.0, + "kl_loss_10": 173.75391540527343, + "kl_loss_2": 2180.093780517578, + "kl_loss_3": 1698.9966857910156, + "kl_loss_7": 593.231803894043, + "learning_rate": 0.00016661549974185424, + "loss": 1159.2525, + "step": 7350 + }, + { + "ce_loss_10": 3.51222710609436, + "ce_loss_13": 3.4394211292266847, + "ce_loss_2": 4.489507508277893, + "ce_loss_3": 4.216231632232666, + "ce_loss_7": 3.6876235485076903, + "epoch": 0.736, + "grad_norm": 604.0, + "kl_loss_10": 179.0154716491699, + "kl_loss_2": 2169.4521362304686, + "kl_loss_3": 1690.815203857422, + "kl_loss_7": 602.3167053222656, + "learning_rate": 0.00016543469682057105, + "loss": 1143.9477, + "step": 7360 + }, + { + "ce_loss_10": 3.5415560364723206, + "ce_loss_13": 3.465597319602966, + "ce_loss_2": 4.508477449417114, + "ce_loss_3": 4.240069580078125, + "ce_loss_7": 3.7229697704315186, + "epoch": 0.737, + "grad_norm": 564.0, + "kl_loss_10": 181.52649993896483, + "kl_loss_2": 2153.332647705078, + "kl_loss_3": 1671.2495178222657, + "kl_loss_7": 610.9849151611328, + "learning_rate": 0.00016425726296817632, + "loss": 1153.5225, + "step": 7370 + }, + { + "ce_loss_10": 3.5615882515907287, + "ce_loss_13": 3.4901331782341005, + "ce_loss_2": 4.51513102054596, + "ce_loss_3": 4.248232364654541, + "ce_loss_7": 3.7354116439819336, + "epoch": 0.738, + "grad_norm": 544.0, + "kl_loss_10": 174.93305130004882, + "kl_loss_2": 2115.262805175781, + "kl_loss_3": 1640.2427490234375, + "kl_loss_7": 589.8659851074219, + "learning_rate": 0.00016308321004141607, + "loss": 1140.3666, + "step": 7380 + }, + { + "ce_loss_10": 3.518048846721649, + "ce_loss_13": 3.438374364376068, + "ce_loss_2": 4.499938416481018, + "ce_loss_3": 4.236223828792572, + "ce_loss_7": 3.695832920074463, + "epoch": 0.739, + "grad_norm": 548.0, + "kl_loss_10": 181.39317779541017, + "kl_loss_2": 2175.456677246094, + "kl_loss_3": 1701.57724609375, + "kl_loss_7": 609.4653137207031, + "learning_rate": 0.00016191254986299043, + "loss": 1150.5328, + "step": 7390 + }, + { + "ce_loss_10": 3.5613037228584288, + "ce_loss_13": 3.4887098908424377, + "ce_loss_2": 4.503171324729919, + "ce_loss_3": 4.245256781578064, + "ce_loss_7": 3.7236536622047423, + "epoch": 0.74, + "grad_norm": 680.0, + "kl_loss_10": 174.15130844116212, + "kl_loss_2": 2131.4445068359373, + "kl_loss_3": 1674.7688537597655, + "kl_loss_7": 591.5977661132813, + "learning_rate": 0.00016074529422143398, + "loss": 1164.3935, + "step": 7400 + }, + { + "ce_loss_10": 3.5027013421058655, + "ce_loss_13": 3.429375433921814, + "ce_loss_2": 4.4999552249908445, + "ce_loss_3": 4.231460630893707, + "ce_loss_7": 3.6830108165740967, + "epoch": 0.741, + "grad_norm": 736.0, + "kl_loss_10": 175.83671493530272, + "kl_loss_2": 2196.9726989746096, + "kl_loss_3": 1720.8603271484376, + "kl_loss_7": 599.7947540283203, + "learning_rate": 0.0001595814548709983, + "loss": 1180.4217, + "step": 7410 + }, + { + "ce_loss_10": 3.576788854598999, + "ce_loss_13": 3.498660683631897, + "ce_loss_2": 4.549895691871643, + "ce_loss_3": 4.287308168411255, + "ce_loss_7": 3.7568582773208616, + "epoch": 0.742, + "grad_norm": 556.0, + "kl_loss_10": 181.97546997070313, + "kl_loss_2": 2178.1142333984376, + "kl_loss_3": 1714.0122802734375, + "kl_loss_7": 610.0974487304687, + "learning_rate": 0.00015842104353153285, + "loss": 1164.6248, + "step": 7420 + }, + { + "ce_loss_10": 3.5943754434585573, + "ce_loss_13": 3.5180631637573243, + "ce_loss_2": 4.549882531166077, + "ce_loss_3": 4.288905811309815, + "ce_loss_7": 3.7695993304252626, + "epoch": 0.743, + "grad_norm": 548.0, + "kl_loss_10": 179.57282943725585, + "kl_loss_2": 2154.6554992675783, + "kl_loss_3": 1684.7554626464844, + "kl_loss_7": 607.9901489257812, + "learning_rate": 0.0001572640718883667, + "loss": 1181.4139, + "step": 7430 + }, + { + "ce_loss_10": 3.5268728017807005, + "ce_loss_13": 3.454422962665558, + "ce_loss_2": 4.4702025055885315, + "ce_loss_3": 4.211061191558838, + "ce_loss_7": 3.699466872215271, + "epoch": 0.744, + "grad_norm": 544.0, + "kl_loss_10": 173.9086715698242, + "kl_loss_2": 2107.2433349609373, + "kl_loss_3": 1643.049658203125, + "kl_loss_7": 587.0272888183594, + "learning_rate": 0.0001561105515921915, + "loss": 1164.3465, + "step": 7440 + }, + { + "ce_loss_10": 3.376306939125061, + "ce_loss_13": 3.3052693247795104, + "ce_loss_2": 4.399094796180725, + "ce_loss_3": 4.130729305744171, + "ce_loss_7": 3.5687609910964966, + "epoch": 0.745, + "grad_norm": 540.0, + "kl_loss_10": 174.5767349243164, + "kl_loss_2": 2266.337322998047, + "kl_loss_3": 1780.6517333984375, + "kl_loss_7": 616.0360229492187, + "learning_rate": 0.0001549604942589441, + "loss": 1163.9994, + "step": 7450 + }, + { + "ce_loss_10": 3.5653053879737855, + "ce_loss_13": 3.493623507022858, + "ce_loss_2": 4.478042149543763, + "ce_loss_3": 4.218795919418335, + "ce_loss_7": 3.731026256084442, + "epoch": 0.746, + "grad_norm": 580.0, + "kl_loss_10": 170.22484588623047, + "kl_loss_2": 2039.5635498046875, + "kl_loss_3": 1579.8051452636719, + "kl_loss_7": 567.1303924560547, + "learning_rate": 0.00015381391146968864, + "loss": 1115.5928, + "step": 7460 + }, + { + "ce_loss_10": 3.5406330108642576, + "ce_loss_13": 3.4665817737579347, + "ce_loss_2": 4.507574367523193, + "ce_loss_3": 4.2422141313552855, + "ce_loss_7": 3.711947810649872, + "epoch": 0.747, + "grad_norm": 576.0, + "kl_loss_10": 173.49912338256837, + "kl_loss_2": 2137.4108154296873, + "kl_loss_3": 1666.1288146972656, + "kl_loss_7": 586.0349029541015, + "learning_rate": 0.00015267081477050133, + "loss": 1153.2315, + "step": 7470 + }, + { + "ce_loss_10": 3.6397757053375246, + "ce_loss_13": 3.565910828113556, + "ce_loss_2": 4.558345174789428, + "ce_loss_3": 4.3014825820922855, + "ce_loss_7": 3.813869845867157, + "epoch": 0.748, + "grad_norm": 524.0, + "kl_loss_10": 179.69472961425782, + "kl_loss_2": 2081.7364990234373, + "kl_loss_3": 1619.6140869140625, + "kl_loss_7": 597.4245025634766, + "learning_rate": 0.00015153121567235335, + "loss": 1120.7676, + "step": 7480 + }, + { + "ce_loss_10": 3.529373216629028, + "ce_loss_13": 3.454616332054138, + "ce_loss_2": 4.507159662246704, + "ce_loss_3": 4.2369110703468325, + "ce_loss_7": 3.7003498554229735, + "epoch": 0.749, + "grad_norm": 596.0, + "kl_loss_10": 178.19662170410157, + "kl_loss_2": 2201.308489990234, + "kl_loss_3": 1718.668115234375, + "kl_loss_7": 600.4444549560546, + "learning_rate": 0.00015039512565099468, + "loss": 1130.487, + "step": 7490 + }, + { + "ce_loss_10": 3.59435373544693, + "ce_loss_13": 3.5217554926872254, + "ce_loss_2": 4.542114019393921, + "ce_loss_3": 4.2746872186660765, + "ce_loss_7": 3.768599247932434, + "epoch": 0.75, + "grad_norm": 532.0, + "kl_loss_10": 177.41806030273438, + "kl_loss_2": 2130.0947509765624, + "kl_loss_3": 1653.6429382324218, + "kl_loss_7": 598.9670806884766, + "learning_rate": 0.00014926255614683932, + "loss": 1188.0775, + "step": 7500 + }, + { + "ce_loss_10": 3.5343728065490723, + "ce_loss_13": 3.462270963191986, + "ce_loss_2": 4.491153955459595, + "ce_loss_3": 4.2244093179702755, + "ce_loss_7": 3.70688259601593, + "epoch": 0.751, + "grad_norm": 584.0, + "kl_loss_10": 175.2909957885742, + "kl_loss_2": 2134.823455810547, + "kl_loss_3": 1661.539990234375, + "kl_loss_7": 592.2256774902344, + "learning_rate": 0.0001481335185648498, + "loss": 1152.3602, + "step": 7510 + }, + { + "ce_loss_10": 3.5509208917617796, + "ce_loss_13": 3.4760316491127012, + "ce_loss_2": 4.4910846710205075, + "ce_loss_3": 4.236609256267547, + "ce_loss_7": 3.7286911368370057, + "epoch": 0.752, + "grad_norm": 560.0, + "kl_loss_10": 175.903653717041, + "kl_loss_2": 2132.4962768554688, + "kl_loss_3": 1669.2187805175781, + "kl_loss_7": 598.009976196289, + "learning_rate": 0.0001470080242744218, + "loss": 1135.5451, + "step": 7520 + }, + { + "ce_loss_10": 3.5404749631881716, + "ce_loss_13": 3.4668622732162477, + "ce_loss_2": 4.505393123626709, + "ce_loss_3": 4.248504590988159, + "ce_loss_7": 3.7097239255905152, + "epoch": 0.753, + "grad_norm": 600.0, + "kl_loss_10": 172.68473205566406, + "kl_loss_2": 2143.0695861816407, + "kl_loss_3": 1687.700439453125, + "kl_loss_7": 591.5756866455079, + "learning_rate": 0.0001458860846092705, + "loss": 1151.0906, + "step": 7530 + }, + { + "ce_loss_10": 3.578909718990326, + "ce_loss_13": 3.503495466709137, + "ce_loss_2": 4.502352619171143, + "ce_loss_3": 4.240141928195953, + "ce_loss_7": 3.750500977039337, + "epoch": 0.754, + "grad_norm": 604.0, + "kl_loss_10": 174.89483642578125, + "kl_loss_2": 2075.617956542969, + "kl_loss_3": 1612.4501708984376, + "kl_loss_7": 588.5097457885743, + "learning_rate": 0.00014476771086731566, + "loss": 1116.6235, + "step": 7540 + }, + { + "ce_loss_10": 3.688204324245453, + "ce_loss_13": 3.610430431365967, + "ce_loss_2": 4.621451306343078, + "ce_loss_3": 4.3530316829681395, + "ce_loss_7": 3.8562689661979674, + "epoch": 0.755, + "grad_norm": 572.0, + "kl_loss_10": 181.31634902954102, + "kl_loss_2": 2096.732080078125, + "kl_loss_3": 1625.2518310546875, + "kl_loss_7": 592.3898040771485, + "learning_rate": 0.00014365291431056872, + "loss": 1170.6359, + "step": 7550 + }, + { + "ce_loss_10": 3.513639771938324, + "ce_loss_13": 3.43876428604126, + "ce_loss_2": 4.494768452644348, + "ce_loss_3": 4.226865899562836, + "ce_loss_7": 3.6938853025436402, + "epoch": 0.756, + "grad_norm": 648.0, + "kl_loss_10": 182.01916885375977, + "kl_loss_2": 2211.1534912109373, + "kl_loss_3": 1723.5334899902343, + "kl_loss_7": 617.1691345214844, + "learning_rate": 0.00014254170616501827, + "loss": 1163.1255, + "step": 7560 + }, + { + "ce_loss_10": 3.4477534770965574, + "ce_loss_13": 3.3702123761177063, + "ce_loss_2": 4.465814185142517, + "ce_loss_3": 4.193384432792664, + "ce_loss_7": 3.6376350045204164, + "epoch": 0.757, + "grad_norm": 652.0, + "kl_loss_10": 181.91958312988282, + "kl_loss_2": 2272.9578369140627, + "kl_loss_3": 1780.6268127441406, + "kl_loss_7": 631.3283477783203, + "learning_rate": 0.0001414340976205183, + "loss": 1210.6553, + "step": 7570 + }, + { + "ce_loss_10": 3.4623551964759827, + "ce_loss_13": 3.386858320236206, + "ce_loss_2": 4.47217173576355, + "ce_loss_3": 4.196212124824524, + "ce_loss_7": 3.6454687833786013, + "epoch": 0.758, + "grad_norm": 652.0, + "kl_loss_10": 175.49118347167968, + "kl_loss_2": 2225.9182312011717, + "kl_loss_3": 1743.3149719238281, + "kl_loss_7": 604.4145355224609, + "learning_rate": 0.00014033009983067452, + "loss": 1165.3377, + "step": 7580 + }, + { + "ce_loss_10": 3.625165855884552, + "ce_loss_13": 3.553388500213623, + "ce_loss_2": 4.5477535963058475, + "ce_loss_3": 4.282978129386902, + "ce_loss_7": 3.790937566757202, + "epoch": 0.759, + "grad_norm": 540.0, + "kl_loss_10": 173.22186889648438, + "kl_loss_2": 2076.229632568359, + "kl_loss_3": 1605.808331298828, + "kl_loss_7": 578.3152954101563, + "learning_rate": 0.00013922972391273224, + "loss": 1124.4209, + "step": 7590 + }, + { + "ce_loss_10": 3.624656689167023, + "ce_loss_13": 3.5520288705825807, + "ce_loss_2": 4.581440138816833, + "ce_loss_3": 4.323860204219818, + "ce_loss_7": 3.799424684047699, + "epoch": 0.76, + "grad_norm": 604.0, + "kl_loss_10": 176.8631507873535, + "kl_loss_2": 2111.316943359375, + "kl_loss_3": 1657.235821533203, + "kl_loss_7": 591.5200927734375, + "learning_rate": 0.0001381329809474649, + "loss": 1146.3586, + "step": 7600 + }, + { + "ce_loss_10": 3.532001996040344, + "ce_loss_13": 3.4530585527420046, + "ce_loss_2": 4.544336724281311, + "ce_loss_3": 4.269702458381653, + "ce_loss_7": 3.7165846705436705, + "epoch": 0.761, + "grad_norm": 632.0, + "kl_loss_10": 181.38144760131837, + "kl_loss_2": 2247.282580566406, + "kl_loss_3": 1759.4795043945312, + "kl_loss_7": 616.4709213256835, + "learning_rate": 0.0001370398819790621, + "loss": 1186.2754, + "step": 7610 + }, + { + "ce_loss_10": 3.6697842359542845, + "ce_loss_13": 3.5929376244544984, + "ce_loss_2": 4.604382491111755, + "ce_loss_3": 4.336557102203369, + "ce_loss_7": 3.8371459245681763, + "epoch": 0.762, + "grad_norm": 612.0, + "kl_loss_10": 176.96341781616212, + "kl_loss_2": 2080.2568908691405, + "kl_loss_3": 1604.4097290039062, + "kl_loss_7": 582.9634078979492, + "learning_rate": 0.00013595043801501794, + "loss": 1108.4416, + "step": 7620 + }, + { + "ce_loss_10": 3.4644748091697695, + "ce_loss_13": 3.386727011203766, + "ce_loss_2": 4.503179264068604, + "ce_loss_3": 4.235963094234466, + "ce_loss_7": 3.650843346118927, + "epoch": 0.763, + "grad_norm": 664.0, + "kl_loss_10": 180.12555694580078, + "kl_loss_2": 2289.173895263672, + "kl_loss_3": 1815.476171875, + "kl_loss_7": 622.1785308837891, + "learning_rate": 0.00013486466002602133, + "loss": 1194.0496, + "step": 7630 + }, + { + "ce_loss_10": 3.577344560623169, + "ce_loss_13": 3.503310763835907, + "ce_loss_2": 4.512240695953369, + "ce_loss_3": 4.2521095991134645, + "ce_loss_7": 3.7476378440856934, + "epoch": 0.764, + "grad_norm": 556.0, + "kl_loss_10": 175.91430206298827, + "kl_loss_2": 2097.193493652344, + "kl_loss_3": 1632.0237731933594, + "kl_loss_7": 587.4400573730469, + "learning_rate": 0.00013378255894584462, + "loss": 1166.6646, + "step": 7640 + }, + { + "ce_loss_10": 3.5123034000396727, + "ce_loss_13": 3.433635425567627, + "ce_loss_2": 4.500353503227234, + "ce_loss_3": 4.2323464274406435, + "ce_loss_7": 3.6943077445030212, + "epoch": 0.765, + "grad_norm": 560.0, + "kl_loss_10": 181.23019485473634, + "kl_loss_2": 2206.8176452636717, + "kl_loss_3": 1726.0316467285156, + "kl_loss_7": 608.9391540527344, + "learning_rate": 0.0001327041456712334, + "loss": 1171.7679, + "step": 7650 + }, + { + "ce_loss_10": 3.55541011095047, + "ce_loss_13": 3.477762734889984, + "ce_loss_2": 4.513465809822082, + "ce_loss_3": 4.241680002212524, + "ce_loss_7": 3.7298651814460753, + "epoch": 0.766, + "grad_norm": 544.0, + "kl_loss_10": 180.71754302978516, + "kl_loss_2": 2169.4558044433593, + "kl_loss_3": 1686.6506469726562, + "kl_loss_7": 611.8586975097656, + "learning_rate": 0.00013162943106179747, + "loss": 1171.1721, + "step": 7660 + }, + { + "ce_loss_10": 3.5293742179870606, + "ce_loss_13": 3.456415057182312, + "ce_loss_2": 4.477925181388855, + "ce_loss_3": 4.21549437046051, + "ce_loss_7": 3.7069293022155763, + "epoch": 0.767, + "grad_norm": 588.0, + "kl_loss_10": 176.59339599609376, + "kl_loss_2": 2121.027722167969, + "kl_loss_3": 1652.0191345214844, + "kl_loss_7": 595.7734832763672, + "learning_rate": 0.00013055842593990132, + "loss": 1142.6258, + "step": 7670 + }, + { + "ce_loss_10": 3.4710524678230286, + "ce_loss_13": 3.399113714694977, + "ce_loss_2": 4.434552907943726, + "ce_loss_3": 4.163622748851776, + "ce_loss_7": 3.6474674701690675, + "epoch": 0.768, + "grad_norm": 540.0, + "kl_loss_10": 174.6668930053711, + "kl_loss_2": 2126.9706176757813, + "kl_loss_3": 1655.3524780273438, + "kl_loss_7": 590.6725830078125, + "learning_rate": 0.00012949114109055414, + "loss": 1168.1568, + "step": 7680 + }, + { + "ce_loss_10": 3.519006776809692, + "ce_loss_13": 3.4431997537612915, + "ce_loss_2": 4.4897076964378355, + "ce_loss_3": 4.226867043972016, + "ce_loss_7": 3.6994680523872376, + "epoch": 0.769, + "grad_norm": 584.0, + "kl_loss_10": 177.6523193359375, + "kl_loss_2": 2161.8881958007814, + "kl_loss_3": 1689.2801330566406, + "kl_loss_7": 607.9943145751953, + "learning_rate": 0.00012842758726130281, + "loss": 1170.0649, + "step": 7690 + }, + { + "ce_loss_10": 3.5628538727760315, + "ce_loss_13": 3.485966920852661, + "ce_loss_2": 4.55888135433197, + "ce_loss_3": 4.292485213279724, + "ce_loss_7": 3.7444689750671385, + "epoch": 0.77, + "grad_norm": 580.0, + "kl_loss_10": 179.38818130493163, + "kl_loss_2": 2210.8425048828126, + "kl_loss_3": 1733.9449951171875, + "kl_loss_7": 610.3931243896484, + "learning_rate": 0.00012736777516212267, + "loss": 1160.5377, + "step": 7700 + }, + { + "ce_loss_10": 3.557508039474487, + "ce_loss_13": 3.4799222111701966, + "ce_loss_2": 4.522961139678955, + "ce_loss_3": 4.253420984745025, + "ce_loss_7": 3.736038076877594, + "epoch": 0.771, + "grad_norm": 548.0, + "kl_loss_10": 181.55507125854493, + "kl_loss_2": 2158.760382080078, + "kl_loss_3": 1679.4241821289063, + "kl_loss_7": 612.0054595947265, + "learning_rate": 0.00012631171546530968, + "loss": 1138.0437, + "step": 7710 + }, + { + "ce_loss_10": 3.573415291309357, + "ce_loss_13": 3.4920427322387697, + "ce_loss_2": 4.5341356039047245, + "ce_loss_3": 4.2658631801605225, + "ce_loss_7": 3.752500355243683, + "epoch": 0.772, + "grad_norm": 568.0, + "kl_loss_10": 181.86062927246093, + "kl_loss_2": 2149.480059814453, + "kl_loss_3": 1673.6519470214844, + "kl_loss_7": 603.6808334350586, + "learning_rate": 0.00012525941880537307, + "loss": 1168.6842, + "step": 7720 + }, + { + "ce_loss_10": 3.6038484454154966, + "ce_loss_13": 3.528382158279419, + "ce_loss_2": 4.546409988403321, + "ce_loss_3": 4.28290638923645, + "ce_loss_7": 3.774893271923065, + "epoch": 0.773, + "grad_norm": 648.0, + "kl_loss_10": 176.13294677734376, + "kl_loss_2": 2093.1892028808593, + "kl_loss_3": 1628.6803955078126, + "kl_loss_7": 588.4353439331055, + "learning_rate": 0.00012421089577892869, + "loss": 1139.2071, + "step": 7730 + }, + { + "ce_loss_10": 3.555491530895233, + "ce_loss_13": 3.4761422514915465, + "ce_loss_2": 4.545820116996765, + "ce_loss_3": 4.266150867938995, + "ce_loss_7": 3.7351402401924134, + "epoch": 0.774, + "grad_norm": 668.0, + "kl_loss_10": 179.60176849365234, + "kl_loss_2": 2216.0720031738283, + "kl_loss_3": 1715.4457092285156, + "kl_loss_7": 609.4783508300782, + "learning_rate": 0.0001231661569445919, + "loss": 1172.4699, + "step": 7740 + }, + { + "ce_loss_10": 3.410160577297211, + "ce_loss_13": 3.3377888798713684, + "ce_loss_2": 4.401880002021789, + "ce_loss_3": 4.1333277225494385, + "ce_loss_7": 3.589407229423523, + "epoch": 0.775, + "grad_norm": 560.0, + "kl_loss_10": 176.03026962280273, + "kl_loss_2": 2206.2500732421877, + "kl_loss_3": 1718.6787414550781, + "kl_loss_7": 601.0735870361328, + "learning_rate": 0.00012212521282287093, + "loss": 1191.8578, + "step": 7750 + }, + { + "ce_loss_10": 3.5700145840644835, + "ce_loss_13": 3.493156003952026, + "ce_loss_2": 4.517843317985535, + "ce_loss_3": 4.254901158809662, + "ce_loss_7": 3.748375141620636, + "epoch": 0.776, + "grad_norm": 536.0, + "kl_loss_10": 180.4297233581543, + "kl_loss_2": 2117.2120727539063, + "kl_loss_3": 1651.861083984375, + "kl_loss_7": 599.0601287841797, + "learning_rate": 0.00012108807389606158, + "loss": 1171.4985, + "step": 7760 + }, + { + "ce_loss_10": 3.5604520797729493, + "ce_loss_13": 3.4879041433334352, + "ce_loss_2": 4.51350736618042, + "ce_loss_3": 4.255460405349732, + "ce_loss_7": 3.737185871601105, + "epoch": 0.777, + "grad_norm": 624.0, + "kl_loss_10": 173.82694396972656, + "kl_loss_2": 2134.6752502441404, + "kl_loss_3": 1670.9033142089843, + "kl_loss_7": 592.1039428710938, + "learning_rate": 0.00012005475060814159, + "loss": 1139.6322, + "step": 7770 + }, + { + "ce_loss_10": 3.5012547731399537, + "ce_loss_13": 3.4265154361724854, + "ce_loss_2": 4.493270707130432, + "ce_loss_3": 4.232757782936096, + "ce_loss_7": 3.676969814300537, + "epoch": 0.778, + "grad_norm": 592.0, + "kl_loss_10": 178.45665435791017, + "kl_loss_2": 2218.041455078125, + "kl_loss_3": 1749.4460815429688, + "kl_loss_7": 609.0319793701171, + "learning_rate": 0.00011902525336466464, + "loss": 1173.4994, + "step": 7780 + }, + { + "ce_loss_10": 3.487755036354065, + "ce_loss_13": 3.40771107673645, + "ce_loss_2": 4.503837430477143, + "ce_loss_3": 4.227473521232605, + "ce_loss_7": 3.6715272665023804, + "epoch": 0.779, + "grad_norm": 556.0, + "kl_loss_10": 182.97367095947266, + "kl_loss_2": 2253.76220703125, + "kl_loss_3": 1756.6077819824218, + "kl_loss_7": 618.8783203125, + "learning_rate": 0.00011799959253265668, + "loss": 1168.3436, + "step": 7790 + }, + { + "ce_loss_10": 3.548134469985962, + "ce_loss_13": 3.4717200636863708, + "ce_loss_2": 4.531218719482422, + "ce_loss_3": 4.259068071842194, + "ce_loss_7": 3.725462853908539, + "epoch": 0.78, + "grad_norm": 588.0, + "kl_loss_10": 179.5894790649414, + "kl_loss_2": 2197.6992370605467, + "kl_loss_3": 1714.0868286132813, + "kl_loss_7": 606.9559936523438, + "learning_rate": 0.00011697777844051105, + "loss": 1168.1586, + "step": 7800 + }, + { + "ce_loss_10": 3.5325579047203064, + "ce_loss_13": 3.4524773359298706, + "ce_loss_2": 4.540277624130249, + "ce_loss_3": 4.275496506690979, + "ce_loss_7": 3.709948194026947, + "epoch": 0.781, + "grad_norm": 600.0, + "kl_loss_10": 182.08444366455078, + "kl_loss_2": 2253.191998291016, + "kl_loss_3": 1783.90888671875, + "kl_loss_7": 609.2252471923828, + "learning_rate": 0.00011595982137788402, + "loss": 1182.0791, + "step": 7810 + }, + { + "ce_loss_10": 3.507384693622589, + "ce_loss_13": 3.433124232292175, + "ce_loss_2": 4.452573490142822, + "ce_loss_3": 4.191938650608063, + "ce_loss_7": 3.6804782152175903, + "epoch": 0.782, + "grad_norm": 552.0, + "kl_loss_10": 174.53733520507814, + "kl_loss_2": 2103.1436462402344, + "kl_loss_3": 1636.1021728515625, + "kl_loss_7": 594.1032348632813, + "learning_rate": 0.00011494573159559212, + "loss": 1150.1953, + "step": 7820 + }, + { + "ce_loss_10": 3.495812237262726, + "ce_loss_13": 3.4193639039993284, + "ce_loss_2": 4.4669132947921755, + "ce_loss_3": 4.2113652467727665, + "ce_loss_7": 3.67316712141037, + "epoch": 0.783, + "grad_norm": 572.0, + "kl_loss_10": 178.65593719482422, + "kl_loss_2": 2173.239221191406, + "kl_loss_3": 1708.3340942382813, + "kl_loss_7": 603.083627319336, + "learning_rate": 0.00011393551930550828, + "loss": 1187.9246, + "step": 7830 + }, + { + "ce_loss_10": 3.6368354201316833, + "ce_loss_13": 3.559674692153931, + "ce_loss_2": 4.571843910217285, + "ce_loss_3": 4.303619515895844, + "ce_loss_7": 3.8069366455078124, + "epoch": 0.784, + "grad_norm": 588.0, + "kl_loss_10": 179.06233749389648, + "kl_loss_2": 2120.6896240234373, + "kl_loss_3": 1638.9197570800782, + "kl_loss_7": 595.7463287353515, + "learning_rate": 0.00011292919468045875, + "loss": 1145.6585, + "step": 7840 + }, + { + "ce_loss_10": 3.584019410610199, + "ce_loss_13": 3.5086099743843078, + "ce_loss_2": 4.53436963558197, + "ce_loss_3": 4.271309959888458, + "ce_loss_7": 3.7602915167808533, + "epoch": 0.785, + "grad_norm": 528.0, + "kl_loss_10": 177.57644500732422, + "kl_loss_2": 2126.9706481933595, + "kl_loss_3": 1654.4735168457032, + "kl_loss_7": 600.0492980957031, + "learning_rate": 0.00011192676785412154, + "loss": 1144.0532, + "step": 7850 + }, + { + "ce_loss_10": 3.522589087486267, + "ce_loss_13": 3.4456050395965576, + "ce_loss_2": 4.529689431190491, + "ce_loss_3": 4.258461606502533, + "ce_loss_7": 3.704596519470215, + "epoch": 0.786, + "grad_norm": 624.0, + "kl_loss_10": 178.9210517883301, + "kl_loss_2": 2216.430499267578, + "kl_loss_3": 1733.1487976074218, + "kl_loss_7": 602.0237121582031, + "learning_rate": 0.00011092824892092374, + "loss": 1161.7434, + "step": 7860 + }, + { + "ce_loss_10": 3.454429876804352, + "ce_loss_13": 3.376889729499817, + "ce_loss_2": 4.473304414749146, + "ce_loss_3": 4.201044774055481, + "ce_loss_7": 3.6403449535369874, + "epoch": 0.787, + "grad_norm": 544.0, + "kl_loss_10": 178.50691452026368, + "kl_loss_2": 2241.591131591797, + "kl_loss_3": 1762.5004089355468, + "kl_loss_7": 614.9870758056641, + "learning_rate": 0.0001099336479359398, + "loss": 1163.7643, + "step": 7870 + }, + { + "ce_loss_10": 3.5764689803123475, + "ce_loss_13": 3.507636034488678, + "ce_loss_2": 4.512799096107483, + "ce_loss_3": 4.25046044588089, + "ce_loss_7": 3.746009385585785, + "epoch": 0.788, + "grad_norm": 564.0, + "kl_loss_10": 175.3071716308594, + "kl_loss_2": 2102.777294921875, + "kl_loss_3": 1634.8632263183595, + "kl_loss_7": 592.2164337158204, + "learning_rate": 0.00010894297491479043, + "loss": 1142.6834, + "step": 7880 + }, + { + "ce_loss_10": 3.575552821159363, + "ce_loss_13": 3.5023175954818724, + "ce_loss_2": 4.539198517799377, + "ce_loss_3": 4.279193782806397, + "ce_loss_7": 3.750091075897217, + "epoch": 0.789, + "grad_norm": 576.0, + "kl_loss_10": 176.76428680419923, + "kl_loss_2": 2146.3808166503904, + "kl_loss_3": 1681.0488159179688, + "kl_loss_7": 595.56142578125, + "learning_rate": 0.00010795623983354214, + "loss": 1139.8293, + "step": 7890 + }, + { + "ce_loss_10": 3.4591768264770506, + "ce_loss_13": 3.3825891733169557, + "ce_loss_2": 4.4514943838119505, + "ce_loss_3": 4.181638932228088, + "ce_loss_7": 3.643087315559387, + "epoch": 0.79, + "grad_norm": 580.0, + "kl_loss_10": 182.44262008666993, + "kl_loss_2": 2230.9637817382813, + "kl_loss_3": 1740.3924072265625, + "kl_loss_7": 621.2922943115234, + "learning_rate": 0.00010697345262860636, + "loss": 1171.6089, + "step": 7900 + }, + { + "ce_loss_10": 3.600342130661011, + "ce_loss_13": 3.5264546155929564, + "ce_loss_2": 4.545495390892029, + "ce_loss_3": 4.278535521030426, + "ce_loss_7": 3.771434724330902, + "epoch": 0.791, + "grad_norm": 736.0, + "kl_loss_10": 177.22287063598634, + "kl_loss_2": 2132.298791503906, + "kl_loss_3": 1650.1429077148437, + "kl_loss_7": 593.3167419433594, + "learning_rate": 0.00010599462319663906, + "loss": 1136.3734, + "step": 7910 + }, + { + "ce_loss_10": 3.5746383547782896, + "ce_loss_13": 3.4998196601867675, + "ce_loss_2": 4.493499338626862, + "ce_loss_3": 4.230222713947296, + "ce_loss_7": 3.7425215244293213, + "epoch": 0.792, + "grad_norm": 520.0, + "kl_loss_10": 174.49715042114258, + "kl_loss_2": 2051.4084716796874, + "kl_loss_3": 1592.7509643554688, + "kl_loss_7": 582.2202606201172, + "learning_rate": 0.00010501976139444191, + "loss": 1118.4902, + "step": 7920 + }, + { + "ce_loss_10": 3.6047690868377686, + "ce_loss_13": 3.5289911150932314, + "ce_loss_2": 4.545255088806153, + "ce_loss_3": 4.2847788572311405, + "ce_loss_7": 3.7748185992240906, + "epoch": 0.793, + "grad_norm": 604.0, + "kl_loss_10": 176.07794952392578, + "kl_loss_2": 2104.0453186035156, + "kl_loss_3": 1645.7491271972656, + "kl_loss_7": 587.9952331542969, + "learning_rate": 0.0001040488770388625, + "loss": 1154.1295, + "step": 7930 + }, + { + "ce_loss_10": 3.548888790607452, + "ce_loss_13": 3.4759244203567503, + "ce_loss_2": 4.515872287750244, + "ce_loss_3": 4.250580382347107, + "ce_loss_7": 3.7205033540725707, + "epoch": 0.794, + "grad_norm": 680.0, + "kl_loss_10": 177.23135299682616, + "kl_loss_2": 2173.47548828125, + "kl_loss_3": 1700.3454467773438, + "kl_loss_7": 599.390249633789, + "learning_rate": 0.00010308197990669538, + "loss": 1149.7575, + "step": 7940 + }, + { + "ce_loss_10": 3.664888024330139, + "ce_loss_13": 3.5850353479385375, + "ce_loss_2": 4.610143923759461, + "ce_loss_3": 4.346996653079986, + "ce_loss_7": 3.83613098859787, + "epoch": 0.795, + "grad_norm": 540.0, + "kl_loss_10": 179.47011337280273, + "kl_loss_2": 2129.81064453125, + "kl_loss_3": 1662.4896118164063, + "kl_loss_7": 599.4926940917969, + "learning_rate": 0.0001021190797345839, + "loss": 1140.0146, + "step": 7950 + }, + { + "ce_loss_10": 3.3896429777145385, + "ce_loss_13": 3.3098750829696657, + "ce_loss_2": 4.413335740566254, + "ce_loss_3": 4.134794509410858, + "ce_loss_7": 3.57609201669693, + "epoch": 0.796, + "grad_norm": 580.0, + "kl_loss_10": 185.28996887207032, + "kl_loss_2": 2269.882763671875, + "kl_loss_3": 1772.13056640625, + "kl_loss_7": 628.5536560058594, + "learning_rate": 0.00010116018621892236, + "loss": 1175.2182, + "step": 7960 + }, + { + "ce_loss_10": 3.603187918663025, + "ce_loss_13": 3.5232182860374452, + "ce_loss_2": 4.567694234848022, + "ce_loss_3": 4.309406304359436, + "ce_loss_7": 3.779837393760681, + "epoch": 0.797, + "grad_norm": 608.0, + "kl_loss_10": 186.1454734802246, + "kl_loss_2": 2165.1848999023437, + "kl_loss_3": 1705.360009765625, + "kl_loss_7": 616.0535675048828, + "learning_rate": 0.00010020530901575753, + "loss": 1136.0533, + "step": 7970 + }, + { + "ce_loss_10": 3.625728499889374, + "ce_loss_13": 3.5490816116333006, + "ce_loss_2": 4.573475480079651, + "ce_loss_3": 4.304804050922394, + "ce_loss_7": 3.799483132362366, + "epoch": 0.798, + "grad_norm": 520.0, + "kl_loss_10": 180.7791946411133, + "kl_loss_2": 2134.7111938476564, + "kl_loss_3": 1658.3630126953126, + "kl_loss_7": 601.8699676513672, + "learning_rate": 9.925445774069231e-05, + "loss": 1126.8894, + "step": 7980 + }, + { + "ce_loss_10": 3.5760633826255797, + "ce_loss_13": 3.500509262084961, + "ce_loss_2": 4.527716112136841, + "ce_loss_3": 4.2627903580665585, + "ce_loss_7": 3.754010498523712, + "epoch": 0.799, + "grad_norm": 728.0, + "kl_loss_10": 177.96156311035156, + "kl_loss_2": 2117.894659423828, + "kl_loss_3": 1646.0127319335938, + "kl_loss_7": 595.1834564208984, + "learning_rate": 9.830764196878872e-05, + "loss": 1125.6953, + "step": 7990 + }, + { + "ce_loss_10": 3.5167272210121157, + "ce_loss_13": 3.443312036991119, + "ce_loss_2": 4.485463619232178, + "ce_loss_3": 4.227659916877746, + "ce_loss_7": 3.6942790031433104, + "epoch": 0.8, + "grad_norm": 480.0, + "kl_loss_10": 175.60029678344728, + "kl_loss_2": 2190.2330810546873, + "kl_loss_3": 1721.7053161621093, + "kl_loss_7": 603.3801788330078, + "learning_rate": 9.736487123447069e-05, + "loss": 1159.6166, + "step": 8000 + }, + { + "ce_loss_10": 3.4639697551727293, + "ce_loss_13": 3.389075720310211, + "ce_loss_2": 4.485336112976074, + "ce_loss_3": 4.228267467021942, + "ce_loss_7": 3.6402989268302917, + "epoch": 0.801, + "grad_norm": 600.0, + "kl_loss_10": 179.87705307006837, + "kl_loss_2": 2294.8387084960937, + "kl_loss_3": 1823.8187927246095, + "kl_loss_7": 608.9864349365234, + "learning_rate": 9.642615503142926e-05, + "loss": 1194.0703, + "step": 8010 + }, + { + "ce_loss_10": 3.5347692489624025, + "ce_loss_13": 3.4572302103042603, + "ce_loss_2": 4.520514702796936, + "ce_loss_3": 4.254623317718506, + "ce_loss_7": 3.7080873131752012, + "epoch": 0.802, + "grad_norm": 572.0, + "kl_loss_10": 175.5455764770508, + "kl_loss_2": 2196.378839111328, + "kl_loss_3": 1715.636651611328, + "kl_loss_7": 596.8724548339844, + "learning_rate": 9.549150281252633e-05, + "loss": 1151.6992, + "step": 8020 + }, + { + "ce_loss_10": 3.563262867927551, + "ce_loss_13": 3.486225724220276, + "ce_loss_2": 4.527439785003662, + "ce_loss_3": 4.256748235225677, + "ce_loss_7": 3.7390462875366213, + "epoch": 0.803, + "grad_norm": 520.0, + "kl_loss_10": 179.0418388366699, + "kl_loss_2": 2160.1055908203125, + "kl_loss_3": 1681.3586303710938, + "kl_loss_7": 596.7608947753906, + "learning_rate": 9.4560923989699e-05, + "loss": 1169.5601, + "step": 8030 + }, + { + "ce_loss_10": 3.549173581600189, + "ce_loss_13": 3.4747613072395325, + "ce_loss_2": 4.515510749816895, + "ce_loss_3": 4.245619797706604, + "ce_loss_7": 3.7281826019287108, + "epoch": 0.804, + "grad_norm": 552.0, + "kl_loss_10": 177.9036865234375, + "kl_loss_2": 2149.996447753906, + "kl_loss_3": 1673.0265747070312, + "kl_loss_7": 598.6687103271485, + "learning_rate": 9.363442793386607e-05, + "loss": 1174.7094, + "step": 8040 + }, + { + "ce_loss_10": 3.5321462750434875, + "ce_loss_13": 3.453168177604675, + "ce_loss_2": 4.5332019329071045, + "ce_loss_3": 4.265519142150879, + "ce_loss_7": 3.7162665724754333, + "epoch": 0.805, + "grad_norm": 592.0, + "kl_loss_10": 181.04829177856445, + "kl_loss_2": 2218.4782836914064, + "kl_loss_3": 1732.6318908691405, + "kl_loss_7": 617.0633758544922, + "learning_rate": 9.271202397483213e-05, + "loss": 1149.8916, + "step": 8050 + }, + { + "ce_loss_10": 3.547755253314972, + "ce_loss_13": 3.474861478805542, + "ce_loss_2": 4.498465514183044, + "ce_loss_3": 4.235939025878906, + "ce_loss_7": 3.7163867115974427, + "epoch": 0.806, + "grad_norm": 572.0, + "kl_loss_10": 175.92396697998046, + "kl_loss_2": 2136.2159729003906, + "kl_loss_3": 1668.2355712890626, + "kl_loss_7": 590.7317443847656, + "learning_rate": 9.179372140119524e-05, + "loss": 1168.4604, + "step": 8060 + }, + { + "ce_loss_10": 3.494523513317108, + "ce_loss_13": 3.4206513285636904, + "ce_loss_2": 4.459244108200073, + "ce_loss_3": 4.188582479953766, + "ce_loss_7": 3.6680249691009523, + "epoch": 0.807, + "grad_norm": 564.0, + "kl_loss_10": 176.53147811889647, + "kl_loss_2": 2154.551867675781, + "kl_loss_3": 1677.4455200195312, + "kl_loss_7": 596.7226654052735, + "learning_rate": 9.087952946025175e-05, + "loss": 1164.4886, + "step": 8070 + }, + { + "ce_loss_10": 3.6058158397674562, + "ce_loss_13": 3.5339553594589233, + "ce_loss_2": 4.533062171936035, + "ce_loss_3": 4.26933354139328, + "ce_loss_7": 3.768651068210602, + "epoch": 0.808, + "grad_norm": 592.0, + "kl_loss_10": 173.78207092285157, + "kl_loss_2": 2082.071905517578, + "kl_loss_3": 1614.2910522460938, + "kl_loss_7": 576.9482543945312, + "learning_rate": 8.996945735790446e-05, + "loss": 1146.8303, + "step": 8080 + }, + { + "ce_loss_10": 3.50276095867157, + "ce_loss_13": 3.428582501411438, + "ce_loss_2": 4.457551169395447, + "ce_loss_3": 4.193507122993469, + "ce_loss_7": 3.672742247581482, + "epoch": 0.809, + "grad_norm": 608.0, + "kl_loss_10": 175.95007400512696, + "kl_loss_2": 2152.8104553222656, + "kl_loss_3": 1678.2301330566406, + "kl_loss_7": 594.3645935058594, + "learning_rate": 8.906351425856951e-05, + "loss": 1158.1713, + "step": 8090 + }, + { + "ce_loss_10": 3.4856011509895324, + "ce_loss_13": 3.412043738365173, + "ce_loss_2": 4.477128624916077, + "ce_loss_3": 4.2102068901062015, + "ce_loss_7": 3.663009238243103, + "epoch": 0.81, + "grad_norm": 588.0, + "kl_loss_10": 178.9893539428711, + "kl_loss_2": 2217.222705078125, + "kl_loss_3": 1734.5539611816407, + "kl_loss_7": 606.3814819335937, + "learning_rate": 8.816170928508365e-05, + "loss": 1174.1137, + "step": 8100 + }, + { + "ce_loss_10": 3.456532561779022, + "ce_loss_13": 3.380963850021362, + "ce_loss_2": 4.470215916633606, + "ce_loss_3": 4.203172373771667, + "ce_loss_7": 3.637952506542206, + "epoch": 0.811, + "grad_norm": 564.0, + "kl_loss_10": 181.76464233398437, + "kl_loss_2": 2271.6646240234377, + "kl_loss_3": 1789.1731323242188, + "kl_loss_7": 618.0579010009766, + "learning_rate": 8.7264051518613e-05, + "loss": 1188.6756, + "step": 8110 + }, + { + "ce_loss_10": 3.5451604604721068, + "ce_loss_13": 3.4716222047805787, + "ce_loss_2": 4.482484936714172, + "ce_loss_3": 4.218600440025329, + "ce_loss_7": 3.7114962100982667, + "epoch": 0.812, + "grad_norm": 572.0, + "kl_loss_10": 174.23039016723632, + "kl_loss_2": 2104.2210205078127, + "kl_loss_3": 1635.3122253417969, + "kl_loss_7": 586.6013153076171, + "learning_rate": 8.637054999856148e-05, + "loss": 1140.2461, + "step": 8120 + }, + { + "ce_loss_10": 3.5334657073020934, + "ce_loss_13": 3.4553168177604676, + "ce_loss_2": 4.5001609325408936, + "ce_loss_3": 4.233760714530945, + "ce_loss_7": 3.71214896440506, + "epoch": 0.813, + "grad_norm": 572.0, + "kl_loss_10": 179.94693908691406, + "kl_loss_2": 2168.638543701172, + "kl_loss_3": 1690.7197631835938, + "kl_loss_7": 602.7170349121094, + "learning_rate": 8.548121372247918e-05, + "loss": 1176.2271, + "step": 8130 + }, + { + "ce_loss_10": 3.6031864166259764, + "ce_loss_13": 3.5284059882164, + "ce_loss_2": 4.540188145637512, + "ce_loss_3": 4.276613438129425, + "ce_loss_7": 3.7673808455467226, + "epoch": 0.814, + "grad_norm": 576.0, + "kl_loss_10": 175.132576751709, + "kl_loss_2": 2116.421954345703, + "kl_loss_3": 1652.4540222167968, + "kl_loss_7": 584.193586730957, + "learning_rate": 8.459605164597267e-05, + "loss": 1140.1102, + "step": 8140 + }, + { + "ce_loss_10": 3.4851497173309327, + "ce_loss_13": 3.4121100902557373, + "ce_loss_2": 4.4567595481872555, + "ce_loss_3": 4.188820004463196, + "ce_loss_7": 3.6609971284866334, + "epoch": 0.815, + "grad_norm": 516.0, + "kl_loss_10": 176.46202392578124, + "kl_loss_2": 2164.97646484375, + "kl_loss_3": 1690.654833984375, + "kl_loss_7": 595.82392578125, + "learning_rate": 8.371507268261436e-05, + "loss": 1160.1355, + "step": 8150 + }, + { + "ce_loss_10": 3.5612674951553345, + "ce_loss_13": 3.486202526092529, + "ce_loss_2": 4.5333171606063845, + "ce_loss_3": 4.264222574234009, + "ce_loss_7": 3.7375367999076845, + "epoch": 0.816, + "grad_norm": 536.0, + "kl_loss_10": 178.54783096313477, + "kl_loss_2": 2174.8032287597657, + "kl_loss_3": 1693.2032836914063, + "kl_loss_7": 601.639274597168, + "learning_rate": 8.283828570385238e-05, + "loss": 1135.0793, + "step": 8160 + }, + { + "ce_loss_10": 3.566178250312805, + "ce_loss_13": 3.4907922625541685, + "ce_loss_2": 4.535942006111145, + "ce_loss_3": 4.269787204265595, + "ce_loss_7": 3.745415151119232, + "epoch": 0.817, + "grad_norm": 596.0, + "kl_loss_10": 175.95259857177734, + "kl_loss_2": 2127.893908691406, + "kl_loss_3": 1655.5139526367188, + "kl_loss_7": 597.2055206298828, + "learning_rate": 8.196569953892202e-05, + "loss": 1147.5553, + "step": 8170 + }, + { + "ce_loss_10": 3.485050618648529, + "ce_loss_13": 3.410007894039154, + "ce_loss_2": 4.454114603996277, + "ce_loss_3": 4.187376809120178, + "ce_loss_7": 3.6601618766784667, + "epoch": 0.818, + "grad_norm": 640.0, + "kl_loss_10": 177.51841354370117, + "kl_loss_2": 2140.9811096191406, + "kl_loss_3": 1670.5337829589844, + "kl_loss_7": 598.4014099121093, + "learning_rate": 8.109732297475635e-05, + "loss": 1142.2542, + "step": 8180 + }, + { + "ce_loss_10": 3.4574038982391357, + "ce_loss_13": 3.376466763019562, + "ce_loss_2": 4.488081407546997, + "ce_loss_3": 4.2140247344970705, + "ce_loss_7": 3.65143164396286, + "epoch": 0.819, + "grad_norm": 588.0, + "kl_loss_10": 184.73964157104493, + "kl_loss_2": 2257.5185974121096, + "kl_loss_3": 1764.1057739257812, + "kl_loss_7": 626.5130950927735, + "learning_rate": 8.023316475589754e-05, + "loss": 1190.8261, + "step": 8190 + }, + { + "ce_loss_10": 3.4220961928367615, + "ce_loss_13": 3.338273751735687, + "ce_loss_2": 4.495982336997986, + "ce_loss_3": 4.211030387878418, + "ce_loss_7": 3.615649092197418, + "epoch": 0.82, + "grad_norm": 680.0, + "kl_loss_10": 185.78453063964844, + "kl_loss_2": 2349.4278076171877, + "kl_loss_3": 1849.0640197753905, + "kl_loss_7": 637.3896453857421, + "learning_rate": 7.937323358440934e-05, + "loss": 1214.0248, + "step": 8200 + }, + { + "ce_loss_10": 3.541324031352997, + "ce_loss_13": 3.4684749960899355, + "ce_loss_2": 4.468911576271057, + "ce_loss_3": 4.20685533285141, + "ce_loss_7": 3.709389495849609, + "epoch": 0.821, + "grad_norm": 584.0, + "kl_loss_10": 174.97513656616212, + "kl_loss_2": 2090.5933227539062, + "kl_loss_3": 1628.1781616210938, + "kl_loss_7": 589.6404022216797, + "learning_rate": 7.851753811978923e-05, + "loss": 1140.9928, + "step": 8210 + }, + { + "ce_loss_10": 3.5598355412483214, + "ce_loss_13": 3.4843420505523683, + "ce_loss_2": 4.541475534439087, + "ce_loss_3": 4.275748348236084, + "ce_loss_7": 3.735049307346344, + "epoch": 0.822, + "grad_norm": 604.0, + "kl_loss_10": 177.00316925048827, + "kl_loss_2": 2174.6779052734373, + "kl_loss_3": 1702.5414611816407, + "kl_loss_7": 595.3967559814453, + "learning_rate": 7.766608697888095e-05, + "loss": 1150.2977, + "step": 8220 + }, + { + "ce_loss_10": 3.57365106344223, + "ce_loss_13": 3.498138427734375, + "ce_loss_2": 4.5424954175949095, + "ce_loss_3": 4.2809364080429075, + "ce_loss_7": 3.7457818508148195, + "epoch": 0.823, + "grad_norm": 576.0, + "kl_loss_10": 180.9578956604004, + "kl_loss_2": 2174.856481933594, + "kl_loss_3": 1712.8867797851562, + "kl_loss_7": 606.9090911865235, + "learning_rate": 7.681888873578785e-05, + "loss": 1172.8941, + "step": 8230 + }, + { + "ce_loss_10": 3.5023999333381655, + "ce_loss_13": 3.423751747608185, + "ce_loss_2": 4.507507848739624, + "ce_loss_3": 4.228441286087036, + "ce_loss_7": 3.6896154999732973, + "epoch": 0.824, + "grad_norm": 556.0, + "kl_loss_10": 182.18136978149414, + "kl_loss_2": 2222.666143798828, + "kl_loss_3": 1725.1198669433593, + "kl_loss_7": 612.9842071533203, + "learning_rate": 7.597595192178702e-05, + "loss": 1157.2363, + "step": 8240 + }, + { + "ce_loss_10": 3.501276743412018, + "ce_loss_13": 3.422858786582947, + "ce_loss_2": 4.514269304275513, + "ce_loss_3": 4.2400298595428465, + "ce_loss_7": 3.6824575424194337, + "epoch": 0.825, + "grad_norm": 588.0, + "kl_loss_10": 181.8477668762207, + "kl_loss_2": 2277.1808898925783, + "kl_loss_3": 1781.4539184570312, + "kl_loss_7": 622.9125793457031, + "learning_rate": 7.513728502524286e-05, + "loss": 1187.7779, + "step": 8250 + }, + { + "ce_loss_10": 3.5026116013526916, + "ce_loss_13": 3.428036153316498, + "ce_loss_2": 4.455591607093811, + "ce_loss_3": 4.192325818538666, + "ce_loss_7": 3.6698171854019166, + "epoch": 0.826, + "grad_norm": 540.0, + "kl_loss_10": 170.86422119140624, + "kl_loss_2": 2124.475665283203, + "kl_loss_3": 1660.3611083984374, + "kl_loss_7": 580.019741821289, + "learning_rate": 7.430289649152156e-05, + "loss": 1161.5576, + "step": 8260 + }, + { + "ce_loss_10": 3.404016637802124, + "ce_loss_13": 3.3291639566421507, + "ce_loss_2": 4.41770989894867, + "ce_loss_3": 4.153000998497009, + "ce_loss_7": 3.591460871696472, + "epoch": 0.827, + "grad_norm": 564.0, + "kl_loss_10": 179.06679763793946, + "kl_loss_2": 2271.2553527832033, + "kl_loss_3": 1785.926287841797, + "kl_loss_7": 620.811245727539, + "learning_rate": 7.347279472290646e-05, + "loss": 1175.1479, + "step": 8270 + }, + { + "ce_loss_10": 3.5473936796188354, + "ce_loss_13": 3.4728646278381348, + "ce_loss_2": 4.529109454154968, + "ce_loss_3": 4.2634922623634335, + "ce_loss_7": 3.7243195176124573, + "epoch": 0.828, + "grad_norm": 516.0, + "kl_loss_10": 176.9839729309082, + "kl_loss_2": 2192.3595458984373, + "kl_loss_3": 1719.4920288085937, + "kl_loss_7": 599.4450622558594, + "learning_rate": 7.264698807851328e-05, + "loss": 1170.4515, + "step": 8280 + }, + { + "ce_loss_10": 3.511405515670776, + "ce_loss_13": 3.4420324087142946, + "ce_loss_2": 4.462396240234375, + "ce_loss_3": 4.196757709980011, + "ce_loss_7": 3.678698420524597, + "epoch": 0.829, + "grad_norm": 520.0, + "kl_loss_10": 173.053133392334, + "kl_loss_2": 2122.6127746582033, + "kl_loss_3": 1647.0698120117188, + "kl_loss_7": 586.3415740966797, + "learning_rate": 7.182548487420554e-05, + "loss": 1152.9492, + "step": 8290 + }, + { + "ce_loss_10": 3.56430242061615, + "ce_loss_13": 3.4877224922180177, + "ce_loss_2": 4.515660381317138, + "ce_loss_3": 4.256710803508758, + "ce_loss_7": 3.7377355813980104, + "epoch": 0.83, + "grad_norm": 552.0, + "kl_loss_10": 178.47344284057618, + "kl_loss_2": 2141.9302307128905, + "kl_loss_3": 1673.7640502929687, + "kl_loss_7": 594.9386505126953, + "learning_rate": 7.100829338251146e-05, + "loss": 1142.0348, + "step": 8300 + }, + { + "ce_loss_10": 3.500733995437622, + "ce_loss_13": 3.420394682884216, + "ce_loss_2": 4.495737314224243, + "ce_loss_3": 4.226100885868073, + "ce_loss_7": 3.6811896324157716, + "epoch": 0.831, + "grad_norm": 664.0, + "kl_loss_10": 181.1860824584961, + "kl_loss_2": 2206.595349121094, + "kl_loss_3": 1725.6842834472657, + "kl_loss_7": 613.2857147216797, + "learning_rate": 7.019542183254046e-05, + "loss": 1155.5816, + "step": 8310 + }, + { + "ce_loss_10": 3.542024350166321, + "ce_loss_13": 3.46200088262558, + "ce_loss_2": 4.494865345954895, + "ce_loss_3": 4.2283999681472775, + "ce_loss_7": 3.7202147483825683, + "epoch": 0.832, + "grad_norm": 700.0, + "kl_loss_10": 184.21017608642578, + "kl_loss_2": 2155.698864746094, + "kl_loss_3": 1680.3289855957032, + "kl_loss_7": 609.4942947387696, + "learning_rate": 6.938687840989971e-05, + "loss": 1152.3119, + "step": 8320 + }, + { + "ce_loss_10": 3.475117301940918, + "ce_loss_13": 3.396902585029602, + "ce_loss_2": 4.447841107845306, + "ce_loss_3": 4.1811567902565, + "ce_loss_7": 3.65566908121109, + "epoch": 0.833, + "grad_norm": 644.0, + "kl_loss_10": 180.3916358947754, + "kl_loss_2": 2151.0271850585937, + "kl_loss_3": 1678.1678161621094, + "kl_loss_7": 600.837434387207, + "learning_rate": 6.858267125661271e-05, + "loss": 1171.0359, + "step": 8330 + }, + { + "ce_loss_10": 3.5341761112213135, + "ce_loss_13": 3.4623092293739317, + "ce_loss_2": 4.504952430725098, + "ce_loss_3": 4.235734903812409, + "ce_loss_7": 3.7161438941955565, + "epoch": 0.834, + "grad_norm": 652.0, + "kl_loss_10": 176.11358489990235, + "kl_loss_2": 2152.3007080078123, + "kl_loss_3": 1674.9229736328125, + "kl_loss_7": 599.1006164550781, + "learning_rate": 6.778280847103668e-05, + "loss": 1187.2795, + "step": 8340 + }, + { + "ce_loss_10": 3.5474065065383913, + "ce_loss_13": 3.4685073494911194, + "ce_loss_2": 4.511071228981018, + "ce_loss_3": 4.243249070644379, + "ce_loss_7": 3.7250569343566893, + "epoch": 0.835, + "grad_norm": 544.0, + "kl_loss_10": 179.0617935180664, + "kl_loss_2": 2161.984759521484, + "kl_loss_3": 1690.6224731445313, + "kl_loss_7": 606.1362609863281, + "learning_rate": 6.698729810778065e-05, + "loss": 1153.2188, + "step": 8350 + }, + { + "ce_loss_10": 3.4568483591079713, + "ce_loss_13": 3.3825206756591797, + "ce_loss_2": 4.450219774246216, + "ce_loss_3": 4.178645396232605, + "ce_loss_7": 3.6368404626846313, + "epoch": 0.836, + "grad_norm": 628.0, + "kl_loss_10": 176.9057861328125, + "kl_loss_2": 2207.3519287109375, + "kl_loss_3": 1716.1288146972656, + "kl_loss_7": 600.357534790039, + "learning_rate": 6.619614817762538e-05, + "loss": 1175.9664, + "step": 8360 + }, + { + "ce_loss_10": 3.421834397315979, + "ce_loss_13": 3.3451184391975404, + "ce_loss_2": 4.458368134498596, + "ce_loss_3": 4.186334764957428, + "ce_loss_7": 3.611315131187439, + "epoch": 0.837, + "grad_norm": 524.0, + "kl_loss_10": 179.41786346435546, + "kl_loss_2": 2288.001843261719, + "kl_loss_3": 1799.511328125, + "kl_loss_7": 622.7222686767578, + "learning_rate": 6.540936664744196e-05, + "loss": 1185.6504, + "step": 8370 + }, + { + "ce_loss_10": 3.5705604910850526, + "ce_loss_13": 3.495253837108612, + "ce_loss_2": 4.550519323348999, + "ce_loss_3": 4.287250196933746, + "ce_loss_7": 3.7482882261276247, + "epoch": 0.838, + "grad_norm": 556.0, + "kl_loss_10": 178.4987823486328, + "kl_loss_2": 2165.8636779785156, + "kl_loss_3": 1697.199853515625, + "kl_loss_7": 600.5514739990234, + "learning_rate": 6.462696144011149e-05, + "loss": 1144.948, + "step": 8380 + }, + { + "ce_loss_10": 3.521126616001129, + "ce_loss_13": 3.44657279253006, + "ce_loss_2": 4.47376012802124, + "ce_loss_3": 4.215767812728882, + "ce_loss_7": 3.700752067565918, + "epoch": 0.839, + "grad_norm": 552.0, + "kl_loss_10": 181.0975456237793, + "kl_loss_2": 2145.9520629882813, + "kl_loss_3": 1683.1618103027345, + "kl_loss_7": 606.7530914306641, + "learning_rate": 6.384894043444567e-05, + "loss": 1140.7508, + "step": 8390 + }, + { + "ce_loss_10": 3.5482063770294188, + "ce_loss_13": 3.4719661116600036, + "ce_loss_2": 4.523540115356445, + "ce_loss_3": 4.260622024536133, + "ce_loss_7": 3.7237455368041994, + "epoch": 0.84, + "grad_norm": 540.0, + "kl_loss_10": 178.300057220459, + "kl_loss_2": 2181.8050231933594, + "kl_loss_3": 1707.3205505371093, + "kl_loss_7": 602.2170059204102, + "learning_rate": 6.307531146510753e-05, + "loss": 1150.0869, + "step": 8400 + }, + { + "ce_loss_10": 3.526041495800018, + "ce_loss_13": 3.4509783387184143, + "ce_loss_2": 4.471963119506836, + "ce_loss_3": 4.206485414505005, + "ce_loss_7": 3.701741063594818, + "epoch": 0.841, + "grad_norm": 560.0, + "kl_loss_10": 177.0880439758301, + "kl_loss_2": 2118.260693359375, + "kl_loss_3": 1641.845037841797, + "kl_loss_7": 595.3580291748046, + "learning_rate": 6.230608232253226e-05, + "loss": 1129.8508, + "step": 8410 + }, + { + "ce_loss_10": 3.4824550271034242, + "ce_loss_13": 3.405357301235199, + "ce_loss_2": 4.482615494728089, + "ce_loss_3": 4.2257519364356995, + "ce_loss_7": 3.6680721044540405, + "epoch": 0.842, + "grad_norm": 512.0, + "kl_loss_10": 179.48506774902344, + "kl_loss_2": 2227.8513061523436, + "kl_loss_3": 1761.4332275390625, + "kl_loss_7": 616.7242858886718, + "learning_rate": 6.154126075284855e-05, + "loss": 1155.2555, + "step": 8420 + }, + { + "ce_loss_10": 3.577245807647705, + "ce_loss_13": 3.5012118101119993, + "ce_loss_2": 4.512918734550476, + "ce_loss_3": 4.249192714691162, + "ce_loss_7": 3.7460012435913086, + "epoch": 0.843, + "grad_norm": 704.0, + "kl_loss_10": 174.01815643310547, + "kl_loss_2": 2066.7827331542967, + "kl_loss_3": 1608.2534240722657, + "kl_loss_7": 586.0584747314454, + "learning_rate": 6.078085445780129e-05, + "loss": 1117.5314, + "step": 8430 + }, + { + "ce_loss_10": 3.584468650817871, + "ce_loss_13": 3.5081024169921875, + "ce_loss_2": 4.569616174697876, + "ce_loss_3": 4.300150573253632, + "ce_loss_7": 3.7608886480331423, + "epoch": 0.844, + "grad_norm": 576.0, + "kl_loss_10": 177.62249679565429, + "kl_loss_2": 2185.7296508789063, + "kl_loss_3": 1710.7079772949219, + "kl_loss_7": 599.2171966552735, + "learning_rate": 6.002487109467347e-05, + "loss": 1141.6974, + "step": 8440 + }, + { + "ce_loss_10": 3.587876856327057, + "ce_loss_13": 3.5108195781707763, + "ce_loss_2": 4.524105596542358, + "ce_loss_3": 4.2644576787948605, + "ce_loss_7": 3.756032574176788, + "epoch": 0.845, + "grad_norm": 592.0, + "kl_loss_10": 181.40281448364257, + "kl_loss_2": 2131.273962402344, + "kl_loss_3": 1667.7534301757812, + "kl_loss_7": 605.4166229248046, + "learning_rate": 5.927331827620902e-05, + "loss": 1141.2443, + "step": 8450 + }, + { + "ce_loss_10": 3.573608911037445, + "ce_loss_13": 3.499223828315735, + "ce_loss_2": 4.488192296028137, + "ce_loss_3": 4.230596256256104, + "ce_loss_7": 3.7483445525169374, + "epoch": 0.846, + "grad_norm": 552.0, + "kl_loss_10": 175.3866973876953, + "kl_loss_2": 2041.3302062988282, + "kl_loss_3": 1588.8981994628907, + "kl_loss_7": 591.4093719482422, + "learning_rate": 5.852620357053651e-05, + "loss": 1132.2791, + "step": 8460 + }, + { + "ce_loss_10": 3.6111098527908325, + "ce_loss_13": 3.536815571784973, + "ce_loss_2": 4.544794130325317, + "ce_loss_3": 4.2818133473396305, + "ce_loss_7": 3.780075490474701, + "epoch": 0.847, + "grad_norm": 596.0, + "kl_loss_10": 174.2255989074707, + "kl_loss_2": 2095.0241271972654, + "kl_loss_3": 1629.0387634277345, + "kl_loss_7": 588.6748046875, + "learning_rate": 5.778353450109286e-05, + "loss": 1140.0846, + "step": 8470 + }, + { + "ce_loss_10": 3.648575019836426, + "ce_loss_13": 3.5720423340797423, + "ce_loss_2": 4.605719590187073, + "ce_loss_3": 4.344667458534241, + "ce_loss_7": 3.8233685731887816, + "epoch": 0.848, + "grad_norm": 486.0, + "kl_loss_10": 179.81479415893554, + "kl_loss_2": 2138.249481201172, + "kl_loss_3": 1668.5974670410155, + "kl_loss_7": 599.209912109375, + "learning_rate": 5.7045318546547206e-05, + "loss": 1146.8947, + "step": 8480 + }, + { + "ce_loss_10": 3.5448459148406983, + "ce_loss_13": 3.468916821479797, + "ce_loss_2": 4.523812007904053, + "ce_loss_3": 4.25856339931488, + "ce_loss_7": 3.7180402636528016, + "epoch": 0.849, + "grad_norm": 556.0, + "kl_loss_10": 176.35201721191407, + "kl_loss_2": 2188.633197021484, + "kl_loss_3": 1710.0829528808595, + "kl_loss_7": 595.4919921875, + "learning_rate": 5.631156314072605e-05, + "loss": 1145.8699, + "step": 8490 + }, + { + "ce_loss_10": 3.559221601486206, + "ce_loss_13": 3.4834945678710936, + "ce_loss_2": 4.495839285850525, + "ce_loss_3": 4.22744711637497, + "ce_loss_7": 3.7315674662590026, + "epoch": 0.85, + "grad_norm": 536.0, + "kl_loss_10": 176.7962844848633, + "kl_loss_2": 2110.8933044433593, + "kl_loss_3": 1632.140673828125, + "kl_loss_7": 588.6574279785157, + "learning_rate": 5.5582275672538315e-05, + "loss": 1128.7181, + "step": 8500 + }, + { + "ce_loss_10": 3.4811159491539003, + "ce_loss_13": 3.403310573101044, + "ce_loss_2": 4.505023097991943, + "ce_loss_3": 4.238485896587372, + "ce_loss_7": 3.6721346259117125, + "epoch": 0.851, + "grad_norm": 608.0, + "kl_loss_10": 182.8627899169922, + "kl_loss_2": 2282.4833068847656, + "kl_loss_3": 1798.2781005859374, + "kl_loss_7": 625.0588409423829, + "learning_rate": 5.4857463485900484e-05, + "loss": 1192.5725, + "step": 8510 + }, + { + "ce_loss_10": 3.5297972202301025, + "ce_loss_13": 3.454351043701172, + "ce_loss_2": 4.489051342010498, + "ce_loss_3": 4.219796097278595, + "ce_loss_7": 3.7097468852996824, + "epoch": 0.852, + "grad_norm": 592.0, + "kl_loss_10": 178.4038864135742, + "kl_loss_2": 2146.418908691406, + "kl_loss_3": 1667.9574584960938, + "kl_loss_7": 602.1778778076172, + "learning_rate": 5.413713387966329e-05, + "loss": 1150.9164, + "step": 8520 + }, + { + "ce_loss_10": 3.558277463912964, + "ce_loss_13": 3.480745458602905, + "ce_loss_2": 4.530928635597229, + "ce_loss_3": 4.266195034980774, + "ce_loss_7": 3.7276942253112795, + "epoch": 0.853, + "grad_norm": 620.0, + "kl_loss_10": 178.86384658813478, + "kl_loss_2": 2174.0791748046877, + "kl_loss_3": 1703.99384765625, + "kl_loss_7": 598.962451171875, + "learning_rate": 5.34212941075381e-05, + "loss": 1160.2438, + "step": 8530 + }, + { + "ce_loss_10": 3.559523808956146, + "ce_loss_13": 3.4899546623229982, + "ce_loss_2": 4.511786758899689, + "ce_loss_3": 4.244810569286346, + "ce_loss_7": 3.7282875180244446, + "epoch": 0.854, + "grad_norm": 544.0, + "kl_loss_10": 173.61905364990236, + "kl_loss_2": 2125.2814514160154, + "kl_loss_3": 1651.0782043457032, + "kl_loss_7": 580.7644989013672, + "learning_rate": 5.270995137802315e-05, + "loss": 1139.1208, + "step": 8540 + }, + { + "ce_loss_10": 3.4913312673568724, + "ce_loss_13": 3.4199952483177185, + "ce_loss_2": 4.46144163608551, + "ce_loss_3": 4.1969265818595884, + "ce_loss_7": 3.6703786849975586, + "epoch": 0.855, + "grad_norm": 596.0, + "kl_loss_10": 176.40887756347655, + "kl_loss_2": 2170.294287109375, + "kl_loss_3": 1693.5492065429687, + "kl_loss_7": 604.686279296875, + "learning_rate": 5.2003112854332125e-05, + "loss": 1161.1432, + "step": 8550 + }, + { + "ce_loss_10": 3.495318293571472, + "ce_loss_13": 3.421377086639404, + "ce_loss_2": 4.460399007797241, + "ce_loss_3": 4.191486561298371, + "ce_loss_7": 3.666605508327484, + "epoch": 0.856, + "grad_norm": 624.0, + "kl_loss_10": 174.107186126709, + "kl_loss_2": 2159.1057250976564, + "kl_loss_3": 1686.5014953613281, + "kl_loss_7": 595.0416564941406, + "learning_rate": 5.130078565432089e-05, + "loss": 1138.6503, + "step": 8560 + }, + { + "ce_loss_10": 3.5646494030952454, + "ce_loss_13": 3.4924421072006226, + "ce_loss_2": 4.498122811317444, + "ce_loss_3": 4.236236476898194, + "ce_loss_7": 3.731403958797455, + "epoch": 0.857, + "grad_norm": 548.0, + "kl_loss_10": 173.7933578491211, + "kl_loss_2": 2107.9238403320314, + "kl_loss_3": 1647.1588623046875, + "kl_loss_7": 586.2743255615235, + "learning_rate": 5.060297685041659e-05, + "loss": 1120.3278, + "step": 8570 + }, + { + "ce_loss_10": 3.498642110824585, + "ce_loss_13": 3.423696291446686, + "ce_loss_2": 4.494770348072052, + "ce_loss_3": 4.218523621559143, + "ce_loss_7": 3.6762722969055175, + "epoch": 0.858, + "grad_norm": 548.0, + "kl_loss_10": 180.7342544555664, + "kl_loss_2": 2221.7594360351563, + "kl_loss_3": 1733.2135437011718, + "kl_loss_7": 609.6224884033203, + "learning_rate": 4.99096934695461e-05, + "loss": 1183.0167, + "step": 8580 + }, + { + "ce_loss_10": 3.55733345746994, + "ce_loss_13": 3.4829642295837404, + "ce_loss_2": 4.523844695091247, + "ce_loss_3": 4.2578066945075985, + "ce_loss_7": 3.7367467999458315, + "epoch": 0.859, + "grad_norm": 544.0, + "kl_loss_10": 175.85337829589844, + "kl_loss_2": 2157.883190917969, + "kl_loss_3": 1681.7412048339843, + "kl_loss_7": 598.1185791015625, + "learning_rate": 4.922094249306558e-05, + "loss": 1131.2188, + "step": 8590 + }, + { + "ce_loss_10": 3.5841567873954774, + "ce_loss_13": 3.509797990322113, + "ce_loss_2": 4.5469663619995115, + "ce_loss_3": 4.2871175646781925, + "ce_loss_7": 3.7628376722335815, + "epoch": 0.86, + "grad_norm": 604.0, + "kl_loss_10": 179.83917465209962, + "kl_loss_2": 2154.5059204101562, + "kl_loss_3": 1690.2081298828125, + "kl_loss_7": 604.0245941162109, + "learning_rate": 4.853673085668947e-05, + "loss": 1135.9622, + "step": 8600 + }, + { + "ce_loss_10": 3.60320885181427, + "ce_loss_13": 3.529503357410431, + "ce_loss_2": 4.566924571990967, + "ce_loss_3": 4.302417039871216, + "ce_loss_7": 3.7819976687431334, + "epoch": 0.861, + "grad_norm": 596.0, + "kl_loss_10": 177.36936798095704, + "kl_loss_2": 2148.3923461914064, + "kl_loss_3": 1680.8319580078125, + "kl_loss_7": 597.4716522216797, + "learning_rate": 4.78570654504214e-05, + "loss": 1156.3883, + "step": 8610 + }, + { + "ce_loss_10": 3.5473016500473022, + "ce_loss_13": 3.4740379452705383, + "ce_loss_2": 4.5167618751525875, + "ce_loss_3": 4.248826539516449, + "ce_loss_7": 3.7283095955848693, + "epoch": 0.862, + "grad_norm": 512.0, + "kl_loss_10": 175.62952041625977, + "kl_loss_2": 2176.8274475097655, + "kl_loss_3": 1698.4800903320313, + "kl_loss_7": 603.3765747070313, + "learning_rate": 4.7181953118484556e-05, + "loss": 1157.7057, + "step": 8620 + }, + { + "ce_loss_10": 3.5743127822875977, + "ce_loss_13": 3.49841423034668, + "ce_loss_2": 4.522394800186158, + "ce_loss_3": 4.2583330273628235, + "ce_loss_7": 3.751324450969696, + "epoch": 0.863, + "grad_norm": 604.0, + "kl_loss_10": 175.00742568969727, + "kl_loss_2": 2093.126568603516, + "kl_loss_3": 1630.9019409179687, + "kl_loss_7": 592.01396484375, + "learning_rate": 4.651140065925269e-05, + "loss": 1159.3387, + "step": 8630 + }, + { + "ce_loss_10": 3.507640373706818, + "ce_loss_13": 3.434223484992981, + "ce_loss_2": 4.487589573860168, + "ce_loss_3": 4.2153548240661625, + "ce_loss_7": 3.6856843709945677, + "epoch": 0.864, + "grad_norm": 588.0, + "kl_loss_10": 177.25660781860353, + "kl_loss_2": 2189.234814453125, + "kl_loss_3": 1705.8089416503906, + "kl_loss_7": 594.7530731201172, + "learning_rate": 4.58454148251814e-05, + "loss": 1175.3236, + "step": 8640 + }, + { + "ce_loss_10": 3.5295264959335326, + "ce_loss_13": 3.451808476448059, + "ce_loss_2": 4.534165596961975, + "ce_loss_3": 4.261823272705078, + "ce_loss_7": 3.7138017773628236, + "epoch": 0.865, + "grad_norm": 568.0, + "kl_loss_10": 177.66054000854493, + "kl_loss_2": 2227.2971801757812, + "kl_loss_3": 1739.9827575683594, + "kl_loss_7": 610.7921539306641, + "learning_rate": 4.518400232274078e-05, + "loss": 1162.0056, + "step": 8650 + }, + { + "ce_loss_10": 3.5479356169700624, + "ce_loss_13": 3.4702929258346558, + "ce_loss_2": 4.501330161094666, + "ce_loss_3": 4.237502670288086, + "ce_loss_7": 3.723937380313873, + "epoch": 0.866, + "grad_norm": 524.0, + "kl_loss_10": 179.25594482421874, + "kl_loss_2": 2137.7675231933595, + "kl_loss_3": 1671.597607421875, + "kl_loss_7": 602.4135375976563, + "learning_rate": 4.452716981234745e-05, + "loss": 1122.9633, + "step": 8660 + }, + { + "ce_loss_10": 3.5203991651535036, + "ce_loss_13": 3.447481095790863, + "ce_loss_2": 4.47524061203003, + "ce_loss_3": 4.205355083942413, + "ce_loss_7": 3.695466148853302, + "epoch": 0.867, + "grad_norm": 568.0, + "kl_loss_10": 174.45485000610353, + "kl_loss_2": 2135.713586425781, + "kl_loss_3": 1657.6833251953126, + "kl_loss_7": 594.2672515869141, + "learning_rate": 4.3874923908297335e-05, + "loss": 1125.4834, + "step": 8670 + }, + { + "ce_loss_10": 3.575284111499786, + "ce_loss_13": 3.498721444606781, + "ce_loss_2": 4.54740161895752, + "ce_loss_3": 4.281282663345337, + "ce_loss_7": 3.7500776290893554, + "epoch": 0.868, + "grad_norm": 596.0, + "kl_loss_10": 178.63047256469727, + "kl_loss_2": 2175.332312011719, + "kl_loss_3": 1702.8009948730469, + "kl_loss_7": 597.87509765625, + "learning_rate": 4.322727117869951e-05, + "loss": 1149.3994, + "step": 8680 + }, + { + "ce_loss_10": 3.575519359111786, + "ce_loss_13": 3.4998192310333254, + "ce_loss_2": 4.55748233795166, + "ce_loss_3": 4.284105372428894, + "ce_loss_7": 3.752281701564789, + "epoch": 0.869, + "grad_norm": 584.0, + "kl_loss_10": 179.47224349975585, + "kl_loss_2": 2189.6538024902343, + "kl_loss_3": 1698.4593017578125, + "kl_loss_7": 604.034390258789, + "learning_rate": 4.2584218145409916e-05, + "loss": 1151.0721, + "step": 8690 + }, + { + "ce_loss_10": 3.6216215252876283, + "ce_loss_13": 3.551214134693146, + "ce_loss_2": 4.543716049194336, + "ce_loss_3": 4.272314977645874, + "ce_loss_7": 3.786858594417572, + "epoch": 0.87, + "grad_norm": 600.0, + "kl_loss_10": 174.5644790649414, + "kl_loss_2": 2072.967736816406, + "kl_loss_3": 1598.7917419433593, + "kl_loss_7": 582.653207397461, + "learning_rate": 4.194577128396521e-05, + "loss": 1108.3934, + "step": 8700 + }, + { + "ce_loss_10": 3.498377776145935, + "ce_loss_13": 3.425851809978485, + "ce_loss_2": 4.466418659687042, + "ce_loss_3": 4.194790709018707, + "ce_loss_7": 3.670235824584961, + "epoch": 0.871, + "grad_norm": 506.0, + "kl_loss_10": 174.37066497802735, + "kl_loss_2": 2168.9560546875, + "kl_loss_3": 1689.01513671875, + "kl_loss_7": 590.7626220703125, + "learning_rate": 4.1311937023518264e-05, + "loss": 1166.4488, + "step": 8710 + }, + { + "ce_loss_10": 3.5134344696998596, + "ce_loss_13": 3.4397946119308473, + "ce_loss_2": 4.529711484909058, + "ce_loss_3": 4.263941979408264, + "ce_loss_7": 3.682065725326538, + "epoch": 0.872, + "grad_norm": 460.0, + "kl_loss_10": 171.77398529052735, + "kl_loss_2": 2246.7038696289064, + "kl_loss_3": 1774.6010986328124, + "kl_loss_7": 576.3054107666015, + "learning_rate": 4.0682721746773344e-05, + "loss": 1163.771, + "step": 8720 + }, + { + "ce_loss_10": 3.3905357241630556, + "ce_loss_13": 3.314958465099335, + "ce_loss_2": 4.396602368354797, + "ce_loss_3": 4.132218360900879, + "ce_loss_7": 3.5750641107559202, + "epoch": 0.873, + "grad_norm": 552.0, + "kl_loss_10": 175.90054779052736, + "kl_loss_2": 2222.7579711914063, + "kl_loss_3": 1749.4481201171875, + "kl_loss_7": 613.7805450439453, + "learning_rate": 4.0058131789920904e-05, + "loss": 1143.7059, + "step": 8730 + }, + { + "ce_loss_10": 3.5397099256515503, + "ce_loss_13": 3.4643173098564146, + "ce_loss_2": 4.497500014305115, + "ce_loss_3": 4.226579332351685, + "ce_loss_7": 3.709259867668152, + "epoch": 0.874, + "grad_norm": 572.0, + "kl_loss_10": 176.51957778930665, + "kl_loss_2": 2162.519940185547, + "kl_loss_3": 1680.782940673828, + "kl_loss_7": 600.4374298095703, + "learning_rate": 3.9438173442575e-05, + "loss": 1188.067, + "step": 8740 + }, + { + "ce_loss_10": 3.5728036165237427, + "ce_loss_13": 3.4973131656646728, + "ce_loss_2": 4.514467573165893, + "ce_loss_3": 4.250711810588837, + "ce_loss_7": 3.740223217010498, + "epoch": 0.875, + "grad_norm": 524.0, + "kl_loss_10": 175.66529846191406, + "kl_loss_2": 2112.5622009277345, + "kl_loss_3": 1651.0288818359375, + "kl_loss_7": 594.9720977783203, + "learning_rate": 3.882285294770937e-05, + "loss": 1137.6895, + "step": 8750 + }, + { + "ce_loss_10": 3.5377378940582274, + "ce_loss_13": 3.4619855165481566, + "ce_loss_2": 4.4779297590255736, + "ce_loss_3": 4.2127908825874325, + "ce_loss_7": 3.7088746547698976, + "epoch": 0.876, + "grad_norm": 600.0, + "kl_loss_10": 178.39902954101564, + "kl_loss_2": 2127.570068359375, + "kl_loss_3": 1648.2625427246094, + "kl_loss_7": 594.3859985351562, + "learning_rate": 3.821217650159453e-05, + "loss": 1155.4234, + "step": 8760 + }, + { + "ce_loss_10": 3.4084259629249574, + "ce_loss_13": 3.332693111896515, + "ce_loss_2": 4.428424310684204, + "ce_loss_3": 4.158231461048127, + "ce_loss_7": 3.6029205918312073, + "epoch": 0.877, + "grad_norm": 548.0, + "kl_loss_10": 180.0270034790039, + "kl_loss_2": 2236.819873046875, + "kl_loss_3": 1758.5952392578124, + "kl_loss_7": 625.4339080810547, + "learning_rate": 3.760615025373543e-05, + "loss": 1171.5936, + "step": 8770 + }, + { + "ce_loss_10": 3.595931589603424, + "ce_loss_13": 3.5179906845092774, + "ce_loss_2": 4.587141966819763, + "ce_loss_3": 4.309424257278442, + "ce_loss_7": 3.7760006308555605, + "epoch": 0.878, + "grad_norm": 660.0, + "kl_loss_10": 183.31888961791992, + "kl_loss_2": 2207.746258544922, + "kl_loss_3": 1714.543280029297, + "kl_loss_7": 607.3442749023437, + "learning_rate": 3.700478030680987e-05, + "loss": 1181.1754, + "step": 8780 + }, + { + "ce_loss_10": 3.5762731194496156, + "ce_loss_13": 3.5029969453811645, + "ce_loss_2": 4.536650991439819, + "ce_loss_3": 4.271095442771911, + "ce_loss_7": 3.748454582691193, + "epoch": 0.879, + "grad_norm": 502.0, + "kl_loss_10": 176.4349349975586, + "kl_loss_2": 2141.4830688476563, + "kl_loss_3": 1675.0247314453125, + "kl_loss_7": 590.1913375854492, + "learning_rate": 3.6408072716606344e-05, + "loss": 1149.3131, + "step": 8790 + }, + { + "ce_loss_10": 3.50073447227478, + "ce_loss_13": 3.4274021863937376, + "ce_loss_2": 4.501238942146301, + "ce_loss_3": 4.239216554164886, + "ce_loss_7": 3.683646392822266, + "epoch": 0.88, + "grad_norm": 600.0, + "kl_loss_10": 180.50948486328124, + "kl_loss_2": 2229.279052734375, + "kl_loss_3": 1758.1964477539063, + "kl_loss_7": 612.8555847167969, + "learning_rate": 3.5816033491963716e-05, + "loss": 1204.1847, + "step": 8800 + }, + { + "ce_loss_10": 3.3653410911560058, + "ce_loss_13": 3.289612293243408, + "ce_loss_2": 4.399015557765961, + "ce_loss_3": 4.129138934612274, + "ce_loss_7": 3.5484530568122863, + "epoch": 0.881, + "grad_norm": 696.0, + "kl_loss_10": 176.9532043457031, + "kl_loss_2": 2282.183734130859, + "kl_loss_3": 1791.626202392578, + "kl_loss_7": 607.9001495361329, + "learning_rate": 3.522866859471047e-05, + "loss": 1184.3774, + "step": 8810 + }, + { + "ce_loss_10": 3.597711908817291, + "ce_loss_13": 3.5282416582107543, + "ce_loss_2": 4.506113409996033, + "ce_loss_3": 4.251258683204651, + "ce_loss_7": 3.7641051173210145, + "epoch": 0.882, + "grad_norm": 636.0, + "kl_loss_10": 169.83995971679687, + "kl_loss_2": 2038.54423828125, + "kl_loss_3": 1585.0056030273438, + "kl_loss_7": 568.1876190185546, + "learning_rate": 3.46459839396045e-05, + "loss": 1125.9656, + "step": 8820 + }, + { + "ce_loss_10": 3.529653000831604, + "ce_loss_13": 3.449263334274292, + "ce_loss_2": 4.503918576240539, + "ce_loss_3": 4.2343867182731625, + "ce_loss_7": 3.712590277194977, + "epoch": 0.883, + "grad_norm": 576.0, + "kl_loss_10": 178.9514488220215, + "kl_loss_2": 2152.299346923828, + "kl_loss_3": 1671.0650329589844, + "kl_loss_7": 603.799105834961, + "learning_rate": 3.406798539427386e-05, + "loss": 1176.0018, + "step": 8830 + }, + { + "ce_loss_10": 3.5842846632003784, + "ce_loss_13": 3.510246682167053, + "ce_loss_2": 4.541045117378235, + "ce_loss_3": 4.27923276424408, + "ce_loss_7": 3.7581582188606264, + "epoch": 0.884, + "grad_norm": 576.0, + "kl_loss_10": 176.05337142944336, + "kl_loss_2": 2155.0933471679687, + "kl_loss_3": 1681.9300415039063, + "kl_loss_7": 595.8100616455079, + "learning_rate": 3.349467877915746e-05, + "loss": 1155.9855, + "step": 8840 + }, + { + "ce_loss_10": 3.5404534935951233, + "ce_loss_13": 3.4635657548904417, + "ce_loss_2": 4.524973630905151, + "ce_loss_3": 4.259057784080506, + "ce_loss_7": 3.7238620042800905, + "epoch": 0.885, + "grad_norm": 604.0, + "kl_loss_10": 178.34489822387695, + "kl_loss_2": 2212.789306640625, + "kl_loss_3": 1740.2167602539062, + "kl_loss_7": 609.8538391113282, + "learning_rate": 3.292606986744667e-05, + "loss": 1199.5514, + "step": 8850 + }, + { + "ce_loss_10": 3.4956598401069643, + "ce_loss_13": 3.4253405332565308, + "ce_loss_2": 4.470083999633789, + "ce_loss_3": 4.206580317020416, + "ce_loss_7": 3.6703789830207825, + "epoch": 0.886, + "grad_norm": 580.0, + "kl_loss_10": 174.30244827270508, + "kl_loss_2": 2159.222479248047, + "kl_loss_3": 1693.7933227539063, + "kl_loss_7": 599.4409484863281, + "learning_rate": 3.23621643850267e-05, + "loss": 1154.7352, + "step": 8860 + }, + { + "ce_loss_10": 3.5727248191833496, + "ce_loss_13": 3.496046614646912, + "ce_loss_2": 4.52955162525177, + "ce_loss_3": 4.25726010799408, + "ce_loss_7": 3.7465644001960756, + "epoch": 0.887, + "grad_norm": 552.0, + "kl_loss_10": 179.55787811279296, + "kl_loss_2": 2159.213214111328, + "kl_loss_3": 1673.28759765625, + "kl_loss_7": 605.2644119262695, + "learning_rate": 3.180296801041971e-05, + "loss": 1139.4904, + "step": 8870 + }, + { + "ce_loss_10": 3.595055866241455, + "ce_loss_13": 3.5219205260276794, + "ce_loss_2": 4.565229892730713, + "ce_loss_3": 4.302072286605835, + "ce_loss_7": 3.7657612919807435, + "epoch": 0.888, + "grad_norm": 488.0, + "kl_loss_10": 174.51052551269532, + "kl_loss_2": 2168.450274658203, + "kl_loss_3": 1696.4366394042968, + "kl_loss_7": 592.0915069580078, + "learning_rate": 3.124848637472688e-05, + "loss": 1132.4514, + "step": 8880 + }, + { + "ce_loss_10": 3.420267331600189, + "ce_loss_13": 3.346735382080078, + "ce_loss_2": 4.411080622673035, + "ce_loss_3": 4.143524849414826, + "ce_loss_7": 3.6017141342163086, + "epoch": 0.889, + "grad_norm": 600.0, + "kl_loss_10": 174.23086242675782, + "kl_loss_2": 2199.3413024902343, + "kl_loss_3": 1717.92958984375, + "kl_loss_7": 596.7423614501953, + "learning_rate": 3.069872506157212e-05, + "loss": 1155.8682, + "step": 8890 + }, + { + "ce_loss_10": 3.5183400988578795, + "ce_loss_13": 3.4446550846099853, + "ce_loss_2": 4.4763764381408695, + "ce_loss_3": 4.213694953918457, + "ce_loss_7": 3.693737256526947, + "epoch": 0.89, + "grad_norm": 544.0, + "kl_loss_10": 176.23825073242188, + "kl_loss_2": 2152.81796875, + "kl_loss_3": 1686.6082885742187, + "kl_loss_7": 599.2687866210938, + "learning_rate": 3.0153689607045842e-05, + "loss": 1144.8437, + "step": 8900 + }, + { + "ce_loss_10": 3.4148733854293822, + "ce_loss_13": 3.3367454648017882, + "ce_loss_2": 4.46618926525116, + "ce_loss_3": 4.190043389797211, + "ce_loss_7": 3.606413686275482, + "epoch": 0.891, + "grad_norm": 556.0, + "kl_loss_10": 181.56116943359376, + "kl_loss_2": 2316.6614379882812, + "kl_loss_3": 1823.7576782226563, + "kl_loss_7": 624.2273193359375, + "learning_rate": 2.9613385499648926e-05, + "loss": 1174.4811, + "step": 8910 + }, + { + "ce_loss_10": 3.472314703464508, + "ce_loss_13": 3.3953917384147645, + "ce_loss_2": 4.439359056949615, + "ce_loss_3": 4.1694392442703245, + "ce_loss_7": 3.6529108047485352, + "epoch": 0.892, + "grad_norm": 632.0, + "kl_loss_10": 176.6066520690918, + "kl_loss_2": 2142.8492736816406, + "kl_loss_3": 1665.7835510253906, + "kl_loss_7": 596.6286926269531, + "learning_rate": 2.9077818180237692e-05, + "loss": 1160.4215, + "step": 8920 + }, + { + "ce_loss_10": 3.5216124176979067, + "ce_loss_13": 3.444805955886841, + "ce_loss_2": 4.504645991325378, + "ce_loss_3": 4.23671303987503, + "ce_loss_7": 3.703485441207886, + "epoch": 0.893, + "grad_norm": 584.0, + "kl_loss_10": 176.44115447998047, + "kl_loss_2": 2174.382470703125, + "kl_loss_3": 1696.403564453125, + "kl_loss_7": 600.8687164306641, + "learning_rate": 2.8546993041969172e-05, + "loss": 1152.6479, + "step": 8930 + }, + { + "ce_loss_10": 3.5529621839523315, + "ce_loss_13": 3.4791373729705812, + "ce_loss_2": 4.487787294387817, + "ce_loss_3": 4.227098524570465, + "ce_loss_7": 3.7245055556297304, + "epoch": 0.894, + "grad_norm": 506.0, + "kl_loss_10": 174.32415542602538, + "kl_loss_2": 2110.938995361328, + "kl_loss_3": 1637.950372314453, + "kl_loss_7": 590.5497222900391, + "learning_rate": 2.802091543024671e-05, + "loss": 1153.5114, + "step": 8940 + }, + { + "ce_loss_10": 3.5515737652778627, + "ce_loss_13": 3.4770275354385376, + "ce_loss_2": 4.526445126533508, + "ce_loss_3": 4.267046928405762, + "ce_loss_7": 3.728025937080383, + "epoch": 0.895, + "grad_norm": 612.0, + "kl_loss_10": 177.65689697265626, + "kl_loss_2": 2195.3055725097656, + "kl_loss_3": 1727.4093139648437, + "kl_loss_7": 604.4041717529296, + "learning_rate": 2.7499590642665774e-05, + "loss": 1190.9908, + "step": 8950 + }, + { + "ce_loss_10": 3.5625943899154664, + "ce_loss_13": 3.4850521326065063, + "ce_loss_2": 4.553565168380738, + "ce_loss_3": 4.279750061035156, + "ce_loss_7": 3.739047312736511, + "epoch": 0.896, + "grad_norm": 512.0, + "kl_loss_10": 178.24014129638672, + "kl_loss_2": 2193.0329833984374, + "kl_loss_3": 1709.0227905273437, + "kl_loss_7": 602.5221313476562, + "learning_rate": 2.6983023928961405e-05, + "loss": 1147.626, + "step": 8960 + }, + { + "ce_loss_10": 3.532795751094818, + "ce_loss_13": 3.4568071961402893, + "ce_loss_2": 4.499462056159973, + "ce_loss_3": 4.242076885700226, + "ce_loss_7": 3.709202516078949, + "epoch": 0.897, + "grad_norm": 616.0, + "kl_loss_10": 177.7622848510742, + "kl_loss_2": 2147.4717651367187, + "kl_loss_3": 1687.235498046875, + "kl_loss_7": 597.6563537597656, + "learning_rate": 2.6471220490954628e-05, + "loss": 1172.1214, + "step": 8970 + }, + { + "ce_loss_10": 3.5174603939056395, + "ce_loss_13": 3.4463194727897646, + "ce_loss_2": 4.477302503585816, + "ce_loss_3": 4.214387357234955, + "ce_loss_7": 3.6873306155204775, + "epoch": 0.898, + "grad_norm": 592.0, + "kl_loss_10": 174.26932220458986, + "kl_loss_2": 2152.279821777344, + "kl_loss_3": 1683.4832763671875, + "kl_loss_7": 590.3468292236328, + "learning_rate": 2.596418548250029e-05, + "loss": 1156.035, + "step": 8980 + }, + { + "ce_loss_10": 3.5602595686912535, + "ce_loss_13": 3.485771131515503, + "ce_loss_2": 4.5211225032806395, + "ce_loss_3": 4.257622516155243, + "ce_loss_7": 3.7368709683418273, + "epoch": 0.899, + "grad_norm": 524.0, + "kl_loss_10": 179.2437530517578, + "kl_loss_2": 2158.524066162109, + "kl_loss_3": 1691.2223327636718, + "kl_loss_7": 601.7962463378906, + "learning_rate": 2.5461924009435368e-05, + "loss": 1142.976, + "step": 8990 + }, + { + "ce_loss_10": 3.5547463297843933, + "ce_loss_13": 3.479547905921936, + "ce_loss_2": 4.515283250808716, + "ce_loss_3": 4.250737547874451, + "ce_loss_7": 3.732912743091583, + "epoch": 0.9, + "grad_norm": 572.0, + "kl_loss_10": 177.4375427246094, + "kl_loss_2": 2139.4888916015625, + "kl_loss_3": 1664.3239318847657, + "kl_loss_7": 599.9839569091797, + "learning_rate": 2.4964441129527336e-05, + "loss": 1166.3201, + "step": 9000 + }, + { + "ce_loss_10": 3.553958511352539, + "ce_loss_13": 3.476763606071472, + "ce_loss_2": 4.496755647659302, + "ce_loss_3": 4.227215158939361, + "ce_loss_7": 3.7206253528594972, + "epoch": 0.901, + "grad_norm": 540.0, + "kl_loss_10": 174.2966407775879, + "kl_loss_2": 2111.5487548828123, + "kl_loss_3": 1639.27841796875, + "kl_loss_7": 584.4350708007812, + "learning_rate": 2.4471741852423235e-05, + "loss": 1125.0274, + "step": 9010 + }, + { + "ce_loss_10": 3.600440430641174, + "ce_loss_13": 3.522721529006958, + "ce_loss_2": 4.542108774185181, + "ce_loss_3": 4.282799339294433, + "ce_loss_7": 3.7742578268051146, + "epoch": 0.902, + "grad_norm": 524.0, + "kl_loss_10": 175.27555160522462, + "kl_loss_2": 2098.4271484375, + "kl_loss_3": 1637.176934814453, + "kl_loss_7": 587.0385147094727, + "learning_rate": 2.3983831139599287e-05, + "loss": 1139.7687, + "step": 9020 + }, + { + "ce_loss_10": 3.519875633716583, + "ce_loss_13": 3.446099603176117, + "ce_loss_2": 4.47457070350647, + "ce_loss_3": 4.212863862514496, + "ce_loss_7": 3.68597651720047, + "epoch": 0.903, + "grad_norm": 508.0, + "kl_loss_10": 174.20441055297852, + "kl_loss_2": 2129.7295166015624, + "kl_loss_3": 1661.6650817871093, + "kl_loss_7": 579.8323806762695, + "learning_rate": 2.3500713904311022e-05, + "loss": 1116.7298, + "step": 9030 + }, + { + "ce_loss_10": 3.5635103940963746, + "ce_loss_13": 3.4901776790618895, + "ce_loss_2": 4.492858815193176, + "ce_loss_3": 4.233377468585968, + "ce_loss_7": 3.7266834378242493, + "epoch": 0.904, + "grad_norm": 568.0, + "kl_loss_10": 172.6203300476074, + "kl_loss_2": 2067.9869079589844, + "kl_loss_3": 1612.4867309570313, + "kl_loss_7": 575.5206619262696, + "learning_rate": 2.3022395011543685e-05, + "loss": 1119.9885, + "step": 9040 + }, + { + "ce_loss_10": 3.592438757419586, + "ce_loss_13": 3.515104389190674, + "ce_loss_2": 4.541802954673767, + "ce_loss_3": 4.281132400035858, + "ce_loss_7": 3.7722238898277283, + "epoch": 0.905, + "grad_norm": 572.0, + "kl_loss_10": 180.47207794189453, + "kl_loss_2": 2144.9797241210936, + "kl_loss_3": 1672.9977722167969, + "kl_loss_7": 611.6667938232422, + "learning_rate": 2.2548879277963063e-05, + "loss": 1176.2332, + "step": 9050 + }, + { + "ce_loss_10": 3.5052724361419676, + "ce_loss_13": 3.43240772485733, + "ce_loss_2": 4.459889388084411, + "ce_loss_3": 4.187293374538422, + "ce_loss_7": 3.677665722370148, + "epoch": 0.906, + "grad_norm": 536.0, + "kl_loss_10": 175.61516647338868, + "kl_loss_2": 2136.0928955078125, + "kl_loss_3": 1652.1388549804688, + "kl_loss_7": 587.7709274291992, + "learning_rate": 2.208017147186736e-05, + "loss": 1112.7982, + "step": 9060 + }, + { + "ce_loss_10": 3.5033626675605776, + "ce_loss_13": 3.4270050883293153, + "ce_loss_2": 4.460723853111267, + "ce_loss_3": 4.200226056575775, + "ce_loss_7": 3.675853359699249, + "epoch": 0.907, + "grad_norm": 532.0, + "kl_loss_10": 175.80412521362305, + "kl_loss_2": 2135.014111328125, + "kl_loss_3": 1673.9324096679688, + "kl_loss_7": 594.3165740966797, + "learning_rate": 2.1616276313139227e-05, + "loss": 1130.9125, + "step": 9070 + }, + { + "ce_loss_10": 3.540289318561554, + "ce_loss_13": 3.4620949029922485, + "ce_loss_2": 4.504480719566345, + "ce_loss_3": 4.243532609939575, + "ce_loss_7": 3.7148184418678283, + "epoch": 0.908, + "grad_norm": 564.0, + "kl_loss_10": 176.60092849731444, + "kl_loss_2": 2138.388262939453, + "kl_loss_3": 1670.2328552246095, + "kl_loss_7": 593.4554962158203, + "learning_rate": 2.1157198473197415e-05, + "loss": 1155.7779, + "step": 9080 + }, + { + "ce_loss_10": 3.608424699306488, + "ce_loss_13": 3.5318522691726684, + "ce_loss_2": 4.569310665130615, + "ce_loss_3": 4.307307338714599, + "ce_loss_7": 3.7887983441352846, + "epoch": 0.909, + "grad_norm": 532.0, + "kl_loss_10": 179.34665451049804, + "kl_loss_2": 2147.910009765625, + "kl_loss_3": 1676.3675476074218, + "kl_loss_7": 609.1422882080078, + "learning_rate": 2.0702942574950812e-05, + "loss": 1150.5193, + "step": 9090 + }, + { + "ce_loss_10": 3.531160354614258, + "ce_loss_13": 3.4534537196159363, + "ce_loss_2": 4.502471184730529, + "ce_loss_3": 4.2366371870040895, + "ce_loss_7": 3.7111218690872194, + "epoch": 0.91, + "grad_norm": 576.0, + "kl_loss_10": 178.72406845092775, + "kl_loss_2": 2166.001983642578, + "kl_loss_3": 1694.5049133300781, + "kl_loss_7": 603.0742904663086, + "learning_rate": 2.025351319275137e-05, + "loss": 1154.2008, + "step": 9100 + }, + { + "ce_loss_10": 3.657759261131287, + "ce_loss_13": 3.5795228123664855, + "ce_loss_2": 4.611237382888794, + "ce_loss_3": 4.346826362609863, + "ce_loss_7": 3.829502213001251, + "epoch": 0.911, + "grad_norm": 568.0, + "kl_loss_10": 182.78317489624024, + "kl_loss_2": 2152.8356689453126, + "kl_loss_3": 1681.0592834472657, + "kl_loss_7": 612.198373413086, + "learning_rate": 1.9808914852347816e-05, + "loss": 1183.935, + "step": 9110 + }, + { + "ce_loss_10": 3.5076343536376955, + "ce_loss_13": 3.4301217675209044, + "ce_loss_2": 4.475468993186951, + "ce_loss_3": 4.200416827201844, + "ce_loss_7": 3.690832221508026, + "epoch": 0.912, + "grad_norm": 520.0, + "kl_loss_10": 177.99471740722657, + "kl_loss_2": 2138.6880798339844, + "kl_loss_3": 1648.8575317382813, + "kl_loss_7": 603.4748657226562, + "learning_rate": 1.9369152030840554e-05, + "loss": 1133.9587, + "step": 9120 + }, + { + "ce_loss_10": 3.5838579297065736, + "ce_loss_13": 3.5108367919921877, + "ce_loss_2": 4.5474550247192385, + "ce_loss_3": 4.282457900047302, + "ce_loss_7": 3.752550458908081, + "epoch": 0.913, + "grad_norm": 592.0, + "kl_loss_10": 175.99298171997071, + "kl_loss_2": 2175.4973083496093, + "kl_loss_3": 1708.1368530273437, + "kl_loss_7": 595.3381591796875, + "learning_rate": 1.893422915663645e-05, + "loss": 1154.3063, + "step": 9130 + }, + { + "ce_loss_10": 3.4526755094528196, + "ce_loss_13": 3.376223611831665, + "ce_loss_2": 4.463929057121277, + "ce_loss_3": 4.188671815395355, + "ce_loss_7": 3.6436040878295897, + "epoch": 0.914, + "grad_norm": 528.0, + "kl_loss_10": 178.96754608154296, + "kl_loss_2": 2226.8522216796873, + "kl_loss_3": 1741.089337158203, + "kl_loss_7": 614.4321655273437, + "learning_rate": 1.850415060940386e-05, + "loss": 1177.2793, + "step": 9140 + }, + { + "ce_loss_10": 3.577260196208954, + "ce_loss_13": 3.4996687054634092, + "ce_loss_2": 4.506159293651581, + "ce_loss_3": 4.247144281864166, + "ce_loss_7": 3.7475706696510316, + "epoch": 0.915, + "grad_norm": 576.0, + "kl_loss_10": 176.05650253295897, + "kl_loss_2": 2092.3647644042967, + "kl_loss_3": 1626.7418640136718, + "kl_loss_7": 590.4974914550781, + "learning_rate": 1.8078920720028978e-05, + "loss": 1136.8029, + "step": 9150 + }, + { + "ce_loss_10": 3.5006513595581055, + "ce_loss_13": 3.4293729782104494, + "ce_loss_2": 4.446831393241882, + "ce_loss_3": 4.180594873428345, + "ce_loss_7": 3.670674538612366, + "epoch": 0.916, + "grad_norm": 584.0, + "kl_loss_10": 173.59793243408203, + "kl_loss_2": 2105.566021728516, + "kl_loss_3": 1637.2901428222656, + "kl_loss_7": 585.1143585205078, + "learning_rate": 1.765854377057219e-05, + "loss": 1156.8438, + "step": 9160 + }, + { + "ce_loss_10": 3.4831743359565737, + "ce_loss_13": 3.410732936859131, + "ce_loss_2": 4.439985752105713, + "ce_loss_3": 4.173957622051239, + "ce_loss_7": 3.652401328086853, + "epoch": 0.917, + "grad_norm": 552.0, + "kl_loss_10": 172.13598022460937, + "kl_loss_2": 2136.6128540039062, + "kl_loss_3": 1663.3346557617188, + "kl_loss_7": 585.5476837158203, + "learning_rate": 1.724302399422456e-05, + "loss": 1148.3008, + "step": 9170 + }, + { + "ce_loss_10": 3.4418306827545164, + "ce_loss_13": 3.365187871456146, + "ce_loss_2": 4.424201607704163, + "ce_loss_3": 4.15188490152359, + "ce_loss_7": 3.617412793636322, + "epoch": 0.918, + "grad_norm": 540.0, + "kl_loss_10": 181.21381607055665, + "kl_loss_2": 2192.6884948730467, + "kl_loss_3": 1711.2826416015625, + "kl_loss_7": 608.8610565185547, + "learning_rate": 1.683236557526574e-05, + "loss": 1171.7086, + "step": 9180 + }, + { + "ce_loss_10": 3.5525336861610413, + "ce_loss_13": 3.479281461238861, + "ce_loss_2": 4.4706899404525755, + "ce_loss_3": 4.209463405609131, + "ce_loss_7": 3.7199216723442077, + "epoch": 0.919, + "grad_norm": 552.0, + "kl_loss_10": 172.36394195556642, + "kl_loss_2": 2051.388214111328, + "kl_loss_3": 1592.7936218261718, + "kl_loss_7": 577.4165252685547, + "learning_rate": 1.6426572649021475e-05, + "loss": 1138.484, + "step": 9190 + }, + { + "ce_loss_10": 3.5873886704444886, + "ce_loss_13": 3.515676808357239, + "ce_loss_2": 4.504871940612793, + "ce_loss_3": 4.245928645133972, + "ce_loss_7": 3.7530444860458374, + "epoch": 0.92, + "grad_norm": 560.0, + "kl_loss_10": 175.5730728149414, + "kl_loss_2": 2067.4602783203127, + "kl_loss_3": 1611.9698425292968, + "kl_loss_7": 583.8149566650391, + "learning_rate": 1.6025649301821876e-05, + "loss": 1125.4826, + "step": 9200 + }, + { + "ce_loss_10": 3.579323208332062, + "ce_loss_13": 3.50703387260437, + "ce_loss_2": 4.493245768547058, + "ce_loss_3": 4.232775616645813, + "ce_loss_7": 3.7471976399421694, + "epoch": 0.921, + "grad_norm": 620.0, + "kl_loss_10": 177.73348236083984, + "kl_loss_2": 2084.493505859375, + "kl_loss_3": 1628.828369140625, + "kl_loss_7": 594.5904907226562, + "learning_rate": 1.5629599570960716e-05, + "loss": 1123.5703, + "step": 9210 + }, + { + "ce_loss_10": 3.482688879966736, + "ce_loss_13": 3.4090826153755187, + "ce_loss_2": 4.466846561431884, + "ce_loss_3": 4.195373678207398, + "ce_loss_7": 3.658596193790436, + "epoch": 0.922, + "grad_norm": 588.0, + "kl_loss_10": 175.80986251831055, + "kl_loss_2": 2196.6754943847654, + "kl_loss_3": 1711.6648010253907, + "kl_loss_7": 598.8134338378907, + "learning_rate": 1.5238427444654367e-05, + "loss": 1155.2326, + "step": 9220 + }, + { + "ce_loss_10": 3.543232810497284, + "ce_loss_13": 3.467606770992279, + "ce_loss_2": 4.491106653213501, + "ce_loss_3": 4.219079720973968, + "ce_loss_7": 3.710583233833313, + "epoch": 0.923, + "grad_norm": 548.0, + "kl_loss_10": 174.20623474121095, + "kl_loss_2": 2120.3849365234373, + "kl_loss_3": 1639.1658142089843, + "kl_loss_7": 584.3361358642578, + "learning_rate": 1.4852136862001764e-05, + "loss": 1130.8816, + "step": 9230 + }, + { + "ce_loss_10": 3.5088143348693848, + "ce_loss_13": 3.435594344139099, + "ce_loss_2": 4.446617817878723, + "ce_loss_3": 4.185671401023865, + "ce_loss_7": 3.681156051158905, + "epoch": 0.924, + "grad_norm": 584.0, + "kl_loss_10": 172.72551879882812, + "kl_loss_2": 2097.9775573730467, + "kl_loss_3": 1637.147705078125, + "kl_loss_7": 588.3473190307617, + "learning_rate": 1.4470731712944884e-05, + "loss": 1146.0963, + "step": 9240 + }, + { + "ce_loss_10": 3.5362769246101378, + "ce_loss_13": 3.461189365386963, + "ce_loss_2": 4.501055717468262, + "ce_loss_3": 4.224046432971955, + "ce_loss_7": 3.714902651309967, + "epoch": 0.925, + "grad_norm": 548.0, + "kl_loss_10": 178.0212043762207, + "kl_loss_2": 2145.880969238281, + "kl_loss_3": 1660.7241577148438, + "kl_loss_7": 597.9191680908203, + "learning_rate": 1.4094215838229174e-05, + "loss": 1173.0712, + "step": 9250 + }, + { + "ce_loss_10": 3.498918581008911, + "ce_loss_13": 3.4251022219657896, + "ce_loss_2": 4.4930708646774296, + "ce_loss_3": 4.217134141921997, + "ce_loss_7": 3.675217390060425, + "epoch": 0.926, + "grad_norm": 628.0, + "kl_loss_10": 177.476513671875, + "kl_loss_2": 2207.0631896972654, + "kl_loss_3": 1717.4895629882812, + "kl_loss_7": 603.3580856323242, + "learning_rate": 1.372259302936546e-05, + "loss": 1205.574, + "step": 9260 + }, + { + "ce_loss_10": 3.6163718700408936, + "ce_loss_13": 3.537315881252289, + "ce_loss_2": 4.571975326538086, + "ce_loss_3": 4.302231848239899, + "ce_loss_7": 3.783820962905884, + "epoch": 0.927, + "grad_norm": 576.0, + "kl_loss_10": 181.85763092041014, + "kl_loss_2": 2140.0186767578125, + "kl_loss_3": 1662.9603210449218, + "kl_loss_7": 600.1376983642579, + "learning_rate": 1.3355867028591206e-05, + "loss": 1136.2721, + "step": 9270 + }, + { + "ce_loss_10": 3.514796030521393, + "ce_loss_13": 3.440743112564087, + "ce_loss_2": 4.449939727783203, + "ce_loss_3": 4.1810842752456665, + "ce_loss_7": 3.683949875831604, + "epoch": 0.928, + "grad_norm": 564.0, + "kl_loss_10": 175.13030853271485, + "kl_loss_2": 2107.975048828125, + "kl_loss_3": 1633.0731262207032, + "kl_loss_7": 589.7343017578125, + "learning_rate": 1.2994041528833267e-05, + "loss": 1127.6617, + "step": 9280 + }, + { + "ce_loss_10": 3.5171499490737914, + "ce_loss_13": 3.440678071975708, + "ce_loss_2": 4.470655179023742, + "ce_loss_3": 4.200739192962646, + "ce_loss_7": 3.690466821193695, + "epoch": 0.929, + "grad_norm": 584.0, + "kl_loss_10": 174.82103958129883, + "kl_loss_2": 2150.8757751464846, + "kl_loss_3": 1677.1400146484375, + "kl_loss_7": 592.8601806640625, + "learning_rate": 1.2637120173670358e-05, + "loss": 1145.5388, + "step": 9290 + }, + { + "ce_loss_10": 3.5360547065734864, + "ce_loss_13": 3.459415102005005, + "ce_loss_2": 4.509140729904175, + "ce_loss_3": 4.243091595172882, + "ce_loss_7": 3.717022383213043, + "epoch": 0.93, + "grad_norm": 616.0, + "kl_loss_10": 177.1988067626953, + "kl_loss_2": 2160.6812377929687, + "kl_loss_3": 1681.2325256347656, + "kl_loss_7": 601.0645294189453, + "learning_rate": 1.2285106557296478e-05, + "loss": 1155.5311, + "step": 9300 + }, + { + "ce_loss_10": 3.4133058071136473, + "ce_loss_13": 3.340100085735321, + "ce_loss_2": 4.45502986907959, + "ce_loss_3": 4.176646625995636, + "ce_loss_7": 3.593162167072296, + "epoch": 0.931, + "grad_norm": 684.0, + "kl_loss_10": 177.049959564209, + "kl_loss_2": 2280.7558776855467, + "kl_loss_3": 1784.5312927246093, + "kl_loss_7": 606.032177734375, + "learning_rate": 1.1938004224484989e-05, + "loss": 1177.6771, + "step": 9310 + }, + { + "ce_loss_10": 3.6532346606254578, + "ce_loss_13": 3.5762033224105836, + "ce_loss_2": 4.59397509098053, + "ce_loss_3": 4.32731124162674, + "ce_loss_7": 3.8223699569702148, + "epoch": 0.932, + "grad_norm": 552.0, + "kl_loss_10": 179.57099151611328, + "kl_loss_2": 2131.067413330078, + "kl_loss_3": 1656.3818481445312, + "kl_loss_7": 601.6865631103516, + "learning_rate": 1.1595816670552429e-05, + "loss": 1167.2541, + "step": 9320 + }, + { + "ce_loss_10": 3.582180309295654, + "ce_loss_13": 3.5061400294303895, + "ce_loss_2": 4.524780786037445, + "ce_loss_3": 4.259365129470825, + "ce_loss_7": 3.747998225688934, + "epoch": 0.933, + "grad_norm": 568.0, + "kl_loss_10": 175.25698852539062, + "kl_loss_2": 2109.6074279785157, + "kl_loss_3": 1638.4892211914062, + "kl_loss_7": 583.7247375488281, + "learning_rate": 1.1258547341323699e-05, + "loss": 1126.3885, + "step": 9330 + }, + { + "ce_loss_10": 3.6068438053131104, + "ce_loss_13": 3.5321076273918153, + "ce_loss_2": 4.546216082572937, + "ce_loss_3": 4.2822174549102785, + "ce_loss_7": 3.7781530022621155, + "epoch": 0.934, + "grad_norm": 584.0, + "kl_loss_10": 177.80956649780273, + "kl_loss_2": 2142.086737060547, + "kl_loss_3": 1668.28828125, + "kl_loss_7": 595.8422210693359, + "learning_rate": 1.0926199633097156e-05, + "loss": 1139.4527, + "step": 9340 + }, + { + "ce_loss_10": 3.610630822181702, + "ce_loss_13": 3.540286922454834, + "ce_loss_2": 4.522486686706543, + "ce_loss_3": 4.263027024269104, + "ce_loss_7": 3.7751463413238526, + "epoch": 0.935, + "grad_norm": 568.0, + "kl_loss_10": 172.0374610900879, + "kl_loss_2": 2078.3720947265624, + "kl_loss_3": 1611.571875, + "kl_loss_7": 582.5087860107421, + "learning_rate": 1.0598776892610684e-05, + "loss": 1147.9141, + "step": 9350 + }, + { + "ce_loss_10": 3.4225520491600037, + "ce_loss_13": 3.350025403499603, + "ce_loss_2": 4.415711855888366, + "ce_loss_3": 4.137125706672668, + "ce_loss_7": 3.5997640252113343, + "epoch": 0.936, + "grad_norm": 552.0, + "kl_loss_10": 173.3252960205078, + "kl_loss_2": 2195.4003051757813, + "kl_loss_3": 1704.4373046875, + "kl_loss_7": 593.9061553955078, + "learning_rate": 1.0276282417007399e-05, + "loss": 1147.3932, + "step": 9360 + }, + { + "ce_loss_10": 3.581203269958496, + "ce_loss_13": 3.5097854137420654, + "ce_loss_2": 4.501006007194519, + "ce_loss_3": 4.243209981918335, + "ce_loss_7": 3.747337484359741, + "epoch": 0.937, + "grad_norm": 596.0, + "kl_loss_10": 171.74332122802736, + "kl_loss_2": 2068.851727294922, + "kl_loss_3": 1615.05986328125, + "kl_loss_7": 580.8333526611328, + "learning_rate": 9.958719453803277e-06, + "loss": 1127.7196, + "step": 9370 + }, + { + "ce_loss_10": 3.578359854221344, + "ce_loss_13": 3.504493975639343, + "ce_loss_2": 4.52911410331726, + "ce_loss_3": 4.265734839439392, + "ce_loss_7": 3.757120943069458, + "epoch": 0.938, + "grad_norm": 568.0, + "kl_loss_10": 176.9942184448242, + "kl_loss_2": 2134.3438110351562, + "kl_loss_3": 1664.9087463378905, + "kl_loss_7": 601.7982177734375, + "learning_rate": 9.646091200853802e-06, + "loss": 1132.6439, + "step": 9380 + }, + { + "ce_loss_10": 3.5366848587989805, + "ce_loss_13": 3.4644816398620604, + "ce_loss_2": 4.483027625083923, + "ce_loss_3": 4.215219330787659, + "ce_loss_7": 3.704088735580444, + "epoch": 0.939, + "grad_norm": 536.0, + "kl_loss_10": 172.74244918823243, + "kl_loss_2": 2099.378448486328, + "kl_loss_3": 1622.1510986328126, + "kl_loss_7": 583.5708526611328, + "learning_rate": 9.338400806321978e-06, + "loss": 1100.1545, + "step": 9390 + }, + { + "ce_loss_10": 3.571504032611847, + "ce_loss_13": 3.493305134773254, + "ce_loss_2": 4.510696125030518, + "ce_loss_3": 4.24888288974762, + "ce_loss_7": 3.742924678325653, + "epoch": 0.94, + "grad_norm": 516.0, + "kl_loss_10": 177.7894515991211, + "kl_loss_2": 2105.6502197265627, + "kl_loss_3": 1641.3450012207031, + "kl_loss_7": 590.5869018554688, + "learning_rate": 9.035651368646646e-06, + "loss": 1131.6762, + "step": 9400 + }, + { + "ce_loss_10": 3.572381889820099, + "ce_loss_13": 3.5001948475837708, + "ce_loss_2": 4.5045966625213625, + "ce_loss_3": 4.241596531867981, + "ce_loss_7": 3.742561626434326, + "epoch": 0.941, + "grad_norm": 612.0, + "kl_loss_10": 173.46388778686523, + "kl_loss_2": 2088.4161376953125, + "kl_loss_3": 1622.1473693847656, + "kl_loss_7": 583.22626953125, + "learning_rate": 8.737845936511335e-06, + "loss": 1133.2381, + "step": 9410 + }, + { + "ce_loss_10": 3.522591459751129, + "ce_loss_13": 3.446700024604797, + "ce_loss_2": 4.507854294776917, + "ce_loss_3": 4.236737239360809, + "ce_loss_7": 3.698591649532318, + "epoch": 0.942, + "grad_norm": 572.0, + "kl_loss_10": 178.92413635253905, + "kl_loss_2": 2186.715954589844, + "kl_loss_3": 1705.135888671875, + "kl_loss_7": 600.7923004150391, + "learning_rate": 8.444987508813451e-06, + "loss": 1149.7434, + "step": 9420 + }, + { + "ce_loss_10": 3.475817048549652, + "ce_loss_13": 3.3989389657974245, + "ce_loss_2": 4.473474383354187, + "ce_loss_3": 4.199087584018708, + "ce_loss_7": 3.6562391996383665, + "epoch": 0.943, + "grad_norm": 628.0, + "kl_loss_10": 179.55375061035156, + "kl_loss_2": 2263.5999450683594, + "kl_loss_3": 1768.8511352539062, + "kl_loss_7": 615.3503479003906, + "learning_rate": 8.157079034633974e-06, + "loss": 1178.9379, + "step": 9430 + }, + { + "ce_loss_10": 3.473416876792908, + "ce_loss_13": 3.40051189661026, + "ce_loss_2": 4.446794199943542, + "ce_loss_3": 4.178017342090607, + "ce_loss_7": 3.6506085276603697, + "epoch": 0.944, + "grad_norm": 552.0, + "kl_loss_10": 174.8175079345703, + "kl_loss_2": 2187.4151916503906, + "kl_loss_3": 1711.5777709960937, + "kl_loss_7": 600.643310546875, + "learning_rate": 7.874123413208145e-06, + "loss": 1147.171, + "step": 9440 + }, + { + "ce_loss_10": 3.445332610607147, + "ce_loss_13": 3.369913935661316, + "ce_loss_2": 4.434185910224914, + "ce_loss_3": 4.161804282665253, + "ce_loss_7": 3.6255853891372682, + "epoch": 0.945, + "grad_norm": 572.0, + "kl_loss_10": 175.52419662475586, + "kl_loss_2": 2184.3239013671873, + "kl_loss_3": 1698.89501953125, + "kl_loss_7": 598.6286346435547, + "learning_rate": 7.59612349389599e-06, + "loss": 1155.226, + "step": 9450 + }, + { + "ce_loss_10": 3.534553039073944, + "ce_loss_13": 3.462403440475464, + "ce_loss_2": 4.465369653701782, + "ce_loss_3": 4.198646211624146, + "ce_loss_7": 3.708206284046173, + "epoch": 0.946, + "grad_norm": 580.0, + "kl_loss_10": 172.39571685791014, + "kl_loss_2": 2074.147509765625, + "kl_loss_3": 1600.0551879882812, + "kl_loss_7": 581.5289535522461, + "learning_rate": 7.323082076153509e-06, + "loss": 1126.5964, + "step": 9460 + }, + { + "ce_loss_10": 3.576580452919006, + "ce_loss_13": 3.503724229335785, + "ce_loss_2": 4.510397839546203, + "ce_loss_3": 4.244443106651306, + "ce_loss_7": 3.7473479986190794, + "epoch": 0.947, + "grad_norm": 572.0, + "kl_loss_10": 179.0567657470703, + "kl_loss_2": 2087.8162841796875, + "kl_loss_3": 1617.9851928710937, + "kl_loss_7": 593.7520263671875, + "learning_rate": 7.055001909504755e-06, + "loss": 1153.5377, + "step": 9470 + }, + { + "ce_loss_10": 3.6078381657600405, + "ce_loss_13": 3.5327057957649233, + "ce_loss_2": 4.550836896896362, + "ce_loss_3": 4.287693047523499, + "ce_loss_7": 3.7842918038368225, + "epoch": 0.948, + "grad_norm": 616.0, + "kl_loss_10": 177.3563217163086, + "kl_loss_2": 2119.7028747558593, + "kl_loss_3": 1649.5453063964844, + "kl_loss_7": 593.8896911621093, + "learning_rate": 6.791885693514133e-06, + "loss": 1138.076, + "step": 9480 + }, + { + "ce_loss_10": 3.5228418946266173, + "ce_loss_13": 3.4471074819564818, + "ce_loss_2": 4.505965852737427, + "ce_loss_3": 4.226520001888275, + "ce_loss_7": 3.6976125478744506, + "epoch": 0.949, + "grad_norm": 544.0, + "kl_loss_10": 179.4360237121582, + "kl_loss_2": 2204.2863159179688, + "kl_loss_3": 1709.0836547851563, + "kl_loss_7": 603.0492492675781, + "learning_rate": 6.533736077758867e-06, + "loss": 1164.073, + "step": 9490 + }, + { + "ce_loss_10": 3.480616366863251, + "ce_loss_13": 3.4073901891708376, + "ce_loss_2": 4.4987491250038145, + "ce_loss_3": 4.22996586561203, + "ce_loss_7": 3.6651357769966126, + "epoch": 0.95, + "grad_norm": 596.0, + "kl_loss_10": 179.7611167907715, + "kl_loss_2": 2254.7767639160156, + "kl_loss_3": 1772.1949829101563, + "kl_loss_7": 613.6569396972657, + "learning_rate": 6.2805556618028556e-06, + "loss": 1174.4523, + "step": 9500 + }, + { + "ce_loss_10": 3.5753507733345034, + "ce_loss_13": 3.5015788078308105, + "ce_loss_2": 4.508717775344849, + "ce_loss_3": 4.236187517642975, + "ce_loss_7": 3.735841393470764, + "epoch": 0.951, + "grad_norm": 600.0, + "kl_loss_10": 171.3624740600586, + "kl_loss_2": 2070.820721435547, + "kl_loss_3": 1594.0807983398438, + "kl_loss_7": 569.2584503173828, + "learning_rate": 6.032346995169968e-06, + "loss": 1091.2504, + "step": 9510 + }, + { + "ce_loss_10": 3.5802130341529845, + "ce_loss_13": 3.505545949935913, + "ce_loss_2": 4.52126247882843, + "ce_loss_3": 4.2531510353088375, + "ce_loss_7": 3.7508664727211, + "epoch": 0.952, + "grad_norm": 572.0, + "kl_loss_10": 175.58048248291016, + "kl_loss_2": 2115.77294921875, + "kl_loss_3": 1640.845928955078, + "kl_loss_7": 590.4191436767578, + "learning_rate": 5.789112577318789e-06, + "loss": 1125.406, + "step": 9520 + }, + { + "ce_loss_10": 3.5525818467140198, + "ce_loss_13": 3.474942719936371, + "ce_loss_2": 4.529764556884766, + "ce_loss_3": 4.262361979484558, + "ce_loss_7": 3.725843298435211, + "epoch": 0.953, + "grad_norm": 560.0, + "kl_loss_10": 178.63978881835936, + "kl_loss_2": 2187.347344970703, + "kl_loss_3": 1713.5655639648437, + "kl_loss_7": 604.028662109375, + "learning_rate": 5.550854857617194e-06, + "loss": 1138.3246, + "step": 9530 + }, + { + "ce_loss_10": 3.5410927653312685, + "ce_loss_13": 3.464462494850159, + "ce_loss_2": 4.532546710968018, + "ce_loss_3": 4.2611222743988035, + "ce_loss_7": 3.717101490497589, + "epoch": 0.954, + "grad_norm": 596.0, + "kl_loss_10": 179.71324462890624, + "kl_loss_2": 2218.49970703125, + "kl_loss_3": 1729.5723754882813, + "kl_loss_7": 606.5572113037109, + "learning_rate": 5.317576235317756e-06, + "loss": 1164.7152, + "step": 9540 + }, + { + "ce_loss_10": 3.567497718334198, + "ce_loss_13": 3.4959131717681884, + "ce_loss_2": 4.49013991355896, + "ce_loss_3": 4.226358330249786, + "ce_loss_7": 3.7339030742645263, + "epoch": 0.955, + "grad_norm": 580.0, + "kl_loss_10": 171.57061843872071, + "kl_loss_2": 2045.03349609375, + "kl_loss_3": 1581.6937194824218, + "kl_loss_7": 573.2978149414063, + "learning_rate": 5.089279059533658e-06, + "loss": 1144.5578, + "step": 9550 + }, + { + "ce_loss_10": 3.6264307737350463, + "ce_loss_13": 3.549366092681885, + "ce_loss_2": 4.555849361419678, + "ce_loss_3": 4.291936588287354, + "ce_loss_7": 3.796077787876129, + "epoch": 0.956, + "grad_norm": 532.0, + "kl_loss_10": 180.97408142089844, + "kl_loss_2": 2101.3386962890627, + "kl_loss_3": 1636.1171264648438, + "kl_loss_7": 603.2738311767578, + "learning_rate": 4.865965629214819e-06, + "loss": 1128.9252, + "step": 9560 + }, + { + "ce_loss_10": 3.5740526914596558, + "ce_loss_13": 3.4976862549781798, + "ce_loss_2": 4.531564974784851, + "ce_loss_3": 4.273219418525696, + "ce_loss_7": 3.7471871614456176, + "epoch": 0.957, + "grad_norm": 496.0, + "kl_loss_10": 178.81691131591796, + "kl_loss_2": 2162.70166015625, + "kl_loss_3": 1696.7497314453126, + "kl_loss_7": 603.7053924560547, + "learning_rate": 4.6476381931251366e-06, + "loss": 1126.9263, + "step": 9570 + }, + { + "ce_loss_10": 3.5494153618812563, + "ce_loss_13": 3.475415658950806, + "ce_loss_2": 4.496997284889221, + "ce_loss_3": 4.230398142337799, + "ce_loss_7": 3.728412318229675, + "epoch": 0.958, + "grad_norm": 496.0, + "kl_loss_10": 176.03445205688476, + "kl_loss_2": 2117.532794189453, + "kl_loss_3": 1643.0480895996093, + "kl_loss_7": 594.0362182617188, + "learning_rate": 4.434298949819449e-06, + "loss": 1135.89, + "step": 9580 + }, + { + "ce_loss_10": 3.5075241327285767, + "ce_loss_13": 3.4301467418670653, + "ce_loss_2": 4.514026093482971, + "ce_loss_3": 4.236743009090423, + "ce_loss_7": 3.6883852958679197, + "epoch": 0.959, + "grad_norm": 584.0, + "kl_loss_10": 182.3026496887207, + "kl_loss_2": 2271.594354248047, + "kl_loss_3": 1773.125311279297, + "kl_loss_7": 624.3575988769531, + "learning_rate": 4.2259500476214406e-06, + "loss": 1183.4918, + "step": 9590 + }, + { + "ce_loss_10": 3.491668391227722, + "ce_loss_13": 3.4157654523849486, + "ce_loss_2": 4.465724205970764, + "ce_loss_3": 4.201405656337738, + "ce_loss_7": 3.665540862083435, + "epoch": 0.96, + "grad_norm": 556.0, + "kl_loss_10": 177.03348541259766, + "kl_loss_2": 2184.085583496094, + "kl_loss_3": 1717.5306762695313, + "kl_loss_7": 602.626220703125, + "learning_rate": 4.02259358460233e-06, + "loss": 1148.7472, + "step": 9600 + }, + { + "ce_loss_10": 3.5573193550109865, + "ce_loss_13": 3.4829213500022886, + "ce_loss_2": 4.506733560562134, + "ce_loss_3": 4.238800776004791, + "ce_loss_7": 3.7266565203666686, + "epoch": 0.961, + "grad_norm": 580.0, + "kl_loss_10": 176.3166290283203, + "kl_loss_2": 2114.0912475585938, + "kl_loss_3": 1637.3499328613282, + "kl_loss_7": 588.5086486816406, + "learning_rate": 3.8242316085594916e-06, + "loss": 1126.8451, + "step": 9610 + }, + { + "ce_loss_10": 3.447552573680878, + "ce_loss_13": 3.369247031211853, + "ce_loss_2": 4.473224306106568, + "ce_loss_3": 4.197393763065338, + "ce_loss_7": 3.6313952803611755, + "epoch": 0.962, + "grad_norm": 556.0, + "kl_loss_10": 180.42641220092773, + "kl_loss_2": 2280.7411865234376, + "kl_loss_3": 1787.7846984863281, + "kl_loss_7": 615.477572631836, + "learning_rate": 3.630866116995757e-06, + "loss": 1194.5547, + "step": 9620 + }, + { + "ce_loss_10": 3.5979113578796387, + "ce_loss_13": 3.5257344841957092, + "ce_loss_2": 4.537094449996948, + "ce_loss_3": 4.265922880172729, + "ce_loss_7": 3.7622151970863342, + "epoch": 0.963, + "grad_norm": 572.0, + "kl_loss_10": 174.69513320922852, + "kl_loss_2": 2105.5745544433594, + "kl_loss_3": 1622.0366455078124, + "kl_loss_7": 578.8991943359375, + "learning_rate": 3.4424990570994797e-06, + "loss": 1156.3669, + "step": 9630 + }, + { + "ce_loss_10": 3.585505282878876, + "ce_loss_13": 3.5114797711372376, + "ce_loss_2": 4.518644833564759, + "ce_loss_3": 4.256495106220245, + "ce_loss_7": 3.7576936960220335, + "epoch": 0.964, + "grad_norm": 482.0, + "kl_loss_10": 175.59745712280272, + "kl_loss_2": 2103.631524658203, + "kl_loss_3": 1631.6069641113281, + "kl_loss_7": 588.9240661621094, + "learning_rate": 3.2591323257248896e-06, + "loss": 1134.1978, + "step": 9640 + }, + { + "ce_loss_10": 3.437925660610199, + "ce_loss_13": 3.3662607192993166, + "ce_loss_2": 4.409651112556458, + "ce_loss_3": 4.150672721862793, + "ce_loss_7": 3.6122069478034975, + "epoch": 0.965, + "grad_norm": 556.0, + "kl_loss_10": 174.7218978881836, + "kl_loss_2": 2173.464489746094, + "kl_loss_3": 1704.9824951171875, + "kl_loss_7": 600.8370666503906, + "learning_rate": 3.0807677693729385e-06, + "loss": 1163.455, + "step": 9650 + }, + { + "ce_loss_10": 3.623323905467987, + "ce_loss_13": 3.55154949426651, + "ce_loss_2": 4.551669549942017, + "ce_loss_3": 4.290165424346924, + "ce_loss_7": 3.794328248500824, + "epoch": 0.966, + "grad_norm": 544.0, + "kl_loss_10": 174.09824600219727, + "kl_loss_2": 2080.361853027344, + "kl_loss_3": 1626.3007873535157, + "kl_loss_7": 584.5892837524414, + "learning_rate": 2.9074071841727055e-06, + "loss": 1115.8137, + "step": 9660 + }, + { + "ce_loss_10": 3.548972153663635, + "ce_loss_13": 3.4729049801826477, + "ce_loss_2": 4.494955968856812, + "ce_loss_3": 4.230167889595032, + "ce_loss_7": 3.730636739730835, + "epoch": 0.967, + "grad_norm": 632.0, + "kl_loss_10": 177.06267852783202, + "kl_loss_2": 2105.5685302734373, + "kl_loss_3": 1641.2366760253906, + "kl_loss_7": 599.4572601318359, + "learning_rate": 2.739052315863355e-06, + "loss": 1112.1609, + "step": 9670 + }, + { + "ce_loss_10": 3.5363902688026427, + "ce_loss_13": 3.4610472440719606, + "ce_loss_2": 4.502471828460694, + "ce_loss_3": 4.230240440368652, + "ce_loss_7": 3.7059998750686645, + "epoch": 0.968, + "grad_norm": 560.0, + "kl_loss_10": 176.56764450073243, + "kl_loss_2": 2152.0122924804687, + "kl_loss_3": 1676.3801025390626, + "kl_loss_7": 591.2061340332032, + "learning_rate": 2.5757048597765396e-06, + "loss": 1135.4543, + "step": 9680 + }, + { + "ce_loss_10": 3.5459084630012514, + "ce_loss_13": 3.4721821188926696, + "ce_loss_2": 4.505685806274414, + "ce_loss_3": 4.235912537574768, + "ce_loss_7": 3.722131609916687, + "epoch": 0.969, + "grad_norm": 560.0, + "kl_loss_10": 176.31484680175782, + "kl_loss_2": 2142.29345703125, + "kl_loss_3": 1672.873828125, + "kl_loss_7": 599.0281616210938, + "learning_rate": 2.417366460819359e-06, + "loss": 1141.189, + "step": 9690 + }, + { + "ce_loss_10": 3.5568428516387938, + "ce_loss_13": 3.47944039106369, + "ce_loss_2": 4.546383309364319, + "ce_loss_3": 4.280533790588379, + "ce_loss_7": 3.73818119764328, + "epoch": 0.97, + "grad_norm": 592.0, + "kl_loss_10": 181.22289581298827, + "kl_loss_2": 2223.5619262695313, + "kl_loss_3": 1743.523046875, + "kl_loss_7": 618.0255676269531, + "learning_rate": 2.2640387134577057e-06, + "loss": 1150.9949, + "step": 9700 + }, + { + "ce_loss_10": 3.4835644006729125, + "ce_loss_13": 3.409128963947296, + "ce_loss_2": 4.400413775444031, + "ce_loss_3": 4.1408212065696715, + "ce_loss_7": 3.6511133790016173, + "epoch": 0.971, + "grad_norm": 584.0, + "kl_loss_10": 169.66612396240234, + "kl_loss_2": 2037.37294921875, + "kl_loss_3": 1580.9468200683593, + "kl_loss_7": 575.112336730957, + "learning_rate": 2.115723161700278e-06, + "loss": 1111.2564, + "step": 9710 + }, + { + "ce_loss_10": 3.462701106071472, + "ce_loss_13": 3.383505952358246, + "ce_loss_2": 4.462756657600403, + "ce_loss_3": 4.1902328610420225, + "ce_loss_7": 3.6453136444091796, + "epoch": 0.972, + "grad_norm": 676.0, + "kl_loss_10": 180.0776268005371, + "kl_loss_2": 2223.634521484375, + "kl_loss_3": 1740.0434143066407, + "kl_loss_7": 612.3085083007812, + "learning_rate": 1.9724212990830937e-06, + "loss": 1170.462, + "step": 9720 + }, + { + "ce_loss_10": 3.6076322913169863, + "ce_loss_13": 3.532732355594635, + "ce_loss_2": 4.577161026000977, + "ce_loss_3": 4.311069667339325, + "ce_loss_7": 3.7834354996681214, + "epoch": 0.973, + "grad_norm": 488.0, + "kl_loss_10": 178.08698196411132, + "kl_loss_2": 2168.2475769042967, + "kl_loss_3": 1699.6189208984374, + "kl_loss_7": 598.9332580566406, + "learning_rate": 1.8341345686543331e-06, + "loss": 1146.8779, + "step": 9730 + }, + { + "ce_loss_10": 3.5909879326820375, + "ce_loss_13": 3.5183821320533752, + "ce_loss_2": 4.50426287651062, + "ce_loss_3": 4.235173010826111, + "ce_loss_7": 3.757961595058441, + "epoch": 0.974, + "grad_norm": 548.0, + "kl_loss_10": 174.61135635375976, + "kl_loss_2": 2063.9743225097654, + "kl_loss_3": 1591.8313293457031, + "kl_loss_7": 585.6029083251954, + "learning_rate": 1.7008643629596864e-06, + "loss": 1145.0081, + "step": 9740 + }, + { + "ce_loss_10": 3.5759197235107423, + "ce_loss_13": 3.4986127734184267, + "ce_loss_2": 4.5397637486457825, + "ce_loss_3": 4.2685352802276615, + "ce_loss_7": 3.7446988224983215, + "epoch": 0.975, + "grad_norm": 552.0, + "kl_loss_10": 176.2814811706543, + "kl_loss_2": 2161.20859375, + "kl_loss_3": 1678.5180541992188, + "kl_loss_7": 590.2671813964844, + "learning_rate": 1.5726120240288633e-06, + "loss": 1164.5706, + "step": 9750 + }, + { + "ce_loss_10": 3.4757342100143434, + "ce_loss_13": 3.4012367367744445, + "ce_loss_2": 4.433041834831238, + "ce_loss_3": 4.165659952163696, + "ce_loss_7": 3.6462602019309998, + "epoch": 0.976, + "grad_norm": 572.0, + "kl_loss_10": 174.65177154541016, + "kl_loss_2": 2138.293341064453, + "kl_loss_3": 1655.4461547851563, + "kl_loss_7": 589.6313079833984, + "learning_rate": 1.4493788433612708e-06, + "loss": 1134.1515, + "step": 9760 + }, + { + "ce_loss_10": 3.5877037525177, + "ce_loss_13": 3.5132053971290587, + "ce_loss_2": 4.55386061668396, + "ce_loss_3": 4.287087714672088, + "ce_loss_7": 3.7638731479644774, + "epoch": 0.977, + "grad_norm": 536.0, + "kl_loss_10": 177.9455436706543, + "kl_loss_2": 2173.966436767578, + "kl_loss_3": 1692.082745361328, + "kl_loss_7": 599.7038208007813, + "learning_rate": 1.3311660619138578e-06, + "loss": 1161.4269, + "step": 9770 + }, + { + "ce_loss_10": 3.584187960624695, + "ce_loss_13": 3.510979926586151, + "ce_loss_2": 4.489086222648621, + "ce_loss_3": 4.228979337215423, + "ce_loss_7": 3.748577618598938, + "epoch": 0.978, + "grad_norm": 516.0, + "kl_loss_10": 176.20037689208985, + "kl_loss_2": 2033.9857421875, + "kl_loss_3": 1575.569403076172, + "kl_loss_7": 583.5555999755859, + "learning_rate": 1.2179748700879012e-06, + "loss": 1135.4594, + "step": 9780 + }, + { + "ce_loss_10": 3.516654706001282, + "ce_loss_13": 3.442041552066803, + "ce_loss_2": 4.460341954231263, + "ce_loss_3": 4.201344418525696, + "ce_loss_7": 3.6880866169929503, + "epoch": 0.979, + "grad_norm": 648.0, + "kl_loss_10": 175.97493591308594, + "kl_loss_2": 2106.5852966308594, + "kl_loss_3": 1640.5349182128907, + "kl_loss_7": 589.2118927001953, + "learning_rate": 1.1098064077174619e-06, + "loss": 1139.4391, + "step": 9790 + }, + { + "ce_loss_10": 3.548008131980896, + "ce_loss_13": 3.470580744743347, + "ce_loss_2": 4.531388640403748, + "ce_loss_3": 4.256609618663788, + "ce_loss_7": 3.7258023023605347, + "epoch": 0.98, + "grad_norm": 660.0, + "kl_loss_10": 175.85005264282228, + "kl_loss_2": 2184.833563232422, + "kl_loss_3": 1695.749658203125, + "kl_loss_7": 597.7893035888671, + "learning_rate": 1.006661764057837e-06, + "loss": 1144.1424, + "step": 9800 + }, + { + "ce_loss_10": 3.5516860127449035, + "ce_loss_13": 3.479186308383942, + "ce_loss_2": 4.507886123657227, + "ce_loss_3": 4.23874124288559, + "ce_loss_7": 3.7239818572998047, + "epoch": 0.981, + "grad_norm": 548.0, + "kl_loss_10": 174.7688331604004, + "kl_loss_2": 2140.183038330078, + "kl_loss_3": 1663.4087707519532, + "kl_loss_7": 592.6127227783203, + "learning_rate": 9.085419777743465e-07, + "loss": 1136.217, + "step": 9810 + }, + { + "ce_loss_10": 3.4896764159202576, + "ce_loss_13": 3.4188039541244506, + "ce_loss_2": 4.450884318351745, + "ce_loss_3": 4.184124147891998, + "ce_loss_7": 3.6670993685722353, + "epoch": 0.982, + "grad_norm": 476.0, + "kl_loss_10": 171.6952751159668, + "kl_loss_2": 2127.2390258789064, + "kl_loss_3": 1658.6653991699218, + "kl_loss_7": 588.117578125, + "learning_rate": 8.15448036932176e-07, + "loss": 1121.8644, + "step": 9820 + }, + { + "ce_loss_10": 3.542994940280914, + "ce_loss_13": 3.471325635910034, + "ce_loss_2": 4.491614294052124, + "ce_loss_3": 4.226279616355896, + "ce_loss_7": 3.716835379600525, + "epoch": 0.983, + "grad_norm": 580.0, + "kl_loss_10": 175.40776138305665, + "kl_loss_2": 2138.5871826171874, + "kl_loss_3": 1668.4995056152343, + "kl_loss_7": 599.0077606201172, + "learning_rate": 7.273808789862724e-07, + "loss": 1157.4876, + "step": 9830 + }, + { + "ce_loss_10": 3.62471262216568, + "ce_loss_13": 3.552128314971924, + "ce_loss_2": 4.560764002799988, + "ce_loss_3": 4.2987874269485475, + "ce_loss_7": 3.7973197221755983, + "epoch": 0.984, + "grad_norm": 536.0, + "kl_loss_10": 177.9404067993164, + "kl_loss_2": 2121.9407958984375, + "kl_loss_3": 1649.4432312011718, + "kl_loss_7": 593.6075317382813, + "learning_rate": 6.443413907720186e-07, + "loss": 1128.3074, + "step": 9840 + }, + { + "ce_loss_10": 3.553659164905548, + "ce_loss_13": 3.479843807220459, + "ce_loss_2": 4.502509045600891, + "ce_loss_3": 4.239866006374359, + "ce_loss_7": 3.7261658310890198, + "epoch": 0.985, + "grad_norm": 612.0, + "kl_loss_10": 175.90703582763672, + "kl_loss_2": 2105.8654174804688, + "kl_loss_3": 1643.8707397460937, + "kl_loss_7": 589.3217529296875, + "learning_rate": 5.663304084960185e-07, + "loss": 1125.6893, + "step": 9850 + }, + { + "ce_loss_10": 3.4857439756393434, + "ce_loss_13": 3.40972044467926, + "ce_loss_2": 4.458719778060913, + "ce_loss_3": 4.193828642368317, + "ce_loss_7": 3.661728310585022, + "epoch": 0.986, + "grad_norm": 544.0, + "kl_loss_10": 175.7668014526367, + "kl_loss_2": 2168.083819580078, + "kl_loss_3": 1695.883349609375, + "kl_loss_7": 599.47685546875, + "learning_rate": 4.933487177280482e-07, + "loss": 1132.0084, + "step": 9860 + }, + { + "ce_loss_10": 3.577410614490509, + "ce_loss_13": 3.50371458530426, + "ce_loss_2": 4.517120695114135, + "ce_loss_3": 4.256355273723602, + "ce_loss_7": 3.745650053024292, + "epoch": 0.987, + "grad_norm": 580.0, + "kl_loss_10": 172.69470291137696, + "kl_loss_2": 2116.2484741210938, + "kl_loss_3": 1646.7192932128905, + "kl_loss_7": 586.3196258544922, + "learning_rate": 4.2539705339295075e-07, + "loss": 1129.2027, + "step": 9870 + }, + { + "ce_loss_10": 3.4351974010467528, + "ce_loss_13": 3.359704864025116, + "ce_loss_2": 4.414662563800812, + "ce_loss_3": 4.1485153317451475, + "ce_loss_7": 3.614791524410248, + "epoch": 0.988, + "grad_norm": 624.0, + "kl_loss_10": 176.81834564208984, + "kl_loss_2": 2189.1237670898436, + "kl_loss_3": 1714.1568420410156, + "kl_loss_7": 602.8686370849609, + "learning_rate": 3.6247609976319816e-07, + "loss": 1142.2324, + "step": 9880 + }, + { + "ce_loss_10": 3.5325068116188048, + "ce_loss_13": 3.4560230016708373, + "ce_loss_2": 4.515677762031555, + "ce_loss_3": 4.241289448738098, + "ce_loss_7": 3.7140289902687074, + "epoch": 0.989, + "grad_norm": 644.0, + "kl_loss_10": 178.62700347900392, + "kl_loss_2": 2181.043316650391, + "kl_loss_3": 1701.3241455078125, + "kl_loss_7": 601.8407318115235, + "learning_rate": 3.0458649045211895e-07, + "loss": 1177.6322, + "step": 9890 + }, + { + "ce_loss_10": 3.505313539505005, + "ce_loss_13": 3.4275246262550354, + "ce_loss_2": 4.470349764823913, + "ce_loss_3": 4.199915885925293, + "ce_loss_7": 3.687111556529999, + "epoch": 0.99, + "grad_norm": 628.0, + "kl_loss_10": 179.9844207763672, + "kl_loss_2": 2144.779681396484, + "kl_loss_3": 1664.5868041992187, + "kl_loss_7": 610.2965026855469, + "learning_rate": 2.517288084074587e-07, + "loss": 1173.5785, + "step": 9900 + }, + { + "ce_loss_10": 3.541435408592224, + "ce_loss_13": 3.4641653418540956, + "ce_loss_2": 4.540918755531311, + "ce_loss_3": 4.268487918376922, + "ce_loss_7": 3.728367364406586, + "epoch": 0.991, + "grad_norm": 544.0, + "kl_loss_10": 181.58360061645507, + "kl_loss_2": 2223.09423828125, + "kl_loss_3": 1733.728173828125, + "kl_loss_7": 618.8083801269531, + "learning_rate": 2.0390358590538505e-07, + "loss": 1164.2306, + "step": 9910 + }, + { + "ce_loss_10": 3.5465844750404356, + "ce_loss_13": 3.4692795395851137, + "ce_loss_2": 4.505300617218017, + "ce_loss_3": 4.238151812553406, + "ce_loss_7": 3.7215544462203978, + "epoch": 0.992, + "grad_norm": 516.0, + "kl_loss_10": 178.79893417358397, + "kl_loss_2": 2149.3738037109374, + "kl_loss_3": 1683.4728881835938, + "kl_loss_7": 602.8562408447266, + "learning_rate": 1.61111304545436e-07, + "loss": 1139.9141, + "step": 9920 + }, + { + "ce_loss_10": 3.5144612431526183, + "ce_loss_13": 3.439807415008545, + "ce_loss_2": 4.468925881385803, + "ce_loss_3": 4.204157900810242, + "ce_loss_7": 3.68552029132843, + "epoch": 0.993, + "grad_norm": 524.0, + "kl_loss_10": 174.9011474609375, + "kl_loss_2": 2131.701556396484, + "kl_loss_3": 1667.3637084960938, + "kl_loss_7": 591.8195831298829, + "learning_rate": 1.2335239524541298e-07, + "loss": 1123.1069, + "step": 9930 + }, + { + "ce_loss_10": 3.485284912586212, + "ce_loss_13": 3.4107711553573608, + "ce_loss_2": 4.4413145065307615, + "ce_loss_3": 4.1761764764785765, + "ce_loss_7": 3.658044862747192, + "epoch": 0.994, + "grad_norm": 552.0, + "kl_loss_10": 174.74987030029297, + "kl_loss_2": 2137.9515625, + "kl_loss_3": 1659.8411071777343, + "kl_loss_7": 590.9619750976562, + "learning_rate": 9.06272382371065e-08, + "loss": 1140.1338, + "step": 9940 + }, + { + "ce_loss_10": 3.5549147844314577, + "ce_loss_13": 3.482628679275513, + "ce_loss_2": 4.527088284492493, + "ce_loss_3": 4.2653639078140255, + "ce_loss_7": 3.7300615668296815, + "epoch": 0.995, + "grad_norm": 540.0, + "kl_loss_10": 177.89019927978515, + "kl_loss_2": 2179.192108154297, + "kl_loss_3": 1710.0057861328125, + "kl_loss_7": 601.8619506835937, + "learning_rate": 6.293616306246586e-08, + "loss": 1148.1468, + "step": 9950 + }, + { + "ce_loss_10": 3.5492191195487974, + "ce_loss_13": 3.4784142851829527, + "ce_loss_2": 4.47113618850708, + "ce_loss_3": 4.207776916027069, + "ce_loss_7": 3.7163458704948424, + "epoch": 0.996, + "grad_norm": 568.0, + "kl_loss_10": 171.20833358764648, + "kl_loss_2": 2067.156182861328, + "kl_loss_3": 1607.7954223632812, + "kl_loss_7": 575.8024002075196, + "learning_rate": 4.027944857032395e-08, + "loss": 1102.1236, + "step": 9960 + }, + { + "ce_loss_10": 3.5417333483695983, + "ce_loss_13": 3.4737359166145323, + "ce_loss_2": 4.454948210716248, + "ce_loss_3": 4.189112281799316, + "ce_loss_7": 3.7030033111572265, + "epoch": 0.997, + "grad_norm": 564.0, + "kl_loss_10": 169.30588455200194, + "kl_loss_2": 2030.570849609375, + "kl_loss_3": 1568.8222778320312, + "kl_loss_7": 562.5833770751954, + "learning_rate": 2.265732291356626e-08, + "loss": 1096.3691, + "step": 9970 + }, + { + "ce_loss_10": 3.5887541651725767, + "ce_loss_13": 3.5155721068382264, + "ce_loss_2": 4.518339204788208, + "ce_loss_3": 4.2516262292861935, + "ce_loss_7": 3.7596161723136903, + "epoch": 0.998, + "grad_norm": 516.0, + "kl_loss_10": 174.8034523010254, + "kl_loss_2": 2081.462506103516, + "kl_loss_3": 1607.8740112304688, + "kl_loss_7": 584.5911361694336, + "learning_rate": 1.0069963546743833e-08, + "loss": 1138.0035, + "step": 9980 + }, + { + "ce_loss_10": 3.567852771282196, + "ce_loss_13": 3.4926111340522765, + "ce_loss_2": 4.526482367515564, + "ce_loss_3": 4.2642577409744264, + "ce_loss_7": 3.741330122947693, + "epoch": 0.999, + "grad_norm": 504.0, + "kl_loss_10": 177.2101951599121, + "kl_loss_2": 2140.8559020996095, + "kl_loss_3": 1666.8830322265626, + "kl_loss_7": 597.1717834472656, + "learning_rate": 2.517497224463483e-09, + "loss": 1140.1191, + "step": 9990 + }, + { + "ce_loss_10": 3.5264371991157533, + "ce_loss_13": 3.450861382484436, + "ce_loss_2": 4.53892297744751, + "ce_loss_3": 4.266285753250122, + "ce_loss_7": 3.7066094994544985, + "epoch": 1.0, + "grad_norm": 580.0, + "kl_loss_10": 180.3290283203125, + "kl_loss_2": 2255.579718017578, + "kl_loss_3": 1769.6880432128905, + "kl_loss_7": 615.3499603271484, + "learning_rate": 0.0, + "loss": 1181.1314, + "step": 10000 + } + ], + "logging_steps": 10, + "max_steps": 10000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 250, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.177819035608023e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}