{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 10000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "ce_loss_10": 6.949970483779907, "ce_loss_13": 3.5991063117980957, "ce_loss_2": 20.74317169189453, "ce_loss_3": 26.111305236816406, "ce_loss_7": 10.075343608856201, "epoch": 0.0001, "grad_norm": 212992.0, "kl_loss_10": 7864.61865234375, "kl_loss_2": 35348.310546875, "kl_loss_3": 46478.765625, "kl_loss_7": 14199.76806640625, "learning_rate": 1e-05, "loss": 25853.3086, "step": 1 }, { "ce_loss_10": 6.125355773501926, "ce_loss_13": 3.6540163622962103, "ce_loss_2": 12.076997624503242, "ce_loss_3": 15.207524087693956, "ce_loss_7": 7.174011654324001, "epoch": 0.001, "grad_norm": 17792.0, "kl_loss_10": 5945.64690483941, "kl_loss_2": 17211.485812717016, "kl_loss_3": 23727.339274088543, "kl_loss_7": 7859.595960828993, "learning_rate": 0.0001, "loss": 13522.6684, "step": 10 }, { "ce_loss_10": 4.603781843185425, "ce_loss_13": 3.6583157896995546, "ce_loss_2": 6.582165956497192, "ce_loss_3": 6.530840277671814, "ce_loss_7": 4.767995834350586, "epoch": 0.002, "grad_norm": 2416.0, "kl_loss_10": 2069.933459472656, "kl_loss_2": 5383.958471679687, "kl_loss_3": 5293.375524902343, "kl_loss_7": 2034.765203857422, "learning_rate": 0.0002, "loss": 3721.8547, "step": 20 }, { "ce_loss_10": 3.8755152463912963, "ce_loss_13": 3.440992832183838, "ce_loss_2": 5.676479697227478, "ce_loss_3": 5.412591814994812, "ce_loss_7": 4.082889425754547, "epoch": 0.003, "grad_norm": 1440.0, "kl_loss_10": 764.9044219970704, "kl_loss_2": 4085.6137573242186, "kl_loss_3": 3584.689697265625, "kl_loss_7": 1103.4823669433595, "learning_rate": 0.0003, "loss": 2353.6012, "step": 30 }, { "ce_loss_10": 3.9151899099349974, "ce_loss_13": 3.612821674346924, "ce_loss_2": 5.4220003366470335, "ce_loss_3": 5.159578323364258, "ce_loss_7": 4.096081411838531, "epoch": 0.004, "grad_norm": 2240.0, "kl_loss_10": 524.7813415527344, "kl_loss_2": 3348.9226196289064, "kl_loss_3": 2841.234338378906, "kl_loss_7": 838.199349975586, "learning_rate": 0.0004, "loss": 1895.9125, "step": 40 }, { "ce_loss_10": 3.8530999541282656, "ce_loss_13": 3.5993613958358766, "ce_loss_2": 5.269976806640625, "ce_loss_3": 4.972214603424073, "ce_loss_7": 4.020862734317779, "epoch": 0.005, "grad_norm": 1856.0, "kl_loss_10": 415.9984130859375, "kl_loss_2": 3121.74814453125, "kl_loss_3": 2545.165393066406, "kl_loss_7": 715.3964080810547, "learning_rate": 0.0005, "loss": 1695.3973, "step": 50 }, { "ce_loss_10": 3.8142170667648316, "ce_loss_13": 3.6055872440338135, "ce_loss_2": 5.112091851234436, "ce_loss_3": 4.877537202835083, "ce_loss_7": 3.9615157723426817, "epoch": 0.006, "grad_norm": 1256.0, "kl_loss_10": 352.13475494384767, "kl_loss_2": 2834.7232299804687, "kl_loss_3": 2411.520703125, "kl_loss_7": 626.5961456298828, "learning_rate": 0.0006, "loss": 1559.8957, "step": 60 }, { "ce_loss_10": 3.7009406328201293, "ce_loss_13": 3.519935369491577, "ce_loss_2": 4.975889682769775, "ce_loss_3": 4.767486214637756, "ce_loss_7": 3.847541904449463, "epoch": 0.007, "grad_norm": 1208.0, "kl_loss_10": 301.96008453369143, "kl_loss_2": 2753.304443359375, "kl_loss_3": 2332.9209411621096, "kl_loss_7": 579.1212310791016, "learning_rate": 0.0007, "loss": 1482.1832, "step": 70 }, { "ce_loss_10": 3.691651237010956, "ce_loss_13": 3.5254875898361204, "ce_loss_2": 4.9323248863220215, "ce_loss_3": 4.6815266609191895, "ce_loss_7": 3.918946826457977, "epoch": 0.008, "grad_norm": 1920.0, "kl_loss_10": 277.7585922241211, "kl_loss_2": 2673.962158203125, "kl_loss_3": 2198.915148925781, "kl_loss_7": 679.1366607666016, "learning_rate": 0.0008, "loss": 1458.9709, "step": 80 }, { "ce_loss_10": 3.630067002773285, "ce_loss_13": 3.48206342458725, "ce_loss_2": 4.835212993621826, "ce_loss_3": 4.615828561782837, "ce_loss_7": 3.8199189424514772, "epoch": 0.009, "grad_norm": 2336.0, "kl_loss_10": 252.8492774963379, "kl_loss_2": 2587.6504516601562, "kl_loss_3": 2134.85634765625, "kl_loss_7": 622.173226928711, "learning_rate": 0.0009000000000000001, "loss": 1391.4618, "step": 90 }, { "ce_loss_10": 3.7532737612724305, "ce_loss_13": 3.6053535461425783, "ce_loss_2": 4.9651381254196165, "ce_loss_3": 4.679249548912049, "ce_loss_7": 3.9317663073539735, "epoch": 0.01, "grad_norm": 2128.0, "kl_loss_10": 251.45062484741212, "kl_loss_2": 2572.321484375, "kl_loss_3": 2009.2118774414062, "kl_loss_7": 549.0976821899415, "learning_rate": 0.001, "loss": 1348.2805, "step": 100 }, { "ce_loss_10": 3.7507344126701354, "ce_loss_13": 3.560739505290985, "ce_loss_2": 4.900289678573609, "ce_loss_3": 4.575748300552368, "ce_loss_7": 3.855139982700348, "epoch": 0.011, "grad_norm": 2096.0, "kl_loss_10": 336.0966377258301, "kl_loss_2": 2548.9544921875, "kl_loss_3": 1914.9787902832031, "kl_loss_7": 503.8795822143555, "learning_rate": 0.0009999974825027757, "loss": 1319.618, "step": 110 }, { "ce_loss_10": 3.820546102523804, "ce_loss_13": 3.6168412566184998, "ce_loss_2": 4.895331883430481, "ce_loss_3": 4.579937386512756, "ce_loss_7": 3.905752420425415, "epoch": 0.012, "grad_norm": 1216.0, "kl_loss_10": 360.8111114501953, "kl_loss_2": 2423.57294921875, "kl_loss_3": 1821.830096435547, "kl_loss_7": 497.84374084472654, "learning_rate": 0.0009999899300364532, "loss": 1256.7569, "step": 120 }, { "ce_loss_10": 3.765466582775116, "ce_loss_13": 3.588362789154053, "ce_loss_2": 4.861788105964661, "ce_loss_3": 4.571296620368957, "ce_loss_7": 3.876194405555725, "epoch": 0.013, "grad_norm": 1920.0, "kl_loss_10": 300.5776092529297, "kl_loss_2": 2421.4876953125, "kl_loss_3": 1850.192041015625, "kl_loss_7": 510.1988525390625, "learning_rate": 0.0009999773426770863, "loss": 1278.616, "step": 130 }, { "ce_loss_10": 3.812940168380737, "ce_loss_13": 3.6267056345939634, "ce_loss_2": 4.856600952148438, "ce_loss_3": 4.555375063419342, "ce_loss_7": 3.9197404980659485, "epoch": 0.014, "grad_norm": 1104.0, "kl_loss_10": 334.16673736572267, "kl_loss_2": 2343.682012939453, "kl_loss_3": 1750.5664672851562, "kl_loss_7": 514.5185562133789, "learning_rate": 0.0009999597205514296, "loss": 1248.4314, "step": 140 }, { "ce_loss_10": 3.7693222880363466, "ce_loss_13": 3.5812222719192506, "ce_loss_2": 4.7746042013168335, "ce_loss_3": 4.491594767570495, "ce_loss_7": 3.9013134360313417, "epoch": 0.015, "grad_norm": 2000.0, "kl_loss_10": 301.1280906677246, "kl_loss_2": 2261.218878173828, "kl_loss_3": 1705.700830078125, "kl_loss_7": 572.2008193969726, "learning_rate": 0.0009999370638369377, "loss": 1215.427, "step": 150 }, { "ce_loss_10": 3.771867072582245, "ce_loss_13": 3.623634135723114, "ce_loss_2": 4.793287062644959, "ce_loss_3": 4.509058833122253, "ce_loss_7": 3.964562237262726, "epoch": 0.016, "grad_norm": 1736.0, "kl_loss_10": 262.3149169921875, "kl_loss_2": 2258.9556396484377, "kl_loss_3": 1685.8056640625, "kl_loss_7": 604.1481872558594, "learning_rate": 0.000999909372761763, "loss": 1207.4248, "step": 160 }, { "ce_loss_10": 3.702605497837067, "ce_loss_13": 3.555959129333496, "ce_loss_2": 4.72133858203888, "ce_loss_3": 4.451281535625458, "ce_loss_7": 3.8406598806381225, "epoch": 0.017, "grad_norm": 1536.0, "kl_loss_10": 242.87019119262695, "kl_loss_2": 2254.203680419922, "kl_loss_3": 1717.980206298828, "kl_loss_7": 507.3527557373047, "learning_rate": 0.0009998766476047546, "loss": 1188.5746, "step": 170 }, { "ce_loss_10": 3.7377680063247682, "ce_loss_13": 3.600107181072235, "ce_loss_2": 4.7649291276931764, "ce_loss_3": 4.480887150764465, "ce_loss_7": 3.8928799867630004, "epoch": 0.018, "grad_norm": 1096.0, "kl_loss_10": 231.40118408203125, "kl_loss_2": 2247.180828857422, "kl_loss_3": 1668.1500427246094, "kl_loss_7": 502.5091278076172, "learning_rate": 0.0009998388886954545, "loss": 1181.5367, "step": 180 }, { "ce_loss_10": 3.693929398059845, "ce_loss_13": 3.5655275106430055, "ce_loss_2": 4.720048952102661, "ce_loss_3": 4.425967907905578, "ce_loss_7": 3.8348891854286196, "epoch": 0.019, "grad_norm": 1032.0, "kl_loss_10": 214.6924057006836, "kl_loss_2": 2225.93642578125, "kl_loss_3": 1640.7731079101563, "kl_loss_7": 467.8557983398438, "learning_rate": 0.0009997960964140947, "loss": 1132.1148, "step": 190 }, { "ce_loss_10": 3.682847249507904, "ce_loss_13": 3.5613570332527162, "ce_loss_2": 4.7296292066574095, "ce_loss_3": 4.422930908203125, "ce_loss_7": 3.8146942615509034, "epoch": 0.02, "grad_norm": 1360.0, "kl_loss_10": 204.76239395141602, "kl_loss_2": 2234.7769409179687, "kl_loss_3": 1619.6412719726563, "kl_loss_7": 443.86146240234376, "learning_rate": 0.0009997482711915926, "loss": 1118.6208, "step": 200 }, { "ce_loss_10": 3.6386141061782835, "ce_loss_13": 3.523626208305359, "ce_loss_2": 4.654034543037414, "ce_loss_3": 4.351287698745727, "ce_loss_7": 3.7648990035057066, "epoch": 0.021, "grad_norm": 844.0, "kl_loss_10": 191.48220748901366, "kl_loss_2": 2176.9761169433596, "kl_loss_3": 1592.8676696777343, "kl_loss_7": 425.4680374145508, "learning_rate": 0.0009996954135095479, "loss": 1087.035, "step": 210 }, { "ce_loss_10": 3.726301395893097, "ce_loss_13": 3.613930583000183, "ce_loss_2": 4.689849066734314, "ce_loss_3": 4.408529257774353, "ce_loss_7": 3.8633771181106566, "epoch": 0.022, "grad_norm": 968.0, "kl_loss_10": 185.19210891723634, "kl_loss_2": 2058.4550598144533, "kl_loss_3": 1519.1773864746094, "kl_loss_7": 434.87823486328125, "learning_rate": 0.0009996375239002368, "loss": 1051.0784, "step": 220 }, { "ce_loss_10": 3.7977551460266112, "ce_loss_13": 3.6818463683128355, "ce_loss_2": 4.714341163635254, "ce_loss_3": 4.444690012931824, "ce_loss_7": 3.9216720938682554, "epoch": 0.023, "grad_norm": 792.0, "kl_loss_10": 197.30275802612306, "kl_loss_2": 1985.7976623535155, "kl_loss_3": 1460.261212158203, "kl_loss_7": 417.5539093017578, "learning_rate": 0.0009995746029466072, "loss": 1021.8153, "step": 230 }, { "ce_loss_10": 3.5866660118103026, "ce_loss_13": 3.465270149707794, "ce_loss_2": 4.572040939331055, "ce_loss_3": 4.279095077514649, "ce_loss_7": 3.7078741550445558, "epoch": 0.024, "grad_norm": 908.0, "kl_loss_10": 207.95580520629883, "kl_loss_2": 2143.852404785156, "kl_loss_3": 1571.8129760742188, "kl_loss_7": 426.2864517211914, "learning_rate": 0.0009995066512822719, "loss": 1050.4631, "step": 240 }, { "ce_loss_10": 3.686765193939209, "ce_loss_13": 3.5706915736198424, "ce_loss_2": 4.679925036430359, "ce_loss_3": 4.384693300724029, "ce_loss_7": 3.8067264676094057, "epoch": 0.025, "grad_norm": 1032.0, "kl_loss_10": 199.02285385131836, "kl_loss_2": 2131.8410888671874, "kl_loss_3": 1544.0210571289062, "kl_loss_7": 413.1264175415039, "learning_rate": 0.000999433669591504, "loss": 1033.4686, "step": 250 }, { "ce_loss_10": 3.581657183170319, "ce_loss_13": 3.472314774990082, "ce_loss_2": 4.541779208183288, "ce_loss_3": 4.317579293251038, "ce_loss_7": 3.7064756393432616, "epoch": 0.026, "grad_norm": 932.0, "kl_loss_10": 189.23829040527343, "kl_loss_2": 2088.3945861816405, "kl_loss_3": 1637.8548217773437, "kl_loss_7": 412.1736145019531, "learning_rate": 0.000999355658609228, "loss": 1057.2906, "step": 260 }, { "ce_loss_10": 3.6219969391822815, "ce_loss_13": 3.5048365235328673, "ce_loss_2": 4.596334934234619, "ce_loss_3": 4.441597318649292, "ce_loss_7": 3.7431845664978027, "epoch": 0.027, "grad_norm": 900.0, "kl_loss_10": 188.88880004882813, "kl_loss_2": 2095.5673889160157, "kl_loss_3": 1750.0010925292968, "kl_loss_7": 405.53798522949216, "learning_rate": 0.0009992726191210138, "loss": 1093.5438, "step": 270 }, { "ce_loss_10": 3.652155375480652, "ce_loss_13": 3.5428276896476745, "ce_loss_2": 4.579152154922485, "ce_loss_3": 4.383497536182404, "ce_loss_7": 3.789737546443939, "epoch": 0.028, "grad_norm": 780.0, "kl_loss_10": 187.1881446838379, "kl_loss_2": 2006.3188415527343, "kl_loss_3": 1629.3374816894532, "kl_loss_7": 423.36602783203125, "learning_rate": 0.0009991845519630679, "loss": 1050.2449, "step": 280 }, { "ce_loss_10": 3.535972011089325, "ce_loss_13": 3.427106332778931, "ce_loss_2": 4.4744978785514835, "ce_loss_3": 4.262858963012695, "ce_loss_7": 3.6743877053260805, "epoch": 0.029, "grad_norm": 684.0, "kl_loss_10": 179.29404220581054, "kl_loss_2": 2016.5242065429688, "kl_loss_3": 1582.6825744628907, "kl_loss_7": 441.13164978027345, "learning_rate": 0.0009990914580222257, "loss": 1053.0684, "step": 290 }, { "ce_loss_10": 3.668388879299164, "ce_loss_13": 3.567758929729462, "ce_loss_2": 4.53744785785675, "ce_loss_3": 4.309589576721192, "ce_loss_7": 3.8119096040725706, "epoch": 0.03, "grad_norm": 1224.0, "kl_loss_10": 187.29344177246094, "kl_loss_2": 1896.2646728515624, "kl_loss_3": 1456.2583312988281, "kl_loss_7": 421.26478881835936, "learning_rate": 0.0009989933382359422, "loss": 1015.491, "step": 300 }, { "ce_loss_10": 3.6792996883392335, "ce_loss_13": 3.5740679264068604, "ce_loss_2": 4.557365846633911, "ce_loss_3": 4.323324573040009, "ce_loss_7": 3.7865865588188172, "epoch": 0.031, "grad_norm": 828.0, "kl_loss_10": 187.5126724243164, "kl_loss_2": 1923.0036499023438, "kl_loss_3": 1442.6469970703124, "kl_loss_7": 384.39212493896486, "learning_rate": 0.0009988901935922825, "loss": 997.117, "step": 310 }, { "ce_loss_10": 3.5293397903442383, "ce_loss_13": 3.4199066400527953, "ce_loss_2": 4.486744737625122, "ce_loss_3": 4.227087867259979, "ce_loss_7": 3.6469008684158326, "epoch": 0.032, "grad_norm": 976.0, "kl_loss_10": 183.28864593505858, "kl_loss_2": 2055.861083984375, "kl_loss_3": 1531.5701293945312, "kl_loss_7": 385.79076690673827, "learning_rate": 0.0009987820251299122, "loss": 1008.4045, "step": 320 }, { "ce_loss_10": 3.66086140871048, "ce_loss_13": 3.556379699707031, "ce_loss_2": 4.536041283607483, "ce_loss_3": 4.270140862464904, "ce_loss_7": 3.770153260231018, "epoch": 0.033, "grad_norm": 1144.0, "kl_loss_10": 168.59372940063477, "kl_loss_2": 1906.9965759277343, "kl_loss_3": 1385.0498474121093, "kl_loss_7": 372.18479614257814, "learning_rate": 0.0009986688339380862, "loss": 957.1518, "step": 330 }, { "ce_loss_10": 3.6052905559539794, "ce_loss_13": 3.504980742931366, "ce_loss_2": 4.501095390319824, "ce_loss_3": 4.218203604221344, "ce_loss_7": 3.727604556083679, "epoch": 0.034, "grad_norm": 1104.0, "kl_loss_10": 164.38146286010743, "kl_loss_2": 1931.4434020996093, "kl_loss_3": 1379.435321044922, "kl_loss_7": 389.75638275146486, "learning_rate": 0.0009985506211566387, "loss": 969.0948, "step": 340 }, { "ce_loss_10": 3.6377886295318604, "ce_loss_13": 3.541017484664917, "ce_loss_2": 4.482600402832031, "ce_loss_3": 4.22925614118576, "ce_loss_7": 3.7690312385559084, "epoch": 0.035, "grad_norm": 988.0, "kl_loss_10": 158.43596343994142, "kl_loss_2": 1829.8166870117188, "kl_loss_3": 1337.818865966797, "kl_loss_7": 388.05591278076173, "learning_rate": 0.0009984273879759713, "loss": 933.1328, "step": 350 }, { "ce_loss_10": 3.667439329624176, "ce_loss_13": 3.5666789412498474, "ce_loss_2": 4.5066794633865355, "ce_loss_3": 4.2926198720932005, "ce_loss_7": 3.7826303958892824, "epoch": 0.036, "grad_norm": 600.0, "kl_loss_10": 162.84700927734374, "kl_loss_2": 1826.3444274902345, "kl_loss_3": 1395.6122314453125, "kl_loss_7": 384.24933471679685, "learning_rate": 0.0009982991356370402, "loss": 957.8976, "step": 360 }, { "ce_loss_10": 3.643305718898773, "ce_loss_13": 3.545375657081604, "ce_loss_2": 4.487171721458435, "ce_loss_3": 4.280910170078277, "ce_loss_7": 3.767821896076202, "epoch": 0.037, "grad_norm": 596.0, "kl_loss_10": 164.2067985534668, "kl_loss_2": 1829.6034606933595, "kl_loss_3": 1399.7697387695312, "kl_loss_7": 389.38902282714844, "learning_rate": 0.0009981658654313456, "loss": 945.4266, "step": 370 }, { "ce_loss_10": 3.728627920150757, "ce_loss_13": 3.628399407863617, "ce_loss_2": 4.530555677413941, "ce_loss_3": 4.30595852136612, "ce_loss_7": 3.83515260219574, "epoch": 0.038, "grad_norm": 572.0, "kl_loss_10": 166.8636932373047, "kl_loss_2": 1769.3647521972657, "kl_loss_3": 1309.9726135253907, "kl_loss_7": 360.24556121826174, "learning_rate": 0.000998027578700917, "loss": 918.2047, "step": 380 }, { "ce_loss_10": 3.6558377385139464, "ce_loss_13": 3.5584804892539976, "ce_loss_2": 4.499538516998291, "ce_loss_3": 4.255290400981903, "ce_loss_7": 3.7707452058792112, "epoch": 0.039, "grad_norm": 684.0, "kl_loss_10": 164.3509963989258, "kl_loss_2": 1842.500555419922, "kl_loss_3": 1353.7652526855468, "kl_loss_7": 364.5350601196289, "learning_rate": 0.0009978842768382998, "loss": 935.4773, "step": 390 }, { "ce_loss_10": 3.6760897040367126, "ce_loss_13": 3.5800448179244997, "ce_loss_2": 4.493572664260864, "ce_loss_3": 4.2423638820648195, "ce_loss_7": 3.786301875114441, "epoch": 0.04, "grad_norm": 968.0, "kl_loss_10": 161.01671752929687, "kl_loss_2": 1790.2952514648437, "kl_loss_3": 1298.0454895019532, "kl_loss_7": 363.45732421875, "learning_rate": 0.0009977359612865424, "loss": 914.3111, "step": 400 }, { "ce_loss_10": 3.684686779975891, "ce_loss_13": 3.586086595058441, "ce_loss_2": 4.512642502784729, "ce_loss_3": 4.255100309848785, "ce_loss_7": 3.805712080001831, "epoch": 0.041, "grad_norm": 724.0, "kl_loss_10": 161.0974250793457, "kl_loss_2": 1807.8360168457032, "kl_loss_3": 1310.046209716797, "kl_loss_7": 391.8801742553711, "learning_rate": 0.0009975826335391806, "loss": 914.0043, "step": 410 }, { "ce_loss_10": 3.707440197467804, "ce_loss_13": 3.604601538181305, "ce_loss_2": 4.522381353378296, "ce_loss_3": 4.265636503696442, "ce_loss_7": 3.822117471694946, "epoch": 0.042, "grad_norm": 900.0, "kl_loss_10": 166.57249908447267, "kl_loss_2": 1773.633642578125, "kl_loss_3": 1273.706396484375, "kl_loss_7": 380.59193420410156, "learning_rate": 0.0009974242951402235, "loss": 906.3268, "step": 420 }, { "ce_loss_10": 3.7127435922622682, "ce_loss_13": 3.6068360447883605, "ce_loss_2": 4.534731841087341, "ce_loss_3": 4.272397923469543, "ce_loss_7": 3.8254016041755676, "epoch": 0.043, "grad_norm": 544.0, "kl_loss_10": 171.76721878051757, "kl_loss_2": 1813.8242553710938, "kl_loss_3": 1297.1632202148437, "kl_loss_7": 380.753857421875, "learning_rate": 0.0009972609476841367, "loss": 907.3121, "step": 430 }, { "ce_loss_10": 3.638201355934143, "ce_loss_13": 3.521967649459839, "ce_loss_2": 4.476631236076355, "ce_loss_3": 4.207662534713745, "ce_loss_7": 3.743925619125366, "epoch": 0.044, "grad_norm": 656.0, "kl_loss_10": 205.51385726928712, "kl_loss_2": 1862.7595336914062, "kl_loss_3": 1318.6298767089843, "kl_loss_7": 397.6446823120117, "learning_rate": 0.0009970925928158272, "loss": 947.2434, "step": 440 }, { "ce_loss_10": 3.5770766854286196, "ce_loss_13": 3.463445019721985, "ce_loss_2": 4.41340719461441, "ce_loss_3": 4.154228365421295, "ce_loss_7": 3.683333933353424, "epoch": 0.045, "grad_norm": 544.0, "kl_loss_10": 187.16454544067383, "kl_loss_2": 1860.5320922851563, "kl_loss_3": 1349.3171264648438, "kl_loss_7": 389.9994171142578, "learning_rate": 0.000996919232230627, "loss": 931.1581, "step": 450 }, { "ce_loss_10": 3.6615111470222472, "ce_loss_13": 3.5475740671157836, "ce_loss_2": 4.4471900224685665, "ce_loss_3": 4.206307077407837, "ce_loss_7": 3.769719123840332, "epoch": 0.046, "grad_norm": 792.0, "kl_loss_10": 189.3802345275879, "kl_loss_2": 1767.1443359375, "kl_loss_3": 1299.2234252929688, "kl_loss_7": 404.1760650634766, "learning_rate": 0.0009967408676742752, "loss": 896.3932, "step": 460 }, { "ce_loss_10": 3.815341627597809, "ce_loss_13": 3.6976951956748962, "ce_loss_2": 4.575603008270264, "ce_loss_3": 4.349165272712708, "ce_loss_7": 3.926807904243469, "epoch": 0.047, "grad_norm": 1020.0, "kl_loss_10": 193.17176513671876, "kl_loss_2": 1722.3591735839843, "kl_loss_3": 1269.6951721191406, "kl_loss_7": 399.5799560546875, "learning_rate": 0.0009965575009429006, "loss": 911.5342, "step": 470 }, { "ce_loss_10": 3.5749866485595705, "ce_loss_13": 3.471819591522217, "ce_loss_2": 4.3897274255752565, "ce_loss_3": 4.163278090953827, "ce_loss_7": 3.6932021975517273, "epoch": 0.048, "grad_norm": 832.0, "kl_loss_10": 173.3515739440918, "kl_loss_2": 1803.337139892578, "kl_loss_3": 1356.6680847167968, "kl_loss_7": 384.8886749267578, "learning_rate": 0.0009963691338830043, "loss": 913.6404, "step": 480 }, { "ce_loss_10": 3.6706506490707396, "ce_loss_13": 3.5724706411361695, "ce_loss_2": 4.442422878742218, "ce_loss_3": 4.223198866844177, "ce_loss_7": 3.7754740715026855, "epoch": 0.049, "grad_norm": 664.0, "kl_loss_10": 163.7422233581543, "kl_loss_2": 1726.0343017578125, "kl_loss_3": 1283.6178405761718, "kl_loss_7": 355.8869354248047, "learning_rate": 0.0009961757683914405, "loss": 866.413, "step": 490 }, { "ce_loss_10": 3.657481300830841, "ce_loss_13": 3.561222219467163, "ce_loss_2": 4.412674343585968, "ce_loss_3": 4.190000641345978, "ce_loss_7": 3.7463939428329467, "epoch": 0.05, "grad_norm": 552.0, "kl_loss_10": 171.74871139526368, "kl_loss_2": 1693.8871337890625, "kl_loss_3": 1238.5546569824219, "kl_loss_7": 333.96338348388673, "learning_rate": 0.0009959774064153978, "loss": 867.9215, "step": 500 }, { "ce_loss_10": 3.6671042442321777, "ce_loss_13": 3.5669935941696167, "ce_loss_2": 4.402782237529754, "ce_loss_3": 4.179040241241455, "ce_loss_7": 3.7529300928115843, "epoch": 0.051, "grad_norm": 548.0, "kl_loss_10": 165.0301971435547, "kl_loss_2": 1649.0840270996093, "kl_loss_3": 1201.8943420410155, "kl_loss_7": 327.53272857666013, "learning_rate": 0.0009957740499523787, "loss": 850.5875, "step": 510 }, { "ce_loss_10": 3.692741870880127, "ce_loss_13": 3.5905726313591004, "ce_loss_2": 4.450686037540436, "ce_loss_3": 4.220798969268799, "ce_loss_7": 3.785287392139435, "epoch": 0.052, "grad_norm": 560.0, "kl_loss_10": 160.49609146118163, "kl_loss_2": 1681.6054443359376, "kl_loss_3": 1234.1128479003905, "kl_loss_7": 330.61391296386716, "learning_rate": 0.0009955657010501807, "loss": 859.9023, "step": 520 }, { "ce_loss_10": 3.654950940608978, "ce_loss_13": 3.554408383369446, "ce_loss_2": 4.4271773338317875, "ce_loss_3": 4.2007159948348995, "ce_loss_7": 3.7458335757255554, "epoch": 0.053, "grad_norm": 560.0, "kl_loss_10": 160.82289505004883, "kl_loss_2": 1731.140557861328, "kl_loss_3": 1270.9948364257812, "kl_loss_7": 331.9795379638672, "learning_rate": 0.000995352361806875, "loss": 862.8967, "step": 530 }, { "ce_loss_10": 3.6911896467208862, "ce_loss_13": 3.5907997369766234, "ce_loss_2": 4.458630633354187, "ce_loss_3": 4.2227191686630245, "ce_loss_7": 3.7843895673751833, "epoch": 0.054, "grad_norm": 552.0, "kl_loss_10": 166.47220001220703, "kl_loss_2": 1722.0740966796875, "kl_loss_3": 1249.600811767578, "kl_loss_7": 335.7519927978516, "learning_rate": 0.0009951340343707852, "loss": 876.934, "step": 540 }, { "ce_loss_10": 3.7539408445358275, "ce_loss_13": 3.6503811120986938, "ce_loss_2": 4.52553424835205, "ce_loss_3": 4.282987451553344, "ce_loss_7": 3.839770758152008, "epoch": 0.055, "grad_norm": 512.0, "kl_loss_10": 162.00789489746094, "kl_loss_2": 1707.214862060547, "kl_loss_3": 1233.2592041015625, "kl_loss_7": 323.39465484619143, "learning_rate": 0.0009949107209404665, "loss": 863.0879, "step": 550 }, { "ce_loss_10": 3.6489940643310548, "ce_loss_13": 3.5539053201675417, "ce_loss_2": 4.41527898311615, "ce_loss_3": 4.180786430835724, "ce_loss_7": 3.750082802772522, "epoch": 0.056, "grad_norm": 540.0, "kl_loss_10": 157.5300537109375, "kl_loss_2": 1703.8317993164062, "kl_loss_3": 1234.3482482910156, "kl_loss_7": 346.96667022705077, "learning_rate": 0.0009946824237646824, "loss": 859.4348, "step": 560 }, { "ce_loss_10": 3.5962815046310426, "ce_loss_13": 3.501141941547394, "ce_loss_2": 4.377828812599182, "ce_loss_3": 4.148308992385864, "ce_loss_7": 3.7191163897514343, "epoch": 0.057, "grad_norm": 764.0, "kl_loss_10": 153.23575592041016, "kl_loss_2": 1739.7751159667969, "kl_loss_3": 1272.3390563964845, "kl_loss_7": 396.63781890869143, "learning_rate": 0.0009944491451423828, "loss": 901.9479, "step": 570 }, { "ce_loss_10": 3.594892370700836, "ce_loss_13": 3.500366282463074, "ce_loss_2": 4.390602493286133, "ce_loss_3": 4.148435056209564, "ce_loss_7": 3.710537350177765, "epoch": 0.058, "grad_norm": 804.0, "kl_loss_10": 153.60133438110353, "kl_loss_2": 1753.4079895019531, "kl_loss_3": 1272.2657775878906, "kl_loss_7": 368.931379699707, "learning_rate": 0.0009942108874226813, "loss": 870.9764, "step": 580 }, { "ce_loss_10": 3.7256513595581056, "ce_loss_13": 3.6301231741905213, "ce_loss_2": 4.468952918052674, "ce_loss_3": 4.23498455286026, "ce_loss_7": 3.8281203866004945, "epoch": 0.059, "grad_norm": 494.0, "kl_loss_10": 155.00701828002929, "kl_loss_2": 1650.434881591797, "kl_loss_3": 1188.881414794922, "kl_loss_7": 349.0711242675781, "learning_rate": 0.00099396765300483, "loss": 829.2725, "step": 590 }, { "ce_loss_10": 3.688658046722412, "ce_loss_13": 3.600668156147003, "ce_loss_2": 4.441829895973205, "ce_loss_3": 4.204685604572296, "ce_loss_7": 3.7927687644958494, "epoch": 0.06, "grad_norm": 700.0, "kl_loss_10": 147.73722648620605, "kl_loss_2": 1665.102276611328, "kl_loss_3": 1201.0594848632813, "kl_loss_7": 336.5929977416992, "learning_rate": 0.0009937194443381972, "loss": 836.3632, "step": 600 }, { "ce_loss_10": 3.7074933648109436, "ce_loss_13": 3.6225223660469057, "ce_loss_2": 4.444479322433471, "ce_loss_3": 4.212475669384003, "ce_loss_7": 3.806171452999115, "epoch": 0.061, "grad_norm": 490.0, "kl_loss_10": 145.92314338684082, "kl_loss_2": 1647.2934875488281, "kl_loss_3": 1192.1070617675782, "kl_loss_7": 330.35746612548826, "learning_rate": 0.0009934662639222412, "loss": 841.5062, "step": 610 }, { "ce_loss_10": 3.6668009042739866, "ce_loss_13": 3.5791383743286134, "ce_loss_2": 4.436111927032471, "ce_loss_3": 4.192030191421509, "ce_loss_7": 3.7709102272987365, "epoch": 0.062, "grad_norm": 548.0, "kl_loss_10": 142.56752128601073, "kl_loss_2": 1707.9285888671875, "kl_loss_3": 1224.2419647216798, "kl_loss_7": 333.47724609375, "learning_rate": 0.000993208114306486, "loss": 843.8041, "step": 620 }, { "ce_loss_10": 3.5789570450782775, "ce_loss_13": 3.4927441477775574, "ce_loss_2": 4.355255722999573, "ce_loss_3": 4.113215839862823, "ce_loss_7": 3.6797728538513184, "epoch": 0.063, "grad_norm": 684.0, "kl_loss_10": 142.47375717163087, "kl_loss_2": 1703.1561950683595, "kl_loss_3": 1224.8568115234375, "kl_loss_7": 327.08728790283203, "learning_rate": 0.0009929449980904952, "loss": 827.3757, "step": 630 }, { "ce_loss_10": 3.6368979692459105, "ce_loss_13": 3.552669334411621, "ce_loss_2": 4.39526858329773, "ce_loss_3": 4.161888694763183, "ce_loss_7": 3.7305431842803953, "epoch": 0.064, "grad_norm": 604.0, "kl_loss_10": 145.31115531921387, "kl_loss_2": 1675.4742797851563, "kl_loss_3": 1206.004461669922, "kl_loss_7": 311.66287689208986, "learning_rate": 0.0009926769179238466, "loss": 830.4232, "step": 640 }, { "ce_loss_10": 3.708518397808075, "ce_loss_13": 3.6032424330711366, "ce_loss_2": 4.449502897262573, "ce_loss_3": 4.213514125347137, "ce_loss_7": 3.7848907709121704, "epoch": 0.065, "grad_norm": 572.0, "kl_loss_10": 183.44921951293946, "kl_loss_2": 1690.6953979492187, "kl_loss_3": 1209.2367431640625, "kl_loss_7": 320.96158905029296, "learning_rate": 0.000992403876506104, "loss": 845.6277, "step": 650 }, { "ce_loss_10": 3.6422240853309633, "ce_loss_13": 3.5376295328140257, "ce_loss_2": 4.388966178894043, "ce_loss_3": 4.148260116577148, "ce_loss_7": 3.721568763256073, "epoch": 0.066, "grad_norm": 516.0, "kl_loss_10": 166.38197479248046, "kl_loss_2": 1675.3920837402343, "kl_loss_3": 1201.2981964111327, "kl_loss_7": 311.2148132324219, "learning_rate": 0.0009921258765867918, "loss": 834.6085, "step": 660 }, { "ce_loss_10": 3.593488574028015, "ce_loss_13": 3.5049474120140074, "ce_loss_2": 4.357883477210999, "ce_loss_3": 4.115947949886322, "ce_loss_7": 3.6764505982398985, "epoch": 0.067, "grad_norm": 600.0, "kl_loss_10": 148.19011993408202, "kl_loss_2": 1717.049383544922, "kl_loss_3": 1223.6348205566405, "kl_loss_7": 306.2616958618164, "learning_rate": 0.0009918429209653662, "loss": 833.8985, "step": 670 }, { "ce_loss_10": 3.648161160945892, "ce_loss_13": 3.559934389591217, "ce_loss_2": 4.409848690032959, "ce_loss_3": 4.171260499954224, "ce_loss_7": 3.7374308466911317, "epoch": 0.068, "grad_norm": 596.0, "kl_loss_10": 147.48591995239258, "kl_loss_2": 1679.9023315429688, "kl_loss_3": 1208.9112182617187, "kl_loss_7": 313.83782348632815, "learning_rate": 0.0009915550124911866, "loss": 822.998, "step": 680 }, { "ce_loss_10": 3.6632981300354004, "ce_loss_13": 3.573338711261749, "ce_loss_2": 4.395955181121826, "ce_loss_3": 4.164679610729218, "ce_loss_7": 3.7496419668197634, "epoch": 0.069, "grad_norm": 636.0, "kl_loss_10": 148.64911651611328, "kl_loss_2": 1629.2107971191406, "kl_loss_3": 1186.3095611572267, "kl_loss_7": 309.2399566650391, "learning_rate": 0.0009912621540634887, "loss": 816.0117, "step": 690 }, { "ce_loss_10": 3.6952749490737915, "ce_loss_13": 3.608550024032593, "ce_loss_2": 4.3951560974121096, "ce_loss_3": 4.167996168136597, "ce_loss_7": 3.778964614868164, "epoch": 0.07, "grad_norm": 524.0, "kl_loss_10": 140.3430618286133, "kl_loss_2": 1575.8819396972656, "kl_loss_3": 1123.8694213867188, "kl_loss_7": 294.651708984375, "learning_rate": 0.0009909643486313534, "loss": 794.9152, "step": 700 }, { "ce_loss_10": 3.5606731176376343, "ce_loss_13": 3.4771942019462587, "ce_loss_2": 4.3175184488296505, "ce_loss_3": 4.074072551727295, "ce_loss_7": 3.650731146335602, "epoch": 0.071, "grad_norm": 600.0, "kl_loss_10": 135.5239990234375, "kl_loss_2": 1676.284442138672, "kl_loss_3": 1193.2137634277344, "kl_loss_7": 307.0430740356445, "learning_rate": 0.000990661599193678, "loss": 839.2205, "step": 710 }, { "ce_loss_10": 3.7052354335784914, "ce_loss_13": 3.6190937519073487, "ce_loss_2": 4.42795637845993, "ce_loss_3": 4.203339767456055, "ce_loss_7": 3.7865342020988466, "epoch": 0.072, "grad_norm": 708.0, "kl_loss_10": 139.11955757141112, "kl_loss_2": 1630.3473266601563, "kl_loss_3": 1169.5729766845702, "kl_loss_7": 299.42345809936523, "learning_rate": 0.0009903539087991462, "loss": 803.8498, "step": 720 }, { "ce_loss_10": 3.6689595699310305, "ce_loss_13": 3.586829674243927, "ce_loss_2": 4.399398994445801, "ce_loss_3": 4.174283814430237, "ce_loss_7": 3.7554702758789062, "epoch": 0.073, "grad_norm": 860.0, "kl_loss_10": 133.20760345458984, "kl_loss_2": 1626.384521484375, "kl_loss_3": 1158.8964233398438, "kl_loss_7": 296.6233856201172, "learning_rate": 0.0009900412805461966, "loss": 810.3949, "step": 730 }, { "ce_loss_10": 3.7475465893745423, "ce_loss_13": 3.6637478709220885, "ce_loss_2": 4.477526593208313, "ce_loss_3": 4.232499527931213, "ce_loss_7": 3.834572732448578, "epoch": 0.074, "grad_norm": 756.0, "kl_loss_10": 136.13002281188966, "kl_loss_2": 1615.2621215820313, "kl_loss_3": 1135.3047760009765, "kl_loss_7": 302.71751708984374, "learning_rate": 0.0009897237175829927, "loss": 812.032, "step": 740 }, { "ce_loss_10": 3.633454430103302, "ce_loss_13": 3.546045184135437, "ce_loss_2": 4.386739385128021, "ce_loss_3": 4.157361710071564, "ce_loss_7": 3.7257861375808714, "epoch": 0.075, "grad_norm": 624.0, "kl_loss_10": 138.0735656738281, "kl_loss_2": 1664.8802978515625, "kl_loss_3": 1209.155780029297, "kl_loss_7": 314.29449157714845, "learning_rate": 0.0009894012231073895, "loss": 820.1248, "step": 750 }, { "ce_loss_10": 3.675256085395813, "ce_loss_13": 3.591258680820465, "ce_loss_2": 4.3781631827354435, "ce_loss_3": 4.169950652122497, "ce_loss_7": 3.7596523761749268, "epoch": 0.076, "grad_norm": 596.0, "kl_loss_10": 137.33892288208008, "kl_loss_2": 1570.5589111328125, "kl_loss_3": 1161.7032104492187, "kl_loss_7": 298.7381622314453, "learning_rate": 0.0009890738003669028, "loss": 801.2431, "step": 760 }, { "ce_loss_10": 3.64959534406662, "ce_loss_13": 3.5664158701896667, "ce_loss_2": 4.371342432498932, "ce_loss_3": 4.150311291217804, "ce_loss_7": 3.7354934453964233, "epoch": 0.077, "grad_norm": 540.0, "kl_loss_10": 136.36218070983887, "kl_loss_2": 1622.240057373047, "kl_loss_3": 1172.391793823242, "kl_loss_7": 304.76952667236327, "learning_rate": 0.0009887414526586764, "loss": 787.9819, "step": 770 }, { "ce_loss_10": 3.708216655254364, "ce_loss_13": 3.625141477584839, "ce_loss_2": 4.414664888381958, "ce_loss_3": 4.183781635761261, "ce_loss_7": 3.8081562399864195, "epoch": 0.078, "grad_norm": 596.0, "kl_loss_10": 133.56560096740722, "kl_loss_2": 1562.47041015625, "kl_loss_3": 1106.445620727539, "kl_loss_7": 312.0776168823242, "learning_rate": 0.0009884041833294476, "loss": 768.2491, "step": 780 }, { "ce_loss_10": 3.706817853450775, "ce_loss_13": 3.622973358631134, "ce_loss_2": 4.41116281747818, "ce_loss_3": 4.179266679286957, "ce_loss_7": 3.8186426639556883, "epoch": 0.079, "grad_norm": 632.0, "kl_loss_10": 132.2478443145752, "kl_loss_2": 1599.446923828125, "kl_loss_3": 1117.8709930419923, "kl_loss_7": 368.3747268676758, "learning_rate": 0.000988061995775515, "loss": 815.0693, "step": 790 }, { "ce_loss_10": 3.641828775405884, "ce_loss_13": 3.5547205209732056, "ce_loss_2": 4.335572981834412, "ce_loss_3": 4.108006286621094, "ce_loss_7": 3.7402275919914247, "epoch": 0.08, "grad_norm": 516.0, "kl_loss_10": 141.807564163208, "kl_loss_2": 1570.8703674316407, "kl_loss_3": 1110.252996826172, "kl_loss_7": 321.9771667480469, "learning_rate": 0.0009877148934427035, "loss": 786.1404, "step": 800 }, { "ce_loss_10": 3.681752073764801, "ce_loss_13": 3.596014940738678, "ce_loss_2": 4.380065774917602, "ce_loss_3": 4.151778030395508, "ce_loss_7": 3.7655294299125672, "epoch": 0.081, "grad_norm": 496.0, "kl_loss_10": 145.9334274291992, "kl_loss_2": 1572.8681091308595, "kl_loss_3": 1116.675845336914, "kl_loss_7": 297.05968246459963, "learning_rate": 0.0009873628798263297, "loss": 776.0455, "step": 810 }, { "ce_loss_10": 3.6424105167388916, "ce_loss_13": 3.5447566747665404, "ce_loss_2": 4.312346494197845, "ce_loss_3": 4.088560962677002, "ce_loss_7": 3.7104405045509337, "epoch": 0.082, "grad_norm": 478.0, "kl_loss_10": 152.06344909667968, "kl_loss_2": 1539.9718017578125, "kl_loss_3": 1091.6052520751953, "kl_loss_7": 286.7229400634766, "learning_rate": 0.0009870059584711668, "loss": 790.5065, "step": 820 }, { "ce_loss_10": 3.6575138568878174, "ce_loss_13": 3.5694735765457155, "ce_loss_2": 4.352469277381897, "ce_loss_3": 4.124953854084015, "ce_loss_7": 3.7358759164810182, "epoch": 0.083, "grad_norm": 516.0, "kl_loss_10": 158.90749130249023, "kl_loss_2": 1569.4235595703126, "kl_loss_3": 1125.5340545654296, "kl_loss_7": 290.7964630126953, "learning_rate": 0.000986644132971409, "loss": 786.8994, "step": 830 }, { "ce_loss_10": 3.6558743476867677, "ce_loss_13": 3.5544149518013, "ce_loss_2": 4.354631888866424, "ce_loss_3": 4.1281127572059635, "ce_loss_7": 3.727833020687103, "epoch": 0.084, "grad_norm": 576.0, "kl_loss_10": 158.36446990966797, "kl_loss_2": 1584.950128173828, "kl_loss_3": 1138.0484100341796, "kl_loss_7": 300.7915969848633, "learning_rate": 0.0009862774069706345, "loss": 786.4536, "step": 840 }, { "ce_loss_10": 3.7631431221961975, "ce_loss_13": 3.6783902406692506, "ce_loss_2": 4.423887753486634, "ce_loss_3": 4.210748863220215, "ce_loss_7": 3.8459392905235292, "epoch": 0.085, "grad_norm": 720.0, "kl_loss_10": 144.1476722717285, "kl_loss_2": 1526.45078125, "kl_loss_3": 1098.919091796875, "kl_loss_7": 305.10309143066405, "learning_rate": 0.000985905784161771, "loss": 773.6244, "step": 850 }, { "ce_loss_10": 3.693523097038269, "ce_loss_13": 3.6117894887924193, "ce_loss_2": 4.374859690666199, "ce_loss_3": 4.145905554294586, "ce_loss_7": 3.799845337867737, "epoch": 0.086, "grad_norm": 648.0, "kl_loss_10": 141.55279006958008, "kl_loss_2": 1548.1404724121094, "kl_loss_3": 1092.3538146972655, "kl_loss_7": 338.8992858886719, "learning_rate": 0.000985529268287055, "loss": 780.1624, "step": 860 }, { "ce_loss_10": 3.6179853677749634, "ce_loss_13": 3.532400143146515, "ce_loss_2": 4.3180185675621034, "ce_loss_3": 4.092539095878601, "ce_loss_7": 3.716568684577942, "epoch": 0.087, "grad_norm": 584.0, "kl_loss_10": 138.25293006896973, "kl_loss_2": 1583.606640625, "kl_loss_3": 1113.2994171142577, "kl_loss_7": 327.81214904785156, "learning_rate": 0.0009851478631379982, "loss": 787.4821, "step": 870 }, { "ce_loss_10": 3.6815198183059694, "ce_loss_13": 3.5956546545028685, "ce_loss_2": 4.367411196231842, "ce_loss_3": 4.13222428560257, "ce_loss_7": 3.7695237517356874, "epoch": 0.088, "grad_norm": 628.0, "kl_loss_10": 140.43244590759278, "kl_loss_2": 1545.1064147949219, "kl_loss_3": 1094.267755126953, "kl_loss_7": 312.3658508300781, "learning_rate": 0.0009847615725553456, "loss": 767.0908, "step": 880 }, { "ce_loss_10": 3.739601492881775, "ce_loss_13": 3.657086157798767, "ce_loss_2": 4.379729843139648, "ce_loss_3": 4.177917766571045, "ce_loss_7": 3.820488429069519, "epoch": 0.089, "grad_norm": 552.0, "kl_loss_10": 134.12742614746094, "kl_loss_2": 1464.5765686035156, "kl_loss_3": 1051.556851196289, "kl_loss_7": 283.62481689453125, "learning_rate": 0.0009843704004290394, "loss": 761.853, "step": 890 }, { "ce_loss_10": 3.6452771425247192, "ce_loss_13": 3.5613077044487, "ce_loss_2": 4.318511128425598, "ce_loss_3": 4.107461535930634, "ce_loss_7": 3.726675534248352, "epoch": 0.09, "grad_norm": 474.0, "kl_loss_10": 136.06297454833984, "kl_loss_2": 1542.6724487304687, "kl_loss_3": 1117.772933959961, "kl_loss_7": 292.2666213989258, "learning_rate": 0.0009839743506981783, "loss": 768.8108, "step": 900 }, { "ce_loss_10": 3.5574649572372437, "ce_loss_13": 3.4748517513275146, "ce_loss_2": 4.266572868824005, "ce_loss_3": 4.057099211215973, "ce_loss_7": 3.6422529578208924, "epoch": 0.091, "grad_norm": 516.0, "kl_loss_10": 139.13952560424804, "kl_loss_2": 1603.9869201660156, "kl_loss_3": 1170.3635620117188, "kl_loss_7": 298.2760665893555, "learning_rate": 0.0009835734273509786, "loss": 783.7168, "step": 910 }, { "ce_loss_10": 3.6700770974159242, "ce_loss_13": 3.5813122153282166, "ce_loss_2": 4.351845908164978, "ce_loss_3": 4.139319920539856, "ce_loss_7": 3.7498608589172364, "epoch": 0.092, "grad_norm": 516.0, "kl_loss_10": 139.36617164611818, "kl_loss_2": 1526.7721801757812, "kl_loss_3": 1107.183511352539, "kl_loss_7": 287.28514404296874, "learning_rate": 0.0009831676344247342, "loss": 768.4225, "step": 920 }, { "ce_loss_10": 3.684238874912262, "ce_loss_13": 3.6015963315963746, "ce_loss_2": 4.3427834749221805, "ce_loss_3": 4.138106441497802, "ce_loss_7": 3.75754714012146, "epoch": 0.093, "grad_norm": 490.0, "kl_loss_10": 135.07495460510253, "kl_loss_2": 1516.6379028320312, "kl_loss_3": 1094.0326538085938, "kl_loss_7": 277.64155731201174, "learning_rate": 0.0009827569760057755, "loss": 762.3584, "step": 930 }, { "ce_loss_10": 3.5946595072746277, "ce_loss_13": 3.512081265449524, "ce_loss_2": 4.322237813472748, "ce_loss_3": 4.095906281471253, "ce_loss_7": 3.6798322200775146, "epoch": 0.094, "grad_norm": 728.0, "kl_loss_10": 138.28199310302733, "kl_loss_2": 1619.1793823242188, "kl_loss_3": 1165.3315551757812, "kl_loss_7": 295.293204498291, "learning_rate": 0.000982341456229428, "loss": 780.917, "step": 940 }, { "ce_loss_10": 3.69069162607193, "ce_loss_13": 3.6100045323371885, "ce_loss_2": 4.376732325553894, "ce_loss_3": 4.16404242515564, "ce_loss_7": 3.7701812386512756, "epoch": 0.095, "grad_norm": 688.0, "kl_loss_10": 131.1420455932617, "kl_loss_2": 1575.732354736328, "kl_loss_3": 1138.4372924804688, "kl_loss_7": 285.67282180786134, "learning_rate": 0.000981921079279971, "loss": 765.979, "step": 950 }, { "ce_loss_10": 3.7074394822120667, "ce_loss_13": 3.62913464307785, "ce_loss_2": 4.366938805580139, "ce_loss_3": 4.150120985507965, "ce_loss_7": 3.7818633675575257, "epoch": 0.096, "grad_norm": 720.0, "kl_loss_10": 130.51903839111327, "kl_loss_2": 1507.3517028808594, "kl_loss_3": 1076.092593383789, "kl_loss_7": 272.2766448974609, "learning_rate": 0.0009814958493905962, "loss": 753.6946, "step": 960 }, { "ce_loss_10": 3.658416414260864, "ce_loss_13": 3.576970672607422, "ce_loss_2": 4.346470355987549, "ce_loss_3": 4.128688275814056, "ce_loss_7": 3.7415476202964784, "epoch": 0.097, "grad_norm": 512.0, "kl_loss_10": 128.56299629211426, "kl_loss_2": 1557.0646423339845, "kl_loss_3": 1112.28828125, "kl_loss_7": 279.6500648498535, "learning_rate": 0.0009810657708433637, "loss": 775.217, "step": 970 }, { "ce_loss_10": 3.7308164954185488, "ce_loss_13": 3.6533005952835085, "ce_loss_2": 4.3734122037887575, "ce_loss_3": 4.170846402645111, "ce_loss_7": 3.8050424695014953, "epoch": 0.098, "grad_norm": 716.0, "kl_loss_10": 124.60902214050293, "kl_loss_2": 1475.1663879394532, "kl_loss_3": 1054.8542236328126, "kl_loss_7": 269.9375114440918, "learning_rate": 0.0009806308479691594, "loss": 736.7519, "step": 980 }, { "ce_loss_10": 3.750465714931488, "ce_loss_13": 3.668341946601868, "ce_loss_2": 4.426263308525085, "ce_loss_3": 4.20258377790451, "ce_loss_7": 3.836391198635101, "epoch": 0.099, "grad_norm": 644.0, "kl_loss_10": 131.81643409729003, "kl_loss_2": 1535.673388671875, "kl_loss_3": 1090.963656616211, "kl_loss_7": 289.6497604370117, "learning_rate": 0.0009801910851476522, "loss": 754.2551, "step": 990 }, { "ce_loss_10": 3.653952169418335, "ce_loss_13": 3.577095854282379, "ce_loss_2": 4.349191665649414, "ce_loss_3": 4.125085318088532, "ce_loss_7": 3.7413162350654603, "epoch": 0.1, "grad_norm": 478.0, "kl_loss_10": 128.62464637756347, "kl_loss_2": 1573.8733642578125, "kl_loss_3": 1114.0891967773437, "kl_loss_7": 292.6377975463867, "learning_rate": 0.0009797464868072487, "loss": 758.6713, "step": 1000 }, { "ce_loss_10": 3.6456503033638, "ce_loss_13": 3.5667870163917543, "ce_loss_2": 4.3237790822982785, "ce_loss_3": 4.11080631017685, "ce_loss_7": 3.7275813579559327, "epoch": 0.101, "grad_norm": 432.0, "kl_loss_10": 128.03596534729004, "kl_loss_2": 1525.84384765625, "kl_loss_3": 1094.1039764404297, "kl_loss_7": 291.2514984130859, "learning_rate": 0.0009792970574250492, "loss": 758.2494, "step": 1010 }, { "ce_loss_10": 3.677238702774048, "ce_loss_13": 3.597763466835022, "ce_loss_2": 4.345702481269837, "ce_loss_3": 4.1323373198509215, "ce_loss_7": 3.757745099067688, "epoch": 0.102, "grad_norm": 480.0, "kl_loss_10": 126.84439620971679, "kl_loss_2": 1518.9351745605468, "kl_loss_3": 1090.8279510498046, "kl_loss_7": 289.6885223388672, "learning_rate": 0.0009788428015268028, "loss": 746.4768, "step": 1020 }, { "ce_loss_10": 3.670746088027954, "ce_loss_13": 3.5901795506477354, "ce_loss_2": 4.326242756843567, "ce_loss_3": 4.110077440738678, "ce_loss_7": 3.7697238445281984, "epoch": 0.103, "grad_norm": 520.0, "kl_loss_10": 147.23381576538085, "kl_loss_2": 1500.6592041015624, "kl_loss_3": 1064.419287109375, "kl_loss_7": 309.967301940918, "learning_rate": 0.0009783837236868609, "loss": 752.1227, "step": 1030 }, { "ce_loss_10": 3.665172076225281, "ce_loss_13": 3.559502327442169, "ce_loss_2": 4.309127068519592, "ce_loss_3": 4.0925112009048465, "ce_loss_7": 3.730722725391388, "epoch": 0.104, "grad_norm": 624.0, "kl_loss_10": 168.995276260376, "kl_loss_2": 1506.7355590820312, "kl_loss_3": 1077.4224884033204, "kl_loss_7": 306.44668807983396, "learning_rate": 0.0009779198285281327, "loss": 758.6978, "step": 1040 }, { "ce_loss_10": 3.6450916528701782, "ce_loss_13": 3.5567120909690857, "ce_loss_2": 4.307799768447876, "ce_loss_3": 4.096536159515381, "ce_loss_7": 3.7174035549163817, "epoch": 0.105, "grad_norm": 464.0, "kl_loss_10": 145.4011459350586, "kl_loss_2": 1511.9253051757812, "kl_loss_3": 1079.955551147461, "kl_loss_7": 290.32603912353517, "learning_rate": 0.0009774511207220368, "loss": 751.4335, "step": 1050 }, { "ce_loss_10": 3.6726208686828614, "ce_loss_13": 3.5870786190032957, "ce_loss_2": 4.340760517120361, "ce_loss_3": 4.122452509403229, "ce_loss_7": 3.7611562490463255, "epoch": 0.106, "grad_norm": 516.0, "kl_loss_10": 146.77743186950684, "kl_loss_2": 1523.6993774414063, "kl_loss_3": 1080.6748168945312, "kl_loss_7": 305.8709197998047, "learning_rate": 0.0009769776049884564, "loss": 759.1102, "step": 1060 }, { "ce_loss_10": 3.5789316415786745, "ce_loss_13": 3.4973001360893248, "ce_loss_2": 4.2655829906463625, "ce_loss_3": 4.0485687255859375, "ce_loss_7": 3.664482927322388, "epoch": 0.107, "grad_norm": 512.0, "kl_loss_10": 138.50279579162597, "kl_loss_2": 1555.6201232910157, "kl_loss_3": 1112.0815826416015, "kl_loss_7": 305.1489685058594, "learning_rate": 0.0009764992860956889, "loss": 779.55, "step": 1070 }, { "ce_loss_10": 3.7416428446769716, "ce_loss_13": 3.6618621706962586, "ce_loss_2": 4.364235496520996, "ce_loss_3": 4.161127758026123, "ce_loss_7": 3.8312565684318542, "epoch": 0.108, "grad_norm": 612.0, "kl_loss_10": 132.17280654907228, "kl_loss_2": 1434.9408752441407, "kl_loss_3": 1021.1740997314453, "kl_loss_7": 306.5512954711914, "learning_rate": 0.0009760161688604008, "loss": 729.6794, "step": 1080 }, { "ce_loss_10": 3.74602724313736, "ce_loss_13": 3.6610143184661865, "ce_loss_2": 4.390218591690063, "ce_loss_3": 4.1842693328857425, "ce_loss_7": 3.8411784768104553, "epoch": 0.109, "grad_norm": 576.0, "kl_loss_10": 133.29356536865234, "kl_loss_2": 1472.9822204589843, "kl_loss_3": 1051.5467559814454, "kl_loss_7": 310.2565521240234, "learning_rate": 0.0009755282581475768, "loss": 747.7812, "step": 1090 }, { "ce_loss_10": 3.801929402351379, "ce_loss_13": 3.715673303604126, "ce_loss_2": 4.428924131393432, "ce_loss_3": 4.217723715305328, "ce_loss_7": 3.886538052558899, "epoch": 0.11, "grad_norm": 552.0, "kl_loss_10": 141.27199668884276, "kl_loss_2": 1455.4054565429688, "kl_loss_3": 1032.206768798828, "kl_loss_7": 311.90191345214845, "learning_rate": 0.0009750355588704727, "loss": 730.8727, "step": 1100 }, { "ce_loss_10": 3.6245179295539858, "ce_loss_13": 3.5434940338134764, "ce_loss_2": 4.285040807723999, "ce_loss_3": 4.064236760139465, "ce_loss_7": 3.722900152206421, "epoch": 0.111, "grad_norm": 536.0, "kl_loss_10": 128.93951110839845, "kl_loss_2": 1479.008282470703, "kl_loss_3": 1042.542987060547, "kl_loss_7": 309.4350082397461, "learning_rate": 0.0009745380759905647, "loss": 755.6506, "step": 1110 }, { "ce_loss_10": 3.5733557820320128, "ce_loss_13": 3.497096002101898, "ce_loss_2": 4.2416357636451725, "ce_loss_3": 4.02953668832779, "ce_loss_7": 3.6843501210212706, "epoch": 0.112, "grad_norm": 584.0, "kl_loss_10": 128.8479259490967, "kl_loss_2": 1501.6354125976563, "kl_loss_3": 1078.8607055664063, "kl_loss_7": 309.57103271484374, "learning_rate": 0.0009740358145174998, "loss": 782.7103, "step": 1120 }, { "ce_loss_10": 3.7390747904777526, "ce_loss_13": 3.654650056362152, "ce_loss_2": 4.359592080116272, "ce_loss_3": 4.165994334220886, "ce_loss_7": 3.8400262117385866, "epoch": 0.113, "grad_norm": 434.0, "kl_loss_10": 134.35130157470704, "kl_loss_2": 1442.04345703125, "kl_loss_3": 1051.9576324462892, "kl_loss_7": 334.60899047851564, "learning_rate": 0.0009735287795090455, "loss": 747.7461, "step": 1130 }, { "ce_loss_10": 3.6206952929496765, "ce_loss_13": 3.5408340215682985, "ce_loss_2": 4.278374576568604, "ce_loss_3": 4.073013770580292, "ce_loss_7": 3.709249567985535, "epoch": 0.114, "grad_norm": 560.0, "kl_loss_10": 129.40065841674806, "kl_loss_2": 1489.2802490234376, "kl_loss_3": 1078.2224548339843, "kl_loss_7": 308.7394744873047, "learning_rate": 0.0009730169760710386, "loss": 743.8783, "step": 1140 }, { "ce_loss_10": 3.7078137516975405, "ce_loss_13": 3.6258071303367614, "ce_loss_2": 4.352467465400696, "ce_loss_3": 4.14322521686554, "ce_loss_7": 3.793818712234497, "epoch": 0.115, "grad_norm": 532.0, "kl_loss_10": 132.82089805603027, "kl_loss_2": 1462.5518798828125, "kl_loss_3": 1047.3691467285157, "kl_loss_7": 303.57424392700193, "learning_rate": 0.0009725004093573342, "loss": 741.0269, "step": 1150 }, { "ce_loss_10": 3.641883647441864, "ce_loss_13": 3.5618484139442446, "ce_loss_2": 4.298850560188294, "ce_loss_3": 4.0857291460037235, "ce_loss_7": 3.732511842250824, "epoch": 0.116, "grad_norm": 500.0, "kl_loss_10": 125.74126281738282, "kl_loss_2": 1472.2821716308595, "kl_loss_3": 1051.9892150878907, "kl_loss_7": 293.47923736572267, "learning_rate": 0.0009719790845697534, "loss": 730.1605, "step": 1160 }, { "ce_loss_10": 3.588691568374634, "ce_loss_13": 3.514021909236908, "ce_loss_2": 4.223182845115661, "ce_loss_3": 4.0243830442428585, "ce_loss_7": 3.668225383758545, "epoch": 0.117, "grad_norm": 544.0, "kl_loss_10": 118.70261917114257, "kl_loss_2": 1445.7062133789063, "kl_loss_3": 1032.7508636474608, "kl_loss_7": 274.055322265625, "learning_rate": 0.0009714530069580309, "loss": 718.2419, "step": 1170 }, { "ce_loss_10": 3.6957285404205322, "ce_loss_13": 3.618162250518799, "ce_loss_2": 4.352480411529541, "ce_loss_3": 4.145036590099335, "ce_loss_7": 3.7782084584236144, "epoch": 0.118, "grad_norm": 536.0, "kl_loss_10": 127.76230659484864, "kl_loss_2": 1480.515966796875, "kl_loss_3": 1059.5112030029297, "kl_loss_7": 282.41261138916013, "learning_rate": 0.0009709221818197624, "loss": 734.455, "step": 1180 }, { "ce_loss_10": 3.721509063243866, "ce_loss_13": 3.6461830377578734, "ce_loss_2": 4.384591698646545, "ce_loss_3": 4.175746941566468, "ce_loss_7": 3.804957926273346, "epoch": 0.119, "grad_norm": 454.0, "kl_loss_10": 121.90502281188965, "kl_loss_2": 1485.071533203125, "kl_loss_3": 1060.364013671875, "kl_loss_7": 273.83970947265624, "learning_rate": 0.0009703866145003512, "loss": 735.9141, "step": 1190 }, { "ce_loss_10": 3.6931097984313963, "ce_loss_13": 3.618978762626648, "ce_loss_2": 4.338696074485779, "ce_loss_3": 4.131911754608154, "ce_loss_7": 3.771903729438782, "epoch": 0.12, "grad_norm": 404.0, "kl_loss_10": 117.82418823242188, "kl_loss_2": 1472.2267517089845, "kl_loss_3": 1051.9043701171875, "kl_loss_7": 267.1518127441406, "learning_rate": 0.0009698463103929542, "loss": 740.661, "step": 1200 }, { "ce_loss_10": 3.658799970149994, "ce_loss_13": 3.5842487812042236, "ce_loss_2": 4.312227940559387, "ce_loss_3": 4.10740053653717, "ce_loss_7": 3.7385629415512085, "epoch": 0.121, "grad_norm": 412.0, "kl_loss_10": 122.52205390930176, "kl_loss_2": 1466.4009826660156, "kl_loss_3": 1056.7593078613281, "kl_loss_7": 272.06723709106444, "learning_rate": 0.0009693012749384279, "loss": 737.0117, "step": 1210 }, { "ce_loss_10": 3.679182291030884, "ce_loss_13": 3.6015621542930605, "ce_loss_2": 4.328339552879333, "ce_loss_3": 4.114730060100555, "ce_loss_7": 3.7581299543380737, "epoch": 0.122, "grad_norm": 500.0, "kl_loss_10": 124.13164978027343, "kl_loss_2": 1486.9410522460937, "kl_loss_3": 1053.6353607177734, "kl_loss_7": 279.761865234375, "learning_rate": 0.0009687515136252732, "loss": 728.5778, "step": 1220 }, { "ce_loss_10": 3.6272791743278505, "ce_loss_13": 3.55220046043396, "ce_loss_2": 4.301675605773926, "ce_loss_3": 4.08597983121872, "ce_loss_7": 3.70781672000885, "epoch": 0.123, "grad_norm": 568.0, "kl_loss_10": 121.01997566223145, "kl_loss_2": 1522.73505859375, "kl_loss_3": 1086.6077362060546, "kl_loss_7": 279.3069206237793, "learning_rate": 0.0009681970319895803, "loss": 759.7192, "step": 1230 }, { "ce_loss_10": 3.71327965259552, "ce_loss_13": 3.6385520815849306, "ce_loss_2": 4.354242825508118, "ce_loss_3": 4.150368654727936, "ce_loss_7": 3.7916497707366945, "epoch": 0.124, "grad_norm": 414.0, "kl_loss_10": 124.44540634155274, "kl_loss_2": 1443.4426513671874, "kl_loss_3": 1030.0522674560548, "kl_loss_7": 268.6724395751953, "learning_rate": 0.0009676378356149733, "loss": 722.6414, "step": 1240 }, { "ce_loss_10": 3.6944294214248656, "ce_loss_13": 3.6130942940711974, "ce_loss_2": 4.31483553647995, "ce_loss_3": 4.113047087192536, "ce_loss_7": 3.7628474831581116, "epoch": 0.125, "grad_norm": 572.0, "kl_loss_10": 133.05949897766112, "kl_loss_2": 1434.6165405273437, "kl_loss_3": 1024.405093383789, "kl_loss_7": 265.9820045471191, "learning_rate": 0.0009670739301325534, "loss": 721.2149, "step": 1250 }, { "ce_loss_10": 3.6491236448287965, "ce_loss_13": 3.5683398127555845, "ce_loss_2": 4.294895899295807, "ce_loss_3": 4.082043838500977, "ce_loss_7": 3.721854901313782, "epoch": 0.126, "grad_norm": 506.0, "kl_loss_10": 130.2590259552002, "kl_loss_2": 1460.9762817382812, "kl_loss_3": 1047.9487213134767, "kl_loss_7": 271.88867340087893, "learning_rate": 0.0009665053212208426, "loss": 732.2017, "step": 1260 }, { "ce_loss_10": 3.6933886647224425, "ce_loss_13": 3.6137840390205382, "ce_loss_2": 4.336398506164551, "ce_loss_3": 4.125988566875458, "ce_loss_7": 3.7641473054885863, "epoch": 0.127, "grad_norm": 470.0, "kl_loss_10": 131.11599006652833, "kl_loss_2": 1466.260760498047, "kl_loss_3": 1047.477099609375, "kl_loss_7": 271.31814041137693, "learning_rate": 0.0009659320146057262, "loss": 729.9061, "step": 1270 }, { "ce_loss_10": 3.6932409524917604, "ce_loss_13": 3.6162060022354128, "ce_loss_2": 4.326151037216187, "ce_loss_3": 4.118873739242554, "ce_loss_7": 3.7665857672691345, "epoch": 0.128, "grad_norm": 488.0, "kl_loss_10": 126.3920768737793, "kl_loss_2": 1439.7459106445312, "kl_loss_3": 1023.4634368896484, "kl_loss_7": 263.45100021362305, "learning_rate": 0.0009653540160603955, "loss": 714.3654, "step": 1280 }, { "ce_loss_10": 3.695625138282776, "ce_loss_13": 3.619100844860077, "ce_loss_2": 4.322635555267334, "ce_loss_3": 4.121702527999878, "ce_loss_7": 3.764703559875488, "epoch": 0.129, "grad_norm": 516.0, "kl_loss_10": 125.06153717041016, "kl_loss_2": 1449.9097961425782, "kl_loss_3": 1036.939111328125, "kl_loss_7": 261.42085418701174, "learning_rate": 0.0009647713314052896, "loss": 709.775, "step": 1290 }, { "ce_loss_10": 3.645713412761688, "ce_loss_13": 3.5693684458732604, "ce_loss_2": 4.318705654144287, "ce_loss_3": 4.105780220031738, "ce_loss_7": 3.721473240852356, "epoch": 0.13, "grad_norm": 504.0, "kl_loss_10": 125.77756729125977, "kl_loss_2": 1515.6605834960938, "kl_loss_3": 1082.791098022461, "kl_loss_7": 268.05779418945315, "learning_rate": 0.0009641839665080363, "loss": 739.9956, "step": 1300 }, { "ce_loss_10": 3.6060986638069155, "ce_loss_13": 3.532057249546051, "ce_loss_2": 4.258748412132263, "ce_loss_3": 4.044409060478211, "ce_loss_7": 3.6810790419578554, "epoch": 0.131, "grad_norm": 576.0, "kl_loss_10": 120.33845672607421, "kl_loss_2": 1464.5880249023437, "kl_loss_3": 1035.7476196289062, "kl_loss_7": 258.05460357666016, "learning_rate": 0.0009635919272833937, "loss": 712.5358, "step": 1310 }, { "ce_loss_10": 3.6437799096107484, "ce_loss_13": 3.567954385280609, "ce_loss_2": 4.29961267709732, "ce_loss_3": 4.091295349597931, "ce_loss_7": 3.7204660773277283, "epoch": 0.132, "grad_norm": 520.0, "kl_loss_10": 123.54525375366211, "kl_loss_2": 1460.5380920410157, "kl_loss_3": 1039.971875, "kl_loss_7": 264.77962188720704, "learning_rate": 0.0009629952196931902, "loss": 712.4777, "step": 1320 }, { "ce_loss_10": 3.63455046415329, "ce_loss_13": 3.557650101184845, "ce_loss_2": 4.270671212673188, "ce_loss_3": 4.062439024448395, "ce_loss_7": 3.702920150756836, "epoch": 0.133, "grad_norm": 434.0, "kl_loss_10": 123.04212112426758, "kl_loss_2": 1444.6365600585937, "kl_loss_3": 1028.4689849853517, "kl_loss_7": 258.92202911376955, "learning_rate": 0.0009623938497462645, "loss": 713.1292, "step": 1330 }, { "ce_loss_10": 3.6247922778129578, "ce_loss_13": 3.5494427919387816, "ce_loss_2": 4.2690158009529116, "ce_loss_3": 4.058642566204071, "ce_loss_7": 3.6972993493080137, "epoch": 0.134, "grad_norm": 478.0, "kl_loss_10": 120.75489349365235, "kl_loss_2": 1456.6053527832032, "kl_loss_3": 1037.5501190185546, "kl_loss_7": 266.06713485717773, "learning_rate": 0.0009617878234984055, "loss": 726.2297, "step": 1340 }, { "ce_loss_10": 3.717451739311218, "ce_loss_13": 3.642001247406006, "ce_loss_2": 4.3319720983505245, "ce_loss_3": 4.123037731647491, "ce_loss_7": 3.790008616447449, "epoch": 0.135, "grad_norm": 548.0, "kl_loss_10": 120.84955863952636, "kl_loss_2": 1400.965509033203, "kl_loss_3": 989.1194274902343, "kl_loss_7": 260.0563507080078, "learning_rate": 0.0009611771470522907, "loss": 704.2836, "step": 1350 }, { "ce_loss_10": 3.6397268891334535, "ce_loss_13": 3.565677487850189, "ce_loss_2": 4.285844933986664, "ce_loss_3": 4.075031089782715, "ce_loss_7": 3.7177743196487425, "epoch": 0.136, "grad_norm": 548.0, "kl_loss_10": 119.19244728088378, "kl_loss_2": 1430.9075134277343, "kl_loss_3": 1014.8560333251953, "kl_loss_7": 264.9954383850098, "learning_rate": 0.0009605618265574251, "loss": 706.0607, "step": 1360 }, { "ce_loss_10": 3.6019657135009764, "ce_loss_13": 3.5283800959587097, "ce_loss_2": 4.247595989704132, "ce_loss_3": 4.0493292808532715, "ce_loss_7": 3.682861661911011, "epoch": 0.137, "grad_norm": 544.0, "kl_loss_10": 120.26506729125977, "kl_loss_2": 1482.0683837890624, "kl_loss_3": 1078.6839630126954, "kl_loss_7": 272.45921630859374, "learning_rate": 0.0009599418682100792, "loss": 727.2132, "step": 1370 }, { "ce_loss_10": 3.645335590839386, "ce_loss_13": 3.570277786254883, "ce_loss_2": 4.2866430401802065, "ce_loss_3": 4.071622550487518, "ce_loss_7": 3.717624640464783, "epoch": 0.138, "grad_norm": 612.0, "kl_loss_10": 119.49271163940429, "kl_loss_2": 1442.7004455566407, "kl_loss_3": 1025.406283569336, "kl_loss_7": 261.58585968017576, "learning_rate": 0.0009593172782532268, "loss": 717.2026, "step": 1380 }, { "ce_loss_10": 3.6908539175987243, "ce_loss_13": 3.617784011363983, "ce_loss_2": 4.313388335704803, "ce_loss_3": 4.114008998870849, "ce_loss_7": 3.7639716506004333, "epoch": 0.139, "grad_norm": 476.0, "kl_loss_10": 120.62876358032227, "kl_loss_2": 1423.3504333496094, "kl_loss_3": 1012.8270812988281, "kl_loss_7": 261.68493881225584, "learning_rate": 0.0009586880629764817, "loss": 706.5565, "step": 1390 }, { "ce_loss_10": 3.61561758518219, "ce_loss_13": 3.5403517365455626, "ce_loss_2": 4.258929216861725, "ce_loss_3": 4.060744059085846, "ce_loss_7": 3.687643599510193, "epoch": 0.14, "grad_norm": 792.0, "kl_loss_10": 120.44653511047363, "kl_loss_2": 1437.7734375, "kl_loss_3": 1068.6304412841796, "kl_loss_7": 272.7257308959961, "learning_rate": 0.0009580542287160348, "loss": 716.5157, "step": 1400 }, { "ce_loss_10": 3.579881501197815, "ce_loss_13": 3.505436861515045, "ce_loss_2": 4.2163320779800415, "ce_loss_3": 4.016775751113892, "ce_loss_7": 3.6602004766464233, "epoch": 0.141, "grad_norm": 740.0, "kl_loss_10": 119.12874908447266, "kl_loss_2": 1437.7877868652345, "kl_loss_3": 1029.927035522461, "kl_loss_7": 274.3121276855469, "learning_rate": 0.0009574157818545901, "loss": 704.4754, "step": 1410 }, { "ce_loss_10": 3.654617667198181, "ce_loss_13": 3.581939327716827, "ce_loss_2": 4.266410648822784, "ce_loss_3": 4.076312291622162, "ce_loss_7": 3.7350067377090452, "epoch": 0.142, "grad_norm": 788.0, "kl_loss_10": 117.25881233215333, "kl_loss_2": 1402.042706298828, "kl_loss_3": 1008.1144165039062, "kl_loss_7": 268.0317886352539, "learning_rate": 0.0009567727288213005, "loss": 712.509, "step": 1420 }, { "ce_loss_10": 3.62344468832016, "ce_loss_13": 3.552217972278595, "ce_loss_2": 4.242154741287232, "ce_loss_3": 4.047549939155578, "ce_loss_7": 3.6985828638076783, "epoch": 0.143, "grad_norm": 466.0, "kl_loss_10": 115.47184524536132, "kl_loss_2": 1418.5062316894532, "kl_loss_3": 1018.4080627441406, "kl_loss_7": 270.1881278991699, "learning_rate": 0.0009561250760917027, "loss": 702.7143, "step": 1430 }, { "ce_loss_10": 3.6490369558334352, "ce_loss_13": 3.5760287642478943, "ce_loss_2": 4.275133848190308, "ce_loss_3": 4.07141832113266, "ce_loss_7": 3.7269250392913817, "epoch": 0.144, "grad_norm": 524.0, "kl_loss_10": 119.90954666137695, "kl_loss_2": 1441.103936767578, "kl_loss_3": 1028.5968627929688, "kl_loss_7": 267.66681442260744, "learning_rate": 0.0009554728301876525, "loss": 698.2885, "step": 1440 }, { "ce_loss_10": 3.7067930936813354, "ce_loss_13": 3.629922258853912, "ce_loss_2": 4.314408445358277, "ce_loss_3": 4.132321739196778, "ce_loss_7": 3.7809135794639586, "epoch": 0.145, "grad_norm": 632.0, "kl_loss_10": 122.64454803466796, "kl_loss_2": 1398.767156982422, "kl_loss_3": 1023.6964874267578, "kl_loss_7": 262.7124481201172, "learning_rate": 0.0009548159976772592, "loss": 721.9051, "step": 1450 }, { "ce_loss_10": 3.641107952594757, "ce_loss_13": 3.5679691076278686, "ce_loss_2": 4.273185658454895, "ce_loss_3": 4.074773287773132, "ce_loss_7": 3.715712809562683, "epoch": 0.146, "grad_norm": 472.0, "kl_loss_10": 119.45023498535156, "kl_loss_2": 1426.5163330078126, "kl_loss_3": 1022.3043518066406, "kl_loss_7": 264.85511245727537, "learning_rate": 0.0009541545851748186, "loss": 702.8599, "step": 1460 }, { "ce_loss_10": 3.5100984811782836, "ce_loss_13": 3.436799693107605, "ce_loss_2": 4.161883985996246, "ce_loss_3": 3.955858516693115, "ce_loss_7": 3.594360911846161, "epoch": 0.147, "grad_norm": 556.0, "kl_loss_10": 116.41493873596191, "kl_loss_2": 1467.1879943847657, "kl_loss_3": 1037.354574584961, "kl_loss_7": 266.5866645812988, "learning_rate": 0.0009534885993407473, "loss": 713.4948, "step": 1470 }, { "ce_loss_10": 3.6824231266975405, "ce_loss_13": 3.608121466636658, "ce_loss_2": 4.328725492954254, "ce_loss_3": 4.115788686275482, "ce_loss_7": 3.755172336101532, "epoch": 0.148, "grad_norm": 560.0, "kl_loss_10": 118.46903686523437, "kl_loss_2": 1448.8276794433593, "kl_loss_3": 1029.914727783203, "kl_loss_7": 263.37538986206056, "learning_rate": 0.0009528180468815154, "loss": 714.8544, "step": 1480 }, { "ce_loss_10": 3.7179338216781614, "ce_loss_13": 3.6484482169151304, "ce_loss_2": 4.323283433914185, "ce_loss_3": 4.127403116226196, "ce_loss_7": 3.7899964809417725, "epoch": 0.149, "grad_norm": 480.0, "kl_loss_10": 114.30357208251954, "kl_loss_2": 1395.1859313964844, "kl_loss_3": 989.8287170410156, "kl_loss_7": 257.14686279296876, "learning_rate": 0.0009521429345495787, "loss": 690.7114, "step": 1490 }, { "ce_loss_10": 3.7034213185310363, "ce_loss_13": 3.6311787962913513, "ce_loss_2": 4.309155285358429, "ce_loss_3": 4.092868828773499, "ce_loss_7": 3.7654823780059816, "epoch": 0.15, "grad_norm": 448.0, "kl_loss_10": 116.55960197448731, "kl_loss_2": 1382.6544982910157, "kl_loss_3": 969.2838195800781, "kl_loss_7": 249.21936950683593, "learning_rate": 0.0009514632691433108, "loss": 688.2995, "step": 1500 }, { "ce_loss_10": 3.6700626373291017, "ce_loss_13": 3.5945797085762026, "ce_loss_2": 4.289841759204864, "ce_loss_3": 4.08635276556015, "ce_loss_7": 3.738140869140625, "epoch": 0.151, "grad_norm": 448.0, "kl_loss_10": 129.3878589630127, "kl_loss_2": 1424.29853515625, "kl_loss_3": 1001.7422454833984, "kl_loss_7": 254.78705520629882, "learning_rate": 0.0009507790575069346, "loss": 706.6927, "step": 1510 }, { "ce_loss_10": 3.65303395986557, "ce_loss_13": 3.5714800715446473, "ce_loss_2": 4.28388956785202, "ce_loss_3": 4.07215541601181, "ce_loss_7": 3.7174383282661436, "epoch": 0.152, "grad_norm": 560.0, "kl_loss_10": 131.33350143432617, "kl_loss_2": 1434.660906982422, "kl_loss_3": 1017.4268585205078, "kl_loss_7": 260.6188400268555, "learning_rate": 0.0009500903065304539, "loss": 715.3042, "step": 1520 }, { "ce_loss_10": 3.683453822135925, "ce_loss_13": 3.60856169462204, "ce_loss_2": 4.287684428691864, "ce_loss_3": 4.0820488929748535, "ce_loss_7": 3.7489410638809204, "epoch": 0.153, "grad_norm": 592.0, "kl_loss_10": 120.57846107482911, "kl_loss_2": 1384.860614013672, "kl_loss_3": 975.0672027587891, "kl_loss_7": 247.4440773010254, "learning_rate": 0.0009493970231495835, "loss": 691.6448, "step": 1530 }, { "ce_loss_10": 3.6223431706428526, "ce_loss_13": 3.55173202753067, "ce_loss_2": 4.230155563354492, "ce_loss_3": 4.02554075717926, "ce_loss_7": 3.6863773345947264, "epoch": 0.154, "grad_norm": 490.0, "kl_loss_10": 119.43844871520996, "kl_loss_2": 1397.457257080078, "kl_loss_3": 991.8431060791015, "kl_loss_7": 243.42232818603514, "learning_rate": 0.0009486992143456792, "loss": 686.1227, "step": 1540 }, { "ce_loss_10": 3.6514541625976564, "ce_loss_13": 3.571796643733978, "ce_loss_2": 4.304909610748291, "ce_loss_3": 4.0922522187232975, "ce_loss_7": 3.7216905117034913, "epoch": 0.155, "grad_norm": 396.0, "kl_loss_10": 128.10715980529784, "kl_loss_2": 1491.158837890625, "kl_loss_3": 1056.2210266113282, "kl_loss_7": 262.1390213012695, "learning_rate": 0.0009479968871456679, "loss": 716.6379, "step": 1550 }, { "ce_loss_10": 3.6170923829078676, "ce_loss_13": 3.542399287223816, "ce_loss_2": 4.252330017089844, "ce_loss_3": 4.045702540874482, "ce_loss_7": 3.685628616809845, "epoch": 0.156, "grad_norm": 454.0, "kl_loss_10": 121.63033790588379, "kl_loss_2": 1463.288525390625, "kl_loss_3": 1026.1065032958984, "kl_loss_7": 254.98253784179687, "learning_rate": 0.0009472900486219768, "loss": 702.4742, "step": 1560 }, { "ce_loss_10": 3.6025954604148867, "ce_loss_13": 3.5303670883178713, "ce_loss_2": 4.232356917858124, "ce_loss_3": 4.022554993629456, "ce_loss_7": 3.6709616661071776, "epoch": 0.157, "grad_norm": 520.0, "kl_loss_10": 118.88864822387696, "kl_loss_2": 1434.4180419921875, "kl_loss_3": 1021.8371948242187, "kl_loss_7": 253.59476776123046, "learning_rate": 0.000946578705892462, "loss": 706.9224, "step": 1570 }, { "ce_loss_10": 3.6455034971237184, "ce_loss_13": 3.5725855112075804, "ce_loss_2": 4.251002633571625, "ce_loss_3": 4.075614416599274, "ce_loss_7": 3.712466835975647, "epoch": 0.158, "grad_norm": 520.0, "kl_loss_10": 115.86212844848633, "kl_loss_2": 1388.911444091797, "kl_loss_3": 1008.0096618652344, "kl_loss_7": 249.38579559326172, "learning_rate": 0.0009458628661203367, "loss": 702.1684, "step": 1580 }, { "ce_loss_10": 3.6394161105155947, "ce_loss_13": 3.571541059017181, "ce_loss_2": 4.284844183921814, "ce_loss_3": 4.076776087284088, "ce_loss_7": 3.7110044956207275, "epoch": 0.159, "grad_norm": 494.0, "kl_loss_10": 113.66581001281739, "kl_loss_2": 1444.869854736328, "kl_loss_3": 1032.262744140625, "kl_loss_7": 253.4554000854492, "learning_rate": 0.0009451425365140996, "loss": 688.5467, "step": 1590 }, { "ce_loss_10": 3.7211164236068726, "ce_loss_13": 3.649132215976715, "ce_loss_2": 4.325118780136108, "ce_loss_3": 4.128993570804596, "ce_loss_7": 3.7914722681045534, "epoch": 0.16, "grad_norm": 456.0, "kl_loss_10": 117.80431632995605, "kl_loss_2": 1373.1717468261718, "kl_loss_3": 981.5717620849609, "kl_loss_7": 253.6373489379883, "learning_rate": 0.0009444177243274617, "loss": 681.3762, "step": 1600 }, { "ce_loss_10": 3.574730896949768, "ce_loss_13": 3.498664665222168, "ce_loss_2": 4.200723135471344, "ce_loss_3": 4.009881269931793, "ce_loss_7": 3.6463570594787598, "epoch": 0.161, "grad_norm": 480.0, "kl_loss_10": 122.87367897033691, "kl_loss_2": 1430.2820068359374, "kl_loss_3": 1037.6371978759767, "kl_loss_7": 260.55384521484376, "learning_rate": 0.0009436884368592739, "loss": 706.6845, "step": 1610 }, { "ce_loss_10": 3.6286559462547303, "ce_loss_13": 3.555983376502991, "ce_loss_2": 4.232067906856537, "ce_loss_3": 4.041269278526306, "ce_loss_7": 3.6985298871994017, "epoch": 0.162, "grad_norm": 498.0, "kl_loss_10": 118.67424545288085, "kl_loss_2": 1385.5154724121094, "kl_loss_3": 999.56015625, "kl_loss_7": 250.75928268432617, "learning_rate": 0.0009429546814534529, "loss": 699.0302, "step": 1620 }, { "ce_loss_10": 3.639633226394653, "ce_loss_13": 3.5706356167793274, "ce_loss_2": 4.241236877441406, "ce_loss_3": 4.056409633159637, "ce_loss_7": 3.708655667304993, "epoch": 0.163, "grad_norm": 384.0, "kl_loss_10": 117.12662200927734, "kl_loss_2": 1374.89609375, "kl_loss_3": 989.9520751953125, "kl_loss_7": 248.8686378479004, "learning_rate": 0.0009422164654989072, "loss": 676.7936, "step": 1630 }, { "ce_loss_10": 3.7635043978691103, "ce_loss_13": 3.687360870838165, "ce_loss_2": 4.338383412361145, "ce_loss_3": 4.1611551403999325, "ce_loss_7": 3.828977358341217, "epoch": 0.164, "grad_norm": 424.0, "kl_loss_10": 119.46500015258789, "kl_loss_2": 1362.1967407226562, "kl_loss_3": 990.5942932128906, "kl_loss_7": 248.94699325561524, "learning_rate": 0.0009414737964294635, "loss": 685.8197, "step": 1640 }, { "ce_loss_10": 3.678090500831604, "ce_loss_13": 3.6101470470428465, "ce_loss_2": 4.259114742279053, "ce_loss_3": 4.078064382076263, "ce_loss_7": 3.7428590416908265, "epoch": 0.165, "grad_norm": 444.0, "kl_loss_10": 112.9244888305664, "kl_loss_2": 1333.3570129394532, "kl_loss_3": 969.1624145507812, "kl_loss_7": 238.38165054321288, "learning_rate": 0.000940726681723791, "loss": 682.7061, "step": 1650 }, { "ce_loss_10": 3.512197470664978, "ce_loss_13": 3.4408557653427123, "ce_loss_2": 4.148977339267731, "ce_loss_3": 3.9514773368835447, "ce_loss_7": 3.5815786600112913, "epoch": 0.166, "grad_norm": 488.0, "kl_loss_10": 117.70020294189453, "kl_loss_2": 1442.5229797363281, "kl_loss_3": 1035.8803924560548, "kl_loss_7": 256.4058250427246, "learning_rate": 0.0009399751289053266, "loss": 690.3188, "step": 1660 }, { "ce_loss_10": 3.742681550979614, "ce_loss_13": 3.671311604976654, "ce_loss_2": 4.328805279731751, "ce_loss_3": 4.135993778705597, "ce_loss_7": 3.809998023509979, "epoch": 0.167, "grad_norm": 478.0, "kl_loss_10": 116.78232650756836, "kl_loss_2": 1366.6806213378907, "kl_loss_3": 967.4927185058593, "kl_loss_7": 249.3471366882324, "learning_rate": 0.0009392191455421988, "loss": 682.1736, "step": 1670 }, { "ce_loss_10": 3.7067878365516664, "ce_loss_13": 3.6276530623435974, "ce_loss_2": 4.298012292385101, "ce_loss_3": 4.104024171829224, "ce_loss_7": 3.7697168350219727, "epoch": 0.168, "grad_norm": 490.0, "kl_loss_10": 123.8147029876709, "kl_loss_2": 1386.5616271972656, "kl_loss_3": 990.9451782226563, "kl_loss_7": 260.7920967102051, "learning_rate": 0.0009384587392471515, "loss": 679.4555, "step": 1680 }, { "ce_loss_10": 3.7010039329528808, "ce_loss_13": 3.629232919216156, "ce_loss_2": 4.288461661338806, "ce_loss_3": 4.104441356658936, "ce_loss_7": 3.773091959953308, "epoch": 0.169, "grad_norm": 494.0, "kl_loss_10": 117.7159465789795, "kl_loss_2": 1349.4280029296874, "kl_loss_3": 968.4034729003906, "kl_loss_7": 251.4811584472656, "learning_rate": 0.0009376939176774678, "loss": 675.85, "step": 1690 }, { "ce_loss_10": 3.678883969783783, "ce_loss_13": 3.602108871936798, "ce_loss_2": 4.274775016307831, "ce_loss_3": 4.074398016929626, "ce_loss_7": 3.7432108521461487, "epoch": 0.17, "grad_norm": 540.0, "kl_loss_10": 124.26388397216797, "kl_loss_2": 1371.6622314453125, "kl_loss_3": 974.3467742919922, "kl_loss_7": 252.39389877319337, "learning_rate": 0.0009369246885348925, "loss": 687.5515, "step": 1700 }, { "ce_loss_10": 3.6718587994575502, "ce_loss_13": 3.591440510749817, "ce_loss_2": 4.303124558925629, "ce_loss_3": 4.095115387439728, "ce_loss_7": 3.7370152711868285, "epoch": 0.171, "grad_norm": 548.0, "kl_loss_10": 130.6899742126465, "kl_loss_2": 1433.3515563964843, "kl_loss_3": 1016.4502960205078, "kl_loss_7": 255.15539627075196, "learning_rate": 0.0009361510595655545, "loss": 695.4526, "step": 1710 }, { "ce_loss_10": 3.6283922672271727, "ce_loss_13": 3.5495692014694216, "ce_loss_2": 4.237072479724884, "ce_loss_3": 4.041323733329773, "ce_loss_7": 3.6956202745437623, "epoch": 0.172, "grad_norm": 466.0, "kl_loss_10": 127.2772174835205, "kl_loss_2": 1409.4606872558593, "kl_loss_3": 1009.6889221191407, "kl_loss_7": 256.7372299194336, "learning_rate": 0.0009353730385598887, "loss": 691.3762, "step": 1720 }, { "ce_loss_10": 3.54926735162735, "ce_loss_13": 3.4769670009613036, "ce_loss_2": 4.178456115722656, "ce_loss_3": 3.9720041275024416, "ce_loss_7": 3.6176819682121275, "epoch": 0.173, "grad_norm": 436.0, "kl_loss_10": 118.23514060974121, "kl_loss_2": 1418.4845947265626, "kl_loss_3": 998.5092987060547, "kl_loss_7": 249.0585678100586, "learning_rate": 0.0009345906333525581, "loss": 697.0381, "step": 1730 }, { "ce_loss_10": 3.5872240900993346, "ce_loss_13": 3.5136430621147157, "ce_loss_2": 4.2012934923172, "ce_loss_3": 3.9967063546180723, "ce_loss_7": 3.65671169757843, "epoch": 0.174, "grad_norm": 408.0, "kl_loss_10": 122.21610031127929, "kl_loss_2": 1418.8342651367188, "kl_loss_3": 1007.5152191162109, "kl_loss_7": 254.60486221313477, "learning_rate": 0.0009338038518223745, "loss": 687.4246, "step": 1740 }, { "ce_loss_10": 3.657099163532257, "ce_loss_13": 3.5811222553253175, "ce_loss_2": 4.272738003730774, "ce_loss_3": 4.0657650351524355, "ce_loss_7": 3.7293712973594664, "epoch": 0.175, "grad_norm": 424.0, "kl_loss_10": 122.57909774780273, "kl_loss_2": 1418.8521423339844, "kl_loss_3": 1004.8310028076172, "kl_loss_7": 258.8813926696777, "learning_rate": 0.0009330127018922195, "loss": 709.7155, "step": 1750 }, { "ce_loss_10": 3.60728679895401, "ce_loss_13": 3.5332212805747987, "ce_loss_2": 4.2153865694999695, "ce_loss_3": 4.016399335861206, "ce_loss_7": 3.6759958028793336, "epoch": 0.176, "grad_norm": 446.0, "kl_loss_10": 117.00486183166504, "kl_loss_2": 1406.6310485839845, "kl_loss_3": 989.8223937988281, "kl_loss_7": 252.39801330566405, "learning_rate": 0.0009322171915289634, "loss": 689.0163, "step": 1760 }, { "ce_loss_10": 3.640791046619415, "ce_loss_13": 3.5716994404792786, "ce_loss_2": 4.240784847736359, "ce_loss_3": 4.040842926502227, "ce_loss_7": 3.7066658616065977, "epoch": 0.177, "grad_norm": 504.0, "kl_loss_10": 114.7268009185791, "kl_loss_2": 1384.9641479492188, "kl_loss_3": 983.0798065185547, "kl_loss_7": 249.6033966064453, "learning_rate": 0.0009314173287433873, "loss": 677.7067, "step": 1770 }, { "ce_loss_10": 3.6371870756149294, "ce_loss_13": 3.565370166301727, "ce_loss_2": 4.248478496074677, "ce_loss_3": 4.042388367652893, "ce_loss_7": 3.7081421256065368, "epoch": 0.178, "grad_norm": 544.0, "kl_loss_10": 117.58927421569824, "kl_loss_2": 1410.781103515625, "kl_loss_3": 995.2456298828125, "kl_loss_7": 252.9298988342285, "learning_rate": 0.0009306131215901003, "loss": 681.0704, "step": 1780 }, { "ce_loss_10": 3.6657212376594543, "ce_loss_13": 3.5942596793174744, "ce_loss_2": 4.267256224155426, "ce_loss_3": 4.070630991458893, "ce_loss_7": 3.736851954460144, "epoch": 0.179, "grad_norm": 608.0, "kl_loss_10": 117.64482841491699, "kl_loss_2": 1382.4147399902345, "kl_loss_3": 974.7222351074219, "kl_loss_7": 254.88860549926758, "learning_rate": 0.0009298045781674596, "loss": 674.4276, "step": 1790 }, { "ce_loss_10": 3.6482357382774353, "ce_loss_13": 3.577735936641693, "ce_loss_2": 4.238207507133484, "ce_loss_3": 4.047251141071319, "ce_loss_7": 3.7247403979301454, "epoch": 0.18, "grad_norm": 584.0, "kl_loss_10": 113.15109825134277, "kl_loss_2": 1356.1355346679688, "kl_loss_3": 966.8513641357422, "kl_loss_7": 260.6168983459473, "learning_rate": 0.0009289917066174886, "loss": 687.0212, "step": 1800 }, { "ce_loss_10": 3.6436230182647704, "ce_loss_13": 3.573524606227875, "ce_loss_2": 4.205891370773315, "ce_loss_3": 4.035960531234741, "ce_loss_7": 3.713192844390869, "epoch": 0.181, "grad_norm": 644.0, "kl_loss_10": 111.37209777832031, "kl_loss_2": 1312.0460144042968, "kl_loss_3": 951.6195190429687, "kl_loss_7": 248.27648315429687, "learning_rate": 0.0009281745151257945, "loss": 665.2686, "step": 1810 }, { "ce_loss_10": 3.6573068499565125, "ce_loss_13": 3.5899064898490907, "ce_loss_2": 4.263534939289093, "ce_loss_3": 4.073390209674836, "ce_loss_7": 3.725028562545776, "epoch": 0.182, "grad_norm": 496.0, "kl_loss_10": 112.47701683044434, "kl_loss_2": 1362.4901000976563, "kl_loss_3": 982.173095703125, "kl_loss_7": 248.95221557617188, "learning_rate": 0.0009273530119214868, "loss": 681.1132, "step": 1820 }, { "ce_loss_10": 3.7659960746765138, "ce_loss_13": 3.6931302428245543, "ce_loss_2": 4.335440850257873, "ce_loss_3": 4.146318483352661, "ce_loss_7": 3.831969678401947, "epoch": 0.183, "grad_norm": 460.0, "kl_loss_10": 115.37424812316894, "kl_loss_2": 1332.8465270996094, "kl_loss_3": 945.53359375, "kl_loss_7": 244.2625930786133, "learning_rate": 0.0009265272052770935, "loss": 653.1528, "step": 1830 }, { "ce_loss_10": 3.573833405971527, "ce_loss_13": 3.504916477203369, "ce_loss_2": 4.191505336761475, "ce_loss_3": 4.003697621822357, "ce_loss_7": 3.6443989157676695, "epoch": 0.184, "grad_norm": 524.0, "kl_loss_10": 110.15304069519043, "kl_loss_2": 1378.9180969238282, "kl_loss_3": 997.8934539794922, "kl_loss_7": 241.1827133178711, "learning_rate": 0.0009256971035084784, "loss": 679.6828, "step": 1840 }, { "ce_loss_10": 3.5141557097434997, "ce_loss_13": 3.4410739183425902, "ce_loss_2": 4.1375454545021055, "ce_loss_3": 3.934183955192566, "ce_loss_7": 3.588452696800232, "epoch": 0.185, "grad_norm": 528.0, "kl_loss_10": 114.25855445861816, "kl_loss_2": 1412.8881896972657, "kl_loss_3": 1019.4745208740235, "kl_loss_7": 253.67017517089843, "learning_rate": 0.0009248627149747573, "loss": 690.3363, "step": 1850 }, { "ce_loss_10": 3.7252640962600707, "ce_loss_13": 3.653822290897369, "ce_loss_2": 4.3040543556213375, "ce_loss_3": 4.132464408874512, "ce_loss_7": 3.793966567516327, "epoch": 0.186, "grad_norm": 564.0, "kl_loss_10": 115.14772300720215, "kl_loss_2": 1340.048565673828, "kl_loss_3": 980.2122955322266, "kl_loss_7": 244.52606735229492, "learning_rate": 0.0009240240480782129, "loss": 674.7569, "step": 1860 }, { "ce_loss_10": 3.635197627544403, "ce_loss_13": 3.561525750160217, "ce_loss_2": 4.234712994098663, "ce_loss_3": 4.036596286296844, "ce_loss_7": 3.7000155448913574, "epoch": 0.187, "grad_norm": 442.0, "kl_loss_10": 116.40899467468262, "kl_loss_2": 1366.7482482910157, "kl_loss_3": 985.8397155761719, "kl_loss_7": 245.4348571777344, "learning_rate": 0.0009231811112642122, "loss": 670.6495, "step": 1870 }, { "ce_loss_10": 3.680171477794647, "ce_loss_13": 3.607526624202728, "ce_loss_2": 4.242461228370667, "ce_loss_3": 4.0597851276397705, "ce_loss_7": 3.7417822241783143, "epoch": 0.188, "grad_norm": 462.0, "kl_loss_10": 115.97058601379395, "kl_loss_2": 1329.476806640625, "kl_loss_3": 944.8101593017578, "kl_loss_7": 240.48009033203124, "learning_rate": 0.0009223339130211192, "loss": 656.504, "step": 1880 }, { "ce_loss_10": 3.527233564853668, "ce_loss_13": 3.456979143619537, "ce_loss_2": 4.1365337610244755, "ce_loss_3": 3.9334477186203003, "ce_loss_7": 3.5922507286071776, "epoch": 0.189, "grad_norm": 492.0, "kl_loss_10": 120.61857757568359, "kl_loss_2": 1391.3780578613282, "kl_loss_3": 981.7018951416015, "kl_loss_7": 240.8455017089844, "learning_rate": 0.0009214824618802108, "loss": 678.3247, "step": 1890 }, { "ce_loss_10": 3.715823400020599, "ce_loss_13": 3.639923906326294, "ce_loss_2": 4.3134965896606445, "ce_loss_3": 4.111241257190704, "ce_loss_7": 3.779346799850464, "epoch": 0.19, "grad_norm": 456.0, "kl_loss_10": 127.23999710083008, "kl_loss_2": 1364.8941589355468, "kl_loss_3": 960.9680725097656, "kl_loss_7": 248.77009048461915, "learning_rate": 0.0009206267664155906, "loss": 685.2967, "step": 1900 }, { "ce_loss_10": 3.6354769825935365, "ce_loss_13": 3.556038224697113, "ce_loss_2": 4.224778318405152, "ce_loss_3": 4.022096812725067, "ce_loss_7": 3.690963363647461, "epoch": 0.191, "grad_norm": 524.0, "kl_loss_10": 125.37596015930175, "kl_loss_2": 1371.057391357422, "kl_loss_3": 969.1447021484375, "kl_loss_7": 243.19865951538085, "learning_rate": 0.0009197668352441024, "loss": 678.1597, "step": 1910 }, { "ce_loss_10": 3.6849255323410035, "ce_loss_13": 3.6070022225379943, "ce_loss_2": 4.272895455360413, "ce_loss_3": 4.0722639799118046, "ce_loss_7": 3.741350519657135, "epoch": 0.192, "grad_norm": 512.0, "kl_loss_10": 128.65237312316896, "kl_loss_2": 1349.729931640625, "kl_loss_3": 949.7346984863282, "kl_loss_7": 242.41346817016603, "learning_rate": 0.0009189026770252437, "loss": 671.3585, "step": 1920 }, { "ce_loss_10": 3.7201656699180603, "ce_loss_13": 3.6394999861717223, "ce_loss_2": 4.302338600158691, "ce_loss_3": 4.1032923579216005, "ce_loss_7": 3.7764319658279417, "epoch": 0.193, "grad_norm": 458.0, "kl_loss_10": 133.32494201660157, "kl_loss_2": 1342.450421142578, "kl_loss_3": 949.4765075683594, "kl_loss_7": 246.09827728271483, "learning_rate": 0.000918034300461078, "loss": 688.7368, "step": 1930 }, { "ce_loss_10": 3.747203004360199, "ce_loss_13": 3.6689361929893494, "ce_loss_2": 4.312567710876465, "ce_loss_3": 4.122261881828308, "ce_loss_7": 3.8043017029762267, "epoch": 0.194, "grad_norm": 446.0, "kl_loss_10": 129.00423164367675, "kl_loss_2": 1325.102197265625, "kl_loss_3": 931.5237396240234, "kl_loss_7": 241.53863983154298, "learning_rate": 0.0009171617142961477, "loss": 661.2737, "step": 1940 }, { "ce_loss_10": 3.699472951889038, "ce_loss_13": 3.6279419898986816, "ce_loss_2": 4.281847763061523, "ce_loss_3": 4.083541011810302, "ce_loss_7": 3.7648919463157653, "epoch": 0.195, "grad_norm": 434.0, "kl_loss_10": 121.35350723266602, "kl_loss_2": 1352.1436584472656, "kl_loss_3": 952.5272399902344, "kl_loss_7": 240.53460235595702, "learning_rate": 0.0009162849273173857, "loss": 665.7376, "step": 1950 }, { "ce_loss_10": 3.632410800457001, "ce_loss_13": 3.5614632248878477, "ce_loss_2": 4.223404765129089, "ce_loss_3": 4.023203945159912, "ce_loss_7": 3.700835573673248, "epoch": 0.196, "grad_norm": 470.0, "kl_loss_10": 118.8283805847168, "kl_loss_2": 1344.0367370605468, "kl_loss_3": 944.1380065917969, "kl_loss_7": 251.04139099121093, "learning_rate": 0.0009154039483540273, "loss": 672.422, "step": 1960 }, { "ce_loss_10": 3.6197654128074648, "ce_loss_13": 3.546481454372406, "ce_loss_2": 4.201360607147217, "ce_loss_3": 4.004413700103759, "ce_loss_7": 3.682723355293274, "epoch": 0.197, "grad_norm": 406.0, "kl_loss_10": 120.08837623596192, "kl_loss_2": 1349.5242309570312, "kl_loss_3": 942.7119018554688, "kl_loss_7": 243.98333435058595, "learning_rate": 0.0009145187862775209, "loss": 667.6594, "step": 1970 }, { "ce_loss_10": 3.6506257891654967, "ce_loss_13": 3.5804669737815855, "ce_loss_2": 4.243929970264435, "ce_loss_3": 4.034862732887268, "ce_loss_7": 3.7135458827018737, "epoch": 0.198, "grad_norm": 620.0, "kl_loss_10": 117.48477897644042, "kl_loss_2": 1377.6264770507812, "kl_loss_3": 958.5087188720703, "kl_loss_7": 243.4327537536621, "learning_rate": 0.0009136294500014386, "loss": 665.5496, "step": 1980 }, { "ce_loss_10": 3.599961686134338, "ce_loss_13": 3.528086531162262, "ce_loss_2": 4.217166924476624, "ce_loss_3": 4.008637738227844, "ce_loss_7": 3.6669308066368105, "epoch": 0.199, "grad_norm": 616.0, "kl_loss_10": 115.34629516601562, "kl_loss_2": 1399.4097900390625, "kl_loss_3": 983.5843353271484, "kl_loss_7": 244.90453720092773, "learning_rate": 0.000912735948481387, "loss": 681.3188, "step": 1990 }, { "ce_loss_10": 3.6347181677818297, "ce_loss_13": 3.560755395889282, "ce_loss_2": 4.230910205841065, "ce_loss_3": 4.03362866640091, "ce_loss_7": 3.700130546092987, "epoch": 0.2, "grad_norm": 492.0, "kl_loss_10": 115.55288009643554, "kl_loss_2": 1372.0876098632812, "kl_loss_3": 976.6782318115235, "kl_loss_7": 248.47412033081054, "learning_rate": 0.0009118382907149164, "loss": 666.3086, "step": 2000 }, { "ce_loss_10": 3.6592599511146546, "ce_loss_13": 3.5870088934898376, "ce_loss_2": 4.23843743801117, "ce_loss_3": 4.045619630813599, "ce_loss_7": 3.722980320453644, "epoch": 0.201, "grad_norm": 492.0, "kl_loss_10": 114.64969139099121, "kl_loss_2": 1351.183447265625, "kl_loss_3": 956.2785675048829, "kl_loss_7": 247.1648811340332, "learning_rate": 0.0009109364857414306, "loss": 658.4385, "step": 2010 }, { "ce_loss_10": 3.6247077345848084, "ce_loss_13": 3.5549973130226133, "ce_loss_2": 4.192802679538727, "ce_loss_3": 4.006897258758545, "ce_loss_7": 3.694356381893158, "epoch": 0.202, "grad_norm": 432.0, "kl_loss_10": 111.96462211608886, "kl_loss_2": 1332.7575988769531, "kl_loss_3": 943.8550109863281, "kl_loss_7": 248.51104660034179, "learning_rate": 0.0009100305426420956, "loss": 673.1317, "step": 2020 }, { "ce_loss_10": 3.5841406345367433, "ce_loss_13": 3.5164321780204775, "ce_loss_2": 4.202693927288055, "ce_loss_3": 3.9953248143196105, "ce_loss_7": 3.650038242340088, "epoch": 0.203, "grad_norm": 432.0, "kl_loss_10": 113.3315975189209, "kl_loss_2": 1413.3973693847656, "kl_loss_3": 984.508837890625, "kl_loss_7": 247.24474029541017, "learning_rate": 0.0009091204705397484, "loss": 669.0848, "step": 2030 }, { "ce_loss_10": 3.585637128353119, "ce_loss_13": 3.5091155648231505, "ce_loss_2": 4.185344040393829, "ce_loss_3": 3.992532753944397, "ce_loss_7": 3.6510769963264464, "epoch": 0.204, "grad_norm": 448.0, "kl_loss_10": 124.74103927612305, "kl_loss_2": 1400.9709167480469, "kl_loss_3": 992.3661834716797, "kl_loss_7": 250.7290901184082, "learning_rate": 0.0009082062785988049, "loss": 681.1268, "step": 2040 }, { "ce_loss_10": 3.721049964427948, "ce_loss_13": 3.6463207244873046, "ce_loss_2": 4.2808568477630615, "ce_loss_3": 4.093539321422577, "ce_loss_7": 3.7847790718078613, "epoch": 0.205, "grad_norm": 466.0, "kl_loss_10": 117.74674758911132, "kl_loss_2": 1322.8293701171874, "kl_loss_3": 931.9617309570312, "kl_loss_7": 242.782958984375, "learning_rate": 0.0009072879760251679, "loss": 667.7382, "step": 2050 }, { "ce_loss_10": 3.6576568126678466, "ce_loss_13": 3.5876036405563356, "ce_loss_2": 4.261196970939636, "ce_loss_3": 4.065271866321564, "ce_loss_7": 3.727122116088867, "epoch": 0.206, "grad_norm": 510.0, "kl_loss_10": 116.0688491821289, "kl_loss_2": 1368.6263488769532, "kl_loss_3": 974.0030578613281, "kl_loss_7": 247.75207290649413, "learning_rate": 0.0009063655720661341, "loss": 667.9454, "step": 2060 }, { "ce_loss_10": 3.7091850519180296, "ce_loss_13": 3.6359564065933228, "ce_loss_2": 4.2861632108688354, "ce_loss_3": 4.091731405258178, "ce_loss_7": 3.780376970767975, "epoch": 0.207, "grad_norm": 756.0, "kl_loss_10": 117.6738265991211, "kl_loss_2": 1338.1011291503905, "kl_loss_3": 948.2550628662109, "kl_loss_7": 265.19689025878904, "learning_rate": 0.000905439076010301, "loss": 666.8086, "step": 2070 }, { "ce_loss_10": 3.661241602897644, "ce_loss_13": 3.5897186398506165, "ce_loss_2": 4.2563663721084595, "ce_loss_3": 4.06014586687088, "ce_loss_7": 3.7477613568305967, "epoch": 0.208, "grad_norm": 502.0, "kl_loss_10": 114.75448532104492, "kl_loss_2": 1354.8022583007812, "kl_loss_3": 965.9129913330078, "kl_loss_7": 280.605126953125, "learning_rate": 0.0009045084971874737, "loss": 668.0169, "step": 2080 }, { "ce_loss_10": 3.6376566290855408, "ce_loss_13": 3.568475866317749, "ce_loss_2": 4.219207537174225, "ce_loss_3": 4.027460336685181, "ce_loss_7": 3.7098584175109863, "epoch": 0.209, "grad_norm": 476.0, "kl_loss_10": 112.78402214050293, "kl_loss_2": 1340.8613708496093, "kl_loss_3": 948.0414184570312, "kl_loss_7": 264.3713745117187, "learning_rate": 0.0009035738449685707, "loss": 673.0266, "step": 2090 }, { "ce_loss_10": 3.5796483874320986, "ce_loss_13": 3.507876431941986, "ce_loss_2": 4.179275572299957, "ce_loss_3": 3.9804170727729797, "ce_loss_7": 3.6553168416023256, "epoch": 0.21, "grad_norm": 576.0, "kl_loss_10": 116.00268287658692, "kl_loss_2": 1358.9301879882812, "kl_loss_3": 960.5301177978515, "kl_loss_7": 258.80527420043944, "learning_rate": 0.0009026351287655293, "loss": 660.1454, "step": 2100 }, { "ce_loss_10": 3.7848559260368346, "ce_loss_13": 3.7130728244781492, "ce_loss_2": 4.324071049690247, "ce_loss_3": 4.141572403907776, "ce_loss_7": 3.8475868105888367, "epoch": 0.211, "grad_norm": 410.0, "kl_loss_10": 115.61974067687989, "kl_loss_2": 1276.820086669922, "kl_loss_3": 901.7400726318359, "kl_loss_7": 240.78678359985352, "learning_rate": 0.0009016923580312113, "loss": 636.6335, "step": 2110 }, { "ce_loss_10": 3.6301342844963074, "ce_loss_13": 3.560654580593109, "ce_loss_2": 4.20012868642807, "ce_loss_3": 4.0125791192054745, "ce_loss_7": 3.6946483969688417, "epoch": 0.212, "grad_norm": 462.0, "kl_loss_10": 112.99716110229492, "kl_loss_2": 1313.61005859375, "kl_loss_3": 932.8424255371094, "kl_loss_7": 240.82636260986328, "learning_rate": 0.0009007455422593077, "loss": 661.1402, "step": 2120 }, { "ce_loss_10": 3.643904185295105, "ce_loss_13": 3.5732839465141297, "ce_loss_2": 4.229895269870758, "ce_loss_3": 4.039864921569825, "ce_loss_7": 3.7100953698158263, "epoch": 0.213, "grad_norm": 544.0, "kl_loss_10": 113.59500656127929, "kl_loss_2": 1376.1138916015625, "kl_loss_3": 982.583627319336, "kl_loss_7": 245.73143768310547, "learning_rate": 0.0008997946909842425, "loss": 673.5951, "step": 2130 }, { "ce_loss_10": 3.6592751264572145, "ce_loss_13": 3.5883899569511413, "ce_loss_2": 4.269070100784302, "ce_loss_3": 4.073972117900849, "ce_loss_7": 3.7271844029426573, "epoch": 0.214, "grad_norm": 486.0, "kl_loss_10": 115.54258766174317, "kl_loss_2": 1390.4590087890624, "kl_loss_3": 991.5129302978515, "kl_loss_7": 248.1077392578125, "learning_rate": 0.0008988398137810777, "loss": 666.5385, "step": 2140 }, { "ce_loss_10": 3.696693778038025, "ce_loss_13": 3.6274760484695436, "ce_loss_2": 4.272996628284455, "ce_loss_3": 4.077895438671112, "ce_loss_7": 3.759436583518982, "epoch": 0.215, "grad_norm": 410.0, "kl_loss_10": 109.15078239440918, "kl_loss_2": 1323.0664428710938, "kl_loss_3": 929.4178771972656, "kl_loss_7": 235.6734588623047, "learning_rate": 0.0008978809202654162, "loss": 648.7643, "step": 2150 }, { "ce_loss_10": 3.674948477745056, "ce_loss_13": 3.6074150681495665, "ce_loss_2": 4.254977214336395, "ce_loss_3": 4.054577016830445, "ce_loss_7": 3.7380695223808287, "epoch": 0.216, "grad_norm": 342.0, "kl_loss_10": 111.83034286499023, "kl_loss_2": 1326.1663391113282, "kl_loss_3": 930.476919555664, "kl_loss_7": 237.29275283813476, "learning_rate": 0.0008969180200933046, "loss": 659.8788, "step": 2160 }, { "ce_loss_10": 3.633396315574646, "ce_loss_13": 3.5632909893989564, "ce_loss_2": 4.235332405567169, "ce_loss_3": 4.043020272254944, "ce_loss_7": 3.700575852394104, "epoch": 0.217, "grad_norm": 426.0, "kl_loss_10": 113.71143455505371, "kl_loss_2": 1376.4859924316406, "kl_loss_3": 965.2266693115234, "kl_loss_7": 244.74252319335938, "learning_rate": 0.0008959511229611376, "loss": 671.5447, "step": 2170 }, { "ce_loss_10": 3.7160756826400756, "ce_loss_13": 3.6463282823562624, "ce_loss_2": 4.29187992811203, "ce_loss_3": 4.0930745005607605, "ce_loss_7": 3.77754408121109, "epoch": 0.218, "grad_norm": 494.0, "kl_loss_10": 112.80306968688964, "kl_loss_2": 1327.3540283203124, "kl_loss_3": 931.4955108642578, "kl_loss_7": 236.75977325439453, "learning_rate": 0.0008949802386055581, "loss": 652.6458, "step": 2180 }, { "ce_loss_10": 3.5766260862350463, "ce_loss_13": 3.5034859418869018, "ce_loss_2": 4.159404098987579, "ce_loss_3": 3.9599217534065247, "ce_loss_7": 3.6389345288276673, "epoch": 0.219, "grad_norm": 466.0, "kl_loss_10": 111.61733703613281, "kl_loss_2": 1335.0789672851563, "kl_loss_3": 936.6266204833985, "kl_loss_7": 234.00814971923828, "learning_rate": 0.0008940053768033609, "loss": 665.0317, "step": 2190 }, { "ce_loss_10": 3.65551677942276, "ce_loss_13": 3.58828284740448, "ce_loss_2": 4.223571491241455, "ce_loss_3": 4.04930864572525, "ce_loss_7": 3.7170740485191347, "epoch": 0.22, "grad_norm": 500.0, "kl_loss_10": 111.12692604064941, "kl_loss_2": 1315.6073425292968, "kl_loss_3": 947.5290161132813, "kl_loss_7": 230.48658447265626, "learning_rate": 0.0008930265473713938, "loss": 654.9715, "step": 2200 }, { "ce_loss_10": 3.6195818066596983, "ce_loss_13": 3.54861319065094, "ce_loss_2": 4.198423433303833, "ce_loss_3": 4.012469959259033, "ce_loss_7": 3.681425595283508, "epoch": 0.221, "grad_norm": 528.0, "kl_loss_10": 115.1269718170166, "kl_loss_2": 1324.1462280273438, "kl_loss_3": 954.0400299072265, "kl_loss_7": 233.1149475097656, "learning_rate": 0.0008920437601664579, "loss": 648.2547, "step": 2210 }, { "ce_loss_10": 3.6091471552848815, "ce_loss_13": 3.539783036708832, "ce_loss_2": 4.181177127361297, "ce_loss_3": 3.993445408344269, "ce_loss_7": 3.669792366027832, "epoch": 0.222, "grad_norm": 410.0, "kl_loss_10": 115.02236862182617, "kl_loss_2": 1333.0483093261719, "kl_loss_3": 948.5272521972656, "kl_loss_7": 236.40514373779297, "learning_rate": 0.0008910570250852097, "loss": 647.6241, "step": 2220 }, { "ce_loss_10": 3.7300463914871216, "ce_loss_13": 3.657940351963043, "ce_loss_2": 4.2689752101898195, "ce_loss_3": 4.08460431098938, "ce_loss_7": 3.786776435375214, "epoch": 0.223, "grad_norm": 396.0, "kl_loss_10": 119.05343589782714, "kl_loss_2": 1270.7809265136718, "kl_loss_3": 894.4331726074219, "kl_loss_7": 233.2676574707031, "learning_rate": 0.0008900663520640604, "loss": 634.1773, "step": 2230 }, { "ce_loss_10": 3.668592298030853, "ce_loss_13": 3.5982241868972777, "ce_loss_2": 4.234554326534271, "ce_loss_3": 4.045566046237946, "ce_loss_7": 3.7271997809410093, "epoch": 0.224, "grad_norm": 484.0, "kl_loss_10": 115.87549057006837, "kl_loss_2": 1305.9935668945313, "kl_loss_3": 922.4547302246094, "kl_loss_7": 233.83654251098633, "learning_rate": 0.0008890717510790764, "loss": 651.4086, "step": 2240 }, { "ce_loss_10": 3.624187970161438, "ce_loss_13": 3.553602933883667, "ce_loss_2": 4.209651458263397, "ce_loss_3": 4.014156377315521, "ce_loss_7": 3.6865435361862184, "epoch": 0.225, "grad_norm": 428.0, "kl_loss_10": 111.85544166564941, "kl_loss_2": 1344.305419921875, "kl_loss_3": 948.2704040527344, "kl_loss_7": 233.77221450805663, "learning_rate": 0.0008880732321458784, "loss": 659.9288, "step": 2250 }, { "ce_loss_10": 3.6572599172592164, "ce_loss_13": 3.589207625389099, "ce_loss_2": 4.229009163379669, "ce_loss_3": 4.035797142982483, "ce_loss_7": 3.720983362197876, "epoch": 0.226, "grad_norm": 450.0, "kl_loss_10": 112.32478141784668, "kl_loss_2": 1315.7465759277343, "kl_loss_3": 923.5756164550781, "kl_loss_7": 233.98969955444335, "learning_rate": 0.0008870708053195413, "loss": 656.0779, "step": 2260 }, { "ce_loss_10": 3.6822890281677245, "ce_loss_13": 3.613891136646271, "ce_loss_2": 4.243361723423004, "ce_loss_3": 4.054495620727539, "ce_loss_7": 3.7477814078330995, "epoch": 0.227, "grad_norm": 510.0, "kl_loss_10": 109.18305702209473, "kl_loss_2": 1298.216571044922, "kl_loss_3": 915.6232360839844, "kl_loss_7": 232.2860221862793, "learning_rate": 0.0008860644806944918, "loss": 640.8393, "step": 2270 }, { "ce_loss_10": 3.6228482365608214, "ce_loss_13": 3.55364191532135, "ce_loss_2": 4.206750881671906, "ce_loss_3": 4.00951054096222, "ce_loss_7": 3.692072665691376, "epoch": 0.228, "grad_norm": 516.0, "kl_loss_10": 112.56623115539551, "kl_loss_2": 1346.1175109863282, "kl_loss_3": 947.8353302001954, "kl_loss_7": 250.14039154052733, "learning_rate": 0.0008850542684044079, "loss": 646.6089, "step": 2280 }, { "ce_loss_10": 3.594875121116638, "ce_loss_13": 3.5233037948608397, "ce_loss_2": 4.204600942134857, "ce_loss_3": 3.998581278324127, "ce_loss_7": 3.665067207813263, "epoch": 0.229, "grad_norm": 450.0, "kl_loss_10": 112.58180122375488, "kl_loss_2": 1387.7576477050782, "kl_loss_3": 974.7930572509765, "kl_loss_7": 248.70092544555663, "learning_rate": 0.0008840401786221159, "loss": 661.1442, "step": 2290 }, { "ce_loss_10": 3.731163203716278, "ce_loss_13": 3.6657732129096985, "ce_loss_2": 4.296657812595368, "ce_loss_3": 4.099385142326355, "ce_loss_7": 3.791227328777313, "epoch": 0.23, "grad_norm": 480.0, "kl_loss_10": 108.29386520385742, "kl_loss_2": 1301.2722473144531, "kl_loss_3": 910.2853973388671, "kl_loss_7": 230.31064682006837, "learning_rate": 0.000883022221559489, "loss": 636.6557, "step": 2300 }, { "ce_loss_10": 3.68310546875, "ce_loss_13": 3.615295338630676, "ce_loss_2": 4.257085943222046, "ce_loss_3": 4.067182207107544, "ce_loss_7": 3.7437336564064028, "epoch": 0.231, "grad_norm": 446.0, "kl_loss_10": 109.58497161865235, "kl_loss_2": 1331.8181457519531, "kl_loss_3": 935.3311614990234, "kl_loss_7": 230.688321685791, "learning_rate": 0.0008820004074673434, "loss": 666.0036, "step": 2310 }, { "ce_loss_10": 3.588702178001404, "ce_loss_13": 3.524743127822876, "ce_loss_2": 4.165178096294403, "ce_loss_3": 3.97126362323761, "ce_loss_7": 3.6520536303520204, "epoch": 0.232, "grad_norm": 494.0, "kl_loss_10": 105.68032302856446, "kl_loss_2": 1342.763037109375, "kl_loss_3": 936.0263000488281, "kl_loss_7": 229.52542724609376, "learning_rate": 0.0008809747466353355, "loss": 641.3422, "step": 2320 }, { "ce_loss_10": 3.602530860900879, "ce_loss_13": 3.5331971526145933, "ce_loss_2": 4.167471373081208, "ce_loss_3": 3.977475893497467, "ce_loss_7": 3.6642409324645997, "epoch": 0.233, "grad_norm": 458.0, "kl_loss_10": 110.41828346252441, "kl_loss_2": 1312.3292907714845, "kl_loss_3": 919.6944396972656, "kl_loss_7": 231.60648040771486, "learning_rate": 0.0008799452493918585, "loss": 645.3, "step": 2330 }, { "ce_loss_10": 3.6880576372146607, "ce_loss_13": 3.6198028326034546, "ce_loss_2": 4.254074168205261, "ce_loss_3": 4.062888276576996, "ce_loss_7": 3.7507563471794128, "epoch": 0.234, "grad_norm": 474.0, "kl_loss_10": 110.42879600524903, "kl_loss_2": 1309.2276428222656, "kl_loss_3": 921.8907257080078, "kl_loss_7": 233.63690185546875, "learning_rate": 0.0008789119261039385, "loss": 662.3614, "step": 2340 }, { "ce_loss_10": 3.5944358229637148, "ce_loss_13": 3.5265544295310973, "ce_loss_2": 4.165751957893372, "ce_loss_3": 3.9788960099220274, "ce_loss_7": 3.656530773639679, "epoch": 0.235, "grad_norm": 390.0, "kl_loss_10": 106.77197875976563, "kl_loss_2": 1303.9624755859375, "kl_loss_3": 920.4357025146485, "kl_loss_7": 233.87101974487305, "learning_rate": 0.0008778747871771292, "loss": 636.2241, "step": 2350 }, { "ce_loss_10": 3.642832565307617, "ce_loss_13": 3.5764389514923094, "ce_loss_2": 4.1962644219398495, "ce_loss_3": 4.005896735191345, "ce_loss_7": 3.703742432594299, "epoch": 0.236, "grad_norm": 488.0, "kl_loss_10": 106.56211357116699, "kl_loss_2": 1280.7797607421876, "kl_loss_3": 899.6746459960938, "kl_loss_7": 226.2964195251465, "learning_rate": 0.0008768338430554083, "loss": 628.8105, "step": 2360 }, { "ce_loss_10": 3.6540219306945803, "ce_loss_13": 3.5862942576408385, "ce_loss_2": 4.217021405696869, "ce_loss_3": 4.0323525190353395, "ce_loss_7": 3.715273082256317, "epoch": 0.237, "grad_norm": 446.0, "kl_loss_10": 108.96415519714355, "kl_loss_2": 1303.7487731933593, "kl_loss_3": 917.9468658447265, "kl_loss_7": 231.81985321044922, "learning_rate": 0.0008757891042210713, "loss": 643.5909, "step": 2370 }, { "ce_loss_10": 3.6785866141319277, "ce_loss_13": 3.6081984996795655, "ce_loss_2": 4.2465239524841305, "ce_loss_3": 4.049813544750213, "ce_loss_7": 3.7435325622558593, "epoch": 0.238, "grad_norm": 504.0, "kl_loss_10": 111.76208610534668, "kl_loss_2": 1305.791943359375, "kl_loss_3": 916.3558563232422, "kl_loss_7": 238.5102066040039, "learning_rate": 0.0008747405811946271, "loss": 645.7604, "step": 2380 }, { "ce_loss_10": 3.5631368517875672, "ce_loss_13": 3.4966716051101683, "ce_loss_2": 4.149629712104797, "ce_loss_3": 3.960558259487152, "ce_loss_7": 3.629357707500458, "epoch": 0.239, "grad_norm": 466.0, "kl_loss_10": 110.7029800415039, "kl_loss_2": 1342.1500549316406, "kl_loss_3": 952.2329833984375, "kl_loss_7": 240.75519790649415, "learning_rate": 0.0008736882845346905, "loss": 640.1473, "step": 2390 }, { "ce_loss_10": 3.6702625513076783, "ce_loss_13": 3.598974347114563, "ce_loss_2": 4.235510897636414, "ce_loss_3": 4.041280698776245, "ce_loss_7": 3.734322738647461, "epoch": 0.24, "grad_norm": 504.0, "kl_loss_10": 116.23694610595703, "kl_loss_2": 1302.5589904785156, "kl_loss_3": 916.9543487548829, "kl_loss_7": 247.71970977783204, "learning_rate": 0.0008726322248378774, "loss": 637.9229, "step": 2400 }, { "ce_loss_10": 3.6652265906333925, "ce_loss_13": 3.595516872406006, "ce_loss_2": 4.246520745754242, "ce_loss_3": 4.043909120559692, "ce_loss_7": 3.7296547532081603, "epoch": 0.241, "grad_norm": 450.0, "kl_loss_10": 113.42751388549804, "kl_loss_2": 1334.7802490234376, "kl_loss_3": 932.5218017578125, "kl_loss_7": 240.9356887817383, "learning_rate": 0.0008715724127386971, "loss": 657.7121, "step": 2410 }, { "ce_loss_10": 3.7339873194694517, "ce_loss_13": 3.6643826484680178, "ce_loss_2": 4.278091061115265, "ce_loss_3": 4.095656621456146, "ce_loss_7": 3.791609489917755, "epoch": 0.242, "grad_norm": 462.0, "kl_loss_10": 112.56025924682618, "kl_loss_2": 1285.8053466796875, "kl_loss_3": 903.9169921875, "kl_loss_7": 234.85026092529296, "learning_rate": 0.0008705088589094458, "loss": 638.7832, "step": 2420 }, { "ce_loss_10": 3.745934009552002, "ce_loss_13": 3.677195417881012, "ce_loss_2": 4.306411802768707, "ce_loss_3": 4.115467298030853, "ce_loss_7": 3.8079170107841493, "epoch": 0.243, "grad_norm": 414.0, "kl_loss_10": 111.40414924621582, "kl_loss_2": 1291.0523193359375, "kl_loss_3": 906.0183563232422, "kl_loss_7": 231.1467155456543, "learning_rate": 0.0008694415740600988, "loss": 640.6179, "step": 2430 }, { "ce_loss_10": 3.595633792877197, "ce_loss_13": 3.5286824345588683, "ce_loss_2": 4.186046612262726, "ce_loss_3": 3.9901591181755065, "ce_loss_7": 3.6573471426963806, "epoch": 0.244, "grad_norm": 500.0, "kl_loss_10": 109.70008354187011, "kl_loss_2": 1348.1939697265625, "kl_loss_3": 959.4411895751953, "kl_loss_7": 231.17713012695313, "learning_rate": 0.0008683705689382025, "loss": 654.2107, "step": 2440 }, { "ce_loss_10": 3.684832978248596, "ce_loss_13": 3.616632854938507, "ce_loss_2": 4.231842195987701, "ce_loss_3": 4.048018515110016, "ce_loss_7": 3.740548253059387, "epoch": 0.245, "grad_norm": 450.0, "kl_loss_10": 108.41531753540039, "kl_loss_2": 1280.763720703125, "kl_loss_3": 904.3856048583984, "kl_loss_7": 224.61626434326172, "learning_rate": 0.0008672958543287666, "loss": 648.306, "step": 2450 }, { "ce_loss_10": 3.6943544864654543, "ce_loss_13": 3.625825345516205, "ce_loss_2": 4.246035170555115, "ce_loss_3": 4.063331556320191, "ce_loss_7": 3.75509090423584, "epoch": 0.246, "grad_norm": 428.0, "kl_loss_10": 109.8709056854248, "kl_loss_2": 1283.2248840332031, "kl_loss_3": 904.9634002685547, "kl_loss_7": 228.17876358032225, "learning_rate": 0.0008662174410541554, "loss": 632.1618, "step": 2460 }, { "ce_loss_10": 3.6546427369117738, "ce_loss_13": 3.587355947494507, "ce_loss_2": 4.204605233669281, "ce_loss_3": 4.022764265537262, "ce_loss_7": 3.717895233631134, "epoch": 0.247, "grad_norm": 394.0, "kl_loss_10": 107.19166374206543, "kl_loss_2": 1283.1351989746095, "kl_loss_3": 906.7954956054688, "kl_loss_7": 227.77373123168945, "learning_rate": 0.0008651353399739787, "loss": 642.6704, "step": 2470 }, { "ce_loss_10": 3.685038208961487, "ce_loss_13": 3.6166242718696595, "ce_loss_2": 4.247460579872131, "ce_loss_3": 4.056502640247345, "ce_loss_7": 3.7451204299926757, "epoch": 0.248, "grad_norm": 520.0, "kl_loss_10": 109.50839309692383, "kl_loss_2": 1297.0845886230468, "kl_loss_3": 908.0451263427734, "kl_loss_7": 229.69447708129883, "learning_rate": 0.0008640495619849821, "loss": 636.7805, "step": 2480 }, { "ce_loss_10": 3.6444509506225584, "ce_loss_13": 3.5771190404891966, "ce_loss_2": 4.202155363559723, "ce_loss_3": 4.00964595079422, "ce_loss_7": 3.709311318397522, "epoch": 0.249, "grad_norm": 492.0, "kl_loss_10": 107.21613540649415, "kl_loss_2": 1287.2485595703124, "kl_loss_3": 903.1389892578125, "kl_loss_7": 236.67683792114258, "learning_rate": 0.0008629601180209381, "loss": 632.6728, "step": 2490 }, { "ce_loss_10": 3.640513265132904, "ce_loss_13": 3.571250784397125, "ce_loss_2": 4.189491713047028, "ce_loss_3": 4.000437986850739, "ce_loss_7": 3.6998685002326965, "epoch": 0.25, "grad_norm": 388.0, "kl_loss_10": 109.32069320678711, "kl_loss_2": 1269.254248046875, "kl_loss_3": 889.8983123779296, "kl_loss_7": 235.86445999145508, "learning_rate": 0.000861867019052535, "loss": 634.9495, "step": 2500 }, { "ce_loss_10": 3.5532511711120605, "ce_loss_13": 3.4857767462730407, "ce_loss_2": 4.138734245300293, "ce_loss_3": 3.9396822333335875, "ce_loss_7": 3.6218135476112367, "epoch": 0.251, "grad_norm": 454.0, "kl_loss_10": 108.08290519714356, "kl_loss_2": 1328.9036437988282, "kl_loss_3": 931.4583953857422, "kl_loss_7": 239.18134002685548, "learning_rate": 0.0008607702760872678, "loss": 651.695, "step": 2510 }, { "ce_loss_10": 3.6758545279502868, "ce_loss_13": 3.6099536180496217, "ce_loss_2": 4.224261367321015, "ce_loss_3": 4.035415709018707, "ce_loss_7": 3.738192629814148, "epoch": 0.252, "grad_norm": 676.0, "kl_loss_10": 109.95955963134766, "kl_loss_2": 1264.877392578125, "kl_loss_3": 890.2998840332032, "kl_loss_7": 227.99790649414064, "learning_rate": 0.0008596699001693256, "loss": 638.9367, "step": 2520 }, { "ce_loss_10": 3.695500302314758, "ce_loss_13": 3.61992267370224, "ce_loss_2": 4.222428333759308, "ce_loss_3": 4.035504674911499, "ce_loss_7": 3.743453121185303, "epoch": 0.253, "grad_norm": 548.0, "kl_loss_10": 127.16191024780274, "kl_loss_2": 1273.7016052246095, "kl_loss_3": 884.7771850585938, "kl_loss_7": 228.30911254882812, "learning_rate": 0.0008585659023794818, "loss": 643.5041, "step": 2530 }, { "ce_loss_10": 3.637289047241211, "ce_loss_13": 3.5686028838157653, "ce_loss_2": 4.218254780769348, "ce_loss_3": 4.019161069393158, "ce_loss_7": 3.696817862987518, "epoch": 0.254, "grad_norm": 462.0, "kl_loss_10": 120.60056190490722, "kl_loss_2": 1330.6954162597656, "kl_loss_3": 938.699496459961, "kl_loss_7": 233.61075134277343, "learning_rate": 0.0008574582938349817, "loss": 644.681, "step": 2540 }, { "ce_loss_10": 3.6420682311058044, "ce_loss_13": 3.567863130569458, "ce_loss_2": 4.228188097476959, "ce_loss_3": 4.029180979728698, "ce_loss_7": 3.705214190483093, "epoch": 0.255, "grad_norm": 372.0, "kl_loss_10": 117.31230735778809, "kl_loss_2": 1354.2801513671875, "kl_loss_3": 952.2808837890625, "kl_loss_7": 241.81886978149413, "learning_rate": 0.0008563470856894315, "loss": 640.0965, "step": 2550 }, { "ce_loss_10": 3.6231508612632752, "ce_loss_13": 3.5556546688079833, "ce_loss_2": 4.189491558074951, "ce_loss_3": 3.9987146973609926, "ce_loss_7": 3.681329298019409, "epoch": 0.256, "grad_norm": 472.0, "kl_loss_10": 108.91358489990235, "kl_loss_2": 1298.7685607910157, "kl_loss_3": 917.6602386474609, "kl_loss_7": 230.57952499389648, "learning_rate": 0.0008552322891326845, "loss": 638.6699, "step": 2560 }, { "ce_loss_10": 3.5982382774353026, "ce_loss_13": 3.528933322429657, "ce_loss_2": 4.162684524059296, "ce_loss_3": 3.9698683977127076, "ce_loss_7": 3.656614398956299, "epoch": 0.257, "grad_norm": 434.0, "kl_loss_10": 109.61449127197265, "kl_loss_2": 1301.0992553710937, "kl_loss_3": 921.5931274414063, "kl_loss_7": 231.00868759155273, "learning_rate": 0.0008541139153907296, "loss": 634.7393, "step": 2570 }, { "ce_loss_10": 3.5526642203330994, "ce_loss_13": 3.485519516468048, "ce_loss_2": 4.114146530628204, "ce_loss_3": 3.924081325531006, "ce_loss_7": 3.614666759967804, "epoch": 0.258, "grad_norm": 548.0, "kl_loss_10": 107.18747673034667, "kl_loss_2": 1300.3511169433593, "kl_loss_3": 919.3837615966797, "kl_loss_7": 228.61345367431642, "learning_rate": 0.0008529919757255782, "loss": 640.0102, "step": 2580 }, { "ce_loss_10": 3.591010940074921, "ce_loss_13": 3.519867956638336, "ce_loss_2": 4.122671520709991, "ce_loss_3": 3.9352630972862244, "ce_loss_7": 3.642985260486603, "epoch": 0.259, "grad_norm": 462.0, "kl_loss_10": 115.52005577087402, "kl_loss_2": 1261.9588439941406, "kl_loss_3": 881.7059204101563, "kl_loss_7": 222.79493560791016, "learning_rate": 0.0008518664814351503, "loss": 624.9721, "step": 2590 }, { "ce_loss_10": 3.5559722065925596, "ce_loss_13": 3.4851076006889343, "ce_loss_2": 4.12727187871933, "ce_loss_3": 3.9312629222869875, "ce_loss_7": 3.6141554713249207, "epoch": 0.26, "grad_norm": 468.0, "kl_loss_10": 118.2235237121582, "kl_loss_2": 1329.9431396484374, "kl_loss_3": 938.9490844726563, "kl_loss_7": 229.97386627197267, "learning_rate": 0.0008507374438531607, "loss": 664.5543, "step": 2600 }, { "ce_loss_10": 3.52994726896286, "ce_loss_13": 3.460645878314972, "ce_loss_2": 4.086832702159882, "ce_loss_3": 3.8989439606666565, "ce_loss_7": 3.5843619465827943, "epoch": 0.261, "grad_norm": 454.0, "kl_loss_10": 111.8594409942627, "kl_loss_2": 1283.5894409179687, "kl_loss_3": 906.6501983642578, "kl_loss_7": 225.07547607421876, "learning_rate": 0.0008496048743490053, "loss": 631.2727, "step": 2610 }, { "ce_loss_10": 3.688689887523651, "ce_loss_13": 3.6171088218688965, "ce_loss_2": 4.232082653045654, "ce_loss_3": 4.045144772529602, "ce_loss_7": 3.7411328554153442, "epoch": 0.262, "grad_norm": 498.0, "kl_loss_10": 112.36735153198242, "kl_loss_2": 1262.6953674316405, "kl_loss_3": 890.0498321533203, "kl_loss_7": 224.3122886657715, "learning_rate": 0.0008484687843276469, "loss": 626.552, "step": 2620 }, { "ce_loss_10": 3.6142364263534548, "ce_loss_13": 3.54564208984375, "ce_loss_2": 4.1653601884841915, "ce_loss_3": 3.979761373996735, "ce_loss_7": 3.670048642158508, "epoch": 0.263, "grad_norm": 604.0, "kl_loss_10": 113.79613609313965, "kl_loss_2": 1291.2943481445313, "kl_loss_3": 909.2907562255859, "kl_loss_7": 229.63263626098632, "learning_rate": 0.0008473291852294987, "loss": 643.7754, "step": 2630 }, { "ce_loss_10": 3.624956822395325, "ce_loss_13": 3.5516101837158205, "ce_loss_2": 4.185651910305023, "ce_loss_3": 3.9955446600914, "ce_loss_7": 3.682393753528595, "epoch": 0.264, "grad_norm": 560.0, "kl_loss_10": 118.29873504638672, "kl_loss_2": 1318.540350341797, "kl_loss_3": 922.2017700195313, "kl_loss_7": 233.18995971679686, "learning_rate": 0.0008461860885303114, "loss": 639.2153, "step": 2640 }, { "ce_loss_10": 3.651511311531067, "ce_loss_13": 3.5835230231285093, "ce_loss_2": 4.192479598522186, "ce_loss_3": 4.010076713562012, "ce_loss_7": 3.705660367012024, "epoch": 0.265, "grad_norm": 532.0, "kl_loss_10": 120.45394592285156, "kl_loss_2": 1256.7761352539062, "kl_loss_3": 889.5764984130859, "kl_loss_7": 224.92701721191406, "learning_rate": 0.000845039505741056, "loss": 630.0308, "step": 2650 }, { "ce_loss_10": 3.6381911635398865, "ce_loss_13": 3.567105031013489, "ce_loss_2": 4.197239780426026, "ce_loss_3": 4.007714176177979, "ce_loss_7": 3.694069528579712, "epoch": 0.266, "grad_norm": 476.0, "kl_loss_10": 120.83754425048828, "kl_loss_2": 1304.788995361328, "kl_loss_3": 924.8183044433594, "kl_loss_7": 230.70355300903321, "learning_rate": 0.0008438894484078086, "loss": 659.7302, "step": 2660 }, { "ce_loss_10": 3.6475989818573, "ce_loss_13": 3.575591731071472, "ce_loss_2": 4.190777051448822, "ce_loss_3": 4.004483807086944, "ce_loss_7": 3.6995357990264894, "epoch": 0.267, "grad_norm": 486.0, "kl_loss_10": 115.3904426574707, "kl_loss_2": 1277.0895690917969, "kl_loss_3": 898.4886535644531, "kl_loss_7": 228.7105613708496, "learning_rate": 0.0008427359281116334, "loss": 634.3606, "step": 2670 }, { "ce_loss_10": 3.547777831554413, "ce_loss_13": 3.4795953035354614, "ce_loss_2": 4.113273656368255, "ce_loss_3": 3.9340083956718446, "ce_loss_7": 3.6087413907051085, "epoch": 0.268, "grad_norm": 572.0, "kl_loss_10": 111.49882125854492, "kl_loss_2": 1312.4553100585938, "kl_loss_3": 937.0226959228515, "kl_loss_7": 228.7356918334961, "learning_rate": 0.0008415789564684673, "loss": 643.8098, "step": 2680 }, { "ce_loss_10": 3.7991090655326842, "ce_loss_13": 3.7250254154205322, "ce_loss_2": 4.329492318630218, "ce_loss_3": 4.153719592094421, "ce_loss_7": 3.8572392106056212, "epoch": 0.269, "grad_norm": 536.0, "kl_loss_10": 119.75568199157715, "kl_loss_2": 1240.7658264160157, "kl_loss_3": 900.5773712158203, "kl_loss_7": 236.5658187866211, "learning_rate": 0.0008404185451290017, "loss": 621.7949, "step": 2690 }, { "ce_loss_10": 3.6571221590042113, "ce_loss_13": 3.5904589772224424, "ce_loss_2": 4.20020170211792, "ce_loss_3": 4.020016396045685, "ce_loss_7": 3.7206122040748597, "epoch": 0.27, "grad_norm": 612.0, "kl_loss_10": 109.39498329162598, "kl_loss_2": 1278.1862854003907, "kl_loss_3": 903.8941192626953, "kl_loss_7": 233.3107780456543, "learning_rate": 0.0008392547057785661, "loss": 629.3908, "step": 2700 }, { "ce_loss_10": 3.5812445521354674, "ce_loss_13": 3.511835253238678, "ce_loss_2": 4.144945275783539, "ce_loss_3": 3.964122700691223, "ce_loss_7": 3.65096001625061, "epoch": 0.271, "grad_norm": 536.0, "kl_loss_10": 110.46146507263184, "kl_loss_2": 1320.70322265625, "kl_loss_3": 951.1240264892579, "kl_loss_7": 251.59460906982423, "learning_rate": 0.0008380874501370098, "loss": 636.0247, "step": 2710 }, { "ce_loss_10": 3.576056122779846, "ce_loss_13": 3.5100281834602356, "ce_loss_2": 4.140222942829132, "ce_loss_3": 3.9531075954437256, "ce_loss_7": 3.6410070419311524, "epoch": 0.272, "grad_norm": 544.0, "kl_loss_10": 110.53367462158204, "kl_loss_2": 1306.970849609375, "kl_loss_3": 923.5474395751953, "kl_loss_7": 236.50457839965821, "learning_rate": 0.0008369167899585841, "loss": 640.9572, "step": 2720 }, { "ce_loss_10": 3.6997987627983093, "ce_loss_13": 3.635802137851715, "ce_loss_2": 4.225974369049072, "ce_loss_3": 4.0530330538749695, "ce_loss_7": 3.760606610774994, "epoch": 0.273, "grad_norm": 700.0, "kl_loss_10": 106.76014976501465, "kl_loss_2": 1235.3878662109375, "kl_loss_3": 873.6434448242187, "kl_loss_7": 224.53188552856446, "learning_rate": 0.0008357427370318238, "loss": 630.9094, "step": 2730 }, { "ce_loss_10": 3.6538386702537538, "ce_loss_13": 3.5854528903961183, "ce_loss_2": 4.205159163475036, "ce_loss_3": 4.013937699794769, "ce_loss_7": 3.712061953544617, "epoch": 0.274, "grad_norm": 448.0, "kl_loss_10": 110.17894134521484, "kl_loss_2": 1286.5609130859375, "kl_loss_3": 904.9044860839844, "kl_loss_7": 229.6865478515625, "learning_rate": 0.0008345653031794292, "loss": 635.8903, "step": 2740 }, { "ce_loss_10": 3.6535327553749086, "ce_loss_13": 3.5872610807418823, "ce_loss_2": 4.199507582187652, "ce_loss_3": 4.0229881525039675, "ce_loss_7": 3.712740111351013, "epoch": 0.275, "grad_norm": 494.0, "kl_loss_10": 108.84803733825683, "kl_loss_2": 1267.8911865234375, "kl_loss_3": 897.5597198486328, "kl_loss_7": 226.9661651611328, "learning_rate": 0.0008333845002581458, "loss": 628.4583, "step": 2750 }, { "ce_loss_10": 3.5756468772888184, "ce_loss_13": 3.5082659125328064, "ce_loss_2": 4.145406031608582, "ce_loss_3": 3.954765808582306, "ce_loss_7": 3.6372469902038573, "epoch": 0.276, "grad_norm": 442.0, "kl_loss_10": 107.86047019958497, "kl_loss_2": 1333.4900146484374, "kl_loss_3": 936.4077697753906, "kl_loss_7": 230.6979835510254, "learning_rate": 0.0008322003401586462, "loss": 647.3615, "step": 2760 }, { "ce_loss_10": 3.615983176231384, "ce_loss_13": 3.5494200587272644, "ce_loss_2": 4.153625726699829, "ce_loss_3": 3.9661497712135314, "ce_loss_7": 3.670648729801178, "epoch": 0.277, "grad_norm": 456.0, "kl_loss_10": 107.68133277893067, "kl_loss_2": 1252.6388610839845, "kl_loss_3": 875.4474029541016, "kl_loss_7": 220.59310073852538, "learning_rate": 0.0008310128348054094, "loss": 608.5761, "step": 2770 }, { "ce_loss_10": 3.581556737422943, "ce_loss_13": 3.518260824680328, "ce_loss_2": 4.13277485370636, "ce_loss_3": 3.9427122831344605, "ce_loss_7": 3.6427804708480833, "epoch": 0.278, "grad_norm": 556.0, "kl_loss_10": 107.17420654296875, "kl_loss_2": 1270.6333923339844, "kl_loss_3": 894.9070373535156, "kl_loss_7": 225.04671096801758, "learning_rate": 0.0008298219961566008, "loss": 624.6308, "step": 2780 }, { "ce_loss_10": 3.5525727391242983, "ce_loss_13": 3.485328257083893, "ce_loss_2": 4.1319693446159365, "ce_loss_3": 3.9388387560844422, "ce_loss_7": 3.6104352355003355, "epoch": 0.279, "grad_norm": 394.0, "kl_loss_10": 112.5637420654297, "kl_loss_2": 1333.4249206542968, "kl_loss_3": 937.83544921875, "kl_loss_7": 227.29133377075195, "learning_rate": 0.0008286278362039527, "loss": 635.7598, "step": 2790 }, { "ce_loss_10": 3.587259495258331, "ce_loss_13": 3.5128440499305724, "ce_loss_2": 4.159168899059296, "ce_loss_3": 3.9615533709526063, "ce_loss_7": 3.6405880570411684, "epoch": 0.28, "grad_norm": 402.0, "kl_loss_10": 114.4114917755127, "kl_loss_2": 1318.4106079101562, "kl_loss_3": 924.2900451660156, "kl_loss_7": 224.0070999145508, "learning_rate": 0.0008274303669726426, "loss": 628.2539, "step": 2800 }, { "ce_loss_10": 3.479981768131256, "ce_loss_13": 3.411652183532715, "ce_loss_2": 4.056045913696289, "ce_loss_3": 3.866449761390686, "ce_loss_7": 3.5390130996704103, "epoch": 0.281, "grad_norm": 484.0, "kl_loss_10": 111.04401168823242, "kl_loss_2": 1325.4802673339843, "kl_loss_3": 933.757958984375, "kl_loss_7": 223.9423583984375, "learning_rate": 0.0008262296005211721, "loss": 628.6442, "step": 2810 }, { "ce_loss_10": 3.6082133769989015, "ce_loss_13": 3.543479096889496, "ce_loss_2": 4.177137637138367, "ce_loss_3": 3.9879695653915403, "ce_loss_7": 3.667951965332031, "epoch": 0.282, "grad_norm": 436.0, "kl_loss_10": 106.70964050292969, "kl_loss_2": 1303.7048461914062, "kl_loss_3": 916.7790283203125, "kl_loss_7": 222.79779052734375, "learning_rate": 0.0008250255489412463, "loss": 627.094, "step": 2820 }, { "ce_loss_10": 3.716309094429016, "ce_loss_13": 3.642316293716431, "ce_loss_2": 4.261255824565888, "ce_loss_3": 4.0787659049034115, "ce_loss_7": 3.7748358964920046, "epoch": 0.283, "grad_norm": 604.0, "kl_loss_10": 114.64575080871582, "kl_loss_2": 1277.7698669433594, "kl_loss_3": 901.2324371337891, "kl_loss_7": 231.05035324096679, "learning_rate": 0.0008238182243576511, "loss": 633.2869, "step": 2830 }, { "ce_loss_10": 3.682005834579468, "ce_loss_13": 3.615007734298706, "ce_loss_2": 4.202854037284851, "ce_loss_3": 4.023157751560211, "ce_loss_7": 3.736177396774292, "epoch": 0.284, "grad_norm": 548.0, "kl_loss_10": 110.75144157409667, "kl_loss_2": 1221.345361328125, "kl_loss_3": 870.2192474365235, "kl_loss_7": 222.93451232910155, "learning_rate": 0.0008226076389281315, "loss": 611.4373, "step": 2840 }, { "ce_loss_10": 3.7233519554138184, "ce_loss_13": 3.6542891025543214, "ce_loss_2": 4.248092949390411, "ce_loss_3": 4.069663691520691, "ce_loss_7": 3.775057625770569, "epoch": 0.285, "grad_norm": 704.0, "kl_loss_10": 110.19483451843261, "kl_loss_2": 1255.0005493164062, "kl_loss_3": 882.4262268066407, "kl_loss_7": 222.1134048461914, "learning_rate": 0.0008213938048432696, "loss": 610.5205, "step": 2850 }, { "ce_loss_10": 3.64341379404068, "ce_loss_13": 3.5780718684196473, "ce_loss_2": 4.180006468296051, "ce_loss_3": 3.9996572732925415, "ce_loss_7": 3.701814925670624, "epoch": 0.286, "grad_norm": 442.0, "kl_loss_10": 108.8284294128418, "kl_loss_2": 1259.7149841308594, "kl_loss_3": 887.2555450439453, "kl_loss_7": 224.16595458984375, "learning_rate": 0.0008201767343263612, "loss": 623.8719, "step": 2860 }, { "ce_loss_10": 3.580712640285492, "ce_loss_13": 3.514770495891571, "ce_loss_2": 4.152428865432739, "ce_loss_3": 3.9586745381355284, "ce_loss_7": 3.6420669674873354, "epoch": 0.287, "grad_norm": 478.0, "kl_loss_10": 104.37866973876953, "kl_loss_2": 1290.6899536132812, "kl_loss_3": 906.0174041748047, "kl_loss_7": 219.9403289794922, "learning_rate": 0.0008189564396332927, "loss": 611.9311, "step": 2870 }, { "ce_loss_10": 3.560059654712677, "ce_loss_13": 3.4959851503372192, "ce_loss_2": 4.124033749103546, "ce_loss_3": 3.9340568661689757, "ce_loss_7": 3.6201700448989866, "epoch": 0.288, "grad_norm": 480.0, "kl_loss_10": 103.69261512756347, "kl_loss_2": 1290.621844482422, "kl_loss_3": 906.7965057373046, "kl_loss_7": 223.63958206176758, "learning_rate": 0.0008177329330524181, "loss": 627.1938, "step": 2880 }, { "ce_loss_10": 3.6303093075752257, "ce_loss_13": 3.5619912266731264, "ce_loss_2": 4.173935759067535, "ce_loss_3": 3.9890891432762148, "ce_loss_7": 3.6870488286018372, "epoch": 0.289, "grad_norm": 452.0, "kl_loss_10": 105.55148658752441, "kl_loss_2": 1245.1935241699218, "kl_loss_3": 874.3131744384766, "kl_loss_7": 225.99310531616212, "learning_rate": 0.0008165062269044352, "loss": 620.2292, "step": 2890 }, { "ce_loss_10": 3.574904942512512, "ce_loss_13": 3.5098610281944276, "ce_loss_2": 4.134270429611206, "ce_loss_3": 3.941369962692261, "ce_loss_7": 3.641903018951416, "epoch": 0.29, "grad_norm": 394.0, "kl_loss_10": 109.33544578552247, "kl_loss_2": 1282.6864868164062, "kl_loss_3": 899.0664276123047, "kl_loss_7": 231.66256561279297, "learning_rate": 0.0008152763335422613, "loss": 630.6565, "step": 2900 }, { "ce_loss_10": 3.5721667885780333, "ce_loss_13": 3.503155696392059, "ce_loss_2": 4.123925364017486, "ce_loss_3": 3.935485672950745, "ce_loss_7": 3.625267505645752, "epoch": 0.291, "grad_norm": 600.0, "kl_loss_10": 111.11225318908691, "kl_loss_2": 1285.6910400390625, "kl_loss_3": 903.9730163574219, "kl_loss_7": 227.13845748901366, "learning_rate": 0.0008140432653509088, "loss": 623.4421, "step": 2910 }, { "ce_loss_10": 3.617369520664215, "ce_loss_13": 3.5511601328849793, "ce_loss_2": 4.158329248428345, "ce_loss_3": 3.9670159101486204, "ce_loss_7": 3.6755479097366335, "epoch": 0.292, "grad_norm": 424.0, "kl_loss_10": 108.97641296386719, "kl_loss_2": 1271.4477172851562, "kl_loss_3": 887.2839324951171, "kl_loss_7": 224.6483947753906, "learning_rate": 0.0008128070347473608, "loss": 614.8932, "step": 2920 }, { "ce_loss_10": 3.6203887820243836, "ce_loss_13": 3.5571650743484495, "ce_loss_2": 4.184931480884552, "ce_loss_3": 3.988680112361908, "ce_loss_7": 3.6783168077468873, "epoch": 0.293, "grad_norm": 442.0, "kl_loss_10": 106.56389541625977, "kl_loss_2": 1309.3880310058594, "kl_loss_3": 912.7983032226563, "kl_loss_7": 223.91178283691406, "learning_rate": 0.0008115676541804455, "loss": 627.653, "step": 2930 }, { "ce_loss_10": 3.631400096416473, "ce_loss_13": 3.565066874027252, "ce_loss_2": 4.173557686805725, "ce_loss_3": 3.9938668727874758, "ce_loss_7": 3.6861080646514894, "epoch": 0.294, "grad_norm": 410.0, "kl_loss_10": 107.59184074401855, "kl_loss_2": 1258.5327880859375, "kl_loss_3": 893.1478424072266, "kl_loss_7": 221.63349151611328, "learning_rate": 0.0008103251361307119, "loss": 625.068, "step": 2940 }, { "ce_loss_10": 3.6633550047874452, "ce_loss_13": 3.5975454568862917, "ce_loss_2": 4.201860392093659, "ce_loss_3": 4.0187016248703005, "ce_loss_7": 3.7217815637588503, "epoch": 0.295, "grad_norm": 484.0, "kl_loss_10": 107.80433006286621, "kl_loss_2": 1263.7409545898438, "kl_loss_3": 899.7593811035156, "kl_loss_7": 224.86621551513673, "learning_rate": 0.0008090794931103026, "loss": 620.4886, "step": 2950 }, { "ce_loss_10": 3.6508840203285216, "ce_loss_13": 3.588442325592041, "ce_loss_2": 4.192799139022827, "ce_loss_3": 4.012849128246307, "ce_loss_7": 3.706376481056213, "epoch": 0.296, "grad_norm": 560.0, "kl_loss_10": 104.73687210083008, "kl_loss_2": 1249.5858154296875, "kl_loss_3": 882.2532653808594, "kl_loss_7": 217.49150695800782, "learning_rate": 0.0008078307376628291, "loss": 618.8026, "step": 2960 }, { "ce_loss_10": 3.714610981941223, "ce_loss_13": 3.648031437397003, "ce_loss_2": 4.232832741737366, "ce_loss_3": 4.05701197385788, "ce_loss_7": 3.7682809591293336, "epoch": 0.297, "grad_norm": 438.0, "kl_loss_10": 105.18131446838379, "kl_loss_2": 1206.7602905273438, "kl_loss_3": 851.9172241210938, "kl_loss_7": 215.51920394897462, "learning_rate": 0.000806578882363245, "loss": 597.2082, "step": 2970 }, { "ce_loss_10": 3.6252587914466856, "ce_loss_13": 3.5611796617507934, "ce_loss_2": 4.163775825500489, "ce_loss_3": 3.977950668334961, "ce_loss_7": 3.6833796977996824, "epoch": 0.298, "grad_norm": 648.0, "kl_loss_10": 103.29475135803223, "kl_loss_2": 1245.597314453125, "kl_loss_3": 877.1612213134765, "kl_loss_7": 219.75524444580077, "learning_rate": 0.0008053239398177191, "loss": 627.9783, "step": 2980 }, { "ce_loss_10": 3.602860856056213, "ce_loss_13": 3.538487696647644, "ce_loss_2": 4.142560148239136, "ce_loss_3": 3.9584405183792115, "ce_loss_7": 3.659018313884735, "epoch": 0.299, "grad_norm": 502.0, "kl_loss_10": 104.49293098449706, "kl_loss_2": 1247.2865417480468, "kl_loss_3": 873.8518829345703, "kl_loss_7": 218.60237731933594, "learning_rate": 0.0008040659226635089, "loss": 629.0394, "step": 2990 }, { "ce_loss_10": 3.737885308265686, "ce_loss_13": 3.670609879493713, "ce_loss_2": 4.267246758937835, "ce_loss_3": 4.084029448032379, "ce_loss_7": 3.801585829257965, "epoch": 0.3, "grad_norm": 474.0, "kl_loss_10": 109.13075065612793, "kl_loss_2": 1251.634942626953, "kl_loss_3": 874.3355682373046, "kl_loss_7": 234.4466766357422, "learning_rate": 0.0008028048435688333, "loss": 617.8753, "step": 3000 }, { "ce_loss_10": 3.608117866516113, "ce_loss_13": 3.5417439699172975, "ce_loss_2": 4.164859163761139, "ce_loss_3": 3.9728567838668822, "ce_loss_7": 3.666444170475006, "epoch": 0.301, "grad_norm": 458.0, "kl_loss_10": 104.67718696594238, "kl_loss_2": 1290.2385009765626, "kl_loss_3": 895.6051879882813, "kl_loss_7": 230.77984161376952, "learning_rate": 0.0008015407152327448, "loss": 624.6472, "step": 3010 }, { "ce_loss_10": 3.655743646621704, "ce_loss_13": 3.589059603214264, "ce_loss_2": 4.197956717014312, "ce_loss_3": 4.010993158817291, "ce_loss_7": 3.715561032295227, "epoch": 0.302, "grad_norm": 490.0, "kl_loss_10": 108.86725807189941, "kl_loss_2": 1260.063540649414, "kl_loss_3": 888.4078857421875, "kl_loss_7": 225.77315521240234, "learning_rate": 0.0008002735503850016, "loss": 621.0348, "step": 3020 }, { "ce_loss_10": 3.5483126521110533, "ce_loss_13": 3.477071487903595, "ce_loss_2": 4.1067805051803585, "ce_loss_3": 3.9177415490150453, "ce_loss_7": 3.613772678375244, "epoch": 0.303, "grad_norm": 442.0, "kl_loss_10": 113.85512008666993, "kl_loss_2": 1301.403955078125, "kl_loss_3": 923.0243713378907, "kl_loss_7": 243.21601486206055, "learning_rate": 0.0007990033617859396, "loss": 643.6124, "step": 3030 }, { "ce_loss_10": 3.596233379840851, "ce_loss_13": 3.527192997932434, "ce_loss_2": 4.134040641784668, "ce_loss_3": 3.9505585551261904, "ce_loss_7": 3.6559112668037415, "epoch": 0.304, "grad_norm": 576.0, "kl_loss_10": 111.53803482055665, "kl_loss_2": 1246.1058349609375, "kl_loss_3": 878.1594299316406, "kl_loss_7": 229.73634643554686, "learning_rate": 0.000797730162226344, "loss": 607.6155, "step": 3040 }, { "ce_loss_10": 3.6262240767478944, "ce_loss_13": 3.55741890668869, "ce_loss_2": 4.167547011375428, "ce_loss_3": 3.981596386432648, "ce_loss_7": 3.686588776111603, "epoch": 0.305, "grad_norm": 430.0, "kl_loss_10": 113.34700317382813, "kl_loss_2": 1258.705908203125, "kl_loss_3": 888.1617828369141, "kl_loss_7": 230.86616134643555, "learning_rate": 0.0007964539645273203, "loss": 613.4882, "step": 3050 }, { "ce_loss_10": 3.6409424901008607, "ce_loss_13": 3.574270474910736, "ce_loss_2": 4.167909657955169, "ce_loss_3": 3.9862082481384276, "ce_loss_7": 3.6940474629402162, "epoch": 0.306, "grad_norm": 486.0, "kl_loss_10": 106.54784317016602, "kl_loss_2": 1238.411444091797, "kl_loss_3": 866.3065307617187, "kl_loss_7": 220.125154876709, "learning_rate": 0.000795174781540165, "loss": 615.3713, "step": 3060 }, { "ce_loss_10": 3.721142077445984, "ce_loss_13": 3.643355393409729, "ce_loss_2": 4.226944315433502, "ce_loss_3": 4.049481880664826, "ce_loss_7": 3.771383452415466, "epoch": 0.307, "grad_norm": 418.0, "kl_loss_10": 122.25658149719239, "kl_loss_2": 1203.3186645507812, "kl_loss_3": 851.1368927001953, "kl_loss_7": 225.69219970703125, "learning_rate": 0.0007938926261462366, "loss": 615.3413, "step": 3070 }, { "ce_loss_10": 3.6604058384895324, "ce_loss_13": 3.5915605425834656, "ce_loss_2": 4.175356435775757, "ce_loss_3": 3.9981507778167726, "ce_loss_7": 3.7177716493606567, "epoch": 0.308, "grad_norm": 528.0, "kl_loss_10": 111.06630897521973, "kl_loss_2": 1238.821875, "kl_loss_3": 876.2146820068359, "kl_loss_7": 223.17951583862305, "learning_rate": 0.0007926075112568258, "loss": 625.9794, "step": 3080 }, { "ce_loss_10": 3.652525985240936, "ce_loss_13": 3.586830127239227, "ce_loss_2": 4.18239061832428, "ce_loss_3": 4.001185369491577, "ce_loss_7": 3.7117084741592405, "epoch": 0.309, "grad_norm": 408.0, "kl_loss_10": 105.23004531860352, "kl_loss_2": 1238.4955749511719, "kl_loss_3": 879.8196685791015, "kl_loss_7": 219.05570373535156, "learning_rate": 0.0007913194498130252, "loss": 606.0291, "step": 3090 }, { "ce_loss_10": 3.576726019382477, "ce_loss_13": 3.5116322517395018, "ce_loss_2": 4.1267429232597355, "ce_loss_3": 3.951638638973236, "ce_loss_7": 3.633882737159729, "epoch": 0.31, "grad_norm": 596.0, "kl_loss_10": 104.83585128784179, "kl_loss_2": 1271.4552978515626, "kl_loss_3": 899.7277404785157, "kl_loss_7": 221.13562393188477, "learning_rate": 0.0007900284547855992, "loss": 625.4858, "step": 3100 }, { "ce_loss_10": 3.585216796398163, "ce_loss_13": 3.5201086163520814, "ce_loss_2": 4.1151411652565, "ce_loss_3": 3.9415447235107424, "ce_loss_7": 3.6418359875679016, "epoch": 0.311, "grad_norm": 460.0, "kl_loss_10": 104.5475685119629, "kl_loss_2": 1231.1286071777345, "kl_loss_3": 880.9251007080078, "kl_loss_7": 215.18857421875, "learning_rate": 0.0007887345391748532, "loss": 620.3755, "step": 3110 }, { "ce_loss_10": 3.7296380400657654, "ce_loss_13": 3.6599961280822755, "ce_loss_2": 4.223298215866089, "ce_loss_3": 4.0600717782974245, "ce_loss_7": 3.7785526752471923, "epoch": 0.312, "grad_norm": 434.0, "kl_loss_10": 110.47460975646973, "kl_loss_2": 1200.8911010742188, "kl_loss_3": 857.2419494628906, "kl_loss_7": 215.84228134155273, "learning_rate": 0.0007874377160105036, "loss": 594.074, "step": 3120 }, { "ce_loss_10": 3.647064197063446, "ce_loss_13": 3.5629413604736326, "ce_loss_2": 4.20149587392807, "ce_loss_3": 4.026539087295532, "ce_loss_7": 3.7087369561195374, "epoch": 0.313, "grad_norm": 504.0, "kl_loss_10": 117.26640319824219, "kl_loss_2": 1253.9545349121095, "kl_loss_3": 905.2646728515625, "kl_loss_7": 235.96448516845703, "learning_rate": 0.0007861379983515449, "loss": 636.8147, "step": 3130 }, { "ce_loss_10": 3.7007260084152223, "ce_loss_13": 3.631552994251251, "ce_loss_2": 4.21818333864212, "ce_loss_3": 4.040616655349732, "ce_loss_7": 3.757350814342499, "epoch": 0.314, "grad_norm": 466.0, "kl_loss_10": 112.17713203430176, "kl_loss_2": 1239.388739013672, "kl_loss_3": 879.6389221191406, "kl_loss_7": 227.5748275756836, "learning_rate": 0.0007848353992861195, "loss": 608.3133, "step": 3140 }, { "ce_loss_10": 3.786464810371399, "ce_loss_13": 3.710281562805176, "ce_loss_2": 4.314648783206939, "ce_loss_3": 4.134762763977051, "ce_loss_7": 3.843652272224426, "epoch": 0.315, "grad_norm": 458.0, "kl_loss_10": 124.92040100097657, "kl_loss_2": 1240.0601806640625, "kl_loss_3": 878.5192687988281, "kl_loss_7": 241.41039581298827, "learning_rate": 0.0007835299319313853, "loss": 620.9426, "step": 3150 }, { "ce_loss_10": 3.663742733001709, "ce_loss_13": 3.5886364459991453, "ce_loss_2": 4.173808574676514, "ce_loss_3": 3.994606840610504, "ce_loss_7": 3.7157713413238525, "epoch": 0.316, "grad_norm": 478.0, "kl_loss_10": 119.08418045043945, "kl_loss_2": 1222.8418212890624, "kl_loss_3": 864.8908721923829, "kl_loss_7": 229.07857666015624, "learning_rate": 0.0007822216094333848, "loss": 627.9899, "step": 3160 }, { "ce_loss_10": 3.660254752635956, "ce_loss_13": 3.592539119720459, "ce_loss_2": 4.196765351295471, "ce_loss_3": 4.01435557603836, "ce_loss_7": 3.7205393433570864, "epoch": 0.317, "grad_norm": 402.0, "kl_loss_10": 115.94363555908203, "kl_loss_2": 1238.9798767089844, "kl_loss_3": 878.1770599365234, "kl_loss_7": 235.79936828613282, "learning_rate": 0.0007809104449669101, "loss": 611.1889, "step": 3170 }, { "ce_loss_10": 3.625234532356262, "ce_loss_13": 3.5487714052200316, "ce_loss_2": 4.128973770141601, "ce_loss_3": 3.9529131054878235, "ce_loss_7": 3.6731096148490905, "epoch": 0.318, "grad_norm": 524.0, "kl_loss_10": 118.72456016540528, "kl_loss_2": 1219.5467956542968, "kl_loss_3": 854.3440185546875, "kl_loss_7": 228.71927642822266, "learning_rate": 0.0007795964517353734, "loss": 608.5358, "step": 3180 }, { "ce_loss_10": 3.623099219799042, "ce_loss_13": 3.54120157957077, "ce_loss_2": 4.132599997520447, "ce_loss_3": 3.9547854542732237, "ce_loss_7": 3.670779359340668, "epoch": 0.319, "grad_norm": 438.0, "kl_loss_10": 145.3066722869873, "kl_loss_2": 1253.242724609375, "kl_loss_3": 888.5025939941406, "kl_loss_7": 249.7946647644043, "learning_rate": 0.000778279642970672, "loss": 614.9188, "step": 3190 }, { "ce_loss_10": 3.61579008102417, "ce_loss_13": 3.5464967608451845, "ce_loss_2": 4.135542809963226, "ce_loss_3": 3.949288582801819, "ce_loss_7": 3.6743045926094053, "epoch": 0.32, "grad_norm": 580.0, "kl_loss_10": 120.59939308166504, "kl_loss_2": 1232.2691650390625, "kl_loss_3": 859.8254913330078, "kl_loss_7": 236.18389816284179, "learning_rate": 0.0007769600319330552, "loss": 603.041, "step": 3200 }, { "ce_loss_10": 3.6462074518203735, "ce_loss_13": 3.5768035650253296, "ce_loss_2": 4.193181753158569, "ce_loss_3": 4.002738869190216, "ce_loss_7": 3.7033765077590943, "epoch": 0.321, "grad_norm": 536.0, "kl_loss_10": 113.30461883544922, "kl_loss_2": 1261.3274169921874, "kl_loss_3": 880.6385803222656, "kl_loss_7": 233.63602294921876, "learning_rate": 0.0007756376319109917, "loss": 615.0137, "step": 3210 }, { "ce_loss_10": 3.6983227729797363, "ce_loss_13": 3.628855359554291, "ce_loss_2": 4.215565764904023, "ce_loss_3": 4.036277508735656, "ce_loss_7": 3.7591845273971556, "epoch": 0.322, "grad_norm": 414.0, "kl_loss_10": 113.85302391052247, "kl_loss_2": 1215.7693420410155, "kl_loss_3": 852.6664520263672, "kl_loss_7": 233.59415435791016, "learning_rate": 0.0007743124562210351, "loss": 595.5453, "step": 3220 }, { "ce_loss_10": 3.7038231015205385, "ce_loss_13": 3.636870324611664, "ce_loss_2": 4.220840120315552, "ce_loss_3": 4.040867578983307, "ce_loss_7": 3.759516727924347, "epoch": 0.323, "grad_norm": 500.0, "kl_loss_10": 116.7942398071289, "kl_loss_2": 1231.380029296875, "kl_loss_3": 860.8811431884766, "kl_loss_7": 226.99792938232423, "learning_rate": 0.0007729845182076895, "loss": 609.7637, "step": 3230 }, { "ce_loss_10": 3.635891842842102, "ce_loss_13": 3.570459449291229, "ce_loss_2": 4.146735298633575, "ce_loss_3": 3.971414268016815, "ce_loss_7": 3.6916919469833376, "epoch": 0.324, "grad_norm": 544.0, "kl_loss_10": 107.84456443786621, "kl_loss_2": 1210.0861877441407, "kl_loss_3": 854.6661926269531, "kl_loss_7": 223.18558731079102, "learning_rate": 0.0007716538312432765, "loss": 613.749, "step": 3240 }, { "ce_loss_10": 3.5933796405792235, "ce_loss_13": 3.5238136887550353, "ce_loss_2": 4.137728452682495, "ce_loss_3": 3.9503737330436706, "ce_loss_7": 3.6502291560173035, "epoch": 0.325, "grad_norm": 532.0, "kl_loss_10": 110.89730453491211, "kl_loss_2": 1272.4953063964845, "kl_loss_3": 899.5693664550781, "kl_loss_7": 234.18169174194335, "learning_rate": 0.0007703204087277988, "loss": 621.0721, "step": 3250 }, { "ce_loss_10": 3.691065728664398, "ce_loss_13": 3.625254142284393, "ce_loss_2": 4.195106828212738, "ce_loss_3": 4.023638522624969, "ce_loss_7": 3.744424653053284, "epoch": 0.326, "grad_norm": 480.0, "kl_loss_10": 108.84702529907227, "kl_loss_2": 1187.3806762695312, "kl_loss_3": 834.2469482421875, "kl_loss_7": 219.25809020996093, "learning_rate": 0.0007689842640888063, "loss": 594.9809, "step": 3260 }, { "ce_loss_10": 3.6937523603439333, "ce_loss_13": 3.6257047772407534, "ce_loss_2": 4.207961022853851, "ce_loss_3": 4.029592931270599, "ce_loss_7": 3.7506144404411317, "epoch": 0.327, "grad_norm": 432.0, "kl_loss_10": 109.73489418029786, "kl_loss_2": 1197.2553649902343, "kl_loss_3": 845.9240936279297, "kl_loss_7": 224.3518325805664, "learning_rate": 0.0007676454107812607, "loss": 600.9104, "step": 3270 }, { "ce_loss_10": 3.6202093243598936, "ce_loss_13": 3.556860589981079, "ce_loss_2": 4.152219152450561, "ce_loss_3": 3.972454571723938, "ce_loss_7": 3.6772433161735534, "epoch": 0.328, "grad_norm": 552.0, "kl_loss_10": 107.7342628479004, "kl_loss_2": 1234.4693603515625, "kl_loss_3": 866.5982177734375, "kl_loss_7": 224.09054641723634, "learning_rate": 0.0007663038622873999, "loss": 600.4109, "step": 3280 }, { "ce_loss_10": 3.6624753713607787, "ce_loss_13": 3.5959082007408143, "ce_loss_2": 4.186812722682953, "ce_loss_3": 4.007313239574432, "ce_loss_7": 3.7183284163475037, "epoch": 0.329, "grad_norm": 416.0, "kl_loss_10": 107.99775848388671, "kl_loss_2": 1235.7617919921875, "kl_loss_3": 865.4350341796875, "kl_loss_7": 219.93520736694336, "learning_rate": 0.0007649596321166025, "loss": 596.3813, "step": 3290 }, { "ce_loss_10": 3.5629011154174806, "ce_loss_13": 3.500445473194122, "ce_loss_2": 4.090505909919739, "ce_loss_3": 3.9089764833450316, "ce_loss_7": 3.619310712814331, "epoch": 0.33, "grad_norm": 448.0, "kl_loss_10": 101.5875473022461, "kl_loss_2": 1220.246160888672, "kl_loss_3": 856.5614715576172, "kl_loss_7": 215.10712509155275, "learning_rate": 0.0007636127338052513, "loss": 603.8148, "step": 3300 }, { "ce_loss_10": 3.670552396774292, "ce_loss_13": 3.6016101121902464, "ce_loss_2": 4.213171231746673, "ce_loss_3": 4.018688130378723, "ce_loss_7": 3.727590525150299, "epoch": 0.331, "grad_norm": 374.0, "kl_loss_10": 108.33710594177246, "kl_loss_2": 1257.856024169922, "kl_loss_3": 874.112905883789, "kl_loss_7": 224.637939453125, "learning_rate": 0.0007622631809165971, "loss": 604.7203, "step": 3310 }, { "ce_loss_10": 3.671126115322113, "ce_loss_13": 3.6092859148979186, "ce_loss_2": 4.177222061157226, "ce_loss_3": 3.9993849992752075, "ce_loss_7": 3.722568082809448, "epoch": 0.332, "grad_norm": 414.0, "kl_loss_10": 101.74094352722167, "kl_loss_2": 1180.6327026367187, "kl_loss_3": 821.7367553710938, "kl_loss_7": 208.3566993713379, "learning_rate": 0.000760910987040623, "loss": 588.4586, "step": 3320 }, { "ce_loss_10": 3.64985990524292, "ce_loss_13": 3.585498571395874, "ce_loss_2": 4.191103303432465, "ce_loss_3": 4.004770576953888, "ce_loss_7": 3.7059614300727843, "epoch": 0.333, "grad_norm": 346.0, "kl_loss_10": 102.83302307128906, "kl_loss_2": 1259.7546875, "kl_loss_3": 881.3513031005859, "kl_loss_7": 217.63404388427733, "learning_rate": 0.000759556165793906, "loss": 599.8207, "step": 3330 }, { "ce_loss_10": 3.676869213581085, "ce_loss_13": 3.610471022129059, "ce_loss_2": 4.2084539294242855, "ce_loss_3": 4.019213974475861, "ce_loss_7": 3.7275768160820006, "epoch": 0.334, "grad_norm": 502.0, "kl_loss_10": 104.88800392150878, "kl_loss_2": 1223.232958984375, "kl_loss_3": 852.1926971435547, "kl_loss_7": 215.24551544189453, "learning_rate": 0.000758198730819481, "loss": 604.6092, "step": 3340 }, { "ce_loss_10": 3.616540086269379, "ce_loss_13": 3.553786301612854, "ce_loss_2": 4.152189195156097, "ce_loss_3": 3.9668321132659914, "ce_loss_7": 3.6709399580955506, "epoch": 0.335, "grad_norm": 488.0, "kl_loss_10": 102.31456336975097, "kl_loss_2": 1251.2591918945313, "kl_loss_3": 875.474462890625, "kl_loss_7": 214.77994079589843, "learning_rate": 0.0007568386957867032, "loss": 608.125, "step": 3350 }, { "ce_loss_10": 3.695429575443268, "ce_loss_13": 3.6276296377182007, "ce_loss_2": 4.209121763706207, "ce_loss_3": 4.032123720645904, "ce_loss_7": 3.749704658985138, "epoch": 0.336, "grad_norm": 664.0, "kl_loss_10": 107.0846736907959, "kl_loss_2": 1209.7884765625, "kl_loss_3": 853.7374877929688, "kl_loss_7": 220.54676055908203, "learning_rate": 0.0007554760743911103, "loss": 605.0996, "step": 3360 }, { "ce_loss_10": 3.5890319466590883, "ce_loss_13": 3.5283274173736574, "ce_loss_2": 4.114323127269745, "ce_loss_3": 3.932225775718689, "ce_loss_7": 3.644662916660309, "epoch": 0.337, "grad_norm": 398.0, "kl_loss_10": 101.10566368103028, "kl_loss_2": 1236.1671508789063, "kl_loss_3": 865.7673828125, "kl_loss_7": 212.85166015625, "learning_rate": 0.0007541108803542846, "loss": 613.867, "step": 3370 }, { "ce_loss_10": 3.6427289605140687, "ce_loss_13": 3.576077425479889, "ce_loss_2": 4.166507577896118, "ce_loss_3": 3.9814778923988343, "ce_loss_7": 3.6960788011550902, "epoch": 0.338, "grad_norm": 420.0, "kl_loss_10": 106.68134155273438, "kl_loss_2": 1229.0040222167968, "kl_loss_3": 856.9913909912109, "kl_loss_7": 213.85500411987306, "learning_rate": 0.0007527431274237149, "loss": 624.6923, "step": 3380 }, { "ce_loss_10": 3.611558997631073, "ce_loss_13": 3.549490749835968, "ce_loss_2": 4.114035534858703, "ce_loss_3": 3.942946660518646, "ce_loss_7": 3.662776732444763, "epoch": 0.339, "grad_norm": 406.0, "kl_loss_10": 102.27137718200683, "kl_loss_2": 1206.6684020996095, "kl_loss_3": 846.7297576904297, "kl_loss_7": 210.38721313476563, "learning_rate": 0.0007513728293726579, "loss": 594.8909, "step": 3390 }, { "ce_loss_10": 3.737028419971466, "ce_loss_13": 3.669820773601532, "ce_loss_2": 4.24596471786499, "ce_loss_3": 4.065989923477173, "ce_loss_7": 3.7901018500328063, "epoch": 0.34, "grad_norm": 456.0, "kl_loss_10": 106.7515941619873, "kl_loss_2": 1213.6457214355469, "kl_loss_3": 848.0824188232422, "kl_loss_7": 217.41063537597657, "learning_rate": 0.00075, "loss": 593.8513, "step": 3400 }, { "ce_loss_10": 3.719330894947052, "ce_loss_13": 3.6538206934928894, "ce_loss_2": 4.25202556848526, "ce_loss_3": 4.069514441490173, "ce_loss_7": 3.7754390835762024, "epoch": 0.341, "grad_norm": 442.0, "kl_loss_10": 105.26911506652831, "kl_loss_2": 1229.2578063964843, "kl_loss_3": 857.8241027832031, "kl_loss_7": 215.74853057861327, "learning_rate": 0.0007486246531301177, "loss": 595.3941, "step": 3410 }, { "ce_loss_10": 3.5200854897499085, "ce_loss_13": 3.457200789451599, "ce_loss_2": 4.057665538787842, "ce_loss_3": 3.8753583312034605, "ce_loss_7": 3.575985038280487, "epoch": 0.342, "grad_norm": 388.0, "kl_loss_10": 101.49059600830078, "kl_loss_2": 1229.5487548828125, "kl_loss_3": 867.5537567138672, "kl_loss_7": 212.1739074707031, "learning_rate": 0.0007472468026127384, "loss": 593.475, "step": 3420 }, { "ce_loss_10": 3.6591346502304076, "ce_loss_13": 3.5927812099456786, "ce_loss_2": 4.209147357940674, "ce_loss_3": 4.019513976573944, "ce_loss_7": 3.7172008395195006, "epoch": 0.343, "grad_norm": 442.0, "kl_loss_10": 106.34202499389649, "kl_loss_2": 1270.0667724609375, "kl_loss_3": 890.6144561767578, "kl_loss_7": 221.5020393371582, "learning_rate": 0.000745866462322802, "loss": 614.0497, "step": 3430 }, { "ce_loss_10": 3.647415816783905, "ce_loss_13": 3.5850081205368043, "ce_loss_2": 4.1631152629852295, "ce_loss_3": 3.980070149898529, "ce_loss_7": 3.7022210240364073, "epoch": 0.344, "grad_norm": 428.0, "kl_loss_10": 103.86195526123046, "kl_loss_2": 1198.3542846679688, "kl_loss_3": 835.6711212158203, "kl_loss_7": 208.45360870361327, "learning_rate": 0.0007444836461603195, "loss": 592.3941, "step": 3440 }, { "ce_loss_10": 3.7135616302490235, "ce_loss_13": 3.6434731125831603, "ce_loss_2": 4.233828973770142, "ce_loss_3": 4.05616340637207, "ce_loss_7": 3.762986993789673, "epoch": 0.345, "grad_norm": 548.0, "kl_loss_10": 110.37765045166016, "kl_loss_2": 1249.6877746582031, "kl_loss_3": 880.3564361572265, "kl_loss_7": 216.23881912231445, "learning_rate": 0.0007430983680502344, "loss": 610.9966, "step": 3450 }, { "ce_loss_10": 3.5541942715644836, "ce_loss_13": 3.4891390204429626, "ce_loss_2": 4.090934145450592, "ce_loss_3": 3.908629584312439, "ce_loss_7": 3.606754219532013, "epoch": 0.346, "grad_norm": 432.0, "kl_loss_10": 110.62757797241211, "kl_loss_2": 1245.3806091308593, "kl_loss_3": 869.5422088623047, "kl_loss_7": 211.6188102722168, "learning_rate": 0.0007417106419422819, "loss": 606.0509, "step": 3460 }, { "ce_loss_10": 3.6656701445579527, "ce_loss_13": 3.596804141998291, "ce_loss_2": 4.186310410499573, "ce_loss_3": 4.003709590435028, "ce_loss_7": 3.716957890987396, "epoch": 0.347, "grad_norm": 432.0, "kl_loss_10": 110.30144805908203, "kl_loss_2": 1208.0226745605469, "kl_loss_3": 843.9369232177735, "kl_loss_7": 210.9572967529297, "learning_rate": 0.0007403204818108486, "loss": 597.1857, "step": 3470 }, { "ce_loss_10": 3.6337965607643126, "ce_loss_13": 3.5606253027915953, "ce_loss_2": 4.153940236568451, "ce_loss_3": 3.970261514186859, "ce_loss_7": 3.680176484584808, "epoch": 0.348, "grad_norm": 380.0, "kl_loss_10": 122.88734741210938, "kl_loss_2": 1235.673895263672, "kl_loss_3": 863.5119903564453, "kl_loss_7": 214.55614318847657, "learning_rate": 0.0007389279016548316, "loss": 589.7067, "step": 3480 }, { "ce_loss_10": 3.6385215759277343, "ce_loss_13": 3.5720754146575926, "ce_loss_2": 4.187943410873413, "ce_loss_3": 3.9984039187431337, "ce_loss_7": 3.692442834377289, "epoch": 0.349, "grad_norm": 540.0, "kl_loss_10": 110.95368614196778, "kl_loss_2": 1266.4402160644531, "kl_loss_3": 881.5294525146485, "kl_loss_7": 217.94278945922852, "learning_rate": 0.0007375329154974975, "loss": 613.9418, "step": 3490 }, { "ce_loss_10": 3.5970895290374756, "ce_loss_13": 3.5335337281227113, "ce_loss_2": 4.117660129070282, "ce_loss_3": 3.938844621181488, "ce_loss_7": 3.6496007084846496, "epoch": 0.35, "grad_norm": 364.0, "kl_loss_10": 106.09449501037598, "kl_loss_2": 1217.6699768066405, "kl_loss_3": 855.84267578125, "kl_loss_7": 211.2824508666992, "learning_rate": 0.0007361355373863414, "loss": 604.2842, "step": 3500 }, { "ce_loss_10": 3.6508504867553713, "ce_loss_13": 3.5859110236167906, "ce_loss_2": 4.1644844770431515, "ce_loss_3": 3.989104926586151, "ce_loss_7": 3.7059740304946898, "epoch": 0.351, "grad_norm": 420.0, "kl_loss_10": 105.65600318908692, "kl_loss_2": 1192.6789306640626, "kl_loss_3": 837.2236511230469, "kl_loss_7": 210.62101364135742, "learning_rate": 0.0007347357813929454, "loss": 605.2478, "step": 3510 }, { "ce_loss_10": 3.5983325362205507, "ce_loss_13": 3.5318838000297545, "ce_loss_2": 4.108148908615112, "ce_loss_3": 3.935304307937622, "ce_loss_7": 3.6479654192924498, "epoch": 0.352, "grad_norm": 500.0, "kl_loss_10": 106.45629920959473, "kl_loss_2": 1190.6948181152343, "kl_loss_3": 837.8225341796875, "kl_loss_7": 210.1330581665039, "learning_rate": 0.0007333336616128369, "loss": 599.2653, "step": 3520 }, { "ce_loss_10": 3.570793068408966, "ce_loss_13": 3.507152056694031, "ce_loss_2": 4.106606543064117, "ce_loss_3": 3.9213356494903566, "ce_loss_7": 3.624741232395172, "epoch": 0.353, "grad_norm": 468.0, "kl_loss_10": 102.9274845123291, "kl_loss_2": 1231.522442626953, "kl_loss_3": 864.720751953125, "kl_loss_7": 214.17628860473633, "learning_rate": 0.0007319291921653463, "loss": 605.1452, "step": 3530 }, { "ce_loss_10": 3.6573350191116334, "ce_loss_13": 3.591005003452301, "ce_loss_2": 4.190282225608826, "ce_loss_3": 4.010705304145813, "ce_loss_7": 3.713829779624939, "epoch": 0.354, "grad_norm": 480.0, "kl_loss_10": 105.38732643127442, "kl_loss_2": 1246.1359802246093, "kl_loss_3": 875.5277282714844, "kl_loss_7": 217.63313064575195, "learning_rate": 0.0007305223871934656, "loss": 597.4614, "step": 3540 }, { "ce_loss_10": 3.6225136160850524, "ce_loss_13": 3.556077516078949, "ce_loss_2": 4.138617634773254, "ce_loss_3": 3.9633963227272035, "ce_loss_7": 3.678558957576752, "epoch": 0.355, "grad_norm": 502.0, "kl_loss_10": 104.04609298706055, "kl_loss_2": 1205.1107055664063, "kl_loss_3": 845.5688415527344, "kl_loss_7": 210.7905143737793, "learning_rate": 0.0007291132608637052, "loss": 595.3202, "step": 3550 }, { "ce_loss_10": 3.585705029964447, "ce_loss_13": 3.52364000082016, "ce_loss_2": 4.140194058418274, "ce_loss_3": 3.939319980144501, "ce_loss_7": 3.637845540046692, "epoch": 0.356, "grad_norm": 612.0, "kl_loss_10": 100.68717575073242, "kl_loss_2": 1272.5315246582031, "kl_loss_3": 866.628369140625, "kl_loss_7": 206.60951766967773, "learning_rate": 0.0007277018273659516, "loss": 612.2947, "step": 3560 }, { "ce_loss_10": 3.708829402923584, "ce_loss_13": 3.6439966320991517, "ce_loss_2": 4.2357800006866455, "ce_loss_3": 4.058422148227692, "ce_loss_7": 3.7655990600585936, "epoch": 0.357, "grad_norm": 400.0, "kl_loss_10": 105.25033149719238, "kl_loss_2": 1234.6828186035157, "kl_loss_3": 864.7261169433593, "kl_loss_7": 215.20211639404297, "learning_rate": 0.0007262881009133242, "loss": 605.0631, "step": 3570 }, { "ce_loss_10": 3.6265846729278564, "ce_loss_13": 3.5641749501228333, "ce_loss_2": 4.144611585140228, "ce_loss_3": 3.9691020011901856, "ce_loss_7": 3.6797274351119995, "epoch": 0.358, "grad_norm": 422.0, "kl_loss_10": 101.45686912536621, "kl_loss_2": 1216.0844970703124, "kl_loss_3": 849.7874267578125, "kl_loss_7": 208.09806137084962, "learning_rate": 0.0007248720957420329, "loss": 589.5256, "step": 3580 }, { "ce_loss_10": 3.6416075587272645, "ce_loss_13": 3.5768683552742004, "ce_loss_2": 4.156981098651886, "ce_loss_3": 3.9762784600257874, "ce_loss_7": 3.690297317504883, "epoch": 0.359, "grad_norm": 374.0, "kl_loss_10": 104.18233222961426, "kl_loss_2": 1196.5406433105468, "kl_loss_3": 831.4658630371093, "kl_loss_7": 209.4309959411621, "learning_rate": 0.0007234538261112341, "loss": 608.9998, "step": 3590 }, { "ce_loss_10": 3.6725340247154237, "ce_loss_13": 3.6092687249183655, "ce_loss_2": 4.202276730537415, "ce_loss_3": 4.014237463474274, "ce_loss_7": 3.7282424688339235, "epoch": 0.36, "grad_norm": 400.0, "kl_loss_10": 101.90313911437988, "kl_loss_2": 1228.7942749023437, "kl_loss_3": 851.1504791259765, "kl_loss_7": 214.15290603637695, "learning_rate": 0.0007220333063028871, "loss": 593.6457, "step": 3600 }, { "ce_loss_10": 3.7029056310653687, "ce_loss_13": 3.6388812899589538, "ce_loss_2": 4.263094091415406, "ce_loss_3": 4.055423867702484, "ce_loss_7": 3.7583480000495912, "epoch": 0.361, "grad_norm": 406.0, "kl_loss_10": 103.6033935546875, "kl_loss_2": 1316.5648254394532, "kl_loss_3": 896.4495971679687, "kl_loss_7": 217.90971908569335, "learning_rate": 0.0007206105506216106, "loss": 621.4246, "step": 3610 }, { "ce_loss_10": 3.582909846305847, "ce_loss_13": 3.5207375407218935, "ce_loss_2": 4.105194330215454, "ce_loss_3": 3.92072172164917, "ce_loss_7": 3.6367709159851076, "epoch": 0.362, "grad_norm": 488.0, "kl_loss_10": 100.51245307922363, "kl_loss_2": 1208.4382385253907, "kl_loss_3": 842.719369506836, "kl_loss_7": 209.43429107666014, "learning_rate": 0.0007191855733945387, "loss": 586.8207, "step": 3620 }, { "ce_loss_10": 3.6772588729858398, "ce_loss_13": 3.611865592002869, "ce_loss_2": 4.192759323120117, "ce_loss_3": 4.0132176041603085, "ce_loss_7": 3.7312068581581115, "epoch": 0.363, "grad_norm": 482.0, "kl_loss_10": 103.05736274719239, "kl_loss_2": 1206.339794921875, "kl_loss_3": 840.5841491699218, "kl_loss_7": 209.33160095214845, "learning_rate": 0.0007177583889711762, "loss": 590.5756, "step": 3630 }, { "ce_loss_10": 3.5943727612495424, "ce_loss_13": 3.5278201699256897, "ce_loss_2": 4.115126085281372, "ce_loss_3": 3.9359707951545717, "ce_loss_7": 3.64764518737793, "epoch": 0.364, "grad_norm": 474.0, "kl_loss_10": 104.63778533935547, "kl_loss_2": 1232.7115539550782, "kl_loss_3": 867.7350891113281, "kl_loss_7": 215.38798904418945, "learning_rate": 0.0007163290117232541, "loss": 602.1762, "step": 3640 }, { "ce_loss_10": 3.719394052028656, "ce_loss_13": 3.6543713212013245, "ce_loss_2": 4.207157838344574, "ce_loss_3": 4.033388280868531, "ce_loss_7": 3.766360378265381, "epoch": 0.365, "grad_norm": 516.0, "kl_loss_10": 106.55956001281739, "kl_loss_2": 1177.5490844726562, "kl_loss_3": 820.275503540039, "kl_loss_7": 210.7781494140625, "learning_rate": 0.0007148974560445859, "loss": 585.3312, "step": 3650 }, { "ce_loss_10": 3.63283451795578, "ce_loss_13": 3.569260811805725, "ce_loss_2": 4.140059876441955, "ce_loss_3": 3.9612114429473877, "ce_loss_7": 3.68426718711853, "epoch": 0.366, "grad_norm": 446.0, "kl_loss_10": 101.39652633666992, "kl_loss_2": 1181.2005432128906, "kl_loss_3": 826.3975830078125, "kl_loss_7": 208.74162216186522, "learning_rate": 0.0007134637363509209, "loss": 580.396, "step": 3660 }, { "ce_loss_10": 3.740837073326111, "ce_loss_13": 3.676628518104553, "ce_loss_2": 4.238210546970367, "ce_loss_3": 4.064305305480957, "ce_loss_7": 3.7917707443237303, "epoch": 0.367, "grad_norm": 374.0, "kl_loss_10": 102.68134994506836, "kl_loss_2": 1165.9671203613282, "kl_loss_3": 815.8925506591797, "kl_loss_7": 205.73183975219726, "learning_rate": 0.0007120278670798009, "loss": 586.6874, "step": 3670 }, { "ce_loss_10": 3.530075693130493, "ce_loss_13": 3.467638063430786, "ce_loss_2": 4.08873633146286, "ce_loss_3": 3.8983967661857606, "ce_loss_7": 3.590684974193573, "epoch": 0.368, "grad_norm": 504.0, "kl_loss_10": 102.20494270324707, "kl_loss_2": 1276.5897247314454, "kl_loss_3": 894.699105834961, "kl_loss_7": 217.834383392334, "learning_rate": 0.0007105898626904133, "loss": 620.3519, "step": 3680 }, { "ce_loss_10": 3.6397287964820864, "ce_loss_13": 3.576084387302399, "ce_loss_2": 4.165349864959717, "ce_loss_3": 3.9844519972801207, "ce_loss_7": 3.6932525277137755, "epoch": 0.369, "grad_norm": 548.0, "kl_loss_10": 103.31561088562012, "kl_loss_2": 1214.6401062011719, "kl_loss_3": 850.1350677490234, "kl_loss_7": 211.8514373779297, "learning_rate": 0.0007091497376634463, "loss": 587.3888, "step": 3690 }, { "ce_loss_10": 3.580397891998291, "ce_loss_13": 3.518483591079712, "ce_loss_2": 4.098948669433594, "ce_loss_3": 3.9198103308677674, "ce_loss_7": 3.633430314064026, "epoch": 0.37, "grad_norm": 462.0, "kl_loss_10": 102.7860034942627, "kl_loss_2": 1196.8778686523438, "kl_loss_3": 839.7853210449218, "kl_loss_7": 210.37151184082032, "learning_rate": 0.0007077075065009433, "loss": 599.0922, "step": 3700 }, { "ce_loss_10": 3.6922479033470155, "ce_loss_13": 3.6247249126434324, "ce_loss_2": 4.215528225898742, "ce_loss_3": 4.034583401679993, "ce_loss_7": 3.7439934253692626, "epoch": 0.371, "grad_norm": 436.0, "kl_loss_10": 107.0543056488037, "kl_loss_2": 1234.6434143066406, "kl_loss_3": 869.9170135498047, "kl_loss_7": 215.78035430908204, "learning_rate": 0.0007062631837261557, "loss": 601.1125, "step": 3710 }, { "ce_loss_10": 3.558840346336365, "ce_loss_13": 3.4976505637168884, "ce_loss_2": 4.082807242870331, "ce_loss_3": 3.90502552986145, "ce_loss_7": 3.611116898059845, "epoch": 0.372, "grad_norm": 418.0, "kl_loss_10": 102.55169563293457, "kl_loss_2": 1217.97548828125, "kl_loss_3": 855.1094757080078, "kl_loss_7": 209.0750946044922, "learning_rate": 0.0007048167838833977, "loss": 602.8635, "step": 3720 }, { "ce_loss_10": 3.6581831574440002, "ce_loss_13": 3.593174624443054, "ce_loss_2": 4.162305021286011, "ce_loss_3": 3.9847410321235657, "ce_loss_7": 3.7109787225723267, "epoch": 0.373, "grad_norm": 536.0, "kl_loss_10": 103.06450958251953, "kl_loss_2": 1197.146795654297, "kl_loss_3": 834.3573669433594, "kl_loss_7": 209.46187515258788, "learning_rate": 0.0007033683215379002, "loss": 588.3938, "step": 3730 }, { "ce_loss_10": 3.6515901923179626, "ce_loss_13": 3.586732280254364, "ce_loss_2": 4.166427576541901, "ce_loss_3": 3.9861610412597654, "ce_loss_7": 3.703124833106995, "epoch": 0.374, "grad_norm": 384.0, "kl_loss_10": 101.91668891906738, "kl_loss_2": 1196.090036010742, "kl_loss_3": 834.7775848388671, "kl_loss_7": 206.9270217895508, "learning_rate": 0.0007019178112756625, "loss": 596.7028, "step": 3740 }, { "ce_loss_10": 3.5998276591300966, "ce_loss_13": 3.539226603507996, "ce_loss_2": 4.120503497123718, "ce_loss_3": 3.938064229488373, "ce_loss_7": 3.6514668703079223, "epoch": 0.375, "grad_norm": 484.0, "kl_loss_10": 101.7071418762207, "kl_loss_2": 1206.4351013183593, "kl_loss_3": 842.5018493652344, "kl_loss_7": 207.55127868652343, "learning_rate": 0.0007004652677033068, "loss": 596.7216, "step": 3750 }, { "ce_loss_10": 3.6823023438453673, "ce_loss_13": 3.6218234419822695, "ce_loss_2": 4.1750637769699095, "ce_loss_3": 4.004729413986206, "ce_loss_7": 3.732503056526184, "epoch": 0.376, "grad_norm": 388.0, "kl_loss_10": 99.9868221282959, "kl_loss_2": 1168.4398498535156, "kl_loss_3": 816.7180572509766, "kl_loss_7": 201.70328750610352, "learning_rate": 0.0006990107054479312, "loss": 584.5167, "step": 3760 }, { "ce_loss_10": 3.667929840087891, "ce_loss_13": 3.6051357984542847, "ce_loss_2": 4.166206574440002, "ce_loss_3": 3.9985297203063963, "ce_loss_7": 3.719240057468414, "epoch": 0.377, "grad_norm": 496.0, "kl_loss_10": 102.5582088470459, "kl_loss_2": 1182.1695739746094, "kl_loss_3": 832.6118957519532, "kl_loss_7": 206.43120498657225, "learning_rate": 0.000697554139156961, "loss": 586.6759, "step": 3770 }, { "ce_loss_10": 3.648312306404114, "ce_loss_13": 3.5864667892456055, "ce_loss_2": 4.165168154239654, "ce_loss_3": 3.980992519855499, "ce_loss_7": 3.703998303413391, "epoch": 0.378, "grad_norm": 532.0, "kl_loss_10": 102.77268753051757, "kl_loss_2": 1217.0308044433593, "kl_loss_3": 845.4426635742187, "kl_loss_7": 211.65556106567382, "learning_rate": 0.0006960955834980027, "loss": 586.9333, "step": 3780 }, { "ce_loss_10": 3.624769401550293, "ce_loss_13": 3.559871160984039, "ce_loss_2": 4.141481828689575, "ce_loss_3": 3.9655247926712036, "ce_loss_7": 3.681060993671417, "epoch": 0.379, "grad_norm": 402.0, "kl_loss_10": 104.66882057189942, "kl_loss_2": 1194.9725402832032, "kl_loss_3": 840.5210388183593, "kl_loss_7": 214.32746124267578, "learning_rate": 0.0006946350531586958, "loss": 591.0428, "step": 3790 }, { "ce_loss_10": 3.6484233260154726, "ce_loss_13": 3.5856125354766846, "ce_loss_2": 4.168078374862671, "ce_loss_3": 3.984307587146759, "ce_loss_7": 3.7046299457550047, "epoch": 0.38, "grad_norm": 494.0, "kl_loss_10": 102.10320167541504, "kl_loss_2": 1202.4750549316407, "kl_loss_3": 836.4282287597656, "kl_loss_7": 215.46153411865234, "learning_rate": 0.0006931725628465643, "loss": 600.8652, "step": 3800 }, { "ce_loss_10": 3.669872498512268, "ce_loss_13": 3.606708490848541, "ce_loss_2": 4.190654408931732, "ce_loss_3": 4.012761104106903, "ce_loss_7": 3.725092887878418, "epoch": 0.381, "grad_norm": 462.0, "kl_loss_10": 105.94147644042968, "kl_loss_2": 1198.5632446289062, "kl_loss_3": 842.3563995361328, "kl_loss_7": 216.23879013061523, "learning_rate": 0.0006917081272888696, "loss": 594.3836, "step": 3810 }, { "ce_loss_10": 3.5702871322631835, "ce_loss_13": 3.503624665737152, "ce_loss_2": 4.083241939544678, "ce_loss_3": 3.9013825416564942, "ce_loss_7": 3.6281121611595153, "epoch": 0.382, "grad_norm": 430.0, "kl_loss_10": 104.559330368042, "kl_loss_2": 1205.9051391601563, "kl_loss_3": 846.9787689208985, "kl_loss_7": 214.2649803161621, "learning_rate": 0.0006902417612324615, "loss": 588.9565, "step": 3820 }, { "ce_loss_10": 3.705217492580414, "ce_loss_13": 3.6370500326156616, "ce_loss_2": 4.2347581624984745, "ce_loss_3": 4.056124079227447, "ce_loss_7": 3.7589930057525636, "epoch": 0.383, "grad_norm": 418.0, "kl_loss_10": 107.22665023803711, "kl_loss_2": 1242.482080078125, "kl_loss_3": 871.0590393066407, "kl_loss_7": 218.71700134277344, "learning_rate": 0.00068877347944363, "loss": 600.3775, "step": 3830 }, { "ce_loss_10": 3.6945597529411316, "ce_loss_13": 3.6302199006080627, "ce_loss_2": 4.190360188484192, "ce_loss_3": 4.017442071437836, "ce_loss_7": 3.74516099691391, "epoch": 0.384, "grad_norm": 460.0, "kl_loss_10": 105.2132453918457, "kl_loss_2": 1180.0169799804687, "kl_loss_3": 825.1839294433594, "kl_loss_7": 210.17990188598634, "learning_rate": 0.0006873032967079561, "loss": 592.1172, "step": 3840 }, { "ce_loss_10": 3.6860820412635804, "ce_loss_13": 3.622925412654877, "ce_loss_2": 4.173858499526977, "ce_loss_3": 4.0060118436813354, "ce_loss_7": 3.7361050128936766, "epoch": 0.385, "grad_norm": 444.0, "kl_loss_10": 102.31974792480469, "kl_loss_2": 1169.402410888672, "kl_loss_3": 819.6500732421875, "kl_loss_7": 207.8970947265625, "learning_rate": 0.0006858312278301637, "loss": 578.5368, "step": 3850 }, { "ce_loss_10": 3.724821174144745, "ce_loss_13": 3.6599106669425963, "ce_loss_2": 4.216867661476135, "ce_loss_3": 4.043015420436859, "ce_loss_7": 3.7741833090782166, "epoch": 0.386, "grad_norm": 628.0, "kl_loss_10": 105.45792541503906, "kl_loss_2": 1182.8445251464843, "kl_loss_3": 827.4248168945312, "kl_loss_7": 208.66201171875, "learning_rate": 0.0006843572876339704, "loss": 581.9299, "step": 3860 }, { "ce_loss_10": 3.639630389213562, "ce_loss_13": 3.578851103782654, "ce_loss_2": 4.1167685151100155, "ce_loss_3": 3.953296732902527, "ce_loss_7": 3.6866363167762755, "epoch": 0.387, "grad_norm": 402.0, "kl_loss_10": 101.30325736999512, "kl_loss_2": 1144.7853637695312, "kl_loss_3": 802.1904113769531, "kl_loss_7": 201.72076492309571, "learning_rate": 0.0006828814909619373, "loss": 586.7184, "step": 3870 }, { "ce_loss_10": 3.7647191643714906, "ce_loss_13": 3.697379672527313, "ce_loss_2": 4.260519480705261, "ce_loss_3": 4.083593368530273, "ce_loss_7": 3.813764202594757, "epoch": 0.388, "grad_norm": 350.0, "kl_loss_10": 106.36605720520019, "kl_loss_2": 1172.6269104003907, "kl_loss_3": 820.4572174072266, "kl_loss_7": 210.88503875732422, "learning_rate": 0.0006814038526753205, "loss": 576.9886, "step": 3880 }, { "ce_loss_10": 3.6557364583015444, "ce_loss_13": 3.5924967169761657, "ce_loss_2": 4.160025131702423, "ce_loss_3": 3.984356963634491, "ce_loss_7": 3.7067020535469055, "epoch": 0.389, "grad_norm": 330.0, "kl_loss_10": 102.68659782409668, "kl_loss_2": 1186.152001953125, "kl_loss_3": 826.8501800537109, "kl_loss_7": 206.71521759033203, "learning_rate": 0.0006799243876539213, "loss": 580.4666, "step": 3890 }, { "ce_loss_10": 3.5759631991386414, "ce_loss_13": 3.5127877712249758, "ce_loss_2": 4.105723321437836, "ce_loss_3": 3.9167493343353272, "ce_loss_7": 3.6288220643997193, "epoch": 0.39, "grad_norm": 536.0, "kl_loss_10": 103.75163269042969, "kl_loss_2": 1215.1460266113281, "kl_loss_3": 839.8725982666016, "kl_loss_7": 208.5065475463867, "learning_rate": 0.0006784431107959359, "loss": 592.4442, "step": 3900 }, { "ce_loss_10": 3.639443838596344, "ce_loss_13": 3.5752380013465883, "ce_loss_2": 4.170507109165191, "ce_loss_3": 3.9816882967948914, "ce_loss_7": 3.694754195213318, "epoch": 0.391, "grad_norm": 510.0, "kl_loss_10": 103.07575302124023, "kl_loss_2": 1237.5377136230468, "kl_loss_3": 858.0287719726563, "kl_loss_7": 214.26128845214845, "learning_rate": 0.0006769600370178059, "loss": 594.2272, "step": 3910 }, { "ce_loss_10": 3.607291209697723, "ce_loss_13": 3.5426042318344115, "ce_loss_2": 4.134967279434204, "ce_loss_3": 3.9495469093322755, "ce_loss_7": 3.6644778490066527, "epoch": 0.392, "grad_norm": 348.0, "kl_loss_10": 100.81994514465332, "kl_loss_2": 1201.7113891601562, "kl_loss_3": 841.3645660400391, "kl_loss_7": 207.30770874023438, "learning_rate": 0.0006754751812540679, "loss": 578.4809, "step": 3920 }, { "ce_loss_10": 3.6542662262916563, "ce_loss_13": 3.5899597883224486, "ce_loss_2": 4.172767472267151, "ce_loss_3": 3.9909741401672365, "ce_loss_7": 3.706152844429016, "epoch": 0.393, "grad_norm": 440.0, "kl_loss_10": 104.03220100402832, "kl_loss_2": 1209.6233947753906, "kl_loss_3": 843.8147003173829, "kl_loss_7": 210.7646583557129, "learning_rate": 0.0006739885584572025, "loss": 592.3653, "step": 3930 }, { "ce_loss_10": 3.685343015193939, "ce_loss_13": 3.619848680496216, "ce_loss_2": 4.199707639217377, "ce_loss_3": 4.017499768733979, "ce_loss_7": 3.734171211719513, "epoch": 0.394, "grad_norm": 564.0, "kl_loss_10": 107.80731964111328, "kl_loss_2": 1232.0240844726563, "kl_loss_3": 850.9272064208984, "kl_loss_7": 211.88618087768555, "learning_rate": 0.0006725001835974853, "loss": 590.3288, "step": 3940 }, { "ce_loss_10": 3.671092712879181, "ce_loss_13": 3.6061443567276, "ce_loss_2": 4.189756679534912, "ce_loss_3": 4.005955624580383, "ce_loss_7": 3.7217952370643617, "epoch": 0.395, "grad_norm": 472.0, "kl_loss_10": 105.94960823059083, "kl_loss_2": 1209.6172180175781, "kl_loss_3": 848.8837646484375, "kl_loss_7": 211.4744026184082, "learning_rate": 0.0006710100716628344, "loss": 581.9217, "step": 3950 }, { "ce_loss_10": 3.6513510942459106, "ce_loss_13": 3.586063766479492, "ce_loss_2": 4.175520932674408, "ce_loss_3": 3.992800068855286, "ce_loss_7": 3.7037784814834596, "epoch": 0.396, "grad_norm": 556.0, "kl_loss_10": 102.45261993408204, "kl_loss_2": 1202.025439453125, "kl_loss_3": 843.4705932617187, "kl_loss_7": 207.75647506713867, "learning_rate": 0.0006695182376586602, "loss": 594.7452, "step": 3960 }, { "ce_loss_10": 3.6946488857269286, "ce_loss_13": 3.6310433030128477, "ce_loss_2": 4.180384719371796, "ce_loss_3": 4.00883582830429, "ce_loss_7": 3.739116144180298, "epoch": 0.397, "grad_norm": 484.0, "kl_loss_10": 100.45674743652344, "kl_loss_2": 1141.924838256836, "kl_loss_3": 795.2099151611328, "kl_loss_7": 201.57386474609376, "learning_rate": 0.000668024696607715, "loss": 581.8865, "step": 3970 }, { "ce_loss_10": 3.63701788187027, "ce_loss_13": 3.5759130001068113, "ce_loss_2": 4.141798782348633, "ce_loss_3": 3.965423548221588, "ce_loss_7": 3.691797506809235, "epoch": 0.398, "grad_norm": 402.0, "kl_loss_10": 99.83709602355957, "kl_loss_2": 1189.6253723144532, "kl_loss_3": 836.8567596435547, "kl_loss_7": 210.05224533081054, "learning_rate": 0.0006665294635499404, "loss": 585.3059, "step": 3980 }, { "ce_loss_10": 3.645500433444977, "ce_loss_13": 3.5827003002166746, "ce_loss_2": 4.174324834346772, "ce_loss_3": 3.992855429649353, "ce_loss_7": 3.7015270590782166, "epoch": 0.399, "grad_norm": 438.0, "kl_loss_10": 103.66120948791504, "kl_loss_2": 1245.642510986328, "kl_loss_3": 869.6440063476563, "kl_loss_7": 216.26355361938477, "learning_rate": 0.0006650325535423167, "loss": 596.3225, "step": 3990 }, { "ce_loss_10": 3.6747123122215273, "ce_loss_13": 3.6138512253761292, "ce_loss_2": 4.168187916278839, "ce_loss_3": 3.993897998332977, "ce_loss_7": 3.725596582889557, "epoch": 0.4, "grad_norm": 520.0, "kl_loss_10": 96.3211498260498, "kl_loss_2": 1152.9211303710938, "kl_loss_3": 801.8546081542969, "kl_loss_7": 200.72928695678712, "learning_rate": 0.0006635339816587109, "loss": 575.9933, "step": 4000 }, { "ce_loss_10": 3.6128929018974305, "ce_loss_13": 3.548132801055908, "ce_loss_2": 4.128501725196839, "ce_loss_3": 3.945591115951538, "ce_loss_7": 3.6652005195617674, "epoch": 0.401, "grad_norm": 430.0, "kl_loss_10": 103.19527244567871, "kl_loss_2": 1214.8156677246093, "kl_loss_3": 840.3229400634766, "kl_loss_7": 210.74479904174805, "learning_rate": 0.0006620337629897252, "loss": 583.2822, "step": 4010 }, { "ce_loss_10": 3.619123613834381, "ce_loss_13": 3.5573631048202516, "ce_loss_2": 4.140160727500915, "ce_loss_3": 3.958257591724396, "ce_loss_7": 3.674074041843414, "epoch": 0.402, "grad_norm": 432.0, "kl_loss_10": 100.38173408508301, "kl_loss_2": 1207.5167907714845, "kl_loss_3": 837.2485626220703, "kl_loss_7": 208.48973083496094, "learning_rate": 0.0006605319126425454, "loss": 597.1898, "step": 4020 }, { "ce_loss_10": 3.5208260893821715, "ce_loss_13": 3.4589377880096435, "ce_loss_2": 4.050716698169708, "ce_loss_3": 3.8632638931274412, "ce_loss_7": 3.5759450912475588, "epoch": 0.403, "grad_norm": 420.0, "kl_loss_10": 100.48741989135742, "kl_loss_2": 1233.5194946289062, "kl_loss_3": 854.4578369140625, "kl_loss_7": 208.70274200439454, "learning_rate": 0.0006590284457407876, "loss": 593.5098, "step": 4030 }, { "ce_loss_10": 3.6270558714866636, "ce_loss_13": 3.5626144886016844, "ce_loss_2": 4.136511921882629, "ce_loss_3": 3.957785797119141, "ce_loss_7": 3.6768479347229004, "epoch": 0.404, "grad_norm": 392.0, "kl_loss_10": 101.69999923706055, "kl_loss_2": 1185.4601745605469, "kl_loss_3": 821.0296905517578, "kl_loss_7": 206.82139434814454, "learning_rate": 0.0006575233774243465, "loss": 582.2525, "step": 4040 }, { "ce_loss_10": 3.612906110286713, "ce_loss_13": 3.550376224517822, "ce_loss_2": 4.1283538222312925, "ce_loss_3": 3.951547086238861, "ce_loss_7": 3.667691433429718, "epoch": 0.405, "grad_norm": 464.0, "kl_loss_10": 100.57203559875488, "kl_loss_2": 1203.0161071777343, "kl_loss_3": 838.8151794433594, "kl_loss_7": 210.55067977905273, "learning_rate": 0.0006560167228492435, "loss": 587.686, "step": 4050 }, { "ce_loss_10": 3.6582042455673216, "ce_loss_13": 3.597072696685791, "ce_loss_2": 4.15371550321579, "ce_loss_3": 3.9819828867912292, "ce_loss_7": 3.7127379179000854, "epoch": 0.406, "grad_norm": 396.0, "kl_loss_10": 97.44431228637696, "kl_loss_2": 1157.4290466308594, "kl_loss_3": 807.0505889892578, "kl_loss_7": 202.94429702758788, "learning_rate": 0.0006545084971874737, "loss": 580.7177, "step": 4060 }, { "ce_loss_10": 3.6273567199707033, "ce_loss_13": 3.564158725738525, "ce_loss_2": 4.158101809024811, "ce_loss_3": 3.9733991026878357, "ce_loss_7": 3.685515010356903, "epoch": 0.407, "grad_norm": 372.0, "kl_loss_10": 103.08215293884277, "kl_loss_2": 1230.8001892089844, "kl_loss_3": 853.4359588623047, "kl_loss_7": 216.80452346801758, "learning_rate": 0.0006529987156268526, "loss": 583.8351, "step": 4070 }, { "ce_loss_10": 3.5464280128479, "ce_loss_13": 3.481638014316559, "ce_loss_2": 4.076263022422791, "ce_loss_3": 3.8974447727203367, "ce_loss_7": 3.6043801426887514, "epoch": 0.408, "grad_norm": 350.0, "kl_loss_10": 102.87330780029296, "kl_loss_2": 1214.2586059570312, "kl_loss_3": 851.9112091064453, "kl_loss_7": 211.73340759277343, "learning_rate": 0.0006514873933708637, "loss": 602.7298, "step": 4080 }, { "ce_loss_10": 3.6543262004852295, "ce_loss_13": 3.5908489346504213, "ce_loss_2": 4.153554606437683, "ce_loss_3": 3.9771866679191588, "ce_loss_7": 3.703446090221405, "epoch": 0.409, "grad_norm": 378.0, "kl_loss_10": 100.85495872497559, "kl_loss_2": 1179.416357421875, "kl_loss_3": 822.3047607421875, "kl_loss_7": 207.08517990112304, "learning_rate": 0.0006499745456385053, "loss": 579.5981, "step": 4090 }, { "ce_loss_10": 3.622114622592926, "ce_loss_13": 3.5604026079177857, "ce_loss_2": 4.138943600654602, "ce_loss_3": 3.9601905822753904, "ce_loss_7": 3.6786248087882996, "epoch": 0.41, "grad_norm": 460.0, "kl_loss_10": 101.49279441833497, "kl_loss_2": 1187.613018798828, "kl_loss_3": 832.265737915039, "kl_loss_7": 211.90668182373048, "learning_rate": 0.0006484601876641375, "loss": 591.7443, "step": 4100 }, { "ce_loss_10": 3.6106685280799864, "ce_loss_13": 3.5491909265518187, "ce_loss_2": 4.104636693000794, "ce_loss_3": 3.9329436659812926, "ce_loss_7": 3.6641584396362306, "epoch": 0.411, "grad_norm": 378.0, "kl_loss_10": 101.25703315734863, "kl_loss_2": 1168.0580017089844, "kl_loss_3": 813.8080810546875, "kl_loss_7": 212.12922592163085, "learning_rate": 0.000646944334697328, "loss": 577.3537, "step": 4110 }, { "ce_loss_10": 3.7338776111602785, "ce_loss_13": 3.665091943740845, "ce_loss_2": 4.2223006844520565, "ce_loss_3": 4.049113523960114, "ce_loss_7": 3.799789845943451, "epoch": 0.412, "grad_norm": 450.0, "kl_loss_10": 109.65744743347167, "kl_loss_2": 1151.4740142822266, "kl_loss_3": 801.2218536376953, "kl_loss_7": 236.72526626586915, "learning_rate": 0.0006454270020026995, "loss": 574.9525, "step": 4120 }, { "ce_loss_10": 3.69082772731781, "ce_loss_13": 3.6286051154136656, "ce_loss_2": 4.175914537906647, "ce_loss_3": 4.002845597267151, "ce_loss_7": 3.7393308877944946, "epoch": 0.413, "grad_norm": 580.0, "kl_loss_10": 104.95364952087402, "kl_loss_2": 1127.3133270263672, "kl_loss_3": 788.5207000732422, "kl_loss_7": 214.98480072021485, "learning_rate": 0.0006439082048597755, "loss": 564.7141, "step": 4130 }, { "ce_loss_10": 3.683094894886017, "ce_loss_13": 3.61643271446228, "ce_loss_2": 4.181109619140625, "ce_loss_3": 4.005432403087616, "ce_loss_7": 3.745869052410126, "epoch": 0.414, "grad_norm": 520.0, "kl_loss_10": 111.28029708862304, "kl_loss_2": 1178.55703125, "kl_loss_3": 823.4579254150391, "kl_loss_7": 238.62436599731444, "learning_rate": 0.0006423879585628261, "loss": 585.353, "step": 4140 }, { "ce_loss_10": 3.648063910007477, "ce_loss_13": 3.579416477680206, "ce_loss_2": 4.166888773441315, "ce_loss_3": 3.98115758895874, "ce_loss_7": 3.7089965462684633, "epoch": 0.415, "grad_norm": 402.0, "kl_loss_10": 109.57027854919434, "kl_loss_2": 1214.0814270019532, "kl_loss_3": 843.1505004882813, "kl_loss_7": 233.17276763916016, "learning_rate": 0.0006408662784207149, "loss": 596.7986, "step": 4150 }, { "ce_loss_10": 3.596502733230591, "ce_loss_13": 3.5327386379241945, "ce_loss_2": 4.09819370508194, "ce_loss_3": 3.9237332344055176, "ce_loss_7": 3.654523158073425, "epoch": 0.416, "grad_norm": 544.0, "kl_loss_10": 99.90503120422363, "kl_loss_2": 1189.1891540527345, "kl_loss_3": 823.6777069091797, "kl_loss_7": 211.67333221435547, "learning_rate": 0.0006393431797567439, "loss": 583.1826, "step": 4160 }, { "ce_loss_10": 3.6853842735290527, "ce_loss_13": 3.622405004501343, "ce_loss_2": 4.1561102867126465, "ce_loss_3": 3.9865566968917845, "ce_loss_7": 3.7344152450561525, "epoch": 0.417, "grad_norm": 384.0, "kl_loss_10": 103.1281753540039, "kl_loss_2": 1144.869805908203, "kl_loss_3": 800.3423767089844, "kl_loss_7": 211.40862579345702, "learning_rate": 0.0006378186779084996, "loss": 557.4173, "step": 4170 }, { "ce_loss_10": 3.5140963315963747, "ce_loss_13": 3.452511179447174, "ce_loss_2": 4.041843056678772, "ce_loss_3": 3.857197344303131, "ce_loss_7": 3.571711480617523, "epoch": 0.418, "grad_norm": 464.0, "kl_loss_10": 100.09027862548828, "kl_loss_2": 1203.0338989257812, "kl_loss_3": 838.9081939697265, "kl_loss_7": 213.11346130371095, "learning_rate": 0.0006362927882276989, "loss": 588.2966, "step": 4180 }, { "ce_loss_10": 3.7188942313194273, "ce_loss_13": 3.6518460750579833, "ce_loss_2": 4.204531168937683, "ce_loss_3": 4.025935411453247, "ce_loss_7": 3.7728618144989015, "epoch": 0.419, "grad_norm": 426.0, "kl_loss_10": 103.15027618408203, "kl_loss_2": 1156.1428161621093, "kl_loss_3": 794.2856292724609, "kl_loss_7": 211.89537048339844, "learning_rate": 0.000634765526080034, "loss": 562.2326, "step": 4190 }, { "ce_loss_10": 3.717780148983002, "ce_loss_13": 3.6511818051338194, "ce_loss_2": 4.210239946842194, "ce_loss_3": 4.0393988490104675, "ce_loss_7": 3.7724336862564085, "epoch": 0.42, "grad_norm": 456.0, "kl_loss_10": 104.51988563537597, "kl_loss_2": 1161.7059631347656, "kl_loss_3": 818.392855834961, "kl_loss_7": 219.07965316772462, "learning_rate": 0.0006332369068450174, "loss": 570.1012, "step": 4200 }, { "ce_loss_10": 3.648071753978729, "ce_loss_13": 3.5840353846549986, "ce_loss_2": 4.147714996337891, "ce_loss_3": 3.972030484676361, "ce_loss_7": 3.7039226770401, "epoch": 0.421, "grad_norm": 426.0, "kl_loss_10": 101.72255935668946, "kl_loss_2": 1175.2358459472657, "kl_loss_3": 821.6455657958984, "kl_loss_7": 216.67398834228516, "learning_rate": 0.0006317069459158283, "loss": 576.074, "step": 4210 }, { "ce_loss_10": 3.766611933708191, "ce_loss_13": 3.7019524574279785, "ce_loss_2": 4.238518404960632, "ce_loss_3": 4.070182096958161, "ce_loss_7": 3.818829393386841, "epoch": 0.422, "grad_norm": 404.0, "kl_loss_10": 102.42731742858886, "kl_loss_2": 1134.2777221679687, "kl_loss_3": 793.5420806884765, "kl_loss_7": 214.86822509765625, "learning_rate": 0.0006301756586991561, "loss": 572.4437, "step": 4220 }, { "ce_loss_10": 3.538297724723816, "ce_loss_13": 3.4769801259040833, "ce_loss_2": 4.051598787307739, "ce_loss_3": 3.8692006349563597, "ce_loss_7": 3.592081093788147, "epoch": 0.423, "grad_norm": 524.0, "kl_loss_10": 100.02308959960938, "kl_loss_2": 1219.534228515625, "kl_loss_3": 847.8958953857422, "kl_loss_7": 217.3907485961914, "learning_rate": 0.0006286430606150459, "loss": 590.4341, "step": 4230 }, { "ce_loss_10": 3.732722854614258, "ce_loss_13": 3.670178234577179, "ce_loss_2": 4.228793060779571, "ce_loss_3": 4.055911266803742, "ce_loss_7": 3.7854557275772094, "epoch": 0.424, "grad_norm": 440.0, "kl_loss_10": 101.63710746765136, "kl_loss_2": 1171.4819213867188, "kl_loss_3": 815.24853515625, "kl_loss_7": 212.84099502563475, "learning_rate": 0.0006271091670967436, "loss": 572.0026, "step": 4240 }, { "ce_loss_10": 3.64589341878891, "ce_loss_13": 3.579445707798004, "ce_loss_2": 4.168534743785858, "ce_loss_3": 3.9873276472091677, "ce_loss_7": 3.7041419625282286, "epoch": 0.425, "grad_norm": 436.0, "kl_loss_10": 105.33321189880371, "kl_loss_2": 1223.9686584472656, "kl_loss_3": 856.7900268554688, "kl_loss_7": 219.8565589904785, "learning_rate": 0.0006255739935905395, "loss": 587.2729, "step": 4250 }, { "ce_loss_10": 3.684093916416168, "ce_loss_13": 3.622530627250671, "ce_loss_2": 4.176068413257599, "ce_loss_3": 4.005461478233338, "ce_loss_7": 3.73612722158432, "epoch": 0.426, "grad_norm": 444.0, "kl_loss_10": 101.16957168579101, "kl_loss_2": 1151.114599609375, "kl_loss_3": 804.5711151123047, "kl_loss_7": 206.51019058227538, "learning_rate": 0.0006240375555556145, "loss": 584.5814, "step": 4260 }, { "ce_loss_10": 3.694865620136261, "ce_loss_13": 3.6328345060348513, "ce_loss_2": 4.216705179214477, "ce_loss_3": 4.035941934585571, "ce_loss_7": 3.7489806532859804, "epoch": 0.427, "grad_norm": 544.0, "kl_loss_10": 102.23134536743164, "kl_loss_2": 1200.0044555664062, "kl_loss_3": 832.4086944580079, "kl_loss_7": 208.58624954223632, "learning_rate": 0.000622499868463882, "loss": 581.1191, "step": 4270 }, { "ce_loss_10": 3.6664886713027953, "ce_loss_13": 3.6031296968460085, "ce_loss_2": 4.138775157928467, "ce_loss_3": 3.968552088737488, "ce_loss_7": 3.716127264499664, "epoch": 0.428, "grad_norm": 442.0, "kl_loss_10": 102.83601112365723, "kl_loss_2": 1148.9752075195313, "kl_loss_3": 798.4193389892578, "kl_loss_7": 204.8626609802246, "learning_rate": 0.0006209609477998338, "loss": 570.8694, "step": 4280 }, { "ce_loss_10": 3.7170133352279664, "ce_loss_13": 3.6512863278388976, "ce_loss_2": 4.214985513687134, "ce_loss_3": 4.041373360157013, "ce_loss_7": 3.76862713098526, "epoch": 0.429, "grad_norm": 492.0, "kl_loss_10": 105.98460693359375, "kl_loss_2": 1171.2547790527344, "kl_loss_3": 819.7431121826172, "kl_loss_7": 209.78300704956055, "learning_rate": 0.0006194208090603844, "loss": 582.6892, "step": 4290 }, { "ce_loss_10": 3.636822462081909, "ce_loss_13": 3.572554814815521, "ce_loss_2": 4.128273499011994, "ce_loss_3": 3.9540862798690797, "ce_loss_7": 3.6845338463783266, "epoch": 0.43, "grad_norm": 384.0, "kl_loss_10": 104.19713554382324, "kl_loss_2": 1158.2531616210938, "kl_loss_3": 808.0290679931641, "kl_loss_7": 201.06265716552736, "learning_rate": 0.0006178794677547138, "loss": 566.7275, "step": 4300 }, { "ce_loss_10": 3.669668412208557, "ce_loss_13": 3.6048370003700256, "ce_loss_2": 4.167822825908661, "ce_loss_3": 3.990470898151398, "ce_loss_7": 3.7204079270362853, "epoch": 0.431, "grad_norm": 462.0, "kl_loss_10": 105.12696495056153, "kl_loss_2": 1189.7153015136719, "kl_loss_3": 827.7414642333985, "kl_loss_7": 209.76073608398437, "learning_rate": 0.0006163369394041111, "loss": 578.5617, "step": 4310 }, { "ce_loss_10": 3.603849542140961, "ce_loss_13": 3.540567708015442, "ce_loss_2": 4.114995861053467, "ce_loss_3": 3.93278226852417, "ce_loss_7": 3.6533514499664306, "epoch": 0.432, "grad_norm": 524.0, "kl_loss_10": 103.23071632385253, "kl_loss_2": 1199.0398742675782, "kl_loss_3": 837.4948120117188, "kl_loss_7": 206.72886505126954, "learning_rate": 0.0006147932395418205, "loss": 593.6705, "step": 4320 }, { "ce_loss_10": 3.6318950057029724, "ce_loss_13": 3.5694007515907287, "ce_loss_2": 4.121479880809784, "ce_loss_3": 3.9539971709251405, "ce_loss_7": 3.6812774300575257, "epoch": 0.433, "grad_norm": 372.0, "kl_loss_10": 101.08283462524415, "kl_loss_2": 1163.6617614746094, "kl_loss_3": 814.8068634033203, "kl_loss_7": 204.31798858642577, "learning_rate": 0.0006132483837128823, "loss": 570.1899, "step": 4330 }, { "ce_loss_10": 3.6211368441581726, "ce_loss_13": 3.5578442931175234, "ce_loss_2": 4.120713996887207, "ce_loss_3": 3.9408787965774534, "ce_loss_7": 3.6715193152427674, "epoch": 0.434, "grad_norm": 380.0, "kl_loss_10": 102.18530006408692, "kl_loss_2": 1181.1154479980469, "kl_loss_3": 821.5291748046875, "kl_loss_7": 205.94673614501954, "learning_rate": 0.0006117023874739772, "loss": 579.966, "step": 4340 }, { "ce_loss_10": 3.606392514705658, "ce_loss_13": 3.542631506919861, "ce_loss_2": 4.1229788064956665, "ce_loss_3": 3.943661665916443, "ce_loss_7": 3.660093939304352, "epoch": 0.435, "grad_norm": 366.0, "kl_loss_10": 101.41253623962402, "kl_loss_2": 1198.5234008789062, "kl_loss_3": 836.8120849609375, "kl_loss_7": 206.9767189025879, "learning_rate": 0.0006101552663932703, "loss": 586.1095, "step": 4350 }, { "ce_loss_10": 3.6401270270347594, "ce_loss_13": 3.5747036576271056, "ce_loss_2": 4.133774304389954, "ce_loss_3": 3.9579702854156493, "ce_loss_7": 3.689171576499939, "epoch": 0.436, "grad_norm": 432.0, "kl_loss_10": 103.28445014953613, "kl_loss_2": 1170.830484008789, "kl_loss_3": 821.6876098632813, "kl_loss_7": 207.47048645019532, "learning_rate": 0.0006086070360502539, "loss": 578.1617, "step": 4360 }, { "ce_loss_10": 3.6478831648826597, "ce_loss_13": 3.5829063415527345, "ce_loss_2": 4.140194344520569, "ce_loss_3": 3.9674217224121096, "ce_loss_7": 3.6954386711120604, "epoch": 0.437, "grad_norm": 324.0, "kl_loss_10": 102.49744033813477, "kl_loss_2": 1182.2726196289063, "kl_loss_3": 820.302099609375, "kl_loss_7": 202.6822937011719, "learning_rate": 0.0006070577120355903, "loss": 585.725, "step": 4370 }, { "ce_loss_10": 3.6493834018707276, "ce_loss_13": 3.585710608959198, "ce_loss_2": 4.1475905418396, "ce_loss_3": 3.9780289769172668, "ce_loss_7": 3.6994438648223875, "epoch": 0.438, "grad_norm": 464.0, "kl_loss_10": 99.22572135925293, "kl_loss_2": 1158.4001525878907, "kl_loss_3": 817.9062316894531, "kl_loss_7": 200.7786117553711, "learning_rate": 0.0006055073099509549, "loss": 570.4337, "step": 4380 }, { "ce_loss_10": 3.7072151064872743, "ce_loss_13": 3.6444019198417665, "ce_loss_2": 4.1913762331008915, "ce_loss_3": 4.024674141407013, "ce_loss_7": 3.755181634426117, "epoch": 0.439, "grad_norm": 414.0, "kl_loss_10": 101.21295433044433, "kl_loss_2": 1155.983868408203, "kl_loss_3": 813.5707092285156, "kl_loss_7": 201.68513870239258, "learning_rate": 0.0006039558454088796, "loss": 578.4039, "step": 4390 }, { "ce_loss_10": 3.6866373896598814, "ce_loss_13": 3.6209323048591613, "ce_loss_2": 4.190221071243286, "ce_loss_3": 4.017517447471619, "ce_loss_7": 3.736443567276001, "epoch": 0.44, "grad_norm": 388.0, "kl_loss_10": 103.66101570129395, "kl_loss_2": 1179.7899597167968, "kl_loss_3": 831.9971649169922, "kl_loss_7": 206.1973434448242, "learning_rate": 0.0006024033340325954, "loss": 572.2276, "step": 4400 }, { "ce_loss_10": 3.7494076251983643, "ce_loss_13": 3.6860761404037476, "ce_loss_2": 4.22088440656662, "ce_loss_3": 4.061302840709686, "ce_loss_7": 3.7976527214050293, "epoch": 0.441, "grad_norm": 384.0, "kl_loss_10": 100.95717124938965, "kl_loss_2": 1117.0268005371095, "kl_loss_3": 788.523080444336, "kl_loss_7": 197.15192718505858, "learning_rate": 0.0006008497914558743, "loss": 559.696, "step": 4410 }, { "ce_loss_10": 3.689165186882019, "ce_loss_13": 3.6250773549079893, "ce_loss_2": 4.1833924651145935, "ce_loss_3": 4.016273534297943, "ce_loss_7": 3.738771951198578, "epoch": 0.442, "grad_norm": 476.0, "kl_loss_10": 105.19830055236817, "kl_loss_2": 1174.740167236328, "kl_loss_3": 830.987890625, "kl_loss_7": 209.00811996459962, "learning_rate": 0.0005992952333228728, "loss": 576.4588, "step": 4420 }, { "ce_loss_10": 3.620419418811798, "ce_loss_13": 3.5588944792747497, "ce_loss_2": 4.125707459449768, "ce_loss_3": 3.9479523420333864, "ce_loss_7": 3.6681005358695984, "epoch": 0.443, "grad_norm": 464.0, "kl_loss_10": 100.17966499328614, "kl_loss_2": 1181.0232360839843, "kl_loss_3": 829.0245361328125, "kl_loss_7": 201.25574188232423, "learning_rate": 0.0005977396752879741, "loss": 577.6452, "step": 4430 }, { "ce_loss_10": 3.5535963416099547, "ce_loss_13": 3.4911730885505676, "ce_loss_2": 4.057285642623901, "ce_loss_3": 3.882522702217102, "ce_loss_7": 3.603209447860718, "epoch": 0.444, "grad_norm": 450.0, "kl_loss_10": 96.56860618591308, "kl_loss_2": 1184.1321594238282, "kl_loss_3": 827.8955352783203, "kl_loss_7": 199.06893157958984, "learning_rate": 0.0005961831330156305, "loss": 569.2716, "step": 4440 }, { "ce_loss_10": 3.697277545928955, "ce_loss_13": 3.6338467955589295, "ce_loss_2": 4.1992070317268375, "ce_loss_3": 4.02395384311676, "ce_loss_7": 3.747213661670685, "epoch": 0.445, "grad_norm": 392.0, "kl_loss_10": 101.60056228637696, "kl_loss_2": 1189.420147705078, "kl_loss_3": 827.8122314453125, "kl_loss_7": 205.08227157592773, "learning_rate": 0.0005946256221802051, "loss": 584.411, "step": 4450 }, { "ce_loss_10": 3.679532468318939, "ce_loss_13": 3.6183473825454713, "ce_loss_2": 4.146489477157592, "ce_loss_3": 3.9755648136138917, "ce_loss_7": 3.7207812786102297, "epoch": 0.446, "grad_norm": 494.0, "kl_loss_10": 101.10317420959473, "kl_loss_2": 1119.8320098876952, "kl_loss_3": 779.770297241211, "kl_loss_7": 198.91878814697264, "learning_rate": 0.0005930671584658151, "loss": 578.7685, "step": 4460 }, { "ce_loss_10": 3.674864172935486, "ce_loss_13": 3.6118743062019347, "ce_loss_2": 4.166282546520233, "ce_loss_3": 3.9925308227539062, "ce_loss_7": 3.7198517322540283, "epoch": 0.447, "grad_norm": 364.0, "kl_loss_10": 100.75155410766601, "kl_loss_2": 1165.5830871582032, "kl_loss_3": 814.2670196533203, "kl_loss_7": 201.9087059020996, "learning_rate": 0.0005915077575661722, "loss": 579.8401, "step": 4470 }, { "ce_loss_10": 3.694182288646698, "ce_loss_13": 3.628465461730957, "ce_loss_2": 4.190526556968689, "ce_loss_3": 4.015213489532471, "ce_loss_7": 3.7417189121246337, "epoch": 0.448, "grad_norm": 520.0, "kl_loss_10": 105.40261840820312, "kl_loss_2": 1179.2632690429687, "kl_loss_3": 825.197119140625, "kl_loss_7": 209.67544021606446, "learning_rate": 0.000589947435184427, "loss": 569.8479, "step": 4480 }, { "ce_loss_10": 3.7602591633796694, "ce_loss_13": 3.6975467801094055, "ce_loss_2": 4.231885468959808, "ce_loss_3": 4.062859082221985, "ce_loss_7": 3.8065670251846315, "epoch": 0.449, "grad_norm": 406.0, "kl_loss_10": 104.7243579864502, "kl_loss_2": 1147.1027252197266, "kl_loss_3": 795.4058624267578, "kl_loss_7": 203.6425910949707, "learning_rate": 0.0005883862070330078, "loss": 568.9265, "step": 4490 }, { "ce_loss_10": 3.6874640941619874, "ce_loss_13": 3.6227025985717773, "ce_loss_2": 4.18091858625412, "ce_loss_3": 4.004498326778412, "ce_loss_7": 3.7389190554618836, "epoch": 0.45, "grad_norm": 342.0, "kl_loss_10": 102.03626098632813, "kl_loss_2": 1166.0193176269531, "kl_loss_3": 811.4805572509765, "kl_loss_7": 204.2785285949707, "learning_rate": 0.0005868240888334653, "loss": 567.3452, "step": 4500 }, { "ce_loss_10": 3.570815551280975, "ce_loss_13": 3.508398413658142, "ce_loss_2": 4.096131467819214, "ce_loss_3": 3.9093389391899107, "ce_loss_7": 3.625988078117371, "epoch": 0.451, "grad_norm": 616.0, "kl_loss_10": 100.9030990600586, "kl_loss_2": 1212.356463623047, "kl_loss_3": 839.7065948486328, "kl_loss_7": 207.68597564697265, "learning_rate": 0.0005852610963163119, "loss": 584.0681, "step": 4510 }, { "ce_loss_10": 3.5951132655143736, "ce_loss_13": 3.5340840578079225, "ce_loss_2": 4.088473439216614, "ce_loss_3": 3.9123128294944762, "ce_loss_7": 3.6418415188789366, "epoch": 0.452, "grad_norm": 440.0, "kl_loss_10": 97.94427604675293, "kl_loss_2": 1155.4515991210938, "kl_loss_3": 802.8143249511719, "kl_loss_7": 198.15041809082032, "learning_rate": 0.0005836972452208654, "loss": 560.779, "step": 4520 }, { "ce_loss_10": 3.6001816511154177, "ce_loss_13": 3.540806245803833, "ce_loss_2": 4.105304884910583, "ce_loss_3": 3.9283313751220703, "ce_loss_7": 3.6497029066085815, "epoch": 0.453, "grad_norm": 470.0, "kl_loss_10": 99.28575630187989, "kl_loss_2": 1176.1295288085937, "kl_loss_3": 817.2998046875, "kl_loss_7": 202.73690338134764, "learning_rate": 0.0005821325512950885, "loss": 572.314, "step": 4530 }, { "ce_loss_10": 3.629274320602417, "ce_loss_13": 3.5687419891357424, "ce_loss_2": 4.1162322640419005, "ce_loss_3": 3.9458845138549803, "ce_loss_7": 3.680540406703949, "epoch": 0.454, "grad_norm": 368.0, "kl_loss_10": 96.52360496520996, "kl_loss_2": 1136.2307861328125, "kl_loss_3": 790.6944702148437, "kl_loss_7": 197.31127700805663, "learning_rate": 0.0005805670302954321, "loss": 568.0196, "step": 4540 }, { "ce_loss_10": 3.6337098717689513, "ce_loss_13": 3.5753876209259032, "ce_loss_2": 4.115709042549133, "ce_loss_3": 3.9439353704452516, "ce_loss_7": 3.6809528470039368, "epoch": 0.455, "grad_norm": 434.0, "kl_loss_10": 95.89570465087891, "kl_loss_2": 1140.969873046875, "kl_loss_3": 792.410400390625, "kl_loss_7": 194.6849395751953, "learning_rate": 0.000579000697986675, "loss": 559.3398, "step": 4550 }, { "ce_loss_10": 3.5949880719184875, "ce_loss_13": 3.5312354803085326, "ce_loss_2": 4.110612523555756, "ce_loss_3": 3.9363887429237367, "ce_loss_7": 3.6481791853904726, "epoch": 0.456, "grad_norm": 398.0, "kl_loss_10": 102.14065132141113, "kl_loss_2": 1200.508935546875, "kl_loss_3": 844.4349182128906, "kl_loss_7": 207.93037872314454, "learning_rate": 0.0005774335701417662, "loss": 577.7247, "step": 4560 }, { "ce_loss_10": 3.578439974784851, "ce_loss_13": 3.5177830338478087, "ce_loss_2": 4.086728799343109, "ce_loss_3": 3.9092958092689516, "ce_loss_7": 3.628882908821106, "epoch": 0.457, "grad_norm": 438.0, "kl_loss_10": 98.15573539733887, "kl_loss_2": 1190.6679321289062, "kl_loss_3": 827.183969116211, "kl_loss_7": 201.49042510986328, "learning_rate": 0.0005758656625416658, "loss": 579.3393, "step": 4570 }, { "ce_loss_10": 3.6351425409317017, "ce_loss_13": 3.5740421295166014, "ce_loss_2": 4.13430563211441, "ce_loss_3": 3.9581828236579897, "ce_loss_7": 3.685711920261383, "epoch": 0.458, "grad_norm": 378.0, "kl_loss_10": 98.59328498840333, "kl_loss_2": 1165.538037109375, "kl_loss_3": 813.1740905761719, "kl_loss_7": 200.91252059936522, "learning_rate": 0.0005742969909751859, "loss": 562.4629, "step": 4580 }, { "ce_loss_10": 3.6438634276390074, "ce_loss_13": 3.5822227597236633, "ce_loss_2": 4.139957237243652, "ce_loss_3": 3.96221022605896, "ce_loss_7": 3.692858374118805, "epoch": 0.459, "grad_norm": 396.0, "kl_loss_10": 100.12554626464843, "kl_loss_2": 1167.3160705566406, "kl_loss_3": 805.8544036865235, "kl_loss_7": 201.26202087402345, "learning_rate": 0.0005727275712388318, "loss": 570.0833, "step": 4590 }, { "ce_loss_10": 3.681215536594391, "ce_loss_13": 3.620731198787689, "ce_loss_2": 4.155962944030762, "ce_loss_3": 3.984270441532135, "ce_loss_7": 3.7283701658248902, "epoch": 0.46, "grad_norm": 568.0, "kl_loss_10": 98.76027946472168, "kl_loss_2": 1132.1197998046875, "kl_loss_3": 792.0047241210938, "kl_loss_7": 197.17216033935546, "learning_rate": 0.0005711574191366427, "loss": 562.7997, "step": 4600 }, { "ce_loss_10": 3.6236431002616882, "ce_loss_13": 3.565703308582306, "ce_loss_2": 4.114531934261322, "ce_loss_3": 3.93969669342041, "ce_loss_7": 3.671102833747864, "epoch": 0.461, "grad_norm": 372.0, "kl_loss_10": 98.42190704345703, "kl_loss_2": 1170.4917938232422, "kl_loss_3": 808.7791198730469, "kl_loss_7": 199.0694892883301, "learning_rate": 0.0005695865504800327, "loss": 564.0159, "step": 4610 }, { "ce_loss_10": 3.562722647190094, "ce_loss_13": 3.500598740577698, "ce_loss_2": 4.109580218791962, "ce_loss_3": 3.9190361380577086, "ce_loss_7": 3.6191172361373902, "epoch": 0.462, "grad_norm": 480.0, "kl_loss_10": 100.51305274963379, "kl_loss_2": 1233.0393005371093, "kl_loss_3": 860.259619140625, "kl_loss_7": 208.89999542236328, "learning_rate": 0.0005680149810876322, "loss": 581.488, "step": 4620 }, { "ce_loss_10": 3.6198580145835875, "ce_loss_13": 3.5573437213897705, "ce_loss_2": 4.117598211765289, "ce_loss_3": 3.94056499004364, "ce_loss_7": 3.667776870727539, "epoch": 0.463, "grad_norm": 560.0, "kl_loss_10": 99.44257354736328, "kl_loss_2": 1160.7040802001952, "kl_loss_3": 809.362094116211, "kl_loss_7": 201.12859268188475, "learning_rate": 0.0005664427267851271, "loss": 565.3629, "step": 4630 }, { "ce_loss_10": 3.534971606731415, "ce_loss_13": 3.47266343832016, "ce_loss_2": 4.036073172092438, "ce_loss_3": 3.857685387134552, "ce_loss_7": 3.5870521306991576, "epoch": 0.464, "grad_norm": 498.0, "kl_loss_10": 97.52345237731933, "kl_loss_2": 1167.1843322753907, "kl_loss_3": 810.5214752197265, "kl_loss_7": 199.60354309082032, "learning_rate": 0.0005648698034051009, "loss": 562.6416, "step": 4640 }, { "ce_loss_10": 3.6570662021636964, "ce_loss_13": 3.594506525993347, "ce_loss_2": 4.158554673194885, "ce_loss_3": 3.980504941940308, "ce_loss_7": 3.7062572717666624, "epoch": 0.465, "grad_norm": 412.0, "kl_loss_10": 99.88166885375976, "kl_loss_2": 1173.9357055664063, "kl_loss_3": 818.5712066650391, "kl_loss_7": 200.30800857543946, "learning_rate": 0.0005632962267868747, "loss": 561.8186, "step": 4650 }, { "ce_loss_10": 3.5903021335601806, "ce_loss_13": 3.5294329643249513, "ce_loss_2": 4.08318532705307, "ce_loss_3": 3.9098427176475523, "ce_loss_7": 3.6388569593429567, "epoch": 0.466, "grad_norm": 464.0, "kl_loss_10": 95.17009468078614, "kl_loss_2": 1143.232162475586, "kl_loss_3": 798.761831665039, "kl_loss_7": 195.75977783203126, "learning_rate": 0.0005617220127763474, "loss": 567.0608, "step": 4660 }, { "ce_loss_10": 3.669221520423889, "ce_loss_13": 3.607930314540863, "ce_loss_2": 4.160642421245575, "ce_loss_3": 3.9847203373908995, "ce_loss_7": 3.717066395282745, "epoch": 0.467, "grad_norm": 412.0, "kl_loss_10": 98.76815719604492, "kl_loss_2": 1153.8832275390625, "kl_loss_3": 803.9543914794922, "kl_loss_7": 198.99397354125978, "learning_rate": 0.0005601471772258368, "loss": 567.3518, "step": 4670 }, { "ce_loss_10": 3.6542641162872314, "ce_loss_13": 3.593363094329834, "ce_loss_2": 4.133442676067352, "ce_loss_3": 3.96450389623642, "ce_loss_7": 3.7022117972373962, "epoch": 0.468, "grad_norm": 384.0, "kl_loss_10": 98.04742546081543, "kl_loss_2": 1118.5282470703125, "kl_loss_3": 784.399691772461, "kl_loss_7": 197.338858795166, "learning_rate": 0.0005585717359939192, "loss": 565.1176, "step": 4680 }, { "ce_loss_10": 3.56116144657135, "ce_loss_13": 3.4993683457374574, "ce_loss_2": 4.055442547798156, "ce_loss_3": 3.887247931957245, "ce_loss_7": 3.6099945425987245, "epoch": 0.469, "grad_norm": 490.0, "kl_loss_10": 97.45741577148438, "kl_loss_2": 1149.7481964111328, "kl_loss_3": 806.3391754150391, "kl_loss_7": 197.63161849975586, "learning_rate": 0.0005569957049452703, "loss": 571.714, "step": 4690 }, { "ce_loss_10": 3.6181132555007935, "ce_loss_13": 3.558199667930603, "ce_loss_2": 4.1229860305786135, "ce_loss_3": 3.9408149838447573, "ce_loss_7": 3.668530523777008, "epoch": 0.47, "grad_norm": 458.0, "kl_loss_10": 98.11741218566894, "kl_loss_2": 1179.65732421875, "kl_loss_3": 819.0914672851562, "kl_loss_7": 202.21502075195312, "learning_rate": 0.0005554190999505056, "loss": 572.5331, "step": 4700 }, { "ce_loss_10": 3.7477443337440492, "ce_loss_13": 3.6823888421058655, "ce_loss_2": 4.236353850364685, "ce_loss_3": 4.064246296882629, "ce_loss_7": 3.7983964323997497, "epoch": 0.471, "grad_norm": 376.0, "kl_loss_10": 101.09743614196778, "kl_loss_2": 1167.4985229492188, "kl_loss_3": 813.3948120117187, "kl_loss_7": 205.17110900878907, "learning_rate": 0.0005538419368860196, "loss": 552.1318, "step": 4710 }, { "ce_loss_10": 3.670793604850769, "ce_loss_13": 3.6081652998924256, "ce_loss_2": 4.154720652103424, "ce_loss_3": 3.986761474609375, "ce_loss_7": 3.7201395988464356, "epoch": 0.472, "grad_norm": 416.0, "kl_loss_10": 100.02058029174805, "kl_loss_2": 1152.6582946777344, "kl_loss_3": 806.7274566650391, "kl_loss_7": 202.40063400268554, "learning_rate": 0.0005522642316338268, "loss": 576.1212, "step": 4720 }, { "ce_loss_10": 3.673479509353638, "ce_loss_13": 3.613760471343994, "ce_loss_2": 4.150910186767578, "ce_loss_3": 3.981798696517944, "ce_loss_7": 3.721827840805054, "epoch": 0.473, "grad_norm": 478.0, "kl_loss_10": 99.9439712524414, "kl_loss_2": 1142.4451599121094, "kl_loss_3": 795.6325531005859, "kl_loss_7": 199.72487106323243, "learning_rate": 0.0005506860000814017, "loss": 573.0671, "step": 4730 }, { "ce_loss_10": 3.700618231296539, "ce_loss_13": 3.638905906677246, "ce_loss_2": 4.180734276771545, "ce_loss_3": 4.006302297115326, "ce_loss_7": 3.7447570085525514, "epoch": 0.474, "grad_norm": 372.0, "kl_loss_10": 99.73388938903808, "kl_loss_2": 1127.7213500976563, "kl_loss_3": 793.5628936767578, "kl_loss_7": 197.02488555908204, "learning_rate": 0.0005491072581215186, "loss": 565.0697, "step": 4740 }, { "ce_loss_10": 3.706625771522522, "ce_loss_13": 3.6401172399520876, "ce_loss_2": 4.184090709686279, "ce_loss_3": 4.019766807556152, "ce_loss_7": 3.754279363155365, "epoch": 0.475, "grad_norm": 516.0, "kl_loss_10": 103.58124504089355, "kl_loss_2": 1159.682275390625, "kl_loss_3": 813.5887573242187, "kl_loss_7": 204.05538330078124, "learning_rate": 0.0005475280216520913, "loss": 556.0086, "step": 4750 }, { "ce_loss_10": 3.617805337905884, "ce_loss_13": 3.5573843002319334, "ce_loss_2": 4.093091154098511, "ce_loss_3": 3.926499140262604, "ce_loss_7": 3.664002466201782, "epoch": 0.476, "grad_norm": 438.0, "kl_loss_10": 97.125687789917, "kl_loss_2": 1118.9559478759766, "kl_loss_3": 784.6352722167969, "kl_loss_7": 196.01404037475587, "learning_rate": 0.0005459483065760138, "loss": 565.9596, "step": 4760 }, { "ce_loss_10": 3.552186381816864, "ce_loss_13": 3.4902740478515626, "ce_loss_2": 4.07539484500885, "ce_loss_3": 3.891750192642212, "ce_loss_7": 3.601547920703888, "epoch": 0.477, "grad_norm": 584.0, "kl_loss_10": 97.89878273010254, "kl_loss_2": 1199.7971740722655, "kl_loss_3": 836.253662109375, "kl_loss_7": 197.98745880126953, "learning_rate": 0.0005443681288009991, "loss": 568.1693, "step": 4770 }, { "ce_loss_10": 3.6120885968208314, "ce_loss_13": 3.5525715351104736, "ce_loss_2": 4.106596338748932, "ce_loss_3": 3.932267451286316, "ce_loss_7": 3.6594039678573607, "epoch": 0.478, "grad_norm": 430.0, "kl_loss_10": 98.81552238464356, "kl_loss_2": 1169.4871887207032, "kl_loss_3": 816.0136047363281, "kl_loss_7": 198.91362609863282, "learning_rate": 0.0005427875042394199, "loss": 570.9199, "step": 4780 }, { "ce_loss_10": 3.6413972973823547, "ce_loss_13": 3.5771793842315676, "ce_loss_2": 4.133682417869568, "ce_loss_3": 3.9594278573989867, "ce_loss_7": 3.6885754466056824, "epoch": 0.479, "grad_norm": 396.0, "kl_loss_10": 102.98994331359863, "kl_loss_2": 1166.8763580322266, "kl_loss_3": 812.8268646240234, "kl_loss_7": 201.2046257019043, "learning_rate": 0.0005412064488081482, "loss": 576.3787, "step": 4790 }, { "ce_loss_10": 3.6483134269714355, "ce_loss_13": 3.5873068809509276, "ce_loss_2": 4.13967661857605, "ce_loss_3": 3.9646928787231444, "ce_loss_7": 3.697467315196991, "epoch": 0.48, "grad_norm": 370.0, "kl_loss_10": 99.1940761566162, "kl_loss_2": 1147.4876434326172, "kl_loss_3": 791.1785400390625, "kl_loss_7": 197.28219909667968, "learning_rate": 0.0005396249784283942, "loss": 558.8872, "step": 4800 }, { "ce_loss_10": 3.675038015842438, "ce_loss_13": 3.605392372608185, "ce_loss_2": 4.173114275932312, "ce_loss_3": 3.99793621301651, "ce_loss_7": 3.719290328025818, "epoch": 0.481, "grad_norm": 424.0, "kl_loss_10": 109.17574653625488, "kl_loss_2": 1186.1647857666017, "kl_loss_3": 827.1956359863282, "kl_loss_7": 205.77321548461913, "learning_rate": 0.0005380431090255476, "loss": 574.2385, "step": 4810 }, { "ce_loss_10": 3.6580063104629517, "ce_loss_13": 3.600323748588562, "ce_loss_2": 4.138371276855469, "ce_loss_3": 3.968156564235687, "ce_loss_7": 3.705365836620331, "epoch": 0.482, "grad_norm": 368.0, "kl_loss_10": 96.444384765625, "kl_loss_2": 1126.6424652099608, "kl_loss_3": 782.5597137451172, "kl_loss_7": 192.58789978027343, "learning_rate": 0.0005364608565290155, "loss": 556.892, "step": 4820 }, { "ce_loss_10": 3.66942412853241, "ce_loss_13": 3.6059840083122254, "ce_loss_2": 4.159801661968231, "ce_loss_3": 3.985584008693695, "ce_loss_7": 3.7178696751594544, "epoch": 0.483, "grad_norm": 528.0, "kl_loss_10": 101.1554500579834, "kl_loss_2": 1154.6878021240234, "kl_loss_3": 803.7564819335937, "kl_loss_7": 199.8929084777832, "learning_rate": 0.0005348782368720626, "loss": 563.005, "step": 4830 }, { "ce_loss_10": 3.596053886413574, "ce_loss_13": 3.5365728974342345, "ce_loss_2": 4.080682539939881, "ce_loss_3": 3.9060325980186463, "ce_loss_7": 3.6432487964630127, "epoch": 0.484, "grad_norm": 520.0, "kl_loss_10": 96.21514892578125, "kl_loss_2": 1134.5447143554688, "kl_loss_3": 787.3878204345704, "kl_loss_7": 194.21477661132812, "learning_rate": 0.000533295265991652, "loss": 564.2112, "step": 4840 }, { "ce_loss_10": 3.6783321022987367, "ce_loss_13": 3.6159629583358766, "ce_loss_2": 4.154437899589539, "ce_loss_3": 3.9877618312835694, "ce_loss_7": 3.727357840538025, "epoch": 0.485, "grad_norm": 434.0, "kl_loss_10": 97.2699405670166, "kl_loss_2": 1128.611801147461, "kl_loss_3": 786.6338958740234, "kl_loss_7": 195.64030685424805, "learning_rate": 0.0005317119598282822, "loss": 554.8634, "step": 4850 }, { "ce_loss_10": 3.6783334612846375, "ce_loss_13": 3.6158772826194765, "ce_loss_2": 4.161105620861053, "ce_loss_3": 3.9936763644218445, "ce_loss_7": 3.726669430732727, "epoch": 0.486, "grad_norm": 500.0, "kl_loss_10": 99.51188240051269, "kl_loss_2": 1139.204409790039, "kl_loss_3": 796.6284942626953, "kl_loss_7": 197.98922119140624, "learning_rate": 0.0005301283343258293, "loss": 559.5733, "step": 4860 }, { "ce_loss_10": 3.739852726459503, "ce_loss_13": 3.679302477836609, "ce_loss_2": 4.207214975357056, "ce_loss_3": 4.046137988567352, "ce_loss_7": 3.7877432465553285, "epoch": 0.487, "grad_norm": 434.0, "kl_loss_10": 98.4985725402832, "kl_loss_2": 1115.5814056396484, "kl_loss_3": 781.7784240722656, "kl_loss_7": 195.47981796264648, "learning_rate": 0.000528544405431384, "loss": 548.517, "step": 4870 }, { "ce_loss_10": 3.617240381240845, "ce_loss_13": 3.555454957485199, "ce_loss_2": 4.122074174880981, "ce_loss_3": 3.944024980068207, "ce_loss_7": 3.668628621101379, "epoch": 0.488, "grad_norm": 432.0, "kl_loss_10": 98.9582015991211, "kl_loss_2": 1175.8768676757813, "kl_loss_3": 814.3092010498046, "kl_loss_7": 202.09591979980468, "learning_rate": 0.000526960189095093, "loss": 569.4682, "step": 4880 }, { "ce_loss_10": 3.5905461430549623, "ce_loss_13": 3.5317755937576294, "ce_loss_2": 4.075044083595276, "ce_loss_3": 3.9047257542610168, "ce_loss_7": 3.637452006340027, "epoch": 0.489, "grad_norm": 406.0, "kl_loss_10": 95.30788230895996, "kl_loss_2": 1125.9373596191406, "kl_loss_3": 783.2284423828125, "kl_loss_7": 192.63981170654296, "learning_rate": 0.0005253757012699972, "loss": 553.6164, "step": 4890 }, { "ce_loss_10": 3.680708420276642, "ce_loss_13": 3.621255648136139, "ce_loss_2": 4.161669278144837, "ce_loss_3": 3.9898939728736877, "ce_loss_7": 3.726882266998291, "epoch": 0.49, "grad_norm": 436.0, "kl_loss_10": 98.59705772399903, "kl_loss_2": 1136.0980651855468, "kl_loss_3": 790.2571563720703, "kl_loss_7": 197.4443115234375, "learning_rate": 0.0005237909579118712, "loss": 568.0026, "step": 4900 }, { "ce_loss_10": 3.6435038447380066, "ce_loss_13": 3.581137490272522, "ce_loss_2": 4.134112453460693, "ce_loss_3": 3.9640262126922607, "ce_loss_7": 3.6911675453186037, "epoch": 0.491, "grad_norm": 520.0, "kl_loss_10": 99.66703796386719, "kl_loss_2": 1167.6467651367188, "kl_loss_3": 818.6171966552735, "kl_loss_7": 200.65354614257814, "learning_rate": 0.0005222059749790631, "loss": 568.3183, "step": 4910 }, { "ce_loss_10": 3.7152050852775576, "ce_loss_13": 3.652708613872528, "ce_loss_2": 4.176082861423493, "ce_loss_3": 4.013521981239319, "ce_loss_7": 3.759286069869995, "epoch": 0.492, "grad_norm": 394.0, "kl_loss_10": 100.0508934020996, "kl_loss_2": 1112.6296081542969, "kl_loss_3": 774.64658203125, "kl_loss_7": 196.3288688659668, "learning_rate": 0.0005206207684323337, "loss": 544.9011, "step": 4920 }, { "ce_loss_10": 3.689722108840942, "ce_loss_13": 3.6289564847946165, "ce_loss_2": 4.170908105373383, "ce_loss_3": 3.9987404584884643, "ce_loss_7": 3.7391751527786257, "epoch": 0.493, "grad_norm": 368.0, "kl_loss_10": 100.77400093078613, "kl_loss_2": 1140.2743774414062, "kl_loss_3": 795.368798828125, "kl_loss_7": 200.2589553833008, "learning_rate": 0.000519035354234695, "loss": 567.6383, "step": 4930 }, { "ce_loss_10": 3.666009783744812, "ce_loss_13": 3.603765845298767, "ce_loss_2": 4.156714332103729, "ce_loss_3": 3.9840614438056945, "ce_loss_7": 3.7159415602684023, "epoch": 0.494, "grad_norm": 516.0, "kl_loss_10": 99.73322830200195, "kl_loss_2": 1144.6152709960938, "kl_loss_3": 797.5058837890625, "kl_loss_7": 199.84856643676758, "learning_rate": 0.0005174497483512506, "loss": 551.5833, "step": 4940 }, { "ce_loss_10": 3.715251398086548, "ce_loss_13": 3.6532492995262147, "ce_loss_2": 4.190750586986542, "ce_loss_3": 4.017711067199707, "ce_loss_7": 3.760482394695282, "epoch": 0.495, "grad_norm": 404.0, "kl_loss_10": 99.74794273376465, "kl_loss_2": 1135.6743072509767, "kl_loss_3": 788.5320007324219, "kl_loss_7": 197.0201416015625, "learning_rate": 0.0005158639667490339, "loss": 559.5508, "step": 4950 }, { "ce_loss_10": 3.60677056312561, "ce_loss_13": 3.545226526260376, "ce_loss_2": 4.091673123836517, "ce_loss_3": 3.921009349822998, "ce_loss_7": 3.6560636878013613, "epoch": 0.496, "grad_norm": 380.0, "kl_loss_10": 97.61143035888672, "kl_loss_2": 1146.4500457763672, "kl_loss_3": 801.032958984375, "kl_loss_7": 198.76946029663085, "learning_rate": 0.0005142780253968481, "loss": 559.3498, "step": 4960 }, { "ce_loss_10": 3.558833396434784, "ce_loss_13": 3.498770594596863, "ce_loss_2": 4.029934275150299, "ce_loss_3": 3.8623911499977113, "ce_loss_7": 3.605703389644623, "epoch": 0.497, "grad_norm": 404.0, "kl_loss_10": 95.15658073425293, "kl_loss_2": 1120.565899658203, "kl_loss_3": 776.6140930175782, "kl_loss_7": 192.40693054199218, "learning_rate": 0.0005126919402651053, "loss": 541.1446, "step": 4970 }, { "ce_loss_10": 3.6243564009666445, "ce_loss_13": 3.562463808059692, "ce_loss_2": 4.122486090660095, "ce_loss_3": 3.9518114924430847, "ce_loss_7": 3.6740434527397157, "epoch": 0.498, "grad_norm": 500.0, "kl_loss_10": 98.81732482910157, "kl_loss_2": 1158.3788116455078, "kl_loss_3": 805.2687072753906, "kl_loss_7": 198.79998626708985, "learning_rate": 0.0005111057273256647, "loss": 562.34, "step": 4980 }, { "ce_loss_10": 3.736222839355469, "ce_loss_13": 3.676733374595642, "ce_loss_2": 4.189973556995392, "ce_loss_3": 4.022429513931274, "ce_loss_7": 3.7769731283187866, "epoch": 0.499, "grad_norm": 396.0, "kl_loss_10": 98.13356437683106, "kl_loss_2": 1078.4886474609375, "kl_loss_3": 748.3916412353516, "kl_loss_7": 191.23028793334962, "learning_rate": 0.0005095194025516733, "loss": 536.8887, "step": 4990 }, { "ce_loss_10": 3.6507428646087647, "ce_loss_13": 3.592644715309143, "ce_loss_2": 4.122073376178742, "ce_loss_3": 3.9521225333213805, "ce_loss_7": 3.697298324108124, "epoch": 0.5, "grad_norm": 378.0, "kl_loss_10": 95.96725730895996, "kl_loss_2": 1110.4840362548828, "kl_loss_3": 769.709603881836, "kl_loss_7": 192.08199310302734, "learning_rate": 0.000507932981917404, "loss": 562.5593, "step": 5000 }, { "ce_loss_10": 3.609897780418396, "ce_loss_13": 3.5468419432640075, "ce_loss_2": 4.115197873115539, "ce_loss_3": 3.9347579956054686, "ce_loss_7": 3.6594788432121277, "epoch": 0.501, "grad_norm": 496.0, "kl_loss_10": 102.02307662963867, "kl_loss_2": 1185.6702362060546, "kl_loss_3": 822.8478820800781, "kl_loss_7": 202.77078170776366, "learning_rate": 0.0005063464813980949, "loss": 576.005, "step": 5010 }, { "ce_loss_10": 3.595167326927185, "ce_loss_13": 3.534419858455658, "ce_loss_2": 4.08291003704071, "ce_loss_3": 3.910551607608795, "ce_loss_7": 3.6416044354438784, "epoch": 0.502, "grad_norm": 366.0, "kl_loss_10": 98.82206382751465, "kl_loss_2": 1157.9163513183594, "kl_loss_3": 802.2986022949219, "kl_loss_7": 196.4967498779297, "learning_rate": 0.0005047599169697884, "loss": 557.0335, "step": 5020 }, { "ce_loss_10": 3.5276883602142335, "ce_loss_13": 3.469167137145996, "ce_loss_2": 4.028472435474396, "ce_loss_3": 3.8497302412986754, "ce_loss_7": 3.5778237104415895, "epoch": 0.503, "grad_norm": 544.0, "kl_loss_10": 95.17037048339844, "kl_loss_2": 1142.5230926513673, "kl_loss_3": 789.8021270751954, "kl_loss_7": 195.37155456542968, "learning_rate": 0.000503173304609171, "loss": 545.4258, "step": 5030 }, { "ce_loss_10": 3.6576398611068726, "ce_loss_13": 3.5950983643531798, "ce_loss_2": 4.14467431306839, "ce_loss_3": 3.9757012486457826, "ce_loss_7": 3.7055052399635313, "epoch": 0.504, "grad_norm": 482.0, "kl_loss_10": 98.4008186340332, "kl_loss_2": 1135.7276794433594, "kl_loss_3": 789.9247985839844, "kl_loss_7": 196.23304824829103, "learning_rate": 0.0005015866602934111, "loss": 552.1605, "step": 5040 }, { "ce_loss_10": 3.621449387073517, "ce_loss_13": 3.5583016514778136, "ce_loss_2": 4.125820016860962, "ce_loss_3": 3.9470208525657653, "ce_loss_7": 3.6696552276611327, "epoch": 0.505, "grad_norm": 386.0, "kl_loss_10": 101.05188751220703, "kl_loss_2": 1170.8730712890624, "kl_loss_3": 822.174462890625, "kl_loss_7": 203.6134246826172, "learning_rate": 0.0005, "loss": 564.1666, "step": 5050 }, { "ce_loss_10": 3.608661472797394, "ce_loss_13": 3.549720525741577, "ce_loss_2": 4.094336903095245, "ce_loss_3": 3.921399199962616, "ce_loss_7": 3.6561817049980165, "epoch": 0.506, "grad_norm": 532.0, "kl_loss_10": 97.96763725280762, "kl_loss_2": 1147.6109741210937, "kl_loss_3": 799.4344543457031, "kl_loss_7": 197.70511016845703, "learning_rate": 0.0004984133397065889, "loss": 551.9219, "step": 5060 }, { "ce_loss_10": 3.619631803035736, "ce_loss_13": 3.5591482758522033, "ce_loss_2": 4.1191855549812315, "ce_loss_3": 3.947730815410614, "ce_loss_7": 3.671154284477234, "epoch": 0.507, "grad_norm": 420.0, "kl_loss_10": 98.14169616699219, "kl_loss_2": 1152.0039337158203, "kl_loss_3": 803.1968292236328, "kl_loss_7": 198.87692565917968, "learning_rate": 0.0004968266953908291, "loss": 554.0305, "step": 5070 }, { "ce_loss_10": 3.6628435134887694, "ce_loss_13": 3.6024859309196473, "ce_loss_2": 4.145783054828644, "ce_loss_3": 3.972540259361267, "ce_loss_7": 3.7080691695213317, "epoch": 0.508, "grad_norm": 532.0, "kl_loss_10": 98.82306175231933, "kl_loss_2": 1137.6268676757813, "kl_loss_3": 795.5397338867188, "kl_loss_7": 194.52870864868163, "learning_rate": 0.0004952400830302117, "loss": 554.9051, "step": 5080 }, { "ce_loss_10": 3.585409712791443, "ce_loss_13": 3.525643265247345, "ce_loss_2": 4.091677510738373, "ce_loss_3": 3.9131953358650207, "ce_loss_7": 3.6364392280578612, "epoch": 0.509, "grad_norm": 412.0, "kl_loss_10": 98.62568626403808, "kl_loss_2": 1168.942919921875, "kl_loss_3": 811.3192687988281, "kl_loss_7": 199.42913665771485, "learning_rate": 0.0004936535186019053, "loss": 559.6511, "step": 5090 }, { "ce_loss_10": 3.6907896161079408, "ce_loss_13": 3.62961208820343, "ce_loss_2": 4.153078198432922, "ce_loss_3": 3.9874324560165406, "ce_loss_7": 3.735322892665863, "epoch": 0.51, "grad_norm": 376.0, "kl_loss_10": 97.42878112792968, "kl_loss_2": 1101.246890258789, "kl_loss_3": 771.5801239013672, "kl_loss_7": 192.14101791381836, "learning_rate": 0.000492067018082596, "loss": 549.3435, "step": 5100 }, { "ce_loss_10": 3.6234113693237306, "ce_loss_13": 3.55826051235199, "ce_loss_2": 4.134788942337036, "ce_loss_3": 3.9512638211250306, "ce_loss_7": 3.673302376270294, "epoch": 0.511, "grad_norm": 358.0, "kl_loss_10": 100.71795692443848, "kl_loss_2": 1184.7957580566406, "kl_loss_3": 822.3129302978516, "kl_loss_7": 201.37216567993164, "learning_rate": 0.0004904805974483267, "loss": 578.112, "step": 5110 }, { "ce_loss_10": 3.73909273147583, "ce_loss_13": 3.6729060292243956, "ce_loss_2": 4.232535266876221, "ce_loss_3": 4.064092624187469, "ce_loss_7": 3.78980005979538, "epoch": 0.512, "grad_norm": 418.0, "kl_loss_10": 103.6674789428711, "kl_loss_2": 1170.0532684326172, "kl_loss_3": 824.2820373535156, "kl_loss_7": 206.52156982421874, "learning_rate": 0.0004888942726743353, "loss": 580.3403, "step": 5120 }, { "ce_loss_10": 3.6079283952713013, "ce_loss_13": 3.5456172823905945, "ce_loss_2": 4.103336191177368, "ce_loss_3": 3.9267752170562744, "ce_loss_7": 3.655103015899658, "epoch": 0.513, "grad_norm": 378.0, "kl_loss_10": 97.65564994812011, "kl_loss_2": 1156.2654846191406, "kl_loss_3": 800.4834381103516, "kl_loss_7": 198.76654281616212, "learning_rate": 0.0004873080597348947, "loss": 561.8108, "step": 5130 }, { "ce_loss_10": 3.492985022068024, "ce_loss_13": 3.433611583709717, "ce_loss_2": 4.009678089618683, "ce_loss_3": 3.82467257976532, "ce_loss_7": 3.543225371837616, "epoch": 0.514, "grad_norm": 440.0, "kl_loss_10": 96.83905181884765, "kl_loss_2": 1194.322329711914, "kl_loss_3": 828.9491943359375, "kl_loss_7": 198.22924575805663, "learning_rate": 0.0004857219746031519, "loss": 567.8251, "step": 5140 }, { "ce_loss_10": 3.6722797036170958, "ce_loss_13": 3.6109776854515077, "ce_loss_2": 4.149738478660583, "ce_loss_3": 3.975986909866333, "ce_loss_7": 3.7163102626800537, "epoch": 0.515, "grad_norm": 430.0, "kl_loss_10": 99.9472442626953, "kl_loss_2": 1140.7201843261719, "kl_loss_3": 787.3806091308594, "kl_loss_7": 197.54812469482422, "learning_rate": 0.0004841360332509663, "loss": 556.8349, "step": 5150 }, { "ce_loss_10": 3.6183668613433837, "ce_loss_13": 3.5591975688934325, "ce_loss_2": 4.100240254402161, "ce_loss_3": 3.9269237518310547, "ce_loss_7": 3.6642425417900086, "epoch": 0.516, "grad_norm": 366.0, "kl_loss_10": 93.92010688781738, "kl_loss_2": 1122.7465362548828, "kl_loss_3": 778.0984069824219, "kl_loss_7": 191.03939056396484, "learning_rate": 0.0004825502516487497, "loss": 537.9487, "step": 5160 }, { "ce_loss_10": 3.5835310339927675, "ce_loss_13": 3.523791456222534, "ce_loss_2": 4.082003366947174, "ce_loss_3": 3.908873450756073, "ce_loss_7": 3.634874391555786, "epoch": 0.517, "grad_norm": 608.0, "kl_loss_10": 99.05728721618652, "kl_loss_2": 1155.0127502441405, "kl_loss_3": 805.5277587890625, "kl_loss_7": 198.6641098022461, "learning_rate": 0.00048096464576530507, "loss": 561.8511, "step": 5170 }, { "ce_loss_10": 3.6886157989501953, "ce_loss_13": 3.628003740310669, "ce_loss_2": 4.146280741691589, "ce_loss_3": 3.9846285343170167, "ce_loss_7": 3.731534945964813, "epoch": 0.518, "grad_norm": 390.0, "kl_loss_10": 98.92878913879395, "kl_loss_2": 1103.851336669922, "kl_loss_3": 767.5214813232421, "kl_loss_7": 193.13973236083984, "learning_rate": 0.00047937923156766646, "loss": 544.8563, "step": 5180 }, { "ce_loss_10": 3.737223446369171, "ce_loss_13": 3.6758363366127016, "ce_loss_2": 4.200218558311462, "ce_loss_3": 4.037039196491241, "ce_loss_7": 3.7829922437667847, "epoch": 0.519, "grad_norm": 428.0, "kl_loss_10": 102.72743797302246, "kl_loss_2": 1108.4752288818358, "kl_loss_3": 772.8697265625, "kl_loss_7": 198.5632797241211, "learning_rate": 0.00047779402502093696, "loss": 549.91, "step": 5190 }, { "ce_loss_10": 3.703013610839844, "ce_loss_13": 3.640911114215851, "ce_loss_2": 4.174945414066315, "ce_loss_3": 4.009368169307709, "ce_loss_7": 3.7497113823890684, "epoch": 0.52, "grad_norm": 478.0, "kl_loss_10": 99.68995170593261, "kl_loss_2": 1110.2117858886718, "kl_loss_3": 777.3010894775391, "kl_loss_7": 196.47792434692383, "learning_rate": 0.0004762090420881289, "loss": 553.7422, "step": 5200 }, { "ce_loss_10": 3.6182032585144044, "ce_loss_13": 3.5570725202560425, "ce_loss_2": 4.098654413223267, "ce_loss_3": 3.916290044784546, "ce_loss_7": 3.665347421169281, "epoch": 0.521, "grad_norm": 426.0, "kl_loss_10": 98.28518867492676, "kl_loss_2": 1126.3521606445313, "kl_loss_3": 772.9946044921875, "kl_loss_7": 193.74108428955077, "learning_rate": 0.00047462429873000296, "loss": 544.104, "step": 5210 }, { "ce_loss_10": 3.7033097624778746, "ce_loss_13": 3.6430840730667113, "ce_loss_2": 4.168367850780487, "ce_loss_3": 3.9993362069129943, "ce_loss_7": 3.74978985786438, "epoch": 0.522, "grad_norm": 412.0, "kl_loss_10": 98.88156356811524, "kl_loss_2": 1115.6398986816407, "kl_loss_3": 774.026156616211, "kl_loss_7": 195.32233123779298, "learning_rate": 0.0004730398109049071, "loss": 547.7821, "step": 5220 }, { "ce_loss_10": 3.633508253097534, "ce_loss_13": 3.5716773152351378, "ce_loss_2": 4.128389453887939, "ce_loss_3": 3.9533074378967283, "ce_loss_7": 3.6823344349861147, "epoch": 0.523, "grad_norm": 396.0, "kl_loss_10": 98.93126792907715, "kl_loss_2": 1163.846746826172, "kl_loss_3": 810.2734771728516, "kl_loss_7": 200.85460052490234, "learning_rate": 0.000471455594568616, "loss": 558.1328, "step": 5230 }, { "ce_loss_10": 3.707250881195068, "ce_loss_13": 3.6447718501091004, "ce_loss_2": 4.174321246147156, "ce_loss_3": 4.004381275177002, "ce_loss_7": 3.753636956214905, "epoch": 0.524, "grad_norm": 394.0, "kl_loss_10": 100.72676544189453, "kl_loss_2": 1114.3457427978515, "kl_loss_3": 768.1556701660156, "kl_loss_7": 195.28340759277344, "learning_rate": 0.00046987166567417086, "loss": 552.4388, "step": 5240 }, { "ce_loss_10": 3.6187984108924867, "ce_loss_13": 3.5605034112930296, "ce_loss_2": 4.1001020789146425, "ce_loss_3": 3.9256922364234925, "ce_loss_7": 3.664110267162323, "epoch": 0.525, "grad_norm": 380.0, "kl_loss_10": 95.83710632324218, "kl_loss_2": 1120.3159301757812, "kl_loss_3": 775.0560852050781, "kl_loss_7": 192.1679656982422, "learning_rate": 0.00046828804017171776, "loss": 536.3316, "step": 5250 }, { "ce_loss_10": 3.6720359563827514, "ce_loss_13": 3.6088499784469605, "ce_loss_2": 4.162907612323761, "ce_loss_3": 3.9896105885505677, "ce_loss_7": 3.722712779045105, "epoch": 0.526, "grad_norm": 394.0, "kl_loss_10": 98.17714996337891, "kl_loss_2": 1138.502374267578, "kl_loss_3": 789.8116973876953, "kl_loss_7": 197.40582656860352, "learning_rate": 0.00046670473400834805, "loss": 559.8189, "step": 5260 }, { "ce_loss_10": 3.597737526893616, "ce_loss_13": 3.5393651485443116, "ce_loss_2": 4.074982023239135, "ce_loss_3": 3.9021154403686524, "ce_loss_7": 3.644618010520935, "epoch": 0.527, "grad_norm": 436.0, "kl_loss_10": 95.52880744934082, "kl_loss_2": 1111.367953491211, "kl_loss_3": 768.6636322021484, "kl_loss_7": 191.67658157348632, "learning_rate": 0.00046512176312793734, "loss": 559.1187, "step": 5270 }, { "ce_loss_10": 3.5923956394195558, "ce_loss_13": 3.5312567353248596, "ce_loss_2": 4.0659032464027405, "ce_loss_3": 3.9041757225990295, "ce_loss_7": 3.638344919681549, "epoch": 0.528, "grad_norm": 382.0, "kl_loss_10": 95.8816967010498, "kl_loss_2": 1131.7323181152344, "kl_loss_3": 788.8408813476562, "kl_loss_7": 193.95931167602538, "learning_rate": 0.00046353914347098467, "loss": 557.7083, "step": 5280 }, { "ce_loss_10": 3.688094747066498, "ce_loss_13": 3.626521134376526, "ce_loss_2": 4.17344571352005, "ce_loss_3": 3.9936492323875425, "ce_loss_7": 3.7344411969184876, "epoch": 0.529, "grad_norm": 438.0, "kl_loss_10": 99.97393112182617, "kl_loss_2": 1136.7248291015626, "kl_loss_3": 780.4773040771485, "kl_loss_7": 194.1311233520508, "learning_rate": 0.0004619568909744524, "loss": 554.6544, "step": 5290 }, { "ce_loss_10": 3.6992242336273193, "ce_loss_13": 3.6374841570854186, "ce_loss_2": 4.173903214931488, "ce_loss_3": 4.004681324958801, "ce_loss_7": 3.7441007494926453, "epoch": 0.53, "grad_norm": 496.0, "kl_loss_10": 100.66301612854004, "kl_loss_2": 1118.1583740234375, "kl_loss_3": 778.1599609375, "kl_loss_7": 195.17978057861328, "learning_rate": 0.00046037502157160573, "loss": 555.7068, "step": 5300 }, { "ce_loss_10": 3.5648537158966063, "ce_loss_13": 3.50801477432251, "ce_loss_2": 4.0505608201026915, "ce_loss_3": 3.885770845413208, "ce_loss_7": 3.614854156970978, "epoch": 0.531, "grad_norm": 392.0, "kl_loss_10": 95.29824142456054, "kl_loss_2": 1148.0569580078125, "kl_loss_3": 803.5360778808594, "kl_loss_7": 195.23088302612305, "learning_rate": 0.00045879355119185207, "loss": 559.6594, "step": 5310 }, { "ce_loss_10": 3.6439425349235535, "ce_loss_13": 3.583683359622955, "ce_loss_2": 4.135701584815979, "ce_loss_3": 3.9598298192024233, "ce_loss_7": 3.692049765586853, "epoch": 0.532, "grad_norm": 444.0, "kl_loss_10": 97.83190078735352, "kl_loss_2": 1160.7438171386718, "kl_loss_3": 807.647915649414, "kl_loss_7": 199.49599685668946, "learning_rate": 0.0004572124957605803, "loss": 565.4321, "step": 5320 }, { "ce_loss_10": 3.6681848645210264, "ce_loss_13": 3.607477676868439, "ce_loss_2": 4.14128270149231, "ce_loss_3": 3.9746485590934753, "ce_loss_7": 3.7138744235038756, "epoch": 0.533, "grad_norm": 340.0, "kl_loss_10": 95.41666564941406, "kl_loss_2": 1136.1244140625, "kl_loss_3": 793.3468963623047, "kl_loss_7": 195.33221740722655, "learning_rate": 0.00045563187119900103, "loss": 550.4382, "step": 5330 }, { "ce_loss_10": 3.5087064266204835, "ce_loss_13": 3.4494638442993164, "ce_loss_2": 4.00373204946518, "ce_loss_3": 3.8344790935516357, "ce_loss_7": 3.5566913962364195, "epoch": 0.534, "grad_norm": 456.0, "kl_loss_10": 96.30420112609863, "kl_loss_2": 1145.2862731933594, "kl_loss_3": 803.7556610107422, "kl_loss_7": 194.92612915039064, "learning_rate": 0.00045405169342398633, "loss": 560.8537, "step": 5340 }, { "ce_loss_10": 3.5990882992744444, "ce_loss_13": 3.535432243347168, "ce_loss_2": 4.08842386007309, "ce_loss_3": 3.912948155403137, "ce_loss_7": 3.6465937376022337, "epoch": 0.535, "grad_norm": 422.0, "kl_loss_10": 99.51773872375489, "kl_loss_2": 1142.4013549804688, "kl_loss_3": 795.5528442382813, "kl_loss_7": 196.72316284179686, "learning_rate": 0.0004524719783479088, "loss": 548.8232, "step": 5350 }, { "ce_loss_10": 3.552276241779327, "ce_loss_13": 3.492251825332642, "ce_loss_2": 4.056445682048798, "ce_loss_3": 3.8783608794212343, "ce_loss_7": 3.603780543804169, "epoch": 0.536, "grad_norm": 376.0, "kl_loss_10": 97.24302253723144, "kl_loss_2": 1164.848809814453, "kl_loss_3": 811.2062194824218, "kl_loss_7": 198.37730560302734, "learning_rate": 0.00045089274187848144, "loss": 554.2202, "step": 5360 }, { "ce_loss_10": 3.6724863052368164, "ce_loss_13": 3.6130531072616576, "ce_loss_2": 4.1379453301429745, "ce_loss_3": 3.968498194217682, "ce_loss_7": 3.717296040058136, "epoch": 0.537, "grad_norm": 536.0, "kl_loss_10": 96.28798866271973, "kl_loss_2": 1108.8939270019532, "kl_loss_3": 770.5279510498046, "kl_loss_7": 192.69188079833984, "learning_rate": 0.00044931399991859835, "loss": 545.4216, "step": 5370 }, { "ce_loss_10": 3.5360588788986207, "ce_loss_13": 3.474487328529358, "ce_loss_2": 4.018628227710724, "ce_loss_3": 3.8429470539093016, "ce_loss_7": 3.5856809496879576, "epoch": 0.538, "grad_norm": 446.0, "kl_loss_10": 97.58423805236816, "kl_loss_2": 1139.092123413086, "kl_loss_3": 788.7141876220703, "kl_loss_7": 196.66349868774415, "learning_rate": 0.00044773576836617336, "loss": 546.6951, "step": 5380 }, { "ce_loss_10": 3.6238678693771362, "ce_loss_13": 3.5626631021499633, "ce_loss_2": 4.120850419998169, "ce_loss_3": 3.943516790866852, "ce_loss_7": 3.6712807416915894, "epoch": 0.539, "grad_norm": 388.0, "kl_loss_10": 99.70593795776367, "kl_loss_2": 1163.2907775878907, "kl_loss_3": 810.1283508300781, "kl_loss_7": 199.7040023803711, "learning_rate": 0.00044615806311398056, "loss": 569.078, "step": 5390 }, { "ce_loss_10": 3.706363093852997, "ce_loss_13": 3.6457801342010496, "ce_loss_2": 4.146688032150268, "ce_loss_3": 3.9897242546081544, "ce_loss_7": 3.7506498098373413, "epoch": 0.54, "grad_norm": 318.0, "kl_loss_10": 98.56370239257812, "kl_loss_2": 1084.4558197021483, "kl_loss_3": 756.0719848632813, "kl_loss_7": 191.6246208190918, "learning_rate": 0.00044458090004949454, "loss": 551.6847, "step": 5400 }, { "ce_loss_10": 3.5594072341918945, "ce_loss_13": 3.4980836510658264, "ce_loss_2": 4.072906112670898, "ce_loss_3": 3.8963231086730956, "ce_loss_7": 3.6096426606178285, "epoch": 0.541, "grad_norm": 490.0, "kl_loss_10": 98.93370399475097, "kl_loss_2": 1204.406317138672, "kl_loss_3": 841.533901977539, "kl_loss_7": 202.28990631103517, "learning_rate": 0.0004430042950547297, "loss": 563.3182, "step": 5410 }, { "ce_loss_10": 3.656948244571686, "ce_loss_13": 3.5917163252830506, "ce_loss_2": 4.146977603435516, "ce_loss_3": 3.9775506377220156, "ce_loss_7": 3.7048157334327696, "epoch": 0.542, "grad_norm": 472.0, "kl_loss_10": 100.26595115661621, "kl_loss_2": 1150.8060424804687, "kl_loss_3": 803.8866760253907, "kl_loss_7": 200.08724365234374, "learning_rate": 0.0004414282640060809, "loss": 559.1381, "step": 5420 }, { "ce_loss_10": 3.7556936740875244, "ce_loss_13": 3.690820097923279, "ce_loss_2": 4.2162927985191345, "ce_loss_3": 4.059760391712189, "ce_loss_7": 3.7993207812309264, "epoch": 0.543, "grad_norm": 466.0, "kl_loss_10": 100.5603858947754, "kl_loss_2": 1102.3566284179688, "kl_loss_3": 774.5157104492188, "kl_loss_7": 196.8573425292969, "learning_rate": 0.0004398528227741633, "loss": 566.5525, "step": 5430 }, { "ce_loss_10": 3.6126871943473815, "ce_loss_13": 3.553126609325409, "ce_loss_2": 4.1005645275115965, "ce_loss_3": 3.9280160546302794, "ce_loss_7": 3.660943078994751, "epoch": 0.544, "grad_norm": 458.0, "kl_loss_10": 97.1538932800293, "kl_loss_2": 1131.997964477539, "kl_loss_3": 791.6496276855469, "kl_loss_7": 198.33607559204103, "learning_rate": 0.00043827798722365264, "loss": 560.7217, "step": 5440 }, { "ce_loss_10": 3.744398605823517, "ce_loss_13": 3.681015205383301, "ce_loss_2": 4.201505517959594, "ce_loss_3": 4.03549770116806, "ce_loss_7": 3.788591706752777, "epoch": 0.545, "grad_norm": 352.0, "kl_loss_10": 99.98037643432617, "kl_loss_2": 1095.4162628173829, "kl_loss_3": 762.5562530517578, "kl_loss_7": 196.50249557495118, "learning_rate": 0.00043670377321312535, "loss": 539.1079, "step": 5450 }, { "ce_loss_10": 3.7459957599639893, "ce_loss_13": 3.6846879959106444, "ce_loss_2": 4.2025530457496645, "ce_loss_3": 4.042814528942108, "ce_loss_7": 3.789257228374481, "epoch": 0.546, "grad_norm": 346.0, "kl_loss_10": 99.90774993896484, "kl_loss_2": 1095.3400299072266, "kl_loss_3": 761.9524017333985, "kl_loss_7": 193.25130310058594, "learning_rate": 0.0004351301965948991, "loss": 550.9912, "step": 5460 }, { "ce_loss_10": 3.6544747233390806, "ce_loss_13": 3.5925102829933167, "ce_loss_2": 4.1156612753868105, "ce_loss_3": 3.9492591619491577, "ce_loss_7": 3.700915348529816, "epoch": 0.547, "grad_norm": 446.0, "kl_loss_10": 99.69101219177246, "kl_loss_2": 1097.9489288330078, "kl_loss_3": 763.9795166015625, "kl_loss_7": 193.2705093383789, "learning_rate": 0.000433557273214873, "loss": 548.6603, "step": 5470 }, { "ce_loss_10": 3.6407829880714417, "ce_loss_13": 3.58055636882782, "ce_loss_2": 4.112010169029236, "ce_loss_3": 3.9410730838775634, "ce_loss_7": 3.6900732636451723, "epoch": 0.548, "grad_norm": 364.0, "kl_loss_10": 96.30272674560547, "kl_loss_2": 1104.9110717773438, "kl_loss_3": 764.0930358886719, "kl_loss_7": 193.28277206420898, "learning_rate": 0.000431985018912368, "loss": 539.9292, "step": 5480 }, { "ce_loss_10": 3.6089709639549254, "ce_loss_13": 3.5466750621795655, "ce_loss_2": 4.105600357055664, "ce_loss_3": 3.9258901715278625, "ce_loss_7": 3.658845567703247, "epoch": 0.549, "grad_norm": 428.0, "kl_loss_10": 98.85242919921875, "kl_loss_2": 1163.1305419921875, "kl_loss_3": 809.7076019287109, "kl_loss_7": 198.85261154174805, "learning_rate": 0.0004304134495199674, "loss": 550.7034, "step": 5490 }, { "ce_loss_10": 3.638536274433136, "ce_loss_13": 3.575793814659119, "ce_loss_2": 4.123488712310791, "ce_loss_3": 3.954343330860138, "ce_loss_7": 3.685023546218872, "epoch": 0.55, "grad_norm": 488.0, "kl_loss_10": 99.10371284484863, "kl_loss_2": 1163.9283081054687, "kl_loss_3": 806.7497436523438, "kl_loss_7": 200.15425338745118, "learning_rate": 0.0004288425808633575, "loss": 555.8719, "step": 5500 }, { "ce_loss_10": 3.6068961024284363, "ce_loss_13": 3.5489359140396117, "ce_loss_2": 4.091926336288452, "ce_loss_3": 3.914480412006378, "ce_loss_7": 3.653805840015411, "epoch": 0.551, "grad_norm": 482.0, "kl_loss_10": 95.30807762145996, "kl_loss_2": 1135.6305114746094, "kl_loss_3": 782.8162139892578, "kl_loss_7": 192.36727905273438, "learning_rate": 0.0004272724287611684, "loss": 551.1164, "step": 5510 }, { "ce_loss_10": 3.5843793511390687, "ce_loss_13": 3.5220483541488647, "ce_loss_2": 4.066782796382904, "ce_loss_3": 3.8880024194717406, "ce_loss_7": 3.628884470462799, "epoch": 0.552, "grad_norm": 472.0, "kl_loss_10": 98.19914245605469, "kl_loss_2": 1138.4930938720704, "kl_loss_3": 792.6924499511719, "kl_loss_7": 197.34004135131835, "learning_rate": 0.00042570300902481425, "loss": 550.9366, "step": 5520 }, { "ce_loss_10": 3.6187870144844054, "ce_loss_13": 3.559086096286774, "ce_loss_2": 4.0836735486984255, "ce_loss_3": 3.913509225845337, "ce_loss_7": 3.662268269062042, "epoch": 0.553, "grad_norm": 460.0, "kl_loss_10": 96.8458236694336, "kl_loss_2": 1113.29208984375, "kl_loss_3": 778.9167602539062, "kl_loss_7": 192.73130722045897, "learning_rate": 0.00042413433745833423, "loss": 545.5068, "step": 5530 }, { "ce_loss_10": 3.6217783451080323, "ce_loss_13": 3.5588382482528687, "ce_loss_2": 4.102611029148102, "ce_loss_3": 3.9288668751716616, "ce_loss_7": 3.667692792415619, "epoch": 0.554, "grad_norm": 394.0, "kl_loss_10": 99.64076881408691, "kl_loss_2": 1129.861962890625, "kl_loss_3": 781.159780883789, "kl_loss_7": 194.5426254272461, "learning_rate": 0.0004225664298582339, "loss": 538.3319, "step": 5540 }, { "ce_loss_10": 3.7008472084999084, "ce_loss_13": 3.6404882073402405, "ce_loss_2": 4.157876873016358, "ce_loss_3": 3.9944301009178163, "ce_loss_7": 3.7464569926261904, "epoch": 0.555, "grad_norm": 352.0, "kl_loss_10": 98.0084358215332, "kl_loss_2": 1092.2807312011719, "kl_loss_3": 758.9974426269531, "kl_loss_7": 191.41172409057617, "learning_rate": 0.000420999302013325, "loss": 539.2247, "step": 5550 }, { "ce_loss_10": 3.5973586678504943, "ce_loss_13": 3.534582734107971, "ce_loss_2": 4.09981359243393, "ce_loss_3": 3.9165178179740905, "ce_loss_7": 3.6474678754806518, "epoch": 0.556, "grad_norm": 454.0, "kl_loss_10": 99.95339088439941, "kl_loss_2": 1148.3679443359374, "kl_loss_3": 795.4782531738281, "kl_loss_7": 199.34042739868164, "learning_rate": 0.000419432969704568, "loss": 547.6515, "step": 5560 }, { "ce_loss_10": 3.6402106523513793, "ce_loss_13": 3.580482280254364, "ce_loss_2": 4.112204611301422, "ce_loss_3": 3.9463653802871703, "ce_loss_7": 3.6864510416984557, "epoch": 0.557, "grad_norm": 374.0, "kl_loss_10": 97.21049270629882, "kl_loss_2": 1103.2306396484375, "kl_loss_3": 765.6696472167969, "kl_loss_7": 192.21127700805664, "learning_rate": 0.00041786744870491154, "loss": 552.003, "step": 5570 }, { "ce_loss_10": 3.5763687014579775, "ce_loss_13": 3.513793337345123, "ce_loss_2": 4.059341847896576, "ce_loss_3": 3.8873541951179504, "ce_loss_7": 3.6242376923561097, "epoch": 0.558, "grad_norm": 496.0, "kl_loss_10": 99.6470874786377, "kl_loss_2": 1146.4394836425781, "kl_loss_3": 799.3714019775391, "kl_loss_7": 198.99811019897462, "learning_rate": 0.0004163027547791347, "loss": 555.3918, "step": 5580 }, { "ce_loss_10": 3.550457501411438, "ce_loss_13": 3.490234684944153, "ce_loss_2": 4.058210396766663, "ce_loss_3": 3.8777605056762696, "ce_loss_7": 3.5981253504753115, "epoch": 0.559, "grad_norm": 362.0, "kl_loss_10": 96.0154800415039, "kl_loss_2": 1166.6077453613282, "kl_loss_3": 807.5666870117187, "kl_loss_7": 196.15278396606445, "learning_rate": 0.0004147389036836881, "loss": 556.2604, "step": 5590 }, { "ce_loss_10": 3.606854057312012, "ce_loss_13": 3.545392167568207, "ce_loss_2": 4.097903311252594, "ce_loss_3": 3.924593436717987, "ce_loss_7": 3.652910280227661, "epoch": 0.56, "grad_norm": 580.0, "kl_loss_10": 99.4388584136963, "kl_loss_2": 1150.4553649902343, "kl_loss_3": 802.4499359130859, "kl_loss_7": 196.6334327697754, "learning_rate": 0.00041317591116653486, "loss": 563.6437, "step": 5600 }, { "ce_loss_10": 3.6449447154998778, "ce_loss_13": 3.5830300569534304, "ce_loss_2": 4.1296777606010435, "ce_loss_3": 3.9580175995826723, "ce_loss_7": 3.695024287700653, "epoch": 0.561, "grad_norm": 528.0, "kl_loss_10": 100.15715980529785, "kl_loss_2": 1137.3770324707032, "kl_loss_3": 786.0429443359375, "kl_loss_7": 199.2029815673828, "learning_rate": 0.0004116137929669921, "loss": 545.8336, "step": 5610 }, { "ce_loss_10": 3.6345237135887145, "ce_loss_13": 3.575796604156494, "ce_loss_2": 4.1131403088569645, "ce_loss_3": 3.940467345714569, "ce_loss_7": 3.6807628154754637, "epoch": 0.562, "grad_norm": 388.0, "kl_loss_10": 95.75808372497559, "kl_loss_2": 1128.9722564697265, "kl_loss_3": 784.2019927978515, "kl_loss_7": 193.04863052368165, "learning_rate": 0.00041005256481557305, "loss": 543.754, "step": 5620 }, { "ce_loss_10": 3.7401763558387757, "ce_loss_13": 3.6805962681770326, "ce_loss_2": 4.185898721218109, "ce_loss_3": 4.027551281452179, "ce_loss_7": 3.783228611946106, "epoch": 0.563, "grad_norm": 516.0, "kl_loss_10": 96.21339073181153, "kl_loss_2": 1061.5840301513672, "kl_loss_3": 738.3266693115235, "kl_loss_7": 187.24717712402344, "learning_rate": 0.00040849224243382767, "loss": 533.9922, "step": 5630 }, { "ce_loss_10": 3.5920221328735353, "ce_loss_13": 3.5324007272720337, "ce_loss_2": 4.072316908836365, "ce_loss_3": 3.8983843684196473, "ce_loss_7": 3.6374821186065676, "epoch": 0.564, "grad_norm": 338.0, "kl_loss_10": 95.43405532836914, "kl_loss_2": 1128.149676513672, "kl_loss_3": 783.0245666503906, "kl_loss_7": 193.40655746459962, "learning_rate": 0.000406932841534185, "loss": 541.5961, "step": 5640 }, { "ce_loss_10": 3.5484704256057737, "ce_loss_13": 3.486864137649536, "ce_loss_2": 4.036842632293701, "ce_loss_3": 3.8651990056037904, "ce_loss_7": 3.597266983985901, "epoch": 0.565, "grad_norm": 604.0, "kl_loss_10": 95.5288932800293, "kl_loss_2": 1141.9300598144532, "kl_loss_3": 797.4877136230468, "kl_loss_7": 194.9025909423828, "learning_rate": 0.0004053743778197951, "loss": 559.9006, "step": 5650 }, { "ce_loss_10": 3.6602503299713134, "ce_loss_13": 3.596804344654083, "ce_loss_2": 4.136269843578338, "ce_loss_3": 3.967122423648834, "ce_loss_7": 3.7048738479614256, "epoch": 0.566, "grad_norm": 418.0, "kl_loss_10": 101.36623306274414, "kl_loss_2": 1114.9331634521484, "kl_loss_3": 774.6735748291015, "kl_loss_7": 196.29364929199218, "learning_rate": 0.0004038168669843697, "loss": 553.1191, "step": 5660 }, { "ce_loss_10": 3.6255574107170103, "ce_loss_13": 3.5639535069465635, "ce_loss_2": 4.085369718074799, "ce_loss_3": 3.919695568084717, "ce_loss_7": 3.6704380750656127, "epoch": 0.567, "grad_norm": 736.0, "kl_loss_10": 98.19256973266602, "kl_loss_2": 1100.560809326172, "kl_loss_3": 765.2613342285156, "kl_loss_7": 192.50554656982422, "learning_rate": 0.000402260324712026, "loss": 547.8535, "step": 5670 }, { "ce_loss_10": 3.669494354724884, "ce_loss_13": 3.60741925239563, "ce_loss_2": 4.148016309738159, "ce_loss_3": 3.9792763590812683, "ce_loss_7": 3.7149499893188476, "epoch": 0.568, "grad_norm": 498.0, "kl_loss_10": 99.65063438415527, "kl_loss_2": 1126.1991058349608, "kl_loss_3": 783.0127746582032, "kl_loss_7": 194.12312698364258, "learning_rate": 0.00040070476667712743, "loss": 543.5942, "step": 5680 }, { "ce_loss_10": 3.7005011796951295, "ce_loss_13": 3.6357283353805543, "ce_loss_2": 4.166275656223297, "ce_loss_3": 4.000016844272613, "ce_loss_7": 3.745167064666748, "epoch": 0.569, "grad_norm": 356.0, "kl_loss_10": 100.85004692077636, "kl_loss_2": 1110.8271209716797, "kl_loss_3": 770.1773681640625, "kl_loss_7": 194.5637939453125, "learning_rate": 0.0003991502085441259, "loss": 548.6594, "step": 5690 }, { "ce_loss_10": 3.729709804058075, "ce_loss_13": 3.6688971519470215, "ce_loss_2": 4.18160834312439, "ce_loss_3": 4.015434455871582, "ce_loss_7": 3.7735623002052305, "epoch": 0.57, "grad_norm": 374.0, "kl_loss_10": 98.11349868774414, "kl_loss_2": 1070.740576171875, "kl_loss_3": 744.3529327392578, "kl_loss_7": 190.02555770874022, "learning_rate": 0.0003975966659674047, "loss": 541.8046, "step": 5700 }, { "ce_loss_10": 3.691783332824707, "ce_loss_13": 3.6318029403686523, "ce_loss_2": 4.161513650417328, "ce_loss_3": 3.986249303817749, "ce_loss_7": 3.73646023273468, "epoch": 0.571, "grad_norm": 536.0, "kl_loss_10": 98.58754501342773, "kl_loss_2": 1102.3078491210938, "kl_loss_3": 759.7488586425782, "kl_loss_7": 191.9359992980957, "learning_rate": 0.0003960441545911204, "loss": 538.4236, "step": 5710 }, { "ce_loss_10": 3.6897791981697083, "ce_loss_13": 3.6274471282958984, "ce_loss_2": 4.157203590869903, "ce_loss_3": 3.9884847044944762, "ce_loss_7": 3.736597418785095, "epoch": 0.572, "grad_norm": 600.0, "kl_loss_10": 97.47168769836426, "kl_loss_2": 1115.5811676025392, "kl_loss_3": 773.7587646484375, "kl_loss_7": 193.96655197143554, "learning_rate": 0.0003944926900490452, "loss": 541.7897, "step": 5720 }, { "ce_loss_10": 3.6022287607192993, "ce_loss_13": 3.541483187675476, "ce_loss_2": 4.094926071166992, "ce_loss_3": 3.9194396138191223, "ce_loss_7": 3.65008407831192, "epoch": 0.573, "grad_norm": 352.0, "kl_loss_10": 96.51857452392578, "kl_loss_2": 1147.3706939697265, "kl_loss_3": 794.91083984375, "kl_loss_7": 194.98720092773436, "learning_rate": 0.0003929422879644099, "loss": 544.8611, "step": 5730 }, { "ce_loss_10": 3.6093438267707825, "ce_loss_13": 3.5497053503990172, "ce_loss_2": 4.068271553516388, "ce_loss_3": 3.9011916518211365, "ce_loss_7": 3.6547187089920046, "epoch": 0.574, "grad_norm": 426.0, "kl_loss_10": 95.6807746887207, "kl_loss_2": 1107.6688201904296, "kl_loss_3": 763.7292449951171, "kl_loss_7": 189.65744247436524, "learning_rate": 0.0003913929639497462, "loss": 535.444, "step": 5740 }, { "ce_loss_10": 3.5539973855018614, "ce_loss_13": 3.4933292627334596, "ce_loss_2": 4.044394338130951, "ce_loss_3": 3.8677351474761963, "ce_loss_7": 3.6000022888183594, "epoch": 0.575, "grad_norm": 408.0, "kl_loss_10": 95.82653579711913, "kl_loss_2": 1130.1885803222656, "kl_loss_3": 778.0026184082031, "kl_loss_7": 190.79474563598632, "learning_rate": 0.00038984473360672965, "loss": 541.1631, "step": 5750 }, { "ce_loss_10": 3.5721747159957884, "ce_loss_13": 3.5100734710693358, "ce_loss_2": 4.053931272029876, "ce_loss_3": 3.883261811733246, "ce_loss_7": 3.6166505217552185, "epoch": 0.576, "grad_norm": 436.0, "kl_loss_10": 95.3091812133789, "kl_loss_2": 1128.7456329345703, "kl_loss_3": 780.4191925048829, "kl_loss_7": 190.4754554748535, "learning_rate": 0.0003882976125260229, "loss": 539.7566, "step": 5760 }, { "ce_loss_10": 3.638679492473602, "ce_loss_13": 3.5770092844963073, "ce_loss_2": 4.1140677571296695, "ce_loss_3": 3.9416022896766663, "ce_loss_7": 3.6866235971450805, "epoch": 0.577, "grad_norm": 366.0, "kl_loss_10": 98.93351516723632, "kl_loss_2": 1112.5931701660156, "kl_loss_3": 770.6242248535157, "kl_loss_7": 191.9038848876953, "learning_rate": 0.00038675161628711776, "loss": 545.2976, "step": 5770 }, { "ce_loss_10": 3.678569030761719, "ce_loss_13": 3.616915798187256, "ce_loss_2": 4.1388965249061584, "ce_loss_3": 3.9749330997467043, "ce_loss_7": 3.722931241989136, "epoch": 0.578, "grad_norm": 404.0, "kl_loss_10": 97.5284637451172, "kl_loss_2": 1093.5021606445312, "kl_loss_3": 761.3094451904296, "kl_loss_7": 191.26370391845703, "learning_rate": 0.0003852067604581794, "loss": 553.459, "step": 5780 }, { "ce_loss_10": 3.6174680829048156, "ce_loss_13": 3.5550846695899962, "ce_loss_2": 4.100849425792694, "ce_loss_3": 3.927929162979126, "ce_loss_7": 3.665549111366272, "epoch": 0.579, "grad_norm": 502.0, "kl_loss_10": 97.5420696258545, "kl_loss_2": 1125.1912048339843, "kl_loss_3": 782.9702056884765, "kl_loss_7": 193.16246643066407, "learning_rate": 0.0003836630605958888, "loss": 543.639, "step": 5790 }, { "ce_loss_10": 3.6780447602272033, "ce_loss_13": 3.616100025177002, "ce_loss_2": 4.136243522167206, "ce_loss_3": 3.9725910425186157, "ce_loss_7": 3.7234076499938964, "epoch": 0.58, "grad_norm": 506.0, "kl_loss_10": 99.15894927978516, "kl_loss_2": 1117.2952941894532, "kl_loss_3": 777.6545166015625, "kl_loss_7": 194.16991271972657, "learning_rate": 0.0003821205322452863, "loss": 560.4495, "step": 5800 }, { "ce_loss_10": 3.657036304473877, "ce_loss_13": 3.5961548686027527, "ce_loss_2": 4.118453872203827, "ce_loss_3": 3.948525774478912, "ce_loss_7": 3.7012171149253845, "epoch": 0.581, "grad_norm": 438.0, "kl_loss_10": 98.11412734985352, "kl_loss_2": 1098.6213439941407, "kl_loss_3": 759.3198364257812, "kl_loss_7": 189.98369064331055, "learning_rate": 0.0003805791909396155, "loss": 541.5742, "step": 5810 }, { "ce_loss_10": 3.6096495151519776, "ce_loss_13": 3.550210452079773, "ce_loss_2": 4.077665090560913, "ce_loss_3": 3.9094552993774414, "ce_loss_7": 3.654946839809418, "epoch": 0.582, "grad_norm": 428.0, "kl_loss_10": 95.98116798400879, "kl_loss_2": 1109.6123931884765, "kl_loss_3": 763.3366668701171, "kl_loss_7": 189.48765182495117, "learning_rate": 0.0003790390522001662, "loss": 547.1139, "step": 5820 }, { "ce_loss_10": 3.538465416431427, "ce_loss_13": 3.4795125126838684, "ce_loss_2": 4.019526553153992, "ce_loss_3": 3.8418781757354736, "ce_loss_7": 3.5831465244293215, "epoch": 0.583, "grad_norm": 354.0, "kl_loss_10": 94.34587249755859, "kl_loss_2": 1136.918035888672, "kl_loss_3": 784.7109252929688, "kl_loss_7": 191.27632827758788, "learning_rate": 0.0003775001315361183, "loss": 542.445, "step": 5830 }, { "ce_loss_10": 3.659132921695709, "ce_loss_13": 3.596101534366608, "ce_loss_2": 4.132727253437042, "ce_loss_3": 3.958163845539093, "ce_loss_7": 3.704639720916748, "epoch": 0.584, "grad_norm": 298.0, "kl_loss_10": 98.75731201171875, "kl_loss_2": 1122.0884033203124, "kl_loss_3": 776.4772644042969, "kl_loss_7": 193.22739944458007, "learning_rate": 0.0003759624444443858, "loss": 544.9992, "step": 5840 }, { "ce_loss_10": 3.6889251112937926, "ce_loss_13": 3.6282206773757935, "ce_loss_2": 4.151758980751038, "ce_loss_3": 3.9822983741760254, "ce_loss_7": 3.732993245124817, "epoch": 0.585, "grad_norm": 346.0, "kl_loss_10": 99.06045837402344, "kl_loss_2": 1097.8614471435546, "kl_loss_3": 758.9134582519531, "kl_loss_7": 191.27917098999023, "learning_rate": 0.00037442600640946044, "loss": 536.17, "step": 5850 }, { "ce_loss_10": 3.6461440443992617, "ce_loss_13": 3.5892478227615356, "ce_loss_2": 4.105236732959748, "ce_loss_3": 3.9375507473945617, "ce_loss_7": 3.692450475692749, "epoch": 0.586, "grad_norm": 408.0, "kl_loss_10": 94.86803092956544, "kl_loss_2": 1099.2377655029297, "kl_loss_3": 758.3301605224609, "kl_loss_7": 189.78098831176757, "learning_rate": 0.00037289083290325663, "loss": 531.0057, "step": 5860 }, { "ce_loss_10": 3.63515100479126, "ce_loss_13": 3.574202799797058, "ce_loss_2": 4.095511162281037, "ce_loss_3": 3.930715727806091, "ce_loss_7": 3.6794507265090943, "epoch": 0.587, "grad_norm": 540.0, "kl_loss_10": 97.98805313110351, "kl_loss_2": 1091.7025299072266, "kl_loss_3": 757.6223114013671, "kl_loss_7": 191.85128860473634, "learning_rate": 0.0003713569393849543, "loss": 533.4333, "step": 5870 }, { "ce_loss_10": 3.6827593207359315, "ce_loss_13": 3.6205956816673277, "ce_loss_2": 4.148468089103699, "ce_loss_3": 3.978341579437256, "ce_loss_7": 3.7273068189620973, "epoch": 0.588, "grad_norm": 398.0, "kl_loss_10": 98.60938911437988, "kl_loss_2": 1107.6281311035157, "kl_loss_3": 765.0102233886719, "kl_loss_7": 192.96542663574218, "learning_rate": 0.00036982434130084397, "loss": 541.5767, "step": 5880 }, { "ce_loss_10": 3.589915359020233, "ce_loss_13": 3.5286367654800417, "ce_loss_2": 4.061057722568512, "ce_loss_3": 3.8881011605262756, "ce_loss_7": 3.6373565912246706, "epoch": 0.589, "grad_norm": 506.0, "kl_loss_10": 97.51137619018554, "kl_loss_2": 1115.5977966308594, "kl_loss_3": 775.6395446777344, "kl_loss_7": 195.47111892700195, "learning_rate": 0.00036829305408417166, "loss": 546.8446, "step": 5890 }, { "ce_loss_10": 3.5797411799430847, "ce_loss_13": 3.5188158631324766, "ce_loss_2": 4.067822527885437, "ce_loss_3": 3.893584966659546, "ce_loss_7": 3.6291656494140625, "epoch": 0.59, "grad_norm": 364.0, "kl_loss_10": 96.57020835876465, "kl_loss_2": 1141.290579223633, "kl_loss_3": 789.6200988769531, "kl_loss_7": 196.76195220947267, "learning_rate": 0.0003667630931549826, "loss": 548.8211, "step": 5900 }, { "ce_loss_10": 3.547331213951111, "ce_loss_13": 3.4874081373214723, "ce_loss_2": 4.03765162229538, "ce_loss_3": 3.8655640482902527, "ce_loss_7": 3.5946906566619874, "epoch": 0.591, "grad_norm": 454.0, "kl_loss_10": 95.69526252746581, "kl_loss_2": 1154.8450256347655, "kl_loss_3": 798.5165588378907, "kl_loss_7": 194.5025749206543, "learning_rate": 0.00036523447391996613, "loss": 552.8163, "step": 5910 }, { "ce_loss_10": 3.6425758361816407, "ce_loss_13": 3.5853498816490172, "ce_loss_2": 4.10631023645401, "ce_loss_3": 3.9402198076248167, "ce_loss_7": 3.690027916431427, "epoch": 0.592, "grad_norm": 432.0, "kl_loss_10": 94.87303581237794, "kl_loss_2": 1090.1558319091796, "kl_loss_3": 756.7847717285156, "kl_loss_7": 189.84710311889648, "learning_rate": 0.00036370721177230114, "loss": 533.6673, "step": 5920 }, { "ce_loss_10": 3.635672652721405, "ce_loss_13": 3.577661764621735, "ce_loss_2": 4.114610862731934, "ce_loss_3": 3.9419226169586183, "ce_loss_7": 3.681511878967285, "epoch": 0.593, "grad_norm": 326.0, "kl_loss_10": 95.39519729614258, "kl_loss_2": 1127.0120056152343, "kl_loss_3": 780.4901336669922, "kl_loss_7": 194.04692993164062, "learning_rate": 0.00036218132209150044, "loss": 545.1962, "step": 5930 }, { "ce_loss_10": 3.593142592906952, "ce_loss_13": 3.530347979068756, "ce_loss_2": 4.095171976089477, "ce_loss_3": 3.920231354236603, "ce_loss_7": 3.645453596115112, "epoch": 0.594, "grad_norm": 378.0, "kl_loss_10": 99.63440895080566, "kl_loss_2": 1173.4297882080077, "kl_loss_3": 813.8213714599609, "kl_loss_7": 199.65494766235352, "learning_rate": 0.0003606568202432562, "loss": 557.0208, "step": 5940 }, { "ce_loss_10": 3.665185475349426, "ce_loss_13": 3.6032612800598143, "ce_loss_2": 4.14498724937439, "ce_loss_3": 3.9701961159706114, "ce_loss_7": 3.7108847856521607, "epoch": 0.595, "grad_norm": 528.0, "kl_loss_10": 99.43977394104004, "kl_loss_2": 1140.6280212402344, "kl_loss_3": 787.1899200439453, "kl_loss_7": 195.35167922973633, "learning_rate": 0.0003591337215792851, "loss": 544.2271, "step": 5950 }, { "ce_loss_10": 3.706349265575409, "ce_loss_13": 3.64465993642807, "ce_loss_2": 4.152172029018402, "ce_loss_3": 3.9943688988685606, "ce_loss_7": 3.7489245533943176, "epoch": 0.596, "grad_norm": 356.0, "kl_loss_10": 99.39506378173829, "kl_loss_2": 1087.233724975586, "kl_loss_3": 759.1374755859375, "kl_loss_7": 190.80716857910156, "learning_rate": 0.00035761204143717383, "loss": 544.3471, "step": 5960 }, { "ce_loss_10": 3.6578794836997988, "ce_loss_13": 3.5957969784736634, "ce_loss_2": 4.119996964931488, "ce_loss_3": 3.9552765846252442, "ce_loss_7": 3.7025834202766417, "epoch": 0.597, "grad_norm": 400.0, "kl_loss_10": 99.01246032714843, "kl_loss_2": 1115.1319488525392, "kl_loss_3": 774.3078552246094, "kl_loss_7": 193.01641845703125, "learning_rate": 0.0003560917951402245, "loss": 556.3752, "step": 5970 }, { "ce_loss_10": 3.632036602497101, "ce_loss_13": 3.5740628480911254, "ce_loss_2": 4.0921210765838625, "ce_loss_3": 3.9307610511779787, "ce_loss_7": 3.6746655702590942, "epoch": 0.598, "grad_norm": 412.0, "kl_loss_10": 95.97110137939453, "kl_loss_2": 1101.7569305419922, "kl_loss_3": 768.7692047119141, "kl_loss_7": 189.95830230712892, "learning_rate": 0.00035457299799730046, "loss": 538.1885, "step": 5980 }, { "ce_loss_10": 3.69617702960968, "ce_loss_13": 3.6354240775108337, "ce_loss_2": 4.163921213150024, "ce_loss_3": 3.993851900100708, "ce_loss_7": 3.7415480971336366, "epoch": 0.599, "grad_norm": 388.0, "kl_loss_10": 96.27426452636719, "kl_loss_2": 1105.9306549072267, "kl_loss_3": 762.228305053711, "kl_loss_7": 190.51752395629882, "learning_rate": 0.0003530556653026721, "loss": 545.8183, "step": 5990 }, { "ce_loss_10": 3.611501228809357, "ce_loss_13": 3.5530946016311646, "ce_loss_2": 4.07593857049942, "ce_loss_3": 3.9016834497451782, "ce_loss_7": 3.6570339798927307, "epoch": 0.6, "grad_norm": 1424.0, "kl_loss_10": 94.48569107055664, "kl_loss_2": 1108.4388488769532, "kl_loss_3": 760.983023071289, "kl_loss_7": 188.30435333251953, "learning_rate": 0.00035153981233586274, "loss": 543.2547, "step": 6000 }, { "ce_loss_10": 3.589734137058258, "ce_loss_13": 3.5291273951530457, "ce_loss_2": 4.066950809955597, "ce_loss_3": 3.8936201214790342, "ce_loss_7": 3.6356727838516236, "epoch": 0.601, "grad_norm": 478.0, "kl_loss_10": 95.43113746643067, "kl_loss_2": 1117.119808959961, "kl_loss_3": 769.7344940185546, "kl_loss_7": 188.8736831665039, "learning_rate": 0.00035002545436149473, "loss": 555.4068, "step": 6010 }, { "ce_loss_10": 3.603361654281616, "ce_loss_13": 3.5395719528198244, "ce_loss_2": 4.084376287460327, "ce_loss_3": 3.913386416435242, "ce_loss_7": 3.6495144724845887, "epoch": 0.602, "grad_norm": 414.0, "kl_loss_10": 99.58069725036621, "kl_loss_2": 1138.4922149658203, "kl_loss_3": 791.1285461425781, "kl_loss_7": 196.0400062561035, "learning_rate": 0.0003485126066291364, "loss": 543.3661, "step": 6020 }, { "ce_loss_10": 3.6472663640975953, "ce_loss_13": 3.586405074596405, "ce_loss_2": 4.12690646648407, "ce_loss_3": 3.9540088891983034, "ce_loss_7": 3.6910028219223023, "epoch": 0.603, "grad_norm": 426.0, "kl_loss_10": 97.50395317077637, "kl_loss_2": 1120.6384643554688, "kl_loss_3": 773.8977966308594, "kl_loss_7": 189.96464309692382, "learning_rate": 0.0003470012843731476, "loss": 547.4742, "step": 6030 }, { "ce_loss_10": 3.587485361099243, "ce_loss_13": 3.527864229679108, "ce_loss_2": 4.065750408172607, "ce_loss_3": 3.8930314064025877, "ce_loss_7": 3.6307687997817992, "epoch": 0.604, "grad_norm": 450.0, "kl_loss_10": 95.93178520202636, "kl_loss_2": 1125.8798370361328, "kl_loss_3": 778.0897277832031, "kl_loss_7": 190.32968826293944, "learning_rate": 0.00034549150281252633, "loss": 553.9461, "step": 6040 }, { "ce_loss_10": 3.567354416847229, "ce_loss_13": 3.5087788224220278, "ce_loss_2": 4.041226005554199, "ce_loss_3": 3.868573796749115, "ce_loss_7": 3.613230037689209, "epoch": 0.605, "grad_norm": 376.0, "kl_loss_10": 96.31193771362305, "kl_loss_2": 1101.1357208251952, "kl_loss_3": 760.5451019287109, "kl_loss_7": 190.99923782348634, "learning_rate": 0.0003439832771507565, "loss": 537.7418, "step": 6050 }, { "ce_loss_10": 3.569633936882019, "ce_loss_13": 3.5091484904289247, "ce_loss_2": 4.052746975421906, "ce_loss_3": 3.8793442845344543, "ce_loss_7": 3.6145769238471983, "epoch": 0.606, "grad_norm": 364.0, "kl_loss_10": 96.17846641540527, "kl_loss_2": 1126.9381469726563, "kl_loss_3": 780.4287139892579, "kl_loss_7": 191.24787139892578, "learning_rate": 0.0003424766225756537, "loss": 539.2611, "step": 6060 }, { "ce_loss_10": 3.6349270820617674, "ce_loss_13": 3.5724891662597655, "ce_loss_2": 4.110528755187988, "ce_loss_3": 3.9370043516159057, "ce_loss_7": 3.679009509086609, "epoch": 0.607, "grad_norm": 380.0, "kl_loss_10": 98.61342163085938, "kl_loss_2": 1107.0002716064453, "kl_loss_3": 763.0299987792969, "kl_loss_7": 192.68891830444335, "learning_rate": 0.00034097155425921255, "loss": 535.4806, "step": 6070 }, { "ce_loss_10": 3.5260583400726317, "ce_loss_13": 3.4644631028175352, "ce_loss_2": 4.0014289021492, "ce_loss_3": 3.829664409160614, "ce_loss_7": 3.571485424041748, "epoch": 0.608, "grad_norm": 422.0, "kl_loss_10": 95.72014465332032, "kl_loss_2": 1128.9732635498046, "kl_loss_3": 780.0001983642578, "kl_loss_7": 191.94852294921876, "learning_rate": 0.0003394680873574546, "loss": 542.5872, "step": 6080 }, { "ce_loss_10": 3.638583517074585, "ce_loss_13": 3.5754881620407106, "ce_loss_2": 4.1181090593338014, "ce_loss_3": 3.9476171731948853, "ce_loss_7": 3.6838363647460937, "epoch": 0.609, "grad_norm": 402.0, "kl_loss_10": 99.43503112792969, "kl_loss_2": 1131.3410400390626, "kl_loss_3": 782.6971099853515, "kl_loss_7": 192.93393096923828, "learning_rate": 0.0003379662370102747, "loss": 542.0118, "step": 6090 }, { "ce_loss_10": 3.6437841415405274, "ce_loss_13": 3.5835014939308167, "ce_loss_2": 4.107234466075897, "ce_loss_3": 3.9407611727714538, "ce_loss_7": 3.689082384109497, "epoch": 0.61, "grad_norm": 378.0, "kl_loss_10": 95.95064582824708, "kl_loss_2": 1116.5803283691407, "kl_loss_3": 769.8769500732421, "kl_loss_7": 190.42120208740235, "learning_rate": 0.0003364660183412892, "loss": 543.2468, "step": 6100 }, { "ce_loss_10": 3.6229702949523928, "ce_loss_13": 3.5642863631248476, "ce_loss_2": 4.082474946975708, "ce_loss_3": 3.920805549621582, "ce_loss_7": 3.6692759871482847, "epoch": 0.611, "grad_norm": 438.0, "kl_loss_10": 95.98471641540527, "kl_loss_2": 1107.3975402832032, "kl_loss_3": 770.6610443115235, "kl_loss_7": 191.18293151855468, "learning_rate": 0.0003349674464576834, "loss": 547.1137, "step": 6110 }, { "ce_loss_10": 3.572301459312439, "ce_loss_13": 3.5100274682044983, "ce_loss_2": 4.04880428314209, "ce_loss_3": 3.87799711227417, "ce_loss_7": 3.6172243118286134, "epoch": 0.612, "grad_norm": 400.0, "kl_loss_10": 97.55015258789062, "kl_loss_2": 1121.5612213134766, "kl_loss_3": 776.7356872558594, "kl_loss_7": 191.68118591308593, "learning_rate": 0.00033347053645005966, "loss": 533.933, "step": 6120 }, { "ce_loss_10": 3.6915227651596068, "ce_loss_13": 3.6307403206825257, "ce_loss_2": 4.149306988716125, "ce_loss_3": 3.986075186729431, "ce_loss_7": 3.7352558612823485, "epoch": 0.613, "grad_norm": 456.0, "kl_loss_10": 97.44704780578613, "kl_loss_2": 1082.3290954589843, "kl_loss_3": 751.4226776123047, "kl_loss_7": 188.24736099243165, "learning_rate": 0.00033197530339228485, "loss": 541.4641, "step": 6130 }, { "ce_loss_10": 3.6387569904327393, "ce_loss_13": 3.5774574756622313, "ce_loss_2": 4.1059521555900576, "ce_loss_3": 3.9463557958602906, "ce_loss_7": 3.686212944984436, "epoch": 0.614, "grad_norm": 320.0, "kl_loss_10": 97.79526100158691, "kl_loss_2": 1105.2626007080078, "kl_loss_3": 773.3958312988282, "kl_loss_7": 193.28426208496094, "learning_rate": 0.00033048176234133967, "loss": 539.6668, "step": 6140 }, { "ce_loss_10": 3.6235718965530395, "ce_loss_13": 3.563166308403015, "ce_loss_2": 4.0937678694725035, "ce_loss_3": 3.9205260276794434, "ce_loss_7": 3.6674267172813417, "epoch": 0.615, "grad_norm": 434.0, "kl_loss_10": 96.52788619995117, "kl_loss_2": 1108.4606842041017, "kl_loss_3": 766.9316375732421, "kl_loss_7": 191.76514892578126, "learning_rate": 0.0003289899283371657, "loss": 545.3005, "step": 6150 }, { "ce_loss_10": 3.6545772314071656, "ce_loss_13": 3.5920246958732607, "ce_loss_2": 4.122934722900391, "ce_loss_3": 3.954042661190033, "ce_loss_7": 3.7002484798431396, "epoch": 0.616, "grad_norm": 512.0, "kl_loss_10": 96.86014366149902, "kl_loss_2": 1110.978466796875, "kl_loss_3": 763.7065795898437, "kl_loss_7": 189.29309463500977, "learning_rate": 0.0003274998164025148, "loss": 546.4095, "step": 6160 }, { "ce_loss_10": 3.687037003040314, "ce_loss_13": 3.62383953332901, "ce_loss_2": 4.151339697837829, "ce_loss_3": 3.982528305053711, "ce_loss_7": 3.730109751224518, "epoch": 0.617, "grad_norm": 420.0, "kl_loss_10": 98.5214340209961, "kl_loss_2": 1105.55556640625, "kl_loss_3": 765.5938140869141, "kl_loss_7": 192.1310241699219, "learning_rate": 0.0003260114415427975, "loss": 551.3336, "step": 6170 }, { "ce_loss_10": 3.6019906878471373, "ce_loss_13": 3.543228101730347, "ce_loss_2": 4.074436497688294, "ce_loss_3": 3.910420286655426, "ce_loss_7": 3.650412619113922, "epoch": 0.618, "grad_norm": 326.0, "kl_loss_10": 96.38783836364746, "kl_loss_2": 1118.9971984863282, "kl_loss_3": 780.3896484375, "kl_loss_7": 191.8697937011719, "learning_rate": 0.0003245248187459323, "loss": 553.7879, "step": 6180 }, { "ce_loss_10": 3.5864107251167296, "ce_loss_13": 3.53016597032547, "ce_loss_2": 4.042503225803375, "ce_loss_3": 3.874970889091492, "ce_loss_7": 3.6281535744667055, "epoch": 0.619, "grad_norm": 418.0, "kl_loss_10": 92.61179161071777, "kl_loss_2": 1080.5412902832031, "kl_loss_3": 743.7303924560547, "kl_loss_7": 185.2653793334961, "learning_rate": 0.00032303996298219416, "loss": 531.9591, "step": 6190 }, { "ce_loss_10": 3.6777410745620727, "ce_loss_13": 3.6153058767318726, "ce_loss_2": 4.135652315616608, "ce_loss_3": 3.968917655944824, "ce_loss_7": 3.723153126239777, "epoch": 0.62, "grad_norm": 328.0, "kl_loss_10": 97.44341430664062, "kl_loss_2": 1081.0255004882813, "kl_loss_3": 750.4999420166016, "kl_loss_7": 189.73232498168946, "learning_rate": 0.00032155688920406414, "loss": 532.518, "step": 6200 }, { "ce_loss_10": 3.587769341468811, "ce_loss_13": 3.524891209602356, "ce_loss_2": 4.075685119628906, "ce_loss_3": 3.896172082424164, "ce_loss_7": 3.6351929187774656, "epoch": 0.621, "grad_norm": 376.0, "kl_loss_10": 100.48479537963867, "kl_loss_2": 1141.9940368652344, "kl_loss_3": 786.613900756836, "kl_loss_7": 195.81097640991212, "learning_rate": 0.0003200756123460788, "loss": 557.093, "step": 6210 }, { "ce_loss_10": 3.613728904724121, "ce_loss_13": 3.5514798045158384, "ce_loss_2": 4.097208368778229, "ce_loss_3": 3.922844612598419, "ce_loss_7": 3.6612853050231933, "epoch": 0.622, "grad_norm": 436.0, "kl_loss_10": 98.99568367004395, "kl_loss_2": 1137.439712524414, "kl_loss_3": 786.5532501220703, "kl_loss_7": 195.3180892944336, "learning_rate": 0.00031859614732467957, "loss": 552.2858, "step": 6220 }, { "ce_loss_10": 3.668611526489258, "ce_loss_13": 3.6079549193382263, "ce_loss_2": 4.123561811447144, "ce_loss_3": 3.957008695602417, "ce_loss_7": 3.7130979537963866, "epoch": 0.623, "grad_norm": 436.0, "kl_loss_10": 96.12700805664062, "kl_loss_2": 1085.7240417480468, "kl_loss_3": 750.371206665039, "kl_loss_7": 188.20330352783202, "learning_rate": 0.00031711850903806275, "loss": 532.2347, "step": 6230 }, { "ce_loss_10": 3.5722012281417848, "ce_loss_13": 3.5121172070503235, "ce_loss_2": 4.05529419183731, "ce_loss_3": 3.8803335189819337, "ce_loss_7": 3.6196384906768797, "epoch": 0.624, "grad_norm": 372.0, "kl_loss_10": 98.26438941955567, "kl_loss_2": 1135.1425506591797, "kl_loss_3": 784.2965637207031, "kl_loss_7": 195.50869674682616, "learning_rate": 0.0003156427123660297, "loss": 544.6104, "step": 6240 }, { "ce_loss_10": 3.663820195198059, "ce_loss_13": 3.6021278977394102, "ce_loss_2": 4.12639445066452, "ce_loss_3": 3.9577032327651978, "ce_loss_7": 3.709599566459656, "epoch": 0.625, "grad_norm": 376.0, "kl_loss_10": 96.6868911743164, "kl_loss_2": 1095.9533905029298, "kl_loss_3": 760.994189453125, "kl_loss_7": 189.45380859375, "learning_rate": 0.0003141687721698363, "loss": 542.975, "step": 6250 }, { "ce_loss_10": 3.6301703572273256, "ce_loss_13": 3.5708668351173403, "ce_loss_2": 4.076251423358917, "ce_loss_3": 3.9138960361480715, "ce_loss_7": 3.6724702954292296, "epoch": 0.626, "grad_norm": 424.0, "kl_loss_10": 94.79209213256836, "kl_loss_2": 1062.4459991455078, "kl_loss_3": 731.6935089111328, "kl_loss_7": 183.48223037719725, "learning_rate": 0.00031269670329204396, "loss": 531.0972, "step": 6260 }, { "ce_loss_10": 3.6652311086654663, "ce_loss_13": 3.6031481981277467, "ce_loss_2": 4.122557854652404, "ce_loss_3": 3.9541366934776305, "ce_loss_7": 3.707497763633728, "epoch": 0.627, "grad_norm": 404.0, "kl_loss_10": 97.36745681762696, "kl_loss_2": 1087.3731384277344, "kl_loss_3": 749.8712615966797, "kl_loss_7": 189.97913208007813, "learning_rate": 0.00031122652055637015, "loss": 536.5034, "step": 6270 }, { "ce_loss_10": 3.6263384938240053, "ce_loss_13": 3.5657132387161257, "ce_loss_2": 4.101957285404206, "ce_loss_3": 3.9301799178123473, "ce_loss_7": 3.671717309951782, "epoch": 0.628, "grad_norm": 320.0, "kl_loss_10": 97.96914176940918, "kl_loss_2": 1132.4724700927734, "kl_loss_3": 779.5158935546875, "kl_loss_7": 193.307218170166, "learning_rate": 0.0003097582387675385, "loss": 538.5988, "step": 6280 }, { "ce_loss_10": 3.6690368175506594, "ce_loss_13": 3.608207333087921, "ce_loss_2": 4.131593143939972, "ce_loss_3": 3.967103731632233, "ce_loss_7": 3.714122140407562, "epoch": 0.629, "grad_norm": 380.0, "kl_loss_10": 97.3248161315918, "kl_loss_2": 1100.8168243408204, "kl_loss_3": 758.5913757324219, "kl_loss_7": 190.2446075439453, "learning_rate": 0.00030829187271113034, "loss": 533.383, "step": 6290 }, { "ce_loss_10": 3.6720826983451844, "ce_loss_13": 3.6116329789161683, "ce_loss_2": 4.121181070804596, "ce_loss_3": 3.958890378475189, "ce_loss_7": 3.713034725189209, "epoch": 0.63, "grad_norm": 474.0, "kl_loss_10": 95.86663208007812, "kl_loss_2": 1078.529071044922, "kl_loss_3": 747.6958526611328, "kl_loss_7": 186.88264846801758, "learning_rate": 0.00030682743715343565, "loss": 538.6207, "step": 6300 }, { "ce_loss_10": 3.6168052315711976, "ce_loss_13": 3.5534343481063844, "ce_loss_2": 4.1001279830932615, "ce_loss_3": 3.926764929294586, "ce_loss_7": 3.6654592990875243, "epoch": 0.631, "grad_norm": 352.0, "kl_loss_10": 98.38105430603028, "kl_loss_2": 1116.2974884033204, "kl_loss_3": 769.4165740966797, "kl_loss_7": 194.41071319580078, "learning_rate": 0.0003053649468413043, "loss": 544.2852, "step": 6310 }, { "ce_loss_10": 3.728801727294922, "ce_loss_13": 3.6677038788795473, "ce_loss_2": 4.186562621593476, "ce_loss_3": 4.021135902404785, "ce_loss_7": 3.7726667642593386, "epoch": 0.632, "grad_norm": 548.0, "kl_loss_10": 98.36889610290527, "kl_loss_2": 1106.3314636230468, "kl_loss_3": 764.3384338378906, "kl_loss_7": 193.92676391601563, "learning_rate": 0.00030390441650199725, "loss": 534.6711, "step": 6320 }, { "ce_loss_10": 3.6225173473358154, "ce_loss_13": 3.564038324356079, "ce_loss_2": 4.088936626911163, "ce_loss_3": 3.9200194835662843, "ce_loss_7": 3.6701310753822325, "epoch": 0.633, "grad_norm": 390.0, "kl_loss_10": 93.89363708496094, "kl_loss_2": 1093.413995361328, "kl_loss_3": 755.4691772460938, "kl_loss_7": 188.9584762573242, "learning_rate": 0.00030244586084303903, "loss": 531.6465, "step": 6330 }, { "ce_loss_10": 3.5908933520317077, "ce_loss_13": 3.530228877067566, "ce_loss_2": 4.073009943962097, "ce_loss_3": 3.908068907260895, "ce_loss_7": 3.6380571484565736, "epoch": 0.634, "grad_norm": 362.0, "kl_loss_10": 96.08535652160644, "kl_loss_2": 1137.027798461914, "kl_loss_3": 794.3090057373047, "kl_loss_7": 193.36979446411132, "learning_rate": 0.00030098929455206903, "loss": 541.8852, "step": 6340 }, { "ce_loss_10": 3.5973508238792418, "ce_loss_13": 3.538694751262665, "ce_loss_2": 4.059111332893371, "ce_loss_3": 3.8917571187019346, "ce_loss_7": 3.6398496866226195, "epoch": 0.635, "grad_norm": 396.0, "kl_loss_10": 95.19868698120118, "kl_loss_2": 1117.9919860839843, "kl_loss_3": 769.856167602539, "kl_loss_7": 189.57870178222657, "learning_rate": 0.00029953473229669324, "loss": 545.9079, "step": 6350 }, { "ce_loss_10": 3.6316630482673644, "ce_loss_13": 3.5723133206367494, "ce_loss_2": 4.099796783924103, "ce_loss_3": 3.9292221426963807, "ce_loss_7": 3.6748278617858885, "epoch": 0.636, "grad_norm": 382.0, "kl_loss_10": 94.04772453308105, "kl_loss_2": 1105.0771392822267, "kl_loss_3": 767.0107574462891, "kl_loss_7": 189.39691848754882, "learning_rate": 0.00029808218872433767, "loss": 534.2105, "step": 6360 }, { "ce_loss_10": 3.6887783288955687, "ce_loss_13": 3.6287707686424255, "ce_loss_2": 4.1434108257293705, "ce_loss_3": 3.9780289769172668, "ce_loss_7": 3.7338571667671205, "epoch": 0.637, "grad_norm": 402.0, "kl_loss_10": 97.2003547668457, "kl_loss_2": 1086.371304321289, "kl_loss_3": 753.1467376708985, "kl_loss_7": 190.29918899536133, "learning_rate": 0.0002966316784621, "loss": 530.8481, "step": 6370 }, { "ce_loss_10": 3.5995650410652162, "ce_loss_13": 3.5394855737686157, "ce_loss_2": 4.081933212280274, "ce_loss_3": 3.905743455886841, "ce_loss_7": 3.6461820721626284, "epoch": 0.638, "grad_norm": 392.0, "kl_loss_10": 94.92418899536133, "kl_loss_2": 1131.0511108398437, "kl_loss_3": 782.9240203857422, "kl_loss_7": 192.17471160888672, "learning_rate": 0.0002951832161166024, "loss": 537.9302, "step": 6380 }, { "ce_loss_10": 3.6817028760910033, "ce_loss_13": 3.619114363193512, "ce_loss_2": 4.15013542175293, "ce_loss_3": 3.980035495758057, "ce_loss_7": 3.726088798046112, "epoch": 0.639, "grad_norm": 284.0, "kl_loss_10": 99.42742652893067, "kl_loss_2": 1089.2870971679688, "kl_loss_3": 758.1006713867188, "kl_loss_7": 192.03466110229493, "learning_rate": 0.0002937368162738445, "loss": 530.5328, "step": 6390 }, { "ce_loss_10": 3.6132258057594298, "ce_loss_13": 3.557306098937988, "ce_loss_2": 4.071500968933106, "ce_loss_3": 3.905410099029541, "ce_loss_7": 3.6560685634613037, "epoch": 0.64, "grad_norm": 580.0, "kl_loss_10": 93.17153434753418, "kl_loss_2": 1090.426809692383, "kl_loss_3": 756.628515625, "kl_loss_7": 185.41258697509767, "learning_rate": 0.0002922924934990568, "loss": 537.7791, "step": 6400 }, { "ce_loss_10": 3.553709554672241, "ce_loss_13": 3.495926034450531, "ce_loss_2": 4.037974917888642, "ce_loss_3": 3.862057626247406, "ce_loss_7": 3.5978724122047425, "epoch": 0.641, "grad_norm": 316.0, "kl_loss_10": 94.70829887390137, "kl_loss_2": 1132.230615234375, "kl_loss_3": 780.3255004882812, "kl_loss_7": 189.6028953552246, "learning_rate": 0.0002908502623365536, "loss": 541.2746, "step": 6410 }, { "ce_loss_10": 3.493143379688263, "ce_loss_13": 3.4340757846832277, "ce_loss_2": 3.982888162136078, "ce_loss_3": 3.8087966442108154, "ce_loss_7": 3.541613507270813, "epoch": 0.642, "grad_norm": 448.0, "kl_loss_10": 93.92830047607421, "kl_loss_2": 1141.5694763183594, "kl_loss_3": 791.2887268066406, "kl_loss_7": 189.8411407470703, "learning_rate": 0.0002894101373095867, "loss": 544.0511, "step": 6420 }, { "ce_loss_10": 3.7018409371376038, "ce_loss_13": 3.641219162940979, "ce_loss_2": 4.160841226577759, "ce_loss_3": 3.996344065666199, "ce_loss_7": 3.7449718475341798, "epoch": 0.643, "grad_norm": 444.0, "kl_loss_10": 98.50596771240234, "kl_loss_2": 1096.2253509521483, "kl_loss_3": 759.2389587402344, "kl_loss_7": 191.72063598632812, "learning_rate": 0.00028797213292019926, "loss": 535.7118, "step": 6430 }, { "ce_loss_10": 3.679163944721222, "ce_loss_13": 3.6178041219711305, "ce_loss_2": 4.137241208553315, "ce_loss_3": 3.9736143589019775, "ce_loss_7": 3.7223108887672423, "epoch": 0.644, "grad_norm": 316.0, "kl_loss_10": 96.37056579589844, "kl_loss_2": 1093.3028533935546, "kl_loss_3": 763.8056060791016, "kl_loss_7": 190.55449371337892, "learning_rate": 0.0002865362636490791, "loss": 543.9671, "step": 6440 }, { "ce_loss_10": 3.689470386505127, "ce_loss_13": 3.6325947284698485, "ce_loss_2": 4.151259076595307, "ce_loss_3": 3.9852967262268066, "ce_loss_7": 3.7347108364105224, "epoch": 0.645, "grad_norm": 422.0, "kl_loss_10": 95.76711997985839, "kl_loss_2": 1101.8473754882812, "kl_loss_3": 757.8740173339844, "kl_loss_7": 188.20162200927734, "learning_rate": 0.0002851025439554142, "loss": 532.7338, "step": 6450 }, { "ce_loss_10": 3.6879691004753115, "ce_loss_13": 3.6268020391464235, "ce_loss_2": 4.149470102787018, "ce_loss_3": 3.9827425360679625, "ce_loss_7": 3.732300865650177, "epoch": 0.646, "grad_norm": 432.0, "kl_loss_10": 96.89583930969238, "kl_loss_2": 1086.1058197021484, "kl_loss_3": 754.8961853027344, "kl_loss_7": 190.88655471801758, "learning_rate": 0.00028367098827674573, "loss": 531.1024, "step": 6460 }, { "ce_loss_10": 3.613504183292389, "ce_loss_13": 3.552918183803558, "ce_loss_2": 4.07694593667984, "ce_loss_3": 3.9072110176086428, "ce_loss_7": 3.656181883811951, "epoch": 0.647, "grad_norm": 382.0, "kl_loss_10": 95.70045394897461, "kl_loss_2": 1088.4426727294922, "kl_loss_3": 747.3143646240235, "kl_loss_7": 185.63362350463868, "learning_rate": 0.00028224161102882397, "loss": 534.1186, "step": 6470 }, { "ce_loss_10": 3.591862881183624, "ce_loss_13": 3.5325499296188356, "ce_loss_2": 4.047231125831604, "ce_loss_3": 3.8850304007530214, "ce_loss_7": 3.6327146530151366, "epoch": 0.648, "grad_norm": 398.0, "kl_loss_10": 97.32144050598144, "kl_loss_2": 1084.3862060546876, "kl_loss_3": 756.0506072998047, "kl_loss_7": 188.20642013549804, "learning_rate": 0.00028081442660546124, "loss": 534.4936, "step": 6480 }, { "ce_loss_10": 3.6528772950172423, "ce_loss_13": 3.593310809135437, "ce_loss_2": 4.104138958454132, "ce_loss_3": 3.940169370174408, "ce_loss_7": 3.6972940802574157, "epoch": 0.649, "grad_norm": 442.0, "kl_loss_10": 96.56869812011719, "kl_loss_2": 1082.232455444336, "kl_loss_3": 748.2576446533203, "kl_loss_7": 188.56612319946288, "learning_rate": 0.0002793894493783892, "loss": 535.3609, "step": 6490 }, { "ce_loss_10": 3.671093225479126, "ce_loss_13": 3.6125397443771363, "ce_loss_2": 4.120749580860138, "ce_loss_3": 3.957093584537506, "ce_loss_7": 3.715547430515289, "epoch": 0.65, "grad_norm": 340.0, "kl_loss_10": 95.52767143249511, "kl_loss_2": 1081.513833618164, "kl_loss_3": 750.0977233886719, "kl_loss_7": 185.41107177734375, "learning_rate": 0.0002779666936971129, "loss": 530.5015, "step": 6500 }, { "ce_loss_10": 3.6747244358062745, "ce_loss_13": 3.6157574892044066, "ce_loss_2": 4.147137761116028, "ce_loss_3": 3.9802316427230835, "ce_loss_7": 3.7200183868408203, "epoch": 0.651, "grad_norm": 388.0, "kl_loss_10": 96.378706741333, "kl_loss_2": 1104.2031311035157, "kl_loss_3": 768.3699279785156, "kl_loss_7": 190.13947677612305, "learning_rate": 0.00027654617388876614, "loss": 540.9622, "step": 6510 }, { "ce_loss_10": 3.7085010170936585, "ce_loss_13": 3.650082528591156, "ce_loss_2": 4.159732723236084, "ce_loss_3": 3.9939939975738525, "ce_loss_7": 3.752064514160156, "epoch": 0.652, "grad_norm": 372.0, "kl_loss_10": 98.8690299987793, "kl_loss_2": 1084.27646484375, "kl_loss_3": 749.1016296386719, "kl_loss_7": 189.19281463623048, "learning_rate": 0.0002751279042579672, "loss": 533.7532, "step": 6520 }, { "ce_loss_10": 3.6514885902404783, "ce_loss_13": 3.589630663394928, "ce_loss_2": 4.104155695438385, "ce_loss_3": 3.9368098855018614, "ce_loss_7": 3.696379566192627, "epoch": 0.653, "grad_norm": 388.0, "kl_loss_10": 98.10863304138184, "kl_loss_2": 1078.5175903320312, "kl_loss_3": 739.8918975830078, "kl_loss_7": 187.05665588378906, "learning_rate": 0.00027371189908667604, "loss": 535.8568, "step": 6530 }, { "ce_loss_10": 3.6950425028800966, "ce_loss_13": 3.6345377445220945, "ce_loss_2": 4.172570693492889, "ce_loss_3": 4.002642476558686, "ce_loss_7": 3.742088866233826, "epoch": 0.654, "grad_norm": 456.0, "kl_loss_10": 98.50621490478515, "kl_loss_2": 1120.8493408203126, "kl_loss_3": 772.4739196777343, "kl_loss_7": 194.52065811157226, "learning_rate": 0.00027229817263404863, "loss": 550.1683, "step": 6540 }, { "ce_loss_10": 3.678051483631134, "ce_loss_13": 3.6163152933120726, "ce_loss_2": 4.125236618518829, "ce_loss_3": 3.9632533311843874, "ce_loss_7": 3.717917835712433, "epoch": 0.655, "grad_norm": 354.0, "kl_loss_10": 97.52188301086426, "kl_loss_2": 1072.0729919433593, "kl_loss_3": 745.5059295654297, "kl_loss_7": 187.41375122070312, "learning_rate": 0.0002708867391362948, "loss": 530.4727, "step": 6550 }, { "ce_loss_10": 3.659157025814056, "ce_loss_13": 3.5987429141998293, "ce_loss_2": 4.098348212242127, "ce_loss_3": 3.9343943357467652, "ce_loss_7": 3.69932336807251, "epoch": 0.656, "grad_norm": 380.0, "kl_loss_10": 95.51490859985351, "kl_loss_2": 1048.09501953125, "kl_loss_3": 723.2193145751953, "kl_loss_7": 183.38801651000978, "learning_rate": 0.0002694776128065345, "loss": 526.4233, "step": 6560 }, { "ce_loss_10": 3.5926573395729067, "ce_loss_13": 3.5355629920959473, "ce_loss_2": 4.059596955776215, "ce_loss_3": 3.8947146415710447, "ce_loss_7": 3.63899849653244, "epoch": 0.657, "grad_norm": 302.0, "kl_loss_10": 94.25321388244629, "kl_loss_2": 1108.046826171875, "kl_loss_3": 769.1714508056641, "kl_loss_7": 190.54062194824218, "learning_rate": 0.00026807080783465374, "loss": 532.2117, "step": 6570 }, { "ce_loss_10": 3.7099499464035035, "ce_loss_13": 3.6470829010009767, "ce_loss_2": 4.173487448692322, "ce_loss_3": 4.007464277744293, "ce_loss_7": 3.753613090515137, "epoch": 0.658, "grad_norm": 336.0, "kl_loss_10": 98.83243751525879, "kl_loss_2": 1096.7148071289062, "kl_loss_3": 763.6604827880859, "kl_loss_7": 191.30890121459962, "learning_rate": 0.00026666633838716316, "loss": 542.1623, "step": 6580 }, { "ce_loss_10": 3.597714030742645, "ce_loss_13": 3.5341309905052185, "ce_loss_2": 4.0741772770881655, "ce_loss_3": 3.9031991958618164, "ce_loss_7": 3.64434130191803, "epoch": 0.659, "grad_norm": 418.0, "kl_loss_10": 98.79775390625, "kl_loss_2": 1119.104165649414, "kl_loss_3": 772.7665252685547, "kl_loss_7": 193.75399169921874, "learning_rate": 0.00026526421860705474, "loss": 546.4087, "step": 6590 }, { "ce_loss_10": 3.6211095809936524, "ce_loss_13": 3.56248060464859, "ce_loss_2": 4.090437388420105, "ce_loss_3": 3.9254501700401305, "ce_loss_7": 3.669628012180328, "epoch": 0.66, "grad_norm": 388.0, "kl_loss_10": 97.33003234863281, "kl_loss_2": 1100.579428100586, "kl_loss_3": 767.1163055419922, "kl_loss_7": 192.85016250610352, "learning_rate": 0.0002638644626136587, "loss": 535.0932, "step": 6600 }, { "ce_loss_10": 3.632294547557831, "ce_loss_13": 3.5736007690429688, "ce_loss_2": 4.098874115943909, "ce_loss_3": 3.928848695755005, "ce_loss_7": 3.6751357674598695, "epoch": 0.661, "grad_norm": 370.0, "kl_loss_10": 95.11613578796387, "kl_loss_2": 1096.4229095458984, "kl_loss_3": 759.0542449951172, "kl_loss_7": 188.92064208984374, "learning_rate": 0.00026246708450250255, "loss": 537.9207, "step": 6610 }, { "ce_loss_10": 3.6327243566513063, "ce_loss_13": 3.5709309697151186, "ce_loss_2": 4.086973357200622, "ce_loss_3": 3.9239420771598814, "ce_loss_7": 3.675078272819519, "epoch": 0.662, "grad_norm": 450.0, "kl_loss_10": 97.06436119079589, "kl_loss_2": 1079.41337890625, "kl_loss_3": 752.72802734375, "kl_loss_7": 187.51063842773436, "learning_rate": 0.00026107209834516854, "loss": 531.8906, "step": 6620 }, { "ce_loss_10": 3.5740899324417112, "ce_loss_13": 3.5152911067008974, "ce_loss_2": 4.057041144371032, "ce_loss_3": 3.8850310802459718, "ce_loss_7": 3.6205747365951537, "epoch": 0.663, "grad_norm": 326.0, "kl_loss_10": 95.74808731079102, "kl_loss_2": 1136.7873779296874, "kl_loss_3": 780.0463623046875, "kl_loss_7": 190.15955352783203, "learning_rate": 0.0002596795181891514, "loss": 547.2686, "step": 6630 }, { "ce_loss_10": 3.5901227831840514, "ce_loss_13": 3.527127909660339, "ce_loss_2": 4.062633895874024, "ce_loss_3": 3.8958073616027833, "ce_loss_7": 3.63388534784317, "epoch": 0.664, "grad_norm": 488.0, "kl_loss_10": 97.48413009643555, "kl_loss_2": 1119.4189453125, "kl_loss_3": 774.4207427978515, "kl_loss_7": 193.8588966369629, "learning_rate": 0.000258289358057718, "loss": 556.5954, "step": 6640 }, { "ce_loss_10": 3.6630045056343077, "ce_loss_13": 3.6010705709457396, "ce_loss_2": 4.126548099517822, "ce_loss_3": 3.960009717941284, "ce_loss_7": 3.70961674451828, "epoch": 0.665, "grad_norm": 368.0, "kl_loss_10": 97.2126693725586, "kl_loss_2": 1116.2655120849608, "kl_loss_3": 770.7855743408203, "kl_loss_7": 193.7609016418457, "learning_rate": 0.0002569016319497657, "loss": 544.2068, "step": 6650 }, { "ce_loss_10": 3.645352327823639, "ce_loss_13": 3.582920753955841, "ce_loss_2": 4.116545259952545, "ce_loss_3": 3.9502077460289002, "ce_loss_7": 3.6899593830108643, "epoch": 0.666, "grad_norm": 324.0, "kl_loss_10": 98.58149223327636, "kl_loss_2": 1127.1539520263673, "kl_loss_3": 778.5697784423828, "kl_loss_7": 194.4781005859375, "learning_rate": 0.00025551635383968066, "loss": 551.8321, "step": 6660 }, { "ce_loss_10": 3.5590095281600953, "ce_loss_13": 3.497633898258209, "ce_loss_2": 4.0256366491317745, "ce_loss_3": 3.8563454031944273, "ce_loss_7": 3.6033952236175537, "epoch": 0.667, "grad_norm": 386.0, "kl_loss_10": 96.00436630249024, "kl_loss_2": 1115.5439819335938, "kl_loss_3": 764.8407897949219, "kl_loss_7": 191.15278091430665, "learning_rate": 0.00025413353767719804, "loss": 541.5643, "step": 6670 }, { "ce_loss_10": 3.6135716080665587, "ce_loss_13": 3.556279420852661, "ce_loss_2": 4.074564230442047, "ce_loss_3": 3.9083084225654603, "ce_loss_7": 3.6589901089668273, "epoch": 0.668, "grad_norm": 404.0, "kl_loss_10": 95.40520133972169, "kl_loss_2": 1103.0668395996095, "kl_loss_3": 766.21494140625, "kl_loss_7": 187.07973251342773, "learning_rate": 0.0002527531973872617, "loss": 541.5821, "step": 6680 }, { "ce_loss_10": 3.630588722229004, "ce_loss_13": 3.5716015577316282, "ce_loss_2": 4.09862619638443, "ce_loss_3": 3.9337419509887694, "ce_loss_7": 3.6740004658699035, "epoch": 0.669, "grad_norm": 376.0, "kl_loss_10": 94.05056571960449, "kl_loss_2": 1104.580502319336, "kl_loss_3": 767.1347503662109, "kl_loss_7": 187.80085144042968, "learning_rate": 0.0002513753468698826, "loss": 536.7451, "step": 6690 }, { "ce_loss_10": 3.6005271077156067, "ce_loss_13": 3.538683819770813, "ce_loss_2": 4.075844824314117, "ce_loss_3": 3.901875948905945, "ce_loss_7": 3.6449614763259888, "epoch": 0.67, "grad_norm": 392.0, "kl_loss_10": 97.46344718933105, "kl_loss_2": 1117.6306915283203, "kl_loss_3": 769.393521118164, "kl_loss_7": 191.83680877685546, "learning_rate": 0.0002500000000000001, "loss": 543.8447, "step": 6700 }, { "ce_loss_10": 3.7194844245910645, "ce_loss_13": 3.6591498017311097, "ce_loss_2": 4.157877945899964, "ce_loss_3": 3.9965709686279296, "ce_loss_7": 3.7608611464500425, "epoch": 0.671, "grad_norm": 388.0, "kl_loss_10": 96.12382774353027, "kl_loss_2": 1059.211587524414, "kl_loss_3": 732.8135711669922, "kl_loss_7": 185.53207092285157, "learning_rate": 0.0002486271706273421, "loss": 540.9632, "step": 6710 }, { "ce_loss_10": 3.652998185157776, "ce_loss_13": 3.5960669040679933, "ce_loss_2": 4.096874964237213, "ce_loss_3": 3.930626368522644, "ce_loss_7": 3.694219136238098, "epoch": 0.672, "grad_norm": 370.0, "kl_loss_10": 96.1414752960205, "kl_loss_2": 1060.9839447021484, "kl_loss_3": 732.6356231689454, "kl_loss_7": 184.73310241699218, "learning_rate": 0.0002472568725762853, "loss": 531.8145, "step": 6720 }, { "ce_loss_10": 3.644508719444275, "ce_loss_13": 3.585316574573517, "ce_loss_2": 4.077662718296051, "ce_loss_3": 3.923126482963562, "ce_loss_7": 3.6880379915237427, "epoch": 0.673, "grad_norm": 536.0, "kl_loss_10": 95.44480400085449, "kl_loss_2": 1060.1810028076172, "kl_loss_3": 734.1040588378906, "kl_loss_7": 183.89718780517578, "learning_rate": 0.00024588911964571554, "loss": 524.9737, "step": 6730 }, { "ce_loss_10": 3.6595176219940186, "ce_loss_13": 3.5960793495178223, "ce_loss_2": 4.141416406631469, "ce_loss_3": 3.971626269817352, "ce_loss_7": 3.706479799747467, "epoch": 0.674, "grad_norm": 370.0, "kl_loss_10": 101.08820152282715, "kl_loss_2": 1123.6421142578124, "kl_loss_3": 779.8745697021484, "kl_loss_7": 196.79359664916993, "learning_rate": 0.00024452392560888974, "loss": 538.6094, "step": 6740 }, { "ce_loss_10": 3.5484472513198853, "ce_loss_13": 3.4903222799301146, "ce_loss_2": 4.00926810503006, "ce_loss_3": 3.837252104282379, "ce_loss_7": 3.5929391860961912, "epoch": 0.675, "grad_norm": 376.0, "kl_loss_10": 94.44077377319336, "kl_loss_2": 1104.6140991210937, "kl_loss_3": 759.8463775634766, "kl_loss_7": 187.49753799438477, "learning_rate": 0.00024316130421329695, "loss": 531.6798, "step": 6750 }, { "ce_loss_10": 3.63141074180603, "ce_loss_13": 3.5704286813735964, "ce_loss_2": 4.089890336990356, "ce_loss_3": 3.9222849130630495, "ce_loss_7": 3.6722644567489624, "epoch": 0.676, "grad_norm": 320.0, "kl_loss_10": 96.4859691619873, "kl_loss_2": 1072.7287811279298, "kl_loss_3": 740.4257781982421, "kl_loss_7": 185.37494659423828, "learning_rate": 0.00024180126918051909, "loss": 528.9844, "step": 6760 }, { "ce_loss_10": 3.6748690009117126, "ce_loss_13": 3.6154377579689028, "ce_loss_2": 4.126313555240631, "ce_loss_3": 3.959956741333008, "ce_loss_7": 3.719127857685089, "epoch": 0.677, "grad_norm": 494.0, "kl_loss_10": 95.71767883300781, "kl_loss_2": 1071.3604461669922, "kl_loss_3": 739.3463531494141, "kl_loss_7": 186.98586730957032, "learning_rate": 0.00024044383420609406, "loss": 526.4402, "step": 6770 }, { "ce_loss_10": 3.6849735140800477, "ce_loss_13": 3.6251555919647216, "ce_loss_2": 4.126254045963288, "ce_loss_3": 3.9655120730400086, "ce_loss_7": 3.7277087569236755, "epoch": 0.678, "grad_norm": 406.0, "kl_loss_10": 96.21127319335938, "kl_loss_2": 1065.4650268554688, "kl_loss_3": 737.1611053466797, "kl_loss_7": 186.31879425048828, "learning_rate": 0.00023908901295937712, "loss": 532.375, "step": 6780 }, { "ce_loss_10": 3.6866431832313538, "ce_loss_13": 3.621911180019379, "ce_loss_2": 4.138471448421479, "ce_loss_3": 3.9692311763763426, "ce_loss_7": 3.727970468997955, "epoch": 0.679, "grad_norm": 520.0, "kl_loss_10": 97.46222076416015, "kl_loss_2": 1075.2411163330078, "kl_loss_3": 742.8502899169922, "kl_loss_7": 187.16495361328126, "learning_rate": 0.00023773681908340283, "loss": 541.7315, "step": 6790 }, { "ce_loss_10": 3.6525588750839235, "ce_loss_13": 3.590035092830658, "ce_loss_2": 4.125091111660003, "ce_loss_3": 3.955258107185364, "ce_loss_7": 3.6996394038200378, "epoch": 0.68, "grad_norm": 448.0, "kl_loss_10": 100.11968383789062, "kl_loss_2": 1120.372329711914, "kl_loss_3": 775.7205535888672, "kl_loss_7": 195.07009201049806, "learning_rate": 0.00023638726619474876, "loss": 550.8879, "step": 6800 }, { "ce_loss_10": 3.6433764457702638, "ce_loss_13": 3.581800138950348, "ce_loss_2": 4.1252215027809145, "ce_loss_3": 3.95204918384552, "ce_loss_7": 3.68941251039505, "epoch": 0.681, "grad_norm": 380.0, "kl_loss_10": 94.89226531982422, "kl_loss_2": 1121.6464782714843, "kl_loss_3": 776.2536529541015, "kl_loss_7": 190.19580459594727, "learning_rate": 0.0002350403678833976, "loss": 540.7707, "step": 6810 }, { "ce_loss_10": 3.5702003121376036, "ce_loss_13": 3.509978950023651, "ce_loss_2": 4.041775238513947, "ce_loss_3": 3.871393322944641, "ce_loss_7": 3.6151094794273377, "epoch": 0.682, "grad_norm": 316.0, "kl_loss_10": 94.982954788208, "kl_loss_2": 1118.5872802734375, "kl_loss_3": 772.0714935302734, "kl_loss_7": 188.55085983276368, "learning_rate": 0.00023369613771260007, "loss": 536.8643, "step": 6820 }, { "ce_loss_10": 3.688840866088867, "ce_loss_13": 3.6270360946655273, "ce_loss_2": 4.156035900115967, "ce_loss_3": 3.9860677838325502, "ce_loss_7": 3.7326239466667177, "epoch": 0.683, "grad_norm": 410.0, "kl_loss_10": 97.82878112792969, "kl_loss_2": 1106.3897888183594, "kl_loss_3": 766.803921508789, "kl_loss_7": 191.37064056396486, "learning_rate": 0.00023235458921873925, "loss": 544.207, "step": 6830 }, { "ce_loss_10": 3.63765789270401, "ce_loss_13": 3.5765843272209166, "ce_loss_2": 4.12269172668457, "ce_loss_3": 3.953417754173279, "ce_loss_7": 3.6850703358650208, "epoch": 0.684, "grad_norm": 676.0, "kl_loss_10": 97.75669631958007, "kl_loss_2": 1147.8291046142579, "kl_loss_3": 799.1194305419922, "kl_loss_7": 195.58543319702147, "learning_rate": 0.0002310157359111938, "loss": 555.1555, "step": 6840 }, { "ce_loss_10": 3.526192367076874, "ce_loss_13": 3.4662320494651793, "ce_loss_2": 4.027907514572144, "ce_loss_3": 3.8482834458351136, "ce_loss_7": 3.574409317970276, "epoch": 0.685, "grad_norm": 660.0, "kl_loss_10": 96.51494178771972, "kl_loss_2": 1163.1898101806642, "kl_loss_3": 802.0491363525391, "kl_loss_7": 194.50169296264647, "learning_rate": 0.0002296795912722014, "loss": 551.9703, "step": 6850 }, { "ce_loss_10": 3.6707953572273255, "ce_loss_13": 3.6116589188575743, "ce_loss_2": 4.125709581375122, "ce_loss_3": 3.957431602478027, "ce_loss_7": 3.716504216194153, "epoch": 0.686, "grad_norm": 328.0, "kl_loss_10": 96.6977554321289, "kl_loss_2": 1086.6772430419921, "kl_loss_3": 747.0762786865234, "kl_loss_7": 188.86367645263672, "learning_rate": 0.0002283461687567236, "loss": 527.8294, "step": 6860 }, { "ce_loss_10": 3.727082335948944, "ce_loss_13": 3.664930725097656, "ce_loss_2": 4.172837960720062, "ce_loss_3": 4.010821652412415, "ce_loss_7": 3.7691392421722414, "epoch": 0.687, "grad_norm": 334.0, "kl_loss_10": 97.53575859069824, "kl_loss_2": 1058.4923736572266, "kl_loss_3": 731.6483947753907, "kl_loss_7": 186.02228698730468, "learning_rate": 0.00022701548179231045, "loss": 535.9072, "step": 6870 }, { "ce_loss_10": 3.6793978810310364, "ce_loss_13": 3.6168754935264587, "ce_loss_2": 4.133899199962616, "ce_loss_3": 3.9700045347213746, "ce_loss_7": 3.7239136338233947, "epoch": 0.688, "grad_norm": 382.0, "kl_loss_10": 98.03768157958984, "kl_loss_2": 1087.3397521972656, "kl_loss_3": 753.5451507568359, "kl_loss_7": 189.21656646728516, "learning_rate": 0.00022568754377896516, "loss": 530.6016, "step": 6880 }, { "ce_loss_10": 3.669530212879181, "ce_loss_13": 3.611078381538391, "ce_loss_2": 4.122839629650116, "ce_loss_3": 3.9565317392349244, "ce_loss_7": 3.7144492745399473, "epoch": 0.689, "grad_norm": 482.0, "kl_loss_10": 93.94465446472168, "kl_loss_2": 1092.5764556884765, "kl_loss_3": 757.9043579101562, "kl_loss_7": 189.06216201782226, "learning_rate": 0.00022436236808900844, "loss": 532.0287, "step": 6890 }, { "ce_loss_10": 3.563220775127411, "ce_loss_13": 3.505044734477997, "ce_loss_2": 4.028258430957794, "ce_loss_3": 3.860454273223877, "ce_loss_7": 3.6083375453948974, "epoch": 0.69, "grad_norm": 402.0, "kl_loss_10": 95.30224533081055, "kl_loss_2": 1114.9274475097657, "kl_loss_3": 768.1644836425781, "kl_loss_7": 189.04213485717773, "learning_rate": 0.00022303996806694487, "loss": 534.7889, "step": 6900 }, { "ce_loss_10": 3.646312749385834, "ce_loss_13": 3.5865816950798033, "ce_loss_2": 4.1086891174316404, "ce_loss_3": 3.9399857401847838, "ce_loss_7": 3.69192236661911, "epoch": 0.691, "grad_norm": 392.0, "kl_loss_10": 95.77762832641602, "kl_loss_2": 1094.2582000732423, "kl_loss_3": 756.6770172119141, "kl_loss_7": 187.92616500854493, "learning_rate": 0.00022172035702932823, "loss": 534.246, "step": 6910 }, { "ce_loss_10": 3.685254919528961, "ce_loss_13": 3.6261175990104677, "ce_loss_2": 4.142215931415558, "ce_loss_3": 3.9721115231513977, "ce_loss_7": 3.7271186470985413, "epoch": 0.692, "grad_norm": 430.0, "kl_loss_10": 94.89179420471191, "kl_loss_2": 1075.7997589111328, "kl_loss_3": 742.8703857421875, "kl_loss_7": 186.23582077026367, "learning_rate": 0.00022040354826462666, "loss": 530.2491, "step": 6920 }, { "ce_loss_10": 3.62452495098114, "ce_loss_13": 3.563087892532349, "ce_loss_2": 4.079807507991791, "ce_loss_3": 3.913197338581085, "ce_loss_7": 3.6697877049446106, "epoch": 0.693, "grad_norm": 410.0, "kl_loss_10": 96.51725845336914, "kl_loss_2": 1085.478707885742, "kl_loss_3": 750.6873352050782, "kl_loss_7": 187.0568748474121, "learning_rate": 0.0002190895550330899, "loss": 535.6979, "step": 6930 }, { "ce_loss_10": 3.547420835494995, "ce_loss_13": 3.488833248615265, "ce_loss_2": 4.036989772319794, "ce_loss_3": 3.85961799621582, "ce_loss_7": 3.598125493526459, "epoch": 0.694, "grad_norm": 406.0, "kl_loss_10": 96.3628433227539, "kl_loss_2": 1128.3786254882812, "kl_loss_3": 778.4836669921875, "kl_loss_7": 192.30058898925782, "learning_rate": 0.00021777839056661552, "loss": 534.9962, "step": 6940 }, { "ce_loss_10": 3.636169970035553, "ce_loss_13": 3.576909136772156, "ce_loss_2": 4.093018388748169, "ce_loss_3": 3.9319678425788878, "ce_loss_7": 3.682026994228363, "epoch": 0.695, "grad_norm": 380.0, "kl_loss_10": 95.15358619689941, "kl_loss_2": 1086.1379272460938, "kl_loss_3": 753.724154663086, "kl_loss_7": 185.95790100097656, "learning_rate": 0.0002164700680686147, "loss": 526.2859, "step": 6950 }, { "ce_loss_10": 3.6809890270233154, "ce_loss_13": 3.6225372910499574, "ce_loss_2": 4.135282206535339, "ce_loss_3": 3.9695199608802794, "ce_loss_7": 3.7249368906021116, "epoch": 0.696, "grad_norm": 400.0, "kl_loss_10": 96.4394718170166, "kl_loss_2": 1074.3540649414062, "kl_loss_3": 743.0920288085938, "kl_loss_7": 188.12129898071288, "learning_rate": 0.0002151646007138806, "loss": 527.0223, "step": 6960 }, { "ce_loss_10": 3.55483934879303, "ce_loss_13": 3.493414306640625, "ce_loss_2": 4.029416286945343, "ce_loss_3": 3.8593334913253785, "ce_loss_7": 3.5997050285339354, "epoch": 0.697, "grad_norm": 324.0, "kl_loss_10": 98.1744327545166, "kl_loss_2": 1119.6888793945313, "kl_loss_3": 776.6419464111328, "kl_loss_7": 191.90652236938476, "learning_rate": 0.00021386200164845526, "loss": 540.4315, "step": 6970 }, { "ce_loss_10": 3.7494669914245606, "ce_loss_13": 3.6868221879005434, "ce_loss_2": 4.180894982814789, "ce_loss_3": 4.02219043970108, "ce_loss_7": 3.790766155719757, "epoch": 0.698, "grad_norm": 386.0, "kl_loss_10": 98.89772605895996, "kl_loss_2": 1061.9671813964844, "kl_loss_3": 736.8194549560546, "kl_loss_7": 189.14059829711914, "learning_rate": 0.0002125622839894964, "loss": 526.3207, "step": 6980 }, { "ce_loss_10": 3.6859158158302305, "ce_loss_13": 3.626417076587677, "ce_loss_2": 4.136348474025726, "ce_loss_3": 3.974168133735657, "ce_loss_7": 3.7279628992080687, "epoch": 0.699, "grad_norm": 406.0, "kl_loss_10": 97.57818336486817, "kl_loss_2": 1081.921697998047, "kl_loss_3": 746.1339630126953, "kl_loss_7": 188.19551315307618, "learning_rate": 0.00021126546082514663, "loss": 529.5254, "step": 6990 }, { "ce_loss_10": 3.704355037212372, "ce_loss_13": 3.643582081794739, "ce_loss_2": 4.151243126392364, "ce_loss_3": 3.9851069808006288, "ce_loss_7": 3.747806203365326, "epoch": 0.7, "grad_norm": 394.0, "kl_loss_10": 97.80472221374512, "kl_loss_2": 1074.9452331542968, "kl_loss_3": 745.385775756836, "kl_loss_7": 188.936759185791, "learning_rate": 0.00020997154521440098, "loss": 526.4211, "step": 7000 }, { "ce_loss_10": 3.6455201506614685, "ce_loss_13": 3.586948239803314, "ce_loss_2": 4.104578590393066, "ce_loss_3": 3.9375877380371094, "ce_loss_7": 3.68754506111145, "epoch": 0.701, "grad_norm": 322.0, "kl_loss_10": 93.82002601623535, "kl_loss_2": 1085.8826141357422, "kl_loss_3": 746.0692993164063, "kl_loss_7": 184.4355583190918, "learning_rate": 0.0002086805501869749, "loss": 524.1356, "step": 7010 }, { "ce_loss_10": 3.6133246064186095, "ce_loss_13": 3.554938244819641, "ce_loss_2": 4.0853543996810915, "ce_loss_3": 3.918412721157074, "ce_loss_7": 3.6612335562705995, "epoch": 0.702, "grad_norm": 398.0, "kl_loss_10": 95.29999237060547, "kl_loss_2": 1131.5339111328126, "kl_loss_3": 781.2637298583984, "kl_loss_7": 192.70318984985352, "learning_rate": 0.0002073924887431744, "loss": 542.1648, "step": 7020 }, { "ce_loss_10": 3.619812881946564, "ce_loss_13": 3.561210036277771, "ce_loss_2": 4.088810133934021, "ce_loss_3": 3.9195892930030825, "ce_loss_7": 3.667060124874115, "epoch": 0.703, "grad_norm": 396.0, "kl_loss_10": 95.14918022155761, "kl_loss_2": 1112.4185638427734, "kl_loss_3": 769.590234375, "kl_loss_7": 188.17913894653321, "learning_rate": 0.00020610737385376348, "loss": 545.7339, "step": 7030 }, { "ce_loss_10": 3.689952182769775, "ce_loss_13": 3.629777657985687, "ce_loss_2": 4.127048587799072, "ce_loss_3": 3.9679968118667603, "ce_loss_7": 3.7309682607650756, "epoch": 0.704, "grad_norm": 480.0, "kl_loss_10": 96.72987632751465, "kl_loss_2": 1060.028268432617, "kl_loss_3": 736.1820068359375, "kl_loss_7": 185.3560775756836, "learning_rate": 0.00020482521845983521, "loss": 531.1421, "step": 7040 }, { "ce_loss_10": 3.681384038925171, "ce_loss_13": 3.6203475475311278, "ce_loss_2": 4.1394176363945006, "ce_loss_3": 3.9727881073951723, "ce_loss_7": 3.725051200389862, "epoch": 0.705, "grad_norm": 482.0, "kl_loss_10": 100.69121513366699, "kl_loss_2": 1089.9848724365233, "kl_loss_3": 754.3679351806641, "kl_loss_7": 192.38913803100587, "learning_rate": 0.00020354603547267987, "loss": 542.1912, "step": 7050 }, { "ce_loss_10": 3.667348313331604, "ce_loss_13": 3.605680251121521, "ce_loss_2": 4.1402019739151, "ce_loss_3": 3.971468675136566, "ce_loss_7": 3.712887394428253, "epoch": 0.706, "grad_norm": 364.0, "kl_loss_10": 97.05326614379882, "kl_loss_2": 1105.346597290039, "kl_loss_3": 773.4686828613281, "kl_loss_7": 191.13608169555664, "learning_rate": 0.00020226983777365604, "loss": 548.4642, "step": 7060 }, { "ce_loss_10": 3.563067603111267, "ce_loss_13": 3.504194128513336, "ce_loss_2": 4.040568280220032, "ce_loss_3": 3.8677730679512026, "ce_loss_7": 3.6067102789878844, "epoch": 0.707, "grad_norm": 338.0, "kl_loss_10": 92.14009590148926, "kl_loss_2": 1122.7782775878907, "kl_loss_3": 765.3169036865235, "kl_loss_7": 183.86895446777345, "learning_rate": 0.00020099663821406056, "loss": 534.7408, "step": 7070 }, { "ce_loss_10": 3.669863748550415, "ce_loss_13": 3.6097553610801696, "ce_loss_2": 4.117836952209473, "ce_loss_3": 3.955933165550232, "ce_loss_7": 3.7124067664146425, "epoch": 0.708, "grad_norm": 528.0, "kl_loss_10": 95.14625968933106, "kl_loss_2": 1064.9939758300782, "kl_loss_3": 737.6582244873047, "kl_loss_7": 184.71611633300782, "learning_rate": 0.00019972644961499853, "loss": 531.3339, "step": 7080 }, { "ce_loss_10": 3.635360848903656, "ce_loss_13": 3.575283741950989, "ce_loss_2": 4.107546412944794, "ce_loss_3": 3.9376320004463197, "ce_loss_7": 3.6813616275787355, "epoch": 0.709, "grad_norm": 454.0, "kl_loss_10": 95.76384582519532, "kl_loss_2": 1112.5157043457032, "kl_loss_3": 768.9019195556641, "kl_loss_7": 190.37624435424806, "learning_rate": 0.00019845928476725522, "loss": 537.9877, "step": 7090 }, { "ce_loss_10": 3.7167228937149046, "ce_loss_13": 3.654716455936432, "ce_loss_2": 4.171470665931702, "ce_loss_3": 4.006917369365692, "ce_loss_7": 3.763367462158203, "epoch": 0.71, "grad_norm": 402.0, "kl_loss_10": 97.96182098388672, "kl_loss_2": 1088.9804382324219, "kl_loss_3": 752.4143249511719, "kl_loss_7": 190.0522773742676, "learning_rate": 0.00019719515643116677, "loss": 545.6708, "step": 7100 }, { "ce_loss_10": 3.657674491405487, "ce_loss_13": 3.595584750175476, "ce_loss_2": 4.113815677165985, "ce_loss_3": 3.9436608791351317, "ce_loss_7": 3.700818693637848, "epoch": 0.711, "grad_norm": 354.0, "kl_loss_10": 97.26274185180664, "kl_loss_2": 1084.9519958496094, "kl_loss_3": 745.9836975097656, "kl_loss_7": 187.7238555908203, "learning_rate": 0.0001959340773364911, "loss": 535.516, "step": 7110 }, { "ce_loss_10": 3.6742369413375853, "ce_loss_13": 3.613626217842102, "ce_loss_2": 4.1353883981704715, "ce_loss_3": 3.9663340568542482, "ce_loss_7": 3.715994417667389, "epoch": 0.712, "grad_norm": 414.0, "kl_loss_10": 97.77620887756348, "kl_loss_2": 1094.1240295410157, "kl_loss_3": 755.1257873535156, "kl_loss_7": 188.97418975830078, "learning_rate": 0.0001946760601822809, "loss": 526.0803, "step": 7120 }, { "ce_loss_10": 3.724298870563507, "ce_loss_13": 3.6654844999313356, "ce_loss_2": 4.171249413490296, "ce_loss_3": 4.011460411548614, "ce_loss_7": 3.770449674129486, "epoch": 0.713, "grad_norm": 328.0, "kl_loss_10": 95.51984024047852, "kl_loss_2": 1076.5175323486328, "kl_loss_3": 741.9749359130859, "kl_loss_7": 187.3384910583496, "learning_rate": 0.00019342111763675512, "loss": 520.2061, "step": 7130 }, { "ce_loss_10": 3.730803680419922, "ce_loss_13": 3.6689053654670714, "ce_loss_2": 4.169312536716461, "ce_loss_3": 4.00542528629303, "ce_loss_7": 3.7727373957633974, "epoch": 0.714, "grad_norm": 418.0, "kl_loss_10": 99.5161979675293, "kl_loss_2": 1071.9742065429687, "kl_loss_3": 743.7749084472656, "kl_loss_7": 189.85234451293945, "learning_rate": 0.00019216926233717085, "loss": 525.6779, "step": 7140 }, { "ce_loss_10": 3.6117329597473145, "ce_loss_13": 3.5528572678565977, "ce_loss_2": 4.092064487934112, "ce_loss_3": 3.914566385746002, "ce_loss_7": 3.653049111366272, "epoch": 0.715, "grad_norm": 342.0, "kl_loss_10": 95.1452823638916, "kl_loss_2": 1135.5553619384766, "kl_loss_3": 779.0962982177734, "kl_loss_7": 185.5459442138672, "learning_rate": 0.00019092050688969737, "loss": 540.4428, "step": 7150 }, { "ce_loss_10": 3.6794282674789427, "ce_loss_13": 3.619647240638733, "ce_loss_2": 4.124801588058472, "ce_loss_3": 3.9644263625144958, "ce_loss_7": 3.7204025983810425, "epoch": 0.716, "grad_norm": 458.0, "kl_loss_10": 95.73797454833985, "kl_loss_2": 1075.4539825439454, "kl_loss_3": 743.3267883300781, "kl_loss_7": 186.0149787902832, "learning_rate": 0.00018967486386928817, "loss": 525.8811, "step": 7160 }, { "ce_loss_10": 3.5499155521392822, "ce_loss_13": 3.4892677664756775, "ce_loss_2": 4.026895833015442, "ce_loss_3": 3.8540278673171997, "ce_loss_7": 3.594646680355072, "epoch": 0.717, "grad_norm": 458.0, "kl_loss_10": 93.14333381652833, "kl_loss_2": 1122.9288635253906, "kl_loss_3": 776.9212982177735, "kl_loss_7": 188.66815719604492, "learning_rate": 0.00018843234581955443, "loss": 552.9929, "step": 7170 }, { "ce_loss_10": 3.574516201019287, "ce_loss_13": 3.512941229343414, "ce_loss_2": 4.049459004402161, "ce_loss_3": 3.871944236755371, "ce_loss_7": 3.6209982872009276, "epoch": 0.718, "grad_norm": 364.0, "kl_loss_10": 96.56784629821777, "kl_loss_2": 1129.7979248046875, "kl_loss_3": 775.4145477294921, "kl_loss_7": 190.88178558349608, "learning_rate": 0.00018719296525263924, "loss": 541.6241, "step": 7180 }, { "ce_loss_10": 3.6690776705741883, "ce_loss_13": 3.6084558844566343, "ce_loss_2": 4.109010553359985, "ce_loss_3": 3.944419741630554, "ce_loss_7": 3.7104645013809203, "epoch": 0.719, "grad_norm": 472.0, "kl_loss_10": 96.92717056274414, "kl_loss_2": 1058.7910217285157, "kl_loss_3": 728.63525390625, "kl_loss_7": 186.22266235351563, "learning_rate": 0.0001859567346490913, "loss": 525.3373, "step": 7190 }, { "ce_loss_10": 3.6438188314437867, "ce_loss_13": 3.5840962886810304, "ce_loss_2": 4.113380300998688, "ce_loss_3": 3.9464030385017397, "ce_loss_7": 3.690832197666168, "epoch": 0.72, "grad_norm": 372.0, "kl_loss_10": 96.38097648620605, "kl_loss_2": 1109.4217742919923, "kl_loss_3": 771.3218353271484, "kl_loss_7": 191.62188110351562, "learning_rate": 0.0001847236664577389, "loss": 531.0333, "step": 7200 }, { "ce_loss_10": 3.673705244064331, "ce_loss_13": 3.614805054664612, "ce_loss_2": 4.117141389846802, "ce_loss_3": 3.954344153404236, "ce_loss_7": 3.717172992229462, "epoch": 0.721, "grad_norm": 342.0, "kl_loss_10": 96.93136787414551, "kl_loss_2": 1071.9077087402343, "kl_loss_3": 737.0366821289062, "kl_loss_7": 186.3966079711914, "learning_rate": 0.00018349377309556487, "loss": 518.4113, "step": 7210 }, { "ce_loss_10": 3.609177756309509, "ce_loss_13": 3.5509839773178102, "ce_loss_2": 4.084410285949707, "ce_loss_3": 3.911909210681915, "ce_loss_7": 3.6546399116516115, "epoch": 0.722, "grad_norm": 436.0, "kl_loss_10": 94.82120094299316, "kl_loss_2": 1119.1944885253906, "kl_loss_3": 772.7051483154297, "kl_loss_7": 190.29311599731446, "learning_rate": 0.00018226706694758193, "loss": 539.7223, "step": 7220 }, { "ce_loss_10": 3.6862050175666807, "ce_loss_13": 3.6256973266601564, "ce_loss_2": 4.135941016674042, "ce_loss_3": 3.9752198338508604, "ce_loss_7": 3.7262799024581907, "epoch": 0.723, "grad_norm": 386.0, "kl_loss_10": 96.04033012390137, "kl_loss_2": 1079.298776245117, "kl_loss_3": 752.3606506347656, "kl_loss_7": 187.0266014099121, "learning_rate": 0.0001810435603667075, "loss": 540.3036, "step": 7230 }, { "ce_loss_10": 3.5322535395622254, "ce_loss_13": 3.4715150594711304, "ce_loss_2": 4.000997626781464, "ce_loss_3": 3.8283260583877565, "ce_loss_7": 3.5753297805786133, "epoch": 0.724, "grad_norm": 348.0, "kl_loss_10": 92.0587100982666, "kl_loss_2": 1101.032977294922, "kl_loss_3": 757.3754730224609, "kl_loss_7": 184.87646255493163, "learning_rate": 0.0001798232656736389, "loss": 539.9771, "step": 7240 }, { "ce_loss_10": 3.7180214405059813, "ce_loss_13": 3.6561784505844117, "ce_loss_2": 4.153665316104889, "ce_loss_3": 3.994912326335907, "ce_loss_7": 3.759207808971405, "epoch": 0.725, "grad_norm": 388.0, "kl_loss_10": 97.47655296325684, "kl_loss_2": 1060.039584350586, "kl_loss_3": 729.7286529541016, "kl_loss_7": 185.7909019470215, "learning_rate": 0.0001786061951567303, "loss": 527.9849, "step": 7250 }, { "ce_loss_10": 3.630312275886536, "ce_loss_13": 3.5694428086280823, "ce_loss_2": 4.091070818901062, "ce_loss_3": 3.92718985080719, "ce_loss_7": 3.675185751914978, "epoch": 0.726, "grad_norm": 382.0, "kl_loss_10": 97.81040573120117, "kl_loss_2": 1091.2934509277343, "kl_loss_3": 755.8922180175781, "kl_loss_7": 189.30439071655275, "learning_rate": 0.00017739236107186857, "loss": 537.2411, "step": 7260 }, { "ce_loss_10": 3.711188280582428, "ce_loss_13": 3.6525003552436828, "ce_loss_2": 4.142853522300721, "ce_loss_3": 3.981386995315552, "ce_loss_7": 3.7502527594566346, "epoch": 0.727, "grad_norm": 374.0, "kl_loss_10": 93.90410652160645, "kl_loss_2": 1048.1178436279297, "kl_loss_3": 725.0591918945313, "kl_loss_7": 182.22721328735352, "learning_rate": 0.00017618177564234904, "loss": 519.2631, "step": 7270 }, { "ce_loss_10": 3.693279492855072, "ce_loss_13": 3.6356263041496275, "ce_loss_2": 4.13202931880951, "ce_loss_3": 3.9750990748405455, "ce_loss_7": 3.7332441210746765, "epoch": 0.728, "grad_norm": 318.0, "kl_loss_10": 95.86821098327637, "kl_loss_2": 1048.5098999023437, "kl_loss_3": 724.8844573974609, "kl_loss_7": 182.79603576660156, "learning_rate": 0.00017497445105875377, "loss": 523.0468, "step": 7280 }, { "ce_loss_10": 3.595864677429199, "ce_loss_13": 3.5371819376945495, "ce_loss_2": 4.073407852649689, "ce_loss_3": 3.904122495651245, "ce_loss_7": 3.6426048040390016, "epoch": 0.729, "grad_norm": 442.0, "kl_loss_10": 95.08332710266113, "kl_loss_2": 1130.4158264160155, "kl_loss_3": 780.6070220947265, "kl_loss_7": 189.8589889526367, "learning_rate": 0.000173770399478828, "loss": 538.7581, "step": 7290 }, { "ce_loss_10": 3.5191142082214357, "ce_loss_13": 3.461543416976929, "ce_loss_2": 3.977211833000183, "ce_loss_3": 3.8103960871696474, "ce_loss_7": 3.564071011543274, "epoch": 0.73, "grad_norm": 438.0, "kl_loss_10": 93.54008331298829, "kl_loss_2": 1093.509115600586, "kl_loss_3": 757.6209930419922, "kl_loss_7": 186.89632568359374, "learning_rate": 0.0001725696330273575, "loss": 540.4559, "step": 7300 }, { "ce_loss_10": 3.714753472805023, "ce_loss_13": 3.6550832748413087, "ce_loss_2": 4.150299251079559, "ce_loss_3": 3.9939948439598085, "ce_loss_7": 3.757344377040863, "epoch": 0.731, "grad_norm": 486.0, "kl_loss_10": 93.61467895507812, "kl_loss_2": 1050.2083618164063, "kl_loss_3": 726.4699127197266, "kl_loss_7": 182.62665328979492, "learning_rate": 0.00017137216379604724, "loss": 517.0194, "step": 7310 }, { "ce_loss_10": 3.590583050251007, "ce_loss_13": 3.5311309576034544, "ce_loss_2": 4.051425302028656, "ce_loss_3": 3.8829818606376647, "ce_loss_7": 3.632352864742279, "epoch": 0.732, "grad_norm": 340.0, "kl_loss_10": 95.8599407196045, "kl_loss_2": 1085.3143981933595, "kl_loss_3": 747.0916809082031, "kl_loss_7": 186.49290466308594, "learning_rate": 0.00017017800384339925, "loss": 528.4002, "step": 7320 }, { "ce_loss_10": 3.540472662448883, "ce_loss_13": 3.4801993131637574, "ce_loss_2": 4.017971241474152, "ce_loss_3": 3.8469355702400208, "ce_loss_7": 3.586536169052124, "epoch": 0.733, "grad_norm": 316.0, "kl_loss_10": 95.24363555908204, "kl_loss_2": 1121.9350006103516, "kl_loss_3": 775.7258972167969, "kl_loss_7": 189.4253242492676, "learning_rate": 0.00016898716519459073, "loss": 528.2626, "step": 7330 }, { "ce_loss_10": 3.6674713015556337, "ce_loss_13": 3.608376145362854, "ce_loss_2": 4.144577407836914, "ce_loss_3": 3.9727422475814818, "ce_loss_7": 3.712773549556732, "epoch": 0.734, "grad_norm": 330.0, "kl_loss_10": 96.16988220214844, "kl_loss_2": 1116.4668975830077, "kl_loss_3": 767.9603485107422, "kl_loss_7": 191.9127670288086, "learning_rate": 0.00016779965984135375, "loss": 536.6205, "step": 7340 }, { "ce_loss_10": 3.5673499703407288, "ce_loss_13": 3.5097331523895265, "ce_loss_2": 4.023692965507507, "ce_loss_3": 3.8575591087341308, "ce_loss_7": 3.6114558935165406, "epoch": 0.735, "grad_norm": 478.0, "kl_loss_10": 92.66586227416992, "kl_loss_2": 1079.1628143310547, "kl_loss_3": 740.6051025390625, "kl_loss_7": 182.72610321044922, "learning_rate": 0.00016661549974185424, "loss": 528.04, "step": 7350 }, { "ce_loss_10": 3.612525475025177, "ce_loss_13": 3.5525230765342712, "ce_loss_2": 4.068535602092743, "ce_loss_3": 3.9024940848350527, "ce_loss_7": 3.6558452367782595, "epoch": 0.736, "grad_norm": 390.0, "kl_loss_10": 97.4712890625, "kl_loss_2": 1087.6514739990234, "kl_loss_3": 751.1584289550781, "kl_loss_7": 188.87088012695312, "learning_rate": 0.00016543469682057105, "loss": 524.4483, "step": 7360 }, { "ce_loss_10": 3.6394684672355653, "ce_loss_13": 3.579529356956482, "ce_loss_2": 4.096106541156769, "ce_loss_3": 3.930702245235443, "ce_loss_7": 3.6828288197517396, "epoch": 0.737, "grad_norm": 332.0, "kl_loss_10": 96.63297386169434, "kl_loss_2": 1092.361489868164, "kl_loss_3": 752.7328277587891, "kl_loss_7": 189.77932739257812, "learning_rate": 0.00016425726296817632, "loss": 533.2087, "step": 7370 }, { "ce_loss_10": 3.6602264523506163, "ce_loss_13": 3.6020756483078005, "ce_loss_2": 4.102893972396851, "ce_loss_3": 3.9389352679252623, "ce_loss_7": 3.702047073841095, "epoch": 0.738, "grad_norm": 604.0, "kl_loss_10": 95.1510066986084, "kl_loss_2": 1066.6962097167968, "kl_loss_3": 734.1273132324219, "kl_loss_7": 185.05731124877929, "learning_rate": 0.00016308321004141607, "loss": 524.9394, "step": 7380 }, { "ce_loss_10": 3.6052220940589903, "ce_loss_13": 3.544311022758484, "ce_loss_2": 4.074472200870514, "ce_loss_3": 3.905838668346405, "ce_loss_7": 3.6499088406562805, "epoch": 0.739, "grad_norm": 414.0, "kl_loss_10": 98.00579032897949, "kl_loss_2": 1091.213427734375, "kl_loss_3": 753.4314147949219, "kl_loss_7": 190.15870666503906, "learning_rate": 0.00016191254986299043, "loss": 528.1322, "step": 7390 }, { "ce_loss_10": 3.665621018409729, "ce_loss_13": 3.606168735027313, "ce_loss_2": 4.110114741325378, "ce_loss_3": 3.9419893980026246, "ce_loss_7": 3.7061524271965025, "epoch": 0.74, "grad_norm": 380.0, "kl_loss_10": 95.95707778930664, "kl_loss_2": 1084.0633728027344, "kl_loss_3": 743.1361236572266, "kl_loss_7": 184.58063583374025, "learning_rate": 0.00016074529422143398, "loss": 534.7291, "step": 7400 }, { "ce_loss_10": 3.5971511721611025, "ce_loss_13": 3.540179669857025, "ce_loss_2": 4.063095271587372, "ce_loss_3": 3.8901899337768553, "ce_loss_7": 3.6407782435417175, "epoch": 0.741, "grad_norm": 672.0, "kl_loss_10": 95.23762931823731, "kl_loss_2": 1107.8231140136718, "kl_loss_3": 756.5032379150391, "kl_loss_7": 187.1332000732422, "learning_rate": 0.0001595814548709983, "loss": 535.9396, "step": 7410 }, { "ce_loss_10": 3.6745630502700806, "ce_loss_13": 3.613684153556824, "ce_loss_2": 4.1425374269485475, "ce_loss_3": 3.9706888437271117, "ce_loss_7": 3.7216501116752623, "epoch": 0.742, "grad_norm": 372.0, "kl_loss_10": 97.69215469360351, "kl_loss_2": 1104.6529205322265, "kl_loss_3": 761.8667907714844, "kl_loss_7": 191.12793655395507, "learning_rate": 0.00015842104353153285, "loss": 536.9469, "step": 7420 }, { "ce_loss_10": 3.6906041502952576, "ce_loss_13": 3.6308916926383974, "ce_loss_2": 4.145760095119476, "ce_loss_3": 3.981899178028107, "ce_loss_7": 3.7335981249809267, "epoch": 0.743, "grad_norm": 418.0, "kl_loss_10": 97.18793029785157, "kl_loss_2": 1097.6078674316407, "kl_loss_3": 759.1226196289062, "kl_loss_7": 189.25363845825194, "learning_rate": 0.0001572640718883667, "loss": 543.1555, "step": 7430 }, { "ce_loss_10": 3.6231363296508787, "ce_loss_13": 3.564767360687256, "ce_loss_2": 4.071314561367035, "ce_loss_3": 3.91090784072876, "ce_loss_7": 3.664019286632538, "epoch": 0.744, "grad_norm": 320.0, "kl_loss_10": 94.90192832946778, "kl_loss_2": 1067.2735595703125, "kl_loss_3": 738.7796752929687, "kl_loss_7": 183.48248062133788, "learning_rate": 0.0001561105515921915, "loss": 533.524, "step": 7440 }, { "ce_loss_10": 3.463870346546173, "ce_loss_13": 3.4067335724830627, "ce_loss_2": 3.9477816224098206, "ce_loss_3": 3.7779128670692446, "ce_loss_7": 3.51077561378479, "epoch": 0.745, "grad_norm": 300.0, "kl_loss_10": 92.0508991241455, "kl_loss_2": 1123.8193664550781, "kl_loss_3": 770.2350646972657, "kl_loss_7": 184.63360900878905, "learning_rate": 0.0001549604942589441, "loss": 530.4723, "step": 7450 }, { "ce_loss_10": 3.6651261687278747, "ce_loss_13": 3.6062275648117064, "ce_loss_2": 4.092234718799591, "ce_loss_3": 3.9361136317253114, "ce_loss_7": 3.7055254936218263, "epoch": 0.746, "grad_norm": 366.0, "kl_loss_10": 93.61905822753906, "kl_loss_2": 1028.498812866211, "kl_loss_3": 711.0323303222656, "kl_loss_7": 180.76227340698242, "learning_rate": 0.00015381391146968864, "loss": 518.9042, "step": 7460 }, { "ce_loss_10": 3.6343637704849243, "ce_loss_13": 3.5772631406784057, "ce_loss_2": 4.097551655769348, "ce_loss_3": 3.9294708490371706, "ce_loss_7": 3.6792303323745728, "epoch": 0.747, "grad_norm": 348.0, "kl_loss_10": 93.67252769470215, "kl_loss_2": 1075.3313690185546, "kl_loss_3": 736.6811370849609, "kl_loss_7": 182.92275466918946, "learning_rate": 0.00015267081477050133, "loss": 529.2104, "step": 7470 }, { "ce_loss_10": 3.737002635002136, "ce_loss_13": 3.6760261058807373, "ce_loss_2": 4.184408628940583, "ce_loss_3": 4.020549094676971, "ce_loss_7": 3.779319405555725, "epoch": 0.748, "grad_norm": 314.0, "kl_loss_10": 97.9722526550293, "kl_loss_2": 1074.686865234375, "kl_loss_3": 738.4016967773438, "kl_loss_7": 188.9453094482422, "learning_rate": 0.00015153121567235335, "loss": 521.3269, "step": 7480 }, { "ce_loss_10": 3.627355396747589, "ce_loss_13": 3.566980814933777, "ce_loss_2": 4.087941682338714, "ce_loss_3": 3.9189056277275087, "ce_loss_7": 3.671427834033966, "epoch": 0.749, "grad_norm": 362.0, "kl_loss_10": 95.86229972839355, "kl_loss_2": 1099.5704315185546, "kl_loss_3": 757.0687835693359, "kl_loss_7": 188.21585922241212, "learning_rate": 0.00015039512565099468, "loss": 520.7597, "step": 7490 }, { "ce_loss_10": 3.6923457860946653, "ce_loss_13": 3.6337139129638674, "ce_loss_2": 4.142465770244598, "ce_loss_3": 3.9779353976249694, "ce_loss_7": 3.7360000610351562, "epoch": 0.75, "grad_norm": 400.0, "kl_loss_10": 96.83558921813965, "kl_loss_2": 1084.189535522461, "kl_loss_3": 748.4331817626953, "kl_loss_7": 188.31302337646486, "learning_rate": 0.00014926255614683932, "loss": 542.3, "step": 7500 }, { "ce_loss_10": 3.63236540555954, "ce_loss_13": 3.5743218302726745, "ce_loss_2": 4.084986460208893, "ce_loss_3": 3.9159162759780886, "ce_loss_7": 3.6776034474372863, "epoch": 0.751, "grad_norm": 356.0, "kl_loss_10": 95.49623985290528, "kl_loss_2": 1074.4479522705078, "kl_loss_3": 737.1827270507813, "kl_loss_7": 185.40160522460937, "learning_rate": 0.0001481335185648498, "loss": 533.0406, "step": 7510 }, { "ce_loss_10": 3.6419626474380493, "ce_loss_13": 3.583856701850891, "ce_loss_2": 4.0939129114151, "ce_loss_3": 3.9313414216041567, "ce_loss_7": 3.686499559879303, "epoch": 0.752, "grad_norm": 406.0, "kl_loss_10": 93.70109405517579, "kl_loss_2": 1078.2966064453126, "kl_loss_3": 747.9822265625, "kl_loss_7": 186.15133514404297, "learning_rate": 0.0001470080242744218, "loss": 523.242, "step": 7520 }, { "ce_loss_10": 3.638762640953064, "ce_loss_13": 3.5817859530448914, "ce_loss_2": 4.096928322315216, "ce_loss_3": 3.925868511199951, "ce_loss_7": 3.6821122765541077, "epoch": 0.753, "grad_norm": 304.0, "kl_loss_10": 92.91362495422364, "kl_loss_2": 1078.3225189208983, "kl_loss_3": 744.8880218505859, "kl_loss_7": 183.9945556640625, "learning_rate": 0.0001458860846092705, "loss": 532.4821, "step": 7530 }, { "ce_loss_10": 3.6720047116279604, "ce_loss_13": 3.6128148198127747, "ce_loss_2": 4.114215791225433, "ce_loss_3": 3.9525702714920046, "ce_loss_7": 3.7135818719863893, "epoch": 0.754, "grad_norm": 322.0, "kl_loss_10": 94.26252975463868, "kl_loss_2": 1064.2825866699218, "kl_loss_3": 735.7307800292969, "kl_loss_7": 183.32004623413087, "learning_rate": 0.00014476771086731566, "loss": 517.3908, "step": 7540 }, { "ce_loss_10": 3.7847033739089966, "ce_loss_13": 3.7219197750091553, "ce_loss_2": 4.230582165718078, "ce_loss_3": 4.065666139125824, "ce_loss_7": 3.827774000167847, "epoch": 0.755, "grad_norm": 430.0, "kl_loss_10": 99.63549118041992, "kl_loss_2": 1067.45849609375, "kl_loss_3": 732.3584259033203, "kl_loss_7": 187.05829620361328, "learning_rate": 0.00014365291431056872, "loss": 535.3279, "step": 7550 }, { "ce_loss_10": 3.6090814113616942, "ce_loss_13": 3.5493834733963014, "ce_loss_2": 4.077333819866181, "ce_loss_3": 3.906116855144501, "ce_loss_7": 3.652885007858276, "epoch": 0.756, "grad_norm": 460.0, "kl_loss_10": 97.59222984313965, "kl_loss_2": 1117.932635498047, "kl_loss_3": 769.9259338378906, "kl_loss_7": 192.52491149902343, "learning_rate": 0.00014254170616501827, "loss": 534.983, "step": 7560 }, { "ce_loss_10": 3.535455918312073, "ce_loss_13": 3.47601797580719, "ce_loss_2": 4.02073061466217, "ce_loss_3": 3.852824592590332, "ce_loss_7": 3.582290494441986, "epoch": 0.757, "grad_norm": 544.0, "kl_loss_10": 94.12142906188964, "kl_loss_2": 1137.807843017578, "kl_loss_3": 793.451205444336, "kl_loss_7": 191.15487823486328, "learning_rate": 0.0001414340976205183, "loss": 552.139, "step": 7570 }, { "ce_loss_10": 3.554329538345337, "ce_loss_13": 3.495155191421509, "ce_loss_2": 4.028338003158569, "ce_loss_3": 3.860016918182373, "ce_loss_7": 3.6010040402412415, "epoch": 0.758, "grad_norm": 392.0, "kl_loss_10": 94.82050590515136, "kl_loss_2": 1103.2837646484375, "kl_loss_3": 760.9699432373047, "kl_loss_7": 186.93385314941406, "learning_rate": 0.00014033009983067452, "loss": 536.1902, "step": 7580 }, { "ce_loss_10": 3.7230227828025817, "ce_loss_13": 3.663366961479187, "ce_loss_2": 4.157820415496826, "ce_loss_3": 3.9997562408447265, "ce_loss_7": 3.766176974773407, "epoch": 0.759, "grad_norm": 366.0, "kl_loss_10": 95.41510429382325, "kl_loss_2": 1045.2667877197266, "kl_loss_3": 721.8115600585937, "kl_loss_7": 183.35761260986328, "learning_rate": 0.00013922972391273224, "loss": 521.7405, "step": 7590 }, { "ce_loss_10": 3.726309287548065, "ce_loss_13": 3.666226303577423, "ce_loss_2": 4.173925065994263, "ce_loss_3": 4.007045650482178, "ce_loss_7": 3.7666892886161802, "epoch": 0.76, "grad_norm": 396.0, "kl_loss_10": 96.0021198272705, "kl_loss_2": 1064.7614837646483, "kl_loss_3": 734.0039642333984, "kl_loss_7": 185.6392059326172, "learning_rate": 0.0001381329809474649, "loss": 528.3375, "step": 7600 }, { "ce_loss_10": 3.621905469894409, "ce_loss_13": 3.561663830280304, "ce_loss_2": 4.098969185352326, "ce_loss_3": 3.925651717185974, "ce_loss_7": 3.6682042717933654, "epoch": 0.761, "grad_norm": 370.0, "kl_loss_10": 96.61415328979493, "kl_loss_2": 1119.761654663086, "kl_loss_3": 769.7445831298828, "kl_loss_7": 190.59917831420898, "learning_rate": 0.0001370398819790621, "loss": 540.338, "step": 7610 }, { "ce_loss_10": 3.7644327759742735, "ce_loss_13": 3.704228925704956, "ce_loss_2": 4.202455806732178, "ce_loss_3": 4.041428947448731, "ce_loss_7": 3.8075477123260497, "epoch": 0.762, "grad_norm": 424.0, "kl_loss_10": 97.06539382934571, "kl_loss_2": 1046.6303649902343, "kl_loss_3": 720.9399322509765, "kl_loss_7": 185.40132827758788, "learning_rate": 0.00013595043801501794, "loss": 512.6931, "step": 7620 }, { "ce_loss_10": 3.5539215803146362, "ce_loss_13": 3.4973302245140077, "ce_loss_2": 4.044493949413299, "ce_loss_3": 3.8687676310539247, "ce_loss_7": 3.602993667125702, "epoch": 0.763, "grad_norm": 468.0, "kl_loss_10": 92.99364700317383, "kl_loss_2": 1138.1223754882812, "kl_loss_3": 782.3796447753906, "kl_loss_7": 188.2281280517578, "learning_rate": 0.00013486466002602133, "loss": 539.5471, "step": 7630 }, { "ce_loss_10": 3.680443322658539, "ce_loss_13": 3.6184515833854674, "ce_loss_2": 4.119498157501221, "ce_loss_3": 3.9600594878196715, "ce_loss_7": 3.7241831541061403, "epoch": 0.764, "grad_norm": 376.0, "kl_loss_10": 97.24503707885742, "kl_loss_2": 1061.5150573730468, "kl_loss_3": 737.4153533935547, "kl_loss_7": 187.22406005859375, "learning_rate": 0.00013378255894584462, "loss": 537.8561, "step": 7640 }, { "ce_loss_10": 3.60829781293869, "ce_loss_13": 3.5466054916381835, "ce_loss_2": 4.072665071487426, "ce_loss_3": 3.9038659572601317, "ce_loss_7": 3.6548298835754394, "epoch": 0.765, "grad_norm": 380.0, "kl_loss_10": 95.1153465270996, "kl_loss_2": 1096.5119873046874, "kl_loss_3": 758.0185302734375, "kl_loss_7": 188.7285140991211, "learning_rate": 0.0001327041456712334, "loss": 535.4322, "step": 7650 }, { "ce_loss_10": 3.649807059764862, "ce_loss_13": 3.588579738140106, "ce_loss_2": 4.103657793998718, "ce_loss_3": 3.9434640645980834, "ce_loss_7": 3.6960434794425963, "epoch": 0.766, "grad_norm": 410.0, "kl_loss_10": 95.99581718444824, "kl_loss_2": 1095.5443603515625, "kl_loss_3": 758.2011474609375, "kl_loss_7": 189.6258804321289, "learning_rate": 0.00013162943106179747, "loss": 538.4857, "step": 7660 }, { "ce_loss_10": 3.627143681049347, "ce_loss_13": 3.5671829342842103, "ce_loss_2": 4.08168009519577, "ce_loss_3": 3.9202899813652037, "ce_loss_7": 3.6696593165397644, "epoch": 0.767, "grad_norm": 372.0, "kl_loss_10": 97.96165161132812, "kl_loss_2": 1080.916067504883, "kl_loss_3": 746.2386291503906, "kl_loss_7": 187.8828155517578, "learning_rate": 0.00013055842593990132, "loss": 529.1405, "step": 7670 }, { "ce_loss_10": 3.571021115779877, "ce_loss_13": 3.5149319171905518, "ce_loss_2": 4.027233076095581, "ce_loss_3": 3.864198935031891, "ce_loss_7": 3.6173386335372926, "epoch": 0.768, "grad_norm": 372.0, "kl_loss_10": 92.48302154541015, "kl_loss_2": 1072.3523834228515, "kl_loss_3": 740.25439453125, "kl_loss_7": 183.08441925048828, "learning_rate": 0.00012949114109055414, "loss": 533.8078, "step": 7680 }, { "ce_loss_10": 3.6176257848739626, "ce_loss_13": 3.5594166994094847, "ce_loss_2": 4.078605031967163, "ce_loss_3": 3.918487286567688, "ce_loss_7": 3.6636170506477357, "epoch": 0.769, "grad_norm": 422.0, "kl_loss_10": 94.60773849487305, "kl_loss_2": 1089.138235473633, "kl_loss_3": 757.3290557861328, "kl_loss_7": 187.67217483520508, "learning_rate": 0.00012842758726130281, "loss": 537.3952, "step": 7690 }, { "ce_loss_10": 3.655508840084076, "ce_loss_13": 3.5946714520454406, "ce_loss_2": 4.117365610599518, "ce_loss_3": 3.9561346530914308, "ce_loss_7": 3.7002153038978576, "epoch": 0.77, "grad_norm": 432.0, "kl_loss_10": 94.65040473937988, "kl_loss_2": 1092.9069885253907, "kl_loss_3": 757.1287445068359, "kl_loss_7": 189.29573440551758, "learning_rate": 0.00012736777516212267, "loss": 528.3388, "step": 7700 }, { "ce_loss_10": 3.65016793012619, "ce_loss_13": 3.5914124608039857, "ce_loss_2": 4.1151956677436825, "ce_loss_3": 3.947415459156036, "ce_loss_7": 3.6969300508499146, "epoch": 0.771, "grad_norm": 404.0, "kl_loss_10": 94.72591972351074, "kl_loss_2": 1095.3469024658202, "kl_loss_3": 757.773715209961, "kl_loss_7": 189.3510871887207, "learning_rate": 0.00012631171546530968, "loss": 527.5062, "step": 7710 }, { "ce_loss_10": 3.6695477604866027, "ce_loss_13": 3.6066803336143494, "ce_loss_2": 4.130255508422851, "ce_loss_3": 3.9629722952842714, "ce_loss_7": 3.7124558687210083, "epoch": 0.772, "grad_norm": 400.0, "kl_loss_10": 99.19231147766114, "kl_loss_2": 1089.8547271728517, "kl_loss_3": 754.8526977539062, "kl_loss_7": 189.7204719543457, "learning_rate": 0.00012525941880537307, "loss": 538.339, "step": 7720 }, { "ce_loss_10": 3.7045652866363525, "ce_loss_13": 3.6435051798820495, "ce_loss_2": 4.150338041782379, "ce_loss_3": 3.9872673988342284, "ce_loss_7": 3.7454243421554567, "epoch": 0.773, "grad_norm": 398.0, "kl_loss_10": 95.61402626037598, "kl_loss_2": 1061.4443786621093, "kl_loss_3": 733.3583831787109, "kl_loss_7": 185.768399810791, "learning_rate": 0.00012421089577892869, "loss": 524.5635, "step": 7730 }, { "ce_loss_10": 3.645431864261627, "ce_loss_13": 3.584313917160034, "ce_loss_2": 4.109975218772888, "ce_loss_3": 3.9383553504943847, "ce_loss_7": 3.6912776827812195, "epoch": 0.774, "grad_norm": 440.0, "kl_loss_10": 96.41397132873536, "kl_loss_2": 1098.874331665039, "kl_loss_3": 755.1620544433594, "kl_loss_7": 190.60089797973632, "learning_rate": 0.0001231661569445919, "loss": 536.2486, "step": 7740 }, { "ce_loss_10": 3.501088798046112, "ce_loss_13": 3.443252968788147, "ce_loss_2": 3.9620775461196898, "ce_loss_3": 3.795079970359802, "ce_loss_7": 3.5464309573173525, "epoch": 0.775, "grad_norm": 346.0, "kl_loss_10": 93.47399139404297, "kl_loss_2": 1090.8283447265626, "kl_loss_3": 754.4031158447266, "kl_loss_7": 186.22638092041015, "learning_rate": 0.00012212521282287093, "loss": 538.4937, "step": 7750 }, { "ce_loss_10": 3.6629942655563354, "ce_loss_13": 3.601106119155884, "ce_loss_2": 4.111618340015411, "ce_loss_3": 3.951659619808197, "ce_loss_7": 3.7078338623046876, "epoch": 0.776, "grad_norm": 364.0, "kl_loss_10": 98.37307014465333, "kl_loss_2": 1080.0280029296875, "kl_loss_3": 745.148388671875, "kl_loss_7": 190.13256072998047, "learning_rate": 0.00012108807389606158, "loss": 538.7029, "step": 7760 }, { "ce_loss_10": 3.659121203422546, "ce_loss_13": 3.6007887601852415, "ce_loss_2": 4.112268555164337, "ce_loss_3": 3.9502876162528993, "ce_loss_7": 3.7037811279296875, "epoch": 0.777, "grad_norm": 364.0, "kl_loss_10": 93.70635108947754, "kl_loss_2": 1072.3641204833984, "kl_loss_3": 740.7109130859375, "kl_loss_7": 182.99172821044922, "learning_rate": 0.00012005475060814159, "loss": 525.026, "step": 7770 }, { "ce_loss_10": 3.5951377630233763, "ce_loss_13": 3.5359464406967165, "ce_loss_2": 4.060847020149231, "ce_loss_3": 3.891322433948517, "ce_loss_7": 3.63891544342041, "epoch": 0.778, "grad_norm": 384.0, "kl_loss_10": 97.0392059326172, "kl_loss_2": 1106.707992553711, "kl_loss_3": 763.6160034179687, "kl_loss_7": 188.94908752441407, "learning_rate": 0.00011902525336466464, "loss": 535.4202, "step": 7780 }, { "ce_loss_10": 3.5829373240470885, "ce_loss_13": 3.5231135487556458, "ce_loss_2": 4.054291594028473, "ce_loss_3": 3.888161540031433, "ce_loss_7": 3.630410146713257, "epoch": 0.779, "grad_norm": 384.0, "kl_loss_10": 95.91268005371094, "kl_loss_2": 1108.9134979248047, "kl_loss_3": 768.8667724609375, "kl_loss_7": 190.86130905151367, "learning_rate": 0.00011799959253265668, "loss": 532.9367, "step": 7790 }, { "ce_loss_10": 3.646629250049591, "ce_loss_13": 3.584940028190613, "ce_loss_2": 4.100114536285401, "ce_loss_3": 3.9342658519744873, "ce_loss_7": 3.687722849845886, "epoch": 0.78, "grad_norm": 426.0, "kl_loss_10": 98.96642303466797, "kl_loss_2": 1093.9118621826171, "kl_loss_3": 757.5971832275391, "kl_loss_7": 190.95031204223633, "learning_rate": 0.00011697777844051105, "loss": 534.9413, "step": 7800 }, { "ce_loss_10": 3.6246392488479615, "ce_loss_13": 3.5636275887489317, "ce_loss_2": 4.0959463000297545, "ce_loss_3": 3.9209203004837034, "ce_loss_7": 3.668913960456848, "epoch": 0.781, "grad_norm": 394.0, "kl_loss_10": 96.37951927185058, "kl_loss_2": 1131.5390258789062, "kl_loss_3": 774.0704650878906, "kl_loss_7": 190.10399703979493, "learning_rate": 0.00011595982137788402, "loss": 539.5272, "step": 7810 }, { "ce_loss_10": 3.601748263835907, "ce_loss_13": 3.542947518825531, "ce_loss_2": 4.0462228655815125, "ce_loss_3": 3.884107196331024, "ce_loss_7": 3.6427837133407595, "epoch": 0.782, "grad_norm": 362.0, "kl_loss_10": 95.04786491394043, "kl_loss_2": 1064.3328094482422, "kl_loss_3": 734.7262878417969, "kl_loss_7": 183.74214706420898, "learning_rate": 0.00011494573159559212, "loss": 528.7992, "step": 7820 }, { "ce_loss_10": 3.587358093261719, "ce_loss_13": 3.5285757184028625, "ce_loss_2": 4.055095791816711, "ce_loss_3": 3.8850948452949523, "ce_loss_7": 3.6320362448692323, "epoch": 0.783, "grad_norm": 344.0, "kl_loss_10": 95.2613368988037, "kl_loss_2": 1092.221664428711, "kl_loss_3": 759.4220550537109, "kl_loss_7": 186.76042938232422, "learning_rate": 0.00011393551930550828, "loss": 541.8625, "step": 7830 }, { "ce_loss_10": 3.7354641199111938, "ce_loss_13": 3.6739312171936036, "ce_loss_2": 4.175600934028625, "ce_loss_3": 4.019279301166534, "ce_loss_7": 3.7783281922340395, "epoch": 0.784, "grad_norm": 390.0, "kl_loss_10": 99.59685325622559, "kl_loss_2": 1064.6414337158203, "kl_loss_3": 741.2587860107421, "kl_loss_7": 189.10858612060548, "learning_rate": 0.00011292919468045875, "loss": 527.9955, "step": 7840 }, { "ce_loss_10": 3.680347263813019, "ce_loss_13": 3.6196223735809325, "ce_loss_2": 4.128578865528107, "ce_loss_3": 3.9644781708717347, "ce_loss_7": 3.723640871047974, "epoch": 0.785, "grad_norm": 326.0, "kl_loss_10": 95.6224323272705, "kl_loss_2": 1072.9300354003906, "kl_loss_3": 746.2864379882812, "kl_loss_7": 187.85019607543944, "learning_rate": 0.00011192676785412154, "loss": 523.3404, "step": 7850 }, { "ce_loss_10": 3.622621536254883, "ce_loss_13": 3.560643196105957, "ce_loss_2": 4.089509451389313, "ce_loss_3": 3.9235698699951174, "ce_loss_7": 3.6674723744392397, "epoch": 0.786, "grad_norm": 458.0, "kl_loss_10": 96.80489120483398, "kl_loss_2": 1093.20048828125, "kl_loss_3": 754.2780883789062, "kl_loss_7": 187.94250411987304, "learning_rate": 0.00011092824892092374, "loss": 533.5229, "step": 7860 }, { "ce_loss_10": 3.547496974468231, "ce_loss_13": 3.4892043232917787, "ce_loss_2": 4.020435309410095, "ce_loss_3": 3.8508559226989747, "ce_loss_7": 3.5902876496315, "epoch": 0.787, "grad_norm": 322.0, "kl_loss_10": 94.49787139892578, "kl_loss_2": 1110.2376556396484, "kl_loss_3": 767.8008331298828, "kl_loss_7": 188.15859375, "learning_rate": 0.0001099336479359398, "loss": 532.4489, "step": 7870 }, { "ce_loss_10": 3.676584839820862, "ce_loss_13": 3.6199623942375183, "ce_loss_2": 4.124644804000854, "ce_loss_3": 3.9601522207260134, "ce_loss_7": 3.7184366583824158, "epoch": 0.788, "grad_norm": 414.0, "kl_loss_10": 92.98647613525391, "kl_loss_2": 1076.658267211914, "kl_loss_3": 737.3064331054687, "kl_loss_7": 183.75065536499022, "learning_rate": 0.00010894297491479043, "loss": 529.369, "step": 7880 }, { "ce_loss_10": 3.675907850265503, "ce_loss_13": 3.615241324901581, "ce_loss_2": 4.123448085784912, "ce_loss_3": 3.9602300405502318, "ce_loss_7": 3.715320038795471, "epoch": 0.789, "grad_norm": 370.0, "kl_loss_10": 97.27086067199707, "kl_loss_2": 1078.250909423828, "kl_loss_3": 741.1790222167969, "kl_loss_7": 186.16854553222657, "learning_rate": 0.00010795623983354214, "loss": 523.6978, "step": 7890 }, { "ce_loss_10": 3.549619424343109, "ce_loss_13": 3.492576813697815, "ce_loss_2": 4.021520948410034, "ce_loss_3": 3.8529414176940917, "ce_loss_7": 3.595447373390198, "epoch": 0.79, "grad_norm": 428.0, "kl_loss_10": 93.0215072631836, "kl_loss_2": 1113.914730834961, "kl_loss_3": 772.1699676513672, "kl_loss_7": 189.76142959594728, "learning_rate": 0.00010697345262860636, "loss": 533.2417, "step": 7900 }, { "ce_loss_10": 3.702609384059906, "ce_loss_13": 3.6431208491325378, "ce_loss_2": 4.14087233543396, "ce_loss_3": 3.9802441716194155, "ce_loss_7": 3.7457746505737304, "epoch": 0.791, "grad_norm": 368.0, "kl_loss_10": 97.61579055786133, "kl_loss_2": 1063.5964447021483, "kl_loss_3": 734.3944030761719, "kl_loss_7": 187.06654663085936, "learning_rate": 0.00010599462319663906, "loss": 520.0625, "step": 7910 }, { "ce_loss_10": 3.6748117208480835, "ce_loss_13": 3.614163410663605, "ce_loss_2": 4.111383318901062, "ce_loss_3": 3.951873278617859, "ce_loss_7": 3.715614116191864, "epoch": 0.792, "grad_norm": 382.0, "kl_loss_10": 94.54501228332519, "kl_loss_2": 1049.0091613769532, "kl_loss_3": 722.9781219482422, "kl_loss_7": 183.01754150390624, "learning_rate": 0.00010501976139444191, "loss": 518.3574, "step": 7920 }, { "ce_loss_10": 3.7049331426620484, "ce_loss_13": 3.6438170671463013, "ce_loss_2": 4.144390141963958, "ce_loss_3": 3.9876601815223696, "ce_loss_7": 3.745813262462616, "epoch": 0.793, "grad_norm": 370.0, "kl_loss_10": 97.8447940826416, "kl_loss_2": 1057.744808959961, "kl_loss_3": 730.5345703125, "kl_loss_7": 185.18996047973633, "learning_rate": 0.0001040488770388625, "loss": 527.8366, "step": 7930 }, { "ce_loss_10": 3.6446168065071105, "ce_loss_13": 3.5857683539390566, "ce_loss_2": 4.095709836483001, "ce_loss_3": 3.92866997718811, "ce_loss_7": 3.685992920398712, "epoch": 0.794, "grad_norm": 426.0, "kl_loss_10": 95.57501831054688, "kl_loss_2": 1080.6232208251954, "kl_loss_3": 746.1043212890625, "kl_loss_7": 186.66847763061523, "learning_rate": 0.00010308197990669538, "loss": 527.0882, "step": 7940 }, { "ce_loss_10": 3.7647696137428284, "ce_loss_13": 3.7019853234291076, "ce_loss_2": 4.21561850309372, "ce_loss_3": 4.0513708114624025, "ce_loss_7": 3.8064971685409548, "epoch": 0.795, "grad_norm": 356.0, "kl_loss_10": 100.9611873626709, "kl_loss_2": 1084.6148345947265, "kl_loss_3": 743.2166534423828, "kl_loss_7": 191.26584091186524, "learning_rate": 0.0001021190797345839, "loss": 525.7331, "step": 7950 }, { "ce_loss_10": 3.4792375445365904, "ce_loss_13": 3.4190258502960207, "ce_loss_2": 3.96710387468338, "ce_loss_3": 3.7957834005355835, "ce_loss_7": 3.528597414493561, "epoch": 0.796, "grad_norm": 386.0, "kl_loss_10": 95.0804401397705, "kl_loss_2": 1137.388375854492, "kl_loss_3": 792.2215118408203, "kl_loss_7": 192.50171508789063, "learning_rate": 0.00010116018621892236, "loss": 537.4441, "step": 7960 }, { "ce_loss_10": 3.6988709568977356, "ce_loss_13": 3.6362175583839416, "ce_loss_2": 4.151265692710877, "ce_loss_3": 3.9912821412086488, "ce_loss_7": 3.742702007293701, "epoch": 0.797, "grad_norm": 444.0, "kl_loss_10": 99.6129222869873, "kl_loss_2": 1100.607211303711, "kl_loss_3": 767.8290985107421, "kl_loss_7": 194.2897491455078, "learning_rate": 0.00010020530901575753, "loss": 526.4385, "step": 7970 }, { "ce_loss_10": 3.727276122570038, "ce_loss_13": 3.664809966087341, "ce_loss_2": 4.17646723985672, "ce_loss_3": 4.011640095710755, "ce_loss_7": 3.7683190941810607, "epoch": 0.798, "grad_norm": 334.0, "kl_loss_10": 98.68130950927734, "kl_loss_2": 1084.4167602539062, "kl_loss_3": 747.0828460693359, "kl_loss_7": 190.09516677856445, "learning_rate": 9.925445774069231e-05, "loss": 521.7054, "step": 7980 }, { "ce_loss_10": 3.677051067352295, "ce_loss_13": 3.6162899494171143, "ce_loss_2": 4.132367658615112, "ce_loss_3": 3.9699331760406493, "ce_loss_7": 3.723151159286499, "epoch": 0.799, "grad_norm": 340.0, "kl_loss_10": 97.4996379852295, "kl_loss_2": 1074.8818054199219, "kl_loss_3": 740.7804992675781, "kl_loss_7": 187.78277282714845, "learning_rate": 9.830764196878872e-05, "loss": 517.902, "step": 7990 }, { "ce_loss_10": 3.6140867948532103, "ce_loss_13": 3.556562864780426, "ce_loss_2": 4.0635038137435915, "ce_loss_3": 3.902656090259552, "ce_loss_7": 3.6608413100242614, "epoch": 0.8, "grad_norm": 410.0, "kl_loss_10": 94.1772445678711, "kl_loss_2": 1099.7673645019531, "kl_loss_3": 761.414794921875, "kl_loss_7": 186.34807205200195, "learning_rate": 9.736487123447069e-05, "loss": 531.4563, "step": 8000 }, { "ce_loss_10": 3.559322512149811, "ce_loss_13": 3.49820739030838, "ce_loss_2": 4.036343896389008, "ce_loss_3": 3.8618996500968934, "ce_loss_7": 3.6017415881156922, "epoch": 0.801, "grad_norm": 424.0, "kl_loss_10": 96.55318107604981, "kl_loss_2": 1136.456121826172, "kl_loss_3": 771.9989410400391, "kl_loss_7": 188.50249938964845, "learning_rate": 9.642615503142926e-05, "loss": 541.6381, "step": 8010 }, { "ce_loss_10": 3.630905735492706, "ce_loss_13": 3.5719484210014345, "ce_loss_2": 4.097460567951202, "ce_loss_3": 3.9188284277915955, "ce_loss_7": 3.673666751384735, "epoch": 0.802, "grad_norm": 370.0, "kl_loss_10": 94.45314712524414, "kl_loss_2": 1090.8831848144532, "kl_loss_3": 738.8009979248047, "kl_loss_7": 184.0514343261719, "learning_rate": 9.549150281252633e-05, "loss": 524.0769, "step": 8020 }, { "ce_loss_10": 3.658740258216858, "ce_loss_13": 3.598051357269287, "ce_loss_2": 4.112537753582001, "ce_loss_3": 3.9440460920333864, "ce_loss_7": 3.701529622077942, "epoch": 0.803, "grad_norm": 354.0, "kl_loss_10": 97.62285194396972, "kl_loss_2": 1076.1221923828125, "kl_loss_3": 742.6418304443359, "kl_loss_7": 187.46692276000977, "learning_rate": 9.4560923989699e-05, "loss": 531.6947, "step": 8030 }, { "ce_loss_10": 3.6491722106933593, "ce_loss_13": 3.5902853846549987, "ce_loss_2": 4.109341251850128, "ce_loss_3": 3.942945408821106, "ce_loss_7": 3.696093666553497, "epoch": 0.804, "grad_norm": 382.0, "kl_loss_10": 96.87751007080078, "kl_loss_2": 1089.1260498046875, "kl_loss_3": 751.9404052734375, "kl_loss_7": 188.3861946105957, "learning_rate": 9.363442793386607e-05, "loss": 538.5806, "step": 8040 }, { "ce_loss_10": 3.6259461641311646, "ce_loss_13": 3.5652650475502012, "ce_loss_2": 4.09434745311737, "ce_loss_3": 3.9288868069648744, "ce_loss_7": 3.670744836330414, "epoch": 0.805, "grad_norm": 436.0, "kl_loss_10": 96.23310775756836, "kl_loss_2": 1102.4481658935547, "kl_loss_3": 766.5739196777344, "kl_loss_7": 189.9322036743164, "learning_rate": 9.271202397483213e-05, "loss": 525.3384, "step": 8050 }, { "ce_loss_10": 3.64525443315506, "ce_loss_13": 3.587091565132141, "ce_loss_2": 4.088842356204987, "ce_loss_3": 3.926717495918274, "ce_loss_7": 3.6877028584480285, "epoch": 0.806, "grad_norm": 462.0, "kl_loss_10": 95.10493888854981, "kl_loss_2": 1064.438558959961, "kl_loss_3": 734.5970611572266, "kl_loss_7": 184.7579719543457, "learning_rate": 9.179372140119524e-05, "loss": 530.6901, "step": 8060 }, { "ce_loss_10": 3.59020277261734, "ce_loss_13": 3.531452512741089, "ce_loss_2": 4.036340653896332, "ce_loss_3": 3.8760047912597657, "ce_loss_7": 3.6337902188301086, "epoch": 0.807, "grad_norm": 432.0, "kl_loss_10": 94.00482330322265, "kl_loss_2": 1074.4489135742188, "kl_loss_3": 739.4833740234375, "kl_loss_7": 184.7809310913086, "learning_rate": 9.087952946025175e-05, "loss": 531.5049, "step": 8070 }, { "ce_loss_10": 3.7053560853004455, "ce_loss_13": 3.6452667355537414, "ce_loss_2": 4.136937665939331, "ce_loss_3": 3.9754079580307007, "ce_loss_7": 3.7457935094833372, "epoch": 0.808, "grad_norm": 368.0, "kl_loss_10": 96.12910385131836, "kl_loss_2": 1048.5191436767577, "kl_loss_3": 719.7487762451171, "kl_loss_7": 183.48829498291016, "learning_rate": 8.996945735790446e-05, "loss": 523.2327, "step": 8080 }, { "ce_loss_10": 3.602836012840271, "ce_loss_13": 3.542934799194336, "ce_loss_2": 4.055256414413452, "ce_loss_3": 3.8926199197769167, "ce_loss_7": 3.6462236762046816, "epoch": 0.809, "grad_norm": 414.0, "kl_loss_10": 95.67857933044434, "kl_loss_2": 1093.489208984375, "kl_loss_3": 759.0634765625, "kl_loss_7": 186.64484634399415, "learning_rate": 8.906351425856951e-05, "loss": 536.3948, "step": 8090 }, { "ce_loss_10": 3.586146354675293, "ce_loss_13": 3.5270805954933167, "ce_loss_2": 4.053403818607331, "ce_loss_3": 3.883652901649475, "ce_loss_7": 3.6302590370178223, "epoch": 0.81, "grad_norm": 328.0, "kl_loss_10": 96.12913436889649, "kl_loss_2": 1108.7147094726563, "kl_loss_3": 762.2885803222656, "kl_loss_7": 187.99051055908203, "learning_rate": 8.816170928508365e-05, "loss": 536.7299, "step": 8100 }, { "ce_loss_10": 3.5469899415969848, "ce_loss_13": 3.487591028213501, "ce_loss_2": 4.024684643745422, "ce_loss_3": 3.853050243854523, "ce_loss_7": 3.5918329834938048, "epoch": 0.811, "grad_norm": 424.0, "kl_loss_10": 95.16791305541992, "kl_loss_2": 1131.8392974853516, "kl_loss_3": 782.3692016601562, "kl_loss_7": 188.51590728759766, "learning_rate": 8.7264051518613e-05, "loss": 538.6139, "step": 8110 }, { "ce_loss_10": 3.639654815196991, "ce_loss_13": 3.583385097980499, "ce_loss_2": 4.081218779087067, "ce_loss_3": 3.9191540598869326, "ce_loss_7": 3.680349314212799, "epoch": 0.812, "grad_norm": 358.0, "kl_loss_10": 93.30685958862304, "kl_loss_2": 1057.4586822509766, "kl_loss_3": 735.9759002685547, "kl_loss_7": 182.97039413452148, "learning_rate": 8.637054999856148e-05, "loss": 526.1802, "step": 8120 }, { "ce_loss_10": 3.6243308544158936, "ce_loss_13": 3.5630579233169555, "ce_loss_2": 4.083577620983124, "ce_loss_3": 3.9160293340682983, "ce_loss_7": 3.6718581318855286, "epoch": 0.813, "grad_norm": 328.0, "kl_loss_10": 95.2622299194336, "kl_loss_2": 1086.6508239746095, "kl_loss_3": 748.3265411376954, "kl_loss_7": 187.44526748657228, "learning_rate": 8.548121372247918e-05, "loss": 536.2552, "step": 8130 }, { "ce_loss_10": 3.699293088912964, "ce_loss_13": 3.641393613815308, "ce_loss_2": 4.146343159675598, "ce_loss_3": 3.982176637649536, "ce_loss_7": 3.7424126982688906, "epoch": 0.814, "grad_norm": 420.0, "kl_loss_10": 97.64918098449706, "kl_loss_2": 1075.0233795166016, "kl_loss_3": 745.3918151855469, "kl_loss_7": 187.1306022644043, "learning_rate": 8.459605164597267e-05, "loss": 527.4509, "step": 8140 }, { "ce_loss_10": 3.5794180989265443, "ce_loss_13": 3.521663022041321, "ce_loss_2": 4.035482859611511, "ce_loss_3": 3.869397759437561, "ce_loss_7": 3.6230968952178957, "epoch": 0.815, "grad_norm": 322.0, "kl_loss_10": 93.84382820129395, "kl_loss_2": 1085.6336395263672, "kl_loss_3": 749.5215454101562, "kl_loss_7": 184.3967170715332, "learning_rate": 8.371507268261436e-05, "loss": 530.9717, "step": 8150 }, { "ce_loss_10": 3.6623859286308287, "ce_loss_13": 3.603581893444061, "ce_loss_2": 4.1160969018936155, "ce_loss_3": 3.9481249690055846, "ce_loss_7": 3.7034823894500732, "epoch": 0.816, "grad_norm": 410.0, "kl_loss_10": 96.0962978363037, "kl_loss_2": 1085.8551330566406, "kl_loss_3": 744.0009185791016, "kl_loss_7": 187.44638290405274, "learning_rate": 8.283828570385238e-05, "loss": 515.8468, "step": 8160 }, { "ce_loss_10": 3.6646664142608643, "ce_loss_13": 3.607030153274536, "ce_loss_2": 4.124508082866669, "ce_loss_3": 3.955708396434784, "ce_loss_7": 3.708679938316345, "epoch": 0.817, "grad_norm": 286.0, "kl_loss_10": 95.48198356628419, "kl_loss_2": 1068.3529357910156, "kl_loss_3": 737.6435119628907, "kl_loss_7": 186.3275260925293, "learning_rate": 8.196569953892202e-05, "loss": 525.6566, "step": 8170 }, { "ce_loss_10": 3.5752533435821534, "ce_loss_13": 3.5151426196098328, "ce_loss_2": 4.039277529716491, "ce_loss_3": 3.8700820326805117, "ce_loss_7": 3.6193170666694643, "epoch": 0.818, "grad_norm": 392.0, "kl_loss_10": 95.23657569885253, "kl_loss_2": 1087.7711944580078, "kl_loss_3": 748.5086303710938, "kl_loss_7": 185.79026489257814, "learning_rate": 8.109732297475635e-05, "loss": 529.4896, "step": 8180 }, { "ce_loss_10": 3.5442301869392394, "ce_loss_13": 3.48368262052536, "ce_loss_2": 4.041348910331726, "ce_loss_3": 3.8620414972305297, "ce_loss_7": 3.593292236328125, "epoch": 0.819, "grad_norm": 508.0, "kl_loss_10": 94.79218406677246, "kl_loss_2": 1140.4125610351562, "kl_loss_3": 788.5256622314453, "kl_loss_7": 192.41318969726564, "learning_rate": 8.023316475589754e-05, "loss": 543.2035, "step": 8190 }, { "ce_loss_10": 3.5104150652885435, "ce_loss_13": 3.44714834690094, "ce_loss_2": 4.0140674948692325, "ce_loss_3": 3.8308369159698485, "ce_loss_7": 3.5589245796203612, "epoch": 0.82, "grad_norm": 532.0, "kl_loss_10": 97.92351608276367, "kl_loss_2": 1158.8160186767577, "kl_loss_3": 797.9960662841797, "kl_loss_7": 195.1374740600586, "learning_rate": 7.937323358440934e-05, "loss": 549.9746, "step": 8200 }, { "ce_loss_10": 3.637300205230713, "ce_loss_13": 3.5789112567901613, "ce_loss_2": 4.087347877025604, "ce_loss_3": 3.923117625713348, "ce_loss_7": 3.679899263381958, "epoch": 0.821, "grad_norm": 404.0, "kl_loss_10": 95.01284561157226, "kl_loss_2": 1074.5766845703124, "kl_loss_3": 743.3249450683594, "kl_loss_7": 184.74501190185546, "learning_rate": 7.851753811978923e-05, "loss": 530.0879, "step": 8210 }, { "ce_loss_10": 3.661479341983795, "ce_loss_13": 3.6010610818862916, "ce_loss_2": 4.123578751087189, "ce_loss_3": 3.9517632484436036, "ce_loss_7": 3.7047110080718992, "epoch": 0.822, "grad_norm": 358.0, "kl_loss_10": 96.71367454528809, "kl_loss_2": 1091.6120025634766, "kl_loss_3": 744.3147399902343, "kl_loss_7": 186.59919815063478, "learning_rate": 7.766608697888095e-05, "loss": 527.9285, "step": 8220 }, { "ce_loss_10": 3.672685134410858, "ce_loss_13": 3.6110698223114013, "ce_loss_2": 4.123067581653595, "ce_loss_3": 3.9549397349357607, "ce_loss_7": 3.7160248041152952, "epoch": 0.823, "grad_norm": 428.0, "kl_loss_10": 99.5799617767334, "kl_loss_2": 1090.7132843017578, "kl_loss_3": 754.5721008300782, "kl_loss_7": 190.94887008666993, "learning_rate": 7.681888873578785e-05, "loss": 534.6821, "step": 8230 }, { "ce_loss_10": 3.599495697021484, "ce_loss_13": 3.5377328515052797, "ce_loss_2": 4.075003004074096, "ce_loss_3": 3.9027179360389708, "ce_loss_7": 3.6464317083358764, "epoch": 0.824, "grad_norm": 454.0, "kl_loss_10": 96.61878395080566, "kl_loss_2": 1113.7870971679688, "kl_loss_3": 766.2083129882812, "kl_loss_7": 191.40456848144532, "learning_rate": 7.597595192178702e-05, "loss": 531.8756, "step": 8240 }, { "ce_loss_10": 3.5937318563461305, "ce_loss_13": 3.5349967002868654, "ce_loss_2": 4.069689559936523, "ce_loss_3": 3.896022927761078, "ce_loss_7": 3.640897309780121, "epoch": 0.825, "grad_norm": 390.0, "kl_loss_10": 96.6520393371582, "kl_loss_2": 1123.3416778564454, "kl_loss_3": 772.9763427734375, "kl_loss_7": 191.78026428222657, "learning_rate": 7.513728502524286e-05, "loss": 540.9631, "step": 8250 }, { "ce_loss_10": 3.600663185119629, "ce_loss_13": 3.543354606628418, "ce_loss_2": 4.056336843967438, "ce_loss_3": 3.886526358127594, "ce_loss_7": 3.644607651233673, "epoch": 0.826, "grad_norm": 520.0, "kl_loss_10": 94.51933555603027, "kl_loss_2": 1071.4390838623046, "kl_loss_3": 737.6031066894532, "kl_loss_7": 182.459228515625, "learning_rate": 7.430289649152156e-05, "loss": 532.1943, "step": 8260 }, { "ce_loss_10": 3.4964008927345276, "ce_loss_13": 3.4386809706687926, "ce_loss_2": 3.979319155216217, "ce_loss_3": 3.806533432006836, "ce_loss_7": 3.5424695372581483, "epoch": 0.827, "grad_norm": 438.0, "kl_loss_10": 92.59819717407227, "kl_loss_2": 1138.140805053711, "kl_loss_3": 787.0873046875, "kl_loss_7": 188.89702301025392, "learning_rate": 7.347279472290646e-05, "loss": 536.0913, "step": 8270 }, { "ce_loss_10": 3.641860234737396, "ce_loss_13": 3.5819854736328125, "ce_loss_2": 4.100788974761963, "ce_loss_3": 3.9369661927223207, "ce_loss_7": 3.6862512946128847, "epoch": 0.828, "grad_norm": 404.0, "kl_loss_10": 96.73132438659668, "kl_loss_2": 1085.176287841797, "kl_loss_3": 756.2387023925781, "kl_loss_7": 187.64101333618163, "learning_rate": 7.264698807851328e-05, "loss": 532.8096, "step": 8280 }, { "ce_loss_10": 3.604352295398712, "ce_loss_13": 3.549103558063507, "ce_loss_2": 4.042815041542053, "ce_loss_3": 3.880488729476929, "ce_loss_7": 3.64413400888443, "epoch": 0.829, "grad_norm": 520.0, "kl_loss_10": 92.21053123474121, "kl_loss_2": 1057.8690124511718, "kl_loss_3": 728.9253723144532, "kl_loss_7": 181.22113647460938, "learning_rate": 7.182548487420554e-05, "loss": 524.6575, "step": 8290 }, { "ce_loss_10": 3.6577786207199097, "ce_loss_13": 3.597848916053772, "ce_loss_2": 4.107566392421722, "ce_loss_3": 3.947295570373535, "ce_loss_7": 3.703710603713989, "epoch": 0.83, "grad_norm": 286.0, "kl_loss_10": 96.30242042541504, "kl_loss_2": 1087.0319366455078, "kl_loss_3": 748.0092193603516, "kl_loss_7": 187.4295867919922, "learning_rate": 7.100829338251146e-05, "loss": 527.7667, "step": 8300 }, { "ce_loss_10": 3.5980669021606446, "ce_loss_13": 3.5371885776519774, "ce_loss_2": 4.070665979385376, "ce_loss_3": 3.8979653000831602, "ce_loss_7": 3.6431610107421877, "epoch": 0.831, "grad_norm": 394.0, "kl_loss_10": 95.44490776062011, "kl_loss_2": 1113.3842803955079, "kl_loss_3": 769.6158874511718, "kl_loss_7": 189.99929428100586, "learning_rate": 7.019542183254046e-05, "loss": 531.0445, "step": 8310 }, { "ce_loss_10": 3.6354474306106566, "ce_loss_13": 3.57179137468338, "ce_loss_2": 4.082340836524963, "ce_loss_3": 3.9207422971725463, "ce_loss_7": 3.6777117967605593, "epoch": 0.832, "grad_norm": 474.0, "kl_loss_10": 100.207564163208, "kl_loss_2": 1084.2285125732421, "kl_loss_3": 748.0254974365234, "kl_loss_7": 190.82402954101562, "learning_rate": 6.938687840989971e-05, "loss": 528.8804, "step": 8320 }, { "ce_loss_10": 3.5696911811828613, "ce_loss_13": 3.508439671993256, "ce_loss_2": 4.0291890621185305, "ce_loss_3": 3.8622559309005737, "ce_loss_7": 3.614106321334839, "epoch": 0.833, "grad_norm": 600.0, "kl_loss_10": 96.55842895507813, "kl_loss_2": 1082.4974243164063, "kl_loss_3": 748.5556121826172, "kl_loss_7": 188.75322189331055, "learning_rate": 6.858267125661271e-05, "loss": 531.4916, "step": 8330 }, { "ce_loss_10": 3.6338680744171143, "ce_loss_13": 3.575134778022766, "ce_loss_2": 4.0971689343452455, "ce_loss_3": 3.930681896209717, "ce_loss_7": 3.6769707798957825, "epoch": 0.834, "grad_norm": 418.0, "kl_loss_10": 93.3882438659668, "kl_loss_2": 1085.4937896728516, "kl_loss_3": 746.0253967285156, "kl_loss_7": 184.32117233276367, "learning_rate": 6.778280847103668e-05, "loss": 538.0241, "step": 8340 }, { "ce_loss_10": 3.6449947714805604, "ce_loss_13": 3.581918466091156, "ce_loss_2": 4.1008768558502195, "ce_loss_3": 3.937298035621643, "ce_loss_7": 3.686388850212097, "epoch": 0.835, "grad_norm": 290.0, "kl_loss_10": 98.43625144958496, "kl_loss_2": 1102.1855102539062, "kl_loss_3": 759.7929138183594, "kl_loss_7": 191.51789016723632, "learning_rate": 6.698729810778065e-05, "loss": 532.2951, "step": 8350 }, { "ce_loss_10": 3.5478424787521363, "ce_loss_13": 3.489585447311401, "ce_loss_2": 4.0140421986579895, "ce_loss_3": 3.8517470717430116, "ce_loss_7": 3.592922496795654, "epoch": 0.836, "grad_norm": 490.0, "kl_loss_10": 91.77609100341797, "kl_loss_2": 1092.1636932373046, "kl_loss_3": 756.2904968261719, "kl_loss_7": 183.14143447875978, "learning_rate": 6.619614817762538e-05, "loss": 531.3562, "step": 8360 }, { "ce_loss_10": 3.509856128692627, "ce_loss_13": 3.4520259737968444, "ce_loss_2": 4.005417215824127, "ce_loss_3": 3.8302616715431212, "ce_loss_7": 3.56083265542984, "epoch": 0.837, "grad_norm": 356.0, "kl_loss_10": 91.30384330749511, "kl_loss_2": 1146.0878509521485, "kl_loss_3": 788.8349487304688, "kl_loss_7": 189.73513488769532, "learning_rate": 6.540936664744196e-05, "loss": 543.0581, "step": 8370 }, { "ce_loss_10": 3.6644623279571533, "ce_loss_13": 3.6040658593177795, "ce_loss_2": 4.12789534330368, "ce_loss_3": 3.959988057613373, "ce_loss_7": 3.7062342405319213, "epoch": 0.838, "grad_norm": 366.0, "kl_loss_10": 97.38574295043945, "kl_loss_2": 1085.7984375, "kl_loss_3": 749.598519897461, "kl_loss_7": 188.30213012695313, "learning_rate": 6.462696144011149e-05, "loss": 525.3536, "step": 8380 }, { "ce_loss_10": 3.6138532400131225, "ce_loss_13": 3.5537376523017885, "ce_loss_2": 4.071477258205414, "ce_loss_3": 3.910947525501251, "ce_loss_7": 3.658327579498291, "epoch": 0.839, "grad_norm": 556.0, "kl_loss_10": 98.20170745849609, "kl_loss_2": 1090.382958984375, "kl_loss_3": 762.5471374511719, "kl_loss_7": 191.74814834594727, "learning_rate": 6.384894043444567e-05, "loss": 528.8093, "step": 8390 }, { "ce_loss_10": 3.644765245914459, "ce_loss_13": 3.585478734970093, "ce_loss_2": 4.109920060634613, "ce_loss_3": 3.9416786313056944, "ce_loss_7": 3.689965844154358, "epoch": 0.84, "grad_norm": 412.0, "kl_loss_10": 97.19089965820312, "kl_loss_2": 1101.7069030761718, "kl_loss_3": 757.5290496826171, "kl_loss_7": 188.98860778808594, "learning_rate": 6.307531146510753e-05, "loss": 529.2157, "step": 8400 }, { "ce_loss_10": 3.621027076244354, "ce_loss_13": 3.5618404507637025, "ce_loss_2": 4.0682983756065365, "ce_loss_3": 3.90874502658844, "ce_loss_7": 3.6661928296089172, "epoch": 0.841, "grad_norm": 384.0, "kl_loss_10": 95.90530738830566, "kl_loss_2": 1067.8680267333984, "kl_loss_3": 738.8968048095703, "kl_loss_7": 187.38672485351563, "learning_rate": 6.230608232253226e-05, "loss": 522.0211, "step": 8410 }, { "ce_loss_10": 3.5725093245506288, "ce_loss_13": 3.5133079648017884, "ce_loss_2": 4.052767491340637, "ce_loss_3": 3.8865469098091125, "ce_loss_7": 3.617572808265686, "epoch": 0.842, "grad_norm": 420.0, "kl_loss_10": 93.54998550415038, "kl_loss_2": 1118.0941436767578, "kl_loss_3": 779.2841003417968, "kl_loss_7": 188.06975250244142, "learning_rate": 6.154126075284855e-05, "loss": 530.6581, "step": 8420 }, { "ce_loss_10": 3.6709149718284606, "ce_loss_13": 3.610610318183899, "ce_loss_2": 4.11589070558548, "ce_loss_3": 3.958199071884155, "ce_loss_7": 3.7119770526885985, "epoch": 0.843, "grad_norm": 360.0, "kl_loss_10": 93.72929344177246, "kl_loss_2": 1052.0708984375, "kl_loss_3": 727.1819213867187, "kl_loss_7": 182.0021545410156, "learning_rate": 6.078085445780129e-05, "loss": 515.5865, "step": 8430 }, { "ce_loss_10": 3.678613018989563, "ce_loss_13": 3.6185575127601624, "ce_loss_2": 4.138859879970551, "ce_loss_3": 3.970304036140442, "ce_loss_7": 3.7233882188796996, "epoch": 0.844, "grad_norm": 708.0, "kl_loss_10": 96.56619453430176, "kl_loss_2": 1092.8436309814454, "kl_loss_3": 748.7821746826172, "kl_loss_7": 187.36514282226562, "learning_rate": 6.002487109467347e-05, "loss": 524.9962, "step": 8440 }, { "ce_loss_10": 3.681882548332214, "ce_loss_13": 3.623554539680481, "ce_loss_2": 4.131060492992401, "ce_loss_3": 3.969043660163879, "ce_loss_7": 3.7261468291282656, "epoch": 0.845, "grad_norm": 498.0, "kl_loss_10": 95.19795646667481, "kl_loss_2": 1083.3428985595704, "kl_loss_3": 748.7116729736329, "kl_loss_7": 188.84120330810546, "learning_rate": 5.927331827620902e-05, "loss": 524.2234, "step": 8450 }, { "ce_loss_10": 3.671555197238922, "ce_loss_13": 3.6144081234931944, "ce_loss_2": 4.109152019023895, "ce_loss_3": 3.957107722759247, "ce_loss_7": 3.7151288032531737, "epoch": 0.846, "grad_norm": 384.0, "kl_loss_10": 92.54770011901856, "kl_loss_2": 1047.1174011230469, "kl_loss_3": 728.4162536621094, "kl_loss_7": 183.04834442138673, "learning_rate": 5.852620357053651e-05, "loss": 522.9391, "step": 8460 }, { "ce_loss_10": 3.7129202485084534, "ce_loss_13": 3.65321398973465, "ce_loss_2": 4.155979669094085, "ce_loss_3": 3.9961599469184876, "ce_loss_7": 3.7558295488357545, "epoch": 0.847, "grad_norm": 432.0, "kl_loss_10": 94.81909484863282, "kl_loss_2": 1067.3740447998048, "kl_loss_3": 736.2771881103515, "kl_loss_7": 184.3846176147461, "learning_rate": 5.778353450109286e-05, "loss": 523.3945, "step": 8470 }, { "ce_loss_10": 3.7526662349700928, "ce_loss_13": 3.6899970173835754, "ce_loss_2": 4.2024567246437075, "ce_loss_3": 4.037352812290192, "ce_loss_7": 3.7961275696754457, "epoch": 0.848, "grad_norm": 420.0, "kl_loss_10": 98.8898868560791, "kl_loss_2": 1083.7428894042969, "kl_loss_3": 747.87919921875, "kl_loss_7": 190.12581558227538, "learning_rate": 5.7045318546547206e-05, "loss": 528.6064, "step": 8480 }, { "ce_loss_10": 3.6435152888298035, "ce_loss_13": 3.5820479154586793, "ce_loss_2": 4.10130136013031, "ce_loss_3": 3.9336646437644958, "ce_loss_7": 3.6865146279335024, "epoch": 0.849, "grad_norm": 476.0, "kl_loss_10": 97.09412269592285, "kl_loss_2": 1097.005093383789, "kl_loss_3": 757.3569030761719, "kl_loss_7": 187.13169021606444, "learning_rate": 5.631156314072605e-05, "loss": 526.7981, "step": 8490 }, { "ce_loss_10": 3.6548070907592773, "ce_loss_13": 3.5959606409072875, "ce_loss_2": 4.090519487857819, "ce_loss_3": 3.9302281975746154, "ce_loss_7": 3.6990505933761595, "epoch": 0.85, "grad_norm": 348.0, "kl_loss_10": 94.60167617797852, "kl_loss_2": 1058.567938232422, "kl_loss_3": 726.6986267089844, "kl_loss_7": 182.6941146850586, "learning_rate": 5.5582275672538315e-05, "loss": 518.2773, "step": 8500 }, { "ce_loss_10": 3.5718761324882506, "ce_loss_13": 3.510132133960724, "ce_loss_2": 4.058491265773773, "ce_loss_3": 3.8868750095367433, "ce_loss_7": 3.62018061876297, "epoch": 0.851, "grad_norm": 356.0, "kl_loss_10": 98.47408905029297, "kl_loss_2": 1129.9293365478516, "kl_loss_3": 782.1435455322265, "kl_loss_7": 191.806551361084, "learning_rate": 5.4857463485900484e-05, "loss": 540.5649, "step": 8510 }, { "ce_loss_10": 3.626720643043518, "ce_loss_13": 3.5688146710395814, "ce_loss_2": 4.081609988212586, "ce_loss_3": 3.9117035031318665, "ce_loss_7": 3.673699951171875, "epoch": 0.852, "grad_norm": 392.0, "kl_loss_10": 94.4161319732666, "kl_loss_2": 1082.976022338867, "kl_loss_3": 743.9283477783204, "kl_loss_7": 185.5842658996582, "learning_rate": 5.413713387966329e-05, "loss": 525.7675, "step": 8520 }, { "ce_loss_10": 3.6495197653770446, "ce_loss_13": 3.5870252728462217, "ce_loss_2": 4.1089702367782595, "ce_loss_3": 3.943737292289734, "ce_loss_7": 3.6925705909729003, "epoch": 0.853, "grad_norm": 560.0, "kl_loss_10": 99.9091007232666, "kl_loss_2": 1091.3887969970704, "kl_loss_3": 754.8269989013672, "kl_loss_7": 190.51073608398437, "learning_rate": 5.34212941075381e-05, "loss": 533.712, "step": 8530 }, { "ce_loss_10": 3.6638750314712523, "ce_loss_13": 3.603909599781036, "ce_loss_2": 4.105106854438782, "ce_loss_3": 3.939826285839081, "ce_loss_7": 3.703915464878082, "epoch": 0.854, "grad_norm": 324.0, "kl_loss_10": 94.93586730957031, "kl_loss_2": 1060.2898712158203, "kl_loss_3": 729.1602386474609, "kl_loss_7": 183.2039321899414, "learning_rate": 5.270995137802315e-05, "loss": 520.0254, "step": 8540 }, { "ce_loss_10": 3.586125075817108, "ce_loss_13": 3.530829107761383, "ce_loss_2": 4.0409599304199215, "ce_loss_3": 3.876398241519928, "ce_loss_7": 3.6288790106773376, "epoch": 0.855, "grad_norm": 390.0, "kl_loss_10": 92.31447868347168, "kl_loss_2": 1091.2599792480469, "kl_loss_3": 750.2804168701172, "kl_loss_7": 184.4141700744629, "learning_rate": 5.2003112854332125e-05, "loss": 530.1402, "step": 8550 }, { "ce_loss_10": 3.592084896564484, "ce_loss_13": 3.5318885922431944, "ce_loss_2": 4.045030009746552, "ce_loss_3": 3.8797095656394958, "ce_loss_7": 3.6342476487159727, "epoch": 0.856, "grad_norm": 410.0, "kl_loss_10": 95.16406364440918, "kl_loss_2": 1083.518502807617, "kl_loss_3": 746.9155914306641, "kl_loss_7": 184.60284118652345, "learning_rate": 5.130078565432089e-05, "loss": 519.0631, "step": 8560 }, { "ce_loss_10": 3.6698386430740357, "ce_loss_13": 3.611102557182312, "ce_loss_2": 4.1041951179504395, "ce_loss_3": 3.9457595467567446, "ce_loss_7": 3.714687442779541, "epoch": 0.857, "grad_norm": 330.0, "kl_loss_10": 94.41157264709473, "kl_loss_2": 1066.6546508789063, "kl_loss_3": 732.30849609375, "kl_loss_7": 183.59521484375, "learning_rate": 5.060297685041659e-05, "loss": 515.5307, "step": 8570 }, { "ce_loss_10": 3.594843864440918, "ce_loss_13": 3.535090386867523, "ce_loss_2": 4.058831119537354, "ce_loss_3": 3.8907560467720033, "ce_loss_7": 3.6390093684196474, "epoch": 0.858, "grad_norm": 396.0, "kl_loss_10": 97.14489707946777, "kl_loss_2": 1100.07861328125, "kl_loss_3": 757.8477020263672, "kl_loss_7": 190.17505111694337, "learning_rate": 4.99096934695461e-05, "loss": 537.0569, "step": 8580 }, { "ce_loss_10": 3.655477023124695, "ce_loss_13": 3.592752683162689, "ce_loss_2": 4.114116084575653, "ce_loss_3": 3.950313460826874, "ce_loss_7": 3.6980414509773256, "epoch": 0.859, "grad_norm": 370.0, "kl_loss_10": 96.66123657226562, "kl_loss_2": 1076.5634460449219, "kl_loss_3": 745.2082977294922, "kl_loss_7": 186.95159301757812, "learning_rate": 4.922094249306558e-05, "loss": 520.1718, "step": 8590 }, { "ce_loss_10": 3.677726352214813, "ce_loss_13": 3.6172829270362854, "ce_loss_2": 4.126979196071625, "ce_loss_3": 3.9645047903060915, "ce_loss_7": 3.7215185284614565, "epoch": 0.86, "grad_norm": 392.0, "kl_loss_10": 96.89525718688965, "kl_loss_2": 1065.1883819580078, "kl_loss_3": 740.1956573486328, "kl_loss_7": 187.83882064819335, "learning_rate": 4.853673085668947e-05, "loss": 516.6985, "step": 8600 }, { "ce_loss_10": 3.707137334346771, "ce_loss_13": 3.6448033452033997, "ce_loss_2": 4.162192296981812, "ce_loss_3": 3.993678319454193, "ce_loss_7": 3.7496466279029845, "epoch": 0.861, "grad_norm": 370.0, "kl_loss_10": 98.02176780700684, "kl_loss_2": 1078.1511993408203, "kl_loss_3": 739.8441162109375, "kl_loss_7": 186.5592399597168, "learning_rate": 4.78570654504214e-05, "loss": 529.6101, "step": 8610 }, { "ce_loss_10": 3.6458049774169923, "ce_loss_13": 3.5854872465133667, "ce_loss_2": 4.110537803173065, "ce_loss_3": 3.938798224925995, "ce_loss_7": 3.6893723726272585, "epoch": 0.862, "grad_norm": 414.0, "kl_loss_10": 94.25516128540039, "kl_loss_2": 1104.6271423339845, "kl_loss_3": 758.221337890625, "kl_loss_7": 185.93933029174804, "learning_rate": 4.7181953118484556e-05, "loss": 535.9025, "step": 8620 }, { "ce_loss_10": 3.6774216413497927, "ce_loss_13": 3.6180386185646056, "ce_loss_2": 4.12672735452652, "ce_loss_3": 3.962115204334259, "ce_loss_7": 3.720357131958008, "epoch": 0.863, "grad_norm": 356.0, "kl_loss_10": 95.34017066955566, "kl_loss_2": 1068.0610900878905, "kl_loss_3": 737.2169891357422, "kl_loss_7": 185.36345138549805, "learning_rate": 4.651140065925269e-05, "loss": 530.0095, "step": 8630 }, { "ce_loss_10": 3.609228265285492, "ce_loss_13": 3.5492658615112305, "ce_loss_2": 4.060226953029632, "ce_loss_3": 3.895670175552368, "ce_loss_7": 3.6542355179786683, "epoch": 0.864, "grad_norm": 360.0, "kl_loss_10": 96.95414390563965, "kl_loss_2": 1087.1394622802734, "kl_loss_3": 748.6742889404297, "kl_loss_7": 188.45738372802734, "learning_rate": 4.58454148251814e-05, "loss": 535.7555, "step": 8640 }, { "ce_loss_10": 3.6290027260780335, "ce_loss_13": 3.566804575920105, "ce_loss_2": 4.098408913612365, "ce_loss_3": 3.928418016433716, "ce_loss_7": 3.673435080051422, "epoch": 0.865, "grad_norm": 352.0, "kl_loss_10": 97.77750358581542, "kl_loss_2": 1105.780810546875, "kl_loss_3": 762.838412475586, "kl_loss_7": 187.93626327514647, "learning_rate": 4.518400232274078e-05, "loss": 530.3719, "step": 8650 }, { "ce_loss_10": 3.641969549655914, "ce_loss_13": 3.5785802602767944, "ce_loss_2": 4.092971992492676, "ce_loss_3": 3.932430160045624, "ce_loss_7": 3.6855560064315798, "epoch": 0.866, "grad_norm": 320.0, "kl_loss_10": 100.24152946472168, "kl_loss_2": 1078.2671875, "kl_loss_3": 746.3800415039062, "kl_loss_7": 188.71098556518555, "learning_rate": 4.452716981234745e-05, "loss": 518.2875, "step": 8660 }, { "ce_loss_10": 3.619352424144745, "ce_loss_13": 3.5634596943855286, "ce_loss_2": 4.0641814827919, "ce_loss_3": 3.9009178042411805, "ce_loss_7": 3.6601861000061033, "epoch": 0.867, "grad_norm": 334.0, "kl_loss_10": 92.77517395019531, "kl_loss_2": 1069.4530029296875, "kl_loss_3": 742.2820404052734, "kl_loss_7": 183.70159912109375, "learning_rate": 4.3874923908297335e-05, "loss": 518.2648, "step": 8670 }, { "ce_loss_10": 3.6679449677467346, "ce_loss_13": 3.605993056297302, "ce_loss_2": 4.122425937652588, "ce_loss_3": 3.955815386772156, "ce_loss_7": 3.710171031951904, "epoch": 0.868, "grad_norm": 372.0, "kl_loss_10": 98.51640739440919, "kl_loss_2": 1091.1497436523437, "kl_loss_3": 753.822543334961, "kl_loss_7": 189.5640121459961, "learning_rate": 4.322727117869951e-05, "loss": 527.5021, "step": 8680 }, { "ce_loss_10": 3.678618919849396, "ce_loss_13": 3.61755256652832, "ce_loss_2": 4.1355063915252686, "ce_loss_3": 3.9705930352211, "ce_loss_7": 3.7248330235481264, "epoch": 0.869, "grad_norm": 450.0, "kl_loss_10": 97.55352783203125, "kl_loss_2": 1094.9813720703125, "kl_loss_3": 756.694857788086, "kl_loss_7": 188.98089218139648, "learning_rate": 4.2584218145409916e-05, "loss": 526.9053, "step": 8690 }, { "ce_loss_10": 3.724055600166321, "ce_loss_13": 3.6645130157470702, "ce_loss_2": 4.164188587665558, "ce_loss_3": 4.006092858314514, "ce_loss_7": 3.766603982448578, "epoch": 0.87, "grad_norm": 368.0, "kl_loss_10": 97.79985809326172, "kl_loss_2": 1054.3090911865233, "kl_loss_3": 727.9592834472656, "kl_loss_7": 186.32457809448243, "learning_rate": 4.194577128396521e-05, "loss": 516.3896, "step": 8700 }, { "ce_loss_10": 3.59331738948822, "ce_loss_13": 3.5345770716667175, "ce_loss_2": 4.046900963783264, "ce_loss_3": 3.882276177406311, "ce_loss_7": 3.636314344406128, "epoch": 0.871, "grad_norm": 348.0, "kl_loss_10": 93.78037185668946, "kl_loss_2": 1077.3778259277344, "kl_loss_3": 740.198031616211, "kl_loss_7": 183.74533233642578, "learning_rate": 4.1311937023518264e-05, "loss": 527.0207, "step": 8710 }, { "ce_loss_10": 3.6144633054733277, "ce_loss_13": 3.5550664901733398, "ce_loss_2": 4.064953672885895, "ce_loss_3": 3.891311466693878, "ce_loss_7": 3.653948724269867, "epoch": 0.872, "grad_norm": 338.0, "kl_loss_10": 94.96177291870117, "kl_loss_2": 1085.5813049316407, "kl_loss_3": 729.3066223144531, "kl_loss_7": 181.0632652282715, "learning_rate": 4.0682721746773344e-05, "loss": 521.2992, "step": 8720 }, { "ce_loss_10": 3.4832905650138857, "ce_loss_13": 3.4249367475509644, "ce_loss_2": 3.961899662017822, "ce_loss_3": 3.788464534282684, "ce_loss_7": 3.527579641342163, "epoch": 0.873, "grad_norm": 370.0, "kl_loss_10": 91.51293182373047, "kl_loss_2": 1104.7394775390626, "kl_loss_3": 759.5037414550782, "kl_loss_7": 185.07400512695312, "learning_rate": 4.0058131789920904e-05, "loss": 521.9289, "step": 8730 }, { "ce_loss_10": 3.640140187740326, "ce_loss_13": 3.57983558177948, "ce_loss_2": 4.088211476802826, "ce_loss_3": 3.927894616127014, "ce_loss_7": 3.6845538139343263, "epoch": 0.874, "grad_norm": 438.0, "kl_loss_10": 95.66121215820313, "kl_loss_2": 1082.0109283447266, "kl_loss_3": 751.8433319091797, "kl_loss_7": 184.97217254638673, "learning_rate": 3.9438173442575e-05, "loss": 542.025, "step": 8740 }, { "ce_loss_10": 3.668476128578186, "ce_loss_13": 3.6084399461746215, "ce_loss_2": 4.114363825321197, "ce_loss_3": 3.948890733718872, "ce_loss_7": 3.712895894050598, "epoch": 0.875, "grad_norm": 360.0, "kl_loss_10": 95.13606338500976, "kl_loss_2": 1069.65205078125, "kl_loss_3": 736.1352905273437, "kl_loss_7": 185.31621551513672, "learning_rate": 3.882285294770937e-05, "loss": 524.7358, "step": 8750 }, { "ce_loss_10": 3.636470365524292, "ce_loss_13": 3.576250433921814, "ce_loss_2": 4.081735682487488, "ce_loss_3": 3.9194202423095703, "ce_loss_7": 3.6787103533744814, "epoch": 0.876, "grad_norm": 372.0, "kl_loss_10": 97.42237510681153, "kl_loss_2": 1070.8320678710938, "kl_loss_3": 736.4440826416015, "kl_loss_7": 186.42294464111328, "learning_rate": 3.821217650159453e-05, "loss": 528.159, "step": 8760 }, { "ce_loss_10": 3.501795244216919, "ce_loss_13": 3.445420837402344, "ce_loss_2": 3.993399131298065, "ce_loss_3": 3.819171416759491, "ce_loss_7": 3.5519042015075684, "epoch": 0.877, "grad_norm": 398.0, "kl_loss_10": 91.19635620117188, "kl_loss_2": 1126.038784790039, "kl_loss_3": 777.8552947998047, "kl_loss_7": 188.21297302246094, "learning_rate": 3.760615025373543e-05, "loss": 535.8912, "step": 8770 }, { "ce_loss_10": 3.687652599811554, "ce_loss_13": 3.6275517463684084, "ce_loss_2": 4.149944150447846, "ce_loss_3": 3.984694278240204, "ce_loss_7": 3.7361566066741942, "epoch": 0.878, "grad_norm": 426.0, "kl_loss_10": 98.53735313415527, "kl_loss_2": 1087.7767242431642, "kl_loss_3": 754.1841644287109, "kl_loss_7": 191.66405487060547, "learning_rate": 3.700478030680987e-05, "loss": 534.6525, "step": 8780 }, { "ce_loss_10": 3.672296917438507, "ce_loss_13": 3.6126784920692443, "ce_loss_2": 4.126206862926483, "ce_loss_3": 3.9555336833000183, "ce_loss_7": 3.7154035449028013, "epoch": 0.879, "grad_norm": 400.0, "kl_loss_10": 95.93194694519043, "kl_loss_2": 1067.7572967529297, "kl_loss_3": 734.3840759277343, "kl_loss_7": 185.99778594970704, "learning_rate": 3.6408072716606344e-05, "loss": 520.9604, "step": 8790 }, { "ce_loss_10": 3.5921829104423524, "ce_loss_13": 3.5314606547355654, "ce_loss_2": 4.064702832698822, "ce_loss_3": 3.897125017642975, "ce_loss_7": 3.639820373058319, "epoch": 0.88, "grad_norm": 424.0, "kl_loss_10": 96.45306243896485, "kl_loss_2": 1113.6997863769532, "kl_loss_3": 769.2831970214844, "kl_loss_7": 189.68171615600585, "learning_rate": 3.5816033491963716e-05, "loss": 546.457, "step": 8800 }, { "ce_loss_10": 3.4587510585784913, "ce_loss_13": 3.398640847206116, "ce_loss_2": 3.9295639514923097, "ce_loss_3": 3.755736696720123, "ce_loss_7": 3.502725625038147, "epoch": 0.881, "grad_norm": 374.0, "kl_loss_10": 94.41120719909668, "kl_loss_2": 1107.7318145751954, "kl_loss_3": 762.6848449707031, "kl_loss_7": 185.3354965209961, "learning_rate": 3.522866859471047e-05, "loss": 531.675, "step": 8810 }, { "ce_loss_10": 3.7003540635108947, "ce_loss_13": 3.6417059302330017, "ce_loss_2": 4.134489345550537, "ce_loss_3": 3.972803270816803, "ce_loss_7": 3.7418115973472594, "epoch": 0.882, "grad_norm": 620.0, "kl_loss_10": 93.44988250732422, "kl_loss_2": 1046.9635864257812, "kl_loss_3": 718.645751953125, "kl_loss_7": 180.43475570678712, "learning_rate": 3.46459839396045e-05, "loss": 519.2549, "step": 8820 }, { "ce_loss_10": 3.6235634326934814, "ce_loss_13": 3.5625478267669677, "ce_loss_2": 4.090062844753265, "ce_loss_3": 3.9221726655960083, "ce_loss_7": 3.6677647113800047, "epoch": 0.883, "grad_norm": 392.0, "kl_loss_10": 97.41650848388672, "kl_loss_2": 1090.359048461914, "kl_loss_3": 752.6492370605469, "kl_loss_7": 188.19114456176757, "learning_rate": 3.406798539427386e-05, "loss": 541.4702, "step": 8830 }, { "ce_loss_10": 3.6815385699272154, "ce_loss_13": 3.622318422794342, "ce_loss_2": 4.134820902347565, "ce_loss_3": 3.9722886800765993, "ce_loss_7": 3.7261940598487855, "epoch": 0.884, "grad_norm": 458.0, "kl_loss_10": 95.14997901916504, "kl_loss_2": 1087.6108123779297, "kl_loss_3": 753.6235443115235, "kl_loss_7": 186.09493026733398, "learning_rate": 3.349467877915746e-05, "loss": 532.4207, "step": 8840 }, { "ce_loss_10": 3.6383310556411743, "ce_loss_13": 3.578685259819031, "ce_loss_2": 4.10920352935791, "ce_loss_3": 3.9395066857337953, "ce_loss_7": 3.684439957141876, "epoch": 0.885, "grad_norm": 346.0, "kl_loss_10": 94.56938552856445, "kl_loss_2": 1107.4275299072265, "kl_loss_3": 766.7192443847656, "kl_loss_7": 187.05870895385743, "learning_rate": 3.292606986744667e-05, "loss": 544.0854, "step": 8850 }, { "ce_loss_10": 3.593039667606354, "ce_loss_13": 3.5363111972808836, "ce_loss_2": 4.061631453037262, "ce_loss_3": 3.888974642753601, "ce_loss_7": 3.6354947090148926, "epoch": 0.886, "grad_norm": 312.0, "kl_loss_10": 94.36025886535644, "kl_loss_2": 1094.437567138672, "kl_loss_3": 755.0413787841796, "kl_loss_7": 185.15854110717774, "learning_rate": 3.23621643850267e-05, "loss": 531.352, "step": 8860 }, { "ce_loss_10": 3.6675365686416628, "ce_loss_13": 3.608867907524109, "ce_loss_2": 4.1205101132392885, "ce_loss_3": 3.9526678919792175, "ce_loss_7": 3.71103777885437, "epoch": 0.887, "grad_norm": 398.0, "kl_loss_10": 95.91901359558105, "kl_loss_2": 1094.978707885742, "kl_loss_3": 758.2980133056641, "kl_loss_7": 187.99334793090821, "learning_rate": 3.180296801041971e-05, "loss": 525.304, "step": 8870 }, { "ce_loss_10": 3.6939959645271303, "ce_loss_13": 3.6341704607009886, "ce_loss_2": 4.136724853515625, "ce_loss_3": 3.976076662540436, "ce_loss_7": 3.7369011640548706, "epoch": 0.888, "grad_norm": 322.0, "kl_loss_10": 96.13762168884277, "kl_loss_2": 1061.462728881836, "kl_loss_3": 731.0939331054688, "kl_loss_7": 185.31768493652345, "learning_rate": 3.124848637472688e-05, "loss": 515.8721, "step": 8880 }, { "ce_loss_10": 3.5114728569984437, "ce_loss_13": 3.452458143234253, "ce_loss_2": 3.9819056034088134, "ce_loss_3": 3.8095321655273438, "ce_loss_7": 3.5549168229103087, "epoch": 0.889, "grad_norm": 430.0, "kl_loss_10": 92.77987136840821, "kl_loss_2": 1105.7576904296875, "kl_loss_3": 760.3018249511719, "kl_loss_7": 183.98031311035157, "learning_rate": 3.069872506157212e-05, "loss": 529.9256, "step": 8890 }, { "ce_loss_10": 3.6096359133720397, "ce_loss_13": 3.5530964136123657, "ce_loss_2": 4.066385662555694, "ce_loss_3": 3.9037466764450075, "ce_loss_7": 3.653862941265106, "epoch": 0.89, "grad_norm": 414.0, "kl_loss_10": 94.68969841003418, "kl_loss_2": 1082.7529907226562, "kl_loss_3": 748.9955108642578, "kl_loss_7": 186.7980583190918, "learning_rate": 3.0153689607045842e-05, "loss": 522.4292, "step": 8900 }, { "ce_loss_10": 3.5076727747917174, "ce_loss_13": 3.4481669664382935, "ce_loss_2": 3.998192644119263, "ce_loss_3": 3.8251919507980348, "ce_loss_7": 3.5543401718139647, "epoch": 0.891, "grad_norm": 462.0, "kl_loss_10": 96.1771800994873, "kl_loss_2": 1157.3876403808595, "kl_loss_3": 799.3413696289062, "kl_loss_7": 192.33385009765624, "learning_rate": 2.9613385499648926e-05, "loss": 537.2502, "step": 8910 }, { "ce_loss_10": 3.5617488503456114, "ce_loss_13": 3.5028850078582763, "ce_loss_2": 4.028625464439392, "ce_loss_3": 3.8606330037117003, "ce_loss_7": 3.60619056224823, "epoch": 0.892, "grad_norm": 364.0, "kl_loss_10": 92.3734031677246, "kl_loss_2": 1092.7289123535156, "kl_loss_3": 755.3269073486329, "kl_loss_7": 183.66201095581056, "learning_rate": 2.9077818180237692e-05, "loss": 529.899, "step": 8920 }, { "ce_loss_10": 3.611976993083954, "ce_loss_13": 3.5523295164108277, "ce_loss_2": 4.088427019119263, "ce_loss_3": 3.911720395088196, "ce_loss_7": 3.6568928718566895, "epoch": 0.893, "grad_norm": 604.0, "kl_loss_10": 95.37241554260254, "kl_loss_2": 1091.7466766357422, "kl_loss_3": 749.5647033691406, "kl_loss_7": 185.87219848632813, "learning_rate": 2.8546993041969172e-05, "loss": 528.8222, "step": 8930 }, { "ce_loss_10": 3.649553382396698, "ce_loss_13": 3.5936214447021486, "ce_loss_2": 4.095563900470734, "ce_loss_3": 3.9343939542770388, "ce_loss_7": 3.6919458627700807, "epoch": 0.894, "grad_norm": 356.0, "kl_loss_10": 92.16914100646973, "kl_loss_2": 1065.6531127929688, "kl_loss_3": 739.0178924560547, "kl_loss_7": 182.67144699096679, "learning_rate": 2.802091543024671e-05, "loss": 525.8132, "step": 8940 }, { "ce_loss_10": 3.6456188917160035, "ce_loss_13": 3.5855357170104982, "ce_loss_2": 4.1163407325744625, "ce_loss_3": 3.9452737092971804, "ce_loss_7": 3.690487289428711, "epoch": 0.895, "grad_norm": 376.0, "kl_loss_10": 94.99068603515624, "kl_loss_2": 1107.8523712158203, "kl_loss_3": 763.5164489746094, "kl_loss_7": 187.85556182861328, "learning_rate": 2.7499590642665774e-05, "loss": 543.5269, "step": 8950 }, { "ce_loss_10": 3.6521722793579103, "ce_loss_13": 3.5920722246170045, "ce_loss_2": 4.112611806392669, "ce_loss_3": 3.942758357524872, "ce_loss_7": 3.6924882411956785, "epoch": 0.896, "grad_norm": 434.0, "kl_loss_10": 97.21023635864258, "kl_loss_2": 1089.4108154296875, "kl_loss_3": 742.6543731689453, "kl_loss_7": 186.23975067138673, "learning_rate": 2.6983023928961405e-05, "loss": 523.9287, "step": 8960 }, { "ce_loss_10": 3.6287880539894104, "ce_loss_13": 3.569942307472229, "ce_loss_2": 4.086234021186828, "ce_loss_3": 3.919290018081665, "ce_loss_7": 3.6727704763412476, "epoch": 0.897, "grad_norm": 428.0, "kl_loss_10": 96.33384323120117, "kl_loss_2": 1081.610333251953, "kl_loss_3": 747.162060546875, "kl_loss_7": 187.28789825439452, "learning_rate": 2.6471220490954628e-05, "loss": 531.8677, "step": 8970 }, { "ce_loss_10": 3.6082414865493773, "ce_loss_13": 3.5503612399101256, "ce_loss_2": 4.054306983947754, "ce_loss_3": 3.8875715851783754, "ce_loss_7": 3.647981250286102, "epoch": 0.898, "grad_norm": 402.0, "kl_loss_10": 93.92480773925782, "kl_loss_2": 1068.1579833984374, "kl_loss_3": 736.318814086914, "kl_loss_7": 183.30384826660156, "learning_rate": 2.596418548250029e-05, "loss": 527.9295, "step": 8980 }, { "ce_loss_10": 3.6551415085792542, "ce_loss_13": 3.5952192187309264, "ce_loss_2": 4.1076843500137326, "ce_loss_3": 3.944980025291443, "ce_loss_7": 3.700137984752655, "epoch": 0.899, "grad_norm": 396.0, "kl_loss_10": 97.98623161315918, "kl_loss_2": 1081.954281616211, "kl_loss_3": 746.2776489257812, "kl_loss_7": 188.93777618408203, "learning_rate": 2.5461924009435368e-05, "loss": 524.2467, "step": 8990 }, { "ce_loss_10": 3.650333786010742, "ce_loss_13": 3.590772497653961, "ce_loss_2": 4.109632253646851, "ce_loss_3": 3.9412980914115905, "ce_loss_7": 3.6946743369102477, "epoch": 0.9, "grad_norm": 410.0, "kl_loss_10": 96.09890708923339, "kl_loss_2": 1079.7472290039063, "kl_loss_3": 745.8318054199219, "kl_loss_7": 186.11589736938475, "learning_rate": 2.4964441129527336e-05, "loss": 536.0899, "step": 9000 }, { "ce_loss_10": 3.6510029554367067, "ce_loss_13": 3.590871715545654, "ce_loss_2": 4.100390136241913, "ce_loss_3": 3.932853305339813, "ce_loss_7": 3.6917531371116636, "epoch": 0.901, "grad_norm": 418.0, "kl_loss_10": 95.55135993957519, "kl_loss_2": 1061.7380157470702, "kl_loss_3": 727.2771514892578, "kl_loss_7": 183.68069381713866, "learning_rate": 2.4471741852423235e-05, "loss": 518.1353, "step": 9010 }, { "ce_loss_10": 3.695908546447754, "ce_loss_13": 3.6349289417266846, "ce_loss_2": 4.151931369304657, "ce_loss_3": 3.98497998714447, "ce_loss_7": 3.739882934093475, "epoch": 0.902, "grad_norm": 392.0, "kl_loss_10": 95.51335906982422, "kl_loss_2": 1066.5906768798827, "kl_loss_3": 733.3630157470703, "kl_loss_7": 184.28593063354492, "learning_rate": 2.3983831139599287e-05, "loss": 522.8627, "step": 9020 }, { "ce_loss_10": 3.617437481880188, "ce_loss_13": 3.558865213394165, "ce_loss_2": 4.061969435214996, "ce_loss_3": 3.8991889357566833, "ce_loss_7": 3.660116195678711, "epoch": 0.903, "grad_norm": 456.0, "kl_loss_10": 93.39376106262208, "kl_loss_2": 1059.7717498779298, "kl_loss_3": 733.3598663330079, "kl_loss_7": 181.95840148925782, "learning_rate": 2.3500713904311022e-05, "loss": 512.7801, "step": 9030 }, { "ce_loss_10": 3.659070146083832, "ce_loss_13": 3.5992442965507507, "ce_loss_2": 4.08744889497757, "ce_loss_3": 3.9278596162796022, "ce_loss_7": 3.700530481338501, "epoch": 0.904, "grad_norm": 472.0, "kl_loss_10": 95.7885025024414, "kl_loss_2": 1036.0338073730468, "kl_loss_3": 713.1754333496094, "kl_loss_7": 181.65938034057618, "learning_rate": 2.3022395011543685e-05, "loss": 514.4845, "step": 9040 }, { "ce_loss_10": 3.6909992337226867, "ce_loss_13": 3.630416977405548, "ce_loss_2": 4.144919979572296, "ce_loss_3": 3.98409184217453, "ce_loss_7": 3.735574746131897, "epoch": 0.905, "grad_norm": 400.0, "kl_loss_10": 95.80096397399902, "kl_loss_2": 1091.1403015136718, "kl_loss_3": 758.9450408935547, "kl_loss_7": 188.74431228637695, "learning_rate": 2.2548879277963063e-05, "loss": 536.6219, "step": 9050 }, { "ce_loss_10": 3.6055094718933107, "ce_loss_13": 3.5453344702720644, "ce_loss_2": 4.055747485160827, "ce_loss_3": 3.8876903891563415, "ce_loss_7": 3.645590376853943, "epoch": 0.906, "grad_norm": 312.0, "kl_loss_10": 94.81256561279297, "kl_loss_2": 1081.8126281738282, "kl_loss_3": 743.9638031005859, "kl_loss_7": 185.8631164550781, "learning_rate": 2.208017147186736e-05, "loss": 517.0646, "step": 9060 }, { "ce_loss_10": 3.5984405398368837, "ce_loss_13": 3.5392195105552675, "ce_loss_2": 4.055430555343628, "ce_loss_3": 3.8891077756881716, "ce_loss_7": 3.643998312950134, "epoch": 0.907, "grad_norm": 424.0, "kl_loss_10": 95.52283592224121, "kl_loss_2": 1082.7356536865234, "kl_loss_3": 749.8307952880859, "kl_loss_7": 186.6390350341797, "learning_rate": 2.1616276313139227e-05, "loss": 522.272, "step": 9070 }, { "ce_loss_10": 3.6377461314201356, "ce_loss_13": 3.5757868885993958, "ce_loss_2": 4.087118625640869, "ce_loss_3": 3.9254656434059143, "ce_loss_7": 3.680292618274689, "epoch": 0.908, "grad_norm": 362.0, "kl_loss_10": 96.6335952758789, "kl_loss_2": 1071.57734375, "kl_loss_3": 743.0760345458984, "kl_loss_7": 186.97156448364257, "learning_rate": 2.1157198473197415e-05, "loss": 527.4616, "step": 9080 }, { "ce_loss_10": 3.7054911255836487, "ce_loss_13": 3.646452081203461, "ce_loss_2": 4.16020712852478, "ce_loss_3": 3.99694961309433, "ce_loss_7": 3.7527972936630247, "epoch": 0.909, "grad_norm": 428.0, "kl_loss_10": 95.60770835876465, "kl_loss_2": 1073.3848999023437, "kl_loss_3": 744.7516662597657, "kl_loss_7": 188.15945053100586, "learning_rate": 2.0702942574950812e-05, "loss": 526.0792, "step": 9090 }, { "ce_loss_10": 3.623731589317322, "ce_loss_13": 3.5640787363052366, "ce_loss_2": 4.083542311191559, "ce_loss_3": 3.9220656394958495, "ce_loss_7": 3.669620490074158, "epoch": 0.91, "grad_norm": 302.0, "kl_loss_10": 95.35622863769531, "kl_loss_2": 1087.3217651367188, "kl_loss_3": 752.284033203125, "kl_loss_7": 187.5697151184082, "learning_rate": 2.025351319275137e-05, "loss": 528.1311, "step": 9100 }, { "ce_loss_10": 3.761759030818939, "ce_loss_13": 3.6962865233421325, "ce_loss_2": 4.2175662279129025, "ce_loss_3": 4.051000607013703, "ce_loss_7": 3.8052276611328124, "epoch": 0.911, "grad_norm": 420.0, "kl_loss_10": 101.6547290802002, "kl_loss_2": 1108.3317321777345, "kl_loss_3": 765.9157867431641, "kl_loss_7": 194.34442520141602, "learning_rate": 1.9808914852347816e-05, "loss": 545.7752, "step": 9110 }, { "ce_loss_10": 3.599123954772949, "ce_loss_13": 3.539510524272919, "ce_loss_2": 4.069272911548614, "ce_loss_3": 3.9009834051132204, "ce_loss_7": 3.6455170154571532, "epoch": 0.912, "grad_norm": 416.0, "kl_loss_10": 95.14377288818359, "kl_loss_2": 1095.5253448486328, "kl_loss_3": 750.8630340576171, "kl_loss_7": 187.0247688293457, "learning_rate": 1.9369152030840554e-05, "loss": 527.6025, "step": 9120 }, { "ce_loss_10": 3.6806903958320616, "ce_loss_13": 3.620557761192322, "ce_loss_2": 4.135490739345551, "ce_loss_3": 3.9653069972991943, "ce_loss_7": 3.723483943939209, "epoch": 0.913, "grad_norm": 362.0, "kl_loss_10": 97.92795066833496, "kl_loss_2": 1089.1937438964844, "kl_loss_3": 747.6420379638672, "kl_loss_7": 187.34563446044922, "learning_rate": 1.893422915663645e-05, "loss": 529.2906, "step": 9130 }, { "ce_loss_10": 3.5489492774009705, "ce_loss_13": 3.488741672039032, "ce_loss_2": 4.032487225532532, "ce_loss_3": 3.862708866596222, "ce_loss_7": 3.594150650501251, "epoch": 0.914, "grad_norm": 460.0, "kl_loss_10": 95.81211128234864, "kl_loss_2": 1122.290625, "kl_loss_3": 780.3386810302734, "kl_loss_7": 190.92548141479492, "learning_rate": 1.850415060940386e-05, "loss": 539.4046, "step": 9140 }, { "ce_loss_10": 3.670183026790619, "ce_loss_13": 3.611021101474762, "ce_loss_2": 4.120828151702881, "ce_loss_3": 3.9584792375564577, "ce_loss_7": 3.712183046340942, "epoch": 0.915, "grad_norm": 418.0, "kl_loss_10": 95.88972358703613, "kl_loss_2": 1074.5135314941406, "kl_loss_3": 738.371826171875, "kl_loss_7": 185.7539405822754, "learning_rate": 1.8078920720028978e-05, "loss": 525.966, "step": 9150 }, { "ce_loss_10": 3.600800943374634, "ce_loss_13": 3.5434103488922117, "ce_loss_2": 4.046385419368744, "ce_loss_3": 3.8842490911483765, "ce_loss_7": 3.6435607194900514, "epoch": 0.916, "grad_norm": 468.0, "kl_loss_10": 94.49675407409669, "kl_loss_2": 1068.3072998046875, "kl_loss_3": 736.1623046875, "kl_loss_7": 182.35257797241212, "learning_rate": 1.765854377057219e-05, "loss": 533.5915, "step": 9160 }, { "ce_loss_10": 3.579929566383362, "ce_loss_13": 3.52090607881546, "ce_loss_2": 4.0303690195083615, "ce_loss_3": 3.863832104206085, "ce_loss_7": 3.621261489391327, "epoch": 0.917, "grad_norm": 344.0, "kl_loss_10": 93.69845123291016, "kl_loss_2": 1076.374838256836, "kl_loss_3": 739.5320068359375, "kl_loss_7": 182.73907394409179, "learning_rate": 1.724302399422456e-05, "loss": 525.9574, "step": 9170 }, { "ce_loss_10": 3.5273375153541564, "ce_loss_13": 3.469092321395874, "ce_loss_2": 3.98960462808609, "ce_loss_3": 3.8235998272895815, "ce_loss_7": 3.572177302837372, "epoch": 0.918, "grad_norm": 328.0, "kl_loss_10": 94.86108894348145, "kl_loss_2": 1092.3598358154297, "kl_loss_3": 757.3310150146484, "kl_loss_7": 188.48751983642578, "learning_rate": 1.683236557526574e-05, "loss": 533.8531, "step": 9180 }, { "ce_loss_10": 3.6514230132102967, "ce_loss_13": 3.59556097984314, "ce_loss_2": 4.083134496212006, "ce_loss_3": 3.926029086112976, "ce_loss_7": 3.693097734451294, "epoch": 0.919, "grad_norm": 276.0, "kl_loss_10": 94.37221069335938, "kl_loss_2": 1047.5379638671875, "kl_loss_3": 720.9200286865234, "kl_loss_7": 181.39565734863282, "learning_rate": 1.6426572649021475e-05, "loss": 520.5356, "step": 9190 }, { "ce_loss_10": 3.6877851486206055, "ce_loss_13": 3.6274981617927553, "ce_loss_2": 4.1144737839698795, "ce_loss_3": 3.9595839619636535, "ce_loss_7": 3.7264232993125916, "epoch": 0.92, "grad_norm": 430.0, "kl_loss_10": 99.18587074279785, "kl_loss_2": 1047.7421783447267, "kl_loss_3": 721.9292663574219, "kl_loss_7": 186.34831695556642, "learning_rate": 1.6025649301821876e-05, "loss": 520.097, "step": 9200 }, { "ce_loss_10": 3.6789560437202455, "ce_loss_13": 3.6199841260910035, "ce_loss_2": 4.116438376903534, "ce_loss_3": 3.95575532913208, "ce_loss_7": 3.720892333984375, "epoch": 0.921, "grad_norm": 430.0, "kl_loss_10": 95.03273735046386, "kl_loss_2": 1068.5045623779297, "kl_loss_3": 740.7460571289063, "kl_loss_7": 185.96430587768555, "learning_rate": 1.5629599570960716e-05, "loss": 522.4428, "step": 9210 }, { "ce_loss_10": 3.579318141937256, "ce_loss_13": 3.5199381947517394, "ce_loss_2": 4.029832947254181, "ce_loss_3": 3.865503740310669, "ce_loss_7": 3.6221681237220764, "epoch": 0.922, "grad_norm": 430.0, "kl_loss_10": 94.97879791259766, "kl_loss_2": 1084.768603515625, "kl_loss_3": 748.8800231933594, "kl_loss_7": 185.368741607666, "learning_rate": 1.5238427444654367e-05, "loss": 526.936, "step": 9220 }, { "ce_loss_10": 3.642410922050476, "ce_loss_13": 3.5841264009475706, "ce_loss_2": 4.090620064735413, "ce_loss_3": 3.929516541957855, "ce_loss_7": 3.68586403131485, "epoch": 0.923, "grad_norm": 340.0, "kl_loss_10": 95.43446731567383, "kl_loss_2": 1061.9394897460938, "kl_loss_3": 729.8539154052735, "kl_loss_7": 184.269775390625, "learning_rate": 1.4852136862001764e-05, "loss": 521.6809, "step": 9230 }, { "ce_loss_10": 3.6022266387939452, "ce_loss_13": 3.5460850477218626, "ce_loss_2": 4.056096696853638, "ce_loss_3": 3.894578981399536, "ce_loss_7": 3.6445172667503356, "epoch": 0.924, "grad_norm": 382.0, "kl_loss_10": 90.83601989746094, "kl_loss_2": 1070.5055114746094, "kl_loss_3": 735.5364959716796, "kl_loss_7": 180.06712493896484, "learning_rate": 1.4470731712944884e-05, "loss": 526.6606, "step": 9240 }, { "ce_loss_10": 3.632104980945587, "ce_loss_13": 3.573563551902771, "ce_loss_2": 4.086918556690216, "ce_loss_3": 3.921724486351013, "ce_loss_7": 3.676921808719635, "epoch": 0.925, "grad_norm": 404.0, "kl_loss_10": 93.8505702972412, "kl_loss_2": 1076.019464111328, "kl_loss_3": 742.9348846435547, "kl_loss_7": 185.7860206604004, "learning_rate": 1.4094215838229174e-05, "loss": 532.0963, "step": 9250 }, { "ce_loss_10": 3.5902254581451416, "ce_loss_13": 3.531176710128784, "ce_loss_2": 4.053838360309601, "ce_loss_3": 3.8887827515602114, "ce_loss_7": 3.634320020675659, "epoch": 0.926, "grad_norm": 440.0, "kl_loss_10": 95.00082511901856, "kl_loss_2": 1108.7564575195313, "kl_loss_3": 761.1957458496094, "kl_loss_7": 187.39419326782226, "learning_rate": 1.372259302936546e-05, "loss": 548.2919, "step": 9260 }, { "ce_loss_10": 3.7115341782569886, "ce_loss_13": 3.6472853660583495, "ce_loss_2": 4.159861445426941, "ce_loss_3": 3.998417854309082, "ce_loss_7": 3.7543888211250307, "epoch": 0.927, "grad_norm": 304.0, "kl_loss_10": 100.11175384521485, "kl_loss_2": 1075.1090118408204, "kl_loss_3": 744.2237152099609, "kl_loss_7": 190.9360038757324, "learning_rate": 1.3355867028591206e-05, "loss": 520.805, "step": 9270 }, { "ce_loss_10": 3.6113879919052123, "ce_loss_13": 3.5496174573898314, "ce_loss_2": 4.047625136375427, "ce_loss_3": 3.8916648983955384, "ce_loss_7": 3.653665769100189, "epoch": 0.928, "grad_norm": 334.0, "kl_loss_10": 94.99486846923828, "kl_loss_2": 1063.383090209961, "kl_loss_3": 737.3780670166016, "kl_loss_7": 184.87188415527345, "learning_rate": 1.2994041528833267e-05, "loss": 520.9468, "step": 9280 }, { "ce_loss_10": 3.612771439552307, "ce_loss_13": 3.5519652009010314, "ce_loss_2": 4.069023680686951, "ce_loss_3": 3.9033527731895448, "ce_loss_7": 3.653776025772095, "epoch": 0.929, "grad_norm": 394.0, "kl_loss_10": 94.48731269836426, "kl_loss_2": 1086.341064453125, "kl_loss_3": 747.7527069091797, "kl_loss_7": 184.27003555297853, "learning_rate": 1.2637120173670358e-05, "loss": 525.795, "step": 9290 }, { "ce_loss_10": 3.6342510104179384, "ce_loss_13": 3.574049484729767, "ce_loss_2": 4.097525525093078, "ce_loss_3": 3.9327287077903748, "ce_loss_7": 3.6803439974784853, "epoch": 0.93, "grad_norm": 492.0, "kl_loss_10": 94.73881340026855, "kl_loss_2": 1086.5091583251954, "kl_loss_3": 750.7861236572265, "kl_loss_7": 186.8117706298828, "learning_rate": 1.2285106557296478e-05, "loss": 526.7854, "step": 9300 }, { "ce_loss_10": 3.513438880443573, "ce_loss_13": 3.453951287269592, "ce_loss_2": 3.9955971360206606, "ce_loss_3": 3.8230167746543886, "ce_loss_7": 3.555509877204895, "epoch": 0.931, "grad_norm": 356.0, "kl_loss_10": 93.80283432006836, "kl_loss_2": 1116.4696807861328, "kl_loss_3": 771.7997375488281, "kl_loss_7": 186.52389373779297, "learning_rate": 1.1938004224484989e-05, "loss": 533.0822, "step": 9310 }, { "ce_loss_10": 3.7524689197540284, "ce_loss_13": 3.6876933336257935, "ce_loss_2": 4.20148618221283, "ce_loss_3": 4.035860347747803, "ce_loss_7": 3.7956905245780943, "epoch": 0.932, "grad_norm": 418.0, "kl_loss_10": 99.70074195861817, "kl_loss_2": 1085.114028930664, "kl_loss_3": 747.7518859863281, "kl_loss_7": 189.80009078979492, "learning_rate": 1.1595816670552429e-05, "loss": 536.128, "step": 9320 }, { "ce_loss_10": 3.6811413764953613, "ce_loss_13": 3.619305157661438, "ce_loss_2": 4.1267077088356015, "ce_loss_3": 3.9628111124038696, "ce_loss_7": 3.7232463002204894, "epoch": 0.933, "grad_norm": 430.0, "kl_loss_10": 98.55138320922852, "kl_loss_2": 1066.0611297607422, "kl_loss_3": 732.6245086669921, "kl_loss_7": 187.06882858276367, "learning_rate": 1.1258547341323699e-05, "loss": 518.9695, "step": 9330 }, { "ce_loss_10": 3.706856846809387, "ce_loss_13": 3.6450837016105653, "ce_loss_2": 4.152973532676697, "ce_loss_3": 3.9891764402389525, "ce_loss_7": 3.7481295585632326, "epoch": 0.934, "grad_norm": 394.0, "kl_loss_10": 96.45535087585449, "kl_loss_2": 1089.2688110351562, "kl_loss_3": 747.8073425292969, "kl_loss_7": 187.34025497436522, "learning_rate": 1.0926199633097156e-05, "loss": 527.061, "step": 9340 }, { "ce_loss_10": 3.7075893759727476, "ce_loss_13": 3.6489187121391295, "ce_loss_2": 4.135252356529236, "ce_loss_3": 3.976875376701355, "ce_loss_7": 3.747441065311432, "epoch": 0.935, "grad_norm": 428.0, "kl_loss_10": 94.83727493286133, "kl_loss_2": 1042.2317810058594, "kl_loss_3": 718.6920349121094, "kl_loss_7": 181.23108978271483, "learning_rate": 1.0598776892610684e-05, "loss": 526.2413, "step": 9350 }, { "ce_loss_10": 3.5169559955596923, "ce_loss_13": 3.4603365540504454, "ce_loss_2": 3.9802993655204775, "ce_loss_3": 3.8121800780296327, "ce_loss_7": 3.561786246299744, "epoch": 0.936, "grad_norm": 334.0, "kl_loss_10": 92.96564292907715, "kl_loss_2": 1091.1406646728515, "kl_loss_3": 747.6543731689453, "kl_loss_7": 183.7804039001465, "learning_rate": 1.0276282417007399e-05, "loss": 521.9861, "step": 9360 }, { "ce_loss_10": 3.6849416494369507, "ce_loss_13": 3.626581645011902, "ce_loss_2": 4.118964040279389, "ce_loss_3": 3.9585147976875303, "ce_loss_7": 3.7237794518470766, "epoch": 0.937, "grad_norm": 464.0, "kl_loss_10": 95.02116394042969, "kl_loss_2": 1044.2026397705079, "kl_loss_3": 719.8276824951172, "kl_loss_7": 182.06821365356444, "learning_rate": 9.958719453803277e-06, "loss": 518.1707, "step": 9370 }, { "ce_loss_10": 3.6774186968803404, "ce_loss_13": 3.6149828910827635, "ce_loss_2": 4.126804637908935, "ce_loss_3": 3.964286994934082, "ce_loss_7": 3.7206520080566405, "epoch": 0.938, "grad_norm": 364.0, "kl_loss_10": 96.62460212707519, "kl_loss_2": 1077.0972625732422, "kl_loss_3": 746.5920196533203, "kl_loss_7": 186.96116638183594, "learning_rate": 9.646091200853802e-06, "loss": 526.3039, "step": 9380 }, { "ce_loss_10": 3.633099365234375, "ce_loss_13": 3.5745465636253355, "ce_loss_2": 4.0883647203445435, "ce_loss_3": 3.9242159128189087, "ce_loss_7": 3.672658348083496, "epoch": 0.939, "grad_norm": 398.0, "kl_loss_10": 93.04219818115234, "kl_loss_2": 1075.2075500488281, "kl_loss_3": 738.6250030517579, "kl_loss_7": 181.5531784057617, "learning_rate": 9.338400806321978e-06, "loss": 512.8155, "step": 9390 }, { "ce_loss_10": 3.664756190776825, "ce_loss_13": 3.603893756866455, "ce_loss_2": 4.104370522499084, "ce_loss_3": 3.941502547264099, "ce_loss_7": 3.7107202291488646, "epoch": 0.94, "grad_norm": 330.0, "kl_loss_10": 96.52969932556152, "kl_loss_2": 1056.286117553711, "kl_loss_3": 729.6215881347656, "kl_loss_7": 186.73142929077147, "learning_rate": 9.035651368646646e-06, "loss": 517.5048, "step": 9400 }, { "ce_loss_10": 3.6749662160873413, "ce_loss_13": 3.6150254607200623, "ce_loss_2": 4.108079397678376, "ce_loss_3": 3.9502787351608277, "ce_loss_7": 3.71422598361969, "epoch": 0.941, "grad_norm": 368.0, "kl_loss_10": 95.4813446044922, "kl_loss_2": 1051.3231384277344, "kl_loss_3": 730.8897918701172, "kl_loss_7": 183.71395568847657, "learning_rate": 8.737845936511335e-06, "loss": 521.5386, "step": 9410 }, { "ce_loss_10": 3.621238374710083, "ce_loss_13": 3.560182070732117, "ce_loss_2": 4.075435829162598, "ce_loss_3": 3.906463932991028, "ce_loss_7": 3.6651031732559205, "epoch": 0.942, "grad_norm": 472.0, "kl_loss_10": 95.50933799743652, "kl_loss_2": 1087.418194580078, "kl_loss_3": 749.9641418457031, "kl_loss_7": 187.3939208984375, "learning_rate": 8.444987508813451e-06, "loss": 524.6778, "step": 9420 }, { "ce_loss_10": 3.567629599571228, "ce_loss_13": 3.5098708271980286, "ce_loss_2": 4.03240327835083, "ce_loss_3": 3.868740451335907, "ce_loss_7": 3.614664590358734, "epoch": 0.943, "grad_norm": 452.0, "kl_loss_10": 95.83200073242188, "kl_loss_2": 1111.0681640625, "kl_loss_3": 769.0793914794922, "kl_loss_7": 188.26431045532226, "learning_rate": 8.157079034633974e-06, "loss": 533.1891, "step": 9430 }, { "ce_loss_10": 3.5664173483848574, "ce_loss_13": 3.5061603307724, "ce_loss_2": 4.02851265668869, "ce_loss_3": 3.862307035923004, "ce_loss_7": 3.6107182621955873, "epoch": 0.944, "grad_norm": 426.0, "kl_loss_10": 94.98325424194336, "kl_loss_2": 1109.4172790527343, "kl_loss_3": 762.6424713134766, "kl_loss_7": 186.38191299438478, "learning_rate": 7.874123413208145e-06, "loss": 528.958, "step": 9440 }, { "ce_loss_10": 3.5382938742637635, "ce_loss_13": 3.481018900871277, "ce_loss_2": 4.006192743778229, "ce_loss_3": 3.8386752605438232, "ce_loss_7": 3.5831608533859254, "epoch": 0.945, "grad_norm": 338.0, "kl_loss_10": 92.47231903076172, "kl_loss_2": 1088.9563568115234, "kl_loss_3": 753.4448974609375, "kl_loss_7": 184.27166213989258, "learning_rate": 7.59612349389599e-06, "loss": 527.5225, "step": 9450 }, { "ce_loss_10": 3.633445167541504, "ce_loss_13": 3.5758827209472654, "ce_loss_2": 4.075440514087677, "ce_loss_3": 3.9124983310699464, "ce_loss_7": 3.6780736327171324, "epoch": 0.946, "grad_norm": 356.0, "kl_loss_10": 91.38598556518555, "kl_loss_2": 1046.8805053710937, "kl_loss_3": 718.2211791992188, "kl_loss_7": 180.72154998779297, "learning_rate": 7.323082076153509e-06, "loss": 519.5404, "step": 9460 }, { "ce_loss_10": 3.675933361053467, "ce_loss_13": 3.616945672035217, "ce_loss_2": 4.116010129451752, "ce_loss_3": 3.954231834411621, "ce_loss_7": 3.7195321679115296, "epoch": 0.947, "grad_norm": 376.0, "kl_loss_10": 96.42714042663575, "kl_loss_2": 1051.1879852294921, "kl_loss_3": 727.5513549804688, "kl_loss_7": 186.51647338867187, "learning_rate": 7.055001909504755e-06, "loss": 525.7655, "step": 9470 }, { "ce_loss_10": 3.7083083152770997, "ce_loss_13": 3.647673761844635, "ce_loss_2": 4.157342481613159, "ce_loss_3": 3.991931939125061, "ce_loss_7": 3.752028775215149, "epoch": 0.948, "grad_norm": 344.0, "kl_loss_10": 96.79825706481934, "kl_loss_2": 1084.5101806640625, "kl_loss_3": 742.6272155761719, "kl_loss_7": 187.0098518371582, "learning_rate": 6.791885693514133e-06, "loss": 528.4126, "step": 9480 }, { "ce_loss_10": 3.6131741404533386, "ce_loss_13": 3.554737401008606, "ce_loss_2": 4.069884133338928, "ce_loss_3": 3.910088050365448, "ce_loss_7": 3.657594072818756, "epoch": 0.949, "grad_norm": 444.0, "kl_loss_10": 95.54262161254883, "kl_loss_2": 1090.819403076172, "kl_loss_3": 755.8211273193359, "kl_loss_7": 187.30291366577148, "learning_rate": 6.533736077758867e-06, "loss": 532.407, "step": 9490 }, { "ce_loss_10": 3.5753329753875733, "ce_loss_13": 3.5157718658447266, "ce_loss_2": 4.050174379348755, "ce_loss_3": 3.878748118877411, "ce_loss_7": 3.621631395816803, "epoch": 0.95, "grad_norm": 454.0, "kl_loss_10": 95.78313636779785, "kl_loss_2": 1112.5021850585938, "kl_loss_3": 766.8859832763671, "kl_loss_7": 188.93851776123046, "learning_rate": 6.2805556618028556e-06, "loss": 531.8975, "step": 9500 }, { "ce_loss_10": 3.6739890694618227, "ce_loss_13": 3.614563775062561, "ce_loss_2": 4.105420649051666, "ce_loss_3": 3.946949827671051, "ce_loss_7": 3.713826298713684, "epoch": 0.951, "grad_norm": 428.0, "kl_loss_10": 95.29025764465332, "kl_loss_2": 1035.753839111328, "kl_loss_3": 718.9863189697265, "kl_loss_7": 182.34558639526367, "learning_rate": 6.032346995169968e-06, "loss": 506.1833, "step": 9510 }, { "ce_loss_10": 3.6744378566741944, "ce_loss_13": 3.6160669803619383, "ce_loss_2": 4.116178596019745, "ce_loss_3": 3.952050065994263, "ce_loss_7": 3.714146387577057, "epoch": 0.952, "grad_norm": 350.0, "kl_loss_10": 95.77439384460449, "kl_loss_2": 1065.6743865966796, "kl_loss_3": 734.3932067871094, "kl_loss_7": 184.87170867919923, "learning_rate": 5.789112577318789e-06, "loss": 520.2576, "step": 9520 }, { "ce_loss_10": 3.6489309549331663, "ce_loss_13": 3.5895671963691713, "ce_loss_2": 4.11376656293869, "ce_loss_3": 3.946073520183563, "ce_loss_7": 3.6925018429756165, "epoch": 0.953, "grad_norm": 460.0, "kl_loss_10": 96.73359451293945, "kl_loss_2": 1111.601629638672, "kl_loss_3": 771.5278289794921, "kl_loss_7": 187.8802848815918, "learning_rate": 5.550854857617194e-06, "loss": 527.3308, "step": 9530 }, { "ce_loss_10": 3.6415695905685426, "ce_loss_13": 3.579833471775055, "ce_loss_2": 4.102292227745056, "ce_loss_3": 3.9383127331733703, "ce_loss_7": 3.6863919377326964, "epoch": 0.954, "grad_norm": 398.0, "kl_loss_10": 98.16804580688476, "kl_loss_2": 1097.6046325683594, "kl_loss_3": 757.5784729003906, "kl_loss_7": 190.50857543945312, "learning_rate": 5.317576235317756e-06, "loss": 527.9396, "step": 9540 }, { "ce_loss_10": 3.6651427507400514, "ce_loss_13": 3.604920470714569, "ce_loss_2": 4.100248050689697, "ce_loss_3": 3.94064177274704, "ce_loss_7": 3.7060970425605775, "epoch": 0.955, "grad_norm": 386.0, "kl_loss_10": 96.45015525817871, "kl_loss_2": 1031.3038146972656, "kl_loss_3": 712.4996978759766, "kl_loss_7": 182.76630401611328, "learning_rate": 5.089279059533658e-06, "loss": 524.0002, "step": 9550 }, { "ce_loss_10": 3.7266568183898925, "ce_loss_13": 3.663935911655426, "ce_loss_2": 4.170814108848572, "ce_loss_3": 4.006054651737213, "ce_loss_7": 3.769794237613678, "epoch": 0.956, "grad_norm": 386.0, "kl_loss_10": 100.15878944396972, "kl_loss_2": 1068.9294799804688, "kl_loss_3": 738.0209930419921, "kl_loss_7": 192.08404541015625, "learning_rate": 4.865965629214819e-06, "loss": 520.8748, "step": 9560 }, { "ce_loss_10": 3.670477032661438, "ce_loss_13": 3.611146903038025, "ce_loss_2": 4.115479242801666, "ce_loss_3": 3.9537983894348145, "ce_loss_7": 3.7129539370536806, "epoch": 0.957, "grad_norm": 496.0, "kl_loss_10": 96.79973983764648, "kl_loss_2": 1085.6631072998048, "kl_loss_3": 749.8902404785156, "kl_loss_7": 188.74480895996095, "learning_rate": 4.6476381931251366e-06, "loss": 519.6521, "step": 9570 }, { "ce_loss_10": 3.646716892719269, "ce_loss_13": 3.5878213763237, "ce_loss_2": 4.089986479282379, "ce_loss_3": 3.9314276933670045, "ce_loss_7": 3.6911307334899903, "epoch": 0.958, "grad_norm": 318.0, "kl_loss_10": 94.01541290283203, "kl_loss_2": 1067.8105712890624, "kl_loss_3": 740.1676208496094, "kl_loss_7": 184.118741607666, "learning_rate": 4.434298949819449e-06, "loss": 523.6254, "step": 9580 }, { "ce_loss_10": 3.6008993268013, "ce_loss_13": 3.538570249080658, "ce_loss_2": 4.069638097286225, "ce_loss_3": 3.8975520372390746, "ce_loss_7": 3.6453381776809692, "epoch": 0.959, "grad_norm": 440.0, "kl_loss_10": 97.41343994140625, "kl_loss_2": 1125.892025756836, "kl_loss_3": 772.14267578125, "kl_loss_7": 189.9515396118164, "learning_rate": 4.2259500476214406e-06, "loss": 534.6609, "step": 9590 }, { "ce_loss_10": 3.58458696603775, "ce_loss_13": 3.52560031414032, "ce_loss_2": 4.040603399276733, "ce_loss_3": 3.8742735624313354, "ce_loss_7": 3.627805030345917, "epoch": 0.96, "grad_norm": 388.0, "kl_loss_10": 94.08248367309571, "kl_loss_2": 1083.009814453125, "kl_loss_3": 746.2331970214843, "kl_loss_7": 184.85717010498047, "learning_rate": 4.02259358460233e-06, "loss": 521.7564, "step": 9600 }, { "ce_loss_10": 3.6558929800987245, "ce_loss_13": 3.5954962849617003, "ce_loss_2": 4.101473760604859, "ce_loss_3": 3.9380804181098936, "ce_loss_7": 3.6987645506858824, "epoch": 0.961, "grad_norm": 544.0, "kl_loss_10": 95.69773292541504, "kl_loss_2": 1060.7937774658203, "kl_loss_3": 733.2102172851562, "kl_loss_7": 185.71547775268556, "learning_rate": 3.8242316085594916e-06, "loss": 516.8465, "step": 9610 }, { "ce_loss_10": 3.5343406558036805, "ce_loss_13": 3.4767986059188845, "ce_loss_2": 4.016193747520447, "ce_loss_3": 3.8443652629852294, "ce_loss_7": 3.580942380428314, "epoch": 0.962, "grad_norm": 366.0, "kl_loss_10": 93.89258918762206, "kl_loss_2": 1123.5916809082032, "kl_loss_3": 780.3413696289062, "kl_loss_7": 187.34277801513673, "learning_rate": 3.630866116995757e-06, "loss": 546.1011, "step": 9620 }, { "ce_loss_10": 3.6960983991622927, "ce_loss_13": 3.635801446437836, "ce_loss_2": 4.132487082481385, "ce_loss_3": 3.9690314412117003, "ce_loss_7": 3.737609100341797, "epoch": 0.963, "grad_norm": 312.0, "kl_loss_10": 96.57149925231934, "kl_loss_2": 1044.7675506591797, "kl_loss_3": 718.4659484863281, "kl_loss_7": 183.9634048461914, "learning_rate": 3.4424990570994797e-06, "loss": 523.2208, "step": 9630 }, { "ce_loss_10": 3.685701644420624, "ce_loss_13": 3.624559962749481, "ce_loss_2": 4.128798627853394, "ce_loss_3": 3.968520772457123, "ce_loss_7": 3.7257295846939087, "epoch": 0.964, "grad_norm": 280.0, "kl_loss_10": 95.63589668273926, "kl_loss_2": 1068.9191833496093, "kl_loss_3": 737.6691131591797, "kl_loss_7": 184.7596893310547, "learning_rate": 3.2591323257248896e-06, "loss": 522.5564, "step": 9640 }, { "ce_loss_10": 3.5315052390098574, "ce_loss_13": 3.4732569575309755, "ce_loss_2": 3.99234676361084, "ce_loss_3": 3.822614312171936, "ce_loss_7": 3.5727542638778687, "epoch": 0.965, "grad_norm": 338.0, "kl_loss_10": 93.59828681945801, "kl_loss_2": 1088.8541290283204, "kl_loss_3": 750.0034118652344, "kl_loss_7": 183.51124572753906, "learning_rate": 3.0807677693729385e-06, "loss": 528.9641, "step": 9650 }, { "ce_loss_10": 3.721923458576202, "ce_loss_13": 3.6635044693946837, "ce_loss_2": 4.157553017139435, "ce_loss_3": 3.9980939745903017, "ce_loss_7": 3.7649829506874086, "epoch": 0.966, "grad_norm": 328.0, "kl_loss_10": 95.77610893249512, "kl_loss_2": 1046.733694458008, "kl_loss_3": 723.9284912109375, "kl_loss_7": 183.63089752197266, "learning_rate": 2.9074071841727055e-06, "loss": 513.6759, "step": 9660 }, { "ce_loss_10": 3.6491685032844545, "ce_loss_13": 3.5898547172546387, "ce_loss_2": 4.10191251039505, "ce_loss_3": 3.9377865552902223, "ce_loss_7": 3.694057583808899, "epoch": 0.967, "grad_norm": 410.0, "kl_loss_10": 94.75908012390137, "kl_loss_2": 1074.1435485839843, "kl_loss_3": 739.0172424316406, "kl_loss_7": 185.9965072631836, "learning_rate": 2.739052315863355e-06, "loss": 514.4849, "step": 9670 }, { "ce_loss_10": 3.6381678700447084, "ce_loss_13": 3.5745797991752624, "ce_loss_2": 4.085923862457276, "ce_loss_3": 3.9223034262657164, "ce_loss_7": 3.679898130893707, "epoch": 0.968, "grad_norm": 400.0, "kl_loss_10": 98.94500389099122, "kl_loss_2": 1071.759048461914, "kl_loss_3": 742.1754821777344, "kl_loss_7": 186.4635383605957, "learning_rate": 2.5757048597765396e-06, "loss": 520.3133, "step": 9680 }, { "ce_loss_10": 3.6451838970184327, "ce_loss_13": 3.584810471534729, "ce_loss_2": 4.096628618240357, "ce_loss_3": 3.9352465867996216, "ce_loss_7": 3.6861096024513245, "epoch": 0.969, "grad_norm": 354.0, "kl_loss_10": 95.84736633300781, "kl_loss_2": 1089.9881774902344, "kl_loss_3": 753.175894165039, "kl_loss_7": 186.62908554077148, "learning_rate": 2.417366460819359e-06, "loss": 527.3621, "step": 9690 }, { "ce_loss_10": 3.6515438675880434, "ce_loss_13": 3.5902424931526182, "ce_loss_2": 4.121148645877838, "ce_loss_3": 3.9508870244026184, "ce_loss_7": 3.6973974823951723, "epoch": 0.97, "grad_norm": 378.0, "kl_loss_10": 97.83438453674316, "kl_loss_2": 1114.7044860839844, "kl_loss_3": 766.2058898925782, "kl_loss_7": 189.7753791809082, "learning_rate": 2.2640387134577057e-06, "loss": 528.5559, "step": 9700 }, { "ce_loss_10": 3.579375672340393, "ce_loss_13": 3.5232587337493895, "ce_loss_2": 4.008558976650238, "ce_loss_3": 3.853218126296997, "ce_loss_7": 3.621316111087799, "epoch": 0.971, "grad_norm": 346.0, "kl_loss_10": 89.91974563598633, "kl_loss_2": 1025.9577575683593, "kl_loss_3": 709.8189392089844, "kl_loss_7": 177.37805099487304, "learning_rate": 2.115723161700278e-06, "loss": 511.7921, "step": 9710 }, { "ce_loss_10": 3.5539788961410523, "ce_loss_13": 3.493563008308411, "ce_loss_2": 4.019102883338928, "ce_loss_3": 3.8513848066329954, "ce_loss_7": 3.6021719098091127, "epoch": 0.972, "grad_norm": 450.0, "kl_loss_10": 97.08839912414551, "kl_loss_2": 1102.8951354980468, "kl_loss_3": 763.9947265625, "kl_loss_7": 189.97327194213867, "learning_rate": 1.9724212990830937e-06, "loss": 534.7647, "step": 9720 }, { "ce_loss_10": 3.7055511236190797, "ce_loss_13": 3.645791494846344, "ce_loss_2": 4.164284873008728, "ce_loss_3": 3.998200333118439, "ce_loss_7": 3.748906970024109, "epoch": 0.973, "grad_norm": 306.0, "kl_loss_10": 97.2132583618164, "kl_loss_2": 1086.074758911133, "kl_loss_3": 748.4372924804687, "kl_loss_7": 187.37096481323243, "learning_rate": 1.8341345686543331e-06, "loss": 526.9427, "step": 9730 }, { "ce_loss_10": 3.688717949390411, "ce_loss_13": 3.6282296895980837, "ce_loss_2": 4.123041558265686, "ce_loss_3": 3.963315784931183, "ce_loss_7": 3.731441855430603, "epoch": 0.974, "grad_norm": 446.0, "kl_loss_10": 95.65226020812989, "kl_loss_2": 1053.469790649414, "kl_loss_3": 725.7204162597657, "kl_loss_7": 185.0056625366211, "learning_rate": 1.7008643629596864e-06, "loss": 524.4386, "step": 9740 }, { "ce_loss_10": 3.674058973789215, "ce_loss_13": 3.6143284678459167, "ce_loss_2": 4.119033622741699, "ce_loss_3": 3.954865837097168, "ce_loss_7": 3.7161202311515806, "epoch": 0.975, "grad_norm": 406.0, "kl_loss_10": 96.88497962951661, "kl_loss_2": 1081.9985229492188, "kl_loss_3": 741.1380645751954, "kl_loss_7": 186.00157089233397, "learning_rate": 1.5726120240288633e-06, "loss": 531.1466, "step": 9750 }, { "ce_loss_10": 3.569232928752899, "ce_loss_13": 3.511077570915222, "ce_loss_2": 4.014019024372101, "ce_loss_3": 3.854006791114807, "ce_loss_7": 3.6116040468215944, "epoch": 0.976, "grad_norm": 548.0, "kl_loss_10": 94.0260025024414, "kl_loss_2": 1069.765771484375, "kl_loss_3": 740.6184356689453, "kl_loss_7": 184.33246154785155, "learning_rate": 1.4493788433612708e-06, "loss": 520.0787, "step": 9760 }, { "ce_loss_10": 3.6905293107032775, "ce_loss_13": 3.630939745903015, "ce_loss_2": 4.1426611065864565, "ce_loss_3": 3.9766762137413023, "ce_loss_7": 3.7346643686294554, "epoch": 0.977, "grad_norm": 340.0, "kl_loss_10": 95.88525352478027, "kl_loss_2": 1083.8150268554687, "kl_loss_3": 744.1571472167968, "kl_loss_7": 186.4142593383789, "learning_rate": 1.3311660619138578e-06, "loss": 528.903, "step": 9770 }, { "ce_loss_10": 3.6836748480796815, "ce_loss_13": 3.6255483746528627, "ce_loss_2": 4.109962856769561, "ce_loss_3": 3.9559788703918457, "ce_loss_7": 3.7251157641410826, "epoch": 0.978, "grad_norm": 358.0, "kl_loss_10": 94.83126564025879, "kl_loss_2": 1041.5651123046875, "kl_loss_3": 719.0739654541015, "kl_loss_7": 183.43487930297852, "learning_rate": 1.2179748700879012e-06, "loss": 517.046, "step": 9780 }, { "ce_loss_10": 3.6114102602005005, "ce_loss_13": 3.553410363197327, "ce_loss_2": 4.060984289646148, "ce_loss_3": 3.9008304476737976, "ce_loss_7": 3.6546178460121155, "epoch": 0.979, "grad_norm": 448.0, "kl_loss_10": 94.51130638122558, "kl_loss_2": 1070.2796936035156, "kl_loss_3": 734.1844604492187, "kl_loss_7": 183.8596923828125, "learning_rate": 1.1098064077174619e-06, "loss": 522.2918, "step": 9790 }, { "ce_loss_10": 3.6468693137168886, "ce_loss_13": 3.5864970564842222, "ce_loss_2": 4.112797820568085, "ce_loss_3": 3.9430916547775268, "ce_loss_7": 3.6902678489685057, "epoch": 0.98, "grad_norm": 396.0, "kl_loss_10": 94.27075653076172, "kl_loss_2": 1089.721176147461, "kl_loss_3": 749.4085998535156, "kl_loss_7": 185.25458450317382, "learning_rate": 1.006661764057837e-06, "loss": 525.8869, "step": 9800 }, { "ce_loss_10": 3.6512062191963195, "ce_loss_13": 3.591748607158661, "ce_loss_2": 4.100599420070648, "ce_loss_3": 3.94116724729538, "ce_loss_7": 3.6929367065429686, "epoch": 0.981, "grad_norm": 370.0, "kl_loss_10": 95.22041091918945, "kl_loss_2": 1079.5743225097656, "kl_loss_3": 744.3837371826172, "kl_loss_7": 184.29767150878905, "learning_rate": 9.085419777743465e-07, "loss": 523.8814, "step": 9810 }, { "ce_loss_10": 3.5867176413536073, "ce_loss_13": 3.5297884702682496, "ce_loss_2": 4.040095067024231, "ce_loss_3": 3.876063418388367, "ce_loss_7": 3.6296881198883058, "epoch": 0.982, "grad_norm": 372.0, "kl_loss_10": 92.43338165283203, "kl_loss_2": 1068.0833831787108, "kl_loss_3": 736.9401550292969, "kl_loss_7": 179.86018447875978, "learning_rate": 8.15448036932176e-07, "loss": 515.9226, "step": 9820 }, { "ce_loss_10": 3.641873502731323, "ce_loss_13": 3.5830901145935057, "ce_loss_2": 4.088828957080841, "ce_loss_3": 3.9226441621780395, "ce_loss_7": 3.6845821142196655, "epoch": 0.983, "grad_norm": 450.0, "kl_loss_10": 93.74083061218262, "kl_loss_2": 1074.023745727539, "kl_loss_3": 742.9956268310547, "kl_loss_7": 184.04373016357422, "learning_rate": 7.273808789862724e-07, "loss": 527.5683, "step": 9830 }, { "ce_loss_10": 3.7283427119255066, "ce_loss_13": 3.6678077578544617, "ce_loss_2": 4.168006038665771, "ce_loss_3": 4.004901099205017, "ce_loss_7": 3.7688368439674376, "epoch": 0.984, "grad_norm": 302.0, "kl_loss_10": 97.79412803649902, "kl_loss_2": 1069.5015228271484, "kl_loss_3": 732.57353515625, "kl_loss_7": 187.13561325073243, "learning_rate": 6.443413907720186e-07, "loss": 519.9878, "step": 9840 }, { "ce_loss_10": 3.6514281272888183, "ce_loss_13": 3.5926932096481323, "ce_loss_2": 4.092191052436829, "ce_loss_3": 3.930639326572418, "ce_loss_7": 3.6929845571517945, "epoch": 0.985, "grad_norm": 370.0, "kl_loss_10": 94.32459564208985, "kl_loss_2": 1056.6608520507812, "kl_loss_3": 730.5611633300781, "kl_loss_7": 184.7391357421875, "learning_rate": 5.663304084960185e-07, "loss": 518.7671, "step": 9850 }, { "ce_loss_10": 3.5804072976112367, "ce_loss_13": 3.520645248889923, "ce_loss_2": 4.040905499458313, "ce_loss_3": 3.8740663886070252, "ce_loss_7": 3.6240461707115172, "epoch": 0.986, "grad_norm": 364.0, "kl_loss_10": 96.15405921936035, "kl_loss_2": 1090.2829711914062, "kl_loss_3": 753.8562805175782, "kl_loss_7": 186.33775329589844, "learning_rate": 4.933487177280482e-07, "loss": 518.7076, "step": 9860 }, { "ce_loss_10": 3.6774788737297057, "ce_loss_13": 3.6186564683914186, "ce_loss_2": 4.12188241481781, "ce_loss_3": 3.958666682243347, "ce_loss_7": 3.7196821093559267, "epoch": 0.987, "grad_norm": 408.0, "kl_loss_10": 94.4230339050293, "kl_loss_2": 1058.865460205078, "kl_loss_3": 734.6985687255859, "kl_loss_7": 181.19112243652344, "learning_rate": 4.2539705339295075e-07, "loss": 516.55, "step": 9870 }, { "ce_loss_10": 3.525623691082001, "ce_loss_13": 3.46755028963089, "ce_loss_2": 3.986714744567871, "ce_loss_3": 3.828349435329437, "ce_loss_7": 3.572091352939606, "epoch": 0.988, "grad_norm": 376.0, "kl_loss_10": 93.51777114868165, "kl_loss_2": 1095.3638244628905, "kl_loss_3": 760.6636901855469, "kl_loss_7": 187.32990493774415, "learning_rate": 3.6247609976319816e-07, "loss": 523.6327, "step": 9880 }, { "ce_loss_10": 3.6277820110321044, "ce_loss_13": 3.567537808418274, "ce_loss_2": 4.088609397411346, "ce_loss_3": 3.924705386161804, "ce_loss_7": 3.6743552684783936, "epoch": 0.989, "grad_norm": 476.0, "kl_loss_10": 96.44798164367675, "kl_loss_2": 1082.7717010498047, "kl_loss_3": 749.9800903320313, "kl_loss_7": 188.03020095825195, "learning_rate": 3.0458649045211895e-07, "loss": 536.5464, "step": 9890 }, { "ce_loss_10": 3.596536934375763, "ce_loss_13": 3.5354753971099853, "ce_loss_2": 4.064900302886963, "ce_loss_3": 3.895892357826233, "ce_loss_7": 3.6420591354370115, "epoch": 0.99, "grad_norm": 354.0, "kl_loss_10": 95.05449028015137, "kl_loss_2": 1090.392593383789, "kl_loss_3": 754.5375762939453, "kl_loss_7": 188.19856491088868, "learning_rate": 2.517288084074587e-07, "loss": 534.519, "step": 9900 }, { "ce_loss_10": 3.635116171836853, "ce_loss_13": 3.574588453769684, "ce_loss_2": 4.111752784252166, "ce_loss_3": 3.944272756576538, "ce_loss_7": 3.682980465888977, "epoch": 0.991, "grad_norm": 354.0, "kl_loss_10": 95.2107322692871, "kl_loss_2": 1113.8304351806642, "kl_loss_3": 770.2767669677735, "kl_loss_7": 189.45772018432618, "learning_rate": 2.0390358590538505e-07, "loss": 533.4635, "step": 9910 }, { "ce_loss_10": 3.644596242904663, "ce_loss_13": 3.5852715373039246, "ce_loss_2": 4.097472989559174, "ce_loss_3": 3.93781635761261, "ce_loss_7": 3.6912411212921143, "epoch": 0.992, "grad_norm": 360.0, "kl_loss_10": 95.30287055969238, "kl_loss_2": 1081.2569366455077, "kl_loss_3": 748.5147064208984, "kl_loss_7": 189.18702926635743, "learning_rate": 1.61111304545436e-07, "loss": 523.9828, "step": 9920 }, { "ce_loss_10": 3.6098355293273925, "ce_loss_13": 3.5515360593795777, "ce_loss_2": 4.0586741924285885, "ce_loss_3": 3.895809698104858, "ce_loss_7": 3.651991581916809, "epoch": 0.993, "grad_norm": 408.0, "kl_loss_10": 94.64987220764161, "kl_loss_2": 1077.9339660644532, "kl_loss_3": 744.9157043457031, "kl_loss_7": 185.0061477661133, "learning_rate": 1.2335239524541298e-07, "loss": 518.6489, "step": 9930 }, { "ce_loss_10": 3.5815568804740905, "ce_loss_13": 3.523807632923126, "ce_loss_2": 4.032287573814392, "ce_loss_3": 3.871028816699982, "ce_loss_7": 3.625322496891022, "epoch": 0.994, "grad_norm": 396.0, "kl_loss_10": 94.44433670043945, "kl_loss_2": 1070.128958129883, "kl_loss_3": 738.4679992675781, "kl_loss_7": 184.16616668701172, "learning_rate": 9.06272382371065e-08, "loss": 522.681, "step": 9940 }, { "ce_loss_10": 3.649178886413574, "ce_loss_13": 3.5898856997489927, "ce_loss_2": 4.107921350002289, "ce_loss_3": 3.9442641377449035, "ce_loss_7": 3.6927329182624815, "epoch": 0.995, "grad_norm": 366.0, "kl_loss_10": 97.43391189575195, "kl_loss_2": 1093.5839904785157, "kl_loss_3": 755.0986083984375, "kl_loss_7": 187.86676559448242, "learning_rate": 6.293616306246586e-08, "loss": 528.0195, "step": 9950 }, { "ce_loss_10": 3.6503029584884645, "ce_loss_13": 3.5912461996078493, "ce_loss_2": 4.083380007743836, "ce_loss_3": 3.9251137375831604, "ce_loss_7": 3.6914992809295653, "epoch": 0.996, "grad_norm": 386.0, "kl_loss_10": 92.51536598205567, "kl_loss_2": 1047.152279663086, "kl_loss_3": 724.7971832275391, "kl_loss_7": 180.47787857055664, "learning_rate": 4.027944857032395e-08, "loss": 508.9252, "step": 9960 }, { "ce_loss_10": 3.640582966804504, "ce_loss_13": 3.5819292664527893, "ce_loss_2": 4.071112728118896, "ce_loss_3": 3.9061211466789247, "ce_loss_7": 3.678541886806488, "epoch": 0.997, "grad_norm": 332.0, "kl_loss_10": 94.77192039489746, "kl_loss_2": 1031.1441497802734, "kl_loss_3": 710.4731018066407, "kl_loss_7": 178.55070953369142, "learning_rate": 2.265732291356626e-08, "loss": 508.5261, "step": 9970 }, { "ce_loss_10": 3.6857742786407472, "ce_loss_13": 3.6264986276626585, "ce_loss_2": 4.119817900657654, "ce_loss_3": 3.9565661191940307, "ce_loss_7": 3.7256898403167726, "epoch": 0.998, "grad_norm": 354.0, "kl_loss_10": 95.05257987976074, "kl_loss_2": 1045.786294555664, "kl_loss_3": 725.6510559082031, "kl_loss_7": 184.49017181396485, "learning_rate": 1.0069963546743833e-08, "loss": 527.3226, "step": 9980 }, { "ce_loss_10": 3.66132515668869, "ce_loss_13": 3.6029880166053774, "ce_loss_2": 4.108680582046508, "ce_loss_3": 3.9468571662902834, "ce_loss_7": 3.7058345794677736, "epoch": 0.999, "grad_norm": 358.0, "kl_loss_10": 95.48841247558593, "kl_loss_2": 1072.3943481445312, "kl_loss_3": 746.4591552734375, "kl_loss_7": 185.93264389038086, "learning_rate": 2.517497224463483e-09, "loss": 522.7165, "step": 9990 }, { "ce_loss_10": 3.6195040583610534, "ce_loss_13": 3.559644305706024, "ce_loss_2": 4.096893215179444, "ce_loss_3": 3.9238691568374633, "ce_loss_7": 3.6661298632621766, "epoch": 1.0, "grad_norm": 502.0, "kl_loss_10": 96.35170402526856, "kl_loss_2": 1110.4292907714844, "kl_loss_3": 763.8701965332032, "kl_loss_7": 189.7416961669922, "learning_rate": 0.0, "loss": 533.5189, "step": 10000 } ], "logging_steps": 10, "max_steps": 10000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.177819035608023e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }