diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,16049 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 10000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "ce_loss_10": 6.949970483779907, + "ce_loss_13": 3.5991063117980957, + "ce_loss_2": 20.74317169189453, + "ce_loss_3": 26.111305236816406, + "ce_loss_7": 10.075343608856201, + "epoch": 0.0001, + "grad_norm": 212992.0, + "kl_loss_10": 7864.61865234375, + "kl_loss_2": 35348.310546875, + "kl_loss_3": 46478.765625, + "kl_loss_7": 14199.76806640625, + "learning_rate": 1e-05, + "loss": 25853.3086, + "step": 1 + }, + { + "ce_loss_10": 6.125355773501926, + "ce_loss_13": 3.6540163622962103, + "ce_loss_2": 12.076997624503242, + "ce_loss_3": 15.207524087693956, + "ce_loss_7": 7.174011654324001, + "epoch": 0.001, + "grad_norm": 17792.0, + "kl_loss_10": 5945.64690483941, + "kl_loss_2": 17211.485812717016, + "kl_loss_3": 23727.339274088543, + "kl_loss_7": 7859.595960828993, + "learning_rate": 0.0001, + "loss": 13522.6684, + "step": 10 + }, + { + "ce_loss_10": 4.603781843185425, + "ce_loss_13": 3.6583157896995546, + "ce_loss_2": 6.582165956497192, + "ce_loss_3": 6.530840277671814, + "ce_loss_7": 4.767995834350586, + "epoch": 0.002, + "grad_norm": 2416.0, + "kl_loss_10": 2069.933459472656, + "kl_loss_2": 5383.958471679687, + "kl_loss_3": 5293.375524902343, + "kl_loss_7": 2034.765203857422, + "learning_rate": 0.0002, + "loss": 3721.8547, + "step": 20 + }, + { + "ce_loss_10": 3.8755152463912963, + "ce_loss_13": 3.440992832183838, + "ce_loss_2": 5.676479697227478, + "ce_loss_3": 5.412591814994812, + "ce_loss_7": 4.082889425754547, + "epoch": 0.003, + "grad_norm": 1440.0, + "kl_loss_10": 764.9044219970704, + "kl_loss_2": 4085.6137573242186, + "kl_loss_3": 3584.689697265625, + "kl_loss_7": 1103.4823669433595, + "learning_rate": 0.0003, + "loss": 2353.6012, + "step": 30 + }, + { + "ce_loss_10": 3.9151899099349974, + "ce_loss_13": 3.612821674346924, + "ce_loss_2": 5.4220003366470335, + "ce_loss_3": 5.159578323364258, + "ce_loss_7": 4.096081411838531, + "epoch": 0.004, + "grad_norm": 2240.0, + "kl_loss_10": 524.7813415527344, + "kl_loss_2": 3348.9226196289064, + "kl_loss_3": 2841.234338378906, + "kl_loss_7": 838.199349975586, + "learning_rate": 0.0004, + "loss": 1895.9125, + "step": 40 + }, + { + "ce_loss_10": 3.8530999541282656, + "ce_loss_13": 3.5993613958358766, + "ce_loss_2": 5.269976806640625, + "ce_loss_3": 4.972214603424073, + "ce_loss_7": 4.020862734317779, + "epoch": 0.005, + "grad_norm": 1856.0, + "kl_loss_10": 415.9984130859375, + "kl_loss_2": 3121.74814453125, + "kl_loss_3": 2545.165393066406, + "kl_loss_7": 715.3964080810547, + "learning_rate": 0.0005, + "loss": 1695.3973, + "step": 50 + }, + { + "ce_loss_10": 3.8142170667648316, + "ce_loss_13": 3.6055872440338135, + "ce_loss_2": 5.112091851234436, + "ce_loss_3": 4.877537202835083, + "ce_loss_7": 3.9615157723426817, + "epoch": 0.006, + "grad_norm": 1256.0, + "kl_loss_10": 352.13475494384767, + "kl_loss_2": 2834.7232299804687, + "kl_loss_3": 2411.520703125, + "kl_loss_7": 626.5961456298828, + "learning_rate": 0.0006, + "loss": 1559.8957, + "step": 60 + }, + { + "ce_loss_10": 3.7009406328201293, + "ce_loss_13": 3.519935369491577, + "ce_loss_2": 4.975889682769775, + "ce_loss_3": 4.767486214637756, + "ce_loss_7": 3.847541904449463, + "epoch": 0.007, + "grad_norm": 1208.0, + "kl_loss_10": 301.96008453369143, + "kl_loss_2": 2753.304443359375, + "kl_loss_3": 2332.9209411621096, + "kl_loss_7": 579.1212310791016, + "learning_rate": 0.0007, + "loss": 1482.1832, + "step": 70 + }, + { + "ce_loss_10": 3.691651237010956, + "ce_loss_13": 3.5254875898361204, + "ce_loss_2": 4.9323248863220215, + "ce_loss_3": 4.6815266609191895, + "ce_loss_7": 3.918946826457977, + "epoch": 0.008, + "grad_norm": 1920.0, + "kl_loss_10": 277.7585922241211, + "kl_loss_2": 2673.962158203125, + "kl_loss_3": 2198.915148925781, + "kl_loss_7": 679.1366607666016, + "learning_rate": 0.0008, + "loss": 1458.9709, + "step": 80 + }, + { + "ce_loss_10": 3.630067002773285, + "ce_loss_13": 3.48206342458725, + "ce_loss_2": 4.835212993621826, + "ce_loss_3": 4.615828561782837, + "ce_loss_7": 3.8199189424514772, + "epoch": 0.009, + "grad_norm": 2336.0, + "kl_loss_10": 252.8492774963379, + "kl_loss_2": 2587.6504516601562, + "kl_loss_3": 2134.85634765625, + "kl_loss_7": 622.173226928711, + "learning_rate": 0.0009000000000000001, + "loss": 1391.4618, + "step": 90 + }, + { + "ce_loss_10": 3.7532737612724305, + "ce_loss_13": 3.6053535461425783, + "ce_loss_2": 4.9651381254196165, + "ce_loss_3": 4.679249548912049, + "ce_loss_7": 3.9317663073539735, + "epoch": 0.01, + "grad_norm": 2128.0, + "kl_loss_10": 251.45062484741212, + "kl_loss_2": 2572.321484375, + "kl_loss_3": 2009.2118774414062, + "kl_loss_7": 549.0976821899415, + "learning_rate": 0.001, + "loss": 1348.2805, + "step": 100 + }, + { + "ce_loss_10": 3.7507344126701354, + "ce_loss_13": 3.560739505290985, + "ce_loss_2": 4.900289678573609, + "ce_loss_3": 4.575748300552368, + "ce_loss_7": 3.855139982700348, + "epoch": 0.011, + "grad_norm": 2096.0, + "kl_loss_10": 336.0966377258301, + "kl_loss_2": 2548.9544921875, + "kl_loss_3": 1914.9787902832031, + "kl_loss_7": 503.8795822143555, + "learning_rate": 0.0009999974825027757, + "loss": 1319.618, + "step": 110 + }, + { + "ce_loss_10": 3.820546102523804, + "ce_loss_13": 3.6168412566184998, + "ce_loss_2": 4.895331883430481, + "ce_loss_3": 4.579937386512756, + "ce_loss_7": 3.905752420425415, + "epoch": 0.012, + "grad_norm": 1216.0, + "kl_loss_10": 360.8111114501953, + "kl_loss_2": 2423.57294921875, + "kl_loss_3": 1821.830096435547, + "kl_loss_7": 497.84374084472654, + "learning_rate": 0.0009999899300364532, + "loss": 1256.7569, + "step": 120 + }, + { + "ce_loss_10": 3.765466582775116, + "ce_loss_13": 3.588362789154053, + "ce_loss_2": 4.861788105964661, + "ce_loss_3": 4.571296620368957, + "ce_loss_7": 3.876194405555725, + "epoch": 0.013, + "grad_norm": 1920.0, + "kl_loss_10": 300.5776092529297, + "kl_loss_2": 2421.4876953125, + "kl_loss_3": 1850.192041015625, + "kl_loss_7": 510.1988525390625, + "learning_rate": 0.0009999773426770863, + "loss": 1278.616, + "step": 130 + }, + { + "ce_loss_10": 3.812940168380737, + "ce_loss_13": 3.6267056345939634, + "ce_loss_2": 4.856600952148438, + "ce_loss_3": 4.555375063419342, + "ce_loss_7": 3.9197404980659485, + "epoch": 0.014, + "grad_norm": 1104.0, + "kl_loss_10": 334.16673736572267, + "kl_loss_2": 2343.682012939453, + "kl_loss_3": 1750.5664672851562, + "kl_loss_7": 514.5185562133789, + "learning_rate": 0.0009999597205514296, + "loss": 1248.4314, + "step": 140 + }, + { + "ce_loss_10": 3.7693222880363466, + "ce_loss_13": 3.5812222719192506, + "ce_loss_2": 4.7746042013168335, + "ce_loss_3": 4.491594767570495, + "ce_loss_7": 3.9013134360313417, + "epoch": 0.015, + "grad_norm": 2000.0, + "kl_loss_10": 301.1280906677246, + "kl_loss_2": 2261.218878173828, + "kl_loss_3": 1705.700830078125, + "kl_loss_7": 572.2008193969726, + "learning_rate": 0.0009999370638369377, + "loss": 1215.427, + "step": 150 + }, + { + "ce_loss_10": 3.771867072582245, + "ce_loss_13": 3.623634135723114, + "ce_loss_2": 4.793287062644959, + "ce_loss_3": 4.509058833122253, + "ce_loss_7": 3.964562237262726, + "epoch": 0.016, + "grad_norm": 1736.0, + "kl_loss_10": 262.3149169921875, + "kl_loss_2": 2258.9556396484377, + "kl_loss_3": 1685.8056640625, + "kl_loss_7": 604.1481872558594, + "learning_rate": 0.000999909372761763, + "loss": 1207.4248, + "step": 160 + }, + { + "ce_loss_10": 3.702605497837067, + "ce_loss_13": 3.555959129333496, + "ce_loss_2": 4.72133858203888, + "ce_loss_3": 4.451281535625458, + "ce_loss_7": 3.8406598806381225, + "epoch": 0.017, + "grad_norm": 1536.0, + "kl_loss_10": 242.87019119262695, + "kl_loss_2": 2254.203680419922, + "kl_loss_3": 1717.980206298828, + "kl_loss_7": 507.3527557373047, + "learning_rate": 0.0009998766476047546, + "loss": 1188.5746, + "step": 170 + }, + { + "ce_loss_10": 3.7377680063247682, + "ce_loss_13": 3.600107181072235, + "ce_loss_2": 4.7649291276931764, + "ce_loss_3": 4.480887150764465, + "ce_loss_7": 3.8928799867630004, + "epoch": 0.018, + "grad_norm": 1096.0, + "kl_loss_10": 231.40118408203125, + "kl_loss_2": 2247.180828857422, + "kl_loss_3": 1668.1500427246094, + "kl_loss_7": 502.5091278076172, + "learning_rate": 0.0009998388886954545, + "loss": 1181.5367, + "step": 180 + }, + { + "ce_loss_10": 3.693929398059845, + "ce_loss_13": 3.5655275106430055, + "ce_loss_2": 4.720048952102661, + "ce_loss_3": 4.425967907905578, + "ce_loss_7": 3.8348891854286196, + "epoch": 0.019, + "grad_norm": 1032.0, + "kl_loss_10": 214.6924057006836, + "kl_loss_2": 2225.93642578125, + "kl_loss_3": 1640.7731079101563, + "kl_loss_7": 467.8557983398438, + "learning_rate": 0.0009997960964140947, + "loss": 1132.1148, + "step": 190 + }, + { + "ce_loss_10": 3.682847249507904, + "ce_loss_13": 3.5613570332527162, + "ce_loss_2": 4.7296292066574095, + "ce_loss_3": 4.422930908203125, + "ce_loss_7": 3.8146942615509034, + "epoch": 0.02, + "grad_norm": 1360.0, + "kl_loss_10": 204.76239395141602, + "kl_loss_2": 2234.7769409179687, + "kl_loss_3": 1619.6412719726563, + "kl_loss_7": 443.86146240234376, + "learning_rate": 0.0009997482711915926, + "loss": 1118.6208, + "step": 200 + }, + { + "ce_loss_10": 3.6386141061782835, + "ce_loss_13": 3.523626208305359, + "ce_loss_2": 4.654034543037414, + "ce_loss_3": 4.351287698745727, + "ce_loss_7": 3.7648990035057066, + "epoch": 0.021, + "grad_norm": 844.0, + "kl_loss_10": 191.48220748901366, + "kl_loss_2": 2176.9761169433596, + "kl_loss_3": 1592.8676696777343, + "kl_loss_7": 425.4680374145508, + "learning_rate": 0.0009996954135095479, + "loss": 1087.035, + "step": 210 + }, + { + "ce_loss_10": 3.726301395893097, + "ce_loss_13": 3.613930583000183, + "ce_loss_2": 4.689849066734314, + "ce_loss_3": 4.408529257774353, + "ce_loss_7": 3.8633771181106566, + "epoch": 0.022, + "grad_norm": 968.0, + "kl_loss_10": 185.19210891723634, + "kl_loss_2": 2058.4550598144533, + "kl_loss_3": 1519.1773864746094, + "kl_loss_7": 434.87823486328125, + "learning_rate": 0.0009996375239002368, + "loss": 1051.0784, + "step": 220 + }, + { + "ce_loss_10": 3.7977551460266112, + "ce_loss_13": 3.6818463683128355, + "ce_loss_2": 4.714341163635254, + "ce_loss_3": 4.444690012931824, + "ce_loss_7": 3.9216720938682554, + "epoch": 0.023, + "grad_norm": 792.0, + "kl_loss_10": 197.30275802612306, + "kl_loss_2": 1985.7976623535155, + "kl_loss_3": 1460.261212158203, + "kl_loss_7": 417.5539093017578, + "learning_rate": 0.0009995746029466072, + "loss": 1021.8153, + "step": 230 + }, + { + "ce_loss_10": 3.5866660118103026, + "ce_loss_13": 3.465270149707794, + "ce_loss_2": 4.572040939331055, + "ce_loss_3": 4.279095077514649, + "ce_loss_7": 3.7078741550445558, + "epoch": 0.024, + "grad_norm": 908.0, + "kl_loss_10": 207.95580520629883, + "kl_loss_2": 2143.852404785156, + "kl_loss_3": 1571.8129760742188, + "kl_loss_7": 426.2864517211914, + "learning_rate": 0.0009995066512822719, + "loss": 1050.4631, + "step": 240 + }, + { + "ce_loss_10": 3.686765193939209, + "ce_loss_13": 3.5706915736198424, + "ce_loss_2": 4.679925036430359, + "ce_loss_3": 4.384693300724029, + "ce_loss_7": 3.8067264676094057, + "epoch": 0.025, + "grad_norm": 1032.0, + "kl_loss_10": 199.02285385131836, + "kl_loss_2": 2131.8410888671874, + "kl_loss_3": 1544.0210571289062, + "kl_loss_7": 413.1264175415039, + "learning_rate": 0.000999433669591504, + "loss": 1033.4686, + "step": 250 + }, + { + "ce_loss_10": 3.581657183170319, + "ce_loss_13": 3.472314774990082, + "ce_loss_2": 4.541779208183288, + "ce_loss_3": 4.317579293251038, + "ce_loss_7": 3.7064756393432616, + "epoch": 0.026, + "grad_norm": 932.0, + "kl_loss_10": 189.23829040527343, + "kl_loss_2": 2088.3945861816405, + "kl_loss_3": 1637.8548217773437, + "kl_loss_7": 412.1736145019531, + "learning_rate": 0.000999355658609228, + "loss": 1057.2906, + "step": 260 + }, + { + "ce_loss_10": 3.6219969391822815, + "ce_loss_13": 3.5048365235328673, + "ce_loss_2": 4.596334934234619, + "ce_loss_3": 4.441597318649292, + "ce_loss_7": 3.7431845664978027, + "epoch": 0.027, + "grad_norm": 900.0, + "kl_loss_10": 188.88880004882813, + "kl_loss_2": 2095.5673889160157, + "kl_loss_3": 1750.0010925292968, + "kl_loss_7": 405.53798522949216, + "learning_rate": 0.0009992726191210138, + "loss": 1093.5438, + "step": 270 + }, + { + "ce_loss_10": 3.652155375480652, + "ce_loss_13": 3.5428276896476745, + "ce_loss_2": 4.579152154922485, + "ce_loss_3": 4.383497536182404, + "ce_loss_7": 3.789737546443939, + "epoch": 0.028, + "grad_norm": 780.0, + "kl_loss_10": 187.1881446838379, + "kl_loss_2": 2006.3188415527343, + "kl_loss_3": 1629.3374816894532, + "kl_loss_7": 423.36602783203125, + "learning_rate": 0.0009991845519630679, + "loss": 1050.2449, + "step": 280 + }, + { + "ce_loss_10": 3.535972011089325, + "ce_loss_13": 3.427106332778931, + "ce_loss_2": 4.4744978785514835, + "ce_loss_3": 4.262858963012695, + "ce_loss_7": 3.6743877053260805, + "epoch": 0.029, + "grad_norm": 684.0, + "kl_loss_10": 179.29404220581054, + "kl_loss_2": 2016.5242065429688, + "kl_loss_3": 1582.6825744628907, + "kl_loss_7": 441.13164978027345, + "learning_rate": 0.0009990914580222257, + "loss": 1053.0684, + "step": 290 + }, + { + "ce_loss_10": 3.668388879299164, + "ce_loss_13": 3.567758929729462, + "ce_loss_2": 4.53744785785675, + "ce_loss_3": 4.309589576721192, + "ce_loss_7": 3.8119096040725706, + "epoch": 0.03, + "grad_norm": 1224.0, + "kl_loss_10": 187.29344177246094, + "kl_loss_2": 1896.2646728515624, + "kl_loss_3": 1456.2583312988281, + "kl_loss_7": 421.26478881835936, + "learning_rate": 0.0009989933382359422, + "loss": 1015.491, + "step": 300 + }, + { + "ce_loss_10": 3.6792996883392335, + "ce_loss_13": 3.5740679264068604, + "ce_loss_2": 4.557365846633911, + "ce_loss_3": 4.323324573040009, + "ce_loss_7": 3.7865865588188172, + "epoch": 0.031, + "grad_norm": 828.0, + "kl_loss_10": 187.5126724243164, + "kl_loss_2": 1923.0036499023438, + "kl_loss_3": 1442.6469970703124, + "kl_loss_7": 384.39212493896486, + "learning_rate": 0.0009988901935922825, + "loss": 997.117, + "step": 310 + }, + { + "ce_loss_10": 3.5293397903442383, + "ce_loss_13": 3.4199066400527953, + "ce_loss_2": 4.486744737625122, + "ce_loss_3": 4.227087867259979, + "ce_loss_7": 3.6469008684158326, + "epoch": 0.032, + "grad_norm": 976.0, + "kl_loss_10": 183.28864593505858, + "kl_loss_2": 2055.861083984375, + "kl_loss_3": 1531.5701293945312, + "kl_loss_7": 385.79076690673827, + "learning_rate": 0.0009987820251299122, + "loss": 1008.4045, + "step": 320 + }, + { + "ce_loss_10": 3.66086140871048, + "ce_loss_13": 3.556379699707031, + "ce_loss_2": 4.536041283607483, + "ce_loss_3": 4.270140862464904, + "ce_loss_7": 3.770153260231018, + "epoch": 0.033, + "grad_norm": 1144.0, + "kl_loss_10": 168.59372940063477, + "kl_loss_2": 1906.9965759277343, + "kl_loss_3": 1385.0498474121093, + "kl_loss_7": 372.18479614257814, + "learning_rate": 0.0009986688339380862, + "loss": 957.1518, + "step": 330 + }, + { + "ce_loss_10": 3.6052905559539794, + "ce_loss_13": 3.504980742931366, + "ce_loss_2": 4.501095390319824, + "ce_loss_3": 4.218203604221344, + "ce_loss_7": 3.727604556083679, + "epoch": 0.034, + "grad_norm": 1104.0, + "kl_loss_10": 164.38146286010743, + "kl_loss_2": 1931.4434020996093, + "kl_loss_3": 1379.435321044922, + "kl_loss_7": 389.75638275146486, + "learning_rate": 0.0009985506211566387, + "loss": 969.0948, + "step": 340 + }, + { + "ce_loss_10": 3.6377886295318604, + "ce_loss_13": 3.541017484664917, + "ce_loss_2": 4.482600402832031, + "ce_loss_3": 4.22925614118576, + "ce_loss_7": 3.7690312385559084, + "epoch": 0.035, + "grad_norm": 988.0, + "kl_loss_10": 158.43596343994142, + "kl_loss_2": 1829.8166870117188, + "kl_loss_3": 1337.818865966797, + "kl_loss_7": 388.05591278076173, + "learning_rate": 0.0009984273879759713, + "loss": 933.1328, + "step": 350 + }, + { + "ce_loss_10": 3.667439329624176, + "ce_loss_13": 3.5666789412498474, + "ce_loss_2": 4.5066794633865355, + "ce_loss_3": 4.2926198720932005, + "ce_loss_7": 3.7826303958892824, + "epoch": 0.036, + "grad_norm": 600.0, + "kl_loss_10": 162.84700927734374, + "kl_loss_2": 1826.3444274902345, + "kl_loss_3": 1395.6122314453125, + "kl_loss_7": 384.24933471679685, + "learning_rate": 0.0009982991356370402, + "loss": 957.8976, + "step": 360 + }, + { + "ce_loss_10": 3.643305718898773, + "ce_loss_13": 3.545375657081604, + "ce_loss_2": 4.487171721458435, + "ce_loss_3": 4.280910170078277, + "ce_loss_7": 3.767821896076202, + "epoch": 0.037, + "grad_norm": 596.0, + "kl_loss_10": 164.2067985534668, + "kl_loss_2": 1829.6034606933595, + "kl_loss_3": 1399.7697387695312, + "kl_loss_7": 389.38902282714844, + "learning_rate": 0.0009981658654313456, + "loss": 945.4266, + "step": 370 + }, + { + "ce_loss_10": 3.728627920150757, + "ce_loss_13": 3.628399407863617, + "ce_loss_2": 4.530555677413941, + "ce_loss_3": 4.30595852136612, + "ce_loss_7": 3.83515260219574, + "epoch": 0.038, + "grad_norm": 572.0, + "kl_loss_10": 166.8636932373047, + "kl_loss_2": 1769.3647521972657, + "kl_loss_3": 1309.9726135253907, + "kl_loss_7": 360.24556121826174, + "learning_rate": 0.000998027578700917, + "loss": 918.2047, + "step": 380 + }, + { + "ce_loss_10": 3.6558377385139464, + "ce_loss_13": 3.5584804892539976, + "ce_loss_2": 4.499538516998291, + "ce_loss_3": 4.255290400981903, + "ce_loss_7": 3.7707452058792112, + "epoch": 0.039, + "grad_norm": 684.0, + "kl_loss_10": 164.3509963989258, + "kl_loss_2": 1842.500555419922, + "kl_loss_3": 1353.7652526855468, + "kl_loss_7": 364.5350601196289, + "learning_rate": 0.0009978842768382998, + "loss": 935.4773, + "step": 390 + }, + { + "ce_loss_10": 3.6760897040367126, + "ce_loss_13": 3.5800448179244997, + "ce_loss_2": 4.493572664260864, + "ce_loss_3": 4.2423638820648195, + "ce_loss_7": 3.786301875114441, + "epoch": 0.04, + "grad_norm": 968.0, + "kl_loss_10": 161.01671752929687, + "kl_loss_2": 1790.2952514648437, + "kl_loss_3": 1298.0454895019532, + "kl_loss_7": 363.45732421875, + "learning_rate": 0.0009977359612865424, + "loss": 914.3111, + "step": 400 + }, + { + "ce_loss_10": 3.684686779975891, + "ce_loss_13": 3.586086595058441, + "ce_loss_2": 4.512642502784729, + "ce_loss_3": 4.255100309848785, + "ce_loss_7": 3.805712080001831, + "epoch": 0.041, + "grad_norm": 724.0, + "kl_loss_10": 161.0974250793457, + "kl_loss_2": 1807.8360168457032, + "kl_loss_3": 1310.046209716797, + "kl_loss_7": 391.8801742553711, + "learning_rate": 0.0009975826335391806, + "loss": 914.0043, + "step": 410 + }, + { + "ce_loss_10": 3.707440197467804, + "ce_loss_13": 3.604601538181305, + "ce_loss_2": 4.522381353378296, + "ce_loss_3": 4.265636503696442, + "ce_loss_7": 3.822117471694946, + "epoch": 0.042, + "grad_norm": 900.0, + "kl_loss_10": 166.57249908447267, + "kl_loss_2": 1773.633642578125, + "kl_loss_3": 1273.706396484375, + "kl_loss_7": 380.59193420410156, + "learning_rate": 0.0009974242951402235, + "loss": 906.3268, + "step": 420 + }, + { + "ce_loss_10": 3.7127435922622682, + "ce_loss_13": 3.6068360447883605, + "ce_loss_2": 4.534731841087341, + "ce_loss_3": 4.272397923469543, + "ce_loss_7": 3.8254016041755676, + "epoch": 0.043, + "grad_norm": 544.0, + "kl_loss_10": 171.76721878051757, + "kl_loss_2": 1813.8242553710938, + "kl_loss_3": 1297.1632202148437, + "kl_loss_7": 380.753857421875, + "learning_rate": 0.0009972609476841367, + "loss": 907.3121, + "step": 430 + }, + { + "ce_loss_10": 3.638201355934143, + "ce_loss_13": 3.521967649459839, + "ce_loss_2": 4.476631236076355, + "ce_loss_3": 4.207662534713745, + "ce_loss_7": 3.743925619125366, + "epoch": 0.044, + "grad_norm": 656.0, + "kl_loss_10": 205.51385726928712, + "kl_loss_2": 1862.7595336914062, + "kl_loss_3": 1318.6298767089843, + "kl_loss_7": 397.6446823120117, + "learning_rate": 0.0009970925928158272, + "loss": 947.2434, + "step": 440 + }, + { + "ce_loss_10": 3.5770766854286196, + "ce_loss_13": 3.463445019721985, + "ce_loss_2": 4.41340719461441, + "ce_loss_3": 4.154228365421295, + "ce_loss_7": 3.683333933353424, + "epoch": 0.045, + "grad_norm": 544.0, + "kl_loss_10": 187.16454544067383, + "kl_loss_2": 1860.5320922851563, + "kl_loss_3": 1349.3171264648438, + "kl_loss_7": 389.9994171142578, + "learning_rate": 0.000996919232230627, + "loss": 931.1581, + "step": 450 + }, + { + "ce_loss_10": 3.6615111470222472, + "ce_loss_13": 3.5475740671157836, + "ce_loss_2": 4.4471900224685665, + "ce_loss_3": 4.206307077407837, + "ce_loss_7": 3.769719123840332, + "epoch": 0.046, + "grad_norm": 792.0, + "kl_loss_10": 189.3802345275879, + "kl_loss_2": 1767.1443359375, + "kl_loss_3": 1299.2234252929688, + "kl_loss_7": 404.1760650634766, + "learning_rate": 0.0009967408676742752, + "loss": 896.3932, + "step": 460 + }, + { + "ce_loss_10": 3.815341627597809, + "ce_loss_13": 3.6976951956748962, + "ce_loss_2": 4.575603008270264, + "ce_loss_3": 4.349165272712708, + "ce_loss_7": 3.926807904243469, + "epoch": 0.047, + "grad_norm": 1020.0, + "kl_loss_10": 193.17176513671876, + "kl_loss_2": 1722.3591735839843, + "kl_loss_3": 1269.6951721191406, + "kl_loss_7": 399.5799560546875, + "learning_rate": 0.0009965575009429006, + "loss": 911.5342, + "step": 470 + }, + { + "ce_loss_10": 3.5749866485595705, + "ce_loss_13": 3.471819591522217, + "ce_loss_2": 4.3897274255752565, + "ce_loss_3": 4.163278090953827, + "ce_loss_7": 3.6932021975517273, + "epoch": 0.048, + "grad_norm": 832.0, + "kl_loss_10": 173.3515739440918, + "kl_loss_2": 1803.337139892578, + "kl_loss_3": 1356.6680847167968, + "kl_loss_7": 384.8886749267578, + "learning_rate": 0.0009963691338830043, + "loss": 913.6404, + "step": 480 + }, + { + "ce_loss_10": 3.6706506490707396, + "ce_loss_13": 3.5724706411361695, + "ce_loss_2": 4.442422878742218, + "ce_loss_3": 4.223198866844177, + "ce_loss_7": 3.7754740715026855, + "epoch": 0.049, + "grad_norm": 664.0, + "kl_loss_10": 163.7422233581543, + "kl_loss_2": 1726.0343017578125, + "kl_loss_3": 1283.6178405761718, + "kl_loss_7": 355.8869354248047, + "learning_rate": 0.0009961757683914405, + "loss": 866.413, + "step": 490 + }, + { + "ce_loss_10": 3.657481300830841, + "ce_loss_13": 3.561222219467163, + "ce_loss_2": 4.412674343585968, + "ce_loss_3": 4.190000641345978, + "ce_loss_7": 3.7463939428329467, + "epoch": 0.05, + "grad_norm": 552.0, + "kl_loss_10": 171.74871139526368, + "kl_loss_2": 1693.8871337890625, + "kl_loss_3": 1238.5546569824219, + "kl_loss_7": 333.96338348388673, + "learning_rate": 0.0009959774064153978, + "loss": 867.9215, + "step": 500 + }, + { + "ce_loss_10": 3.6671042442321777, + "ce_loss_13": 3.5669935941696167, + "ce_loss_2": 4.402782237529754, + "ce_loss_3": 4.179040241241455, + "ce_loss_7": 3.7529300928115843, + "epoch": 0.051, + "grad_norm": 548.0, + "kl_loss_10": 165.0301971435547, + "kl_loss_2": 1649.0840270996093, + "kl_loss_3": 1201.8943420410155, + "kl_loss_7": 327.53272857666013, + "learning_rate": 0.0009957740499523787, + "loss": 850.5875, + "step": 510 + }, + { + "ce_loss_10": 3.692741870880127, + "ce_loss_13": 3.5905726313591004, + "ce_loss_2": 4.450686037540436, + "ce_loss_3": 4.220798969268799, + "ce_loss_7": 3.785287392139435, + "epoch": 0.052, + "grad_norm": 560.0, + "kl_loss_10": 160.49609146118163, + "kl_loss_2": 1681.6054443359376, + "kl_loss_3": 1234.1128479003905, + "kl_loss_7": 330.61391296386716, + "learning_rate": 0.0009955657010501807, + "loss": 859.9023, + "step": 520 + }, + { + "ce_loss_10": 3.654950940608978, + "ce_loss_13": 3.554408383369446, + "ce_loss_2": 4.4271773338317875, + "ce_loss_3": 4.2007159948348995, + "ce_loss_7": 3.7458335757255554, + "epoch": 0.053, + "grad_norm": 560.0, + "kl_loss_10": 160.82289505004883, + "kl_loss_2": 1731.140557861328, + "kl_loss_3": 1270.9948364257812, + "kl_loss_7": 331.9795379638672, + "learning_rate": 0.000995352361806875, + "loss": 862.8967, + "step": 530 + }, + { + "ce_loss_10": 3.6911896467208862, + "ce_loss_13": 3.5907997369766234, + "ce_loss_2": 4.458630633354187, + "ce_loss_3": 4.2227191686630245, + "ce_loss_7": 3.7843895673751833, + "epoch": 0.054, + "grad_norm": 552.0, + "kl_loss_10": 166.47220001220703, + "kl_loss_2": 1722.0740966796875, + "kl_loss_3": 1249.600811767578, + "kl_loss_7": 335.7519927978516, + "learning_rate": 0.0009951340343707852, + "loss": 876.934, + "step": 540 + }, + { + "ce_loss_10": 3.7539408445358275, + "ce_loss_13": 3.6503811120986938, + "ce_loss_2": 4.52553424835205, + "ce_loss_3": 4.282987451553344, + "ce_loss_7": 3.839770758152008, + "epoch": 0.055, + "grad_norm": 512.0, + "kl_loss_10": 162.00789489746094, + "kl_loss_2": 1707.214862060547, + "kl_loss_3": 1233.2592041015625, + "kl_loss_7": 323.39465484619143, + "learning_rate": 0.0009949107209404665, + "loss": 863.0879, + "step": 550 + }, + { + "ce_loss_10": 3.6489940643310548, + "ce_loss_13": 3.5539053201675417, + "ce_loss_2": 4.41527898311615, + "ce_loss_3": 4.180786430835724, + "ce_loss_7": 3.750082802772522, + "epoch": 0.056, + "grad_norm": 540.0, + "kl_loss_10": 157.5300537109375, + "kl_loss_2": 1703.8317993164062, + "kl_loss_3": 1234.3482482910156, + "kl_loss_7": 346.96667022705077, + "learning_rate": 0.0009946824237646824, + "loss": 859.4348, + "step": 560 + }, + { + "ce_loss_10": 3.5962815046310426, + "ce_loss_13": 3.501141941547394, + "ce_loss_2": 4.377828812599182, + "ce_loss_3": 4.148308992385864, + "ce_loss_7": 3.7191163897514343, + "epoch": 0.057, + "grad_norm": 764.0, + "kl_loss_10": 153.23575592041016, + "kl_loss_2": 1739.7751159667969, + "kl_loss_3": 1272.3390563964845, + "kl_loss_7": 396.63781890869143, + "learning_rate": 0.0009944491451423828, + "loss": 901.9479, + "step": 570 + }, + { + "ce_loss_10": 3.594892370700836, + "ce_loss_13": 3.500366282463074, + "ce_loss_2": 4.390602493286133, + "ce_loss_3": 4.148435056209564, + "ce_loss_7": 3.710537350177765, + "epoch": 0.058, + "grad_norm": 804.0, + "kl_loss_10": 153.60133438110353, + "kl_loss_2": 1753.4079895019531, + "kl_loss_3": 1272.2657775878906, + "kl_loss_7": 368.931379699707, + "learning_rate": 0.0009942108874226813, + "loss": 870.9764, + "step": 580 + }, + { + "ce_loss_10": 3.7256513595581056, + "ce_loss_13": 3.6301231741905213, + "ce_loss_2": 4.468952918052674, + "ce_loss_3": 4.23498455286026, + "ce_loss_7": 3.8281203866004945, + "epoch": 0.059, + "grad_norm": 494.0, + "kl_loss_10": 155.00701828002929, + "kl_loss_2": 1650.434881591797, + "kl_loss_3": 1188.881414794922, + "kl_loss_7": 349.0711242675781, + "learning_rate": 0.00099396765300483, + "loss": 829.2725, + "step": 590 + }, + { + "ce_loss_10": 3.688658046722412, + "ce_loss_13": 3.600668156147003, + "ce_loss_2": 4.441829895973205, + "ce_loss_3": 4.204685604572296, + "ce_loss_7": 3.7927687644958494, + "epoch": 0.06, + "grad_norm": 700.0, + "kl_loss_10": 147.73722648620605, + "kl_loss_2": 1665.102276611328, + "kl_loss_3": 1201.0594848632813, + "kl_loss_7": 336.5929977416992, + "learning_rate": 0.0009937194443381972, + "loss": 836.3632, + "step": 600 + }, + { + "ce_loss_10": 3.7074933648109436, + "ce_loss_13": 3.6225223660469057, + "ce_loss_2": 4.444479322433471, + "ce_loss_3": 4.212475669384003, + "ce_loss_7": 3.806171452999115, + "epoch": 0.061, + "grad_norm": 490.0, + "kl_loss_10": 145.92314338684082, + "kl_loss_2": 1647.2934875488281, + "kl_loss_3": 1192.1070617675782, + "kl_loss_7": 330.35746612548826, + "learning_rate": 0.0009934662639222412, + "loss": 841.5062, + "step": 610 + }, + { + "ce_loss_10": 3.6668009042739866, + "ce_loss_13": 3.5791383743286134, + "ce_loss_2": 4.436111927032471, + "ce_loss_3": 4.192030191421509, + "ce_loss_7": 3.7709102272987365, + "epoch": 0.062, + "grad_norm": 548.0, + "kl_loss_10": 142.56752128601073, + "kl_loss_2": 1707.9285888671875, + "kl_loss_3": 1224.2419647216798, + "kl_loss_7": 333.47724609375, + "learning_rate": 0.000993208114306486, + "loss": 843.8041, + "step": 620 + }, + { + "ce_loss_10": 3.5789570450782775, + "ce_loss_13": 3.4927441477775574, + "ce_loss_2": 4.355255722999573, + "ce_loss_3": 4.113215839862823, + "ce_loss_7": 3.6797728538513184, + "epoch": 0.063, + "grad_norm": 684.0, + "kl_loss_10": 142.47375717163087, + "kl_loss_2": 1703.1561950683595, + "kl_loss_3": 1224.8568115234375, + "kl_loss_7": 327.08728790283203, + "learning_rate": 0.0009929449980904952, + "loss": 827.3757, + "step": 630 + }, + { + "ce_loss_10": 3.6368979692459105, + "ce_loss_13": 3.552669334411621, + "ce_loss_2": 4.39526858329773, + "ce_loss_3": 4.161888694763183, + "ce_loss_7": 3.7305431842803953, + "epoch": 0.064, + "grad_norm": 604.0, + "kl_loss_10": 145.31115531921387, + "kl_loss_2": 1675.4742797851563, + "kl_loss_3": 1206.004461669922, + "kl_loss_7": 311.66287689208986, + "learning_rate": 0.0009926769179238466, + "loss": 830.4232, + "step": 640 + }, + { + "ce_loss_10": 3.708518397808075, + "ce_loss_13": 3.6032424330711366, + "ce_loss_2": 4.449502897262573, + "ce_loss_3": 4.213514125347137, + "ce_loss_7": 3.7848907709121704, + "epoch": 0.065, + "grad_norm": 572.0, + "kl_loss_10": 183.44921951293946, + "kl_loss_2": 1690.6953979492187, + "kl_loss_3": 1209.2367431640625, + "kl_loss_7": 320.96158905029296, + "learning_rate": 0.000992403876506104, + "loss": 845.6277, + "step": 650 + }, + { + "ce_loss_10": 3.6422240853309633, + "ce_loss_13": 3.5376295328140257, + "ce_loss_2": 4.388966178894043, + "ce_loss_3": 4.148260116577148, + "ce_loss_7": 3.721568763256073, + "epoch": 0.066, + "grad_norm": 516.0, + "kl_loss_10": 166.38197479248046, + "kl_loss_2": 1675.3920837402343, + "kl_loss_3": 1201.2981964111327, + "kl_loss_7": 311.2148132324219, + "learning_rate": 0.0009921258765867918, + "loss": 834.6085, + "step": 660 + }, + { + "ce_loss_10": 3.593488574028015, + "ce_loss_13": 3.5049474120140074, + "ce_loss_2": 4.357883477210999, + "ce_loss_3": 4.115947949886322, + "ce_loss_7": 3.6764505982398985, + "epoch": 0.067, + "grad_norm": 600.0, + "kl_loss_10": 148.19011993408202, + "kl_loss_2": 1717.049383544922, + "kl_loss_3": 1223.6348205566405, + "kl_loss_7": 306.2616958618164, + "learning_rate": 0.0009918429209653662, + "loss": 833.8985, + "step": 670 + }, + { + "ce_loss_10": 3.648161160945892, + "ce_loss_13": 3.559934389591217, + "ce_loss_2": 4.409848690032959, + "ce_loss_3": 4.171260499954224, + "ce_loss_7": 3.7374308466911317, + "epoch": 0.068, + "grad_norm": 596.0, + "kl_loss_10": 147.48591995239258, + "kl_loss_2": 1679.9023315429688, + "kl_loss_3": 1208.9112182617187, + "kl_loss_7": 313.83782348632815, + "learning_rate": 0.0009915550124911866, + "loss": 822.998, + "step": 680 + }, + { + "ce_loss_10": 3.6632981300354004, + "ce_loss_13": 3.573338711261749, + "ce_loss_2": 4.395955181121826, + "ce_loss_3": 4.164679610729218, + "ce_loss_7": 3.7496419668197634, + "epoch": 0.069, + "grad_norm": 636.0, + "kl_loss_10": 148.64911651611328, + "kl_loss_2": 1629.2107971191406, + "kl_loss_3": 1186.3095611572267, + "kl_loss_7": 309.2399566650391, + "learning_rate": 0.0009912621540634887, + "loss": 816.0117, + "step": 690 + }, + { + "ce_loss_10": 3.6952749490737915, + "ce_loss_13": 3.608550024032593, + "ce_loss_2": 4.3951560974121096, + "ce_loss_3": 4.167996168136597, + "ce_loss_7": 3.778964614868164, + "epoch": 0.07, + "grad_norm": 524.0, + "kl_loss_10": 140.3430618286133, + "kl_loss_2": 1575.8819396972656, + "kl_loss_3": 1123.8694213867188, + "kl_loss_7": 294.651708984375, + "learning_rate": 0.0009909643486313534, + "loss": 794.9152, + "step": 700 + }, + { + "ce_loss_10": 3.5606731176376343, + "ce_loss_13": 3.4771942019462587, + "ce_loss_2": 4.3175184488296505, + "ce_loss_3": 4.074072551727295, + "ce_loss_7": 3.650731146335602, + "epoch": 0.071, + "grad_norm": 600.0, + "kl_loss_10": 135.5239990234375, + "kl_loss_2": 1676.284442138672, + "kl_loss_3": 1193.2137634277344, + "kl_loss_7": 307.0430740356445, + "learning_rate": 0.000990661599193678, + "loss": 839.2205, + "step": 710 + }, + { + "ce_loss_10": 3.7052354335784914, + "ce_loss_13": 3.6190937519073487, + "ce_loss_2": 4.42795637845993, + "ce_loss_3": 4.203339767456055, + "ce_loss_7": 3.7865342020988466, + "epoch": 0.072, + "grad_norm": 708.0, + "kl_loss_10": 139.11955757141112, + "kl_loss_2": 1630.3473266601563, + "kl_loss_3": 1169.5729766845702, + "kl_loss_7": 299.42345809936523, + "learning_rate": 0.0009903539087991462, + "loss": 803.8498, + "step": 720 + }, + { + "ce_loss_10": 3.6689595699310305, + "ce_loss_13": 3.586829674243927, + "ce_loss_2": 4.399398994445801, + "ce_loss_3": 4.174283814430237, + "ce_loss_7": 3.7554702758789062, + "epoch": 0.073, + "grad_norm": 860.0, + "kl_loss_10": 133.20760345458984, + "kl_loss_2": 1626.384521484375, + "kl_loss_3": 1158.8964233398438, + "kl_loss_7": 296.6233856201172, + "learning_rate": 0.0009900412805461966, + "loss": 810.3949, + "step": 730 + }, + { + "ce_loss_10": 3.7475465893745423, + "ce_loss_13": 3.6637478709220885, + "ce_loss_2": 4.477526593208313, + "ce_loss_3": 4.232499527931213, + "ce_loss_7": 3.834572732448578, + "epoch": 0.074, + "grad_norm": 756.0, + "kl_loss_10": 136.13002281188966, + "kl_loss_2": 1615.2621215820313, + "kl_loss_3": 1135.3047760009765, + "kl_loss_7": 302.71751708984374, + "learning_rate": 0.0009897237175829927, + "loss": 812.032, + "step": 740 + }, + { + "ce_loss_10": 3.633454430103302, + "ce_loss_13": 3.546045184135437, + "ce_loss_2": 4.386739385128021, + "ce_loss_3": 4.157361710071564, + "ce_loss_7": 3.7257861375808714, + "epoch": 0.075, + "grad_norm": 624.0, + "kl_loss_10": 138.0735656738281, + "kl_loss_2": 1664.8802978515625, + "kl_loss_3": 1209.155780029297, + "kl_loss_7": 314.29449157714845, + "learning_rate": 0.0009894012231073895, + "loss": 820.1248, + "step": 750 + }, + { + "ce_loss_10": 3.675256085395813, + "ce_loss_13": 3.591258680820465, + "ce_loss_2": 4.3781631827354435, + "ce_loss_3": 4.169950652122497, + "ce_loss_7": 3.7596523761749268, + "epoch": 0.076, + "grad_norm": 596.0, + "kl_loss_10": 137.33892288208008, + "kl_loss_2": 1570.5589111328125, + "kl_loss_3": 1161.7032104492187, + "kl_loss_7": 298.7381622314453, + "learning_rate": 0.0009890738003669028, + "loss": 801.2431, + "step": 760 + }, + { + "ce_loss_10": 3.64959534406662, + "ce_loss_13": 3.5664158701896667, + "ce_loss_2": 4.371342432498932, + "ce_loss_3": 4.150311291217804, + "ce_loss_7": 3.7354934453964233, + "epoch": 0.077, + "grad_norm": 540.0, + "kl_loss_10": 136.36218070983887, + "kl_loss_2": 1622.240057373047, + "kl_loss_3": 1172.391793823242, + "kl_loss_7": 304.76952667236327, + "learning_rate": 0.0009887414526586764, + "loss": 787.9819, + "step": 770 + }, + { + "ce_loss_10": 3.708216655254364, + "ce_loss_13": 3.625141477584839, + "ce_loss_2": 4.414664888381958, + "ce_loss_3": 4.183781635761261, + "ce_loss_7": 3.8081562399864195, + "epoch": 0.078, + "grad_norm": 596.0, + "kl_loss_10": 133.56560096740722, + "kl_loss_2": 1562.47041015625, + "kl_loss_3": 1106.445620727539, + "kl_loss_7": 312.0776168823242, + "learning_rate": 0.0009884041833294476, + "loss": 768.2491, + "step": 780 + }, + { + "ce_loss_10": 3.706817853450775, + "ce_loss_13": 3.622973358631134, + "ce_loss_2": 4.41116281747818, + "ce_loss_3": 4.179266679286957, + "ce_loss_7": 3.8186426639556883, + "epoch": 0.079, + "grad_norm": 632.0, + "kl_loss_10": 132.2478443145752, + "kl_loss_2": 1599.446923828125, + "kl_loss_3": 1117.8709930419923, + "kl_loss_7": 368.3747268676758, + "learning_rate": 0.000988061995775515, + "loss": 815.0693, + "step": 790 + }, + { + "ce_loss_10": 3.641828775405884, + "ce_loss_13": 3.5547205209732056, + "ce_loss_2": 4.335572981834412, + "ce_loss_3": 4.108006286621094, + "ce_loss_7": 3.7402275919914247, + "epoch": 0.08, + "grad_norm": 516.0, + "kl_loss_10": 141.807564163208, + "kl_loss_2": 1570.8703674316407, + "kl_loss_3": 1110.252996826172, + "kl_loss_7": 321.9771667480469, + "learning_rate": 0.0009877148934427035, + "loss": 786.1404, + "step": 800 + }, + { + "ce_loss_10": 3.681752073764801, + "ce_loss_13": 3.596014940738678, + "ce_loss_2": 4.380065774917602, + "ce_loss_3": 4.151778030395508, + "ce_loss_7": 3.7655294299125672, + "epoch": 0.081, + "grad_norm": 496.0, + "kl_loss_10": 145.9334274291992, + "kl_loss_2": 1572.8681091308595, + "kl_loss_3": 1116.675845336914, + "kl_loss_7": 297.05968246459963, + "learning_rate": 0.0009873628798263297, + "loss": 776.0455, + "step": 810 + }, + { + "ce_loss_10": 3.6424105167388916, + "ce_loss_13": 3.5447566747665404, + "ce_loss_2": 4.312346494197845, + "ce_loss_3": 4.088560962677002, + "ce_loss_7": 3.7104405045509337, + "epoch": 0.082, + "grad_norm": 478.0, + "kl_loss_10": 152.06344909667968, + "kl_loss_2": 1539.9718017578125, + "kl_loss_3": 1091.6052520751953, + "kl_loss_7": 286.7229400634766, + "learning_rate": 0.0009870059584711668, + "loss": 790.5065, + "step": 820 + }, + { + "ce_loss_10": 3.6575138568878174, + "ce_loss_13": 3.5694735765457155, + "ce_loss_2": 4.352469277381897, + "ce_loss_3": 4.124953854084015, + "ce_loss_7": 3.7358759164810182, + "epoch": 0.083, + "grad_norm": 516.0, + "kl_loss_10": 158.90749130249023, + "kl_loss_2": 1569.4235595703126, + "kl_loss_3": 1125.5340545654296, + "kl_loss_7": 290.7964630126953, + "learning_rate": 0.000986644132971409, + "loss": 786.8994, + "step": 830 + }, + { + "ce_loss_10": 3.6558743476867677, + "ce_loss_13": 3.5544149518013, + "ce_loss_2": 4.354631888866424, + "ce_loss_3": 4.1281127572059635, + "ce_loss_7": 3.727833020687103, + "epoch": 0.084, + "grad_norm": 576.0, + "kl_loss_10": 158.36446990966797, + "kl_loss_2": 1584.950128173828, + "kl_loss_3": 1138.0484100341796, + "kl_loss_7": 300.7915969848633, + "learning_rate": 0.0009862774069706345, + "loss": 786.4536, + "step": 840 + }, + { + "ce_loss_10": 3.7631431221961975, + "ce_loss_13": 3.6783902406692506, + "ce_loss_2": 4.423887753486634, + "ce_loss_3": 4.210748863220215, + "ce_loss_7": 3.8459392905235292, + "epoch": 0.085, + "grad_norm": 720.0, + "kl_loss_10": 144.1476722717285, + "kl_loss_2": 1526.45078125, + "kl_loss_3": 1098.919091796875, + "kl_loss_7": 305.10309143066405, + "learning_rate": 0.000985905784161771, + "loss": 773.6244, + "step": 850 + }, + { + "ce_loss_10": 3.693523097038269, + "ce_loss_13": 3.6117894887924193, + "ce_loss_2": 4.374859690666199, + "ce_loss_3": 4.145905554294586, + "ce_loss_7": 3.799845337867737, + "epoch": 0.086, + "grad_norm": 648.0, + "kl_loss_10": 141.55279006958008, + "kl_loss_2": 1548.1404724121094, + "kl_loss_3": 1092.3538146972655, + "kl_loss_7": 338.8992858886719, + "learning_rate": 0.000985529268287055, + "loss": 780.1624, + "step": 860 + }, + { + "ce_loss_10": 3.6179853677749634, + "ce_loss_13": 3.532400143146515, + "ce_loss_2": 4.3180185675621034, + "ce_loss_3": 4.092539095878601, + "ce_loss_7": 3.716568684577942, + "epoch": 0.087, + "grad_norm": 584.0, + "kl_loss_10": 138.25293006896973, + "kl_loss_2": 1583.606640625, + "kl_loss_3": 1113.2994171142577, + "kl_loss_7": 327.81214904785156, + "learning_rate": 0.0009851478631379982, + "loss": 787.4821, + "step": 870 + }, + { + "ce_loss_10": 3.6815198183059694, + "ce_loss_13": 3.5956546545028685, + "ce_loss_2": 4.367411196231842, + "ce_loss_3": 4.13222428560257, + "ce_loss_7": 3.7695237517356874, + "epoch": 0.088, + "grad_norm": 628.0, + "kl_loss_10": 140.43244590759278, + "kl_loss_2": 1545.1064147949219, + "kl_loss_3": 1094.267755126953, + "kl_loss_7": 312.3658508300781, + "learning_rate": 0.0009847615725553456, + "loss": 767.0908, + "step": 880 + }, + { + "ce_loss_10": 3.739601492881775, + "ce_loss_13": 3.657086157798767, + "ce_loss_2": 4.379729843139648, + "ce_loss_3": 4.177917766571045, + "ce_loss_7": 3.820488429069519, + "epoch": 0.089, + "grad_norm": 552.0, + "kl_loss_10": 134.12742614746094, + "kl_loss_2": 1464.5765686035156, + "kl_loss_3": 1051.556851196289, + "kl_loss_7": 283.62481689453125, + "learning_rate": 0.0009843704004290394, + "loss": 761.853, + "step": 890 + }, + { + "ce_loss_10": 3.6452771425247192, + "ce_loss_13": 3.5613077044487, + "ce_loss_2": 4.318511128425598, + "ce_loss_3": 4.107461535930634, + "ce_loss_7": 3.726675534248352, + "epoch": 0.09, + "grad_norm": 474.0, + "kl_loss_10": 136.06297454833984, + "kl_loss_2": 1542.6724487304687, + "kl_loss_3": 1117.772933959961, + "kl_loss_7": 292.2666213989258, + "learning_rate": 0.0009839743506981783, + "loss": 768.8108, + "step": 900 + }, + { + "ce_loss_10": 3.5574649572372437, + "ce_loss_13": 3.4748517513275146, + "ce_loss_2": 4.266572868824005, + "ce_loss_3": 4.057099211215973, + "ce_loss_7": 3.6422529578208924, + "epoch": 0.091, + "grad_norm": 516.0, + "kl_loss_10": 139.13952560424804, + "kl_loss_2": 1603.9869201660156, + "kl_loss_3": 1170.3635620117188, + "kl_loss_7": 298.2760665893555, + "learning_rate": 0.0009835734273509786, + "loss": 783.7168, + "step": 910 + }, + { + "ce_loss_10": 3.6700770974159242, + "ce_loss_13": 3.5813122153282166, + "ce_loss_2": 4.351845908164978, + "ce_loss_3": 4.139319920539856, + "ce_loss_7": 3.7498608589172364, + "epoch": 0.092, + "grad_norm": 516.0, + "kl_loss_10": 139.36617164611818, + "kl_loss_2": 1526.7721801757812, + "kl_loss_3": 1107.183511352539, + "kl_loss_7": 287.28514404296874, + "learning_rate": 0.0009831676344247342, + "loss": 768.4225, + "step": 920 + }, + { + "ce_loss_10": 3.684238874912262, + "ce_loss_13": 3.6015963315963746, + "ce_loss_2": 4.3427834749221805, + "ce_loss_3": 4.138106441497802, + "ce_loss_7": 3.75754714012146, + "epoch": 0.093, + "grad_norm": 490.0, + "kl_loss_10": 135.07495460510253, + "kl_loss_2": 1516.6379028320312, + "kl_loss_3": 1094.0326538085938, + "kl_loss_7": 277.64155731201174, + "learning_rate": 0.0009827569760057755, + "loss": 762.3584, + "step": 930 + }, + { + "ce_loss_10": 3.5946595072746277, + "ce_loss_13": 3.512081265449524, + "ce_loss_2": 4.322237813472748, + "ce_loss_3": 4.095906281471253, + "ce_loss_7": 3.6798322200775146, + "epoch": 0.094, + "grad_norm": 728.0, + "kl_loss_10": 138.28199310302733, + "kl_loss_2": 1619.1793823242188, + "kl_loss_3": 1165.3315551757812, + "kl_loss_7": 295.293204498291, + "learning_rate": 0.000982341456229428, + "loss": 780.917, + "step": 940 + }, + { + "ce_loss_10": 3.69069162607193, + "ce_loss_13": 3.6100045323371885, + "ce_loss_2": 4.376732325553894, + "ce_loss_3": 4.16404242515564, + "ce_loss_7": 3.7701812386512756, + "epoch": 0.095, + "grad_norm": 688.0, + "kl_loss_10": 131.1420455932617, + "kl_loss_2": 1575.732354736328, + "kl_loss_3": 1138.4372924804688, + "kl_loss_7": 285.67282180786134, + "learning_rate": 0.000981921079279971, + "loss": 765.979, + "step": 950 + }, + { + "ce_loss_10": 3.7074394822120667, + "ce_loss_13": 3.62913464307785, + "ce_loss_2": 4.366938805580139, + "ce_loss_3": 4.150120985507965, + "ce_loss_7": 3.7818633675575257, + "epoch": 0.096, + "grad_norm": 720.0, + "kl_loss_10": 130.51903839111327, + "kl_loss_2": 1507.3517028808594, + "kl_loss_3": 1076.092593383789, + "kl_loss_7": 272.2766448974609, + "learning_rate": 0.0009814958493905962, + "loss": 753.6946, + "step": 960 + }, + { + "ce_loss_10": 3.658416414260864, + "ce_loss_13": 3.576970672607422, + "ce_loss_2": 4.346470355987549, + "ce_loss_3": 4.128688275814056, + "ce_loss_7": 3.7415476202964784, + "epoch": 0.097, + "grad_norm": 512.0, + "kl_loss_10": 128.56299629211426, + "kl_loss_2": 1557.0646423339845, + "kl_loss_3": 1112.28828125, + "kl_loss_7": 279.6500648498535, + "learning_rate": 0.0009810657708433637, + "loss": 775.217, + "step": 970 + }, + { + "ce_loss_10": 3.7308164954185488, + "ce_loss_13": 3.6533005952835085, + "ce_loss_2": 4.3734122037887575, + "ce_loss_3": 4.170846402645111, + "ce_loss_7": 3.8050424695014953, + "epoch": 0.098, + "grad_norm": 716.0, + "kl_loss_10": 124.60902214050293, + "kl_loss_2": 1475.1663879394532, + "kl_loss_3": 1054.8542236328126, + "kl_loss_7": 269.9375114440918, + "learning_rate": 0.0009806308479691594, + "loss": 736.7519, + "step": 980 + }, + { + "ce_loss_10": 3.750465714931488, + "ce_loss_13": 3.668341946601868, + "ce_loss_2": 4.426263308525085, + "ce_loss_3": 4.20258377790451, + "ce_loss_7": 3.836391198635101, + "epoch": 0.099, + "grad_norm": 644.0, + "kl_loss_10": 131.81643409729003, + "kl_loss_2": 1535.673388671875, + "kl_loss_3": 1090.963656616211, + "kl_loss_7": 289.6497604370117, + "learning_rate": 0.0009801910851476522, + "loss": 754.2551, + "step": 990 + }, + { + "ce_loss_10": 3.653952169418335, + "ce_loss_13": 3.577095854282379, + "ce_loss_2": 4.349191665649414, + "ce_loss_3": 4.125085318088532, + "ce_loss_7": 3.7413162350654603, + "epoch": 0.1, + "grad_norm": 478.0, + "kl_loss_10": 128.62464637756347, + "kl_loss_2": 1573.8733642578125, + "kl_loss_3": 1114.0891967773437, + "kl_loss_7": 292.6377975463867, + "learning_rate": 0.0009797464868072487, + "loss": 758.6713, + "step": 1000 + }, + { + "ce_loss_10": 3.6456503033638, + "ce_loss_13": 3.5667870163917543, + "ce_loss_2": 4.3237790822982785, + "ce_loss_3": 4.11080631017685, + "ce_loss_7": 3.7275813579559327, + "epoch": 0.101, + "grad_norm": 432.0, + "kl_loss_10": 128.03596534729004, + "kl_loss_2": 1525.84384765625, + "kl_loss_3": 1094.1039764404297, + "kl_loss_7": 291.2514984130859, + "learning_rate": 0.0009792970574250492, + "loss": 758.2494, + "step": 1010 + }, + { + "ce_loss_10": 3.677238702774048, + "ce_loss_13": 3.597763466835022, + "ce_loss_2": 4.345702481269837, + "ce_loss_3": 4.1323373198509215, + "ce_loss_7": 3.757745099067688, + "epoch": 0.102, + "grad_norm": 480.0, + "kl_loss_10": 126.84439620971679, + "kl_loss_2": 1518.9351745605468, + "kl_loss_3": 1090.8279510498046, + "kl_loss_7": 289.6885223388672, + "learning_rate": 0.0009788428015268028, + "loss": 746.4768, + "step": 1020 + }, + { + "ce_loss_10": 3.670746088027954, + "ce_loss_13": 3.5901795506477354, + "ce_loss_2": 4.326242756843567, + "ce_loss_3": 4.110077440738678, + "ce_loss_7": 3.7697238445281984, + "epoch": 0.103, + "grad_norm": 520.0, + "kl_loss_10": 147.23381576538085, + "kl_loss_2": 1500.6592041015624, + "kl_loss_3": 1064.419287109375, + "kl_loss_7": 309.967301940918, + "learning_rate": 0.0009783837236868609, + "loss": 752.1227, + "step": 1030 + }, + { + "ce_loss_10": 3.665172076225281, + "ce_loss_13": 3.559502327442169, + "ce_loss_2": 4.309127068519592, + "ce_loss_3": 4.0925112009048465, + "ce_loss_7": 3.730722725391388, + "epoch": 0.104, + "grad_norm": 624.0, + "kl_loss_10": 168.995276260376, + "kl_loss_2": 1506.7355590820312, + "kl_loss_3": 1077.4224884033204, + "kl_loss_7": 306.44668807983396, + "learning_rate": 0.0009779198285281327, + "loss": 758.6978, + "step": 1040 + }, + { + "ce_loss_10": 3.6450916528701782, + "ce_loss_13": 3.5567120909690857, + "ce_loss_2": 4.307799768447876, + "ce_loss_3": 4.096536159515381, + "ce_loss_7": 3.7174035549163817, + "epoch": 0.105, + "grad_norm": 464.0, + "kl_loss_10": 145.4011459350586, + "kl_loss_2": 1511.9253051757812, + "kl_loss_3": 1079.955551147461, + "kl_loss_7": 290.32603912353517, + "learning_rate": 0.0009774511207220368, + "loss": 751.4335, + "step": 1050 + }, + { + "ce_loss_10": 3.6726208686828614, + "ce_loss_13": 3.5870786190032957, + "ce_loss_2": 4.340760517120361, + "ce_loss_3": 4.122452509403229, + "ce_loss_7": 3.7611562490463255, + "epoch": 0.106, + "grad_norm": 516.0, + "kl_loss_10": 146.77743186950684, + "kl_loss_2": 1523.6993774414063, + "kl_loss_3": 1080.6748168945312, + "kl_loss_7": 305.8709197998047, + "learning_rate": 0.0009769776049884564, + "loss": 759.1102, + "step": 1060 + }, + { + "ce_loss_10": 3.5789316415786745, + "ce_loss_13": 3.4973001360893248, + "ce_loss_2": 4.2655829906463625, + "ce_loss_3": 4.0485687255859375, + "ce_loss_7": 3.664482927322388, + "epoch": 0.107, + "grad_norm": 512.0, + "kl_loss_10": 138.50279579162597, + "kl_loss_2": 1555.6201232910157, + "kl_loss_3": 1112.0815826416015, + "kl_loss_7": 305.1489685058594, + "learning_rate": 0.0009764992860956889, + "loss": 779.55, + "step": 1070 + }, + { + "ce_loss_10": 3.7416428446769716, + "ce_loss_13": 3.6618621706962586, + "ce_loss_2": 4.364235496520996, + "ce_loss_3": 4.161127758026123, + "ce_loss_7": 3.8312565684318542, + "epoch": 0.108, + "grad_norm": 612.0, + "kl_loss_10": 132.17280654907228, + "kl_loss_2": 1434.9408752441407, + "kl_loss_3": 1021.1740997314453, + "kl_loss_7": 306.5512954711914, + "learning_rate": 0.0009760161688604008, + "loss": 729.6794, + "step": 1080 + }, + { + "ce_loss_10": 3.74602724313736, + "ce_loss_13": 3.6610143184661865, + "ce_loss_2": 4.390218591690063, + "ce_loss_3": 4.1842693328857425, + "ce_loss_7": 3.8411784768104553, + "epoch": 0.109, + "grad_norm": 576.0, + "kl_loss_10": 133.29356536865234, + "kl_loss_2": 1472.9822204589843, + "kl_loss_3": 1051.5467559814454, + "kl_loss_7": 310.2565521240234, + "learning_rate": 0.0009755282581475768, + "loss": 747.7812, + "step": 1090 + }, + { + "ce_loss_10": 3.801929402351379, + "ce_loss_13": 3.715673303604126, + "ce_loss_2": 4.428924131393432, + "ce_loss_3": 4.217723715305328, + "ce_loss_7": 3.886538052558899, + "epoch": 0.11, + "grad_norm": 552.0, + "kl_loss_10": 141.27199668884276, + "kl_loss_2": 1455.4054565429688, + "kl_loss_3": 1032.206768798828, + "kl_loss_7": 311.90191345214845, + "learning_rate": 0.0009750355588704727, + "loss": 730.8727, + "step": 1100 + }, + { + "ce_loss_10": 3.6245179295539858, + "ce_loss_13": 3.5434940338134764, + "ce_loss_2": 4.285040807723999, + "ce_loss_3": 4.064236760139465, + "ce_loss_7": 3.722900152206421, + "epoch": 0.111, + "grad_norm": 536.0, + "kl_loss_10": 128.93951110839845, + "kl_loss_2": 1479.008282470703, + "kl_loss_3": 1042.542987060547, + "kl_loss_7": 309.4350082397461, + "learning_rate": 0.0009745380759905647, + "loss": 755.6506, + "step": 1110 + }, + { + "ce_loss_10": 3.5733557820320128, + "ce_loss_13": 3.497096002101898, + "ce_loss_2": 4.2416357636451725, + "ce_loss_3": 4.02953668832779, + "ce_loss_7": 3.6843501210212706, + "epoch": 0.112, + "grad_norm": 584.0, + "kl_loss_10": 128.8479259490967, + "kl_loss_2": 1501.6354125976563, + "kl_loss_3": 1078.8607055664063, + "kl_loss_7": 309.57103271484374, + "learning_rate": 0.0009740358145174998, + "loss": 782.7103, + "step": 1120 + }, + { + "ce_loss_10": 3.7390747904777526, + "ce_loss_13": 3.654650056362152, + "ce_loss_2": 4.359592080116272, + "ce_loss_3": 4.165994334220886, + "ce_loss_7": 3.8400262117385866, + "epoch": 0.113, + "grad_norm": 434.0, + "kl_loss_10": 134.35130157470704, + "kl_loss_2": 1442.04345703125, + "kl_loss_3": 1051.9576324462892, + "kl_loss_7": 334.60899047851564, + "learning_rate": 0.0009735287795090455, + "loss": 747.7461, + "step": 1130 + }, + { + "ce_loss_10": 3.6206952929496765, + "ce_loss_13": 3.5408340215682985, + "ce_loss_2": 4.278374576568604, + "ce_loss_3": 4.073013770580292, + "ce_loss_7": 3.709249567985535, + "epoch": 0.114, + "grad_norm": 560.0, + "kl_loss_10": 129.40065841674806, + "kl_loss_2": 1489.2802490234376, + "kl_loss_3": 1078.2224548339843, + "kl_loss_7": 308.7394744873047, + "learning_rate": 0.0009730169760710386, + "loss": 743.8783, + "step": 1140 + }, + { + "ce_loss_10": 3.7078137516975405, + "ce_loss_13": 3.6258071303367614, + "ce_loss_2": 4.352467465400696, + "ce_loss_3": 4.14322521686554, + "ce_loss_7": 3.793818712234497, + "epoch": 0.115, + "grad_norm": 532.0, + "kl_loss_10": 132.82089805603027, + "kl_loss_2": 1462.5518798828125, + "kl_loss_3": 1047.3691467285157, + "kl_loss_7": 303.57424392700193, + "learning_rate": 0.0009725004093573342, + "loss": 741.0269, + "step": 1150 + }, + { + "ce_loss_10": 3.641883647441864, + "ce_loss_13": 3.5618484139442446, + "ce_loss_2": 4.298850560188294, + "ce_loss_3": 4.0857291460037235, + "ce_loss_7": 3.732511842250824, + "epoch": 0.116, + "grad_norm": 500.0, + "kl_loss_10": 125.74126281738282, + "kl_loss_2": 1472.2821716308595, + "kl_loss_3": 1051.9892150878907, + "kl_loss_7": 293.47923736572267, + "learning_rate": 0.0009719790845697534, + "loss": 730.1605, + "step": 1160 + }, + { + "ce_loss_10": 3.588691568374634, + "ce_loss_13": 3.514021909236908, + "ce_loss_2": 4.223182845115661, + "ce_loss_3": 4.0243830442428585, + "ce_loss_7": 3.668225383758545, + "epoch": 0.117, + "grad_norm": 544.0, + "kl_loss_10": 118.70261917114257, + "kl_loss_2": 1445.7062133789063, + "kl_loss_3": 1032.7508636474608, + "kl_loss_7": 274.055322265625, + "learning_rate": 0.0009714530069580309, + "loss": 718.2419, + "step": 1170 + }, + { + "ce_loss_10": 3.6957285404205322, + "ce_loss_13": 3.618162250518799, + "ce_loss_2": 4.352480411529541, + "ce_loss_3": 4.145036590099335, + "ce_loss_7": 3.7782084584236144, + "epoch": 0.118, + "grad_norm": 536.0, + "kl_loss_10": 127.76230659484864, + "kl_loss_2": 1480.515966796875, + "kl_loss_3": 1059.5112030029297, + "kl_loss_7": 282.41261138916013, + "learning_rate": 0.0009709221818197624, + "loss": 734.455, + "step": 1180 + }, + { + "ce_loss_10": 3.721509063243866, + "ce_loss_13": 3.6461830377578734, + "ce_loss_2": 4.384591698646545, + "ce_loss_3": 4.175746941566468, + "ce_loss_7": 3.804957926273346, + "epoch": 0.119, + "grad_norm": 454.0, + "kl_loss_10": 121.90502281188965, + "kl_loss_2": 1485.071533203125, + "kl_loss_3": 1060.364013671875, + "kl_loss_7": 273.83970947265624, + "learning_rate": 0.0009703866145003512, + "loss": 735.9141, + "step": 1190 + }, + { + "ce_loss_10": 3.6931097984313963, + "ce_loss_13": 3.618978762626648, + "ce_loss_2": 4.338696074485779, + "ce_loss_3": 4.131911754608154, + "ce_loss_7": 3.771903729438782, + "epoch": 0.12, + "grad_norm": 404.0, + "kl_loss_10": 117.82418823242188, + "kl_loss_2": 1472.2267517089845, + "kl_loss_3": 1051.9043701171875, + "kl_loss_7": 267.1518127441406, + "learning_rate": 0.0009698463103929542, + "loss": 740.661, + "step": 1200 + }, + { + "ce_loss_10": 3.658799970149994, + "ce_loss_13": 3.5842487812042236, + "ce_loss_2": 4.312227940559387, + "ce_loss_3": 4.10740053653717, + "ce_loss_7": 3.7385629415512085, + "epoch": 0.121, + "grad_norm": 412.0, + "kl_loss_10": 122.52205390930176, + "kl_loss_2": 1466.4009826660156, + "kl_loss_3": 1056.7593078613281, + "kl_loss_7": 272.06723709106444, + "learning_rate": 0.0009693012749384279, + "loss": 737.0117, + "step": 1210 + }, + { + "ce_loss_10": 3.679182291030884, + "ce_loss_13": 3.6015621542930605, + "ce_loss_2": 4.328339552879333, + "ce_loss_3": 4.114730060100555, + "ce_loss_7": 3.7581299543380737, + "epoch": 0.122, + "grad_norm": 500.0, + "kl_loss_10": 124.13164978027343, + "kl_loss_2": 1486.9410522460937, + "kl_loss_3": 1053.6353607177734, + "kl_loss_7": 279.761865234375, + "learning_rate": 0.0009687515136252732, + "loss": 728.5778, + "step": 1220 + }, + { + "ce_loss_10": 3.6272791743278505, + "ce_loss_13": 3.55220046043396, + "ce_loss_2": 4.301675605773926, + "ce_loss_3": 4.08597983121872, + "ce_loss_7": 3.70781672000885, + "epoch": 0.123, + "grad_norm": 568.0, + "kl_loss_10": 121.01997566223145, + "kl_loss_2": 1522.73505859375, + "kl_loss_3": 1086.6077362060546, + "kl_loss_7": 279.3069206237793, + "learning_rate": 0.0009681970319895803, + "loss": 759.7192, + "step": 1230 + }, + { + "ce_loss_10": 3.71327965259552, + "ce_loss_13": 3.6385520815849306, + "ce_loss_2": 4.354242825508118, + "ce_loss_3": 4.150368654727936, + "ce_loss_7": 3.7916497707366945, + "epoch": 0.124, + "grad_norm": 414.0, + "kl_loss_10": 124.44540634155274, + "kl_loss_2": 1443.4426513671874, + "kl_loss_3": 1030.0522674560548, + "kl_loss_7": 268.6724395751953, + "learning_rate": 0.0009676378356149733, + "loss": 722.6414, + "step": 1240 + }, + { + "ce_loss_10": 3.6944294214248656, + "ce_loss_13": 3.6130942940711974, + "ce_loss_2": 4.31483553647995, + "ce_loss_3": 4.113047087192536, + "ce_loss_7": 3.7628474831581116, + "epoch": 0.125, + "grad_norm": 572.0, + "kl_loss_10": 133.05949897766112, + "kl_loss_2": 1434.6165405273437, + "kl_loss_3": 1024.405093383789, + "kl_loss_7": 265.9820045471191, + "learning_rate": 0.0009670739301325534, + "loss": 721.2149, + "step": 1250 + }, + { + "ce_loss_10": 3.6491236448287965, + "ce_loss_13": 3.5683398127555845, + "ce_loss_2": 4.294895899295807, + "ce_loss_3": 4.082043838500977, + "ce_loss_7": 3.721854901313782, + "epoch": 0.126, + "grad_norm": 506.0, + "kl_loss_10": 130.2590259552002, + "kl_loss_2": 1460.9762817382812, + "kl_loss_3": 1047.9487213134767, + "kl_loss_7": 271.88867340087893, + "learning_rate": 0.0009665053212208426, + "loss": 732.2017, + "step": 1260 + }, + { + "ce_loss_10": 3.6933886647224425, + "ce_loss_13": 3.6137840390205382, + "ce_loss_2": 4.336398506164551, + "ce_loss_3": 4.125988566875458, + "ce_loss_7": 3.7641473054885863, + "epoch": 0.127, + "grad_norm": 470.0, + "kl_loss_10": 131.11599006652833, + "kl_loss_2": 1466.260760498047, + "kl_loss_3": 1047.477099609375, + "kl_loss_7": 271.31814041137693, + "learning_rate": 0.0009659320146057262, + "loss": 729.9061, + "step": 1270 + }, + { + "ce_loss_10": 3.6932409524917604, + "ce_loss_13": 3.6162060022354128, + "ce_loss_2": 4.326151037216187, + "ce_loss_3": 4.118873739242554, + "ce_loss_7": 3.7665857672691345, + "epoch": 0.128, + "grad_norm": 488.0, + "kl_loss_10": 126.3920768737793, + "kl_loss_2": 1439.7459106445312, + "kl_loss_3": 1023.4634368896484, + "kl_loss_7": 263.45100021362305, + "learning_rate": 0.0009653540160603955, + "loss": 714.3654, + "step": 1280 + }, + { + "ce_loss_10": 3.695625138282776, + "ce_loss_13": 3.619100844860077, + "ce_loss_2": 4.322635555267334, + "ce_loss_3": 4.121702527999878, + "ce_loss_7": 3.764703559875488, + "epoch": 0.129, + "grad_norm": 516.0, + "kl_loss_10": 125.06153717041016, + "kl_loss_2": 1449.9097961425782, + "kl_loss_3": 1036.939111328125, + "kl_loss_7": 261.42085418701174, + "learning_rate": 0.0009647713314052896, + "loss": 709.775, + "step": 1290 + }, + { + "ce_loss_10": 3.645713412761688, + "ce_loss_13": 3.5693684458732604, + "ce_loss_2": 4.318705654144287, + "ce_loss_3": 4.105780220031738, + "ce_loss_7": 3.721473240852356, + "epoch": 0.13, + "grad_norm": 504.0, + "kl_loss_10": 125.77756729125977, + "kl_loss_2": 1515.6605834960938, + "kl_loss_3": 1082.791098022461, + "kl_loss_7": 268.05779418945315, + "learning_rate": 0.0009641839665080363, + "loss": 739.9956, + "step": 1300 + }, + { + "ce_loss_10": 3.6060986638069155, + "ce_loss_13": 3.532057249546051, + "ce_loss_2": 4.258748412132263, + "ce_loss_3": 4.044409060478211, + "ce_loss_7": 3.6810790419578554, + "epoch": 0.131, + "grad_norm": 576.0, + "kl_loss_10": 120.33845672607421, + "kl_loss_2": 1464.5880249023437, + "kl_loss_3": 1035.7476196289062, + "kl_loss_7": 258.05460357666016, + "learning_rate": 0.0009635919272833937, + "loss": 712.5358, + "step": 1310 + }, + { + "ce_loss_10": 3.6437799096107484, + "ce_loss_13": 3.567954385280609, + "ce_loss_2": 4.29961267709732, + "ce_loss_3": 4.091295349597931, + "ce_loss_7": 3.7204660773277283, + "epoch": 0.132, + "grad_norm": 520.0, + "kl_loss_10": 123.54525375366211, + "kl_loss_2": 1460.5380920410157, + "kl_loss_3": 1039.971875, + "kl_loss_7": 264.77962188720704, + "learning_rate": 0.0009629952196931902, + "loss": 712.4777, + "step": 1320 + }, + { + "ce_loss_10": 3.63455046415329, + "ce_loss_13": 3.557650101184845, + "ce_loss_2": 4.270671212673188, + "ce_loss_3": 4.062439024448395, + "ce_loss_7": 3.702920150756836, + "epoch": 0.133, + "grad_norm": 434.0, + "kl_loss_10": 123.04212112426758, + "kl_loss_2": 1444.6365600585937, + "kl_loss_3": 1028.4689849853517, + "kl_loss_7": 258.92202911376955, + "learning_rate": 0.0009623938497462645, + "loss": 713.1292, + "step": 1330 + }, + { + "ce_loss_10": 3.6247922778129578, + "ce_loss_13": 3.5494427919387816, + "ce_loss_2": 4.2690158009529116, + "ce_loss_3": 4.058642566204071, + "ce_loss_7": 3.6972993493080137, + "epoch": 0.134, + "grad_norm": 478.0, + "kl_loss_10": 120.75489349365235, + "kl_loss_2": 1456.6053527832032, + "kl_loss_3": 1037.5501190185546, + "kl_loss_7": 266.06713485717773, + "learning_rate": 0.0009617878234984055, + "loss": 726.2297, + "step": 1340 + }, + { + "ce_loss_10": 3.717451739311218, + "ce_loss_13": 3.642001247406006, + "ce_loss_2": 4.3319720983505245, + "ce_loss_3": 4.123037731647491, + "ce_loss_7": 3.790008616447449, + "epoch": 0.135, + "grad_norm": 548.0, + "kl_loss_10": 120.84955863952636, + "kl_loss_2": 1400.965509033203, + "kl_loss_3": 989.1194274902343, + "kl_loss_7": 260.0563507080078, + "learning_rate": 0.0009611771470522907, + "loss": 704.2836, + "step": 1350 + }, + { + "ce_loss_10": 3.6397268891334535, + "ce_loss_13": 3.565677487850189, + "ce_loss_2": 4.285844933986664, + "ce_loss_3": 4.075031089782715, + "ce_loss_7": 3.7177743196487425, + "epoch": 0.136, + "grad_norm": 548.0, + "kl_loss_10": 119.19244728088378, + "kl_loss_2": 1430.9075134277343, + "kl_loss_3": 1014.8560333251953, + "kl_loss_7": 264.9954383850098, + "learning_rate": 0.0009605618265574251, + "loss": 706.0607, + "step": 1360 + }, + { + "ce_loss_10": 3.6019657135009764, + "ce_loss_13": 3.5283800959587097, + "ce_loss_2": 4.247595989704132, + "ce_loss_3": 4.0493292808532715, + "ce_loss_7": 3.682861661911011, + "epoch": 0.137, + "grad_norm": 544.0, + "kl_loss_10": 120.26506729125977, + "kl_loss_2": 1482.0683837890624, + "kl_loss_3": 1078.6839630126954, + "kl_loss_7": 272.45921630859374, + "learning_rate": 0.0009599418682100792, + "loss": 727.2132, + "step": 1370 + }, + { + "ce_loss_10": 3.645335590839386, + "ce_loss_13": 3.570277786254883, + "ce_loss_2": 4.2866430401802065, + "ce_loss_3": 4.071622550487518, + "ce_loss_7": 3.717624640464783, + "epoch": 0.138, + "grad_norm": 612.0, + "kl_loss_10": 119.49271163940429, + "kl_loss_2": 1442.7004455566407, + "kl_loss_3": 1025.406283569336, + "kl_loss_7": 261.58585968017576, + "learning_rate": 0.0009593172782532268, + "loss": 717.2026, + "step": 1380 + }, + { + "ce_loss_10": 3.6908539175987243, + "ce_loss_13": 3.617784011363983, + "ce_loss_2": 4.313388335704803, + "ce_loss_3": 4.114008998870849, + "ce_loss_7": 3.7639716506004333, + "epoch": 0.139, + "grad_norm": 476.0, + "kl_loss_10": 120.62876358032227, + "kl_loss_2": 1423.3504333496094, + "kl_loss_3": 1012.8270812988281, + "kl_loss_7": 261.68493881225584, + "learning_rate": 0.0009586880629764817, + "loss": 706.5565, + "step": 1390 + }, + { + "ce_loss_10": 3.61561758518219, + "ce_loss_13": 3.5403517365455626, + "ce_loss_2": 4.258929216861725, + "ce_loss_3": 4.060744059085846, + "ce_loss_7": 3.687643599510193, + "epoch": 0.14, + "grad_norm": 792.0, + "kl_loss_10": 120.44653511047363, + "kl_loss_2": 1437.7734375, + "kl_loss_3": 1068.6304412841796, + "kl_loss_7": 272.7257308959961, + "learning_rate": 0.0009580542287160348, + "loss": 716.5157, + "step": 1400 + }, + { + "ce_loss_10": 3.579881501197815, + "ce_loss_13": 3.505436861515045, + "ce_loss_2": 4.2163320779800415, + "ce_loss_3": 4.016775751113892, + "ce_loss_7": 3.6602004766464233, + "epoch": 0.141, + "grad_norm": 740.0, + "kl_loss_10": 119.12874908447266, + "kl_loss_2": 1437.7877868652345, + "kl_loss_3": 1029.927035522461, + "kl_loss_7": 274.3121276855469, + "learning_rate": 0.0009574157818545901, + "loss": 704.4754, + "step": 1410 + }, + { + "ce_loss_10": 3.654617667198181, + "ce_loss_13": 3.581939327716827, + "ce_loss_2": 4.266410648822784, + "ce_loss_3": 4.076312291622162, + "ce_loss_7": 3.7350067377090452, + "epoch": 0.142, + "grad_norm": 788.0, + "kl_loss_10": 117.25881233215333, + "kl_loss_2": 1402.042706298828, + "kl_loss_3": 1008.1144165039062, + "kl_loss_7": 268.0317886352539, + "learning_rate": 0.0009567727288213005, + "loss": 712.509, + "step": 1420 + }, + { + "ce_loss_10": 3.62344468832016, + "ce_loss_13": 3.552217972278595, + "ce_loss_2": 4.242154741287232, + "ce_loss_3": 4.047549939155578, + "ce_loss_7": 3.6985828638076783, + "epoch": 0.143, + "grad_norm": 466.0, + "kl_loss_10": 115.47184524536132, + "kl_loss_2": 1418.5062316894532, + "kl_loss_3": 1018.4080627441406, + "kl_loss_7": 270.1881278991699, + "learning_rate": 0.0009561250760917027, + "loss": 702.7143, + "step": 1430 + }, + { + "ce_loss_10": 3.6490369558334352, + "ce_loss_13": 3.5760287642478943, + "ce_loss_2": 4.275133848190308, + "ce_loss_3": 4.07141832113266, + "ce_loss_7": 3.7269250392913817, + "epoch": 0.144, + "grad_norm": 524.0, + "kl_loss_10": 119.90954666137695, + "kl_loss_2": 1441.103936767578, + "kl_loss_3": 1028.5968627929688, + "kl_loss_7": 267.66681442260744, + "learning_rate": 0.0009554728301876525, + "loss": 698.2885, + "step": 1440 + }, + { + "ce_loss_10": 3.7067930936813354, + "ce_loss_13": 3.629922258853912, + "ce_loss_2": 4.314408445358277, + "ce_loss_3": 4.132321739196778, + "ce_loss_7": 3.7809135794639586, + "epoch": 0.145, + "grad_norm": 632.0, + "kl_loss_10": 122.64454803466796, + "kl_loss_2": 1398.767156982422, + "kl_loss_3": 1023.6964874267578, + "kl_loss_7": 262.7124481201172, + "learning_rate": 0.0009548159976772592, + "loss": 721.9051, + "step": 1450 + }, + { + "ce_loss_10": 3.641107952594757, + "ce_loss_13": 3.5679691076278686, + "ce_loss_2": 4.273185658454895, + "ce_loss_3": 4.074773287773132, + "ce_loss_7": 3.715712809562683, + "epoch": 0.146, + "grad_norm": 472.0, + "kl_loss_10": 119.45023498535156, + "kl_loss_2": 1426.5163330078126, + "kl_loss_3": 1022.3043518066406, + "kl_loss_7": 264.85511245727537, + "learning_rate": 0.0009541545851748186, + "loss": 702.8599, + "step": 1460 + }, + { + "ce_loss_10": 3.5100984811782836, + "ce_loss_13": 3.436799693107605, + "ce_loss_2": 4.161883985996246, + "ce_loss_3": 3.955858516693115, + "ce_loss_7": 3.594360911846161, + "epoch": 0.147, + "grad_norm": 556.0, + "kl_loss_10": 116.41493873596191, + "kl_loss_2": 1467.1879943847657, + "kl_loss_3": 1037.354574584961, + "kl_loss_7": 266.5866645812988, + "learning_rate": 0.0009534885993407473, + "loss": 713.4948, + "step": 1470 + }, + { + "ce_loss_10": 3.6824231266975405, + "ce_loss_13": 3.608121466636658, + "ce_loss_2": 4.328725492954254, + "ce_loss_3": 4.115788686275482, + "ce_loss_7": 3.755172336101532, + "epoch": 0.148, + "grad_norm": 560.0, + "kl_loss_10": 118.46903686523437, + "kl_loss_2": 1448.8276794433593, + "kl_loss_3": 1029.914727783203, + "kl_loss_7": 263.37538986206056, + "learning_rate": 0.0009528180468815154, + "loss": 714.8544, + "step": 1480 + }, + { + "ce_loss_10": 3.7179338216781614, + "ce_loss_13": 3.6484482169151304, + "ce_loss_2": 4.323283433914185, + "ce_loss_3": 4.127403116226196, + "ce_loss_7": 3.7899964809417725, + "epoch": 0.149, + "grad_norm": 480.0, + "kl_loss_10": 114.30357208251954, + "kl_loss_2": 1395.1859313964844, + "kl_loss_3": 989.8287170410156, + "kl_loss_7": 257.14686279296876, + "learning_rate": 0.0009521429345495787, + "loss": 690.7114, + "step": 1490 + }, + { + "ce_loss_10": 3.7034213185310363, + "ce_loss_13": 3.6311787962913513, + "ce_loss_2": 4.309155285358429, + "ce_loss_3": 4.092868828773499, + "ce_loss_7": 3.7654823780059816, + "epoch": 0.15, + "grad_norm": 448.0, + "kl_loss_10": 116.55960197448731, + "kl_loss_2": 1382.6544982910157, + "kl_loss_3": 969.2838195800781, + "kl_loss_7": 249.21936950683593, + "learning_rate": 0.0009514632691433108, + "loss": 688.2995, + "step": 1500 + }, + { + "ce_loss_10": 3.6700626373291017, + "ce_loss_13": 3.5945797085762026, + "ce_loss_2": 4.289841759204864, + "ce_loss_3": 4.08635276556015, + "ce_loss_7": 3.738140869140625, + "epoch": 0.151, + "grad_norm": 448.0, + "kl_loss_10": 129.3878589630127, + "kl_loss_2": 1424.29853515625, + "kl_loss_3": 1001.7422454833984, + "kl_loss_7": 254.78705520629882, + "learning_rate": 0.0009507790575069346, + "loss": 706.6927, + "step": 1510 + }, + { + "ce_loss_10": 3.65303395986557, + "ce_loss_13": 3.5714800715446473, + "ce_loss_2": 4.28388956785202, + "ce_loss_3": 4.07215541601181, + "ce_loss_7": 3.7174383282661436, + "epoch": 0.152, + "grad_norm": 560.0, + "kl_loss_10": 131.33350143432617, + "kl_loss_2": 1434.660906982422, + "kl_loss_3": 1017.4268585205078, + "kl_loss_7": 260.6188400268555, + "learning_rate": 0.0009500903065304539, + "loss": 715.3042, + "step": 1520 + }, + { + "ce_loss_10": 3.683453822135925, + "ce_loss_13": 3.60856169462204, + "ce_loss_2": 4.287684428691864, + "ce_loss_3": 4.0820488929748535, + "ce_loss_7": 3.7489410638809204, + "epoch": 0.153, + "grad_norm": 592.0, + "kl_loss_10": 120.57846107482911, + "kl_loss_2": 1384.860614013672, + "kl_loss_3": 975.0672027587891, + "kl_loss_7": 247.4440773010254, + "learning_rate": 0.0009493970231495835, + "loss": 691.6448, + "step": 1530 + }, + { + "ce_loss_10": 3.6223431706428526, + "ce_loss_13": 3.55173202753067, + "ce_loss_2": 4.230155563354492, + "ce_loss_3": 4.02554075717926, + "ce_loss_7": 3.6863773345947264, + "epoch": 0.154, + "grad_norm": 490.0, + "kl_loss_10": 119.43844871520996, + "kl_loss_2": 1397.457257080078, + "kl_loss_3": 991.8431060791015, + "kl_loss_7": 243.42232818603514, + "learning_rate": 0.0009486992143456792, + "loss": 686.1227, + "step": 1540 + }, + { + "ce_loss_10": 3.6514541625976564, + "ce_loss_13": 3.571796643733978, + "ce_loss_2": 4.304909610748291, + "ce_loss_3": 4.0922522187232975, + "ce_loss_7": 3.7216905117034913, + "epoch": 0.155, + "grad_norm": 396.0, + "kl_loss_10": 128.10715980529784, + "kl_loss_2": 1491.158837890625, + "kl_loss_3": 1056.2210266113282, + "kl_loss_7": 262.1390213012695, + "learning_rate": 0.0009479968871456679, + "loss": 716.6379, + "step": 1550 + }, + { + "ce_loss_10": 3.6170923829078676, + "ce_loss_13": 3.542399287223816, + "ce_loss_2": 4.252330017089844, + "ce_loss_3": 4.045702540874482, + "ce_loss_7": 3.685628616809845, + "epoch": 0.156, + "grad_norm": 454.0, + "kl_loss_10": 121.63033790588379, + "kl_loss_2": 1463.288525390625, + "kl_loss_3": 1026.1065032958984, + "kl_loss_7": 254.98253784179687, + "learning_rate": 0.0009472900486219768, + "loss": 702.4742, + "step": 1560 + }, + { + "ce_loss_10": 3.6025954604148867, + "ce_loss_13": 3.5303670883178713, + "ce_loss_2": 4.232356917858124, + "ce_loss_3": 4.022554993629456, + "ce_loss_7": 3.6709616661071776, + "epoch": 0.157, + "grad_norm": 520.0, + "kl_loss_10": 118.88864822387696, + "kl_loss_2": 1434.4180419921875, + "kl_loss_3": 1021.8371948242187, + "kl_loss_7": 253.59476776123046, + "learning_rate": 0.000946578705892462, + "loss": 706.9224, + "step": 1570 + }, + { + "ce_loss_10": 3.6455034971237184, + "ce_loss_13": 3.5725855112075804, + "ce_loss_2": 4.251002633571625, + "ce_loss_3": 4.075614416599274, + "ce_loss_7": 3.712466835975647, + "epoch": 0.158, + "grad_norm": 520.0, + "kl_loss_10": 115.86212844848633, + "kl_loss_2": 1388.911444091797, + "kl_loss_3": 1008.0096618652344, + "kl_loss_7": 249.38579559326172, + "learning_rate": 0.0009458628661203367, + "loss": 702.1684, + "step": 1580 + }, + { + "ce_loss_10": 3.6394161105155947, + "ce_loss_13": 3.571541059017181, + "ce_loss_2": 4.284844183921814, + "ce_loss_3": 4.076776087284088, + "ce_loss_7": 3.7110044956207275, + "epoch": 0.159, + "grad_norm": 494.0, + "kl_loss_10": 113.66581001281739, + "kl_loss_2": 1444.869854736328, + "kl_loss_3": 1032.262744140625, + "kl_loss_7": 253.4554000854492, + "learning_rate": 0.0009451425365140996, + "loss": 688.5467, + "step": 1590 + }, + { + "ce_loss_10": 3.7211164236068726, + "ce_loss_13": 3.649132215976715, + "ce_loss_2": 4.325118780136108, + "ce_loss_3": 4.128993570804596, + "ce_loss_7": 3.7914722681045534, + "epoch": 0.16, + "grad_norm": 456.0, + "kl_loss_10": 117.80431632995605, + "kl_loss_2": 1373.1717468261718, + "kl_loss_3": 981.5717620849609, + "kl_loss_7": 253.6373489379883, + "learning_rate": 0.0009444177243274617, + "loss": 681.3762, + "step": 1600 + }, + { + "ce_loss_10": 3.574730896949768, + "ce_loss_13": 3.498664665222168, + "ce_loss_2": 4.200723135471344, + "ce_loss_3": 4.009881269931793, + "ce_loss_7": 3.6463570594787598, + "epoch": 0.161, + "grad_norm": 480.0, + "kl_loss_10": 122.87367897033691, + "kl_loss_2": 1430.2820068359374, + "kl_loss_3": 1037.6371978759767, + "kl_loss_7": 260.55384521484376, + "learning_rate": 0.0009436884368592739, + "loss": 706.6845, + "step": 1610 + }, + { + "ce_loss_10": 3.6286559462547303, + "ce_loss_13": 3.555983376502991, + "ce_loss_2": 4.232067906856537, + "ce_loss_3": 4.041269278526306, + "ce_loss_7": 3.6985298871994017, + "epoch": 0.162, + "grad_norm": 498.0, + "kl_loss_10": 118.67424545288085, + "kl_loss_2": 1385.5154724121094, + "kl_loss_3": 999.56015625, + "kl_loss_7": 250.75928268432617, + "learning_rate": 0.0009429546814534529, + "loss": 699.0302, + "step": 1620 + }, + { + "ce_loss_10": 3.639633226394653, + "ce_loss_13": 3.5706356167793274, + "ce_loss_2": 4.241236877441406, + "ce_loss_3": 4.056409633159637, + "ce_loss_7": 3.708655667304993, + "epoch": 0.163, + "grad_norm": 384.0, + "kl_loss_10": 117.12662200927734, + "kl_loss_2": 1374.89609375, + "kl_loss_3": 989.9520751953125, + "kl_loss_7": 248.8686378479004, + "learning_rate": 0.0009422164654989072, + "loss": 676.7936, + "step": 1630 + }, + { + "ce_loss_10": 3.7635043978691103, + "ce_loss_13": 3.687360870838165, + "ce_loss_2": 4.338383412361145, + "ce_loss_3": 4.1611551403999325, + "ce_loss_7": 3.828977358341217, + "epoch": 0.164, + "grad_norm": 424.0, + "kl_loss_10": 119.46500015258789, + "kl_loss_2": 1362.1967407226562, + "kl_loss_3": 990.5942932128906, + "kl_loss_7": 248.94699325561524, + "learning_rate": 0.0009414737964294635, + "loss": 685.8197, + "step": 1640 + }, + { + "ce_loss_10": 3.678090500831604, + "ce_loss_13": 3.6101470470428465, + "ce_loss_2": 4.259114742279053, + "ce_loss_3": 4.078064382076263, + "ce_loss_7": 3.7428590416908265, + "epoch": 0.165, + "grad_norm": 444.0, + "kl_loss_10": 112.9244888305664, + "kl_loss_2": 1333.3570129394532, + "kl_loss_3": 969.1624145507812, + "kl_loss_7": 238.38165054321288, + "learning_rate": 0.000940726681723791, + "loss": 682.7061, + "step": 1650 + }, + { + "ce_loss_10": 3.512197470664978, + "ce_loss_13": 3.4408557653427123, + "ce_loss_2": 4.148977339267731, + "ce_loss_3": 3.9514773368835447, + "ce_loss_7": 3.5815786600112913, + "epoch": 0.166, + "grad_norm": 488.0, + "kl_loss_10": 117.70020294189453, + "kl_loss_2": 1442.5229797363281, + "kl_loss_3": 1035.8803924560548, + "kl_loss_7": 256.4058250427246, + "learning_rate": 0.0009399751289053266, + "loss": 690.3188, + "step": 1660 + }, + { + "ce_loss_10": 3.742681550979614, + "ce_loss_13": 3.671311604976654, + "ce_loss_2": 4.328805279731751, + "ce_loss_3": 4.135993778705597, + "ce_loss_7": 3.809998023509979, + "epoch": 0.167, + "grad_norm": 478.0, + "kl_loss_10": 116.78232650756836, + "kl_loss_2": 1366.6806213378907, + "kl_loss_3": 967.4927185058593, + "kl_loss_7": 249.3471366882324, + "learning_rate": 0.0009392191455421988, + "loss": 682.1736, + "step": 1670 + }, + { + "ce_loss_10": 3.7067878365516664, + "ce_loss_13": 3.6276530623435974, + "ce_loss_2": 4.298012292385101, + "ce_loss_3": 4.104024171829224, + "ce_loss_7": 3.7697168350219727, + "epoch": 0.168, + "grad_norm": 490.0, + "kl_loss_10": 123.8147029876709, + "kl_loss_2": 1386.5616271972656, + "kl_loss_3": 990.9451782226563, + "kl_loss_7": 260.7920967102051, + "learning_rate": 0.0009384587392471515, + "loss": 679.4555, + "step": 1680 + }, + { + "ce_loss_10": 3.7010039329528808, + "ce_loss_13": 3.629232919216156, + "ce_loss_2": 4.288461661338806, + "ce_loss_3": 4.104441356658936, + "ce_loss_7": 3.773091959953308, + "epoch": 0.169, + "grad_norm": 494.0, + "kl_loss_10": 117.7159465789795, + "kl_loss_2": 1349.4280029296874, + "kl_loss_3": 968.4034729003906, + "kl_loss_7": 251.4811584472656, + "learning_rate": 0.0009376939176774678, + "loss": 675.85, + "step": 1690 + }, + { + "ce_loss_10": 3.678883969783783, + "ce_loss_13": 3.602108871936798, + "ce_loss_2": 4.274775016307831, + "ce_loss_3": 4.074398016929626, + "ce_loss_7": 3.7432108521461487, + "epoch": 0.17, + "grad_norm": 540.0, + "kl_loss_10": 124.26388397216797, + "kl_loss_2": 1371.6622314453125, + "kl_loss_3": 974.3467742919922, + "kl_loss_7": 252.39389877319337, + "learning_rate": 0.0009369246885348925, + "loss": 687.5515, + "step": 1700 + }, + { + "ce_loss_10": 3.6718587994575502, + "ce_loss_13": 3.591440510749817, + "ce_loss_2": 4.303124558925629, + "ce_loss_3": 4.095115387439728, + "ce_loss_7": 3.7370152711868285, + "epoch": 0.171, + "grad_norm": 548.0, + "kl_loss_10": 130.6899742126465, + "kl_loss_2": 1433.3515563964843, + "kl_loss_3": 1016.4502960205078, + "kl_loss_7": 255.15539627075196, + "learning_rate": 0.0009361510595655545, + "loss": 695.4526, + "step": 1710 + }, + { + "ce_loss_10": 3.6283922672271727, + "ce_loss_13": 3.5495692014694216, + "ce_loss_2": 4.237072479724884, + "ce_loss_3": 4.041323733329773, + "ce_loss_7": 3.6956202745437623, + "epoch": 0.172, + "grad_norm": 466.0, + "kl_loss_10": 127.2772174835205, + "kl_loss_2": 1409.4606872558593, + "kl_loss_3": 1009.6889221191407, + "kl_loss_7": 256.7372299194336, + "learning_rate": 0.0009353730385598887, + "loss": 691.3762, + "step": 1720 + }, + { + "ce_loss_10": 3.54926735162735, + "ce_loss_13": 3.4769670009613036, + "ce_loss_2": 4.178456115722656, + "ce_loss_3": 3.9720041275024416, + "ce_loss_7": 3.6176819682121275, + "epoch": 0.173, + "grad_norm": 436.0, + "kl_loss_10": 118.23514060974121, + "kl_loss_2": 1418.4845947265626, + "kl_loss_3": 998.5092987060547, + "kl_loss_7": 249.0585678100586, + "learning_rate": 0.0009345906333525581, + "loss": 697.0381, + "step": 1730 + }, + { + "ce_loss_10": 3.5872240900993346, + "ce_loss_13": 3.5136430621147157, + "ce_loss_2": 4.2012934923172, + "ce_loss_3": 3.9967063546180723, + "ce_loss_7": 3.65671169757843, + "epoch": 0.174, + "grad_norm": 408.0, + "kl_loss_10": 122.21610031127929, + "kl_loss_2": 1418.8342651367188, + "kl_loss_3": 1007.5152191162109, + "kl_loss_7": 254.60486221313477, + "learning_rate": 0.0009338038518223745, + "loss": 687.4246, + "step": 1740 + }, + { + "ce_loss_10": 3.657099163532257, + "ce_loss_13": 3.5811222553253175, + "ce_loss_2": 4.272738003730774, + "ce_loss_3": 4.0657650351524355, + "ce_loss_7": 3.7293712973594664, + "epoch": 0.175, + "grad_norm": 424.0, + "kl_loss_10": 122.57909774780273, + "kl_loss_2": 1418.8521423339844, + "kl_loss_3": 1004.8310028076172, + "kl_loss_7": 258.8813926696777, + "learning_rate": 0.0009330127018922195, + "loss": 709.7155, + "step": 1750 + }, + { + "ce_loss_10": 3.60728679895401, + "ce_loss_13": 3.5332212805747987, + "ce_loss_2": 4.2153865694999695, + "ce_loss_3": 4.016399335861206, + "ce_loss_7": 3.6759958028793336, + "epoch": 0.176, + "grad_norm": 446.0, + "kl_loss_10": 117.00486183166504, + "kl_loss_2": 1406.6310485839845, + "kl_loss_3": 989.8223937988281, + "kl_loss_7": 252.39801330566405, + "learning_rate": 0.0009322171915289634, + "loss": 689.0163, + "step": 1760 + }, + { + "ce_loss_10": 3.640791046619415, + "ce_loss_13": 3.5716994404792786, + "ce_loss_2": 4.240784847736359, + "ce_loss_3": 4.040842926502227, + "ce_loss_7": 3.7066658616065977, + "epoch": 0.177, + "grad_norm": 504.0, + "kl_loss_10": 114.7268009185791, + "kl_loss_2": 1384.9641479492188, + "kl_loss_3": 983.0798065185547, + "kl_loss_7": 249.6033966064453, + "learning_rate": 0.0009314173287433873, + "loss": 677.7067, + "step": 1770 + }, + { + "ce_loss_10": 3.6371870756149294, + "ce_loss_13": 3.565370166301727, + "ce_loss_2": 4.248478496074677, + "ce_loss_3": 4.042388367652893, + "ce_loss_7": 3.7081421256065368, + "epoch": 0.178, + "grad_norm": 544.0, + "kl_loss_10": 117.58927421569824, + "kl_loss_2": 1410.781103515625, + "kl_loss_3": 995.2456298828125, + "kl_loss_7": 252.9298988342285, + "learning_rate": 0.0009306131215901003, + "loss": 681.0704, + "step": 1780 + }, + { + "ce_loss_10": 3.6657212376594543, + "ce_loss_13": 3.5942596793174744, + "ce_loss_2": 4.267256224155426, + "ce_loss_3": 4.070630991458893, + "ce_loss_7": 3.736851954460144, + "epoch": 0.179, + "grad_norm": 608.0, + "kl_loss_10": 117.64482841491699, + "kl_loss_2": 1382.4147399902345, + "kl_loss_3": 974.7222351074219, + "kl_loss_7": 254.88860549926758, + "learning_rate": 0.0009298045781674596, + "loss": 674.4276, + "step": 1790 + }, + { + "ce_loss_10": 3.6482357382774353, + "ce_loss_13": 3.577735936641693, + "ce_loss_2": 4.238207507133484, + "ce_loss_3": 4.047251141071319, + "ce_loss_7": 3.7247403979301454, + "epoch": 0.18, + "grad_norm": 584.0, + "kl_loss_10": 113.15109825134277, + "kl_loss_2": 1356.1355346679688, + "kl_loss_3": 966.8513641357422, + "kl_loss_7": 260.6168983459473, + "learning_rate": 0.0009289917066174886, + "loss": 687.0212, + "step": 1800 + }, + { + "ce_loss_10": 3.6436230182647704, + "ce_loss_13": 3.573524606227875, + "ce_loss_2": 4.205891370773315, + "ce_loss_3": 4.035960531234741, + "ce_loss_7": 3.713192844390869, + "epoch": 0.181, + "grad_norm": 644.0, + "kl_loss_10": 111.37209777832031, + "kl_loss_2": 1312.0460144042968, + "kl_loss_3": 951.6195190429687, + "kl_loss_7": 248.27648315429687, + "learning_rate": 0.0009281745151257945, + "loss": 665.2686, + "step": 1810 + }, + { + "ce_loss_10": 3.6573068499565125, + "ce_loss_13": 3.5899064898490907, + "ce_loss_2": 4.263534939289093, + "ce_loss_3": 4.073390209674836, + "ce_loss_7": 3.725028562545776, + "epoch": 0.182, + "grad_norm": 496.0, + "kl_loss_10": 112.47701683044434, + "kl_loss_2": 1362.4901000976563, + "kl_loss_3": 982.173095703125, + "kl_loss_7": 248.95221557617188, + "learning_rate": 0.0009273530119214868, + "loss": 681.1132, + "step": 1820 + }, + { + "ce_loss_10": 3.7659960746765138, + "ce_loss_13": 3.6931302428245543, + "ce_loss_2": 4.335440850257873, + "ce_loss_3": 4.146318483352661, + "ce_loss_7": 3.831969678401947, + "epoch": 0.183, + "grad_norm": 460.0, + "kl_loss_10": 115.37424812316894, + "kl_loss_2": 1332.8465270996094, + "kl_loss_3": 945.53359375, + "kl_loss_7": 244.2625930786133, + "learning_rate": 0.0009265272052770935, + "loss": 653.1528, + "step": 1830 + }, + { + "ce_loss_10": 3.573833405971527, + "ce_loss_13": 3.504916477203369, + "ce_loss_2": 4.191505336761475, + "ce_loss_3": 4.003697621822357, + "ce_loss_7": 3.6443989157676695, + "epoch": 0.184, + "grad_norm": 524.0, + "kl_loss_10": 110.15304069519043, + "kl_loss_2": 1378.9180969238282, + "kl_loss_3": 997.8934539794922, + "kl_loss_7": 241.1827133178711, + "learning_rate": 0.0009256971035084784, + "loss": 679.6828, + "step": 1840 + }, + { + "ce_loss_10": 3.5141557097434997, + "ce_loss_13": 3.4410739183425902, + "ce_loss_2": 4.1375454545021055, + "ce_loss_3": 3.934183955192566, + "ce_loss_7": 3.588452696800232, + "epoch": 0.185, + "grad_norm": 528.0, + "kl_loss_10": 114.25855445861816, + "kl_loss_2": 1412.8881896972657, + "kl_loss_3": 1019.4745208740235, + "kl_loss_7": 253.67017517089843, + "learning_rate": 0.0009248627149747573, + "loss": 690.3363, + "step": 1850 + }, + { + "ce_loss_10": 3.7252640962600707, + "ce_loss_13": 3.653822290897369, + "ce_loss_2": 4.3040543556213375, + "ce_loss_3": 4.132464408874512, + "ce_loss_7": 3.793966567516327, + "epoch": 0.186, + "grad_norm": 564.0, + "kl_loss_10": 115.14772300720215, + "kl_loss_2": 1340.048565673828, + "kl_loss_3": 980.2122955322266, + "kl_loss_7": 244.52606735229492, + "learning_rate": 0.0009240240480782129, + "loss": 674.7569, + "step": 1860 + }, + { + "ce_loss_10": 3.635197627544403, + "ce_loss_13": 3.561525750160217, + "ce_loss_2": 4.234712994098663, + "ce_loss_3": 4.036596286296844, + "ce_loss_7": 3.7000155448913574, + "epoch": 0.187, + "grad_norm": 442.0, + "kl_loss_10": 116.40899467468262, + "kl_loss_2": 1366.7482482910157, + "kl_loss_3": 985.8397155761719, + "kl_loss_7": 245.4348571777344, + "learning_rate": 0.0009231811112642122, + "loss": 670.6495, + "step": 1870 + }, + { + "ce_loss_10": 3.680171477794647, + "ce_loss_13": 3.607526624202728, + "ce_loss_2": 4.242461228370667, + "ce_loss_3": 4.0597851276397705, + "ce_loss_7": 3.7417822241783143, + "epoch": 0.188, + "grad_norm": 462.0, + "kl_loss_10": 115.97058601379395, + "kl_loss_2": 1329.476806640625, + "kl_loss_3": 944.8101593017578, + "kl_loss_7": 240.48009033203124, + "learning_rate": 0.0009223339130211192, + "loss": 656.504, + "step": 1880 + }, + { + "ce_loss_10": 3.527233564853668, + "ce_loss_13": 3.456979143619537, + "ce_loss_2": 4.1365337610244755, + "ce_loss_3": 3.9334477186203003, + "ce_loss_7": 3.5922507286071776, + "epoch": 0.189, + "grad_norm": 492.0, + "kl_loss_10": 120.61857757568359, + "kl_loss_2": 1391.3780578613282, + "kl_loss_3": 981.7018951416015, + "kl_loss_7": 240.8455017089844, + "learning_rate": 0.0009214824618802108, + "loss": 678.3247, + "step": 1890 + }, + { + "ce_loss_10": 3.715823400020599, + "ce_loss_13": 3.639923906326294, + "ce_loss_2": 4.3134965896606445, + "ce_loss_3": 4.111241257190704, + "ce_loss_7": 3.779346799850464, + "epoch": 0.19, + "grad_norm": 456.0, + "kl_loss_10": 127.23999710083008, + "kl_loss_2": 1364.8941589355468, + "kl_loss_3": 960.9680725097656, + "kl_loss_7": 248.77009048461915, + "learning_rate": 0.0009206267664155906, + "loss": 685.2967, + "step": 1900 + }, + { + "ce_loss_10": 3.6354769825935365, + "ce_loss_13": 3.556038224697113, + "ce_loss_2": 4.224778318405152, + "ce_loss_3": 4.022096812725067, + "ce_loss_7": 3.690963363647461, + "epoch": 0.191, + "grad_norm": 524.0, + "kl_loss_10": 125.37596015930175, + "kl_loss_2": 1371.057391357422, + "kl_loss_3": 969.1447021484375, + "kl_loss_7": 243.19865951538085, + "learning_rate": 0.0009197668352441024, + "loss": 678.1597, + "step": 1910 + }, + { + "ce_loss_10": 3.6849255323410035, + "ce_loss_13": 3.6070022225379943, + "ce_loss_2": 4.272895455360413, + "ce_loss_3": 4.0722639799118046, + "ce_loss_7": 3.741350519657135, + "epoch": 0.192, + "grad_norm": 512.0, + "kl_loss_10": 128.65237312316896, + "kl_loss_2": 1349.729931640625, + "kl_loss_3": 949.7346984863282, + "kl_loss_7": 242.41346817016603, + "learning_rate": 0.0009189026770252437, + "loss": 671.3585, + "step": 1920 + }, + { + "ce_loss_10": 3.7201656699180603, + "ce_loss_13": 3.6394999861717223, + "ce_loss_2": 4.302338600158691, + "ce_loss_3": 4.1032923579216005, + "ce_loss_7": 3.7764319658279417, + "epoch": 0.193, + "grad_norm": 458.0, + "kl_loss_10": 133.32494201660157, + "kl_loss_2": 1342.450421142578, + "kl_loss_3": 949.4765075683594, + "kl_loss_7": 246.09827728271483, + "learning_rate": 0.000918034300461078, + "loss": 688.7368, + "step": 1930 + }, + { + "ce_loss_10": 3.747203004360199, + "ce_loss_13": 3.6689361929893494, + "ce_loss_2": 4.312567710876465, + "ce_loss_3": 4.122261881828308, + "ce_loss_7": 3.8043017029762267, + "epoch": 0.194, + "grad_norm": 446.0, + "kl_loss_10": 129.00423164367675, + "kl_loss_2": 1325.102197265625, + "kl_loss_3": 931.5237396240234, + "kl_loss_7": 241.53863983154298, + "learning_rate": 0.0009171617142961477, + "loss": 661.2737, + "step": 1940 + }, + { + "ce_loss_10": 3.699472951889038, + "ce_loss_13": 3.6279419898986816, + "ce_loss_2": 4.281847763061523, + "ce_loss_3": 4.083541011810302, + "ce_loss_7": 3.7648919463157653, + "epoch": 0.195, + "grad_norm": 434.0, + "kl_loss_10": 121.35350723266602, + "kl_loss_2": 1352.1436584472656, + "kl_loss_3": 952.5272399902344, + "kl_loss_7": 240.53460235595702, + "learning_rate": 0.0009162849273173857, + "loss": 665.7376, + "step": 1950 + }, + { + "ce_loss_10": 3.632410800457001, + "ce_loss_13": 3.5614632248878477, + "ce_loss_2": 4.223404765129089, + "ce_loss_3": 4.023203945159912, + "ce_loss_7": 3.700835573673248, + "epoch": 0.196, + "grad_norm": 470.0, + "kl_loss_10": 118.8283805847168, + "kl_loss_2": 1344.0367370605468, + "kl_loss_3": 944.1380065917969, + "kl_loss_7": 251.04139099121093, + "learning_rate": 0.0009154039483540273, + "loss": 672.422, + "step": 1960 + }, + { + "ce_loss_10": 3.6197654128074648, + "ce_loss_13": 3.546481454372406, + "ce_loss_2": 4.201360607147217, + "ce_loss_3": 4.004413700103759, + "ce_loss_7": 3.682723355293274, + "epoch": 0.197, + "grad_norm": 406.0, + "kl_loss_10": 120.08837623596192, + "kl_loss_2": 1349.5242309570312, + "kl_loss_3": 942.7119018554688, + "kl_loss_7": 243.98333435058595, + "learning_rate": 0.0009145187862775209, + "loss": 667.6594, + "step": 1970 + }, + { + "ce_loss_10": 3.6506257891654967, + "ce_loss_13": 3.5804669737815855, + "ce_loss_2": 4.243929970264435, + "ce_loss_3": 4.034862732887268, + "ce_loss_7": 3.7135458827018737, + "epoch": 0.198, + "grad_norm": 620.0, + "kl_loss_10": 117.48477897644042, + "kl_loss_2": 1377.6264770507812, + "kl_loss_3": 958.5087188720703, + "kl_loss_7": 243.4327537536621, + "learning_rate": 0.0009136294500014386, + "loss": 665.5496, + "step": 1980 + }, + { + "ce_loss_10": 3.599961686134338, + "ce_loss_13": 3.528086531162262, + "ce_loss_2": 4.217166924476624, + "ce_loss_3": 4.008637738227844, + "ce_loss_7": 3.6669308066368105, + "epoch": 0.199, + "grad_norm": 616.0, + "kl_loss_10": 115.34629516601562, + "kl_loss_2": 1399.4097900390625, + "kl_loss_3": 983.5843353271484, + "kl_loss_7": 244.90453720092773, + "learning_rate": 0.000912735948481387, + "loss": 681.3188, + "step": 1990 + }, + { + "ce_loss_10": 3.6347181677818297, + "ce_loss_13": 3.560755395889282, + "ce_loss_2": 4.230910205841065, + "ce_loss_3": 4.03362866640091, + "ce_loss_7": 3.700130546092987, + "epoch": 0.2, + "grad_norm": 492.0, + "kl_loss_10": 115.55288009643554, + "kl_loss_2": 1372.0876098632812, + "kl_loss_3": 976.6782318115235, + "kl_loss_7": 248.47412033081054, + "learning_rate": 0.0009118382907149164, + "loss": 666.3086, + "step": 2000 + }, + { + "ce_loss_10": 3.6592599511146546, + "ce_loss_13": 3.5870088934898376, + "ce_loss_2": 4.23843743801117, + "ce_loss_3": 4.045619630813599, + "ce_loss_7": 3.722980320453644, + "epoch": 0.201, + "grad_norm": 492.0, + "kl_loss_10": 114.64969139099121, + "kl_loss_2": 1351.183447265625, + "kl_loss_3": 956.2785675048829, + "kl_loss_7": 247.1648811340332, + "learning_rate": 0.0009109364857414306, + "loss": 658.4385, + "step": 2010 + }, + { + "ce_loss_10": 3.6247077345848084, + "ce_loss_13": 3.5549973130226133, + "ce_loss_2": 4.192802679538727, + "ce_loss_3": 4.006897258758545, + "ce_loss_7": 3.694356381893158, + "epoch": 0.202, + "grad_norm": 432.0, + "kl_loss_10": 111.96462211608886, + "kl_loss_2": 1332.7575988769531, + "kl_loss_3": 943.8550109863281, + "kl_loss_7": 248.51104660034179, + "learning_rate": 0.0009100305426420956, + "loss": 673.1317, + "step": 2020 + }, + { + "ce_loss_10": 3.5841406345367433, + "ce_loss_13": 3.5164321780204775, + "ce_loss_2": 4.202693927288055, + "ce_loss_3": 3.9953248143196105, + "ce_loss_7": 3.650038242340088, + "epoch": 0.203, + "grad_norm": 432.0, + "kl_loss_10": 113.3315975189209, + "kl_loss_2": 1413.3973693847656, + "kl_loss_3": 984.508837890625, + "kl_loss_7": 247.24474029541017, + "learning_rate": 0.0009091204705397484, + "loss": 669.0848, + "step": 2030 + }, + { + "ce_loss_10": 3.585637128353119, + "ce_loss_13": 3.5091155648231505, + "ce_loss_2": 4.185344040393829, + "ce_loss_3": 3.992532753944397, + "ce_loss_7": 3.6510769963264464, + "epoch": 0.204, + "grad_norm": 448.0, + "kl_loss_10": 124.74103927612305, + "kl_loss_2": 1400.9709167480469, + "kl_loss_3": 992.3661834716797, + "kl_loss_7": 250.7290901184082, + "learning_rate": 0.0009082062785988049, + "loss": 681.1268, + "step": 2040 + }, + { + "ce_loss_10": 3.721049964427948, + "ce_loss_13": 3.6463207244873046, + "ce_loss_2": 4.2808568477630615, + "ce_loss_3": 4.093539321422577, + "ce_loss_7": 3.7847790718078613, + "epoch": 0.205, + "grad_norm": 466.0, + "kl_loss_10": 117.74674758911132, + "kl_loss_2": 1322.8293701171874, + "kl_loss_3": 931.9617309570312, + "kl_loss_7": 242.782958984375, + "learning_rate": 0.0009072879760251679, + "loss": 667.7382, + "step": 2050 + }, + { + "ce_loss_10": 3.6576568126678466, + "ce_loss_13": 3.5876036405563356, + "ce_loss_2": 4.261196970939636, + "ce_loss_3": 4.065271866321564, + "ce_loss_7": 3.727122116088867, + "epoch": 0.206, + "grad_norm": 510.0, + "kl_loss_10": 116.0688491821289, + "kl_loss_2": 1368.6263488769532, + "kl_loss_3": 974.0030578613281, + "kl_loss_7": 247.75207290649413, + "learning_rate": 0.0009063655720661341, + "loss": 667.9454, + "step": 2060 + }, + { + "ce_loss_10": 3.7091850519180296, + "ce_loss_13": 3.6359564065933228, + "ce_loss_2": 4.2861632108688354, + "ce_loss_3": 4.091731405258178, + "ce_loss_7": 3.780376970767975, + "epoch": 0.207, + "grad_norm": 756.0, + "kl_loss_10": 117.6738265991211, + "kl_loss_2": 1338.1011291503905, + "kl_loss_3": 948.2550628662109, + "kl_loss_7": 265.19689025878904, + "learning_rate": 0.000905439076010301, + "loss": 666.8086, + "step": 2070 + }, + { + "ce_loss_10": 3.661241602897644, + "ce_loss_13": 3.5897186398506165, + "ce_loss_2": 4.2563663721084595, + "ce_loss_3": 4.06014586687088, + "ce_loss_7": 3.7477613568305967, + "epoch": 0.208, + "grad_norm": 502.0, + "kl_loss_10": 114.75448532104492, + "kl_loss_2": 1354.8022583007812, + "kl_loss_3": 965.9129913330078, + "kl_loss_7": 280.605126953125, + "learning_rate": 0.0009045084971874737, + "loss": 668.0169, + "step": 2080 + }, + { + "ce_loss_10": 3.6376566290855408, + "ce_loss_13": 3.568475866317749, + "ce_loss_2": 4.219207537174225, + "ce_loss_3": 4.027460336685181, + "ce_loss_7": 3.7098584175109863, + "epoch": 0.209, + "grad_norm": 476.0, + "kl_loss_10": 112.78402214050293, + "kl_loss_2": 1340.8613708496093, + "kl_loss_3": 948.0414184570312, + "kl_loss_7": 264.3713745117187, + "learning_rate": 0.0009035738449685707, + "loss": 673.0266, + "step": 2090 + }, + { + "ce_loss_10": 3.5796483874320986, + "ce_loss_13": 3.507876431941986, + "ce_loss_2": 4.179275572299957, + "ce_loss_3": 3.9804170727729797, + "ce_loss_7": 3.6553168416023256, + "epoch": 0.21, + "grad_norm": 576.0, + "kl_loss_10": 116.00268287658692, + "kl_loss_2": 1358.9301879882812, + "kl_loss_3": 960.5301177978515, + "kl_loss_7": 258.80527420043944, + "learning_rate": 0.0009026351287655293, + "loss": 660.1454, + "step": 2100 + }, + { + "ce_loss_10": 3.7848559260368346, + "ce_loss_13": 3.7130728244781492, + "ce_loss_2": 4.324071049690247, + "ce_loss_3": 4.141572403907776, + "ce_loss_7": 3.8475868105888367, + "epoch": 0.211, + "grad_norm": 410.0, + "kl_loss_10": 115.61974067687989, + "kl_loss_2": 1276.820086669922, + "kl_loss_3": 901.7400726318359, + "kl_loss_7": 240.78678359985352, + "learning_rate": 0.0009016923580312113, + "loss": 636.6335, + "step": 2110 + }, + { + "ce_loss_10": 3.6301342844963074, + "ce_loss_13": 3.560654580593109, + "ce_loss_2": 4.20012868642807, + "ce_loss_3": 4.0125791192054745, + "ce_loss_7": 3.6946483969688417, + "epoch": 0.212, + "grad_norm": 462.0, + "kl_loss_10": 112.99716110229492, + "kl_loss_2": 1313.61005859375, + "kl_loss_3": 932.8424255371094, + "kl_loss_7": 240.82636260986328, + "learning_rate": 0.0009007455422593077, + "loss": 661.1402, + "step": 2120 + }, + { + "ce_loss_10": 3.643904185295105, + "ce_loss_13": 3.5732839465141297, + "ce_loss_2": 4.229895269870758, + "ce_loss_3": 4.039864921569825, + "ce_loss_7": 3.7100953698158263, + "epoch": 0.213, + "grad_norm": 544.0, + "kl_loss_10": 113.59500656127929, + "kl_loss_2": 1376.1138916015625, + "kl_loss_3": 982.583627319336, + "kl_loss_7": 245.73143768310547, + "learning_rate": 0.0008997946909842425, + "loss": 673.5951, + "step": 2130 + }, + { + "ce_loss_10": 3.6592751264572145, + "ce_loss_13": 3.5883899569511413, + "ce_loss_2": 4.269070100784302, + "ce_loss_3": 4.073972117900849, + "ce_loss_7": 3.7271844029426573, + "epoch": 0.214, + "grad_norm": 486.0, + "kl_loss_10": 115.54258766174317, + "kl_loss_2": 1390.4590087890624, + "kl_loss_3": 991.5129302978515, + "kl_loss_7": 248.1077392578125, + "learning_rate": 0.0008988398137810777, + "loss": 666.5385, + "step": 2140 + }, + { + "ce_loss_10": 3.696693778038025, + "ce_loss_13": 3.6274760484695436, + "ce_loss_2": 4.272996628284455, + "ce_loss_3": 4.077895438671112, + "ce_loss_7": 3.759436583518982, + "epoch": 0.215, + "grad_norm": 410.0, + "kl_loss_10": 109.15078239440918, + "kl_loss_2": 1323.0664428710938, + "kl_loss_3": 929.4178771972656, + "kl_loss_7": 235.6734588623047, + "learning_rate": 0.0008978809202654162, + "loss": 648.7643, + "step": 2150 + }, + { + "ce_loss_10": 3.674948477745056, + "ce_loss_13": 3.6074150681495665, + "ce_loss_2": 4.254977214336395, + "ce_loss_3": 4.054577016830445, + "ce_loss_7": 3.7380695223808287, + "epoch": 0.216, + "grad_norm": 342.0, + "kl_loss_10": 111.83034286499023, + "kl_loss_2": 1326.1663391113282, + "kl_loss_3": 930.476919555664, + "kl_loss_7": 237.29275283813476, + "learning_rate": 0.0008969180200933046, + "loss": 659.8788, + "step": 2160 + }, + { + "ce_loss_10": 3.633396315574646, + "ce_loss_13": 3.5632909893989564, + "ce_loss_2": 4.235332405567169, + "ce_loss_3": 4.043020272254944, + "ce_loss_7": 3.700575852394104, + "epoch": 0.217, + "grad_norm": 426.0, + "kl_loss_10": 113.71143455505371, + "kl_loss_2": 1376.4859924316406, + "kl_loss_3": 965.2266693115234, + "kl_loss_7": 244.74252319335938, + "learning_rate": 0.0008959511229611376, + "loss": 671.5447, + "step": 2170 + }, + { + "ce_loss_10": 3.7160756826400756, + "ce_loss_13": 3.6463282823562624, + "ce_loss_2": 4.29187992811203, + "ce_loss_3": 4.0930745005607605, + "ce_loss_7": 3.77754408121109, + "epoch": 0.218, + "grad_norm": 494.0, + "kl_loss_10": 112.80306968688964, + "kl_loss_2": 1327.3540283203124, + "kl_loss_3": 931.4955108642578, + "kl_loss_7": 236.75977325439453, + "learning_rate": 0.0008949802386055581, + "loss": 652.6458, + "step": 2180 + }, + { + "ce_loss_10": 3.5766260862350463, + "ce_loss_13": 3.5034859418869018, + "ce_loss_2": 4.159404098987579, + "ce_loss_3": 3.9599217534065247, + "ce_loss_7": 3.6389345288276673, + "epoch": 0.219, + "grad_norm": 466.0, + "kl_loss_10": 111.61733703613281, + "kl_loss_2": 1335.0789672851563, + "kl_loss_3": 936.6266204833985, + "kl_loss_7": 234.00814971923828, + "learning_rate": 0.0008940053768033609, + "loss": 665.0317, + "step": 2190 + }, + { + "ce_loss_10": 3.65551677942276, + "ce_loss_13": 3.58828284740448, + "ce_loss_2": 4.223571491241455, + "ce_loss_3": 4.04930864572525, + "ce_loss_7": 3.7170740485191347, + "epoch": 0.22, + "grad_norm": 500.0, + "kl_loss_10": 111.12692604064941, + "kl_loss_2": 1315.6073425292968, + "kl_loss_3": 947.5290161132813, + "kl_loss_7": 230.48658447265626, + "learning_rate": 0.0008930265473713938, + "loss": 654.9715, + "step": 2200 + }, + { + "ce_loss_10": 3.6195818066596983, + "ce_loss_13": 3.54861319065094, + "ce_loss_2": 4.198423433303833, + "ce_loss_3": 4.012469959259033, + "ce_loss_7": 3.681425595283508, + "epoch": 0.221, + "grad_norm": 528.0, + "kl_loss_10": 115.1269718170166, + "kl_loss_2": 1324.1462280273438, + "kl_loss_3": 954.0400299072265, + "kl_loss_7": 233.1149475097656, + "learning_rate": 0.0008920437601664579, + "loss": 648.2547, + "step": 2210 + }, + { + "ce_loss_10": 3.6091471552848815, + "ce_loss_13": 3.539783036708832, + "ce_loss_2": 4.181177127361297, + "ce_loss_3": 3.993445408344269, + "ce_loss_7": 3.669792366027832, + "epoch": 0.222, + "grad_norm": 410.0, + "kl_loss_10": 115.02236862182617, + "kl_loss_2": 1333.0483093261719, + "kl_loss_3": 948.5272521972656, + "kl_loss_7": 236.40514373779297, + "learning_rate": 0.0008910570250852097, + "loss": 647.6241, + "step": 2220 + }, + { + "ce_loss_10": 3.7300463914871216, + "ce_loss_13": 3.657940351963043, + "ce_loss_2": 4.2689752101898195, + "ce_loss_3": 4.08460431098938, + "ce_loss_7": 3.786776435375214, + "epoch": 0.223, + "grad_norm": 396.0, + "kl_loss_10": 119.05343589782714, + "kl_loss_2": 1270.7809265136718, + "kl_loss_3": 894.4331726074219, + "kl_loss_7": 233.2676574707031, + "learning_rate": 0.0008900663520640604, + "loss": 634.1773, + "step": 2230 + }, + { + "ce_loss_10": 3.668592298030853, + "ce_loss_13": 3.5982241868972777, + "ce_loss_2": 4.234554326534271, + "ce_loss_3": 4.045566046237946, + "ce_loss_7": 3.7271997809410093, + "epoch": 0.224, + "grad_norm": 484.0, + "kl_loss_10": 115.87549057006837, + "kl_loss_2": 1305.9935668945313, + "kl_loss_3": 922.4547302246094, + "kl_loss_7": 233.83654251098633, + "learning_rate": 0.0008890717510790764, + "loss": 651.4086, + "step": 2240 + }, + { + "ce_loss_10": 3.624187970161438, + "ce_loss_13": 3.553602933883667, + "ce_loss_2": 4.209651458263397, + "ce_loss_3": 4.014156377315521, + "ce_loss_7": 3.6865435361862184, + "epoch": 0.225, + "grad_norm": 428.0, + "kl_loss_10": 111.85544166564941, + "kl_loss_2": 1344.305419921875, + "kl_loss_3": 948.2704040527344, + "kl_loss_7": 233.77221450805663, + "learning_rate": 0.0008880732321458784, + "loss": 659.9288, + "step": 2250 + }, + { + "ce_loss_10": 3.6572599172592164, + "ce_loss_13": 3.589207625389099, + "ce_loss_2": 4.229009163379669, + "ce_loss_3": 4.035797142982483, + "ce_loss_7": 3.720983362197876, + "epoch": 0.226, + "grad_norm": 450.0, + "kl_loss_10": 112.32478141784668, + "kl_loss_2": 1315.7465759277343, + "kl_loss_3": 923.5756164550781, + "kl_loss_7": 233.98969955444335, + "learning_rate": 0.0008870708053195413, + "loss": 656.0779, + "step": 2260 + }, + { + "ce_loss_10": 3.6822890281677245, + "ce_loss_13": 3.613891136646271, + "ce_loss_2": 4.243361723423004, + "ce_loss_3": 4.054495620727539, + "ce_loss_7": 3.7477814078330995, + "epoch": 0.227, + "grad_norm": 510.0, + "kl_loss_10": 109.18305702209473, + "kl_loss_2": 1298.216571044922, + "kl_loss_3": 915.6232360839844, + "kl_loss_7": 232.2860221862793, + "learning_rate": 0.0008860644806944918, + "loss": 640.8393, + "step": 2270 + }, + { + "ce_loss_10": 3.6228482365608214, + "ce_loss_13": 3.55364191532135, + "ce_loss_2": 4.206750881671906, + "ce_loss_3": 4.00951054096222, + "ce_loss_7": 3.692072665691376, + "epoch": 0.228, + "grad_norm": 516.0, + "kl_loss_10": 112.56623115539551, + "kl_loss_2": 1346.1175109863282, + "kl_loss_3": 947.8353302001954, + "kl_loss_7": 250.14039154052733, + "learning_rate": 0.0008850542684044079, + "loss": 646.6089, + "step": 2280 + }, + { + "ce_loss_10": 3.594875121116638, + "ce_loss_13": 3.5233037948608397, + "ce_loss_2": 4.204600942134857, + "ce_loss_3": 3.998581278324127, + "ce_loss_7": 3.665067207813263, + "epoch": 0.229, + "grad_norm": 450.0, + "kl_loss_10": 112.58180122375488, + "kl_loss_2": 1387.7576477050782, + "kl_loss_3": 974.7930572509765, + "kl_loss_7": 248.70092544555663, + "learning_rate": 0.0008840401786221159, + "loss": 661.1442, + "step": 2290 + }, + { + "ce_loss_10": 3.731163203716278, + "ce_loss_13": 3.6657732129096985, + "ce_loss_2": 4.296657812595368, + "ce_loss_3": 4.099385142326355, + "ce_loss_7": 3.791227328777313, + "epoch": 0.23, + "grad_norm": 480.0, + "kl_loss_10": 108.29386520385742, + "kl_loss_2": 1301.2722473144531, + "kl_loss_3": 910.2853973388671, + "kl_loss_7": 230.31064682006837, + "learning_rate": 0.000883022221559489, + "loss": 636.6557, + "step": 2300 + }, + { + "ce_loss_10": 3.68310546875, + "ce_loss_13": 3.615295338630676, + "ce_loss_2": 4.257085943222046, + "ce_loss_3": 4.067182207107544, + "ce_loss_7": 3.7437336564064028, + "epoch": 0.231, + "grad_norm": 446.0, + "kl_loss_10": 109.58497161865235, + "kl_loss_2": 1331.8181457519531, + "kl_loss_3": 935.3311614990234, + "kl_loss_7": 230.688321685791, + "learning_rate": 0.0008820004074673434, + "loss": 666.0036, + "step": 2310 + }, + { + "ce_loss_10": 3.588702178001404, + "ce_loss_13": 3.524743127822876, + "ce_loss_2": 4.165178096294403, + "ce_loss_3": 3.97126362323761, + "ce_loss_7": 3.6520536303520204, + "epoch": 0.232, + "grad_norm": 494.0, + "kl_loss_10": 105.68032302856446, + "kl_loss_2": 1342.763037109375, + "kl_loss_3": 936.0263000488281, + "kl_loss_7": 229.52542724609376, + "learning_rate": 0.0008809747466353355, + "loss": 641.3422, + "step": 2320 + }, + { + "ce_loss_10": 3.602530860900879, + "ce_loss_13": 3.5331971526145933, + "ce_loss_2": 4.167471373081208, + "ce_loss_3": 3.977475893497467, + "ce_loss_7": 3.6642409324645997, + "epoch": 0.233, + "grad_norm": 458.0, + "kl_loss_10": 110.41828346252441, + "kl_loss_2": 1312.3292907714845, + "kl_loss_3": 919.6944396972656, + "kl_loss_7": 231.60648040771486, + "learning_rate": 0.0008799452493918585, + "loss": 645.3, + "step": 2330 + }, + { + "ce_loss_10": 3.6880576372146607, + "ce_loss_13": 3.6198028326034546, + "ce_loss_2": 4.254074168205261, + "ce_loss_3": 4.062888276576996, + "ce_loss_7": 3.7507563471794128, + "epoch": 0.234, + "grad_norm": 474.0, + "kl_loss_10": 110.42879600524903, + "kl_loss_2": 1309.2276428222656, + "kl_loss_3": 921.8907257080078, + "kl_loss_7": 233.63690185546875, + "learning_rate": 0.0008789119261039385, + "loss": 662.3614, + "step": 2340 + }, + { + "ce_loss_10": 3.5944358229637148, + "ce_loss_13": 3.5265544295310973, + "ce_loss_2": 4.165751957893372, + "ce_loss_3": 3.9788960099220274, + "ce_loss_7": 3.656530773639679, + "epoch": 0.235, + "grad_norm": 390.0, + "kl_loss_10": 106.77197875976563, + "kl_loss_2": 1303.9624755859375, + "kl_loss_3": 920.4357025146485, + "kl_loss_7": 233.87101974487305, + "learning_rate": 0.0008778747871771292, + "loss": 636.2241, + "step": 2350 + }, + { + "ce_loss_10": 3.642832565307617, + "ce_loss_13": 3.5764389514923094, + "ce_loss_2": 4.1962644219398495, + "ce_loss_3": 4.005896735191345, + "ce_loss_7": 3.703742432594299, + "epoch": 0.236, + "grad_norm": 488.0, + "kl_loss_10": 106.56211357116699, + "kl_loss_2": 1280.7797607421876, + "kl_loss_3": 899.6746459960938, + "kl_loss_7": 226.2964195251465, + "learning_rate": 0.0008768338430554083, + "loss": 628.8105, + "step": 2360 + }, + { + "ce_loss_10": 3.6540219306945803, + "ce_loss_13": 3.5862942576408385, + "ce_loss_2": 4.217021405696869, + "ce_loss_3": 4.0323525190353395, + "ce_loss_7": 3.715273082256317, + "epoch": 0.237, + "grad_norm": 446.0, + "kl_loss_10": 108.96415519714355, + "kl_loss_2": 1303.7487731933593, + "kl_loss_3": 917.9468658447265, + "kl_loss_7": 231.81985321044922, + "learning_rate": 0.0008757891042210713, + "loss": 643.5909, + "step": 2370 + }, + { + "ce_loss_10": 3.6785866141319277, + "ce_loss_13": 3.6081984996795655, + "ce_loss_2": 4.2465239524841305, + "ce_loss_3": 4.049813544750213, + "ce_loss_7": 3.7435325622558593, + "epoch": 0.238, + "grad_norm": 504.0, + "kl_loss_10": 111.76208610534668, + "kl_loss_2": 1305.791943359375, + "kl_loss_3": 916.3558563232422, + "kl_loss_7": 238.5102066040039, + "learning_rate": 0.0008747405811946271, + "loss": 645.7604, + "step": 2380 + }, + { + "ce_loss_10": 3.5631368517875672, + "ce_loss_13": 3.4966716051101683, + "ce_loss_2": 4.149629712104797, + "ce_loss_3": 3.960558259487152, + "ce_loss_7": 3.629357707500458, + "epoch": 0.239, + "grad_norm": 466.0, + "kl_loss_10": 110.7029800415039, + "kl_loss_2": 1342.1500549316406, + "kl_loss_3": 952.2329833984375, + "kl_loss_7": 240.75519790649415, + "learning_rate": 0.0008736882845346905, + "loss": 640.1473, + "step": 2390 + }, + { + "ce_loss_10": 3.6702625513076783, + "ce_loss_13": 3.598974347114563, + "ce_loss_2": 4.235510897636414, + "ce_loss_3": 4.041280698776245, + "ce_loss_7": 3.734322738647461, + "epoch": 0.24, + "grad_norm": 504.0, + "kl_loss_10": 116.23694610595703, + "kl_loss_2": 1302.5589904785156, + "kl_loss_3": 916.9543487548829, + "kl_loss_7": 247.71970977783204, + "learning_rate": 0.0008726322248378774, + "loss": 637.9229, + "step": 2400 + }, + { + "ce_loss_10": 3.6652265906333925, + "ce_loss_13": 3.595516872406006, + "ce_loss_2": 4.246520745754242, + "ce_loss_3": 4.043909120559692, + "ce_loss_7": 3.7296547532081603, + "epoch": 0.241, + "grad_norm": 450.0, + "kl_loss_10": 113.42751388549804, + "kl_loss_2": 1334.7802490234376, + "kl_loss_3": 932.5218017578125, + "kl_loss_7": 240.9356887817383, + "learning_rate": 0.0008715724127386971, + "loss": 657.7121, + "step": 2410 + }, + { + "ce_loss_10": 3.7339873194694517, + "ce_loss_13": 3.6643826484680178, + "ce_loss_2": 4.278091061115265, + "ce_loss_3": 4.095656621456146, + "ce_loss_7": 3.791609489917755, + "epoch": 0.242, + "grad_norm": 462.0, + "kl_loss_10": 112.56025924682618, + "kl_loss_2": 1285.8053466796875, + "kl_loss_3": 903.9169921875, + "kl_loss_7": 234.85026092529296, + "learning_rate": 0.0008705088589094458, + "loss": 638.7832, + "step": 2420 + }, + { + "ce_loss_10": 3.745934009552002, + "ce_loss_13": 3.677195417881012, + "ce_loss_2": 4.306411802768707, + "ce_loss_3": 4.115467298030853, + "ce_loss_7": 3.8079170107841493, + "epoch": 0.243, + "grad_norm": 414.0, + "kl_loss_10": 111.40414924621582, + "kl_loss_2": 1291.0523193359375, + "kl_loss_3": 906.0183563232422, + "kl_loss_7": 231.1467155456543, + "learning_rate": 0.0008694415740600988, + "loss": 640.6179, + "step": 2430 + }, + { + "ce_loss_10": 3.595633792877197, + "ce_loss_13": 3.5286824345588683, + "ce_loss_2": 4.186046612262726, + "ce_loss_3": 3.9901591181755065, + "ce_loss_7": 3.6573471426963806, + "epoch": 0.244, + "grad_norm": 500.0, + "kl_loss_10": 109.70008354187011, + "kl_loss_2": 1348.1939697265625, + "kl_loss_3": 959.4411895751953, + "kl_loss_7": 231.17713012695313, + "learning_rate": 0.0008683705689382025, + "loss": 654.2107, + "step": 2440 + }, + { + "ce_loss_10": 3.684832978248596, + "ce_loss_13": 3.616632854938507, + "ce_loss_2": 4.231842195987701, + "ce_loss_3": 4.048018515110016, + "ce_loss_7": 3.740548253059387, + "epoch": 0.245, + "grad_norm": 450.0, + "kl_loss_10": 108.41531753540039, + "kl_loss_2": 1280.763720703125, + "kl_loss_3": 904.3856048583984, + "kl_loss_7": 224.61626434326172, + "learning_rate": 0.0008672958543287666, + "loss": 648.306, + "step": 2450 + }, + { + "ce_loss_10": 3.6943544864654543, + "ce_loss_13": 3.625825345516205, + "ce_loss_2": 4.246035170555115, + "ce_loss_3": 4.063331556320191, + "ce_loss_7": 3.75509090423584, + "epoch": 0.246, + "grad_norm": 428.0, + "kl_loss_10": 109.8709056854248, + "kl_loss_2": 1283.2248840332031, + "kl_loss_3": 904.9634002685547, + "kl_loss_7": 228.17876358032225, + "learning_rate": 0.0008662174410541554, + "loss": 632.1618, + "step": 2460 + }, + { + "ce_loss_10": 3.6546427369117738, + "ce_loss_13": 3.587355947494507, + "ce_loss_2": 4.204605233669281, + "ce_loss_3": 4.022764265537262, + "ce_loss_7": 3.717895233631134, + "epoch": 0.247, + "grad_norm": 394.0, + "kl_loss_10": 107.19166374206543, + "kl_loss_2": 1283.1351989746095, + "kl_loss_3": 906.7954956054688, + "kl_loss_7": 227.77373123168945, + "learning_rate": 0.0008651353399739787, + "loss": 642.6704, + "step": 2470 + }, + { + "ce_loss_10": 3.685038208961487, + "ce_loss_13": 3.6166242718696595, + "ce_loss_2": 4.247460579872131, + "ce_loss_3": 4.056502640247345, + "ce_loss_7": 3.7451204299926757, + "epoch": 0.248, + "grad_norm": 520.0, + "kl_loss_10": 109.50839309692383, + "kl_loss_2": 1297.0845886230468, + "kl_loss_3": 908.0451263427734, + "kl_loss_7": 229.69447708129883, + "learning_rate": 0.0008640495619849821, + "loss": 636.7805, + "step": 2480 + }, + { + "ce_loss_10": 3.6444509506225584, + "ce_loss_13": 3.5771190404891966, + "ce_loss_2": 4.202155363559723, + "ce_loss_3": 4.00964595079422, + "ce_loss_7": 3.709311318397522, + "epoch": 0.249, + "grad_norm": 492.0, + "kl_loss_10": 107.21613540649415, + "kl_loss_2": 1287.2485595703124, + "kl_loss_3": 903.1389892578125, + "kl_loss_7": 236.67683792114258, + "learning_rate": 0.0008629601180209381, + "loss": 632.6728, + "step": 2490 + }, + { + "ce_loss_10": 3.640513265132904, + "ce_loss_13": 3.571250784397125, + "ce_loss_2": 4.189491713047028, + "ce_loss_3": 4.000437986850739, + "ce_loss_7": 3.6998685002326965, + "epoch": 0.25, + "grad_norm": 388.0, + "kl_loss_10": 109.32069320678711, + "kl_loss_2": 1269.254248046875, + "kl_loss_3": 889.8983123779296, + "kl_loss_7": 235.86445999145508, + "learning_rate": 0.000861867019052535, + "loss": 634.9495, + "step": 2500 + }, + { + "ce_loss_10": 3.5532511711120605, + "ce_loss_13": 3.4857767462730407, + "ce_loss_2": 4.138734245300293, + "ce_loss_3": 3.9396822333335875, + "ce_loss_7": 3.6218135476112367, + "epoch": 0.251, + "grad_norm": 454.0, + "kl_loss_10": 108.08290519714356, + "kl_loss_2": 1328.9036437988282, + "kl_loss_3": 931.4583953857422, + "kl_loss_7": 239.18134002685548, + "learning_rate": 0.0008607702760872678, + "loss": 651.695, + "step": 2510 + }, + { + "ce_loss_10": 3.6758545279502868, + "ce_loss_13": 3.6099536180496217, + "ce_loss_2": 4.224261367321015, + "ce_loss_3": 4.035415709018707, + "ce_loss_7": 3.738192629814148, + "epoch": 0.252, + "grad_norm": 676.0, + "kl_loss_10": 109.95955963134766, + "kl_loss_2": 1264.877392578125, + "kl_loss_3": 890.2998840332032, + "kl_loss_7": 227.99790649414064, + "learning_rate": 0.0008596699001693256, + "loss": 638.9367, + "step": 2520 + }, + { + "ce_loss_10": 3.695500302314758, + "ce_loss_13": 3.61992267370224, + "ce_loss_2": 4.222428333759308, + "ce_loss_3": 4.035504674911499, + "ce_loss_7": 3.743453121185303, + "epoch": 0.253, + "grad_norm": 548.0, + "kl_loss_10": 127.16191024780274, + "kl_loss_2": 1273.7016052246095, + "kl_loss_3": 884.7771850585938, + "kl_loss_7": 228.30911254882812, + "learning_rate": 0.0008585659023794818, + "loss": 643.5041, + "step": 2530 + }, + { + "ce_loss_10": 3.637289047241211, + "ce_loss_13": 3.5686028838157653, + "ce_loss_2": 4.218254780769348, + "ce_loss_3": 4.019161069393158, + "ce_loss_7": 3.696817862987518, + "epoch": 0.254, + "grad_norm": 462.0, + "kl_loss_10": 120.60056190490722, + "kl_loss_2": 1330.6954162597656, + "kl_loss_3": 938.699496459961, + "kl_loss_7": 233.61075134277343, + "learning_rate": 0.0008574582938349817, + "loss": 644.681, + "step": 2540 + }, + { + "ce_loss_10": 3.6420682311058044, + "ce_loss_13": 3.567863130569458, + "ce_loss_2": 4.228188097476959, + "ce_loss_3": 4.029180979728698, + "ce_loss_7": 3.705214190483093, + "epoch": 0.255, + "grad_norm": 372.0, + "kl_loss_10": 117.31230735778809, + "kl_loss_2": 1354.2801513671875, + "kl_loss_3": 952.2808837890625, + "kl_loss_7": 241.81886978149413, + "learning_rate": 0.0008563470856894315, + "loss": 640.0965, + "step": 2550 + }, + { + "ce_loss_10": 3.6231508612632752, + "ce_loss_13": 3.5556546688079833, + "ce_loss_2": 4.189491558074951, + "ce_loss_3": 3.9987146973609926, + "ce_loss_7": 3.681329298019409, + "epoch": 0.256, + "grad_norm": 472.0, + "kl_loss_10": 108.91358489990235, + "kl_loss_2": 1298.7685607910157, + "kl_loss_3": 917.6602386474609, + "kl_loss_7": 230.57952499389648, + "learning_rate": 0.0008552322891326845, + "loss": 638.6699, + "step": 2560 + }, + { + "ce_loss_10": 3.5982382774353026, + "ce_loss_13": 3.528933322429657, + "ce_loss_2": 4.162684524059296, + "ce_loss_3": 3.9698683977127076, + "ce_loss_7": 3.656614398956299, + "epoch": 0.257, + "grad_norm": 434.0, + "kl_loss_10": 109.61449127197265, + "kl_loss_2": 1301.0992553710937, + "kl_loss_3": 921.5931274414063, + "kl_loss_7": 231.00868759155273, + "learning_rate": 0.0008541139153907296, + "loss": 634.7393, + "step": 2570 + }, + { + "ce_loss_10": 3.5526642203330994, + "ce_loss_13": 3.485519516468048, + "ce_loss_2": 4.114146530628204, + "ce_loss_3": 3.924081325531006, + "ce_loss_7": 3.614666759967804, + "epoch": 0.258, + "grad_norm": 548.0, + "kl_loss_10": 107.18747673034667, + "kl_loss_2": 1300.3511169433593, + "kl_loss_3": 919.3837615966797, + "kl_loss_7": 228.61345367431642, + "learning_rate": 0.0008529919757255782, + "loss": 640.0102, + "step": 2580 + }, + { + "ce_loss_10": 3.591010940074921, + "ce_loss_13": 3.519867956638336, + "ce_loss_2": 4.122671520709991, + "ce_loss_3": 3.9352630972862244, + "ce_loss_7": 3.642985260486603, + "epoch": 0.259, + "grad_norm": 462.0, + "kl_loss_10": 115.52005577087402, + "kl_loss_2": 1261.9588439941406, + "kl_loss_3": 881.7059204101563, + "kl_loss_7": 222.79493560791016, + "learning_rate": 0.0008518664814351503, + "loss": 624.9721, + "step": 2590 + }, + { + "ce_loss_10": 3.5559722065925596, + "ce_loss_13": 3.4851076006889343, + "ce_loss_2": 4.12727187871933, + "ce_loss_3": 3.9312629222869875, + "ce_loss_7": 3.6141554713249207, + "epoch": 0.26, + "grad_norm": 468.0, + "kl_loss_10": 118.2235237121582, + "kl_loss_2": 1329.9431396484374, + "kl_loss_3": 938.9490844726563, + "kl_loss_7": 229.97386627197267, + "learning_rate": 0.0008507374438531607, + "loss": 664.5543, + "step": 2600 + }, + { + "ce_loss_10": 3.52994726896286, + "ce_loss_13": 3.460645878314972, + "ce_loss_2": 4.086832702159882, + "ce_loss_3": 3.8989439606666565, + "ce_loss_7": 3.5843619465827943, + "epoch": 0.261, + "grad_norm": 454.0, + "kl_loss_10": 111.8594409942627, + "kl_loss_2": 1283.5894409179687, + "kl_loss_3": 906.6501983642578, + "kl_loss_7": 225.07547607421876, + "learning_rate": 0.0008496048743490053, + "loss": 631.2727, + "step": 2610 + }, + { + "ce_loss_10": 3.688689887523651, + "ce_loss_13": 3.6171088218688965, + "ce_loss_2": 4.232082653045654, + "ce_loss_3": 4.045144772529602, + "ce_loss_7": 3.7411328554153442, + "epoch": 0.262, + "grad_norm": 498.0, + "kl_loss_10": 112.36735153198242, + "kl_loss_2": 1262.6953674316405, + "kl_loss_3": 890.0498321533203, + "kl_loss_7": 224.3122886657715, + "learning_rate": 0.0008484687843276469, + "loss": 626.552, + "step": 2620 + }, + { + "ce_loss_10": 3.6142364263534548, + "ce_loss_13": 3.54564208984375, + "ce_loss_2": 4.1653601884841915, + "ce_loss_3": 3.979761373996735, + "ce_loss_7": 3.670048642158508, + "epoch": 0.263, + "grad_norm": 604.0, + "kl_loss_10": 113.79613609313965, + "kl_loss_2": 1291.2943481445313, + "kl_loss_3": 909.2907562255859, + "kl_loss_7": 229.63263626098632, + "learning_rate": 0.0008473291852294987, + "loss": 643.7754, + "step": 2630 + }, + { + "ce_loss_10": 3.624956822395325, + "ce_loss_13": 3.5516101837158205, + "ce_loss_2": 4.185651910305023, + "ce_loss_3": 3.9955446600914, + "ce_loss_7": 3.682393753528595, + "epoch": 0.264, + "grad_norm": 560.0, + "kl_loss_10": 118.29873504638672, + "kl_loss_2": 1318.540350341797, + "kl_loss_3": 922.2017700195313, + "kl_loss_7": 233.18995971679686, + "learning_rate": 0.0008461860885303114, + "loss": 639.2153, + "step": 2640 + }, + { + "ce_loss_10": 3.651511311531067, + "ce_loss_13": 3.5835230231285093, + "ce_loss_2": 4.192479598522186, + "ce_loss_3": 4.010076713562012, + "ce_loss_7": 3.705660367012024, + "epoch": 0.265, + "grad_norm": 532.0, + "kl_loss_10": 120.45394592285156, + "kl_loss_2": 1256.7761352539062, + "kl_loss_3": 889.5764984130859, + "kl_loss_7": 224.92701721191406, + "learning_rate": 0.000845039505741056, + "loss": 630.0308, + "step": 2650 + }, + { + "ce_loss_10": 3.6381911635398865, + "ce_loss_13": 3.567105031013489, + "ce_loss_2": 4.197239780426026, + "ce_loss_3": 4.007714176177979, + "ce_loss_7": 3.694069528579712, + "epoch": 0.266, + "grad_norm": 476.0, + "kl_loss_10": 120.83754425048828, + "kl_loss_2": 1304.788995361328, + "kl_loss_3": 924.8183044433594, + "kl_loss_7": 230.70355300903321, + "learning_rate": 0.0008438894484078086, + "loss": 659.7302, + "step": 2660 + }, + { + "ce_loss_10": 3.6475989818573, + "ce_loss_13": 3.575591731071472, + "ce_loss_2": 4.190777051448822, + "ce_loss_3": 4.004483807086944, + "ce_loss_7": 3.6995357990264894, + "epoch": 0.267, + "grad_norm": 486.0, + "kl_loss_10": 115.3904426574707, + "kl_loss_2": 1277.0895690917969, + "kl_loss_3": 898.4886535644531, + "kl_loss_7": 228.7105613708496, + "learning_rate": 0.0008427359281116334, + "loss": 634.3606, + "step": 2670 + }, + { + "ce_loss_10": 3.547777831554413, + "ce_loss_13": 3.4795953035354614, + "ce_loss_2": 4.113273656368255, + "ce_loss_3": 3.9340083956718446, + "ce_loss_7": 3.6087413907051085, + "epoch": 0.268, + "grad_norm": 572.0, + "kl_loss_10": 111.49882125854492, + "kl_loss_2": 1312.4553100585938, + "kl_loss_3": 937.0226959228515, + "kl_loss_7": 228.7356918334961, + "learning_rate": 0.0008415789564684673, + "loss": 643.8098, + "step": 2680 + }, + { + "ce_loss_10": 3.7991090655326842, + "ce_loss_13": 3.7250254154205322, + "ce_loss_2": 4.329492318630218, + "ce_loss_3": 4.153719592094421, + "ce_loss_7": 3.8572392106056212, + "epoch": 0.269, + "grad_norm": 536.0, + "kl_loss_10": 119.75568199157715, + "kl_loss_2": 1240.7658264160157, + "kl_loss_3": 900.5773712158203, + "kl_loss_7": 236.5658187866211, + "learning_rate": 0.0008404185451290017, + "loss": 621.7949, + "step": 2690 + }, + { + "ce_loss_10": 3.6571221590042113, + "ce_loss_13": 3.5904589772224424, + "ce_loss_2": 4.20020170211792, + "ce_loss_3": 4.020016396045685, + "ce_loss_7": 3.7206122040748597, + "epoch": 0.27, + "grad_norm": 612.0, + "kl_loss_10": 109.39498329162598, + "kl_loss_2": 1278.1862854003907, + "kl_loss_3": 903.8941192626953, + "kl_loss_7": 233.3107780456543, + "learning_rate": 0.0008392547057785661, + "loss": 629.3908, + "step": 2700 + }, + { + "ce_loss_10": 3.5812445521354674, + "ce_loss_13": 3.511835253238678, + "ce_loss_2": 4.144945275783539, + "ce_loss_3": 3.964122700691223, + "ce_loss_7": 3.65096001625061, + "epoch": 0.271, + "grad_norm": 536.0, + "kl_loss_10": 110.46146507263184, + "kl_loss_2": 1320.70322265625, + "kl_loss_3": 951.1240264892579, + "kl_loss_7": 251.59460906982423, + "learning_rate": 0.0008380874501370098, + "loss": 636.0247, + "step": 2710 + }, + { + "ce_loss_10": 3.576056122779846, + "ce_loss_13": 3.5100281834602356, + "ce_loss_2": 4.140222942829132, + "ce_loss_3": 3.9531075954437256, + "ce_loss_7": 3.6410070419311524, + "epoch": 0.272, + "grad_norm": 544.0, + "kl_loss_10": 110.53367462158204, + "kl_loss_2": 1306.970849609375, + "kl_loss_3": 923.5474395751953, + "kl_loss_7": 236.50457839965821, + "learning_rate": 0.0008369167899585841, + "loss": 640.9572, + "step": 2720 + }, + { + "ce_loss_10": 3.6997987627983093, + "ce_loss_13": 3.635802137851715, + "ce_loss_2": 4.225974369049072, + "ce_loss_3": 4.0530330538749695, + "ce_loss_7": 3.760606610774994, + "epoch": 0.273, + "grad_norm": 700.0, + "kl_loss_10": 106.76014976501465, + "kl_loss_2": 1235.3878662109375, + "kl_loss_3": 873.6434448242187, + "kl_loss_7": 224.53188552856446, + "learning_rate": 0.0008357427370318238, + "loss": 630.9094, + "step": 2730 + }, + { + "ce_loss_10": 3.6538386702537538, + "ce_loss_13": 3.5854528903961183, + "ce_loss_2": 4.205159163475036, + "ce_loss_3": 4.013937699794769, + "ce_loss_7": 3.712061953544617, + "epoch": 0.274, + "grad_norm": 448.0, + "kl_loss_10": 110.17894134521484, + "kl_loss_2": 1286.5609130859375, + "kl_loss_3": 904.9044860839844, + "kl_loss_7": 229.6865478515625, + "learning_rate": 0.0008345653031794292, + "loss": 635.8903, + "step": 2740 + }, + { + "ce_loss_10": 3.6535327553749086, + "ce_loss_13": 3.5872610807418823, + "ce_loss_2": 4.199507582187652, + "ce_loss_3": 4.0229881525039675, + "ce_loss_7": 3.712740111351013, + "epoch": 0.275, + "grad_norm": 494.0, + "kl_loss_10": 108.84803733825683, + "kl_loss_2": 1267.8911865234375, + "kl_loss_3": 897.5597198486328, + "kl_loss_7": 226.9661651611328, + "learning_rate": 0.0008333845002581458, + "loss": 628.4583, + "step": 2750 + }, + { + "ce_loss_10": 3.5756468772888184, + "ce_loss_13": 3.5082659125328064, + "ce_loss_2": 4.145406031608582, + "ce_loss_3": 3.954765808582306, + "ce_loss_7": 3.6372469902038573, + "epoch": 0.276, + "grad_norm": 442.0, + "kl_loss_10": 107.86047019958497, + "kl_loss_2": 1333.4900146484374, + "kl_loss_3": 936.4077697753906, + "kl_loss_7": 230.6979835510254, + "learning_rate": 0.0008322003401586462, + "loss": 647.3615, + "step": 2760 + }, + { + "ce_loss_10": 3.615983176231384, + "ce_loss_13": 3.5494200587272644, + "ce_loss_2": 4.153625726699829, + "ce_loss_3": 3.9661497712135314, + "ce_loss_7": 3.670648729801178, + "epoch": 0.277, + "grad_norm": 456.0, + "kl_loss_10": 107.68133277893067, + "kl_loss_2": 1252.6388610839845, + "kl_loss_3": 875.4474029541016, + "kl_loss_7": 220.59310073852538, + "learning_rate": 0.0008310128348054094, + "loss": 608.5761, + "step": 2770 + }, + { + "ce_loss_10": 3.581556737422943, + "ce_loss_13": 3.518260824680328, + "ce_loss_2": 4.13277485370636, + "ce_loss_3": 3.9427122831344605, + "ce_loss_7": 3.6427804708480833, + "epoch": 0.278, + "grad_norm": 556.0, + "kl_loss_10": 107.17420654296875, + "kl_loss_2": 1270.6333923339844, + "kl_loss_3": 894.9070373535156, + "kl_loss_7": 225.04671096801758, + "learning_rate": 0.0008298219961566008, + "loss": 624.6308, + "step": 2780 + }, + { + "ce_loss_10": 3.5525727391242983, + "ce_loss_13": 3.485328257083893, + "ce_loss_2": 4.1319693446159365, + "ce_loss_3": 3.9388387560844422, + "ce_loss_7": 3.6104352355003355, + "epoch": 0.279, + "grad_norm": 394.0, + "kl_loss_10": 112.5637420654297, + "kl_loss_2": 1333.4249206542968, + "kl_loss_3": 937.83544921875, + "kl_loss_7": 227.29133377075195, + "learning_rate": 0.0008286278362039527, + "loss": 635.7598, + "step": 2790 + }, + { + "ce_loss_10": 3.587259495258331, + "ce_loss_13": 3.5128440499305724, + "ce_loss_2": 4.159168899059296, + "ce_loss_3": 3.9615533709526063, + "ce_loss_7": 3.6405880570411684, + "epoch": 0.28, + "grad_norm": 402.0, + "kl_loss_10": 114.4114917755127, + "kl_loss_2": 1318.4106079101562, + "kl_loss_3": 924.2900451660156, + "kl_loss_7": 224.0070999145508, + "learning_rate": 0.0008274303669726426, + "loss": 628.2539, + "step": 2800 + }, + { + "ce_loss_10": 3.479981768131256, + "ce_loss_13": 3.411652183532715, + "ce_loss_2": 4.056045913696289, + "ce_loss_3": 3.866449761390686, + "ce_loss_7": 3.5390130996704103, + "epoch": 0.281, + "grad_norm": 484.0, + "kl_loss_10": 111.04401168823242, + "kl_loss_2": 1325.4802673339843, + "kl_loss_3": 933.757958984375, + "kl_loss_7": 223.9423583984375, + "learning_rate": 0.0008262296005211721, + "loss": 628.6442, + "step": 2810 + }, + { + "ce_loss_10": 3.6082133769989015, + "ce_loss_13": 3.543479096889496, + "ce_loss_2": 4.177137637138367, + "ce_loss_3": 3.9879695653915403, + "ce_loss_7": 3.667951965332031, + "epoch": 0.282, + "grad_norm": 436.0, + "kl_loss_10": 106.70964050292969, + "kl_loss_2": 1303.7048461914062, + "kl_loss_3": 916.7790283203125, + "kl_loss_7": 222.79779052734375, + "learning_rate": 0.0008250255489412463, + "loss": 627.094, + "step": 2820 + }, + { + "ce_loss_10": 3.716309094429016, + "ce_loss_13": 3.642316293716431, + "ce_loss_2": 4.261255824565888, + "ce_loss_3": 4.0787659049034115, + "ce_loss_7": 3.7748358964920046, + "epoch": 0.283, + "grad_norm": 604.0, + "kl_loss_10": 114.64575080871582, + "kl_loss_2": 1277.7698669433594, + "kl_loss_3": 901.2324371337891, + "kl_loss_7": 231.05035324096679, + "learning_rate": 0.0008238182243576511, + "loss": 633.2869, + "step": 2830 + }, + { + "ce_loss_10": 3.682005834579468, + "ce_loss_13": 3.615007734298706, + "ce_loss_2": 4.202854037284851, + "ce_loss_3": 4.023157751560211, + "ce_loss_7": 3.736177396774292, + "epoch": 0.284, + "grad_norm": 548.0, + "kl_loss_10": 110.75144157409667, + "kl_loss_2": 1221.345361328125, + "kl_loss_3": 870.2192474365235, + "kl_loss_7": 222.93451232910155, + "learning_rate": 0.0008226076389281315, + "loss": 611.4373, + "step": 2840 + }, + { + "ce_loss_10": 3.7233519554138184, + "ce_loss_13": 3.6542891025543214, + "ce_loss_2": 4.248092949390411, + "ce_loss_3": 4.069663691520691, + "ce_loss_7": 3.775057625770569, + "epoch": 0.285, + "grad_norm": 704.0, + "kl_loss_10": 110.19483451843261, + "kl_loss_2": 1255.0005493164062, + "kl_loss_3": 882.4262268066407, + "kl_loss_7": 222.1134048461914, + "learning_rate": 0.0008213938048432696, + "loss": 610.5205, + "step": 2850 + }, + { + "ce_loss_10": 3.64341379404068, + "ce_loss_13": 3.5780718684196473, + "ce_loss_2": 4.180006468296051, + "ce_loss_3": 3.9996572732925415, + "ce_loss_7": 3.701814925670624, + "epoch": 0.286, + "grad_norm": 442.0, + "kl_loss_10": 108.8284294128418, + "kl_loss_2": 1259.7149841308594, + "kl_loss_3": 887.2555450439453, + "kl_loss_7": 224.16595458984375, + "learning_rate": 0.0008201767343263612, + "loss": 623.8719, + "step": 2860 + }, + { + "ce_loss_10": 3.580712640285492, + "ce_loss_13": 3.514770495891571, + "ce_loss_2": 4.152428865432739, + "ce_loss_3": 3.9586745381355284, + "ce_loss_7": 3.6420669674873354, + "epoch": 0.287, + "grad_norm": 478.0, + "kl_loss_10": 104.37866973876953, + "kl_loss_2": 1290.6899536132812, + "kl_loss_3": 906.0174041748047, + "kl_loss_7": 219.9403289794922, + "learning_rate": 0.0008189564396332927, + "loss": 611.9311, + "step": 2870 + }, + { + "ce_loss_10": 3.560059654712677, + "ce_loss_13": 3.4959851503372192, + "ce_loss_2": 4.124033749103546, + "ce_loss_3": 3.9340568661689757, + "ce_loss_7": 3.6201700448989866, + "epoch": 0.288, + "grad_norm": 480.0, + "kl_loss_10": 103.69261512756347, + "kl_loss_2": 1290.621844482422, + "kl_loss_3": 906.7965057373046, + "kl_loss_7": 223.63958206176758, + "learning_rate": 0.0008177329330524181, + "loss": 627.1938, + "step": 2880 + }, + { + "ce_loss_10": 3.6303093075752257, + "ce_loss_13": 3.5619912266731264, + "ce_loss_2": 4.173935759067535, + "ce_loss_3": 3.9890891432762148, + "ce_loss_7": 3.6870488286018372, + "epoch": 0.289, + "grad_norm": 452.0, + "kl_loss_10": 105.55148658752441, + "kl_loss_2": 1245.1935241699218, + "kl_loss_3": 874.3131744384766, + "kl_loss_7": 225.99310531616212, + "learning_rate": 0.0008165062269044352, + "loss": 620.2292, + "step": 2890 + }, + { + "ce_loss_10": 3.574904942512512, + "ce_loss_13": 3.5098610281944276, + "ce_loss_2": 4.134270429611206, + "ce_loss_3": 3.941369962692261, + "ce_loss_7": 3.641903018951416, + "epoch": 0.29, + "grad_norm": 394.0, + "kl_loss_10": 109.33544578552247, + "kl_loss_2": 1282.6864868164062, + "kl_loss_3": 899.0664276123047, + "kl_loss_7": 231.66256561279297, + "learning_rate": 0.0008152763335422613, + "loss": 630.6565, + "step": 2900 + }, + { + "ce_loss_10": 3.5721667885780333, + "ce_loss_13": 3.503155696392059, + "ce_loss_2": 4.123925364017486, + "ce_loss_3": 3.935485672950745, + "ce_loss_7": 3.625267505645752, + "epoch": 0.291, + "grad_norm": 600.0, + "kl_loss_10": 111.11225318908691, + "kl_loss_2": 1285.6910400390625, + "kl_loss_3": 903.9730163574219, + "kl_loss_7": 227.13845748901366, + "learning_rate": 0.0008140432653509088, + "loss": 623.4421, + "step": 2910 + }, + { + "ce_loss_10": 3.617369520664215, + "ce_loss_13": 3.5511601328849793, + "ce_loss_2": 4.158329248428345, + "ce_loss_3": 3.9670159101486204, + "ce_loss_7": 3.6755479097366335, + "epoch": 0.292, + "grad_norm": 424.0, + "kl_loss_10": 108.97641296386719, + "kl_loss_2": 1271.4477172851562, + "kl_loss_3": 887.2839324951171, + "kl_loss_7": 224.6483947753906, + "learning_rate": 0.0008128070347473608, + "loss": 614.8932, + "step": 2920 + }, + { + "ce_loss_10": 3.6203887820243836, + "ce_loss_13": 3.5571650743484495, + "ce_loss_2": 4.184931480884552, + "ce_loss_3": 3.988680112361908, + "ce_loss_7": 3.6783168077468873, + "epoch": 0.293, + "grad_norm": 442.0, + "kl_loss_10": 106.56389541625977, + "kl_loss_2": 1309.3880310058594, + "kl_loss_3": 912.7983032226563, + "kl_loss_7": 223.91178283691406, + "learning_rate": 0.0008115676541804455, + "loss": 627.653, + "step": 2930 + }, + { + "ce_loss_10": 3.631400096416473, + "ce_loss_13": 3.565066874027252, + "ce_loss_2": 4.173557686805725, + "ce_loss_3": 3.9938668727874758, + "ce_loss_7": 3.6861080646514894, + "epoch": 0.294, + "grad_norm": 410.0, + "kl_loss_10": 107.59184074401855, + "kl_loss_2": 1258.5327880859375, + "kl_loss_3": 893.1478424072266, + "kl_loss_7": 221.63349151611328, + "learning_rate": 0.0008103251361307119, + "loss": 625.068, + "step": 2940 + }, + { + "ce_loss_10": 3.6633550047874452, + "ce_loss_13": 3.5975454568862917, + "ce_loss_2": 4.201860392093659, + "ce_loss_3": 4.0187016248703005, + "ce_loss_7": 3.7217815637588503, + "epoch": 0.295, + "grad_norm": 484.0, + "kl_loss_10": 107.80433006286621, + "kl_loss_2": 1263.7409545898438, + "kl_loss_3": 899.7593811035156, + "kl_loss_7": 224.86621551513673, + "learning_rate": 0.0008090794931103026, + "loss": 620.4886, + "step": 2950 + }, + { + "ce_loss_10": 3.6508840203285216, + "ce_loss_13": 3.588442325592041, + "ce_loss_2": 4.192799139022827, + "ce_loss_3": 4.012849128246307, + "ce_loss_7": 3.706376481056213, + "epoch": 0.296, + "grad_norm": 560.0, + "kl_loss_10": 104.73687210083008, + "kl_loss_2": 1249.5858154296875, + "kl_loss_3": 882.2532653808594, + "kl_loss_7": 217.49150695800782, + "learning_rate": 0.0008078307376628291, + "loss": 618.8026, + "step": 2960 + }, + { + "ce_loss_10": 3.714610981941223, + "ce_loss_13": 3.648031437397003, + "ce_loss_2": 4.232832741737366, + "ce_loss_3": 4.05701197385788, + "ce_loss_7": 3.7682809591293336, + "epoch": 0.297, + "grad_norm": 438.0, + "kl_loss_10": 105.18131446838379, + "kl_loss_2": 1206.7602905273438, + "kl_loss_3": 851.9172241210938, + "kl_loss_7": 215.51920394897462, + "learning_rate": 0.000806578882363245, + "loss": 597.2082, + "step": 2970 + }, + { + "ce_loss_10": 3.6252587914466856, + "ce_loss_13": 3.5611796617507934, + "ce_loss_2": 4.163775825500489, + "ce_loss_3": 3.977950668334961, + "ce_loss_7": 3.6833796977996824, + "epoch": 0.298, + "grad_norm": 648.0, + "kl_loss_10": 103.29475135803223, + "kl_loss_2": 1245.597314453125, + "kl_loss_3": 877.1612213134765, + "kl_loss_7": 219.75524444580077, + "learning_rate": 0.0008053239398177191, + "loss": 627.9783, + "step": 2980 + }, + { + "ce_loss_10": 3.602860856056213, + "ce_loss_13": 3.538487696647644, + "ce_loss_2": 4.142560148239136, + "ce_loss_3": 3.9584405183792115, + "ce_loss_7": 3.659018313884735, + "epoch": 0.299, + "grad_norm": 502.0, + "kl_loss_10": 104.49293098449706, + "kl_loss_2": 1247.2865417480468, + "kl_loss_3": 873.8518829345703, + "kl_loss_7": 218.60237731933594, + "learning_rate": 0.0008040659226635089, + "loss": 629.0394, + "step": 2990 + }, + { + "ce_loss_10": 3.737885308265686, + "ce_loss_13": 3.670609879493713, + "ce_loss_2": 4.267246758937835, + "ce_loss_3": 4.084029448032379, + "ce_loss_7": 3.801585829257965, + "epoch": 0.3, + "grad_norm": 474.0, + "kl_loss_10": 109.13075065612793, + "kl_loss_2": 1251.634942626953, + "kl_loss_3": 874.3355682373046, + "kl_loss_7": 234.4466766357422, + "learning_rate": 0.0008028048435688333, + "loss": 617.8753, + "step": 3000 + }, + { + "ce_loss_10": 3.608117866516113, + "ce_loss_13": 3.5417439699172975, + "ce_loss_2": 4.164859163761139, + "ce_loss_3": 3.9728567838668822, + "ce_loss_7": 3.666444170475006, + "epoch": 0.301, + "grad_norm": 458.0, + "kl_loss_10": 104.67718696594238, + "kl_loss_2": 1290.2385009765626, + "kl_loss_3": 895.6051879882813, + "kl_loss_7": 230.77984161376952, + "learning_rate": 0.0008015407152327448, + "loss": 624.6472, + "step": 3010 + }, + { + "ce_loss_10": 3.655743646621704, + "ce_loss_13": 3.589059603214264, + "ce_loss_2": 4.197956717014312, + "ce_loss_3": 4.010993158817291, + "ce_loss_7": 3.715561032295227, + "epoch": 0.302, + "grad_norm": 490.0, + "kl_loss_10": 108.86725807189941, + "kl_loss_2": 1260.063540649414, + "kl_loss_3": 888.4078857421875, + "kl_loss_7": 225.77315521240234, + "learning_rate": 0.0008002735503850016, + "loss": 621.0348, + "step": 3020 + }, + { + "ce_loss_10": 3.5483126521110533, + "ce_loss_13": 3.477071487903595, + "ce_loss_2": 4.1067805051803585, + "ce_loss_3": 3.9177415490150453, + "ce_loss_7": 3.613772678375244, + "epoch": 0.303, + "grad_norm": 442.0, + "kl_loss_10": 113.85512008666993, + "kl_loss_2": 1301.403955078125, + "kl_loss_3": 923.0243713378907, + "kl_loss_7": 243.21601486206055, + "learning_rate": 0.0007990033617859396, + "loss": 643.6124, + "step": 3030 + }, + { + "ce_loss_10": 3.596233379840851, + "ce_loss_13": 3.527192997932434, + "ce_loss_2": 4.134040641784668, + "ce_loss_3": 3.9505585551261904, + "ce_loss_7": 3.6559112668037415, + "epoch": 0.304, + "grad_norm": 576.0, + "kl_loss_10": 111.53803482055665, + "kl_loss_2": 1246.1058349609375, + "kl_loss_3": 878.1594299316406, + "kl_loss_7": 229.73634643554686, + "learning_rate": 0.000797730162226344, + "loss": 607.6155, + "step": 3040 + }, + { + "ce_loss_10": 3.6262240767478944, + "ce_loss_13": 3.55741890668869, + "ce_loss_2": 4.167547011375428, + "ce_loss_3": 3.981596386432648, + "ce_loss_7": 3.686588776111603, + "epoch": 0.305, + "grad_norm": 430.0, + "kl_loss_10": 113.34700317382813, + "kl_loss_2": 1258.705908203125, + "kl_loss_3": 888.1617828369141, + "kl_loss_7": 230.86616134643555, + "learning_rate": 0.0007964539645273203, + "loss": 613.4882, + "step": 3050 + }, + { + "ce_loss_10": 3.6409424901008607, + "ce_loss_13": 3.574270474910736, + "ce_loss_2": 4.167909657955169, + "ce_loss_3": 3.9862082481384276, + "ce_loss_7": 3.6940474629402162, + "epoch": 0.306, + "grad_norm": 486.0, + "kl_loss_10": 106.54784317016602, + "kl_loss_2": 1238.411444091797, + "kl_loss_3": 866.3065307617187, + "kl_loss_7": 220.125154876709, + "learning_rate": 0.000795174781540165, + "loss": 615.3713, + "step": 3060 + }, + { + "ce_loss_10": 3.721142077445984, + "ce_loss_13": 3.643355393409729, + "ce_loss_2": 4.226944315433502, + "ce_loss_3": 4.049481880664826, + "ce_loss_7": 3.771383452415466, + "epoch": 0.307, + "grad_norm": 418.0, + "kl_loss_10": 122.25658149719239, + "kl_loss_2": 1203.3186645507812, + "kl_loss_3": 851.1368927001953, + "kl_loss_7": 225.69219970703125, + "learning_rate": 0.0007938926261462366, + "loss": 615.3413, + "step": 3070 + }, + { + "ce_loss_10": 3.6604058384895324, + "ce_loss_13": 3.5915605425834656, + "ce_loss_2": 4.175356435775757, + "ce_loss_3": 3.9981507778167726, + "ce_loss_7": 3.7177716493606567, + "epoch": 0.308, + "grad_norm": 528.0, + "kl_loss_10": 111.06630897521973, + "kl_loss_2": 1238.821875, + "kl_loss_3": 876.2146820068359, + "kl_loss_7": 223.17951583862305, + "learning_rate": 0.0007926075112568258, + "loss": 625.9794, + "step": 3080 + }, + { + "ce_loss_10": 3.652525985240936, + "ce_loss_13": 3.586830127239227, + "ce_loss_2": 4.18239061832428, + "ce_loss_3": 4.001185369491577, + "ce_loss_7": 3.7117084741592405, + "epoch": 0.309, + "grad_norm": 408.0, + "kl_loss_10": 105.23004531860352, + "kl_loss_2": 1238.4955749511719, + "kl_loss_3": 879.8196685791015, + "kl_loss_7": 219.05570373535156, + "learning_rate": 0.0007913194498130252, + "loss": 606.0291, + "step": 3090 + }, + { + "ce_loss_10": 3.576726019382477, + "ce_loss_13": 3.5116322517395018, + "ce_loss_2": 4.1267429232597355, + "ce_loss_3": 3.951638638973236, + "ce_loss_7": 3.633882737159729, + "epoch": 0.31, + "grad_norm": 596.0, + "kl_loss_10": 104.83585128784179, + "kl_loss_2": 1271.4552978515626, + "kl_loss_3": 899.7277404785157, + "kl_loss_7": 221.13562393188477, + "learning_rate": 0.0007900284547855992, + "loss": 625.4858, + "step": 3100 + }, + { + "ce_loss_10": 3.585216796398163, + "ce_loss_13": 3.5201086163520814, + "ce_loss_2": 4.1151411652565, + "ce_loss_3": 3.9415447235107424, + "ce_loss_7": 3.6418359875679016, + "epoch": 0.311, + "grad_norm": 460.0, + "kl_loss_10": 104.5475685119629, + "kl_loss_2": 1231.1286071777345, + "kl_loss_3": 880.9251007080078, + "kl_loss_7": 215.18857421875, + "learning_rate": 0.0007887345391748532, + "loss": 620.3755, + "step": 3110 + }, + { + "ce_loss_10": 3.7296380400657654, + "ce_loss_13": 3.6599961280822755, + "ce_loss_2": 4.223298215866089, + "ce_loss_3": 4.0600717782974245, + "ce_loss_7": 3.7785526752471923, + "epoch": 0.312, + "grad_norm": 434.0, + "kl_loss_10": 110.47460975646973, + "kl_loss_2": 1200.8911010742188, + "kl_loss_3": 857.2419494628906, + "kl_loss_7": 215.84228134155273, + "learning_rate": 0.0007874377160105036, + "loss": 594.074, + "step": 3120 + }, + { + "ce_loss_10": 3.647064197063446, + "ce_loss_13": 3.5629413604736326, + "ce_loss_2": 4.20149587392807, + "ce_loss_3": 4.026539087295532, + "ce_loss_7": 3.7087369561195374, + "epoch": 0.313, + "grad_norm": 504.0, + "kl_loss_10": 117.26640319824219, + "kl_loss_2": 1253.9545349121095, + "kl_loss_3": 905.2646728515625, + "kl_loss_7": 235.96448516845703, + "learning_rate": 0.0007861379983515449, + "loss": 636.8147, + "step": 3130 + }, + { + "ce_loss_10": 3.7007260084152223, + "ce_loss_13": 3.631552994251251, + "ce_loss_2": 4.21818333864212, + "ce_loss_3": 4.040616655349732, + "ce_loss_7": 3.757350814342499, + "epoch": 0.314, + "grad_norm": 466.0, + "kl_loss_10": 112.17713203430176, + "kl_loss_2": 1239.388739013672, + "kl_loss_3": 879.6389221191406, + "kl_loss_7": 227.5748275756836, + "learning_rate": 0.0007848353992861195, + "loss": 608.3133, + "step": 3140 + }, + { + "ce_loss_10": 3.786464810371399, + "ce_loss_13": 3.710281562805176, + "ce_loss_2": 4.314648783206939, + "ce_loss_3": 4.134762763977051, + "ce_loss_7": 3.843652272224426, + "epoch": 0.315, + "grad_norm": 458.0, + "kl_loss_10": 124.92040100097657, + "kl_loss_2": 1240.0601806640625, + "kl_loss_3": 878.5192687988281, + "kl_loss_7": 241.41039581298827, + "learning_rate": 0.0007835299319313853, + "loss": 620.9426, + "step": 3150 + }, + { + "ce_loss_10": 3.663742733001709, + "ce_loss_13": 3.5886364459991453, + "ce_loss_2": 4.173808574676514, + "ce_loss_3": 3.994606840610504, + "ce_loss_7": 3.7157713413238525, + "epoch": 0.316, + "grad_norm": 478.0, + "kl_loss_10": 119.08418045043945, + "kl_loss_2": 1222.8418212890624, + "kl_loss_3": 864.8908721923829, + "kl_loss_7": 229.07857666015624, + "learning_rate": 0.0007822216094333848, + "loss": 627.9899, + "step": 3160 + }, + { + "ce_loss_10": 3.660254752635956, + "ce_loss_13": 3.592539119720459, + "ce_loss_2": 4.196765351295471, + "ce_loss_3": 4.01435557603836, + "ce_loss_7": 3.7205393433570864, + "epoch": 0.317, + "grad_norm": 402.0, + "kl_loss_10": 115.94363555908203, + "kl_loss_2": 1238.9798767089844, + "kl_loss_3": 878.1770599365234, + "kl_loss_7": 235.79936828613282, + "learning_rate": 0.0007809104449669101, + "loss": 611.1889, + "step": 3170 + }, + { + "ce_loss_10": 3.625234532356262, + "ce_loss_13": 3.5487714052200316, + "ce_loss_2": 4.128973770141601, + "ce_loss_3": 3.9529131054878235, + "ce_loss_7": 3.6731096148490905, + "epoch": 0.318, + "grad_norm": 524.0, + "kl_loss_10": 118.72456016540528, + "kl_loss_2": 1219.5467956542968, + "kl_loss_3": 854.3440185546875, + "kl_loss_7": 228.71927642822266, + "learning_rate": 0.0007795964517353734, + "loss": 608.5358, + "step": 3180 + }, + { + "ce_loss_10": 3.623099219799042, + "ce_loss_13": 3.54120157957077, + "ce_loss_2": 4.132599997520447, + "ce_loss_3": 3.9547854542732237, + "ce_loss_7": 3.670779359340668, + "epoch": 0.319, + "grad_norm": 438.0, + "kl_loss_10": 145.3066722869873, + "kl_loss_2": 1253.242724609375, + "kl_loss_3": 888.5025939941406, + "kl_loss_7": 249.7946647644043, + "learning_rate": 0.000778279642970672, + "loss": 614.9188, + "step": 3190 + }, + { + "ce_loss_10": 3.61579008102417, + "ce_loss_13": 3.5464967608451845, + "ce_loss_2": 4.135542809963226, + "ce_loss_3": 3.949288582801819, + "ce_loss_7": 3.6743045926094053, + "epoch": 0.32, + "grad_norm": 580.0, + "kl_loss_10": 120.59939308166504, + "kl_loss_2": 1232.2691650390625, + "kl_loss_3": 859.8254913330078, + "kl_loss_7": 236.18389816284179, + "learning_rate": 0.0007769600319330552, + "loss": 603.041, + "step": 3200 + }, + { + "ce_loss_10": 3.6462074518203735, + "ce_loss_13": 3.5768035650253296, + "ce_loss_2": 4.193181753158569, + "ce_loss_3": 4.002738869190216, + "ce_loss_7": 3.7033765077590943, + "epoch": 0.321, + "grad_norm": 536.0, + "kl_loss_10": 113.30461883544922, + "kl_loss_2": 1261.3274169921874, + "kl_loss_3": 880.6385803222656, + "kl_loss_7": 233.63602294921876, + "learning_rate": 0.0007756376319109917, + "loss": 615.0137, + "step": 3210 + }, + { + "ce_loss_10": 3.6983227729797363, + "ce_loss_13": 3.628855359554291, + "ce_loss_2": 4.215565764904023, + "ce_loss_3": 4.036277508735656, + "ce_loss_7": 3.7591845273971556, + "epoch": 0.322, + "grad_norm": 414.0, + "kl_loss_10": 113.85302391052247, + "kl_loss_2": 1215.7693420410155, + "kl_loss_3": 852.6664520263672, + "kl_loss_7": 233.59415435791016, + "learning_rate": 0.0007743124562210351, + "loss": 595.5453, + "step": 3220 + }, + { + "ce_loss_10": 3.7038231015205385, + "ce_loss_13": 3.636870324611664, + "ce_loss_2": 4.220840120315552, + "ce_loss_3": 4.040867578983307, + "ce_loss_7": 3.759516727924347, + "epoch": 0.323, + "grad_norm": 500.0, + "kl_loss_10": 116.7942398071289, + "kl_loss_2": 1231.380029296875, + "kl_loss_3": 860.8811431884766, + "kl_loss_7": 226.99792938232423, + "learning_rate": 0.0007729845182076895, + "loss": 609.7637, + "step": 3230 + }, + { + "ce_loss_10": 3.635891842842102, + "ce_loss_13": 3.570459449291229, + "ce_loss_2": 4.146735298633575, + "ce_loss_3": 3.971414268016815, + "ce_loss_7": 3.6916919469833376, + "epoch": 0.324, + "grad_norm": 544.0, + "kl_loss_10": 107.84456443786621, + "kl_loss_2": 1210.0861877441407, + "kl_loss_3": 854.6661926269531, + "kl_loss_7": 223.18558731079102, + "learning_rate": 0.0007716538312432765, + "loss": 613.749, + "step": 3240 + }, + { + "ce_loss_10": 3.5933796405792235, + "ce_loss_13": 3.5238136887550353, + "ce_loss_2": 4.137728452682495, + "ce_loss_3": 3.9503737330436706, + "ce_loss_7": 3.6502291560173035, + "epoch": 0.325, + "grad_norm": 532.0, + "kl_loss_10": 110.89730453491211, + "kl_loss_2": 1272.4953063964845, + "kl_loss_3": 899.5693664550781, + "kl_loss_7": 234.18169174194335, + "learning_rate": 0.0007703204087277988, + "loss": 621.0721, + "step": 3250 + }, + { + "ce_loss_10": 3.691065728664398, + "ce_loss_13": 3.625254142284393, + "ce_loss_2": 4.195106828212738, + "ce_loss_3": 4.023638522624969, + "ce_loss_7": 3.744424653053284, + "epoch": 0.326, + "grad_norm": 480.0, + "kl_loss_10": 108.84702529907227, + "kl_loss_2": 1187.3806762695312, + "kl_loss_3": 834.2469482421875, + "kl_loss_7": 219.25809020996093, + "learning_rate": 0.0007689842640888063, + "loss": 594.9809, + "step": 3260 + }, + { + "ce_loss_10": 3.6937523603439333, + "ce_loss_13": 3.6257047772407534, + "ce_loss_2": 4.207961022853851, + "ce_loss_3": 4.029592931270599, + "ce_loss_7": 3.7506144404411317, + "epoch": 0.327, + "grad_norm": 432.0, + "kl_loss_10": 109.73489418029786, + "kl_loss_2": 1197.2553649902343, + "kl_loss_3": 845.9240936279297, + "kl_loss_7": 224.3518325805664, + "learning_rate": 0.0007676454107812607, + "loss": 600.9104, + "step": 3270 + }, + { + "ce_loss_10": 3.6202093243598936, + "ce_loss_13": 3.556860589981079, + "ce_loss_2": 4.152219152450561, + "ce_loss_3": 3.972454571723938, + "ce_loss_7": 3.6772433161735534, + "epoch": 0.328, + "grad_norm": 552.0, + "kl_loss_10": 107.7342628479004, + "kl_loss_2": 1234.4693603515625, + "kl_loss_3": 866.5982177734375, + "kl_loss_7": 224.09054641723634, + "learning_rate": 0.0007663038622873999, + "loss": 600.4109, + "step": 3280 + }, + { + "ce_loss_10": 3.6624753713607787, + "ce_loss_13": 3.5959082007408143, + "ce_loss_2": 4.186812722682953, + "ce_loss_3": 4.007313239574432, + "ce_loss_7": 3.7183284163475037, + "epoch": 0.329, + "grad_norm": 416.0, + "kl_loss_10": 107.99775848388671, + "kl_loss_2": 1235.7617919921875, + "kl_loss_3": 865.4350341796875, + "kl_loss_7": 219.93520736694336, + "learning_rate": 0.0007649596321166025, + "loss": 596.3813, + "step": 3290 + }, + { + "ce_loss_10": 3.5629011154174806, + "ce_loss_13": 3.500445473194122, + "ce_loss_2": 4.090505909919739, + "ce_loss_3": 3.9089764833450316, + "ce_loss_7": 3.619310712814331, + "epoch": 0.33, + "grad_norm": 448.0, + "kl_loss_10": 101.5875473022461, + "kl_loss_2": 1220.246160888672, + "kl_loss_3": 856.5614715576172, + "kl_loss_7": 215.10712509155275, + "learning_rate": 0.0007636127338052513, + "loss": 603.8148, + "step": 3300 + }, + { + "ce_loss_10": 3.670552396774292, + "ce_loss_13": 3.6016101121902464, + "ce_loss_2": 4.213171231746673, + "ce_loss_3": 4.018688130378723, + "ce_loss_7": 3.727590525150299, + "epoch": 0.331, + "grad_norm": 374.0, + "kl_loss_10": 108.33710594177246, + "kl_loss_2": 1257.856024169922, + "kl_loss_3": 874.112905883789, + "kl_loss_7": 224.637939453125, + "learning_rate": 0.0007622631809165971, + "loss": 604.7203, + "step": 3310 + }, + { + "ce_loss_10": 3.671126115322113, + "ce_loss_13": 3.6092859148979186, + "ce_loss_2": 4.177222061157226, + "ce_loss_3": 3.9993849992752075, + "ce_loss_7": 3.722568082809448, + "epoch": 0.332, + "grad_norm": 414.0, + "kl_loss_10": 101.74094352722167, + "kl_loss_2": 1180.6327026367187, + "kl_loss_3": 821.7367553710938, + "kl_loss_7": 208.3566993713379, + "learning_rate": 0.000760910987040623, + "loss": 588.4586, + "step": 3320 + }, + { + "ce_loss_10": 3.64985990524292, + "ce_loss_13": 3.585498571395874, + "ce_loss_2": 4.191103303432465, + "ce_loss_3": 4.004770576953888, + "ce_loss_7": 3.7059614300727843, + "epoch": 0.333, + "grad_norm": 346.0, + "kl_loss_10": 102.83302307128906, + "kl_loss_2": 1259.7546875, + "kl_loss_3": 881.3513031005859, + "kl_loss_7": 217.63404388427733, + "learning_rate": 0.000759556165793906, + "loss": 599.8207, + "step": 3330 + }, + { + "ce_loss_10": 3.676869213581085, + "ce_loss_13": 3.610471022129059, + "ce_loss_2": 4.2084539294242855, + "ce_loss_3": 4.019213974475861, + "ce_loss_7": 3.7275768160820006, + "epoch": 0.334, + "grad_norm": 502.0, + "kl_loss_10": 104.88800392150878, + "kl_loss_2": 1223.232958984375, + "kl_loss_3": 852.1926971435547, + "kl_loss_7": 215.24551544189453, + "learning_rate": 0.000758198730819481, + "loss": 604.6092, + "step": 3340 + }, + { + "ce_loss_10": 3.616540086269379, + "ce_loss_13": 3.553786301612854, + "ce_loss_2": 4.152189195156097, + "ce_loss_3": 3.9668321132659914, + "ce_loss_7": 3.6709399580955506, + "epoch": 0.335, + "grad_norm": 488.0, + "kl_loss_10": 102.31456336975097, + "kl_loss_2": 1251.2591918945313, + "kl_loss_3": 875.474462890625, + "kl_loss_7": 214.77994079589843, + "learning_rate": 0.0007568386957867032, + "loss": 608.125, + "step": 3350 + }, + { + "ce_loss_10": 3.695429575443268, + "ce_loss_13": 3.6276296377182007, + "ce_loss_2": 4.209121763706207, + "ce_loss_3": 4.032123720645904, + "ce_loss_7": 3.749704658985138, + "epoch": 0.336, + "grad_norm": 664.0, + "kl_loss_10": 107.0846736907959, + "kl_loss_2": 1209.7884765625, + "kl_loss_3": 853.7374877929688, + "kl_loss_7": 220.54676055908203, + "learning_rate": 0.0007554760743911103, + "loss": 605.0996, + "step": 3360 + }, + { + "ce_loss_10": 3.5890319466590883, + "ce_loss_13": 3.5283274173736574, + "ce_loss_2": 4.114323127269745, + "ce_loss_3": 3.932225775718689, + "ce_loss_7": 3.644662916660309, + "epoch": 0.337, + "grad_norm": 398.0, + "kl_loss_10": 101.10566368103028, + "kl_loss_2": 1236.1671508789063, + "kl_loss_3": 865.7673828125, + "kl_loss_7": 212.85166015625, + "learning_rate": 0.0007541108803542846, + "loss": 613.867, + "step": 3370 + }, + { + "ce_loss_10": 3.6427289605140687, + "ce_loss_13": 3.576077425479889, + "ce_loss_2": 4.166507577896118, + "ce_loss_3": 3.9814778923988343, + "ce_loss_7": 3.6960788011550902, + "epoch": 0.338, + "grad_norm": 420.0, + "kl_loss_10": 106.68134155273438, + "kl_loss_2": 1229.0040222167968, + "kl_loss_3": 856.9913909912109, + "kl_loss_7": 213.85500411987306, + "learning_rate": 0.0007527431274237149, + "loss": 624.6923, + "step": 3380 + }, + { + "ce_loss_10": 3.611558997631073, + "ce_loss_13": 3.549490749835968, + "ce_loss_2": 4.114035534858703, + "ce_loss_3": 3.942946660518646, + "ce_loss_7": 3.662776732444763, + "epoch": 0.339, + "grad_norm": 406.0, + "kl_loss_10": 102.27137718200683, + "kl_loss_2": 1206.6684020996095, + "kl_loss_3": 846.7297576904297, + "kl_loss_7": 210.38721313476563, + "learning_rate": 0.0007513728293726579, + "loss": 594.8909, + "step": 3390 + }, + { + "ce_loss_10": 3.737028419971466, + "ce_loss_13": 3.669820773601532, + "ce_loss_2": 4.24596471786499, + "ce_loss_3": 4.065989923477173, + "ce_loss_7": 3.7901018500328063, + "epoch": 0.34, + "grad_norm": 456.0, + "kl_loss_10": 106.7515941619873, + "kl_loss_2": 1213.6457214355469, + "kl_loss_3": 848.0824188232422, + "kl_loss_7": 217.41063537597657, + "learning_rate": 0.00075, + "loss": 593.8513, + "step": 3400 + }, + { + "ce_loss_10": 3.719330894947052, + "ce_loss_13": 3.6538206934928894, + "ce_loss_2": 4.25202556848526, + "ce_loss_3": 4.069514441490173, + "ce_loss_7": 3.7754390835762024, + "epoch": 0.341, + "grad_norm": 442.0, + "kl_loss_10": 105.26911506652831, + "kl_loss_2": 1229.2578063964843, + "kl_loss_3": 857.8241027832031, + "kl_loss_7": 215.74853057861327, + "learning_rate": 0.0007486246531301177, + "loss": 595.3941, + "step": 3410 + }, + { + "ce_loss_10": 3.5200854897499085, + "ce_loss_13": 3.457200789451599, + "ce_loss_2": 4.057665538787842, + "ce_loss_3": 3.8753583312034605, + "ce_loss_7": 3.575985038280487, + "epoch": 0.342, + "grad_norm": 388.0, + "kl_loss_10": 101.49059600830078, + "kl_loss_2": 1229.5487548828125, + "kl_loss_3": 867.5537567138672, + "kl_loss_7": 212.1739074707031, + "learning_rate": 0.0007472468026127384, + "loss": 593.475, + "step": 3420 + }, + { + "ce_loss_10": 3.6591346502304076, + "ce_loss_13": 3.5927812099456786, + "ce_loss_2": 4.209147357940674, + "ce_loss_3": 4.019513976573944, + "ce_loss_7": 3.7172008395195006, + "epoch": 0.343, + "grad_norm": 442.0, + "kl_loss_10": 106.34202499389649, + "kl_loss_2": 1270.0667724609375, + "kl_loss_3": 890.6144561767578, + "kl_loss_7": 221.5020393371582, + "learning_rate": 0.000745866462322802, + "loss": 614.0497, + "step": 3430 + }, + { + "ce_loss_10": 3.647415816783905, + "ce_loss_13": 3.5850081205368043, + "ce_loss_2": 4.1631152629852295, + "ce_loss_3": 3.980070149898529, + "ce_loss_7": 3.7022210240364073, + "epoch": 0.344, + "grad_norm": 428.0, + "kl_loss_10": 103.86195526123046, + "kl_loss_2": 1198.3542846679688, + "kl_loss_3": 835.6711212158203, + "kl_loss_7": 208.45360870361327, + "learning_rate": 0.0007444836461603195, + "loss": 592.3941, + "step": 3440 + }, + { + "ce_loss_10": 3.7135616302490235, + "ce_loss_13": 3.6434731125831603, + "ce_loss_2": 4.233828973770142, + "ce_loss_3": 4.05616340637207, + "ce_loss_7": 3.762986993789673, + "epoch": 0.345, + "grad_norm": 548.0, + "kl_loss_10": 110.37765045166016, + "kl_loss_2": 1249.6877746582031, + "kl_loss_3": 880.3564361572265, + "kl_loss_7": 216.23881912231445, + "learning_rate": 0.0007430983680502344, + "loss": 610.9966, + "step": 3450 + }, + { + "ce_loss_10": 3.5541942715644836, + "ce_loss_13": 3.4891390204429626, + "ce_loss_2": 4.090934145450592, + "ce_loss_3": 3.908629584312439, + "ce_loss_7": 3.606754219532013, + "epoch": 0.346, + "grad_norm": 432.0, + "kl_loss_10": 110.62757797241211, + "kl_loss_2": 1245.3806091308593, + "kl_loss_3": 869.5422088623047, + "kl_loss_7": 211.6188102722168, + "learning_rate": 0.0007417106419422819, + "loss": 606.0509, + "step": 3460 + }, + { + "ce_loss_10": 3.6656701445579527, + "ce_loss_13": 3.596804141998291, + "ce_loss_2": 4.186310410499573, + "ce_loss_3": 4.003709590435028, + "ce_loss_7": 3.716957890987396, + "epoch": 0.347, + "grad_norm": 432.0, + "kl_loss_10": 110.30144805908203, + "kl_loss_2": 1208.0226745605469, + "kl_loss_3": 843.9369232177735, + "kl_loss_7": 210.9572967529297, + "learning_rate": 0.0007403204818108486, + "loss": 597.1857, + "step": 3470 + }, + { + "ce_loss_10": 3.6337965607643126, + "ce_loss_13": 3.5606253027915953, + "ce_loss_2": 4.153940236568451, + "ce_loss_3": 3.970261514186859, + "ce_loss_7": 3.680176484584808, + "epoch": 0.348, + "grad_norm": 380.0, + "kl_loss_10": 122.88734741210938, + "kl_loss_2": 1235.673895263672, + "kl_loss_3": 863.5119903564453, + "kl_loss_7": 214.55614318847657, + "learning_rate": 0.0007389279016548316, + "loss": 589.7067, + "step": 3480 + }, + { + "ce_loss_10": 3.6385215759277343, + "ce_loss_13": 3.5720754146575926, + "ce_loss_2": 4.187943410873413, + "ce_loss_3": 3.9984039187431337, + "ce_loss_7": 3.692442834377289, + "epoch": 0.349, + "grad_norm": 540.0, + "kl_loss_10": 110.95368614196778, + "kl_loss_2": 1266.4402160644531, + "kl_loss_3": 881.5294525146485, + "kl_loss_7": 217.94278945922852, + "learning_rate": 0.0007375329154974975, + "loss": 613.9418, + "step": 3490 + }, + { + "ce_loss_10": 3.5970895290374756, + "ce_loss_13": 3.5335337281227113, + "ce_loss_2": 4.117660129070282, + "ce_loss_3": 3.938844621181488, + "ce_loss_7": 3.6496007084846496, + "epoch": 0.35, + "grad_norm": 364.0, + "kl_loss_10": 106.09449501037598, + "kl_loss_2": 1217.6699768066405, + "kl_loss_3": 855.84267578125, + "kl_loss_7": 211.2824508666992, + "learning_rate": 0.0007361355373863414, + "loss": 604.2842, + "step": 3500 + }, + { + "ce_loss_10": 3.6508504867553713, + "ce_loss_13": 3.5859110236167906, + "ce_loss_2": 4.1644844770431515, + "ce_loss_3": 3.989104926586151, + "ce_loss_7": 3.7059740304946898, + "epoch": 0.351, + "grad_norm": 420.0, + "kl_loss_10": 105.65600318908692, + "kl_loss_2": 1192.6789306640626, + "kl_loss_3": 837.2236511230469, + "kl_loss_7": 210.62101364135742, + "learning_rate": 0.0007347357813929454, + "loss": 605.2478, + "step": 3510 + }, + { + "ce_loss_10": 3.5983325362205507, + "ce_loss_13": 3.5318838000297545, + "ce_loss_2": 4.108148908615112, + "ce_loss_3": 3.935304307937622, + "ce_loss_7": 3.6479654192924498, + "epoch": 0.352, + "grad_norm": 500.0, + "kl_loss_10": 106.45629920959473, + "kl_loss_2": 1190.6948181152343, + "kl_loss_3": 837.8225341796875, + "kl_loss_7": 210.1330581665039, + "learning_rate": 0.0007333336616128369, + "loss": 599.2653, + "step": 3520 + }, + { + "ce_loss_10": 3.570793068408966, + "ce_loss_13": 3.507152056694031, + "ce_loss_2": 4.106606543064117, + "ce_loss_3": 3.9213356494903566, + "ce_loss_7": 3.624741232395172, + "epoch": 0.353, + "grad_norm": 468.0, + "kl_loss_10": 102.9274845123291, + "kl_loss_2": 1231.522442626953, + "kl_loss_3": 864.720751953125, + "kl_loss_7": 214.17628860473633, + "learning_rate": 0.0007319291921653463, + "loss": 605.1452, + "step": 3530 + }, + { + "ce_loss_10": 3.6573350191116334, + "ce_loss_13": 3.591005003452301, + "ce_loss_2": 4.190282225608826, + "ce_loss_3": 4.010705304145813, + "ce_loss_7": 3.713829779624939, + "epoch": 0.354, + "grad_norm": 480.0, + "kl_loss_10": 105.38732643127442, + "kl_loss_2": 1246.1359802246093, + "kl_loss_3": 875.5277282714844, + "kl_loss_7": 217.63313064575195, + "learning_rate": 0.0007305223871934656, + "loss": 597.4614, + "step": 3540 + }, + { + "ce_loss_10": 3.6225136160850524, + "ce_loss_13": 3.556077516078949, + "ce_loss_2": 4.138617634773254, + "ce_loss_3": 3.9633963227272035, + "ce_loss_7": 3.678558957576752, + "epoch": 0.355, + "grad_norm": 502.0, + "kl_loss_10": 104.04609298706055, + "kl_loss_2": 1205.1107055664063, + "kl_loss_3": 845.5688415527344, + "kl_loss_7": 210.7905143737793, + "learning_rate": 0.0007291132608637052, + "loss": 595.3202, + "step": 3550 + }, + { + "ce_loss_10": 3.585705029964447, + "ce_loss_13": 3.52364000082016, + "ce_loss_2": 4.140194058418274, + "ce_loss_3": 3.939319980144501, + "ce_loss_7": 3.637845540046692, + "epoch": 0.356, + "grad_norm": 612.0, + "kl_loss_10": 100.68717575073242, + "kl_loss_2": 1272.5315246582031, + "kl_loss_3": 866.628369140625, + "kl_loss_7": 206.60951766967773, + "learning_rate": 0.0007277018273659516, + "loss": 612.2947, + "step": 3560 + }, + { + "ce_loss_10": 3.708829402923584, + "ce_loss_13": 3.6439966320991517, + "ce_loss_2": 4.2357800006866455, + "ce_loss_3": 4.058422148227692, + "ce_loss_7": 3.7655990600585936, + "epoch": 0.357, + "grad_norm": 400.0, + "kl_loss_10": 105.25033149719238, + "kl_loss_2": 1234.6828186035157, + "kl_loss_3": 864.7261169433593, + "kl_loss_7": 215.20211639404297, + "learning_rate": 0.0007262881009133242, + "loss": 605.0631, + "step": 3570 + }, + { + "ce_loss_10": 3.6265846729278564, + "ce_loss_13": 3.5641749501228333, + "ce_loss_2": 4.144611585140228, + "ce_loss_3": 3.9691020011901856, + "ce_loss_7": 3.6797274351119995, + "epoch": 0.358, + "grad_norm": 422.0, + "kl_loss_10": 101.45686912536621, + "kl_loss_2": 1216.0844970703124, + "kl_loss_3": 849.7874267578125, + "kl_loss_7": 208.09806137084962, + "learning_rate": 0.0007248720957420329, + "loss": 589.5256, + "step": 3580 + }, + { + "ce_loss_10": 3.6416075587272645, + "ce_loss_13": 3.5768683552742004, + "ce_loss_2": 4.156981098651886, + "ce_loss_3": 3.9762784600257874, + "ce_loss_7": 3.690297317504883, + "epoch": 0.359, + "grad_norm": 374.0, + "kl_loss_10": 104.18233222961426, + "kl_loss_2": 1196.5406433105468, + "kl_loss_3": 831.4658630371093, + "kl_loss_7": 209.4309959411621, + "learning_rate": 0.0007234538261112341, + "loss": 608.9998, + "step": 3590 + }, + { + "ce_loss_10": 3.6725340247154237, + "ce_loss_13": 3.6092687249183655, + "ce_loss_2": 4.202276730537415, + "ce_loss_3": 4.014237463474274, + "ce_loss_7": 3.7282424688339235, + "epoch": 0.36, + "grad_norm": 400.0, + "kl_loss_10": 101.90313911437988, + "kl_loss_2": 1228.7942749023437, + "kl_loss_3": 851.1504791259765, + "kl_loss_7": 214.15290603637695, + "learning_rate": 0.0007220333063028871, + "loss": 593.6457, + "step": 3600 + }, + { + "ce_loss_10": 3.7029056310653687, + "ce_loss_13": 3.6388812899589538, + "ce_loss_2": 4.263094091415406, + "ce_loss_3": 4.055423867702484, + "ce_loss_7": 3.7583480000495912, + "epoch": 0.361, + "grad_norm": 406.0, + "kl_loss_10": 103.6033935546875, + "kl_loss_2": 1316.5648254394532, + "kl_loss_3": 896.4495971679687, + "kl_loss_7": 217.90971908569335, + "learning_rate": 0.0007206105506216106, + "loss": 621.4246, + "step": 3610 + }, + { + "ce_loss_10": 3.582909846305847, + "ce_loss_13": 3.5207375407218935, + "ce_loss_2": 4.105194330215454, + "ce_loss_3": 3.92072172164917, + "ce_loss_7": 3.6367709159851076, + "epoch": 0.362, + "grad_norm": 488.0, + "kl_loss_10": 100.51245307922363, + "kl_loss_2": 1208.4382385253907, + "kl_loss_3": 842.719369506836, + "kl_loss_7": 209.43429107666014, + "learning_rate": 0.0007191855733945387, + "loss": 586.8207, + "step": 3620 + }, + { + "ce_loss_10": 3.6772588729858398, + "ce_loss_13": 3.611865592002869, + "ce_loss_2": 4.192759323120117, + "ce_loss_3": 4.0132176041603085, + "ce_loss_7": 3.7312068581581115, + "epoch": 0.363, + "grad_norm": 482.0, + "kl_loss_10": 103.05736274719239, + "kl_loss_2": 1206.339794921875, + "kl_loss_3": 840.5841491699218, + "kl_loss_7": 209.33160095214845, + "learning_rate": 0.0007177583889711762, + "loss": 590.5756, + "step": 3630 + }, + { + "ce_loss_10": 3.5943727612495424, + "ce_loss_13": 3.5278201699256897, + "ce_loss_2": 4.115126085281372, + "ce_loss_3": 3.9359707951545717, + "ce_loss_7": 3.64764518737793, + "epoch": 0.364, + "grad_norm": 474.0, + "kl_loss_10": 104.63778533935547, + "kl_loss_2": 1232.7115539550782, + "kl_loss_3": 867.7350891113281, + "kl_loss_7": 215.38798904418945, + "learning_rate": 0.0007163290117232541, + "loss": 602.1762, + "step": 3640 + }, + { + "ce_loss_10": 3.719394052028656, + "ce_loss_13": 3.6543713212013245, + "ce_loss_2": 4.207157838344574, + "ce_loss_3": 4.033388280868531, + "ce_loss_7": 3.766360378265381, + "epoch": 0.365, + "grad_norm": 516.0, + "kl_loss_10": 106.55956001281739, + "kl_loss_2": 1177.5490844726562, + "kl_loss_3": 820.275503540039, + "kl_loss_7": 210.7781494140625, + "learning_rate": 0.0007148974560445859, + "loss": 585.3312, + "step": 3650 + }, + { + "ce_loss_10": 3.63283451795578, + "ce_loss_13": 3.569260811805725, + "ce_loss_2": 4.140059876441955, + "ce_loss_3": 3.9612114429473877, + "ce_loss_7": 3.68426718711853, + "epoch": 0.366, + "grad_norm": 446.0, + "kl_loss_10": 101.39652633666992, + "kl_loss_2": 1181.2005432128906, + "kl_loss_3": 826.3975830078125, + "kl_loss_7": 208.74162216186522, + "learning_rate": 0.0007134637363509209, + "loss": 580.396, + "step": 3660 + }, + { + "ce_loss_10": 3.740837073326111, + "ce_loss_13": 3.676628518104553, + "ce_loss_2": 4.238210546970367, + "ce_loss_3": 4.064305305480957, + "ce_loss_7": 3.7917707443237303, + "epoch": 0.367, + "grad_norm": 374.0, + "kl_loss_10": 102.68134994506836, + "kl_loss_2": 1165.9671203613282, + "kl_loss_3": 815.8925506591797, + "kl_loss_7": 205.73183975219726, + "learning_rate": 0.0007120278670798009, + "loss": 586.6874, + "step": 3670 + }, + { + "ce_loss_10": 3.530075693130493, + "ce_loss_13": 3.467638063430786, + "ce_loss_2": 4.08873633146286, + "ce_loss_3": 3.8983967661857606, + "ce_loss_7": 3.590684974193573, + "epoch": 0.368, + "grad_norm": 504.0, + "kl_loss_10": 102.20494270324707, + "kl_loss_2": 1276.5897247314454, + "kl_loss_3": 894.699105834961, + "kl_loss_7": 217.834383392334, + "learning_rate": 0.0007105898626904133, + "loss": 620.3519, + "step": 3680 + }, + { + "ce_loss_10": 3.6397287964820864, + "ce_loss_13": 3.576084387302399, + "ce_loss_2": 4.165349864959717, + "ce_loss_3": 3.9844519972801207, + "ce_loss_7": 3.6932525277137755, + "epoch": 0.369, + "grad_norm": 548.0, + "kl_loss_10": 103.31561088562012, + "kl_loss_2": 1214.6401062011719, + "kl_loss_3": 850.1350677490234, + "kl_loss_7": 211.8514373779297, + "learning_rate": 0.0007091497376634463, + "loss": 587.3888, + "step": 3690 + }, + { + "ce_loss_10": 3.580397891998291, + "ce_loss_13": 3.518483591079712, + "ce_loss_2": 4.098948669433594, + "ce_loss_3": 3.9198103308677674, + "ce_loss_7": 3.633430314064026, + "epoch": 0.37, + "grad_norm": 462.0, + "kl_loss_10": 102.7860034942627, + "kl_loss_2": 1196.8778686523438, + "kl_loss_3": 839.7853210449218, + "kl_loss_7": 210.37151184082032, + "learning_rate": 0.0007077075065009433, + "loss": 599.0922, + "step": 3700 + }, + { + "ce_loss_10": 3.6922479033470155, + "ce_loss_13": 3.6247249126434324, + "ce_loss_2": 4.215528225898742, + "ce_loss_3": 4.034583401679993, + "ce_loss_7": 3.7439934253692626, + "epoch": 0.371, + "grad_norm": 436.0, + "kl_loss_10": 107.0543056488037, + "kl_loss_2": 1234.6434143066406, + "kl_loss_3": 869.9170135498047, + "kl_loss_7": 215.78035430908204, + "learning_rate": 0.0007062631837261557, + "loss": 601.1125, + "step": 3710 + }, + { + "ce_loss_10": 3.558840346336365, + "ce_loss_13": 3.4976505637168884, + "ce_loss_2": 4.082807242870331, + "ce_loss_3": 3.90502552986145, + "ce_loss_7": 3.611116898059845, + "epoch": 0.372, + "grad_norm": 418.0, + "kl_loss_10": 102.55169563293457, + "kl_loss_2": 1217.97548828125, + "kl_loss_3": 855.1094757080078, + "kl_loss_7": 209.0750946044922, + "learning_rate": 0.0007048167838833977, + "loss": 602.8635, + "step": 3720 + }, + { + "ce_loss_10": 3.6581831574440002, + "ce_loss_13": 3.593174624443054, + "ce_loss_2": 4.162305021286011, + "ce_loss_3": 3.9847410321235657, + "ce_loss_7": 3.7109787225723267, + "epoch": 0.373, + "grad_norm": 536.0, + "kl_loss_10": 103.06450958251953, + "kl_loss_2": 1197.146795654297, + "kl_loss_3": 834.3573669433594, + "kl_loss_7": 209.46187515258788, + "learning_rate": 0.0007033683215379002, + "loss": 588.3938, + "step": 3730 + }, + { + "ce_loss_10": 3.6515901923179626, + "ce_loss_13": 3.586732280254364, + "ce_loss_2": 4.166427576541901, + "ce_loss_3": 3.9861610412597654, + "ce_loss_7": 3.703124833106995, + "epoch": 0.374, + "grad_norm": 384.0, + "kl_loss_10": 101.91668891906738, + "kl_loss_2": 1196.090036010742, + "kl_loss_3": 834.7775848388671, + "kl_loss_7": 206.9270217895508, + "learning_rate": 0.0007019178112756625, + "loss": 596.7028, + "step": 3740 + }, + { + "ce_loss_10": 3.5998276591300966, + "ce_loss_13": 3.539226603507996, + "ce_loss_2": 4.120503497123718, + "ce_loss_3": 3.938064229488373, + "ce_loss_7": 3.6514668703079223, + "epoch": 0.375, + "grad_norm": 484.0, + "kl_loss_10": 101.7071418762207, + "kl_loss_2": 1206.4351013183593, + "kl_loss_3": 842.5018493652344, + "kl_loss_7": 207.55127868652343, + "learning_rate": 0.0007004652677033068, + "loss": 596.7216, + "step": 3750 + }, + { + "ce_loss_10": 3.6823023438453673, + "ce_loss_13": 3.6218234419822695, + "ce_loss_2": 4.1750637769699095, + "ce_loss_3": 4.004729413986206, + "ce_loss_7": 3.732503056526184, + "epoch": 0.376, + "grad_norm": 388.0, + "kl_loss_10": 99.9868221282959, + "kl_loss_2": 1168.4398498535156, + "kl_loss_3": 816.7180572509766, + "kl_loss_7": 201.70328750610352, + "learning_rate": 0.0006990107054479312, + "loss": 584.5167, + "step": 3760 + }, + { + "ce_loss_10": 3.667929840087891, + "ce_loss_13": 3.6051357984542847, + "ce_loss_2": 4.166206574440002, + "ce_loss_3": 3.9985297203063963, + "ce_loss_7": 3.719240057468414, + "epoch": 0.377, + "grad_norm": 496.0, + "kl_loss_10": 102.5582088470459, + "kl_loss_2": 1182.1695739746094, + "kl_loss_3": 832.6118957519532, + "kl_loss_7": 206.43120498657225, + "learning_rate": 0.000697554139156961, + "loss": 586.6759, + "step": 3770 + }, + { + "ce_loss_10": 3.648312306404114, + "ce_loss_13": 3.5864667892456055, + "ce_loss_2": 4.165168154239654, + "ce_loss_3": 3.980992519855499, + "ce_loss_7": 3.703998303413391, + "epoch": 0.378, + "grad_norm": 532.0, + "kl_loss_10": 102.77268753051757, + "kl_loss_2": 1217.0308044433593, + "kl_loss_3": 845.4426635742187, + "kl_loss_7": 211.65556106567382, + "learning_rate": 0.0006960955834980027, + "loss": 586.9333, + "step": 3780 + }, + { + "ce_loss_10": 3.624769401550293, + "ce_loss_13": 3.559871160984039, + "ce_loss_2": 4.141481828689575, + "ce_loss_3": 3.9655247926712036, + "ce_loss_7": 3.681060993671417, + "epoch": 0.379, + "grad_norm": 402.0, + "kl_loss_10": 104.66882057189942, + "kl_loss_2": 1194.9725402832032, + "kl_loss_3": 840.5210388183593, + "kl_loss_7": 214.32746124267578, + "learning_rate": 0.0006946350531586958, + "loss": 591.0428, + "step": 3790 + }, + { + "ce_loss_10": 3.6484233260154726, + "ce_loss_13": 3.5856125354766846, + "ce_loss_2": 4.168078374862671, + "ce_loss_3": 3.984307587146759, + "ce_loss_7": 3.7046299457550047, + "epoch": 0.38, + "grad_norm": 494.0, + "kl_loss_10": 102.10320167541504, + "kl_loss_2": 1202.4750549316407, + "kl_loss_3": 836.4282287597656, + "kl_loss_7": 215.46153411865234, + "learning_rate": 0.0006931725628465643, + "loss": 600.8652, + "step": 3800 + }, + { + "ce_loss_10": 3.669872498512268, + "ce_loss_13": 3.606708490848541, + "ce_loss_2": 4.190654408931732, + "ce_loss_3": 4.012761104106903, + "ce_loss_7": 3.725092887878418, + "epoch": 0.381, + "grad_norm": 462.0, + "kl_loss_10": 105.94147644042968, + "kl_loss_2": 1198.5632446289062, + "kl_loss_3": 842.3563995361328, + "kl_loss_7": 216.23879013061523, + "learning_rate": 0.0006917081272888696, + "loss": 594.3836, + "step": 3810 + }, + { + "ce_loss_10": 3.5702871322631835, + "ce_loss_13": 3.503624665737152, + "ce_loss_2": 4.083241939544678, + "ce_loss_3": 3.9013825416564942, + "ce_loss_7": 3.6281121611595153, + "epoch": 0.382, + "grad_norm": 430.0, + "kl_loss_10": 104.559330368042, + "kl_loss_2": 1205.9051391601563, + "kl_loss_3": 846.9787689208985, + "kl_loss_7": 214.2649803161621, + "learning_rate": 0.0006902417612324615, + "loss": 588.9565, + "step": 3820 + }, + { + "ce_loss_10": 3.705217492580414, + "ce_loss_13": 3.6370500326156616, + "ce_loss_2": 4.2347581624984745, + "ce_loss_3": 4.056124079227447, + "ce_loss_7": 3.7589930057525636, + "epoch": 0.383, + "grad_norm": 418.0, + "kl_loss_10": 107.22665023803711, + "kl_loss_2": 1242.482080078125, + "kl_loss_3": 871.0590393066407, + "kl_loss_7": 218.71700134277344, + "learning_rate": 0.00068877347944363, + "loss": 600.3775, + "step": 3830 + }, + { + "ce_loss_10": 3.6945597529411316, + "ce_loss_13": 3.6302199006080627, + "ce_loss_2": 4.190360188484192, + "ce_loss_3": 4.017442071437836, + "ce_loss_7": 3.74516099691391, + "epoch": 0.384, + "grad_norm": 460.0, + "kl_loss_10": 105.2132453918457, + "kl_loss_2": 1180.0169799804687, + "kl_loss_3": 825.1839294433594, + "kl_loss_7": 210.17990188598634, + "learning_rate": 0.0006873032967079561, + "loss": 592.1172, + "step": 3840 + }, + { + "ce_loss_10": 3.6860820412635804, + "ce_loss_13": 3.622925412654877, + "ce_loss_2": 4.173858499526977, + "ce_loss_3": 4.0060118436813354, + "ce_loss_7": 3.7361050128936766, + "epoch": 0.385, + "grad_norm": 444.0, + "kl_loss_10": 102.31974792480469, + "kl_loss_2": 1169.402410888672, + "kl_loss_3": 819.6500732421875, + "kl_loss_7": 207.8970947265625, + "learning_rate": 0.0006858312278301637, + "loss": 578.5368, + "step": 3850 + }, + { + "ce_loss_10": 3.724821174144745, + "ce_loss_13": 3.6599106669425963, + "ce_loss_2": 4.216867661476135, + "ce_loss_3": 4.043015420436859, + "ce_loss_7": 3.7741833090782166, + "epoch": 0.386, + "grad_norm": 628.0, + "kl_loss_10": 105.45792541503906, + "kl_loss_2": 1182.8445251464843, + "kl_loss_3": 827.4248168945312, + "kl_loss_7": 208.66201171875, + "learning_rate": 0.0006843572876339704, + "loss": 581.9299, + "step": 3860 + }, + { + "ce_loss_10": 3.639630389213562, + "ce_loss_13": 3.578851103782654, + "ce_loss_2": 4.1167685151100155, + "ce_loss_3": 3.953296732902527, + "ce_loss_7": 3.6866363167762755, + "epoch": 0.387, + "grad_norm": 402.0, + "kl_loss_10": 101.30325736999512, + "kl_loss_2": 1144.7853637695312, + "kl_loss_3": 802.1904113769531, + "kl_loss_7": 201.72076492309571, + "learning_rate": 0.0006828814909619373, + "loss": 586.7184, + "step": 3870 + }, + { + "ce_loss_10": 3.7647191643714906, + "ce_loss_13": 3.697379672527313, + "ce_loss_2": 4.260519480705261, + "ce_loss_3": 4.083593368530273, + "ce_loss_7": 3.813764202594757, + "epoch": 0.388, + "grad_norm": 350.0, + "kl_loss_10": 106.36605720520019, + "kl_loss_2": 1172.6269104003907, + "kl_loss_3": 820.4572174072266, + "kl_loss_7": 210.88503875732422, + "learning_rate": 0.0006814038526753205, + "loss": 576.9886, + "step": 3880 + }, + { + "ce_loss_10": 3.6557364583015444, + "ce_loss_13": 3.5924967169761657, + "ce_loss_2": 4.160025131702423, + "ce_loss_3": 3.984356963634491, + "ce_loss_7": 3.7067020535469055, + "epoch": 0.389, + "grad_norm": 330.0, + "kl_loss_10": 102.68659782409668, + "kl_loss_2": 1186.152001953125, + "kl_loss_3": 826.8501800537109, + "kl_loss_7": 206.71521759033203, + "learning_rate": 0.0006799243876539213, + "loss": 580.4666, + "step": 3890 + }, + { + "ce_loss_10": 3.5759631991386414, + "ce_loss_13": 3.5127877712249758, + "ce_loss_2": 4.105723321437836, + "ce_loss_3": 3.9167493343353272, + "ce_loss_7": 3.6288220643997193, + "epoch": 0.39, + "grad_norm": 536.0, + "kl_loss_10": 103.75163269042969, + "kl_loss_2": 1215.1460266113281, + "kl_loss_3": 839.8725982666016, + "kl_loss_7": 208.5065475463867, + "learning_rate": 0.0006784431107959359, + "loss": 592.4442, + "step": 3900 + }, + { + "ce_loss_10": 3.639443838596344, + "ce_loss_13": 3.5752380013465883, + "ce_loss_2": 4.170507109165191, + "ce_loss_3": 3.9816882967948914, + "ce_loss_7": 3.694754195213318, + "epoch": 0.391, + "grad_norm": 510.0, + "kl_loss_10": 103.07575302124023, + "kl_loss_2": 1237.5377136230468, + "kl_loss_3": 858.0287719726563, + "kl_loss_7": 214.26128845214845, + "learning_rate": 0.0006769600370178059, + "loss": 594.2272, + "step": 3910 + }, + { + "ce_loss_10": 3.607291209697723, + "ce_loss_13": 3.5426042318344115, + "ce_loss_2": 4.134967279434204, + "ce_loss_3": 3.9495469093322755, + "ce_loss_7": 3.6644778490066527, + "epoch": 0.392, + "grad_norm": 348.0, + "kl_loss_10": 100.81994514465332, + "kl_loss_2": 1201.7113891601562, + "kl_loss_3": 841.3645660400391, + "kl_loss_7": 207.30770874023438, + "learning_rate": 0.0006754751812540679, + "loss": 578.4809, + "step": 3920 + }, + { + "ce_loss_10": 3.6542662262916563, + "ce_loss_13": 3.5899597883224486, + "ce_loss_2": 4.172767472267151, + "ce_loss_3": 3.9909741401672365, + "ce_loss_7": 3.706152844429016, + "epoch": 0.393, + "grad_norm": 440.0, + "kl_loss_10": 104.03220100402832, + "kl_loss_2": 1209.6233947753906, + "kl_loss_3": 843.8147003173829, + "kl_loss_7": 210.7646583557129, + "learning_rate": 0.0006739885584572025, + "loss": 592.3653, + "step": 3930 + }, + { + "ce_loss_10": 3.685343015193939, + "ce_loss_13": 3.619848680496216, + "ce_loss_2": 4.199707639217377, + "ce_loss_3": 4.017499768733979, + "ce_loss_7": 3.734171211719513, + "epoch": 0.394, + "grad_norm": 564.0, + "kl_loss_10": 107.80731964111328, + "kl_loss_2": 1232.0240844726563, + "kl_loss_3": 850.9272064208984, + "kl_loss_7": 211.88618087768555, + "learning_rate": 0.0006725001835974853, + "loss": 590.3288, + "step": 3940 + }, + { + "ce_loss_10": 3.671092712879181, + "ce_loss_13": 3.6061443567276, + "ce_loss_2": 4.189756679534912, + "ce_loss_3": 4.005955624580383, + "ce_loss_7": 3.7217952370643617, + "epoch": 0.395, + "grad_norm": 472.0, + "kl_loss_10": 105.94960823059083, + "kl_loss_2": 1209.6172180175781, + "kl_loss_3": 848.8837646484375, + "kl_loss_7": 211.4744026184082, + "learning_rate": 0.0006710100716628344, + "loss": 581.9217, + "step": 3950 + }, + { + "ce_loss_10": 3.6513510942459106, + "ce_loss_13": 3.586063766479492, + "ce_loss_2": 4.175520932674408, + "ce_loss_3": 3.992800068855286, + "ce_loss_7": 3.7037784814834596, + "epoch": 0.396, + "grad_norm": 556.0, + "kl_loss_10": 102.45261993408204, + "kl_loss_2": 1202.025439453125, + "kl_loss_3": 843.4705932617187, + "kl_loss_7": 207.75647506713867, + "learning_rate": 0.0006695182376586602, + "loss": 594.7452, + "step": 3960 + }, + { + "ce_loss_10": 3.6946488857269286, + "ce_loss_13": 3.6310433030128477, + "ce_loss_2": 4.180384719371796, + "ce_loss_3": 4.00883582830429, + "ce_loss_7": 3.739116144180298, + "epoch": 0.397, + "grad_norm": 484.0, + "kl_loss_10": 100.45674743652344, + "kl_loss_2": 1141.924838256836, + "kl_loss_3": 795.2099151611328, + "kl_loss_7": 201.57386474609376, + "learning_rate": 0.000668024696607715, + "loss": 581.8865, + "step": 3970 + }, + { + "ce_loss_10": 3.63701788187027, + "ce_loss_13": 3.5759130001068113, + "ce_loss_2": 4.141798782348633, + "ce_loss_3": 3.965423548221588, + "ce_loss_7": 3.691797506809235, + "epoch": 0.398, + "grad_norm": 402.0, + "kl_loss_10": 99.83709602355957, + "kl_loss_2": 1189.6253723144532, + "kl_loss_3": 836.8567596435547, + "kl_loss_7": 210.05224533081054, + "learning_rate": 0.0006665294635499404, + "loss": 585.3059, + "step": 3980 + }, + { + "ce_loss_10": 3.645500433444977, + "ce_loss_13": 3.5827003002166746, + "ce_loss_2": 4.174324834346772, + "ce_loss_3": 3.992855429649353, + "ce_loss_7": 3.7015270590782166, + "epoch": 0.399, + "grad_norm": 438.0, + "kl_loss_10": 103.66120948791504, + "kl_loss_2": 1245.642510986328, + "kl_loss_3": 869.6440063476563, + "kl_loss_7": 216.26355361938477, + "learning_rate": 0.0006650325535423167, + "loss": 596.3225, + "step": 3990 + }, + { + "ce_loss_10": 3.6747123122215273, + "ce_loss_13": 3.6138512253761292, + "ce_loss_2": 4.168187916278839, + "ce_loss_3": 3.993897998332977, + "ce_loss_7": 3.725596582889557, + "epoch": 0.4, + "grad_norm": 520.0, + "kl_loss_10": 96.3211498260498, + "kl_loss_2": 1152.9211303710938, + "kl_loss_3": 801.8546081542969, + "kl_loss_7": 200.72928695678712, + "learning_rate": 0.0006635339816587109, + "loss": 575.9933, + "step": 4000 + }, + { + "ce_loss_10": 3.6128929018974305, + "ce_loss_13": 3.548132801055908, + "ce_loss_2": 4.128501725196839, + "ce_loss_3": 3.945591115951538, + "ce_loss_7": 3.6652005195617674, + "epoch": 0.401, + "grad_norm": 430.0, + "kl_loss_10": 103.19527244567871, + "kl_loss_2": 1214.8156677246093, + "kl_loss_3": 840.3229400634766, + "kl_loss_7": 210.74479904174805, + "learning_rate": 0.0006620337629897252, + "loss": 583.2822, + "step": 4010 + }, + { + "ce_loss_10": 3.619123613834381, + "ce_loss_13": 3.5573631048202516, + "ce_loss_2": 4.140160727500915, + "ce_loss_3": 3.958257591724396, + "ce_loss_7": 3.674074041843414, + "epoch": 0.402, + "grad_norm": 432.0, + "kl_loss_10": 100.38173408508301, + "kl_loss_2": 1207.5167907714845, + "kl_loss_3": 837.2485626220703, + "kl_loss_7": 208.48973083496094, + "learning_rate": 0.0006605319126425454, + "loss": 597.1898, + "step": 4020 + }, + { + "ce_loss_10": 3.5208260893821715, + "ce_loss_13": 3.4589377880096435, + "ce_loss_2": 4.050716698169708, + "ce_loss_3": 3.8632638931274412, + "ce_loss_7": 3.5759450912475588, + "epoch": 0.403, + "grad_norm": 420.0, + "kl_loss_10": 100.48741989135742, + "kl_loss_2": 1233.5194946289062, + "kl_loss_3": 854.4578369140625, + "kl_loss_7": 208.70274200439454, + "learning_rate": 0.0006590284457407876, + "loss": 593.5098, + "step": 4030 + }, + { + "ce_loss_10": 3.6270558714866636, + "ce_loss_13": 3.5626144886016844, + "ce_loss_2": 4.136511921882629, + "ce_loss_3": 3.957785797119141, + "ce_loss_7": 3.6768479347229004, + "epoch": 0.404, + "grad_norm": 392.0, + "kl_loss_10": 101.69999923706055, + "kl_loss_2": 1185.4601745605469, + "kl_loss_3": 821.0296905517578, + "kl_loss_7": 206.82139434814454, + "learning_rate": 0.0006575233774243465, + "loss": 582.2525, + "step": 4040 + }, + { + "ce_loss_10": 3.612906110286713, + "ce_loss_13": 3.550376224517822, + "ce_loss_2": 4.1283538222312925, + "ce_loss_3": 3.951547086238861, + "ce_loss_7": 3.667691433429718, + "epoch": 0.405, + "grad_norm": 464.0, + "kl_loss_10": 100.57203559875488, + "kl_loss_2": 1203.0161071777343, + "kl_loss_3": 838.8151794433594, + "kl_loss_7": 210.55067977905273, + "learning_rate": 0.0006560167228492435, + "loss": 587.686, + "step": 4050 + }, + { + "ce_loss_10": 3.6582042455673216, + "ce_loss_13": 3.597072696685791, + "ce_loss_2": 4.15371550321579, + "ce_loss_3": 3.9819828867912292, + "ce_loss_7": 3.7127379179000854, + "epoch": 0.406, + "grad_norm": 396.0, + "kl_loss_10": 97.44431228637696, + "kl_loss_2": 1157.4290466308594, + "kl_loss_3": 807.0505889892578, + "kl_loss_7": 202.94429702758788, + "learning_rate": 0.0006545084971874737, + "loss": 580.7177, + "step": 4060 + }, + { + "ce_loss_10": 3.6273567199707033, + "ce_loss_13": 3.564158725738525, + "ce_loss_2": 4.158101809024811, + "ce_loss_3": 3.9733991026878357, + "ce_loss_7": 3.685515010356903, + "epoch": 0.407, + "grad_norm": 372.0, + "kl_loss_10": 103.08215293884277, + "kl_loss_2": 1230.8001892089844, + "kl_loss_3": 853.4359588623047, + "kl_loss_7": 216.80452346801758, + "learning_rate": 0.0006529987156268526, + "loss": 583.8351, + "step": 4070 + }, + { + "ce_loss_10": 3.5464280128479, + "ce_loss_13": 3.481638014316559, + "ce_loss_2": 4.076263022422791, + "ce_loss_3": 3.8974447727203367, + "ce_loss_7": 3.6043801426887514, + "epoch": 0.408, + "grad_norm": 350.0, + "kl_loss_10": 102.87330780029296, + "kl_loss_2": 1214.2586059570312, + "kl_loss_3": 851.9112091064453, + "kl_loss_7": 211.73340759277343, + "learning_rate": 0.0006514873933708637, + "loss": 602.7298, + "step": 4080 + }, + { + "ce_loss_10": 3.6543262004852295, + "ce_loss_13": 3.5908489346504213, + "ce_loss_2": 4.153554606437683, + "ce_loss_3": 3.9771866679191588, + "ce_loss_7": 3.703446090221405, + "epoch": 0.409, + "grad_norm": 378.0, + "kl_loss_10": 100.85495872497559, + "kl_loss_2": 1179.416357421875, + "kl_loss_3": 822.3047607421875, + "kl_loss_7": 207.08517990112304, + "learning_rate": 0.0006499745456385053, + "loss": 579.5981, + "step": 4090 + }, + { + "ce_loss_10": 3.622114622592926, + "ce_loss_13": 3.5604026079177857, + "ce_loss_2": 4.138943600654602, + "ce_loss_3": 3.9601905822753904, + "ce_loss_7": 3.6786248087882996, + "epoch": 0.41, + "grad_norm": 460.0, + "kl_loss_10": 101.49279441833497, + "kl_loss_2": 1187.613018798828, + "kl_loss_3": 832.265737915039, + "kl_loss_7": 211.90668182373048, + "learning_rate": 0.0006484601876641375, + "loss": 591.7443, + "step": 4100 + }, + { + "ce_loss_10": 3.6106685280799864, + "ce_loss_13": 3.5491909265518187, + "ce_loss_2": 4.104636693000794, + "ce_loss_3": 3.9329436659812926, + "ce_loss_7": 3.6641584396362306, + "epoch": 0.411, + "grad_norm": 378.0, + "kl_loss_10": 101.25703315734863, + "kl_loss_2": 1168.0580017089844, + "kl_loss_3": 813.8080810546875, + "kl_loss_7": 212.12922592163085, + "learning_rate": 0.000646944334697328, + "loss": 577.3537, + "step": 4110 + }, + { + "ce_loss_10": 3.7338776111602785, + "ce_loss_13": 3.665091943740845, + "ce_loss_2": 4.2223006844520565, + "ce_loss_3": 4.049113523960114, + "ce_loss_7": 3.799789845943451, + "epoch": 0.412, + "grad_norm": 450.0, + "kl_loss_10": 109.65744743347167, + "kl_loss_2": 1151.4740142822266, + "kl_loss_3": 801.2218536376953, + "kl_loss_7": 236.72526626586915, + "learning_rate": 0.0006454270020026995, + "loss": 574.9525, + "step": 4120 + }, + { + "ce_loss_10": 3.69082772731781, + "ce_loss_13": 3.6286051154136656, + "ce_loss_2": 4.175914537906647, + "ce_loss_3": 4.002845597267151, + "ce_loss_7": 3.7393308877944946, + "epoch": 0.413, + "grad_norm": 580.0, + "kl_loss_10": 104.95364952087402, + "kl_loss_2": 1127.3133270263672, + "kl_loss_3": 788.5207000732422, + "kl_loss_7": 214.98480072021485, + "learning_rate": 0.0006439082048597755, + "loss": 564.7141, + "step": 4130 + }, + { + "ce_loss_10": 3.683094894886017, + "ce_loss_13": 3.61643271446228, + "ce_loss_2": 4.181109619140625, + "ce_loss_3": 4.005432403087616, + "ce_loss_7": 3.745869052410126, + "epoch": 0.414, + "grad_norm": 520.0, + "kl_loss_10": 111.28029708862304, + "kl_loss_2": 1178.55703125, + "kl_loss_3": 823.4579254150391, + "kl_loss_7": 238.62436599731444, + "learning_rate": 0.0006423879585628261, + "loss": 585.353, + "step": 4140 + }, + { + "ce_loss_10": 3.648063910007477, + "ce_loss_13": 3.579416477680206, + "ce_loss_2": 4.166888773441315, + "ce_loss_3": 3.98115758895874, + "ce_loss_7": 3.7089965462684633, + "epoch": 0.415, + "grad_norm": 402.0, + "kl_loss_10": 109.57027854919434, + "kl_loss_2": 1214.0814270019532, + "kl_loss_3": 843.1505004882813, + "kl_loss_7": 233.17276763916016, + "learning_rate": 0.0006408662784207149, + "loss": 596.7986, + "step": 4150 + }, + { + "ce_loss_10": 3.596502733230591, + "ce_loss_13": 3.5327386379241945, + "ce_loss_2": 4.09819370508194, + "ce_loss_3": 3.9237332344055176, + "ce_loss_7": 3.654523158073425, + "epoch": 0.416, + "grad_norm": 544.0, + "kl_loss_10": 99.90503120422363, + "kl_loss_2": 1189.1891540527345, + "kl_loss_3": 823.6777069091797, + "kl_loss_7": 211.67333221435547, + "learning_rate": 0.0006393431797567439, + "loss": 583.1826, + "step": 4160 + }, + { + "ce_loss_10": 3.6853842735290527, + "ce_loss_13": 3.622405004501343, + "ce_loss_2": 4.1561102867126465, + "ce_loss_3": 3.9865566968917845, + "ce_loss_7": 3.7344152450561525, + "epoch": 0.417, + "grad_norm": 384.0, + "kl_loss_10": 103.1281753540039, + "kl_loss_2": 1144.869805908203, + "kl_loss_3": 800.3423767089844, + "kl_loss_7": 211.40862579345702, + "learning_rate": 0.0006378186779084996, + "loss": 557.4173, + "step": 4170 + }, + { + "ce_loss_10": 3.5140963315963747, + "ce_loss_13": 3.452511179447174, + "ce_loss_2": 4.041843056678772, + "ce_loss_3": 3.857197344303131, + "ce_loss_7": 3.571711480617523, + "epoch": 0.418, + "grad_norm": 464.0, + "kl_loss_10": 100.09027862548828, + "kl_loss_2": 1203.0338989257812, + "kl_loss_3": 838.9081939697265, + "kl_loss_7": 213.11346130371095, + "learning_rate": 0.0006362927882276989, + "loss": 588.2966, + "step": 4180 + }, + { + "ce_loss_10": 3.7188942313194273, + "ce_loss_13": 3.6518460750579833, + "ce_loss_2": 4.204531168937683, + "ce_loss_3": 4.025935411453247, + "ce_loss_7": 3.7728618144989015, + "epoch": 0.419, + "grad_norm": 426.0, + "kl_loss_10": 103.15027618408203, + "kl_loss_2": 1156.1428161621093, + "kl_loss_3": 794.2856292724609, + "kl_loss_7": 211.89537048339844, + "learning_rate": 0.000634765526080034, + "loss": 562.2326, + "step": 4190 + }, + { + "ce_loss_10": 3.717780148983002, + "ce_loss_13": 3.6511818051338194, + "ce_loss_2": 4.210239946842194, + "ce_loss_3": 4.0393988490104675, + "ce_loss_7": 3.7724336862564085, + "epoch": 0.42, + "grad_norm": 456.0, + "kl_loss_10": 104.51988563537597, + "kl_loss_2": 1161.7059631347656, + "kl_loss_3": 818.392855834961, + "kl_loss_7": 219.07965316772462, + "learning_rate": 0.0006332369068450174, + "loss": 570.1012, + "step": 4200 + }, + { + "ce_loss_10": 3.648071753978729, + "ce_loss_13": 3.5840353846549986, + "ce_loss_2": 4.147714996337891, + "ce_loss_3": 3.972030484676361, + "ce_loss_7": 3.7039226770401, + "epoch": 0.421, + "grad_norm": 426.0, + "kl_loss_10": 101.72255935668946, + "kl_loss_2": 1175.2358459472657, + "kl_loss_3": 821.6455657958984, + "kl_loss_7": 216.67398834228516, + "learning_rate": 0.0006317069459158283, + "loss": 576.074, + "step": 4210 + }, + { + "ce_loss_10": 3.766611933708191, + "ce_loss_13": 3.7019524574279785, + "ce_loss_2": 4.238518404960632, + "ce_loss_3": 4.070182096958161, + "ce_loss_7": 3.818829393386841, + "epoch": 0.422, + "grad_norm": 404.0, + "kl_loss_10": 102.42731742858886, + "kl_loss_2": 1134.2777221679687, + "kl_loss_3": 793.5420806884765, + "kl_loss_7": 214.86822509765625, + "learning_rate": 0.0006301756586991561, + "loss": 572.4437, + "step": 4220 + }, + { + "ce_loss_10": 3.538297724723816, + "ce_loss_13": 3.4769801259040833, + "ce_loss_2": 4.051598787307739, + "ce_loss_3": 3.8692006349563597, + "ce_loss_7": 3.592081093788147, + "epoch": 0.423, + "grad_norm": 524.0, + "kl_loss_10": 100.02308959960938, + "kl_loss_2": 1219.534228515625, + "kl_loss_3": 847.8958953857422, + "kl_loss_7": 217.3907485961914, + "learning_rate": 0.0006286430606150459, + "loss": 590.4341, + "step": 4230 + }, + { + "ce_loss_10": 3.732722854614258, + "ce_loss_13": 3.670178234577179, + "ce_loss_2": 4.228793060779571, + "ce_loss_3": 4.055911266803742, + "ce_loss_7": 3.7854557275772094, + "epoch": 0.424, + "grad_norm": 440.0, + "kl_loss_10": 101.63710746765136, + "kl_loss_2": 1171.4819213867188, + "kl_loss_3": 815.24853515625, + "kl_loss_7": 212.84099502563475, + "learning_rate": 0.0006271091670967436, + "loss": 572.0026, + "step": 4240 + }, + { + "ce_loss_10": 3.64589341878891, + "ce_loss_13": 3.579445707798004, + "ce_loss_2": 4.168534743785858, + "ce_loss_3": 3.9873276472091677, + "ce_loss_7": 3.7041419625282286, + "epoch": 0.425, + "grad_norm": 436.0, + "kl_loss_10": 105.33321189880371, + "kl_loss_2": 1223.9686584472656, + "kl_loss_3": 856.7900268554688, + "kl_loss_7": 219.8565589904785, + "learning_rate": 0.0006255739935905395, + "loss": 587.2729, + "step": 4250 + }, + { + "ce_loss_10": 3.684093916416168, + "ce_loss_13": 3.622530627250671, + "ce_loss_2": 4.176068413257599, + "ce_loss_3": 4.005461478233338, + "ce_loss_7": 3.73612722158432, + "epoch": 0.426, + "grad_norm": 444.0, + "kl_loss_10": 101.16957168579101, + "kl_loss_2": 1151.114599609375, + "kl_loss_3": 804.5711151123047, + "kl_loss_7": 206.51019058227538, + "learning_rate": 0.0006240375555556145, + "loss": 584.5814, + "step": 4260 + }, + { + "ce_loss_10": 3.694865620136261, + "ce_loss_13": 3.6328345060348513, + "ce_loss_2": 4.216705179214477, + "ce_loss_3": 4.035941934585571, + "ce_loss_7": 3.7489806532859804, + "epoch": 0.427, + "grad_norm": 544.0, + "kl_loss_10": 102.23134536743164, + "kl_loss_2": 1200.0044555664062, + "kl_loss_3": 832.4086944580079, + "kl_loss_7": 208.58624954223632, + "learning_rate": 0.000622499868463882, + "loss": 581.1191, + "step": 4270 + }, + { + "ce_loss_10": 3.6664886713027953, + "ce_loss_13": 3.6031296968460085, + "ce_loss_2": 4.138775157928467, + "ce_loss_3": 3.968552088737488, + "ce_loss_7": 3.716127264499664, + "epoch": 0.428, + "grad_norm": 442.0, + "kl_loss_10": 102.83601112365723, + "kl_loss_2": 1148.9752075195313, + "kl_loss_3": 798.4193389892578, + "kl_loss_7": 204.8626609802246, + "learning_rate": 0.0006209609477998338, + "loss": 570.8694, + "step": 4280 + }, + { + "ce_loss_10": 3.7170133352279664, + "ce_loss_13": 3.6512863278388976, + "ce_loss_2": 4.214985513687134, + "ce_loss_3": 4.041373360157013, + "ce_loss_7": 3.76862713098526, + "epoch": 0.429, + "grad_norm": 492.0, + "kl_loss_10": 105.98460693359375, + "kl_loss_2": 1171.2547790527344, + "kl_loss_3": 819.7431121826172, + "kl_loss_7": 209.78300704956055, + "learning_rate": 0.0006194208090603844, + "loss": 582.6892, + "step": 4290 + }, + { + "ce_loss_10": 3.636822462081909, + "ce_loss_13": 3.572554814815521, + "ce_loss_2": 4.128273499011994, + "ce_loss_3": 3.9540862798690797, + "ce_loss_7": 3.6845338463783266, + "epoch": 0.43, + "grad_norm": 384.0, + "kl_loss_10": 104.19713554382324, + "kl_loss_2": 1158.2531616210938, + "kl_loss_3": 808.0290679931641, + "kl_loss_7": 201.06265716552736, + "learning_rate": 0.0006178794677547138, + "loss": 566.7275, + "step": 4300 + }, + { + "ce_loss_10": 3.669668412208557, + "ce_loss_13": 3.6048370003700256, + "ce_loss_2": 4.167822825908661, + "ce_loss_3": 3.990470898151398, + "ce_loss_7": 3.7204079270362853, + "epoch": 0.431, + "grad_norm": 462.0, + "kl_loss_10": 105.12696495056153, + "kl_loss_2": 1189.7153015136719, + "kl_loss_3": 827.7414642333985, + "kl_loss_7": 209.76073608398437, + "learning_rate": 0.0006163369394041111, + "loss": 578.5617, + "step": 4310 + }, + { + "ce_loss_10": 3.603849542140961, + "ce_loss_13": 3.540567708015442, + "ce_loss_2": 4.114995861053467, + "ce_loss_3": 3.93278226852417, + "ce_loss_7": 3.6533514499664306, + "epoch": 0.432, + "grad_norm": 524.0, + "kl_loss_10": 103.23071632385253, + "kl_loss_2": 1199.0398742675782, + "kl_loss_3": 837.4948120117188, + "kl_loss_7": 206.72886505126954, + "learning_rate": 0.0006147932395418205, + "loss": 593.6705, + "step": 4320 + }, + { + "ce_loss_10": 3.6318950057029724, + "ce_loss_13": 3.5694007515907287, + "ce_loss_2": 4.121479880809784, + "ce_loss_3": 3.9539971709251405, + "ce_loss_7": 3.6812774300575257, + "epoch": 0.433, + "grad_norm": 372.0, + "kl_loss_10": 101.08283462524415, + "kl_loss_2": 1163.6617614746094, + "kl_loss_3": 814.8068634033203, + "kl_loss_7": 204.31798858642577, + "learning_rate": 0.0006132483837128823, + "loss": 570.1899, + "step": 4330 + }, + { + "ce_loss_10": 3.6211368441581726, + "ce_loss_13": 3.5578442931175234, + "ce_loss_2": 4.120713996887207, + "ce_loss_3": 3.9408787965774534, + "ce_loss_7": 3.6715193152427674, + "epoch": 0.434, + "grad_norm": 380.0, + "kl_loss_10": 102.18530006408692, + "kl_loss_2": 1181.1154479980469, + "kl_loss_3": 821.5291748046875, + "kl_loss_7": 205.94673614501954, + "learning_rate": 0.0006117023874739772, + "loss": 579.966, + "step": 4340 + }, + { + "ce_loss_10": 3.606392514705658, + "ce_loss_13": 3.542631506919861, + "ce_loss_2": 4.1229788064956665, + "ce_loss_3": 3.943661665916443, + "ce_loss_7": 3.660093939304352, + "epoch": 0.435, + "grad_norm": 366.0, + "kl_loss_10": 101.41253623962402, + "kl_loss_2": 1198.5234008789062, + "kl_loss_3": 836.8120849609375, + "kl_loss_7": 206.9767189025879, + "learning_rate": 0.0006101552663932703, + "loss": 586.1095, + "step": 4350 + }, + { + "ce_loss_10": 3.6401270270347594, + "ce_loss_13": 3.5747036576271056, + "ce_loss_2": 4.133774304389954, + "ce_loss_3": 3.9579702854156493, + "ce_loss_7": 3.689171576499939, + "epoch": 0.436, + "grad_norm": 432.0, + "kl_loss_10": 103.28445014953613, + "kl_loss_2": 1170.830484008789, + "kl_loss_3": 821.6876098632813, + "kl_loss_7": 207.47048645019532, + "learning_rate": 0.0006086070360502539, + "loss": 578.1617, + "step": 4360 + }, + { + "ce_loss_10": 3.6478831648826597, + "ce_loss_13": 3.5829063415527345, + "ce_loss_2": 4.140194344520569, + "ce_loss_3": 3.9674217224121096, + "ce_loss_7": 3.6954386711120604, + "epoch": 0.437, + "grad_norm": 324.0, + "kl_loss_10": 102.49744033813477, + "kl_loss_2": 1182.2726196289063, + "kl_loss_3": 820.302099609375, + "kl_loss_7": 202.6822937011719, + "learning_rate": 0.0006070577120355903, + "loss": 585.725, + "step": 4370 + }, + { + "ce_loss_10": 3.6493834018707276, + "ce_loss_13": 3.585710608959198, + "ce_loss_2": 4.1475905418396, + "ce_loss_3": 3.9780289769172668, + "ce_loss_7": 3.6994438648223875, + "epoch": 0.438, + "grad_norm": 464.0, + "kl_loss_10": 99.22572135925293, + "kl_loss_2": 1158.4001525878907, + "kl_loss_3": 817.9062316894531, + "kl_loss_7": 200.7786117553711, + "learning_rate": 0.0006055073099509549, + "loss": 570.4337, + "step": 4380 + }, + { + "ce_loss_10": 3.7072151064872743, + "ce_loss_13": 3.6444019198417665, + "ce_loss_2": 4.1913762331008915, + "ce_loss_3": 4.024674141407013, + "ce_loss_7": 3.755181634426117, + "epoch": 0.439, + "grad_norm": 414.0, + "kl_loss_10": 101.21295433044433, + "kl_loss_2": 1155.983868408203, + "kl_loss_3": 813.5707092285156, + "kl_loss_7": 201.68513870239258, + "learning_rate": 0.0006039558454088796, + "loss": 578.4039, + "step": 4390 + }, + { + "ce_loss_10": 3.6866373896598814, + "ce_loss_13": 3.6209323048591613, + "ce_loss_2": 4.190221071243286, + "ce_loss_3": 4.017517447471619, + "ce_loss_7": 3.736443567276001, + "epoch": 0.44, + "grad_norm": 388.0, + "kl_loss_10": 103.66101570129395, + "kl_loss_2": 1179.7899597167968, + "kl_loss_3": 831.9971649169922, + "kl_loss_7": 206.1973434448242, + "learning_rate": 0.0006024033340325954, + "loss": 572.2276, + "step": 4400 + }, + { + "ce_loss_10": 3.7494076251983643, + "ce_loss_13": 3.6860761404037476, + "ce_loss_2": 4.22088440656662, + "ce_loss_3": 4.061302840709686, + "ce_loss_7": 3.7976527214050293, + "epoch": 0.441, + "grad_norm": 384.0, + "kl_loss_10": 100.95717124938965, + "kl_loss_2": 1117.0268005371095, + "kl_loss_3": 788.523080444336, + "kl_loss_7": 197.15192718505858, + "learning_rate": 0.0006008497914558743, + "loss": 559.696, + "step": 4410 + }, + { + "ce_loss_10": 3.689165186882019, + "ce_loss_13": 3.6250773549079893, + "ce_loss_2": 4.1833924651145935, + "ce_loss_3": 4.016273534297943, + "ce_loss_7": 3.738771951198578, + "epoch": 0.442, + "grad_norm": 476.0, + "kl_loss_10": 105.19830055236817, + "kl_loss_2": 1174.740167236328, + "kl_loss_3": 830.987890625, + "kl_loss_7": 209.00811996459962, + "learning_rate": 0.0005992952333228728, + "loss": 576.4588, + "step": 4420 + }, + { + "ce_loss_10": 3.620419418811798, + "ce_loss_13": 3.5588944792747497, + "ce_loss_2": 4.125707459449768, + "ce_loss_3": 3.9479523420333864, + "ce_loss_7": 3.6681005358695984, + "epoch": 0.443, + "grad_norm": 464.0, + "kl_loss_10": 100.17966499328614, + "kl_loss_2": 1181.0232360839843, + "kl_loss_3": 829.0245361328125, + "kl_loss_7": 201.25574188232423, + "learning_rate": 0.0005977396752879741, + "loss": 577.6452, + "step": 4430 + }, + { + "ce_loss_10": 3.5535963416099547, + "ce_loss_13": 3.4911730885505676, + "ce_loss_2": 4.057285642623901, + "ce_loss_3": 3.882522702217102, + "ce_loss_7": 3.603209447860718, + "epoch": 0.444, + "grad_norm": 450.0, + "kl_loss_10": 96.56860618591308, + "kl_loss_2": 1184.1321594238282, + "kl_loss_3": 827.8955352783203, + "kl_loss_7": 199.06893157958984, + "learning_rate": 0.0005961831330156305, + "loss": 569.2716, + "step": 4440 + }, + { + "ce_loss_10": 3.697277545928955, + "ce_loss_13": 3.6338467955589295, + "ce_loss_2": 4.1992070317268375, + "ce_loss_3": 4.02395384311676, + "ce_loss_7": 3.747213661670685, + "epoch": 0.445, + "grad_norm": 392.0, + "kl_loss_10": 101.60056228637696, + "kl_loss_2": 1189.420147705078, + "kl_loss_3": 827.8122314453125, + "kl_loss_7": 205.08227157592773, + "learning_rate": 0.0005946256221802051, + "loss": 584.411, + "step": 4450 + }, + { + "ce_loss_10": 3.679532468318939, + "ce_loss_13": 3.6183473825454713, + "ce_loss_2": 4.146489477157592, + "ce_loss_3": 3.9755648136138917, + "ce_loss_7": 3.7207812786102297, + "epoch": 0.446, + "grad_norm": 494.0, + "kl_loss_10": 101.10317420959473, + "kl_loss_2": 1119.8320098876952, + "kl_loss_3": 779.770297241211, + "kl_loss_7": 198.91878814697264, + "learning_rate": 0.0005930671584658151, + "loss": 578.7685, + "step": 4460 + }, + { + "ce_loss_10": 3.674864172935486, + "ce_loss_13": 3.6118743062019347, + "ce_loss_2": 4.166282546520233, + "ce_loss_3": 3.9925308227539062, + "ce_loss_7": 3.7198517322540283, + "epoch": 0.447, + "grad_norm": 364.0, + "kl_loss_10": 100.75155410766601, + "kl_loss_2": 1165.5830871582032, + "kl_loss_3": 814.2670196533203, + "kl_loss_7": 201.9087059020996, + "learning_rate": 0.0005915077575661722, + "loss": 579.8401, + "step": 4470 + }, + { + "ce_loss_10": 3.694182288646698, + "ce_loss_13": 3.628465461730957, + "ce_loss_2": 4.190526556968689, + "ce_loss_3": 4.015213489532471, + "ce_loss_7": 3.7417189121246337, + "epoch": 0.448, + "grad_norm": 520.0, + "kl_loss_10": 105.40261840820312, + "kl_loss_2": 1179.2632690429687, + "kl_loss_3": 825.197119140625, + "kl_loss_7": 209.67544021606446, + "learning_rate": 0.000589947435184427, + "loss": 569.8479, + "step": 4480 + }, + { + "ce_loss_10": 3.7602591633796694, + "ce_loss_13": 3.6975467801094055, + "ce_loss_2": 4.231885468959808, + "ce_loss_3": 4.062859082221985, + "ce_loss_7": 3.8065670251846315, + "epoch": 0.449, + "grad_norm": 406.0, + "kl_loss_10": 104.7243579864502, + "kl_loss_2": 1147.1027252197266, + "kl_loss_3": 795.4058624267578, + "kl_loss_7": 203.6425910949707, + "learning_rate": 0.0005883862070330078, + "loss": 568.9265, + "step": 4490 + }, + { + "ce_loss_10": 3.6874640941619874, + "ce_loss_13": 3.6227025985717773, + "ce_loss_2": 4.18091858625412, + "ce_loss_3": 4.004498326778412, + "ce_loss_7": 3.7389190554618836, + "epoch": 0.45, + "grad_norm": 342.0, + "kl_loss_10": 102.03626098632813, + "kl_loss_2": 1166.0193176269531, + "kl_loss_3": 811.4805572509765, + "kl_loss_7": 204.2785285949707, + "learning_rate": 0.0005868240888334653, + "loss": 567.3452, + "step": 4500 + }, + { + "ce_loss_10": 3.570815551280975, + "ce_loss_13": 3.508398413658142, + "ce_loss_2": 4.096131467819214, + "ce_loss_3": 3.9093389391899107, + "ce_loss_7": 3.625988078117371, + "epoch": 0.451, + "grad_norm": 616.0, + "kl_loss_10": 100.9030990600586, + "kl_loss_2": 1212.356463623047, + "kl_loss_3": 839.7065948486328, + "kl_loss_7": 207.68597564697265, + "learning_rate": 0.0005852610963163119, + "loss": 584.0681, + "step": 4510 + }, + { + "ce_loss_10": 3.5951132655143736, + "ce_loss_13": 3.5340840578079225, + "ce_loss_2": 4.088473439216614, + "ce_loss_3": 3.9123128294944762, + "ce_loss_7": 3.6418415188789366, + "epoch": 0.452, + "grad_norm": 440.0, + "kl_loss_10": 97.94427604675293, + "kl_loss_2": 1155.4515991210938, + "kl_loss_3": 802.8143249511719, + "kl_loss_7": 198.15041809082032, + "learning_rate": 0.0005836972452208654, + "loss": 560.779, + "step": 4520 + }, + { + "ce_loss_10": 3.6001816511154177, + "ce_loss_13": 3.540806245803833, + "ce_loss_2": 4.105304884910583, + "ce_loss_3": 3.9283313751220703, + "ce_loss_7": 3.6497029066085815, + "epoch": 0.453, + "grad_norm": 470.0, + "kl_loss_10": 99.28575630187989, + "kl_loss_2": 1176.1295288085937, + "kl_loss_3": 817.2998046875, + "kl_loss_7": 202.73690338134764, + "learning_rate": 0.0005821325512950885, + "loss": 572.314, + "step": 4530 + }, + { + "ce_loss_10": 3.629274320602417, + "ce_loss_13": 3.5687419891357424, + "ce_loss_2": 4.1162322640419005, + "ce_loss_3": 3.9458845138549803, + "ce_loss_7": 3.680540406703949, + "epoch": 0.454, + "grad_norm": 368.0, + "kl_loss_10": 96.52360496520996, + "kl_loss_2": 1136.2307861328125, + "kl_loss_3": 790.6944702148437, + "kl_loss_7": 197.31127700805663, + "learning_rate": 0.0005805670302954321, + "loss": 568.0196, + "step": 4540 + }, + { + "ce_loss_10": 3.6337098717689513, + "ce_loss_13": 3.5753876209259032, + "ce_loss_2": 4.115709042549133, + "ce_loss_3": 3.9439353704452516, + "ce_loss_7": 3.6809528470039368, + "epoch": 0.455, + "grad_norm": 434.0, + "kl_loss_10": 95.89570465087891, + "kl_loss_2": 1140.969873046875, + "kl_loss_3": 792.410400390625, + "kl_loss_7": 194.6849395751953, + "learning_rate": 0.000579000697986675, + "loss": 559.3398, + "step": 4550 + }, + { + "ce_loss_10": 3.5949880719184875, + "ce_loss_13": 3.5312354803085326, + "ce_loss_2": 4.110612523555756, + "ce_loss_3": 3.9363887429237367, + "ce_loss_7": 3.6481791853904726, + "epoch": 0.456, + "grad_norm": 398.0, + "kl_loss_10": 102.14065132141113, + "kl_loss_2": 1200.508935546875, + "kl_loss_3": 844.4349182128906, + "kl_loss_7": 207.93037872314454, + "learning_rate": 0.0005774335701417662, + "loss": 577.7247, + "step": 4560 + }, + { + "ce_loss_10": 3.578439974784851, + "ce_loss_13": 3.5177830338478087, + "ce_loss_2": 4.086728799343109, + "ce_loss_3": 3.9092958092689516, + "ce_loss_7": 3.628882908821106, + "epoch": 0.457, + "grad_norm": 438.0, + "kl_loss_10": 98.15573539733887, + "kl_loss_2": 1190.6679321289062, + "kl_loss_3": 827.183969116211, + "kl_loss_7": 201.49042510986328, + "learning_rate": 0.0005758656625416658, + "loss": 579.3393, + "step": 4570 + }, + { + "ce_loss_10": 3.6351425409317017, + "ce_loss_13": 3.5740421295166014, + "ce_loss_2": 4.13430563211441, + "ce_loss_3": 3.9581828236579897, + "ce_loss_7": 3.685711920261383, + "epoch": 0.458, + "grad_norm": 378.0, + "kl_loss_10": 98.59328498840333, + "kl_loss_2": 1165.538037109375, + "kl_loss_3": 813.1740905761719, + "kl_loss_7": 200.91252059936522, + "learning_rate": 0.0005742969909751859, + "loss": 562.4629, + "step": 4580 + }, + { + "ce_loss_10": 3.6438634276390074, + "ce_loss_13": 3.5822227597236633, + "ce_loss_2": 4.139957237243652, + "ce_loss_3": 3.96221022605896, + "ce_loss_7": 3.692858374118805, + "epoch": 0.459, + "grad_norm": 396.0, + "kl_loss_10": 100.12554626464843, + "kl_loss_2": 1167.3160705566406, + "kl_loss_3": 805.8544036865235, + "kl_loss_7": 201.26202087402345, + "learning_rate": 0.0005727275712388318, + "loss": 570.0833, + "step": 4590 + }, + { + "ce_loss_10": 3.681215536594391, + "ce_loss_13": 3.620731198787689, + "ce_loss_2": 4.155962944030762, + "ce_loss_3": 3.984270441532135, + "ce_loss_7": 3.7283701658248902, + "epoch": 0.46, + "grad_norm": 568.0, + "kl_loss_10": 98.76027946472168, + "kl_loss_2": 1132.1197998046875, + "kl_loss_3": 792.0047241210938, + "kl_loss_7": 197.17216033935546, + "learning_rate": 0.0005711574191366427, + "loss": 562.7997, + "step": 4600 + }, + { + "ce_loss_10": 3.6236431002616882, + "ce_loss_13": 3.565703308582306, + "ce_loss_2": 4.114531934261322, + "ce_loss_3": 3.93969669342041, + "ce_loss_7": 3.671102833747864, + "epoch": 0.461, + "grad_norm": 372.0, + "kl_loss_10": 98.42190704345703, + "kl_loss_2": 1170.4917938232422, + "kl_loss_3": 808.7791198730469, + "kl_loss_7": 199.0694892883301, + "learning_rate": 0.0005695865504800327, + "loss": 564.0159, + "step": 4610 + }, + { + "ce_loss_10": 3.562722647190094, + "ce_loss_13": 3.500598740577698, + "ce_loss_2": 4.109580218791962, + "ce_loss_3": 3.9190361380577086, + "ce_loss_7": 3.6191172361373902, + "epoch": 0.462, + "grad_norm": 480.0, + "kl_loss_10": 100.51305274963379, + "kl_loss_2": 1233.0393005371093, + "kl_loss_3": 860.259619140625, + "kl_loss_7": 208.89999542236328, + "learning_rate": 0.0005680149810876322, + "loss": 581.488, + "step": 4620 + }, + { + "ce_loss_10": 3.6198580145835875, + "ce_loss_13": 3.5573437213897705, + "ce_loss_2": 4.117598211765289, + "ce_loss_3": 3.94056499004364, + "ce_loss_7": 3.667776870727539, + "epoch": 0.463, + "grad_norm": 560.0, + "kl_loss_10": 99.44257354736328, + "kl_loss_2": 1160.7040802001952, + "kl_loss_3": 809.362094116211, + "kl_loss_7": 201.12859268188475, + "learning_rate": 0.0005664427267851271, + "loss": 565.3629, + "step": 4630 + }, + { + "ce_loss_10": 3.534971606731415, + "ce_loss_13": 3.47266343832016, + "ce_loss_2": 4.036073172092438, + "ce_loss_3": 3.857685387134552, + "ce_loss_7": 3.5870521306991576, + "epoch": 0.464, + "grad_norm": 498.0, + "kl_loss_10": 97.52345237731933, + "kl_loss_2": 1167.1843322753907, + "kl_loss_3": 810.5214752197265, + "kl_loss_7": 199.60354309082032, + "learning_rate": 0.0005648698034051009, + "loss": 562.6416, + "step": 4640 + }, + { + "ce_loss_10": 3.6570662021636964, + "ce_loss_13": 3.594506525993347, + "ce_loss_2": 4.158554673194885, + "ce_loss_3": 3.980504941940308, + "ce_loss_7": 3.7062572717666624, + "epoch": 0.465, + "grad_norm": 412.0, + "kl_loss_10": 99.88166885375976, + "kl_loss_2": 1173.9357055664063, + "kl_loss_3": 818.5712066650391, + "kl_loss_7": 200.30800857543946, + "learning_rate": 0.0005632962267868747, + "loss": 561.8186, + "step": 4650 + }, + { + "ce_loss_10": 3.5903021335601806, + "ce_loss_13": 3.5294329643249513, + "ce_loss_2": 4.08318532705307, + "ce_loss_3": 3.9098427176475523, + "ce_loss_7": 3.6388569593429567, + "epoch": 0.466, + "grad_norm": 464.0, + "kl_loss_10": 95.17009468078614, + "kl_loss_2": 1143.232162475586, + "kl_loss_3": 798.761831665039, + "kl_loss_7": 195.75977783203126, + "learning_rate": 0.0005617220127763474, + "loss": 567.0608, + "step": 4660 + }, + { + "ce_loss_10": 3.669221520423889, + "ce_loss_13": 3.607930314540863, + "ce_loss_2": 4.160642421245575, + "ce_loss_3": 3.9847203373908995, + "ce_loss_7": 3.717066395282745, + "epoch": 0.467, + "grad_norm": 412.0, + "kl_loss_10": 98.76815719604492, + "kl_loss_2": 1153.8832275390625, + "kl_loss_3": 803.9543914794922, + "kl_loss_7": 198.99397354125978, + "learning_rate": 0.0005601471772258368, + "loss": 567.3518, + "step": 4670 + }, + { + "ce_loss_10": 3.6542641162872314, + "ce_loss_13": 3.593363094329834, + "ce_loss_2": 4.133442676067352, + "ce_loss_3": 3.96450389623642, + "ce_loss_7": 3.7022117972373962, + "epoch": 0.468, + "grad_norm": 384.0, + "kl_loss_10": 98.04742546081543, + "kl_loss_2": 1118.5282470703125, + "kl_loss_3": 784.399691772461, + "kl_loss_7": 197.338858795166, + "learning_rate": 0.0005585717359939192, + "loss": 565.1176, + "step": 4680 + }, + { + "ce_loss_10": 3.56116144657135, + "ce_loss_13": 3.4993683457374574, + "ce_loss_2": 4.055442547798156, + "ce_loss_3": 3.887247931957245, + "ce_loss_7": 3.6099945425987245, + "epoch": 0.469, + "grad_norm": 490.0, + "kl_loss_10": 97.45741577148438, + "kl_loss_2": 1149.7481964111328, + "kl_loss_3": 806.3391754150391, + "kl_loss_7": 197.63161849975586, + "learning_rate": 0.0005569957049452703, + "loss": 571.714, + "step": 4690 + }, + { + "ce_loss_10": 3.6181132555007935, + "ce_loss_13": 3.558199667930603, + "ce_loss_2": 4.1229860305786135, + "ce_loss_3": 3.9408149838447573, + "ce_loss_7": 3.668530523777008, + "epoch": 0.47, + "grad_norm": 458.0, + "kl_loss_10": 98.11741218566894, + "kl_loss_2": 1179.65732421875, + "kl_loss_3": 819.0914672851562, + "kl_loss_7": 202.21502075195312, + "learning_rate": 0.0005554190999505056, + "loss": 572.5331, + "step": 4700 + }, + { + "ce_loss_10": 3.7477443337440492, + "ce_loss_13": 3.6823888421058655, + "ce_loss_2": 4.236353850364685, + "ce_loss_3": 4.064246296882629, + "ce_loss_7": 3.7983964323997497, + "epoch": 0.471, + "grad_norm": 376.0, + "kl_loss_10": 101.09743614196778, + "kl_loss_2": 1167.4985229492188, + "kl_loss_3": 813.3948120117187, + "kl_loss_7": 205.17110900878907, + "learning_rate": 0.0005538419368860196, + "loss": 552.1318, + "step": 4710 + }, + { + "ce_loss_10": 3.670793604850769, + "ce_loss_13": 3.6081652998924256, + "ce_loss_2": 4.154720652103424, + "ce_loss_3": 3.986761474609375, + "ce_loss_7": 3.7201395988464356, + "epoch": 0.472, + "grad_norm": 416.0, + "kl_loss_10": 100.02058029174805, + "kl_loss_2": 1152.6582946777344, + "kl_loss_3": 806.7274566650391, + "kl_loss_7": 202.40063400268554, + "learning_rate": 0.0005522642316338268, + "loss": 576.1212, + "step": 4720 + }, + { + "ce_loss_10": 3.673479509353638, + "ce_loss_13": 3.613760471343994, + "ce_loss_2": 4.150910186767578, + "ce_loss_3": 3.981798696517944, + "ce_loss_7": 3.721827840805054, + "epoch": 0.473, + "grad_norm": 478.0, + "kl_loss_10": 99.9439712524414, + "kl_loss_2": 1142.4451599121094, + "kl_loss_3": 795.6325531005859, + "kl_loss_7": 199.72487106323243, + "learning_rate": 0.0005506860000814017, + "loss": 573.0671, + "step": 4730 + }, + { + "ce_loss_10": 3.700618231296539, + "ce_loss_13": 3.638905906677246, + "ce_loss_2": 4.180734276771545, + "ce_loss_3": 4.006302297115326, + "ce_loss_7": 3.7447570085525514, + "epoch": 0.474, + "grad_norm": 372.0, + "kl_loss_10": 99.73388938903808, + "kl_loss_2": 1127.7213500976563, + "kl_loss_3": 793.5628936767578, + "kl_loss_7": 197.02488555908204, + "learning_rate": 0.0005491072581215186, + "loss": 565.0697, + "step": 4740 + }, + { + "ce_loss_10": 3.706625771522522, + "ce_loss_13": 3.6401172399520876, + "ce_loss_2": 4.184090709686279, + "ce_loss_3": 4.019766807556152, + "ce_loss_7": 3.754279363155365, + "epoch": 0.475, + "grad_norm": 516.0, + "kl_loss_10": 103.58124504089355, + "kl_loss_2": 1159.682275390625, + "kl_loss_3": 813.5887573242187, + "kl_loss_7": 204.05538330078124, + "learning_rate": 0.0005475280216520913, + "loss": 556.0086, + "step": 4750 + }, + { + "ce_loss_10": 3.617805337905884, + "ce_loss_13": 3.5573843002319334, + "ce_loss_2": 4.093091154098511, + "ce_loss_3": 3.926499140262604, + "ce_loss_7": 3.664002466201782, + "epoch": 0.476, + "grad_norm": 438.0, + "kl_loss_10": 97.125687789917, + "kl_loss_2": 1118.9559478759766, + "kl_loss_3": 784.6352722167969, + "kl_loss_7": 196.01404037475587, + "learning_rate": 0.0005459483065760138, + "loss": 565.9596, + "step": 4760 + }, + { + "ce_loss_10": 3.552186381816864, + "ce_loss_13": 3.4902740478515626, + "ce_loss_2": 4.07539484500885, + "ce_loss_3": 3.891750192642212, + "ce_loss_7": 3.601547920703888, + "epoch": 0.477, + "grad_norm": 584.0, + "kl_loss_10": 97.89878273010254, + "kl_loss_2": 1199.7971740722655, + "kl_loss_3": 836.253662109375, + "kl_loss_7": 197.98745880126953, + "learning_rate": 0.0005443681288009991, + "loss": 568.1693, + "step": 4770 + }, + { + "ce_loss_10": 3.6120885968208314, + "ce_loss_13": 3.5525715351104736, + "ce_loss_2": 4.106596338748932, + "ce_loss_3": 3.932267451286316, + "ce_loss_7": 3.6594039678573607, + "epoch": 0.478, + "grad_norm": 430.0, + "kl_loss_10": 98.81552238464356, + "kl_loss_2": 1169.4871887207032, + "kl_loss_3": 816.0136047363281, + "kl_loss_7": 198.91362609863282, + "learning_rate": 0.0005427875042394199, + "loss": 570.9199, + "step": 4780 + }, + { + "ce_loss_10": 3.6413972973823547, + "ce_loss_13": 3.5771793842315676, + "ce_loss_2": 4.133682417869568, + "ce_loss_3": 3.9594278573989867, + "ce_loss_7": 3.6885754466056824, + "epoch": 0.479, + "grad_norm": 396.0, + "kl_loss_10": 102.98994331359863, + "kl_loss_2": 1166.8763580322266, + "kl_loss_3": 812.8268646240234, + "kl_loss_7": 201.2046257019043, + "learning_rate": 0.0005412064488081482, + "loss": 576.3787, + "step": 4790 + }, + { + "ce_loss_10": 3.6483134269714355, + "ce_loss_13": 3.5873068809509276, + "ce_loss_2": 4.13967661857605, + "ce_loss_3": 3.9646928787231444, + "ce_loss_7": 3.697467315196991, + "epoch": 0.48, + "grad_norm": 370.0, + "kl_loss_10": 99.1940761566162, + "kl_loss_2": 1147.4876434326172, + "kl_loss_3": 791.1785400390625, + "kl_loss_7": 197.28219909667968, + "learning_rate": 0.0005396249784283942, + "loss": 558.8872, + "step": 4800 + }, + { + "ce_loss_10": 3.675038015842438, + "ce_loss_13": 3.605392372608185, + "ce_loss_2": 4.173114275932312, + "ce_loss_3": 3.99793621301651, + "ce_loss_7": 3.719290328025818, + "epoch": 0.481, + "grad_norm": 424.0, + "kl_loss_10": 109.17574653625488, + "kl_loss_2": 1186.1647857666017, + "kl_loss_3": 827.1956359863282, + "kl_loss_7": 205.77321548461913, + "learning_rate": 0.0005380431090255476, + "loss": 574.2385, + "step": 4810 + }, + { + "ce_loss_10": 3.6580063104629517, + "ce_loss_13": 3.600323748588562, + "ce_loss_2": 4.138371276855469, + "ce_loss_3": 3.968156564235687, + "ce_loss_7": 3.705365836620331, + "epoch": 0.482, + "grad_norm": 368.0, + "kl_loss_10": 96.444384765625, + "kl_loss_2": 1126.6424652099608, + "kl_loss_3": 782.5597137451172, + "kl_loss_7": 192.58789978027343, + "learning_rate": 0.0005364608565290155, + "loss": 556.892, + "step": 4820 + }, + { + "ce_loss_10": 3.66942412853241, + "ce_loss_13": 3.6059840083122254, + "ce_loss_2": 4.159801661968231, + "ce_loss_3": 3.985584008693695, + "ce_loss_7": 3.7178696751594544, + "epoch": 0.483, + "grad_norm": 528.0, + "kl_loss_10": 101.1554500579834, + "kl_loss_2": 1154.6878021240234, + "kl_loss_3": 803.7564819335937, + "kl_loss_7": 199.8929084777832, + "learning_rate": 0.0005348782368720626, + "loss": 563.005, + "step": 4830 + }, + { + "ce_loss_10": 3.596053886413574, + "ce_loss_13": 3.5365728974342345, + "ce_loss_2": 4.080682539939881, + "ce_loss_3": 3.9060325980186463, + "ce_loss_7": 3.6432487964630127, + "epoch": 0.484, + "grad_norm": 520.0, + "kl_loss_10": 96.21514892578125, + "kl_loss_2": 1134.5447143554688, + "kl_loss_3": 787.3878204345704, + "kl_loss_7": 194.21477661132812, + "learning_rate": 0.000533295265991652, + "loss": 564.2112, + "step": 4840 + }, + { + "ce_loss_10": 3.6783321022987367, + "ce_loss_13": 3.6159629583358766, + "ce_loss_2": 4.154437899589539, + "ce_loss_3": 3.9877618312835694, + "ce_loss_7": 3.727357840538025, + "epoch": 0.485, + "grad_norm": 434.0, + "kl_loss_10": 97.2699405670166, + "kl_loss_2": 1128.611801147461, + "kl_loss_3": 786.6338958740234, + "kl_loss_7": 195.64030685424805, + "learning_rate": 0.0005317119598282822, + "loss": 554.8634, + "step": 4850 + }, + { + "ce_loss_10": 3.6783334612846375, + "ce_loss_13": 3.6158772826194765, + "ce_loss_2": 4.161105620861053, + "ce_loss_3": 3.9936763644218445, + "ce_loss_7": 3.726669430732727, + "epoch": 0.486, + "grad_norm": 500.0, + "kl_loss_10": 99.51188240051269, + "kl_loss_2": 1139.204409790039, + "kl_loss_3": 796.6284942626953, + "kl_loss_7": 197.98922119140624, + "learning_rate": 0.0005301283343258293, + "loss": 559.5733, + "step": 4860 + }, + { + "ce_loss_10": 3.739852726459503, + "ce_loss_13": 3.679302477836609, + "ce_loss_2": 4.207214975357056, + "ce_loss_3": 4.046137988567352, + "ce_loss_7": 3.7877432465553285, + "epoch": 0.487, + "grad_norm": 434.0, + "kl_loss_10": 98.4985725402832, + "kl_loss_2": 1115.5814056396484, + "kl_loss_3": 781.7784240722656, + "kl_loss_7": 195.47981796264648, + "learning_rate": 0.000528544405431384, + "loss": 548.517, + "step": 4870 + }, + { + "ce_loss_10": 3.617240381240845, + "ce_loss_13": 3.555454957485199, + "ce_loss_2": 4.122074174880981, + "ce_loss_3": 3.944024980068207, + "ce_loss_7": 3.668628621101379, + "epoch": 0.488, + "grad_norm": 432.0, + "kl_loss_10": 98.9582015991211, + "kl_loss_2": 1175.8768676757813, + "kl_loss_3": 814.3092010498046, + "kl_loss_7": 202.09591979980468, + "learning_rate": 0.000526960189095093, + "loss": 569.4682, + "step": 4880 + }, + { + "ce_loss_10": 3.5905461430549623, + "ce_loss_13": 3.5317755937576294, + "ce_loss_2": 4.075044083595276, + "ce_loss_3": 3.9047257542610168, + "ce_loss_7": 3.637452006340027, + "epoch": 0.489, + "grad_norm": 406.0, + "kl_loss_10": 95.30788230895996, + "kl_loss_2": 1125.9373596191406, + "kl_loss_3": 783.2284423828125, + "kl_loss_7": 192.63981170654296, + "learning_rate": 0.0005253757012699972, + "loss": 553.6164, + "step": 4890 + }, + { + "ce_loss_10": 3.680708420276642, + "ce_loss_13": 3.621255648136139, + "ce_loss_2": 4.161669278144837, + "ce_loss_3": 3.9898939728736877, + "ce_loss_7": 3.726882266998291, + "epoch": 0.49, + "grad_norm": 436.0, + "kl_loss_10": 98.59705772399903, + "kl_loss_2": 1136.0980651855468, + "kl_loss_3": 790.2571563720703, + "kl_loss_7": 197.4443115234375, + "learning_rate": 0.0005237909579118712, + "loss": 568.0026, + "step": 4900 + }, + { + "ce_loss_10": 3.6435038447380066, + "ce_loss_13": 3.581137490272522, + "ce_loss_2": 4.134112453460693, + "ce_loss_3": 3.9640262126922607, + "ce_loss_7": 3.6911675453186037, + "epoch": 0.491, + "grad_norm": 520.0, + "kl_loss_10": 99.66703796386719, + "kl_loss_2": 1167.6467651367188, + "kl_loss_3": 818.6171966552735, + "kl_loss_7": 200.65354614257814, + "learning_rate": 0.0005222059749790631, + "loss": 568.3183, + "step": 4910 + }, + { + "ce_loss_10": 3.7152050852775576, + "ce_loss_13": 3.652708613872528, + "ce_loss_2": 4.176082861423493, + "ce_loss_3": 4.013521981239319, + "ce_loss_7": 3.759286069869995, + "epoch": 0.492, + "grad_norm": 394.0, + "kl_loss_10": 100.0508934020996, + "kl_loss_2": 1112.6296081542969, + "kl_loss_3": 774.64658203125, + "kl_loss_7": 196.3288688659668, + "learning_rate": 0.0005206207684323337, + "loss": 544.9011, + "step": 4920 + }, + { + "ce_loss_10": 3.689722108840942, + "ce_loss_13": 3.6289564847946165, + "ce_loss_2": 4.170908105373383, + "ce_loss_3": 3.9987404584884643, + "ce_loss_7": 3.7391751527786257, + "epoch": 0.493, + "grad_norm": 368.0, + "kl_loss_10": 100.77400093078613, + "kl_loss_2": 1140.2743774414062, + "kl_loss_3": 795.368798828125, + "kl_loss_7": 200.2589553833008, + "learning_rate": 0.000519035354234695, + "loss": 567.6383, + "step": 4930 + }, + { + "ce_loss_10": 3.666009783744812, + "ce_loss_13": 3.603765845298767, + "ce_loss_2": 4.156714332103729, + "ce_loss_3": 3.9840614438056945, + "ce_loss_7": 3.7159415602684023, + "epoch": 0.494, + "grad_norm": 516.0, + "kl_loss_10": 99.73322830200195, + "kl_loss_2": 1144.6152709960938, + "kl_loss_3": 797.5058837890625, + "kl_loss_7": 199.84856643676758, + "learning_rate": 0.0005174497483512506, + "loss": 551.5833, + "step": 4940 + }, + { + "ce_loss_10": 3.715251398086548, + "ce_loss_13": 3.6532492995262147, + "ce_loss_2": 4.190750586986542, + "ce_loss_3": 4.017711067199707, + "ce_loss_7": 3.760482394695282, + "epoch": 0.495, + "grad_norm": 404.0, + "kl_loss_10": 99.74794273376465, + "kl_loss_2": 1135.6743072509767, + "kl_loss_3": 788.5320007324219, + "kl_loss_7": 197.0201416015625, + "learning_rate": 0.0005158639667490339, + "loss": 559.5508, + "step": 4950 + }, + { + "ce_loss_10": 3.60677056312561, + "ce_loss_13": 3.545226526260376, + "ce_loss_2": 4.091673123836517, + "ce_loss_3": 3.921009349822998, + "ce_loss_7": 3.6560636878013613, + "epoch": 0.496, + "grad_norm": 380.0, + "kl_loss_10": 97.61143035888672, + "kl_loss_2": 1146.4500457763672, + "kl_loss_3": 801.032958984375, + "kl_loss_7": 198.76946029663085, + "learning_rate": 0.0005142780253968481, + "loss": 559.3498, + "step": 4960 + }, + { + "ce_loss_10": 3.558833396434784, + "ce_loss_13": 3.498770594596863, + "ce_loss_2": 4.029934275150299, + "ce_loss_3": 3.8623911499977113, + "ce_loss_7": 3.605703389644623, + "epoch": 0.497, + "grad_norm": 404.0, + "kl_loss_10": 95.15658073425293, + "kl_loss_2": 1120.565899658203, + "kl_loss_3": 776.6140930175782, + "kl_loss_7": 192.40693054199218, + "learning_rate": 0.0005126919402651053, + "loss": 541.1446, + "step": 4970 + }, + { + "ce_loss_10": 3.6243564009666445, + "ce_loss_13": 3.562463808059692, + "ce_loss_2": 4.122486090660095, + "ce_loss_3": 3.9518114924430847, + "ce_loss_7": 3.6740434527397157, + "epoch": 0.498, + "grad_norm": 500.0, + "kl_loss_10": 98.81732482910157, + "kl_loss_2": 1158.3788116455078, + "kl_loss_3": 805.2687072753906, + "kl_loss_7": 198.79998626708985, + "learning_rate": 0.0005111057273256647, + "loss": 562.34, + "step": 4980 + }, + { + "ce_loss_10": 3.736222839355469, + "ce_loss_13": 3.676733374595642, + "ce_loss_2": 4.189973556995392, + "ce_loss_3": 4.022429513931274, + "ce_loss_7": 3.7769731283187866, + "epoch": 0.499, + "grad_norm": 396.0, + "kl_loss_10": 98.13356437683106, + "kl_loss_2": 1078.4886474609375, + "kl_loss_3": 748.3916412353516, + "kl_loss_7": 191.23028793334962, + "learning_rate": 0.0005095194025516733, + "loss": 536.8887, + "step": 4990 + }, + { + "ce_loss_10": 3.6507428646087647, + "ce_loss_13": 3.592644715309143, + "ce_loss_2": 4.122073376178742, + "ce_loss_3": 3.9521225333213805, + "ce_loss_7": 3.697298324108124, + "epoch": 0.5, + "grad_norm": 378.0, + "kl_loss_10": 95.96725730895996, + "kl_loss_2": 1110.4840362548828, + "kl_loss_3": 769.709603881836, + "kl_loss_7": 192.08199310302734, + "learning_rate": 0.000507932981917404, + "loss": 562.5593, + "step": 5000 + }, + { + "ce_loss_10": 3.609897780418396, + "ce_loss_13": 3.5468419432640075, + "ce_loss_2": 4.115197873115539, + "ce_loss_3": 3.9347579956054686, + "ce_loss_7": 3.6594788432121277, + "epoch": 0.501, + "grad_norm": 496.0, + "kl_loss_10": 102.02307662963867, + "kl_loss_2": 1185.6702362060546, + "kl_loss_3": 822.8478820800781, + "kl_loss_7": 202.77078170776366, + "learning_rate": 0.0005063464813980949, + "loss": 576.005, + "step": 5010 + }, + { + "ce_loss_10": 3.595167326927185, + "ce_loss_13": 3.534419858455658, + "ce_loss_2": 4.08291003704071, + "ce_loss_3": 3.910551607608795, + "ce_loss_7": 3.6416044354438784, + "epoch": 0.502, + "grad_norm": 366.0, + "kl_loss_10": 98.82206382751465, + "kl_loss_2": 1157.9163513183594, + "kl_loss_3": 802.2986022949219, + "kl_loss_7": 196.4967498779297, + "learning_rate": 0.0005047599169697884, + "loss": 557.0335, + "step": 5020 + }, + { + "ce_loss_10": 3.5276883602142335, + "ce_loss_13": 3.469167137145996, + "ce_loss_2": 4.028472435474396, + "ce_loss_3": 3.8497302412986754, + "ce_loss_7": 3.5778237104415895, + "epoch": 0.503, + "grad_norm": 544.0, + "kl_loss_10": 95.17037048339844, + "kl_loss_2": 1142.5230926513673, + "kl_loss_3": 789.8021270751954, + "kl_loss_7": 195.37155456542968, + "learning_rate": 0.000503173304609171, + "loss": 545.4258, + "step": 5030 + }, + { + "ce_loss_10": 3.6576398611068726, + "ce_loss_13": 3.5950983643531798, + "ce_loss_2": 4.14467431306839, + "ce_loss_3": 3.9757012486457826, + "ce_loss_7": 3.7055052399635313, + "epoch": 0.504, + "grad_norm": 482.0, + "kl_loss_10": 98.4008186340332, + "kl_loss_2": 1135.7276794433594, + "kl_loss_3": 789.9247985839844, + "kl_loss_7": 196.23304824829103, + "learning_rate": 0.0005015866602934111, + "loss": 552.1605, + "step": 5040 + }, + { + "ce_loss_10": 3.621449387073517, + "ce_loss_13": 3.5583016514778136, + "ce_loss_2": 4.125820016860962, + "ce_loss_3": 3.9470208525657653, + "ce_loss_7": 3.6696552276611327, + "epoch": 0.505, + "grad_norm": 386.0, + "kl_loss_10": 101.05188751220703, + "kl_loss_2": 1170.8730712890624, + "kl_loss_3": 822.174462890625, + "kl_loss_7": 203.6134246826172, + "learning_rate": 0.0005, + "loss": 564.1666, + "step": 5050 + }, + { + "ce_loss_10": 3.608661472797394, + "ce_loss_13": 3.549720525741577, + "ce_loss_2": 4.094336903095245, + "ce_loss_3": 3.921399199962616, + "ce_loss_7": 3.6561817049980165, + "epoch": 0.506, + "grad_norm": 532.0, + "kl_loss_10": 97.96763725280762, + "kl_loss_2": 1147.6109741210937, + "kl_loss_3": 799.4344543457031, + "kl_loss_7": 197.70511016845703, + "learning_rate": 0.0004984133397065889, + "loss": 551.9219, + "step": 5060 + }, + { + "ce_loss_10": 3.619631803035736, + "ce_loss_13": 3.5591482758522033, + "ce_loss_2": 4.1191855549812315, + "ce_loss_3": 3.947730815410614, + "ce_loss_7": 3.671154284477234, + "epoch": 0.507, + "grad_norm": 420.0, + "kl_loss_10": 98.14169616699219, + "kl_loss_2": 1152.0039337158203, + "kl_loss_3": 803.1968292236328, + "kl_loss_7": 198.87692565917968, + "learning_rate": 0.0004968266953908291, + "loss": 554.0305, + "step": 5070 + }, + { + "ce_loss_10": 3.6628435134887694, + "ce_loss_13": 3.6024859309196473, + "ce_loss_2": 4.145783054828644, + "ce_loss_3": 3.972540259361267, + "ce_loss_7": 3.7080691695213317, + "epoch": 0.508, + "grad_norm": 532.0, + "kl_loss_10": 98.82306175231933, + "kl_loss_2": 1137.6268676757813, + "kl_loss_3": 795.5397338867188, + "kl_loss_7": 194.52870864868163, + "learning_rate": 0.0004952400830302117, + "loss": 554.9051, + "step": 5080 + }, + { + "ce_loss_10": 3.585409712791443, + "ce_loss_13": 3.525643265247345, + "ce_loss_2": 4.091677510738373, + "ce_loss_3": 3.9131953358650207, + "ce_loss_7": 3.6364392280578612, + "epoch": 0.509, + "grad_norm": 412.0, + "kl_loss_10": 98.62568626403808, + "kl_loss_2": 1168.942919921875, + "kl_loss_3": 811.3192687988281, + "kl_loss_7": 199.42913665771485, + "learning_rate": 0.0004936535186019053, + "loss": 559.6511, + "step": 5090 + }, + { + "ce_loss_10": 3.6907896161079408, + "ce_loss_13": 3.62961208820343, + "ce_loss_2": 4.153078198432922, + "ce_loss_3": 3.9874324560165406, + "ce_loss_7": 3.735322892665863, + "epoch": 0.51, + "grad_norm": 376.0, + "kl_loss_10": 97.42878112792968, + "kl_loss_2": 1101.246890258789, + "kl_loss_3": 771.5801239013672, + "kl_loss_7": 192.14101791381836, + "learning_rate": 0.000492067018082596, + "loss": 549.3435, + "step": 5100 + }, + { + "ce_loss_10": 3.6234113693237306, + "ce_loss_13": 3.55826051235199, + "ce_loss_2": 4.134788942337036, + "ce_loss_3": 3.9512638211250306, + "ce_loss_7": 3.673302376270294, + "epoch": 0.511, + "grad_norm": 358.0, + "kl_loss_10": 100.71795692443848, + "kl_loss_2": 1184.7957580566406, + "kl_loss_3": 822.3129302978516, + "kl_loss_7": 201.37216567993164, + "learning_rate": 0.0004904805974483267, + "loss": 578.112, + "step": 5110 + }, + { + "ce_loss_10": 3.73909273147583, + "ce_loss_13": 3.6729060292243956, + "ce_loss_2": 4.232535266876221, + "ce_loss_3": 4.064092624187469, + "ce_loss_7": 3.78980005979538, + "epoch": 0.512, + "grad_norm": 418.0, + "kl_loss_10": 103.6674789428711, + "kl_loss_2": 1170.0532684326172, + "kl_loss_3": 824.2820373535156, + "kl_loss_7": 206.52156982421874, + "learning_rate": 0.0004888942726743353, + "loss": 580.3403, + "step": 5120 + }, + { + "ce_loss_10": 3.6079283952713013, + "ce_loss_13": 3.5456172823905945, + "ce_loss_2": 4.103336191177368, + "ce_loss_3": 3.9267752170562744, + "ce_loss_7": 3.655103015899658, + "epoch": 0.513, + "grad_norm": 378.0, + "kl_loss_10": 97.65564994812011, + "kl_loss_2": 1156.2654846191406, + "kl_loss_3": 800.4834381103516, + "kl_loss_7": 198.76654281616212, + "learning_rate": 0.0004873080597348947, + "loss": 561.8108, + "step": 5130 + }, + { + "ce_loss_10": 3.492985022068024, + "ce_loss_13": 3.433611583709717, + "ce_loss_2": 4.009678089618683, + "ce_loss_3": 3.82467257976532, + "ce_loss_7": 3.543225371837616, + "epoch": 0.514, + "grad_norm": 440.0, + "kl_loss_10": 96.83905181884765, + "kl_loss_2": 1194.322329711914, + "kl_loss_3": 828.9491943359375, + "kl_loss_7": 198.22924575805663, + "learning_rate": 0.0004857219746031519, + "loss": 567.8251, + "step": 5140 + }, + { + "ce_loss_10": 3.6722797036170958, + "ce_loss_13": 3.6109776854515077, + "ce_loss_2": 4.149738478660583, + "ce_loss_3": 3.975986909866333, + "ce_loss_7": 3.7163102626800537, + "epoch": 0.515, + "grad_norm": 430.0, + "kl_loss_10": 99.9472442626953, + "kl_loss_2": 1140.7201843261719, + "kl_loss_3": 787.3806091308594, + "kl_loss_7": 197.54812469482422, + "learning_rate": 0.0004841360332509663, + "loss": 556.8349, + "step": 5150 + }, + { + "ce_loss_10": 3.6183668613433837, + "ce_loss_13": 3.5591975688934325, + "ce_loss_2": 4.100240254402161, + "ce_loss_3": 3.9269237518310547, + "ce_loss_7": 3.6642425417900086, + "epoch": 0.516, + "grad_norm": 366.0, + "kl_loss_10": 93.92010688781738, + "kl_loss_2": 1122.7465362548828, + "kl_loss_3": 778.0984069824219, + "kl_loss_7": 191.03939056396484, + "learning_rate": 0.0004825502516487497, + "loss": 537.9487, + "step": 5160 + }, + { + "ce_loss_10": 3.5835310339927675, + "ce_loss_13": 3.523791456222534, + "ce_loss_2": 4.082003366947174, + "ce_loss_3": 3.908873450756073, + "ce_loss_7": 3.634874391555786, + "epoch": 0.517, + "grad_norm": 608.0, + "kl_loss_10": 99.05728721618652, + "kl_loss_2": 1155.0127502441405, + "kl_loss_3": 805.5277587890625, + "kl_loss_7": 198.6641098022461, + "learning_rate": 0.00048096464576530507, + "loss": 561.8511, + "step": 5170 + }, + { + "ce_loss_10": 3.6886157989501953, + "ce_loss_13": 3.628003740310669, + "ce_loss_2": 4.146280741691589, + "ce_loss_3": 3.9846285343170167, + "ce_loss_7": 3.731534945964813, + "epoch": 0.518, + "grad_norm": 390.0, + "kl_loss_10": 98.92878913879395, + "kl_loss_2": 1103.851336669922, + "kl_loss_3": 767.5214813232421, + "kl_loss_7": 193.13973236083984, + "learning_rate": 0.00047937923156766646, + "loss": 544.8563, + "step": 5180 + }, + { + "ce_loss_10": 3.737223446369171, + "ce_loss_13": 3.6758363366127016, + "ce_loss_2": 4.200218558311462, + "ce_loss_3": 4.037039196491241, + "ce_loss_7": 3.7829922437667847, + "epoch": 0.519, + "grad_norm": 428.0, + "kl_loss_10": 102.72743797302246, + "kl_loss_2": 1108.4752288818358, + "kl_loss_3": 772.8697265625, + "kl_loss_7": 198.5632797241211, + "learning_rate": 0.00047779402502093696, + "loss": 549.91, + "step": 5190 + }, + { + "ce_loss_10": 3.703013610839844, + "ce_loss_13": 3.640911114215851, + "ce_loss_2": 4.174945414066315, + "ce_loss_3": 4.009368169307709, + "ce_loss_7": 3.7497113823890684, + "epoch": 0.52, + "grad_norm": 478.0, + "kl_loss_10": 99.68995170593261, + "kl_loss_2": 1110.2117858886718, + "kl_loss_3": 777.3010894775391, + "kl_loss_7": 196.47792434692383, + "learning_rate": 0.0004762090420881289, + "loss": 553.7422, + "step": 5200 + }, + { + "ce_loss_10": 3.6182032585144044, + "ce_loss_13": 3.5570725202560425, + "ce_loss_2": 4.098654413223267, + "ce_loss_3": 3.916290044784546, + "ce_loss_7": 3.665347421169281, + "epoch": 0.521, + "grad_norm": 426.0, + "kl_loss_10": 98.28518867492676, + "kl_loss_2": 1126.3521606445313, + "kl_loss_3": 772.9946044921875, + "kl_loss_7": 193.74108428955077, + "learning_rate": 0.00047462429873000296, + "loss": 544.104, + "step": 5210 + }, + { + "ce_loss_10": 3.7033097624778746, + "ce_loss_13": 3.6430840730667113, + "ce_loss_2": 4.168367850780487, + "ce_loss_3": 3.9993362069129943, + "ce_loss_7": 3.74978985786438, + "epoch": 0.522, + "grad_norm": 412.0, + "kl_loss_10": 98.88156356811524, + "kl_loss_2": 1115.6398986816407, + "kl_loss_3": 774.026156616211, + "kl_loss_7": 195.32233123779298, + "learning_rate": 0.0004730398109049071, + "loss": 547.7821, + "step": 5220 + }, + { + "ce_loss_10": 3.633508253097534, + "ce_loss_13": 3.5716773152351378, + "ce_loss_2": 4.128389453887939, + "ce_loss_3": 3.9533074378967283, + "ce_loss_7": 3.6823344349861147, + "epoch": 0.523, + "grad_norm": 396.0, + "kl_loss_10": 98.93126792907715, + "kl_loss_2": 1163.846746826172, + "kl_loss_3": 810.2734771728516, + "kl_loss_7": 200.85460052490234, + "learning_rate": 0.000471455594568616, + "loss": 558.1328, + "step": 5230 + }, + { + "ce_loss_10": 3.707250881195068, + "ce_loss_13": 3.6447718501091004, + "ce_loss_2": 4.174321246147156, + "ce_loss_3": 4.004381275177002, + "ce_loss_7": 3.753636956214905, + "epoch": 0.524, + "grad_norm": 394.0, + "kl_loss_10": 100.72676544189453, + "kl_loss_2": 1114.3457427978515, + "kl_loss_3": 768.1556701660156, + "kl_loss_7": 195.28340759277344, + "learning_rate": 0.00046987166567417086, + "loss": 552.4388, + "step": 5240 + }, + { + "ce_loss_10": 3.6187984108924867, + "ce_loss_13": 3.5605034112930296, + "ce_loss_2": 4.1001020789146425, + "ce_loss_3": 3.9256922364234925, + "ce_loss_7": 3.664110267162323, + "epoch": 0.525, + "grad_norm": 380.0, + "kl_loss_10": 95.83710632324218, + "kl_loss_2": 1120.3159301757812, + "kl_loss_3": 775.0560852050781, + "kl_loss_7": 192.1679656982422, + "learning_rate": 0.00046828804017171776, + "loss": 536.3316, + "step": 5250 + }, + { + "ce_loss_10": 3.6720359563827514, + "ce_loss_13": 3.6088499784469605, + "ce_loss_2": 4.162907612323761, + "ce_loss_3": 3.9896105885505677, + "ce_loss_7": 3.722712779045105, + "epoch": 0.526, + "grad_norm": 394.0, + "kl_loss_10": 98.17714996337891, + "kl_loss_2": 1138.502374267578, + "kl_loss_3": 789.8116973876953, + "kl_loss_7": 197.40582656860352, + "learning_rate": 0.00046670473400834805, + "loss": 559.8189, + "step": 5260 + }, + { + "ce_loss_10": 3.597737526893616, + "ce_loss_13": 3.5393651485443116, + "ce_loss_2": 4.074982023239135, + "ce_loss_3": 3.9021154403686524, + "ce_loss_7": 3.644618010520935, + "epoch": 0.527, + "grad_norm": 436.0, + "kl_loss_10": 95.52880744934082, + "kl_loss_2": 1111.367953491211, + "kl_loss_3": 768.6636322021484, + "kl_loss_7": 191.67658157348632, + "learning_rate": 0.00046512176312793734, + "loss": 559.1187, + "step": 5270 + }, + { + "ce_loss_10": 3.5923956394195558, + "ce_loss_13": 3.5312567353248596, + "ce_loss_2": 4.0659032464027405, + "ce_loss_3": 3.9041757225990295, + "ce_loss_7": 3.638344919681549, + "epoch": 0.528, + "grad_norm": 382.0, + "kl_loss_10": 95.8816967010498, + "kl_loss_2": 1131.7323181152344, + "kl_loss_3": 788.8408813476562, + "kl_loss_7": 193.95931167602538, + "learning_rate": 0.00046353914347098467, + "loss": 557.7083, + "step": 5280 + }, + { + "ce_loss_10": 3.688094747066498, + "ce_loss_13": 3.626521134376526, + "ce_loss_2": 4.17344571352005, + "ce_loss_3": 3.9936492323875425, + "ce_loss_7": 3.7344411969184876, + "epoch": 0.529, + "grad_norm": 438.0, + "kl_loss_10": 99.97393112182617, + "kl_loss_2": 1136.7248291015626, + "kl_loss_3": 780.4773040771485, + "kl_loss_7": 194.1311233520508, + "learning_rate": 0.0004619568909744524, + "loss": 554.6544, + "step": 5290 + }, + { + "ce_loss_10": 3.6992242336273193, + "ce_loss_13": 3.6374841570854186, + "ce_loss_2": 4.173903214931488, + "ce_loss_3": 4.004681324958801, + "ce_loss_7": 3.7441007494926453, + "epoch": 0.53, + "grad_norm": 496.0, + "kl_loss_10": 100.66301612854004, + "kl_loss_2": 1118.1583740234375, + "kl_loss_3": 778.1599609375, + "kl_loss_7": 195.17978057861328, + "learning_rate": 0.00046037502157160573, + "loss": 555.7068, + "step": 5300 + }, + { + "ce_loss_10": 3.5648537158966063, + "ce_loss_13": 3.50801477432251, + "ce_loss_2": 4.0505608201026915, + "ce_loss_3": 3.885770845413208, + "ce_loss_7": 3.614854156970978, + "epoch": 0.531, + "grad_norm": 392.0, + "kl_loss_10": 95.29824142456054, + "kl_loss_2": 1148.0569580078125, + "kl_loss_3": 803.5360778808594, + "kl_loss_7": 195.23088302612305, + "learning_rate": 0.00045879355119185207, + "loss": 559.6594, + "step": 5310 + }, + { + "ce_loss_10": 3.6439425349235535, + "ce_loss_13": 3.583683359622955, + "ce_loss_2": 4.135701584815979, + "ce_loss_3": 3.9598298192024233, + "ce_loss_7": 3.692049765586853, + "epoch": 0.532, + "grad_norm": 444.0, + "kl_loss_10": 97.83190078735352, + "kl_loss_2": 1160.7438171386718, + "kl_loss_3": 807.647915649414, + "kl_loss_7": 199.49599685668946, + "learning_rate": 0.0004572124957605803, + "loss": 565.4321, + "step": 5320 + }, + { + "ce_loss_10": 3.6681848645210264, + "ce_loss_13": 3.607477676868439, + "ce_loss_2": 4.14128270149231, + "ce_loss_3": 3.9746485590934753, + "ce_loss_7": 3.7138744235038756, + "epoch": 0.533, + "grad_norm": 340.0, + "kl_loss_10": 95.41666564941406, + "kl_loss_2": 1136.1244140625, + "kl_loss_3": 793.3468963623047, + "kl_loss_7": 195.33221740722655, + "learning_rate": 0.00045563187119900103, + "loss": 550.4382, + "step": 5330 + }, + { + "ce_loss_10": 3.5087064266204835, + "ce_loss_13": 3.4494638442993164, + "ce_loss_2": 4.00373204946518, + "ce_loss_3": 3.8344790935516357, + "ce_loss_7": 3.5566913962364195, + "epoch": 0.534, + "grad_norm": 456.0, + "kl_loss_10": 96.30420112609863, + "kl_loss_2": 1145.2862731933594, + "kl_loss_3": 803.7556610107422, + "kl_loss_7": 194.92612915039064, + "learning_rate": 0.00045405169342398633, + "loss": 560.8537, + "step": 5340 + }, + { + "ce_loss_10": 3.5990882992744444, + "ce_loss_13": 3.535432243347168, + "ce_loss_2": 4.08842386007309, + "ce_loss_3": 3.912948155403137, + "ce_loss_7": 3.6465937376022337, + "epoch": 0.535, + "grad_norm": 422.0, + "kl_loss_10": 99.51773872375489, + "kl_loss_2": 1142.4013549804688, + "kl_loss_3": 795.5528442382813, + "kl_loss_7": 196.72316284179686, + "learning_rate": 0.0004524719783479088, + "loss": 548.8232, + "step": 5350 + }, + { + "ce_loss_10": 3.552276241779327, + "ce_loss_13": 3.492251825332642, + "ce_loss_2": 4.056445682048798, + "ce_loss_3": 3.8783608794212343, + "ce_loss_7": 3.603780543804169, + "epoch": 0.536, + "grad_norm": 376.0, + "kl_loss_10": 97.24302253723144, + "kl_loss_2": 1164.848809814453, + "kl_loss_3": 811.2062194824218, + "kl_loss_7": 198.37730560302734, + "learning_rate": 0.00045089274187848144, + "loss": 554.2202, + "step": 5360 + }, + { + "ce_loss_10": 3.6724863052368164, + "ce_loss_13": 3.6130531072616576, + "ce_loss_2": 4.1379453301429745, + "ce_loss_3": 3.968498194217682, + "ce_loss_7": 3.717296040058136, + "epoch": 0.537, + "grad_norm": 536.0, + "kl_loss_10": 96.28798866271973, + "kl_loss_2": 1108.8939270019532, + "kl_loss_3": 770.5279510498046, + "kl_loss_7": 192.69188079833984, + "learning_rate": 0.00044931399991859835, + "loss": 545.4216, + "step": 5370 + }, + { + "ce_loss_10": 3.5360588788986207, + "ce_loss_13": 3.474487328529358, + "ce_loss_2": 4.018628227710724, + "ce_loss_3": 3.8429470539093016, + "ce_loss_7": 3.5856809496879576, + "epoch": 0.538, + "grad_norm": 446.0, + "kl_loss_10": 97.58423805236816, + "kl_loss_2": 1139.092123413086, + "kl_loss_3": 788.7141876220703, + "kl_loss_7": 196.66349868774415, + "learning_rate": 0.00044773576836617336, + "loss": 546.6951, + "step": 5380 + }, + { + "ce_loss_10": 3.6238678693771362, + "ce_loss_13": 3.5626631021499633, + "ce_loss_2": 4.120850419998169, + "ce_loss_3": 3.943516790866852, + "ce_loss_7": 3.6712807416915894, + "epoch": 0.539, + "grad_norm": 388.0, + "kl_loss_10": 99.70593795776367, + "kl_loss_2": 1163.2907775878907, + "kl_loss_3": 810.1283508300781, + "kl_loss_7": 199.7040023803711, + "learning_rate": 0.00044615806311398056, + "loss": 569.078, + "step": 5390 + }, + { + "ce_loss_10": 3.706363093852997, + "ce_loss_13": 3.6457801342010496, + "ce_loss_2": 4.146688032150268, + "ce_loss_3": 3.9897242546081544, + "ce_loss_7": 3.7506498098373413, + "epoch": 0.54, + "grad_norm": 318.0, + "kl_loss_10": 98.56370239257812, + "kl_loss_2": 1084.4558197021483, + "kl_loss_3": 756.0719848632813, + "kl_loss_7": 191.6246208190918, + "learning_rate": 0.00044458090004949454, + "loss": 551.6847, + "step": 5400 + }, + { + "ce_loss_10": 3.5594072341918945, + "ce_loss_13": 3.4980836510658264, + "ce_loss_2": 4.072906112670898, + "ce_loss_3": 3.8963231086730956, + "ce_loss_7": 3.6096426606178285, + "epoch": 0.541, + "grad_norm": 490.0, + "kl_loss_10": 98.93370399475097, + "kl_loss_2": 1204.406317138672, + "kl_loss_3": 841.533901977539, + "kl_loss_7": 202.28990631103517, + "learning_rate": 0.0004430042950547297, + "loss": 563.3182, + "step": 5410 + }, + { + "ce_loss_10": 3.656948244571686, + "ce_loss_13": 3.5917163252830506, + "ce_loss_2": 4.146977603435516, + "ce_loss_3": 3.9775506377220156, + "ce_loss_7": 3.7048157334327696, + "epoch": 0.542, + "grad_norm": 472.0, + "kl_loss_10": 100.26595115661621, + "kl_loss_2": 1150.8060424804687, + "kl_loss_3": 803.8866760253907, + "kl_loss_7": 200.08724365234374, + "learning_rate": 0.0004414282640060809, + "loss": 559.1381, + "step": 5420 + }, + { + "ce_loss_10": 3.7556936740875244, + "ce_loss_13": 3.690820097923279, + "ce_loss_2": 4.2162927985191345, + "ce_loss_3": 4.059760391712189, + "ce_loss_7": 3.7993207812309264, + "epoch": 0.543, + "grad_norm": 466.0, + "kl_loss_10": 100.5603858947754, + "kl_loss_2": 1102.3566284179688, + "kl_loss_3": 774.5157104492188, + "kl_loss_7": 196.8573425292969, + "learning_rate": 0.0004398528227741633, + "loss": 566.5525, + "step": 5430 + }, + { + "ce_loss_10": 3.6126871943473815, + "ce_loss_13": 3.553126609325409, + "ce_loss_2": 4.1005645275115965, + "ce_loss_3": 3.9280160546302794, + "ce_loss_7": 3.660943078994751, + "epoch": 0.544, + "grad_norm": 458.0, + "kl_loss_10": 97.1538932800293, + "kl_loss_2": 1131.997964477539, + "kl_loss_3": 791.6496276855469, + "kl_loss_7": 198.33607559204103, + "learning_rate": 0.00043827798722365264, + "loss": 560.7217, + "step": 5440 + }, + { + "ce_loss_10": 3.744398605823517, + "ce_loss_13": 3.681015205383301, + "ce_loss_2": 4.201505517959594, + "ce_loss_3": 4.03549770116806, + "ce_loss_7": 3.788591706752777, + "epoch": 0.545, + "grad_norm": 352.0, + "kl_loss_10": 99.98037643432617, + "kl_loss_2": 1095.4162628173829, + "kl_loss_3": 762.5562530517578, + "kl_loss_7": 196.50249557495118, + "learning_rate": 0.00043670377321312535, + "loss": 539.1079, + "step": 5450 + }, + { + "ce_loss_10": 3.7459957599639893, + "ce_loss_13": 3.6846879959106444, + "ce_loss_2": 4.2025530457496645, + "ce_loss_3": 4.042814528942108, + "ce_loss_7": 3.789257228374481, + "epoch": 0.546, + "grad_norm": 346.0, + "kl_loss_10": 99.90774993896484, + "kl_loss_2": 1095.3400299072266, + "kl_loss_3": 761.9524017333985, + "kl_loss_7": 193.25130310058594, + "learning_rate": 0.0004351301965948991, + "loss": 550.9912, + "step": 5460 + }, + { + "ce_loss_10": 3.6544747233390806, + "ce_loss_13": 3.5925102829933167, + "ce_loss_2": 4.1156612753868105, + "ce_loss_3": 3.9492591619491577, + "ce_loss_7": 3.700915348529816, + "epoch": 0.547, + "grad_norm": 446.0, + "kl_loss_10": 99.69101219177246, + "kl_loss_2": 1097.9489288330078, + "kl_loss_3": 763.9795166015625, + "kl_loss_7": 193.2705093383789, + "learning_rate": 0.000433557273214873, + "loss": 548.6603, + "step": 5470 + }, + { + "ce_loss_10": 3.6407829880714417, + "ce_loss_13": 3.58055636882782, + "ce_loss_2": 4.112010169029236, + "ce_loss_3": 3.9410730838775634, + "ce_loss_7": 3.6900732636451723, + "epoch": 0.548, + "grad_norm": 364.0, + "kl_loss_10": 96.30272674560547, + "kl_loss_2": 1104.9110717773438, + "kl_loss_3": 764.0930358886719, + "kl_loss_7": 193.28277206420898, + "learning_rate": 0.000431985018912368, + "loss": 539.9292, + "step": 5480 + }, + { + "ce_loss_10": 3.6089709639549254, + "ce_loss_13": 3.5466750621795655, + "ce_loss_2": 4.105600357055664, + "ce_loss_3": 3.9258901715278625, + "ce_loss_7": 3.658845567703247, + "epoch": 0.549, + "grad_norm": 428.0, + "kl_loss_10": 98.85242919921875, + "kl_loss_2": 1163.1305419921875, + "kl_loss_3": 809.7076019287109, + "kl_loss_7": 198.85261154174805, + "learning_rate": 0.0004304134495199674, + "loss": 550.7034, + "step": 5490 + }, + { + "ce_loss_10": 3.638536274433136, + "ce_loss_13": 3.575793814659119, + "ce_loss_2": 4.123488712310791, + "ce_loss_3": 3.954343330860138, + "ce_loss_7": 3.685023546218872, + "epoch": 0.55, + "grad_norm": 488.0, + "kl_loss_10": 99.10371284484863, + "kl_loss_2": 1163.9283081054687, + "kl_loss_3": 806.7497436523438, + "kl_loss_7": 200.15425338745118, + "learning_rate": 0.0004288425808633575, + "loss": 555.8719, + "step": 5500 + }, + { + "ce_loss_10": 3.6068961024284363, + "ce_loss_13": 3.5489359140396117, + "ce_loss_2": 4.091926336288452, + "ce_loss_3": 3.914480412006378, + "ce_loss_7": 3.653805840015411, + "epoch": 0.551, + "grad_norm": 482.0, + "kl_loss_10": 95.30807762145996, + "kl_loss_2": 1135.6305114746094, + "kl_loss_3": 782.8162139892578, + "kl_loss_7": 192.36727905273438, + "learning_rate": 0.0004272724287611684, + "loss": 551.1164, + "step": 5510 + }, + { + "ce_loss_10": 3.5843793511390687, + "ce_loss_13": 3.5220483541488647, + "ce_loss_2": 4.066782796382904, + "ce_loss_3": 3.8880024194717406, + "ce_loss_7": 3.628884470462799, + "epoch": 0.552, + "grad_norm": 472.0, + "kl_loss_10": 98.19914245605469, + "kl_loss_2": 1138.4930938720704, + "kl_loss_3": 792.6924499511719, + "kl_loss_7": 197.34004135131835, + "learning_rate": 0.00042570300902481425, + "loss": 550.9366, + "step": 5520 + }, + { + "ce_loss_10": 3.6187870144844054, + "ce_loss_13": 3.559086096286774, + "ce_loss_2": 4.0836735486984255, + "ce_loss_3": 3.913509225845337, + "ce_loss_7": 3.662268269062042, + "epoch": 0.553, + "grad_norm": 460.0, + "kl_loss_10": 96.8458236694336, + "kl_loss_2": 1113.29208984375, + "kl_loss_3": 778.9167602539062, + "kl_loss_7": 192.73130722045897, + "learning_rate": 0.00042413433745833423, + "loss": 545.5068, + "step": 5530 + }, + { + "ce_loss_10": 3.6217783451080323, + "ce_loss_13": 3.5588382482528687, + "ce_loss_2": 4.102611029148102, + "ce_loss_3": 3.9288668751716616, + "ce_loss_7": 3.667692792415619, + "epoch": 0.554, + "grad_norm": 394.0, + "kl_loss_10": 99.64076881408691, + "kl_loss_2": 1129.861962890625, + "kl_loss_3": 781.159780883789, + "kl_loss_7": 194.5426254272461, + "learning_rate": 0.0004225664298582339, + "loss": 538.3319, + "step": 5540 + }, + { + "ce_loss_10": 3.7008472084999084, + "ce_loss_13": 3.6404882073402405, + "ce_loss_2": 4.157876873016358, + "ce_loss_3": 3.9944301009178163, + "ce_loss_7": 3.7464569926261904, + "epoch": 0.555, + "grad_norm": 352.0, + "kl_loss_10": 98.0084358215332, + "kl_loss_2": 1092.2807312011719, + "kl_loss_3": 758.9974426269531, + "kl_loss_7": 191.41172409057617, + "learning_rate": 0.000420999302013325, + "loss": 539.2247, + "step": 5550 + }, + { + "ce_loss_10": 3.5973586678504943, + "ce_loss_13": 3.534582734107971, + "ce_loss_2": 4.09981359243393, + "ce_loss_3": 3.9165178179740905, + "ce_loss_7": 3.6474678754806518, + "epoch": 0.556, + "grad_norm": 454.0, + "kl_loss_10": 99.95339088439941, + "kl_loss_2": 1148.3679443359374, + "kl_loss_3": 795.4782531738281, + "kl_loss_7": 199.34042739868164, + "learning_rate": 0.000419432969704568, + "loss": 547.6515, + "step": 5560 + }, + { + "ce_loss_10": 3.6402106523513793, + "ce_loss_13": 3.580482280254364, + "ce_loss_2": 4.112204611301422, + "ce_loss_3": 3.9463653802871703, + "ce_loss_7": 3.6864510416984557, + "epoch": 0.557, + "grad_norm": 374.0, + "kl_loss_10": 97.21049270629882, + "kl_loss_2": 1103.2306396484375, + "kl_loss_3": 765.6696472167969, + "kl_loss_7": 192.21127700805664, + "learning_rate": 0.00041786744870491154, + "loss": 552.003, + "step": 5570 + }, + { + "ce_loss_10": 3.5763687014579775, + "ce_loss_13": 3.513793337345123, + "ce_loss_2": 4.059341847896576, + "ce_loss_3": 3.8873541951179504, + "ce_loss_7": 3.6242376923561097, + "epoch": 0.558, + "grad_norm": 496.0, + "kl_loss_10": 99.6470874786377, + "kl_loss_2": 1146.4394836425781, + "kl_loss_3": 799.3714019775391, + "kl_loss_7": 198.99811019897462, + "learning_rate": 0.0004163027547791347, + "loss": 555.3918, + "step": 5580 + }, + { + "ce_loss_10": 3.550457501411438, + "ce_loss_13": 3.490234684944153, + "ce_loss_2": 4.058210396766663, + "ce_loss_3": 3.8777605056762696, + "ce_loss_7": 3.5981253504753115, + "epoch": 0.559, + "grad_norm": 362.0, + "kl_loss_10": 96.0154800415039, + "kl_loss_2": 1166.6077453613282, + "kl_loss_3": 807.5666870117187, + "kl_loss_7": 196.15278396606445, + "learning_rate": 0.0004147389036836881, + "loss": 556.2604, + "step": 5590 + }, + { + "ce_loss_10": 3.606854057312012, + "ce_loss_13": 3.545392167568207, + "ce_loss_2": 4.097903311252594, + "ce_loss_3": 3.924593436717987, + "ce_loss_7": 3.652910280227661, + "epoch": 0.56, + "grad_norm": 580.0, + "kl_loss_10": 99.4388584136963, + "kl_loss_2": 1150.4553649902343, + "kl_loss_3": 802.4499359130859, + "kl_loss_7": 196.6334327697754, + "learning_rate": 0.00041317591116653486, + "loss": 563.6437, + "step": 5600 + }, + { + "ce_loss_10": 3.6449447154998778, + "ce_loss_13": 3.5830300569534304, + "ce_loss_2": 4.1296777606010435, + "ce_loss_3": 3.9580175995826723, + "ce_loss_7": 3.695024287700653, + "epoch": 0.561, + "grad_norm": 528.0, + "kl_loss_10": 100.15715980529785, + "kl_loss_2": 1137.3770324707032, + "kl_loss_3": 786.0429443359375, + "kl_loss_7": 199.2029815673828, + "learning_rate": 0.0004116137929669921, + "loss": 545.8336, + "step": 5610 + }, + { + "ce_loss_10": 3.6345237135887145, + "ce_loss_13": 3.575796604156494, + "ce_loss_2": 4.1131403088569645, + "ce_loss_3": 3.940467345714569, + "ce_loss_7": 3.6807628154754637, + "epoch": 0.562, + "grad_norm": 388.0, + "kl_loss_10": 95.75808372497559, + "kl_loss_2": 1128.9722564697265, + "kl_loss_3": 784.2019927978515, + "kl_loss_7": 193.04863052368165, + "learning_rate": 0.00041005256481557305, + "loss": 543.754, + "step": 5620 + }, + { + "ce_loss_10": 3.7401763558387757, + "ce_loss_13": 3.6805962681770326, + "ce_loss_2": 4.185898721218109, + "ce_loss_3": 4.027551281452179, + "ce_loss_7": 3.783228611946106, + "epoch": 0.563, + "grad_norm": 516.0, + "kl_loss_10": 96.21339073181153, + "kl_loss_2": 1061.5840301513672, + "kl_loss_3": 738.3266693115235, + "kl_loss_7": 187.24717712402344, + "learning_rate": 0.00040849224243382767, + "loss": 533.9922, + "step": 5630 + }, + { + "ce_loss_10": 3.5920221328735353, + "ce_loss_13": 3.5324007272720337, + "ce_loss_2": 4.072316908836365, + "ce_loss_3": 3.8983843684196473, + "ce_loss_7": 3.6374821186065676, + "epoch": 0.564, + "grad_norm": 338.0, + "kl_loss_10": 95.43405532836914, + "kl_loss_2": 1128.149676513672, + "kl_loss_3": 783.0245666503906, + "kl_loss_7": 193.40655746459962, + "learning_rate": 0.000406932841534185, + "loss": 541.5961, + "step": 5640 + }, + { + "ce_loss_10": 3.5484704256057737, + "ce_loss_13": 3.486864137649536, + "ce_loss_2": 4.036842632293701, + "ce_loss_3": 3.8651990056037904, + "ce_loss_7": 3.597266983985901, + "epoch": 0.565, + "grad_norm": 604.0, + "kl_loss_10": 95.5288932800293, + "kl_loss_2": 1141.9300598144532, + "kl_loss_3": 797.4877136230468, + "kl_loss_7": 194.9025909423828, + "learning_rate": 0.0004053743778197951, + "loss": 559.9006, + "step": 5650 + }, + { + "ce_loss_10": 3.6602503299713134, + "ce_loss_13": 3.596804344654083, + "ce_loss_2": 4.136269843578338, + "ce_loss_3": 3.967122423648834, + "ce_loss_7": 3.7048738479614256, + "epoch": 0.566, + "grad_norm": 418.0, + "kl_loss_10": 101.36623306274414, + "kl_loss_2": 1114.9331634521484, + "kl_loss_3": 774.6735748291015, + "kl_loss_7": 196.29364929199218, + "learning_rate": 0.0004038168669843697, + "loss": 553.1191, + "step": 5660 + }, + { + "ce_loss_10": 3.6255574107170103, + "ce_loss_13": 3.5639535069465635, + "ce_loss_2": 4.085369718074799, + "ce_loss_3": 3.919695568084717, + "ce_loss_7": 3.6704380750656127, + "epoch": 0.567, + "grad_norm": 736.0, + "kl_loss_10": 98.19256973266602, + "kl_loss_2": 1100.560809326172, + "kl_loss_3": 765.2613342285156, + "kl_loss_7": 192.50554656982422, + "learning_rate": 0.000402260324712026, + "loss": 547.8535, + "step": 5670 + }, + { + "ce_loss_10": 3.669494354724884, + "ce_loss_13": 3.60741925239563, + "ce_loss_2": 4.148016309738159, + "ce_loss_3": 3.9792763590812683, + "ce_loss_7": 3.7149499893188476, + "epoch": 0.568, + "grad_norm": 498.0, + "kl_loss_10": 99.65063438415527, + "kl_loss_2": 1126.1991058349608, + "kl_loss_3": 783.0127746582032, + "kl_loss_7": 194.12312698364258, + "learning_rate": 0.00040070476667712743, + "loss": 543.5942, + "step": 5680 + }, + { + "ce_loss_10": 3.7005011796951295, + "ce_loss_13": 3.6357283353805543, + "ce_loss_2": 4.166275656223297, + "ce_loss_3": 4.000016844272613, + "ce_loss_7": 3.745167064666748, + "epoch": 0.569, + "grad_norm": 356.0, + "kl_loss_10": 100.85004692077636, + "kl_loss_2": 1110.8271209716797, + "kl_loss_3": 770.1773681640625, + "kl_loss_7": 194.5637939453125, + "learning_rate": 0.0003991502085441259, + "loss": 548.6594, + "step": 5690 + }, + { + "ce_loss_10": 3.729709804058075, + "ce_loss_13": 3.6688971519470215, + "ce_loss_2": 4.18160834312439, + "ce_loss_3": 4.015434455871582, + "ce_loss_7": 3.7735623002052305, + "epoch": 0.57, + "grad_norm": 374.0, + "kl_loss_10": 98.11349868774414, + "kl_loss_2": 1070.740576171875, + "kl_loss_3": 744.3529327392578, + "kl_loss_7": 190.02555770874022, + "learning_rate": 0.0003975966659674047, + "loss": 541.8046, + "step": 5700 + }, + { + "ce_loss_10": 3.691783332824707, + "ce_loss_13": 3.6318029403686523, + "ce_loss_2": 4.161513650417328, + "ce_loss_3": 3.986249303817749, + "ce_loss_7": 3.73646023273468, + "epoch": 0.571, + "grad_norm": 536.0, + "kl_loss_10": 98.58754501342773, + "kl_loss_2": 1102.3078491210938, + "kl_loss_3": 759.7488586425782, + "kl_loss_7": 191.9359992980957, + "learning_rate": 0.0003960441545911204, + "loss": 538.4236, + "step": 5710 + }, + { + "ce_loss_10": 3.6897791981697083, + "ce_loss_13": 3.6274471282958984, + "ce_loss_2": 4.157203590869903, + "ce_loss_3": 3.9884847044944762, + "ce_loss_7": 3.736597418785095, + "epoch": 0.572, + "grad_norm": 600.0, + "kl_loss_10": 97.47168769836426, + "kl_loss_2": 1115.5811676025392, + "kl_loss_3": 773.7587646484375, + "kl_loss_7": 193.96655197143554, + "learning_rate": 0.0003944926900490452, + "loss": 541.7897, + "step": 5720 + }, + { + "ce_loss_10": 3.6022287607192993, + "ce_loss_13": 3.541483187675476, + "ce_loss_2": 4.094926071166992, + "ce_loss_3": 3.9194396138191223, + "ce_loss_7": 3.65008407831192, + "epoch": 0.573, + "grad_norm": 352.0, + "kl_loss_10": 96.51857452392578, + "kl_loss_2": 1147.3706939697265, + "kl_loss_3": 794.91083984375, + "kl_loss_7": 194.98720092773436, + "learning_rate": 0.0003929422879644099, + "loss": 544.8611, + "step": 5730 + }, + { + "ce_loss_10": 3.6093438267707825, + "ce_loss_13": 3.5497053503990172, + "ce_loss_2": 4.068271553516388, + "ce_loss_3": 3.9011916518211365, + "ce_loss_7": 3.6547187089920046, + "epoch": 0.574, + "grad_norm": 426.0, + "kl_loss_10": 95.6807746887207, + "kl_loss_2": 1107.6688201904296, + "kl_loss_3": 763.7292449951171, + "kl_loss_7": 189.65744247436524, + "learning_rate": 0.0003913929639497462, + "loss": 535.444, + "step": 5740 + }, + { + "ce_loss_10": 3.5539973855018614, + "ce_loss_13": 3.4933292627334596, + "ce_loss_2": 4.044394338130951, + "ce_loss_3": 3.8677351474761963, + "ce_loss_7": 3.6000022888183594, + "epoch": 0.575, + "grad_norm": 408.0, + "kl_loss_10": 95.82653579711913, + "kl_loss_2": 1130.1885803222656, + "kl_loss_3": 778.0026184082031, + "kl_loss_7": 190.79474563598632, + "learning_rate": 0.00038984473360672965, + "loss": 541.1631, + "step": 5750 + }, + { + "ce_loss_10": 3.5721747159957884, + "ce_loss_13": 3.5100734710693358, + "ce_loss_2": 4.053931272029876, + "ce_loss_3": 3.883261811733246, + "ce_loss_7": 3.6166505217552185, + "epoch": 0.576, + "grad_norm": 436.0, + "kl_loss_10": 95.3091812133789, + "kl_loss_2": 1128.7456329345703, + "kl_loss_3": 780.4191925048829, + "kl_loss_7": 190.4754554748535, + "learning_rate": 0.0003882976125260229, + "loss": 539.7566, + "step": 5760 + }, + { + "ce_loss_10": 3.638679492473602, + "ce_loss_13": 3.5770092844963073, + "ce_loss_2": 4.1140677571296695, + "ce_loss_3": 3.9416022896766663, + "ce_loss_7": 3.6866235971450805, + "epoch": 0.577, + "grad_norm": 366.0, + "kl_loss_10": 98.93351516723632, + "kl_loss_2": 1112.5931701660156, + "kl_loss_3": 770.6242248535157, + "kl_loss_7": 191.9038848876953, + "learning_rate": 0.00038675161628711776, + "loss": 545.2976, + "step": 5770 + }, + { + "ce_loss_10": 3.678569030761719, + "ce_loss_13": 3.616915798187256, + "ce_loss_2": 4.1388965249061584, + "ce_loss_3": 3.9749330997467043, + "ce_loss_7": 3.722931241989136, + "epoch": 0.578, + "grad_norm": 404.0, + "kl_loss_10": 97.5284637451172, + "kl_loss_2": 1093.5021606445312, + "kl_loss_3": 761.3094451904296, + "kl_loss_7": 191.26370391845703, + "learning_rate": 0.0003852067604581794, + "loss": 553.459, + "step": 5780 + }, + { + "ce_loss_10": 3.6174680829048156, + "ce_loss_13": 3.5550846695899962, + "ce_loss_2": 4.100849425792694, + "ce_loss_3": 3.927929162979126, + "ce_loss_7": 3.665549111366272, + "epoch": 0.579, + "grad_norm": 502.0, + "kl_loss_10": 97.5420696258545, + "kl_loss_2": 1125.1912048339843, + "kl_loss_3": 782.9702056884765, + "kl_loss_7": 193.16246643066407, + "learning_rate": 0.0003836630605958888, + "loss": 543.639, + "step": 5790 + }, + { + "ce_loss_10": 3.6780447602272033, + "ce_loss_13": 3.616100025177002, + "ce_loss_2": 4.136243522167206, + "ce_loss_3": 3.9725910425186157, + "ce_loss_7": 3.7234076499938964, + "epoch": 0.58, + "grad_norm": 506.0, + "kl_loss_10": 99.15894927978516, + "kl_loss_2": 1117.2952941894532, + "kl_loss_3": 777.6545166015625, + "kl_loss_7": 194.16991271972657, + "learning_rate": 0.0003821205322452863, + "loss": 560.4495, + "step": 5800 + }, + { + "ce_loss_10": 3.657036304473877, + "ce_loss_13": 3.5961548686027527, + "ce_loss_2": 4.118453872203827, + "ce_loss_3": 3.948525774478912, + "ce_loss_7": 3.7012171149253845, + "epoch": 0.581, + "grad_norm": 438.0, + "kl_loss_10": 98.11412734985352, + "kl_loss_2": 1098.6213439941407, + "kl_loss_3": 759.3198364257812, + "kl_loss_7": 189.98369064331055, + "learning_rate": 0.0003805791909396155, + "loss": 541.5742, + "step": 5810 + }, + { + "ce_loss_10": 3.6096495151519776, + "ce_loss_13": 3.550210452079773, + "ce_loss_2": 4.077665090560913, + "ce_loss_3": 3.9094552993774414, + "ce_loss_7": 3.654946839809418, + "epoch": 0.582, + "grad_norm": 428.0, + "kl_loss_10": 95.98116798400879, + "kl_loss_2": 1109.6123931884765, + "kl_loss_3": 763.3366668701171, + "kl_loss_7": 189.48765182495117, + "learning_rate": 0.0003790390522001662, + "loss": 547.1139, + "step": 5820 + }, + { + "ce_loss_10": 3.538465416431427, + "ce_loss_13": 3.4795125126838684, + "ce_loss_2": 4.019526553153992, + "ce_loss_3": 3.8418781757354736, + "ce_loss_7": 3.5831465244293215, + "epoch": 0.583, + "grad_norm": 354.0, + "kl_loss_10": 94.34587249755859, + "kl_loss_2": 1136.918035888672, + "kl_loss_3": 784.7109252929688, + "kl_loss_7": 191.27632827758788, + "learning_rate": 0.0003775001315361183, + "loss": 542.445, + "step": 5830 + }, + { + "ce_loss_10": 3.659132921695709, + "ce_loss_13": 3.596101534366608, + "ce_loss_2": 4.132727253437042, + "ce_loss_3": 3.958163845539093, + "ce_loss_7": 3.704639720916748, + "epoch": 0.584, + "grad_norm": 298.0, + "kl_loss_10": 98.75731201171875, + "kl_loss_2": 1122.0884033203124, + "kl_loss_3": 776.4772644042969, + "kl_loss_7": 193.22739944458007, + "learning_rate": 0.0003759624444443858, + "loss": 544.9992, + "step": 5840 + }, + { + "ce_loss_10": 3.6889251112937926, + "ce_loss_13": 3.6282206773757935, + "ce_loss_2": 4.151758980751038, + "ce_loss_3": 3.9822983741760254, + "ce_loss_7": 3.732993245124817, + "epoch": 0.585, + "grad_norm": 346.0, + "kl_loss_10": 99.06045837402344, + "kl_loss_2": 1097.8614471435546, + "kl_loss_3": 758.9134582519531, + "kl_loss_7": 191.27917098999023, + "learning_rate": 0.00037442600640946044, + "loss": 536.17, + "step": 5850 + }, + { + "ce_loss_10": 3.6461440443992617, + "ce_loss_13": 3.5892478227615356, + "ce_loss_2": 4.105236732959748, + "ce_loss_3": 3.9375507473945617, + "ce_loss_7": 3.692450475692749, + "epoch": 0.586, + "grad_norm": 408.0, + "kl_loss_10": 94.86803092956544, + "kl_loss_2": 1099.2377655029297, + "kl_loss_3": 758.3301605224609, + "kl_loss_7": 189.78098831176757, + "learning_rate": 0.00037289083290325663, + "loss": 531.0057, + "step": 5860 + }, + { + "ce_loss_10": 3.63515100479126, + "ce_loss_13": 3.574202799797058, + "ce_loss_2": 4.095511162281037, + "ce_loss_3": 3.930715727806091, + "ce_loss_7": 3.6794507265090943, + "epoch": 0.587, + "grad_norm": 540.0, + "kl_loss_10": 97.98805313110351, + "kl_loss_2": 1091.7025299072266, + "kl_loss_3": 757.6223114013671, + "kl_loss_7": 191.85128860473634, + "learning_rate": 0.0003713569393849543, + "loss": 533.4333, + "step": 5870 + }, + { + "ce_loss_10": 3.6827593207359315, + "ce_loss_13": 3.6205956816673277, + "ce_loss_2": 4.148468089103699, + "ce_loss_3": 3.978341579437256, + "ce_loss_7": 3.7273068189620973, + "epoch": 0.588, + "grad_norm": 398.0, + "kl_loss_10": 98.60938911437988, + "kl_loss_2": 1107.6281311035157, + "kl_loss_3": 765.0102233886719, + "kl_loss_7": 192.96542663574218, + "learning_rate": 0.00036982434130084397, + "loss": 541.5767, + "step": 5880 + }, + { + "ce_loss_10": 3.589915359020233, + "ce_loss_13": 3.5286367654800417, + "ce_loss_2": 4.061057722568512, + "ce_loss_3": 3.8881011605262756, + "ce_loss_7": 3.6373565912246706, + "epoch": 0.589, + "grad_norm": 506.0, + "kl_loss_10": 97.51137619018554, + "kl_loss_2": 1115.5977966308594, + "kl_loss_3": 775.6395446777344, + "kl_loss_7": 195.47111892700195, + "learning_rate": 0.00036829305408417166, + "loss": 546.8446, + "step": 5890 + }, + { + "ce_loss_10": 3.5797411799430847, + "ce_loss_13": 3.5188158631324766, + "ce_loss_2": 4.067822527885437, + "ce_loss_3": 3.893584966659546, + "ce_loss_7": 3.6291656494140625, + "epoch": 0.59, + "grad_norm": 364.0, + "kl_loss_10": 96.57020835876465, + "kl_loss_2": 1141.290579223633, + "kl_loss_3": 789.6200988769531, + "kl_loss_7": 196.76195220947267, + "learning_rate": 0.0003667630931549826, + "loss": 548.8211, + "step": 5900 + }, + { + "ce_loss_10": 3.547331213951111, + "ce_loss_13": 3.4874081373214723, + "ce_loss_2": 4.03765162229538, + "ce_loss_3": 3.8655640482902527, + "ce_loss_7": 3.5946906566619874, + "epoch": 0.591, + "grad_norm": 454.0, + "kl_loss_10": 95.69526252746581, + "kl_loss_2": 1154.8450256347655, + "kl_loss_3": 798.5165588378907, + "kl_loss_7": 194.5025749206543, + "learning_rate": 0.00036523447391996613, + "loss": 552.8163, + "step": 5910 + }, + { + "ce_loss_10": 3.6425758361816407, + "ce_loss_13": 3.5853498816490172, + "ce_loss_2": 4.10631023645401, + "ce_loss_3": 3.9402198076248167, + "ce_loss_7": 3.690027916431427, + "epoch": 0.592, + "grad_norm": 432.0, + "kl_loss_10": 94.87303581237794, + "kl_loss_2": 1090.1558319091796, + "kl_loss_3": 756.7847717285156, + "kl_loss_7": 189.84710311889648, + "learning_rate": 0.00036370721177230114, + "loss": 533.6673, + "step": 5920 + }, + { + "ce_loss_10": 3.635672652721405, + "ce_loss_13": 3.577661764621735, + "ce_loss_2": 4.114610862731934, + "ce_loss_3": 3.9419226169586183, + "ce_loss_7": 3.681511878967285, + "epoch": 0.593, + "grad_norm": 326.0, + "kl_loss_10": 95.39519729614258, + "kl_loss_2": 1127.0120056152343, + "kl_loss_3": 780.4901336669922, + "kl_loss_7": 194.04692993164062, + "learning_rate": 0.00036218132209150044, + "loss": 545.1962, + "step": 5930 + }, + { + "ce_loss_10": 3.593142592906952, + "ce_loss_13": 3.530347979068756, + "ce_loss_2": 4.095171976089477, + "ce_loss_3": 3.920231354236603, + "ce_loss_7": 3.645453596115112, + "epoch": 0.594, + "grad_norm": 378.0, + "kl_loss_10": 99.63440895080566, + "kl_loss_2": 1173.4297882080077, + "kl_loss_3": 813.8213714599609, + "kl_loss_7": 199.65494766235352, + "learning_rate": 0.0003606568202432562, + "loss": 557.0208, + "step": 5940 + }, + { + "ce_loss_10": 3.665185475349426, + "ce_loss_13": 3.6032612800598143, + "ce_loss_2": 4.14498724937439, + "ce_loss_3": 3.9701961159706114, + "ce_loss_7": 3.7108847856521607, + "epoch": 0.595, + "grad_norm": 528.0, + "kl_loss_10": 99.43977394104004, + "kl_loss_2": 1140.6280212402344, + "kl_loss_3": 787.1899200439453, + "kl_loss_7": 195.35167922973633, + "learning_rate": 0.0003591337215792851, + "loss": 544.2271, + "step": 5950 + }, + { + "ce_loss_10": 3.706349265575409, + "ce_loss_13": 3.64465993642807, + "ce_loss_2": 4.152172029018402, + "ce_loss_3": 3.9943688988685606, + "ce_loss_7": 3.7489245533943176, + "epoch": 0.596, + "grad_norm": 356.0, + "kl_loss_10": 99.39506378173829, + "kl_loss_2": 1087.233724975586, + "kl_loss_3": 759.1374755859375, + "kl_loss_7": 190.80716857910156, + "learning_rate": 0.00035761204143717383, + "loss": 544.3471, + "step": 5960 + }, + { + "ce_loss_10": 3.6578794836997988, + "ce_loss_13": 3.5957969784736634, + "ce_loss_2": 4.119996964931488, + "ce_loss_3": 3.9552765846252442, + "ce_loss_7": 3.7025834202766417, + "epoch": 0.597, + "grad_norm": 400.0, + "kl_loss_10": 99.01246032714843, + "kl_loss_2": 1115.1319488525392, + "kl_loss_3": 774.3078552246094, + "kl_loss_7": 193.01641845703125, + "learning_rate": 0.0003560917951402245, + "loss": 556.3752, + "step": 5970 + }, + { + "ce_loss_10": 3.632036602497101, + "ce_loss_13": 3.5740628480911254, + "ce_loss_2": 4.0921210765838625, + "ce_loss_3": 3.9307610511779787, + "ce_loss_7": 3.6746655702590942, + "epoch": 0.598, + "grad_norm": 412.0, + "kl_loss_10": 95.97110137939453, + "kl_loss_2": 1101.7569305419922, + "kl_loss_3": 768.7692047119141, + "kl_loss_7": 189.95830230712892, + "learning_rate": 0.00035457299799730046, + "loss": 538.1885, + "step": 5980 + }, + { + "ce_loss_10": 3.69617702960968, + "ce_loss_13": 3.6354240775108337, + "ce_loss_2": 4.163921213150024, + "ce_loss_3": 3.993851900100708, + "ce_loss_7": 3.7415480971336366, + "epoch": 0.599, + "grad_norm": 388.0, + "kl_loss_10": 96.27426452636719, + "kl_loss_2": 1105.9306549072267, + "kl_loss_3": 762.228305053711, + "kl_loss_7": 190.51752395629882, + "learning_rate": 0.0003530556653026721, + "loss": 545.8183, + "step": 5990 + }, + { + "ce_loss_10": 3.611501228809357, + "ce_loss_13": 3.5530946016311646, + "ce_loss_2": 4.07593857049942, + "ce_loss_3": 3.9016834497451782, + "ce_loss_7": 3.6570339798927307, + "epoch": 0.6, + "grad_norm": 1424.0, + "kl_loss_10": 94.48569107055664, + "kl_loss_2": 1108.4388488769532, + "kl_loss_3": 760.983023071289, + "kl_loss_7": 188.30435333251953, + "learning_rate": 0.00035153981233586274, + "loss": 543.2547, + "step": 6000 + }, + { + "ce_loss_10": 3.589734137058258, + "ce_loss_13": 3.5291273951530457, + "ce_loss_2": 4.066950809955597, + "ce_loss_3": 3.8936201214790342, + "ce_loss_7": 3.6356727838516236, + "epoch": 0.601, + "grad_norm": 478.0, + "kl_loss_10": 95.43113746643067, + "kl_loss_2": 1117.119808959961, + "kl_loss_3": 769.7344940185546, + "kl_loss_7": 188.8736831665039, + "learning_rate": 0.00035002545436149473, + "loss": 555.4068, + "step": 6010 + }, + { + "ce_loss_10": 3.603361654281616, + "ce_loss_13": 3.5395719528198244, + "ce_loss_2": 4.084376287460327, + "ce_loss_3": 3.913386416435242, + "ce_loss_7": 3.6495144724845887, + "epoch": 0.602, + "grad_norm": 414.0, + "kl_loss_10": 99.58069725036621, + "kl_loss_2": 1138.4922149658203, + "kl_loss_3": 791.1285461425781, + "kl_loss_7": 196.0400062561035, + "learning_rate": 0.0003485126066291364, + "loss": 543.3661, + "step": 6020 + }, + { + "ce_loss_10": 3.6472663640975953, + "ce_loss_13": 3.586405074596405, + "ce_loss_2": 4.12690646648407, + "ce_loss_3": 3.9540088891983034, + "ce_loss_7": 3.6910028219223023, + "epoch": 0.603, + "grad_norm": 426.0, + "kl_loss_10": 97.50395317077637, + "kl_loss_2": 1120.6384643554688, + "kl_loss_3": 773.8977966308594, + "kl_loss_7": 189.96464309692382, + "learning_rate": 0.0003470012843731476, + "loss": 547.4742, + "step": 6030 + }, + { + "ce_loss_10": 3.587485361099243, + "ce_loss_13": 3.527864229679108, + "ce_loss_2": 4.065750408172607, + "ce_loss_3": 3.8930314064025877, + "ce_loss_7": 3.6307687997817992, + "epoch": 0.604, + "grad_norm": 450.0, + "kl_loss_10": 95.93178520202636, + "kl_loss_2": 1125.8798370361328, + "kl_loss_3": 778.0897277832031, + "kl_loss_7": 190.32968826293944, + "learning_rate": 0.00034549150281252633, + "loss": 553.9461, + "step": 6040 + }, + { + "ce_loss_10": 3.567354416847229, + "ce_loss_13": 3.5087788224220278, + "ce_loss_2": 4.041226005554199, + "ce_loss_3": 3.868573796749115, + "ce_loss_7": 3.613230037689209, + "epoch": 0.605, + "grad_norm": 376.0, + "kl_loss_10": 96.31193771362305, + "kl_loss_2": 1101.1357208251952, + "kl_loss_3": 760.5451019287109, + "kl_loss_7": 190.99923782348634, + "learning_rate": 0.0003439832771507565, + "loss": 537.7418, + "step": 6050 + }, + { + "ce_loss_10": 3.569633936882019, + "ce_loss_13": 3.5091484904289247, + "ce_loss_2": 4.052746975421906, + "ce_loss_3": 3.8793442845344543, + "ce_loss_7": 3.6145769238471983, + "epoch": 0.606, + "grad_norm": 364.0, + "kl_loss_10": 96.17846641540527, + "kl_loss_2": 1126.9381469726563, + "kl_loss_3": 780.4287139892579, + "kl_loss_7": 191.24787139892578, + "learning_rate": 0.0003424766225756537, + "loss": 539.2611, + "step": 6060 + }, + { + "ce_loss_10": 3.6349270820617674, + "ce_loss_13": 3.5724891662597655, + "ce_loss_2": 4.110528755187988, + "ce_loss_3": 3.9370043516159057, + "ce_loss_7": 3.679009509086609, + "epoch": 0.607, + "grad_norm": 380.0, + "kl_loss_10": 98.61342163085938, + "kl_loss_2": 1107.0002716064453, + "kl_loss_3": 763.0299987792969, + "kl_loss_7": 192.68891830444335, + "learning_rate": 0.00034097155425921255, + "loss": 535.4806, + "step": 6070 + }, + { + "ce_loss_10": 3.5260583400726317, + "ce_loss_13": 3.4644631028175352, + "ce_loss_2": 4.0014289021492, + "ce_loss_3": 3.829664409160614, + "ce_loss_7": 3.571485424041748, + "epoch": 0.608, + "grad_norm": 422.0, + "kl_loss_10": 95.72014465332032, + "kl_loss_2": 1128.9732635498046, + "kl_loss_3": 780.0001983642578, + "kl_loss_7": 191.94852294921876, + "learning_rate": 0.0003394680873574546, + "loss": 542.5872, + "step": 6080 + }, + { + "ce_loss_10": 3.638583517074585, + "ce_loss_13": 3.5754881620407106, + "ce_loss_2": 4.1181090593338014, + "ce_loss_3": 3.9476171731948853, + "ce_loss_7": 3.6838363647460937, + "epoch": 0.609, + "grad_norm": 402.0, + "kl_loss_10": 99.43503112792969, + "kl_loss_2": 1131.3410400390626, + "kl_loss_3": 782.6971099853515, + "kl_loss_7": 192.93393096923828, + "learning_rate": 0.0003379662370102747, + "loss": 542.0118, + "step": 6090 + }, + { + "ce_loss_10": 3.6437841415405274, + "ce_loss_13": 3.5835014939308167, + "ce_loss_2": 4.107234466075897, + "ce_loss_3": 3.9407611727714538, + "ce_loss_7": 3.689082384109497, + "epoch": 0.61, + "grad_norm": 378.0, + "kl_loss_10": 95.95064582824708, + "kl_loss_2": 1116.5803283691407, + "kl_loss_3": 769.8769500732421, + "kl_loss_7": 190.42120208740235, + "learning_rate": 0.0003364660183412892, + "loss": 543.2468, + "step": 6100 + }, + { + "ce_loss_10": 3.6229702949523928, + "ce_loss_13": 3.5642863631248476, + "ce_loss_2": 4.082474946975708, + "ce_loss_3": 3.920805549621582, + "ce_loss_7": 3.6692759871482847, + "epoch": 0.611, + "grad_norm": 438.0, + "kl_loss_10": 95.98471641540527, + "kl_loss_2": 1107.3975402832032, + "kl_loss_3": 770.6610443115235, + "kl_loss_7": 191.18293151855468, + "learning_rate": 0.0003349674464576834, + "loss": 547.1137, + "step": 6110 + }, + { + "ce_loss_10": 3.572301459312439, + "ce_loss_13": 3.5100274682044983, + "ce_loss_2": 4.04880428314209, + "ce_loss_3": 3.87799711227417, + "ce_loss_7": 3.6172243118286134, + "epoch": 0.612, + "grad_norm": 400.0, + "kl_loss_10": 97.55015258789062, + "kl_loss_2": 1121.5612213134766, + "kl_loss_3": 776.7356872558594, + "kl_loss_7": 191.68118591308593, + "learning_rate": 0.00033347053645005966, + "loss": 533.933, + "step": 6120 + }, + { + "ce_loss_10": 3.6915227651596068, + "ce_loss_13": 3.6307403206825257, + "ce_loss_2": 4.149306988716125, + "ce_loss_3": 3.986075186729431, + "ce_loss_7": 3.7352558612823485, + "epoch": 0.613, + "grad_norm": 456.0, + "kl_loss_10": 97.44704780578613, + "kl_loss_2": 1082.3290954589843, + "kl_loss_3": 751.4226776123047, + "kl_loss_7": 188.24736099243165, + "learning_rate": 0.00033197530339228485, + "loss": 541.4641, + "step": 6130 + }, + { + "ce_loss_10": 3.6387569904327393, + "ce_loss_13": 3.5774574756622313, + "ce_loss_2": 4.1059521555900576, + "ce_loss_3": 3.9463557958602906, + "ce_loss_7": 3.686212944984436, + "epoch": 0.614, + "grad_norm": 320.0, + "kl_loss_10": 97.79526100158691, + "kl_loss_2": 1105.2626007080078, + "kl_loss_3": 773.3958312988282, + "kl_loss_7": 193.28426208496094, + "learning_rate": 0.00033048176234133967, + "loss": 539.6668, + "step": 6140 + }, + { + "ce_loss_10": 3.6235718965530395, + "ce_loss_13": 3.563166308403015, + "ce_loss_2": 4.0937678694725035, + "ce_loss_3": 3.9205260276794434, + "ce_loss_7": 3.6674267172813417, + "epoch": 0.615, + "grad_norm": 434.0, + "kl_loss_10": 96.52788619995117, + "kl_loss_2": 1108.4606842041017, + "kl_loss_3": 766.9316375732421, + "kl_loss_7": 191.76514892578126, + "learning_rate": 0.0003289899283371657, + "loss": 545.3005, + "step": 6150 + }, + { + "ce_loss_10": 3.6545772314071656, + "ce_loss_13": 3.5920246958732607, + "ce_loss_2": 4.122934722900391, + "ce_loss_3": 3.954042661190033, + "ce_loss_7": 3.7002484798431396, + "epoch": 0.616, + "grad_norm": 512.0, + "kl_loss_10": 96.86014366149902, + "kl_loss_2": 1110.978466796875, + "kl_loss_3": 763.7065795898437, + "kl_loss_7": 189.29309463500977, + "learning_rate": 0.0003274998164025148, + "loss": 546.4095, + "step": 6160 + }, + { + "ce_loss_10": 3.687037003040314, + "ce_loss_13": 3.62383953332901, + "ce_loss_2": 4.151339697837829, + "ce_loss_3": 3.982528305053711, + "ce_loss_7": 3.730109751224518, + "epoch": 0.617, + "grad_norm": 420.0, + "kl_loss_10": 98.5214340209961, + "kl_loss_2": 1105.55556640625, + "kl_loss_3": 765.5938140869141, + "kl_loss_7": 192.1310241699219, + "learning_rate": 0.0003260114415427975, + "loss": 551.3336, + "step": 6170 + }, + { + "ce_loss_10": 3.6019906878471373, + "ce_loss_13": 3.543228101730347, + "ce_loss_2": 4.074436497688294, + "ce_loss_3": 3.910420286655426, + "ce_loss_7": 3.650412619113922, + "epoch": 0.618, + "grad_norm": 326.0, + "kl_loss_10": 96.38783836364746, + "kl_loss_2": 1118.9971984863282, + "kl_loss_3": 780.3896484375, + "kl_loss_7": 191.8697937011719, + "learning_rate": 0.0003245248187459323, + "loss": 553.7879, + "step": 6180 + }, + { + "ce_loss_10": 3.5864107251167296, + "ce_loss_13": 3.53016597032547, + "ce_loss_2": 4.042503225803375, + "ce_loss_3": 3.874970889091492, + "ce_loss_7": 3.6281535744667055, + "epoch": 0.619, + "grad_norm": 418.0, + "kl_loss_10": 92.61179161071777, + "kl_loss_2": 1080.5412902832031, + "kl_loss_3": 743.7303924560547, + "kl_loss_7": 185.2653793334961, + "learning_rate": 0.00032303996298219416, + "loss": 531.9591, + "step": 6190 + }, + { + "ce_loss_10": 3.6777410745620727, + "ce_loss_13": 3.6153058767318726, + "ce_loss_2": 4.135652315616608, + "ce_loss_3": 3.968917655944824, + "ce_loss_7": 3.723153126239777, + "epoch": 0.62, + "grad_norm": 328.0, + "kl_loss_10": 97.44341430664062, + "kl_loss_2": 1081.0255004882813, + "kl_loss_3": 750.4999420166016, + "kl_loss_7": 189.73232498168946, + "learning_rate": 0.00032155688920406414, + "loss": 532.518, + "step": 6200 + }, + { + "ce_loss_10": 3.587769341468811, + "ce_loss_13": 3.524891209602356, + "ce_loss_2": 4.075685119628906, + "ce_loss_3": 3.896172082424164, + "ce_loss_7": 3.6351929187774656, + "epoch": 0.621, + "grad_norm": 376.0, + "kl_loss_10": 100.48479537963867, + "kl_loss_2": 1141.9940368652344, + "kl_loss_3": 786.613900756836, + "kl_loss_7": 195.81097640991212, + "learning_rate": 0.0003200756123460788, + "loss": 557.093, + "step": 6210 + }, + { + "ce_loss_10": 3.613728904724121, + "ce_loss_13": 3.5514798045158384, + "ce_loss_2": 4.097208368778229, + "ce_loss_3": 3.922844612598419, + "ce_loss_7": 3.6612853050231933, + "epoch": 0.622, + "grad_norm": 436.0, + "kl_loss_10": 98.99568367004395, + "kl_loss_2": 1137.439712524414, + "kl_loss_3": 786.5532501220703, + "kl_loss_7": 195.3180892944336, + "learning_rate": 0.00031859614732467957, + "loss": 552.2858, + "step": 6220 + }, + { + "ce_loss_10": 3.668611526489258, + "ce_loss_13": 3.6079549193382263, + "ce_loss_2": 4.123561811447144, + "ce_loss_3": 3.957008695602417, + "ce_loss_7": 3.7130979537963866, + "epoch": 0.623, + "grad_norm": 436.0, + "kl_loss_10": 96.12700805664062, + "kl_loss_2": 1085.7240417480468, + "kl_loss_3": 750.371206665039, + "kl_loss_7": 188.20330352783202, + "learning_rate": 0.00031711850903806275, + "loss": 532.2347, + "step": 6230 + }, + { + "ce_loss_10": 3.5722012281417848, + "ce_loss_13": 3.5121172070503235, + "ce_loss_2": 4.05529419183731, + "ce_loss_3": 3.8803335189819337, + "ce_loss_7": 3.6196384906768797, + "epoch": 0.624, + "grad_norm": 372.0, + "kl_loss_10": 98.26438941955567, + "kl_loss_2": 1135.1425506591797, + "kl_loss_3": 784.2965637207031, + "kl_loss_7": 195.50869674682616, + "learning_rate": 0.0003156427123660297, + "loss": 544.6104, + "step": 6240 + }, + { + "ce_loss_10": 3.663820195198059, + "ce_loss_13": 3.6021278977394102, + "ce_loss_2": 4.12639445066452, + "ce_loss_3": 3.9577032327651978, + "ce_loss_7": 3.709599566459656, + "epoch": 0.625, + "grad_norm": 376.0, + "kl_loss_10": 96.6868911743164, + "kl_loss_2": 1095.9533905029298, + "kl_loss_3": 760.994189453125, + "kl_loss_7": 189.45380859375, + "learning_rate": 0.0003141687721698363, + "loss": 542.975, + "step": 6250 + }, + { + "ce_loss_10": 3.6301703572273256, + "ce_loss_13": 3.5708668351173403, + "ce_loss_2": 4.076251423358917, + "ce_loss_3": 3.9138960361480715, + "ce_loss_7": 3.6724702954292296, + "epoch": 0.626, + "grad_norm": 424.0, + "kl_loss_10": 94.79209213256836, + "kl_loss_2": 1062.4459991455078, + "kl_loss_3": 731.6935089111328, + "kl_loss_7": 183.48223037719725, + "learning_rate": 0.00031269670329204396, + "loss": 531.0972, + "step": 6260 + }, + { + "ce_loss_10": 3.6652311086654663, + "ce_loss_13": 3.6031481981277467, + "ce_loss_2": 4.122557854652404, + "ce_loss_3": 3.9541366934776305, + "ce_loss_7": 3.707497763633728, + "epoch": 0.627, + "grad_norm": 404.0, + "kl_loss_10": 97.36745681762696, + "kl_loss_2": 1087.3731384277344, + "kl_loss_3": 749.8712615966797, + "kl_loss_7": 189.97913208007813, + "learning_rate": 0.00031122652055637015, + "loss": 536.5034, + "step": 6270 + }, + { + "ce_loss_10": 3.6263384938240053, + "ce_loss_13": 3.5657132387161257, + "ce_loss_2": 4.101957285404206, + "ce_loss_3": 3.9301799178123473, + "ce_loss_7": 3.671717309951782, + "epoch": 0.628, + "grad_norm": 320.0, + "kl_loss_10": 97.96914176940918, + "kl_loss_2": 1132.4724700927734, + "kl_loss_3": 779.5158935546875, + "kl_loss_7": 193.307218170166, + "learning_rate": 0.0003097582387675385, + "loss": 538.5988, + "step": 6280 + }, + { + "ce_loss_10": 3.6690368175506594, + "ce_loss_13": 3.608207333087921, + "ce_loss_2": 4.131593143939972, + "ce_loss_3": 3.967103731632233, + "ce_loss_7": 3.714122140407562, + "epoch": 0.629, + "grad_norm": 380.0, + "kl_loss_10": 97.3248161315918, + "kl_loss_2": 1100.8168243408204, + "kl_loss_3": 758.5913757324219, + "kl_loss_7": 190.2446075439453, + "learning_rate": 0.00030829187271113034, + "loss": 533.383, + "step": 6290 + }, + { + "ce_loss_10": 3.6720826983451844, + "ce_loss_13": 3.6116329789161683, + "ce_loss_2": 4.121181070804596, + "ce_loss_3": 3.958890378475189, + "ce_loss_7": 3.713034725189209, + "epoch": 0.63, + "grad_norm": 474.0, + "kl_loss_10": 95.86663208007812, + "kl_loss_2": 1078.529071044922, + "kl_loss_3": 747.6958526611328, + "kl_loss_7": 186.88264846801758, + "learning_rate": 0.00030682743715343565, + "loss": 538.6207, + "step": 6300 + }, + { + "ce_loss_10": 3.6168052315711976, + "ce_loss_13": 3.5534343481063844, + "ce_loss_2": 4.1001279830932615, + "ce_loss_3": 3.926764929294586, + "ce_loss_7": 3.6654592990875243, + "epoch": 0.631, + "grad_norm": 352.0, + "kl_loss_10": 98.38105430603028, + "kl_loss_2": 1116.2974884033204, + "kl_loss_3": 769.4165740966797, + "kl_loss_7": 194.41071319580078, + "learning_rate": 0.0003053649468413043, + "loss": 544.2852, + "step": 6310 + }, + { + "ce_loss_10": 3.728801727294922, + "ce_loss_13": 3.6677038788795473, + "ce_loss_2": 4.186562621593476, + "ce_loss_3": 4.021135902404785, + "ce_loss_7": 3.7726667642593386, + "epoch": 0.632, + "grad_norm": 548.0, + "kl_loss_10": 98.36889610290527, + "kl_loss_2": 1106.3314636230468, + "kl_loss_3": 764.3384338378906, + "kl_loss_7": 193.92676391601563, + "learning_rate": 0.00030390441650199725, + "loss": 534.6711, + "step": 6320 + }, + { + "ce_loss_10": 3.6225173473358154, + "ce_loss_13": 3.564038324356079, + "ce_loss_2": 4.088936626911163, + "ce_loss_3": 3.9200194835662843, + "ce_loss_7": 3.6701310753822325, + "epoch": 0.633, + "grad_norm": 390.0, + "kl_loss_10": 93.89363708496094, + "kl_loss_2": 1093.413995361328, + "kl_loss_3": 755.4691772460938, + "kl_loss_7": 188.9584762573242, + "learning_rate": 0.00030244586084303903, + "loss": 531.6465, + "step": 6330 + }, + { + "ce_loss_10": 3.5908933520317077, + "ce_loss_13": 3.530228877067566, + "ce_loss_2": 4.073009943962097, + "ce_loss_3": 3.908068907260895, + "ce_loss_7": 3.6380571484565736, + "epoch": 0.634, + "grad_norm": 362.0, + "kl_loss_10": 96.08535652160644, + "kl_loss_2": 1137.027798461914, + "kl_loss_3": 794.3090057373047, + "kl_loss_7": 193.36979446411132, + "learning_rate": 0.00030098929455206903, + "loss": 541.8852, + "step": 6340 + }, + { + "ce_loss_10": 3.5973508238792418, + "ce_loss_13": 3.538694751262665, + "ce_loss_2": 4.059111332893371, + "ce_loss_3": 3.8917571187019346, + "ce_loss_7": 3.6398496866226195, + "epoch": 0.635, + "grad_norm": 396.0, + "kl_loss_10": 95.19868698120118, + "kl_loss_2": 1117.9919860839843, + "kl_loss_3": 769.856167602539, + "kl_loss_7": 189.57870178222657, + "learning_rate": 0.00029953473229669324, + "loss": 545.9079, + "step": 6350 + }, + { + "ce_loss_10": 3.6316630482673644, + "ce_loss_13": 3.5723133206367494, + "ce_loss_2": 4.099796783924103, + "ce_loss_3": 3.9292221426963807, + "ce_loss_7": 3.6748278617858885, + "epoch": 0.636, + "grad_norm": 382.0, + "kl_loss_10": 94.04772453308105, + "kl_loss_2": 1105.0771392822267, + "kl_loss_3": 767.0107574462891, + "kl_loss_7": 189.39691848754882, + "learning_rate": 0.00029808218872433767, + "loss": 534.2105, + "step": 6360 + }, + { + "ce_loss_10": 3.6887783288955687, + "ce_loss_13": 3.6287707686424255, + "ce_loss_2": 4.1434108257293705, + "ce_loss_3": 3.9780289769172668, + "ce_loss_7": 3.7338571667671205, + "epoch": 0.637, + "grad_norm": 402.0, + "kl_loss_10": 97.2003547668457, + "kl_loss_2": 1086.371304321289, + "kl_loss_3": 753.1467376708985, + "kl_loss_7": 190.29918899536133, + "learning_rate": 0.0002966316784621, + "loss": 530.8481, + "step": 6370 + }, + { + "ce_loss_10": 3.5995650410652162, + "ce_loss_13": 3.5394855737686157, + "ce_loss_2": 4.081933212280274, + "ce_loss_3": 3.905743455886841, + "ce_loss_7": 3.6461820721626284, + "epoch": 0.638, + "grad_norm": 392.0, + "kl_loss_10": 94.92418899536133, + "kl_loss_2": 1131.0511108398437, + "kl_loss_3": 782.9240203857422, + "kl_loss_7": 192.17471160888672, + "learning_rate": 0.0002951832161166024, + "loss": 537.9302, + "step": 6380 + }, + { + "ce_loss_10": 3.6817028760910033, + "ce_loss_13": 3.619114363193512, + "ce_loss_2": 4.15013542175293, + "ce_loss_3": 3.980035495758057, + "ce_loss_7": 3.726088798046112, + "epoch": 0.639, + "grad_norm": 284.0, + "kl_loss_10": 99.42742652893067, + "kl_loss_2": 1089.2870971679688, + "kl_loss_3": 758.1006713867188, + "kl_loss_7": 192.03466110229493, + "learning_rate": 0.0002937368162738445, + "loss": 530.5328, + "step": 6390 + }, + { + "ce_loss_10": 3.6132258057594298, + "ce_loss_13": 3.557306098937988, + "ce_loss_2": 4.071500968933106, + "ce_loss_3": 3.905410099029541, + "ce_loss_7": 3.6560685634613037, + "epoch": 0.64, + "grad_norm": 580.0, + "kl_loss_10": 93.17153434753418, + "kl_loss_2": 1090.426809692383, + "kl_loss_3": 756.628515625, + "kl_loss_7": 185.41258697509767, + "learning_rate": 0.0002922924934990568, + "loss": 537.7791, + "step": 6400 + }, + { + "ce_loss_10": 3.553709554672241, + "ce_loss_13": 3.495926034450531, + "ce_loss_2": 4.037974917888642, + "ce_loss_3": 3.862057626247406, + "ce_loss_7": 3.5978724122047425, + "epoch": 0.641, + "grad_norm": 316.0, + "kl_loss_10": 94.70829887390137, + "kl_loss_2": 1132.230615234375, + "kl_loss_3": 780.3255004882812, + "kl_loss_7": 189.6028953552246, + "learning_rate": 0.0002908502623365536, + "loss": 541.2746, + "step": 6410 + }, + { + "ce_loss_10": 3.493143379688263, + "ce_loss_13": 3.4340757846832277, + "ce_loss_2": 3.982888162136078, + "ce_loss_3": 3.8087966442108154, + "ce_loss_7": 3.541613507270813, + "epoch": 0.642, + "grad_norm": 448.0, + "kl_loss_10": 93.92830047607421, + "kl_loss_2": 1141.5694763183594, + "kl_loss_3": 791.2887268066406, + "kl_loss_7": 189.8411407470703, + "learning_rate": 0.0002894101373095867, + "loss": 544.0511, + "step": 6420 + }, + { + "ce_loss_10": 3.7018409371376038, + "ce_loss_13": 3.641219162940979, + "ce_loss_2": 4.160841226577759, + "ce_loss_3": 3.996344065666199, + "ce_loss_7": 3.7449718475341798, + "epoch": 0.643, + "grad_norm": 444.0, + "kl_loss_10": 98.50596771240234, + "kl_loss_2": 1096.2253509521483, + "kl_loss_3": 759.2389587402344, + "kl_loss_7": 191.72063598632812, + "learning_rate": 0.00028797213292019926, + "loss": 535.7118, + "step": 6430 + }, + { + "ce_loss_10": 3.679163944721222, + "ce_loss_13": 3.6178041219711305, + "ce_loss_2": 4.137241208553315, + "ce_loss_3": 3.9736143589019775, + "ce_loss_7": 3.7223108887672423, + "epoch": 0.644, + "grad_norm": 316.0, + "kl_loss_10": 96.37056579589844, + "kl_loss_2": 1093.3028533935546, + "kl_loss_3": 763.8056060791016, + "kl_loss_7": 190.55449371337892, + "learning_rate": 0.0002865362636490791, + "loss": 543.9671, + "step": 6440 + }, + { + "ce_loss_10": 3.689470386505127, + "ce_loss_13": 3.6325947284698485, + "ce_loss_2": 4.151259076595307, + "ce_loss_3": 3.9852967262268066, + "ce_loss_7": 3.7347108364105224, + "epoch": 0.645, + "grad_norm": 422.0, + "kl_loss_10": 95.76711997985839, + "kl_loss_2": 1101.8473754882812, + "kl_loss_3": 757.8740173339844, + "kl_loss_7": 188.20162200927734, + "learning_rate": 0.0002851025439554142, + "loss": 532.7338, + "step": 6450 + }, + { + "ce_loss_10": 3.6879691004753115, + "ce_loss_13": 3.6268020391464235, + "ce_loss_2": 4.149470102787018, + "ce_loss_3": 3.9827425360679625, + "ce_loss_7": 3.732300865650177, + "epoch": 0.646, + "grad_norm": 432.0, + "kl_loss_10": 96.89583930969238, + "kl_loss_2": 1086.1058197021484, + "kl_loss_3": 754.8961853027344, + "kl_loss_7": 190.88655471801758, + "learning_rate": 0.00028367098827674573, + "loss": 531.1024, + "step": 6460 + }, + { + "ce_loss_10": 3.613504183292389, + "ce_loss_13": 3.552918183803558, + "ce_loss_2": 4.07694593667984, + "ce_loss_3": 3.9072110176086428, + "ce_loss_7": 3.656181883811951, + "epoch": 0.647, + "grad_norm": 382.0, + "kl_loss_10": 95.70045394897461, + "kl_loss_2": 1088.4426727294922, + "kl_loss_3": 747.3143646240235, + "kl_loss_7": 185.63362350463868, + "learning_rate": 0.00028224161102882397, + "loss": 534.1186, + "step": 6470 + }, + { + "ce_loss_10": 3.591862881183624, + "ce_loss_13": 3.5325499296188356, + "ce_loss_2": 4.047231125831604, + "ce_loss_3": 3.8850304007530214, + "ce_loss_7": 3.6327146530151366, + "epoch": 0.648, + "grad_norm": 398.0, + "kl_loss_10": 97.32144050598144, + "kl_loss_2": 1084.3862060546876, + "kl_loss_3": 756.0506072998047, + "kl_loss_7": 188.20642013549804, + "learning_rate": 0.00028081442660546124, + "loss": 534.4936, + "step": 6480 + }, + { + "ce_loss_10": 3.6528772950172423, + "ce_loss_13": 3.593310809135437, + "ce_loss_2": 4.104138958454132, + "ce_loss_3": 3.940169370174408, + "ce_loss_7": 3.6972940802574157, + "epoch": 0.649, + "grad_norm": 442.0, + "kl_loss_10": 96.56869812011719, + "kl_loss_2": 1082.232455444336, + "kl_loss_3": 748.2576446533203, + "kl_loss_7": 188.56612319946288, + "learning_rate": 0.0002793894493783892, + "loss": 535.3609, + "step": 6490 + }, + { + "ce_loss_10": 3.671093225479126, + "ce_loss_13": 3.6125397443771363, + "ce_loss_2": 4.120749580860138, + "ce_loss_3": 3.957093584537506, + "ce_loss_7": 3.715547430515289, + "epoch": 0.65, + "grad_norm": 340.0, + "kl_loss_10": 95.52767143249511, + "kl_loss_2": 1081.513833618164, + "kl_loss_3": 750.0977233886719, + "kl_loss_7": 185.41107177734375, + "learning_rate": 0.0002779666936971129, + "loss": 530.5015, + "step": 6500 + }, + { + "ce_loss_10": 3.6747244358062745, + "ce_loss_13": 3.6157574892044066, + "ce_loss_2": 4.147137761116028, + "ce_loss_3": 3.9802316427230835, + "ce_loss_7": 3.7200183868408203, + "epoch": 0.651, + "grad_norm": 388.0, + "kl_loss_10": 96.378706741333, + "kl_loss_2": 1104.2031311035157, + "kl_loss_3": 768.3699279785156, + "kl_loss_7": 190.13947677612305, + "learning_rate": 0.00027654617388876614, + "loss": 540.9622, + "step": 6510 + }, + { + "ce_loss_10": 3.7085010170936585, + "ce_loss_13": 3.650082528591156, + "ce_loss_2": 4.159732723236084, + "ce_loss_3": 3.9939939975738525, + "ce_loss_7": 3.752064514160156, + "epoch": 0.652, + "grad_norm": 372.0, + "kl_loss_10": 98.8690299987793, + "kl_loss_2": 1084.27646484375, + "kl_loss_3": 749.1016296386719, + "kl_loss_7": 189.19281463623048, + "learning_rate": 0.0002751279042579672, + "loss": 533.7532, + "step": 6520 + }, + { + "ce_loss_10": 3.6514885902404783, + "ce_loss_13": 3.589630663394928, + "ce_loss_2": 4.104155695438385, + "ce_loss_3": 3.9368098855018614, + "ce_loss_7": 3.696379566192627, + "epoch": 0.653, + "grad_norm": 388.0, + "kl_loss_10": 98.10863304138184, + "kl_loss_2": 1078.5175903320312, + "kl_loss_3": 739.8918975830078, + "kl_loss_7": 187.05665588378906, + "learning_rate": 0.00027371189908667604, + "loss": 535.8568, + "step": 6530 + }, + { + "ce_loss_10": 3.6950425028800966, + "ce_loss_13": 3.6345377445220945, + "ce_loss_2": 4.172570693492889, + "ce_loss_3": 4.002642476558686, + "ce_loss_7": 3.742088866233826, + "epoch": 0.654, + "grad_norm": 456.0, + "kl_loss_10": 98.50621490478515, + "kl_loss_2": 1120.8493408203126, + "kl_loss_3": 772.4739196777343, + "kl_loss_7": 194.52065811157226, + "learning_rate": 0.00027229817263404863, + "loss": 550.1683, + "step": 6540 + }, + { + "ce_loss_10": 3.678051483631134, + "ce_loss_13": 3.6163152933120726, + "ce_loss_2": 4.125236618518829, + "ce_loss_3": 3.9632533311843874, + "ce_loss_7": 3.717917835712433, + "epoch": 0.655, + "grad_norm": 354.0, + "kl_loss_10": 97.52188301086426, + "kl_loss_2": 1072.0729919433593, + "kl_loss_3": 745.5059295654297, + "kl_loss_7": 187.41375122070312, + "learning_rate": 0.0002708867391362948, + "loss": 530.4727, + "step": 6550 + }, + { + "ce_loss_10": 3.659157025814056, + "ce_loss_13": 3.5987429141998293, + "ce_loss_2": 4.098348212242127, + "ce_loss_3": 3.9343943357467652, + "ce_loss_7": 3.69932336807251, + "epoch": 0.656, + "grad_norm": 380.0, + "kl_loss_10": 95.51490859985351, + "kl_loss_2": 1048.09501953125, + "kl_loss_3": 723.2193145751953, + "kl_loss_7": 183.38801651000978, + "learning_rate": 0.0002694776128065345, + "loss": 526.4233, + "step": 6560 + }, + { + "ce_loss_10": 3.5926573395729067, + "ce_loss_13": 3.5355629920959473, + "ce_loss_2": 4.059596955776215, + "ce_loss_3": 3.8947146415710447, + "ce_loss_7": 3.63899849653244, + "epoch": 0.657, + "grad_norm": 302.0, + "kl_loss_10": 94.25321388244629, + "kl_loss_2": 1108.046826171875, + "kl_loss_3": 769.1714508056641, + "kl_loss_7": 190.54062194824218, + "learning_rate": 0.00026807080783465374, + "loss": 532.2117, + "step": 6570 + }, + { + "ce_loss_10": 3.7099499464035035, + "ce_loss_13": 3.6470829010009767, + "ce_loss_2": 4.173487448692322, + "ce_loss_3": 4.007464277744293, + "ce_loss_7": 3.753613090515137, + "epoch": 0.658, + "grad_norm": 336.0, + "kl_loss_10": 98.83243751525879, + "kl_loss_2": 1096.7148071289062, + "kl_loss_3": 763.6604827880859, + "kl_loss_7": 191.30890121459962, + "learning_rate": 0.00026666633838716316, + "loss": 542.1623, + "step": 6580 + }, + { + "ce_loss_10": 3.597714030742645, + "ce_loss_13": 3.5341309905052185, + "ce_loss_2": 4.0741772770881655, + "ce_loss_3": 3.9031991958618164, + "ce_loss_7": 3.64434130191803, + "epoch": 0.659, + "grad_norm": 418.0, + "kl_loss_10": 98.79775390625, + "kl_loss_2": 1119.104165649414, + "kl_loss_3": 772.7665252685547, + "kl_loss_7": 193.75399169921874, + "learning_rate": 0.00026526421860705474, + "loss": 546.4087, + "step": 6590 + }, + { + "ce_loss_10": 3.6211095809936524, + "ce_loss_13": 3.56248060464859, + "ce_loss_2": 4.090437388420105, + "ce_loss_3": 3.9254501700401305, + "ce_loss_7": 3.669628012180328, + "epoch": 0.66, + "grad_norm": 388.0, + "kl_loss_10": 97.33003234863281, + "kl_loss_2": 1100.579428100586, + "kl_loss_3": 767.1163055419922, + "kl_loss_7": 192.85016250610352, + "learning_rate": 0.0002638644626136587, + "loss": 535.0932, + "step": 6600 + }, + { + "ce_loss_10": 3.632294547557831, + "ce_loss_13": 3.5736007690429688, + "ce_loss_2": 4.098874115943909, + "ce_loss_3": 3.928848695755005, + "ce_loss_7": 3.6751357674598695, + "epoch": 0.661, + "grad_norm": 370.0, + "kl_loss_10": 95.11613578796387, + "kl_loss_2": 1096.4229095458984, + "kl_loss_3": 759.0542449951172, + "kl_loss_7": 188.92064208984374, + "learning_rate": 0.00026246708450250255, + "loss": 537.9207, + "step": 6610 + }, + { + "ce_loss_10": 3.6327243566513063, + "ce_loss_13": 3.5709309697151186, + "ce_loss_2": 4.086973357200622, + "ce_loss_3": 3.9239420771598814, + "ce_loss_7": 3.675078272819519, + "epoch": 0.662, + "grad_norm": 450.0, + "kl_loss_10": 97.06436119079589, + "kl_loss_2": 1079.41337890625, + "kl_loss_3": 752.72802734375, + "kl_loss_7": 187.51063842773436, + "learning_rate": 0.00026107209834516854, + "loss": 531.8906, + "step": 6620 + }, + { + "ce_loss_10": 3.5740899324417112, + "ce_loss_13": 3.5152911067008974, + "ce_loss_2": 4.057041144371032, + "ce_loss_3": 3.8850310802459718, + "ce_loss_7": 3.6205747365951537, + "epoch": 0.663, + "grad_norm": 326.0, + "kl_loss_10": 95.74808731079102, + "kl_loss_2": 1136.7873779296874, + "kl_loss_3": 780.0463623046875, + "kl_loss_7": 190.15955352783203, + "learning_rate": 0.0002596795181891514, + "loss": 547.2686, + "step": 6630 + }, + { + "ce_loss_10": 3.5901227831840514, + "ce_loss_13": 3.527127909660339, + "ce_loss_2": 4.062633895874024, + "ce_loss_3": 3.8958073616027833, + "ce_loss_7": 3.63388534784317, + "epoch": 0.664, + "grad_norm": 488.0, + "kl_loss_10": 97.48413009643555, + "kl_loss_2": 1119.4189453125, + "kl_loss_3": 774.4207427978515, + "kl_loss_7": 193.8588966369629, + "learning_rate": 0.000258289358057718, + "loss": 556.5954, + "step": 6640 + }, + { + "ce_loss_10": 3.6630045056343077, + "ce_loss_13": 3.6010705709457396, + "ce_loss_2": 4.126548099517822, + "ce_loss_3": 3.960009717941284, + "ce_loss_7": 3.70961674451828, + "epoch": 0.665, + "grad_norm": 368.0, + "kl_loss_10": 97.2126693725586, + "kl_loss_2": 1116.2655120849608, + "kl_loss_3": 770.7855743408203, + "kl_loss_7": 193.7609016418457, + "learning_rate": 0.0002569016319497657, + "loss": 544.2068, + "step": 6650 + }, + { + "ce_loss_10": 3.645352327823639, + "ce_loss_13": 3.582920753955841, + "ce_loss_2": 4.116545259952545, + "ce_loss_3": 3.9502077460289002, + "ce_loss_7": 3.6899593830108643, + "epoch": 0.666, + "grad_norm": 324.0, + "kl_loss_10": 98.58149223327636, + "kl_loss_2": 1127.1539520263673, + "kl_loss_3": 778.5697784423828, + "kl_loss_7": 194.4781005859375, + "learning_rate": 0.00025551635383968066, + "loss": 551.8321, + "step": 6660 + }, + { + "ce_loss_10": 3.5590095281600953, + "ce_loss_13": 3.497633898258209, + "ce_loss_2": 4.0256366491317745, + "ce_loss_3": 3.8563454031944273, + "ce_loss_7": 3.6033952236175537, + "epoch": 0.667, + "grad_norm": 386.0, + "kl_loss_10": 96.00436630249024, + "kl_loss_2": 1115.5439819335938, + "kl_loss_3": 764.8407897949219, + "kl_loss_7": 191.15278091430665, + "learning_rate": 0.00025413353767719804, + "loss": 541.5643, + "step": 6670 + }, + { + "ce_loss_10": 3.6135716080665587, + "ce_loss_13": 3.556279420852661, + "ce_loss_2": 4.074564230442047, + "ce_loss_3": 3.9083084225654603, + "ce_loss_7": 3.6589901089668273, + "epoch": 0.668, + "grad_norm": 404.0, + "kl_loss_10": 95.40520133972169, + "kl_loss_2": 1103.0668395996095, + "kl_loss_3": 766.21494140625, + "kl_loss_7": 187.07973251342773, + "learning_rate": 0.0002527531973872617, + "loss": 541.5821, + "step": 6680 + }, + { + "ce_loss_10": 3.630588722229004, + "ce_loss_13": 3.5716015577316282, + "ce_loss_2": 4.09862619638443, + "ce_loss_3": 3.9337419509887694, + "ce_loss_7": 3.6740004658699035, + "epoch": 0.669, + "grad_norm": 376.0, + "kl_loss_10": 94.05056571960449, + "kl_loss_2": 1104.580502319336, + "kl_loss_3": 767.1347503662109, + "kl_loss_7": 187.80085144042968, + "learning_rate": 0.0002513753468698826, + "loss": 536.7451, + "step": 6690 + }, + { + "ce_loss_10": 3.6005271077156067, + "ce_loss_13": 3.538683819770813, + "ce_loss_2": 4.075844824314117, + "ce_loss_3": 3.901875948905945, + "ce_loss_7": 3.6449614763259888, + "epoch": 0.67, + "grad_norm": 392.0, + "kl_loss_10": 97.46344718933105, + "kl_loss_2": 1117.6306915283203, + "kl_loss_3": 769.393521118164, + "kl_loss_7": 191.83680877685546, + "learning_rate": 0.0002500000000000001, + "loss": 543.8447, + "step": 6700 + }, + { + "ce_loss_10": 3.7194844245910645, + "ce_loss_13": 3.6591498017311097, + "ce_loss_2": 4.157877945899964, + "ce_loss_3": 3.9965709686279296, + "ce_loss_7": 3.7608611464500425, + "epoch": 0.671, + "grad_norm": 388.0, + "kl_loss_10": 96.12382774353027, + "kl_loss_2": 1059.211587524414, + "kl_loss_3": 732.8135711669922, + "kl_loss_7": 185.53207092285157, + "learning_rate": 0.0002486271706273421, + "loss": 540.9632, + "step": 6710 + }, + { + "ce_loss_10": 3.652998185157776, + "ce_loss_13": 3.5960669040679933, + "ce_loss_2": 4.096874964237213, + "ce_loss_3": 3.930626368522644, + "ce_loss_7": 3.694219136238098, + "epoch": 0.672, + "grad_norm": 370.0, + "kl_loss_10": 96.1414752960205, + "kl_loss_2": 1060.9839447021484, + "kl_loss_3": 732.6356231689454, + "kl_loss_7": 184.73310241699218, + "learning_rate": 0.0002472568725762853, + "loss": 531.8145, + "step": 6720 + }, + { + "ce_loss_10": 3.644508719444275, + "ce_loss_13": 3.585316574573517, + "ce_loss_2": 4.077662718296051, + "ce_loss_3": 3.923126482963562, + "ce_loss_7": 3.6880379915237427, + "epoch": 0.673, + "grad_norm": 536.0, + "kl_loss_10": 95.44480400085449, + "kl_loss_2": 1060.1810028076172, + "kl_loss_3": 734.1040588378906, + "kl_loss_7": 183.89718780517578, + "learning_rate": 0.00024588911964571554, + "loss": 524.9737, + "step": 6730 + }, + { + "ce_loss_10": 3.6595176219940186, + "ce_loss_13": 3.5960793495178223, + "ce_loss_2": 4.141416406631469, + "ce_loss_3": 3.971626269817352, + "ce_loss_7": 3.706479799747467, + "epoch": 0.674, + "grad_norm": 370.0, + "kl_loss_10": 101.08820152282715, + "kl_loss_2": 1123.6421142578124, + "kl_loss_3": 779.8745697021484, + "kl_loss_7": 196.79359664916993, + "learning_rate": 0.00024452392560888974, + "loss": 538.6094, + "step": 6740 + }, + { + "ce_loss_10": 3.5484472513198853, + "ce_loss_13": 3.4903222799301146, + "ce_loss_2": 4.00926810503006, + "ce_loss_3": 3.837252104282379, + "ce_loss_7": 3.5929391860961912, + "epoch": 0.675, + "grad_norm": 376.0, + "kl_loss_10": 94.44077377319336, + "kl_loss_2": 1104.6140991210937, + "kl_loss_3": 759.8463775634766, + "kl_loss_7": 187.49753799438477, + "learning_rate": 0.00024316130421329695, + "loss": 531.6798, + "step": 6750 + }, + { + "ce_loss_10": 3.63141074180603, + "ce_loss_13": 3.5704286813735964, + "ce_loss_2": 4.089890336990356, + "ce_loss_3": 3.9222849130630495, + "ce_loss_7": 3.6722644567489624, + "epoch": 0.676, + "grad_norm": 320.0, + "kl_loss_10": 96.4859691619873, + "kl_loss_2": 1072.7287811279298, + "kl_loss_3": 740.4257781982421, + "kl_loss_7": 185.37494659423828, + "learning_rate": 0.00024180126918051909, + "loss": 528.9844, + "step": 6760 + }, + { + "ce_loss_10": 3.6748690009117126, + "ce_loss_13": 3.6154377579689028, + "ce_loss_2": 4.126313555240631, + "ce_loss_3": 3.959956741333008, + "ce_loss_7": 3.719127857685089, + "epoch": 0.677, + "grad_norm": 494.0, + "kl_loss_10": 95.71767883300781, + "kl_loss_2": 1071.3604461669922, + "kl_loss_3": 739.3463531494141, + "kl_loss_7": 186.98586730957032, + "learning_rate": 0.00024044383420609406, + "loss": 526.4402, + "step": 6770 + }, + { + "ce_loss_10": 3.6849735140800477, + "ce_loss_13": 3.6251555919647216, + "ce_loss_2": 4.126254045963288, + "ce_loss_3": 3.9655120730400086, + "ce_loss_7": 3.7277087569236755, + "epoch": 0.678, + "grad_norm": 406.0, + "kl_loss_10": 96.21127319335938, + "kl_loss_2": 1065.4650268554688, + "kl_loss_3": 737.1611053466797, + "kl_loss_7": 186.31879425048828, + "learning_rate": 0.00023908901295937712, + "loss": 532.375, + "step": 6780 + }, + { + "ce_loss_10": 3.6866431832313538, + "ce_loss_13": 3.621911180019379, + "ce_loss_2": 4.138471448421479, + "ce_loss_3": 3.9692311763763426, + "ce_loss_7": 3.727970468997955, + "epoch": 0.679, + "grad_norm": 520.0, + "kl_loss_10": 97.46222076416015, + "kl_loss_2": 1075.2411163330078, + "kl_loss_3": 742.8502899169922, + "kl_loss_7": 187.16495361328126, + "learning_rate": 0.00023773681908340283, + "loss": 541.7315, + "step": 6790 + }, + { + "ce_loss_10": 3.6525588750839235, + "ce_loss_13": 3.590035092830658, + "ce_loss_2": 4.125091111660003, + "ce_loss_3": 3.955258107185364, + "ce_loss_7": 3.6996394038200378, + "epoch": 0.68, + "grad_norm": 448.0, + "kl_loss_10": 100.11968383789062, + "kl_loss_2": 1120.372329711914, + "kl_loss_3": 775.7205535888672, + "kl_loss_7": 195.07009201049806, + "learning_rate": 0.00023638726619474876, + "loss": 550.8879, + "step": 6800 + }, + { + "ce_loss_10": 3.6433764457702638, + "ce_loss_13": 3.581800138950348, + "ce_loss_2": 4.1252215027809145, + "ce_loss_3": 3.95204918384552, + "ce_loss_7": 3.68941251039505, + "epoch": 0.681, + "grad_norm": 380.0, + "kl_loss_10": 94.89226531982422, + "kl_loss_2": 1121.6464782714843, + "kl_loss_3": 776.2536529541015, + "kl_loss_7": 190.19580459594727, + "learning_rate": 0.0002350403678833976, + "loss": 540.7707, + "step": 6810 + }, + { + "ce_loss_10": 3.5702003121376036, + "ce_loss_13": 3.509978950023651, + "ce_loss_2": 4.041775238513947, + "ce_loss_3": 3.871393322944641, + "ce_loss_7": 3.6151094794273377, + "epoch": 0.682, + "grad_norm": 316.0, + "kl_loss_10": 94.982954788208, + "kl_loss_2": 1118.5872802734375, + "kl_loss_3": 772.0714935302734, + "kl_loss_7": 188.55085983276368, + "learning_rate": 0.00023369613771260007, + "loss": 536.8643, + "step": 6820 + }, + { + "ce_loss_10": 3.688840866088867, + "ce_loss_13": 3.6270360946655273, + "ce_loss_2": 4.156035900115967, + "ce_loss_3": 3.9860677838325502, + "ce_loss_7": 3.7326239466667177, + "epoch": 0.683, + "grad_norm": 410.0, + "kl_loss_10": 97.82878112792969, + "kl_loss_2": 1106.3897888183594, + "kl_loss_3": 766.803921508789, + "kl_loss_7": 191.37064056396486, + "learning_rate": 0.00023235458921873925, + "loss": 544.207, + "step": 6830 + }, + { + "ce_loss_10": 3.63765789270401, + "ce_loss_13": 3.5765843272209166, + "ce_loss_2": 4.12269172668457, + "ce_loss_3": 3.953417754173279, + "ce_loss_7": 3.6850703358650208, + "epoch": 0.684, + "grad_norm": 676.0, + "kl_loss_10": 97.75669631958007, + "kl_loss_2": 1147.8291046142579, + "kl_loss_3": 799.1194305419922, + "kl_loss_7": 195.58543319702147, + "learning_rate": 0.0002310157359111938, + "loss": 555.1555, + "step": 6840 + }, + { + "ce_loss_10": 3.526192367076874, + "ce_loss_13": 3.4662320494651793, + "ce_loss_2": 4.027907514572144, + "ce_loss_3": 3.8482834458351136, + "ce_loss_7": 3.574409317970276, + "epoch": 0.685, + "grad_norm": 660.0, + "kl_loss_10": 96.51494178771972, + "kl_loss_2": 1163.1898101806642, + "kl_loss_3": 802.0491363525391, + "kl_loss_7": 194.50169296264647, + "learning_rate": 0.0002296795912722014, + "loss": 551.9703, + "step": 6850 + }, + { + "ce_loss_10": 3.6707953572273255, + "ce_loss_13": 3.6116589188575743, + "ce_loss_2": 4.125709581375122, + "ce_loss_3": 3.957431602478027, + "ce_loss_7": 3.716504216194153, + "epoch": 0.686, + "grad_norm": 328.0, + "kl_loss_10": 96.6977554321289, + "kl_loss_2": 1086.6772430419921, + "kl_loss_3": 747.0762786865234, + "kl_loss_7": 188.86367645263672, + "learning_rate": 0.0002283461687567236, + "loss": 527.8294, + "step": 6860 + }, + { + "ce_loss_10": 3.727082335948944, + "ce_loss_13": 3.664930725097656, + "ce_loss_2": 4.172837960720062, + "ce_loss_3": 4.010821652412415, + "ce_loss_7": 3.7691392421722414, + "epoch": 0.687, + "grad_norm": 334.0, + "kl_loss_10": 97.53575859069824, + "kl_loss_2": 1058.4923736572266, + "kl_loss_3": 731.6483947753907, + "kl_loss_7": 186.02228698730468, + "learning_rate": 0.00022701548179231045, + "loss": 535.9072, + "step": 6870 + }, + { + "ce_loss_10": 3.6793978810310364, + "ce_loss_13": 3.6168754935264587, + "ce_loss_2": 4.133899199962616, + "ce_loss_3": 3.9700045347213746, + "ce_loss_7": 3.7239136338233947, + "epoch": 0.688, + "grad_norm": 382.0, + "kl_loss_10": 98.03768157958984, + "kl_loss_2": 1087.3397521972656, + "kl_loss_3": 753.5451507568359, + "kl_loss_7": 189.21656646728516, + "learning_rate": 0.00022568754377896516, + "loss": 530.6016, + "step": 6880 + }, + { + "ce_loss_10": 3.669530212879181, + "ce_loss_13": 3.611078381538391, + "ce_loss_2": 4.122839629650116, + "ce_loss_3": 3.9565317392349244, + "ce_loss_7": 3.7144492745399473, + "epoch": 0.689, + "grad_norm": 482.0, + "kl_loss_10": 93.94465446472168, + "kl_loss_2": 1092.5764556884765, + "kl_loss_3": 757.9043579101562, + "kl_loss_7": 189.06216201782226, + "learning_rate": 0.00022436236808900844, + "loss": 532.0287, + "step": 6890 + }, + { + "ce_loss_10": 3.563220775127411, + "ce_loss_13": 3.505044734477997, + "ce_loss_2": 4.028258430957794, + "ce_loss_3": 3.860454273223877, + "ce_loss_7": 3.6083375453948974, + "epoch": 0.69, + "grad_norm": 402.0, + "kl_loss_10": 95.30224533081055, + "kl_loss_2": 1114.9274475097657, + "kl_loss_3": 768.1644836425781, + "kl_loss_7": 189.04213485717773, + "learning_rate": 0.00022303996806694487, + "loss": 534.7889, + "step": 6900 + }, + { + "ce_loss_10": 3.646312749385834, + "ce_loss_13": 3.5865816950798033, + "ce_loss_2": 4.1086891174316404, + "ce_loss_3": 3.9399857401847838, + "ce_loss_7": 3.69192236661911, + "epoch": 0.691, + "grad_norm": 392.0, + "kl_loss_10": 95.77762832641602, + "kl_loss_2": 1094.2582000732423, + "kl_loss_3": 756.6770172119141, + "kl_loss_7": 187.92616500854493, + "learning_rate": 0.00022172035702932823, + "loss": 534.246, + "step": 6910 + }, + { + "ce_loss_10": 3.685254919528961, + "ce_loss_13": 3.6261175990104677, + "ce_loss_2": 4.142215931415558, + "ce_loss_3": 3.9721115231513977, + "ce_loss_7": 3.7271186470985413, + "epoch": 0.692, + "grad_norm": 430.0, + "kl_loss_10": 94.89179420471191, + "kl_loss_2": 1075.7997589111328, + "kl_loss_3": 742.8703857421875, + "kl_loss_7": 186.23582077026367, + "learning_rate": 0.00022040354826462666, + "loss": 530.2491, + "step": 6920 + }, + { + "ce_loss_10": 3.62452495098114, + "ce_loss_13": 3.563087892532349, + "ce_loss_2": 4.079807507991791, + "ce_loss_3": 3.913197338581085, + "ce_loss_7": 3.6697877049446106, + "epoch": 0.693, + "grad_norm": 410.0, + "kl_loss_10": 96.51725845336914, + "kl_loss_2": 1085.478707885742, + "kl_loss_3": 750.6873352050782, + "kl_loss_7": 187.0568748474121, + "learning_rate": 0.0002190895550330899, + "loss": 535.6979, + "step": 6930 + }, + { + "ce_loss_10": 3.547420835494995, + "ce_loss_13": 3.488833248615265, + "ce_loss_2": 4.036989772319794, + "ce_loss_3": 3.85961799621582, + "ce_loss_7": 3.598125493526459, + "epoch": 0.694, + "grad_norm": 406.0, + "kl_loss_10": 96.3628433227539, + "kl_loss_2": 1128.3786254882812, + "kl_loss_3": 778.4836669921875, + "kl_loss_7": 192.30058898925782, + "learning_rate": 0.00021777839056661552, + "loss": 534.9962, + "step": 6940 + }, + { + "ce_loss_10": 3.636169970035553, + "ce_loss_13": 3.576909136772156, + "ce_loss_2": 4.093018388748169, + "ce_loss_3": 3.9319678425788878, + "ce_loss_7": 3.682026994228363, + "epoch": 0.695, + "grad_norm": 380.0, + "kl_loss_10": 95.15358619689941, + "kl_loss_2": 1086.1379272460938, + "kl_loss_3": 753.724154663086, + "kl_loss_7": 185.95790100097656, + "learning_rate": 0.0002164700680686147, + "loss": 526.2859, + "step": 6950 + }, + { + "ce_loss_10": 3.6809890270233154, + "ce_loss_13": 3.6225372910499574, + "ce_loss_2": 4.135282206535339, + "ce_loss_3": 3.9695199608802794, + "ce_loss_7": 3.7249368906021116, + "epoch": 0.696, + "grad_norm": 400.0, + "kl_loss_10": 96.4394718170166, + "kl_loss_2": 1074.3540649414062, + "kl_loss_3": 743.0920288085938, + "kl_loss_7": 188.12129898071288, + "learning_rate": 0.0002151646007138806, + "loss": 527.0223, + "step": 6960 + }, + { + "ce_loss_10": 3.55483934879303, + "ce_loss_13": 3.493414306640625, + "ce_loss_2": 4.029416286945343, + "ce_loss_3": 3.8593334913253785, + "ce_loss_7": 3.5997050285339354, + "epoch": 0.697, + "grad_norm": 324.0, + "kl_loss_10": 98.1744327545166, + "kl_loss_2": 1119.6888793945313, + "kl_loss_3": 776.6419464111328, + "kl_loss_7": 191.90652236938476, + "learning_rate": 0.00021386200164845526, + "loss": 540.4315, + "step": 6970 + }, + { + "ce_loss_10": 3.7494669914245606, + "ce_loss_13": 3.6868221879005434, + "ce_loss_2": 4.180894982814789, + "ce_loss_3": 4.02219043970108, + "ce_loss_7": 3.790766155719757, + "epoch": 0.698, + "grad_norm": 386.0, + "kl_loss_10": 98.89772605895996, + "kl_loss_2": 1061.9671813964844, + "kl_loss_3": 736.8194549560546, + "kl_loss_7": 189.14059829711914, + "learning_rate": 0.0002125622839894964, + "loss": 526.3207, + "step": 6980 + }, + { + "ce_loss_10": 3.6859158158302305, + "ce_loss_13": 3.626417076587677, + "ce_loss_2": 4.136348474025726, + "ce_loss_3": 3.974168133735657, + "ce_loss_7": 3.7279628992080687, + "epoch": 0.699, + "grad_norm": 406.0, + "kl_loss_10": 97.57818336486817, + "kl_loss_2": 1081.921697998047, + "kl_loss_3": 746.1339630126953, + "kl_loss_7": 188.19551315307618, + "learning_rate": 0.00021126546082514663, + "loss": 529.5254, + "step": 6990 + }, + { + "ce_loss_10": 3.704355037212372, + "ce_loss_13": 3.643582081794739, + "ce_loss_2": 4.151243126392364, + "ce_loss_3": 3.9851069808006288, + "ce_loss_7": 3.747806203365326, + "epoch": 0.7, + "grad_norm": 394.0, + "kl_loss_10": 97.80472221374512, + "kl_loss_2": 1074.9452331542968, + "kl_loss_3": 745.385775756836, + "kl_loss_7": 188.936759185791, + "learning_rate": 0.00020997154521440098, + "loss": 526.4211, + "step": 7000 + }, + { + "ce_loss_10": 3.6455201506614685, + "ce_loss_13": 3.586948239803314, + "ce_loss_2": 4.104578590393066, + "ce_loss_3": 3.9375877380371094, + "ce_loss_7": 3.68754506111145, + "epoch": 0.701, + "grad_norm": 322.0, + "kl_loss_10": 93.82002601623535, + "kl_loss_2": 1085.8826141357422, + "kl_loss_3": 746.0692993164063, + "kl_loss_7": 184.4355583190918, + "learning_rate": 0.0002086805501869749, + "loss": 524.1356, + "step": 7010 + }, + { + "ce_loss_10": 3.6133246064186095, + "ce_loss_13": 3.554938244819641, + "ce_loss_2": 4.0853543996810915, + "ce_loss_3": 3.918412721157074, + "ce_loss_7": 3.6612335562705995, + "epoch": 0.702, + "grad_norm": 398.0, + "kl_loss_10": 95.29999237060547, + "kl_loss_2": 1131.5339111328126, + "kl_loss_3": 781.2637298583984, + "kl_loss_7": 192.70318984985352, + "learning_rate": 0.0002073924887431744, + "loss": 542.1648, + "step": 7020 + }, + { + "ce_loss_10": 3.619812881946564, + "ce_loss_13": 3.561210036277771, + "ce_loss_2": 4.088810133934021, + "ce_loss_3": 3.9195892930030825, + "ce_loss_7": 3.667060124874115, + "epoch": 0.703, + "grad_norm": 396.0, + "kl_loss_10": 95.14918022155761, + "kl_loss_2": 1112.4185638427734, + "kl_loss_3": 769.590234375, + "kl_loss_7": 188.17913894653321, + "learning_rate": 0.00020610737385376348, + "loss": 545.7339, + "step": 7030 + }, + { + "ce_loss_10": 3.689952182769775, + "ce_loss_13": 3.629777657985687, + "ce_loss_2": 4.127048587799072, + "ce_loss_3": 3.9679968118667603, + "ce_loss_7": 3.7309682607650756, + "epoch": 0.704, + "grad_norm": 480.0, + "kl_loss_10": 96.72987632751465, + "kl_loss_2": 1060.028268432617, + "kl_loss_3": 736.1820068359375, + "kl_loss_7": 185.3560775756836, + "learning_rate": 0.00020482521845983521, + "loss": 531.1421, + "step": 7040 + }, + { + "ce_loss_10": 3.681384038925171, + "ce_loss_13": 3.6203475475311278, + "ce_loss_2": 4.1394176363945006, + "ce_loss_3": 3.9727881073951723, + "ce_loss_7": 3.725051200389862, + "epoch": 0.705, + "grad_norm": 482.0, + "kl_loss_10": 100.69121513366699, + "kl_loss_2": 1089.9848724365233, + "kl_loss_3": 754.3679351806641, + "kl_loss_7": 192.38913803100587, + "learning_rate": 0.00020354603547267987, + "loss": 542.1912, + "step": 7050 + }, + { + "ce_loss_10": 3.667348313331604, + "ce_loss_13": 3.605680251121521, + "ce_loss_2": 4.1402019739151, + "ce_loss_3": 3.971468675136566, + "ce_loss_7": 3.712887394428253, + "epoch": 0.706, + "grad_norm": 364.0, + "kl_loss_10": 97.05326614379882, + "kl_loss_2": 1105.346597290039, + "kl_loss_3": 773.4686828613281, + "kl_loss_7": 191.13608169555664, + "learning_rate": 0.00020226983777365604, + "loss": 548.4642, + "step": 7060 + }, + { + "ce_loss_10": 3.563067603111267, + "ce_loss_13": 3.504194128513336, + "ce_loss_2": 4.040568280220032, + "ce_loss_3": 3.8677730679512026, + "ce_loss_7": 3.6067102789878844, + "epoch": 0.707, + "grad_norm": 338.0, + "kl_loss_10": 92.14009590148926, + "kl_loss_2": 1122.7782775878907, + "kl_loss_3": 765.3169036865235, + "kl_loss_7": 183.86895446777345, + "learning_rate": 0.00020099663821406056, + "loss": 534.7408, + "step": 7070 + }, + { + "ce_loss_10": 3.669863748550415, + "ce_loss_13": 3.6097553610801696, + "ce_loss_2": 4.117836952209473, + "ce_loss_3": 3.955933165550232, + "ce_loss_7": 3.7124067664146425, + "epoch": 0.708, + "grad_norm": 528.0, + "kl_loss_10": 95.14625968933106, + "kl_loss_2": 1064.9939758300782, + "kl_loss_3": 737.6582244873047, + "kl_loss_7": 184.71611633300782, + "learning_rate": 0.00019972644961499853, + "loss": 531.3339, + "step": 7080 + }, + { + "ce_loss_10": 3.635360848903656, + "ce_loss_13": 3.575283741950989, + "ce_loss_2": 4.107546412944794, + "ce_loss_3": 3.9376320004463197, + "ce_loss_7": 3.6813616275787355, + "epoch": 0.709, + "grad_norm": 454.0, + "kl_loss_10": 95.76384582519532, + "kl_loss_2": 1112.5157043457032, + "kl_loss_3": 768.9019195556641, + "kl_loss_7": 190.37624435424806, + "learning_rate": 0.00019845928476725522, + "loss": 537.9877, + "step": 7090 + }, + { + "ce_loss_10": 3.7167228937149046, + "ce_loss_13": 3.654716455936432, + "ce_loss_2": 4.171470665931702, + "ce_loss_3": 4.006917369365692, + "ce_loss_7": 3.763367462158203, + "epoch": 0.71, + "grad_norm": 402.0, + "kl_loss_10": 97.96182098388672, + "kl_loss_2": 1088.9804382324219, + "kl_loss_3": 752.4143249511719, + "kl_loss_7": 190.0522773742676, + "learning_rate": 0.00019719515643116677, + "loss": 545.6708, + "step": 7100 + }, + { + "ce_loss_10": 3.657674491405487, + "ce_loss_13": 3.595584750175476, + "ce_loss_2": 4.113815677165985, + "ce_loss_3": 3.9436608791351317, + "ce_loss_7": 3.700818693637848, + "epoch": 0.711, + "grad_norm": 354.0, + "kl_loss_10": 97.26274185180664, + "kl_loss_2": 1084.9519958496094, + "kl_loss_3": 745.9836975097656, + "kl_loss_7": 187.7238555908203, + "learning_rate": 0.0001959340773364911, + "loss": 535.516, + "step": 7110 + }, + { + "ce_loss_10": 3.6742369413375853, + "ce_loss_13": 3.613626217842102, + "ce_loss_2": 4.1353883981704715, + "ce_loss_3": 3.9663340568542482, + "ce_loss_7": 3.715994417667389, + "epoch": 0.712, + "grad_norm": 414.0, + "kl_loss_10": 97.77620887756348, + "kl_loss_2": 1094.1240295410157, + "kl_loss_3": 755.1257873535156, + "kl_loss_7": 188.97418975830078, + "learning_rate": 0.0001946760601822809, + "loss": 526.0803, + "step": 7120 + }, + { + "ce_loss_10": 3.724298870563507, + "ce_loss_13": 3.6654844999313356, + "ce_loss_2": 4.171249413490296, + "ce_loss_3": 4.011460411548614, + "ce_loss_7": 3.770449674129486, + "epoch": 0.713, + "grad_norm": 328.0, + "kl_loss_10": 95.51984024047852, + "kl_loss_2": 1076.5175323486328, + "kl_loss_3": 741.9749359130859, + "kl_loss_7": 187.3384910583496, + "learning_rate": 0.00019342111763675512, + "loss": 520.2061, + "step": 7130 + }, + { + "ce_loss_10": 3.730803680419922, + "ce_loss_13": 3.6689053654670714, + "ce_loss_2": 4.169312536716461, + "ce_loss_3": 4.00542528629303, + "ce_loss_7": 3.7727373957633974, + "epoch": 0.714, + "grad_norm": 418.0, + "kl_loss_10": 99.5161979675293, + "kl_loss_2": 1071.9742065429687, + "kl_loss_3": 743.7749084472656, + "kl_loss_7": 189.85234451293945, + "learning_rate": 0.00019216926233717085, + "loss": 525.6779, + "step": 7140 + }, + { + "ce_loss_10": 3.6117329597473145, + "ce_loss_13": 3.5528572678565977, + "ce_loss_2": 4.092064487934112, + "ce_loss_3": 3.914566385746002, + "ce_loss_7": 3.653049111366272, + "epoch": 0.715, + "grad_norm": 342.0, + "kl_loss_10": 95.1452823638916, + "kl_loss_2": 1135.5553619384766, + "kl_loss_3": 779.0962982177734, + "kl_loss_7": 185.5459442138672, + "learning_rate": 0.00019092050688969737, + "loss": 540.4428, + "step": 7150 + }, + { + "ce_loss_10": 3.6794282674789427, + "ce_loss_13": 3.619647240638733, + "ce_loss_2": 4.124801588058472, + "ce_loss_3": 3.9644263625144958, + "ce_loss_7": 3.7204025983810425, + "epoch": 0.716, + "grad_norm": 458.0, + "kl_loss_10": 95.73797454833985, + "kl_loss_2": 1075.4539825439454, + "kl_loss_3": 743.3267883300781, + "kl_loss_7": 186.0149787902832, + "learning_rate": 0.00018967486386928817, + "loss": 525.8811, + "step": 7160 + }, + { + "ce_loss_10": 3.5499155521392822, + "ce_loss_13": 3.4892677664756775, + "ce_loss_2": 4.026895833015442, + "ce_loss_3": 3.8540278673171997, + "ce_loss_7": 3.594646680355072, + "epoch": 0.717, + "grad_norm": 458.0, + "kl_loss_10": 93.14333381652833, + "kl_loss_2": 1122.9288635253906, + "kl_loss_3": 776.9212982177735, + "kl_loss_7": 188.66815719604492, + "learning_rate": 0.00018843234581955443, + "loss": 552.9929, + "step": 7170 + }, + { + "ce_loss_10": 3.574516201019287, + "ce_loss_13": 3.512941229343414, + "ce_loss_2": 4.049459004402161, + "ce_loss_3": 3.871944236755371, + "ce_loss_7": 3.6209982872009276, + "epoch": 0.718, + "grad_norm": 364.0, + "kl_loss_10": 96.56784629821777, + "kl_loss_2": 1129.7979248046875, + "kl_loss_3": 775.4145477294921, + "kl_loss_7": 190.88178558349608, + "learning_rate": 0.00018719296525263924, + "loss": 541.6241, + "step": 7180 + }, + { + "ce_loss_10": 3.6690776705741883, + "ce_loss_13": 3.6084558844566343, + "ce_loss_2": 4.109010553359985, + "ce_loss_3": 3.944419741630554, + "ce_loss_7": 3.7104645013809203, + "epoch": 0.719, + "grad_norm": 472.0, + "kl_loss_10": 96.92717056274414, + "kl_loss_2": 1058.7910217285157, + "kl_loss_3": 728.63525390625, + "kl_loss_7": 186.22266235351563, + "learning_rate": 0.0001859567346490913, + "loss": 525.3373, + "step": 7190 + }, + { + "ce_loss_10": 3.6438188314437867, + "ce_loss_13": 3.5840962886810304, + "ce_loss_2": 4.113380300998688, + "ce_loss_3": 3.9464030385017397, + "ce_loss_7": 3.690832197666168, + "epoch": 0.72, + "grad_norm": 372.0, + "kl_loss_10": 96.38097648620605, + "kl_loss_2": 1109.4217742919923, + "kl_loss_3": 771.3218353271484, + "kl_loss_7": 191.62188110351562, + "learning_rate": 0.0001847236664577389, + "loss": 531.0333, + "step": 7200 + }, + { + "ce_loss_10": 3.673705244064331, + "ce_loss_13": 3.614805054664612, + "ce_loss_2": 4.117141389846802, + "ce_loss_3": 3.954344153404236, + "ce_loss_7": 3.717172992229462, + "epoch": 0.721, + "grad_norm": 342.0, + "kl_loss_10": 96.93136787414551, + "kl_loss_2": 1071.9077087402343, + "kl_loss_3": 737.0366821289062, + "kl_loss_7": 186.3966079711914, + "learning_rate": 0.00018349377309556487, + "loss": 518.4113, + "step": 7210 + }, + { + "ce_loss_10": 3.609177756309509, + "ce_loss_13": 3.5509839773178102, + "ce_loss_2": 4.084410285949707, + "ce_loss_3": 3.911909210681915, + "ce_loss_7": 3.6546399116516115, + "epoch": 0.722, + "grad_norm": 436.0, + "kl_loss_10": 94.82120094299316, + "kl_loss_2": 1119.1944885253906, + "kl_loss_3": 772.7051483154297, + "kl_loss_7": 190.29311599731446, + "learning_rate": 0.00018226706694758193, + "loss": 539.7223, + "step": 7220 + }, + { + "ce_loss_10": 3.6862050175666807, + "ce_loss_13": 3.6256973266601564, + "ce_loss_2": 4.135941016674042, + "ce_loss_3": 3.9752198338508604, + "ce_loss_7": 3.7262799024581907, + "epoch": 0.723, + "grad_norm": 386.0, + "kl_loss_10": 96.04033012390137, + "kl_loss_2": 1079.298776245117, + "kl_loss_3": 752.3606506347656, + "kl_loss_7": 187.0266014099121, + "learning_rate": 0.0001810435603667075, + "loss": 540.3036, + "step": 7230 + }, + { + "ce_loss_10": 3.5322535395622254, + "ce_loss_13": 3.4715150594711304, + "ce_loss_2": 4.000997626781464, + "ce_loss_3": 3.8283260583877565, + "ce_loss_7": 3.5753297805786133, + "epoch": 0.724, + "grad_norm": 348.0, + "kl_loss_10": 92.0587100982666, + "kl_loss_2": 1101.032977294922, + "kl_loss_3": 757.3754730224609, + "kl_loss_7": 184.87646255493163, + "learning_rate": 0.0001798232656736389, + "loss": 539.9771, + "step": 7240 + }, + { + "ce_loss_10": 3.7180214405059813, + "ce_loss_13": 3.6561784505844117, + "ce_loss_2": 4.153665316104889, + "ce_loss_3": 3.994912326335907, + "ce_loss_7": 3.759207808971405, + "epoch": 0.725, + "grad_norm": 388.0, + "kl_loss_10": 97.47655296325684, + "kl_loss_2": 1060.039584350586, + "kl_loss_3": 729.7286529541016, + "kl_loss_7": 185.7909019470215, + "learning_rate": 0.0001786061951567303, + "loss": 527.9849, + "step": 7250 + }, + { + "ce_loss_10": 3.630312275886536, + "ce_loss_13": 3.5694428086280823, + "ce_loss_2": 4.091070818901062, + "ce_loss_3": 3.92718985080719, + "ce_loss_7": 3.675185751914978, + "epoch": 0.726, + "grad_norm": 382.0, + "kl_loss_10": 97.81040573120117, + "kl_loss_2": 1091.2934509277343, + "kl_loss_3": 755.8922180175781, + "kl_loss_7": 189.30439071655275, + "learning_rate": 0.00017739236107186857, + "loss": 537.2411, + "step": 7260 + }, + { + "ce_loss_10": 3.711188280582428, + "ce_loss_13": 3.6525003552436828, + "ce_loss_2": 4.142853522300721, + "ce_loss_3": 3.981386995315552, + "ce_loss_7": 3.7502527594566346, + "epoch": 0.727, + "grad_norm": 374.0, + "kl_loss_10": 93.90410652160645, + "kl_loss_2": 1048.1178436279297, + "kl_loss_3": 725.0591918945313, + "kl_loss_7": 182.22721328735352, + "learning_rate": 0.00017618177564234904, + "loss": 519.2631, + "step": 7270 + }, + { + "ce_loss_10": 3.693279492855072, + "ce_loss_13": 3.6356263041496275, + "ce_loss_2": 4.13202931880951, + "ce_loss_3": 3.9750990748405455, + "ce_loss_7": 3.7332441210746765, + "epoch": 0.728, + "grad_norm": 318.0, + "kl_loss_10": 95.86821098327637, + "kl_loss_2": 1048.5098999023437, + "kl_loss_3": 724.8844573974609, + "kl_loss_7": 182.79603576660156, + "learning_rate": 0.00017497445105875377, + "loss": 523.0468, + "step": 7280 + }, + { + "ce_loss_10": 3.595864677429199, + "ce_loss_13": 3.5371819376945495, + "ce_loss_2": 4.073407852649689, + "ce_loss_3": 3.904122495651245, + "ce_loss_7": 3.6426048040390016, + "epoch": 0.729, + "grad_norm": 442.0, + "kl_loss_10": 95.08332710266113, + "kl_loss_2": 1130.4158264160155, + "kl_loss_3": 780.6070220947265, + "kl_loss_7": 189.8589889526367, + "learning_rate": 0.000173770399478828, + "loss": 538.7581, + "step": 7290 + }, + { + "ce_loss_10": 3.5191142082214357, + "ce_loss_13": 3.461543416976929, + "ce_loss_2": 3.977211833000183, + "ce_loss_3": 3.8103960871696474, + "ce_loss_7": 3.564071011543274, + "epoch": 0.73, + "grad_norm": 438.0, + "kl_loss_10": 93.54008331298829, + "kl_loss_2": 1093.509115600586, + "kl_loss_3": 757.6209930419922, + "kl_loss_7": 186.89632568359374, + "learning_rate": 0.0001725696330273575, + "loss": 540.4559, + "step": 7300 + }, + { + "ce_loss_10": 3.714753472805023, + "ce_loss_13": 3.6550832748413087, + "ce_loss_2": 4.150299251079559, + "ce_loss_3": 3.9939948439598085, + "ce_loss_7": 3.757344377040863, + "epoch": 0.731, + "grad_norm": 486.0, + "kl_loss_10": 93.61467895507812, + "kl_loss_2": 1050.2083618164063, + "kl_loss_3": 726.4699127197266, + "kl_loss_7": 182.62665328979492, + "learning_rate": 0.00017137216379604724, + "loss": 517.0194, + "step": 7310 + }, + { + "ce_loss_10": 3.590583050251007, + "ce_loss_13": 3.5311309576034544, + "ce_loss_2": 4.051425302028656, + "ce_loss_3": 3.8829818606376647, + "ce_loss_7": 3.632352864742279, + "epoch": 0.732, + "grad_norm": 340.0, + "kl_loss_10": 95.8599407196045, + "kl_loss_2": 1085.3143981933595, + "kl_loss_3": 747.0916809082031, + "kl_loss_7": 186.49290466308594, + "learning_rate": 0.00017017800384339925, + "loss": 528.4002, + "step": 7320 + }, + { + "ce_loss_10": 3.540472662448883, + "ce_loss_13": 3.4801993131637574, + "ce_loss_2": 4.017971241474152, + "ce_loss_3": 3.8469355702400208, + "ce_loss_7": 3.586536169052124, + "epoch": 0.733, + "grad_norm": 316.0, + "kl_loss_10": 95.24363555908204, + "kl_loss_2": 1121.9350006103516, + "kl_loss_3": 775.7258972167969, + "kl_loss_7": 189.4253242492676, + "learning_rate": 0.00016898716519459073, + "loss": 528.2626, + "step": 7330 + }, + { + "ce_loss_10": 3.6674713015556337, + "ce_loss_13": 3.608376145362854, + "ce_loss_2": 4.144577407836914, + "ce_loss_3": 3.9727422475814818, + "ce_loss_7": 3.712773549556732, + "epoch": 0.734, + "grad_norm": 330.0, + "kl_loss_10": 96.16988220214844, + "kl_loss_2": 1116.4668975830077, + "kl_loss_3": 767.9603485107422, + "kl_loss_7": 191.9127670288086, + "learning_rate": 0.00016779965984135375, + "loss": 536.6205, + "step": 7340 + }, + { + "ce_loss_10": 3.5673499703407288, + "ce_loss_13": 3.5097331523895265, + "ce_loss_2": 4.023692965507507, + "ce_loss_3": 3.8575591087341308, + "ce_loss_7": 3.6114558935165406, + "epoch": 0.735, + "grad_norm": 478.0, + "kl_loss_10": 92.66586227416992, + "kl_loss_2": 1079.1628143310547, + "kl_loss_3": 740.6051025390625, + "kl_loss_7": 182.72610321044922, + "learning_rate": 0.00016661549974185424, + "loss": 528.04, + "step": 7350 + }, + { + "ce_loss_10": 3.612525475025177, + "ce_loss_13": 3.5525230765342712, + "ce_loss_2": 4.068535602092743, + "ce_loss_3": 3.9024940848350527, + "ce_loss_7": 3.6558452367782595, + "epoch": 0.736, + "grad_norm": 390.0, + "kl_loss_10": 97.4712890625, + "kl_loss_2": 1087.6514739990234, + "kl_loss_3": 751.1584289550781, + "kl_loss_7": 188.87088012695312, + "learning_rate": 0.00016543469682057105, + "loss": 524.4483, + "step": 7360 + }, + { + "ce_loss_10": 3.6394684672355653, + "ce_loss_13": 3.579529356956482, + "ce_loss_2": 4.096106541156769, + "ce_loss_3": 3.930702245235443, + "ce_loss_7": 3.6828288197517396, + "epoch": 0.737, + "grad_norm": 332.0, + "kl_loss_10": 96.63297386169434, + "kl_loss_2": 1092.361489868164, + "kl_loss_3": 752.7328277587891, + "kl_loss_7": 189.77932739257812, + "learning_rate": 0.00016425726296817632, + "loss": 533.2087, + "step": 7370 + }, + { + "ce_loss_10": 3.6602264523506163, + "ce_loss_13": 3.6020756483078005, + "ce_loss_2": 4.102893972396851, + "ce_loss_3": 3.9389352679252623, + "ce_loss_7": 3.702047073841095, + "epoch": 0.738, + "grad_norm": 604.0, + "kl_loss_10": 95.1510066986084, + "kl_loss_2": 1066.6962097167968, + "kl_loss_3": 734.1273132324219, + "kl_loss_7": 185.05731124877929, + "learning_rate": 0.00016308321004141607, + "loss": 524.9394, + "step": 7380 + }, + { + "ce_loss_10": 3.6052220940589903, + "ce_loss_13": 3.544311022758484, + "ce_loss_2": 4.074472200870514, + "ce_loss_3": 3.905838668346405, + "ce_loss_7": 3.6499088406562805, + "epoch": 0.739, + "grad_norm": 414.0, + "kl_loss_10": 98.00579032897949, + "kl_loss_2": 1091.213427734375, + "kl_loss_3": 753.4314147949219, + "kl_loss_7": 190.15870666503906, + "learning_rate": 0.00016191254986299043, + "loss": 528.1322, + "step": 7390 + }, + { + "ce_loss_10": 3.665621018409729, + "ce_loss_13": 3.606168735027313, + "ce_loss_2": 4.110114741325378, + "ce_loss_3": 3.9419893980026246, + "ce_loss_7": 3.7061524271965025, + "epoch": 0.74, + "grad_norm": 380.0, + "kl_loss_10": 95.95707778930664, + "kl_loss_2": 1084.0633728027344, + "kl_loss_3": 743.1361236572266, + "kl_loss_7": 184.58063583374025, + "learning_rate": 0.00016074529422143398, + "loss": 534.7291, + "step": 7400 + }, + { + "ce_loss_10": 3.5971511721611025, + "ce_loss_13": 3.540179669857025, + "ce_loss_2": 4.063095271587372, + "ce_loss_3": 3.8901899337768553, + "ce_loss_7": 3.6407782435417175, + "epoch": 0.741, + "grad_norm": 672.0, + "kl_loss_10": 95.23762931823731, + "kl_loss_2": 1107.8231140136718, + "kl_loss_3": 756.5032379150391, + "kl_loss_7": 187.1332000732422, + "learning_rate": 0.0001595814548709983, + "loss": 535.9396, + "step": 7410 + }, + { + "ce_loss_10": 3.6745630502700806, + "ce_loss_13": 3.613684153556824, + "ce_loss_2": 4.1425374269485475, + "ce_loss_3": 3.9706888437271117, + "ce_loss_7": 3.7216501116752623, + "epoch": 0.742, + "grad_norm": 372.0, + "kl_loss_10": 97.69215469360351, + "kl_loss_2": 1104.6529205322265, + "kl_loss_3": 761.8667907714844, + "kl_loss_7": 191.12793655395507, + "learning_rate": 0.00015842104353153285, + "loss": 536.9469, + "step": 7420 + }, + { + "ce_loss_10": 3.6906041502952576, + "ce_loss_13": 3.6308916926383974, + "ce_loss_2": 4.145760095119476, + "ce_loss_3": 3.981899178028107, + "ce_loss_7": 3.7335981249809267, + "epoch": 0.743, + "grad_norm": 418.0, + "kl_loss_10": 97.18793029785157, + "kl_loss_2": 1097.6078674316407, + "kl_loss_3": 759.1226196289062, + "kl_loss_7": 189.25363845825194, + "learning_rate": 0.0001572640718883667, + "loss": 543.1555, + "step": 7430 + }, + { + "ce_loss_10": 3.6231363296508787, + "ce_loss_13": 3.564767360687256, + "ce_loss_2": 4.071314561367035, + "ce_loss_3": 3.91090784072876, + "ce_loss_7": 3.664019286632538, + "epoch": 0.744, + "grad_norm": 320.0, + "kl_loss_10": 94.90192832946778, + "kl_loss_2": 1067.2735595703125, + "kl_loss_3": 738.7796752929687, + "kl_loss_7": 183.48248062133788, + "learning_rate": 0.0001561105515921915, + "loss": 533.524, + "step": 7440 + }, + { + "ce_loss_10": 3.463870346546173, + "ce_loss_13": 3.4067335724830627, + "ce_loss_2": 3.9477816224098206, + "ce_loss_3": 3.7779128670692446, + "ce_loss_7": 3.51077561378479, + "epoch": 0.745, + "grad_norm": 300.0, + "kl_loss_10": 92.0508991241455, + "kl_loss_2": 1123.8193664550781, + "kl_loss_3": 770.2350646972657, + "kl_loss_7": 184.63360900878905, + "learning_rate": 0.0001549604942589441, + "loss": 530.4723, + "step": 7450 + }, + { + "ce_loss_10": 3.6651261687278747, + "ce_loss_13": 3.6062275648117064, + "ce_loss_2": 4.092234718799591, + "ce_loss_3": 3.9361136317253114, + "ce_loss_7": 3.7055254936218263, + "epoch": 0.746, + "grad_norm": 366.0, + "kl_loss_10": 93.61905822753906, + "kl_loss_2": 1028.498812866211, + "kl_loss_3": 711.0323303222656, + "kl_loss_7": 180.76227340698242, + "learning_rate": 0.00015381391146968864, + "loss": 518.9042, + "step": 7460 + }, + { + "ce_loss_10": 3.6343637704849243, + "ce_loss_13": 3.5772631406784057, + "ce_loss_2": 4.097551655769348, + "ce_loss_3": 3.9294708490371706, + "ce_loss_7": 3.6792303323745728, + "epoch": 0.747, + "grad_norm": 348.0, + "kl_loss_10": 93.67252769470215, + "kl_loss_2": 1075.3313690185546, + "kl_loss_3": 736.6811370849609, + "kl_loss_7": 182.92275466918946, + "learning_rate": 0.00015267081477050133, + "loss": 529.2104, + "step": 7470 + }, + { + "ce_loss_10": 3.737002635002136, + "ce_loss_13": 3.6760261058807373, + "ce_loss_2": 4.184408628940583, + "ce_loss_3": 4.020549094676971, + "ce_loss_7": 3.779319405555725, + "epoch": 0.748, + "grad_norm": 314.0, + "kl_loss_10": 97.9722526550293, + "kl_loss_2": 1074.686865234375, + "kl_loss_3": 738.4016967773438, + "kl_loss_7": 188.9453094482422, + "learning_rate": 0.00015153121567235335, + "loss": 521.3269, + "step": 7480 + }, + { + "ce_loss_10": 3.627355396747589, + "ce_loss_13": 3.566980814933777, + "ce_loss_2": 4.087941682338714, + "ce_loss_3": 3.9189056277275087, + "ce_loss_7": 3.671427834033966, + "epoch": 0.749, + "grad_norm": 362.0, + "kl_loss_10": 95.86229972839355, + "kl_loss_2": 1099.5704315185546, + "kl_loss_3": 757.0687835693359, + "kl_loss_7": 188.21585922241212, + "learning_rate": 0.00015039512565099468, + "loss": 520.7597, + "step": 7490 + }, + { + "ce_loss_10": 3.6923457860946653, + "ce_loss_13": 3.6337139129638674, + "ce_loss_2": 4.142465770244598, + "ce_loss_3": 3.9779353976249694, + "ce_loss_7": 3.7360000610351562, + "epoch": 0.75, + "grad_norm": 400.0, + "kl_loss_10": 96.83558921813965, + "kl_loss_2": 1084.189535522461, + "kl_loss_3": 748.4331817626953, + "kl_loss_7": 188.31302337646486, + "learning_rate": 0.00014926255614683932, + "loss": 542.3, + "step": 7500 + }, + { + "ce_loss_10": 3.63236540555954, + "ce_loss_13": 3.5743218302726745, + "ce_loss_2": 4.084986460208893, + "ce_loss_3": 3.9159162759780886, + "ce_loss_7": 3.6776034474372863, + "epoch": 0.751, + "grad_norm": 356.0, + "kl_loss_10": 95.49623985290528, + "kl_loss_2": 1074.4479522705078, + "kl_loss_3": 737.1827270507813, + "kl_loss_7": 185.40160522460937, + "learning_rate": 0.0001481335185648498, + "loss": 533.0406, + "step": 7510 + }, + { + "ce_loss_10": 3.6419626474380493, + "ce_loss_13": 3.583856701850891, + "ce_loss_2": 4.0939129114151, + "ce_loss_3": 3.9313414216041567, + "ce_loss_7": 3.686499559879303, + "epoch": 0.752, + "grad_norm": 406.0, + "kl_loss_10": 93.70109405517579, + "kl_loss_2": 1078.2966064453126, + "kl_loss_3": 747.9822265625, + "kl_loss_7": 186.15133514404297, + "learning_rate": 0.0001470080242744218, + "loss": 523.242, + "step": 7520 + }, + { + "ce_loss_10": 3.638762640953064, + "ce_loss_13": 3.5817859530448914, + "ce_loss_2": 4.096928322315216, + "ce_loss_3": 3.925868511199951, + "ce_loss_7": 3.6821122765541077, + "epoch": 0.753, + "grad_norm": 304.0, + "kl_loss_10": 92.91362495422364, + "kl_loss_2": 1078.3225189208983, + "kl_loss_3": 744.8880218505859, + "kl_loss_7": 183.9945556640625, + "learning_rate": 0.0001458860846092705, + "loss": 532.4821, + "step": 7530 + }, + { + "ce_loss_10": 3.6720047116279604, + "ce_loss_13": 3.6128148198127747, + "ce_loss_2": 4.114215791225433, + "ce_loss_3": 3.9525702714920046, + "ce_loss_7": 3.7135818719863893, + "epoch": 0.754, + "grad_norm": 322.0, + "kl_loss_10": 94.26252975463868, + "kl_loss_2": 1064.2825866699218, + "kl_loss_3": 735.7307800292969, + "kl_loss_7": 183.32004623413087, + "learning_rate": 0.00014476771086731566, + "loss": 517.3908, + "step": 7540 + }, + { + "ce_loss_10": 3.7847033739089966, + "ce_loss_13": 3.7219197750091553, + "ce_loss_2": 4.230582165718078, + "ce_loss_3": 4.065666139125824, + "ce_loss_7": 3.827774000167847, + "epoch": 0.755, + "grad_norm": 430.0, + "kl_loss_10": 99.63549118041992, + "kl_loss_2": 1067.45849609375, + "kl_loss_3": 732.3584259033203, + "kl_loss_7": 187.05829620361328, + "learning_rate": 0.00014365291431056872, + "loss": 535.3279, + "step": 7550 + }, + { + "ce_loss_10": 3.6090814113616942, + "ce_loss_13": 3.5493834733963014, + "ce_loss_2": 4.077333819866181, + "ce_loss_3": 3.906116855144501, + "ce_loss_7": 3.652885007858276, + "epoch": 0.756, + "grad_norm": 460.0, + "kl_loss_10": 97.59222984313965, + "kl_loss_2": 1117.932635498047, + "kl_loss_3": 769.9259338378906, + "kl_loss_7": 192.52491149902343, + "learning_rate": 0.00014254170616501827, + "loss": 534.983, + "step": 7560 + }, + { + "ce_loss_10": 3.535455918312073, + "ce_loss_13": 3.47601797580719, + "ce_loss_2": 4.02073061466217, + "ce_loss_3": 3.852824592590332, + "ce_loss_7": 3.582290494441986, + "epoch": 0.757, + "grad_norm": 544.0, + "kl_loss_10": 94.12142906188964, + "kl_loss_2": 1137.807843017578, + "kl_loss_3": 793.451205444336, + "kl_loss_7": 191.15487823486328, + "learning_rate": 0.0001414340976205183, + "loss": 552.139, + "step": 7570 + }, + { + "ce_loss_10": 3.554329538345337, + "ce_loss_13": 3.495155191421509, + "ce_loss_2": 4.028338003158569, + "ce_loss_3": 3.860016918182373, + "ce_loss_7": 3.6010040402412415, + "epoch": 0.758, + "grad_norm": 392.0, + "kl_loss_10": 94.82050590515136, + "kl_loss_2": 1103.2837646484375, + "kl_loss_3": 760.9699432373047, + "kl_loss_7": 186.93385314941406, + "learning_rate": 0.00014033009983067452, + "loss": 536.1902, + "step": 7580 + }, + { + "ce_loss_10": 3.7230227828025817, + "ce_loss_13": 3.663366961479187, + "ce_loss_2": 4.157820415496826, + "ce_loss_3": 3.9997562408447265, + "ce_loss_7": 3.766176974773407, + "epoch": 0.759, + "grad_norm": 366.0, + "kl_loss_10": 95.41510429382325, + "kl_loss_2": 1045.2667877197266, + "kl_loss_3": 721.8115600585937, + "kl_loss_7": 183.35761260986328, + "learning_rate": 0.00013922972391273224, + "loss": 521.7405, + "step": 7590 + }, + { + "ce_loss_10": 3.726309287548065, + "ce_loss_13": 3.666226303577423, + "ce_loss_2": 4.173925065994263, + "ce_loss_3": 4.007045650482178, + "ce_loss_7": 3.7666892886161802, + "epoch": 0.76, + "grad_norm": 396.0, + "kl_loss_10": 96.0021198272705, + "kl_loss_2": 1064.7614837646483, + "kl_loss_3": 734.0039642333984, + "kl_loss_7": 185.6392059326172, + "learning_rate": 0.0001381329809474649, + "loss": 528.3375, + "step": 7600 + }, + { + "ce_loss_10": 3.621905469894409, + "ce_loss_13": 3.561663830280304, + "ce_loss_2": 4.098969185352326, + "ce_loss_3": 3.925651717185974, + "ce_loss_7": 3.6682042717933654, + "epoch": 0.761, + "grad_norm": 370.0, + "kl_loss_10": 96.61415328979493, + "kl_loss_2": 1119.761654663086, + "kl_loss_3": 769.7445831298828, + "kl_loss_7": 190.59917831420898, + "learning_rate": 0.0001370398819790621, + "loss": 540.338, + "step": 7610 + }, + { + "ce_loss_10": 3.7644327759742735, + "ce_loss_13": 3.704228925704956, + "ce_loss_2": 4.202455806732178, + "ce_loss_3": 4.041428947448731, + "ce_loss_7": 3.8075477123260497, + "epoch": 0.762, + "grad_norm": 424.0, + "kl_loss_10": 97.06539382934571, + "kl_loss_2": 1046.6303649902343, + "kl_loss_3": 720.9399322509765, + "kl_loss_7": 185.40132827758788, + "learning_rate": 0.00013595043801501794, + "loss": 512.6931, + "step": 7620 + }, + { + "ce_loss_10": 3.5539215803146362, + "ce_loss_13": 3.4973302245140077, + "ce_loss_2": 4.044493949413299, + "ce_loss_3": 3.8687676310539247, + "ce_loss_7": 3.602993667125702, + "epoch": 0.763, + "grad_norm": 468.0, + "kl_loss_10": 92.99364700317383, + "kl_loss_2": 1138.1223754882812, + "kl_loss_3": 782.3796447753906, + "kl_loss_7": 188.2281280517578, + "learning_rate": 0.00013486466002602133, + "loss": 539.5471, + "step": 7630 + }, + { + "ce_loss_10": 3.680443322658539, + "ce_loss_13": 3.6184515833854674, + "ce_loss_2": 4.119498157501221, + "ce_loss_3": 3.9600594878196715, + "ce_loss_7": 3.7241831541061403, + "epoch": 0.764, + "grad_norm": 376.0, + "kl_loss_10": 97.24503707885742, + "kl_loss_2": 1061.5150573730468, + "kl_loss_3": 737.4153533935547, + "kl_loss_7": 187.22406005859375, + "learning_rate": 0.00013378255894584462, + "loss": 537.8561, + "step": 7640 + }, + { + "ce_loss_10": 3.60829781293869, + "ce_loss_13": 3.5466054916381835, + "ce_loss_2": 4.072665071487426, + "ce_loss_3": 3.9038659572601317, + "ce_loss_7": 3.6548298835754394, + "epoch": 0.765, + "grad_norm": 380.0, + "kl_loss_10": 95.1153465270996, + "kl_loss_2": 1096.5119873046874, + "kl_loss_3": 758.0185302734375, + "kl_loss_7": 188.7285140991211, + "learning_rate": 0.0001327041456712334, + "loss": 535.4322, + "step": 7650 + }, + { + "ce_loss_10": 3.649807059764862, + "ce_loss_13": 3.588579738140106, + "ce_loss_2": 4.103657793998718, + "ce_loss_3": 3.9434640645980834, + "ce_loss_7": 3.6960434794425963, + "epoch": 0.766, + "grad_norm": 410.0, + "kl_loss_10": 95.99581718444824, + "kl_loss_2": 1095.5443603515625, + "kl_loss_3": 758.2011474609375, + "kl_loss_7": 189.6258804321289, + "learning_rate": 0.00013162943106179747, + "loss": 538.4857, + "step": 7660 + }, + { + "ce_loss_10": 3.627143681049347, + "ce_loss_13": 3.5671829342842103, + "ce_loss_2": 4.08168009519577, + "ce_loss_3": 3.9202899813652037, + "ce_loss_7": 3.6696593165397644, + "epoch": 0.767, + "grad_norm": 372.0, + "kl_loss_10": 97.96165161132812, + "kl_loss_2": 1080.916067504883, + "kl_loss_3": 746.2386291503906, + "kl_loss_7": 187.8828155517578, + "learning_rate": 0.00013055842593990132, + "loss": 529.1405, + "step": 7670 + }, + { + "ce_loss_10": 3.571021115779877, + "ce_loss_13": 3.5149319171905518, + "ce_loss_2": 4.027233076095581, + "ce_loss_3": 3.864198935031891, + "ce_loss_7": 3.6173386335372926, + "epoch": 0.768, + "grad_norm": 372.0, + "kl_loss_10": 92.48302154541015, + "kl_loss_2": 1072.3523834228515, + "kl_loss_3": 740.25439453125, + "kl_loss_7": 183.08441925048828, + "learning_rate": 0.00012949114109055414, + "loss": 533.8078, + "step": 7680 + }, + { + "ce_loss_10": 3.6176257848739626, + "ce_loss_13": 3.5594166994094847, + "ce_loss_2": 4.078605031967163, + "ce_loss_3": 3.918487286567688, + "ce_loss_7": 3.6636170506477357, + "epoch": 0.769, + "grad_norm": 422.0, + "kl_loss_10": 94.60773849487305, + "kl_loss_2": 1089.138235473633, + "kl_loss_3": 757.3290557861328, + "kl_loss_7": 187.67217483520508, + "learning_rate": 0.00012842758726130281, + "loss": 537.3952, + "step": 7690 + }, + { + "ce_loss_10": 3.655508840084076, + "ce_loss_13": 3.5946714520454406, + "ce_loss_2": 4.117365610599518, + "ce_loss_3": 3.9561346530914308, + "ce_loss_7": 3.7002153038978576, + "epoch": 0.77, + "grad_norm": 432.0, + "kl_loss_10": 94.65040473937988, + "kl_loss_2": 1092.9069885253907, + "kl_loss_3": 757.1287445068359, + "kl_loss_7": 189.29573440551758, + "learning_rate": 0.00012736777516212267, + "loss": 528.3388, + "step": 7700 + }, + { + "ce_loss_10": 3.65016793012619, + "ce_loss_13": 3.5914124608039857, + "ce_loss_2": 4.1151956677436825, + "ce_loss_3": 3.947415459156036, + "ce_loss_7": 3.6969300508499146, + "epoch": 0.771, + "grad_norm": 404.0, + "kl_loss_10": 94.72591972351074, + "kl_loss_2": 1095.3469024658202, + "kl_loss_3": 757.773715209961, + "kl_loss_7": 189.3510871887207, + "learning_rate": 0.00012631171546530968, + "loss": 527.5062, + "step": 7710 + }, + { + "ce_loss_10": 3.6695477604866027, + "ce_loss_13": 3.6066803336143494, + "ce_loss_2": 4.130255508422851, + "ce_loss_3": 3.9629722952842714, + "ce_loss_7": 3.7124558687210083, + "epoch": 0.772, + "grad_norm": 400.0, + "kl_loss_10": 99.19231147766114, + "kl_loss_2": 1089.8547271728517, + "kl_loss_3": 754.8526977539062, + "kl_loss_7": 189.7204719543457, + "learning_rate": 0.00012525941880537307, + "loss": 538.339, + "step": 7720 + }, + { + "ce_loss_10": 3.7045652866363525, + "ce_loss_13": 3.6435051798820495, + "ce_loss_2": 4.150338041782379, + "ce_loss_3": 3.9872673988342284, + "ce_loss_7": 3.7454243421554567, + "epoch": 0.773, + "grad_norm": 398.0, + "kl_loss_10": 95.61402626037598, + "kl_loss_2": 1061.4443786621093, + "kl_loss_3": 733.3583831787109, + "kl_loss_7": 185.768399810791, + "learning_rate": 0.00012421089577892869, + "loss": 524.5635, + "step": 7730 + }, + { + "ce_loss_10": 3.645431864261627, + "ce_loss_13": 3.584313917160034, + "ce_loss_2": 4.109975218772888, + "ce_loss_3": 3.9383553504943847, + "ce_loss_7": 3.6912776827812195, + "epoch": 0.774, + "grad_norm": 440.0, + "kl_loss_10": 96.41397132873536, + "kl_loss_2": 1098.874331665039, + "kl_loss_3": 755.1620544433594, + "kl_loss_7": 190.60089797973632, + "learning_rate": 0.0001231661569445919, + "loss": 536.2486, + "step": 7740 + }, + { + "ce_loss_10": 3.501088798046112, + "ce_loss_13": 3.443252968788147, + "ce_loss_2": 3.9620775461196898, + "ce_loss_3": 3.795079970359802, + "ce_loss_7": 3.5464309573173525, + "epoch": 0.775, + "grad_norm": 346.0, + "kl_loss_10": 93.47399139404297, + "kl_loss_2": 1090.8283447265626, + "kl_loss_3": 754.4031158447266, + "kl_loss_7": 186.22638092041015, + "learning_rate": 0.00012212521282287093, + "loss": 538.4937, + "step": 7750 + }, + { + "ce_loss_10": 3.6629942655563354, + "ce_loss_13": 3.601106119155884, + "ce_loss_2": 4.111618340015411, + "ce_loss_3": 3.951659619808197, + "ce_loss_7": 3.7078338623046876, + "epoch": 0.776, + "grad_norm": 364.0, + "kl_loss_10": 98.37307014465333, + "kl_loss_2": 1080.0280029296875, + "kl_loss_3": 745.148388671875, + "kl_loss_7": 190.13256072998047, + "learning_rate": 0.00012108807389606158, + "loss": 538.7029, + "step": 7760 + }, + { + "ce_loss_10": 3.659121203422546, + "ce_loss_13": 3.6007887601852415, + "ce_loss_2": 4.112268555164337, + "ce_loss_3": 3.9502876162528993, + "ce_loss_7": 3.7037811279296875, + "epoch": 0.777, + "grad_norm": 364.0, + "kl_loss_10": 93.70635108947754, + "kl_loss_2": 1072.3641204833984, + "kl_loss_3": 740.7109130859375, + "kl_loss_7": 182.99172821044922, + "learning_rate": 0.00012005475060814159, + "loss": 525.026, + "step": 7770 + }, + { + "ce_loss_10": 3.5951377630233763, + "ce_loss_13": 3.5359464406967165, + "ce_loss_2": 4.060847020149231, + "ce_loss_3": 3.891322433948517, + "ce_loss_7": 3.63891544342041, + "epoch": 0.778, + "grad_norm": 384.0, + "kl_loss_10": 97.0392059326172, + "kl_loss_2": 1106.707992553711, + "kl_loss_3": 763.6160034179687, + "kl_loss_7": 188.94908752441407, + "learning_rate": 0.00011902525336466464, + "loss": 535.4202, + "step": 7780 + }, + { + "ce_loss_10": 3.5829373240470885, + "ce_loss_13": 3.5231135487556458, + "ce_loss_2": 4.054291594028473, + "ce_loss_3": 3.888161540031433, + "ce_loss_7": 3.630410146713257, + "epoch": 0.779, + "grad_norm": 384.0, + "kl_loss_10": 95.91268005371094, + "kl_loss_2": 1108.9134979248047, + "kl_loss_3": 768.8667724609375, + "kl_loss_7": 190.86130905151367, + "learning_rate": 0.00011799959253265668, + "loss": 532.9367, + "step": 7790 + }, + { + "ce_loss_10": 3.646629250049591, + "ce_loss_13": 3.584940028190613, + "ce_loss_2": 4.100114536285401, + "ce_loss_3": 3.9342658519744873, + "ce_loss_7": 3.687722849845886, + "epoch": 0.78, + "grad_norm": 426.0, + "kl_loss_10": 98.96642303466797, + "kl_loss_2": 1093.9118621826171, + "kl_loss_3": 757.5971832275391, + "kl_loss_7": 190.95031204223633, + "learning_rate": 0.00011697777844051105, + "loss": 534.9413, + "step": 7800 + }, + { + "ce_loss_10": 3.6246392488479615, + "ce_loss_13": 3.5636275887489317, + "ce_loss_2": 4.0959463000297545, + "ce_loss_3": 3.9209203004837034, + "ce_loss_7": 3.668913960456848, + "epoch": 0.781, + "grad_norm": 394.0, + "kl_loss_10": 96.37951927185058, + "kl_loss_2": 1131.5390258789062, + "kl_loss_3": 774.0704650878906, + "kl_loss_7": 190.10399703979493, + "learning_rate": 0.00011595982137788402, + "loss": 539.5272, + "step": 7810 + }, + { + "ce_loss_10": 3.601748263835907, + "ce_loss_13": 3.542947518825531, + "ce_loss_2": 4.0462228655815125, + "ce_loss_3": 3.884107196331024, + "ce_loss_7": 3.6427837133407595, + "epoch": 0.782, + "grad_norm": 362.0, + "kl_loss_10": 95.04786491394043, + "kl_loss_2": 1064.3328094482422, + "kl_loss_3": 734.7262878417969, + "kl_loss_7": 183.74214706420898, + "learning_rate": 0.00011494573159559212, + "loss": 528.7992, + "step": 7820 + }, + { + "ce_loss_10": 3.587358093261719, + "ce_loss_13": 3.5285757184028625, + "ce_loss_2": 4.055095791816711, + "ce_loss_3": 3.8850948452949523, + "ce_loss_7": 3.6320362448692323, + "epoch": 0.783, + "grad_norm": 344.0, + "kl_loss_10": 95.2613368988037, + "kl_loss_2": 1092.221664428711, + "kl_loss_3": 759.4220550537109, + "kl_loss_7": 186.76042938232422, + "learning_rate": 0.00011393551930550828, + "loss": 541.8625, + "step": 7830 + }, + { + "ce_loss_10": 3.7354641199111938, + "ce_loss_13": 3.6739312171936036, + "ce_loss_2": 4.175600934028625, + "ce_loss_3": 4.019279301166534, + "ce_loss_7": 3.7783281922340395, + "epoch": 0.784, + "grad_norm": 390.0, + "kl_loss_10": 99.59685325622559, + "kl_loss_2": 1064.6414337158203, + "kl_loss_3": 741.2587860107421, + "kl_loss_7": 189.10858612060548, + "learning_rate": 0.00011292919468045875, + "loss": 527.9955, + "step": 7840 + }, + { + "ce_loss_10": 3.680347263813019, + "ce_loss_13": 3.6196223735809325, + "ce_loss_2": 4.128578865528107, + "ce_loss_3": 3.9644781708717347, + "ce_loss_7": 3.723640871047974, + "epoch": 0.785, + "grad_norm": 326.0, + "kl_loss_10": 95.6224323272705, + "kl_loss_2": 1072.9300354003906, + "kl_loss_3": 746.2864379882812, + "kl_loss_7": 187.85019607543944, + "learning_rate": 0.00011192676785412154, + "loss": 523.3404, + "step": 7850 + }, + { + "ce_loss_10": 3.622621536254883, + "ce_loss_13": 3.560643196105957, + "ce_loss_2": 4.089509451389313, + "ce_loss_3": 3.9235698699951174, + "ce_loss_7": 3.6674723744392397, + "epoch": 0.786, + "grad_norm": 458.0, + "kl_loss_10": 96.80489120483398, + "kl_loss_2": 1093.20048828125, + "kl_loss_3": 754.2780883789062, + "kl_loss_7": 187.94250411987304, + "learning_rate": 0.00011092824892092374, + "loss": 533.5229, + "step": 7860 + }, + { + "ce_loss_10": 3.547496974468231, + "ce_loss_13": 3.4892043232917787, + "ce_loss_2": 4.020435309410095, + "ce_loss_3": 3.8508559226989747, + "ce_loss_7": 3.5902876496315, + "epoch": 0.787, + "grad_norm": 322.0, + "kl_loss_10": 94.49787139892578, + "kl_loss_2": 1110.2376556396484, + "kl_loss_3": 767.8008331298828, + "kl_loss_7": 188.15859375, + "learning_rate": 0.0001099336479359398, + "loss": 532.4489, + "step": 7870 + }, + { + "ce_loss_10": 3.676584839820862, + "ce_loss_13": 3.6199623942375183, + "ce_loss_2": 4.124644804000854, + "ce_loss_3": 3.9601522207260134, + "ce_loss_7": 3.7184366583824158, + "epoch": 0.788, + "grad_norm": 414.0, + "kl_loss_10": 92.98647613525391, + "kl_loss_2": 1076.658267211914, + "kl_loss_3": 737.3064331054687, + "kl_loss_7": 183.75065536499022, + "learning_rate": 0.00010894297491479043, + "loss": 529.369, + "step": 7880 + }, + { + "ce_loss_10": 3.675907850265503, + "ce_loss_13": 3.615241324901581, + "ce_loss_2": 4.123448085784912, + "ce_loss_3": 3.9602300405502318, + "ce_loss_7": 3.715320038795471, + "epoch": 0.789, + "grad_norm": 370.0, + "kl_loss_10": 97.27086067199707, + "kl_loss_2": 1078.250909423828, + "kl_loss_3": 741.1790222167969, + "kl_loss_7": 186.16854553222657, + "learning_rate": 0.00010795623983354214, + "loss": 523.6978, + "step": 7890 + }, + { + "ce_loss_10": 3.549619424343109, + "ce_loss_13": 3.492576813697815, + "ce_loss_2": 4.021520948410034, + "ce_loss_3": 3.8529414176940917, + "ce_loss_7": 3.595447373390198, + "epoch": 0.79, + "grad_norm": 428.0, + "kl_loss_10": 93.0215072631836, + "kl_loss_2": 1113.914730834961, + "kl_loss_3": 772.1699676513672, + "kl_loss_7": 189.76142959594728, + "learning_rate": 0.00010697345262860636, + "loss": 533.2417, + "step": 7900 + }, + { + "ce_loss_10": 3.702609384059906, + "ce_loss_13": 3.6431208491325378, + "ce_loss_2": 4.14087233543396, + "ce_loss_3": 3.9802441716194155, + "ce_loss_7": 3.7457746505737304, + "epoch": 0.791, + "grad_norm": 368.0, + "kl_loss_10": 97.61579055786133, + "kl_loss_2": 1063.5964447021483, + "kl_loss_3": 734.3944030761719, + "kl_loss_7": 187.06654663085936, + "learning_rate": 0.00010599462319663906, + "loss": 520.0625, + "step": 7910 + }, + { + "ce_loss_10": 3.6748117208480835, + "ce_loss_13": 3.614163410663605, + "ce_loss_2": 4.111383318901062, + "ce_loss_3": 3.951873278617859, + "ce_loss_7": 3.715614116191864, + "epoch": 0.792, + "grad_norm": 382.0, + "kl_loss_10": 94.54501228332519, + "kl_loss_2": 1049.0091613769532, + "kl_loss_3": 722.9781219482422, + "kl_loss_7": 183.01754150390624, + "learning_rate": 0.00010501976139444191, + "loss": 518.3574, + "step": 7920 + }, + { + "ce_loss_10": 3.7049331426620484, + "ce_loss_13": 3.6438170671463013, + "ce_loss_2": 4.144390141963958, + "ce_loss_3": 3.9876601815223696, + "ce_loss_7": 3.745813262462616, + "epoch": 0.793, + "grad_norm": 370.0, + "kl_loss_10": 97.8447940826416, + "kl_loss_2": 1057.744808959961, + "kl_loss_3": 730.5345703125, + "kl_loss_7": 185.18996047973633, + "learning_rate": 0.0001040488770388625, + "loss": 527.8366, + "step": 7930 + }, + { + "ce_loss_10": 3.6446168065071105, + "ce_loss_13": 3.5857683539390566, + "ce_loss_2": 4.095709836483001, + "ce_loss_3": 3.92866997718811, + "ce_loss_7": 3.685992920398712, + "epoch": 0.794, + "grad_norm": 426.0, + "kl_loss_10": 95.57501831054688, + "kl_loss_2": 1080.6232208251954, + "kl_loss_3": 746.1043212890625, + "kl_loss_7": 186.66847763061523, + "learning_rate": 0.00010308197990669538, + "loss": 527.0882, + "step": 7940 + }, + { + "ce_loss_10": 3.7647696137428284, + "ce_loss_13": 3.7019853234291076, + "ce_loss_2": 4.21561850309372, + "ce_loss_3": 4.0513708114624025, + "ce_loss_7": 3.8064971685409548, + "epoch": 0.795, + "grad_norm": 356.0, + "kl_loss_10": 100.9611873626709, + "kl_loss_2": 1084.6148345947265, + "kl_loss_3": 743.2166534423828, + "kl_loss_7": 191.26584091186524, + "learning_rate": 0.0001021190797345839, + "loss": 525.7331, + "step": 7950 + }, + { + "ce_loss_10": 3.4792375445365904, + "ce_loss_13": 3.4190258502960207, + "ce_loss_2": 3.96710387468338, + "ce_loss_3": 3.7957834005355835, + "ce_loss_7": 3.528597414493561, + "epoch": 0.796, + "grad_norm": 386.0, + "kl_loss_10": 95.0804401397705, + "kl_loss_2": 1137.388375854492, + "kl_loss_3": 792.2215118408203, + "kl_loss_7": 192.50171508789063, + "learning_rate": 0.00010116018621892236, + "loss": 537.4441, + "step": 7960 + }, + { + "ce_loss_10": 3.6988709568977356, + "ce_loss_13": 3.6362175583839416, + "ce_loss_2": 4.151265692710877, + "ce_loss_3": 3.9912821412086488, + "ce_loss_7": 3.742702007293701, + "epoch": 0.797, + "grad_norm": 444.0, + "kl_loss_10": 99.6129222869873, + "kl_loss_2": 1100.607211303711, + "kl_loss_3": 767.8290985107421, + "kl_loss_7": 194.2897491455078, + "learning_rate": 0.00010020530901575753, + "loss": 526.4385, + "step": 7970 + }, + { + "ce_loss_10": 3.727276122570038, + "ce_loss_13": 3.664809966087341, + "ce_loss_2": 4.17646723985672, + "ce_loss_3": 4.011640095710755, + "ce_loss_7": 3.7683190941810607, + "epoch": 0.798, + "grad_norm": 334.0, + "kl_loss_10": 98.68130950927734, + "kl_loss_2": 1084.4167602539062, + "kl_loss_3": 747.0828460693359, + "kl_loss_7": 190.09516677856445, + "learning_rate": 9.925445774069231e-05, + "loss": 521.7054, + "step": 7980 + }, + { + "ce_loss_10": 3.677051067352295, + "ce_loss_13": 3.6162899494171143, + "ce_loss_2": 4.132367658615112, + "ce_loss_3": 3.9699331760406493, + "ce_loss_7": 3.723151159286499, + "epoch": 0.799, + "grad_norm": 340.0, + "kl_loss_10": 97.4996379852295, + "kl_loss_2": 1074.8818054199219, + "kl_loss_3": 740.7804992675781, + "kl_loss_7": 187.78277282714845, + "learning_rate": 9.830764196878872e-05, + "loss": 517.902, + "step": 7990 + }, + { + "ce_loss_10": 3.6140867948532103, + "ce_loss_13": 3.556562864780426, + "ce_loss_2": 4.0635038137435915, + "ce_loss_3": 3.902656090259552, + "ce_loss_7": 3.6608413100242614, + "epoch": 0.8, + "grad_norm": 410.0, + "kl_loss_10": 94.1772445678711, + "kl_loss_2": 1099.7673645019531, + "kl_loss_3": 761.414794921875, + "kl_loss_7": 186.34807205200195, + "learning_rate": 9.736487123447069e-05, + "loss": 531.4563, + "step": 8000 + }, + { + "ce_loss_10": 3.559322512149811, + "ce_loss_13": 3.49820739030838, + "ce_loss_2": 4.036343896389008, + "ce_loss_3": 3.8618996500968934, + "ce_loss_7": 3.6017415881156922, + "epoch": 0.801, + "grad_norm": 424.0, + "kl_loss_10": 96.55318107604981, + "kl_loss_2": 1136.456121826172, + "kl_loss_3": 771.9989410400391, + "kl_loss_7": 188.50249938964845, + "learning_rate": 9.642615503142926e-05, + "loss": 541.6381, + "step": 8010 + }, + { + "ce_loss_10": 3.630905735492706, + "ce_loss_13": 3.5719484210014345, + "ce_loss_2": 4.097460567951202, + "ce_loss_3": 3.9188284277915955, + "ce_loss_7": 3.673666751384735, + "epoch": 0.802, + "grad_norm": 370.0, + "kl_loss_10": 94.45314712524414, + "kl_loss_2": 1090.8831848144532, + "kl_loss_3": 738.8009979248047, + "kl_loss_7": 184.0514343261719, + "learning_rate": 9.549150281252633e-05, + "loss": 524.0769, + "step": 8020 + }, + { + "ce_loss_10": 3.658740258216858, + "ce_loss_13": 3.598051357269287, + "ce_loss_2": 4.112537753582001, + "ce_loss_3": 3.9440460920333864, + "ce_loss_7": 3.701529622077942, + "epoch": 0.803, + "grad_norm": 354.0, + "kl_loss_10": 97.62285194396972, + "kl_loss_2": 1076.1221923828125, + "kl_loss_3": 742.6418304443359, + "kl_loss_7": 187.46692276000977, + "learning_rate": 9.4560923989699e-05, + "loss": 531.6947, + "step": 8030 + }, + { + "ce_loss_10": 3.6491722106933593, + "ce_loss_13": 3.5902853846549987, + "ce_loss_2": 4.109341251850128, + "ce_loss_3": 3.942945408821106, + "ce_loss_7": 3.696093666553497, + "epoch": 0.804, + "grad_norm": 382.0, + "kl_loss_10": 96.87751007080078, + "kl_loss_2": 1089.1260498046875, + "kl_loss_3": 751.9404052734375, + "kl_loss_7": 188.3861946105957, + "learning_rate": 9.363442793386607e-05, + "loss": 538.5806, + "step": 8040 + }, + { + "ce_loss_10": 3.6259461641311646, + "ce_loss_13": 3.5652650475502012, + "ce_loss_2": 4.09434745311737, + "ce_loss_3": 3.9288868069648744, + "ce_loss_7": 3.670744836330414, + "epoch": 0.805, + "grad_norm": 436.0, + "kl_loss_10": 96.23310775756836, + "kl_loss_2": 1102.4481658935547, + "kl_loss_3": 766.5739196777344, + "kl_loss_7": 189.9322036743164, + "learning_rate": 9.271202397483213e-05, + "loss": 525.3384, + "step": 8050 + }, + { + "ce_loss_10": 3.64525443315506, + "ce_loss_13": 3.587091565132141, + "ce_loss_2": 4.088842356204987, + "ce_loss_3": 3.926717495918274, + "ce_loss_7": 3.6877028584480285, + "epoch": 0.806, + "grad_norm": 462.0, + "kl_loss_10": 95.10493888854981, + "kl_loss_2": 1064.438558959961, + "kl_loss_3": 734.5970611572266, + "kl_loss_7": 184.7579719543457, + "learning_rate": 9.179372140119524e-05, + "loss": 530.6901, + "step": 8060 + }, + { + "ce_loss_10": 3.59020277261734, + "ce_loss_13": 3.531452512741089, + "ce_loss_2": 4.036340653896332, + "ce_loss_3": 3.8760047912597657, + "ce_loss_7": 3.6337902188301086, + "epoch": 0.807, + "grad_norm": 432.0, + "kl_loss_10": 94.00482330322265, + "kl_loss_2": 1074.4489135742188, + "kl_loss_3": 739.4833740234375, + "kl_loss_7": 184.7809310913086, + "learning_rate": 9.087952946025175e-05, + "loss": 531.5049, + "step": 8070 + }, + { + "ce_loss_10": 3.7053560853004455, + "ce_loss_13": 3.6452667355537414, + "ce_loss_2": 4.136937665939331, + "ce_loss_3": 3.9754079580307007, + "ce_loss_7": 3.7457935094833372, + "epoch": 0.808, + "grad_norm": 368.0, + "kl_loss_10": 96.12910385131836, + "kl_loss_2": 1048.5191436767577, + "kl_loss_3": 719.7487762451171, + "kl_loss_7": 183.48829498291016, + "learning_rate": 8.996945735790446e-05, + "loss": 523.2327, + "step": 8080 + }, + { + "ce_loss_10": 3.602836012840271, + "ce_loss_13": 3.542934799194336, + "ce_loss_2": 4.055256414413452, + "ce_loss_3": 3.8926199197769167, + "ce_loss_7": 3.6462236762046816, + "epoch": 0.809, + "grad_norm": 414.0, + "kl_loss_10": 95.67857933044434, + "kl_loss_2": 1093.489208984375, + "kl_loss_3": 759.0634765625, + "kl_loss_7": 186.64484634399415, + "learning_rate": 8.906351425856951e-05, + "loss": 536.3948, + "step": 8090 + }, + { + "ce_loss_10": 3.586146354675293, + "ce_loss_13": 3.5270805954933167, + "ce_loss_2": 4.053403818607331, + "ce_loss_3": 3.883652901649475, + "ce_loss_7": 3.6302590370178223, + "epoch": 0.81, + "grad_norm": 328.0, + "kl_loss_10": 96.12913436889649, + "kl_loss_2": 1108.7147094726563, + "kl_loss_3": 762.2885803222656, + "kl_loss_7": 187.99051055908203, + "learning_rate": 8.816170928508365e-05, + "loss": 536.7299, + "step": 8100 + }, + { + "ce_loss_10": 3.5469899415969848, + "ce_loss_13": 3.487591028213501, + "ce_loss_2": 4.024684643745422, + "ce_loss_3": 3.853050243854523, + "ce_loss_7": 3.5918329834938048, + "epoch": 0.811, + "grad_norm": 424.0, + "kl_loss_10": 95.16791305541992, + "kl_loss_2": 1131.8392974853516, + "kl_loss_3": 782.3692016601562, + "kl_loss_7": 188.51590728759766, + "learning_rate": 8.7264051518613e-05, + "loss": 538.6139, + "step": 8110 + }, + { + "ce_loss_10": 3.639654815196991, + "ce_loss_13": 3.583385097980499, + "ce_loss_2": 4.081218779087067, + "ce_loss_3": 3.9191540598869326, + "ce_loss_7": 3.680349314212799, + "epoch": 0.812, + "grad_norm": 358.0, + "kl_loss_10": 93.30685958862304, + "kl_loss_2": 1057.4586822509766, + "kl_loss_3": 735.9759002685547, + "kl_loss_7": 182.97039413452148, + "learning_rate": 8.637054999856148e-05, + "loss": 526.1802, + "step": 8120 + }, + { + "ce_loss_10": 3.6243308544158936, + "ce_loss_13": 3.5630579233169555, + "ce_loss_2": 4.083577620983124, + "ce_loss_3": 3.9160293340682983, + "ce_loss_7": 3.6718581318855286, + "epoch": 0.813, + "grad_norm": 328.0, + "kl_loss_10": 95.2622299194336, + "kl_loss_2": 1086.6508239746095, + "kl_loss_3": 748.3265411376954, + "kl_loss_7": 187.44526748657228, + "learning_rate": 8.548121372247918e-05, + "loss": 536.2552, + "step": 8130 + }, + { + "ce_loss_10": 3.699293088912964, + "ce_loss_13": 3.641393613815308, + "ce_loss_2": 4.146343159675598, + "ce_loss_3": 3.982176637649536, + "ce_loss_7": 3.7424126982688906, + "epoch": 0.814, + "grad_norm": 420.0, + "kl_loss_10": 97.64918098449706, + "kl_loss_2": 1075.0233795166016, + "kl_loss_3": 745.3918151855469, + "kl_loss_7": 187.1306022644043, + "learning_rate": 8.459605164597267e-05, + "loss": 527.4509, + "step": 8140 + }, + { + "ce_loss_10": 3.5794180989265443, + "ce_loss_13": 3.521663022041321, + "ce_loss_2": 4.035482859611511, + "ce_loss_3": 3.869397759437561, + "ce_loss_7": 3.6230968952178957, + "epoch": 0.815, + "grad_norm": 322.0, + "kl_loss_10": 93.84382820129395, + "kl_loss_2": 1085.6336395263672, + "kl_loss_3": 749.5215454101562, + "kl_loss_7": 184.3967170715332, + "learning_rate": 8.371507268261436e-05, + "loss": 530.9717, + "step": 8150 + }, + { + "ce_loss_10": 3.6623859286308287, + "ce_loss_13": 3.603581893444061, + "ce_loss_2": 4.1160969018936155, + "ce_loss_3": 3.9481249690055846, + "ce_loss_7": 3.7034823894500732, + "epoch": 0.816, + "grad_norm": 410.0, + "kl_loss_10": 96.0962978363037, + "kl_loss_2": 1085.8551330566406, + "kl_loss_3": 744.0009185791016, + "kl_loss_7": 187.44638290405274, + "learning_rate": 8.283828570385238e-05, + "loss": 515.8468, + "step": 8160 + }, + { + "ce_loss_10": 3.6646664142608643, + "ce_loss_13": 3.607030153274536, + "ce_loss_2": 4.124508082866669, + "ce_loss_3": 3.955708396434784, + "ce_loss_7": 3.708679938316345, + "epoch": 0.817, + "grad_norm": 286.0, + "kl_loss_10": 95.48198356628419, + "kl_loss_2": 1068.3529357910156, + "kl_loss_3": 737.6435119628907, + "kl_loss_7": 186.3275260925293, + "learning_rate": 8.196569953892202e-05, + "loss": 525.6566, + "step": 8170 + }, + { + "ce_loss_10": 3.5752533435821534, + "ce_loss_13": 3.5151426196098328, + "ce_loss_2": 4.039277529716491, + "ce_loss_3": 3.8700820326805117, + "ce_loss_7": 3.6193170666694643, + "epoch": 0.818, + "grad_norm": 392.0, + "kl_loss_10": 95.23657569885253, + "kl_loss_2": 1087.7711944580078, + "kl_loss_3": 748.5086303710938, + "kl_loss_7": 185.79026489257814, + "learning_rate": 8.109732297475635e-05, + "loss": 529.4896, + "step": 8180 + }, + { + "ce_loss_10": 3.5442301869392394, + "ce_loss_13": 3.48368262052536, + "ce_loss_2": 4.041348910331726, + "ce_loss_3": 3.8620414972305297, + "ce_loss_7": 3.593292236328125, + "epoch": 0.819, + "grad_norm": 508.0, + "kl_loss_10": 94.79218406677246, + "kl_loss_2": 1140.4125610351562, + "kl_loss_3": 788.5256622314453, + "kl_loss_7": 192.41318969726564, + "learning_rate": 8.023316475589754e-05, + "loss": 543.2035, + "step": 8190 + }, + { + "ce_loss_10": 3.5104150652885435, + "ce_loss_13": 3.44714834690094, + "ce_loss_2": 4.0140674948692325, + "ce_loss_3": 3.8308369159698485, + "ce_loss_7": 3.5589245796203612, + "epoch": 0.82, + "grad_norm": 532.0, + "kl_loss_10": 97.92351608276367, + "kl_loss_2": 1158.8160186767577, + "kl_loss_3": 797.9960662841797, + "kl_loss_7": 195.1374740600586, + "learning_rate": 7.937323358440934e-05, + "loss": 549.9746, + "step": 8200 + }, + { + "ce_loss_10": 3.637300205230713, + "ce_loss_13": 3.5789112567901613, + "ce_loss_2": 4.087347877025604, + "ce_loss_3": 3.923117625713348, + "ce_loss_7": 3.679899263381958, + "epoch": 0.821, + "grad_norm": 404.0, + "kl_loss_10": 95.01284561157226, + "kl_loss_2": 1074.5766845703124, + "kl_loss_3": 743.3249450683594, + "kl_loss_7": 184.74501190185546, + "learning_rate": 7.851753811978923e-05, + "loss": 530.0879, + "step": 8210 + }, + { + "ce_loss_10": 3.661479341983795, + "ce_loss_13": 3.6010610818862916, + "ce_loss_2": 4.123578751087189, + "ce_loss_3": 3.9517632484436036, + "ce_loss_7": 3.7047110080718992, + "epoch": 0.822, + "grad_norm": 358.0, + "kl_loss_10": 96.71367454528809, + "kl_loss_2": 1091.6120025634766, + "kl_loss_3": 744.3147399902343, + "kl_loss_7": 186.59919815063478, + "learning_rate": 7.766608697888095e-05, + "loss": 527.9285, + "step": 8220 + }, + { + "ce_loss_10": 3.672685134410858, + "ce_loss_13": 3.6110698223114013, + "ce_loss_2": 4.123067581653595, + "ce_loss_3": 3.9549397349357607, + "ce_loss_7": 3.7160248041152952, + "epoch": 0.823, + "grad_norm": 428.0, + "kl_loss_10": 99.5799617767334, + "kl_loss_2": 1090.7132843017578, + "kl_loss_3": 754.5721008300782, + "kl_loss_7": 190.94887008666993, + "learning_rate": 7.681888873578785e-05, + "loss": 534.6821, + "step": 8230 + }, + { + "ce_loss_10": 3.599495697021484, + "ce_loss_13": 3.5377328515052797, + "ce_loss_2": 4.075003004074096, + "ce_loss_3": 3.9027179360389708, + "ce_loss_7": 3.6464317083358764, + "epoch": 0.824, + "grad_norm": 454.0, + "kl_loss_10": 96.61878395080566, + "kl_loss_2": 1113.7870971679688, + "kl_loss_3": 766.2083129882812, + "kl_loss_7": 191.40456848144532, + "learning_rate": 7.597595192178702e-05, + "loss": 531.8756, + "step": 8240 + }, + { + "ce_loss_10": 3.5937318563461305, + "ce_loss_13": 3.5349967002868654, + "ce_loss_2": 4.069689559936523, + "ce_loss_3": 3.896022927761078, + "ce_loss_7": 3.640897309780121, + "epoch": 0.825, + "grad_norm": 390.0, + "kl_loss_10": 96.6520393371582, + "kl_loss_2": 1123.3416778564454, + "kl_loss_3": 772.9763427734375, + "kl_loss_7": 191.78026428222657, + "learning_rate": 7.513728502524286e-05, + "loss": 540.9631, + "step": 8250 + }, + { + "ce_loss_10": 3.600663185119629, + "ce_loss_13": 3.543354606628418, + "ce_loss_2": 4.056336843967438, + "ce_loss_3": 3.886526358127594, + "ce_loss_7": 3.644607651233673, + "epoch": 0.826, + "grad_norm": 520.0, + "kl_loss_10": 94.51933555603027, + "kl_loss_2": 1071.4390838623046, + "kl_loss_3": 737.6031066894532, + "kl_loss_7": 182.459228515625, + "learning_rate": 7.430289649152156e-05, + "loss": 532.1943, + "step": 8260 + }, + { + "ce_loss_10": 3.4964008927345276, + "ce_loss_13": 3.4386809706687926, + "ce_loss_2": 3.979319155216217, + "ce_loss_3": 3.806533432006836, + "ce_loss_7": 3.5424695372581483, + "epoch": 0.827, + "grad_norm": 438.0, + "kl_loss_10": 92.59819717407227, + "kl_loss_2": 1138.140805053711, + "kl_loss_3": 787.0873046875, + "kl_loss_7": 188.89702301025392, + "learning_rate": 7.347279472290646e-05, + "loss": 536.0913, + "step": 8270 + }, + { + "ce_loss_10": 3.641860234737396, + "ce_loss_13": 3.5819854736328125, + "ce_loss_2": 4.100788974761963, + "ce_loss_3": 3.9369661927223207, + "ce_loss_7": 3.6862512946128847, + "epoch": 0.828, + "grad_norm": 404.0, + "kl_loss_10": 96.73132438659668, + "kl_loss_2": 1085.176287841797, + "kl_loss_3": 756.2387023925781, + "kl_loss_7": 187.64101333618163, + "learning_rate": 7.264698807851328e-05, + "loss": 532.8096, + "step": 8280 + }, + { + "ce_loss_10": 3.604352295398712, + "ce_loss_13": 3.549103558063507, + "ce_loss_2": 4.042815041542053, + "ce_loss_3": 3.880488729476929, + "ce_loss_7": 3.64413400888443, + "epoch": 0.829, + "grad_norm": 520.0, + "kl_loss_10": 92.21053123474121, + "kl_loss_2": 1057.8690124511718, + "kl_loss_3": 728.9253723144532, + "kl_loss_7": 181.22113647460938, + "learning_rate": 7.182548487420554e-05, + "loss": 524.6575, + "step": 8290 + }, + { + "ce_loss_10": 3.6577786207199097, + "ce_loss_13": 3.597848916053772, + "ce_loss_2": 4.107566392421722, + "ce_loss_3": 3.947295570373535, + "ce_loss_7": 3.703710603713989, + "epoch": 0.83, + "grad_norm": 286.0, + "kl_loss_10": 96.30242042541504, + "kl_loss_2": 1087.0319366455078, + "kl_loss_3": 748.0092193603516, + "kl_loss_7": 187.4295867919922, + "learning_rate": 7.100829338251146e-05, + "loss": 527.7667, + "step": 8300 + }, + { + "ce_loss_10": 3.5980669021606446, + "ce_loss_13": 3.5371885776519774, + "ce_loss_2": 4.070665979385376, + "ce_loss_3": 3.8979653000831602, + "ce_loss_7": 3.6431610107421877, + "epoch": 0.831, + "grad_norm": 394.0, + "kl_loss_10": 95.44490776062011, + "kl_loss_2": 1113.3842803955079, + "kl_loss_3": 769.6158874511718, + "kl_loss_7": 189.99929428100586, + "learning_rate": 7.019542183254046e-05, + "loss": 531.0445, + "step": 8310 + }, + { + "ce_loss_10": 3.6354474306106566, + "ce_loss_13": 3.57179137468338, + "ce_loss_2": 4.082340836524963, + "ce_loss_3": 3.9207422971725463, + "ce_loss_7": 3.6777117967605593, + "epoch": 0.832, + "grad_norm": 474.0, + "kl_loss_10": 100.207564163208, + "kl_loss_2": 1084.2285125732421, + "kl_loss_3": 748.0254974365234, + "kl_loss_7": 190.82402954101562, + "learning_rate": 6.938687840989971e-05, + "loss": 528.8804, + "step": 8320 + }, + { + "ce_loss_10": 3.5696911811828613, + "ce_loss_13": 3.508439671993256, + "ce_loss_2": 4.0291890621185305, + "ce_loss_3": 3.8622559309005737, + "ce_loss_7": 3.614106321334839, + "epoch": 0.833, + "grad_norm": 600.0, + "kl_loss_10": 96.55842895507813, + "kl_loss_2": 1082.4974243164063, + "kl_loss_3": 748.5556121826172, + "kl_loss_7": 188.75322189331055, + "learning_rate": 6.858267125661271e-05, + "loss": 531.4916, + "step": 8330 + }, + { + "ce_loss_10": 3.6338680744171143, + "ce_loss_13": 3.575134778022766, + "ce_loss_2": 4.0971689343452455, + "ce_loss_3": 3.930681896209717, + "ce_loss_7": 3.6769707798957825, + "epoch": 0.834, + "grad_norm": 418.0, + "kl_loss_10": 93.3882438659668, + "kl_loss_2": 1085.4937896728516, + "kl_loss_3": 746.0253967285156, + "kl_loss_7": 184.32117233276367, + "learning_rate": 6.778280847103668e-05, + "loss": 538.0241, + "step": 8340 + }, + { + "ce_loss_10": 3.6449947714805604, + "ce_loss_13": 3.581918466091156, + "ce_loss_2": 4.1008768558502195, + "ce_loss_3": 3.937298035621643, + "ce_loss_7": 3.686388850212097, + "epoch": 0.835, + "grad_norm": 290.0, + "kl_loss_10": 98.43625144958496, + "kl_loss_2": 1102.1855102539062, + "kl_loss_3": 759.7929138183594, + "kl_loss_7": 191.51789016723632, + "learning_rate": 6.698729810778065e-05, + "loss": 532.2951, + "step": 8350 + }, + { + "ce_loss_10": 3.5478424787521363, + "ce_loss_13": 3.489585447311401, + "ce_loss_2": 4.0140421986579895, + "ce_loss_3": 3.8517470717430116, + "ce_loss_7": 3.592922496795654, + "epoch": 0.836, + "grad_norm": 490.0, + "kl_loss_10": 91.77609100341797, + "kl_loss_2": 1092.1636932373046, + "kl_loss_3": 756.2904968261719, + "kl_loss_7": 183.14143447875978, + "learning_rate": 6.619614817762538e-05, + "loss": 531.3562, + "step": 8360 + }, + { + "ce_loss_10": 3.509856128692627, + "ce_loss_13": 3.4520259737968444, + "ce_loss_2": 4.005417215824127, + "ce_loss_3": 3.8302616715431212, + "ce_loss_7": 3.56083265542984, + "epoch": 0.837, + "grad_norm": 356.0, + "kl_loss_10": 91.30384330749511, + "kl_loss_2": 1146.0878509521485, + "kl_loss_3": 788.8349487304688, + "kl_loss_7": 189.73513488769532, + "learning_rate": 6.540936664744196e-05, + "loss": 543.0581, + "step": 8370 + }, + { + "ce_loss_10": 3.6644623279571533, + "ce_loss_13": 3.6040658593177795, + "ce_loss_2": 4.12789534330368, + "ce_loss_3": 3.959988057613373, + "ce_loss_7": 3.7062342405319213, + "epoch": 0.838, + "grad_norm": 366.0, + "kl_loss_10": 97.38574295043945, + "kl_loss_2": 1085.7984375, + "kl_loss_3": 749.598519897461, + "kl_loss_7": 188.30213012695313, + "learning_rate": 6.462696144011149e-05, + "loss": 525.3536, + "step": 8380 + }, + { + "ce_loss_10": 3.6138532400131225, + "ce_loss_13": 3.5537376523017885, + "ce_loss_2": 4.071477258205414, + "ce_loss_3": 3.910947525501251, + "ce_loss_7": 3.658327579498291, + "epoch": 0.839, + "grad_norm": 556.0, + "kl_loss_10": 98.20170745849609, + "kl_loss_2": 1090.382958984375, + "kl_loss_3": 762.5471374511719, + "kl_loss_7": 191.74814834594727, + "learning_rate": 6.384894043444567e-05, + "loss": 528.8093, + "step": 8390 + }, + { + "ce_loss_10": 3.644765245914459, + "ce_loss_13": 3.585478734970093, + "ce_loss_2": 4.109920060634613, + "ce_loss_3": 3.9416786313056944, + "ce_loss_7": 3.689965844154358, + "epoch": 0.84, + "grad_norm": 412.0, + "kl_loss_10": 97.19089965820312, + "kl_loss_2": 1101.7069030761718, + "kl_loss_3": 757.5290496826171, + "kl_loss_7": 188.98860778808594, + "learning_rate": 6.307531146510753e-05, + "loss": 529.2157, + "step": 8400 + }, + { + "ce_loss_10": 3.621027076244354, + "ce_loss_13": 3.5618404507637025, + "ce_loss_2": 4.0682983756065365, + "ce_loss_3": 3.90874502658844, + "ce_loss_7": 3.6661928296089172, + "epoch": 0.841, + "grad_norm": 384.0, + "kl_loss_10": 95.90530738830566, + "kl_loss_2": 1067.8680267333984, + "kl_loss_3": 738.8968048095703, + "kl_loss_7": 187.38672485351563, + "learning_rate": 6.230608232253226e-05, + "loss": 522.0211, + "step": 8410 + }, + { + "ce_loss_10": 3.5725093245506288, + "ce_loss_13": 3.5133079648017884, + "ce_loss_2": 4.052767491340637, + "ce_loss_3": 3.8865469098091125, + "ce_loss_7": 3.617572808265686, + "epoch": 0.842, + "grad_norm": 420.0, + "kl_loss_10": 93.54998550415038, + "kl_loss_2": 1118.0941436767578, + "kl_loss_3": 779.2841003417968, + "kl_loss_7": 188.06975250244142, + "learning_rate": 6.154126075284855e-05, + "loss": 530.6581, + "step": 8420 + }, + { + "ce_loss_10": 3.6709149718284606, + "ce_loss_13": 3.610610318183899, + "ce_loss_2": 4.11589070558548, + "ce_loss_3": 3.958199071884155, + "ce_loss_7": 3.7119770526885985, + "epoch": 0.843, + "grad_norm": 360.0, + "kl_loss_10": 93.72929344177246, + "kl_loss_2": 1052.0708984375, + "kl_loss_3": 727.1819213867187, + "kl_loss_7": 182.0021545410156, + "learning_rate": 6.078085445780129e-05, + "loss": 515.5865, + "step": 8430 + }, + { + "ce_loss_10": 3.678613018989563, + "ce_loss_13": 3.6185575127601624, + "ce_loss_2": 4.138859879970551, + "ce_loss_3": 3.970304036140442, + "ce_loss_7": 3.7233882188796996, + "epoch": 0.844, + "grad_norm": 708.0, + "kl_loss_10": 96.56619453430176, + "kl_loss_2": 1092.8436309814454, + "kl_loss_3": 748.7821746826172, + "kl_loss_7": 187.36514282226562, + "learning_rate": 6.002487109467347e-05, + "loss": 524.9962, + "step": 8440 + }, + { + "ce_loss_10": 3.681882548332214, + "ce_loss_13": 3.623554539680481, + "ce_loss_2": 4.131060492992401, + "ce_loss_3": 3.969043660163879, + "ce_loss_7": 3.7261468291282656, + "epoch": 0.845, + "grad_norm": 498.0, + "kl_loss_10": 95.19795646667481, + "kl_loss_2": 1083.3428985595704, + "kl_loss_3": 748.7116729736329, + "kl_loss_7": 188.84120330810546, + "learning_rate": 5.927331827620902e-05, + "loss": 524.2234, + "step": 8450 + }, + { + "ce_loss_10": 3.671555197238922, + "ce_loss_13": 3.6144081234931944, + "ce_loss_2": 4.109152019023895, + "ce_loss_3": 3.957107722759247, + "ce_loss_7": 3.7151288032531737, + "epoch": 0.846, + "grad_norm": 384.0, + "kl_loss_10": 92.54770011901856, + "kl_loss_2": 1047.1174011230469, + "kl_loss_3": 728.4162536621094, + "kl_loss_7": 183.04834442138673, + "learning_rate": 5.852620357053651e-05, + "loss": 522.9391, + "step": 8460 + }, + { + "ce_loss_10": 3.7129202485084534, + "ce_loss_13": 3.65321398973465, + "ce_loss_2": 4.155979669094085, + "ce_loss_3": 3.9961599469184876, + "ce_loss_7": 3.7558295488357545, + "epoch": 0.847, + "grad_norm": 432.0, + "kl_loss_10": 94.81909484863282, + "kl_loss_2": 1067.3740447998048, + "kl_loss_3": 736.2771881103515, + "kl_loss_7": 184.3846176147461, + "learning_rate": 5.778353450109286e-05, + "loss": 523.3945, + "step": 8470 + }, + { + "ce_loss_10": 3.7526662349700928, + "ce_loss_13": 3.6899970173835754, + "ce_loss_2": 4.2024567246437075, + "ce_loss_3": 4.037352812290192, + "ce_loss_7": 3.7961275696754457, + "epoch": 0.848, + "grad_norm": 420.0, + "kl_loss_10": 98.8898868560791, + "kl_loss_2": 1083.7428894042969, + "kl_loss_3": 747.87919921875, + "kl_loss_7": 190.12581558227538, + "learning_rate": 5.7045318546547206e-05, + "loss": 528.6064, + "step": 8480 + }, + { + "ce_loss_10": 3.6435152888298035, + "ce_loss_13": 3.5820479154586793, + "ce_loss_2": 4.10130136013031, + "ce_loss_3": 3.9336646437644958, + "ce_loss_7": 3.6865146279335024, + "epoch": 0.849, + "grad_norm": 476.0, + "kl_loss_10": 97.09412269592285, + "kl_loss_2": 1097.005093383789, + "kl_loss_3": 757.3569030761719, + "kl_loss_7": 187.13169021606444, + "learning_rate": 5.631156314072605e-05, + "loss": 526.7981, + "step": 8490 + }, + { + "ce_loss_10": 3.6548070907592773, + "ce_loss_13": 3.5959606409072875, + "ce_loss_2": 4.090519487857819, + "ce_loss_3": 3.9302281975746154, + "ce_loss_7": 3.6990505933761595, + "epoch": 0.85, + "grad_norm": 348.0, + "kl_loss_10": 94.60167617797852, + "kl_loss_2": 1058.567938232422, + "kl_loss_3": 726.6986267089844, + "kl_loss_7": 182.6941146850586, + "learning_rate": 5.5582275672538315e-05, + "loss": 518.2773, + "step": 8500 + }, + { + "ce_loss_10": 3.5718761324882506, + "ce_loss_13": 3.510132133960724, + "ce_loss_2": 4.058491265773773, + "ce_loss_3": 3.8868750095367433, + "ce_loss_7": 3.62018061876297, + "epoch": 0.851, + "grad_norm": 356.0, + "kl_loss_10": 98.47408905029297, + "kl_loss_2": 1129.9293365478516, + "kl_loss_3": 782.1435455322265, + "kl_loss_7": 191.806551361084, + "learning_rate": 5.4857463485900484e-05, + "loss": 540.5649, + "step": 8510 + }, + { + "ce_loss_10": 3.626720643043518, + "ce_loss_13": 3.5688146710395814, + "ce_loss_2": 4.081609988212586, + "ce_loss_3": 3.9117035031318665, + "ce_loss_7": 3.673699951171875, + "epoch": 0.852, + "grad_norm": 392.0, + "kl_loss_10": 94.4161319732666, + "kl_loss_2": 1082.976022338867, + "kl_loss_3": 743.9283477783204, + "kl_loss_7": 185.5842658996582, + "learning_rate": 5.413713387966329e-05, + "loss": 525.7675, + "step": 8520 + }, + { + "ce_loss_10": 3.6495197653770446, + "ce_loss_13": 3.5870252728462217, + "ce_loss_2": 4.1089702367782595, + "ce_loss_3": 3.943737292289734, + "ce_loss_7": 3.6925705909729003, + "epoch": 0.853, + "grad_norm": 560.0, + "kl_loss_10": 99.9091007232666, + "kl_loss_2": 1091.3887969970704, + "kl_loss_3": 754.8269989013672, + "kl_loss_7": 190.51073608398437, + "learning_rate": 5.34212941075381e-05, + "loss": 533.712, + "step": 8530 + }, + { + "ce_loss_10": 3.6638750314712523, + "ce_loss_13": 3.603909599781036, + "ce_loss_2": 4.105106854438782, + "ce_loss_3": 3.939826285839081, + "ce_loss_7": 3.703915464878082, + "epoch": 0.854, + "grad_norm": 324.0, + "kl_loss_10": 94.93586730957031, + "kl_loss_2": 1060.2898712158203, + "kl_loss_3": 729.1602386474609, + "kl_loss_7": 183.2039321899414, + "learning_rate": 5.270995137802315e-05, + "loss": 520.0254, + "step": 8540 + }, + { + "ce_loss_10": 3.586125075817108, + "ce_loss_13": 3.530829107761383, + "ce_loss_2": 4.0409599304199215, + "ce_loss_3": 3.876398241519928, + "ce_loss_7": 3.6288790106773376, + "epoch": 0.855, + "grad_norm": 390.0, + "kl_loss_10": 92.31447868347168, + "kl_loss_2": 1091.2599792480469, + "kl_loss_3": 750.2804168701172, + "kl_loss_7": 184.4141700744629, + "learning_rate": 5.2003112854332125e-05, + "loss": 530.1402, + "step": 8550 + }, + { + "ce_loss_10": 3.592084896564484, + "ce_loss_13": 3.5318885922431944, + "ce_loss_2": 4.045030009746552, + "ce_loss_3": 3.8797095656394958, + "ce_loss_7": 3.6342476487159727, + "epoch": 0.856, + "grad_norm": 410.0, + "kl_loss_10": 95.16406364440918, + "kl_loss_2": 1083.518502807617, + "kl_loss_3": 746.9155914306641, + "kl_loss_7": 184.60284118652345, + "learning_rate": 5.130078565432089e-05, + "loss": 519.0631, + "step": 8560 + }, + { + "ce_loss_10": 3.6698386430740357, + "ce_loss_13": 3.611102557182312, + "ce_loss_2": 4.1041951179504395, + "ce_loss_3": 3.9457595467567446, + "ce_loss_7": 3.714687442779541, + "epoch": 0.857, + "grad_norm": 330.0, + "kl_loss_10": 94.41157264709473, + "kl_loss_2": 1066.6546508789063, + "kl_loss_3": 732.30849609375, + "kl_loss_7": 183.59521484375, + "learning_rate": 5.060297685041659e-05, + "loss": 515.5307, + "step": 8570 + }, + { + "ce_loss_10": 3.594843864440918, + "ce_loss_13": 3.535090386867523, + "ce_loss_2": 4.058831119537354, + "ce_loss_3": 3.8907560467720033, + "ce_loss_7": 3.6390093684196474, + "epoch": 0.858, + "grad_norm": 396.0, + "kl_loss_10": 97.14489707946777, + "kl_loss_2": 1100.07861328125, + "kl_loss_3": 757.8477020263672, + "kl_loss_7": 190.17505111694337, + "learning_rate": 4.99096934695461e-05, + "loss": 537.0569, + "step": 8580 + }, + { + "ce_loss_10": 3.655477023124695, + "ce_loss_13": 3.592752683162689, + "ce_loss_2": 4.114116084575653, + "ce_loss_3": 3.950313460826874, + "ce_loss_7": 3.6980414509773256, + "epoch": 0.859, + "grad_norm": 370.0, + "kl_loss_10": 96.66123657226562, + "kl_loss_2": 1076.5634460449219, + "kl_loss_3": 745.2082977294922, + "kl_loss_7": 186.95159301757812, + "learning_rate": 4.922094249306558e-05, + "loss": 520.1718, + "step": 8590 + }, + { + "ce_loss_10": 3.677726352214813, + "ce_loss_13": 3.6172829270362854, + "ce_loss_2": 4.126979196071625, + "ce_loss_3": 3.9645047903060915, + "ce_loss_7": 3.7215185284614565, + "epoch": 0.86, + "grad_norm": 392.0, + "kl_loss_10": 96.89525718688965, + "kl_loss_2": 1065.1883819580078, + "kl_loss_3": 740.1956573486328, + "kl_loss_7": 187.83882064819335, + "learning_rate": 4.853673085668947e-05, + "loss": 516.6985, + "step": 8600 + }, + { + "ce_loss_10": 3.707137334346771, + "ce_loss_13": 3.6448033452033997, + "ce_loss_2": 4.162192296981812, + "ce_loss_3": 3.993678319454193, + "ce_loss_7": 3.7496466279029845, + "epoch": 0.861, + "grad_norm": 370.0, + "kl_loss_10": 98.02176780700684, + "kl_loss_2": 1078.1511993408203, + "kl_loss_3": 739.8441162109375, + "kl_loss_7": 186.5592399597168, + "learning_rate": 4.78570654504214e-05, + "loss": 529.6101, + "step": 8610 + }, + { + "ce_loss_10": 3.6458049774169923, + "ce_loss_13": 3.5854872465133667, + "ce_loss_2": 4.110537803173065, + "ce_loss_3": 3.938798224925995, + "ce_loss_7": 3.6893723726272585, + "epoch": 0.862, + "grad_norm": 414.0, + "kl_loss_10": 94.25516128540039, + "kl_loss_2": 1104.6271423339845, + "kl_loss_3": 758.221337890625, + "kl_loss_7": 185.93933029174804, + "learning_rate": 4.7181953118484556e-05, + "loss": 535.9025, + "step": 8620 + }, + { + "ce_loss_10": 3.6774216413497927, + "ce_loss_13": 3.6180386185646056, + "ce_loss_2": 4.12672735452652, + "ce_loss_3": 3.962115204334259, + "ce_loss_7": 3.720357131958008, + "epoch": 0.863, + "grad_norm": 356.0, + "kl_loss_10": 95.34017066955566, + "kl_loss_2": 1068.0610900878905, + "kl_loss_3": 737.2169891357422, + "kl_loss_7": 185.36345138549805, + "learning_rate": 4.651140065925269e-05, + "loss": 530.0095, + "step": 8630 + }, + { + "ce_loss_10": 3.609228265285492, + "ce_loss_13": 3.5492658615112305, + "ce_loss_2": 4.060226953029632, + "ce_loss_3": 3.895670175552368, + "ce_loss_7": 3.6542355179786683, + "epoch": 0.864, + "grad_norm": 360.0, + "kl_loss_10": 96.95414390563965, + "kl_loss_2": 1087.1394622802734, + "kl_loss_3": 748.6742889404297, + "kl_loss_7": 188.45738372802734, + "learning_rate": 4.58454148251814e-05, + "loss": 535.7555, + "step": 8640 + }, + { + "ce_loss_10": 3.6290027260780335, + "ce_loss_13": 3.566804575920105, + "ce_loss_2": 4.098408913612365, + "ce_loss_3": 3.928418016433716, + "ce_loss_7": 3.673435080051422, + "epoch": 0.865, + "grad_norm": 352.0, + "kl_loss_10": 97.77750358581542, + "kl_loss_2": 1105.780810546875, + "kl_loss_3": 762.838412475586, + "kl_loss_7": 187.93626327514647, + "learning_rate": 4.518400232274078e-05, + "loss": 530.3719, + "step": 8650 + }, + { + "ce_loss_10": 3.641969549655914, + "ce_loss_13": 3.5785802602767944, + "ce_loss_2": 4.092971992492676, + "ce_loss_3": 3.932430160045624, + "ce_loss_7": 3.6855560064315798, + "epoch": 0.866, + "grad_norm": 320.0, + "kl_loss_10": 100.24152946472168, + "kl_loss_2": 1078.2671875, + "kl_loss_3": 746.3800415039062, + "kl_loss_7": 188.71098556518555, + "learning_rate": 4.452716981234745e-05, + "loss": 518.2875, + "step": 8660 + }, + { + "ce_loss_10": 3.619352424144745, + "ce_loss_13": 3.5634596943855286, + "ce_loss_2": 4.0641814827919, + "ce_loss_3": 3.9009178042411805, + "ce_loss_7": 3.6601861000061033, + "epoch": 0.867, + "grad_norm": 334.0, + "kl_loss_10": 92.77517395019531, + "kl_loss_2": 1069.4530029296875, + "kl_loss_3": 742.2820404052734, + "kl_loss_7": 183.70159912109375, + "learning_rate": 4.3874923908297335e-05, + "loss": 518.2648, + "step": 8670 + }, + { + "ce_loss_10": 3.6679449677467346, + "ce_loss_13": 3.605993056297302, + "ce_loss_2": 4.122425937652588, + "ce_loss_3": 3.955815386772156, + "ce_loss_7": 3.710171031951904, + "epoch": 0.868, + "grad_norm": 372.0, + "kl_loss_10": 98.51640739440919, + "kl_loss_2": 1091.1497436523437, + "kl_loss_3": 753.822543334961, + "kl_loss_7": 189.5640121459961, + "learning_rate": 4.322727117869951e-05, + "loss": 527.5021, + "step": 8680 + }, + { + "ce_loss_10": 3.678618919849396, + "ce_loss_13": 3.61755256652832, + "ce_loss_2": 4.1355063915252686, + "ce_loss_3": 3.9705930352211, + "ce_loss_7": 3.7248330235481264, + "epoch": 0.869, + "grad_norm": 450.0, + "kl_loss_10": 97.55352783203125, + "kl_loss_2": 1094.9813720703125, + "kl_loss_3": 756.694857788086, + "kl_loss_7": 188.98089218139648, + "learning_rate": 4.2584218145409916e-05, + "loss": 526.9053, + "step": 8690 + }, + { + "ce_loss_10": 3.724055600166321, + "ce_loss_13": 3.6645130157470702, + "ce_loss_2": 4.164188587665558, + "ce_loss_3": 4.006092858314514, + "ce_loss_7": 3.766603982448578, + "epoch": 0.87, + "grad_norm": 368.0, + "kl_loss_10": 97.79985809326172, + "kl_loss_2": 1054.3090911865233, + "kl_loss_3": 727.9592834472656, + "kl_loss_7": 186.32457809448243, + "learning_rate": 4.194577128396521e-05, + "loss": 516.3896, + "step": 8700 + }, + { + "ce_loss_10": 3.59331738948822, + "ce_loss_13": 3.5345770716667175, + "ce_loss_2": 4.046900963783264, + "ce_loss_3": 3.882276177406311, + "ce_loss_7": 3.636314344406128, + "epoch": 0.871, + "grad_norm": 348.0, + "kl_loss_10": 93.78037185668946, + "kl_loss_2": 1077.3778259277344, + "kl_loss_3": 740.198031616211, + "kl_loss_7": 183.74533233642578, + "learning_rate": 4.1311937023518264e-05, + "loss": 527.0207, + "step": 8710 + }, + { + "ce_loss_10": 3.6144633054733277, + "ce_loss_13": 3.5550664901733398, + "ce_loss_2": 4.064953672885895, + "ce_loss_3": 3.891311466693878, + "ce_loss_7": 3.653948724269867, + "epoch": 0.872, + "grad_norm": 338.0, + "kl_loss_10": 94.96177291870117, + "kl_loss_2": 1085.5813049316407, + "kl_loss_3": 729.3066223144531, + "kl_loss_7": 181.0632652282715, + "learning_rate": 4.0682721746773344e-05, + "loss": 521.2992, + "step": 8720 + }, + { + "ce_loss_10": 3.4832905650138857, + "ce_loss_13": 3.4249367475509644, + "ce_loss_2": 3.961899662017822, + "ce_loss_3": 3.788464534282684, + "ce_loss_7": 3.527579641342163, + "epoch": 0.873, + "grad_norm": 370.0, + "kl_loss_10": 91.51293182373047, + "kl_loss_2": 1104.7394775390626, + "kl_loss_3": 759.5037414550782, + "kl_loss_7": 185.07400512695312, + "learning_rate": 4.0058131789920904e-05, + "loss": 521.9289, + "step": 8730 + }, + { + "ce_loss_10": 3.640140187740326, + "ce_loss_13": 3.57983558177948, + "ce_loss_2": 4.088211476802826, + "ce_loss_3": 3.927894616127014, + "ce_loss_7": 3.6845538139343263, + "epoch": 0.874, + "grad_norm": 438.0, + "kl_loss_10": 95.66121215820313, + "kl_loss_2": 1082.0109283447266, + "kl_loss_3": 751.8433319091797, + "kl_loss_7": 184.97217254638673, + "learning_rate": 3.9438173442575e-05, + "loss": 542.025, + "step": 8740 + }, + { + "ce_loss_10": 3.668476128578186, + "ce_loss_13": 3.6084399461746215, + "ce_loss_2": 4.114363825321197, + "ce_loss_3": 3.948890733718872, + "ce_loss_7": 3.712895894050598, + "epoch": 0.875, + "grad_norm": 360.0, + "kl_loss_10": 95.13606338500976, + "kl_loss_2": 1069.65205078125, + "kl_loss_3": 736.1352905273437, + "kl_loss_7": 185.31621551513672, + "learning_rate": 3.882285294770937e-05, + "loss": 524.7358, + "step": 8750 + }, + { + "ce_loss_10": 3.636470365524292, + "ce_loss_13": 3.576250433921814, + "ce_loss_2": 4.081735682487488, + "ce_loss_3": 3.9194202423095703, + "ce_loss_7": 3.6787103533744814, + "epoch": 0.876, + "grad_norm": 372.0, + "kl_loss_10": 97.42237510681153, + "kl_loss_2": 1070.8320678710938, + "kl_loss_3": 736.4440826416015, + "kl_loss_7": 186.42294464111328, + "learning_rate": 3.821217650159453e-05, + "loss": 528.159, + "step": 8760 + }, + { + "ce_loss_10": 3.501795244216919, + "ce_loss_13": 3.445420837402344, + "ce_loss_2": 3.993399131298065, + "ce_loss_3": 3.819171416759491, + "ce_loss_7": 3.5519042015075684, + "epoch": 0.877, + "grad_norm": 398.0, + "kl_loss_10": 91.19635620117188, + "kl_loss_2": 1126.038784790039, + "kl_loss_3": 777.8552947998047, + "kl_loss_7": 188.21297302246094, + "learning_rate": 3.760615025373543e-05, + "loss": 535.8912, + "step": 8770 + }, + { + "ce_loss_10": 3.687652599811554, + "ce_loss_13": 3.6275517463684084, + "ce_loss_2": 4.149944150447846, + "ce_loss_3": 3.984694278240204, + "ce_loss_7": 3.7361566066741942, + "epoch": 0.878, + "grad_norm": 426.0, + "kl_loss_10": 98.53735313415527, + "kl_loss_2": 1087.7767242431642, + "kl_loss_3": 754.1841644287109, + "kl_loss_7": 191.66405487060547, + "learning_rate": 3.700478030680987e-05, + "loss": 534.6525, + "step": 8780 + }, + { + "ce_loss_10": 3.672296917438507, + "ce_loss_13": 3.6126784920692443, + "ce_loss_2": 4.126206862926483, + "ce_loss_3": 3.9555336833000183, + "ce_loss_7": 3.7154035449028013, + "epoch": 0.879, + "grad_norm": 400.0, + "kl_loss_10": 95.93194694519043, + "kl_loss_2": 1067.7572967529297, + "kl_loss_3": 734.3840759277343, + "kl_loss_7": 185.99778594970704, + "learning_rate": 3.6408072716606344e-05, + "loss": 520.9604, + "step": 8790 + }, + { + "ce_loss_10": 3.5921829104423524, + "ce_loss_13": 3.5314606547355654, + "ce_loss_2": 4.064702832698822, + "ce_loss_3": 3.897125017642975, + "ce_loss_7": 3.639820373058319, + "epoch": 0.88, + "grad_norm": 424.0, + "kl_loss_10": 96.45306243896485, + "kl_loss_2": 1113.6997863769532, + "kl_loss_3": 769.2831970214844, + "kl_loss_7": 189.68171615600585, + "learning_rate": 3.5816033491963716e-05, + "loss": 546.457, + "step": 8800 + }, + { + "ce_loss_10": 3.4587510585784913, + "ce_loss_13": 3.398640847206116, + "ce_loss_2": 3.9295639514923097, + "ce_loss_3": 3.755736696720123, + "ce_loss_7": 3.502725625038147, + "epoch": 0.881, + "grad_norm": 374.0, + "kl_loss_10": 94.41120719909668, + "kl_loss_2": 1107.7318145751954, + "kl_loss_3": 762.6848449707031, + "kl_loss_7": 185.3354965209961, + "learning_rate": 3.522866859471047e-05, + "loss": 531.675, + "step": 8810 + }, + { + "ce_loss_10": 3.7003540635108947, + "ce_loss_13": 3.6417059302330017, + "ce_loss_2": 4.134489345550537, + "ce_loss_3": 3.972803270816803, + "ce_loss_7": 3.7418115973472594, + "epoch": 0.882, + "grad_norm": 620.0, + "kl_loss_10": 93.44988250732422, + "kl_loss_2": 1046.9635864257812, + "kl_loss_3": 718.645751953125, + "kl_loss_7": 180.43475570678712, + "learning_rate": 3.46459839396045e-05, + "loss": 519.2549, + "step": 8820 + }, + { + "ce_loss_10": 3.6235634326934814, + "ce_loss_13": 3.5625478267669677, + "ce_loss_2": 4.090062844753265, + "ce_loss_3": 3.9221726655960083, + "ce_loss_7": 3.6677647113800047, + "epoch": 0.883, + "grad_norm": 392.0, + "kl_loss_10": 97.41650848388672, + "kl_loss_2": 1090.359048461914, + "kl_loss_3": 752.6492370605469, + "kl_loss_7": 188.19114456176757, + "learning_rate": 3.406798539427386e-05, + "loss": 541.4702, + "step": 8830 + }, + { + "ce_loss_10": 3.6815385699272154, + "ce_loss_13": 3.622318422794342, + "ce_loss_2": 4.134820902347565, + "ce_loss_3": 3.9722886800765993, + "ce_loss_7": 3.7261940598487855, + "epoch": 0.884, + "grad_norm": 458.0, + "kl_loss_10": 95.14997901916504, + "kl_loss_2": 1087.6108123779297, + "kl_loss_3": 753.6235443115235, + "kl_loss_7": 186.09493026733398, + "learning_rate": 3.349467877915746e-05, + "loss": 532.4207, + "step": 8840 + }, + { + "ce_loss_10": 3.6383310556411743, + "ce_loss_13": 3.578685259819031, + "ce_loss_2": 4.10920352935791, + "ce_loss_3": 3.9395066857337953, + "ce_loss_7": 3.684439957141876, + "epoch": 0.885, + "grad_norm": 346.0, + "kl_loss_10": 94.56938552856445, + "kl_loss_2": 1107.4275299072265, + "kl_loss_3": 766.7192443847656, + "kl_loss_7": 187.05870895385743, + "learning_rate": 3.292606986744667e-05, + "loss": 544.0854, + "step": 8850 + }, + { + "ce_loss_10": 3.593039667606354, + "ce_loss_13": 3.5363111972808836, + "ce_loss_2": 4.061631453037262, + "ce_loss_3": 3.888974642753601, + "ce_loss_7": 3.6354947090148926, + "epoch": 0.886, + "grad_norm": 312.0, + "kl_loss_10": 94.36025886535644, + "kl_loss_2": 1094.437567138672, + "kl_loss_3": 755.0413787841796, + "kl_loss_7": 185.15854110717774, + "learning_rate": 3.23621643850267e-05, + "loss": 531.352, + "step": 8860 + }, + { + "ce_loss_10": 3.6675365686416628, + "ce_loss_13": 3.608867907524109, + "ce_loss_2": 4.1205101132392885, + "ce_loss_3": 3.9526678919792175, + "ce_loss_7": 3.71103777885437, + "epoch": 0.887, + "grad_norm": 398.0, + "kl_loss_10": 95.91901359558105, + "kl_loss_2": 1094.978707885742, + "kl_loss_3": 758.2980133056641, + "kl_loss_7": 187.99334793090821, + "learning_rate": 3.180296801041971e-05, + "loss": 525.304, + "step": 8870 + }, + { + "ce_loss_10": 3.6939959645271303, + "ce_loss_13": 3.6341704607009886, + "ce_loss_2": 4.136724853515625, + "ce_loss_3": 3.976076662540436, + "ce_loss_7": 3.7369011640548706, + "epoch": 0.888, + "grad_norm": 322.0, + "kl_loss_10": 96.13762168884277, + "kl_loss_2": 1061.462728881836, + "kl_loss_3": 731.0939331054688, + "kl_loss_7": 185.31768493652345, + "learning_rate": 3.124848637472688e-05, + "loss": 515.8721, + "step": 8880 + }, + { + "ce_loss_10": 3.5114728569984437, + "ce_loss_13": 3.452458143234253, + "ce_loss_2": 3.9819056034088134, + "ce_loss_3": 3.8095321655273438, + "ce_loss_7": 3.5549168229103087, + "epoch": 0.889, + "grad_norm": 430.0, + "kl_loss_10": 92.77987136840821, + "kl_loss_2": 1105.7576904296875, + "kl_loss_3": 760.3018249511719, + "kl_loss_7": 183.98031311035157, + "learning_rate": 3.069872506157212e-05, + "loss": 529.9256, + "step": 8890 + }, + { + "ce_loss_10": 3.6096359133720397, + "ce_loss_13": 3.5530964136123657, + "ce_loss_2": 4.066385662555694, + "ce_loss_3": 3.9037466764450075, + "ce_loss_7": 3.653862941265106, + "epoch": 0.89, + "grad_norm": 414.0, + "kl_loss_10": 94.68969841003418, + "kl_loss_2": 1082.7529907226562, + "kl_loss_3": 748.9955108642578, + "kl_loss_7": 186.7980583190918, + "learning_rate": 3.0153689607045842e-05, + "loss": 522.4292, + "step": 8900 + }, + { + "ce_loss_10": 3.5076727747917174, + "ce_loss_13": 3.4481669664382935, + "ce_loss_2": 3.998192644119263, + "ce_loss_3": 3.8251919507980348, + "ce_loss_7": 3.5543401718139647, + "epoch": 0.891, + "grad_norm": 462.0, + "kl_loss_10": 96.1771800994873, + "kl_loss_2": 1157.3876403808595, + "kl_loss_3": 799.3413696289062, + "kl_loss_7": 192.33385009765624, + "learning_rate": 2.9613385499648926e-05, + "loss": 537.2502, + "step": 8910 + }, + { + "ce_loss_10": 3.5617488503456114, + "ce_loss_13": 3.5028850078582763, + "ce_loss_2": 4.028625464439392, + "ce_loss_3": 3.8606330037117003, + "ce_loss_7": 3.60619056224823, + "epoch": 0.892, + "grad_norm": 364.0, + "kl_loss_10": 92.3734031677246, + "kl_loss_2": 1092.7289123535156, + "kl_loss_3": 755.3269073486329, + "kl_loss_7": 183.66201095581056, + "learning_rate": 2.9077818180237692e-05, + "loss": 529.899, + "step": 8920 + }, + { + "ce_loss_10": 3.611976993083954, + "ce_loss_13": 3.5523295164108277, + "ce_loss_2": 4.088427019119263, + "ce_loss_3": 3.911720395088196, + "ce_loss_7": 3.6568928718566895, + "epoch": 0.893, + "grad_norm": 604.0, + "kl_loss_10": 95.37241554260254, + "kl_loss_2": 1091.7466766357422, + "kl_loss_3": 749.5647033691406, + "kl_loss_7": 185.87219848632813, + "learning_rate": 2.8546993041969172e-05, + "loss": 528.8222, + "step": 8930 + }, + { + "ce_loss_10": 3.649553382396698, + "ce_loss_13": 3.5936214447021486, + "ce_loss_2": 4.095563900470734, + "ce_loss_3": 3.9343939542770388, + "ce_loss_7": 3.6919458627700807, + "epoch": 0.894, + "grad_norm": 356.0, + "kl_loss_10": 92.16914100646973, + "kl_loss_2": 1065.6531127929688, + "kl_loss_3": 739.0178924560547, + "kl_loss_7": 182.67144699096679, + "learning_rate": 2.802091543024671e-05, + "loss": 525.8132, + "step": 8940 + }, + { + "ce_loss_10": 3.6456188917160035, + "ce_loss_13": 3.5855357170104982, + "ce_loss_2": 4.1163407325744625, + "ce_loss_3": 3.9452737092971804, + "ce_loss_7": 3.690487289428711, + "epoch": 0.895, + "grad_norm": 376.0, + "kl_loss_10": 94.99068603515624, + "kl_loss_2": 1107.8523712158203, + "kl_loss_3": 763.5164489746094, + "kl_loss_7": 187.85556182861328, + "learning_rate": 2.7499590642665774e-05, + "loss": 543.5269, + "step": 8950 + }, + { + "ce_loss_10": 3.6521722793579103, + "ce_loss_13": 3.5920722246170045, + "ce_loss_2": 4.112611806392669, + "ce_loss_3": 3.942758357524872, + "ce_loss_7": 3.6924882411956785, + "epoch": 0.896, + "grad_norm": 434.0, + "kl_loss_10": 97.21023635864258, + "kl_loss_2": 1089.4108154296875, + "kl_loss_3": 742.6543731689453, + "kl_loss_7": 186.23975067138673, + "learning_rate": 2.6983023928961405e-05, + "loss": 523.9287, + "step": 8960 + }, + { + "ce_loss_10": 3.6287880539894104, + "ce_loss_13": 3.569942307472229, + "ce_loss_2": 4.086234021186828, + "ce_loss_3": 3.919290018081665, + "ce_loss_7": 3.6727704763412476, + "epoch": 0.897, + "grad_norm": 428.0, + "kl_loss_10": 96.33384323120117, + "kl_loss_2": 1081.610333251953, + "kl_loss_3": 747.162060546875, + "kl_loss_7": 187.28789825439452, + "learning_rate": 2.6471220490954628e-05, + "loss": 531.8677, + "step": 8970 + }, + { + "ce_loss_10": 3.6082414865493773, + "ce_loss_13": 3.5503612399101256, + "ce_loss_2": 4.054306983947754, + "ce_loss_3": 3.8875715851783754, + "ce_loss_7": 3.647981250286102, + "epoch": 0.898, + "grad_norm": 402.0, + "kl_loss_10": 93.92480773925782, + "kl_loss_2": 1068.1579833984374, + "kl_loss_3": 736.318814086914, + "kl_loss_7": 183.30384826660156, + "learning_rate": 2.596418548250029e-05, + "loss": 527.9295, + "step": 8980 + }, + { + "ce_loss_10": 3.6551415085792542, + "ce_loss_13": 3.5952192187309264, + "ce_loss_2": 4.1076843500137326, + "ce_loss_3": 3.944980025291443, + "ce_loss_7": 3.700137984752655, + "epoch": 0.899, + "grad_norm": 396.0, + "kl_loss_10": 97.98623161315918, + "kl_loss_2": 1081.954281616211, + "kl_loss_3": 746.2776489257812, + "kl_loss_7": 188.93777618408203, + "learning_rate": 2.5461924009435368e-05, + "loss": 524.2467, + "step": 8990 + }, + { + "ce_loss_10": 3.650333786010742, + "ce_loss_13": 3.590772497653961, + "ce_loss_2": 4.109632253646851, + "ce_loss_3": 3.9412980914115905, + "ce_loss_7": 3.6946743369102477, + "epoch": 0.9, + "grad_norm": 410.0, + "kl_loss_10": 96.09890708923339, + "kl_loss_2": 1079.7472290039063, + "kl_loss_3": 745.8318054199219, + "kl_loss_7": 186.11589736938475, + "learning_rate": 2.4964441129527336e-05, + "loss": 536.0899, + "step": 9000 + }, + { + "ce_loss_10": 3.6510029554367067, + "ce_loss_13": 3.590871715545654, + "ce_loss_2": 4.100390136241913, + "ce_loss_3": 3.932853305339813, + "ce_loss_7": 3.6917531371116636, + "epoch": 0.901, + "grad_norm": 418.0, + "kl_loss_10": 95.55135993957519, + "kl_loss_2": 1061.7380157470702, + "kl_loss_3": 727.2771514892578, + "kl_loss_7": 183.68069381713866, + "learning_rate": 2.4471741852423235e-05, + "loss": 518.1353, + "step": 9010 + }, + { + "ce_loss_10": 3.695908546447754, + "ce_loss_13": 3.6349289417266846, + "ce_loss_2": 4.151931369304657, + "ce_loss_3": 3.98497998714447, + "ce_loss_7": 3.739882934093475, + "epoch": 0.902, + "grad_norm": 392.0, + "kl_loss_10": 95.51335906982422, + "kl_loss_2": 1066.5906768798827, + "kl_loss_3": 733.3630157470703, + "kl_loss_7": 184.28593063354492, + "learning_rate": 2.3983831139599287e-05, + "loss": 522.8627, + "step": 9020 + }, + { + "ce_loss_10": 3.617437481880188, + "ce_loss_13": 3.558865213394165, + "ce_loss_2": 4.061969435214996, + "ce_loss_3": 3.8991889357566833, + "ce_loss_7": 3.660116195678711, + "epoch": 0.903, + "grad_norm": 456.0, + "kl_loss_10": 93.39376106262208, + "kl_loss_2": 1059.7717498779298, + "kl_loss_3": 733.3598663330079, + "kl_loss_7": 181.95840148925782, + "learning_rate": 2.3500713904311022e-05, + "loss": 512.7801, + "step": 9030 + }, + { + "ce_loss_10": 3.659070146083832, + "ce_loss_13": 3.5992442965507507, + "ce_loss_2": 4.08744889497757, + "ce_loss_3": 3.9278596162796022, + "ce_loss_7": 3.700530481338501, + "epoch": 0.904, + "grad_norm": 472.0, + "kl_loss_10": 95.7885025024414, + "kl_loss_2": 1036.0338073730468, + "kl_loss_3": 713.1754333496094, + "kl_loss_7": 181.65938034057618, + "learning_rate": 2.3022395011543685e-05, + "loss": 514.4845, + "step": 9040 + }, + { + "ce_loss_10": 3.6909992337226867, + "ce_loss_13": 3.630416977405548, + "ce_loss_2": 4.144919979572296, + "ce_loss_3": 3.98409184217453, + "ce_loss_7": 3.735574746131897, + "epoch": 0.905, + "grad_norm": 400.0, + "kl_loss_10": 95.80096397399902, + "kl_loss_2": 1091.1403015136718, + "kl_loss_3": 758.9450408935547, + "kl_loss_7": 188.74431228637695, + "learning_rate": 2.2548879277963063e-05, + "loss": 536.6219, + "step": 9050 + }, + { + "ce_loss_10": 3.6055094718933107, + "ce_loss_13": 3.5453344702720644, + "ce_loss_2": 4.055747485160827, + "ce_loss_3": 3.8876903891563415, + "ce_loss_7": 3.645590376853943, + "epoch": 0.906, + "grad_norm": 312.0, + "kl_loss_10": 94.81256561279297, + "kl_loss_2": 1081.8126281738282, + "kl_loss_3": 743.9638031005859, + "kl_loss_7": 185.8631164550781, + "learning_rate": 2.208017147186736e-05, + "loss": 517.0646, + "step": 9060 + }, + { + "ce_loss_10": 3.5984405398368837, + "ce_loss_13": 3.5392195105552675, + "ce_loss_2": 4.055430555343628, + "ce_loss_3": 3.8891077756881716, + "ce_loss_7": 3.643998312950134, + "epoch": 0.907, + "grad_norm": 424.0, + "kl_loss_10": 95.52283592224121, + "kl_loss_2": 1082.7356536865234, + "kl_loss_3": 749.8307952880859, + "kl_loss_7": 186.6390350341797, + "learning_rate": 2.1616276313139227e-05, + "loss": 522.272, + "step": 9070 + }, + { + "ce_loss_10": 3.6377461314201356, + "ce_loss_13": 3.5757868885993958, + "ce_loss_2": 4.087118625640869, + "ce_loss_3": 3.9254656434059143, + "ce_loss_7": 3.680292618274689, + "epoch": 0.908, + "grad_norm": 362.0, + "kl_loss_10": 96.6335952758789, + "kl_loss_2": 1071.57734375, + "kl_loss_3": 743.0760345458984, + "kl_loss_7": 186.97156448364257, + "learning_rate": 2.1157198473197415e-05, + "loss": 527.4616, + "step": 9080 + }, + { + "ce_loss_10": 3.7054911255836487, + "ce_loss_13": 3.646452081203461, + "ce_loss_2": 4.16020712852478, + "ce_loss_3": 3.99694961309433, + "ce_loss_7": 3.7527972936630247, + "epoch": 0.909, + "grad_norm": 428.0, + "kl_loss_10": 95.60770835876465, + "kl_loss_2": 1073.3848999023437, + "kl_loss_3": 744.7516662597657, + "kl_loss_7": 188.15945053100586, + "learning_rate": 2.0702942574950812e-05, + "loss": 526.0792, + "step": 9090 + }, + { + "ce_loss_10": 3.623731589317322, + "ce_loss_13": 3.5640787363052366, + "ce_loss_2": 4.083542311191559, + "ce_loss_3": 3.9220656394958495, + "ce_loss_7": 3.669620490074158, + "epoch": 0.91, + "grad_norm": 302.0, + "kl_loss_10": 95.35622863769531, + "kl_loss_2": 1087.3217651367188, + "kl_loss_3": 752.284033203125, + "kl_loss_7": 187.5697151184082, + "learning_rate": 2.025351319275137e-05, + "loss": 528.1311, + "step": 9100 + }, + { + "ce_loss_10": 3.761759030818939, + "ce_loss_13": 3.6962865233421325, + "ce_loss_2": 4.2175662279129025, + "ce_loss_3": 4.051000607013703, + "ce_loss_7": 3.8052276611328124, + "epoch": 0.911, + "grad_norm": 420.0, + "kl_loss_10": 101.6547290802002, + "kl_loss_2": 1108.3317321777345, + "kl_loss_3": 765.9157867431641, + "kl_loss_7": 194.34442520141602, + "learning_rate": 1.9808914852347816e-05, + "loss": 545.7752, + "step": 9110 + }, + { + "ce_loss_10": 3.599123954772949, + "ce_loss_13": 3.539510524272919, + "ce_loss_2": 4.069272911548614, + "ce_loss_3": 3.9009834051132204, + "ce_loss_7": 3.6455170154571532, + "epoch": 0.912, + "grad_norm": 416.0, + "kl_loss_10": 95.14377288818359, + "kl_loss_2": 1095.5253448486328, + "kl_loss_3": 750.8630340576171, + "kl_loss_7": 187.0247688293457, + "learning_rate": 1.9369152030840554e-05, + "loss": 527.6025, + "step": 9120 + }, + { + "ce_loss_10": 3.6806903958320616, + "ce_loss_13": 3.620557761192322, + "ce_loss_2": 4.135490739345551, + "ce_loss_3": 3.9653069972991943, + "ce_loss_7": 3.723483943939209, + "epoch": 0.913, + "grad_norm": 362.0, + "kl_loss_10": 97.92795066833496, + "kl_loss_2": 1089.1937438964844, + "kl_loss_3": 747.6420379638672, + "kl_loss_7": 187.34563446044922, + "learning_rate": 1.893422915663645e-05, + "loss": 529.2906, + "step": 9130 + }, + { + "ce_loss_10": 3.5489492774009705, + "ce_loss_13": 3.488741672039032, + "ce_loss_2": 4.032487225532532, + "ce_loss_3": 3.862708866596222, + "ce_loss_7": 3.594150650501251, + "epoch": 0.914, + "grad_norm": 460.0, + "kl_loss_10": 95.81211128234864, + "kl_loss_2": 1122.290625, + "kl_loss_3": 780.3386810302734, + "kl_loss_7": 190.92548141479492, + "learning_rate": 1.850415060940386e-05, + "loss": 539.4046, + "step": 9140 + }, + { + "ce_loss_10": 3.670183026790619, + "ce_loss_13": 3.611021101474762, + "ce_loss_2": 4.120828151702881, + "ce_loss_3": 3.9584792375564577, + "ce_loss_7": 3.712183046340942, + "epoch": 0.915, + "grad_norm": 418.0, + "kl_loss_10": 95.88972358703613, + "kl_loss_2": 1074.5135314941406, + "kl_loss_3": 738.371826171875, + "kl_loss_7": 185.7539405822754, + "learning_rate": 1.8078920720028978e-05, + "loss": 525.966, + "step": 9150 + }, + { + "ce_loss_10": 3.600800943374634, + "ce_loss_13": 3.5434103488922117, + "ce_loss_2": 4.046385419368744, + "ce_loss_3": 3.8842490911483765, + "ce_loss_7": 3.6435607194900514, + "epoch": 0.916, + "grad_norm": 468.0, + "kl_loss_10": 94.49675407409669, + "kl_loss_2": 1068.3072998046875, + "kl_loss_3": 736.1623046875, + "kl_loss_7": 182.35257797241212, + "learning_rate": 1.765854377057219e-05, + "loss": 533.5915, + "step": 9160 + }, + { + "ce_loss_10": 3.579929566383362, + "ce_loss_13": 3.52090607881546, + "ce_loss_2": 4.0303690195083615, + "ce_loss_3": 3.863832104206085, + "ce_loss_7": 3.621261489391327, + "epoch": 0.917, + "grad_norm": 344.0, + "kl_loss_10": 93.69845123291016, + "kl_loss_2": 1076.374838256836, + "kl_loss_3": 739.5320068359375, + "kl_loss_7": 182.73907394409179, + "learning_rate": 1.724302399422456e-05, + "loss": 525.9574, + "step": 9170 + }, + { + "ce_loss_10": 3.5273375153541564, + "ce_loss_13": 3.469092321395874, + "ce_loss_2": 3.98960462808609, + "ce_loss_3": 3.8235998272895815, + "ce_loss_7": 3.572177302837372, + "epoch": 0.918, + "grad_norm": 328.0, + "kl_loss_10": 94.86108894348145, + "kl_loss_2": 1092.3598358154297, + "kl_loss_3": 757.3310150146484, + "kl_loss_7": 188.48751983642578, + "learning_rate": 1.683236557526574e-05, + "loss": 533.8531, + "step": 9180 + }, + { + "ce_loss_10": 3.6514230132102967, + "ce_loss_13": 3.59556097984314, + "ce_loss_2": 4.083134496212006, + "ce_loss_3": 3.926029086112976, + "ce_loss_7": 3.693097734451294, + "epoch": 0.919, + "grad_norm": 276.0, + "kl_loss_10": 94.37221069335938, + "kl_loss_2": 1047.5379638671875, + "kl_loss_3": 720.9200286865234, + "kl_loss_7": 181.39565734863282, + "learning_rate": 1.6426572649021475e-05, + "loss": 520.5356, + "step": 9190 + }, + { + "ce_loss_10": 3.6877851486206055, + "ce_loss_13": 3.6274981617927553, + "ce_loss_2": 4.1144737839698795, + "ce_loss_3": 3.9595839619636535, + "ce_loss_7": 3.7264232993125916, + "epoch": 0.92, + "grad_norm": 430.0, + "kl_loss_10": 99.18587074279785, + "kl_loss_2": 1047.7421783447267, + "kl_loss_3": 721.9292663574219, + "kl_loss_7": 186.34831695556642, + "learning_rate": 1.6025649301821876e-05, + "loss": 520.097, + "step": 9200 + }, + { + "ce_loss_10": 3.6789560437202455, + "ce_loss_13": 3.6199841260910035, + "ce_loss_2": 4.116438376903534, + "ce_loss_3": 3.95575532913208, + "ce_loss_7": 3.720892333984375, + "epoch": 0.921, + "grad_norm": 430.0, + "kl_loss_10": 95.03273735046386, + "kl_loss_2": 1068.5045623779297, + "kl_loss_3": 740.7460571289063, + "kl_loss_7": 185.96430587768555, + "learning_rate": 1.5629599570960716e-05, + "loss": 522.4428, + "step": 9210 + }, + { + "ce_loss_10": 3.579318141937256, + "ce_loss_13": 3.5199381947517394, + "ce_loss_2": 4.029832947254181, + "ce_loss_3": 3.865503740310669, + "ce_loss_7": 3.6221681237220764, + "epoch": 0.922, + "grad_norm": 430.0, + "kl_loss_10": 94.97879791259766, + "kl_loss_2": 1084.768603515625, + "kl_loss_3": 748.8800231933594, + "kl_loss_7": 185.368741607666, + "learning_rate": 1.5238427444654367e-05, + "loss": 526.936, + "step": 9220 + }, + { + "ce_loss_10": 3.642410922050476, + "ce_loss_13": 3.5841264009475706, + "ce_loss_2": 4.090620064735413, + "ce_loss_3": 3.929516541957855, + "ce_loss_7": 3.68586403131485, + "epoch": 0.923, + "grad_norm": 340.0, + "kl_loss_10": 95.43446731567383, + "kl_loss_2": 1061.9394897460938, + "kl_loss_3": 729.8539154052735, + "kl_loss_7": 184.269775390625, + "learning_rate": 1.4852136862001764e-05, + "loss": 521.6809, + "step": 9230 + }, + { + "ce_loss_10": 3.6022266387939452, + "ce_loss_13": 3.5460850477218626, + "ce_loss_2": 4.056096696853638, + "ce_loss_3": 3.894578981399536, + "ce_loss_7": 3.6445172667503356, + "epoch": 0.924, + "grad_norm": 382.0, + "kl_loss_10": 90.83601989746094, + "kl_loss_2": 1070.5055114746094, + "kl_loss_3": 735.5364959716796, + "kl_loss_7": 180.06712493896484, + "learning_rate": 1.4470731712944884e-05, + "loss": 526.6606, + "step": 9240 + }, + { + "ce_loss_10": 3.632104980945587, + "ce_loss_13": 3.573563551902771, + "ce_loss_2": 4.086918556690216, + "ce_loss_3": 3.921724486351013, + "ce_loss_7": 3.676921808719635, + "epoch": 0.925, + "grad_norm": 404.0, + "kl_loss_10": 93.8505702972412, + "kl_loss_2": 1076.019464111328, + "kl_loss_3": 742.9348846435547, + "kl_loss_7": 185.7860206604004, + "learning_rate": 1.4094215838229174e-05, + "loss": 532.0963, + "step": 9250 + }, + { + "ce_loss_10": 3.5902254581451416, + "ce_loss_13": 3.531176710128784, + "ce_loss_2": 4.053838360309601, + "ce_loss_3": 3.8887827515602114, + "ce_loss_7": 3.634320020675659, + "epoch": 0.926, + "grad_norm": 440.0, + "kl_loss_10": 95.00082511901856, + "kl_loss_2": 1108.7564575195313, + "kl_loss_3": 761.1957458496094, + "kl_loss_7": 187.39419326782226, + "learning_rate": 1.372259302936546e-05, + "loss": 548.2919, + "step": 9260 + }, + { + "ce_loss_10": 3.7115341782569886, + "ce_loss_13": 3.6472853660583495, + "ce_loss_2": 4.159861445426941, + "ce_loss_3": 3.998417854309082, + "ce_loss_7": 3.7543888211250307, + "epoch": 0.927, + "grad_norm": 304.0, + "kl_loss_10": 100.11175384521485, + "kl_loss_2": 1075.1090118408204, + "kl_loss_3": 744.2237152099609, + "kl_loss_7": 190.9360038757324, + "learning_rate": 1.3355867028591206e-05, + "loss": 520.805, + "step": 9270 + }, + { + "ce_loss_10": 3.6113879919052123, + "ce_loss_13": 3.5496174573898314, + "ce_loss_2": 4.047625136375427, + "ce_loss_3": 3.8916648983955384, + "ce_loss_7": 3.653665769100189, + "epoch": 0.928, + "grad_norm": 334.0, + "kl_loss_10": 94.99486846923828, + "kl_loss_2": 1063.383090209961, + "kl_loss_3": 737.3780670166016, + "kl_loss_7": 184.87188415527345, + "learning_rate": 1.2994041528833267e-05, + "loss": 520.9468, + "step": 9280 + }, + { + "ce_loss_10": 3.612771439552307, + "ce_loss_13": 3.5519652009010314, + "ce_loss_2": 4.069023680686951, + "ce_loss_3": 3.9033527731895448, + "ce_loss_7": 3.653776025772095, + "epoch": 0.929, + "grad_norm": 394.0, + "kl_loss_10": 94.48731269836426, + "kl_loss_2": 1086.341064453125, + "kl_loss_3": 747.7527069091797, + "kl_loss_7": 184.27003555297853, + "learning_rate": 1.2637120173670358e-05, + "loss": 525.795, + "step": 9290 + }, + { + "ce_loss_10": 3.6342510104179384, + "ce_loss_13": 3.574049484729767, + "ce_loss_2": 4.097525525093078, + "ce_loss_3": 3.9327287077903748, + "ce_loss_7": 3.6803439974784853, + "epoch": 0.93, + "grad_norm": 492.0, + "kl_loss_10": 94.73881340026855, + "kl_loss_2": 1086.5091583251954, + "kl_loss_3": 750.7861236572265, + "kl_loss_7": 186.8117706298828, + "learning_rate": 1.2285106557296478e-05, + "loss": 526.7854, + "step": 9300 + }, + { + "ce_loss_10": 3.513438880443573, + "ce_loss_13": 3.453951287269592, + "ce_loss_2": 3.9955971360206606, + "ce_loss_3": 3.8230167746543886, + "ce_loss_7": 3.555509877204895, + "epoch": 0.931, + "grad_norm": 356.0, + "kl_loss_10": 93.80283432006836, + "kl_loss_2": 1116.4696807861328, + "kl_loss_3": 771.7997375488281, + "kl_loss_7": 186.52389373779297, + "learning_rate": 1.1938004224484989e-05, + "loss": 533.0822, + "step": 9310 + }, + { + "ce_loss_10": 3.7524689197540284, + "ce_loss_13": 3.6876933336257935, + "ce_loss_2": 4.20148618221283, + "ce_loss_3": 4.035860347747803, + "ce_loss_7": 3.7956905245780943, + "epoch": 0.932, + "grad_norm": 418.0, + "kl_loss_10": 99.70074195861817, + "kl_loss_2": 1085.114028930664, + "kl_loss_3": 747.7518859863281, + "kl_loss_7": 189.80009078979492, + "learning_rate": 1.1595816670552429e-05, + "loss": 536.128, + "step": 9320 + }, + { + "ce_loss_10": 3.6811413764953613, + "ce_loss_13": 3.619305157661438, + "ce_loss_2": 4.1267077088356015, + "ce_loss_3": 3.9628111124038696, + "ce_loss_7": 3.7232463002204894, + "epoch": 0.933, + "grad_norm": 430.0, + "kl_loss_10": 98.55138320922852, + "kl_loss_2": 1066.0611297607422, + "kl_loss_3": 732.6245086669921, + "kl_loss_7": 187.06882858276367, + "learning_rate": 1.1258547341323699e-05, + "loss": 518.9695, + "step": 9330 + }, + { + "ce_loss_10": 3.706856846809387, + "ce_loss_13": 3.6450837016105653, + "ce_loss_2": 4.152973532676697, + "ce_loss_3": 3.9891764402389525, + "ce_loss_7": 3.7481295585632326, + "epoch": 0.934, + "grad_norm": 394.0, + "kl_loss_10": 96.45535087585449, + "kl_loss_2": 1089.2688110351562, + "kl_loss_3": 747.8073425292969, + "kl_loss_7": 187.34025497436522, + "learning_rate": 1.0926199633097156e-05, + "loss": 527.061, + "step": 9340 + }, + { + "ce_loss_10": 3.7075893759727476, + "ce_loss_13": 3.6489187121391295, + "ce_loss_2": 4.135252356529236, + "ce_loss_3": 3.976875376701355, + "ce_loss_7": 3.747441065311432, + "epoch": 0.935, + "grad_norm": 428.0, + "kl_loss_10": 94.83727493286133, + "kl_loss_2": 1042.2317810058594, + "kl_loss_3": 718.6920349121094, + "kl_loss_7": 181.23108978271483, + "learning_rate": 1.0598776892610684e-05, + "loss": 526.2413, + "step": 9350 + }, + { + "ce_loss_10": 3.5169559955596923, + "ce_loss_13": 3.4603365540504454, + "ce_loss_2": 3.9802993655204775, + "ce_loss_3": 3.8121800780296327, + "ce_loss_7": 3.561786246299744, + "epoch": 0.936, + "grad_norm": 334.0, + "kl_loss_10": 92.96564292907715, + "kl_loss_2": 1091.1406646728515, + "kl_loss_3": 747.6543731689453, + "kl_loss_7": 183.7804039001465, + "learning_rate": 1.0276282417007399e-05, + "loss": 521.9861, + "step": 9360 + }, + { + "ce_loss_10": 3.6849416494369507, + "ce_loss_13": 3.626581645011902, + "ce_loss_2": 4.118964040279389, + "ce_loss_3": 3.9585147976875303, + "ce_loss_7": 3.7237794518470766, + "epoch": 0.937, + "grad_norm": 464.0, + "kl_loss_10": 95.02116394042969, + "kl_loss_2": 1044.2026397705079, + "kl_loss_3": 719.8276824951172, + "kl_loss_7": 182.06821365356444, + "learning_rate": 9.958719453803277e-06, + "loss": 518.1707, + "step": 9370 + }, + { + "ce_loss_10": 3.6774186968803404, + "ce_loss_13": 3.6149828910827635, + "ce_loss_2": 4.126804637908935, + "ce_loss_3": 3.964286994934082, + "ce_loss_7": 3.7206520080566405, + "epoch": 0.938, + "grad_norm": 364.0, + "kl_loss_10": 96.62460212707519, + "kl_loss_2": 1077.0972625732422, + "kl_loss_3": 746.5920196533203, + "kl_loss_7": 186.96116638183594, + "learning_rate": 9.646091200853802e-06, + "loss": 526.3039, + "step": 9380 + }, + { + "ce_loss_10": 3.633099365234375, + "ce_loss_13": 3.5745465636253355, + "ce_loss_2": 4.0883647203445435, + "ce_loss_3": 3.9242159128189087, + "ce_loss_7": 3.672658348083496, + "epoch": 0.939, + "grad_norm": 398.0, + "kl_loss_10": 93.04219818115234, + "kl_loss_2": 1075.2075500488281, + "kl_loss_3": 738.6250030517579, + "kl_loss_7": 181.5531784057617, + "learning_rate": 9.338400806321978e-06, + "loss": 512.8155, + "step": 9390 + }, + { + "ce_loss_10": 3.664756190776825, + "ce_loss_13": 3.603893756866455, + "ce_loss_2": 4.104370522499084, + "ce_loss_3": 3.941502547264099, + "ce_loss_7": 3.7107202291488646, + "epoch": 0.94, + "grad_norm": 330.0, + "kl_loss_10": 96.52969932556152, + "kl_loss_2": 1056.286117553711, + "kl_loss_3": 729.6215881347656, + "kl_loss_7": 186.73142929077147, + "learning_rate": 9.035651368646646e-06, + "loss": 517.5048, + "step": 9400 + }, + { + "ce_loss_10": 3.6749662160873413, + "ce_loss_13": 3.6150254607200623, + "ce_loss_2": 4.108079397678376, + "ce_loss_3": 3.9502787351608277, + "ce_loss_7": 3.71422598361969, + "epoch": 0.941, + "grad_norm": 368.0, + "kl_loss_10": 95.4813446044922, + "kl_loss_2": 1051.3231384277344, + "kl_loss_3": 730.8897918701172, + "kl_loss_7": 183.71395568847657, + "learning_rate": 8.737845936511335e-06, + "loss": 521.5386, + "step": 9410 + }, + { + "ce_loss_10": 3.621238374710083, + "ce_loss_13": 3.560182070732117, + "ce_loss_2": 4.075435829162598, + "ce_loss_3": 3.906463932991028, + "ce_loss_7": 3.6651031732559205, + "epoch": 0.942, + "grad_norm": 472.0, + "kl_loss_10": 95.50933799743652, + "kl_loss_2": 1087.418194580078, + "kl_loss_3": 749.9641418457031, + "kl_loss_7": 187.3939208984375, + "learning_rate": 8.444987508813451e-06, + "loss": 524.6778, + "step": 9420 + }, + { + "ce_loss_10": 3.567629599571228, + "ce_loss_13": 3.5098708271980286, + "ce_loss_2": 4.03240327835083, + "ce_loss_3": 3.868740451335907, + "ce_loss_7": 3.614664590358734, + "epoch": 0.943, + "grad_norm": 452.0, + "kl_loss_10": 95.83200073242188, + "kl_loss_2": 1111.0681640625, + "kl_loss_3": 769.0793914794922, + "kl_loss_7": 188.26431045532226, + "learning_rate": 8.157079034633974e-06, + "loss": 533.1891, + "step": 9430 + }, + { + "ce_loss_10": 3.5664173483848574, + "ce_loss_13": 3.5061603307724, + "ce_loss_2": 4.02851265668869, + "ce_loss_3": 3.862307035923004, + "ce_loss_7": 3.6107182621955873, + "epoch": 0.944, + "grad_norm": 426.0, + "kl_loss_10": 94.98325424194336, + "kl_loss_2": 1109.4172790527343, + "kl_loss_3": 762.6424713134766, + "kl_loss_7": 186.38191299438478, + "learning_rate": 7.874123413208145e-06, + "loss": 528.958, + "step": 9440 + }, + { + "ce_loss_10": 3.5382938742637635, + "ce_loss_13": 3.481018900871277, + "ce_loss_2": 4.006192743778229, + "ce_loss_3": 3.8386752605438232, + "ce_loss_7": 3.5831608533859254, + "epoch": 0.945, + "grad_norm": 338.0, + "kl_loss_10": 92.47231903076172, + "kl_loss_2": 1088.9563568115234, + "kl_loss_3": 753.4448974609375, + "kl_loss_7": 184.27166213989258, + "learning_rate": 7.59612349389599e-06, + "loss": 527.5225, + "step": 9450 + }, + { + "ce_loss_10": 3.633445167541504, + "ce_loss_13": 3.5758827209472654, + "ce_loss_2": 4.075440514087677, + "ce_loss_3": 3.9124983310699464, + "ce_loss_7": 3.6780736327171324, + "epoch": 0.946, + "grad_norm": 356.0, + "kl_loss_10": 91.38598556518555, + "kl_loss_2": 1046.8805053710937, + "kl_loss_3": 718.2211791992188, + "kl_loss_7": 180.72154998779297, + "learning_rate": 7.323082076153509e-06, + "loss": 519.5404, + "step": 9460 + }, + { + "ce_loss_10": 3.675933361053467, + "ce_loss_13": 3.616945672035217, + "ce_loss_2": 4.116010129451752, + "ce_loss_3": 3.954231834411621, + "ce_loss_7": 3.7195321679115296, + "epoch": 0.947, + "grad_norm": 376.0, + "kl_loss_10": 96.42714042663575, + "kl_loss_2": 1051.1879852294921, + "kl_loss_3": 727.5513549804688, + "kl_loss_7": 186.51647338867187, + "learning_rate": 7.055001909504755e-06, + "loss": 525.7655, + "step": 9470 + }, + { + "ce_loss_10": 3.7083083152770997, + "ce_loss_13": 3.647673761844635, + "ce_loss_2": 4.157342481613159, + "ce_loss_3": 3.991931939125061, + "ce_loss_7": 3.752028775215149, + "epoch": 0.948, + "grad_norm": 344.0, + "kl_loss_10": 96.79825706481934, + "kl_loss_2": 1084.5101806640625, + "kl_loss_3": 742.6272155761719, + "kl_loss_7": 187.0098518371582, + "learning_rate": 6.791885693514133e-06, + "loss": 528.4126, + "step": 9480 + }, + { + "ce_loss_10": 3.6131741404533386, + "ce_loss_13": 3.554737401008606, + "ce_loss_2": 4.069884133338928, + "ce_loss_3": 3.910088050365448, + "ce_loss_7": 3.657594072818756, + "epoch": 0.949, + "grad_norm": 444.0, + "kl_loss_10": 95.54262161254883, + "kl_loss_2": 1090.819403076172, + "kl_loss_3": 755.8211273193359, + "kl_loss_7": 187.30291366577148, + "learning_rate": 6.533736077758867e-06, + "loss": 532.407, + "step": 9490 + }, + { + "ce_loss_10": 3.5753329753875733, + "ce_loss_13": 3.5157718658447266, + "ce_loss_2": 4.050174379348755, + "ce_loss_3": 3.878748118877411, + "ce_loss_7": 3.621631395816803, + "epoch": 0.95, + "grad_norm": 454.0, + "kl_loss_10": 95.78313636779785, + "kl_loss_2": 1112.5021850585938, + "kl_loss_3": 766.8859832763671, + "kl_loss_7": 188.93851776123046, + "learning_rate": 6.2805556618028556e-06, + "loss": 531.8975, + "step": 9500 + }, + { + "ce_loss_10": 3.6739890694618227, + "ce_loss_13": 3.614563775062561, + "ce_loss_2": 4.105420649051666, + "ce_loss_3": 3.946949827671051, + "ce_loss_7": 3.713826298713684, + "epoch": 0.951, + "grad_norm": 428.0, + "kl_loss_10": 95.29025764465332, + "kl_loss_2": 1035.753839111328, + "kl_loss_3": 718.9863189697265, + "kl_loss_7": 182.34558639526367, + "learning_rate": 6.032346995169968e-06, + "loss": 506.1833, + "step": 9510 + }, + { + "ce_loss_10": 3.6744378566741944, + "ce_loss_13": 3.6160669803619383, + "ce_loss_2": 4.116178596019745, + "ce_loss_3": 3.952050065994263, + "ce_loss_7": 3.714146387577057, + "epoch": 0.952, + "grad_norm": 350.0, + "kl_loss_10": 95.77439384460449, + "kl_loss_2": 1065.6743865966796, + "kl_loss_3": 734.3932067871094, + "kl_loss_7": 184.87170867919923, + "learning_rate": 5.789112577318789e-06, + "loss": 520.2576, + "step": 9520 + }, + { + "ce_loss_10": 3.6489309549331663, + "ce_loss_13": 3.5895671963691713, + "ce_loss_2": 4.11376656293869, + "ce_loss_3": 3.946073520183563, + "ce_loss_7": 3.6925018429756165, + "epoch": 0.953, + "grad_norm": 460.0, + "kl_loss_10": 96.73359451293945, + "kl_loss_2": 1111.601629638672, + "kl_loss_3": 771.5278289794921, + "kl_loss_7": 187.8802848815918, + "learning_rate": 5.550854857617194e-06, + "loss": 527.3308, + "step": 9530 + }, + { + "ce_loss_10": 3.6415695905685426, + "ce_loss_13": 3.579833471775055, + "ce_loss_2": 4.102292227745056, + "ce_loss_3": 3.9383127331733703, + "ce_loss_7": 3.6863919377326964, + "epoch": 0.954, + "grad_norm": 398.0, + "kl_loss_10": 98.16804580688476, + "kl_loss_2": 1097.6046325683594, + "kl_loss_3": 757.5784729003906, + "kl_loss_7": 190.50857543945312, + "learning_rate": 5.317576235317756e-06, + "loss": 527.9396, + "step": 9540 + }, + { + "ce_loss_10": 3.6651427507400514, + "ce_loss_13": 3.604920470714569, + "ce_loss_2": 4.100248050689697, + "ce_loss_3": 3.94064177274704, + "ce_loss_7": 3.7060970425605775, + "epoch": 0.955, + "grad_norm": 386.0, + "kl_loss_10": 96.45015525817871, + "kl_loss_2": 1031.3038146972656, + "kl_loss_3": 712.4996978759766, + "kl_loss_7": 182.76630401611328, + "learning_rate": 5.089279059533658e-06, + "loss": 524.0002, + "step": 9550 + }, + { + "ce_loss_10": 3.7266568183898925, + "ce_loss_13": 3.663935911655426, + "ce_loss_2": 4.170814108848572, + "ce_loss_3": 4.006054651737213, + "ce_loss_7": 3.769794237613678, + "epoch": 0.956, + "grad_norm": 386.0, + "kl_loss_10": 100.15878944396972, + "kl_loss_2": 1068.9294799804688, + "kl_loss_3": 738.0209930419921, + "kl_loss_7": 192.08404541015625, + "learning_rate": 4.865965629214819e-06, + "loss": 520.8748, + "step": 9560 + }, + { + "ce_loss_10": 3.670477032661438, + "ce_loss_13": 3.611146903038025, + "ce_loss_2": 4.115479242801666, + "ce_loss_3": 3.9537983894348145, + "ce_loss_7": 3.7129539370536806, + "epoch": 0.957, + "grad_norm": 496.0, + "kl_loss_10": 96.79973983764648, + "kl_loss_2": 1085.6631072998048, + "kl_loss_3": 749.8902404785156, + "kl_loss_7": 188.74480895996095, + "learning_rate": 4.6476381931251366e-06, + "loss": 519.6521, + "step": 9570 + }, + { + "ce_loss_10": 3.646716892719269, + "ce_loss_13": 3.5878213763237, + "ce_loss_2": 4.089986479282379, + "ce_loss_3": 3.9314276933670045, + "ce_loss_7": 3.6911307334899903, + "epoch": 0.958, + "grad_norm": 318.0, + "kl_loss_10": 94.01541290283203, + "kl_loss_2": 1067.8105712890624, + "kl_loss_3": 740.1676208496094, + "kl_loss_7": 184.118741607666, + "learning_rate": 4.434298949819449e-06, + "loss": 523.6254, + "step": 9580 + }, + { + "ce_loss_10": 3.6008993268013, + "ce_loss_13": 3.538570249080658, + "ce_loss_2": 4.069638097286225, + "ce_loss_3": 3.8975520372390746, + "ce_loss_7": 3.6453381776809692, + "epoch": 0.959, + "grad_norm": 440.0, + "kl_loss_10": 97.41343994140625, + "kl_loss_2": 1125.892025756836, + "kl_loss_3": 772.14267578125, + "kl_loss_7": 189.9515396118164, + "learning_rate": 4.2259500476214406e-06, + "loss": 534.6609, + "step": 9590 + }, + { + "ce_loss_10": 3.58458696603775, + "ce_loss_13": 3.52560031414032, + "ce_loss_2": 4.040603399276733, + "ce_loss_3": 3.8742735624313354, + "ce_loss_7": 3.627805030345917, + "epoch": 0.96, + "grad_norm": 388.0, + "kl_loss_10": 94.08248367309571, + "kl_loss_2": 1083.009814453125, + "kl_loss_3": 746.2331970214843, + "kl_loss_7": 184.85717010498047, + "learning_rate": 4.02259358460233e-06, + "loss": 521.7564, + "step": 9600 + }, + { + "ce_loss_10": 3.6558929800987245, + "ce_loss_13": 3.5954962849617003, + "ce_loss_2": 4.101473760604859, + "ce_loss_3": 3.9380804181098936, + "ce_loss_7": 3.6987645506858824, + "epoch": 0.961, + "grad_norm": 544.0, + "kl_loss_10": 95.69773292541504, + "kl_loss_2": 1060.7937774658203, + "kl_loss_3": 733.2102172851562, + "kl_loss_7": 185.71547775268556, + "learning_rate": 3.8242316085594916e-06, + "loss": 516.8465, + "step": 9610 + }, + { + "ce_loss_10": 3.5343406558036805, + "ce_loss_13": 3.4767986059188845, + "ce_loss_2": 4.016193747520447, + "ce_loss_3": 3.8443652629852294, + "ce_loss_7": 3.580942380428314, + "epoch": 0.962, + "grad_norm": 366.0, + "kl_loss_10": 93.89258918762206, + "kl_loss_2": 1123.5916809082032, + "kl_loss_3": 780.3413696289062, + "kl_loss_7": 187.34277801513673, + "learning_rate": 3.630866116995757e-06, + "loss": 546.1011, + "step": 9620 + }, + { + "ce_loss_10": 3.6960983991622927, + "ce_loss_13": 3.635801446437836, + "ce_loss_2": 4.132487082481385, + "ce_loss_3": 3.9690314412117003, + "ce_loss_7": 3.737609100341797, + "epoch": 0.963, + "grad_norm": 312.0, + "kl_loss_10": 96.57149925231934, + "kl_loss_2": 1044.7675506591797, + "kl_loss_3": 718.4659484863281, + "kl_loss_7": 183.9634048461914, + "learning_rate": 3.4424990570994797e-06, + "loss": 523.2208, + "step": 9630 + }, + { + "ce_loss_10": 3.685701644420624, + "ce_loss_13": 3.624559962749481, + "ce_loss_2": 4.128798627853394, + "ce_loss_3": 3.968520772457123, + "ce_loss_7": 3.7257295846939087, + "epoch": 0.964, + "grad_norm": 280.0, + "kl_loss_10": 95.63589668273926, + "kl_loss_2": 1068.9191833496093, + "kl_loss_3": 737.6691131591797, + "kl_loss_7": 184.7596893310547, + "learning_rate": 3.2591323257248896e-06, + "loss": 522.5564, + "step": 9640 + }, + { + "ce_loss_10": 3.5315052390098574, + "ce_loss_13": 3.4732569575309755, + "ce_loss_2": 3.99234676361084, + "ce_loss_3": 3.822614312171936, + "ce_loss_7": 3.5727542638778687, + "epoch": 0.965, + "grad_norm": 338.0, + "kl_loss_10": 93.59828681945801, + "kl_loss_2": 1088.8541290283204, + "kl_loss_3": 750.0034118652344, + "kl_loss_7": 183.51124572753906, + "learning_rate": 3.0807677693729385e-06, + "loss": 528.9641, + "step": 9650 + }, + { + "ce_loss_10": 3.721923458576202, + "ce_loss_13": 3.6635044693946837, + "ce_loss_2": 4.157553017139435, + "ce_loss_3": 3.9980939745903017, + "ce_loss_7": 3.7649829506874086, + "epoch": 0.966, + "grad_norm": 328.0, + "kl_loss_10": 95.77610893249512, + "kl_loss_2": 1046.733694458008, + "kl_loss_3": 723.9284912109375, + "kl_loss_7": 183.63089752197266, + "learning_rate": 2.9074071841727055e-06, + "loss": 513.6759, + "step": 9660 + }, + { + "ce_loss_10": 3.6491685032844545, + "ce_loss_13": 3.5898547172546387, + "ce_loss_2": 4.10191251039505, + "ce_loss_3": 3.9377865552902223, + "ce_loss_7": 3.694057583808899, + "epoch": 0.967, + "grad_norm": 410.0, + "kl_loss_10": 94.75908012390137, + "kl_loss_2": 1074.1435485839843, + "kl_loss_3": 739.0172424316406, + "kl_loss_7": 185.9965072631836, + "learning_rate": 2.739052315863355e-06, + "loss": 514.4849, + "step": 9670 + }, + { + "ce_loss_10": 3.6381678700447084, + "ce_loss_13": 3.5745797991752624, + "ce_loss_2": 4.085923862457276, + "ce_loss_3": 3.9223034262657164, + "ce_loss_7": 3.679898130893707, + "epoch": 0.968, + "grad_norm": 400.0, + "kl_loss_10": 98.94500389099122, + "kl_loss_2": 1071.759048461914, + "kl_loss_3": 742.1754821777344, + "kl_loss_7": 186.4635383605957, + "learning_rate": 2.5757048597765396e-06, + "loss": 520.3133, + "step": 9680 + }, + { + "ce_loss_10": 3.6451838970184327, + "ce_loss_13": 3.584810471534729, + "ce_loss_2": 4.096628618240357, + "ce_loss_3": 3.9352465867996216, + "ce_loss_7": 3.6861096024513245, + "epoch": 0.969, + "grad_norm": 354.0, + "kl_loss_10": 95.84736633300781, + "kl_loss_2": 1089.9881774902344, + "kl_loss_3": 753.175894165039, + "kl_loss_7": 186.62908554077148, + "learning_rate": 2.417366460819359e-06, + "loss": 527.3621, + "step": 9690 + }, + { + "ce_loss_10": 3.6515438675880434, + "ce_loss_13": 3.5902424931526182, + "ce_loss_2": 4.121148645877838, + "ce_loss_3": 3.9508870244026184, + "ce_loss_7": 3.6973974823951723, + "epoch": 0.97, + "grad_norm": 378.0, + "kl_loss_10": 97.83438453674316, + "kl_loss_2": 1114.7044860839844, + "kl_loss_3": 766.2058898925782, + "kl_loss_7": 189.7753791809082, + "learning_rate": 2.2640387134577057e-06, + "loss": 528.5559, + "step": 9700 + }, + { + "ce_loss_10": 3.579375672340393, + "ce_loss_13": 3.5232587337493895, + "ce_loss_2": 4.008558976650238, + "ce_loss_3": 3.853218126296997, + "ce_loss_7": 3.621316111087799, + "epoch": 0.971, + "grad_norm": 346.0, + "kl_loss_10": 89.91974563598633, + "kl_loss_2": 1025.9577575683593, + "kl_loss_3": 709.8189392089844, + "kl_loss_7": 177.37805099487304, + "learning_rate": 2.115723161700278e-06, + "loss": 511.7921, + "step": 9710 + }, + { + "ce_loss_10": 3.5539788961410523, + "ce_loss_13": 3.493563008308411, + "ce_loss_2": 4.019102883338928, + "ce_loss_3": 3.8513848066329954, + "ce_loss_7": 3.6021719098091127, + "epoch": 0.972, + "grad_norm": 450.0, + "kl_loss_10": 97.08839912414551, + "kl_loss_2": 1102.8951354980468, + "kl_loss_3": 763.9947265625, + "kl_loss_7": 189.97327194213867, + "learning_rate": 1.9724212990830937e-06, + "loss": 534.7647, + "step": 9720 + }, + { + "ce_loss_10": 3.7055511236190797, + "ce_loss_13": 3.645791494846344, + "ce_loss_2": 4.164284873008728, + "ce_loss_3": 3.998200333118439, + "ce_loss_7": 3.748906970024109, + "epoch": 0.973, + "grad_norm": 306.0, + "kl_loss_10": 97.2132583618164, + "kl_loss_2": 1086.074758911133, + "kl_loss_3": 748.4372924804687, + "kl_loss_7": 187.37096481323243, + "learning_rate": 1.8341345686543331e-06, + "loss": 526.9427, + "step": 9730 + }, + { + "ce_loss_10": 3.688717949390411, + "ce_loss_13": 3.6282296895980837, + "ce_loss_2": 4.123041558265686, + "ce_loss_3": 3.963315784931183, + "ce_loss_7": 3.731441855430603, + "epoch": 0.974, + "grad_norm": 446.0, + "kl_loss_10": 95.65226020812989, + "kl_loss_2": 1053.469790649414, + "kl_loss_3": 725.7204162597657, + "kl_loss_7": 185.0056625366211, + "learning_rate": 1.7008643629596864e-06, + "loss": 524.4386, + "step": 9740 + }, + { + "ce_loss_10": 3.674058973789215, + "ce_loss_13": 3.6143284678459167, + "ce_loss_2": 4.119033622741699, + "ce_loss_3": 3.954865837097168, + "ce_loss_7": 3.7161202311515806, + "epoch": 0.975, + "grad_norm": 406.0, + "kl_loss_10": 96.88497962951661, + "kl_loss_2": 1081.9985229492188, + "kl_loss_3": 741.1380645751954, + "kl_loss_7": 186.00157089233397, + "learning_rate": 1.5726120240288633e-06, + "loss": 531.1466, + "step": 9750 + }, + { + "ce_loss_10": 3.569232928752899, + "ce_loss_13": 3.511077570915222, + "ce_loss_2": 4.014019024372101, + "ce_loss_3": 3.854006791114807, + "ce_loss_7": 3.6116040468215944, + "epoch": 0.976, + "grad_norm": 548.0, + "kl_loss_10": 94.0260025024414, + "kl_loss_2": 1069.765771484375, + "kl_loss_3": 740.6184356689453, + "kl_loss_7": 184.33246154785155, + "learning_rate": 1.4493788433612708e-06, + "loss": 520.0787, + "step": 9760 + }, + { + "ce_loss_10": 3.6905293107032775, + "ce_loss_13": 3.630939745903015, + "ce_loss_2": 4.1426611065864565, + "ce_loss_3": 3.9766762137413023, + "ce_loss_7": 3.7346643686294554, + "epoch": 0.977, + "grad_norm": 340.0, + "kl_loss_10": 95.88525352478027, + "kl_loss_2": 1083.8150268554687, + "kl_loss_3": 744.1571472167968, + "kl_loss_7": 186.4142593383789, + "learning_rate": 1.3311660619138578e-06, + "loss": 528.903, + "step": 9770 + }, + { + "ce_loss_10": 3.6836748480796815, + "ce_loss_13": 3.6255483746528627, + "ce_loss_2": 4.109962856769561, + "ce_loss_3": 3.9559788703918457, + "ce_loss_7": 3.7251157641410826, + "epoch": 0.978, + "grad_norm": 358.0, + "kl_loss_10": 94.83126564025879, + "kl_loss_2": 1041.5651123046875, + "kl_loss_3": 719.0739654541015, + "kl_loss_7": 183.43487930297852, + "learning_rate": 1.2179748700879012e-06, + "loss": 517.046, + "step": 9780 + }, + { + "ce_loss_10": 3.6114102602005005, + "ce_loss_13": 3.553410363197327, + "ce_loss_2": 4.060984289646148, + "ce_loss_3": 3.9008304476737976, + "ce_loss_7": 3.6546178460121155, + "epoch": 0.979, + "grad_norm": 448.0, + "kl_loss_10": 94.51130638122558, + "kl_loss_2": 1070.2796936035156, + "kl_loss_3": 734.1844604492187, + "kl_loss_7": 183.8596923828125, + "learning_rate": 1.1098064077174619e-06, + "loss": 522.2918, + "step": 9790 + }, + { + "ce_loss_10": 3.6468693137168886, + "ce_loss_13": 3.5864970564842222, + "ce_loss_2": 4.112797820568085, + "ce_loss_3": 3.9430916547775268, + "ce_loss_7": 3.6902678489685057, + "epoch": 0.98, + "grad_norm": 396.0, + "kl_loss_10": 94.27075653076172, + "kl_loss_2": 1089.721176147461, + "kl_loss_3": 749.4085998535156, + "kl_loss_7": 185.25458450317382, + "learning_rate": 1.006661764057837e-06, + "loss": 525.8869, + "step": 9800 + }, + { + "ce_loss_10": 3.6512062191963195, + "ce_loss_13": 3.591748607158661, + "ce_loss_2": 4.100599420070648, + "ce_loss_3": 3.94116724729538, + "ce_loss_7": 3.6929367065429686, + "epoch": 0.981, + "grad_norm": 370.0, + "kl_loss_10": 95.22041091918945, + "kl_loss_2": 1079.5743225097656, + "kl_loss_3": 744.3837371826172, + "kl_loss_7": 184.29767150878905, + "learning_rate": 9.085419777743465e-07, + "loss": 523.8814, + "step": 9810 + }, + { + "ce_loss_10": 3.5867176413536073, + "ce_loss_13": 3.5297884702682496, + "ce_loss_2": 4.040095067024231, + "ce_loss_3": 3.876063418388367, + "ce_loss_7": 3.6296881198883058, + "epoch": 0.982, + "grad_norm": 372.0, + "kl_loss_10": 92.43338165283203, + "kl_loss_2": 1068.0833831787108, + "kl_loss_3": 736.9401550292969, + "kl_loss_7": 179.86018447875978, + "learning_rate": 8.15448036932176e-07, + "loss": 515.9226, + "step": 9820 + }, + { + "ce_loss_10": 3.641873502731323, + "ce_loss_13": 3.5830901145935057, + "ce_loss_2": 4.088828957080841, + "ce_loss_3": 3.9226441621780395, + "ce_loss_7": 3.6845821142196655, + "epoch": 0.983, + "grad_norm": 450.0, + "kl_loss_10": 93.74083061218262, + "kl_loss_2": 1074.023745727539, + "kl_loss_3": 742.9956268310547, + "kl_loss_7": 184.04373016357422, + "learning_rate": 7.273808789862724e-07, + "loss": 527.5683, + "step": 9830 + }, + { + "ce_loss_10": 3.7283427119255066, + "ce_loss_13": 3.6678077578544617, + "ce_loss_2": 4.168006038665771, + "ce_loss_3": 4.004901099205017, + "ce_loss_7": 3.7688368439674376, + "epoch": 0.984, + "grad_norm": 302.0, + "kl_loss_10": 97.79412803649902, + "kl_loss_2": 1069.5015228271484, + "kl_loss_3": 732.57353515625, + "kl_loss_7": 187.13561325073243, + "learning_rate": 6.443413907720186e-07, + "loss": 519.9878, + "step": 9840 + }, + { + "ce_loss_10": 3.6514281272888183, + "ce_loss_13": 3.5926932096481323, + "ce_loss_2": 4.092191052436829, + "ce_loss_3": 3.930639326572418, + "ce_loss_7": 3.6929845571517945, + "epoch": 0.985, + "grad_norm": 370.0, + "kl_loss_10": 94.32459564208985, + "kl_loss_2": 1056.6608520507812, + "kl_loss_3": 730.5611633300781, + "kl_loss_7": 184.7391357421875, + "learning_rate": 5.663304084960185e-07, + "loss": 518.7671, + "step": 9850 + }, + { + "ce_loss_10": 3.5804072976112367, + "ce_loss_13": 3.520645248889923, + "ce_loss_2": 4.040905499458313, + "ce_loss_3": 3.8740663886070252, + "ce_loss_7": 3.6240461707115172, + "epoch": 0.986, + "grad_norm": 364.0, + "kl_loss_10": 96.15405921936035, + "kl_loss_2": 1090.2829711914062, + "kl_loss_3": 753.8562805175782, + "kl_loss_7": 186.33775329589844, + "learning_rate": 4.933487177280482e-07, + "loss": 518.7076, + "step": 9860 + }, + { + "ce_loss_10": 3.6774788737297057, + "ce_loss_13": 3.6186564683914186, + "ce_loss_2": 4.12188241481781, + "ce_loss_3": 3.958666682243347, + "ce_loss_7": 3.7196821093559267, + "epoch": 0.987, + "grad_norm": 408.0, + "kl_loss_10": 94.4230339050293, + "kl_loss_2": 1058.865460205078, + "kl_loss_3": 734.6985687255859, + "kl_loss_7": 181.19112243652344, + "learning_rate": 4.2539705339295075e-07, + "loss": 516.55, + "step": 9870 + }, + { + "ce_loss_10": 3.525623691082001, + "ce_loss_13": 3.46755028963089, + "ce_loss_2": 3.986714744567871, + "ce_loss_3": 3.828349435329437, + "ce_loss_7": 3.572091352939606, + "epoch": 0.988, + "grad_norm": 376.0, + "kl_loss_10": 93.51777114868165, + "kl_loss_2": 1095.3638244628905, + "kl_loss_3": 760.6636901855469, + "kl_loss_7": 187.32990493774415, + "learning_rate": 3.6247609976319816e-07, + "loss": 523.6327, + "step": 9880 + }, + { + "ce_loss_10": 3.6277820110321044, + "ce_loss_13": 3.567537808418274, + "ce_loss_2": 4.088609397411346, + "ce_loss_3": 3.924705386161804, + "ce_loss_7": 3.6743552684783936, + "epoch": 0.989, + "grad_norm": 476.0, + "kl_loss_10": 96.44798164367675, + "kl_loss_2": 1082.7717010498047, + "kl_loss_3": 749.9800903320313, + "kl_loss_7": 188.03020095825195, + "learning_rate": 3.0458649045211895e-07, + "loss": 536.5464, + "step": 9890 + }, + { + "ce_loss_10": 3.596536934375763, + "ce_loss_13": 3.5354753971099853, + "ce_loss_2": 4.064900302886963, + "ce_loss_3": 3.895892357826233, + "ce_loss_7": 3.6420591354370115, + "epoch": 0.99, + "grad_norm": 354.0, + "kl_loss_10": 95.05449028015137, + "kl_loss_2": 1090.392593383789, + "kl_loss_3": 754.5375762939453, + "kl_loss_7": 188.19856491088868, + "learning_rate": 2.517288084074587e-07, + "loss": 534.519, + "step": 9900 + }, + { + "ce_loss_10": 3.635116171836853, + "ce_loss_13": 3.574588453769684, + "ce_loss_2": 4.111752784252166, + "ce_loss_3": 3.944272756576538, + "ce_loss_7": 3.682980465888977, + "epoch": 0.991, + "grad_norm": 354.0, + "kl_loss_10": 95.2107322692871, + "kl_loss_2": 1113.8304351806642, + "kl_loss_3": 770.2767669677735, + "kl_loss_7": 189.45772018432618, + "learning_rate": 2.0390358590538505e-07, + "loss": 533.4635, + "step": 9910 + }, + { + "ce_loss_10": 3.644596242904663, + "ce_loss_13": 3.5852715373039246, + "ce_loss_2": 4.097472989559174, + "ce_loss_3": 3.93781635761261, + "ce_loss_7": 3.6912411212921143, + "epoch": 0.992, + "grad_norm": 360.0, + "kl_loss_10": 95.30287055969238, + "kl_loss_2": 1081.2569366455077, + "kl_loss_3": 748.5147064208984, + "kl_loss_7": 189.18702926635743, + "learning_rate": 1.61111304545436e-07, + "loss": 523.9828, + "step": 9920 + }, + { + "ce_loss_10": 3.6098355293273925, + "ce_loss_13": 3.5515360593795777, + "ce_loss_2": 4.0586741924285885, + "ce_loss_3": 3.895809698104858, + "ce_loss_7": 3.651991581916809, + "epoch": 0.993, + "grad_norm": 408.0, + "kl_loss_10": 94.64987220764161, + "kl_loss_2": 1077.9339660644532, + "kl_loss_3": 744.9157043457031, + "kl_loss_7": 185.0061477661133, + "learning_rate": 1.2335239524541298e-07, + "loss": 518.6489, + "step": 9930 + }, + { + "ce_loss_10": 3.5815568804740905, + "ce_loss_13": 3.523807632923126, + "ce_loss_2": 4.032287573814392, + "ce_loss_3": 3.871028816699982, + "ce_loss_7": 3.625322496891022, + "epoch": 0.994, + "grad_norm": 396.0, + "kl_loss_10": 94.44433670043945, + "kl_loss_2": 1070.128958129883, + "kl_loss_3": 738.4679992675781, + "kl_loss_7": 184.16616668701172, + "learning_rate": 9.06272382371065e-08, + "loss": 522.681, + "step": 9940 + }, + { + "ce_loss_10": 3.649178886413574, + "ce_loss_13": 3.5898856997489927, + "ce_loss_2": 4.107921350002289, + "ce_loss_3": 3.9442641377449035, + "ce_loss_7": 3.6927329182624815, + "epoch": 0.995, + "grad_norm": 366.0, + "kl_loss_10": 97.43391189575195, + "kl_loss_2": 1093.5839904785157, + "kl_loss_3": 755.0986083984375, + "kl_loss_7": 187.86676559448242, + "learning_rate": 6.293616306246586e-08, + "loss": 528.0195, + "step": 9950 + }, + { + "ce_loss_10": 3.6503029584884645, + "ce_loss_13": 3.5912461996078493, + "ce_loss_2": 4.083380007743836, + "ce_loss_3": 3.9251137375831604, + "ce_loss_7": 3.6914992809295653, + "epoch": 0.996, + "grad_norm": 386.0, + "kl_loss_10": 92.51536598205567, + "kl_loss_2": 1047.152279663086, + "kl_loss_3": 724.7971832275391, + "kl_loss_7": 180.47787857055664, + "learning_rate": 4.027944857032395e-08, + "loss": 508.9252, + "step": 9960 + }, + { + "ce_loss_10": 3.640582966804504, + "ce_loss_13": 3.5819292664527893, + "ce_loss_2": 4.071112728118896, + "ce_loss_3": 3.9061211466789247, + "ce_loss_7": 3.678541886806488, + "epoch": 0.997, + "grad_norm": 332.0, + "kl_loss_10": 94.77192039489746, + "kl_loss_2": 1031.1441497802734, + "kl_loss_3": 710.4731018066407, + "kl_loss_7": 178.55070953369142, + "learning_rate": 2.265732291356626e-08, + "loss": 508.5261, + "step": 9970 + }, + { + "ce_loss_10": 3.6857742786407472, + "ce_loss_13": 3.6264986276626585, + "ce_loss_2": 4.119817900657654, + "ce_loss_3": 3.9565661191940307, + "ce_loss_7": 3.7256898403167726, + "epoch": 0.998, + "grad_norm": 354.0, + "kl_loss_10": 95.05257987976074, + "kl_loss_2": 1045.786294555664, + "kl_loss_3": 725.6510559082031, + "kl_loss_7": 184.49017181396485, + "learning_rate": 1.0069963546743833e-08, + "loss": 527.3226, + "step": 9980 + }, + { + "ce_loss_10": 3.66132515668869, + "ce_loss_13": 3.6029880166053774, + "ce_loss_2": 4.108680582046508, + "ce_loss_3": 3.9468571662902834, + "ce_loss_7": 3.7058345794677736, + "epoch": 0.999, + "grad_norm": 358.0, + "kl_loss_10": 95.48841247558593, + "kl_loss_2": 1072.3943481445312, + "kl_loss_3": 746.4591552734375, + "kl_loss_7": 185.93264389038086, + "learning_rate": 2.517497224463483e-09, + "loss": 522.7165, + "step": 9990 + }, + { + "ce_loss_10": 3.6195040583610534, + "ce_loss_13": 3.559644305706024, + "ce_loss_2": 4.096893215179444, + "ce_loss_3": 3.9238691568374633, + "ce_loss_7": 3.6661298632621766, + "epoch": 1.0, + "grad_norm": 502.0, + "kl_loss_10": 96.35170402526856, + "kl_loss_2": 1110.4292907714844, + "kl_loss_3": 763.8701965332032, + "kl_loss_7": 189.7416961669922, + "learning_rate": 0.0, + "loss": 533.5189, + "step": 10000 + } + ], + "logging_steps": 10, + "max_steps": 10000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 250, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.177819035608023e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}