diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,16049 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 10000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "ce_loss_12": 17.798757553100586, + "ce_loss_17": 12.109902381896973, + "ce_loss_23": 2.898716688156128, + "ce_loss_3": 16.791034698486328, + "ce_loss_6": 17.334301948547363, + "epoch": 0.0001, + "grad_norm": 211968.0, + "kl_loss_12": 31074.1875, + "kl_loss_17": 19915.5283203125, + "kl_loss_3": 28423.80078125, + "kl_loss_6": 29539.3955078125, + "learning_rate": 1e-05, + "loss": 27273.5469, + "step": 1 + }, + { + "ce_loss_12": 11.539109309514364, + "ce_loss_17": 9.017974032296074, + "ce_loss_23": 2.9512691365347967, + "ce_loss_3": 11.650188684463501, + "ce_loss_6": 11.816295120451185, + "epoch": 0.001, + "grad_norm": 26112.0, + "kl_loss_12": 17560.87771267361, + "kl_loss_17": 12742.507107204861, + "kl_loss_3": 17486.401258680555, + "kl_loss_6": 17750.530110677082, + "learning_rate": 0.0001, + "loss": 16390.9132, + "step": 10 + }, + { + "ce_loss_12": 6.662769317626953, + "ce_loss_17": 5.113725876808166, + "ce_loss_23": 2.958701026439667, + "ce_loss_3": 7.423222160339355, + "ce_loss_6": 7.240184164047241, + "epoch": 0.002, + "grad_norm": 10816.0, + "kl_loss_12": 7201.2345703125, + "kl_loss_17": 4226.757141113281, + "kl_loss_3": 8504.014624023437, + "kl_loss_6": 8126.985400390625, + "learning_rate": 0.0002, + "loss": 7110.9945, + "step": 20 + }, + { + "ce_loss_12": 5.635356903076172, + "ce_loss_17": 3.987026798725128, + "ce_loss_23": 2.7665123343467712, + "ce_loss_3": 6.693690013885498, + "ce_loss_6": 6.371592235565186, + "epoch": 0.003, + "grad_norm": 6592.0, + "kl_loss_12": 5527.604516601563, + "kl_loss_17": 2312.6288452148438, + "kl_loss_3": 7508.761254882813, + "kl_loss_6": 6876.302319335938, + "learning_rate": 0.0003, + "loss": 5459.1285, + "step": 30 + }, + { + "ce_loss_12": 5.312513065338135, + "ce_loss_17": 3.8958184838294985, + "ce_loss_23": 2.928679037094116, + "ce_loss_3": 6.311967754364014, + "ce_loss_6": 5.9457975149154665, + "epoch": 0.004, + "grad_norm": 9280.0, + "kl_loss_12": 4565.179907226562, + "kl_loss_17": 1834.186749267578, + "kl_loss_3": 6455.294677734375, + "kl_loss_6": 5751.1544921875, + "learning_rate": 0.0004, + "loss": 4687.6551, + "step": 40 + }, + { + "ce_loss_12": 5.002953696250915, + "ce_loss_17": 3.739673209190369, + "ce_loss_23": 2.8902602910995485, + "ce_loss_3": 6.069131588935852, + "ce_loss_6": 5.692244386672973, + "epoch": 0.005, + "grad_norm": 7552.0, + "kl_loss_12": 4123.054260253906, + "kl_loss_17": 1606.3025634765625, + "kl_loss_3": 6154.768359375, + "kl_loss_6": 5436.51982421875, + "learning_rate": 0.0005, + "loss": 4305.6285, + "step": 50 + }, + { + "ce_loss_12": 4.829141068458557, + "ce_loss_17": 3.635519802570343, + "ce_loss_23": 2.9109005689620973, + "ce_loss_3": 5.868700075149536, + "ce_loss_6": 5.501702117919922, + "epoch": 0.006, + "grad_norm": 11392.0, + "kl_loss_12": 3755.554541015625, + "kl_loss_17": 1389.9905212402343, + "kl_loss_3": 5720.950512695313, + "kl_loss_6": 5029.498291015625, + "learning_rate": 0.0006, + "loss": 3986.793, + "step": 60 + }, + { + "ce_loss_12": 4.645738911628723, + "ce_loss_17": 3.4978497982025147, + "ce_loss_23": 2.8339456021785736, + "ce_loss_3": 5.803532552719116, + "ce_loss_6": 5.397406053543091, + "epoch": 0.007, + "grad_norm": 8256.0, + "kl_loss_12": 3554.4886840820313, + "kl_loss_17": 1276.3660400390625, + "kl_loss_3": 5769.8087890625, + "kl_loss_6": 4980.427783203125, + "learning_rate": 0.0007, + "loss": 3883.4496, + "step": 70 + }, + { + "ce_loss_12": 4.597572684288025, + "ce_loss_17": 3.5362090349197386, + "ce_loss_23": 2.829783248901367, + "ce_loss_3": 5.797271776199341, + "ce_loss_6": 5.276372957229614, + "epoch": 0.008, + "grad_norm": 6976.0, + "kl_loss_12": 3471.6586669921876, + "kl_loss_17": 1371.0523620605468, + "kl_loss_3": 5753.631884765625, + "kl_loss_6": 4787.4904296875, + "learning_rate": 0.0008, + "loss": 3867.9441, + "step": 80 + }, + { + "ce_loss_12": 4.451014447212219, + "ce_loss_17": 3.5222262740135193, + "ce_loss_23": 2.7971161007881165, + "ce_loss_3": 5.97350127696991, + "ce_loss_6": 5.160103845596313, + "epoch": 0.009, + "grad_norm": 11968.0, + "kl_loss_12": 3281.931005859375, + "kl_loss_17": 1445.2955322265625, + "kl_loss_3": 6169.022705078125, + "kl_loss_6": 4631.531591796875, + "learning_rate": 0.0009000000000000001, + "loss": 3860.6953, + "step": 90 + }, + { + "ce_loss_12": 4.490131831169128, + "ce_loss_17": 3.5490355372428892, + "ce_loss_23": 2.903198480606079, + "ce_loss_3": 5.901812219619751, + "ce_loss_6": 5.335342597961426, + "epoch": 0.01, + "grad_norm": 7904.0, + "kl_loss_12": 3168.4276489257813, + "kl_loss_17": 1262.5684020996093, + "kl_loss_3": 5861.412744140625, + "kl_loss_6": 4760.437231445312, + "learning_rate": 0.001, + "loss": 3758.8113, + "step": 100 + }, + { + "ce_loss_12": 4.583330249786377, + "ce_loss_17": 3.447415602207184, + "ce_loss_23": 2.8641112565994264, + "ce_loss_3": 5.773125648498535, + "ce_loss_6": 5.602428388595581, + "epoch": 0.011, + "grad_norm": 16000.0, + "kl_loss_12": 3430.811181640625, + "kl_loss_17": 1154.315347290039, + "kl_loss_3": 5642.04091796875, + "kl_loss_6": 5399.360400390625, + "learning_rate": 0.0009999974825027757, + "loss": 3887.9992, + "step": 110 + }, + { + "ce_loss_12": 5.235488867759704, + "ce_loss_17": 3.5834272861480714, + "ce_loss_23": 2.9280850529670714, + "ce_loss_3": 5.679557681083679, + "ce_loss_6": 5.6021226167678835, + "epoch": 0.012, + "grad_norm": 14080.0, + "kl_loss_12": 4564.27548828125, + "kl_loss_17": 1343.9837036132812, + "kl_loss_3": 5395.271264648438, + "kl_loss_6": 5231.13349609375, + "learning_rate": 0.0009999899300364532, + "loss": 4105.0148, + "step": 120 + }, + { + "ce_loss_12": 5.704882669448852, + "ce_loss_17": 3.6224010229110717, + "ce_loss_23": 2.8817583322525024, + "ce_loss_3": 5.816133284568787, + "ce_loss_6": 5.332496976852417, + "epoch": 0.013, + "grad_norm": 12288.0, + "kl_loss_12": 5597.638903808594, + "kl_loss_17": 1380.1296264648438, + "kl_loss_3": 5679.002026367188, + "kl_loss_6": 4769.773095703125, + "learning_rate": 0.0009999773426770863, + "loss": 4395.3734, + "step": 130 + }, + { + "ce_loss_12": 4.891789817810059, + "ce_loss_17": 3.532665467262268, + "ce_loss_23": 2.929382526874542, + "ce_loss_3": 5.8127936840057375, + "ce_loss_6": 5.244004225730896, + "epoch": 0.014, + "grad_norm": 6912.0, + "kl_loss_12": 3870.0730102539064, + "kl_loss_17": 1216.6474182128907, + "kl_loss_3": 5634.811572265625, + "kl_loss_6": 4561.313623046875, + "learning_rate": 0.0009999597205514296, + "loss": 3838.6836, + "step": 140 + }, + { + "ce_loss_12": 4.74582872390747, + "ce_loss_17": 3.492581915855408, + "ce_loss_23": 2.88243225812912, + "ce_loss_3": 5.780796051025391, + "ce_loss_6": 5.230218529701233, + "epoch": 0.015, + "grad_norm": 3872.0, + "kl_loss_12": 3684.5867309570312, + "kl_loss_17": 1215.8963134765625, + "kl_loss_3": 5671.0294921875, + "kl_loss_6": 4582.3294921875, + "learning_rate": 0.0009999370638369377, + "loss": 3805.625, + "step": 150 + }, + { + "ce_loss_12": 4.55562047958374, + "ce_loss_17": 3.4915971159934998, + "ce_loss_23": 2.922529602050781, + "ce_loss_3": 5.791428542137146, + "ce_loss_6": 5.213034510612488, + "epoch": 0.016, + "grad_norm": 5344.0, + "kl_loss_12": 3269.4089599609374, + "kl_loss_17": 1116.4551055908203, + "kl_loss_3": 5598.087133789062, + "kl_loss_6": 4509.215734863281, + "learning_rate": 0.000999909372761763, + "loss": 3618.3555, + "step": 160 + }, + { + "ce_loss_12": 4.466858816146851, + "ce_loss_17": 3.4728991270065306, + "ce_loss_23": 2.860163617134094, + "ce_loss_3": 5.697683525085449, + "ce_loss_6": 5.133940243721009, + "epoch": 0.017, + "grad_norm": 2448.0, + "kl_loss_12": 3196.91201171875, + "kl_loss_17": 1242.2661376953124, + "kl_loss_3": 5557.818823242187, + "kl_loss_6": 4480.91279296875, + "learning_rate": 0.0009998766476047546, + "loss": 3649.5055, + "step": 170 + }, + { + "ce_loss_12": 4.527968239784241, + "ce_loss_17": 3.5157664895057676, + "ce_loss_23": 2.8960798501968386, + "ce_loss_3": 5.678347492218018, + "ce_loss_6": 5.090836000442505, + "epoch": 0.018, + "grad_norm": 2608.0, + "kl_loss_12": 3277.224816894531, + "kl_loss_17": 1209.9160522460938, + "kl_loss_3": 5431.537768554687, + "kl_loss_6": 4314.870227050781, + "learning_rate": 0.0009998388886954545, + "loss": 3574.5352, + "step": 180 + }, + { + "ce_loss_12": 4.394872999191284, + "ce_loss_17": 3.47138329744339, + "ce_loss_23": 2.8707207679748534, + "ce_loss_3": 5.5450726509094235, + "ce_loss_6": 5.056916284561157, + "epoch": 0.019, + "grad_norm": 2040.0, + "kl_loss_12": 3051.567834472656, + "kl_loss_17": 1180.9717346191405, + "kl_loss_3": 5247.836181640625, + "kl_loss_6": 4322.745776367187, + "learning_rate": 0.0009997960964140947, + "loss": 3442.2301, + "step": 190 + }, + { + "ce_loss_12": 4.349503815174103, + "ce_loss_17": 3.4052997827529907, + "ce_loss_23": 2.8605754256248472, + "ce_loss_3": 5.51627106666565, + "ce_loss_6": 5.015147471427918, + "epoch": 0.02, + "grad_norm": 2008.0, + "kl_loss_12": 2997.1719360351562, + "kl_loss_17": 1077.3347778320312, + "kl_loss_3": 5216.086987304688, + "kl_loss_6": 4270.710314941406, + "learning_rate": 0.0009997482711915926, + "loss": 3372.0453, + "step": 200 + }, + { + "ce_loss_12": 4.382658886909485, + "ce_loss_17": 3.366709041595459, + "ce_loss_23": 2.8402206182479857, + "ce_loss_3": 5.42800190448761, + "ce_loss_6": 4.934572362899781, + "epoch": 0.021, + "grad_norm": 2000.0, + "kl_loss_12": 3123.2125854492188, + "kl_loss_17": 1060.0697784423828, + "kl_loss_3": 5091.03828125, + "kl_loss_6": 4182.4359130859375, + "learning_rate": 0.0009996954135095479, + "loss": 3352.6879, + "step": 210 + }, + { + "ce_loss_12": 4.337828183174134, + "ce_loss_17": 3.416949248313904, + "ce_loss_23": 2.910671079158783, + "ce_loss_3": 5.403342247009277, + "ce_loss_6": 4.892907905578613, + "epoch": 0.022, + "grad_norm": 2832.0, + "kl_loss_12": 2856.504675292969, + "kl_loss_17": 997.9278289794922, + "kl_loss_3": 4914.284692382813, + "kl_loss_6": 3927.723681640625, + "learning_rate": 0.0009996375239002368, + "loss": 3177.3701, + "step": 220 + }, + { + "ce_loss_12": 4.298712420463562, + "ce_loss_17": 3.466365027427673, + "ce_loss_23": 2.9782427191734313, + "ce_loss_3": 5.402818751335144, + "ce_loss_6": 4.915108633041382, + "epoch": 0.023, + "grad_norm": 1928.0, + "kl_loss_12": 2680.0574462890627, + "kl_loss_17": 967.0066162109375, + "kl_loss_3": 4764.784228515625, + "kl_loss_6": 3848.396130371094, + "learning_rate": 0.0009995746029466072, + "loss": 3073.8885, + "step": 230 + }, + { + "ce_loss_12": 4.158153474330902, + "ce_loss_17": 3.2901054859161376, + "ce_loss_23": 2.786073011159897, + "ce_loss_3": 5.342944192886352, + "ce_loss_6": 4.852350068092346, + "epoch": 0.024, + "grad_norm": 2784.0, + "kl_loss_12": 2786.9808959960938, + "kl_loss_17": 1015.3545806884765, + "kl_loss_3": 5037.80654296875, + "kl_loss_6": 4118.17490234375, + "learning_rate": 0.0009995066512822719, + "loss": 3153.4832, + "step": 240 + }, + { + "ce_loss_12": 4.2036160111427305, + "ce_loss_17": 3.368748664855957, + "ce_loss_23": 2.8782119750976562, + "ce_loss_3": 5.450334095954895, + "ce_loss_6": 4.917315721511841, + "epoch": 0.025, + "grad_norm": 1688.0, + "kl_loss_12": 2688.274169921875, + "kl_loss_17": 961.38095703125, + "kl_loss_3": 5079.578344726562, + "kl_loss_6": 4061.3109497070313, + "learning_rate": 0.000999433669591504, + "loss": 3096.8104, + "step": 250 + }, + { + "ce_loss_12": 4.106511771678925, + "ce_loss_17": 3.2343634247779844, + "ce_loss_23": 2.7847194910049438, + "ce_loss_3": 5.36883807182312, + "ce_loss_6": 4.7796388387680055, + "epoch": 0.026, + "grad_norm": 1416.0, + "kl_loss_12": 2675.691442871094, + "kl_loss_17": 906.2499481201172, + "kl_loss_3": 5100.184497070312, + "kl_loss_6": 3969.0149169921874, + "learning_rate": 0.000999355658609228, + "loss": 3087.484, + "step": 260 + }, + { + "ce_loss_12": 4.176587176322937, + "ce_loss_17": 3.267600440979004, + "ce_loss_23": 2.809903073310852, + "ce_loss_3": 5.416643333435059, + "ce_loss_6": 4.862660479545593, + "epoch": 0.027, + "grad_norm": 1624.0, + "kl_loss_12": 2710.8676025390623, + "kl_loss_17": 894.4491577148438, + "kl_loss_3": 5104.5154296875, + "kl_loss_6": 4047.3380737304688, + "learning_rate": 0.0009992726191210138, + "loss": 3135.7184, + "step": 270 + }, + { + "ce_loss_12": 4.147988736629486, + "ce_loss_17": 3.2972861886024476, + "ce_loss_23": 2.8483713388442995, + "ce_loss_3": 5.283263945579529, + "ce_loss_6": 4.793030953407287, + "epoch": 0.028, + "grad_norm": 1936.0, + "kl_loss_12": 2634.4557495117188, + "kl_loss_17": 890.9104583740234, + "kl_loss_3": 4775.991870117187, + "kl_loss_6": 3846.7409912109374, + "learning_rate": 0.0009991845519630679, + "loss": 3005.9059, + "step": 280 + }, + { + "ce_loss_12": 4.069076085090638, + "ce_loss_17": 3.2262199759483337, + "ce_loss_23": 2.7426149249076843, + "ce_loss_3": 5.173510408401489, + "ce_loss_6": 4.663527464866638, + "epoch": 0.029, + "grad_norm": 1464.0, + "kl_loss_12": 2673.3083618164064, + "kl_loss_17": 964.3922149658204, + "kl_loss_3": 4793.293725585938, + "kl_loss_6": 3818.1897827148437, + "learning_rate": 0.0009990914580222257, + "loss": 3053.4721, + "step": 290 + }, + { + "ce_loss_12": 4.102203106880188, + "ce_loss_17": 3.312656319141388, + "ce_loss_23": 2.877616453170776, + "ce_loss_3": 5.184468483924865, + "ce_loss_6": 4.681406426429748, + "epoch": 0.03, + "grad_norm": 1424.0, + "kl_loss_12": 2515.7253540039064, + "kl_loss_17": 879.6887023925781, + "kl_loss_3": 4588.6248046875, + "kl_loss_6": 3630.072717285156, + "learning_rate": 0.0009989933382359422, + "loss": 2960.4674, + "step": 300 + }, + { + "ce_loss_12": 4.090289652347565, + "ce_loss_17": 3.3021645307540894, + "ce_loss_23": 2.8886025071144106, + "ce_loss_3": 5.160676980018616, + "ce_loss_6": 4.689880275726319, + "epoch": 0.031, + "grad_norm": 1744.0, + "kl_loss_12": 2450.2645874023438, + "kl_loss_17": 831.6864807128907, + "kl_loss_3": 4499.324084472656, + "kl_loss_6": 3599.02568359375, + "learning_rate": 0.0009988901935922825, + "loss": 2877.4172, + "step": 310 + }, + { + "ce_loss_12": 4.008204507827759, + "ce_loss_17": 3.2151699900627135, + "ce_loss_23": 2.7390822887420656, + "ce_loss_3": 5.131978940963745, + "ce_loss_6": 4.660629677772522, + "epoch": 0.032, + "grad_norm": 1288.0, + "kl_loss_12": 2575.4901611328123, + "kl_loss_17": 951.4900726318359, + "kl_loss_3": 4740.608227539063, + "kl_loss_6": 3837.3116943359373, + "learning_rate": 0.0009987820251299122, + "loss": 2955.9014, + "step": 320 + }, + { + "ce_loss_12": 4.053432321548462, + "ce_loss_17": 3.3205178499221804, + "ce_loss_23": 2.8587931513786318, + "ce_loss_3": 5.102370977401733, + "ce_loss_6": 4.653764081001282, + "epoch": 0.033, + "grad_norm": 1264.0, + "kl_loss_12": 2450.4338500976564, + "kl_loss_17": 935.9856903076172, + "kl_loss_3": 4473.6075439453125, + "kl_loss_6": 3609.1573974609373, + "learning_rate": 0.0009986688339380862, + "loss": 2848.4352, + "step": 330 + }, + { + "ce_loss_12": 3.957953178882599, + "ce_loss_17": 3.22804708480835, + "ce_loss_23": 2.8191253423690794, + "ce_loss_3": 4.9991214036941525, + "ce_loss_6": 4.5394447326660154, + "epoch": 0.034, + "grad_norm": 996.0, + "kl_loss_12": 2342.859753417969, + "kl_loss_17": 820.8218719482422, + "kl_loss_3": 4322.4416015625, + "kl_loss_6": 3451.667041015625, + "learning_rate": 0.0009985506211566387, + "loss": 2758.6914, + "step": 340 + }, + { + "ce_loss_12": 3.9520851135253907, + "ce_loss_17": 3.2317885398864745, + "ce_loss_23": 2.846635568141937, + "ce_loss_3": 5.002023839950562, + "ce_loss_6": 4.5321044921875, + "epoch": 0.035, + "grad_norm": 1592.0, + "kl_loss_12": 2274.8722229003906, + "kl_loss_17": 782.2494018554687, + "kl_loss_3": 4281.415600585938, + "kl_loss_6": 3387.6537963867186, + "learning_rate": 0.0009984273879759713, + "loss": 2705.5883, + "step": 350 + }, + { + "ce_loss_12": 4.0336832165718075, + "ce_loss_17": 3.288927936553955, + "ce_loss_23": 2.874334526062012, + "ce_loss_3": 5.094504308700562, + "ce_loss_6": 4.62630341053009, + "epoch": 0.036, + "grad_norm": 964.0, + "kl_loss_12": 2347.282421875, + "kl_loss_17": 819.1060791015625, + "kl_loss_3": 4394.406872558594, + "kl_loss_6": 3496.9873046875, + "learning_rate": 0.0009982991356370402, + "loss": 2793.3129, + "step": 360 + }, + { + "ce_loss_12": 3.9753374218940736, + "ce_loss_17": 3.2193778276443483, + "ce_loss_23": 2.8541990399360655, + "ce_loss_3": 5.038383626937867, + "ce_loss_6": 4.584095597267151, + "epoch": 0.037, + "grad_norm": 936.0, + "kl_loss_12": 2308.5005615234377, + "kl_loss_17": 759.3042938232422, + "kl_loss_3": 4345.300744628907, + "kl_loss_6": 3480.0982177734377, + "learning_rate": 0.0009981658654313456, + "loss": 2738.6379, + "step": 370 + }, + { + "ce_loss_12": 4.011369419097901, + "ce_loss_17": 3.2788562774658203, + "ce_loss_23": 2.916998362541199, + "ce_loss_3": 5.057713532447815, + "ce_loss_6": 4.593180727958679, + "epoch": 0.038, + "grad_norm": 1048.0, + "kl_loss_12": 2240.633892822266, + "kl_loss_17": 730.3999633789062, + "kl_loss_3": 4240.71103515625, + "kl_loss_6": 3363.1546142578127, + "learning_rate": 0.000998027578700917, + "loss": 2685.7131, + "step": 380 + }, + { + "ce_loss_12": 3.9916287541389464, + "ce_loss_17": 3.232191336154938, + "ce_loss_23": 2.871097719669342, + "ce_loss_3": 5.032142806053161, + "ce_loss_6": 4.558016991615295, + "epoch": 0.039, + "grad_norm": 1104.0, + "kl_loss_12": 2288.9161743164063, + "kl_loss_17": 730.8479858398438, + "kl_loss_3": 4279.519396972656, + "kl_loss_6": 3377.6620239257813, + "learning_rate": 0.0009978842768382998, + "loss": 2698.6039, + "step": 390 + }, + { + "ce_loss_12": 3.9320905208587646, + "ce_loss_17": 3.206320250034332, + "ce_loss_23": 2.873453605175018, + "ce_loss_3": 4.961167740821838, + "ce_loss_6": 4.500527572631836, + "epoch": 0.04, + "grad_norm": 1200.0, + "kl_loss_12": 2164.7929077148438, + "kl_loss_17": 678.6263000488282, + "kl_loss_3": 4138.049755859375, + "kl_loss_6": 3266.367578125, + "learning_rate": 0.0009977359612865424, + "loss": 2588.5998, + "step": 400 + }, + { + "ce_loss_12": 3.9608339428901673, + "ce_loss_17": 3.23901629447937, + "ce_loss_23": 2.890236461162567, + "ce_loss_3": 4.998275995254517, + "ce_loss_6": 4.56150221824646, + "epoch": 0.041, + "grad_norm": 1328.0, + "kl_loss_12": 2207.0178466796874, + "kl_loss_17": 721.2607818603516, + "kl_loss_3": 4192.5740966796875, + "kl_loss_6": 3363.844616699219, + "learning_rate": 0.0009975826335391806, + "loss": 2604.1301, + "step": 410 + }, + { + "ce_loss_12": 3.9477717757225035, + "ce_loss_17": 3.2436631083488465, + "ce_loss_23": 2.907458317279816, + "ce_loss_3": 4.954374933242798, + "ce_loss_6": 4.549765229225159, + "epoch": 0.042, + "grad_norm": 844.0, + "kl_loss_12": 2127.1870727539062, + "kl_loss_17": 682.032470703125, + "kl_loss_3": 4070.1541015625, + "kl_loss_6": 3305.5493408203124, + "learning_rate": 0.0009974242951402235, + "loss": 2572.7324, + "step": 420 + }, + { + "ce_loss_12": 3.950932002067566, + "ce_loss_17": 3.2564106822013854, + "ce_loss_23": 2.910490798950195, + "ce_loss_3": 4.9817784309387205, + "ce_loss_6": 4.567001938819885, + "epoch": 0.043, + "grad_norm": 1104.0, + "kl_loss_12": 2153.303643798828, + "kl_loss_17": 712.436978149414, + "kl_loss_3": 4140.805859375, + "kl_loss_6": 3338.912951660156, + "learning_rate": 0.0009972609476841367, + "loss": 2562.109, + "step": 430 + }, + { + "ce_loss_12": 3.9008285284042357, + "ce_loss_17": 3.1890947103500364, + "ce_loss_23": 2.8326279640197756, + "ce_loss_3": 4.932444286346436, + "ce_loss_6": 4.5064095735549925, + "epoch": 0.044, + "grad_norm": 948.0, + "kl_loss_12": 2182.2732788085937, + "kl_loss_17": 723.6691040039062, + "kl_loss_3": 4158.048254394531, + "kl_loss_6": 3358.2782592773438, + "learning_rate": 0.0009970925928158272, + "loss": 2605.9805, + "step": 440 + }, + { + "ce_loss_12": 3.8430684328079225, + "ce_loss_17": 3.1278234124183655, + "ce_loss_23": 2.7837769746780396, + "ce_loss_3": 4.891387319564819, + "ce_loss_6": 4.4741298913955685, + "epoch": 0.045, + "grad_norm": 912.0, + "kl_loss_12": 2194.3703674316407, + "kl_loss_17": 712.3570159912109, + "kl_loss_3": 4230.814050292969, + "kl_loss_6": 3414.0078369140624, + "learning_rate": 0.000996919232230627, + "loss": 2615.0797, + "step": 450 + }, + { + "ce_loss_12": 3.873635399341583, + "ce_loss_17": 3.1885122299194335, + "ce_loss_23": 2.8668599367141723, + "ce_loss_3": 4.8737973928451535, + "ce_loss_6": 4.461533617973328, + "epoch": 0.046, + "grad_norm": 1096.0, + "kl_loss_12": 2121.9797241210936, + "kl_loss_17": 668.4944366455078, + "kl_loss_3": 4048.25771484375, + "kl_loss_6": 3273.9767822265626, + "learning_rate": 0.0009967408676742752, + "loss": 2483.2707, + "step": 460 + }, + { + "ce_loss_12": 3.997884488105774, + "ce_loss_17": 3.330697500705719, + "ce_loss_23": 3.0016987800598143, + "ce_loss_3": 4.963337516784668, + "ce_loss_6": 4.5454404830932615, + "epoch": 0.047, + "grad_norm": 1168.0, + "kl_loss_12": 2082.495251464844, + "kl_loss_17": 688.2158508300781, + "kl_loss_3": 3956.098254394531, + "kl_loss_6": 3153.3085205078123, + "learning_rate": 0.0009965575009429006, + "loss": 2550.675, + "step": 470 + }, + { + "ce_loss_12": 3.828306031227112, + "ce_loss_17": 3.134593462944031, + "ce_loss_23": 2.7883999347686768, + "ce_loss_3": 4.853883934020996, + "ce_loss_6": 4.421608352661133, + "epoch": 0.048, + "grad_norm": 924.0, + "kl_loss_12": 2141.7112731933594, + "kl_loss_17": 703.4312683105469, + "kl_loss_3": 4134.702722167969, + "kl_loss_6": 3301.436169433594, + "learning_rate": 0.0009963691338830043, + "loss": 2537.2172, + "step": 480 + }, + { + "ce_loss_12": 3.8659058570861817, + "ce_loss_17": 3.199295365810394, + "ce_loss_23": 2.878894364833832, + "ce_loss_3": 4.872761750221253, + "ce_loss_6": 4.462412023544312, + "epoch": 0.049, + "grad_norm": 880.0, + "kl_loss_12": 2068.831591796875, + "kl_loss_17": 653.7953063964844, + "kl_loss_3": 4015.805126953125, + "kl_loss_6": 3236.064831542969, + "learning_rate": 0.0009961757683914405, + "loss": 2477.0184, + "step": 490 + }, + { + "ce_loss_12": 3.864358937740326, + "ce_loss_17": 3.1860976815223694, + "ce_loss_23": 2.864992415904999, + "ce_loss_3": 4.822944927215576, + "ce_loss_6": 4.406767797470093, + "epoch": 0.05, + "grad_norm": 952.0, + "kl_loss_12": 2072.2715576171877, + "kl_loss_17": 656.6796630859375, + "kl_loss_3": 3929.730029296875, + "kl_loss_6": 3145.1748901367187, + "learning_rate": 0.0009959774064153978, + "loss": 2476.4344, + "step": 500 + }, + { + "ce_loss_12": 3.8237053871154787, + "ce_loss_17": 3.1873385667800904, + "ce_loss_23": 2.8856295228004454, + "ce_loss_3": 4.799383807182312, + "ce_loss_6": 4.393183696269989, + "epoch": 0.051, + "grad_norm": 1104.0, + "kl_loss_12": 1984.5502136230468, + "kl_loss_17": 638.6908020019531, + "kl_loss_3": 3862.577673339844, + "kl_loss_6": 3089.1991943359376, + "learning_rate": 0.0009957740499523787, + "loss": 2434.852, + "step": 510 + }, + { + "ce_loss_12": 3.8531986594200136, + "ce_loss_17": 3.2021188259124758, + "ce_loss_23": 2.894522261619568, + "ce_loss_3": 4.832447981834411, + "ce_loss_6": 4.422991037368774, + "epoch": 0.052, + "grad_norm": 996.0, + "kl_loss_12": 2003.9843200683595, + "kl_loss_17": 635.9474334716797, + "kl_loss_3": 3881.0363525390626, + "kl_loss_6": 3101.2047607421873, + "learning_rate": 0.0009955657010501807, + "loss": 2418.8723, + "step": 520 + }, + { + "ce_loss_12": 3.8310924410820006, + "ce_loss_17": 3.1631489157676698, + "ce_loss_23": 2.848756265640259, + "ce_loss_3": 4.833318781852722, + "ce_loss_6": 4.419972848892212, + "epoch": 0.053, + "grad_norm": 1040.0, + "kl_loss_12": 2029.2051391601562, + "kl_loss_17": 638.5391387939453, + "kl_loss_3": 3972.6285766601563, + "kl_loss_6": 3185.361083984375, + "learning_rate": 0.000995352361806875, + "loss": 2437.1742, + "step": 530 + }, + { + "ce_loss_12": 3.8631009817123414, + "ce_loss_17": 3.212864434719086, + "ce_loss_23": 2.8967057943344114, + "ce_loss_3": 4.83975579738617, + "ce_loss_6": 4.445706224441528, + "epoch": 0.054, + "grad_norm": 812.0, + "kl_loss_12": 2037.0037414550782, + "kl_loss_17": 649.3562622070312, + "kl_loss_3": 3937.4210205078125, + "kl_loss_6": 3171.944091796875, + "learning_rate": 0.0009951340343707852, + "loss": 2466.9324, + "step": 540 + }, + { + "ce_loss_12": 3.90088312625885, + "ce_loss_17": 3.240309000015259, + "ce_loss_23": 2.9396286845207213, + "ce_loss_3": 4.886050772666931, + "ce_loss_6": 4.482292723655701, + "epoch": 0.055, + "grad_norm": 1056.0, + "kl_loss_12": 1982.8837341308595, + "kl_loss_17": 613.3875244140625, + "kl_loss_3": 3888.7989135742187, + "kl_loss_6": 3124.067333984375, + "learning_rate": 0.0009949107209404665, + "loss": 2422.4299, + "step": 550 + }, + { + "ce_loss_12": 3.8069709300994874, + "ce_loss_17": 3.1651495575904844, + "ce_loss_23": 2.8611255407333376, + "ce_loss_3": 4.779746437072754, + "ce_loss_6": 4.3871207475662235, + "epoch": 0.056, + "grad_norm": 904.0, + "kl_loss_12": 1977.2301452636718, + "kl_loss_17": 620.4258605957032, + "kl_loss_3": 3868.1070190429687, + "kl_loss_6": 3116.3320068359376, + "learning_rate": 0.0009946824237646824, + "loss": 2404.8865, + "step": 560 + }, + { + "ce_loss_12": 3.776531147956848, + "ce_loss_17": 3.1243035554885865, + "ce_loss_23": 2.8193069100379944, + "ce_loss_3": 4.7919842004776, + "ce_loss_6": 4.361578702926636, + "epoch": 0.057, + "grad_norm": 1008.0, + "kl_loss_12": 1993.1865539550781, + "kl_loss_17": 625.6534484863281, + "kl_loss_3": 3952.2551391601564, + "kl_loss_6": 3133.5532958984377, + "learning_rate": 0.0009944491451423828, + "loss": 2463.0141, + "step": 570 + }, + { + "ce_loss_12": 3.787885057926178, + "ce_loss_17": 3.125153052806854, + "ce_loss_23": 2.812381112575531, + "ce_loss_3": 4.800907230377197, + "ce_loss_6": 4.379597437381745, + "epoch": 0.058, + "grad_norm": 1072.0, + "kl_loss_12": 2041.7862365722656, + "kl_loss_17": 633.625210571289, + "kl_loss_3": 4005.790637207031, + "kl_loss_6": 3195.4418701171876, + "learning_rate": 0.0009942108874226813, + "loss": 2431.434, + "step": 580 + }, + { + "ce_loss_12": 3.8284475803375244, + "ce_loss_17": 3.205892300605774, + "ce_loss_23": 2.911292541027069, + "ce_loss_3": 4.796784925460815, + "ce_loss_6": 4.386137056350708, + "epoch": 0.059, + "grad_norm": 956.0, + "kl_loss_12": 1921.2669250488282, + "kl_loss_17": 609.1336517333984, + "kl_loss_3": 3794.235485839844, + "kl_loss_6": 3003.3477172851562, + "learning_rate": 0.00099396765300483, + "loss": 2323.3688, + "step": 590 + }, + { + "ce_loss_12": 3.813377547264099, + "ce_loss_17": 3.2030410766601562, + "ce_loss_23": 2.8971424460411073, + "ce_loss_3": 4.776553750038147, + "ce_loss_6": 4.383290815353393, + "epoch": 0.06, + "grad_norm": 1112.0, + "kl_loss_12": 1935.8569091796876, + "kl_loss_17": 634.0841278076172, + "kl_loss_3": 3804.7228149414063, + "kl_loss_6": 3041.823278808594, + "learning_rate": 0.0009937194443381972, + "loss": 2351.4145, + "step": 600 + }, + { + "ce_loss_12": 3.82430579662323, + "ce_loss_17": 3.239823818206787, + "ce_loss_23": 2.930612790584564, + "ce_loss_3": 4.761824107170105, + "ce_loss_6": 4.363707947731018, + "epoch": 0.061, + "grad_norm": 908.0, + "kl_loss_12": 1880.8529052734375, + "kl_loss_17": 646.1875213623047, + "kl_loss_3": 3713.361767578125, + "kl_loss_6": 2948.2285400390624, + "learning_rate": 0.0009934662639222412, + "loss": 2351.317, + "step": 610 + }, + { + "ce_loss_12": 3.8140470504760744, + "ce_loss_17": 3.1936643242836, + "ce_loss_23": 2.882937967777252, + "ce_loss_3": 4.7908659219741825, + "ce_loss_6": 4.390506052970887, + "epoch": 0.062, + "grad_norm": 952.0, + "kl_loss_12": 1963.47578125, + "kl_loss_17": 643.031982421875, + "kl_loss_3": 3858.526159667969, + "kl_loss_6": 3099.5319702148436, + "learning_rate": 0.000993208114306486, + "loss": 2373.4488, + "step": 620 + }, + { + "ce_loss_12": 3.7572766065597536, + "ce_loss_17": 3.1062564730644224, + "ce_loss_23": 2.814168381690979, + "ce_loss_3": 4.740272831916809, + "ce_loss_6": 4.334878396987915, + "epoch": 0.063, + "grad_norm": 924.0, + "kl_loss_12": 1978.2345825195312, + "kl_loss_17": 614.9058258056641, + "kl_loss_3": 3885.282019042969, + "kl_loss_6": 3110.789013671875, + "learning_rate": 0.0009929449980904952, + "loss": 2342.2086, + "step": 630 + }, + { + "ce_loss_12": 3.7797908902168276, + "ce_loss_17": 3.1566942811012266, + "ce_loss_23": 2.869984269142151, + "ce_loss_3": 4.747637248039245, + "ce_loss_6": 4.350768375396728, + "epoch": 0.064, + "grad_norm": 800.0, + "kl_loss_12": 1928.0524047851563, + "kl_loss_17": 596.5263305664063, + "kl_loss_3": 3814.7638916015626, + "kl_loss_6": 3054.167761230469, + "learning_rate": 0.0009926769179238466, + "loss": 2326.8762, + "step": 640 + }, + { + "ce_loss_12": 3.8469881892204283, + "ce_loss_17": 3.203332006931305, + "ce_loss_23": 2.903032958507538, + "ce_loss_3": 4.791834831237793, + "ce_loss_6": 4.398166084289551, + "epoch": 0.065, + "grad_norm": 1020.0, + "kl_loss_12": 1990.3420837402343, + "kl_loss_17": 627.5457916259766, + "kl_loss_3": 3828.7911743164063, + "kl_loss_6": 3068.594140625, + "learning_rate": 0.000992403876506104, + "loss": 2354.9871, + "step": 650 + }, + { + "ce_loss_12": 3.7656350135803223, + "ce_loss_17": 3.137479567527771, + "ce_loss_23": 2.847555148601532, + "ce_loss_3": 4.724980711936951, + "ce_loss_6": 4.33546097278595, + "epoch": 0.066, + "grad_norm": 872.0, + "kl_loss_12": 1947.3597778320313, + "kl_loss_17": 601.6236145019532, + "kl_loss_3": 3810.823876953125, + "kl_loss_6": 3051.07158203125, + "learning_rate": 0.0009921258765867918, + "loss": 2351.0516, + "step": 660 + }, + { + "ce_loss_12": 3.7432477951049803, + "ce_loss_17": 3.102054977416992, + "ce_loss_23": 2.8209555387496947, + "ce_loss_3": 4.726716303825379, + "ce_loss_6": 4.332750606536865, + "epoch": 0.067, + "grad_norm": 1048.0, + "kl_loss_12": 1934.0251159667969, + "kl_loss_17": 590.7294250488281, + "kl_loss_3": 3862.057373046875, + "kl_loss_6": 3099.9753173828126, + "learning_rate": 0.0009918429209653662, + "loss": 2332.4148, + "step": 670 + }, + { + "ce_loss_12": 3.79118971824646, + "ce_loss_17": 3.160488820075989, + "ce_loss_23": 2.8699307322502134, + "ce_loss_3": 4.746296501159668, + "ce_loss_6": 4.369972658157349, + "epoch": 0.068, + "grad_norm": 880.0, + "kl_loss_12": 1943.5599060058594, + "kl_loss_17": 621.3161865234375, + "kl_loss_3": 3822.037878417969, + "kl_loss_6": 3095.7862060546877, + "learning_rate": 0.0009915550124911866, + "loss": 2309.8324, + "step": 680 + }, + { + "ce_loss_12": 3.7744308590888975, + "ce_loss_17": 3.164461362361908, + "ce_loss_23": 2.871832025051117, + "ce_loss_3": 4.717565703392029, + "ce_loss_6": 4.334046483039856, + "epoch": 0.069, + "grad_norm": 928.0, + "kl_loss_12": 1894.8691345214843, + "kl_loss_17": 600.1791351318359, + "kl_loss_3": 3719.063757324219, + "kl_loss_6": 2989.2135620117188, + "learning_rate": 0.0009912621540634887, + "loss": 2298.6404, + "step": 690 + }, + { + "ce_loss_12": 3.7581581592559816, + "ce_loss_17": 3.175926184654236, + "ce_loss_23": 2.91041499376297, + "ce_loss_3": 4.693021059036255, + "ce_loss_6": 4.328584957122803, + "epoch": 0.07, + "grad_norm": 952.0, + "kl_loss_12": 1805.7688293457031, + "kl_loss_17": 557.3165618896485, + "kl_loss_3": 3636.874401855469, + "kl_loss_6": 2931.793310546875, + "learning_rate": 0.0009909643486313534, + "loss": 2253.7109, + "step": 700 + }, + { + "ce_loss_12": 3.694593846797943, + "ce_loss_17": 3.0789793491363526, + "ce_loss_23": 2.806851255893707, + "ce_loss_3": 4.675688862800598, + "ce_loss_6": 4.283495950698852, + "epoch": 0.071, + "grad_norm": 836.0, + "kl_loss_12": 1873.1919860839844, + "kl_loss_17": 561.667138671875, + "kl_loss_3": 3785.0610595703124, + "kl_loss_6": 3032.8089111328127, + "learning_rate": 0.000990661599193678, + "loss": 2339.2168, + "step": 710 + }, + { + "ce_loss_12": 3.781097650527954, + "ce_loss_17": 3.1813387274742126, + "ce_loss_23": 2.9092867970466614, + "ce_loss_3": 4.718334794044495, + "ce_loss_6": 4.346433448791504, + "epoch": 0.072, + "grad_norm": 844.0, + "kl_loss_12": 1837.0475830078126, + "kl_loss_17": 569.5842346191406, + "kl_loss_3": 3658.182763671875, + "kl_loss_6": 2955.2635498046875, + "learning_rate": 0.0009903539087991462, + "loss": 2259.2977, + "step": 720 + }, + { + "ce_loss_12": 3.7577704548835755, + "ce_loss_17": 3.175670492649078, + "ce_loss_23": 2.8987006187438964, + "ce_loss_3": 4.700696063041687, + "ce_loss_6": 4.328676080703735, + "epoch": 0.073, + "grad_norm": 828.0, + "kl_loss_12": 1822.2861938476562, + "kl_loss_17": 580.5244903564453, + "kl_loss_3": 3665.8289306640627, + "kl_loss_6": 2948.7177856445314, + "learning_rate": 0.0009900412805461966, + "loss": 2269.7988, + "step": 730 + }, + { + "ce_loss_12": 3.7941108107566834, + "ce_loss_17": 3.2247503757476808, + "ce_loss_23": 2.9608768463134765, + "ce_loss_3": 4.737755060195923, + "ce_loss_6": 4.366144275665283, + "epoch": 0.074, + "grad_norm": 1152.0, + "kl_loss_12": 1778.3082214355468, + "kl_loss_17": 545.759782409668, + "kl_loss_3": 3633.7240478515623, + "kl_loss_6": 2913.7588500976562, + "learning_rate": 0.0009897237175829927, + "loss": 2249.7711, + "step": 740 + }, + { + "ce_loss_12": 3.7247921228408813, + "ce_loss_17": 3.1244994401931763, + "ce_loss_23": 2.854635012149811, + "ce_loss_3": 4.680092597007752, + "ce_loss_6": 4.321918106079101, + "epoch": 0.075, + "grad_norm": 876.0, + "kl_loss_12": 1839.49482421875, + "kl_loss_17": 563.657942199707, + "kl_loss_3": 3715.802197265625, + "kl_loss_6": 3012.9723876953126, + "learning_rate": 0.0009894012231073895, + "loss": 2259.4498, + "step": 750 + }, + { + "ce_loss_12": 3.7520262837409972, + "ce_loss_17": 3.1635246515274047, + "ce_loss_23": 2.9015506505966187, + "ce_loss_3": 4.691110682487488, + "ce_loss_6": 4.32509058713913, + "epoch": 0.076, + "grad_norm": 908.0, + "kl_loss_12": 1785.3197631835938, + "kl_loss_17": 539.1061965942383, + "kl_loss_3": 3617.3318725585937, + "kl_loss_6": 2915.640002441406, + "learning_rate": 0.0009890738003669028, + "loss": 2248.3758, + "step": 760 + }, + { + "ce_loss_12": 3.7484350919723513, + "ce_loss_17": 3.133402609825134, + "ce_loss_23": 2.8702358484268187, + "ce_loss_3": 4.71472737789154, + "ce_loss_6": 4.340381741523743, + "epoch": 0.077, + "grad_norm": 772.0, + "kl_loss_12": 1869.190771484375, + "kl_loss_17": 557.6747970581055, + "kl_loss_3": 3771.08115234375, + "kl_loss_6": 3040.724951171875, + "learning_rate": 0.0009887414526586764, + "loss": 2246.727, + "step": 770 + }, + { + "ce_loss_12": 3.7698349475860597, + "ce_loss_17": 3.175296950340271, + "ce_loss_23": 2.920288693904877, + "ce_loss_3": 4.71705801486969, + "ce_loss_6": 4.331751799583435, + "epoch": 0.078, + "grad_norm": 788.0, + "kl_loss_12": 1792.9148071289062, + "kl_loss_17": 538.2495681762696, + "kl_loss_3": 3645.0748657226563, + "kl_loss_6": 2909.5169311523437, + "learning_rate": 0.0009884041833294476, + "loss": 2177.8914, + "step": 780 + }, + { + "ce_loss_12": 3.7557843208312987, + "ce_loss_17": 3.1842599511146545, + "ce_loss_23": 2.927360641956329, + "ce_loss_3": 4.686308670043945, + "ce_loss_6": 4.309764838218689, + "epoch": 0.079, + "grad_norm": 960.0, + "kl_loss_12": 1760.4378784179687, + "kl_loss_17": 537.1736328125, + "kl_loss_3": 3580.4406494140626, + "kl_loss_6": 2858.5263671875, + "learning_rate": 0.000988061995775515, + "loss": 2255.1971, + "step": 790 + }, + { + "ce_loss_12": 3.7003114342689516, + "ce_loss_17": 3.1278024435043337, + "ce_loss_23": 2.8603140830993654, + "ce_loss_3": 4.623929166793824, + "ce_loss_6": 4.252516531944275, + "epoch": 0.08, + "grad_norm": 960.0, + "kl_loss_12": 1791.1098693847657, + "kl_loss_17": 572.117807006836, + "kl_loss_3": 3605.273156738281, + "kl_loss_6": 2881.9625, + "learning_rate": 0.0009877148934427035, + "loss": 2211.3432, + "step": 800 + }, + { + "ce_loss_12": 3.7298121809959413, + "ce_loss_17": 3.1572980999946596, + "ce_loss_23": 2.8989072322845457, + "ce_loss_3": 4.679050588607788, + "ce_loss_6": 4.30648295879364, + "epoch": 0.081, + "grad_norm": 828.0, + "kl_loss_12": 1769.9298461914063, + "kl_loss_17": 549.3276397705079, + "kl_loss_3": 3633.3033081054687, + "kl_loss_6": 2918.2040771484376, + "learning_rate": 0.0009873628798263297, + "loss": 2195.8723, + "step": 810 + }, + { + "ce_loss_12": 3.6752089977264406, + "ce_loss_17": 3.12715106010437, + "ce_loss_23": 2.8626845240592957, + "ce_loss_3": 4.616334772109985, + "ce_loss_6": 4.246775162220001, + "epoch": 0.082, + "grad_norm": 928.0, + "kl_loss_12": 1739.75078125, + "kl_loss_17": 553.6329010009765, + "kl_loss_3": 3569.2022094726562, + "kl_loss_6": 2855.6306396484374, + "learning_rate": 0.0009870059584711668, + "loss": 2231.7055, + "step": 820 + }, + { + "ce_loss_12": 3.6967708706855773, + "ce_loss_17": 3.126306390762329, + "ce_loss_23": 2.8665772557258604, + "ce_loss_3": 4.637620544433593, + "ce_loss_6": 4.258155560493469, + "epoch": 0.083, + "grad_norm": 932.0, + "kl_loss_12": 1746.82158203125, + "kl_loss_17": 537.5760116577148, + "kl_loss_3": 3573.961315917969, + "kl_loss_6": 2853.324560546875, + "learning_rate": 0.000986644132971409, + "loss": 2181.06, + "step": 830 + }, + { + "ce_loss_12": 3.707110118865967, + "ce_loss_17": 3.124701976776123, + "ce_loss_23": 2.8559574365615843, + "ce_loss_3": 4.668276762962341, + "ce_loss_6": 4.28531403541565, + "epoch": 0.084, + "grad_norm": 808.0, + "kl_loss_12": 1796.9253723144532, + "kl_loss_17": 546.925749206543, + "kl_loss_3": 3666.6813110351563, + "kl_loss_6": 2930.093310546875, + "learning_rate": 0.0009862774069706345, + "loss": 2212.5172, + "step": 840 + }, + { + "ce_loss_12": 3.774817633628845, + "ce_loss_17": 3.231712806224823, + "ce_loss_23": 2.9828774333000183, + "ce_loss_3": 4.684700012207031, + "ce_loss_6": 4.320200824737549, + "epoch": 0.085, + "grad_norm": 800.0, + "kl_loss_12": 1732.337646484375, + "kl_loss_17": 536.6730453491211, + "kl_loss_3": 3509.0249267578124, + "kl_loss_6": 2806.8006591796875, + "learning_rate": 0.000985905784161771, + "loss": 2170.632, + "step": 850 + }, + { + "ce_loss_12": 3.715651345252991, + "ce_loss_17": 3.154586577415466, + "ce_loss_23": 2.9106888294219972, + "ce_loss_3": 4.637371349334717, + "ce_loss_6": 4.270102548599243, + "epoch": 0.086, + "grad_norm": 744.0, + "kl_loss_12": 1734.4763610839843, + "kl_loss_17": 514.9126983642578, + "kl_loss_3": 3539.1369018554688, + "kl_loss_6": 2834.6635498046876, + "learning_rate": 0.000985529268287055, + "loss": 2148.7227, + "step": 860 + }, + { + "ce_loss_12": 3.674989342689514, + "ce_loss_17": 3.1098021984100344, + "ce_loss_23": 2.8464756488800047, + "ce_loss_3": 4.642683744430542, + "ce_loss_6": 4.276080477237701, + "epoch": 0.087, + "grad_norm": 856.0, + "kl_loss_12": 1765.6208251953126, + "kl_loss_17": 545.826058959961, + "kl_loss_3": 3641.5214111328123, + "kl_loss_6": 2934.75615234375, + "learning_rate": 0.0009851478631379982, + "loss": 2215.8617, + "step": 870 + }, + { + "ce_loss_12": 3.7032182455062865, + "ce_loss_17": 3.157497191429138, + "ce_loss_23": 2.898158383369446, + "ce_loss_3": 4.64922685623169, + "ce_loss_6": 4.282612156867981, + "epoch": 0.088, + "grad_norm": 836.0, + "kl_loss_12": 1717.8744384765625, + "kl_loss_17": 551.1847427368164, + "kl_loss_3": 3571.97431640625, + "kl_loss_6": 2868.6826782226562, + "learning_rate": 0.0009847615725553456, + "loss": 2173.4846, + "step": 880 + }, + { + "ce_loss_12": 3.71795312166214, + "ce_loss_17": 3.1950910091400146, + "ce_loss_23": 2.9528208017349242, + "ce_loss_3": 4.61668553352356, + "ce_loss_6": 4.271125149726868, + "epoch": 0.089, + "grad_norm": 1016.0, + "kl_loss_12": 1644.957293701172, + "kl_loss_17": 517.1908355712891, + "kl_loss_3": 3410.7930419921877, + "kl_loss_6": 2743.155871582031, + "learning_rate": 0.0009843704004290394, + "loss": 2151.0193, + "step": 890 + }, + { + "ce_loss_12": 3.663389527797699, + "ce_loss_17": 3.1275211811065673, + "ce_loss_23": 2.864755928516388, + "ce_loss_3": 4.603832817077636, + "ce_loss_6": 4.239253675937652, + "epoch": 0.09, + "grad_norm": 716.0, + "kl_loss_12": 1722.57783203125, + "kl_loss_17": 551.8916763305664, + "kl_loss_3": 3570.454870605469, + "kl_loss_6": 2861.2793823242187, + "learning_rate": 0.0009839743506981783, + "loss": 2167.943, + "step": 900 + }, + { + "ce_loss_12": 3.6289228081703184, + "ce_loss_17": 3.0549963235855104, + "ce_loss_23": 2.787931501865387, + "ce_loss_3": 4.591374397277832, + "ce_loss_6": 4.221539330482483, + "epoch": 0.091, + "grad_norm": 768.0, + "kl_loss_12": 1782.277117919922, + "kl_loss_17": 558.6154708862305, + "kl_loss_3": 3686.1168212890625, + "kl_loss_6": 2968.0820068359376, + "learning_rate": 0.0009835734273509786, + "loss": 2207.0652, + "step": 910 + }, + { + "ce_loss_12": 3.693990683555603, + "ce_loss_17": 3.1360571503639223, + "ce_loss_23": 2.8765580654144287, + "ce_loss_3": 4.6382588863372805, + "ce_loss_6": 4.266951274871826, + "epoch": 0.092, + "grad_norm": 820.0, + "kl_loss_12": 1706.47421875, + "kl_loss_17": 539.3909072875977, + "kl_loss_3": 3556.4054931640626, + "kl_loss_6": 2838.4239868164063, + "learning_rate": 0.0009831676344247342, + "loss": 2159.5197, + "step": 920 + }, + { + "ce_loss_12": 3.683386981487274, + "ce_loss_17": 3.1375192284584044, + "ce_loss_23": 2.9041780829429626, + "ce_loss_3": 4.575248861312867, + "ce_loss_6": 4.217752540111542, + "epoch": 0.093, + "grad_norm": 832.0, + "kl_loss_12": 1681.6122253417968, + "kl_loss_17": 507.54964752197264, + "kl_loss_3": 3429.8853271484377, + "kl_loss_6": 2744.8274047851564, + "learning_rate": 0.0009827569760057755, + "loss": 2126.2113, + "step": 930 + }, + { + "ce_loss_12": 3.6735096335411073, + "ce_loss_17": 3.0771583557128905, + "ce_loss_23": 2.817060923576355, + "ce_loss_3": 4.632097887992859, + "ce_loss_6": 4.2522142887115475, + "epoch": 0.094, + "grad_norm": 920.0, + "kl_loss_12": 1796.5344604492188, + "kl_loss_17": 537.3643798828125, + "kl_loss_3": 3685.60615234375, + "kl_loss_6": 2951.1067016601564, + "learning_rate": 0.000982341456229428, + "loss": 2172.0842, + "step": 940 + }, + { + "ce_loss_12": 3.730703866481781, + "ce_loss_17": 3.161611866950989, + "ce_loss_23": 2.9119389176368715, + "ce_loss_3": 4.65162878036499, + "ce_loss_6": 4.285764884948731, + "epoch": 0.095, + "grad_norm": 876.0, + "kl_loss_12": 1769.41083984375, + "kl_loss_17": 528.7643432617188, + "kl_loss_3": 3566.267614746094, + "kl_loss_6": 2857.342639160156, + "learning_rate": 0.000981921079279971, + "loss": 2131.398, + "step": 950 + }, + { + "ce_loss_12": 3.6671907782554625, + "ce_loss_17": 3.14452269077301, + "ce_loss_23": 2.9202617764472962, + "ce_loss_3": 4.5672272682189945, + "ce_loss_6": 4.20267025232315, + "epoch": 0.096, + "grad_norm": 884.0, + "kl_loss_12": 1647.3808959960938, + "kl_loss_17": 494.3776519775391, + "kl_loss_3": 3406.887805175781, + "kl_loss_6": 2712.2851440429686, + "learning_rate": 0.0009814958493905962, + "loss": 2087.8811, + "step": 960 + }, + { + "ce_loss_12": 3.675451910495758, + "ce_loss_17": 3.126833999156952, + "ce_loss_23": 2.8827471137046814, + "ce_loss_3": 4.63006055355072, + "ce_loss_6": 4.257764625549316, + "epoch": 0.097, + "grad_norm": 828.0, + "kl_loss_12": 1702.2585144042969, + "kl_loss_17": 518.7686157226562, + "kl_loss_3": 3553.715295410156, + "kl_loss_6": 2842.6969970703126, + "learning_rate": 0.0009810657708433637, + "loss": 2178.4695, + "step": 970 + }, + { + "ce_loss_12": 3.699459207057953, + "ce_loss_17": 3.196008837223053, + "ce_loss_23": 2.953006935119629, + "ce_loss_3": 4.605400490760803, + "ce_loss_6": 4.242717599868774, + "epoch": 0.098, + "grad_norm": 792.0, + "kl_loss_12": 1612.577392578125, + "kl_loss_17": 508.1386352539063, + "kl_loss_3": 3371.6166381835938, + "kl_loss_6": 2683.5531372070313, + "learning_rate": 0.0009806308479691594, + "loss": 2072.0863, + "step": 980 + }, + { + "ce_loss_12": 3.746099066734314, + "ce_loss_17": 3.2276832103729247, + "ce_loss_23": 2.9513447999954225, + "ce_loss_3": 4.660121750831604, + "ce_loss_6": 4.313950896263123, + "epoch": 0.099, + "grad_norm": 1064.0, + "kl_loss_12": 1686.7129333496093, + "kl_loss_17": 575.0331268310547, + "kl_loss_3": 3487.668017578125, + "kl_loss_6": 2801.712780761719, + "learning_rate": 0.0009801910851476522, + "loss": 2125.6598, + "step": 990 + }, + { + "ce_loss_12": 3.6821831703186034, + "ce_loss_17": 3.1449294328689574, + "ce_loss_23": 2.890371763706207, + "ce_loss_3": 4.6326807022094725, + "ce_loss_6": 4.274430382251739, + "epoch": 0.1, + "grad_norm": 1080.0, + "kl_loss_12": 1714.1190612792968, + "kl_loss_17": 552.9030899047851, + "kl_loss_3": 3598.4555908203124, + "kl_loss_6": 2901.4451171875, + "learning_rate": 0.0009797464868072487, + "loss": 2147.7566, + "step": 1000 + }, + { + "ce_loss_12": 3.660784673690796, + "ce_loss_17": 3.1308478116989136, + "ce_loss_23": 2.8782097458839417, + "ce_loss_3": 4.59887433052063, + "ce_loss_6": 4.227726662158966, + "epoch": 0.101, + "grad_norm": 840.0, + "kl_loss_12": 1695.8429382324218, + "kl_loss_17": 536.8638442993164, + "kl_loss_3": 3530.2622924804687, + "kl_loss_6": 2826.7454345703127, + "learning_rate": 0.0009792970574250492, + "loss": 2137.6924, + "step": 1010 + }, + { + "ce_loss_12": 3.66312894821167, + "ce_loss_17": 3.1327234148979186, + "ce_loss_23": 2.888000476360321, + "ce_loss_3": 4.584208631515503, + "ce_loss_6": 4.2243523597717285, + "epoch": 0.102, + "grad_norm": 1020.0, + "kl_loss_12": 1655.641827392578, + "kl_loss_17": 515.9014770507813, + "kl_loss_3": 3461.872509765625, + "kl_loss_6": 2770.165979003906, + "learning_rate": 0.0009788428015268028, + "loss": 2076.8967, + "step": 1020 + }, + { + "ce_loss_12": 3.658862817287445, + "ce_loss_17": 3.1359084010124207, + "ce_loss_23": 2.8994009375572203, + "ce_loss_3": 4.570998239517212, + "ce_loss_6": 4.215702986717224, + "epoch": 0.103, + "grad_norm": 720.0, + "kl_loss_12": 1638.7247253417968, + "kl_loss_17": 514.6052276611329, + "kl_loss_3": 3429.5267700195313, + "kl_loss_6": 2738.1430419921876, + "learning_rate": 0.0009783837236868609, + "loss": 2080.224, + "step": 1030 + }, + { + "ce_loss_12": 3.6323411583900453, + "ce_loss_17": 3.0970885038375853, + "ce_loss_23": 2.8580228686332703, + "ce_loss_3": 4.537933397293091, + "ce_loss_6": 4.185244929790497, + "epoch": 0.104, + "grad_norm": 816.0, + "kl_loss_12": 1652.1881530761718, + "kl_loss_17": 504.38118438720704, + "kl_loss_3": 3413.4997436523436, + "kl_loss_6": 2735.0294799804688, + "learning_rate": 0.0009779198285281327, + "loss": 2065.5354, + "step": 1040 + }, + { + "ce_loss_12": 3.647919070720673, + "ce_loss_17": 3.0967610716819762, + "ce_loss_23": 2.863264966011047, + "ce_loss_3": 4.571896409988403, + "ce_loss_6": 4.211425912380219, + "epoch": 0.105, + "grad_norm": 780.0, + "kl_loss_12": 1667.45146484375, + "kl_loss_17": 492.2688583374023, + "kl_loss_3": 3491.690625, + "kl_loss_6": 2786.4512573242187, + "learning_rate": 0.0009774511207220368, + "loss": 2097.541, + "step": 1050 + }, + { + "ce_loss_12": 3.6793935298919678, + "ce_loss_17": 3.1396284341812133, + "ce_loss_23": 2.9056878805160524, + "ce_loss_3": 4.610803484916687, + "ce_loss_6": 4.24627730846405, + "epoch": 0.106, + "grad_norm": 680.0, + "kl_loss_12": 1664.4289855957031, + "kl_loss_17": 503.2407257080078, + "kl_loss_3": 3494.34267578125, + "kl_loss_6": 2796.4139770507813, + "learning_rate": 0.0009769776049884564, + "loss": 2101.0625, + "step": 1060 + }, + { + "ce_loss_12": 3.6110607266426085, + "ce_loss_17": 3.0582555770874023, + "ce_loss_23": 2.8185740232467653, + "ce_loss_3": 4.550081133842468, + "ce_loss_6": 4.188111197948456, + "epoch": 0.107, + "grad_norm": 804.0, + "kl_loss_12": 1693.1038452148437, + "kl_loss_17": 502.1560562133789, + "kl_loss_3": 3553.5575317382813, + "kl_loss_6": 2851.522509765625, + "learning_rate": 0.0009764992860956889, + "loss": 2160.6059, + "step": 1070 + }, + { + "ce_loss_12": 3.6763460516929625, + "ce_loss_17": 3.1736180782318115, + "ce_loss_23": 2.96165292263031, + "ce_loss_3": 4.553462076187134, + "ce_loss_6": 4.211921668052673, + "epoch": 0.108, + "grad_norm": 940.0, + "kl_loss_12": 1565.0017883300782, + "kl_loss_17": 465.6837585449219, + "kl_loss_3": 3285.5805053710938, + "kl_loss_6": 2630.6787231445314, + "learning_rate": 0.0009760161688604008, + "loss": 2027.6166, + "step": 1080 + }, + { + "ce_loss_12": 3.704931104183197, + "ce_loss_17": 3.183962869644165, + "ce_loss_23": 2.955583941936493, + "ce_loss_3": 4.6247032403945925, + "ce_loss_6": 4.262003123760223, + "epoch": 0.109, + "grad_norm": 780.0, + "kl_loss_12": 1606.7292419433593, + "kl_loss_17": 479.0166900634766, + "kl_loss_3": 3407.0781982421877, + "kl_loss_6": 2713.0140380859375, + "learning_rate": 0.0009755282581475768, + "loss": 2075.1375, + "step": 1090 + }, + { + "ce_loss_12": 3.735625076293945, + "ce_loss_17": 3.2278528928756716, + "ce_loss_23": 2.992156505584717, + "ce_loss_3": 4.650100636482239, + "ce_loss_6": 4.283954048156739, + "epoch": 0.11, + "grad_norm": 1120.0, + "kl_loss_12": 1597.375665283203, + "kl_loss_17": 487.4188934326172, + "kl_loss_3": 3389.489611816406, + "kl_loss_6": 2680.6547973632814, + "learning_rate": 0.0009750355588704727, + "loss": 2029.5996, + "step": 1100 + }, + { + "ce_loss_12": 3.602501726150513, + "ce_loss_17": 3.0762116074562074, + "ce_loss_23": 2.8531104922294617, + "ce_loss_3": 4.54400155544281, + "ce_loss_6": 4.181029498577118, + "epoch": 0.111, + "grad_norm": 892.0, + "kl_loss_12": 1600.1835693359376, + "kl_loss_17": 478.77005157470705, + "kl_loss_3": 3453.218310546875, + "kl_loss_6": 2753.7009399414064, + "learning_rate": 0.0009745380759905647, + "loss": 2102.3586, + "step": 1110 + }, + { + "ce_loss_12": 3.566656935214996, + "ce_loss_17": 3.036020517349243, + "ce_loss_23": 2.811892902851105, + "ce_loss_3": 4.49903085231781, + "ce_loss_6": 4.14581116437912, + "epoch": 0.112, + "grad_norm": 952.0, + "kl_loss_12": 1613.6444152832032, + "kl_loss_17": 472.362158203125, + "kl_loss_3": 3453.385888671875, + "kl_loss_6": 2762.3469360351564, + "learning_rate": 0.0009740358145174998, + "loss": 2112.1863, + "step": 1120 + }, + { + "ce_loss_12": 3.666201722621918, + "ce_loss_17": 3.163138520717621, + "ce_loss_23": 2.945633387565613, + "ce_loss_3": 4.552199673652649, + "ce_loss_6": 4.200457072257995, + "epoch": 0.113, + "grad_norm": 820.0, + "kl_loss_12": 1570.5922912597657, + "kl_loss_17": 465.73221893310546, + "kl_loss_3": 3322.1153564453125, + "kl_loss_6": 2639.3234130859373, + "learning_rate": 0.0009735287795090455, + "loss": 2028.2494, + "step": 1130 + }, + { + "ce_loss_12": 3.6025816917419435, + "ce_loss_17": 3.0846946001052857, + "ce_loss_23": 2.855406713485718, + "ce_loss_3": 4.512770676612854, + "ce_loss_6": 4.161376357078552, + "epoch": 0.114, + "grad_norm": 880.0, + "kl_loss_12": 1629.634259033203, + "kl_loss_17": 493.5605972290039, + "kl_loss_3": 3416.2345947265626, + "kl_loss_6": 2742.5260375976563, + "learning_rate": 0.0009730169760710386, + "loss": 2055.8789, + "step": 1140 + }, + { + "ce_loss_12": 3.655885374546051, + "ce_loss_17": 3.1461606860160827, + "ce_loss_23": 2.9138878703117372, + "ce_loss_3": 4.562855291366577, + "ce_loss_6": 4.201452386379242, + "epoch": 0.115, + "grad_norm": 812.0, + "kl_loss_12": 1582.1077880859375, + "kl_loss_17": 490.54324798583986, + "kl_loss_3": 3347.0728271484377, + "kl_loss_6": 2658.38212890625, + "learning_rate": 0.0009725004093573342, + "loss": 2048.8902, + "step": 1150 + }, + { + "ce_loss_12": 3.6162595748901367, + "ce_loss_17": 3.117063570022583, + "ce_loss_23": 2.8703216075897218, + "ce_loss_3": 4.53184130191803, + "ce_loss_6": 4.167289030551911, + "epoch": 0.116, + "grad_norm": 1064.0, + "kl_loss_12": 1582.6757202148438, + "kl_loss_17": 511.1013519287109, + "kl_loss_3": 3389.1841430664062, + "kl_loss_6": 2682.0026000976563, + "learning_rate": 0.0009719790845697534, + "loss": 2028.8348, + "step": 1160 + }, + { + "ce_loss_12": 3.5447800517082215, + "ce_loss_17": 3.0915211915969847, + "ce_loss_23": 2.8394047379493714, + "ce_loss_3": 4.442968082427979, + "ce_loss_6": 4.093847930431366, + "epoch": 0.117, + "grad_norm": 1304.0, + "kl_loss_12": 1529.0038330078125, + "kl_loss_17": 528.0925674438477, + "kl_loss_3": 3300.722692871094, + "kl_loss_6": 2623.0964965820312, + "learning_rate": 0.0009714530069580309, + "loss": 2001.9266, + "step": 1170 + }, + { + "ce_loss_12": 3.648385465145111, + "ce_loss_17": 3.163987386226654, + "ce_loss_23": 2.906623864173889, + "ce_loss_3": 4.570439338684082, + "ce_loss_6": 4.20642215013504, + "epoch": 0.118, + "grad_norm": 976.0, + "kl_loss_12": 1599.7222900390625, + "kl_loss_17": 545.9570709228516, + "kl_loss_3": 3392.21904296875, + "kl_loss_6": 2702.566052246094, + "learning_rate": 0.0009709221818197624, + "loss": 2045.9377, + "step": 1180 + }, + { + "ce_loss_12": 3.69852100610733, + "ce_loss_17": 3.195405375957489, + "ce_loss_23": 2.955050897598267, + "ce_loss_3": 4.611287760734558, + "ce_loss_6": 4.259045755863189, + "epoch": 0.119, + "grad_norm": 996.0, + "kl_loss_12": 1595.0409118652344, + "kl_loss_17": 510.2501693725586, + "kl_loss_3": 3394.6732788085938, + "kl_loss_6": 2711.4011474609374, + "learning_rate": 0.0009703866145003512, + "loss": 2048.791, + "step": 1190 + }, + { + "ce_loss_12": 3.6461198687553407, + "ce_loss_17": 3.1560633540153504, + "ce_loss_23": 2.9288500785827636, + "ce_loss_3": 4.544662046432495, + "ce_loss_6": 4.196171057224274, + "epoch": 0.12, + "grad_norm": 844.0, + "kl_loss_12": 1562.8870666503906, + "kl_loss_17": 476.9662322998047, + "kl_loss_3": 3339.506799316406, + "kl_loss_6": 2660.272998046875, + "learning_rate": 0.0009698463103929542, + "loss": 2039.2613, + "step": 1200 + }, + { + "ce_loss_12": 3.6362263798713683, + "ce_loss_17": 3.118537414073944, + "ce_loss_23": 2.890295457839966, + "ce_loss_3": 4.55160620212555, + "ce_loss_6": 4.197222125530243, + "epoch": 0.121, + "grad_norm": 840.0, + "kl_loss_12": 1606.2838256835937, + "kl_loss_17": 482.26617736816405, + "kl_loss_3": 3396.402978515625, + "kl_loss_6": 2713.9810546875, + "learning_rate": 0.0009693012749384279, + "loss": 2055.3727, + "step": 1210 + }, + { + "ce_loss_12": 3.620063138008118, + "ce_loss_17": 3.11392560005188, + "ce_loss_23": 2.894404947757721, + "ce_loss_3": 4.517776942253112, + "ce_loss_6": 4.170427846908569, + "epoch": 0.122, + "grad_norm": 896.0, + "kl_loss_12": 1568.8251892089843, + "kl_loss_17": 472.27161560058596, + "kl_loss_3": 3342.350476074219, + "kl_loss_6": 2666.2023193359373, + "learning_rate": 0.0009687515136252732, + "loss": 1995.2891, + "step": 1220 + }, + { + "ce_loss_12": 3.6086941242218016, + "ce_loss_17": 3.079377865791321, + "ce_loss_23": 2.861296200752258, + "ce_loss_3": 4.553121948242188, + "ce_loss_6": 4.194600236415863, + "epoch": 0.123, + "grad_norm": 920.0, + "kl_loss_12": 1611.9838623046876, + "kl_loss_17": 462.04967346191404, + "kl_loss_3": 3465.6461181640625, + "kl_loss_6": 2771.3404052734377, + "learning_rate": 0.0009681970319895803, + "loss": 2132.065, + "step": 1230 + }, + { + "ce_loss_12": 3.6665617704391478, + "ce_loss_17": 3.152220356464386, + "ce_loss_23": 2.9416170954704284, + "ce_loss_3": 4.568020367622376, + "ce_loss_6": 4.2054404497146605, + "epoch": 0.124, + "grad_norm": 1004.0, + "kl_loss_12": 1559.611993408203, + "kl_loss_17": 455.8671081542969, + "kl_loss_3": 3328.233264160156, + "kl_loss_6": 2633.26875, + "learning_rate": 0.0009676378356149733, + "loss": 1992.5469, + "step": 1240 + }, + { + "ce_loss_12": 3.606646728515625, + "ce_loss_17": 3.105606806278229, + "ce_loss_23": 2.9098575115203857, + "ce_loss_3": 4.509173798561096, + "ce_loss_6": 4.152543914318085, + "epoch": 0.125, + "grad_norm": 732.0, + "kl_loss_12": 1517.0133972167969, + "kl_loss_17": 436.5280731201172, + "kl_loss_3": 3289.8114501953123, + "kl_loss_6": 2606.249169921875, + "learning_rate": 0.0009670739301325534, + "loss": 1980.5193, + "step": 1250 + }, + { + "ce_loss_12": 3.5973118901252747, + "ce_loss_17": 3.0886258721351623, + "ce_loss_23": 2.8696954250335693, + "ce_loss_3": 4.481967115402222, + "ce_loss_6": 4.128626692295074, + "epoch": 0.126, + "grad_norm": 776.0, + "kl_loss_12": 1570.6747680664062, + "kl_loss_17": 457.1540771484375, + "kl_loss_3": 3311.004675292969, + "kl_loss_6": 2629.484521484375, + "learning_rate": 0.0009665053212208426, + "loss": 2021.3977, + "step": 1260 + }, + { + "ce_loss_12": 3.637851631641388, + "ce_loss_17": 3.1208248853683473, + "ce_loss_23": 2.904267394542694, + "ce_loss_3": 4.546870756149292, + "ce_loss_6": 4.183465051651001, + "epoch": 0.127, + "grad_norm": 920.0, + "kl_loss_12": 1595.9552673339845, + "kl_loss_17": 469.6421890258789, + "kl_loss_3": 3385.359619140625, + "kl_loss_6": 2681.4626708984374, + "learning_rate": 0.0009659320146057262, + "loss": 2027.4336, + "step": 1270 + }, + { + "ce_loss_12": 3.637918245792389, + "ce_loss_17": 3.1327763319015505, + "ce_loss_23": 2.92303524017334, + "ce_loss_3": 4.516714310646057, + "ce_loss_6": 4.169120907783508, + "epoch": 0.128, + "grad_norm": 1088.0, + "kl_loss_12": 1543.7512573242188, + "kl_loss_17": 451.88436431884764, + "kl_loss_3": 3297.9475708007812, + "kl_loss_6": 2619.7408935546873, + "learning_rate": 0.0009653540160603955, + "loss": 1990.6383, + "step": 1280 + }, + { + "ce_loss_12": 3.632475471496582, + "ce_loss_17": 3.1240146160125732, + "ce_loss_23": 2.922209882736206, + "ce_loss_3": 4.500276255607605, + "ce_loss_6": 4.169117403030396, + "epoch": 0.129, + "grad_norm": 1280.0, + "kl_loss_12": 1547.4067016601562, + "kl_loss_17": 442.0089508056641, + "kl_loss_3": 3277.050927734375, + "kl_loss_6": 2635.888952636719, + "learning_rate": 0.0009647713314052896, + "loss": 1965.3996, + "step": 1290 + }, + { + "ce_loss_12": 3.613161361217499, + "ce_loss_17": 3.081482803821564, + "ce_loss_23": 2.8680325150489807, + "ce_loss_3": 4.532515811920166, + "ce_loss_6": 4.186613607406616, + "epoch": 0.13, + "grad_norm": 956.0, + "kl_loss_12": 1596.8814331054687, + "kl_loss_17": 452.1222091674805, + "kl_loss_3": 3415.5057739257813, + "kl_loss_6": 2745.046984863281, + "learning_rate": 0.0009641839665080363, + "loss": 2033.1896, + "step": 1300 + }, + { + "ce_loss_12": 3.562652254104614, + "ce_loss_17": 3.0530904650688173, + "ce_loss_23": 2.8504271388053892, + "ce_loss_3": 4.474567329883575, + "ce_loss_6": 4.133738553524017, + "epoch": 0.131, + "grad_norm": 784.0, + "kl_loss_12": 1531.3059204101562, + "kl_loss_17": 432.8893524169922, + "kl_loss_3": 3327.0203857421875, + "kl_loss_6": 2671.88740234375, + "learning_rate": 0.0009635919272833937, + "loss": 1965.8117, + "step": 1310 + }, + { + "ce_loss_12": 3.5924949288368224, + "ce_loss_17": 3.0808891892433166, + "ce_loss_23": 2.8723941087722777, + "ce_loss_3": 4.503520846366882, + "ce_loss_6": 4.153036797046662, + "epoch": 0.132, + "grad_norm": 1048.0, + "kl_loss_12": 1532.0396484375, + "kl_loss_17": 437.7684783935547, + "kl_loss_3": 3331.197619628906, + "kl_loss_6": 2647.5683471679686, + "learning_rate": 0.0009629952196931902, + "loss": 1948.2967, + "step": 1320 + }, + { + "ce_loss_12": 3.5496832847595217, + "ce_loss_17": 3.0600673079490663, + "ce_loss_23": 2.86389017701149, + "ce_loss_3": 4.480193614959717, + "ce_loss_6": 4.126709806919098, + "epoch": 0.133, + "grad_norm": 920.0, + "kl_loss_12": 1499.148486328125, + "kl_loss_17": 434.60589294433595, + "kl_loss_3": 3326.1408447265626, + "kl_loss_6": 2646.4357788085936, + "learning_rate": 0.0009623938497462645, + "loss": 1959.7061, + "step": 1330 + }, + { + "ce_loss_12": 3.5570675134658813, + "ce_loss_17": 3.054165482521057, + "ce_loss_23": 2.8529660224914553, + "ce_loss_3": 4.460046792030335, + "ce_loss_6": 4.117917943000793, + "epoch": 0.134, + "grad_norm": 1240.0, + "kl_loss_12": 1530.305157470703, + "kl_loss_17": 441.2641830444336, + "kl_loss_3": 3308.531140136719, + "kl_loss_6": 2643.5994262695312, + "learning_rate": 0.0009617878234984055, + "loss": 1994.7871, + "step": 1340 + }, + { + "ce_loss_12": 3.614206826686859, + "ce_loss_17": 3.138014531135559, + "ce_loss_23": 2.938678300380707, + "ce_loss_3": 4.502785682678223, + "ce_loss_6": 4.152660512924195, + "epoch": 0.135, + "grad_norm": 1088.0, + "kl_loss_12": 1482.2549621582032, + "kl_loss_17": 428.1679397583008, + "kl_loss_3": 3236.7653930664064, + "kl_loss_6": 2564.1514282226562, + "learning_rate": 0.0009611771470522907, + "loss": 1953.1434, + "step": 1350 + }, + { + "ce_loss_12": 3.5755207896232606, + "ce_loss_17": 3.0807791352272034, + "ce_loss_23": 2.8702484488487245, + "ce_loss_3": 4.491902422904968, + "ce_loss_6": 4.136397743225098, + "epoch": 0.136, + "grad_norm": 772.0, + "kl_loss_12": 1496.8853271484375, + "kl_loss_17": 433.81241302490236, + "kl_loss_3": 3306.8744140625, + "kl_loss_6": 2615.6809326171874, + "learning_rate": 0.0009605618265574251, + "loss": 1944.1047, + "step": 1360 + }, + { + "ce_loss_12": 3.5688475489616396, + "ce_loss_17": 3.0453089475631714, + "ce_loss_23": 2.8437607407569887, + "ce_loss_3": 4.478715562820435, + "ce_loss_6": 4.137208127975464, + "epoch": 0.137, + "grad_norm": 812.0, + "kl_loss_12": 1564.785009765625, + "kl_loss_17": 442.6109817504883, + "kl_loss_3": 3373.6415283203123, + "kl_loss_6": 2701.3059326171874, + "learning_rate": 0.0009599418682100792, + "loss": 1991.9086, + "step": 1370 + }, + { + "ce_loss_12": 3.5749759554862974, + "ce_loss_17": 3.0835989832878115, + "ce_loss_23": 2.8831868410110473, + "ce_loss_3": 4.501394367218017, + "ce_loss_6": 4.144296360015869, + "epoch": 0.138, + "grad_norm": 804.0, + "kl_loss_12": 1509.7307250976562, + "kl_loss_17": 432.77608184814454, + "kl_loss_3": 3318.9124755859375, + "kl_loss_6": 2630.0392578125, + "learning_rate": 0.0009593172782532268, + "loss": 1973.3191, + "step": 1380 + }, + { + "ce_loss_12": 3.6028687715530396, + "ce_loss_17": 3.1176024317741393, + "ce_loss_23": 2.9136727333068846, + "ce_loss_3": 4.505716633796692, + "ce_loss_6": 4.151191294193268, + "epoch": 0.139, + "grad_norm": 848.0, + "kl_loss_12": 1499.2755004882813, + "kl_loss_17": 433.8134765625, + "kl_loss_3": 3275.23427734375, + "kl_loss_6": 2584.3994384765624, + "learning_rate": 0.0009586880629764817, + "loss": 1947.1092, + "step": 1390 + }, + { + "ce_loss_12": 3.5526779294013977, + "ce_loss_17": 3.062094521522522, + "ce_loss_23": 2.851840627193451, + "ce_loss_3": 4.468865394592285, + "ce_loss_6": 4.124651682376862, + "epoch": 0.14, + "grad_norm": 768.0, + "kl_loss_12": 1505.257391357422, + "kl_loss_17": 441.1879943847656, + "kl_loss_3": 3301.0413330078127, + "kl_loss_6": 2638.0770751953123, + "learning_rate": 0.0009580542287160348, + "loss": 1941.148, + "step": 1400 + }, + { + "ce_loss_12": 3.5124378323554994, + "ce_loss_17": 3.026162827014923, + "ce_loss_23": 2.8164494037628174, + "ce_loss_3": 4.431016111373902, + "ce_loss_6": 4.072873175144196, + "epoch": 0.141, + "grad_norm": 868.0, + "kl_loss_12": 1509.1204895019532, + "kl_loss_17": 451.11576385498046, + "kl_loss_3": 3324.8519775390623, + "kl_loss_6": 2641.1728759765624, + "learning_rate": 0.0009574157818545901, + "loss": 1946.1949, + "step": 1410 + }, + { + "ce_loss_12": 3.559919762611389, + "ce_loss_17": 3.088271903991699, + "ce_loss_23": 2.888034200668335, + "ce_loss_3": 4.447533106803894, + "ce_loss_6": 4.106405591964721, + "epoch": 0.142, + "grad_norm": 1032.0, + "kl_loss_12": 1453.2306091308594, + "kl_loss_17": 428.4953872680664, + "kl_loss_3": 3214.4719360351564, + "kl_loss_6": 2558.95302734375, + "learning_rate": 0.0009567727288213005, + "loss": 1957.8348, + "step": 1420 + }, + { + "ce_loss_12": 3.5663437843322754, + "ce_loss_17": 3.079855978488922, + "ce_loss_23": 2.8650198936462403, + "ce_loss_3": 4.477605485916138, + "ce_loss_6": 4.123521387577057, + "epoch": 0.143, + "grad_norm": 912.0, + "kl_loss_12": 1524.784912109375, + "kl_loss_17": 462.08311462402344, + "kl_loss_3": 3328.7591918945313, + "kl_loss_6": 2644.153955078125, + "learning_rate": 0.0009561250760917027, + "loss": 1960.2055, + "step": 1430 + }, + { + "ce_loss_12": 3.5682985663414, + "ce_loss_17": 3.092578685283661, + "ce_loss_23": 2.877074158191681, + "ce_loss_3": 4.466118907928466, + "ce_loss_6": 4.126705014705658, + "epoch": 0.144, + "grad_norm": 984.0, + "kl_loss_12": 1512.0598266601562, + "kl_loss_17": 466.4274627685547, + "kl_loss_3": 3282.262243652344, + "kl_loss_6": 2622.5625, + "learning_rate": 0.0009554728301876525, + "loss": 1924.617, + "step": 1440 + }, + { + "ce_loss_12": 3.6111833572387697, + "ce_loss_17": 3.1243973851203917, + "ce_loss_23": 2.9122658252716063, + "ce_loss_3": 4.4899966478347775, + "ce_loss_6": 4.147026920318604, + "epoch": 0.145, + "grad_norm": 780.0, + "kl_loss_12": 1503.76845703125, + "kl_loss_17": 453.64098052978517, + "kl_loss_3": 3236.64072265625, + "kl_loss_6": 2568.684216308594, + "learning_rate": 0.0009548159976772592, + "loss": 1997.7438, + "step": 1450 + }, + { + "ce_loss_12": 3.5708705306053163, + "ce_loss_17": 3.0883104681968687, + "ce_loss_23": 2.874672222137451, + "ce_loss_3": 4.492368054389954, + "ce_loss_6": 4.138778901100158, + "epoch": 0.146, + "grad_norm": 868.0, + "kl_loss_12": 1515.8531860351563, + "kl_loss_17": 453.29029693603513, + "kl_loss_3": 3321.64169921875, + "kl_loss_6": 2639.4420654296873, + "learning_rate": 0.0009541545851748186, + "loss": 1960.3789, + "step": 1460 + }, + { + "ce_loss_12": 3.4660339832305906, + "ce_loss_17": 2.9614037394523622, + "ce_loss_23": 2.751054549217224, + "ce_loss_3": 4.412300229072571, + "ce_loss_6": 4.054196834564209, + "epoch": 0.147, + "grad_norm": 964.0, + "kl_loss_12": 1522.1134399414063, + "kl_loss_17": 445.31788482666013, + "kl_loss_3": 3382.094104003906, + "kl_loss_6": 2679.044787597656, + "learning_rate": 0.0009534885993407473, + "loss": 1979.6027, + "step": 1470 + }, + { + "ce_loss_12": 3.591128182411194, + "ce_loss_17": 3.1027103424072267, + "ce_loss_23": 2.9019294023513793, + "ce_loss_3": 4.5010672330856325, + "ce_loss_6": 4.15693166255951, + "epoch": 0.148, + "grad_norm": 844.0, + "kl_loss_12": 1484.3631774902344, + "kl_loss_17": 428.38403778076173, + "kl_loss_3": 3270.4963989257812, + "kl_loss_6": 2609.3266723632814, + "learning_rate": 0.0009528180468815154, + "loss": 1953.434, + "step": 1480 + }, + { + "ce_loss_12": 3.6338334441185, + "ce_loss_17": 3.1534486651420592, + "ce_loss_23": 2.961154115200043, + "ce_loss_3": 4.526342129707336, + "ce_loss_6": 4.185048532485962, + "epoch": 0.149, + "grad_norm": 948.0, + "kl_loss_12": 1481.1465270996093, + "kl_loss_17": 430.0911437988281, + "kl_loss_3": 3230.5956787109376, + "kl_loss_6": 2562.666149902344, + "learning_rate": 0.0009521429345495787, + "loss": 1933.2992, + "step": 1490 + }, + { + "ce_loss_12": 3.598537302017212, + "ce_loss_17": 3.1211525917053224, + "ce_loss_23": 2.9331879019737244, + "ce_loss_3": 4.479608464241028, + "ce_loss_6": 4.137439227104187, + "epoch": 0.15, + "grad_norm": 960.0, + "kl_loss_12": 1462.8320007324219, + "kl_loss_17": 407.0600341796875, + "kl_loss_3": 3211.813586425781, + "kl_loss_6": 2545.463244628906, + "learning_rate": 0.0009514632691433108, + "loss": 1927.2254, + "step": 1500 + }, + { + "ce_loss_12": 3.5884127616882324, + "ce_loss_17": 3.0939092874526977, + "ce_loss_23": 2.8999706745147704, + "ce_loss_3": 4.481277489662171, + "ce_loss_6": 4.125718057155609, + "epoch": 0.151, + "grad_norm": 948.0, + "kl_loss_12": 1493.9746704101562, + "kl_loss_17": 429.4934387207031, + "kl_loss_3": 3266.7730102539062, + "kl_loss_6": 2581.5667236328127, + "learning_rate": 0.0009507790575069346, + "loss": 1948.7154, + "step": 1510 + }, + { + "ce_loss_12": 3.5742282629013062, + "ce_loss_17": 3.06930810213089, + "ce_loss_23": 2.8608696103096007, + "ce_loss_3": 4.483838415145874, + "ce_loss_6": 4.137926387786865, + "epoch": 0.152, + "grad_norm": 880.0, + "kl_loss_12": 1521.2948486328125, + "kl_loss_17": 445.56875, + "kl_loss_3": 3322.2888793945312, + "kl_loss_6": 2635.8153564453123, + "learning_rate": 0.0009500903065304539, + "loss": 1989.816, + "step": 1520 + }, + { + "ce_loss_12": 3.561050260066986, + "ce_loss_17": 3.0977537989616395, + "ce_loss_23": 2.9062703967094423, + "ce_loss_3": 4.4570392847061155, + "ce_loss_6": 4.105130457878113, + "epoch": 0.153, + "grad_norm": 988.0, + "kl_loss_12": 1437.3990417480468, + "kl_loss_17": 418.8599227905273, + "kl_loss_3": 3199.603796386719, + "kl_loss_6": 2526.914514160156, + "learning_rate": 0.0009493970231495835, + "loss": 1924.5223, + "step": 1530 + }, + { + "ce_loss_12": 3.5027644991874696, + "ce_loss_17": 3.0454777598381044, + "ce_loss_23": 2.8620609521865843, + "ce_loss_3": 4.382567358016968, + "ce_loss_6": 4.0437868475914005, + "epoch": 0.154, + "grad_norm": 916.0, + "kl_loss_12": 1412.9682373046876, + "kl_loss_17": 408.1348571777344, + "kl_loss_3": 3152.004931640625, + "kl_loss_6": 2493.6770629882812, + "learning_rate": 0.0009486992143456792, + "loss": 1884.0617, + "step": 1540 + }, + { + "ce_loss_12": 3.599140226840973, + "ce_loss_17": 3.080864226818085, + "ce_loss_23": 2.8673729300498962, + "ce_loss_3": 4.539337968826294, + "ce_loss_6": 4.187305796146393, + "epoch": 0.155, + "grad_norm": 820.0, + "kl_loss_12": 1557.6886962890626, + "kl_loss_17": 457.3323699951172, + "kl_loss_3": 3423.3054809570312, + "kl_loss_6": 2738.628857421875, + "learning_rate": 0.0009479968871456679, + "loss": 1985.3701, + "step": 1550 + }, + { + "ce_loss_12": 3.543421280384064, + "ce_loss_17": 3.055658221244812, + "ce_loss_23": 2.8478148102760317, + "ce_loss_3": 4.466424870491028, + "ce_loss_6": 4.108865821361542, + "epoch": 0.156, + "grad_norm": 1232.0, + "kl_loss_12": 1501.4693420410156, + "kl_loss_17": 452.0060317993164, + "kl_loss_3": 3327.9519897460937, + "kl_loss_6": 2635.226696777344, + "learning_rate": 0.0009472900486219768, + "loss": 1935.15, + "step": 1560 + }, + { + "ce_loss_12": 3.51930969953537, + "ce_loss_17": 3.0501848578453066, + "ce_loss_23": 2.839846873283386, + "ce_loss_3": 4.412303018569946, + "ce_loss_6": 4.060588872432708, + "epoch": 0.157, + "grad_norm": 1020.0, + "kl_loss_12": 1464.4576171875, + "kl_loss_17": 446.97486267089846, + "kl_loss_3": 3240.67470703125, + "kl_loss_6": 2549.5041015625, + "learning_rate": 0.000946578705892462, + "loss": 1938.8553, + "step": 1570 + }, + { + "ce_loss_12": 3.5285266995429994, + "ce_loss_17": 3.0789849877357485, + "ce_loss_23": 2.8734943747520445, + "ce_loss_3": 4.422140288352966, + "ce_loss_6": 4.075236809253693, + "epoch": 0.158, + "grad_norm": 768.0, + "kl_loss_12": 1412.7643310546875, + "kl_loss_17": 430.24098052978513, + "kl_loss_3": 3183.34345703125, + "kl_loss_6": 2502.259240722656, + "learning_rate": 0.0009458628661203367, + "loss": 1917.8164, + "step": 1580 + }, + { + "ce_loss_12": 3.571163034439087, + "ce_loss_17": 3.0778075575828554, + "ce_loss_23": 2.8818636178970336, + "ce_loss_3": 4.490660810470581, + "ce_loss_6": 4.131191611289978, + "epoch": 0.159, + "grad_norm": 1064.0, + "kl_loss_12": 1492.7530334472656, + "kl_loss_17": 436.4660186767578, + "kl_loss_3": 3318.567431640625, + "kl_loss_6": 2629.0233520507813, + "learning_rate": 0.0009451425365140996, + "loss": 1908.1531, + "step": 1590 + }, + { + "ce_loss_12": 3.594418454170227, + "ce_loss_17": 3.138779675960541, + "ce_loss_23": 2.9434367418289185, + "ce_loss_3": 4.466434574127197, + "ce_loss_6": 4.127680397033691, + "epoch": 0.16, + "grad_norm": 1144.0, + "kl_loss_12": 1422.8166015625, + "kl_loss_17": 416.85819549560546, + "kl_loss_3": 3158.031591796875, + "kl_loss_6": 2484.4607055664064, + "learning_rate": 0.0009444177243274617, + "loss": 1869.9365, + "step": 1600 + }, + { + "ce_loss_12": 3.498467671871185, + "ce_loss_17": 3.0184685468673704, + "ce_loss_23": 2.8139328956604004, + "ce_loss_3": 4.4108422040939335, + "ce_loss_6": 4.0624502301216125, + "epoch": 0.161, + "grad_norm": 860.0, + "kl_loss_12": 1490.5886535644531, + "kl_loss_17": 436.47232208251955, + "kl_loss_3": 3284.385534667969, + "kl_loss_6": 2611.4808715820313, + "learning_rate": 0.0009436884368592739, + "loss": 1933.7881, + "step": 1610 + }, + { + "ce_loss_12": 3.527223265171051, + "ce_loss_17": 3.0564520359039307, + "ce_loss_23": 2.8607160449028015, + "ce_loss_3": 4.418679213523864, + "ce_loss_6": 4.0709424138069155, + "epoch": 0.162, + "grad_norm": 876.0, + "kl_loss_12": 1436.9969604492187, + "kl_loss_17": 422.61949920654297, + "kl_loss_3": 3201.344482421875, + "kl_loss_6": 2530.6149291992188, + "learning_rate": 0.0009429546814534529, + "loss": 1937.7432, + "step": 1620 + }, + { + "ce_loss_12": 3.523609459400177, + "ce_loss_17": 3.058007037639618, + "ce_loss_23": 2.8733000993728637, + "ce_loss_3": 4.4185140371322635, + "ce_loss_6": 4.068736577033997, + "epoch": 0.163, + "grad_norm": 772.0, + "kl_loss_12": 1429.8490417480468, + "kl_loss_17": 410.2163909912109, + "kl_loss_3": 3195.8261840820314, + "kl_loss_6": 2515.595849609375, + "learning_rate": 0.0009422164654989072, + "loss": 1870.0762, + "step": 1630 + }, + { + "ce_loss_12": 3.613458776473999, + "ce_loss_17": 3.1591226935386656, + "ce_loss_23": 2.9718995690345764, + "ce_loss_3": 4.496907019615174, + "ce_loss_6": 4.157380700111389, + "epoch": 0.164, + "grad_norm": 1296.0, + "kl_loss_12": 1414.0604736328125, + "kl_loss_17": 415.0271514892578, + "kl_loss_3": 3165.8775146484377, + "kl_loss_6": 2493.7409057617188, + "learning_rate": 0.0009414737964294635, + "loss": 1882.8074, + "step": 1640 + }, + { + "ce_loss_12": 3.534224784374237, + "ce_loss_17": 3.0942984342575075, + "ce_loss_23": 2.9147730588912966, + "ce_loss_3": 4.40143461227417, + "ce_loss_6": 4.060476922988892, + "epoch": 0.165, + "grad_norm": 1064.0, + "kl_loss_12": 1357.9927368164062, + "kl_loss_17": 398.0754959106445, + "kl_loss_3": 3082.7246826171877, + "kl_loss_6": 2413.491162109375, + "learning_rate": 0.000940726681723791, + "loss": 1866.6898, + "step": 1650 + }, + { + "ce_loss_12": 3.450613260269165, + "ce_loss_17": 2.9654441356658934, + "ce_loss_23": 2.7755810499191282, + "ce_loss_3": 4.376272583007813, + "ce_loss_6": 4.026358532905578, + "epoch": 0.166, + "grad_norm": 1056.0, + "kl_loss_12": 1461.2494140625, + "kl_loss_17": 416.39320678710936, + "kl_loss_3": 3303.5306762695313, + "kl_loss_6": 2606.460546875, + "learning_rate": 0.0009399751289053266, + "loss": 1874.083, + "step": 1660 + }, + { + "ce_loss_12": 3.592409574985504, + "ce_loss_17": 3.1445668935775757, + "ce_loss_23": 2.953047680854797, + "ce_loss_3": 4.4779764175415036, + "ce_loss_6": 4.136311149597168, + "epoch": 0.167, + "grad_norm": 884.0, + "kl_loss_12": 1394.46318359375, + "kl_loss_17": 410.1771575927734, + "kl_loss_3": 3157.8220092773436, + "kl_loss_6": 2486.7565795898436, + "learning_rate": 0.0009392191455421988, + "loss": 1895.3063, + "step": 1670 + }, + { + "ce_loss_12": 3.590947449207306, + "ce_loss_17": 3.136911916732788, + "ce_loss_23": 2.9433919906616213, + "ce_loss_3": 4.479397130012512, + "ce_loss_6": 4.129423558712006, + "epoch": 0.168, + "grad_norm": 1032.0, + "kl_loss_12": 1427.6645935058593, + "kl_loss_17": 427.07548828125, + "kl_loss_3": 3183.6565185546874, + "kl_loss_6": 2503.090478515625, + "learning_rate": 0.0009384587392471515, + "loss": 1849.109, + "step": 1680 + }, + { + "ce_loss_12": 3.562043583393097, + "ce_loss_17": 3.113064157962799, + "ce_loss_23": 2.9287237524986267, + "ce_loss_3": 4.418517231941223, + "ce_loss_6": 4.085638308525086, + "epoch": 0.169, + "grad_norm": 1880.0, + "kl_loss_12": 1395.710760498047, + "kl_loss_17": 402.3734420776367, + "kl_loss_3": 3108.858435058594, + "kl_loss_6": 2451.7667602539063, + "learning_rate": 0.0009376939176774678, + "loss": 1843.5416, + "step": 1690 + }, + { + "ce_loss_12": 3.550597834587097, + "ce_loss_17": 3.0912911772727965, + "ce_loss_23": 2.901586651802063, + "ce_loss_3": 4.4385639190673825, + "ce_loss_6": 4.095955264568329, + "epoch": 0.17, + "grad_norm": 796.0, + "kl_loss_12": 1405.624041748047, + "kl_loss_17": 409.81105651855466, + "kl_loss_3": 3160.742041015625, + "kl_loss_6": 2495.841442871094, + "learning_rate": 0.0009369246885348925, + "loss": 1891.4168, + "step": 1700 + }, + { + "ce_loss_12": 3.5553791761398315, + "ce_loss_17": 3.076837086677551, + "ce_loss_23": 2.8917274832725526, + "ce_loss_3": 4.461563658714295, + "ce_loss_6": 4.118083095550537, + "epoch": 0.171, + "grad_norm": 900.0, + "kl_loss_12": 1449.8334106445313, + "kl_loss_17": 407.05690307617186, + "kl_loss_3": 3231.3767700195312, + "kl_loss_6": 2578.7130126953125, + "learning_rate": 0.0009361510595655545, + "loss": 1901.1266, + "step": 1710 + }, + { + "ce_loss_12": 3.528924262523651, + "ce_loss_17": 3.053530716896057, + "ce_loss_23": 2.852275025844574, + "ce_loss_3": 4.40249902009964, + "ce_loss_6": 4.066362845897674, + "epoch": 0.172, + "grad_norm": 924.0, + "kl_loss_12": 1457.4590148925781, + "kl_loss_17": 424.09732818603516, + "kl_loss_3": 3216.18837890625, + "kl_loss_6": 2553.196826171875, + "learning_rate": 0.0009353730385598887, + "loss": 1891.0375, + "step": 1720 + }, + { + "ce_loss_12": 3.4698773980140687, + "ce_loss_17": 2.9942647933959963, + "ce_loss_23": 2.8003952622413637, + "ce_loss_3": 4.3849413871765135, + "ce_loss_6": 4.0349366664886475, + "epoch": 0.173, + "grad_norm": 1096.0, + "kl_loss_12": 1450.6549926757812, + "kl_loss_17": 420.19408721923827, + "kl_loss_3": 3264.2931640625, + "kl_loss_6": 2581.879248046875, + "learning_rate": 0.0009345906333525581, + "loss": 1917.2275, + "step": 1730 + }, + { + "ce_loss_12": 3.487299084663391, + "ce_loss_17": 3.0379632472991944, + "ce_loss_23": 2.832411003112793, + "ce_loss_3": 4.398686468601227, + "ce_loss_6": 4.05125447511673, + "epoch": 0.174, + "grad_norm": 1064.0, + "kl_loss_12": 1426.8994995117187, + "kl_loss_17": 433.52008056640625, + "kl_loss_3": 3225.7196655273438, + "kl_loss_6": 2552.5757690429687, + "learning_rate": 0.0009338038518223745, + "loss": 1887.9564, + "step": 1740 + }, + { + "ce_loss_12": 3.549421525001526, + "ce_loss_17": 3.088615870475769, + "ce_loss_23": 2.8778313755989076, + "ce_loss_3": 4.448174166679382, + "ce_loss_6": 4.108810389041901, + "epoch": 0.175, + "grad_norm": 1176.0, + "kl_loss_12": 1455.715264892578, + "kl_loss_17": 449.5879791259766, + "kl_loss_3": 3243.080114746094, + "kl_loss_6": 2573.329064941406, + "learning_rate": 0.0009330127018922195, + "loss": 1961.3834, + "step": 1750 + }, + { + "ce_loss_12": 3.505111026763916, + "ce_loss_17": 3.049764132499695, + "ce_loss_23": 2.850164294242859, + "ce_loss_3": 4.395489168167114, + "ce_loss_6": 4.059174644947052, + "epoch": 0.176, + "grad_norm": 1024.0, + "kl_loss_12": 1426.4638122558595, + "kl_loss_17": 425.43538665771484, + "kl_loss_3": 3204.3296508789062, + "kl_loss_6": 2542.2288818359375, + "learning_rate": 0.0009322171915289634, + "loss": 1899.1223, + "step": 1760 + }, + { + "ce_loss_12": 3.518779253959656, + "ce_loss_17": 3.0761126041412354, + "ce_loss_23": 2.895628201961517, + "ce_loss_3": 4.394254040718079, + "ce_loss_6": 4.065056955814361, + "epoch": 0.177, + "grad_norm": 860.0, + "kl_loss_12": 1398.0104370117188, + "kl_loss_17": 413.5629913330078, + "kl_loss_3": 3148.1657836914064, + "kl_loss_6": 2497.1087890625, + "learning_rate": 0.0009314173287433873, + "loss": 1852.8078, + "step": 1770 + }, + { + "ce_loss_12": 3.51189923286438, + "ce_loss_17": 3.0561089396476744, + "ce_loss_23": 2.8649056434631346, + "ce_loss_3": 4.407578945159912, + "ce_loss_6": 4.058146071434021, + "epoch": 0.178, + "grad_norm": 940.0, + "kl_loss_12": 1422.9045959472655, + "kl_loss_17": 421.48497161865237, + "kl_loss_3": 3196.1125244140626, + "kl_loss_6": 2527.5512084960938, + "learning_rate": 0.0009306131215901003, + "loss": 1853.0984, + "step": 1780 + }, + { + "ce_loss_12": 3.5367927074432375, + "ce_loss_17": 3.089224076271057, + "ce_loss_23": 2.904616725444794, + "ce_loss_3": 4.4222025871276855, + "ce_loss_6": 4.077728879451752, + "epoch": 0.179, + "grad_norm": 1056.0, + "kl_loss_12": 1400.3500183105468, + "kl_loss_17": 412.55066833496096, + "kl_loss_3": 3161.286022949219, + "kl_loss_6": 2497.1654052734375, + "learning_rate": 0.0009298045781674596, + "loss": 1833.8826, + "step": 1790 + }, + { + "ce_loss_12": 3.5144901275634766, + "ce_loss_17": 3.0709169149398803, + "ce_loss_23": 2.879539155960083, + "ce_loss_3": 4.379846239089966, + "ce_loss_6": 4.051528000831604, + "epoch": 0.18, + "grad_norm": 1120.0, + "kl_loss_12": 1391.6433166503907, + "kl_loss_17": 422.61016235351565, + "kl_loss_3": 3123.181042480469, + "kl_loss_6": 2487.7217407226562, + "learning_rate": 0.0009289917066174886, + "loss": 1872.4902, + "step": 1800 + }, + { + "ce_loss_12": 3.489149105548859, + "ce_loss_17": 3.0734178900718687, + "ce_loss_23": 2.8796284437179565, + "ce_loss_3": 4.333601975440979, + "ce_loss_6": 4.019747495651245, + "epoch": 0.181, + "grad_norm": 1104.0, + "kl_loss_12": 1349.7888732910155, + "kl_loss_17": 424.239079284668, + "kl_loss_3": 3048.8976928710936, + "kl_loss_6": 2417.6389282226564, + "learning_rate": 0.0009281745151257945, + "loss": 1826.0723, + "step": 1810 + }, + { + "ce_loss_12": 3.53800128698349, + "ce_loss_17": 3.093510103225708, + "ce_loss_23": 2.8986947774887084, + "ce_loss_3": 4.429437685012817, + "ce_loss_6": 4.085277903079986, + "epoch": 0.182, + "grad_norm": 916.0, + "kl_loss_12": 1390.8755798339844, + "kl_loss_17": 430.1387771606445, + "kl_loss_3": 3154.4113159179688, + "kl_loss_6": 2487.5041625976564, + "learning_rate": 0.0009273530119214868, + "loss": 1881.2629, + "step": 1820 + }, + { + "ce_loss_12": 3.6042762279510496, + "ce_loss_17": 3.167820060253143, + "ce_loss_23": 2.9859367847442626, + "ce_loss_3": 4.4725106954574585, + "ce_loss_6": 4.148066067695618, + "epoch": 0.183, + "grad_norm": 856.0, + "kl_loss_12": 1389.0459228515624, + "kl_loss_17": 408.4911514282227, + "kl_loss_3": 3087.9786865234373, + "kl_loss_6": 2463.8032348632814, + "learning_rate": 0.0009265272052770935, + "loss": 1818.3871, + "step": 1830 + }, + { + "ce_loss_12": 3.4827219009399415, + "ce_loss_17": 3.0174628019332888, + "ce_loss_23": 2.824735474586487, + "ce_loss_3": 4.38643205165863, + "ce_loss_6": 4.036978626251221, + "epoch": 0.184, + "grad_norm": 864.0, + "kl_loss_12": 1409.780340576172, + "kl_loss_17": 412.9845397949219, + "kl_loss_3": 3203.962255859375, + "kl_loss_6": 2522.3953369140627, + "learning_rate": 0.0009256971035084784, + "loss": 1878.3439, + "step": 1840 + }, + { + "ce_loss_12": 3.448565185070038, + "ce_loss_17": 2.9642847418785094, + "ce_loss_23": 2.762493336200714, + "ce_loss_3": 4.362462449073791, + "ce_loss_6": 4.01518462896347, + "epoch": 0.185, + "grad_norm": 1160.0, + "kl_loss_12": 1466.8582885742187, + "kl_loss_17": 427.94970245361327, + "kl_loss_3": 3280.3394165039062, + "kl_loss_6": 2595.3165893554688, + "learning_rate": 0.0009248627149747573, + "loss": 1901.1352, + "step": 1850 + }, + { + "ce_loss_12": 3.5774747490882874, + "ce_loss_17": 3.132255482673645, + "ce_loss_23": 2.9522884011268617, + "ce_loss_3": 4.440769362449646, + "ce_loss_6": 4.11391499042511, + "epoch": 0.186, + "grad_norm": 1216.0, + "kl_loss_12": 1390.4288696289063, + "kl_loss_17": 399.74344940185546, + "kl_loss_3": 3114.68369140625, + "kl_loss_6": 2476.0990478515623, + "learning_rate": 0.0009240240480782129, + "loss": 1848.3285, + "step": 1860 + }, + { + "ce_loss_12": 3.504031074047089, + "ce_loss_17": 3.0449450731277468, + "ce_loss_23": 2.8575493335723876, + "ce_loss_3": 4.403111290931702, + "ce_loss_6": 4.054022586345672, + "epoch": 0.187, + "grad_norm": 932.0, + "kl_loss_12": 1397.6381225585938, + "kl_loss_17": 406.8374771118164, + "kl_loss_3": 3180.344445800781, + "kl_loss_6": 2515.5447021484374, + "learning_rate": 0.0009231811112642122, + "loss": 1848.7035, + "step": 1870 + }, + { + "ce_loss_12": 3.526438903808594, + "ce_loss_17": 3.087480330467224, + "ce_loss_23": 2.902469539642334, + "ce_loss_3": 4.3881865501403805, + "ce_loss_6": 4.057741189002991, + "epoch": 0.188, + "grad_norm": 1136.0, + "kl_loss_12": 1376.4404724121093, + "kl_loss_17": 405.33871002197264, + "kl_loss_3": 3085.746923828125, + "kl_loss_6": 2443.7936584472654, + "learning_rate": 0.0009223339130211192, + "loss": 1826.3887, + "step": 1880 + }, + { + "ce_loss_12": 3.399556028842926, + "ce_loss_17": 2.9548186540603636, + "ce_loss_23": 2.7765444099903105, + "ce_loss_3": 4.322276103496551, + "ce_loss_6": 3.9670225024223327, + "epoch": 0.189, + "grad_norm": 996.0, + "kl_loss_12": 1366.1088439941407, + "kl_loss_17": 390.74108123779297, + "kl_loss_3": 3201.887548828125, + "kl_loss_6": 2507.1484924316405, + "learning_rate": 0.0009214824618802108, + "loss": 1857.4459, + "step": 1890 + }, + { + "ce_loss_12": 3.5746437430381777, + "ce_loss_17": 3.1217092633247376, + "ce_loss_23": 2.9347758293151855, + "ce_loss_3": 4.446531271934509, + "ce_loss_6": 4.109218180179596, + "epoch": 0.19, + "grad_norm": 968.0, + "kl_loss_12": 1395.1021362304687, + "kl_loss_17": 405.2522399902344, + "kl_loss_3": 3118.518701171875, + "kl_loss_6": 2450.5746948242186, + "learning_rate": 0.0009206267664155906, + "loss": 1880.9684, + "step": 1900 + }, + { + "ce_loss_12": 3.5064347863197325, + "ce_loss_17": 3.0529030919075013, + "ce_loss_23": 2.865175986289978, + "ce_loss_3": 4.398700284957886, + "ce_loss_6": 4.056966316699982, + "epoch": 0.191, + "grad_norm": 828.0, + "kl_loss_12": 1388.640936279297, + "kl_loss_17": 400.7437454223633, + "kl_loss_3": 3160.6127685546876, + "kl_loss_6": 2485.7033813476564, + "learning_rate": 0.0009197668352441024, + "loss": 1862.618, + "step": 1910 + }, + { + "ce_loss_12": 3.540658414363861, + "ce_loss_17": 3.094384551048279, + "ce_loss_23": 2.9136697888374328, + "ce_loss_3": 4.423890471458435, + "ce_loss_6": 4.079328870773315, + "epoch": 0.192, + "grad_norm": 1020.0, + "kl_loss_12": 1373.4152404785157, + "kl_loss_17": 395.8149917602539, + "kl_loss_3": 3131.6627197265625, + "kl_loss_6": 2459.1564819335936, + "learning_rate": 0.0009189026770252437, + "loss": 1851.2279, + "step": 1920 + }, + { + "ce_loss_12": 3.56448096036911, + "ce_loss_17": 3.1186470866203306, + "ce_loss_23": 2.93763267993927, + "ce_loss_3": 4.442287898063659, + "ce_loss_6": 4.099389755725861, + "epoch": 0.193, + "grad_norm": 956.0, + "kl_loss_12": 1375.8356872558593, + "kl_loss_17": 395.9308288574219, + "kl_loss_3": 3114.5266235351564, + "kl_loss_6": 2440.400048828125, + "learning_rate": 0.000918034300461078, + "loss": 1889.0145, + "step": 1930 + }, + { + "ce_loss_12": 3.5736100912094115, + "ce_loss_17": 3.1367435455322266, + "ce_loss_23": 2.9566476941108704, + "ce_loss_3": 4.431257534027099, + "ce_loss_6": 4.098700952529907, + "epoch": 0.194, + "grad_norm": 980.0, + "kl_loss_12": 1361.859637451172, + "kl_loss_17": 396.3990768432617, + "kl_loss_3": 3055.080847167969, + "kl_loss_6": 2412.7113403320313, + "learning_rate": 0.0009171617142961477, + "loss": 1820.2039, + "step": 1940 + }, + { + "ce_loss_12": 3.551243340969086, + "ce_loss_17": 3.1035329580307005, + "ce_loss_23": 2.925469446182251, + "ce_loss_3": 4.420022678375244, + "ce_loss_6": 4.082593929767609, + "epoch": 0.195, + "grad_norm": 928.0, + "kl_loss_12": 1381.5827026367188, + "kl_loss_17": 391.83741607666013, + "kl_loss_3": 3100.8082275390625, + "kl_loss_6": 2442.37509765625, + "learning_rate": 0.0009162849273173857, + "loss": 1829.3063, + "step": 1950 + }, + { + "ce_loss_12": 3.4948492884635924, + "ce_loss_17": 3.055851709842682, + "ce_loss_23": 2.8745150208473205, + "ce_loss_3": 4.382340717315674, + "ce_loss_6": 4.030688488483429, + "epoch": 0.196, + "grad_norm": 1184.0, + "kl_loss_12": 1361.248681640625, + "kl_loss_17": 393.1595397949219, + "kl_loss_3": 3122.5375244140623, + "kl_loss_6": 2436.858264160156, + "learning_rate": 0.0009154039483540273, + "loss": 1850.5926, + "step": 1960 + }, + { + "ce_loss_12": 3.4737292528152466, + "ce_loss_17": 3.0338326692581177, + "ce_loss_23": 2.854222285747528, + "ce_loss_3": 4.370313429832459, + "ce_loss_6": 4.023552799224854, + "epoch": 0.197, + "grad_norm": 1016.0, + "kl_loss_12": 1363.5056640625, + "kl_loss_17": 396.76910400390625, + "kl_loss_3": 3134.1785522460937, + "kl_loss_6": 2465.798193359375, + "learning_rate": 0.0009145187862775209, + "loss": 1834.7617, + "step": 1970 + }, + { + "ce_loss_12": 3.5026198029518127, + "ce_loss_17": 3.0663907170295714, + "ce_loss_23": 2.8810113787651064, + "ce_loss_3": 4.366890490055084, + "ce_loss_6": 4.034711396694183, + "epoch": 0.198, + "grad_norm": 852.0, + "kl_loss_12": 1363.4940185546875, + "kl_loss_17": 409.51978454589846, + "kl_loss_3": 3089.037390136719, + "kl_loss_6": 2430.1923278808595, + "learning_rate": 0.0009136294500014386, + "loss": 1825.1098, + "step": 1980 + }, + { + "ce_loss_12": 3.4802589416503906, + "ce_loss_17": 3.024968659877777, + "ce_loss_23": 2.8380630016326904, + "ce_loss_3": 4.3972203373909, + "ce_loss_6": 4.06009738445282, + "epoch": 0.199, + "grad_norm": 1072.0, + "kl_loss_12": 1394.8473205566406, + "kl_loss_17": 410.90734405517577, + "kl_loss_3": 3194.697900390625, + "kl_loss_6": 2531.3602294921875, + "learning_rate": 0.000912735948481387, + "loss": 1883.1371, + "step": 1990 + }, + { + "ce_loss_12": 3.486107325553894, + "ce_loss_17": 3.051172912120819, + "ce_loss_23": 2.869850277900696, + "ce_loss_3": 4.371750068664551, + "ce_loss_6": 4.029321420192718, + "epoch": 0.2, + "grad_norm": 1008.0, + "kl_loss_12": 1377.7386535644532, + "kl_loss_17": 407.95068817138673, + "kl_loss_3": 3139.5554321289064, + "kl_loss_6": 2464.485803222656, + "learning_rate": 0.0009118382907149164, + "loss": 1813.9844, + "step": 2000 + }, + { + "ce_loss_12": 3.5094108819961547, + "ce_loss_17": 3.0742453813552855, + "ce_loss_23": 2.8915016531944273, + "ce_loss_3": 4.396561598777771, + "ce_loss_6": 4.0471575379371645, + "epoch": 0.201, + "grad_norm": 912.0, + "kl_loss_12": 1373.4571838378906, + "kl_loss_17": 401.7746154785156, + "kl_loss_3": 3119.1673217773437, + "kl_loss_6": 2440.0885620117188, + "learning_rate": 0.0009109364857414306, + "loss": 1819.1262, + "step": 2010 + }, + { + "ce_loss_12": 3.4708608746528626, + "ce_loss_17": 3.0348114490509035, + "ce_loss_23": 2.85721560716629, + "ce_loss_3": 4.349551677703857, + "ce_loss_6": 4.001498186588288, + "epoch": 0.202, + "grad_norm": 872.0, + "kl_loss_12": 1353.9906433105468, + "kl_loss_17": 394.17857055664064, + "kl_loss_3": 3116.371875, + "kl_loss_6": 2437.3400146484373, + "learning_rate": 0.0009100305426420956, + "loss": 1858.0932, + "step": 2020 + }, + { + "ce_loss_12": 3.465420198440552, + "ce_loss_17": 3.0086682796478272, + "ce_loss_23": 2.8314072251319886, + "ce_loss_3": 4.394584012031555, + "ce_loss_6": 4.039972257614136, + "epoch": 0.203, + "grad_norm": 928.0, + "kl_loss_12": 1402.326953125, + "kl_loss_17": 396.2106231689453, + "kl_loss_3": 3243.264025878906, + "kl_loss_6": 2553.330419921875, + "learning_rate": 0.0009091204705397484, + "loss": 1852.6303, + "step": 2030 + }, + { + "ce_loss_12": 3.461729180812836, + "ce_loss_17": 2.994718587398529, + "ce_loss_23": 2.8124589204788206, + "ce_loss_3": 4.370394968986512, + "ce_loss_6": 4.028866863250732, + "epoch": 0.204, + "grad_norm": 1096.0, + "kl_loss_12": 1412.3960754394532, + "kl_loss_17": 398.8710708618164, + "kl_loss_3": 3242.097900390625, + "kl_loss_6": 2569.238562011719, + "learning_rate": 0.0009082062785988049, + "loss": 1872.2047, + "step": 2040 + }, + { + "ce_loss_12": 3.547369456291199, + "ce_loss_17": 3.1226140022277833, + "ce_loss_23": 2.947987914085388, + "ce_loss_3": 4.40339457988739, + "ce_loss_6": 4.07686003446579, + "epoch": 0.205, + "grad_norm": 948.0, + "kl_loss_12": 1351.3995056152344, + "kl_loss_17": 389.1872100830078, + "kl_loss_3": 3056.0323120117187, + "kl_loss_6": 2417.5919677734373, + "learning_rate": 0.0009072879760251679, + "loss": 1828.1234, + "step": 2050 + }, + { + "ce_loss_12": 3.5337481617927553, + "ce_loss_17": 3.0816317319869997, + "ce_loss_23": 2.895327150821686, + "ce_loss_3": 4.419333148002624, + "ce_loss_6": 4.081639957427979, + "epoch": 0.206, + "grad_norm": 852.0, + "kl_loss_12": 1393.3875061035155, + "kl_loss_17": 401.1993011474609, + "kl_loss_3": 3170.3522705078126, + "kl_loss_6": 2516.9411743164064, + "learning_rate": 0.0009063655720661341, + "loss": 1844.2873, + "step": 2060 + }, + { + "ce_loss_12": 3.5453538060188294, + "ce_loss_17": 3.1116058468818664, + "ce_loss_23": 2.9324185848236084, + "ce_loss_3": 4.399405813217163, + "ce_loss_6": 4.07843269109726, + "epoch": 0.207, + "grad_norm": 808.0, + "kl_loss_12": 1355.4486755371095, + "kl_loss_17": 393.4923431396484, + "kl_loss_3": 3041.4896850585938, + "kl_loss_6": 2420.3547973632812, + "learning_rate": 0.000905439076010301, + "loss": 1818.3211, + "step": 2070 + }, + { + "ce_loss_12": 3.5257867217063903, + "ce_loss_17": 3.0792110562324524, + "ce_loss_23": 2.8893296360969543, + "ce_loss_3": 4.41441969871521, + "ce_loss_6": 4.0752364754676815, + "epoch": 0.208, + "grad_norm": 856.0, + "kl_loss_12": 1386.7962463378906, + "kl_loss_17": 410.9291061401367, + "kl_loss_3": 3146.4475708007812, + "kl_loss_6": 2472.69052734375, + "learning_rate": 0.0009045084971874737, + "loss": 1811.7352, + "step": 2080 + }, + { + "ce_loss_12": 3.500695765018463, + "ce_loss_17": 3.069680321216583, + "ce_loss_23": 2.8757070302963257, + "ce_loss_3": 4.375121593475342, + "ce_loss_6": 4.043542790412903, + "epoch": 0.209, + "grad_norm": 1152.0, + "kl_loss_12": 1371.920343017578, + "kl_loss_17": 412.9090606689453, + "kl_loss_3": 3107.186181640625, + "kl_loss_6": 2448.299768066406, + "learning_rate": 0.0009035738449685707, + "loss": 1857.4186, + "step": 2090 + }, + { + "ce_loss_12": 3.4602583169937136, + "ce_loss_17": 2.9988523602485655, + "ce_loss_23": 2.810432803630829, + "ce_loss_3": 4.37140154838562, + "ce_loss_6": 4.031448018550873, + "epoch": 0.21, + "grad_norm": 928.0, + "kl_loss_12": 1398.5671875, + "kl_loss_17": 402.3813186645508, + "kl_loss_3": 3212.7988159179686, + "kl_loss_6": 2537.6999389648436, + "learning_rate": 0.0009026351287655293, + "loss": 1838.1242, + "step": 2100 + }, + { + "ce_loss_12": 3.580909264087677, + "ce_loss_17": 3.174431824684143, + "ce_loss_23": 3.0090521216392516, + "ce_loss_3": 4.410317397117614, + "ce_loss_6": 4.08203741312027, + "epoch": 0.211, + "grad_norm": 1072.0, + "kl_loss_12": 1291.9223876953124, + "kl_loss_17": 377.8187744140625, + "kl_loss_3": 2940.6177612304687, + "kl_loss_6": 2313.8096557617187, + "learning_rate": 0.0009016923580312113, + "loss": 1745.3838, + "step": 2110 + }, + { + "ce_loss_12": 3.478354549407959, + "ce_loss_17": 3.0455804109573363, + "ce_loss_23": 2.873227298259735, + "ce_loss_3": 4.352131581306457, + "ce_loss_6": 4.007352542877197, + "epoch": 0.212, + "grad_norm": 1120.0, + "kl_loss_12": 1344.7798950195313, + "kl_loss_17": 390.32891540527345, + "kl_loss_3": 3080.3701904296877, + "kl_loss_6": 2408.41884765625, + "learning_rate": 0.0009007455422593077, + "loss": 1836.1477, + "step": 2120 + }, + { + "ce_loss_12": 3.5267333984375, + "ce_loss_17": 3.064280998706818, + "ce_loss_23": 2.881734824180603, + "ce_loss_3": 4.405498075485229, + "ce_loss_6": 4.059432685375214, + "epoch": 0.213, + "grad_norm": 1064.0, + "kl_loss_12": 1400.2952575683594, + "kl_loss_17": 398.35522003173827, + "kl_loss_3": 3146.431506347656, + "kl_loss_6": 2484.1463928222656, + "learning_rate": 0.0008997946909842425, + "loss": 1854.3893, + "step": 2130 + }, + { + "ce_loss_12": 3.5560362100601197, + "ce_loss_17": 3.0794739723205566, + "ce_loss_23": 2.8871533274650574, + "ce_loss_3": 4.470532035827636, + "ce_loss_6": 4.124138689041137, + "epoch": 0.214, + "grad_norm": 1120.0, + "kl_loss_12": 1434.1231384277344, + "kl_loss_17": 417.38670349121094, + "kl_loss_3": 3245.7887573242188, + "kl_loss_6": 2579.8198120117186, + "learning_rate": 0.0008988398137810777, + "loss": 1849.3316, + "step": 2140 + }, + { + "ce_loss_12": 3.537279522418976, + "ce_loss_17": 3.1066004991531373, + "ce_loss_23": 2.929929780960083, + "ce_loss_3": 4.395436775684357, + "ce_loss_6": 4.06287008523941, + "epoch": 0.215, + "grad_norm": 1012.0, + "kl_loss_12": 1343.9474609375, + "kl_loss_17": 388.11578979492185, + "kl_loss_3": 3054.53203125, + "kl_loss_6": 2400.9071044921875, + "learning_rate": 0.0008978809202654162, + "loss": 1788.1793, + "step": 2150 + }, + { + "ce_loss_12": 3.5223759412765503, + "ce_loss_17": 3.099969244003296, + "ce_loss_23": 2.9096084117889403, + "ce_loss_3": 4.392382049560547, + "ce_loss_6": 4.058804070949554, + "epoch": 0.216, + "grad_norm": 1024.0, + "kl_loss_12": 1335.4723693847657, + "kl_loss_17": 410.1502883911133, + "kl_loss_3": 3057.390380859375, + "kl_loss_6": 2406.6466796875, + "learning_rate": 0.0008969180200933046, + "loss": 1821.6504, + "step": 2160 + }, + { + "ce_loss_12": 3.5146302223205566, + "ce_loss_17": 3.066201627254486, + "ce_loss_23": 2.864242434501648, + "ce_loss_3": 4.410690546035767, + "ce_loss_6": 4.065093767642975, + "epoch": 0.217, + "grad_norm": 1280.0, + "kl_loss_12": 1387.5415649414062, + "kl_loss_17": 426.98915405273436, + "kl_loss_3": 3146.7869384765627, + "kl_loss_6": 2480.2459106445312, + "learning_rate": 0.0008959511229611376, + "loss": 1859.2805, + "step": 2170 + }, + { + "ce_loss_12": 3.547934627532959, + "ce_loss_17": 3.1149901032447813, + "ce_loss_23": 2.9360812187194822, + "ce_loss_3": 4.426415348052979, + "ce_loss_6": 4.092259585857391, + "epoch": 0.218, + "grad_norm": 944.0, + "kl_loss_12": 1348.66884765625, + "kl_loss_17": 391.7157409667969, + "kl_loss_3": 3096.0570434570313, + "kl_loss_6": 2439.3980102539062, + "learning_rate": 0.0008949802386055581, + "loss": 1815.5332, + "step": 2180 + }, + { + "ce_loss_12": 3.4251845479011536, + "ce_loss_17": 2.9923732399940492, + "ce_loss_23": 2.8120394349098206, + "ce_loss_3": 4.317071855068207, + "ce_loss_6": 3.9748199582099915, + "epoch": 0.219, + "grad_norm": 1184.0, + "kl_loss_12": 1339.4009887695313, + "kl_loss_17": 385.9377899169922, + "kl_loss_3": 3084.246716308594, + "kl_loss_6": 2414.5926025390627, + "learning_rate": 0.0008940053768033609, + "loss": 1844.834, + "step": 2190 + }, + { + "ce_loss_12": 3.5045410871505736, + "ce_loss_17": 3.0732019186019897, + "ce_loss_23": 2.8981399059295656, + "ce_loss_3": 4.353909540176391, + "ce_loss_6": 4.019334411621093, + "epoch": 0.22, + "grad_norm": 976.0, + "kl_loss_12": 1333.2462219238282, + "kl_loss_17": 388.82810974121094, + "kl_loss_3": 3036.5219604492186, + "kl_loss_6": 2388.822961425781, + "learning_rate": 0.0008930265473713938, + "loss": 1795.5426, + "step": 2200 + }, + { + "ce_loss_12": 3.47272390127182, + "ce_loss_17": 3.040353035926819, + "ce_loss_23": 2.855030930042267, + "ce_loss_3": 4.346427464485169, + "ce_loss_6": 4.012941122055054, + "epoch": 0.221, + "grad_norm": 1032.0, + "kl_loss_12": 1339.0766662597657, + "kl_loss_17": 402.6368606567383, + "kl_loss_3": 3076.573132324219, + "kl_loss_6": 2416.5378173828126, + "learning_rate": 0.0008920437601664579, + "loss": 1776.9988, + "step": 2210 + }, + { + "ce_loss_12": 3.4764568209648132, + "ce_loss_17": 3.0413265466690063, + "ce_loss_23": 2.8498608231544496, + "ce_loss_3": 4.359201240539551, + "ce_loss_6": 4.015230906009674, + "epoch": 0.222, + "grad_norm": 904.0, + "kl_loss_12": 1372.467919921875, + "kl_loss_17": 416.31578216552737, + "kl_loss_3": 3135.654150390625, + "kl_loss_6": 2461.368017578125, + "learning_rate": 0.0008910570250852097, + "loss": 1804.4967, + "step": 2220 + }, + { + "ce_loss_12": 3.5195571303367617, + "ce_loss_17": 3.1164942383766174, + "ce_loss_23": 2.9423655033111573, + "ce_loss_3": 4.369904780387879, + "ce_loss_6": 4.036461865901947, + "epoch": 0.223, + "grad_norm": 1112.0, + "kl_loss_12": 1290.9219665527344, + "kl_loss_17": 376.9027725219727, + "kl_loss_3": 2988.6837158203125, + "kl_loss_6": 2340.4248229980467, + "learning_rate": 0.0008900663520640604, + "loss": 1759.0195, + "step": 2230 + }, + { + "ce_loss_12": 3.5042008996009826, + "ce_loss_17": 3.0734793901443482, + "ce_loss_23": 2.9011544823646545, + "ce_loss_3": 4.36311182975769, + "ce_loss_6": 4.041118001937866, + "epoch": 0.224, + "grad_norm": 1004.0, + "kl_loss_12": 1334.5621520996094, + "kl_loss_17": 387.0037338256836, + "kl_loss_3": 3057.005822753906, + "kl_loss_6": 2425.3416015625, + "learning_rate": 0.0008890717510790764, + "loss": 1801.5563, + "step": 2240 + }, + { + "ce_loss_12": 3.466621422767639, + "ce_loss_17": 3.0410445094108582, + "ce_loss_23": 2.867064726352692, + "ce_loss_3": 4.357352328300476, + "ce_loss_6": 4.015378224849701, + "epoch": 0.225, + "grad_norm": 1088.0, + "kl_loss_12": 1319.4816345214845, + "kl_loss_17": 377.6472473144531, + "kl_loss_3": 3086.9820068359377, + "kl_loss_6": 2414.7611450195313, + "learning_rate": 0.0008880732321458784, + "loss": 1812.3871, + "step": 2250 + }, + { + "ce_loss_12": 3.4890703678131105, + "ce_loss_17": 3.0644241333007813, + "ce_loss_23": 2.8930245280265807, + "ce_loss_3": 4.359507060050964, + "ce_loss_6": 4.0214638948440555, + "epoch": 0.226, + "grad_norm": 1008.0, + "kl_loss_12": 1324.6771667480468, + "kl_loss_17": 375.7590301513672, + "kl_loss_3": 3045.208117675781, + "kl_loss_6": 2393.032666015625, + "learning_rate": 0.0008870708053195413, + "loss": 1810.9191, + "step": 2260 + }, + { + "ce_loss_12": 3.498258447647095, + "ce_loss_17": 3.079079532623291, + "ce_loss_23": 2.9166102409362793, + "ce_loss_3": 4.339015591144562, + "ce_loss_6": 4.020640075206757, + "epoch": 0.227, + "grad_norm": 1096.0, + "kl_loss_12": 1285.5239379882812, + "kl_loss_17": 359.8129364013672, + "kl_loss_3": 2961.5877197265627, + "kl_loss_6": 2341.482598876953, + "learning_rate": 0.0008860644806944918, + "loss": 1761.2666, + "step": 2270 + }, + { + "ce_loss_12": 3.470140528678894, + "ce_loss_17": 3.0365947604179384, + "ce_loss_23": 2.8646312832832335, + "ce_loss_3": 4.346737742424011, + "ce_loss_6": 4.01911609172821, + "epoch": 0.228, + "grad_norm": 956.0, + "kl_loss_12": 1334.8696166992188, + "kl_loss_17": 383.21412506103513, + "kl_loss_3": 3073.5315063476564, + "kl_loss_6": 2431.9812255859374, + "learning_rate": 0.0008850542684044079, + "loss": 1765.1102, + "step": 2280 + }, + { + "ce_loss_12": 3.4536983966827393, + "ce_loss_17": 3.001571798324585, + "ce_loss_23": 2.8181382417678833, + "ce_loss_3": 4.369045734405518, + "ce_loss_6": 4.026021492481232, + "epoch": 0.229, + "grad_norm": 1096.0, + "kl_loss_12": 1394.3187866210938, + "kl_loss_17": 398.59433898925784, + "kl_loss_3": 3213.7202880859377, + "kl_loss_6": 2537.492883300781, + "learning_rate": 0.0008840401786221159, + "loss": 1824.2578, + "step": 2290 + }, + { + "ce_loss_12": 3.549013364315033, + "ce_loss_17": 3.123672294616699, + "ce_loss_23": 2.960705578327179, + "ce_loss_3": 4.38698136806488, + "ce_loss_6": 4.058153986930847, + "epoch": 0.23, + "grad_norm": 844.0, + "kl_loss_12": 1296.4142211914063, + "kl_loss_17": 358.9579452514648, + "kl_loss_3": 2962.921838378906, + "kl_loss_6": 2329.2815368652346, + "learning_rate": 0.000883022221559489, + "loss": 1736.9637, + "step": 2300 + }, + { + "ce_loss_12": 3.524729681015015, + "ce_loss_17": 3.0923240423202514, + "ce_loss_23": 2.9268879771232603, + "ce_loss_3": 4.394909167289734, + "ce_loss_6": 4.064548134803772, + "epoch": 0.231, + "grad_norm": 1088.0, + "kl_loss_12": 1332.733349609375, + "kl_loss_17": 373.6583770751953, + "kl_loss_3": 3063.1235473632814, + "kl_loss_6": 2420.8246948242186, + "learning_rate": 0.0008820004074673434, + "loss": 1840.4422, + "step": 2310 + }, + { + "ce_loss_12": 3.4370501399040223, + "ce_loss_17": 3.0037574648857115, + "ce_loss_23": 2.8393426060676576, + "ce_loss_3": 4.316093397140503, + "ce_loss_6": 3.9695367217063904, + "epoch": 0.232, + "grad_norm": 924.0, + "kl_loss_12": 1337.5342590332032, + "kl_loss_17": 368.76837463378905, + "kl_loss_3": 3081.577917480469, + "kl_loss_6": 2412.1710571289063, + "learning_rate": 0.0008809747466353355, + "loss": 1776.1449, + "step": 2320 + }, + { + "ce_loss_12": 3.436672937870026, + "ce_loss_17": 3.0068650960922243, + "ce_loss_23": 2.8387425184249877, + "ce_loss_3": 4.318840408325196, + "ce_loss_6": 3.98241525888443, + "epoch": 0.233, + "grad_norm": 1792.0, + "kl_loss_12": 1313.316717529297, + "kl_loss_17": 369.51439056396487, + "kl_loss_3": 3074.159777832031, + "kl_loss_6": 2411.598767089844, + "learning_rate": 0.0008799452493918585, + "loss": 1800.6109, + "step": 2330 + }, + { + "ce_loss_12": 3.5013447403907776, + "ce_loss_17": 3.07126921415329, + "ce_loss_23": 2.9082824349403382, + "ce_loss_3": 4.362140035629272, + "ce_loss_6": 4.038045120239258, + "epoch": 0.234, + "grad_norm": 1000.0, + "kl_loss_12": 1312.60400390625, + "kl_loss_17": 364.74903259277346, + "kl_loss_3": 3032.4592041015626, + "kl_loss_6": 2394.586633300781, + "learning_rate": 0.0008789119261039385, + "loss": 1830.5715, + "step": 2340 + }, + { + "ce_loss_12": 3.4402097702026366, + "ce_loss_17": 2.992724096775055, + "ce_loss_23": 2.8270597219467164, + "ce_loss_3": 4.309396195411682, + "ce_loss_6": 3.9708672523498536, + "epoch": 0.235, + "grad_norm": 1040.0, + "kl_loss_12": 1336.757794189453, + "kl_loss_17": 369.18068542480466, + "kl_loss_3": 3064.9718383789063, + "kl_loss_6": 2405.208776855469, + "learning_rate": 0.0008778747871771292, + "loss": 1766.0102, + "step": 2350 + }, + { + "ce_loss_12": 3.4626588582992555, + "ce_loss_17": 3.0441460490226744, + "ce_loss_23": 2.883425784111023, + "ce_loss_3": 4.300020003318787, + "ce_loss_6": 3.9820852994918825, + "epoch": 0.236, + "grad_norm": 932.0, + "kl_loss_12": 1287.1029907226562, + "kl_loss_17": 355.5157272338867, + "kl_loss_3": 2961.4650268554688, + "kl_loss_6": 2333.040173339844, + "learning_rate": 0.0008768338430554083, + "loss": 1735.4672, + "step": 2360 + }, + { + "ce_loss_12": 3.491554284095764, + "ce_loss_17": 3.0689448595046995, + "ce_loss_23": 2.8880576729774474, + "ce_loss_3": 4.349581158161163, + "ce_loss_6": 4.016392803192138, + "epoch": 0.237, + "grad_norm": 940.0, + "kl_loss_12": 1315.7613159179687, + "kl_loss_17": 389.4289184570313, + "kl_loss_3": 3028.0147705078125, + "kl_loss_6": 2380.471240234375, + "learning_rate": 0.0008757891042210713, + "loss": 1790.2062, + "step": 2370 + }, + { + "ce_loss_12": 3.4883965253829956, + "ce_loss_17": 3.077411413192749, + "ce_loss_23": 2.907989227771759, + "ce_loss_3": 4.351418459415436, + "ce_loss_6": 4.019922041893006, + "epoch": 0.238, + "grad_norm": 944.0, + "kl_loss_12": 1290.36904296875, + "kl_loss_17": 373.3400344848633, + "kl_loss_3": 2995.101806640625, + "kl_loss_6": 2344.245819091797, + "learning_rate": 0.0008747405811946271, + "loss": 1763.9508, + "step": 2380 + }, + { + "ce_loss_12": 3.43811776638031, + "ce_loss_17": 2.9906313180923463, + "ce_loss_23": 2.8158723592758177, + "ce_loss_3": 4.325619494915008, + "ce_loss_6": 3.9896966576576234, + "epoch": 0.239, + "grad_norm": 872.0, + "kl_loss_12": 1358.5515502929688, + "kl_loss_17": 380.8702789306641, + "kl_loss_3": 3141.8859008789063, + "kl_loss_6": 2481.448974609375, + "learning_rate": 0.0008736882845346905, + "loss": 1779.4078, + "step": 2390 + }, + { + "ce_loss_12": 3.497787058353424, + "ce_loss_17": 3.0713407397270203, + "ce_loss_23": 2.896159815788269, + "ce_loss_3": 4.380762910842895, + "ce_loss_6": 4.03783073425293, + "epoch": 0.24, + "grad_norm": 1120.0, + "kl_loss_12": 1309.9198181152344, + "kl_loss_17": 382.0894317626953, + "kl_loss_3": 3064.660021972656, + "kl_loss_6": 2392.8274658203127, + "learning_rate": 0.0008726322248378774, + "loss": 1776.2117, + "step": 2400 + }, + { + "ce_loss_12": 3.496664881706238, + "ce_loss_17": 3.062252378463745, + "ce_loss_23": 2.8999626994132996, + "ce_loss_3": 4.393068075180054, + "ce_loss_6": 4.060015296936035, + "epoch": 0.241, + "grad_norm": 788.0, + "kl_loss_12": 1330.9965576171876, + "kl_loss_17": 364.06876373291016, + "kl_loss_3": 3115.4126953125, + "kl_loss_6": 2465.3824096679687, + "learning_rate": 0.0008715724127386971, + "loss": 1828.7469, + "step": 2410 + }, + { + "ce_loss_12": 3.5431973457336428, + "ce_loss_17": 3.131938111782074, + "ce_loss_23": 2.969063603878021, + "ce_loss_3": 4.4129211664199826, + "ce_loss_6": 4.077617800235748, + "epoch": 0.242, + "grad_norm": 920.0, + "kl_loss_12": 1289.0557250976562, + "kl_loss_17": 364.9691192626953, + "kl_loss_3": 3012.280969238281, + "kl_loss_6": 2371.3688720703126, + "learning_rate": 0.0008705088589094458, + "loss": 1775.2705, + "step": 2420 + }, + { + "ce_loss_12": 3.5635186672210692, + "ce_loss_17": 3.1368691086769105, + "ce_loss_23": 2.9768545746803285, + "ce_loss_3": 4.4323231220245365, + "ce_loss_6": 4.099137330055237, + "epoch": 0.243, + "grad_norm": 932.0, + "kl_loss_12": 1306.441229248047, + "kl_loss_17": 364.32171020507815, + "kl_loss_3": 3046.074914550781, + "kl_loss_6": 2390.725549316406, + "learning_rate": 0.0008694415740600988, + "loss": 1783.0297, + "step": 2430 + }, + { + "ce_loss_12": 3.435083818435669, + "ce_loss_17": 3.0188833594322206, + "ce_loss_23": 2.8394552946090696, + "ce_loss_3": 4.335052013397217, + "ce_loss_6": 4.008730459213257, + "epoch": 0.244, + "grad_norm": 1128.0, + "kl_loss_12": 1319.4216796875, + "kl_loss_17": 391.80567626953126, + "kl_loss_3": 3102.2006103515623, + "kl_loss_6": 2459.811145019531, + "learning_rate": 0.0008683705689382025, + "loss": 1790.7793, + "step": 2440 + }, + { + "ce_loss_12": 3.501308631896973, + "ce_loss_17": 3.086840510368347, + "ce_loss_23": 2.922885799407959, + "ce_loss_3": 4.338215970993042, + "ce_loss_6": 4.02147890329361, + "epoch": 0.245, + "grad_norm": 992.0, + "kl_loss_12": 1284.8777770996094, + "kl_loss_17": 372.9107971191406, + "kl_loss_3": 2967.5705322265626, + "kl_loss_6": 2348.475634765625, + "learning_rate": 0.0008672958543287666, + "loss": 1789.2359, + "step": 2450 + }, + { + "ce_loss_12": 3.5120221972465515, + "ce_loss_17": 3.0961593270301817, + "ce_loss_23": 2.926725244522095, + "ce_loss_3": 4.341679620742798, + "ce_loss_6": 4.023214590549469, + "epoch": 0.246, + "grad_norm": 1176.0, + "kl_loss_12": 1297.7512512207031, + "kl_loss_17": 373.1650756835937, + "kl_loss_3": 2975.7365478515626, + "kl_loss_6": 2330.557568359375, + "learning_rate": 0.0008662174410541554, + "loss": 1745.7102, + "step": 2460 + }, + { + "ce_loss_12": 3.470390427112579, + "ce_loss_17": 3.0552254199981688, + "ce_loss_23": 2.894935131072998, + "ce_loss_3": 4.322717833518982, + "ce_loss_6": 3.99176082611084, + "epoch": 0.247, + "grad_norm": 1032.0, + "kl_loss_12": 1274.985662841797, + "kl_loss_17": 363.30750732421876, + "kl_loss_3": 2987.416650390625, + "kl_loss_6": 2335.2953918457033, + "learning_rate": 0.0008651353399739787, + "loss": 1791.3639, + "step": 2470 + }, + { + "ce_loss_12": 3.4972774505615236, + "ce_loss_17": 3.0815531253814696, + "ce_loss_23": 2.915669393539429, + "ce_loss_3": 4.35245532989502, + "ce_loss_6": 4.024959123134613, + "epoch": 0.248, + "grad_norm": 884.0, + "kl_loss_12": 1287.859619140625, + "kl_loss_17": 361.7851028442383, + "kl_loss_3": 2980.0279296875, + "kl_loss_6": 2345.3993225097656, + "learning_rate": 0.0008640495619849821, + "loss": 1753.2402, + "step": 2480 + }, + { + "ce_loss_12": 3.459804022312164, + "ce_loss_17": 3.0489287614822387, + "ce_loss_23": 2.8821661949157713, + "ce_loss_3": 4.315313053131104, + "ce_loss_6": 3.983213782310486, + "epoch": 0.249, + "grad_norm": 1104.0, + "kl_loss_12": 1288.3751342773437, + "kl_loss_17": 367.5175216674805, + "kl_loss_3": 2994.886682128906, + "kl_loss_6": 2342.3803161621095, + "learning_rate": 0.0008629601180209381, + "loss": 1744.3105, + "step": 2490 + }, + { + "ce_loss_12": 3.4463224172592164, + "ce_loss_17": 3.037926363945007, + "ce_loss_23": 2.870476996898651, + "ce_loss_3": 4.3170277118682865, + "ce_loss_6": 3.9774172425270082, + "epoch": 0.25, + "grad_norm": 988.0, + "kl_loss_12": 1279.446728515625, + "kl_loss_17": 365.6539993286133, + "kl_loss_3": 2985.2022094726562, + "kl_loss_6": 2335.312158203125, + "learning_rate": 0.000861867019052535, + "loss": 1779.5314, + "step": 2500 + }, + { + "ce_loss_12": 3.405802917480469, + "ce_loss_17": 2.9695845365524294, + "ce_loss_23": 2.8024597764015198, + "ce_loss_3": 4.301933288574219, + "ce_loss_6": 3.963208818435669, + "epoch": 0.251, + "grad_norm": 1016.0, + "kl_loss_12": 1317.6872192382812, + "kl_loss_17": 365.97511596679686, + "kl_loss_3": 3089.4097900390625, + "kl_loss_6": 2429.868518066406, + "learning_rate": 0.0008607702760872678, + "loss": 1795.1891, + "step": 2510 + }, + { + "ce_loss_12": 3.4766680002212524, + "ce_loss_17": 3.0628212213516237, + "ce_loss_23": 2.902873623371124, + "ce_loss_3": 4.3218683242797855, + "ce_loss_6": 3.9976150155067445, + "epoch": 0.252, + "grad_norm": 1056.0, + "kl_loss_12": 1269.5998901367188, + "kl_loss_17": 356.7829559326172, + "kl_loss_3": 2949.064074707031, + "kl_loss_6": 2307.2, + "learning_rate": 0.0008596699001693256, + "loss": 1769.2199, + "step": 2520 + }, + { + "ce_loss_12": 3.4674214720726013, + "ce_loss_17": 3.0689143180847167, + "ce_loss_23": 2.9168426513671877, + "ce_loss_3": 4.330565083026886, + "ce_loss_6": 3.996849024295807, + "epoch": 0.253, + "grad_norm": 1040.0, + "kl_loss_12": 1256.2728576660156, + "kl_loss_17": 347.1592544555664, + "kl_loss_3": 2967.556774902344, + "kl_loss_6": 2312.917028808594, + "learning_rate": 0.0008585659023794818, + "loss": 1777.5496, + "step": 2530 + }, + { + "ce_loss_12": 3.488456690311432, + "ce_loss_17": 3.049675261974335, + "ce_loss_23": 2.8808682441711424, + "ce_loss_3": 4.386857700347901, + "ce_loss_6": 4.042752182483673, + "epoch": 0.254, + "grad_norm": 1024.0, + "kl_loss_12": 1329.540185546875, + "kl_loss_17": 367.1026809692383, + "kl_loss_3": 3106.331298828125, + "kl_loss_6": 2430.2492797851564, + "learning_rate": 0.0008574582938349817, + "loss": 1783.7406, + "step": 2540 + }, + { + "ce_loss_12": 3.4827857851982116, + "ce_loss_17": 3.033743643760681, + "ce_loss_23": 2.856909465789795, + "ce_loss_3": 4.363191175460815, + "ce_loss_6": 4.0178595662117, + "epoch": 0.255, + "grad_norm": 928.0, + "kl_loss_12": 1360.4717224121093, + "kl_loss_17": 387.29022216796875, + "kl_loss_3": 3103.3068725585936, + "kl_loss_6": 2429.029577636719, + "learning_rate": 0.0008563470856894315, + "loss": 1761.7795, + "step": 2550 + }, + { + "ce_loss_12": 3.453168773651123, + "ce_loss_17": 3.019764852523804, + "ce_loss_23": 2.8588831305503843, + "ce_loss_3": 4.325317311286926, + "ce_loss_6": 3.998428213596344, + "epoch": 0.256, + "grad_norm": 1136.0, + "kl_loss_12": 1323.9368774414063, + "kl_loss_17": 359.41111907958987, + "kl_loss_3": 3045.49697265625, + "kl_loss_6": 2401.3049194335936, + "learning_rate": 0.0008552322891326845, + "loss": 1768.4219, + "step": 2560 + }, + { + "ce_loss_12": 3.422929072380066, + "ce_loss_17": 3.001448130607605, + "ce_loss_23": 2.832953155040741, + "ce_loss_3": 4.307469379901886, + "ce_loss_6": 3.963564693927765, + "epoch": 0.257, + "grad_norm": 952.0, + "kl_loss_12": 1304.7041931152344, + "kl_loss_17": 367.04587707519534, + "kl_loss_3": 3057.959326171875, + "kl_loss_6": 2391.83447265625, + "learning_rate": 0.0008541139153907296, + "loss": 1751.0988, + "step": 2570 + }, + { + "ce_loss_12": 3.3787537217140198, + "ce_loss_17": 2.958510947227478, + "ce_loss_23": 2.7977961897850037, + "ce_loss_3": 4.249011647701264, + "ce_loss_6": 3.9145611047744753, + "epoch": 0.258, + "grad_norm": 1048.0, + "kl_loss_12": 1283.3902404785156, + "kl_loss_17": 355.2511352539062, + "kl_loss_3": 3008.0134521484374, + "kl_loss_6": 2352.897106933594, + "learning_rate": 0.0008529919757255782, + "loss": 1775.459, + "step": 2580 + }, + { + "ce_loss_12": 3.397475516796112, + "ce_loss_17": 2.9850598335266114, + "ce_loss_23": 2.833599638938904, + "ce_loss_3": 4.2180173873901365, + "ce_loss_6": 3.8947041869163512, + "epoch": 0.259, + "grad_norm": 992.0, + "kl_loss_12": 1272.6158386230468, + "kl_loss_17": 348.8061218261719, + "kl_loss_3": 2917.62060546875, + "kl_loss_6": 2278.41728515625, + "learning_rate": 0.0008518664814351503, + "loss": 1723.7156, + "step": 2590 + }, + { + "ce_loss_12": 3.398776721954346, + "ce_loss_17": 2.960651087760925, + "ce_loss_23": 2.7926816940307617, + "ce_loss_3": 4.279233813285828, + "ce_loss_6": 3.9336220383644105, + "epoch": 0.26, + "grad_norm": 984.0, + "kl_loss_12": 1338.0150085449218, + "kl_loss_17": 370.83314056396483, + "kl_loss_3": 3081.394177246094, + "kl_loss_6": 2407.3505981445314, + "learning_rate": 0.0008507374438531607, + "loss": 1831.2203, + "step": 2600 + }, + { + "ce_loss_12": 3.3708202481269836, + "ce_loss_17": 2.939207446575165, + "ce_loss_23": 2.7798258900642394, + "ce_loss_3": 4.22383736371994, + "ce_loss_6": 3.899499309062958, + "epoch": 0.261, + "grad_norm": 1012.0, + "kl_loss_12": 1302.8319213867187, + "kl_loss_17": 356.8877914428711, + "kl_loss_3": 2991.932568359375, + "kl_loss_6": 2358.0376098632814, + "learning_rate": 0.0008496048743490053, + "loss": 1752.1227, + "step": 2610 + }, + { + "ce_loss_12": 3.4945289492607117, + "ce_loss_17": 3.0854422569274904, + "ce_loss_23": 2.926167845726013, + "ce_loss_3": 4.32569922208786, + "ce_loss_6": 4.003281497955323, + "epoch": 0.262, + "grad_norm": 1464.0, + "kl_loss_12": 1268.1002868652345, + "kl_loss_17": 355.6738220214844, + "kl_loss_3": 2944.6495727539063, + "kl_loss_6": 2304.16591796875, + "learning_rate": 0.0008484687843276469, + "loss": 1737.8074, + "step": 2620 + }, + { + "ce_loss_12": 3.437246561050415, + "ce_loss_17": 3.01961772441864, + "ce_loss_23": 2.8550262570381166, + "ce_loss_3": 4.30987902879715, + "ce_loss_6": 3.969077003002167, + "epoch": 0.263, + "grad_norm": 924.0, + "kl_loss_12": 1300.986590576172, + "kl_loss_17": 363.2773895263672, + "kl_loss_3": 3022.796911621094, + "kl_loss_6": 2354.0840698242187, + "learning_rate": 0.0008473291852294987, + "loss": 1774.0008, + "step": 2630 + }, + { + "ce_loss_12": 3.4466328263282775, + "ce_loss_17": 3.0360899448394774, + "ce_loss_23": 2.8658475399017336, + "ce_loss_3": 4.297406530380249, + "ce_loss_6": 3.971087157726288, + "epoch": 0.264, + "grad_norm": 968.0, + "kl_loss_12": 1296.3372619628906, + "kl_loss_17": 367.7685241699219, + "kl_loss_3": 2995.4369140625, + "kl_loss_6": 2360.178326416016, + "learning_rate": 0.0008461860885303114, + "loss": 1742.2068, + "step": 2640 + }, + { + "ce_loss_12": 3.460123133659363, + "ce_loss_17": 3.050652766227722, + "ce_loss_23": 2.8907677292823792, + "ce_loss_3": 4.304844355583191, + "ce_loss_6": 3.9868418335914613, + "epoch": 0.265, + "grad_norm": 1440.0, + "kl_loss_12": 1261.4657043457032, + "kl_loss_17": 360.51488494873047, + "kl_loss_3": 2945.7997192382813, + "kl_loss_6": 2313.209997558594, + "learning_rate": 0.000845039505741056, + "loss": 1748.1709, + "step": 2650 + }, + { + "ce_loss_12": 3.465388464927673, + "ce_loss_17": 3.0377141714096068, + "ce_loss_23": 2.8722215056419373, + "ce_loss_3": 4.3243202567100525, + "ce_loss_6": 3.991137969493866, + "epoch": 0.266, + "grad_norm": 1012.0, + "kl_loss_12": 1322.8445251464843, + "kl_loss_17": 374.1597625732422, + "kl_loss_3": 3032.355578613281, + "kl_loss_6": 2393.8152709960937, + "learning_rate": 0.0008438894484078086, + "loss": 1821.5604, + "step": 2660 + }, + { + "ce_loss_12": 3.450354981422424, + "ce_loss_17": 3.056048274040222, + "ce_loss_23": 2.8842979192733766, + "ce_loss_3": 4.297944974899292, + "ce_loss_6": 3.983082890510559, + "epoch": 0.267, + "grad_norm": 928.0, + "kl_loss_12": 1266.1199890136718, + "kl_loss_17": 382.5615264892578, + "kl_loss_3": 2958.8844482421873, + "kl_loss_6": 2330.66201171875, + "learning_rate": 0.0008427359281116334, + "loss": 1752.3658, + "step": 2670 + }, + { + "ce_loss_12": 3.37357120513916, + "ce_loss_17": 2.9571158409118654, + "ce_loss_23": 2.7853593945503237, + "ce_loss_3": 4.2602328896522526, + "ce_loss_6": 3.9257378458976744, + "epoch": 0.268, + "grad_norm": 1448.0, + "kl_loss_12": 1305.2779907226563, + "kl_loss_17": 383.0894973754883, + "kl_loss_3": 3063.417590332031, + "kl_loss_6": 2411.9236450195312, + "learning_rate": 0.0008415789564684673, + "loss": 1772.8453, + "step": 2680 + }, + { + "ce_loss_12": 3.5913300633430483, + "ce_loss_17": 3.1934632778167726, + "ce_loss_23": 3.019876754283905, + "ce_loss_3": 4.42219307422638, + "ce_loss_6": 4.088761901855468, + "epoch": 0.269, + "grad_norm": 1136.0, + "kl_loss_12": 1267.3768493652344, + "kl_loss_17": 386.0883987426758, + "kl_loss_3": 2907.607763671875, + "kl_loss_6": 2264.976397705078, + "learning_rate": 0.0008404185451290017, + "loss": 1715.9904, + "step": 2690 + }, + { + "ce_loss_12": 3.4683207511901855, + "ce_loss_17": 3.07811154127121, + "ce_loss_23": 2.8965150237083437, + "ce_loss_3": 4.320998978614807, + "ce_loss_6": 3.992653453350067, + "epoch": 0.27, + "grad_norm": 1240.0, + "kl_loss_12": 1260.81787109375, + "kl_loss_17": 394.48106994628904, + "kl_loss_3": 2952.572314453125, + "kl_loss_6": 2312.224505615234, + "learning_rate": 0.0008392547057785661, + "loss": 1735.7289, + "step": 2700 + }, + { + "ce_loss_12": 3.4322686553001405, + "ce_loss_17": 3.0301548719406126, + "ce_loss_23": 2.8392791628837584, + "ce_loss_3": 4.310086572170258, + "ce_loss_6": 3.9823203444480897, + "epoch": 0.271, + "grad_norm": 880.0, + "kl_loss_12": 1318.2845825195313, + "kl_loss_17": 423.6947479248047, + "kl_loss_3": 3075.54521484375, + "kl_loss_6": 2435.184326171875, + "learning_rate": 0.0008380874501370098, + "loss": 1750.4709, + "step": 2710 + }, + { + "ce_loss_12": 3.4204657673835754, + "ce_loss_17": 3.0037129402160643, + "ce_loss_23": 2.821027398109436, + "ce_loss_3": 4.296162378787995, + "ce_loss_6": 3.9711161613464356, + "epoch": 0.272, + "grad_norm": 928.0, + "kl_loss_12": 1301.1679565429688, + "kl_loss_17": 401.3130187988281, + "kl_loss_3": 3049.724011230469, + "kl_loss_6": 2402.8673461914063, + "learning_rate": 0.0008369167899585841, + "loss": 1774.0307, + "step": 2720 + }, + { + "ce_loss_12": 3.487650990486145, + "ce_loss_17": 3.1002121210098266, + "ce_loss_23": 2.936123478412628, + "ce_loss_3": 4.30772819519043, + "ce_loss_6": 3.9867376923561095, + "epoch": 0.273, + "grad_norm": 1088.0, + "kl_loss_12": 1245.5649353027343, + "kl_loss_17": 369.61613006591796, + "kl_loss_3": 2892.830920410156, + "kl_loss_6": 2261.4607543945312, + "learning_rate": 0.0008357427370318238, + "loss": 1752.9934, + "step": 2730 + }, + { + "ce_loss_12": 3.45722119808197, + "ce_loss_17": 3.0482855081558227, + "ce_loss_23": 2.884982371330261, + "ce_loss_3": 4.328402829170227, + "ce_loss_6": 4.003220283985138, + "epoch": 0.274, + "grad_norm": 1320.0, + "kl_loss_12": 1274.736767578125, + "kl_loss_17": 362.33515014648435, + "kl_loss_3": 3004.6664428710938, + "kl_loss_6": 2365.4545654296876, + "learning_rate": 0.0008345653031794292, + "loss": 1758.5988, + "step": 2740 + }, + { + "ce_loss_12": 3.468037247657776, + "ce_loss_17": 3.0583531856536865, + "ce_loss_23": 2.8902764678001405, + "ce_loss_3": 4.325004518032074, + "ce_loss_6": 3.9913942217826843, + "epoch": 0.275, + "grad_norm": 1248.0, + "kl_loss_12": 1277.5965454101563, + "kl_loss_17": 370.7550796508789, + "kl_loss_3": 2985.575329589844, + "kl_loss_6": 2330.1687194824217, + "learning_rate": 0.0008333845002581458, + "loss": 1745.3102, + "step": 2750 + }, + { + "ce_loss_12": 3.415788435935974, + "ce_loss_17": 2.9950094342231752, + "ce_loss_23": 2.826165699958801, + "ce_loss_3": 4.2899749159812925, + "ce_loss_6": 3.957856571674347, + "epoch": 0.276, + "grad_norm": 1040.0, + "kl_loss_12": 1311.6923767089843, + "kl_loss_17": 368.13220062255857, + "kl_loss_3": 3057.960583496094, + "kl_loss_6": 2412.7806030273437, + "learning_rate": 0.0008322003401586462, + "loss": 1780.4906, + "step": 2760 + }, + { + "ce_loss_12": 3.4133952260017395, + "ce_loss_17": 3.0063074231147766, + "ce_loss_23": 2.8547223687171934, + "ce_loss_3": 4.248537862300873, + "ce_loss_6": 3.930584025382996, + "epoch": 0.277, + "grad_norm": 1120.0, + "kl_loss_12": 1237.89150390625, + "kl_loss_17": 345.06705474853516, + "kl_loss_3": 2913.3134033203123, + "kl_loss_6": 2285.3903869628907, + "learning_rate": 0.0008310128348054094, + "loss": 1679.1152, + "step": 2770 + }, + { + "ce_loss_12": 3.3873406529426573, + "ce_loss_17": 2.9817344427108763, + "ce_loss_23": 2.826510119438171, + "ce_loss_3": 4.259957826137542, + "ce_loss_6": 3.922231209278107, + "epoch": 0.278, + "grad_norm": 1056.0, + "kl_loss_12": 1256.4411865234374, + "kl_loss_17": 350.2546112060547, + "kl_loss_3": 2975.923962402344, + "kl_loss_6": 2313.7191162109375, + "learning_rate": 0.0008298219961566008, + "loss": 1732.8988, + "step": 2780 + }, + { + "ce_loss_12": 3.371931481361389, + "ce_loss_17": 2.9464340090751646, + "ce_loss_23": 2.7890324234962462, + "ce_loss_3": 4.25629768371582, + "ce_loss_6": 3.9189098596572878, + "epoch": 0.279, + "grad_norm": 1368.0, + "kl_loss_12": 1300.705126953125, + "kl_loss_17": 354.02294006347654, + "kl_loss_3": 3076.6906494140626, + "kl_loss_6": 2407.201062011719, + "learning_rate": 0.0008286278362039527, + "loss": 1741.5291, + "step": 2790 + }, + { + "ce_loss_12": 3.410293257236481, + "ce_loss_17": 2.983790063858032, + "ce_loss_23": 2.818021869659424, + "ce_loss_3": 4.301714396476745, + "ce_loss_6": 3.9640199065208437, + "epoch": 0.28, + "grad_norm": 1040.0, + "kl_loss_12": 1309.4822204589843, + "kl_loss_17": 358.61011657714846, + "kl_loss_3": 3090.107067871094, + "kl_loss_6": 2422.9576782226563, + "learning_rate": 0.0008274303669726426, + "loss": 1748.3598, + "step": 2800 + }, + { + "ce_loss_12": 3.3253208994865417, + "ce_loss_17": 2.8995402812957765, + "ce_loss_23": 2.737238872051239, + "ce_loss_3": 4.240236687660217, + "ce_loss_6": 3.903521752357483, + "epoch": 0.281, + "grad_norm": 900.0, + "kl_loss_12": 1297.9562866210938, + "kl_loss_17": 362.3501480102539, + "kl_loss_3": 3113.5695922851564, + "kl_loss_6": 2452.9843994140624, + "learning_rate": 0.0008262296005211721, + "loss": 1740.7102, + "step": 2810 + }, + { + "ce_loss_12": 3.437107729911804, + "ce_loss_17": 3.014890968799591, + "ce_loss_23": 2.8532368183135985, + "ce_loss_3": 4.30629141330719, + "ce_loss_6": 3.9686094522476196, + "epoch": 0.282, + "grad_norm": 1176.0, + "kl_loss_12": 1297.9700988769532, + "kl_loss_17": 360.28114929199216, + "kl_loss_3": 3020.598486328125, + "kl_loss_6": 2359.4715087890627, + "learning_rate": 0.0008250255489412463, + "loss": 1736.0588, + "step": 2820 + }, + { + "ce_loss_12": 3.5168994188308718, + "ce_loss_17": 3.103827476501465, + "ce_loss_23": 2.9425456881523133, + "ce_loss_3": 4.380283188819885, + "ce_loss_6": 4.0496561288833615, + "epoch": 0.283, + "grad_norm": 940.0, + "kl_loss_12": 1263.3302734375, + "kl_loss_17": 358.89661102294923, + "kl_loss_3": 2991.8945556640624, + "kl_loss_6": 2343.1708740234376, + "learning_rate": 0.0008238182243576511, + "loss": 1739.1707, + "step": 2830 + }, + { + "ce_loss_12": 3.4478026747703554, + "ce_loss_17": 3.0591145396232604, + "ce_loss_23": 2.9078246116638184, + "ce_loss_3": 4.258657145500183, + "ce_loss_6": 3.9365967869758607, + "epoch": 0.284, + "grad_norm": 988.0, + "kl_loss_12": 1218.239813232422, + "kl_loss_17": 343.7883529663086, + "kl_loss_3": 2836.483447265625, + "kl_loss_6": 2205.292608642578, + "learning_rate": 0.0008226076389281315, + "loss": 1684.1754, + "step": 2840 + }, + { + "ce_loss_12": 3.5043246269226076, + "ce_loss_17": 3.104953384399414, + "ce_loss_23": 2.9533308386802672, + "ce_loss_3": 4.344747185707092, + "ce_loss_6": 4.0262510776519775, + "epoch": 0.285, + "grad_norm": 1120.0, + "kl_loss_12": 1250.2501098632813, + "kl_loss_17": 346.8875228881836, + "kl_loss_3": 2944.923376464844, + "kl_loss_6": 2315.420861816406, + "learning_rate": 0.0008213938048432696, + "loss": 1694.4004, + "step": 2850 + }, + { + "ce_loss_12": 3.443698799610138, + "ce_loss_17": 3.0415454149246215, + "ce_loss_23": 2.8757529616355897, + "ce_loss_3": 4.296736359596252, + "ce_loss_6": 3.968770682811737, + "epoch": 0.286, + "grad_norm": 1400.0, + "kl_loss_12": 1260.857196044922, + "kl_loss_17": 360.1633895874023, + "kl_loss_3": 2964.88828125, + "kl_loss_6": 2322.2742919921875, + "learning_rate": 0.0008201767343263612, + "loss": 1735.0531, + "step": 2860 + }, + { + "ce_loss_12": 3.408144676685333, + "ce_loss_17": 2.992103934288025, + "ce_loss_23": 2.8287665486335754, + "ce_loss_3": 4.272946655750275, + "ce_loss_6": 3.9430832743644713, + "epoch": 0.287, + "grad_norm": 1032.0, + "kl_loss_12": 1275.9811218261718, + "kl_loss_17": 349.76739501953125, + "kl_loss_3": 3006.3862426757814, + "kl_loss_6": 2355.1337646484376, + "learning_rate": 0.0008189564396332927, + "loss": 1692.4711, + "step": 2870 + }, + { + "ce_loss_12": 3.3988120436668394, + "ce_loss_17": 2.9796494960784914, + "ce_loss_23": 2.822283446788788, + "ce_loss_3": 4.284251952171326, + "ce_loss_6": 3.934915232658386, + "epoch": 0.288, + "grad_norm": 956.0, + "kl_loss_12": 1261.259295654297, + "kl_loss_17": 351.22632751464846, + "kl_loss_3": 3018.626599121094, + "kl_loss_6": 2347.247448730469, + "learning_rate": 0.0008177329330524181, + "loss": 1743.3631, + "step": 2880 + }, + { + "ce_loss_12": 3.42569385766983, + "ce_loss_17": 3.0192829370498657, + "ce_loss_23": 2.860982358455658, + "ce_loss_3": 4.265313994884491, + "ce_loss_6": 3.939829432964325, + "epoch": 0.289, + "grad_norm": 936.0, + "kl_loss_12": 1248.4840698242188, + "kl_loss_17": 348.7205474853516, + "kl_loss_3": 2906.6572387695314, + "kl_loss_6": 2269.7896362304687, + "learning_rate": 0.0008165062269044352, + "loss": 1709.3297, + "step": 2890 + }, + { + "ce_loss_12": 3.422416353225708, + "ce_loss_17": 2.9858880400657655, + "ce_loss_23": 2.8259318113327025, + "ce_loss_3": 4.268325328826904, + "ce_loss_6": 3.9391417264938355, + "epoch": 0.29, + "grad_norm": 1264.0, + "kl_loss_12": 1315.4104431152343, + "kl_loss_17": 357.4076446533203, + "kl_loss_3": 3013.7338623046876, + "kl_loss_6": 2362.8879516601564, + "learning_rate": 0.0008152763335422613, + "loss": 1761.2363, + "step": 2900 + }, + { + "ce_loss_12": 3.4009305357933046, + "ce_loss_17": 2.974248266220093, + "ce_loss_23": 2.8042847514152527, + "ce_loss_3": 4.2600690722465515, + "ce_loss_6": 3.9201393485069276, + "epoch": 0.291, + "grad_norm": 1144.0, + "kl_loss_12": 1298.79541015625, + "kl_loss_17": 360.68007507324216, + "kl_loss_3": 3015.308837890625, + "kl_loss_6": 2350.725445556641, + "learning_rate": 0.0008140432653509088, + "loss": 1733.8855, + "step": 2910 + }, + { + "ce_loss_12": 3.433920180797577, + "ce_loss_17": 3.023169231414795, + "ce_loss_23": 2.8585532546043395, + "ce_loss_3": 4.279167592525482, + "ce_loss_6": 3.94570974111557, + "epoch": 0.292, + "grad_norm": 980.0, + "kl_loss_12": 1282.2089416503907, + "kl_loss_17": 364.0143417358398, + "kl_loss_3": 2988.808801269531, + "kl_loss_6": 2321.889697265625, + "learning_rate": 0.0008128070347473608, + "loss": 1725.4188, + "step": 2920 + }, + { + "ce_loss_12": 3.452561819553375, + "ce_loss_17": 3.036209762096405, + "ce_loss_23": 2.86896470785141, + "ce_loss_3": 4.338265037536621, + "ce_loss_6": 3.9918197631835937, + "epoch": 0.293, + "grad_norm": 1368.0, + "kl_loss_12": 1288.3324584960938, + "kl_loss_17": 367.0218811035156, + "kl_loss_3": 3065.8913818359374, + "kl_loss_6": 2384.238067626953, + "learning_rate": 0.0008115676541804455, + "loss": 1742.3186, + "step": 2930 + }, + { + "ce_loss_12": 3.429081213474274, + "ce_loss_17": 3.032915246486664, + "ce_loss_23": 2.8716418743133545, + "ce_loss_3": 4.276789140701294, + "ce_loss_6": 3.95506272315979, + "epoch": 0.294, + "grad_norm": 1160.0, + "kl_loss_12": 1249.3301696777344, + "kl_loss_17": 362.1129867553711, + "kl_loss_3": 2951.0944580078126, + "kl_loss_6": 2311.5976318359376, + "learning_rate": 0.0008103251361307119, + "loss": 1741.0098, + "step": 2940 + }, + { + "ce_loss_12": 3.467066729068756, + "ce_loss_17": 3.0619003772735596, + "ce_loss_23": 2.8990610003471375, + "ce_loss_3": 4.321110010147095, + "ce_loss_6": 3.990778183937073, + "epoch": 0.295, + "grad_norm": 984.0, + "kl_loss_12": 1265.3674926757812, + "kl_loss_17": 364.09894409179685, + "kl_loss_3": 2966.912878417969, + "kl_loss_6": 2325.2829406738283, + "learning_rate": 0.0008090794931103026, + "loss": 1712.6641, + "step": 2950 + }, + { + "ce_loss_12": 3.4314486742019654, + "ce_loss_17": 3.0392224907875063, + "ce_loss_23": 2.88177170753479, + "ce_loss_3": 4.275236093997956, + "ce_loss_6": 3.951506805419922, + "epoch": 0.296, + "grad_norm": 1048.0, + "kl_loss_12": 1241.217755126953, + "kl_loss_17": 350.6375930786133, + "kl_loss_3": 2931.218994140625, + "kl_loss_6": 2290.7966796875, + "learning_rate": 0.0008078307376628291, + "loss": 1712.4719, + "step": 2960 + }, + { + "ce_loss_12": 3.4813525915145873, + "ce_loss_17": 3.095889449119568, + "ce_loss_23": 2.9383904099464417, + "ce_loss_3": 4.30640218257904, + "ce_loss_6": 3.9789917945861815, + "epoch": 0.297, + "grad_norm": 1320.0, + "kl_loss_12": 1200.214617919922, + "kl_loss_17": 356.74906463623046, + "kl_loss_3": 2835.6501342773436, + "kl_loss_6": 2204.1634033203127, + "learning_rate": 0.000806578882363245, + "loss": 1654.0092, + "step": 2970 + }, + { + "ce_loss_12": 3.4101366281509398, + "ce_loss_17": 3.0129162073135376, + "ce_loss_23": 2.86002494096756, + "ce_loss_3": 4.239542484283447, + "ce_loss_6": 3.9237579584121702, + "epoch": 0.298, + "grad_norm": 996.0, + "kl_loss_12": 1232.5573425292969, + "kl_loss_17": 347.3342086791992, + "kl_loss_3": 2897.1833251953126, + "kl_loss_6": 2273.477685546875, + "learning_rate": 0.0008053239398177191, + "loss": 1723.9477, + "step": 2980 + }, + { + "ce_loss_12": 3.4091039061546327, + "ce_loss_17": 3.007597863674164, + "ce_loss_23": 2.847511053085327, + "ce_loss_3": 4.270219349861145, + "ce_loss_6": 3.9442752599716187, + "epoch": 0.299, + "grad_norm": 1000.0, + "kl_loss_12": 1248.1081665039062, + "kl_loss_17": 354.12125701904296, + "kl_loss_3": 2963.1808349609373, + "kl_loss_6": 2330.194610595703, + "learning_rate": 0.0008040659226635089, + "loss": 1757.8508, + "step": 2990 + }, + { + "ce_loss_12": 3.527987062931061, + "ce_loss_17": 3.1199743270874025, + "ce_loss_23": 2.954590117931366, + "ce_loss_3": 4.3627019882202145, + "ce_loss_6": 4.041871237754822, + "epoch": 0.3, + "grad_norm": 920.0, + "kl_loss_12": 1262.4868103027343, + "kl_loss_17": 369.21068878173827, + "kl_loss_3": 2945.7930297851562, + "kl_loss_6": 2307.741522216797, + "learning_rate": 0.0008028048435688333, + "loss": 1701.9465, + "step": 3000 + }, + { + "ce_loss_12": 3.424453687667847, + "ce_loss_17": 3.0054264664649963, + "ce_loss_23": 2.8507750153541567, + "ce_loss_3": 4.295927572250366, + "ce_loss_6": 3.9547128438949586, + "epoch": 0.301, + "grad_norm": 872.0, + "kl_loss_12": 1277.3141723632812, + "kl_loss_17": 354.91680297851565, + "kl_loss_3": 3015.4612426757812, + "kl_loss_6": 2361.0749755859374, + "learning_rate": 0.0008015407152327448, + "loss": 1730.5312, + "step": 3010 + }, + { + "ce_loss_12": 3.4446123480796813, + "ce_loss_17": 3.047043168544769, + "ce_loss_23": 2.8878801107406615, + "ce_loss_3": 4.314627742767334, + "ce_loss_6": 3.9756017565727233, + "epoch": 0.302, + "grad_norm": 1104.0, + "kl_loss_12": 1255.7592834472657, + "kl_loss_17": 354.1048187255859, + "kl_loss_3": 3005.7612426757814, + "kl_loss_6": 2343.475305175781, + "learning_rate": 0.0008002735503850016, + "loss": 1735.2307, + "step": 3020 + }, + { + "ce_loss_12": 3.3590387105941772, + "ce_loss_17": 2.944441223144531, + "ce_loss_23": 2.785951316356659, + "ce_loss_3": 4.251755249500275, + "ce_loss_6": 3.910470759868622, + "epoch": 0.303, + "grad_norm": 1208.0, + "kl_loss_12": 1281.0070678710938, + "kl_loss_17": 354.10951843261716, + "kl_loss_3": 3049.8285888671876, + "kl_loss_6": 2380.6233093261717, + "learning_rate": 0.0007990033617859396, + "loss": 1756.1746, + "step": 3030 + }, + { + "ce_loss_12": 3.398754870891571, + "ce_loss_17": 2.9987738013267515, + "ce_loss_23": 2.8411520957946776, + "ce_loss_3": 4.253010249137878, + "ce_loss_6": 3.915536332130432, + "epoch": 0.304, + "grad_norm": 996.0, + "kl_loss_12": 1240.5551879882812, + "kl_loss_17": 345.8853164672852, + "kl_loss_3": 2941.6995849609375, + "kl_loss_6": 2288.344030761719, + "learning_rate": 0.000797730162226344, + "loss": 1664.4863, + "step": 3040 + }, + { + "ce_loss_12": 3.423166370391846, + "ce_loss_17": 3.015391969680786, + "ce_loss_23": 2.857586407661438, + "ce_loss_3": 4.28016722202301, + "ce_loss_6": 3.9513394713401793, + "epoch": 0.305, + "grad_norm": 1280.0, + "kl_loss_12": 1252.754815673828, + "kl_loss_17": 347.2244384765625, + "kl_loss_3": 2958.3191040039064, + "kl_loss_6": 2312.717321777344, + "learning_rate": 0.0007964539645273203, + "loss": 1708.0309, + "step": 3050 + }, + { + "ce_loss_12": 3.4114735603332518, + "ce_loss_17": 3.025020956993103, + "ce_loss_23": 2.880479061603546, + "ce_loss_3": 4.265051865577698, + "ce_loss_6": 3.9374590635299684, + "epoch": 0.306, + "grad_norm": 984.0, + "kl_loss_12": 1201.3837463378907, + "kl_loss_17": 333.77726287841796, + "kl_loss_3": 2892.557958984375, + "kl_loss_6": 2244.813525390625, + "learning_rate": 0.000795174781540165, + "loss": 1696.2406, + "step": 3060 + }, + { + "ce_loss_12": 3.4819996476173403, + "ce_loss_17": 3.0955939888954163, + "ce_loss_23": 2.9482061147689818, + "ce_loss_3": 4.297872924804688, + "ce_loss_6": 3.9962024688720703, + "epoch": 0.307, + "grad_norm": 824.0, + "kl_loss_12": 1201.2342224121094, + "kl_loss_17": 335.6284637451172, + "kl_loss_3": 2830.093212890625, + "kl_loss_6": 2229.5277893066404, + "learning_rate": 0.0007938926261462366, + "loss": 1689.6482, + "step": 3070 + }, + { + "ce_loss_12": 3.4330130338668825, + "ce_loss_17": 3.051720929145813, + "ce_loss_23": 2.897457814216614, + "ce_loss_3": 4.26753751039505, + "ce_loss_6": 3.942421293258667, + "epoch": 0.308, + "grad_norm": 1032.0, + "kl_loss_12": 1214.312567138672, + "kl_loss_17": 344.4313339233398, + "kl_loss_3": 2897.050341796875, + "kl_loss_6": 2260.629071044922, + "learning_rate": 0.0007926075112568258, + "loss": 1714.8252, + "step": 3080 + }, + { + "ce_loss_12": 3.4475087881088258, + "ce_loss_17": 3.053488528728485, + "ce_loss_23": 2.8923556566238404, + "ce_loss_3": 4.28842933177948, + "ce_loss_6": 3.9646134853363035, + "epoch": 0.309, + "grad_norm": 928.0, + "kl_loss_12": 1240.5612548828126, + "kl_loss_17": 354.20672912597655, + "kl_loss_3": 2923.2433959960936, + "kl_loss_6": 2286.2694580078123, + "learning_rate": 0.0007913194498130252, + "loss": 1677.05, + "step": 3090 + }, + { + "ce_loss_12": 3.3970143675804136, + "ce_loss_17": 2.9865134000778197, + "ce_loss_23": 2.8217122316360475, + "ce_loss_3": 4.25620584487915, + "ce_loss_6": 3.9191672444343566, + "epoch": 0.31, + "grad_norm": 1240.0, + "kl_loss_12": 1247.7857238769532, + "kl_loss_17": 356.9169692993164, + "kl_loss_3": 2962.325329589844, + "kl_loss_6": 2299.6020629882814, + "learning_rate": 0.0007900284547855992, + "loss": 1725.1816, + "step": 3100 + }, + { + "ce_loss_12": 3.385082983970642, + "ce_loss_17": 2.99058358669281, + "ce_loss_23": 2.837337374687195, + "ce_loss_3": 4.213411450386047, + "ce_loss_6": 3.886853301525116, + "epoch": 0.311, + "grad_norm": 804.0, + "kl_loss_12": 1234.5562255859375, + "kl_loss_17": 350.2632308959961, + "kl_loss_3": 2897.912805175781, + "kl_loss_6": 2260.5111755371095, + "learning_rate": 0.0007887345391748532, + "loss": 1719.3498, + "step": 3110 + }, + { + "ce_loss_12": 3.4767756819725038, + "ce_loss_17": 3.0919549822807313, + "ce_loss_23": 2.939840757846832, + "ce_loss_3": 4.2933003306388855, + "ce_loss_6": 3.976503777503967, + "epoch": 0.312, + "grad_norm": 1264.0, + "kl_loss_12": 1205.0848999023438, + "kl_loss_17": 345.1719009399414, + "kl_loss_3": 2828.3422607421876, + "kl_loss_6": 2205.365673828125, + "learning_rate": 0.0007874377160105036, + "loss": 1645.6273, + "step": 3120 + }, + { + "ce_loss_12": 3.428488540649414, + "ce_loss_17": 3.0121246337890626, + "ce_loss_23": 2.8537957429885865, + "ce_loss_3": 4.28159236907959, + "ce_loss_6": 3.9406146883964537, + "epoch": 0.313, + "grad_norm": 928.0, + "kl_loss_12": 1269.7479125976563, + "kl_loss_17": 346.16997680664065, + "kl_loss_3": 2976.1847412109373, + "kl_loss_6": 2310.867999267578, + "learning_rate": 0.0007861379983515449, + "loss": 1755.5637, + "step": 3130 + }, + { + "ce_loss_12": 3.478686714172363, + "ce_loss_17": 3.074314284324646, + "ce_loss_23": 2.9215672969818116, + "ce_loss_3": 4.305224299430847, + "ce_loss_6": 3.9890227794647215, + "epoch": 0.314, + "grad_norm": 1012.0, + "kl_loss_12": 1248.4745849609376, + "kl_loss_17": 339.9023941040039, + "kl_loss_3": 2893.8021484375, + "kl_loss_6": 2270.5517761230467, + "learning_rate": 0.0007848353992861195, + "loss": 1679.8211, + "step": 3140 + }, + { + "ce_loss_12": 3.574088752269745, + "ce_loss_17": 3.160393226146698, + "ce_loss_23": 2.9927392840385436, + "ce_loss_3": 4.402963733673095, + "ce_loss_6": 4.084368908405304, + "epoch": 0.315, + "grad_norm": 932.0, + "kl_loss_12": 1281.5710205078126, + "kl_loss_17": 361.84169921875, + "kl_loss_3": 2930.8458740234373, + "kl_loss_6": 2299.417041015625, + "learning_rate": 0.0007835299319313853, + "loss": 1721.2396, + "step": 3150 + }, + { + "ce_loss_12": 3.441893661022186, + "ce_loss_17": 3.050514340400696, + "ce_loss_23": 2.9018136262893677, + "ce_loss_3": 4.269163203239441, + "ce_loss_6": 3.950773096084595, + "epoch": 0.316, + "grad_norm": 1112.0, + "kl_loss_12": 1213.3498229980469, + "kl_loss_17": 338.602278137207, + "kl_loss_3": 2869.5227783203127, + "kl_loss_6": 2248.8671264648438, + "learning_rate": 0.0007822216094333848, + "loss": 1721.7668, + "step": 3160 + }, + { + "ce_loss_12": 3.4707986950874328, + "ce_loss_17": 3.0577391386032104, + "ce_loss_23": 2.905358004570007, + "ce_loss_3": 4.321441674232483, + "ce_loss_6": 3.9936294078826906, + "epoch": 0.317, + "grad_norm": 1024.0, + "kl_loss_12": 1257.6893615722656, + "kl_loss_17": 341.61038665771486, + "kl_loss_3": 2954.4898315429687, + "kl_loss_6": 2308.9023193359376, + "learning_rate": 0.0007809104449669101, + "loss": 1698.5734, + "step": 3170 + }, + { + "ce_loss_12": 3.3872597932815554, + "ce_loss_17": 2.9946960330009462, + "ce_loss_23": 2.853222095966339, + "ce_loss_3": 4.220446193218232, + "ce_loss_6": 3.9008307218551637, + "epoch": 0.318, + "grad_norm": 1536.0, + "kl_loss_12": 1210.7290832519532, + "kl_loss_17": 331.215837097168, + "kl_loss_3": 2866.191455078125, + "kl_loss_6": 2238.608123779297, + "learning_rate": 0.0007795964517353734, + "loss": 1672.657, + "step": 3180 + }, + { + "ce_loss_12": 3.403947818279266, + "ce_loss_17": 3.000806951522827, + "ce_loss_23": 2.8538204431533813, + "ce_loss_3": 4.2604128241539, + "ce_loss_6": 3.930659818649292, + "epoch": 0.319, + "grad_norm": 980.0, + "kl_loss_12": 1233.635791015625, + "kl_loss_17": 336.13388977050784, + "kl_loss_3": 2938.8574462890624, + "kl_loss_6": 2292.6660705566405, + "learning_rate": 0.000778279642970672, + "loss": 1661.2771, + "step": 3190 + }, + { + "ce_loss_12": 3.3985164761543274, + "ce_loss_17": 3.008086919784546, + "ce_loss_23": 2.8599029898643495, + "ce_loss_3": 4.225405025482178, + "ce_loss_6": 3.910887622833252, + "epoch": 0.32, + "grad_norm": 1088.0, + "kl_loss_12": 1213.4981201171875, + "kl_loss_17": 333.7114624023437, + "kl_loss_3": 2869.0503173828124, + "kl_loss_6": 2258.674560546875, + "learning_rate": 0.0007769600319330552, + "loss": 1655.7844, + "step": 3200 + }, + { + "ce_loss_12": 3.447622609138489, + "ce_loss_17": 3.0317168831825256, + "ce_loss_23": 2.884088695049286, + "ce_loss_3": 4.318697500228882, + "ce_loss_6": 3.983830189704895, + "epoch": 0.321, + "grad_norm": 1224.0, + "kl_loss_12": 1245.9826690673829, + "kl_loss_17": 337.38295593261716, + "kl_loss_3": 2983.2697509765626, + "kl_loss_6": 2329.8124755859376, + "learning_rate": 0.0007756376319109917, + "loss": 1698.1805, + "step": 3210 + }, + { + "ce_loss_12": 3.4633454203605654, + "ce_loss_17": 3.067983663082123, + "ce_loss_23": 2.924458014965057, + "ce_loss_3": 4.289544808864593, + "ce_loss_6": 3.9699981570243836, + "epoch": 0.322, + "grad_norm": 1176.0, + "kl_loss_12": 1213.3118133544922, + "kl_loss_17": 330.83406219482424, + "kl_loss_3": 2866.3750732421877, + "kl_loss_6": 2242.109002685547, + "learning_rate": 0.0007743124562210351, + "loss": 1639.6686, + "step": 3220 + }, + { + "ce_loss_12": 3.475546681880951, + "ce_loss_17": 3.0862147331237795, + "ce_loss_23": 2.937591588497162, + "ce_loss_3": 4.305411899089814, + "ce_loss_6": 3.9904374480247498, + "epoch": 0.323, + "grad_norm": 1504.0, + "kl_loss_12": 1211.6309631347656, + "kl_loss_17": 333.8129486083984, + "kl_loss_3": 2886.2772583007813, + "kl_loss_6": 2262.567352294922, + "learning_rate": 0.0007729845182076895, + "loss": 1680.683, + "step": 3230 + }, + { + "ce_loss_12": 3.410216248035431, + "ce_loss_17": 3.0217228651046755, + "ce_loss_23": 2.877736246585846, + "ce_loss_3": 4.229400503635406, + "ce_loss_6": 3.9092357754707336, + "epoch": 0.324, + "grad_norm": 1048.0, + "kl_loss_12": 1201.3193969726562, + "kl_loss_17": 328.6470672607422, + "kl_loss_3": 2836.2654541015627, + "kl_loss_6": 2212.4843994140624, + "learning_rate": 0.0007716538312432765, + "loss": 1695.1324, + "step": 3240 + }, + { + "ce_loss_12": 3.390603744983673, + "ce_loss_17": 2.9847122192382813, + "ce_loss_23": 2.832513761520386, + "ce_loss_3": 4.25310173034668, + "ce_loss_6": 3.921470022201538, + "epoch": 0.325, + "grad_norm": 1240.0, + "kl_loss_12": 1255.1136657714844, + "kl_loss_17": 344.44752044677733, + "kl_loss_3": 2969.5409912109376, + "kl_loss_6": 2320.2986572265627, + "learning_rate": 0.0007703204087277988, + "loss": 1709.0459, + "step": 3250 + }, + { + "ce_loss_12": 3.451129651069641, + "ce_loss_17": 3.0698804378509523, + "ce_loss_23": 2.9265930533409117, + "ce_loss_3": 4.268451309204101, + "ce_loss_6": 3.950474727153778, + "epoch": 0.326, + "grad_norm": 908.0, + "kl_loss_12": 1186.6741638183594, + "kl_loss_17": 325.9498016357422, + "kl_loss_3": 2813.0351440429686, + "kl_loss_6": 2200.3617919921876, + "learning_rate": 0.0007689842640888063, + "loss": 1644.4848, + "step": 3260 + }, + { + "ce_loss_12": 3.4569345355033874, + "ce_loss_17": 3.0677335143089293, + "ce_loss_23": 2.9155858874320986, + "ce_loss_3": 4.279983472824097, + "ce_loss_6": 3.9633984923362733, + "epoch": 0.327, + "grad_norm": 1208.0, + "kl_loss_12": 1202.497802734375, + "kl_loss_17": 335.511946105957, + "kl_loss_3": 2833.3481323242186, + "kl_loss_6": 2213.333331298828, + "learning_rate": 0.0007676454107812607, + "loss": 1664.0393, + "step": 3270 + }, + { + "ce_loss_12": 3.4128082990646362, + "ce_loss_17": 3.0163416266441345, + "ce_loss_23": 2.8643674731254576, + "ce_loss_3": 4.269602704048157, + "ce_loss_6": 3.942162013053894, + "epoch": 0.328, + "grad_norm": 1072.0, + "kl_loss_12": 1230.0508850097656, + "kl_loss_17": 342.1333236694336, + "kl_loss_3": 2943.2237182617187, + "kl_loss_6": 2295.276818847656, + "learning_rate": 0.0007663038622873999, + "loss": 1674.4738, + "step": 3280 + }, + { + "ce_loss_12": 3.444205844402313, + "ce_loss_17": 3.0551442742347716, + "ce_loss_23": 2.9088870882987976, + "ce_loss_3": 4.284962522983551, + "ce_loss_6": 3.965476393699646, + "epoch": 0.329, + "grad_norm": 996.0, + "kl_loss_12": 1205.8692626953125, + "kl_loss_17": 337.4704162597656, + "kl_loss_3": 2886.572998046875, + "kl_loss_6": 2260.4553100585936, + "learning_rate": 0.0007649596321166025, + "loss": 1644.5602, + "step": 3290 + }, + { + "ce_loss_12": 3.3512764811515807, + "ce_loss_17": 2.963244104385376, + "ce_loss_23": 2.816838335990906, + "ce_loss_3": 4.173265266418457, + "ce_loss_6": 3.848917078971863, + "epoch": 0.33, + "grad_norm": 1176.0, + "kl_loss_12": 1188.5720703125, + "kl_loss_17": 328.98443298339845, + "kl_loss_3": 2824.7002563476562, + "kl_loss_6": 2182.885040283203, + "learning_rate": 0.0007636127338052513, + "loss": 1658.1311, + "step": 3300 + }, + { + "ce_loss_12": 3.459541440010071, + "ce_loss_17": 3.0586934447288514, + "ce_loss_23": 2.9092486619949343, + "ce_loss_3": 4.322417807579041, + "ce_loss_6": 4.004756271839142, + "epoch": 0.331, + "grad_norm": 956.0, + "kl_loss_12": 1244.9644409179687, + "kl_loss_17": 339.57240905761716, + "kl_loss_3": 2961.9379638671876, + "kl_loss_6": 2337.522784423828, + "learning_rate": 0.0007622631809165971, + "loss": 1680.2289, + "step": 3310 + }, + { + "ce_loss_12": 3.417771947383881, + "ce_loss_17": 3.0409359097480775, + "ce_loss_23": 2.904327702522278, + "ce_loss_3": 4.219339740276337, + "ce_loss_6": 3.9072014331817626, + "epoch": 0.332, + "grad_norm": 932.0, + "kl_loss_12": 1154.7458221435547, + "kl_loss_17": 314.9847137451172, + "kl_loss_3": 2750.251428222656, + "kl_loss_6": 2141.974853515625, + "learning_rate": 0.000760910987040623, + "loss": 1631.1079, + "step": 3320 + }, + { + "ce_loss_12": 3.4493923664093016, + "ce_loss_17": 3.0475847482681275, + "ce_loss_23": 2.890909481048584, + "ce_loss_3": 4.309184527397155, + "ce_loss_6": 3.9842631340026857, + "epoch": 0.333, + "grad_norm": 1004.0, + "kl_loss_12": 1254.3341552734375, + "kl_loss_17": 344.63670349121094, + "kl_loss_3": 2990.2296508789063, + "kl_loss_6": 2340.0262939453123, + "learning_rate": 0.000759556165793906, + "loss": 1678.9332, + "step": 3330 + }, + { + "ce_loss_12": 3.444781243801117, + "ce_loss_17": 3.0487191557884215, + "ce_loss_23": 2.8992719531059263, + "ce_loss_3": 4.283552169799805, + "ce_loss_6": 3.961264455318451, + "epoch": 0.334, + "grad_norm": 1312.0, + "kl_loss_12": 1226.6700500488282, + "kl_loss_17": 337.08323059082034, + "kl_loss_3": 2898.0889892578125, + "kl_loss_6": 2265.684948730469, + "learning_rate": 0.000758198730819481, + "loss": 1694.3148, + "step": 3340 + }, + { + "ce_loss_12": 3.4056811690330506, + "ce_loss_17": 3.015982913970947, + "ce_loss_23": 2.8739725828170775, + "ce_loss_3": 4.259327507019043, + "ce_loss_6": 3.92424658536911, + "epoch": 0.335, + "grad_norm": 1004.0, + "kl_loss_12": 1191.1532836914062, + "kl_loss_17": 325.79417724609374, + "kl_loss_3": 2897.7123168945313, + "kl_loss_6": 2244.7325927734373, + "learning_rate": 0.0007568386957867032, + "loss": 1669.3246, + "step": 3350 + }, + { + "ce_loss_12": 3.4579933881759644, + "ce_loss_17": 3.0634746313095094, + "ce_loss_23": 2.9128017902374266, + "ce_loss_3": 4.298311471939087, + "ce_loss_6": 3.96904639005661, + "epoch": 0.336, + "grad_norm": 984.0, + "kl_loss_12": 1210.9908325195313, + "kl_loss_17": 332.8085235595703, + "kl_loss_3": 2887.1035766601562, + "kl_loss_6": 2238.7325805664063, + "learning_rate": 0.0007554760743911103, + "loss": 1685.1012, + "step": 3360 + }, + { + "ce_loss_12": 3.3742502093315125, + "ce_loss_17": 2.9924378871917723, + "ce_loss_23": 2.8445406556129456, + "ce_loss_3": 4.203329575061798, + "ce_loss_6": 3.883717620372772, + "epoch": 0.337, + "grad_norm": 1232.0, + "kl_loss_12": 1176.7677856445312, + "kl_loss_17": 322.7069152832031, + "kl_loss_3": 2854.629248046875, + "kl_loss_6": 2216.13232421875, + "learning_rate": 0.0007541108803542846, + "loss": 1695.4902, + "step": 3370 + }, + { + "ce_loss_12": 3.405466413497925, + "ce_loss_17": 3.026802861690521, + "ce_loss_23": 2.8841339468955995, + "ce_loss_3": 4.250721335411072, + "ce_loss_6": 3.918993294239044, + "epoch": 0.338, + "grad_norm": 1032.0, + "kl_loss_12": 1186.8168060302735, + "kl_loss_17": 326.74443359375, + "kl_loss_3": 2882.3068725585936, + "kl_loss_6": 2238.0958862304688, + "learning_rate": 0.0007527431274237149, + "loss": 1734.5438, + "step": 3380 + }, + { + "ce_loss_12": 3.3811033248901365, + "ce_loss_17": 2.9985656142234802, + "ce_loss_23": 2.855313777923584, + "ce_loss_3": 4.223816525936127, + "ce_loss_6": 3.9034560084342957, + "epoch": 0.339, + "grad_norm": 836.0, + "kl_loss_12": 1187.6836853027344, + "kl_loss_17": 328.6970550537109, + "kl_loss_3": 2870.4953369140626, + "kl_loss_6": 2244.3517517089845, + "learning_rate": 0.0007513728293726579, + "loss": 1663.8422, + "step": 3390 + }, + { + "ce_loss_12": 3.4858251929283144, + "ce_loss_17": 3.1010074496269224, + "ce_loss_23": 2.9533849716186524, + "ce_loss_3": 4.3071588516235355, + "ce_loss_6": 3.9825177431106566, + "epoch": 0.34, + "grad_norm": 968.0, + "kl_loss_12": 1201.841552734375, + "kl_loss_17": 333.19029541015624, + "kl_loss_3": 2849.7021728515624, + "kl_loss_6": 2217.7927062988283, + "learning_rate": 0.00075, + "loss": 1647.9623, + "step": 3400 + }, + { + "ce_loss_12": 3.4936216950416563, + "ce_loss_17": 3.096825158596039, + "ce_loss_23": 2.946681487560272, + "ce_loss_3": 4.33717565536499, + "ce_loss_6": 4.007562124729157, + "epoch": 0.341, + "grad_norm": 1088.0, + "kl_loss_12": 1218.2007141113281, + "kl_loss_17": 336.65336151123046, + "kl_loss_3": 2905.7448852539064, + "kl_loss_6": 2266.000128173828, + "learning_rate": 0.0007486246531301177, + "loss": 1664.9313, + "step": 3410 + }, + { + "ce_loss_12": 3.3124721884727477, + "ce_loss_17": 2.9247117161750795, + "ce_loss_23": 2.7764694809913637, + "ce_loss_3": 4.156286108493805, + "ce_loss_6": 3.8327873945236206, + "epoch": 0.342, + "grad_norm": 972.0, + "kl_loss_12": 1194.6027465820312, + "kl_loss_17": 326.52222137451173, + "kl_loss_3": 2875.920751953125, + "kl_loss_6": 2236.224786376953, + "learning_rate": 0.0007472468026127384, + "loss": 1643.2096, + "step": 3420 + }, + { + "ce_loss_12": 3.467436170578003, + "ce_loss_17": 3.0640435934066774, + "ce_loss_23": 2.9080865740776063, + "ce_loss_3": 4.331029152870178, + "ce_loss_6": 3.997330093383789, + "epoch": 0.343, + "grad_norm": 1048.0, + "kl_loss_12": 1247.931414794922, + "kl_loss_17": 353.526628112793, + "kl_loss_3": 2997.3978515625, + "kl_loss_6": 2341.1341552734375, + "learning_rate": 0.000745866462322802, + "loss": 1710.0527, + "step": 3430 + }, + { + "ce_loss_12": 3.413029646873474, + "ce_loss_17": 3.0317874789237975, + "ce_loss_23": 2.889462399482727, + "ce_loss_3": 4.240648257732391, + "ce_loss_6": 3.920201134681702, + "epoch": 0.344, + "grad_norm": 992.0, + "kl_loss_12": 1178.7016906738281, + "kl_loss_17": 325.30846405029297, + "kl_loss_3": 2833.207421875, + "kl_loss_6": 2199.9562438964845, + "learning_rate": 0.0007444836461603195, + "loss": 1647.2428, + "step": 3440 + }, + { + "ce_loss_12": 3.4991490840911865, + "ce_loss_17": 3.1049185156822205, + "ce_loss_23": 2.9510151505470277, + "ce_loss_3": 4.332233214378357, + "ce_loss_6": 4.014215791225434, + "epoch": 0.345, + "grad_norm": 992.0, + "kl_loss_12": 1251.4855041503906, + "kl_loss_17": 351.6912017822266, + "kl_loss_3": 2915.772314453125, + "kl_loss_6": 2294.0759887695312, + "learning_rate": 0.0007430983680502344, + "loss": 1706.1598, + "step": 3450 + }, + { + "ce_loss_12": 3.3458803057670594, + "ce_loss_17": 2.950461220741272, + "ce_loss_23": 2.801449549198151, + "ce_loss_3": 4.203580784797668, + "ce_loss_6": 3.875659191608429, + "epoch": 0.346, + "grad_norm": 1104.0, + "kl_loss_12": 1221.081005859375, + "kl_loss_17": 339.3466766357422, + "kl_loss_3": 2943.6446044921877, + "kl_loss_6": 2290.779504394531, + "learning_rate": 0.0007417106419422819, + "loss": 1686.3721, + "step": 3460 + }, + { + "ce_loss_12": 3.422148883342743, + "ce_loss_17": 3.035439658164978, + "ce_loss_23": 2.886011064052582, + "ce_loss_3": 4.258232343196869, + "ce_loss_6": 3.929511618614197, + "epoch": 0.347, + "grad_norm": 1304.0, + "kl_loss_12": 1193.3747314453126, + "kl_loss_17": 330.8219955444336, + "kl_loss_3": 2852.3163208007813, + "kl_loss_6": 2209.255895996094, + "learning_rate": 0.0007403204818108486, + "loss": 1673.4871, + "step": 3470 + }, + { + "ce_loss_12": 3.408907175064087, + "ce_loss_17": 3.02507598400116, + "ce_loss_23": 2.87800327539444, + "ce_loss_3": 4.256623768806458, + "ce_loss_6": 3.9263039350509645, + "epoch": 0.348, + "grad_norm": 1200.0, + "kl_loss_12": 1221.0171936035156, + "kl_loss_17": 334.5439254760742, + "kl_loss_3": 2934.891650390625, + "kl_loss_6": 2288.8263549804688, + "learning_rate": 0.0007389279016548316, + "loss": 1638.8451, + "step": 3480 + }, + { + "ce_loss_12": 3.4307928681373596, + "ce_loss_17": 3.0258888959884644, + "ce_loss_23": 2.8691168308258055, + "ce_loss_3": 4.318265390396118, + "ce_loss_6": 3.977033627033234, + "epoch": 0.349, + "grad_norm": 1336.0, + "kl_loss_12": 1240.856460571289, + "kl_loss_17": 346.09679412841797, + "kl_loss_3": 3012.540246582031, + "kl_loss_6": 2333.396343994141, + "learning_rate": 0.0007375329154974975, + "loss": 1706.6148, + "step": 3490 + }, + { + "ce_loss_12": 3.3646591901779175, + "ce_loss_17": 2.97927565574646, + "ce_loss_23": 2.844258749485016, + "ce_loss_3": 4.202694976329804, + "ce_loss_6": 3.874116039276123, + "epoch": 0.35, + "grad_norm": 1012.0, + "kl_loss_12": 1182.0934997558593, + "kl_loss_17": 326.57828521728516, + "kl_loss_3": 2833.6731201171874, + "kl_loss_6": 2196.8786743164064, + "learning_rate": 0.0007361355373863414, + "loss": 1678.5336, + "step": 3500 + }, + { + "ce_loss_12": 3.408645486831665, + "ce_loss_17": 3.0301180243492127, + "ce_loss_23": 2.8835768818855287, + "ce_loss_3": 4.239638113975525, + "ce_loss_6": 3.914521038532257, + "epoch": 0.351, + "grad_norm": 996.0, + "kl_loss_12": 1170.8706909179687, + "kl_loss_17": 326.8454879760742, + "kl_loss_3": 2834.1891235351563, + "kl_loss_6": 2189.419958496094, + "learning_rate": 0.0007347357813929454, + "loss": 1679.9662, + "step": 3510 + }, + { + "ce_loss_12": 3.363666367530823, + "ce_loss_17": 2.9849454045295714, + "ce_loss_23": 2.8406106352806093, + "ce_loss_3": 4.193360352516175, + "ce_loss_6": 3.8782509207725524, + "epoch": 0.352, + "grad_norm": 1152.0, + "kl_loss_12": 1161.9197082519531, + "kl_loss_17": 326.0701385498047, + "kl_loss_3": 2821.089880371094, + "kl_loss_6": 2196.3210876464846, + "learning_rate": 0.0007333336616128369, + "loss": 1667.7982, + "step": 3520 + }, + { + "ce_loss_12": 3.363524007797241, + "ce_loss_17": 2.9640949726104737, + "ce_loss_23": 2.8081513166427614, + "ce_loss_3": 4.223999178409576, + "ce_loss_6": 3.899560832977295, + "epoch": 0.353, + "grad_norm": 940.0, + "kl_loss_12": 1227.8570373535156, + "kl_loss_17": 341.45946044921874, + "kl_loss_3": 2951.542346191406, + "kl_loss_6": 2311.4682556152343, + "learning_rate": 0.0007319291921653463, + "loss": 1696.3664, + "step": 3530 + }, + { + "ce_loss_12": 3.4368479251861572, + "ce_loss_17": 3.045628237724304, + "ce_loss_23": 2.88864551782608, + "ce_loss_3": 4.29784243106842, + "ce_loss_6": 3.960416281223297, + "epoch": 0.354, + "grad_norm": 1048.0, + "kl_loss_12": 1223.4083862304688, + "kl_loss_17": 345.2417556762695, + "kl_loss_3": 2935.894006347656, + "kl_loss_6": 2278.224365234375, + "learning_rate": 0.0007305223871934656, + "loss": 1665.9004, + "step": 3540 + }, + { + "ce_loss_12": 3.390989351272583, + "ce_loss_17": 3.007799005508423, + "ce_loss_23": 2.8593823671340943, + "ce_loss_3": 4.239778733253479, + "ce_loss_6": 3.9120181441307067, + "epoch": 0.355, + "grad_norm": 1088.0, + "kl_loss_12": 1199.7558227539062, + "kl_loss_17": 331.8055679321289, + "kl_loss_3": 2895.3523071289064, + "kl_loss_6": 2249.6681213378906, + "learning_rate": 0.0007291132608637052, + "loss": 1671.7258, + "step": 3550 + }, + { + "ce_loss_12": 3.4190950870513914, + "ce_loss_17": 2.9831862688064574, + "ce_loss_23": 2.8376792788505556, + "ce_loss_3": 4.28361166715622, + "ce_loss_6": 3.955883574485779, + "epoch": 0.356, + "grad_norm": 1248.0, + "kl_loss_12": 1284.4929809570312, + "kl_loss_17": 324.38274536132815, + "kl_loss_3": 3015.9632934570313, + "kl_loss_6": 2369.1489807128905, + "learning_rate": 0.0007277018273659516, + "loss": 1730.4875, + "step": 3560 + }, + { + "ce_loss_12": 3.499080014228821, + "ce_loss_17": 3.095122253894806, + "ce_loss_23": 2.944272482395172, + "ce_loss_3": 4.32317476272583, + "ce_loss_6": 4.0050243973732, + "epoch": 0.357, + "grad_norm": 860.0, + "kl_loss_12": 1239.7875671386719, + "kl_loss_17": 340.98800201416014, + "kl_loss_3": 2891.496875, + "kl_loss_6": 2267.2274780273438, + "learning_rate": 0.0007262881009133242, + "loss": 1674.6738, + "step": 3570 + }, + { + "ce_loss_12": 3.400438332557678, + "ce_loss_17": 3.0199944972991943, + "ce_loss_23": 2.8742507457733155, + "ce_loss_3": 4.232627630233765, + "ce_loss_6": 3.914644730091095, + "epoch": 0.358, + "grad_norm": 1012.0, + "kl_loss_12": 1179.560043334961, + "kl_loss_17": 324.86656494140624, + "kl_loss_3": 2846.779443359375, + "kl_loss_6": 2215.014270019531, + "learning_rate": 0.0007248720957420329, + "loss": 1637.5172, + "step": 3580 + }, + { + "ce_loss_12": 3.3900612473487852, + "ce_loss_17": 3.0157479643821716, + "ce_loss_23": 2.8782155990600584, + "ce_loss_3": 4.211673903465271, + "ce_loss_6": 3.8979337215423584, + "epoch": 0.359, + "grad_norm": 968.0, + "kl_loss_12": 1171.381411743164, + "kl_loss_17": 321.20471649169923, + "kl_loss_3": 2810.2108520507813, + "kl_loss_6": 2188.5156005859376, + "learning_rate": 0.0007234538261112341, + "loss": 1678.8098, + "step": 3590 + }, + { + "ce_loss_12": 3.4516560554504396, + "ce_loss_17": 3.0625590562820433, + "ce_loss_23": 2.9125977158546448, + "ce_loss_3": 4.298930358886719, + "ce_loss_6": 3.9716432213783266, + "epoch": 0.36, + "grad_norm": 1448.0, + "kl_loss_12": 1205.0357238769532, + "kl_loss_17": 335.10853118896483, + "kl_loss_3": 2908.1567504882814, + "kl_loss_6": 2268.4599182128904, + "learning_rate": 0.0007220333063028871, + "loss": 1654.9832, + "step": 3600 + }, + { + "ce_loss_12": 3.647151756286621, + "ce_loss_17": 3.100103199481964, + "ce_loss_23": 2.9450520038604737, + "ce_loss_3": 4.415069651603699, + "ce_loss_6": 4.093648076057434, + "epoch": 0.361, + "grad_norm": 920.0, + "kl_loss_12": 1537.5132202148438, + "kl_loss_17": 348.4029968261719, + "kl_loss_3": 3085.6149047851563, + "kl_loss_6": 2452.1618225097654, + "learning_rate": 0.0007206105506216106, + "loss": 1780.9383, + "step": 3610 + }, + { + "ce_loss_12": 3.3636831879615783, + "ce_loss_17": 2.970406544208527, + "ce_loss_23": 2.830918622016907, + "ce_loss_3": 4.1724036693572994, + "ce_loss_6": 3.860769248008728, + "epoch": 0.362, + "grad_norm": 1192.0, + "kl_loss_12": 1192.9184692382812, + "kl_loss_17": 321.7777557373047, + "kl_loss_3": 2808.0264526367187, + "kl_loss_6": 2189.515771484375, + "learning_rate": 0.0007191855733945387, + "loss": 1620.7258, + "step": 3620 + }, + { + "ce_loss_12": 3.45752409696579, + "ce_loss_17": 3.0582520365715027, + "ce_loss_23": 2.91327451467514, + "ce_loss_3": 4.273197531700134, + "ce_loss_6": 3.9577839732170106, + "epoch": 0.363, + "grad_norm": 1360.0, + "kl_loss_12": 1206.7141052246093, + "kl_loss_17": 327.1438919067383, + "kl_loss_3": 2851.4152221679688, + "kl_loss_6": 2226.528112792969, + "learning_rate": 0.0007177583889711762, + "loss": 1646.1391, + "step": 3630 + }, + { + "ce_loss_12": 3.3765084385871886, + "ce_loss_17": 2.9773260831832884, + "ce_loss_23": 2.8310837745666504, + "ce_loss_3": 4.207540595531464, + "ce_loss_6": 3.8836970210075377, + "epoch": 0.364, + "grad_norm": 1008.0, + "kl_loss_12": 1220.5072326660156, + "kl_loss_17": 330.72536010742186, + "kl_loss_3": 2894.7391357421875, + "kl_loss_6": 2249.510809326172, + "learning_rate": 0.0007163290117232541, + "loss": 1670.5426, + "step": 3640 + }, + { + "ce_loss_12": 3.449886155128479, + "ce_loss_17": 3.0827237129211427, + "ce_loss_23": 2.9416739106178285, + "ce_loss_3": 4.242349648475647, + "ce_loss_6": 3.9314298033714294, + "epoch": 0.365, + "grad_norm": 968.0, + "kl_loss_12": 1169.6258117675782, + "kl_loss_17": 319.44925079345705, + "kl_loss_3": 2779.614050292969, + "kl_loss_6": 2167.718853759766, + "learning_rate": 0.0007148974560445859, + "loss": 1634.0065, + "step": 3650 + }, + { + "ce_loss_12": 3.39327358007431, + "ce_loss_17": 3.011539709568024, + "ce_loss_23": 2.8679875254631044, + "ce_loss_3": 4.203790700435638, + "ce_loss_6": 3.8777782678604127, + "epoch": 0.366, + "grad_norm": 964.0, + "kl_loss_12": 1173.676431274414, + "kl_loss_17": 322.41929931640624, + "kl_loss_3": 2789.83759765625, + "kl_loss_6": 2164.4882751464843, + "learning_rate": 0.0007134637363509209, + "loss": 1611.7471, + "step": 3660 + }, + { + "ce_loss_12": 3.4866799235343935, + "ce_loss_17": 3.1163609504699705, + "ce_loss_23": 2.977469801902771, + "ce_loss_3": 4.299042820930481, + "ce_loss_6": 3.983168888092041, + "epoch": 0.367, + "grad_norm": 952.0, + "kl_loss_12": 1160.4010803222657, + "kl_loss_17": 316.6776519775391, + "kl_loss_3": 2768.0127685546877, + "kl_loss_6": 2150.191613769531, + "learning_rate": 0.0007120278670798009, + "loss": 1632.7604, + "step": 3670 + }, + { + "ce_loss_12": 3.3547308087348937, + "ce_loss_17": 2.9385184764862062, + "ce_loss_23": 2.793093574047089, + "ce_loss_3": 4.232029449939728, + "ce_loss_6": 3.904446530342102, + "epoch": 0.368, + "grad_norm": 1072.0, + "kl_loss_12": 1246.8237579345703, + "kl_loss_17": 336.25614776611326, + "kl_loss_3": 3006.0741333007813, + "kl_loss_6": 2360.845281982422, + "learning_rate": 0.0007105898626904133, + "loss": 1730.9002, + "step": 3680 + }, + { + "ce_loss_12": 3.4213064908981323, + "ce_loss_17": 3.026142966747284, + "ce_loss_23": 2.881701076030731, + "ce_loss_3": 4.266143536567688, + "ce_loss_6": 3.938157784938812, + "epoch": 0.369, + "grad_norm": 1176.0, + "kl_loss_12": 1195.0875213623046, + "kl_loss_17": 323.45970611572267, + "kl_loss_3": 2873.1806762695314, + "kl_loss_6": 2230.3562744140627, + "learning_rate": 0.0007091497376634463, + "loss": 1645.2902, + "step": 3690 + }, + { + "ce_loss_12": 3.3609025478363037, + "ce_loss_17": 2.976015532016754, + "ce_loss_23": 2.8323405981063843, + "ce_loss_3": 4.19185117483139, + "ce_loss_6": 3.8728413343429566, + "epoch": 0.37, + "grad_norm": 980.0, + "kl_loss_12": 1184.621435546875, + "kl_loss_17": 327.99839782714844, + "kl_loss_3": 2828.9660766601564, + "kl_loss_6": 2203.749591064453, + "learning_rate": 0.0007077075065009433, + "loss": 1666.8021, + "step": 3700 + }, + { + "ce_loss_12": 3.4689030408859254, + "ce_loss_17": 3.076432979106903, + "ce_loss_23": 2.9234078168869018, + "ce_loss_3": 4.315247058868408, + "ce_loss_6": 3.988802170753479, + "epoch": 0.371, + "grad_norm": 1104.0, + "kl_loss_12": 1224.3506103515624, + "kl_loss_17": 340.6742309570312, + "kl_loss_3": 2917.4282836914062, + "kl_loss_6": 2279.465539550781, + "learning_rate": 0.0007062631837261557, + "loss": 1674.5109, + "step": 3710 + }, + { + "ce_loss_12": 3.334757077693939, + "ce_loss_17": 2.955988013744354, + "ce_loss_23": 2.8153989911079407, + "ce_loss_3": 4.175846552848816, + "ce_loss_6": 3.8509674072265625, + "epoch": 0.372, + "grad_norm": 1128.0, + "kl_loss_12": 1177.7722412109374, + "kl_loss_17": 322.4950775146484, + "kl_loss_3": 2849.675048828125, + "kl_loss_6": 2210.367559814453, + "learning_rate": 0.0007048167838833977, + "loss": 1686.3625, + "step": 3720 + }, + { + "ce_loss_12": 3.4109480500221254, + "ce_loss_17": 3.0387460708618166, + "ce_loss_23": 2.890661919116974, + "ce_loss_3": 4.229937970638275, + "ce_loss_6": 3.915041470527649, + "epoch": 0.373, + "grad_norm": 1096.0, + "kl_loss_12": 1171.3883331298828, + "kl_loss_17": 328.7982147216797, + "kl_loss_3": 2824.0050537109373, + "kl_loss_6": 2200.1705627441406, + "learning_rate": 0.0007033683215379002, + "loss": 1640.442, + "step": 3730 + }, + { + "ce_loss_12": 3.400256597995758, + "ce_loss_17": 3.017388308048248, + "ce_loss_23": 2.8781059622764587, + "ce_loss_3": 4.242074239253998, + "ce_loss_6": 3.9133849501609803, + "epoch": 0.374, + "grad_norm": 1360.0, + "kl_loss_12": 1169.729736328125, + "kl_loss_17": 319.6404739379883, + "kl_loss_3": 2847.294885253906, + "kl_loss_6": 2206.2028259277345, + "learning_rate": 0.0007019178112756625, + "loss": 1663.7166, + "step": 3740 + }, + { + "ce_loss_12": 3.379054582118988, + "ce_loss_17": 2.994138073921204, + "ce_loss_23": 2.8540342330932615, + "ce_loss_3": 4.206314241886139, + "ce_loss_6": 3.8927628993988037, + "epoch": 0.375, + "grad_norm": 968.0, + "kl_loss_12": 1167.5954345703126, + "kl_loss_17": 319.94212341308594, + "kl_loss_3": 2816.092578125, + "kl_loss_6": 2200.663360595703, + "learning_rate": 0.0007004652677033068, + "loss": 1642.523, + "step": 3750 + }, + { + "ce_loss_12": 3.430193781852722, + "ce_loss_17": 3.06665323972702, + "ce_loss_23": 2.9323292851448057, + "ce_loss_3": 4.248442935943603, + "ce_loss_6": 3.927159142494202, + "epoch": 0.376, + "grad_norm": 1064.0, + "kl_loss_12": 1131.9261444091796, + "kl_loss_17": 309.78822937011716, + "kl_loss_3": 2785.084680175781, + "kl_loss_6": 2150.070684814453, + "learning_rate": 0.0006990107054479312, + "loss": 1620.216, + "step": 3760 + }, + { + "ce_loss_12": 3.425540065765381, + "ce_loss_17": 3.0395328879356383, + "ce_loss_23": 2.896018648147583, + "ce_loss_3": 4.24209463596344, + "ce_loss_6": 3.9303755879402162, + "epoch": 0.377, + "grad_norm": 1048.0, + "kl_loss_12": 1186.8247192382812, + "kl_loss_17": 326.4223037719727, + "kl_loss_3": 2820.041540527344, + "kl_loss_6": 2201.958917236328, + "learning_rate": 0.000697554139156961, + "loss": 1642.091, + "step": 3770 + }, + { + "ce_loss_12": 3.4275654792785644, + "ce_loss_17": 3.041707730293274, + "ce_loss_23": 2.8987489342689514, + "ce_loss_3": 4.269000458717346, + "ce_loss_6": 3.941159975528717, + "epoch": 0.378, + "grad_norm": 1072.0, + "kl_loss_12": 1198.9744995117187, + "kl_loss_17": 331.3607635498047, + "kl_loss_3": 2895.0713012695314, + "kl_loss_6": 2249.006982421875, + "learning_rate": 0.0006960955834980027, + "loss": 1626.0086, + "step": 3780 + }, + { + "ce_loss_12": 3.388811159133911, + "ce_loss_17": 3.0065065026283264, + "ce_loss_23": 2.865676498413086, + "ce_loss_3": 4.215176248550415, + "ce_loss_6": 3.8905073523521425, + "epoch": 0.379, + "grad_norm": 1024.0, + "kl_loss_12": 1166.9369812011719, + "kl_loss_17": 318.73981018066405, + "kl_loss_3": 2826.0754150390626, + "kl_loss_6": 2183.756945800781, + "learning_rate": 0.0006946350531586958, + "loss": 1629.8831, + "step": 3790 + }, + { + "ce_loss_12": 3.419374644756317, + "ce_loss_17": 3.0380242824554444, + "ce_loss_23": 2.899268925189972, + "ce_loss_3": 4.244865441322327, + "ce_loss_6": 3.9227930545806884, + "epoch": 0.38, + "grad_norm": 1432.0, + "kl_loss_12": 1178.4429138183593, + "kl_loss_17": 321.04165802001955, + "kl_loss_3": 2842.534814453125, + "kl_loss_6": 2213.956402587891, + "learning_rate": 0.0006931725628465643, + "loss": 1669.4293, + "step": 3800 + }, + { + "ce_loss_12": 3.4255444049835204, + "ce_loss_17": 3.039610135555267, + "ce_loss_23": 2.8914812207221985, + "ce_loss_3": 4.2611403465271, + "ce_loss_6": 3.9370644330978393, + "epoch": 0.381, + "grad_norm": 1088.0, + "kl_loss_12": 1179.518118286133, + "kl_loss_17": 327.4698013305664, + "kl_loss_3": 2840.5864013671876, + "kl_loss_6": 2207.823132324219, + "learning_rate": 0.0006917081272888696, + "loss": 1642.6992, + "step": 3810 + }, + { + "ce_loss_12": 3.3692954659461973, + "ce_loss_17": 2.965705895423889, + "ce_loss_23": 2.8230329275131227, + "ce_loss_3": 4.224563157558441, + "ce_loss_6": 3.8963645458221436, + "epoch": 0.382, + "grad_norm": 1136.0, + "kl_loss_12": 1231.950244140625, + "kl_loss_17": 326.74798278808595, + "kl_loss_3": 2944.267932128906, + "kl_loss_6": 2300.83447265625, + "learning_rate": 0.0006902417612324615, + "loss": 1656.0443, + "step": 3820 + }, + { + "ce_loss_12": 3.484429359436035, + "ce_loss_17": 3.083032763004303, + "ce_loss_23": 2.9321335554122925, + "ce_loss_3": 4.346240592002869, + "ce_loss_6": 4.015672600269317, + "epoch": 0.383, + "grad_norm": 996.0, + "kl_loss_12": 1232.4169067382813, + "kl_loss_17": 339.5805206298828, + "kl_loss_3": 2957.370556640625, + "kl_loss_6": 2296.0817749023436, + "learning_rate": 0.00068877347944363, + "loss": 1683.5805, + "step": 3830 + }, + { + "ce_loss_12": 3.4553491711616515, + "ce_loss_17": 3.077640438079834, + "ce_loss_23": 2.93577378988266, + "ce_loss_3": 4.262261152267456, + "ce_loss_6": 3.9459288239479067, + "epoch": 0.384, + "grad_norm": 952.0, + "kl_loss_12": 1174.015118408203, + "kl_loss_17": 322.96498718261716, + "kl_loss_3": 2797.076550292969, + "kl_loss_6": 2187.494738769531, + "learning_rate": 0.0006873032967079561, + "loss": 1649.0123, + "step": 3840 + }, + { + "ce_loss_12": 3.4251646637916564, + "ce_loss_17": 3.0583839654922484, + "ce_loss_23": 2.921176278591156, + "ce_loss_3": 4.226701831817627, + "ce_loss_6": 3.9207385420799254, + "epoch": 0.385, + "grad_norm": 1040.0, + "kl_loss_12": 1149.8529052734375, + "kl_loss_17": 320.24271392822266, + "kl_loss_3": 2766.314599609375, + "kl_loss_6": 2155.769970703125, + "learning_rate": 0.0006858312278301637, + "loss": 1607.1077, + "step": 3850 + }, + { + "ce_loss_12": 3.4586119294166564, + "ce_loss_17": 3.094258224964142, + "ce_loss_23": 2.959088897705078, + "ce_loss_3": 4.254314255714417, + "ce_loss_6": 3.939511406421661, + "epoch": 0.386, + "grad_norm": 996.0, + "kl_loss_12": 1156.5623840332032, + "kl_loss_17": 319.6972915649414, + "kl_loss_3": 2757.2991333007812, + "kl_loss_6": 2136.331689453125, + "learning_rate": 0.0006843572876339704, + "loss": 1605.4867, + "step": 3860 + }, + { + "ce_loss_12": 3.378196489810944, + "ce_loss_17": 3.0188413739204405, + "ce_loss_23": 2.8821682691574098, + "ce_loss_3": 4.178075551986694, + "ce_loss_6": 3.862470579147339, + "epoch": 0.387, + "grad_norm": 1400.0, + "kl_loss_12": 1141.6409393310546, + "kl_loss_17": 313.7957015991211, + "kl_loss_3": 2745.0942260742186, + "kl_loss_6": 2123.7592712402343, + "learning_rate": 0.0006828814909619373, + "loss": 1654.2504, + "step": 3870 + }, + { + "ce_loss_12": 3.521103036403656, + "ce_loss_17": 3.13949875831604, + "ce_loss_23": 2.9910845518112184, + "ce_loss_3": 4.34140408039093, + "ce_loss_6": 4.002995109558105, + "epoch": 0.388, + "grad_norm": 880.0, + "kl_loss_12": 1186.9687927246093, + "kl_loss_17": 334.6907043457031, + "kl_loss_3": 2817.365966796875, + "kl_loss_6": 2165.355920410156, + "learning_rate": 0.0006814038526753205, + "loss": 1611.627, + "step": 3880 + }, + { + "ce_loss_12": 3.4270783066749573, + "ce_loss_17": 3.042400801181793, + "ce_loss_23": 2.899458038806915, + "ce_loss_3": 4.239492547512055, + "ce_loss_6": 3.9202648162841798, + "epoch": 0.389, + "grad_norm": 1056.0, + "kl_loss_12": 1170.2104949951172, + "kl_loss_17": 323.59288482666017, + "kl_loss_3": 2790.2853149414063, + "kl_loss_6": 2166.1191528320314, + "learning_rate": 0.0006799243876539213, + "loss": 1620.8202, + "step": 3890 + }, + { + "ce_loss_12": 3.359401023387909, + "ce_loss_17": 2.9685895681381225, + "ce_loss_23": 2.830006313323975, + "ce_loss_3": 4.210623705387116, + "ce_loss_6": 3.882489597797394, + "epoch": 0.39, + "grad_norm": 1136.0, + "kl_loss_12": 1183.7762817382813, + "kl_loss_17": 320.4316207885742, + "kl_loss_3": 2891.90244140625, + "kl_loss_6": 2243.515478515625, + "learning_rate": 0.0006784431107959359, + "loss": 1667.743, + "step": 3900 + }, + { + "ce_loss_12": 3.4223373532295227, + "ce_loss_17": 3.0267418742179872, + "ce_loss_23": 2.879237198829651, + "ce_loss_3": 4.278204727172851, + "ce_loss_6": 3.946788024902344, + "epoch": 0.391, + "grad_norm": 1264.0, + "kl_loss_12": 1212.3030578613282, + "kl_loss_17": 330.3525680541992, + "kl_loss_3": 2935.582763671875, + "kl_loss_6": 2278.313610839844, + "learning_rate": 0.0006769600370178059, + "loss": 1661.9084, + "step": 3910 + }, + { + "ce_loss_12": 3.3743677854537966, + "ce_loss_17": 2.9920090198516847, + "ce_loss_23": 2.8529026985168455, + "ce_loss_3": 4.2079997777938845, + "ce_loss_6": 3.8821290016174315, + "epoch": 0.392, + "grad_norm": 820.0, + "kl_loss_12": 1180.332696533203, + "kl_loss_17": 319.2956573486328, + "kl_loss_3": 2833.5747680664062, + "kl_loss_6": 2189.435583496094, + "learning_rate": 0.0006754751812540679, + "loss": 1611.8293, + "step": 3920 + }, + { + "ce_loss_12": 3.4259696125984194, + "ce_loss_17": 3.041267991065979, + "ce_loss_23": 2.897170066833496, + "ce_loss_3": 4.2738687753677365, + "ce_loss_6": 3.9354104518890383, + "epoch": 0.393, + "grad_norm": 1280.0, + "kl_loss_12": 1197.7025756835938, + "kl_loss_17": 326.8806884765625, + "kl_loss_3": 2886.1544555664063, + "kl_loss_6": 2231.5798583984374, + "learning_rate": 0.0006739885584572025, + "loss": 1660.7541, + "step": 3930 + }, + { + "ce_loss_12": 3.4471525073051454, + "ce_loss_17": 3.051491439342499, + "ce_loss_23": 2.913893294334412, + "ce_loss_3": 4.300728058815002, + "ce_loss_6": 3.9704771399497987, + "epoch": 0.394, + "grad_norm": 992.0, + "kl_loss_12": 1213.9912811279296, + "kl_loss_17": 326.83617095947267, + "kl_loss_3": 2943.55283203125, + "kl_loss_6": 2295.3341735839845, + "learning_rate": 0.0006725001835974853, + "loss": 1649.1977, + "step": 3940 + }, + { + "ce_loss_12": 3.4414065361022947, + "ce_loss_17": 3.0564581513404847, + "ce_loss_23": 2.9085390329360963, + "ce_loss_3": 4.280353951454162, + "ce_loss_6": 3.960696303844452, + "epoch": 0.395, + "grad_norm": 960.0, + "kl_loss_12": 1196.9637329101563, + "kl_loss_17": 331.86663208007815, + "kl_loss_3": 2880.139501953125, + "kl_loss_6": 2242.0492309570313, + "learning_rate": 0.0006710100716628344, + "loss": 1626.1908, + "step": 3950 + }, + { + "ce_loss_12": 3.428723990917206, + "ce_loss_17": 3.038597285747528, + "ce_loss_23": 2.8930386543273925, + "ce_loss_3": 4.255900311470032, + "ce_loss_6": 3.936865043640137, + "epoch": 0.396, + "grad_norm": 1016.0, + "kl_loss_12": 1188.9954895019532, + "kl_loss_17": 319.97182312011716, + "kl_loss_3": 2847.898083496094, + "kl_loss_6": 2217.8192932128904, + "learning_rate": 0.0006695182376586602, + "loss": 1653.416, + "step": 3960 + }, + { + "ce_loss_12": 3.4169690012931824, + "ce_loss_17": 3.0513774871826174, + "ce_loss_23": 2.919622015953064, + "ce_loss_3": 4.2158072113990785, + "ce_loss_6": 3.893892788887024, + "epoch": 0.397, + "grad_norm": 1480.0, + "kl_loss_12": 1115.445260620117, + "kl_loss_17": 304.40807037353517, + "kl_loss_3": 2705.8389892578125, + "kl_loss_6": 2082.140728759766, + "learning_rate": 0.000668024696607715, + "loss": 1632.433, + "step": 3970 + }, + { + "ce_loss_12": 3.406718575954437, + "ce_loss_17": 3.031963884830475, + "ce_loss_23": 2.895033621788025, + "ce_loss_3": 4.22108142375946, + "ce_loss_6": 3.9037919998168946, + "epoch": 0.398, + "grad_norm": 1536.0, + "kl_loss_12": 1165.0296813964844, + "kl_loss_17": 317.1994743347168, + "kl_loss_3": 2799.8370971679688, + "kl_loss_6": 2181.7656982421877, + "learning_rate": 0.0006665294635499404, + "loss": 1619.0551, + "step": 3980 + }, + { + "ce_loss_12": 3.43480144739151, + "ce_loss_17": 3.041903519630432, + "ce_loss_23": 2.8935364723205566, + "ce_loss_3": 4.295881199836731, + "ce_loss_6": 3.973275625705719, + "epoch": 0.399, + "grad_norm": 1184.0, + "kl_loss_12": 1226.6514404296875, + "kl_loss_17": 339.84486083984376, + "kl_loss_3": 2951.3014892578126, + "kl_loss_6": 2316.8784423828124, + "learning_rate": 0.0006650325535423167, + "loss": 1668.1738, + "step": 3990 + }, + { + "ce_loss_12": 3.4110765933990477, + "ce_loss_17": 3.0551862835884096, + "ce_loss_23": 2.9150756716728212, + "ce_loss_3": 4.215204250812531, + "ce_loss_6": 3.8930283784866333, + "epoch": 0.4, + "grad_norm": 1176.0, + "kl_loss_12": 1126.5774383544922, + "kl_loss_17": 316.1091995239258, + "kl_loss_3": 2728.413671875, + "kl_loss_6": 2095.254010009766, + "learning_rate": 0.0006635339816587109, + "loss": 1618.082, + "step": 4000 + }, + { + "ce_loss_12": 3.3729982733726502, + "ce_loss_17": 2.992246413230896, + "ce_loss_23": 2.851309823989868, + "ce_loss_3": 4.219701409339905, + "ce_loss_6": 3.905067670345306, + "epoch": 0.401, + "grad_norm": 1400.0, + "kl_loss_12": 1184.4628173828125, + "kl_loss_17": 325.0249252319336, + "kl_loss_3": 2873.4937377929687, + "kl_loss_6": 2256.891387939453, + "learning_rate": 0.0006620337629897252, + "loss": 1631.4626, + "step": 4010 + }, + { + "ce_loss_12": 3.3826616048812865, + "ce_loss_17": 3.0062437295913695, + "ce_loss_23": 2.8597806096076965, + "ce_loss_3": 4.213565754890442, + "ce_loss_6": 3.8855297684669496, + "epoch": 0.402, + "grad_norm": 1056.0, + "kl_loss_12": 1164.632745361328, + "kl_loss_17": 325.6399185180664, + "kl_loss_3": 2830.8072875976563, + "kl_loss_6": 2184.3242370605467, + "learning_rate": 0.0006605319126425454, + "loss": 1655.3189, + "step": 4020 + }, + { + "ce_loss_12": 3.3020559906959535, + "ce_loss_17": 2.919109809398651, + "ce_loss_23": 2.781159979104996, + "ce_loss_3": 4.159004271030426, + "ce_loss_6": 3.836709499359131, + "epoch": 0.403, + "grad_norm": 1128.0, + "kl_loss_12": 1189.5907165527344, + "kl_loss_17": 322.5351135253906, + "kl_loss_3": 2899.326037597656, + "kl_loss_6": 2257.474090576172, + "learning_rate": 0.0006590284457407876, + "loss": 1652.3072, + "step": 4030 + }, + { + "ce_loss_12": 3.389311420917511, + "ce_loss_17": 3.010061573982239, + "ce_loss_23": 2.8683920979499815, + "ce_loss_3": 4.223417830467224, + "ce_loss_6": 3.894852077960968, + "epoch": 0.404, + "grad_norm": 1048.0, + "kl_loss_12": 1165.923388671875, + "kl_loss_17": 323.8217208862305, + "kl_loss_3": 2836.329638671875, + "kl_loss_6": 2193.572961425781, + "learning_rate": 0.0006575233774243465, + "loss": 1628.1803, + "step": 4040 + }, + { + "ce_loss_12": 3.3846017479896546, + "ce_loss_17": 3.006804585456848, + "ce_loss_23": 2.8631478548049927, + "ce_loss_3": 4.2313508033752445, + "ce_loss_6": 3.9021772384643554, + "epoch": 0.405, + "grad_norm": 1264.0, + "kl_loss_12": 1176.518975830078, + "kl_loss_17": 329.24969482421875, + "kl_loss_3": 2880.244873046875, + "kl_loss_6": 2226.158905029297, + "learning_rate": 0.0006560167228492435, + "loss": 1644.4455, + "step": 4050 + }, + { + "ce_loss_12": 3.4046581268310545, + "ce_loss_17": 3.041847562789917, + "ce_loss_23": 2.9070791721343996, + "ce_loss_3": 4.219157040119171, + "ce_loss_6": 3.8969243288040163, + "epoch": 0.406, + "grad_norm": 988.0, + "kl_loss_12": 1129.4734466552734, + "kl_loss_17": 312.1006332397461, + "kl_loss_3": 2755.409387207031, + "kl_loss_6": 2129.448858642578, + "learning_rate": 0.0006545084971874737, + "loss": 1629.0785, + "step": 4060 + }, + { + "ce_loss_12": 3.4048274517059327, + "ce_loss_17": 3.010120987892151, + "ce_loss_23": 2.8593595743179323, + "ce_loss_3": 4.262722325325012, + "ce_loss_6": 3.9252733469009398, + "epoch": 0.407, + "grad_norm": 1088.0, + "kl_loss_12": 1210.6922943115235, + "kl_loss_17": 337.23687744140625, + "kl_loss_3": 2925.420068359375, + "kl_loss_6": 2256.9163330078127, + "learning_rate": 0.0006529987156268526, + "loss": 1640.2656, + "step": 4070 + }, + { + "ce_loss_12": 3.317921507358551, + "ce_loss_17": 2.9338502049446107, + "ce_loss_23": 2.785680627822876, + "ce_loss_3": 4.1656157851219175, + "ce_loss_6": 3.837941527366638, + "epoch": 0.408, + "grad_norm": 1024.0, + "kl_loss_12": 1175.9814331054688, + "kl_loss_17": 324.7815673828125, + "kl_loss_3": 2871.5527587890624, + "kl_loss_6": 2227.038439941406, + "learning_rate": 0.0006514873933708637, + "loss": 1675.1437, + "step": 4080 + }, + { + "ce_loss_12": 3.408484864234924, + "ce_loss_17": 3.039537787437439, + "ce_loss_23": 2.8986527919769287, + "ce_loss_3": 4.231956946849823, + "ce_loss_6": 3.917720365524292, + "epoch": 0.409, + "grad_norm": 976.0, + "kl_loss_12": 1140.7272155761718, + "kl_loss_17": 314.2014724731445, + "kl_loss_3": 2784.7043701171874, + "kl_loss_6": 2163.5523376464844, + "learning_rate": 0.0006499745456385053, + "loss": 1610.7346, + "step": 4090 + }, + { + "ce_loss_12": 3.3876006484031675, + "ce_loss_17": 3.0119059801101686, + "ce_loss_23": 2.863646948337555, + "ce_loss_3": 4.21506804227829, + "ce_loss_6": 3.894675004482269, + "epoch": 0.41, + "grad_norm": 960.0, + "kl_loss_12": 1179.637158203125, + "kl_loss_17": 326.34833068847655, + "kl_loss_3": 2843.0706420898437, + "kl_loss_6": 2211.5627380371093, + "learning_rate": 0.0006484601876641375, + "loss": 1647.2432, + "step": 4100 + }, + { + "ce_loss_12": 3.358959376811981, + "ce_loss_17": 2.9945371627807615, + "ce_loss_23": 2.8585385918617248, + "ce_loss_3": 4.157709872722625, + "ce_loss_6": 3.844931447505951, + "epoch": 0.411, + "grad_norm": 1040.0, + "kl_loss_12": 1127.2386535644532, + "kl_loss_17": 313.78133392333984, + "kl_loss_3": 2728.8353637695313, + "kl_loss_6": 2121.688055419922, + "learning_rate": 0.000646944334697328, + "loss": 1597.0447, + "step": 4110 + }, + { + "ce_loss_12": 3.460660433769226, + "ce_loss_17": 3.096243751049042, + "ce_loss_23": 2.9564993023872375, + "ce_loss_3": 4.2486083745956424, + "ce_loss_6": 3.938153052330017, + "epoch": 0.412, + "grad_norm": 960.0, + "kl_loss_12": 1131.3982330322265, + "kl_loss_17": 315.8477813720703, + "kl_loss_3": 2707.6667724609374, + "kl_loss_6": 2098.8063354492188, + "learning_rate": 0.0006454270020026995, + "loss": 1574.3133, + "step": 4120 + }, + { + "ce_loss_12": 3.4204355835914613, + "ce_loss_17": 3.067693066596985, + "ce_loss_23": 2.9321154236793516, + "ce_loss_3": 4.215284967422486, + "ce_loss_6": 3.904770815372467, + "epoch": 0.413, + "grad_norm": 988.0, + "kl_loss_12": 1112.7341613769531, + "kl_loss_17": 306.37862396240234, + "kl_loss_3": 2706.292724609375, + "kl_loss_6": 2092.9628967285157, + "learning_rate": 0.0006439082048597755, + "loss": 1570.2234, + "step": 4130 + }, + { + "ce_loss_12": 3.4354603052139283, + "ce_loss_17": 3.0538384079933167, + "ce_loss_23": 2.9169345617294313, + "ce_loss_3": 4.2601546287536625, + "ce_loss_6": 3.9380828738212585, + "epoch": 0.414, + "grad_norm": 1144.0, + "kl_loss_12": 1169.7640869140625, + "kl_loss_17": 318.6606689453125, + "kl_loss_3": 2818.671984863281, + "kl_loss_6": 2193.7250061035156, + "learning_rate": 0.0006423879585628261, + "loss": 1629.1748, + "step": 4140 + }, + { + "ce_loss_12": 3.399686324596405, + "ce_loss_17": 3.018126440048218, + "ce_loss_23": 2.873788130283356, + "ce_loss_3": 4.243383419513703, + "ce_loss_6": 3.922605037689209, + "epoch": 0.415, + "grad_norm": 1384.0, + "kl_loss_12": 1186.7492919921874, + "kl_loss_17": 326.4311004638672, + "kl_loss_3": 2872.7005859375, + "kl_loss_6": 2243.2379943847654, + "learning_rate": 0.0006408662784207149, + "loss": 1650.5398, + "step": 4150 + }, + { + "ce_loss_12": 3.3609784603118897, + "ce_loss_17": 2.994195282459259, + "ce_loss_23": 2.8559187173843386, + "ce_loss_3": 4.1885595440864565, + "ce_loss_6": 3.861635887622833, + "epoch": 0.416, + "grad_norm": 1224.0, + "kl_loss_12": 1156.0622589111329, + "kl_loss_17": 314.15498046875, + "kl_loss_3": 2812.4429443359377, + "kl_loss_6": 2174.676873779297, + "learning_rate": 0.0006393431797567439, + "loss": 1619.142, + "step": 4160 + }, + { + "ce_loss_12": 3.4092275619506838, + "ce_loss_17": 3.059811198711395, + "ce_loss_23": 2.9277246236801147, + "ce_loss_3": 4.204770541191101, + "ce_loss_6": 3.888715922832489, + "epoch": 0.417, + "grad_norm": 1216.0, + "kl_loss_12": 1123.1555938720703, + "kl_loss_17": 312.6223335266113, + "kl_loss_3": 2726.036267089844, + "kl_loss_6": 2100.0018615722656, + "learning_rate": 0.0006378186779084996, + "loss": 1551.5569, + "step": 4170 + }, + { + "ce_loss_12": 3.294791781902313, + "ce_loss_17": 2.90387818813324, + "ce_loss_23": 2.7652917981147764, + "ce_loss_3": 4.132733774185181, + "ce_loss_6": 3.806008851528168, + "epoch": 0.418, + "grad_norm": 936.0, + "kl_loss_12": 1173.2480255126952, + "kl_loss_17": 320.46661376953125, + "kl_loss_3": 2840.6937744140623, + "kl_loss_6": 2205.0871704101564, + "learning_rate": 0.0006362927882276989, + "loss": 1644.2609, + "step": 4180 + }, + { + "ce_loss_12": 3.430036115646362, + "ce_loss_17": 3.0758618474006654, + "ce_loss_23": 2.9404595136642455, + "ce_loss_3": 4.2398931860923765, + "ce_loss_6": 3.9211429834365843, + "epoch": 0.419, + "grad_norm": 1424.0, + "kl_loss_12": 1106.8557525634765, + "kl_loss_17": 304.0802848815918, + "kl_loss_3": 2734.475732421875, + "kl_loss_6": 2099.411590576172, + "learning_rate": 0.000634765526080034, + "loss": 1556.1971, + "step": 4190 + }, + { + "ce_loss_12": 3.453889286518097, + "ce_loss_17": 3.0857287406921388, + "ce_loss_23": 2.9473395586013793, + "ce_loss_3": 4.255516767501831, + "ce_loss_6": 3.944706749916077, + "epoch": 0.42, + "grad_norm": 960.0, + "kl_loss_12": 1145.736996459961, + "kl_loss_17": 317.604719543457, + "kl_loss_3": 2756.877331542969, + "kl_loss_6": 2135.238446044922, + "learning_rate": 0.0006332369068450174, + "loss": 1586.098, + "step": 4200 + }, + { + "ce_loss_12": 3.404520869255066, + "ce_loss_17": 3.026888573169708, + "ce_loss_23": 2.8927263975143434, + "ce_loss_3": 4.221071326732636, + "ce_loss_6": 3.915247893333435, + "epoch": 0.421, + "grad_norm": 972.0, + "kl_loss_12": 1162.0391998291016, + "kl_loss_17": 316.31638107299807, + "kl_loss_3": 2793.0206665039063, + "kl_loss_6": 2186.566125488281, + "learning_rate": 0.0006317069459158283, + "loss": 1605.7854, + "step": 4210 + }, + { + "ce_loss_12": 3.4682124257087708, + "ce_loss_17": 3.117214620113373, + "ce_loss_23": 2.9843473196029664, + "ce_loss_3": 4.26329472064972, + "ce_loss_6": 3.9522085428237914, + "epoch": 0.422, + "grad_norm": 964.0, + "kl_loss_12": 1128.2288879394532, + "kl_loss_17": 313.6609832763672, + "kl_loss_3": 2728.2145874023436, + "kl_loss_6": 2108.8893432617188, + "learning_rate": 0.0006301756586991561, + "loss": 1583.7455, + "step": 4220 + }, + { + "ce_loss_12": 3.3013151049613954, + "ce_loss_17": 2.9211807370185854, + "ce_loss_23": 2.781052625179291, + "ce_loss_3": 4.149512720108032, + "ce_loss_6": 3.8239535689353943, + "epoch": 0.423, + "grad_norm": 1004.0, + "kl_loss_12": 1185.4845764160157, + "kl_loss_17": 321.148747253418, + "kl_loss_3": 2884.1349365234373, + "kl_loss_6": 2240.584826660156, + "learning_rate": 0.0006286430606150459, + "loss": 1639.8203, + "step": 4230 + }, + { + "ce_loss_12": 3.48389675617218, + "ce_loss_17": 3.1149152278900147, + "ce_loss_23": 2.977355432510376, + "ce_loss_3": 4.2954552412033085, + "ce_loss_6": 3.9828169465065004, + "epoch": 0.424, + "grad_norm": 984.0, + "kl_loss_12": 1147.0034240722657, + "kl_loss_17": 320.0153060913086, + "kl_loss_3": 2780.7621337890623, + "kl_loss_6": 2161.5279174804687, + "learning_rate": 0.0006271091670967436, + "loss": 1600.0794, + "step": 4240 + }, + { + "ce_loss_12": 3.424987781047821, + "ce_loss_17": 3.0343403697013853, + "ce_loss_23": 2.8883402824401854, + "ce_loss_3": 4.268402814865112, + "ce_loss_6": 3.942337465286255, + "epoch": 0.425, + "grad_norm": 1288.0, + "kl_loss_12": 1207.4884033203125, + "kl_loss_17": 332.36527862548826, + "kl_loss_3": 2902.5668090820313, + "kl_loss_6": 2257.4686645507813, + "learning_rate": 0.0006255739935905395, + "loss": 1638.8984, + "step": 4250 + }, + { + "ce_loss_12": 3.4310410976409913, + "ce_loss_17": 3.0717048764228823, + "ce_loss_23": 2.931931567192078, + "ce_loss_3": 4.242630553245545, + "ce_loss_6": 3.925148296356201, + "epoch": 0.426, + "grad_norm": 1144.0, + "kl_loss_12": 1145.0904571533204, + "kl_loss_17": 318.5279541015625, + "kl_loss_3": 2771.041711425781, + "kl_loss_6": 2146.984967041016, + "learning_rate": 0.0006240375555556145, + "loss": 1651.0617, + "step": 4260 + }, + { + "ce_loss_12": 3.448286783695221, + "ce_loss_17": 3.0631021738052366, + "ce_loss_23": 2.921405851840973, + "ce_loss_3": 4.293193435668945, + "ce_loss_6": 3.9641549587249756, + "epoch": 0.427, + "grad_norm": 828.0, + "kl_loss_12": 1183.8236541748047, + "kl_loss_17": 321.07748260498045, + "kl_loss_3": 2869.4075927734375, + "kl_loss_6": 2233.6482055664064, + "learning_rate": 0.000622499868463882, + "loss": 1637.2682, + "step": 4270 + }, + { + "ce_loss_12": 3.392454504966736, + "ce_loss_17": 3.0375906944274904, + "ce_loss_23": 2.9044872641563417, + "ce_loss_3": 4.181933903694153, + "ce_loss_6": 3.871629846096039, + "epoch": 0.428, + "grad_norm": 1200.0, + "kl_loss_12": 1119.5519989013671, + "kl_loss_17": 311.49632720947267, + "kl_loss_3": 2726.1117553710938, + "kl_loss_6": 2109.8937805175783, + "learning_rate": 0.0006209609477998338, + "loss": 1587.0687, + "step": 4280 + }, + { + "ce_loss_12": 3.463665783405304, + "ce_loss_17": 3.093734884262085, + "ce_loss_23": 2.9519339919090273, + "ce_loss_3": 4.2704680323600765, + "ce_loss_6": 3.9471255421638487, + "epoch": 0.429, + "grad_norm": 924.0, + "kl_loss_12": 1153.5091369628906, + "kl_loss_17": 319.26009674072264, + "kl_loss_3": 2781.630859375, + "kl_loss_6": 2142.5775146484375, + "learning_rate": 0.0006194208090603844, + "loss": 1618.8986, + "step": 4290 + }, + { + "ce_loss_12": 3.37452689409256, + "ce_loss_17": 3.0138617753982544, + "ce_loss_23": 2.878571164608002, + "ce_loss_3": 4.188263761997223, + "ce_loss_6": 3.878198838233948, + "epoch": 0.43, + "grad_norm": 948.0, + "kl_loss_12": 1118.5654907226562, + "kl_loss_17": 307.2293182373047, + "kl_loss_3": 2742.392199707031, + "kl_loss_6": 2129.879833984375, + "learning_rate": 0.0006178794677547138, + "loss": 1566.273, + "step": 4300 + }, + { + "ce_loss_12": 3.4126819610595702, + "ce_loss_17": 3.040526843070984, + "ce_loss_23": 2.901843559741974, + "ce_loss_3": 4.239118087291717, + "ce_loss_6": 3.921295201778412, + "epoch": 0.431, + "grad_norm": 1136.0, + "kl_loss_12": 1158.5338500976563, + "kl_loss_17": 319.30886993408205, + "kl_loss_3": 2805.543896484375, + "kl_loss_6": 2183.512487792969, + "learning_rate": 0.0006163369394041111, + "loss": 1605.8267, + "step": 4310 + }, + { + "ce_loss_12": 3.345866930484772, + "ce_loss_17": 2.974448561668396, + "ce_loss_23": 2.838093012571335, + "ce_loss_3": 4.1859783172607425, + "ce_loss_6": 3.8710575103759766, + "epoch": 0.432, + "grad_norm": 1128.0, + "kl_loss_12": 1158.7231079101562, + "kl_loss_17": 311.65046844482424, + "kl_loss_3": 2838.497119140625, + "kl_loss_6": 2212.0281005859374, + "learning_rate": 0.0006147932395418205, + "loss": 1657.0939, + "step": 4320 + }, + { + "ce_loss_12": 3.3821220636367797, + "ce_loss_17": 3.0188361406326294, + "ce_loss_23": 2.883270764350891, + "ce_loss_3": 4.1908201456069945, + "ce_loss_6": 3.872398817539215, + "epoch": 0.433, + "grad_norm": 1448.0, + "kl_loss_12": 1142.3731201171875, + "kl_loss_17": 313.32046966552736, + "kl_loss_3": 2772.5299682617188, + "kl_loss_6": 2137.9815185546877, + "learning_rate": 0.0006132483837128823, + "loss": 1581.4684, + "step": 4330 + }, + { + "ce_loss_12": 3.360012209415436, + "ce_loss_17": 2.994674015045166, + "ce_loss_23": 2.859726941585541, + "ce_loss_3": 4.202695202827454, + "ce_loss_6": 3.8732985854148865, + "epoch": 0.434, + "grad_norm": 912.0, + "kl_loss_12": 1148.7674255371094, + "kl_loss_17": 310.43812255859376, + "kl_loss_3": 2840.497424316406, + "kl_loss_6": 2196.349890136719, + "learning_rate": 0.0006117023874739772, + "loss": 1622.8654, + "step": 4340 + }, + { + "ce_loss_12": 3.367009127140045, + "ce_loss_17": 2.9907040119171144, + "ce_loss_23": 2.852668786048889, + "ce_loss_3": 4.191274094581604, + "ce_loss_6": 3.87186518907547, + "epoch": 0.435, + "grad_norm": 1024.0, + "kl_loss_12": 1161.118389892578, + "kl_loss_17": 313.2685745239258, + "kl_loss_3": 2827.9594970703124, + "kl_loss_6": 2188.6732482910156, + "learning_rate": 0.0006101552663932703, + "loss": 1632.4307, + "step": 4350 + }, + { + "ce_loss_12": 3.3917209982872008, + "ce_loss_17": 3.022636556625366, + "ce_loss_23": 2.884074628353119, + "ce_loss_3": 4.2065025806427006, + "ce_loss_6": 3.8965853214263917, + "epoch": 0.436, + "grad_norm": 876.0, + "kl_loss_12": 1152.564471435547, + "kl_loss_17": 320.2475158691406, + "kl_loss_3": 2783.341162109375, + "kl_loss_6": 2171.592877197266, + "learning_rate": 0.0006086070360502539, + "loss": 1609.012, + "step": 4360 + }, + { + "ce_loss_12": 3.3906732559204102, + "ce_loss_17": 3.0208441615104675, + "ce_loss_23": 2.8845319390296935, + "ce_loss_3": 4.21385703086853, + "ce_loss_6": 3.896357476711273, + "epoch": 0.437, + "grad_norm": 1016.0, + "kl_loss_12": 1148.9530700683595, + "kl_loss_17": 310.03978576660154, + "kl_loss_3": 2806.8066650390624, + "kl_loss_6": 2181.143347167969, + "learning_rate": 0.0006070577120355903, + "loss": 1614.329, + "step": 4370 + }, + { + "ce_loss_12": 3.3964044094085692, + "ce_loss_17": 3.0242053508758544, + "ce_loss_23": 2.890236794948578, + "ce_loss_3": 4.183533334732056, + "ce_loss_6": 3.871987521648407, + "epoch": 0.438, + "grad_norm": 960.0, + "kl_loss_12": 1123.01728515625, + "kl_loss_17": 306.5171600341797, + "kl_loss_3": 2704.7479614257813, + "kl_loss_6": 2085.7805908203127, + "learning_rate": 0.0006055073099509549, + "loss": 1585.2225, + "step": 4380 + }, + { + "ce_loss_12": 3.4402162551879885, + "ce_loss_17": 3.0814417481422423, + "ce_loss_23": 2.9468929171562195, + "ce_loss_3": 4.247218203544617, + "ce_loss_6": 3.933430850505829, + "epoch": 0.439, + "grad_norm": 1064.0, + "kl_loss_12": 1131.3260192871094, + "kl_loss_17": 311.39515380859376, + "kl_loss_3": 2749.497119140625, + "kl_loss_6": 2138.216192626953, + "learning_rate": 0.0006039558454088796, + "loss": 1612.5506, + "step": 4390 + }, + { + "ce_loss_12": 3.422533404827118, + "ce_loss_17": 3.0480231046676636, + "ce_loss_23": 2.9077144980430605, + "ce_loss_3": 4.241112649440765, + "ce_loss_6": 3.9317553758621218, + "epoch": 0.44, + "grad_norm": 1256.0, + "kl_loss_12": 1146.5663787841797, + "kl_loss_17": 313.3567604064941, + "kl_loss_3": 2788.4807006835936, + "kl_loss_6": 2172.3048828125, + "learning_rate": 0.0006024033340325954, + "loss": 1576.1466, + "step": 4400 + }, + { + "ce_loss_12": 3.464881455898285, + "ce_loss_17": 3.1081199288368224, + "ce_loss_23": 2.9811485767364503, + "ce_loss_3": 4.24443507194519, + "ce_loss_6": 3.940363574028015, + "epoch": 0.441, + "grad_norm": 884.0, + "kl_loss_12": 1095.667462158203, + "kl_loss_17": 298.9134033203125, + "kl_loss_3": 2663.990344238281, + "kl_loss_6": 2061.230120849609, + "learning_rate": 0.0006008497914558743, + "loss": 1566.9678, + "step": 4410 + }, + { + "ce_loss_12": 3.4442933082580565, + "ce_loss_17": 3.0712830781936646, + "ce_loss_23": 2.9269362568855284, + "ce_loss_3": 4.272569918632508, + "ce_loss_6": 3.945245790481567, + "epoch": 0.442, + "grad_norm": 1016.0, + "kl_loss_12": 1164.5831817626954, + "kl_loss_17": 326.71973876953126, + "kl_loss_3": 2832.883654785156, + "kl_loss_6": 2189.940539550781, + "learning_rate": 0.0005992952333228728, + "loss": 1621.5256, + "step": 4420 + }, + { + "ce_loss_12": 3.3685991048812864, + "ce_loss_17": 3.00733345746994, + "ce_loss_23": 2.8735783815383913, + "ce_loss_3": 4.198529326915741, + "ce_loss_6": 3.882496440410614, + "epoch": 0.443, + "grad_norm": 1032.0, + "kl_loss_12": 1138.6497192382812, + "kl_loss_17": 309.3685668945312, + "kl_loss_3": 2800.78447265625, + "kl_loss_6": 2179.3860107421874, + "learning_rate": 0.0005977396752879741, + "loss": 1591.6296, + "step": 4430 + }, + { + "ce_loss_12": 3.3082218766212463, + "ce_loss_17": 2.934286594390869, + "ce_loss_23": 2.795569658279419, + "ce_loss_3": 4.121199953556061, + "ce_loss_6": 3.8101864218711854, + "epoch": 0.444, + "grad_norm": 1288.0, + "kl_loss_12": 1158.0689025878905, + "kl_loss_17": 315.5136322021484, + "kl_loss_3": 2796.026611328125, + "kl_loss_6": 2176.3042724609377, + "learning_rate": 0.0005961831330156305, + "loss": 1591.341, + "step": 4440 + }, + { + "ce_loss_12": 3.4397592782974242, + "ce_loss_17": 3.0738757252693176, + "ce_loss_23": 2.934872305393219, + "ce_loss_3": 4.28535441160202, + "ce_loss_6": 3.9639587998390198, + "epoch": 0.445, + "grad_norm": 1232.0, + "kl_loss_12": 1144.8870971679687, + "kl_loss_17": 316.44542388916017, + "kl_loss_3": 2836.933642578125, + "kl_loss_6": 2195.8565368652344, + "learning_rate": 0.0005946256221802051, + "loss": 1641.4605, + "step": 4450 + }, + { + "ce_loss_12": 3.3827796936035157, + "ce_loss_17": 3.0436911463737486, + "ce_loss_23": 2.911225152015686, + "ce_loss_3": 4.180616104602814, + "ce_loss_6": 3.864882254600525, + "epoch": 0.446, + "grad_norm": 1104.0, + "kl_loss_12": 1090.165689086914, + "kl_loss_17": 312.66605987548826, + "kl_loss_3": 2692.0668334960938, + "kl_loss_6": 2074.98994140625, + "learning_rate": 0.0005930671584658151, + "loss": 1638.3902, + "step": 4460 + }, + { + "ce_loss_12": 3.419265556335449, + "ce_loss_17": 3.0543219804763795, + "ce_loss_23": 2.9189966201782225, + "ce_loss_3": 4.228051006793976, + "ce_loss_6": 3.9190755128860473, + "epoch": 0.447, + "grad_norm": 1144.0, + "kl_loss_12": 1135.6621948242187, + "kl_loss_17": 319.49698638916016, + "kl_loss_3": 2777.132958984375, + "kl_loss_6": 2155.2238891601564, + "learning_rate": 0.0005915077575661722, + "loss": 1618.4136, + "step": 4470 + }, + { + "ce_loss_12": 3.442099952697754, + "ce_loss_17": 3.07456750869751, + "ce_loss_23": 2.9299558758735658, + "ce_loss_3": 4.255296397209167, + "ce_loss_6": 3.9377192854881287, + "epoch": 0.448, + "grad_norm": 1072.0, + "kl_loss_12": 1161.6586822509767, + "kl_loss_17": 332.67717895507815, + "kl_loss_3": 2798.3115966796877, + "kl_loss_6": 2179.07138671875, + "learning_rate": 0.000589947435184427, + "loss": 1596.1469, + "step": 4480 + }, + { + "ce_loss_12": 3.470191848278046, + "ce_loss_17": 3.1280281066894533, + "ce_loss_23": 2.990417408943176, + "ce_loss_3": 4.238299298286438, + "ce_loss_6": 3.934940552711487, + "epoch": 0.449, + "grad_norm": 1136.0, + "kl_loss_12": 1124.0675689697266, + "kl_loss_17": 314.65479736328126, + "kl_loss_3": 2680.177978515625, + "kl_loss_6": 2075.8083312988283, + "learning_rate": 0.0005883862070330078, + "loss": 1577.6258, + "step": 4490 + }, + { + "ce_loss_12": 3.438168096542358, + "ce_loss_17": 3.063719856739044, + "ce_loss_23": 2.9275577425956727, + "ce_loss_3": 4.238969564437866, + "ce_loss_6": 3.9301570534706114, + "epoch": 0.45, + "grad_norm": 1168.0, + "kl_loss_12": 1152.8913330078126, + "kl_loss_17": 316.52400970458984, + "kl_loss_3": 2774.6293701171876, + "kl_loss_6": 2158.622393798828, + "learning_rate": 0.0005868240888334653, + "loss": 1597.5406, + "step": 4500 + }, + { + "ce_loss_12": 3.3346810936927795, + "ce_loss_17": 2.9612236976623536, + "ce_loss_23": 2.819701302051544, + "ce_loss_3": 4.172786235809326, + "ce_loss_6": 3.844736897945404, + "epoch": 0.451, + "grad_norm": 960.0, + "kl_loss_12": 1160.5079437255858, + "kl_loss_17": 321.31404266357424, + "kl_loss_3": 2839.0437744140627, + "kl_loss_6": 2195.8436950683595, + "learning_rate": 0.0005852610963163119, + "loss": 1618.7336, + "step": 4510 + }, + { + "ce_loss_12": 3.347758114337921, + "ce_loss_17": 2.9786689639091493, + "ce_loss_23": 2.8448096394538878, + "ce_loss_3": 4.152521347999572, + "ce_loss_6": 3.8433269381523134, + "epoch": 0.452, + "grad_norm": 1056.0, + "kl_loss_12": 1137.2014526367188, + "kl_loss_17": 310.4284866333008, + "kl_loss_3": 2761.944201660156, + "kl_loss_6": 2144.6801025390623, + "learning_rate": 0.0005836972452208654, + "loss": 1577.3804, + "step": 4520 + }, + { + "ce_loss_12": 3.3452757716178896, + "ce_loss_17": 2.9843344926834106, + "ce_loss_23": 2.8507073402404783, + "ce_loss_3": 4.172976124286651, + "ce_loss_6": 3.8575175166130067, + "epoch": 0.453, + "grad_norm": 1144.0, + "kl_loss_12": 1132.9714324951171, + "kl_loss_17": 313.4719665527344, + "kl_loss_3": 2796.802868652344, + "kl_loss_6": 2169.0428100585937, + "learning_rate": 0.0005821325512950885, + "loss": 1598.8304, + "step": 4530 + }, + { + "ce_loss_12": 3.3669161796569824, + "ce_loss_17": 3.005538260936737, + "ce_loss_23": 2.8709388375282288, + "ce_loss_3": 4.176382279396057, + "ce_loss_6": 3.8579967260360717, + "epoch": 0.454, + "grad_norm": 980.0, + "kl_loss_12": 1107.4463897705077, + "kl_loss_17": 305.12395706176756, + "kl_loss_3": 2722.3322875976564, + "kl_loss_6": 2097.39970703125, + "learning_rate": 0.0005805670302954321, + "loss": 1584.4574, + "step": 4540 + }, + { + "ce_loss_12": 3.3633339762687684, + "ce_loss_17": 3.010383832454681, + "ce_loss_23": 2.8782918214797975, + "ce_loss_3": 4.172574400901794, + "ce_loss_6": 3.8587594032287598, + "epoch": 0.455, + "grad_norm": 1056.0, + "kl_loss_12": 1112.6406524658203, + "kl_loss_17": 299.7900199890137, + "kl_loss_3": 2729.6116943359375, + "kl_loss_6": 2117.4360534667967, + "learning_rate": 0.000579000697986675, + "loss": 1563.8545, + "step": 4550 + }, + { + "ce_loss_12": 3.362711465358734, + "ce_loss_17": 2.9764938712120057, + "ce_loss_23": 2.835140883922577, + "ce_loss_3": 4.201532959938049, + "ce_loss_6": 3.8728991508483888, + "epoch": 0.456, + "grad_norm": 1552.0, + "kl_loss_12": 1178.7279296875, + "kl_loss_17": 323.01573944091797, + "kl_loss_3": 2854.2749877929687, + "kl_loss_6": 2213.0695861816407, + "learning_rate": 0.0005774335701417662, + "loss": 1613.0781, + "step": 4560 + }, + { + "ce_loss_12": 3.3355695724487306, + "ce_loss_17": 2.9634481072425842, + "ce_loss_23": 2.828409492969513, + "ce_loss_3": 4.187091267108917, + "ce_loss_6": 3.866722321510315, + "epoch": 0.457, + "grad_norm": 956.0, + "kl_loss_12": 1150.532940673828, + "kl_loss_17": 307.76855926513673, + "kl_loss_3": 2868.6227416992188, + "kl_loss_6": 2231.113848876953, + "learning_rate": 0.0005758656625416658, + "loss": 1613.2054, + "step": 4570 + }, + { + "ce_loss_12": 3.3876204609870912, + "ce_loss_17": 3.027148795127869, + "ce_loss_23": 2.8844519376754763, + "ce_loss_3": 4.2057746887207035, + "ce_loss_6": 3.8870466589927672, + "epoch": 0.458, + "grad_norm": 956.0, + "kl_loss_12": 1136.8851470947266, + "kl_loss_17": 316.8278106689453, + "kl_loss_3": 2786.901501464844, + "kl_loss_6": 2145.4644104003905, + "learning_rate": 0.0005742969909751859, + "loss": 1575.2155, + "step": 4580 + }, + { + "ce_loss_12": 3.4007017374038697, + "ce_loss_17": 3.034340000152588, + "ce_loss_23": 2.89749014377594, + "ce_loss_3": 4.228575205802917, + "ce_loss_6": 3.9102230429649354, + "epoch": 0.459, + "grad_norm": 944.0, + "kl_loss_12": 1143.504412841797, + "kl_loss_17": 312.67955169677737, + "kl_loss_3": 2813.0753662109373, + "kl_loss_6": 2178.4854736328125, + "learning_rate": 0.0005727275712388318, + "loss": 1615.2342, + "step": 4590 + }, + { + "ce_loss_12": 3.3975281357765197, + "ce_loss_17": 3.0452203273773195, + "ce_loss_23": 2.9190300822257997, + "ce_loss_3": 4.187395370006561, + "ce_loss_6": 3.8785125851631164, + "epoch": 0.46, + "grad_norm": 980.0, + "kl_loss_12": 1097.602850341797, + "kl_loss_17": 300.3045196533203, + "kl_loss_3": 2693.808935546875, + "kl_loss_6": 2085.7624572753907, + "learning_rate": 0.0005711574191366427, + "loss": 1563.5143, + "step": 4600 + }, + { + "ce_loss_12": 3.3609559893608094, + "ce_loss_17": 3.004966366291046, + "ce_loss_23": 2.868931531906128, + "ce_loss_3": 4.1772660374641415, + "ce_loss_6": 3.8511223554611207, + "epoch": 0.461, + "grad_norm": 828.0, + "kl_loss_12": 1116.0391632080077, + "kl_loss_17": 304.19537963867185, + "kl_loss_3": 2751.3671752929686, + "kl_loss_6": 2109.9775939941405, + "learning_rate": 0.0005695865504800327, + "loss": 1566.2516, + "step": 4610 + }, + { + "ce_loss_12": 3.3485647439956665, + "ce_loss_17": 2.95493905544281, + "ce_loss_23": 2.80878484249115, + "ce_loss_3": 4.229027032852173, + "ce_loss_6": 3.8901962757110597, + "epoch": 0.462, + "grad_norm": 1144.0, + "kl_loss_12": 1214.5166809082032, + "kl_loss_17": 328.5302200317383, + "kl_loss_3": 2971.7057861328126, + "kl_loss_6": 2304.507421875, + "learning_rate": 0.0005680149810876322, + "loss": 1641.5496, + "step": 4620 + }, + { + "ce_loss_12": 3.359517002105713, + "ce_loss_17": 2.9971216320991516, + "ce_loss_23": 2.866685378551483, + "ce_loss_3": 4.192817986011505, + "ce_loss_6": 3.871630370616913, + "epoch": 0.463, + "grad_norm": 1968.0, + "kl_loss_12": 1122.232159423828, + "kl_loss_17": 304.9923568725586, + "kl_loss_3": 2791.3302612304688, + "kl_loss_6": 2163.3309997558595, + "learning_rate": 0.0005664427267851271, + "loss": 1587.3979, + "step": 4630 + }, + { + "ce_loss_12": 3.2848004341125487, + "ce_loss_17": 2.923465597629547, + "ce_loss_23": 2.7880022048950197, + "ce_loss_3": 4.112053787708282, + "ce_loss_6": 3.796520805358887, + "epoch": 0.464, + "grad_norm": 1176.0, + "kl_loss_12": 1109.333233642578, + "kl_loss_17": 302.905500793457, + "kl_loss_3": 2762.598498535156, + "kl_loss_6": 2136.8950744628905, + "learning_rate": 0.0005648698034051009, + "loss": 1566.1207, + "step": 4640 + }, + { + "ce_loss_12": 3.3933666944503784, + "ce_loss_17": 3.0286192536354064, + "ce_loss_23": 2.893868792057037, + "ce_loss_3": 4.239399254322052, + "ce_loss_6": 3.918375754356384, + "epoch": 0.465, + "grad_norm": 976.0, + "kl_loss_12": 1127.1548950195313, + "kl_loss_17": 305.87921447753905, + "kl_loss_3": 2811.355078125, + "kl_loss_6": 2189.2998901367187, + "learning_rate": 0.0005632962267868747, + "loss": 1574.9604, + "step": 4650 + }, + { + "ce_loss_12": 3.3286603569984434, + "ce_loss_17": 2.9657166481018065, + "ce_loss_23": 2.83941011428833, + "ce_loss_3": 4.1424295663833615, + "ce_loss_6": 3.8242884278297424, + "epoch": 0.466, + "grad_norm": 1128.0, + "kl_loss_12": 1115.7930053710938, + "kl_loss_17": 298.15654907226565, + "kl_loss_3": 2751.6337646484376, + "kl_loss_6": 2123.6877258300783, + "learning_rate": 0.0005617220127763474, + "loss": 1588.1186, + "step": 4660 + }, + { + "ce_loss_12": 3.399213743209839, + "ce_loss_17": 3.049934506416321, + "ce_loss_23": 2.9157302737236024, + "ce_loss_3": 4.208151781558991, + "ce_loss_6": 3.887350296974182, + "epoch": 0.467, + "grad_norm": 984.0, + "kl_loss_12": 1117.3792602539063, + "kl_loss_17": 306.5261657714844, + "kl_loss_3": 2735.5549438476564, + "kl_loss_6": 2100.812664794922, + "learning_rate": 0.0005601471772258368, + "loss": 1586.6715, + "step": 4670 + }, + { + "ce_loss_12": 3.382119631767273, + "ce_loss_17": 3.033186662197113, + "ce_loss_23": 2.8996434092521666, + "ce_loss_3": 4.197882652282715, + "ce_loss_6": 3.8774981260299684, + "epoch": 0.468, + "grad_norm": 1048.0, + "kl_loss_12": 1095.82021484375, + "kl_loss_17": 304.3485252380371, + "kl_loss_3": 2719.297412109375, + "kl_loss_6": 2086.8393798828124, + "learning_rate": 0.0005585717359939192, + "loss": 1593.2576, + "step": 4680 + }, + { + "ce_loss_12": 3.3027979850769045, + "ce_loss_17": 2.945749342441559, + "ce_loss_23": 2.8152668833732606, + "ce_loss_3": 4.104500102996826, + "ce_loss_6": 3.7914631724357606, + "epoch": 0.469, + "grad_norm": 1224.0, + "kl_loss_12": 1105.9541046142579, + "kl_loss_17": 302.2694259643555, + "kl_loss_3": 2706.378515625, + "kl_loss_6": 2090.2652587890625, + "learning_rate": 0.0005569957049452703, + "loss": 1594.227, + "step": 4690 + }, + { + "ce_loss_12": 3.369704079627991, + "ce_loss_17": 3.0091665029525756, + "ce_loss_23": 2.871000623703003, + "ce_loss_3": 4.200240743160248, + "ce_loss_6": 3.874775540828705, + "epoch": 0.47, + "grad_norm": 1040.0, + "kl_loss_12": 1132.7062622070312, + "kl_loss_17": 311.7139724731445, + "kl_loss_3": 2804.237646484375, + "kl_loss_6": 2159.6342346191404, + "learning_rate": 0.0005554190999505056, + "loss": 1606.4697, + "step": 4700 + }, + { + "ce_loss_12": 3.491312766075134, + "ce_loss_17": 3.1184787154197693, + "ce_loss_23": 2.9828242897987365, + "ce_loss_3": 4.289120233058929, + "ce_loss_6": 3.9822962641716004, + "epoch": 0.471, + "grad_norm": 1096.0, + "kl_loss_12": 1157.3537475585938, + "kl_loss_17": 318.26240692138674, + "kl_loss_3": 2775.381311035156, + "kl_loss_6": 2164.731060791016, + "learning_rate": 0.0005538419368860196, + "loss": 1548.7658, + "step": 4710 + }, + { + "ce_loss_12": 3.3963905811309814, + "ce_loss_17": 3.0453217029571533, + "ce_loss_23": 2.909340965747833, + "ce_loss_3": 4.201114368438721, + "ce_loss_6": 3.8916338801383974, + "epoch": 0.472, + "grad_norm": 1040.0, + "kl_loss_12": 1125.0690490722657, + "kl_loss_17": 310.0855987548828, + "kl_loss_3": 2735.7884887695313, + "kl_loss_6": 2123.482470703125, + "learning_rate": 0.0005522642316338268, + "loss": 1607.8658, + "step": 4720 + }, + { + "ce_loss_12": 3.4153844356536864, + "ce_loss_17": 3.056465709209442, + "ce_loss_23": 2.9310761451721192, + "ce_loss_3": 4.222506582736969, + "ce_loss_6": 3.91341096162796, + "epoch": 0.473, + "grad_norm": 908.0, + "kl_loss_12": 1125.3278381347657, + "kl_loss_17": 305.9197311401367, + "kl_loss_3": 2736.0848754882813, + "kl_loss_6": 2125.858508300781, + "learning_rate": 0.0005506860000814017, + "loss": 1615.6734, + "step": 4730 + }, + { + "ce_loss_12": 3.418290138244629, + "ce_loss_17": 3.0670406818389893, + "ce_loss_23": 2.942454922199249, + "ce_loss_3": 4.213110136985779, + "ce_loss_6": 3.897139918804169, + "epoch": 0.474, + "grad_norm": 988.0, + "kl_loss_12": 1103.0640380859375, + "kl_loss_17": 293.4414855957031, + "kl_loss_3": 2694.6202392578125, + "kl_loss_6": 2075.2019470214846, + "learning_rate": 0.0005491072581215186, + "loss": 1574.7879, + "step": 4740 + }, + { + "ce_loss_12": 3.4341385841369627, + "ce_loss_17": 3.0710843801498413, + "ce_loss_23": 2.933401334285736, + "ce_loss_3": 4.239067649841308, + "ce_loss_6": 3.9195735335350035, + "epoch": 0.475, + "grad_norm": 1104.0, + "kl_loss_12": 1139.1321960449218, + "kl_loss_17": 314.00052337646486, + "kl_loss_3": 2772.883679199219, + "kl_loss_6": 2136.547479248047, + "learning_rate": 0.0005475280216520913, + "loss": 1562.3973, + "step": 4750 + }, + { + "ce_loss_12": 3.349986743927002, + "ce_loss_17": 2.997076177597046, + "ce_loss_23": 2.866178572177887, + "ce_loss_3": 4.152829468250275, + "ce_loss_6": 3.837746262550354, + "epoch": 0.476, + "grad_norm": 1192.0, + "kl_loss_12": 1096.0175872802733, + "kl_loss_17": 301.01770782470703, + "kl_loss_3": 2698.3632202148438, + "kl_loss_6": 2076.271063232422, + "learning_rate": 0.0005459483065760138, + "loss": 1595.573, + "step": 4760 + }, + { + "ce_loss_12": 3.3164255142211916, + "ce_loss_17": 2.941723871231079, + "ce_loss_23": 2.8090891242027283, + "ce_loss_3": 4.179085612297058, + "ce_loss_6": 3.8568483710289003, + "epoch": 0.477, + "grad_norm": 1048.0, + "kl_loss_12": 1156.2247375488282, + "kl_loss_17": 306.74070129394534, + "kl_loss_3": 2867.866845703125, + "kl_loss_6": 2235.567724609375, + "learning_rate": 0.0005443681288009991, + "loss": 1601.8034, + "step": 4770 + }, + { + "ce_loss_12": 3.3487496376037598, + "ce_loss_17": 2.9888516068458557, + "ce_loss_23": 2.8586570262908935, + "ce_loss_3": 4.1714806199073795, + "ce_loss_6": 3.8551082015037537, + "epoch": 0.478, + "grad_norm": 836.0, + "kl_loss_12": 1120.5207489013671, + "kl_loss_17": 301.96836547851564, + "kl_loss_3": 2774.1383666992188, + "kl_loss_6": 2145.277478027344, + "learning_rate": 0.0005427875042394199, + "loss": 1587.4909, + "step": 4780 + }, + { + "ce_loss_12": 3.3863983273506166, + "ce_loss_17": 3.0380003809928895, + "ce_loss_23": 2.8937386155128477, + "ce_loss_3": 4.200499761104584, + "ce_loss_6": 3.8739574551582336, + "epoch": 0.479, + "grad_norm": 1016.0, + "kl_loss_12": 1115.8671905517579, + "kl_loss_17": 314.9477005004883, + "kl_loss_3": 2738.422473144531, + "kl_loss_6": 2093.9318725585936, + "learning_rate": 0.0005412064488081482, + "loss": 1597.256, + "step": 4790 + }, + { + "ce_loss_12": 3.3651809096336365, + "ce_loss_17": 3.0185546040534974, + "ce_loss_23": 2.8903374314308166, + "ce_loss_3": 4.176959121227265, + "ce_loss_6": 3.8557177424430846, + "epoch": 0.48, + "grad_norm": 984.0, + "kl_loss_12": 1079.9235870361329, + "kl_loss_17": 297.1005088806152, + "kl_loss_3": 2698.806689453125, + "kl_loss_6": 2073.9032958984376, + "learning_rate": 0.0005396249784283942, + "loss": 1548.1096, + "step": 4800 + }, + { + "ce_loss_12": 3.410639774799347, + "ce_loss_17": 3.041554272174835, + "ce_loss_23": 2.904786264896393, + "ce_loss_3": 4.251404666900635, + "ce_loss_6": 3.9328576564788817, + "epoch": 0.481, + "grad_norm": 968.0, + "kl_loss_12": 1150.5845642089844, + "kl_loss_17": 312.17601776123047, + "kl_loss_3": 2835.917614746094, + "kl_loss_6": 2204.481726074219, + "learning_rate": 0.0005380431090255476, + "loss": 1610.3476, + "step": 4810 + }, + { + "ce_loss_12": 3.3847161650657656, + "ce_loss_17": 3.035974931716919, + "ce_loss_23": 2.9126381397247316, + "ce_loss_3": 4.18476665019989, + "ce_loss_6": 3.875488114356995, + "epoch": 0.482, + "grad_norm": 912.0, + "kl_loss_12": 1089.5160858154297, + "kl_loss_17": 290.29730834960935, + "kl_loss_3": 2687.77041015625, + "kl_loss_6": 2075.221826171875, + "learning_rate": 0.0005364608565290155, + "loss": 1548.8517, + "step": 4820 + }, + { + "ce_loss_12": 3.400247502326965, + "ce_loss_17": 3.043899357318878, + "ce_loss_23": 2.915937912464142, + "ce_loss_3": 4.2259934425354, + "ce_loss_6": 3.9076520800590515, + "epoch": 0.483, + "grad_norm": 996.0, + "kl_loss_12": 1115.6108764648438, + "kl_loss_17": 304.19869842529295, + "kl_loss_3": 2769.1885986328125, + "kl_loss_6": 2143.8648986816406, + "learning_rate": 0.0005348782368720626, + "loss": 1584.2228, + "step": 4830 + }, + { + "ce_loss_12": 3.3320335507392884, + "ce_loss_17": 2.9882596731185913, + "ce_loss_23": 2.854368841648102, + "ce_loss_3": 4.15036495923996, + "ce_loss_6": 3.8380707025527956, + "epoch": 0.484, + "grad_norm": 844.0, + "kl_loss_12": 1091.236019897461, + "kl_loss_17": 298.0321350097656, + "kl_loss_3": 2724.5786010742186, + "kl_loss_6": 2096.6154052734373, + "learning_rate": 0.000533295265991652, + "loss": 1576.1192, + "step": 4840 + }, + { + "ce_loss_12": 3.4043202638626098, + "ce_loss_17": 3.048448932170868, + "ce_loss_23": 2.914829695224762, + "ce_loss_3": 4.195201706886292, + "ce_loss_6": 3.8884942173957824, + "epoch": 0.485, + "grad_norm": 1088.0, + "kl_loss_12": 1097.879165649414, + "kl_loss_17": 301.0802597045898, + "kl_loss_3": 2693.1909423828124, + "kl_loss_6": 2091.048065185547, + "learning_rate": 0.0005317119598282822, + "loss": 1546.7378, + "step": 4850 + }, + { + "ce_loss_12": 3.4163004517555238, + "ce_loss_17": 3.055159831047058, + "ce_loss_23": 2.921207368373871, + "ce_loss_3": 4.212529075145722, + "ce_loss_6": 3.9011515617370605, + "epoch": 0.486, + "grad_norm": 1272.0, + "kl_loss_12": 1117.5282165527344, + "kl_loss_17": 304.5247299194336, + "kl_loss_3": 2715.294299316406, + "kl_loss_6": 2106.637139892578, + "learning_rate": 0.0005301283343258293, + "loss": 1562.6322, + "step": 4860 + }, + { + "ce_loss_12": 3.452582538127899, + "ce_loss_17": 3.1055136561393737, + "ce_loss_23": 2.97325998544693, + "ce_loss_3": 4.245788037776947, + "ce_loss_6": 3.9398349642753603, + "epoch": 0.487, + "grad_norm": 976.0, + "kl_loss_12": 1102.8094665527344, + "kl_loss_17": 303.8196731567383, + "kl_loss_3": 2696.489990234375, + "kl_loss_6": 2084.3579345703124, + "learning_rate": 0.000528544405431384, + "loss": 1544.8414, + "step": 4870 + }, + { + "ce_loss_12": 3.3599804520606993, + "ce_loss_17": 2.9914204597473146, + "ce_loss_23": 2.8525378108024597, + "ce_loss_3": 4.176129698753357, + "ce_loss_6": 3.85386004447937, + "epoch": 0.488, + "grad_norm": 1016.0, + "kl_loss_12": 1146.3044921875, + "kl_loss_17": 315.3286987304688, + "kl_loss_3": 2801.2627197265624, + "kl_loss_6": 2156.7280395507814, + "learning_rate": 0.000526960189095093, + "loss": 1600.166, + "step": 4880 + }, + { + "ce_loss_12": 3.334537076950073, + "ce_loss_17": 2.9861772298812865, + "ce_loss_23": 2.852927792072296, + "ce_loss_3": 4.146724760532379, + "ce_loss_6": 3.8291383743286134, + "epoch": 0.489, + "grad_norm": 972.0, + "kl_loss_12": 1091.358187866211, + "kl_loss_17": 299.0898208618164, + "kl_loss_3": 2717.929248046875, + "kl_loss_6": 2096.2663513183593, + "learning_rate": 0.0005253757012699972, + "loss": 1556.6723, + "step": 4890 + }, + { + "ce_loss_12": 3.3993546605110168, + "ce_loss_17": 3.0482656478881838, + "ce_loss_23": 2.9207078576087953, + "ce_loss_3": 4.2034744143486025, + "ce_loss_6": 3.888164055347443, + "epoch": 0.49, + "grad_norm": 1096.0, + "kl_loss_12": 1099.3933380126953, + "kl_loss_17": 298.8610366821289, + "kl_loss_3": 2719.4765380859376, + "kl_loss_6": 2089.0828125, + "learning_rate": 0.0005237909579118712, + "loss": 1578.4719, + "step": 4900 + }, + { + "ce_loss_12": 3.388143837451935, + "ce_loss_17": 3.023465883731842, + "ce_loss_23": 2.8811936497688295, + "ce_loss_3": 4.216781687736511, + "ce_loss_6": 3.900932049751282, + "epoch": 0.491, + "grad_norm": 1208.0, + "kl_loss_12": 1139.244564819336, + "kl_loss_17": 316.09378509521486, + "kl_loss_3": 2809.9015625, + "kl_loss_6": 2175.9791259765625, + "learning_rate": 0.0005222059749790631, + "loss": 1598.0844, + "step": 4910 + }, + { + "ce_loss_12": 3.4135035037994386, + "ce_loss_17": 3.07126407623291, + "ce_loss_23": 2.942831826210022, + "ce_loss_3": 4.193430387973786, + "ce_loss_6": 3.882127547264099, + "epoch": 0.492, + "grad_norm": 1128.0, + "kl_loss_12": 1084.961932373047, + "kl_loss_17": 296.9151382446289, + "kl_loss_3": 2666.9692993164062, + "kl_loss_6": 2048.114013671875, + "learning_rate": 0.0005206207684323337, + "loss": 1526.9676, + "step": 4920 + }, + { + "ce_loss_12": 3.4127950191497805, + "ce_loss_17": 3.055870497226715, + "ce_loss_23": 2.9254420518875124, + "ce_loss_3": 4.215585446357727, + "ce_loss_6": 3.9041973114013673, + "epoch": 0.493, + "grad_norm": 1224.0, + "kl_loss_12": 1125.9021087646483, + "kl_loss_17": 306.9901626586914, + "kl_loss_3": 2730.7314453125, + "kl_loss_6": 2122.603759765625, + "learning_rate": 0.000519035354234695, + "loss": 1590.4694, + "step": 4930 + }, + { + "ce_loss_12": 3.3999263286590575, + "ce_loss_17": 3.0387742042541506, + "ce_loss_23": 2.895954394340515, + "ce_loss_3": 4.199414372444153, + "ce_loss_6": 3.8858419179916384, + "epoch": 0.494, + "grad_norm": 1184.0, + "kl_loss_12": 1124.1252349853517, + "kl_loss_17": 314.1003082275391, + "kl_loss_3": 2728.172705078125, + "kl_loss_6": 2106.65966796875, + "learning_rate": 0.0005174497483512506, + "loss": 1552.0831, + "step": 4940 + }, + { + "ce_loss_12": 3.41754287481308, + "ce_loss_17": 3.0745951890945435, + "ce_loss_23": 2.9491618275642395, + "ce_loss_3": 4.21412056684494, + "ce_loss_6": 3.9022476434707642, + "epoch": 0.495, + "grad_norm": 976.0, + "kl_loss_12": 1096.221469116211, + "kl_loss_17": 295.2589080810547, + "kl_loss_3": 2714.2718505859375, + "kl_loss_6": 2097.348748779297, + "learning_rate": 0.0005158639667490339, + "loss": 1584.319, + "step": 4950 + }, + { + "ce_loss_12": 3.3561907529830934, + "ce_loss_17": 2.9901116251945496, + "ce_loss_23": 2.8571989059448244, + "ce_loss_3": 4.174978446960449, + "ce_loss_6": 3.845904302597046, + "epoch": 0.496, + "grad_norm": 1080.0, + "kl_loss_12": 1125.0650604248046, + "kl_loss_17": 303.3439971923828, + "kl_loss_3": 2763.3949584960938, + "kl_loss_6": 2120.0795471191404, + "learning_rate": 0.0005142780253968481, + "loss": 1574.7215, + "step": 4960 + }, + { + "ce_loss_12": 3.286413645744324, + "ce_loss_17": 2.9384931564331054, + "ce_loss_23": 2.8116806387901305, + "ce_loss_3": 4.088664150238037, + "ce_loss_6": 3.77538526058197, + "epoch": 0.497, + "grad_norm": 1376.0, + "kl_loss_12": 1079.2211120605468, + "kl_loss_17": 295.1123382568359, + "kl_loss_3": 2690.6939697265625, + "kl_loss_6": 2071.3447998046877, + "learning_rate": 0.0005126919402651053, + "loss": 1525.6309, + "step": 4970 + }, + { + "ce_loss_12": 3.370965826511383, + "ce_loss_17": 3.012047052383423, + "ce_loss_23": 2.875856566429138, + "ce_loss_3": 4.195962822437286, + "ce_loss_6": 3.884010601043701, + "epoch": 0.498, + "grad_norm": 1184.0, + "kl_loss_12": 1126.862042236328, + "kl_loss_17": 312.22010040283203, + "kl_loss_3": 2761.840673828125, + "kl_loss_6": 2139.622937011719, + "learning_rate": 0.0005111057273256647, + "loss": 1584.7812, + "step": 4980 + }, + { + "ce_loss_12": 3.419179451465607, + "ce_loss_17": 3.0902063131332396, + "ce_loss_23": 2.971741831302643, + "ce_loss_3": 4.177947437763214, + "ce_loss_6": 3.8815560221672056, + "epoch": 0.499, + "grad_norm": 1224.0, + "kl_loss_12": 1053.150732421875, + "kl_loss_17": 285.0224044799805, + "kl_loss_3": 2569.1810791015623, + "kl_loss_6": 1990.8180358886718, + "learning_rate": 0.0005095194025516733, + "loss": 1504.0578, + "step": 4990 + }, + { + "ce_loss_12": 3.374780237674713, + "ce_loss_17": 3.0321351408958437, + "ce_loss_23": 2.908969295024872, + "ce_loss_3": 4.176579833030701, + "ce_loss_6": 3.8689828872680665, + "epoch": 0.5, + "grad_norm": 1064.0, + "kl_loss_12": 1082.8041015625, + "kl_loss_17": 291.1903495788574, + "kl_loss_3": 2680.8746826171873, + "kl_loss_6": 2074.5982421875, + "learning_rate": 0.000507932981917404, + "loss": 1585.3881, + "step": 5000 + }, + { + "ce_loss_12": 3.362855243682861, + "ce_loss_17": 2.988880681991577, + "ce_loss_23": 2.8498858332633974, + "ce_loss_3": 4.198509395122528, + "ce_loss_6": 3.8798313617706297, + "epoch": 0.501, + "grad_norm": 976.0, + "kl_loss_12": 1162.0169952392578, + "kl_loss_17": 314.58503875732424, + "kl_loss_3": 2829.14833984375, + "kl_loss_6": 2203.0539306640626, + "learning_rate": 0.0005063464813980949, + "loss": 1620.7376, + "step": 5010 + }, + { + "ce_loss_12": 3.3213644504547117, + "ce_loss_17": 2.967113363742828, + "ce_loss_23": 2.8417882204055784, + "ce_loss_3": 4.131510305404663, + "ce_loss_6": 3.8222088694572447, + "epoch": 0.502, + "grad_norm": 1024.0, + "kl_loss_12": 1118.1484283447267, + "kl_loss_17": 299.0059448242188, + "kl_loss_3": 2753.0571533203124, + "kl_loss_6": 2133.07119140625, + "learning_rate": 0.0005047599169697884, + "loss": 1566.5135, + "step": 5020 + }, + { + "ce_loss_12": 3.2775539636611937, + "ce_loss_17": 2.914250874519348, + "ce_loss_23": 2.7824475884437563, + "ce_loss_3": 4.102517211437226, + "ce_loss_6": 3.785682499408722, + "epoch": 0.503, + "grad_norm": 1152.0, + "kl_loss_12": 1109.3389801025392, + "kl_loss_17": 302.9358444213867, + "kl_loss_3": 2754.7373901367187, + "kl_loss_6": 2128.6819702148437, + "learning_rate": 0.000503173304609171, + "loss": 1538.1774, + "step": 5030 + }, + { + "ce_loss_12": 3.3846050620079042, + "ce_loss_17": 3.028143787384033, + "ce_loss_23": 2.8941952705383303, + "ce_loss_3": 4.190663433074951, + "ce_loss_6": 3.878485178947449, + "epoch": 0.504, + "grad_norm": 1192.0, + "kl_loss_12": 1106.5670806884766, + "kl_loss_17": 296.38336486816405, + "kl_loss_3": 2712.2251586914062, + "kl_loss_6": 2105.7641845703124, + "learning_rate": 0.0005015866602934111, + "loss": 1537.4449, + "step": 5040 + }, + { + "ce_loss_12": 3.385323441028595, + "ce_loss_17": 3.0098569989204407, + "ce_loss_23": 2.8715598344802857, + "ce_loss_3": 4.201504063606262, + "ce_loss_6": 3.881215286254883, + "epoch": 0.505, + "grad_norm": 1128.0, + "kl_loss_12": 1161.3495056152344, + "kl_loss_17": 316.38443450927736, + "kl_loss_3": 2794.4003662109376, + "kl_loss_6": 2166.5137634277344, + "learning_rate": 0.0005, + "loss": 1584.4279, + "step": 5050 + }, + { + "ce_loss_12": 3.359061539173126, + "ce_loss_17": 2.9973837614059446, + "ce_loss_23": 2.866459846496582, + "ce_loss_3": 4.173029494285584, + "ce_loss_6": 3.8500274181365968, + "epoch": 0.506, + "grad_norm": 1368.0, + "kl_loss_12": 1112.7315063476562, + "kl_loss_17": 304.95944061279295, + "kl_loss_3": 2741.0337890625, + "kl_loss_6": 2107.1026306152344, + "learning_rate": 0.0004984133397065889, + "loss": 1546.8272, + "step": 5060 + }, + { + "ce_loss_12": 3.372469925880432, + "ce_loss_17": 3.005589461326599, + "ce_loss_23": 2.867159426212311, + "ce_loss_3": 4.18806174993515, + "ce_loss_6": 3.873115861415863, + "epoch": 0.507, + "grad_norm": 1032.0, + "kl_loss_12": 1123.6464630126952, + "kl_loss_17": 306.77745666503904, + "kl_loss_3": 2752.3882690429687, + "kl_loss_6": 2126.8781494140626, + "learning_rate": 0.0004968266953908291, + "loss": 1546.3459, + "step": 5070 + }, + { + "ce_loss_12": 3.3890419483184813, + "ce_loss_17": 3.0321525931358337, + "ce_loss_23": 2.9076250076293944, + "ce_loss_3": 4.213087391853333, + "ce_loss_6": 3.892174541950226, + "epoch": 0.508, + "grad_norm": 1128.0, + "kl_loss_12": 1105.1057861328125, + "kl_loss_17": 292.9428909301758, + "kl_loss_3": 2746.5344970703127, + "kl_loss_6": 2125.439532470703, + "learning_rate": 0.0004952400830302117, + "loss": 1561.7111, + "step": 5080 + }, + { + "ce_loss_12": 3.339200568199158, + "ce_loss_17": 2.973728096485138, + "ce_loss_23": 2.8369530916213987, + "ce_loss_3": 4.165874052047729, + "ce_loss_6": 3.8430152773857116, + "epoch": 0.509, + "grad_norm": 1128.0, + "kl_loss_12": 1136.2608703613282, + "kl_loss_17": 307.2293701171875, + "kl_loss_3": 2784.1413452148436, + "kl_loss_6": 2157.0075561523436, + "learning_rate": 0.0004936535186019053, + "loss": 1572.1361, + "step": 5090 + }, + { + "ce_loss_12": 3.3992201328277587, + "ce_loss_17": 3.059427273273468, + "ce_loss_23": 2.935956287384033, + "ce_loss_3": 4.1924993872642515, + "ce_loss_6": 3.8852957248687745, + "epoch": 0.51, + "grad_norm": 1144.0, + "kl_loss_12": 1067.5751892089843, + "kl_loss_17": 287.5902297973633, + "kl_loss_3": 2651.0362182617187, + "kl_loss_6": 2043.5480590820312, + "learning_rate": 0.000492067018082596, + "loss": 1536.1178, + "step": 5100 + }, + { + "ce_loss_12": 3.382169485092163, + "ce_loss_17": 3.0059693813323975, + "ce_loss_23": 2.8708268642425536, + "ce_loss_3": 4.224006390571594, + "ce_loss_6": 3.899908125400543, + "epoch": 0.511, + "grad_norm": 1392.0, + "kl_loss_12": 1148.9556762695313, + "kl_loss_17": 309.4052139282227, + "kl_loss_3": 2832.4399169921876, + "kl_loss_6": 2198.9977783203126, + "learning_rate": 0.0004904805974483267, + "loss": 1623.9365, + "step": 5110 + }, + { + "ce_loss_12": 3.4949843645095826, + "ce_loss_17": 3.1124078273773192, + "ce_loss_23": 2.9691667199134826, + "ce_loss_3": 4.304324913024902, + "ce_loss_6": 3.9900792956352236, + "epoch": 0.512, + "grad_norm": 1192.0, + "kl_loss_12": 1186.4844299316405, + "kl_loss_17": 324.86098022460936, + "kl_loss_3": 2814.8347045898436, + "kl_loss_6": 2192.9266662597656, + "learning_rate": 0.0004888942726743353, + "loss": 1644.2129, + "step": 5120 + }, + { + "ce_loss_12": 3.3473219513893127, + "ce_loss_17": 2.989601743221283, + "ce_loss_23": 2.8587637186050414, + "ce_loss_3": 4.163712787628174, + "ce_loss_6": 3.8459107637405396, + "epoch": 0.513, + "grad_norm": 1048.0, + "kl_loss_12": 1121.4485717773437, + "kl_loss_17": 305.6918441772461, + "kl_loss_3": 2768.2578125, + "kl_loss_6": 2135.605535888672, + "learning_rate": 0.0004873080597348947, + "loss": 1587.6139, + "step": 5130 + }, + { + "ce_loss_12": 3.262009656429291, + "ce_loss_17": 2.887168896198273, + "ce_loss_23": 2.75510116815567, + "ce_loss_3": 4.116335034370422, + "ce_loss_6": 3.7995688676834107, + "epoch": 0.514, + "grad_norm": 1144.0, + "kl_loss_12": 1153.4715423583984, + "kl_loss_17": 301.694108581543, + "kl_loss_3": 2854.0366943359377, + "kl_loss_6": 2224.2273193359374, + "learning_rate": 0.0004857219746031519, + "loss": 1596.3561, + "step": 5140 + }, + { + "ce_loss_12": 3.3927998185157775, + "ce_loss_17": 3.0430898785591127, + "ce_loss_23": 2.9166905879974365, + "ce_loss_3": 4.200037574768066, + "ce_loss_6": 3.877912938594818, + "epoch": 0.515, + "grad_norm": 1016.0, + "kl_loss_12": 1092.6485656738282, + "kl_loss_17": 300.26861572265625, + "kl_loss_3": 2706.573291015625, + "kl_loss_6": 2077.4241271972655, + "learning_rate": 0.0004841360332509663, + "loss": 1561.7504, + "step": 5150 + }, + { + "ce_loss_12": 3.350694715976715, + "ce_loss_17": 2.9989014267921448, + "ce_loss_23": 2.873697781562805, + "ce_loss_3": 4.148285734653473, + "ce_loss_6": 3.842498481273651, + "epoch": 0.516, + "grad_norm": 1368.0, + "kl_loss_12": 1091.5367370605468, + "kl_loss_17": 291.91494903564455, + "kl_loss_3": 2684.4546142578124, + "kl_loss_6": 2076.9037963867186, + "learning_rate": 0.0004825502516487497, + "loss": 1503.6863, + "step": 5160 + }, + { + "ce_loss_12": 3.3179797649383547, + "ce_loss_17": 2.9602678537368776, + "ce_loss_23": 2.835769760608673, + "ce_loss_3": 4.13564225435257, + "ce_loss_6": 3.835077476501465, + "epoch": 0.517, + "grad_norm": 1400.0, + "kl_loss_12": 1110.5461822509765, + "kl_loss_17": 297.78162994384763, + "kl_loss_3": 2746.1639770507813, + "kl_loss_6": 2157.3029541015626, + "learning_rate": 0.00048096464576530507, + "loss": 1586.2658, + "step": 5170 + }, + { + "ce_loss_12": 3.40576229095459, + "ce_loss_17": 3.0669244527816772, + "ce_loss_23": 2.9372299790382383, + "ce_loss_3": 4.170511937141418, + "ce_loss_6": 3.873363471031189, + "epoch": 0.518, + "grad_norm": 1056.0, + "kl_loss_12": 1087.6305053710937, + "kl_loss_17": 299.07730026245116, + "kl_loss_3": 2632.4614013671876, + "kl_loss_6": 2038.3814025878905, + "learning_rate": 0.00047937923156766646, + "loss": 1524.3064, + "step": 5180 + }, + { + "ce_loss_12": 3.434298062324524, + "ce_loss_17": 3.1022254109382628, + "ce_loss_23": 2.9763625144958494, + "ce_loss_3": 4.1991536021232605, + "ce_loss_6": 3.8984975099563597, + "epoch": 0.519, + "grad_norm": 1352.0, + "kl_loss_12": 1077.075503540039, + "kl_loss_17": 291.15290145874025, + "kl_loss_3": 2635.3423583984377, + "kl_loss_6": 2036.4578491210937, + "learning_rate": 0.00047779402502093696, + "loss": 1529.8139, + "step": 5190 + }, + { + "ce_loss_12": 3.417772078514099, + "ce_loss_17": 3.0711885690689087, + "ce_loss_23": 2.9444741368293763, + "ce_loss_3": 4.217487633228302, + "ce_loss_6": 3.904100239276886, + "epoch": 0.52, + "grad_norm": 1120.0, + "kl_loss_12": 1084.6034698486328, + "kl_loss_17": 296.61804046630857, + "kl_loss_3": 2687.315478515625, + "kl_loss_6": 2068.0660705566406, + "learning_rate": 0.0004762090420881289, + "loss": 1553.0215, + "step": 5200 + }, + { + "ce_loss_12": 3.3423298478126524, + "ce_loss_17": 2.992612087726593, + "ce_loss_23": 2.865809166431427, + "ce_loss_3": 4.1181949257850645, + "ce_loss_6": 3.8178300738334654, + "epoch": 0.521, + "grad_norm": 1104.0, + "kl_loss_12": 1091.2071472167968, + "kl_loss_17": 296.23180694580077, + "kl_loss_3": 2668.9665283203126, + "kl_loss_6": 2073.2246154785157, + "learning_rate": 0.00047462429873000296, + "loss": 1516.2294, + "step": 5210 + }, + { + "ce_loss_12": 3.4136949300765993, + "ce_loss_17": 3.0739993691444396, + "ce_loss_23": 2.944578742980957, + "ce_loss_3": 4.207600545883179, + "ce_loss_6": 3.8948861360549927, + "epoch": 0.522, + "grad_norm": 1272.0, + "kl_loss_12": 1078.202474975586, + "kl_loss_17": 302.248828125, + "kl_loss_3": 2676.73427734375, + "kl_loss_6": 2065.8719177246094, + "learning_rate": 0.0004730398109049071, + "loss": 1528.9749, + "step": 5220 + }, + { + "ce_loss_12": 3.379810082912445, + "ce_loss_17": 3.0066291213035585, + "ce_loss_23": 2.868514358997345, + "ce_loss_3": 4.210710990428924, + "ce_loss_6": 3.89647376537323, + "epoch": 0.523, + "grad_norm": 1208.0, + "kl_loss_12": 1150.9331085205079, + "kl_loss_17": 311.6774368286133, + "kl_loss_3": 2813.1722778320313, + "kl_loss_6": 2192.030694580078, + "learning_rate": 0.000471455594568616, + "loss": 1577.8427, + "step": 5230 + }, + { + "ce_loss_12": 3.407266581058502, + "ce_loss_17": 3.062355172634125, + "ce_loss_23": 2.939179801940918, + "ce_loss_3": 4.188946032524109, + "ce_loss_6": 3.8770066261291505, + "epoch": 0.524, + "grad_norm": 1264.0, + "kl_loss_12": 1081.1963348388672, + "kl_loss_17": 299.0507553100586, + "kl_loss_3": 2653.9161987304688, + "kl_loss_6": 2035.7987121582032, + "learning_rate": 0.00046987166567417086, + "loss": 1547.227, + "step": 5240 + }, + { + "ce_loss_12": 3.349161219596863, + "ce_loss_17": 2.9991302371025084, + "ce_loss_23": 2.8701635241508483, + "ce_loss_3": 4.162693417072296, + "ce_loss_6": 3.8393788576126098, + "epoch": 0.525, + "grad_norm": 996.0, + "kl_loss_12": 1095.2534759521484, + "kl_loss_17": 295.6771751403809, + "kl_loss_3": 2718.0575561523438, + "kl_loss_6": 2089.1939208984377, + "learning_rate": 0.00046828804017171776, + "loss": 1512.2344, + "step": 5250 + }, + { + "ce_loss_12": 3.4031213760375976, + "ce_loss_17": 3.0353243589401244, + "ce_loss_23": 2.9000794291496277, + "ce_loss_3": 4.230943822860718, + "ce_loss_6": 3.912229025363922, + "epoch": 0.526, + "grad_norm": 1312.0, + "kl_loss_12": 1117.801577758789, + "kl_loss_17": 304.54825592041016, + "kl_loss_3": 2767.648645019531, + "kl_loss_6": 2137.6414428710937, + "learning_rate": 0.00046670473400834805, + "loss": 1583.449, + "step": 5260 + }, + { + "ce_loss_12": 3.3178684949874877, + "ce_loss_17": 2.9749055981636046, + "ce_loss_23": 2.8483084440231323, + "ce_loss_3": 4.121661520004272, + "ce_loss_6": 3.8078757643699648, + "epoch": 0.527, + "grad_norm": 1064.0, + "kl_loss_12": 1067.9791290283204, + "kl_loss_17": 292.6219314575195, + "kl_loss_3": 2680.6882202148436, + "kl_loss_6": 2062.2185485839846, + "learning_rate": 0.00046512176312793734, + "loss": 1584.1877, + "step": 5270 + }, + { + "ce_loss_12": 3.322924852371216, + "ce_loss_17": 2.9707990288734436, + "ce_loss_23": 2.8352202653884886, + "ce_loss_3": 4.123947286605835, + "ce_loss_6": 3.817195165157318, + "epoch": 0.528, + "grad_norm": 1112.0, + "kl_loss_12": 1094.3742126464845, + "kl_loss_17": 296.32200927734374, + "kl_loss_3": 2715.054821777344, + "kl_loss_6": 2107.3768615722656, + "learning_rate": 0.00046353914347098467, + "loss": 1567.2074, + "step": 5280 + }, + { + "ce_loss_12": 3.4210985898971558, + "ce_loss_17": 3.067178690433502, + "ce_loss_23": 2.9379127740859987, + "ce_loss_3": 4.217587947845459, + "ce_loss_6": 3.9102158188819884, + "epoch": 0.529, + "grad_norm": 1048.0, + "kl_loss_12": 1083.2941864013671, + "kl_loss_17": 293.2905502319336, + "kl_loss_3": 2691.60302734375, + "kl_loss_6": 2081.5444458007814, + "learning_rate": 0.0004619568909744524, + "loss": 1570.5784, + "step": 5290 + }, + { + "ce_loss_12": 3.4063668847084045, + "ce_loss_17": 3.06554034948349, + "ce_loss_23": 2.939655864238739, + "ce_loss_3": 4.199694049358368, + "ce_loss_6": 3.88633279800415, + "epoch": 0.53, + "grad_norm": 1344.0, + "kl_loss_12": 1087.9134185791015, + "kl_loss_17": 297.81110229492185, + "kl_loss_3": 2678.2871826171877, + "kl_loss_6": 2065.655108642578, + "learning_rate": 0.00046037502157160573, + "loss": 1556.4295, + "step": 5300 + }, + { + "ce_loss_12": 3.3119312047958376, + "ce_loss_17": 2.9507867217063906, + "ce_loss_23": 2.8170288801193237, + "ce_loss_3": 4.108970201015472, + "ce_loss_6": 3.799880337715149, + "epoch": 0.531, + "grad_norm": 1048.0, + "kl_loss_12": 1108.8056365966797, + "kl_loss_17": 303.1284881591797, + "kl_loss_3": 2711.242626953125, + "kl_loss_6": 2104.8263671875, + "learning_rate": 0.00045879355119185207, + "loss": 1567.8275, + "step": 5310 + }, + { + "ce_loss_12": 3.3983437180519105, + "ce_loss_17": 3.0327823519706727, + "ce_loss_23": 2.9004740595817564, + "ce_loss_3": 4.212223029136657, + "ce_loss_6": 3.8889464497566224, + "epoch": 0.532, + "grad_norm": 1152.0, + "kl_loss_12": 1144.2750122070313, + "kl_loss_17": 307.9484588623047, + "kl_loss_3": 2772.0366333007814, + "kl_loss_6": 2136.8219482421873, + "learning_rate": 0.0004572124957605803, + "loss": 1590.5291, + "step": 5320 + }, + { + "ce_loss_12": 3.4015161991119385, + "ce_loss_17": 3.0423429012298584, + "ce_loss_23": 2.906565010547638, + "ce_loss_3": 4.199693059921264, + "ce_loss_6": 3.8853312730789185, + "epoch": 0.533, + "grad_norm": 1136.0, + "kl_loss_12": 1112.8039581298829, + "kl_loss_17": 303.7584228515625, + "kl_loss_3": 2726.5053955078124, + "kl_loss_6": 2097.246826171875, + "learning_rate": 0.00045563187119900103, + "loss": 1542.5536, + "step": 5330 + }, + { + "ce_loss_12": 3.2540330410003664, + "ce_loss_17": 2.89544860124588, + "ce_loss_23": 2.768121588230133, + "ce_loss_3": 4.084176480770111, + "ce_loss_6": 3.7669583201408385, + "epoch": 0.534, + "grad_norm": 1104.0, + "kl_loss_12": 1115.6181274414062, + "kl_loss_17": 299.1415328979492, + "kl_loss_3": 2771.7518310546875, + "kl_loss_6": 2149.6638732910155, + "learning_rate": 0.00045405169342398633, + "loss": 1576.5603, + "step": 5340 + }, + { + "ce_loss_12": 3.349810791015625, + "ce_loss_17": 2.9884414792060854, + "ce_loss_23": 2.8516027212142943, + "ce_loss_3": 4.166842401027679, + "ce_loss_6": 3.8521859884262084, + "epoch": 0.535, + "grad_norm": 1288.0, + "kl_loss_12": 1131.2634643554688, + "kl_loss_17": 310.653182220459, + "kl_loss_3": 2767.4667846679686, + "kl_loss_6": 2154.172009277344, + "learning_rate": 0.0004524719783479088, + "loss": 1551.4629, + "step": 5350 + }, + { + "ce_loss_12": 3.308687424659729, + "ce_loss_17": 2.9389654994010925, + "ce_loss_23": 2.8066301941871643, + "ce_loss_3": 4.13761465549469, + "ce_loss_6": 3.8201239943504333, + "epoch": 0.536, + "grad_norm": 1080.0, + "kl_loss_12": 1133.4922302246093, + "kl_loss_17": 305.0677856445312, + "kl_loss_3": 2793.434191894531, + "kl_loss_6": 2159.982745361328, + "learning_rate": 0.00045089274187848144, + "loss": 1552.6938, + "step": 5360 + }, + { + "ce_loss_12": 3.3914544343948365, + "ce_loss_17": 3.0516345858573914, + "ce_loss_23": 2.925180435180664, + "ce_loss_3": 4.187878561019898, + "ce_loss_6": 3.8809223055839537, + "epoch": 0.537, + "grad_norm": 940.0, + "kl_loss_12": 1087.690396118164, + "kl_loss_17": 295.78431701660156, + "kl_loss_3": 2695.1560668945312, + "kl_loss_6": 2088.261999511719, + "learning_rate": 0.00044931399991859835, + "loss": 1535.7311, + "step": 5370 + }, + { + "ce_loss_12": 3.263565754890442, + "ce_loss_17": 2.911922574043274, + "ce_loss_23": 2.7829471707344053, + "ce_loss_3": 4.075548827648163, + "ce_loss_6": 3.754881238937378, + "epoch": 0.538, + "grad_norm": 1096.0, + "kl_loss_12": 1100.927456665039, + "kl_loss_17": 295.7045013427734, + "kl_loss_3": 2745.293347167969, + "kl_loss_6": 2104.372979736328, + "learning_rate": 0.00044773576836617336, + "loss": 1537.6268, + "step": 5380 + }, + { + "ce_loss_12": 3.3768471837043763, + "ce_loss_17": 3.005662715435028, + "ce_loss_23": 2.875761556625366, + "ce_loss_3": 4.179002559185028, + "ce_loss_6": 3.868990218639374, + "epoch": 0.539, + "grad_norm": 1080.0, + "kl_loss_12": 1134.3475219726563, + "kl_loss_17": 303.7232040405273, + "kl_loss_3": 2747.5085571289064, + "kl_loss_6": 2136.5920349121093, + "learning_rate": 0.00044615806311398056, + "loss": 1599.1203, + "step": 5390 + }, + { + "ce_loss_12": 3.396114432811737, + "ce_loss_17": 3.074923348426819, + "ce_loss_23": 2.9501069307327272, + "ce_loss_3": 4.153258454799652, + "ce_loss_6": 3.8466654300689695, + "epoch": 0.54, + "grad_norm": 1016.0, + "kl_loss_12": 1052.32314453125, + "kl_loss_17": 289.0661148071289, + "kl_loss_3": 2593.190881347656, + "kl_loss_6": 1993.3581787109374, + "learning_rate": 0.00044458090004949454, + "loss": 1539.339, + "step": 5400 + }, + { + "ce_loss_12": 3.3335826635360717, + "ce_loss_17": 2.951065015792847, + "ce_loss_23": 2.8089526534080504, + "ce_loss_3": 4.174290323257447, + "ce_loss_6": 3.8532066226005552, + "epoch": 0.541, + "grad_norm": 964.0, + "kl_loss_12": 1162.954409790039, + "kl_loss_17": 317.13980560302736, + "kl_loss_3": 2863.15234375, + "kl_loss_6": 2221.428271484375, + "learning_rate": 0.0004430042950547297, + "loss": 1576.8596, + "step": 5410 + }, + { + "ce_loss_12": 3.396577227115631, + "ce_loss_17": 3.033453941345215, + "ce_loss_23": 2.8963638305664063, + "ce_loss_3": 4.211390352249145, + "ce_loss_6": 3.8926366686820986, + "epoch": 0.542, + "grad_norm": 1104.0, + "kl_loss_12": 1130.787789916992, + "kl_loss_17": 312.94815216064455, + "kl_loss_3": 2775.302355957031, + "kl_loss_6": 2144.0336853027343, + "learning_rate": 0.0004414282640060809, + "loss": 1563.72, + "step": 5420 + }, + { + "ce_loss_12": 3.461113154888153, + "ce_loss_17": 3.1103001713752745, + "ce_loss_23": 2.978768992424011, + "ce_loss_3": 4.233858203887939, + "ce_loss_6": 3.9294100046157836, + "epoch": 0.543, + "grad_norm": 1224.0, + "kl_loss_12": 1085.9322082519532, + "kl_loss_17": 298.91674041748047, + "kl_loss_3": 2630.5422973632812, + "kl_loss_6": 2034.4115905761719, + "learning_rate": 0.0004398528227741633, + "loss": 1532.4951, + "step": 5430 + }, + { + "ce_loss_12": 3.3482273697853087, + "ce_loss_17": 2.990015757083893, + "ce_loss_23": 2.857475447654724, + "ce_loss_3": 4.1610354542732235, + "ce_loss_6": 3.8469629645347596, + "epoch": 0.544, + "grad_norm": 1192.0, + "kl_loss_12": 1108.1345916748046, + "kl_loss_17": 309.11339111328124, + "kl_loss_3": 2719.430615234375, + "kl_loss_6": 2098.4219177246096, + "learning_rate": 0.00043827798722365264, + "loss": 1571.2654, + "step": 5440 + }, + { + "ce_loss_12": 3.436587190628052, + "ce_loss_17": 3.0982702732086183, + "ce_loss_23": 2.97260959148407, + "ce_loss_3": 4.212341475486755, + "ce_loss_6": 3.909819185733795, + "epoch": 0.545, + "grad_norm": 1120.0, + "kl_loss_12": 1080.8670532226563, + "kl_loss_17": 298.4113143920898, + "kl_loss_3": 2648.5758544921873, + "kl_loss_6": 2044.9892578125, + "learning_rate": 0.00043670377321312535, + "loss": 1519.2725, + "step": 5450 + }, + { + "ce_loss_12": 3.437877380847931, + "ce_loss_17": 3.1039116382598877, + "ce_loss_23": 2.9818585395812987, + "ce_loss_3": 4.211915624141693, + "ce_loss_6": 3.904364216327667, + "epoch": 0.546, + "grad_norm": 1072.0, + "kl_loss_12": 1069.5538299560546, + "kl_loss_17": 291.2031845092773, + "kl_loss_3": 2638.0355224609375, + "kl_loss_6": 2021.8220092773438, + "learning_rate": 0.0004351301965948991, + "loss": 1536.6615, + "step": 5460 + }, + { + "ce_loss_12": 3.3516125679016113, + "ce_loss_17": 3.0189313173294066, + "ce_loss_23": 2.8931252121925355, + "ce_loss_3": 4.143655979633332, + "ce_loss_6": 3.828027617931366, + "epoch": 0.547, + "grad_norm": 1272.0, + "kl_loss_12": 1059.2278198242188, + "kl_loss_17": 289.7262649536133, + "kl_loss_3": 2646.740625, + "kl_loss_6": 2029.8422607421876, + "learning_rate": 0.000433557273214873, + "loss": 1524.926, + "step": 5470 + }, + { + "ce_loss_12": 3.354721283912659, + "ce_loss_17": 3.0130093812942507, + "ce_loss_23": 2.8745198965072634, + "ce_loss_3": 4.152052211761474, + "ce_loss_6": 3.8364554405212403, + "epoch": 0.548, + "grad_norm": 964.0, + "kl_loss_12": 1084.157943725586, + "kl_loss_17": 300.56928100585935, + "kl_loss_3": 2678.8192504882813, + "kl_loss_6": 2063.3562622070312, + "learning_rate": 0.000431985018912368, + "loss": 1519.3572, + "step": 5480 + }, + { + "ce_loss_12": 3.3481543898582458, + "ce_loss_17": 2.9807775378227235, + "ce_loss_23": 2.849691128730774, + "ce_loss_3": 4.166516637802124, + "ce_loss_6": 3.854253661632538, + "epoch": 0.549, + "grad_norm": 1352.0, + "kl_loss_12": 1111.0652160644531, + "kl_loss_17": 300.24488677978513, + "kl_loss_3": 2764.147900390625, + "kl_loss_6": 2148.22568359375, + "learning_rate": 0.0004304134495199674, + "loss": 1530.8162, + "step": 5490 + }, + { + "ce_loss_12": 3.375580167770386, + "ce_loss_17": 3.0166457295417786, + "ce_loss_23": 2.8813437461853026, + "ce_loss_3": 4.167650175094605, + "ce_loss_6": 3.8590697288513183, + "epoch": 0.55, + "grad_norm": 972.0, + "kl_loss_12": 1129.8497314453125, + "kl_loss_17": 304.87682189941404, + "kl_loss_3": 2749.120849609375, + "kl_loss_6": 2130.23251953125, + "learning_rate": 0.0004288425808633575, + "loss": 1552.1418, + "step": 5500 + }, + { + "ce_loss_12": 3.338499128818512, + "ce_loss_17": 2.991779828071594, + "ce_loss_23": 2.8615185499191282, + "ce_loss_3": 4.142512774467468, + "ce_loss_6": 3.8318737506866456, + "epoch": 0.551, + "grad_norm": 1128.0, + "kl_loss_12": 1092.0799499511718, + "kl_loss_17": 295.32262191772463, + "kl_loss_3": 2709.7896728515625, + "kl_loss_6": 2087.308660888672, + "learning_rate": 0.0004272724287611684, + "loss": 1550.8097, + "step": 5510 + }, + { + "ce_loss_12": 3.3272794246673585, + "ce_loss_17": 2.9705721020698546, + "ce_loss_23": 2.840547788143158, + "ce_loss_3": 4.144940996170044, + "ce_loss_6": 3.8349496006965635, + "epoch": 0.552, + "grad_norm": 1024.0, + "kl_loss_12": 1108.8988861083985, + "kl_loss_17": 299.7927978515625, + "kl_loss_3": 2759.7510986328125, + "kl_loss_6": 2144.3437255859376, + "learning_rate": 0.00042570300902481425, + "loss": 1561.2407, + "step": 5520 + }, + { + "ce_loss_12": 3.3361517786979675, + "ce_loss_17": 2.996741271018982, + "ce_loss_23": 2.869708800315857, + "ce_loss_3": 4.126246166229248, + "ce_loss_6": 3.8227033257484435, + "epoch": 0.553, + "grad_norm": 1168.0, + "kl_loss_12": 1082.9974700927735, + "kl_loss_17": 294.6476860046387, + "kl_loss_3": 2682.4309326171874, + "kl_loss_6": 2074.345251464844, + "learning_rate": 0.00042413433745833423, + "loss": 1533.8751, + "step": 5530 + }, + { + "ce_loss_12": 3.3447516441345213, + "ce_loss_17": 2.9871742129325867, + "ce_loss_23": 2.8585495829582213, + "ce_loss_3": 4.156840574741364, + "ce_loss_6": 3.842957389354706, + "epoch": 0.554, + "grad_norm": 964.0, + "kl_loss_12": 1104.5059844970704, + "kl_loss_17": 297.8919593811035, + "kl_loss_3": 2739.675, + "kl_loss_6": 2113.0805847167967, + "learning_rate": 0.0004225664298582339, + "loss": 1513.3228, + "step": 5540 + }, + { + "ce_loss_12": 3.413621115684509, + "ce_loss_17": 3.0707615494728087, + "ce_loss_23": 2.9419044971466066, + "ce_loss_3": 4.194462668895722, + "ce_loss_6": 3.888316106796265, + "epoch": 0.555, + "grad_norm": 1184.0, + "kl_loss_12": 1069.3179565429687, + "kl_loss_17": 291.9802551269531, + "kl_loss_3": 2635.356201171875, + "kl_loss_6": 2029.8074829101563, + "learning_rate": 0.000420999302013325, + "loss": 1507.5443, + "step": 5550 + }, + { + "ce_loss_12": 3.346064102649689, + "ce_loss_17": 2.9775984048843385, + "ce_loss_23": 2.8422580122947694, + "ce_loss_3": 4.203102040290832, + "ce_loss_6": 3.8697832822799683, + "epoch": 0.556, + "grad_norm": 1136.0, + "kl_loss_12": 1128.6266021728516, + "kl_loss_17": 312.0568115234375, + "kl_loss_3": 2840.6982421875, + "kl_loss_6": 2181.8872680664062, + "learning_rate": 0.000419432969704568, + "loss": 1557.96, + "step": 5560 + }, + { + "ce_loss_12": 3.357046055793762, + "ce_loss_17": 3.008342170715332, + "ce_loss_23": 2.8809687376022337, + "ce_loss_3": 4.14723539352417, + "ce_loss_6": 3.8425039291381835, + "epoch": 0.557, + "grad_norm": 1024.0, + "kl_loss_12": 1081.215493774414, + "kl_loss_17": 295.9071792602539, + "kl_loss_3": 2669.309045410156, + "kl_loss_6": 2059.5223022460937, + "learning_rate": 0.00041786744870491154, + "loss": 1568.9957, + "step": 5570 + }, + { + "ce_loss_12": 3.3172632813453675, + "ce_loss_17": 2.955425262451172, + "ce_loss_23": 2.821260964870453, + "ce_loss_3": 4.119695484638214, + "ce_loss_6": 3.806350862979889, + "epoch": 0.558, + "grad_norm": 828.0, + "kl_loss_12": 1118.678955078125, + "kl_loss_17": 302.5011444091797, + "kl_loss_3": 2731.080847167969, + "kl_loss_6": 2115.9034118652344, + "learning_rate": 0.0004163027547791347, + "loss": 1544.0346, + "step": 5580 + }, + { + "ce_loss_12": 3.309348404407501, + "ce_loss_17": 2.947772943973541, + "ce_loss_23": 2.814731788635254, + "ce_loss_3": 4.156531059741974, + "ce_loss_6": 3.8248415350914002, + "epoch": 0.559, + "grad_norm": 1240.0, + "kl_loss_12": 1114.9301391601562, + "kl_loss_17": 301.8474456787109, + "kl_loss_3": 2811.576452636719, + "kl_loss_6": 2152.8734436035156, + "learning_rate": 0.0004147389036836881, + "loss": 1565.2141, + "step": 5590 + }, + { + "ce_loss_12": 3.3516384840011595, + "ce_loss_17": 2.987208640575409, + "ce_loss_23": 2.8559449911117554, + "ce_loss_3": 4.1557383179664615, + "ce_loss_6": 3.838567817211151, + "epoch": 0.56, + "grad_norm": 1152.0, + "kl_loss_12": 1114.13662109375, + "kl_loss_17": 300.4609680175781, + "kl_loss_3": 2724.2331665039064, + "kl_loss_6": 2099.025762939453, + "learning_rate": 0.00041317591116653486, + "loss": 1581.426, + "step": 5600 + }, + { + "ce_loss_12": 3.375890648365021, + "ce_loss_17": 3.016210114955902, + "ce_loss_23": 2.8871567368507387, + "ce_loss_3": 4.192247068881988, + "ce_loss_6": 3.8769724011421203, + "epoch": 0.561, + "grad_norm": 868.0, + "kl_loss_12": 1118.3572082519531, + "kl_loss_17": 306.61693572998047, + "kl_loss_3": 2753.4271606445313, + "kl_loss_6": 2124.9645141601563, + "learning_rate": 0.0004116137929669921, + "loss": 1550.451, + "step": 5610 + }, + { + "ce_loss_12": 3.370885908603668, + "ce_loss_17": 3.0194214582443237, + "ce_loss_23": 2.886621868610382, + "ce_loss_3": 4.162791633605957, + "ce_loss_6": 3.8525096774101257, + "epoch": 0.562, + "grad_norm": 984.0, + "kl_loss_12": 1096.9350799560548, + "kl_loss_17": 294.7433792114258, + "kl_loss_3": 2698.144323730469, + "kl_loss_6": 2083.495526123047, + "learning_rate": 0.00041005256481557305, + "loss": 1525.6439, + "step": 5620 + }, + { + "ce_loss_12": 3.4280372262001038, + "ce_loss_17": 3.095071315765381, + "ce_loss_23": 2.9774329543113707, + "ce_loss_3": 4.1962571144104, + "ce_loss_6": 3.8930899143218993, + "epoch": 0.563, + "grad_norm": 976.0, + "kl_loss_12": 1045.3597290039063, + "kl_loss_17": 284.91689529418943, + "kl_loss_3": 2594.1365478515627, + "kl_loss_6": 1992.5528503417968, + "learning_rate": 0.00040849224243382767, + "loss": 1503.5002, + "step": 5630 + }, + { + "ce_loss_12": 3.322995662689209, + "ce_loss_17": 2.9679643034935, + "ce_loss_23": 2.8381430983543394, + "ce_loss_3": 4.143428933620453, + "ce_loss_6": 3.8188948750495912, + "epoch": 0.564, + "grad_norm": 1144.0, + "kl_loss_12": 1107.2206665039062, + "kl_loss_17": 296.03601531982423, + "kl_loss_3": 2743.397814941406, + "kl_loss_6": 2105.9764099121094, + "learning_rate": 0.000406932841534185, + "loss": 1529.3828, + "step": 5640 + }, + { + "ce_loss_12": 3.2935484528541563, + "ce_loss_17": 2.938543951511383, + "ce_loss_23": 2.8072848439216616, + "ce_loss_3": 4.1067038655281065, + "ce_loss_6": 3.789441239833832, + "epoch": 0.565, + "grad_norm": 1280.0, + "kl_loss_12": 1104.0531219482423, + "kl_loss_17": 301.8814224243164, + "kl_loss_3": 2737.7357543945313, + "kl_loss_6": 2116.7823181152344, + "learning_rate": 0.0004053743778197951, + "loss": 1591.8746, + "step": 5650 + }, + { + "ce_loss_12": 3.391418957710266, + "ce_loss_17": 3.033774769306183, + "ce_loss_23": 2.9061689734458924, + "ce_loss_3": 4.182014131546021, + "ce_loss_6": 3.885739576816559, + "epoch": 0.566, + "grad_norm": 984.0, + "kl_loss_12": 1107.833056640625, + "kl_loss_17": 303.8883712768555, + "kl_loss_3": 2684.518017578125, + "kl_loss_6": 2084.1996826171876, + "learning_rate": 0.0004038168669843697, + "loss": 1568.2076, + "step": 5660 + }, + { + "ce_loss_12": 3.3283536434173584, + "ce_loss_17": 2.98902086019516, + "ce_loss_23": 2.8606817722320557, + "ce_loss_3": 4.110491549968719, + "ce_loss_6": 3.8071823596954344, + "epoch": 0.567, + "grad_norm": 1144.0, + "kl_loss_12": 1078.9445007324218, + "kl_loss_17": 296.4503967285156, + "kl_loss_3": 2661.1900146484377, + "kl_loss_6": 2052.0313110351562, + "learning_rate": 0.000402260324712026, + "loss": 1558.9467, + "step": 5670 + }, + { + "ce_loss_12": 3.3894649267196657, + "ce_loss_17": 3.0279223918914795, + "ce_loss_23": 2.903757655620575, + "ce_loss_3": 4.205927729606628, + "ce_loss_6": 3.8925053119659423, + "epoch": 0.568, + "grad_norm": 1080.0, + "kl_loss_12": 1102.922415161133, + "kl_loss_17": 291.1776184082031, + "kl_loss_3": 2740.9990234375, + "kl_loss_6": 2123.743518066406, + "learning_rate": 0.00040070476667712743, + "loss": 1536.7454, + "step": 5680 + }, + { + "ce_loss_12": 3.401654672622681, + "ce_loss_17": 3.0565669178962707, + "ce_loss_23": 2.927121162414551, + "ce_loss_3": 4.19632499217987, + "ce_loss_6": 3.8915048956871034, + "epoch": 0.569, + "grad_norm": 988.0, + "kl_loss_12": 1081.9984588623047, + "kl_loss_17": 295.2158767700195, + "kl_loss_3": 2684.343957519531, + "kl_loss_6": 2082.0970153808594, + "learning_rate": 0.0003991502085441259, + "loss": 1547.1768, + "step": 5690 + }, + { + "ce_loss_12": 3.4185086369514464, + "ce_loss_17": 3.090511643886566, + "ce_loss_23": 2.966333067417145, + "ce_loss_3": 4.192823874950409, + "ce_loss_6": 3.8864670515060427, + "epoch": 0.57, + "grad_norm": 1704.0, + "kl_loss_12": 1051.5930419921874, + "kl_loss_17": 288.72295532226565, + "kl_loss_3": 2605.3867065429686, + "kl_loss_6": 2003.20048828125, + "learning_rate": 0.0003975966659674047, + "loss": 1528.4219, + "step": 5700 + }, + { + "ce_loss_12": 3.411479675769806, + "ce_loss_17": 3.0676485657691956, + "ce_loss_23": 2.9371942400932314, + "ce_loss_3": 4.217548334598542, + "ce_loss_6": 3.8981226801872255, + "epoch": 0.571, + "grad_norm": 1040.0, + "kl_loss_12": 1084.3829620361328, + "kl_loss_17": 297.86733169555663, + "kl_loss_3": 2690.292272949219, + "kl_loss_6": 2069.090856933594, + "learning_rate": 0.0003960441545911204, + "loss": 1523.9072, + "step": 5710 + }, + { + "ce_loss_12": 3.3867371439933778, + "ce_loss_17": 3.0464473128318788, + "ce_loss_23": 2.924866199493408, + "ce_loss_3": 4.1862914800643924, + "ce_loss_6": 3.866020941734314, + "epoch": 0.572, + "grad_norm": 1008.0, + "kl_loss_12": 1086.786917114258, + "kl_loss_17": 293.1391548156738, + "kl_loss_3": 2695.4630493164063, + "kl_loss_6": 2060.2872924804688, + "learning_rate": 0.0003944926900490452, + "loss": 1528.1995, + "step": 5720 + }, + { + "ce_loss_12": 3.3431777358055115, + "ce_loss_17": 2.981088709831238, + "ce_loss_23": 2.847019040584564, + "ce_loss_3": 4.164076113700867, + "ce_loss_6": 3.845302677154541, + "epoch": 0.573, + "grad_norm": 1120.0, + "kl_loss_12": 1116.8780670166016, + "kl_loss_17": 300.9324645996094, + "kl_loss_3": 2760.368933105469, + "kl_loss_6": 2132.633331298828, + "learning_rate": 0.0003929422879644099, + "loss": 1547.4666, + "step": 5730 + }, + { + "ce_loss_12": 3.316387987136841, + "ce_loss_17": 2.9817095041275024, + "ce_loss_23": 2.8599035263061525, + "ce_loss_3": 4.11018670797348, + "ce_loss_6": 3.802216875553131, + "epoch": 0.574, + "grad_norm": 952.0, + "kl_loss_12": 1063.2005157470703, + "kl_loss_17": 289.11716537475587, + "kl_loss_3": 2670.6744995117188, + "kl_loss_6": 2049.6060180664062, + "learning_rate": 0.0003913929639497462, + "loss": 1492.7935, + "step": 5740 + }, + { + "ce_loss_12": 3.2893279433250426, + "ce_loss_17": 2.9399860501289368, + "ce_loss_23": 2.8129002928733824, + "ce_loss_3": 4.120655429363251, + "ce_loss_6": 3.805050051212311, + "epoch": 0.575, + "grad_norm": 1144.0, + "kl_loss_12": 1088.0894744873046, + "kl_loss_17": 292.67746505737307, + "kl_loss_3": 2748.174462890625, + "kl_loss_6": 2129.8866577148438, + "learning_rate": 0.00038984473360672965, + "loss": 1527.1544, + "step": 5750 + }, + { + "ce_loss_12": 3.3022054076194762, + "ce_loss_17": 2.9506754994392397, + "ce_loss_23": 2.822495758533478, + "ce_loss_3": 4.120715641975403, + "ce_loss_6": 3.8015459895133974, + "epoch": 0.576, + "grad_norm": 1072.0, + "kl_loss_12": 1087.9370819091796, + "kl_loss_17": 292.52199630737306, + "kl_loss_3": 2732.5151489257814, + "kl_loss_6": 2102.5725280761717, + "learning_rate": 0.0003882976125260229, + "loss": 1520.1432, + "step": 5760 + }, + { + "ce_loss_12": 3.3552067637443543, + "ce_loss_17": 3.0094610452651978, + "ce_loss_23": 2.87836571931839, + "ce_loss_3": 4.162405633926392, + "ce_loss_6": 3.8583031415939333, + "epoch": 0.577, + "grad_norm": 1168.0, + "kl_loss_12": 1082.5192932128907, + "kl_loss_17": 295.40897369384766, + "kl_loss_3": 2708.016064453125, + "kl_loss_6": 2094.8595703125, + "learning_rate": 0.00038675161628711776, + "loss": 1544.6213, + "step": 5770 + }, + { + "ce_loss_12": 3.385429286956787, + "ce_loss_17": 3.0453956604003904, + "ce_loss_23": 2.9156638741493226, + "ce_loss_3": 4.1767893671989444, + "ce_loss_6": 3.860312449932098, + "epoch": 0.578, + "grad_norm": 1088.0, + "kl_loss_12": 1070.3292022705077, + "kl_loss_17": 294.337052154541, + "kl_loss_3": 2652.4936645507814, + "kl_loss_6": 2038.156591796875, + "learning_rate": 0.0003852067604581794, + "loss": 1563.9108, + "step": 5780 + }, + { + "ce_loss_12": 3.3374809861183166, + "ce_loss_17": 2.9879353404045106, + "ce_loss_23": 2.8669142842292787, + "ce_loss_3": 4.147375965118409, + "ce_loss_6": 3.835199749469757, + "epoch": 0.579, + "grad_norm": 1144.0, + "kl_loss_12": 1090.973321533203, + "kl_loss_17": 289.15511627197264, + "kl_loss_3": 2724.677099609375, + "kl_loss_6": 2117.0361877441405, + "learning_rate": 0.0003836630605958888, + "loss": 1532.849, + "step": 5790 + }, + { + "ce_loss_12": 3.3952765345573424, + "ce_loss_17": 3.048290753364563, + "ce_loss_23": 2.9225462913513183, + "ce_loss_3": 4.181715559959412, + "ce_loss_6": 3.877750539779663, + "epoch": 0.58, + "grad_norm": 1376.0, + "kl_loss_12": 1092.2751861572265, + "kl_loss_17": 293.92593612670896, + "kl_loss_3": 2685.8295288085938, + "kl_loss_6": 2082.6134765625, + "learning_rate": 0.0003821205322452863, + "loss": 1585.8538, + "step": 5800 + }, + { + "ce_loss_12": 3.372221601009369, + "ce_loss_17": 3.0282418727874756, + "ce_loss_23": 2.9078637599945067, + "ce_loss_3": 4.161537182331085, + "ce_loss_6": 3.854393744468689, + "epoch": 0.581, + "grad_norm": 1152.0, + "kl_loss_12": 1074.3501007080079, + "kl_loss_17": 288.7527191162109, + "kl_loss_3": 2669.658166503906, + "kl_loss_6": 2063.708355712891, + "learning_rate": 0.0003805791909396155, + "loss": 1528.6066, + "step": 5810 + }, + { + "ce_loss_12": 3.33404426574707, + "ce_loss_17": 2.97887327671051, + "ce_loss_23": 2.8551398634910585, + "ce_loss_3": 4.130940783023834, + "ce_loss_6": 3.823299324512482, + "epoch": 0.582, + "grad_norm": 1208.0, + "kl_loss_12": 1079.6981231689454, + "kl_loss_17": 288.92149887084963, + "kl_loss_3": 2689.42265625, + "kl_loss_6": 2081.0817321777345, + "learning_rate": 0.0003790390522001662, + "loss": 1541.9953, + "step": 5820 + }, + { + "ce_loss_12": 3.2722179651260377, + "ce_loss_17": 2.9275118470191956, + "ce_loss_23": 2.8062780618667604, + "ce_loss_3": 4.0827684044837955, + "ce_loss_6": 3.774982988834381, + "epoch": 0.583, + "grad_norm": 908.0, + "kl_loss_12": 1081.150732421875, + "kl_loss_17": 287.35875778198243, + "kl_loss_3": 2712.231884765625, + "kl_loss_6": 2095.866046142578, + "learning_rate": 0.0003775001315361183, + "loss": 1519.208, + "step": 5830 + }, + { + "ce_loss_12": 3.374925982952118, + "ce_loss_17": 3.0226351737976076, + "ce_loss_23": 2.891891610622406, + "ce_loss_3": 4.189905488491059, + "ce_loss_6": 3.868648278713226, + "epoch": 0.584, + "grad_norm": 1264.0, + "kl_loss_12": 1097.4136840820313, + "kl_loss_17": 297.0824569702148, + "kl_loss_3": 2729.2060546875, + "kl_loss_6": 2104.550671386719, + "learning_rate": 0.0003759624444443858, + "loss": 1549.6725, + "step": 5840 + }, + { + "ce_loss_12": 3.3894322991371153, + "ce_loss_17": 3.0529601216316222, + "ce_loss_23": 2.9311428904533385, + "ce_loss_3": 4.178544974327087, + "ce_loss_6": 3.8757920384407045, + "epoch": 0.585, + "grad_norm": 1264.0, + "kl_loss_12": 1068.644955444336, + "kl_loss_17": 287.98411254882814, + "kl_loss_3": 2671.1638671875, + "kl_loss_6": 2065.733996582031, + "learning_rate": 0.00037442600640946044, + "loss": 1513.0081, + "step": 5850 + }, + { + "ce_loss_12": 3.3614978075027464, + "ce_loss_17": 3.022676134109497, + "ce_loss_23": 2.89876549243927, + "ce_loss_3": 4.150962948799133, + "ce_loss_6": 3.8397781610488892, + "epoch": 0.586, + "grad_norm": 1096.0, + "kl_loss_12": 1074.1620147705078, + "kl_loss_17": 291.1872756958008, + "kl_loss_3": 2674.4105712890623, + "kl_loss_6": 2057.2900573730467, + "learning_rate": 0.00037289083290325663, + "loss": 1500.6721, + "step": 5860 + }, + { + "ce_loss_12": 3.3385061502456663, + "ce_loss_17": 2.9948259949684144, + "ce_loss_23": 2.867872619628906, + "ce_loss_3": 4.128155767917633, + "ce_loss_6": 3.8193222999572756, + "epoch": 0.587, + "grad_norm": 1136.0, + "kl_loss_12": 1057.0669647216796, + "kl_loss_17": 289.5401077270508, + "kl_loss_3": 2640.21787109375, + "kl_loss_6": 2028.2294311523438, + "learning_rate": 0.0003713569393849543, + "loss": 1510.5493, + "step": 5870 + }, + { + "ce_loss_12": 3.3919139862060548, + "ce_loss_17": 3.051191568374634, + "ce_loss_23": 2.9242817759513855, + "ce_loss_3": 4.1811758518219, + "ce_loss_6": 3.875462770462036, + "epoch": 0.588, + "grad_norm": 1128.0, + "kl_loss_12": 1079.9062591552733, + "kl_loss_17": 290.8986526489258, + "kl_loss_3": 2669.1223266601564, + "kl_loss_6": 2065.6935485839845, + "learning_rate": 0.00036982434130084397, + "loss": 1528.3336, + "step": 5880 + }, + { + "ce_loss_12": 3.319986581802368, + "ce_loss_17": 2.9722229123115538, + "ce_loss_23": 2.8413925766944885, + "ce_loss_3": 4.113067412376404, + "ce_loss_6": 3.7913942337036133, + "epoch": 0.589, + "grad_norm": 1008.0, + "kl_loss_12": 1089.219302368164, + "kl_loss_17": 301.8997085571289, + "kl_loss_3": 2681.680615234375, + "kl_loss_6": 2054.2447814941406, + "learning_rate": 0.00036829305408417166, + "loss": 1546.4316, + "step": 5890 + }, + { + "ce_loss_12": 3.3140172004699706, + "ce_loss_17": 2.9589925050735473, + "ce_loss_23": 2.8260411262512206, + "ce_loss_3": 4.134636390209198, + "ce_loss_6": 3.8160529017448424, + "epoch": 0.59, + "grad_norm": 964.0, + "kl_loss_12": 1104.7077758789062, + "kl_loss_17": 300.90020446777345, + "kl_loss_3": 2747.9242553710938, + "kl_loss_6": 2121.038653564453, + "learning_rate": 0.0003667630931549826, + "loss": 1544.2424, + "step": 5900 + }, + { + "ce_loss_12": 3.2931908488273622, + "ce_loss_17": 2.9319966673851012, + "ce_loss_23": 2.8010067105293275, + "ce_loss_3": 4.142383825778961, + "ce_loss_6": 3.8219820141792296, + "epoch": 0.591, + "grad_norm": 1536.0, + "kl_loss_12": 1120.0817626953126, + "kl_loss_17": 296.99367065429686, + "kl_loss_3": 2830.34755859375, + "kl_loss_6": 2194.107342529297, + "learning_rate": 0.00036523447391996613, + "loss": 1573.1694, + "step": 5910 + }, + { + "ce_loss_12": 3.3541221261024474, + "ce_loss_17": 3.0086112022399902, + "ce_loss_23": 2.8847101092338563, + "ce_loss_3": 4.145113706588745, + "ce_loss_6": 3.8366657137870788, + "epoch": 0.592, + "grad_norm": 1160.0, + "kl_loss_12": 1060.8351257324218, + "kl_loss_17": 284.69307098388674, + "kl_loss_3": 2644.6882934570312, + "kl_loss_6": 2042.1166931152343, + "learning_rate": 0.00036370721177230114, + "loss": 1507.935, + "step": 5920 + }, + { + "ce_loss_12": 3.3653964400291443, + "ce_loss_17": 3.015224003791809, + "ce_loss_23": 2.8863983511924745, + "ce_loss_3": 4.187268912792206, + "ce_loss_6": 3.864014434814453, + "epoch": 0.593, + "grad_norm": 1064.0, + "kl_loss_12": 1098.9212554931642, + "kl_loss_17": 301.37811279296875, + "kl_loss_3": 2741.749865722656, + "kl_loss_6": 2103.893975830078, + "learning_rate": 0.00036218132209150044, + "loss": 1543.1186, + "step": 5930 + }, + { + "ce_loss_12": 3.3391342997550963, + "ce_loss_17": 2.9664016723632813, + "ce_loss_23": 2.832013189792633, + "ce_loss_3": 4.177560043334961, + "ce_loss_6": 3.8474175453186037, + "epoch": 0.594, + "grad_norm": 944.0, + "kl_loss_12": 1139.9402313232422, + "kl_loss_17": 309.586328125, + "kl_loss_3": 2825.787255859375, + "kl_loss_6": 2181.953405761719, + "learning_rate": 0.0003606568202432562, + "loss": 1574.9494, + "step": 5940 + }, + { + "ce_loss_12": 3.393070602416992, + "ce_loss_17": 3.0422707915306093, + "ce_loss_23": 2.9140780329704286, + "ce_loss_3": 4.2164586067199705, + "ce_loss_6": 3.901226615905762, + "epoch": 0.595, + "grad_norm": 1200.0, + "kl_loss_12": 1117.1515686035157, + "kl_loss_17": 299.17299728393556, + "kl_loss_3": 2776.9905517578127, + "kl_loss_6": 2153.523962402344, + "learning_rate": 0.0003591337215792851, + "loss": 1532.3694, + "step": 5950 + }, + { + "ce_loss_12": 3.402258062362671, + "ce_loss_17": 3.064120590686798, + "ce_loss_23": 2.9467490553855895, + "ce_loss_3": 4.156550872325897, + "ce_loss_6": 3.868350923061371, + "epoch": 0.596, + "grad_norm": 964.0, + "kl_loss_12": 1057.8695495605468, + "kl_loss_17": 279.5248458862305, + "kl_loss_3": 2594.1113037109376, + "kl_loss_6": 2025.8124206542968, + "learning_rate": 0.00035761204143717383, + "loss": 1517.4082, + "step": 5960 + }, + { + "ce_loss_12": 3.369770860671997, + "ce_loss_17": 3.0239328026771544, + "ce_loss_23": 2.8976298213005065, + "ce_loss_3": 4.161876916885376, + "ce_loss_6": 3.8617225527763366, + "epoch": 0.597, + "grad_norm": 1064.0, + "kl_loss_12": 1086.6250274658203, + "kl_loss_17": 296.2882781982422, + "kl_loss_3": 2684.7556884765627, + "kl_loss_6": 2089.285284423828, + "learning_rate": 0.0003560917951402245, + "loss": 1572.7229, + "step": 5970 + }, + { + "ce_loss_12": 3.3533798694610595, + "ce_loss_17": 3.0037330985069275, + "ce_loss_23": 2.8826130986213685, + "ce_loss_3": 4.144544363021851, + "ce_loss_6": 3.841200351715088, + "epoch": 0.598, + "grad_norm": 1304.0, + "kl_loss_12": 1080.194351196289, + "kl_loss_17": 288.0468276977539, + "kl_loss_3": 2674.7138916015624, + "kl_loss_6": 2076.2087158203126, + "learning_rate": 0.00035457299799730046, + "loss": 1528.053, + "step": 5980 + }, + { + "ce_loss_12": 3.4121166348457335, + "ce_loss_17": 3.0635475873947144, + "ce_loss_23": 2.939061403274536, + "ce_loss_3": 4.191158545017243, + "ce_loss_6": 3.8858134865760805, + "epoch": 0.599, + "grad_norm": 1056.0, + "kl_loss_12": 1076.3190643310547, + "kl_loss_17": 291.5554595947266, + "kl_loss_3": 2657.576770019531, + "kl_loss_6": 2046.1385803222656, + "learning_rate": 0.0003530556653026721, + "loss": 1528.4061, + "step": 5990 + }, + { + "ce_loss_12": 3.338744008541107, + "ce_loss_17": 2.991839051246643, + "ce_loss_23": 2.8644071221351624, + "ce_loss_3": 4.148921036720276, + "ce_loss_6": 3.8365323424339293, + "epoch": 0.6, + "grad_norm": 1528.0, + "kl_loss_12": 1065.605661010742, + "kl_loss_17": 288.03568267822266, + "kl_loss_3": 2682.4912841796877, + "kl_loss_6": 2081.639971923828, + "learning_rate": 0.00035153981233586274, + "loss": 1541.0759, + "step": 6000 + }, + { + "ce_loss_12": 3.308136820793152, + "ce_loss_17": 2.9552505135536196, + "ce_loss_23": 2.83356112241745, + "ce_loss_3": 4.110099685192108, + "ce_loss_6": 3.799723744392395, + "epoch": 0.601, + "grad_norm": 1104.0, + "kl_loss_12": 1082.6001922607422, + "kl_loss_17": 285.3096450805664, + "kl_loss_3": 2685.622314453125, + "kl_loss_6": 2073.3991088867188, + "learning_rate": 0.00035002545436149473, + "loss": 1580.2212, + "step": 6010 + }, + { + "ce_loss_12": 3.3342719674110413, + "ce_loss_17": 2.977822244167328, + "ce_loss_23": 2.8470508456230164, + "ce_loss_3": 4.141815936565399, + "ce_loss_6": 3.836508405208588, + "epoch": 0.602, + "grad_norm": 1056.0, + "kl_loss_12": 1109.7272827148438, + "kl_loss_17": 302.80249938964846, + "kl_loss_3": 2743.32158203125, + "kl_loss_6": 2125.801647949219, + "learning_rate": 0.0003485126066291364, + "loss": 1532.795, + "step": 6020 + }, + { + "ce_loss_12": 3.3519185185432434, + "ce_loss_17": 3.004382920265198, + "ce_loss_23": 2.879039800167084, + "ce_loss_3": 4.151576399803162, + "ce_loss_6": 3.853084945678711, + "epoch": 0.603, + "grad_norm": 928.0, + "kl_loss_12": 1079.1995330810546, + "kl_loss_17": 286.18796691894534, + "kl_loss_3": 2690.309191894531, + "kl_loss_6": 2088.8179321289062, + "learning_rate": 0.0003470012843731476, + "loss": 1541.0395, + "step": 6030 + }, + { + "ce_loss_12": 3.309786891937256, + "ce_loss_17": 2.958170974254608, + "ce_loss_23": 2.832332801818848, + "ce_loss_3": 4.122778058052063, + "ce_loss_6": 3.8115728855133058, + "epoch": 0.604, + "grad_norm": 976.0, + "kl_loss_12": 1085.630792236328, + "kl_loss_17": 290.9824615478516, + "kl_loss_3": 2714.005078125, + "kl_loss_6": 2106.4618286132813, + "learning_rate": 0.00034549150281252633, + "loss": 1575.2684, + "step": 6040 + }, + { + "ce_loss_12": 3.2901809930801393, + "ce_loss_17": 2.949860167503357, + "ce_loss_23": 2.8217196464538574, + "ce_loss_3": 4.078897643089294, + "ce_loss_6": 3.7691982269287108, + "epoch": 0.605, + "grad_norm": 844.0, + "kl_loss_12": 1068.8934967041016, + "kl_loss_17": 295.5742889404297, + "kl_loss_3": 2643.9244140625, + "kl_loss_6": 2034.2466369628905, + "learning_rate": 0.0003439832771507565, + "loss": 1510.9966, + "step": 6050 + }, + { + "ce_loss_12": 3.3043904185295103, + "ce_loss_17": 2.9475502729415894, + "ce_loss_23": 2.8230517506599426, + "ce_loss_3": 4.097569191455841, + "ce_loss_6": 3.7958645701408384, + "epoch": 0.606, + "grad_norm": 920.0, + "kl_loss_12": 1094.1976593017578, + "kl_loss_17": 292.10668869018554, + "kl_loss_3": 2699.2143432617186, + "kl_loss_6": 2095.2228515625, + "learning_rate": 0.0003424766225756537, + "loss": 1520.6666, + "step": 6060 + }, + { + "ce_loss_12": 3.352240562438965, + "ce_loss_17": 3.0022374629974364, + "ce_loss_23": 2.880287563800812, + "ce_loss_3": 4.148053014278412, + "ce_loss_6": 3.8432873129844665, + "epoch": 0.607, + "grad_norm": 1328.0, + "kl_loss_12": 1085.2191497802735, + "kl_loss_17": 289.7973358154297, + "kl_loss_3": 2683.3669921875, + "kl_loss_6": 2085.138427734375, + "learning_rate": 0.00034097155425921255, + "loss": 1509.973, + "step": 6070 + }, + { + "ce_loss_12": 3.2671317934989927, + "ce_loss_17": 2.9134520411491396, + "ce_loss_23": 2.7855666875839233, + "ce_loss_3": 4.085386347770691, + "ce_loss_6": 3.767521357536316, + "epoch": 0.608, + "grad_norm": 996.0, + "kl_loss_12": 1102.0127227783203, + "kl_loss_17": 293.87185592651366, + "kl_loss_3": 2754.059533691406, + "kl_loss_6": 2126.1028076171874, + "learning_rate": 0.0003394680873574546, + "loss": 1540.0117, + "step": 6080 + }, + { + "ce_loss_12": 3.368610906600952, + "ce_loss_17": 3.0045804500579836, + "ce_loss_23": 2.8808337569236757, + "ce_loss_3": 4.169866585731507, + "ce_loss_6": 3.8673511505126954, + "epoch": 0.609, + "grad_norm": 1152.0, + "kl_loss_12": 1115.8485717773438, + "kl_loss_17": 297.3110382080078, + "kl_loss_3": 2746.35283203125, + "kl_loss_6": 2137.1409240722655, + "learning_rate": 0.0003379662370102747, + "loss": 1539.9181, + "step": 6090 + }, + { + "ce_loss_12": 3.353055679798126, + "ce_loss_17": 3.0137892961502075, + "ce_loss_23": 2.893881106376648, + "ce_loss_3": 4.145450925827026, + "ce_loss_6": 3.8335773825645445, + "epoch": 0.61, + "grad_norm": 1064.0, + "kl_loss_12": 1075.17353515625, + "kl_loss_17": 288.5395881652832, + "kl_loss_3": 2682.5274169921877, + "kl_loss_6": 2063.0679138183596, + "learning_rate": 0.0003364660183412892, + "loss": 1531.6078, + "step": 6100 + }, + { + "ce_loss_12": 3.3445538878440857, + "ce_loss_17": 3.0021854639053345, + "ce_loss_23": 2.8739949345588682, + "ce_loss_3": 4.135062468051911, + "ce_loss_6": 3.8257460951805116, + "epoch": 0.611, + "grad_norm": 1480.0, + "kl_loss_12": 1095.9966613769532, + "kl_loss_17": 294.90844039916993, + "kl_loss_3": 2693.2832275390624, + "kl_loss_6": 2079.3878479003906, + "learning_rate": 0.0003349674464576834, + "loss": 1549.767, + "step": 6110 + }, + { + "ce_loss_12": 3.303148889541626, + "ce_loss_17": 2.95277715921402, + "ce_loss_23": 2.825856649875641, + "ce_loss_3": 4.105974912643433, + "ce_loss_6": 3.79825245141983, + "epoch": 0.612, + "grad_norm": 1136.0, + "kl_loss_12": 1088.9236877441406, + "kl_loss_17": 294.0137344360352, + "kl_loss_3": 2725.0036499023436, + "kl_loss_6": 2109.8044250488283, + "learning_rate": 0.00033347053645005966, + "loss": 1508.8463, + "step": 6120 + }, + { + "ce_loss_12": 3.3814459562301638, + "ce_loss_17": 3.038688623905182, + "ce_loss_23": 2.918603265285492, + "ce_loss_3": 4.158742451667786, + "ce_loss_6": 3.853513276576996, + "epoch": 0.613, + "grad_norm": 1264.0, + "kl_loss_12": 1059.3533569335937, + "kl_loss_17": 285.41724853515626, + "kl_loss_3": 2617.593151855469, + "kl_loss_6": 2014.1167114257812, + "learning_rate": 0.00033197530339228485, + "loss": 1518.3242, + "step": 6130 + }, + { + "ce_loss_12": 3.360202980041504, + "ce_loss_17": 3.005841851234436, + "ce_loss_23": 2.877064311504364, + "ce_loss_3": 4.150792407989502, + "ce_loss_6": 3.839804494380951, + "epoch": 0.614, + "grad_norm": 1232.0, + "kl_loss_12": 1087.1132446289062, + "kl_loss_17": 294.73849563598634, + "kl_loss_3": 2674.4955078125, + "kl_loss_6": 2059.744171142578, + "learning_rate": 0.00033048176234133967, + "loss": 1521.8021, + "step": 6140 + }, + { + "ce_loss_12": 3.341773736476898, + "ce_loss_17": 3.001604068279266, + "ce_loss_23": 2.877167117595673, + "ce_loss_3": 4.13272854089737, + "ce_loss_6": 3.822968912124634, + "epoch": 0.615, + "grad_norm": 1000.0, + "kl_loss_12": 1092.6182922363282, + "kl_loss_17": 295.6040756225586, + "kl_loss_3": 2697.6044677734376, + "kl_loss_6": 2079.6624145507812, + "learning_rate": 0.0003289899283371657, + "loss": 1546.1363, + "step": 6150 + }, + { + "ce_loss_12": 3.351723313331604, + "ce_loss_17": 3.0066920042037966, + "ce_loss_23": 2.885224533081055, + "ce_loss_3": 4.161774778366089, + "ce_loss_6": 3.8526275753974915, + "epoch": 0.616, + "grad_norm": 1056.0, + "kl_loss_12": 1060.0194366455078, + "kl_loss_17": 285.7883804321289, + "kl_loss_3": 2672.10927734375, + "kl_loss_6": 2070.3538452148437, + "learning_rate": 0.0003274998164025148, + "loss": 1549.4511, + "step": 6160 + }, + { + "ce_loss_12": 3.387555170059204, + "ce_loss_17": 3.046477997303009, + "ce_loss_23": 2.919628012180328, + "ce_loss_3": 4.181600368022918, + "ce_loss_6": 3.8731335401535034, + "epoch": 0.617, + "grad_norm": 1064.0, + "kl_loss_12": 1092.581869506836, + "kl_loss_17": 296.49526062011716, + "kl_loss_3": 2683.827392578125, + "kl_loss_6": 2079.1006103515624, + "learning_rate": 0.0003260114415427975, + "loss": 1567.9312, + "step": 6170 + }, + { + "ce_loss_12": 3.3219369292259215, + "ce_loss_17": 2.9732369661331175, + "ce_loss_23": 2.8491630494594573, + "ce_loss_3": 4.150906503200531, + "ce_loss_6": 3.8287518501281737, + "epoch": 0.618, + "grad_norm": 1208.0, + "kl_loss_12": 1091.444174194336, + "kl_loss_17": 292.62363357543944, + "kl_loss_3": 2767.002648925781, + "kl_loss_6": 2123.3348022460937, + "learning_rate": 0.0003245248187459323, + "loss": 1571.918, + "step": 6180 + }, + { + "ce_loss_12": 3.29248104095459, + "ce_loss_17": 2.964356517791748, + "ce_loss_23": 2.8416656017303468, + "ce_loss_3": 4.0751290082931515, + "ce_loss_6": 3.7665364861488344, + "epoch": 0.619, + "grad_norm": 1960.0, + "kl_loss_12": 1048.1108001708985, + "kl_loss_17": 282.0491600036621, + "kl_loss_3": 2624.9859619140625, + "kl_loss_6": 2019.6050048828124, + "learning_rate": 0.00032303996298219416, + "loss": 1493.8592, + "step": 6190 + }, + { + "ce_loss_12": 3.3730963468551636, + "ce_loss_17": 3.0364683270454407, + "ce_loss_23": 2.9132092356681825, + "ce_loss_3": 4.150227224826812, + "ce_loss_6": 3.8517947673797606, + "epoch": 0.62, + "grad_norm": 1160.0, + "kl_loss_12": 1046.0441528320312, + "kl_loss_17": 284.50682067871094, + "kl_loss_3": 2596.279650878906, + "kl_loss_6": 2006.559521484375, + "learning_rate": 0.00032155688920406414, + "loss": 1493.982, + "step": 6200 + }, + { + "ce_loss_12": 3.307856547832489, + "ce_loss_17": 2.9499720454216005, + "ce_loss_23": 2.8251369833946227, + "ce_loss_3": 4.141007030010224, + "ce_loss_6": 3.8185084462165833, + "epoch": 0.621, + "grad_norm": 1040.0, + "kl_loss_12": 1089.2369049072265, + "kl_loss_17": 294.6406188964844, + "kl_loss_3": 2752.1582153320314, + "kl_loss_6": 2122.614605712891, + "learning_rate": 0.0003200756123460788, + "loss": 1577.3812, + "step": 6210 + }, + { + "ce_loss_12": 3.351741683483124, + "ce_loss_17": 2.993086099624634, + "ce_loss_23": 2.863808012008667, + "ce_loss_3": 4.161297249794006, + "ce_loss_6": 3.848933458328247, + "epoch": 0.622, + "grad_norm": 1960.0, + "kl_loss_12": 1108.0446380615235, + "kl_loss_17": 299.9723045349121, + "kl_loss_3": 2750.4146240234377, + "kl_loss_6": 2129.6673889160156, + "learning_rate": 0.00031859614732467957, + "loss": 1558.543, + "step": 6220 + }, + { + "ce_loss_12": 3.375869262218475, + "ce_loss_17": 3.037969648838043, + "ce_loss_23": 2.9146955251693725, + "ce_loss_3": 4.169191467761993, + "ce_loss_6": 3.851213252544403, + "epoch": 0.623, + "grad_norm": 992.0, + "kl_loss_12": 1053.6349365234375, + "kl_loss_17": 285.6215026855469, + "kl_loss_3": 2644.1402099609377, + "kl_loss_6": 2024.4838745117188, + "learning_rate": 0.00031711850903806275, + "loss": 1507.5234, + "step": 6230 + }, + { + "ce_loss_12": 3.3069013476371767, + "ce_loss_17": 2.9518585205078125, + "ce_loss_23": 2.822894871234894, + "ce_loss_3": 4.115435206890107, + "ce_loss_6": 3.8047771573066713, + "epoch": 0.624, + "grad_norm": 1264.0, + "kl_loss_12": 1104.3774383544921, + "kl_loss_17": 299.824292755127, + "kl_loss_3": 2743.9949951171875, + "kl_loss_6": 2122.171771240234, + "learning_rate": 0.0003156427123660297, + "loss": 1525.2645, + "step": 6240 + }, + { + "ce_loss_12": 3.3772284746170045, + "ce_loss_17": 3.031772184371948, + "ce_loss_23": 2.9064966320991514, + "ce_loss_3": 4.147403049468994, + "ce_loss_6": 3.841344749927521, + "epoch": 0.625, + "grad_norm": 924.0, + "kl_loss_12": 1082.6211517333984, + "kl_loss_17": 290.51032104492185, + "kl_loss_3": 2642.9122924804688, + "kl_loss_6": 2037.5070373535157, + "learning_rate": 0.0003141687721698363, + "loss": 1529.8761, + "step": 6250 + }, + { + "ce_loss_12": 3.3317097783088685, + "ce_loss_17": 3.006670558452606, + "ce_loss_23": 2.889304530620575, + "ce_loss_3": 4.0970113515853885, + "ce_loss_6": 3.7971444487571717, + "epoch": 0.626, + "grad_norm": 952.0, + "kl_loss_12": 1023.0517028808594, + "kl_loss_17": 276.3337455749512, + "kl_loss_3": 2562.144384765625, + "kl_loss_6": 1967.785302734375, + "learning_rate": 0.00031269670329204396, + "loss": 1494.9123, + "step": 6260 + }, + { + "ce_loss_12": 3.380745196342468, + "ce_loss_17": 3.0420040488243103, + "ce_loss_23": 2.9203967571258547, + "ce_loss_3": 4.142084872722625, + "ce_loss_6": 3.8459377884864807, + "epoch": 0.627, + "grad_norm": 1280.0, + "kl_loss_12": 1069.5860168457032, + "kl_loss_17": 290.3002021789551, + "kl_loss_3": 2616.0244873046877, + "kl_loss_6": 2023.7755065917968, + "learning_rate": 0.00031122652055637015, + "loss": 1523.6999, + "step": 6270 + }, + { + "ce_loss_12": 3.344464898109436, + "ce_loss_17": 3.0021347045898437, + "ce_loss_23": 2.8809479236602784, + "ce_loss_3": 4.160500121116638, + "ce_loss_6": 3.8466029167175293, + "epoch": 0.628, + "grad_norm": 1264.0, + "kl_loss_12": 1071.2355590820312, + "kl_loss_17": 290.591535949707, + "kl_loss_3": 2714.581018066406, + "kl_loss_6": 2090.1380126953127, + "learning_rate": 0.0003097582387675385, + "loss": 1512.6952, + "step": 6280 + }, + { + "ce_loss_12": 3.388115203380585, + "ce_loss_17": 3.0414551854133607, + "ce_loss_23": 2.915836453437805, + "ce_loss_3": 4.170203387737274, + "ce_loss_6": 3.8668522119522093, + "epoch": 0.629, + "grad_norm": 1208.0, + "kl_loss_12": 1079.1004943847656, + "kl_loss_17": 292.55574188232424, + "kl_loss_3": 2678.459423828125, + "kl_loss_6": 2070.119000244141, + "learning_rate": 0.00030829187271113034, + "loss": 1520.3328, + "step": 6290 + }, + { + "ce_loss_12": 3.3620065093040465, + "ce_loss_17": 3.0299498438835144, + "ce_loss_23": 2.907538902759552, + "ce_loss_3": 4.149127829074859, + "ce_loss_6": 3.838905620574951, + "epoch": 0.63, + "grad_norm": 1240.0, + "kl_loss_12": 1040.4415649414063, + "kl_loss_17": 282.5463836669922, + "kl_loss_3": 2615.3244873046874, + "kl_loss_6": 2007.760760498047, + "learning_rate": 0.00030682743715343565, + "loss": 1521.68, + "step": 6300 + }, + { + "ce_loss_12": 3.335128140449524, + "ce_loss_17": 2.9822778344154357, + "ce_loss_23": 2.852949821949005, + "ce_loss_3": 4.134418356418609, + "ce_loss_6": 3.830398738384247, + "epoch": 0.631, + "grad_norm": 1248.0, + "kl_loss_12": 1089.9908935546875, + "kl_loss_17": 297.17050018310545, + "kl_loss_3": 2691.6098022460938, + "kl_loss_6": 2083.130157470703, + "learning_rate": 0.0003053649468413043, + "loss": 1558.2545, + "step": 6310 + }, + { + "ce_loss_12": 3.428669238090515, + "ce_loss_17": 3.0927268743515013, + "ce_loss_23": 2.966153454780579, + "ce_loss_3": 4.215055656433106, + "ce_loss_6": 3.9037797093391418, + "epoch": 0.632, + "grad_norm": 1544.0, + "kl_loss_12": 1072.3437133789062, + "kl_loss_17": 294.3572998046875, + "kl_loss_3": 2663.4129272460937, + "kl_loss_6": 2048.999542236328, + "learning_rate": 0.00030390441650199725, + "loss": 1512.9581, + "step": 6320 + }, + { + "ce_loss_12": 3.343301022052765, + "ce_loss_17": 2.9984962582588195, + "ce_loss_23": 2.8740360498428346, + "ce_loss_3": 4.133579123020172, + "ce_loss_6": 3.8244497656822203, + "epoch": 0.633, + "grad_norm": 1168.0, + "kl_loss_12": 1065.6994506835938, + "kl_loss_17": 289.3247261047363, + "kl_loss_3": 2648.2664428710937, + "kl_loss_6": 2032.6017517089845, + "learning_rate": 0.00030244586084303903, + "loss": 1500.9375, + "step": 6330 + }, + { + "ce_loss_12": 3.332288992404938, + "ce_loss_17": 2.9722416281700133, + "ce_loss_23": 2.8453716039657593, + "ce_loss_3": 4.135188567638397, + "ce_loss_6": 3.8267269372940063, + "epoch": 0.634, + "grad_norm": 856.0, + "kl_loss_12": 1104.3370056152344, + "kl_loss_17": 297.27563934326173, + "kl_loss_3": 2726.391882324219, + "kl_loss_6": 2114.6755859375, + "learning_rate": 0.00030098929455206903, + "loss": 1521.0924, + "step": 6340 + }, + { + "ce_loss_12": 3.3067553639411926, + "ce_loss_17": 2.9648497700691223, + "ce_loss_23": 2.845312809944153, + "ce_loss_3": 4.113324463367462, + "ce_loss_6": 3.8015464425086973, + "epoch": 0.635, + "grad_norm": 1040.0, + "kl_loss_12": 1069.1728210449219, + "kl_loss_17": 285.195866394043, + "kl_loss_3": 2687.285046386719, + "kl_loss_6": 2075.2121032714845, + "learning_rate": 0.00029953473229669324, + "loss": 1569.9994, + "step": 6350 + }, + { + "ce_loss_12": 3.3521832466125487, + "ce_loss_17": 2.9959980249404907, + "ce_loss_23": 2.873391032218933, + "ce_loss_3": 4.1363235235214235, + "ce_loss_6": 3.833011245727539, + "epoch": 0.636, + "grad_norm": 1392.0, + "kl_loss_12": 1088.97607421875, + "kl_loss_17": 287.1729057312012, + "kl_loss_3": 2675.118701171875, + "kl_loss_6": 2077.9879821777345, + "learning_rate": 0.00029808218872433767, + "loss": 1509.7584, + "step": 6360 + }, + { + "ce_loss_12": 3.3870229005813597, + "ce_loss_17": 3.052706515789032, + "ce_loss_23": 2.9297956585884095, + "ce_loss_3": 4.169206213951111, + "ce_loss_6": 3.870281684398651, + "epoch": 0.637, + "grad_norm": 1568.0, + "kl_loss_12": 1048.5180084228516, + "kl_loss_17": 282.48961334228517, + "kl_loss_3": 2630.5627685546874, + "kl_loss_6": 2032.5360290527344, + "learning_rate": 0.0002966316784621, + "loss": 1498.2122, + "step": 6370 + }, + { + "ce_loss_12": 3.3314394116401673, + "ce_loss_17": 2.9822089433670045, + "ce_loss_23": 2.847934913635254, + "ce_loss_3": 4.135004734992981, + "ce_loss_6": 3.8197688102722167, + "epoch": 0.638, + "grad_norm": 1072.0, + "kl_loss_12": 1095.9818908691407, + "kl_loss_17": 296.8167663574219, + "kl_loss_3": 2713.0668090820313, + "kl_loss_6": 2088.9831420898436, + "learning_rate": 0.0002951832161166024, + "loss": 1513.5551, + "step": 6380 + }, + { + "ce_loss_12": 3.391787350177765, + "ce_loss_17": 3.039981758594513, + "ce_loss_23": 2.9113774418830873, + "ce_loss_3": 4.174764740467071, + "ce_loss_6": 3.877041220664978, + "epoch": 0.639, + "grad_norm": 888.0, + "kl_loss_12": 1083.952947998047, + "kl_loss_17": 293.1161712646484, + "kl_loss_3": 2669.3549194335938, + "kl_loss_6": 2076.956903076172, + "learning_rate": 0.0002937368162738445, + "loss": 1502.0477, + "step": 6390 + }, + { + "ce_loss_12": 3.3227894425392153, + "ce_loss_17": 2.993958580493927, + "ce_loss_23": 2.8782469749450685, + "ce_loss_3": 4.11331399679184, + "ce_loss_6": 3.8117558360099792, + "epoch": 0.64, + "grad_norm": 1160.0, + "kl_loss_12": 1048.937274169922, + "kl_loss_17": 279.83716888427733, + "kl_loss_3": 2632.9319458007812, + "kl_loss_6": 2037.3876037597656, + "learning_rate": 0.0002922924934990568, + "loss": 1526.9837, + "step": 6400 + }, + { + "ce_loss_12": 3.2892454981803896, + "ce_loss_17": 2.9320313334465027, + "ce_loss_23": 2.8080975651741027, + "ce_loss_3": 4.108310544490815, + "ce_loss_6": 3.7949350237846375, + "epoch": 0.641, + "grad_norm": 1416.0, + "kl_loss_12": 1095.0140625, + "kl_loss_17": 289.45925369262693, + "kl_loss_3": 2754.233044433594, + "kl_loss_6": 2133.916400146484, + "learning_rate": 0.0002908502623365536, + "loss": 1539.6635, + "step": 6410 + }, + { + "ce_loss_12": 3.2228192329406737, + "ce_loss_17": 2.8717540979385374, + "ce_loss_23": 2.7454086601734162, + "ce_loss_3": 4.063563895225525, + "ce_loss_6": 3.7421645879745484, + "epoch": 0.642, + "grad_norm": 1376.0, + "kl_loss_12": 1095.3357879638672, + "kl_loss_17": 289.3959281921387, + "kl_loss_3": 2782.289697265625, + "kl_loss_6": 2145.7878540039064, + "learning_rate": 0.0002894101373095867, + "loss": 1544.1738, + "step": 6420 + }, + { + "ce_loss_12": 3.413454031944275, + "ce_loss_17": 3.074009048938751, + "ce_loss_23": 2.9470053911209106, + "ce_loss_3": 4.182444787025451, + "ce_loss_6": 3.886037003993988, + "epoch": 0.643, + "grad_norm": 868.0, + "kl_loss_12": 1073.1138610839844, + "kl_loss_17": 293.98568572998045, + "kl_loss_3": 2631.6056274414063, + "kl_loss_6": 2042.1914184570312, + "learning_rate": 0.00028797213292019926, + "loss": 1514.5645, + "step": 6430 + }, + { + "ce_loss_12": 3.394935941696167, + "ce_loss_17": 3.052765667438507, + "ce_loss_23": 2.925507402420044, + "ce_loss_3": 4.178702533245087, + "ce_loss_6": 3.871429920196533, + "epoch": 0.644, + "grad_norm": 1248.0, + "kl_loss_12": 1081.2356658935546, + "kl_loss_17": 293.6976516723633, + "kl_loss_3": 2645.124951171875, + "kl_loss_6": 2043.427099609375, + "learning_rate": 0.0002865362636490791, + "loss": 1547.284, + "step": 6440 + }, + { + "ce_loss_12": 3.4013839244842528, + "ce_loss_17": 3.0620115041732787, + "ce_loss_23": 2.942251706123352, + "ce_loss_3": 4.181041252613068, + "ce_loss_6": 3.874261713027954, + "epoch": 0.645, + "grad_norm": 1096.0, + "kl_loss_12": 1063.9557067871094, + "kl_loss_17": 287.7143745422363, + "kl_loss_3": 2624.9001586914064, + "kl_loss_6": 2024.624041748047, + "learning_rate": 0.0002851025439554142, + "loss": 1502.1726, + "step": 6450 + }, + { + "ce_loss_12": 3.3908503651618958, + "ce_loss_17": 3.0486684918403624, + "ce_loss_23": 2.9185595750808715, + "ce_loss_3": 4.1556107759475704, + "ce_loss_6": 3.8517315864562987, + "epoch": 0.646, + "grad_norm": 1080.0, + "kl_loss_12": 1064.3933685302734, + "kl_loss_17": 290.0317329406738, + "kl_loss_3": 2599.8388427734376, + "kl_loss_6": 2001.0165588378907, + "learning_rate": 0.00028367098827674573, + "loss": 1502.8248, + "step": 6460 + }, + { + "ce_loss_12": 3.3172547459602355, + "ce_loss_17": 2.9788630843162536, + "ce_loss_23": 2.8583949327468874, + "ce_loss_3": 4.1198078989982605, + "ce_loss_6": 3.804555869102478, + "epoch": 0.647, + "grad_norm": 1280.0, + "kl_loss_12": 1058.7646209716797, + "kl_loss_17": 283.33220596313475, + "kl_loss_3": 2653.0367065429687, + "kl_loss_6": 2037.1688110351563, + "learning_rate": 0.00028224161102882397, + "loss": 1525.8549, + "step": 6470 + }, + { + "ce_loss_12": 3.2954100370407104, + "ce_loss_17": 2.9568528294563294, + "ce_loss_23": 2.8395904183387755, + "ce_loss_3": 4.058874034881592, + "ce_loss_6": 3.7639296650886536, + "epoch": 0.648, + "grad_norm": 956.0, + "kl_loss_12": 1051.4144226074218, + "kl_loss_17": 277.5598617553711, + "kl_loss_3": 2598.404248046875, + "kl_loss_6": 2012.757061767578, + "learning_rate": 0.00028081442660546124, + "loss": 1513.174, + "step": 6480 + }, + { + "ce_loss_12": 3.359078085422516, + "ce_loss_17": 3.021183693408966, + "ce_loss_23": 2.895930051803589, + "ce_loss_3": 4.148511445522308, + "ce_loss_6": 3.834946537017822, + "epoch": 0.649, + "grad_norm": 1004.0, + "kl_loss_12": 1060.023629760742, + "kl_loss_17": 291.75023956298827, + "kl_loss_3": 2661.596643066406, + "kl_loss_6": 2033.6894104003907, + "learning_rate": 0.0002793894493783892, + "loss": 1514.7693, + "step": 6490 + }, + { + "ce_loss_12": 3.366551387310028, + "ce_loss_17": 3.030740213394165, + "ce_loss_23": 2.913418471813202, + "ce_loss_3": 4.158485841751099, + "ce_loss_6": 3.850329840183258, + "epoch": 0.65, + "grad_norm": 1112.0, + "kl_loss_12": 1045.378155517578, + "kl_loss_17": 278.98334350585935, + "kl_loss_3": 2624.3213745117187, + "kl_loss_6": 2024.8353698730468, + "learning_rate": 0.0002779666936971129, + "loss": 1499.6525, + "step": 6500 + }, + { + "ce_loss_12": 3.394643759727478, + "ce_loss_17": 3.04899320602417, + "ce_loss_23": 2.9235555052757265, + "ce_loss_3": 4.185719633102417, + "ce_loss_6": 3.8867220282554626, + "epoch": 0.651, + "grad_norm": 916.0, + "kl_loss_12": 1075.935726928711, + "kl_loss_17": 288.56647872924805, + "kl_loss_3": 2680.7698486328127, + "kl_loss_6": 2072.048992919922, + "learning_rate": 0.00027654617388876614, + "loss": 1532.4816, + "step": 6510 + }, + { + "ce_loss_12": 3.398653984069824, + "ce_loss_17": 3.0640782356262206, + "ce_loss_23": 2.938787519931793, + "ce_loss_3": 4.188610053062439, + "ce_loss_6": 3.878059720993042, + "epoch": 0.652, + "grad_norm": 908.0, + "kl_loss_12": 1061.6495239257813, + "kl_loss_17": 291.04235076904297, + "kl_loss_3": 2673.7321899414064, + "kl_loss_6": 2048.234454345703, + "learning_rate": 0.0002751279042579672, + "loss": 1528.4081, + "step": 6520 + }, + { + "ce_loss_12": 3.3495648741722106, + "ce_loss_17": 3.0123731017112734, + "ce_loss_23": 2.892154061794281, + "ce_loss_3": 4.12523752450943, + "ce_loss_6": 3.828623628616333, + "epoch": 0.653, + "grad_norm": 1344.0, + "kl_loss_12": 1051.1055145263672, + "kl_loss_17": 278.1377319335937, + "kl_loss_3": 2622.2849731445312, + "kl_loss_6": 2022.1887268066407, + "learning_rate": 0.00027371189908667604, + "loss": 1526.3842, + "step": 6530 + }, + { + "ce_loss_12": 3.4101003408432007, + "ce_loss_17": 3.0658029317855835, + "ce_loss_23": 2.9356747388839723, + "ce_loss_3": 4.236353695392609, + "ce_loss_6": 3.928320360183716, + "epoch": 0.654, + "grad_norm": 912.0, + "kl_loss_12": 1088.5187896728517, + "kl_loss_17": 305.0631164550781, + "kl_loss_3": 2743.589709472656, + "kl_loss_6": 2120.7795227050783, + "learning_rate": 0.00027229817263404863, + "loss": 1564.6652, + "step": 6540 + }, + { + "ce_loss_12": 3.372529911994934, + "ce_loss_17": 3.046334421634674, + "ce_loss_23": 2.929336595535278, + "ce_loss_3": 4.127618956565857, + "ce_loss_6": 3.825167953968048, + "epoch": 0.655, + "grad_norm": 1144.0, + "kl_loss_12": 1039.775701904297, + "kl_loss_17": 281.04506454467776, + "kl_loss_3": 2574.1316040039064, + "kl_loss_6": 1979.277850341797, + "learning_rate": 0.0002708867391362948, + "loss": 1498.607, + "step": 6550 + }, + { + "ce_loss_12": 3.348506212234497, + "ce_loss_17": 3.024095320701599, + "ce_loss_23": 2.908759653568268, + "ce_loss_3": 4.1206450939178465, + "ce_loss_6": 3.817743384838104, + "epoch": 0.656, + "grad_norm": 960.0, + "kl_loss_12": 1020.556494140625, + "kl_loss_17": 276.0389991760254, + "kl_loss_3": 2578.2486083984377, + "kl_loss_6": 1974.8206176757812, + "learning_rate": 0.0002694776128065345, + "loss": 1498.607, + "step": 6560 + }, + { + "ce_loss_12": 3.317186141014099, + "ce_loss_17": 2.9659634828567505, + "ce_loss_23": 2.8406397342681884, + "ce_loss_3": 4.101336324214936, + "ce_loss_6": 3.78921320438385, + "epoch": 0.657, + "grad_norm": 832.0, + "kl_loss_12": 1084.9080657958984, + "kl_loss_17": 291.7940902709961, + "kl_loss_3": 2676.329724121094, + "kl_loss_6": 2057.403973388672, + "learning_rate": 0.00026807080783465374, + "loss": 1499.9951, + "step": 6570 + }, + { + "ce_loss_12": 3.4112606763839723, + "ce_loss_17": 3.0685062646865844, + "ce_loss_23": 2.9431819200515745, + "ce_loss_3": 4.199546587467194, + "ce_loss_6": 3.8976390838623045, + "epoch": 0.658, + "grad_norm": 1128.0, + "kl_loss_12": 1085.2541046142578, + "kl_loss_17": 291.7502784729004, + "kl_loss_3": 2673.6155639648437, + "kl_loss_6": 2070.629815673828, + "learning_rate": 0.00026666633838716316, + "loss": 1540.0823, + "step": 6580 + }, + { + "ce_loss_12": 3.325250494480133, + "ce_loss_17": 2.9772164583206178, + "ce_loss_23": 2.849889171123505, + "ce_loss_3": 4.125745224952698, + "ce_loss_6": 3.8154043674468996, + "epoch": 0.659, + "grad_norm": 1104.0, + "kl_loss_12": 1087.6900970458985, + "kl_loss_17": 296.8415771484375, + "kl_loss_3": 2694.4278930664063, + "kl_loss_6": 2083.0305114746093, + "learning_rate": 0.00026526421860705474, + "loss": 1550.6973, + "step": 6590 + }, + { + "ce_loss_12": 3.3463248372077943, + "ce_loss_17": 2.9926101684570314, + "ce_loss_23": 2.8664312243461607, + "ce_loss_3": 4.144900977611542, + "ce_loss_6": 3.84015097618103, + "epoch": 0.66, + "grad_norm": 1272.0, + "kl_loss_12": 1082.672903442383, + "kl_loss_17": 295.27245864868166, + "kl_loss_3": 2689.79296875, + "kl_loss_6": 2084.4121032714843, + "learning_rate": 0.0002638644626136587, + "loss": 1518.3607, + "step": 6600 + }, + { + "ce_loss_12": 3.3497479915618897, + "ce_loss_17": 3.010710376501083, + "ce_loss_23": 2.8882995724678038, + "ce_loss_3": 4.14725239276886, + "ce_loss_6": 3.838615524768829, + "epoch": 0.661, + "grad_norm": 1048.0, + "kl_loss_12": 1062.1607543945313, + "kl_loss_17": 285.89757080078124, + "kl_loss_3": 2643.746630859375, + "kl_loss_6": 2037.8521606445313, + "learning_rate": 0.00026246708450250255, + "loss": 1512.559, + "step": 6610 + }, + { + "ce_loss_12": 3.3246140122413634, + "ce_loss_17": 2.9918368458747864, + "ce_loss_23": 2.8710078835487365, + "ce_loss_3": 4.096918213367462, + "ce_loss_6": 3.8028513193130493, + "epoch": 0.662, + "grad_norm": 984.0, + "kl_loss_12": 1048.273110961914, + "kl_loss_17": 283.497420501709, + "kl_loss_3": 2607.4280639648437, + "kl_loss_6": 2013.1176147460938, + "learning_rate": 0.00026107209834516854, + "loss": 1498.8475, + "step": 6620 + }, + { + "ce_loss_12": 3.3087719678878784, + "ce_loss_17": 2.9600358247756957, + "ce_loss_23": 2.8357106685638427, + "ce_loss_3": 4.133854627609253, + "ce_loss_6": 3.8167326927185057, + "epoch": 0.663, + "grad_norm": 1080.0, + "kl_loss_12": 1081.6213989257812, + "kl_loss_17": 288.49953842163086, + "kl_loss_3": 2732.234655761719, + "kl_loss_6": 2106.6643188476564, + "learning_rate": 0.0002596795181891514, + "loss": 1548.7227, + "step": 6630 + }, + { + "ce_loss_12": 3.3174493074417115, + "ce_loss_17": 2.971146559715271, + "ce_loss_23": 2.837556302547455, + "ce_loss_3": 4.109105908870697, + "ce_loss_6": 3.8051462888717653, + "epoch": 0.664, + "grad_norm": 1112.0, + "kl_loss_12": 1099.5311096191406, + "kl_loss_17": 299.4432113647461, + "kl_loss_3": 2689.6567504882814, + "kl_loss_6": 2086.8522766113283, + "learning_rate": 0.000258289358057718, + "loss": 1584.9252, + "step": 6640 + }, + { + "ce_loss_12": 3.3833224058151243, + "ce_loss_17": 3.027475082874298, + "ce_loss_23": 2.8935093522071837, + "ce_loss_3": 4.180250668525696, + "ce_loss_6": 3.8748958826065065, + "epoch": 0.665, + "grad_norm": 1120.0, + "kl_loss_12": 1098.4118103027345, + "kl_loss_17": 301.1476058959961, + "kl_loss_3": 2711.7373779296877, + "kl_loss_6": 2101.2721557617188, + "learning_rate": 0.0002569016319497657, + "loss": 1549.8691, + "step": 6650 + }, + { + "ce_loss_12": 3.3719239592552186, + "ce_loss_17": 3.0172632336616516, + "ce_loss_23": 2.8854519844055178, + "ce_loss_3": 4.161745226383209, + "ce_loss_6": 3.8583480596542357, + "epoch": 0.666, + "grad_norm": 1072.0, + "kl_loss_12": 1103.6511932373046, + "kl_loss_17": 303.22628631591795, + "kl_loss_3": 2711.325439453125, + "kl_loss_6": 2102.153436279297, + "learning_rate": 0.00025551635383968066, + "loss": 1558.8236, + "step": 6660 + }, + { + "ce_loss_12": 3.281505012512207, + "ce_loss_17": 2.940502107143402, + "ce_loss_23": 2.813677740097046, + "ce_loss_3": 4.103258204460144, + "ce_loss_6": 3.77959862947464, + "epoch": 0.667, + "grad_norm": 1048.0, + "kl_loss_12": 1085.2066711425782, + "kl_loss_17": 293.6140731811523, + "kl_loss_3": 2719.1686889648436, + "kl_loss_6": 2092.0775451660156, + "learning_rate": 0.00025413353767719804, + "loss": 1543.7244, + "step": 6670 + }, + { + "ce_loss_12": 3.3316211462020875, + "ce_loss_17": 2.990746355056763, + "ce_loss_23": 2.870693302154541, + "ce_loss_3": 4.122513997554779, + "ce_loss_6": 3.820575976371765, + "epoch": 0.668, + "grad_norm": 1136.0, + "kl_loss_12": 1068.2965698242188, + "kl_loss_17": 284.17504425048827, + "kl_loss_3": 2670.3581787109374, + "kl_loss_6": 2070.3725158691404, + "learning_rate": 0.0002527531973872617, + "loss": 1525.5031, + "step": 6680 + }, + { + "ce_loss_12": 3.338472878932953, + "ce_loss_17": 3.008737337589264, + "ce_loss_23": 2.8879444122314455, + "ce_loss_3": 4.122899007797241, + "ce_loss_6": 3.8144315242767335, + "epoch": 0.669, + "grad_norm": 928.0, + "kl_loss_12": 1054.9741912841796, + "kl_loss_17": 286.4005615234375, + "kl_loss_3": 2637.2769409179687, + "kl_loss_6": 2025.1039489746095, + "learning_rate": 0.0002513753468698826, + "loss": 1502.526, + "step": 6690 + }, + { + "ce_loss_12": 3.317826581001282, + "ce_loss_17": 2.9731616497039797, + "ce_loss_23": 2.849671816825867, + "ce_loss_3": 4.118358051776886, + "ce_loss_6": 3.8065997838973997, + "epoch": 0.67, + "grad_norm": 896.0, + "kl_loss_12": 1082.1374267578126, + "kl_loss_17": 293.6460014343262, + "kl_loss_3": 2702.1760009765626, + "kl_loss_6": 2087.3015747070312, + "learning_rate": 0.0002500000000000001, + "loss": 1532.8109, + "step": 6700 + }, + { + "ce_loss_12": 3.398632895946503, + "ce_loss_17": 3.0781195163726807, + "ce_loss_23": 2.9640834808349608, + "ce_loss_3": 4.1491206049919125, + "ce_loss_6": 3.8456847786903383, + "epoch": 0.671, + "grad_norm": 1064.0, + "kl_loss_12": 1029.509799194336, + "kl_loss_17": 277.89868240356446, + "kl_loss_3": 2540.9117431640625, + "kl_loss_6": 1946.9610656738282, + "learning_rate": 0.0002486271706273421, + "loss": 1526.3414, + "step": 6710 + }, + { + "ce_loss_12": 3.3420624256134035, + "ce_loss_17": 3.016208124160767, + "ce_loss_23": 2.900933396816254, + "ce_loss_3": 4.097201704978943, + "ce_loss_6": 3.7993552684783936, + "epoch": 0.672, + "grad_norm": 1136.0, + "kl_loss_12": 1028.4074829101562, + "kl_loss_17": 275.4583435058594, + "kl_loss_3": 2554.589733886719, + "kl_loss_6": 1960.41845703125, + "learning_rate": 0.0002472568725762853, + "loss": 1500.902, + "step": 6720 + }, + { + "ce_loss_12": 3.3320967674255373, + "ce_loss_17": 3.0137043356895448, + "ce_loss_23": 2.896807622909546, + "ce_loss_3": 4.0894329190254215, + "ce_loss_6": 3.7987982273101806, + "epoch": 0.673, + "grad_norm": 980.0, + "kl_loss_12": 1022.3379486083984, + "kl_loss_17": 274.11581802368164, + "kl_loss_3": 2557.5329467773436, + "kl_loss_6": 1979.3651306152344, + "learning_rate": 0.00024588911964571554, + "loss": 1478.16, + "step": 6730 + }, + { + "ce_loss_12": 3.383940541744232, + "ce_loss_17": 3.0269935369491576, + "ce_loss_23": 2.8932540893554686, + "ce_loss_3": 4.192104065418244, + "ce_loss_6": 3.879211986064911, + "epoch": 0.674, + "grad_norm": 1032.0, + "kl_loss_12": 1114.5952880859375, + "kl_loss_17": 303.4539489746094, + "kl_loss_3": 2712.537390136719, + "kl_loss_6": 2103.3569152832033, + "learning_rate": 0.00024452392560888974, + "loss": 1527.6742, + "step": 6740 + }, + { + "ce_loss_12": 3.258523499965668, + "ce_loss_17": 2.9190154671669006, + "ce_loss_23": 2.7954254031181334, + "ce_loss_3": 4.043025708198547, + "ce_loss_6": 3.7352750062942506, + "epoch": 0.675, + "grad_norm": 964.0, + "kl_loss_12": 1053.5061340332031, + "kl_loss_17": 279.17559814453125, + "kl_loss_3": 2647.2302368164064, + "kl_loss_6": 2033.0640502929687, + "learning_rate": 0.00024316130421329695, + "loss": 1493.335, + "step": 6750 + }, + { + "ce_loss_12": 3.324470043182373, + "ce_loss_17": 2.9908277809619905, + "ce_loss_23": 2.8736360669136047, + "ce_loss_3": 4.108500003814697, + "ce_loss_6": 3.8002394437789917, + "epoch": 0.676, + "grad_norm": 1016.0, + "kl_loss_12": 1059.7845489501954, + "kl_loss_17": 282.0549186706543, + "kl_loss_3": 2640.0760986328123, + "kl_loss_6": 2035.9004272460938, + "learning_rate": 0.00024180126918051909, + "loss": 1515.6507, + "step": 6760 + }, + { + "ce_loss_12": 3.3820838809013365, + "ce_loss_17": 3.0400675654411318, + "ce_loss_23": 2.9171111464500425, + "ce_loss_3": 4.148803424835205, + "ce_loss_6": 3.848258209228516, + "epoch": 0.677, + "grad_norm": 1056.0, + "kl_loss_12": 1065.7451202392579, + "kl_loss_17": 287.0917541503906, + "kl_loss_3": 2629.779504394531, + "kl_loss_6": 2024.6423583984374, + "learning_rate": 0.00024044383420609406, + "loss": 1495.1738, + "step": 6770 + }, + { + "ce_loss_12": 3.3741610646247864, + "ce_loss_17": 3.05090674161911, + "ce_loss_23": 2.9324339389801026, + "ce_loss_3": 4.128608286380768, + "ce_loss_6": 3.830838167667389, + "epoch": 0.678, + "grad_norm": 2096.0, + "kl_loss_12": 1040.0166320800781, + "kl_loss_17": 278.25733337402346, + "kl_loss_3": 2569.7828369140625, + "kl_loss_6": 1983.01318359375, + "learning_rate": 0.00023908901295937712, + "loss": 1514.7359, + "step": 6780 + }, + { + "ce_loss_12": 3.363990819454193, + "ce_loss_17": 3.0348567843437193, + "ce_loss_23": 2.9126704931259155, + "ce_loss_3": 4.140928709506989, + "ce_loss_6": 3.838523817062378, + "epoch": 0.679, + "grad_norm": 1088.0, + "kl_loss_12": 1038.0822326660157, + "kl_loss_17": 281.52527770996096, + "kl_loss_3": 2588.859130859375, + "kl_loss_6": 1988.4985778808593, + "learning_rate": 0.00023773681908340283, + "loss": 1513.9151, + "step": 6790 + }, + { + "ce_loss_12": 3.367039108276367, + "ce_loss_17": 3.0190672516822814, + "ce_loss_23": 2.8874718308448792, + "ce_loss_3": 4.159163236618042, + "ce_loss_6": 3.8588353276252745, + "epoch": 0.68, + "grad_norm": 1048.0, + "kl_loss_12": 1106.3126983642578, + "kl_loss_17": 303.4519523620605, + "kl_loss_3": 2704.4398681640623, + "kl_loss_6": 2104.3208129882814, + "learning_rate": 0.00023638726619474876, + "loss": 1562.6964, + "step": 6800 + }, + { + "ce_loss_12": 3.3643818736076354, + "ce_loss_17": 3.013008165359497, + "ce_loss_23": 2.878277862071991, + "ce_loss_3": 4.188300597667694, + "ce_loss_6": 3.8750722408294678, + "epoch": 0.681, + "grad_norm": 1232.0, + "kl_loss_12": 1105.7393890380858, + "kl_loss_17": 300.2103561401367, + "kl_loss_3": 2752.149304199219, + "kl_loss_6": 2135.510040283203, + "learning_rate": 0.0002350403678833976, + "loss": 1543.9213, + "step": 6810 + }, + { + "ce_loss_12": 3.2816458344459534, + "ce_loss_17": 2.9370277523994446, + "ce_loss_23": 2.8150463938713073, + "ce_loss_3": 4.0751855850219725, + "ce_loss_6": 3.76500483751297, + "epoch": 0.682, + "grad_norm": 856.0, + "kl_loss_12": 1076.2480041503907, + "kl_loss_17": 285.06641082763673, + "kl_loss_3": 2683.7720581054687, + "kl_loss_6": 2068.134649658203, + "learning_rate": 0.00023369613771260007, + "loss": 1512.8716, + "step": 6820 + }, + { + "ce_loss_12": 3.39626430273056, + "ce_loss_17": 3.047028458118439, + "ce_loss_23": 2.9260175466537475, + "ce_loss_3": 4.189331746101379, + "ce_loss_6": 3.8821977376937866, + "epoch": 0.683, + "grad_norm": 896.0, + "kl_loss_12": 1084.3677520751953, + "kl_loss_17": 290.67858200073243, + "kl_loss_3": 2690.972790527344, + "kl_loss_6": 2081.7094116210938, + "learning_rate": 0.00023235458921873925, + "loss": 1535.7379, + "step": 6830 + }, + { + "ce_loss_12": 3.386719024181366, + "ce_loss_17": 3.0164597988128663, + "ce_loss_23": 2.881801736354828, + "ce_loss_3": 4.203792822360993, + "ce_loss_6": 3.894656181335449, + "epoch": 0.684, + "grad_norm": 1016.0, + "kl_loss_12": 1140.004690551758, + "kl_loss_17": 307.1119552612305, + "kl_loss_3": 2795.7464233398437, + "kl_loss_6": 2184.1928283691404, + "learning_rate": 0.0002310157359111938, + "loss": 1590.245, + "step": 6840 + }, + { + "ce_loss_12": 3.281593942642212, + "ce_loss_17": 2.9068808436393736, + "ce_loss_23": 2.7744899153709413, + "ce_loss_3": 4.151088750362396, + "ce_loss_6": 3.8237022399902343, + "epoch": 0.685, + "grad_norm": 1040.0, + "kl_loss_12": 1121.4570068359376, + "kl_loss_17": 298.14404449462893, + "kl_loss_3": 2864.43203125, + "kl_loss_6": 2216.5655029296877, + "learning_rate": 0.0002296795912722014, + "loss": 1589.223, + "step": 6850 + }, + { + "ce_loss_12": 3.376678490638733, + "ce_loss_17": 3.0392310976982118, + "ce_loss_23": 2.9149879217147827, + "ce_loss_3": 4.142076396942139, + "ce_loss_6": 3.8350725889205934, + "epoch": 0.686, + "grad_norm": 1136.0, + "kl_loss_12": 1057.6813568115235, + "kl_loss_17": 285.86506271362305, + "kl_loss_3": 2618.06630859375, + "kl_loss_6": 2008.5849182128907, + "learning_rate": 0.0002283461687567236, + "loss": 1479.476, + "step": 6860 + }, + { + "ce_loss_12": 3.4147990345954895, + "ce_loss_17": 3.0852442264556883, + "ce_loss_23": 2.967425298690796, + "ce_loss_3": 4.163288021087647, + "ce_loss_6": 3.869793510437012, + "epoch": 0.687, + "grad_norm": 1088.0, + "kl_loss_12": 1037.484829711914, + "kl_loss_17": 282.1102653503418, + "kl_loss_3": 2554.0628662109375, + "kl_loss_6": 1967.2089477539062, + "learning_rate": 0.00022701548179231045, + "loss": 1506.1345, + "step": 6870 + }, + { + "ce_loss_12": 3.3873743414878845, + "ce_loss_17": 3.0464545249938966, + "ce_loss_23": 2.922188627719879, + "ce_loss_3": 4.175418794155121, + "ce_loss_6": 3.875636374950409, + "epoch": 0.688, + "grad_norm": 1400.0, + "kl_loss_12": 1075.7415679931642, + "kl_loss_17": 289.9831932067871, + "kl_loss_3": 2661.655517578125, + "kl_loss_6": 2067.484735107422, + "learning_rate": 0.00022568754377896516, + "loss": 1503.4135, + "step": 6880 + }, + { + "ce_loss_12": 3.385250473022461, + "ce_loss_17": 3.0440564274787905, + "ce_loss_23": 2.9164554595947267, + "ce_loss_3": 4.146143007278442, + "ce_loss_6": 3.846618139743805, + "epoch": 0.689, + "grad_norm": 952.0, + "kl_loss_12": 1077.9116333007812, + "kl_loss_17": 290.6547370910645, + "kl_loss_3": 2626.9868286132814, + "kl_loss_6": 2026.3530639648438, + "learning_rate": 0.00022436236808900844, + "loss": 1506.102, + "step": 6890 + }, + { + "ce_loss_12": 3.2836104154586794, + "ce_loss_17": 2.936374032497406, + "ce_loss_23": 2.810339403152466, + "ce_loss_3": 4.082259452342987, + "ce_loss_6": 3.7733787298202515, + "epoch": 0.69, + "grad_norm": 1000.0, + "kl_loss_12": 1083.3499816894532, + "kl_loss_17": 289.6112510681152, + "kl_loss_3": 2704.3146240234373, + "kl_loss_6": 2076.830242919922, + "learning_rate": 0.00022303996806694487, + "loss": 1520.6959, + "step": 6900 + }, + { + "ce_loss_12": 3.348386228084564, + "ce_loss_17": 3.005220341682434, + "ce_loss_23": 2.8856596112251283, + "ce_loss_3": 4.147955656051636, + "ce_loss_6": 3.8367886543273926, + "epoch": 0.691, + "grad_norm": 1096.0, + "kl_loss_12": 1068.3898193359375, + "kl_loss_17": 283.26347732543945, + "kl_loss_3": 2685.2141967773437, + "kl_loss_6": 2064.223016357422, + "learning_rate": 0.00022172035702932823, + "loss": 1514.9848, + "step": 6910 + }, + { + "ce_loss_12": 3.3900525689125063, + "ce_loss_17": 3.0595398545265198, + "ce_loss_23": 2.935157132148743, + "ce_loss_3": 4.146279001235962, + "ce_loss_6": 3.850438177585602, + "epoch": 0.692, + "grad_norm": 1012.0, + "kl_loss_12": 1046.249249267578, + "kl_loss_17": 287.8135139465332, + "kl_loss_3": 2567.0113037109377, + "kl_loss_6": 1990.5654724121093, + "learning_rate": 0.00022040354826462666, + "loss": 1482.2508, + "step": 6920 + }, + { + "ce_loss_12": 3.318897318840027, + "ce_loss_17": 2.9834712266922, + "ce_loss_23": 2.865597403049469, + "ce_loss_3": 4.112779140472412, + "ce_loss_6": 3.808480966091156, + "epoch": 0.693, + "grad_norm": 1096.0, + "kl_loss_12": 1055.834146118164, + "kl_loss_17": 281.22287368774414, + "kl_loss_3": 2656.381481933594, + "kl_loss_6": 2053.055072021484, + "learning_rate": 0.0002190895550330899, + "loss": 1526.6893, + "step": 6930 + }, + { + "ce_loss_12": 3.2867075204849243, + "ce_loss_17": 2.9261071562767027, + "ce_loss_23": 2.797209632396698, + "ce_loss_3": 4.094171130657196, + "ce_loss_6": 3.777874195575714, + "epoch": 0.694, + "grad_norm": 1584.0, + "kl_loss_12": 1091.2527709960937, + "kl_loss_17": 291.81100311279295, + "kl_loss_3": 2703.411511230469, + "kl_loss_6": 2074.94111328125, + "learning_rate": 0.00021777839056661552, + "loss": 1509.3366, + "step": 6940 + }, + { + "ce_loss_12": 3.337392258644104, + "ce_loss_17": 3.0037010431289675, + "ce_loss_23": 2.883526420593262, + "ce_loss_3": 4.121099376678467, + "ce_loss_6": 3.8113796710968018, + "epoch": 0.695, + "grad_norm": 1448.0, + "kl_loss_12": 1057.7788360595703, + "kl_loss_17": 283.61209259033205, + "kl_loss_3": 2636.60634765625, + "kl_loss_6": 2022.8482055664062, + "learning_rate": 0.0002164700680686147, + "loss": 1485.7182, + "step": 6950 + }, + { + "ce_loss_12": 3.3705799698829653, + "ce_loss_17": 3.0428457498550414, + "ce_loss_23": 2.91817193031311, + "ce_loss_3": 4.139367914199829, + "ce_loss_6": 3.8418300271034242, + "epoch": 0.696, + "grad_norm": 1576.0, + "kl_loss_12": 1039.181134033203, + "kl_loss_17": 285.70360260009767, + "kl_loss_3": 2583.8804931640625, + "kl_loss_6": 1989.1890197753905, + "learning_rate": 0.0002151646007138806, + "loss": 1487.7269, + "step": 6960 + }, + { + "ce_loss_12": 3.288752329349518, + "ce_loss_17": 2.940204656124115, + "ce_loss_23": 2.815975916385651, + "ce_loss_3": 4.087472057342529, + "ce_loss_6": 3.7803780317306517, + "epoch": 0.697, + "grad_norm": 1264.0, + "kl_loss_12": 1086.2156982421875, + "kl_loss_17": 290.2320617675781, + "kl_loss_3": 2702.2922485351564, + "kl_loss_6": 2095.7369689941406, + "learning_rate": 0.00021386200164845526, + "loss": 1518.6186, + "step": 6970 + }, + { + "ce_loss_12": 3.422125792503357, + "ce_loss_17": 3.0908151388168337, + "ce_loss_23": 2.9712757110595702, + "ce_loss_3": 4.169968152046204, + "ce_loss_6": 3.877488946914673, + "epoch": 0.698, + "grad_norm": 1056.0, + "kl_loss_12": 1040.7631591796876, + "kl_loss_17": 279.68602294921874, + "kl_loss_3": 2554.2770751953126, + "kl_loss_6": 1976.9941101074219, + "learning_rate": 0.0002125622839894964, + "loss": 1470.3584, + "step": 6980 + }, + { + "ce_loss_12": 3.3737497091293336, + "ce_loss_17": 3.0447206258773805, + "ce_loss_23": 2.9255369782447813, + "ce_loss_3": 4.149833381175995, + "ce_loss_6": 3.8436522603034975, + "epoch": 0.699, + "grad_norm": 1004.0, + "kl_loss_12": 1034.2772094726563, + "kl_loss_17": 279.0837905883789, + "kl_loss_3": 2585.775256347656, + "kl_loss_6": 1985.5286926269532, + "learning_rate": 0.00021126546082514663, + "loss": 1479.8258, + "step": 6990 + }, + { + "ce_loss_12": 3.3959160089492797, + "ce_loss_17": 3.066310930252075, + "ce_loss_23": 2.9466503143310545, + "ce_loss_3": 4.15298364162445, + "ce_loss_6": 3.8538186311721803, + "epoch": 0.7, + "grad_norm": 1056.0, + "kl_loss_12": 1044.9644104003905, + "kl_loss_17": 280.3296058654785, + "kl_loss_3": 2576.889367675781, + "kl_loss_6": 1987.39765625, + "learning_rate": 0.00020997154521440098, + "loss": 1473.4609, + "step": 7000 + }, + { + "ce_loss_12": 3.3464019894599915, + "ce_loss_17": 3.0117498874664306, + "ce_loss_23": 2.894484758377075, + "ce_loss_3": 4.125147533416748, + "ce_loss_6": 3.816951608657837, + "epoch": 0.701, + "grad_norm": 956.0, + "kl_loss_12": 1051.8970977783204, + "kl_loss_17": 278.6096450805664, + "kl_loss_3": 2619.6825317382813, + "kl_loss_6": 2012.2601745605468, + "learning_rate": 0.0002086805501869749, + "loss": 1484.8754, + "step": 7010 + }, + { + "ce_loss_12": 3.347380018234253, + "ce_loss_17": 2.995504927635193, + "ce_loss_23": 2.867575478553772, + "ce_loss_3": 4.148639333248139, + "ce_loss_6": 3.8330894231796266, + "epoch": 0.702, + "grad_norm": 1208.0, + "kl_loss_12": 1098.6046295166016, + "kl_loss_17": 294.9442642211914, + "kl_loss_3": 2722.4083984375, + "kl_loss_6": 2093.525006103516, + "learning_rate": 0.0002073924887431744, + "loss": 1525.892, + "step": 7020 + }, + { + "ce_loss_12": 3.336989998817444, + "ce_loss_17": 2.992552936077118, + "ce_loss_23": 2.874508500099182, + "ce_loss_3": 4.118464183807373, + "ce_loss_6": 3.824393606185913, + "epoch": 0.703, + "grad_norm": 1152.0, + "kl_loss_12": 1065.1422973632812, + "kl_loss_17": 284.58373031616213, + "kl_loss_3": 2654.9352783203126, + "kl_loss_6": 2063.013397216797, + "learning_rate": 0.00020610737385376348, + "loss": 1550.015, + "step": 7030 + }, + { + "ce_loss_12": 3.371172559261322, + "ce_loss_17": 3.0425814151763917, + "ce_loss_23": 2.9228823304176332, + "ce_loss_3": 4.125135231018066, + "ce_loss_6": 3.8306512594223023, + "epoch": 0.704, + "grad_norm": 1240.0, + "kl_loss_12": 1038.9294342041017, + "kl_loss_17": 283.898503112793, + "kl_loss_3": 2563.0609741210938, + "kl_loss_6": 1977.402685546875, + "learning_rate": 0.00020482521845983521, + "loss": 1507.833, + "step": 7040 + }, + { + "ce_loss_12": 3.3880650877952574, + "ce_loss_17": 3.0465856552124024, + "ce_loss_23": 2.917467784881592, + "ce_loss_3": 4.1669586300849915, + "ce_loss_6": 3.874874770641327, + "epoch": 0.705, + "grad_norm": 992.0, + "kl_loss_12": 1080.733804321289, + "kl_loss_17": 295.1897285461426, + "kl_loss_3": 2654.74990234375, + "kl_loss_6": 2066.9548767089846, + "learning_rate": 0.00020354603547267987, + "loss": 1535.6387, + "step": 7050 + }, + { + "ce_loss_12": 3.3857168674468996, + "ce_loss_17": 3.0348952293395994, + "ce_loss_23": 2.9074413180351257, + "ce_loss_3": 4.179165804386139, + "ce_loss_6": 3.8724570512771606, + "epoch": 0.706, + "grad_norm": 1152.0, + "kl_loss_12": 1086.6515533447266, + "kl_loss_17": 293.63476943969727, + "kl_loss_3": 2683.95380859375, + "kl_loss_6": 2069.7790588378907, + "learning_rate": 0.00020226983777365604, + "loss": 1563.2762, + "step": 7060 + }, + { + "ce_loss_12": 3.2863266706466674, + "ce_loss_17": 2.9430976510047913, + "ce_loss_23": 2.8257482767105104, + "ce_loss_3": 4.097273468971252, + "ce_loss_6": 3.79554363489151, + "epoch": 0.707, + "grad_norm": 1328.0, + "kl_loss_12": 1056.6638458251953, + "kl_loss_17": 279.99397430419924, + "kl_loss_3": 2690.1689331054686, + "kl_loss_6": 2093.4708618164063, + "learning_rate": 0.00020099663821406056, + "loss": 1516.3141, + "step": 7070 + }, + { + "ce_loss_12": 3.364639139175415, + "ce_loss_17": 3.0360579252243043, + "ce_loss_23": 2.914643609523773, + "ce_loss_3": 4.128908836841584, + "ce_loss_6": 3.832869017124176, + "epoch": 0.708, + "grad_norm": 1608.0, + "kl_loss_12": 1038.0157989501954, + "kl_loss_17": 278.99704666137694, + "kl_loss_3": 2584.8697265625, + "kl_loss_6": 1995.2526916503907, + "learning_rate": 0.00019972644961499853, + "loss": 1513.3617, + "step": 7080 + }, + { + "ce_loss_12": 3.3603764891624452, + "ce_loss_17": 3.01292085647583, + "ce_loss_23": 2.8856699466705322, + "ce_loss_3": 4.167693400382996, + "ce_loss_6": 3.852894461154938, + "epoch": 0.709, + "grad_norm": 916.0, + "kl_loss_12": 1086.388168334961, + "kl_loss_17": 295.21505279541014, + "kl_loss_3": 2708.575646972656, + "kl_loss_6": 2087.249658203125, + "learning_rate": 0.00019845928476725522, + "loss": 1522.1769, + "step": 7090 + }, + { + "ce_loss_12": 3.425241839885712, + "ce_loss_17": 3.0817944526672365, + "ce_loss_23": 2.9560640811920167, + "ce_loss_3": 4.197418344020844, + "ce_loss_6": 3.898772180080414, + "epoch": 0.71, + "grad_norm": 1272.0, + "kl_loss_12": 1077.3139404296876, + "kl_loss_17": 289.7147720336914, + "kl_loss_3": 2628.4650268554688, + "kl_loss_6": 2023.9876220703125, + "learning_rate": 0.00019719515643116677, + "loss": 1551.5217, + "step": 7100 + }, + { + "ce_loss_12": 3.347544848918915, + "ce_loss_17": 3.016925871372223, + "ce_loss_23": 2.8971351742744447, + "ce_loss_3": 4.12117292881012, + "ce_loss_6": 3.8190558791160583, + "epoch": 0.711, + "grad_norm": 1040.0, + "kl_loss_12": 1045.0277587890625, + "kl_loss_17": 283.63589630126955, + "kl_loss_3": 2617.9817504882812, + "kl_loss_6": 2007.0311584472656, + "learning_rate": 0.0001959340773364911, + "loss": 1514.3187, + "step": 7110 + }, + { + "ce_loss_12": 3.3740314126014708, + "ce_loss_17": 3.03394775390625, + "ce_loss_23": 2.911791443824768, + "ce_loss_3": 4.163285481929779, + "ce_loss_6": 3.8552634716033936, + "epoch": 0.712, + "grad_norm": 940.0, + "kl_loss_12": 1061.135043334961, + "kl_loss_17": 286.66785583496096, + "kl_loss_3": 2647.994091796875, + "kl_loss_6": 2041.245782470703, + "learning_rate": 0.0001946760601822809, + "loss": 1487.3021, + "step": 7120 + }, + { + "ce_loss_12": 3.414544379711151, + "ce_loss_17": 3.0850356340408327, + "ce_loss_23": 2.96371386051178, + "ce_loss_3": 4.186938786506653, + "ce_loss_6": 3.8713775396347048, + "epoch": 0.713, + "grad_norm": 1088.0, + "kl_loss_12": 1043.7035827636719, + "kl_loss_17": 281.5654457092285, + "kl_loss_3": 2607.409875488281, + "kl_loss_6": 1985.3408264160157, + "learning_rate": 0.00019342111763675512, + "loss": 1466.5036, + "step": 7130 + }, + { + "ce_loss_12": 3.4078126668930055, + "ce_loss_17": 3.082154428958893, + "ce_loss_23": 2.9556745529174804, + "ce_loss_3": 4.162794458866119, + "ce_loss_6": 3.864700162410736, + "epoch": 0.714, + "grad_norm": 980.0, + "kl_loss_12": 1041.3136169433594, + "kl_loss_17": 285.2400405883789, + "kl_loss_3": 2559.408154296875, + "kl_loss_6": 1978.7324157714843, + "learning_rate": 0.00019216926233717085, + "loss": 1469.756, + "step": 7140 + }, + { + "ce_loss_12": 3.324468493461609, + "ce_loss_17": 2.9787415981292726, + "ce_loss_23": 2.858863890171051, + "ce_loss_3": 4.164292752742767, + "ce_loss_6": 3.86098655462265, + "epoch": 0.715, + "grad_norm": 1328.0, + "kl_loss_12": 1068.1288116455078, + "kl_loss_17": 280.78184204101564, + "kl_loss_3": 2746.748254394531, + "kl_loss_6": 2145.286444091797, + "learning_rate": 0.00019092050688969737, + "loss": 1541.5396, + "step": 7150 + }, + { + "ce_loss_12": 3.3683008909225465, + "ce_loss_17": 3.042057716846466, + "ce_loss_23": 2.923891615867615, + "ce_loss_3": 4.135058331489563, + "ce_loss_6": 3.8425532221794128, + "epoch": 0.716, + "grad_norm": 1192.0, + "kl_loss_12": 1041.178155517578, + "kl_loss_17": 280.4268173217773, + "kl_loss_3": 2604.3657470703124, + "kl_loss_6": 2012.2357360839844, + "learning_rate": 0.00018967486386928817, + "loss": 1485.7764, + "step": 7160 + }, + { + "ce_loss_12": 3.2786318182945253, + "ce_loss_17": 2.9329994559288024, + "ce_loss_23": 2.8096609830856325, + "ce_loss_3": 4.083643531799316, + "ce_loss_6": 3.772729206085205, + "epoch": 0.717, + "grad_norm": 964.0, + "kl_loss_12": 1074.2320251464844, + "kl_loss_17": 287.8572372436523, + "kl_loss_3": 2699.5654418945314, + "kl_loss_6": 2074.710577392578, + "learning_rate": 0.00018843234581955443, + "loss": 1567.7979, + "step": 7170 + }, + { + "ce_loss_12": 3.285939705371857, + "ce_loss_17": 2.938218724727631, + "ce_loss_23": 2.8121312975883486, + "ce_loss_3": 4.080737113952637, + "ce_loss_6": 3.7792991995811462, + "epoch": 0.718, + "grad_norm": 1192.0, + "kl_loss_12": 1083.5157440185546, + "kl_loss_17": 289.6617492675781, + "kl_loss_3": 2676.434375, + "kl_loss_6": 2066.987451171875, + "learning_rate": 0.00018719296525263924, + "loss": 1525.8836, + "step": 7180 + }, + { + "ce_loss_12": 3.350790059566498, + "ce_loss_17": 3.0248377323150635, + "ce_loss_23": 2.9074713706970217, + "ce_loss_3": 4.104654264450073, + "ce_loss_6": 3.807921862602234, + "epoch": 0.719, + "grad_norm": 1464.0, + "kl_loss_12": 1024.6246978759766, + "kl_loss_17": 282.37331466674806, + "kl_loss_3": 2545.300085449219, + "kl_loss_6": 1961.3479431152343, + "learning_rate": 0.0001859567346490913, + "loss": 1467.8516, + "step": 7190 + }, + { + "ce_loss_12": 3.356687808036804, + "ce_loss_17": 3.021155858039856, + "ce_loss_23": 2.8927383184432984, + "ce_loss_3": 4.146722996234894, + "ce_loss_6": 3.845209336280823, + "epoch": 0.72, + "grad_norm": 1056.0, + "kl_loss_12": 1070.720932006836, + "kl_loss_17": 292.14399642944335, + "kl_loss_3": 2671.309716796875, + "kl_loss_6": 2063.0775756835938, + "learning_rate": 0.0001847236664577389, + "loss": 1501.4645, + "step": 7200 + }, + { + "ce_loss_12": 3.3498232841491697, + "ce_loss_17": 3.0289518237113953, + "ce_loss_23": 2.9100707530975343, + "ce_loss_3": 4.10675413608551, + "ce_loss_6": 3.8027848839759826, + "epoch": 0.721, + "grad_norm": 992.0, + "kl_loss_12": 1024.908447265625, + "kl_loss_17": 280.13957595825195, + "kl_loss_3": 2547.7384521484373, + "kl_loss_6": 1949.222998046875, + "learning_rate": 0.00018349377309556487, + "loss": 1461.0233, + "step": 7210 + }, + { + "ce_loss_12": 3.327030324935913, + "ce_loss_17": 2.983788788318634, + "ce_loss_23": 2.8595494508743284, + "ce_loss_3": 4.154975938796997, + "ce_loss_6": 3.8437779188156127, + "epoch": 0.722, + "grad_norm": 1336.0, + "kl_loss_12": 1097.6743377685548, + "kl_loss_17": 291.4554931640625, + "kl_loss_3": 2756.4244140625, + "kl_loss_6": 2143.5498779296877, + "learning_rate": 0.00018226706694758193, + "loss": 1543.6838, + "step": 7220 + }, + { + "ce_loss_12": 3.3858094096183775, + "ce_loss_17": 3.0542941093444824, + "ce_loss_23": 2.9364635705947877, + "ce_loss_3": 4.162681579589844, + "ce_loss_6": 3.8647823333740234, + "epoch": 0.723, + "grad_norm": 1000.0, + "kl_loss_12": 1060.308575439453, + "kl_loss_17": 282.78950729370115, + "kl_loss_3": 2636.62216796875, + "kl_loss_6": 2041.1465759277344, + "learning_rate": 0.0001810435603667075, + "loss": 1547.8955, + "step": 7230 + }, + { + "ce_loss_12": 3.253883945941925, + "ce_loss_17": 2.911012315750122, + "ce_loss_23": 2.788108789920807, + "ce_loss_3": 4.047508549690247, + "ce_loss_6": 3.7393125534057616, + "epoch": 0.724, + "grad_norm": 872.0, + "kl_loss_12": 1055.8319091796875, + "kl_loss_17": 280.26491088867186, + "kl_loss_3": 2649.6712646484375, + "kl_loss_6": 2038.39619140625, + "learning_rate": 0.0001798232656736389, + "loss": 1537.725, + "step": 7240 + }, + { + "ce_loss_12": 3.3978039264678954, + "ce_loss_17": 3.0690760612487793, + "ce_loss_23": 2.9458399534225466, + "ce_loss_3": 4.148194074630737, + "ce_loss_6": 3.857631707191467, + "epoch": 0.725, + "grad_norm": 1032.0, + "kl_loss_12": 1028.905044555664, + "kl_loss_17": 281.8951362609863, + "kl_loss_3": 2543.316760253906, + "kl_loss_6": 1961.9904541015626, + "learning_rate": 0.0001786061951567303, + "loss": 1484.9909, + "step": 7250 + }, + { + "ce_loss_12": 3.33192777633667, + "ce_loss_17": 2.991592597961426, + "ce_loss_23": 2.866880714893341, + "ce_loss_3": 4.11643146276474, + "ce_loss_6": 3.814874029159546, + "epoch": 0.726, + "grad_norm": 1072.0, + "kl_loss_12": 1066.6974670410157, + "kl_loss_17": 289.0210906982422, + "kl_loss_3": 2635.8052734375, + "kl_loss_6": 2039.5341430664062, + "learning_rate": 0.00017739236107186857, + "loss": 1521.1975, + "step": 7260 + }, + { + "ce_loss_12": 3.396472692489624, + "ce_loss_17": 3.0823714971542358, + "ce_loss_23": 2.963548684120178, + "ce_loss_3": 4.141474437713623, + "ce_loss_6": 3.8460238099098207, + "epoch": 0.727, + "grad_norm": 1104.0, + "kl_loss_12": 1017.0243896484375, + "kl_loss_17": 274.48282318115236, + "kl_loss_3": 2524.86728515625, + "kl_loss_6": 1935.4310913085938, + "learning_rate": 0.00017618177564234904, + "loss": 1473.3111, + "step": 7270 + }, + { + "ce_loss_12": 3.3675931453704835, + "ce_loss_17": 3.044634234905243, + "ce_loss_23": 2.9322256088256835, + "ce_loss_3": 4.122310245037079, + "ce_loss_6": 3.817670261859894, + "epoch": 0.728, + "grad_norm": 980.0, + "kl_loss_12": 1010.9489959716797, + "kl_loss_17": 269.3266403198242, + "kl_loss_3": 2518.7998168945314, + "kl_loss_6": 1925.0574768066406, + "learning_rate": 0.00017497445105875377, + "loss": 1465.2238, + "step": 7280 + }, + { + "ce_loss_12": 3.3209853768348694, + "ce_loss_17": 2.9679404616355898, + "ce_loss_23": 2.8431422114372253, + "ce_loss_3": 4.124039840698242, + "ce_loss_6": 3.8073510766029357, + "epoch": 0.729, + "grad_norm": 1144.0, + "kl_loss_12": 1086.3922210693358, + "kl_loss_17": 289.9881004333496, + "kl_loss_3": 2705.1694458007814, + "kl_loss_6": 2082.1453247070312, + "learning_rate": 0.000173770399478828, + "loss": 1522.562, + "step": 7290 + }, + { + "ce_loss_12": 3.230670762062073, + "ce_loss_17": 2.898967134952545, + "ce_loss_23": 2.7818917870521545, + "ce_loss_3": 4.020108902454377, + "ce_loss_6": 3.709551203250885, + "epoch": 0.73, + "grad_norm": 1064.0, + "kl_loss_12": 1040.022787475586, + "kl_loss_17": 277.5796829223633, + "kl_loss_3": 2642.2594360351563, + "kl_loss_6": 2024.7518676757813, + "learning_rate": 0.0001725696330273575, + "loss": 1532.9043, + "step": 7300 + }, + { + "ce_loss_12": 3.392570424079895, + "ce_loss_17": 3.0626121759414673, + "ce_loss_23": 2.943638336658478, + "ce_loss_3": 4.148163962364197, + "ce_loss_6": 3.8440015435218813, + "epoch": 0.731, + "grad_norm": 1264.0, + "kl_loss_12": 1028.6334869384766, + "kl_loss_17": 275.8746925354004, + "kl_loss_3": 2548.121789550781, + "kl_loss_6": 1954.8319152832032, + "learning_rate": 0.00017137216379604724, + "loss": 1459.7328, + "step": 7310 + }, + { + "ce_loss_12": 3.284358024597168, + "ce_loss_17": 2.9527658343315126, + "ce_loss_23": 2.829926073551178, + "ce_loss_3": 4.08096536397934, + "ce_loss_6": 3.7757596254348753, + "epoch": 0.732, + "grad_norm": 1192.0, + "kl_loss_12": 1043.1614227294922, + "kl_loss_17": 281.0181144714355, + "kl_loss_3": 2630.3141235351563, + "kl_loss_6": 2029.7836059570313, + "learning_rate": 0.00017017800384339925, + "loss": 1507.2828, + "step": 7320 + }, + { + "ce_loss_12": 3.26470342874527, + "ce_loss_17": 2.9141951203346252, + "ce_loss_23": 2.7884278655052186, + "ce_loss_3": 4.081666564941406, + "ce_loss_6": 3.7667925357818604, + "epoch": 0.733, + "grad_norm": 1000.0, + "kl_loss_12": 1087.1163970947266, + "kl_loss_17": 285.4456298828125, + "kl_loss_3": 2725.5192993164064, + "kl_loss_6": 2103.689501953125, + "learning_rate": 0.00016898716519459073, + "loss": 1497.8559, + "step": 7330 + }, + { + "ce_loss_12": 3.3825608849525453, + "ce_loss_17": 3.0331148505210876, + "ce_loss_23": 2.900582027435303, + "ce_loss_3": 4.195623028278351, + "ce_loss_6": 3.8816211581230164, + "epoch": 0.734, + "grad_norm": 960.0, + "kl_loss_12": 1088.0415496826172, + "kl_loss_17": 299.24062423706056, + "kl_loss_3": 2700.626806640625, + "kl_loss_6": 2086.225689697266, + "learning_rate": 0.00016779965984135375, + "loss": 1519.9778, + "step": 7340 + }, + { + "ce_loss_12": 3.2890339851379395, + "ce_loss_17": 2.952122926712036, + "ce_loss_23": 2.82895188331604, + "ce_loss_3": 4.083813881874084, + "ce_loss_6": 3.780845284461975, + "epoch": 0.735, + "grad_norm": 1192.0, + "kl_loss_12": 1042.695639038086, + "kl_loss_17": 277.04451141357424, + "kl_loss_3": 2625.741064453125, + "kl_loss_6": 2032.5084655761718, + "learning_rate": 0.00016661549974185424, + "loss": 1498.1021, + "step": 7350 + }, + { + "ce_loss_12": 3.323700475692749, + "ce_loss_17": 2.9858119606971742, + "ce_loss_23": 2.862323749065399, + "ce_loss_3": 4.097188651561737, + "ce_loss_6": 3.7945112943649293, + "epoch": 0.736, + "grad_norm": 1024.0, + "kl_loss_12": 1056.7614532470702, + "kl_loss_17": 287.83737258911134, + "kl_loss_3": 2622.837121582031, + "kl_loss_6": 2023.2848693847657, + "learning_rate": 0.00016543469682057105, + "loss": 1481.7643, + "step": 7360 + }, + { + "ce_loss_12": 3.3511133074760435, + "ce_loss_17": 3.00958833694458, + "ce_loss_23": 2.8835976123809814, + "ce_loss_3": 4.127915704250336, + "ce_loss_6": 3.8203643321990968, + "epoch": 0.737, + "grad_norm": 976.0, + "kl_loss_12": 1061.8063507080078, + "kl_loss_17": 289.0400161743164, + "kl_loss_3": 2634.0617919921874, + "kl_loss_6": 2024.7839965820312, + "learning_rate": 0.00016425726296817632, + "loss": 1493.3889, + "step": 7370 + }, + { + "ce_loss_12": 3.3494356870651245, + "ce_loss_17": 3.016697037220001, + "ce_loss_23": 2.901347589492798, + "ce_loss_3": 4.1259073138237, + "ce_loss_6": 3.823378086090088, + "epoch": 0.738, + "grad_norm": 996.0, + "kl_loss_12": 1033.489205932617, + "kl_loss_17": 279.1275939941406, + "kl_loss_3": 2588.6812133789062, + "kl_loss_6": 1985.3832580566407, + "learning_rate": 0.00016308321004141607, + "loss": 1489.5334, + "step": 7380 + }, + { + "ce_loss_12": 3.3241341948509215, + "ce_loss_17": 2.977702283859253, + "ce_loss_23": 2.8510108828544616, + "ce_loss_3": 4.11659916639328, + "ce_loss_6": 3.8045485615730286, + "epoch": 0.739, + "grad_norm": 1320.0, + "kl_loss_12": 1075.1243988037108, + "kl_loss_17": 294.54233703613284, + "kl_loss_3": 2660.0075805664064, + "kl_loss_6": 2044.3505432128907, + "learning_rate": 0.00016191254986299043, + "loss": 1490.9283, + "step": 7390 + }, + { + "ce_loss_12": 3.331904947757721, + "ce_loss_17": 3.0109179496765135, + "ce_loss_23": 2.8957800030708314, + "ce_loss_3": 4.101964116096497, + "ce_loss_6": 3.8015185356140138, + "epoch": 0.74, + "grad_norm": 936.0, + "kl_loss_12": 1030.34921875, + "kl_loss_17": 273.9654281616211, + "kl_loss_3": 2589.215710449219, + "kl_loss_6": 1992.3321166992187, + "learning_rate": 0.00016074529422143398, + "loss": 1507.4629, + "step": 7400 + }, + { + "ce_loss_12": 3.321945583820343, + "ce_loss_17": 2.9820314168930055, + "ce_loss_23": 2.858848738670349, + "ce_loss_3": 4.122950708866119, + "ce_loss_6": 3.816438043117523, + "epoch": 0.741, + "grad_norm": 1020.0, + "kl_loss_12": 1063.10849609375, + "kl_loss_17": 290.5937843322754, + "kl_loss_3": 2667.8557250976564, + "kl_loss_6": 2064.332342529297, + "learning_rate": 0.0001595814548709983, + "loss": 1533.617, + "step": 7410 + }, + { + "ce_loss_12": 3.3886276841163636, + "ce_loss_17": 3.0383246660232546, + "ce_loss_23": 2.9127967834472654, + "ce_loss_3": 4.178054928779602, + "ce_loss_6": 3.8697713732719423, + "epoch": 0.742, + "grad_norm": 1296.0, + "kl_loss_12": 1085.9440490722657, + "kl_loss_17": 295.9751800537109, + "kl_loss_3": 2689.67080078125, + "kl_loss_6": 2086.7424377441407, + "learning_rate": 0.00015842104353153285, + "loss": 1522.5532, + "step": 7420 + }, + { + "ce_loss_12": 3.3868244290351868, + "ce_loss_17": 3.0489043235778808, + "ce_loss_23": 2.925636887550354, + "ce_loss_3": 4.170488095283508, + "ce_loss_6": 3.8697841644287108, + "epoch": 0.743, + "grad_norm": 1020.0, + "kl_loss_12": 1065.0920440673829, + "kl_loss_17": 289.10176544189454, + "kl_loss_3": 2638.9310913085938, + "kl_loss_6": 2043.6178588867188, + "learning_rate": 0.0001572640718883667, + "loss": 1531.0913, + "step": 7430 + }, + { + "ce_loss_12": 3.3096341848373414, + "ce_loss_17": 2.987624776363373, + "ce_loss_23": 2.872477889060974, + "ce_loss_3": 4.082384061813355, + "ce_loss_6": 3.7801726818084718, + "epoch": 0.744, + "grad_norm": 964.0, + "kl_loss_12": 1032.089779663086, + "kl_loss_17": 277.3216491699219, + "kl_loss_3": 2569.769177246094, + "kl_loss_6": 1975.4082946777344, + "learning_rate": 0.0001561105515921915, + "loss": 1511.5431, + "step": 7440 + }, + { + "ce_loss_12": 3.2099051237106324, + "ce_loss_17": 2.857716774940491, + "ce_loss_23": 2.7396645665168764, + "ce_loss_3": 4.029573166370392, + "ce_loss_6": 3.706012415885925, + "epoch": 0.745, + "grad_norm": 1120.0, + "kl_loss_12": 1069.060369873047, + "kl_loss_17": 277.27220916748047, + "kl_loss_3": 2726.9617431640627, + "kl_loss_6": 2093.1573852539063, + "learning_rate": 0.0001549604942589441, + "loss": 1509.512, + "step": 7450 + }, + { + "ce_loss_12": 3.341689133644104, + "ce_loss_17": 3.0220096111297607, + "ce_loss_23": 2.9066612124443054, + "ce_loss_3": 4.086879503726959, + "ce_loss_6": 3.8007105112075807, + "epoch": 0.746, + "grad_norm": 896.0, + "kl_loss_12": 1009.4411987304687, + "kl_loss_17": 271.61903076171876, + "kl_loss_3": 2505.743408203125, + "kl_loss_6": 1934.8906188964843, + "learning_rate": 0.00015381391146968864, + "loss": 1461.8856, + "step": 7460 + }, + { + "ce_loss_12": 3.333513391017914, + "ce_loss_17": 2.9974658370018004, + "ce_loss_23": 2.879487907886505, + "ce_loss_3": 4.123572957515717, + "ce_loss_6": 3.821442413330078, + "epoch": 0.747, + "grad_norm": 1136.0, + "kl_loss_12": 1039.75849609375, + "kl_loss_17": 274.57335205078124, + "kl_loss_3": 2615.1775756835937, + "kl_loss_6": 2020.2796203613282, + "learning_rate": 0.00015267081477050133, + "loss": 1504.0042, + "step": 7470 + }, + { + "ce_loss_12": 3.427445709705353, + "ce_loss_17": 3.09232976436615, + "ce_loss_23": 2.965824806690216, + "ce_loss_3": 4.178172600269318, + "ce_loss_6": 3.883107364177704, + "epoch": 0.748, + "grad_norm": 968.0, + "kl_loss_12": 1057.313070678711, + "kl_loss_17": 290.85567474365234, + "kl_loss_3": 2577.955285644531, + "kl_loss_6": 1986.2766052246093, + "learning_rate": 0.00015153121567235335, + "loss": 1470.3311, + "step": 7480 + }, + { + "ce_loss_12": 3.3238179087638855, + "ce_loss_17": 2.9903075218200685, + "ce_loss_23": 2.874490833282471, + "ce_loss_3": 4.120837676525116, + "ce_loss_6": 3.8117267370223997, + "epoch": 0.749, + "grad_norm": 984.0, + "kl_loss_12": 1054.7883697509765, + "kl_loss_17": 282.8294624328613, + "kl_loss_3": 2665.5120239257812, + "kl_loss_6": 2052.0541809082033, + "learning_rate": 0.00015039512565099468, + "loss": 1472.3645, + "step": 7490 + }, + { + "ce_loss_12": 3.3772895336151123, + "ce_loss_17": 3.052878940105438, + "ce_loss_23": 2.937324583530426, + "ce_loss_3": 4.142999291419983, + "ce_loss_6": 3.8434554815292357, + "epoch": 0.75, + "grad_norm": 988.0, + "kl_loss_12": 1040.7686798095704, + "kl_loss_17": 278.93787689208983, + "kl_loss_3": 2590.99130859375, + "kl_loss_6": 1991.1614868164063, + "learning_rate": 0.00014926255614683932, + "loss": 1538.4063, + "step": 7500 + }, + { + "ce_loss_12": 3.318462574481964, + "ce_loss_17": 2.988450789451599, + "ce_loss_23": 2.8678772807121278, + "ce_loss_3": 4.096622204780578, + "ce_loss_6": 3.796446645259857, + "epoch": 0.751, + "grad_norm": 1072.0, + "kl_loss_12": 1038.6952239990235, + "kl_loss_17": 280.92377548217775, + "kl_loss_3": 2619.8683471679688, + "kl_loss_6": 2017.0190490722657, + "learning_rate": 0.0001481335185648498, + "loss": 1501.4311, + "step": 7510 + }, + { + "ce_loss_12": 3.347504699230194, + "ce_loss_17": 3.0147265672683714, + "ce_loss_23": 2.8962690949440004, + "ce_loss_3": 4.121276211738587, + "ce_loss_6": 3.8162226915359496, + "epoch": 0.752, + "grad_norm": 1056.0, + "kl_loss_12": 1048.723617553711, + "kl_loss_17": 281.2362464904785, + "kl_loss_3": 2618.826452636719, + "kl_loss_6": 2011.4213256835938, + "learning_rate": 0.0001470080242744218, + "loss": 1482.2265, + "step": 7520 + }, + { + "ce_loss_12": 3.34172899723053, + "ce_loss_17": 3.0092835426330566, + "ce_loss_23": 2.897358810901642, + "ce_loss_3": 4.1162315011024475, + "ce_loss_6": 3.8220934510231017, + "epoch": 0.753, + "grad_norm": 924.0, + "kl_loss_12": 1029.3202911376952, + "kl_loss_17": 273.0403007507324, + "kl_loss_3": 2603.049938964844, + "kl_loss_6": 2020.111151123047, + "learning_rate": 0.0001458860846092705, + "loss": 1497.3177, + "step": 7530 + }, + { + "ce_loss_12": 3.378802788257599, + "ce_loss_17": 3.055048716068268, + "ce_loss_23": 2.9335278511047362, + "ce_loss_3": 4.1248640537261965, + "ce_loss_6": 3.829790270328522, + "epoch": 0.754, + "grad_norm": 1176.0, + "kl_loss_12": 1033.308349609375, + "kl_loss_17": 281.56811904907227, + "kl_loss_3": 2530.413781738281, + "kl_loss_6": 1945.8234558105469, + "learning_rate": 0.00014476771086731566, + "loss": 1448.7382, + "step": 7540 + }, + { + "ce_loss_12": 3.4659211158752443, + "ce_loss_17": 3.1346755385398866, + "ce_loss_23": 3.0065076112747193, + "ce_loss_3": 4.226503646373748, + "ce_loss_6": 3.93116614818573, + "epoch": 0.755, + "grad_norm": 1128.0, + "kl_loss_12": 1052.2420623779296, + "kl_loss_17": 292.08596420288086, + "kl_loss_3": 2581.6936767578127, + "kl_loss_6": 1997.2755920410157, + "learning_rate": 0.00014365291431056872, + "loss": 1518.8756, + "step": 7550 + }, + { + "ce_loss_12": 3.3224751710891725, + "ce_loss_17": 2.9759204030036925, + "ce_loss_23": 2.8496909141540527, + "ce_loss_3": 4.109515285491943, + "ce_loss_6": 3.801195240020752, + "epoch": 0.756, + "grad_norm": 904.0, + "kl_loss_12": 1081.3564544677733, + "kl_loss_17": 294.1281547546387, + "kl_loss_3": 2679.63125, + "kl_loss_6": 2064.4482421875, + "learning_rate": 0.00014254170616501827, + "loss": 1514.05, + "step": 7560 + }, + { + "ce_loss_12": 3.2967641592025756, + "ce_loss_17": 2.930052900314331, + "ce_loss_23": 2.7977853178977967, + "ce_loss_3": 4.1107590913772585, + "ce_loss_6": 3.7909518480300903, + "epoch": 0.757, + "grad_norm": 1020.0, + "kl_loss_12": 1128.729232788086, + "kl_loss_17": 298.5667533874512, + "kl_loss_3": 2763.8598754882814, + "kl_loss_6": 2126.74951171875, + "learning_rate": 0.0001414340976205183, + "loss": 1567.2646, + "step": 7570 + }, + { + "ce_loss_12": 3.280392253398895, + "ce_loss_17": 2.9262807726860047, + "ce_loss_23": 2.807630729675293, + "ce_loss_3": 4.092207396030426, + "ce_loss_6": 3.7799305081367494, + "epoch": 0.758, + "grad_norm": 1336.0, + "kl_loss_12": 1065.8500915527343, + "kl_loss_17": 281.43665313720703, + "kl_loss_3": 2699.0353881835936, + "kl_loss_6": 2083.4188720703123, + "learning_rate": 0.00014033009983067452, + "loss": 1506.1648, + "step": 7580 + }, + { + "ce_loss_12": 3.394823133945465, + "ce_loss_17": 3.0780391693115234, + "ce_loss_23": 2.9608801007270813, + "ce_loss_3": 4.142879939079284, + "ce_loss_6": 3.8521243810653685, + "epoch": 0.759, + "grad_norm": 1056.0, + "kl_loss_12": 1015.8759643554688, + "kl_loss_17": 273.7043281555176, + "kl_loss_3": 2537.435986328125, + "kl_loss_6": 1953.083233642578, + "learning_rate": 0.00013922972391273224, + "loss": 1475.9166, + "step": 7590 + }, + { + "ce_loss_12": 3.409343111515045, + "ce_loss_17": 3.0831799149513244, + "ce_loss_23": 2.961395597457886, + "ce_loss_3": 4.194057762622833, + "ce_loss_6": 3.8991695284843444, + "epoch": 0.76, + "grad_norm": 1544.0, + "kl_loss_12": 1032.2313873291016, + "kl_loss_17": 281.07324600219727, + "kl_loss_3": 2602.2853759765626, + "kl_loss_6": 2026.8900756835938, + "learning_rate": 0.0001381329809474649, + "loss": 1492.928, + "step": 7600 + }, + { + "ce_loss_12": 3.3551380395889283, + "ce_loss_17": 3.0006386637687683, + "ce_loss_23": 2.871221125125885, + "ce_loss_3": 4.1550891399383545, + "ce_loss_6": 3.8454134941101072, + "epoch": 0.761, + "grad_norm": 1184.0, + "kl_loss_12": 1100.336163330078, + "kl_loss_17": 294.1468795776367, + "kl_loss_3": 2706.9808654785156, + "kl_loss_6": 2096.7616455078123, + "learning_rate": 0.0001370398819790621, + "loss": 1533.9725, + "step": 7610 + }, + { + "ce_loss_12": 3.4408915281295775, + "ce_loss_17": 3.116612160205841, + "ce_loss_23": 2.9966283202171327, + "ce_loss_3": 4.1998590469360355, + "ce_loss_6": 3.9062315940856935, + "epoch": 0.762, + "grad_norm": 1088.0, + "kl_loss_12": 1025.209161376953, + "kl_loss_17": 280.3527404785156, + "kl_loss_3": 2559.6067016601564, + "kl_loss_6": 1977.9031372070312, + "learning_rate": 0.00013595043801501794, + "loss": 1465.2758, + "step": 7620 + }, + { + "ce_loss_12": 3.2940911531448362, + "ce_loss_17": 2.9408129334449766, + "ce_loss_23": 2.8113720297813414, + "ce_loss_3": 4.135281682014465, + "ce_loss_6": 3.812657952308655, + "epoch": 0.763, + "grad_norm": 1280.0, + "kl_loss_12": 1105.1962646484376, + "kl_loss_17": 294.04699935913084, + "kl_loss_3": 2790.003955078125, + "kl_loss_6": 2151.4091979980467, + "learning_rate": 0.00013486466002602133, + "loss": 1539.4232, + "step": 7630 + }, + { + "ce_loss_12": 3.3600721836090086, + "ce_loss_17": 3.031892383098602, + "ce_loss_23": 2.9156318426132204, + "ce_loss_3": 4.1076583623886105, + "ce_loss_6": 3.8144946694374084, + "epoch": 0.764, + "grad_norm": 984.0, + "kl_loss_12": 1034.2011474609376, + "kl_loss_17": 277.2654144287109, + "kl_loss_3": 2565.522998046875, + "kl_loss_6": 1969.8522705078126, + "learning_rate": 0.00013378255894584462, + "loss": 1514.8982, + "step": 7640 + }, + { + "ce_loss_12": 3.3211570501327516, + "ce_loss_17": 2.979449915885925, + "ce_loss_23": 2.8517404437065124, + "ce_loss_3": 4.130272006988525, + "ce_loss_6": 3.8186199069023132, + "epoch": 0.765, + "grad_norm": 1144.0, + "kl_loss_12": 1071.8350494384765, + "kl_loss_17": 290.4520797729492, + "kl_loss_3": 2694.0873901367186, + "kl_loss_6": 2079.6744750976563, + "learning_rate": 0.0001327041456712334, + "loss": 1524.7129, + "step": 7650 + }, + { + "ce_loss_12": 3.353261423110962, + "ce_loss_17": 3.0194068431854246, + "ce_loss_23": 2.89362518787384, + "ce_loss_3": 4.14197006225586, + "ce_loss_6": 3.8328930497169496, + "epoch": 0.766, + "grad_norm": 1304.0, + "kl_loss_12": 1062.2486907958985, + "kl_loss_17": 287.6679061889648, + "kl_loss_3": 2641.623046875, + "kl_loss_6": 2034.7803100585938, + "learning_rate": 0.00013162943106179747, + "loss": 1519.9464, + "step": 7660 + }, + { + "ce_loss_12": 3.3228484749794007, + "ce_loss_17": 2.9905929923057557, + "ce_loss_23": 2.8718537211418154, + "ce_loss_3": 4.084289968013763, + "ce_loss_6": 3.7856014490127565, + "epoch": 0.767, + "grad_norm": 976.0, + "kl_loss_12": 1042.566226196289, + "kl_loss_17": 278.72392959594725, + "kl_loss_3": 2588.8190795898436, + "kl_loss_6": 1995.1788024902344, + "learning_rate": 0.00013055842593990132, + "loss": 1492.6238, + "step": 7670 + }, + { + "ce_loss_12": 3.2804736375808714, + "ce_loss_17": 2.943773651123047, + "ce_loss_23": 2.8240336775779724, + "ce_loss_3": 4.059450459480286, + "ce_loss_6": 3.747068452835083, + "epoch": 0.768, + "grad_norm": 992.0, + "kl_loss_12": 1031.083804321289, + "kl_loss_17": 278.0084037780762, + "kl_loss_3": 2579.5146728515624, + "kl_loss_6": 1981.6210876464843, + "learning_rate": 0.00012949114109055414, + "loss": 1509.4066, + "step": 7680 + }, + { + "ce_loss_12": 3.325702929496765, + "ce_loss_17": 2.985046076774597, + "ce_loss_23": 2.8618072509765624, + "ce_loss_3": 4.106758785247803, + "ce_loss_6": 3.806333029270172, + "epoch": 0.769, + "grad_norm": 1032.0, + "kl_loss_12": 1057.625018310547, + "kl_loss_17": 287.085506439209, + "kl_loss_3": 2639.7965576171873, + "kl_loss_6": 2039.7977416992187, + "learning_rate": 0.00012842758726130281, + "loss": 1521.547, + "step": 7690 + }, + { + "ce_loss_12": 3.3817939877510073, + "ce_loss_17": 3.0320732951164246, + "ce_loss_23": 2.9052831649780275, + "ce_loss_3": 4.1872913479805, + "ce_loss_6": 3.873805296421051, + "epoch": 0.77, + "grad_norm": 1048.0, + "kl_loss_12": 1073.136459350586, + "kl_loss_17": 289.16695938110354, + "kl_loss_3": 2685.2822509765624, + "kl_loss_6": 2067.8914428710937, + "learning_rate": 0.00012736777516212267, + "loss": 1502.7867, + "step": 7700 + }, + { + "ce_loss_12": 3.3680408596992493, + "ce_loss_17": 3.027958834171295, + "ce_loss_23": 2.9010915637016295, + "ce_loss_3": 4.150628173351288, + "ce_loss_6": 3.8480496644973754, + "epoch": 0.771, + "grad_norm": 1200.0, + "kl_loss_12": 1072.5316314697266, + "kl_loss_17": 292.39178771972655, + "kl_loss_3": 2632.5837646484374, + "kl_loss_6": 2040.2246887207032, + "learning_rate": 0.00012631171546530968, + "loss": 1486.1639, + "step": 7710 + }, + { + "ce_loss_12": 3.3786956071853638, + "ce_loss_17": 3.0343061208724977, + "ce_loss_23": 2.906955349445343, + "ce_loss_3": 4.14276841878891, + "ce_loss_6": 3.8478384733200075, + "epoch": 0.772, + "grad_norm": 1312.0, + "kl_loss_12": 1076.8528961181642, + "kl_loss_17": 291.54980926513673, + "kl_loss_3": 2633.0797729492188, + "kl_loss_6": 2043.3614379882813, + "learning_rate": 0.00012525941880537307, + "loss": 1524.2413, + "step": 7720 + }, + { + "ce_loss_12": 3.398939514160156, + "ce_loss_17": 3.0631134629249575, + "ce_loss_23": 2.942670977115631, + "ce_loss_3": 4.171182703971863, + "ce_loss_6": 3.86375207901001, + "epoch": 0.773, + "grad_norm": 1152.0, + "kl_loss_12": 1049.3153106689454, + "kl_loss_17": 280.57236404418944, + "kl_loss_3": 2612.383923339844, + "kl_loss_6": 1999.32705078125, + "learning_rate": 0.00012421089577892869, + "loss": 1495.2975, + "step": 7730 + }, + { + "ce_loss_12": 3.361254799365997, + "ce_loss_17": 3.017450821399689, + "ce_loss_23": 2.8957334995269775, + "ce_loss_3": 4.158638286590576, + "ce_loss_6": 3.8432446360588073, + "epoch": 0.774, + "grad_norm": 1012.0, + "kl_loss_12": 1077.780435180664, + "kl_loss_17": 284.0028190612793, + "kl_loss_3": 2675.105017089844, + "kl_loss_6": 2050.918780517578, + "learning_rate": 0.0001231661569445919, + "loss": 1516.5686, + "step": 7740 + }, + { + "ce_loss_12": 3.2308457136154174, + "ce_loss_17": 2.890022850036621, + "ce_loss_23": 2.7693028926849363, + "ce_loss_3": 4.028452098369598, + "ce_loss_6": 3.714640963077545, + "epoch": 0.775, + "grad_norm": 976.0, + "kl_loss_12": 1057.9297485351562, + "kl_loss_17": 286.21061248779296, + "kl_loss_3": 2668.4592529296874, + "kl_loss_6": 2049.3302795410154, + "learning_rate": 0.00012212521282287093, + "loss": 1540.696, + "step": 7750 + }, + { + "ce_loss_12": 3.3653706312179565, + "ce_loss_17": 3.0217082381248472, + "ce_loss_23": 2.8978288531303407, + "ce_loss_3": 4.125185596942901, + "ce_loss_6": 3.829509603977203, + "epoch": 0.776, + "grad_norm": 1064.0, + "kl_loss_12": 1065.6006988525392, + "kl_loss_17": 287.18448333740236, + "kl_loss_3": 2599.796826171875, + "kl_loss_6": 2009.5392761230469, + "learning_rate": 0.00012108807389606158, + "loss": 1524.3297, + "step": 7760 + }, + { + "ce_loss_12": 3.354585552215576, + "ce_loss_17": 3.024714708328247, + "ce_loss_23": 2.90908796787262, + "ce_loss_3": 4.133483743667602, + "ce_loss_6": 3.8332683205604554, + "epoch": 0.777, + "grad_norm": 1008.0, + "kl_loss_12": 1044.9940063476563, + "kl_loss_17": 277.4327453613281, + "kl_loss_3": 2609.7527587890627, + "kl_loss_6": 2017.0435913085937, + "learning_rate": 0.00012005475060814159, + "loss": 1490.6898, + "step": 7770 + }, + { + "ce_loss_12": 3.3110827565193177, + "ce_loss_17": 2.962108778953552, + "ce_loss_23": 2.842651915550232, + "ce_loss_3": 4.103477931022644, + "ce_loss_6": 3.805611324310303, + "epoch": 0.778, + "grad_norm": 1192.0, + "kl_loss_12": 1077.4869171142577, + "kl_loss_17": 285.6055603027344, + "kl_loss_3": 2677.163195800781, + "kl_loss_6": 2081.893310546875, + "learning_rate": 0.00011902525336466464, + "loss": 1515.6557, + "step": 7780 + }, + { + "ce_loss_12": 3.309223008155823, + "ce_loss_17": 2.957816791534424, + "ce_loss_23": 2.826165294647217, + "ce_loss_3": 4.1195541501045225, + "ce_loss_6": 3.808503520488739, + "epoch": 0.779, + "grad_norm": 916.0, + "kl_loss_12": 1092.967202758789, + "kl_loss_17": 293.0207763671875, + "kl_loss_3": 2728.019567871094, + "kl_loss_6": 2111.1691650390626, + "learning_rate": 0.00011799959253265668, + "loss": 1524.7547, + "step": 7790 + }, + { + "ce_loss_12": 3.3427767872810366, + "ce_loss_17": 3.005957913398743, + "ce_loss_23": 2.8837684988975525, + "ce_loss_3": 4.141287219524384, + "ce_loss_6": 3.8301820755004883, + "epoch": 0.78, + "grad_norm": 1004.0, + "kl_loss_12": 1063.709521484375, + "kl_loss_17": 287.99907302856445, + "kl_loss_3": 2670.677001953125, + "kl_loss_6": 2058.465539550781, + "learning_rate": 0.00011697777844051105, + "loss": 1512.9553, + "step": 7800 + }, + { + "ce_loss_12": 3.34201123714447, + "ce_loss_17": 3.0019322752952577, + "ce_loss_23": 2.875227212905884, + "ce_loss_3": 4.166788375377655, + "ce_loss_6": 3.8613690376281737, + "epoch": 0.781, + "grad_norm": 960.0, + "kl_loss_12": 1070.0397857666017, + "kl_loss_17": 291.10598678588866, + "kl_loss_3": 2726.98310546875, + "kl_loss_6": 2118.020654296875, + "learning_rate": 0.00011595982137788402, + "loss": 1532.2738, + "step": 7810 + }, + { + "ce_loss_12": 3.297192394733429, + "ce_loss_17": 2.974992501735687, + "ce_loss_23": 2.8550528407096865, + "ce_loss_3": 4.067392218112945, + "ce_loss_6": 3.7673613667488097, + "epoch": 0.782, + "grad_norm": 1496.0, + "kl_loss_12": 1030.9745788574219, + "kl_loss_17": 277.9621757507324, + "kl_loss_3": 2583.620556640625, + "kl_loss_6": 1984.6390380859375, + "learning_rate": 0.00011494573159559212, + "loss": 1494.2168, + "step": 7820 + }, + { + "ce_loss_12": 3.305099880695343, + "ce_loss_17": 2.9622052550315856, + "ce_loss_23": 2.839592659473419, + "ce_loss_3": 4.081231749057769, + "ce_loss_6": 3.786455988883972, + "epoch": 0.783, + "grad_norm": 1000.0, + "kl_loss_12": 1064.5903106689452, + "kl_loss_17": 289.8975769042969, + "kl_loss_3": 2638.9677490234376, + "kl_loss_6": 2053.720361328125, + "learning_rate": 0.00011393551930550828, + "loss": 1541.4899, + "step": 7830 + }, + { + "ce_loss_12": 3.4056118726730347, + "ce_loss_17": 3.078857898712158, + "ce_loss_23": 2.9554930329322815, + "ce_loss_3": 4.180590558052063, + "ce_loss_6": 3.875455212593079, + "epoch": 0.784, + "grad_norm": 1144.0, + "kl_loss_12": 1038.494302368164, + "kl_loss_17": 284.370703125, + "kl_loss_3": 2591.5422241210936, + "kl_loss_6": 1985.3228637695313, + "learning_rate": 0.00011292919468045875, + "loss": 1485.5578, + "step": 7840 + }, + { + "ce_loss_12": 3.3782568097114565, + "ce_loss_17": 3.0423492789268494, + "ce_loss_23": 2.9200599431991576, + "ce_loss_3": 4.162760663032532, + "ce_loss_6": 3.854156756401062, + "epoch": 0.785, + "grad_norm": 1096.0, + "kl_loss_12": 1049.5641845703126, + "kl_loss_17": 285.1061187744141, + "kl_loss_3": 2621.591650390625, + "kl_loss_6": 2012.33037109375, + "learning_rate": 0.00011192676785412154, + "loss": 1485.7816, + "step": 7850 + }, + { + "ce_loss_12": 3.3352497935295107, + "ce_loss_17": 2.9882391810417177, + "ce_loss_23": 2.8606590270996093, + "ce_loss_3": 4.143488276004791, + "ce_loss_6": 3.829927217960358, + "epoch": 0.786, + "grad_norm": 1248.0, + "kl_loss_12": 1070.5537658691405, + "kl_loss_17": 288.625439453125, + "kl_loss_3": 2696.9665283203126, + "kl_loss_6": 2076.2456298828124, + "learning_rate": 0.00011092824892092374, + "loss": 1522.1464, + "step": 7860 + }, + { + "ce_loss_12": 3.2732455015182493, + "ce_loss_17": 2.924168884754181, + "ce_loss_23": 2.8001551151275637, + "ce_loss_3": 4.080106019973755, + "ce_loss_6": 3.7724220633506773, + "epoch": 0.787, + "grad_norm": 1144.0, + "kl_loss_12": 1081.1447296142578, + "kl_loss_17": 282.0930236816406, + "kl_loss_3": 2704.5709228515625, + "kl_loss_6": 2092.5669921875, + "learning_rate": 0.0001099336479359398, + "loss": 1509.2197, + "step": 7870 + }, + { + "ce_loss_12": 3.3644967079162598, + "ce_loss_17": 3.0397923231124877, + "ce_loss_23": 2.9192729711532595, + "ce_loss_3": 4.132528138160706, + "ce_loss_6": 3.829935443401337, + "epoch": 0.788, + "grad_norm": 1064.0, + "kl_loss_12": 1039.1475769042968, + "kl_loss_17": 280.9910743713379, + "kl_loss_3": 2589.2807373046876, + "kl_loss_6": 1985.3061462402343, + "learning_rate": 0.00010894297491479043, + "loss": 1499.0057, + "step": 7880 + }, + { + "ce_loss_12": 3.3554076671600344, + "ce_loss_17": 3.021135950088501, + "ce_loss_23": 2.901791679859161, + "ce_loss_3": 4.140153288841248, + "ce_loss_6": 3.8371911525726317, + "epoch": 0.789, + "grad_norm": 1096.0, + "kl_loss_12": 1053.7805969238282, + "kl_loss_17": 281.94963760375975, + "kl_loss_3": 2625.732043457031, + "kl_loss_6": 2028.555682373047, + "learning_rate": 0.00010795623983354214, + "loss": 1492.3039, + "step": 7890 + }, + { + "ce_loss_12": 3.283448374271393, + "ce_loss_17": 2.9408915996551515, + "ce_loss_23": 2.8099642992019653, + "ce_loss_3": 4.072676730155945, + "ce_loss_6": 3.7639185547828675, + "epoch": 0.79, + "grad_norm": 1128.0, + "kl_loss_12": 1075.9224853515625, + "kl_loss_17": 296.4463150024414, + "kl_loss_3": 2683.721142578125, + "kl_loss_6": 2063.581365966797, + "learning_rate": 0.00010697345262860636, + "loss": 1513.3955, + "step": 7900 + }, + { + "ce_loss_12": 3.3812041759490965, + "ce_loss_17": 3.059047210216522, + "ce_loss_23": 2.9412465929985045, + "ce_loss_3": 4.161124658584595, + "ce_loss_6": 3.8484493136405944, + "epoch": 0.791, + "grad_norm": 1208.0, + "kl_loss_12": 1039.3051513671876, + "kl_loss_17": 281.3121871948242, + "kl_loss_3": 2603.398046875, + "kl_loss_6": 1995.908465576172, + "learning_rate": 0.00010599462319663906, + "loss": 1477.8745, + "step": 7910 + }, + { + "ce_loss_12": 3.3532153487205507, + "ce_loss_17": 3.0290448904037475, + "ce_loss_23": 2.9121936798095702, + "ce_loss_3": 4.107381510734558, + "ce_loss_6": 3.80359365940094, + "epoch": 0.792, + "grad_norm": 1120.0, + "kl_loss_12": 1027.3702545166016, + "kl_loss_17": 278.64846115112306, + "kl_loss_3": 2552.5097900390624, + "kl_loss_6": 1950.554150390625, + "learning_rate": 0.00010501976139444191, + "loss": 1462.499, + "step": 7920 + }, + { + "ce_loss_12": 3.379141628742218, + "ce_loss_17": 3.0500821232795716, + "ce_loss_23": 2.9308634400367737, + "ce_loss_3": 4.141736924648285, + "ce_loss_6": 3.85120313167572, + "epoch": 0.793, + "grad_norm": 1080.0, + "kl_loss_12": 1031.612240600586, + "kl_loss_17": 277.12482299804685, + "kl_loss_3": 2567.5821899414063, + "kl_loss_6": 1991.542919921875, + "learning_rate": 0.0001040488770388625, + "loss": 1497.1086, + "step": 7930 + }, + { + "ce_loss_12": 3.3468173861503603, + "ce_loss_17": 3.0071908950805666, + "ce_loss_23": 2.8908491253852846, + "ce_loss_3": 4.129796278476715, + "ce_loss_6": 3.827104127407074, + "epoch": 0.794, + "grad_norm": 1144.0, + "kl_loss_12": 1059.5995666503907, + "kl_loss_17": 281.8688194274902, + "kl_loss_3": 2650.525427246094, + "kl_loss_6": 2047.058837890625, + "learning_rate": 0.00010308197990669538, + "loss": 1497.3817, + "step": 7940 + }, + { + "ce_loss_12": 3.4481312155723574, + "ce_loss_17": 3.1162644386291505, + "ce_loss_23": 2.9944263815879824, + "ce_loss_3": 4.223077440261841, + "ce_loss_6": 3.91604346036911, + "epoch": 0.795, + "grad_norm": 1208.0, + "kl_loss_12": 1061.0736083984375, + "kl_loss_17": 288.20682525634766, + "kl_loss_3": 2627.3918579101564, + "kl_loss_6": 2017.8263366699218, + "learning_rate": 0.0001021190797345839, + "loss": 1486.8842, + "step": 7950 + }, + { + "ce_loss_12": 3.2265631914138795, + "ce_loss_17": 2.8669999957084658, + "ce_loss_23": 2.7341838479042053, + "ce_loss_3": 4.046359026432038, + "ce_loss_6": 3.724623703956604, + "epoch": 0.796, + "grad_norm": 1056.0, + "kl_loss_12": 1112.0119140625, + "kl_loss_17": 297.9104400634766, + "kl_loss_3": 2748.6629150390627, + "kl_loss_6": 2113.977429199219, + "learning_rate": 0.00010116018621892236, + "loss": 1528.2286, + "step": 7960 + }, + { + "ce_loss_12": 3.4046633005142213, + "ce_loss_17": 3.0650864958763124, + "ce_loss_23": 2.9383347630500793, + "ce_loss_3": 4.186934053897858, + "ce_loss_6": 3.8826000571250914, + "epoch": 0.797, + "grad_norm": 1024.0, + "kl_loss_12": 1086.240530395508, + "kl_loss_17": 299.4277030944824, + "kl_loss_3": 2658.250390625, + "kl_loss_6": 2064.7257446289063, + "learning_rate": 0.00010020530901575753, + "loss": 1481.2242, + "step": 7970 + }, + { + "ce_loss_12": 3.411669981479645, + "ce_loss_17": 3.077830362319946, + "ce_loss_23": 2.9566885828971863, + "ce_loss_3": 4.1832381844520565, + "ce_loss_6": 3.87999769449234, + "epoch": 0.798, + "grad_norm": 856.0, + "kl_loss_12": 1062.4724334716798, + "kl_loss_17": 287.22545928955077, + "kl_loss_3": 2635.3749145507813, + "kl_loss_6": 2028.0968811035157, + "learning_rate": 9.925445774069231e-05, + "loss": 1476.0003, + "step": 7980 + }, + { + "ce_loss_12": 3.3718875646591187, + "ce_loss_17": 3.032572162151337, + "ce_loss_23": 2.9057837605476378, + "ce_loss_3": 4.142505764961243, + "ce_loss_6": 3.843195044994354, + "epoch": 0.799, + "grad_norm": 916.0, + "kl_loss_12": 1043.4041320800782, + "kl_loss_17": 285.2698211669922, + "kl_loss_3": 2588.7929077148438, + "kl_loss_6": 1991.7592468261719, + "learning_rate": 9.830764196878872e-05, + "loss": 1457.0741, + "step": 7990 + }, + { + "ce_loss_12": 3.3205261707305906, + "ce_loss_17": 2.9875508427619932, + "ce_loss_23": 2.8695124864578245, + "ce_loss_3": 4.107181358337402, + "ce_loss_6": 3.799295961856842, + "epoch": 0.8, + "grad_norm": 1024.0, + "kl_loss_12": 1056.3955352783203, + "kl_loss_17": 278.84816665649413, + "kl_loss_3": 2666.8331298828125, + "kl_loss_6": 2049.211877441406, + "learning_rate": 9.736487123447069e-05, + "loss": 1501.2203, + "step": 8000 + }, + { + "ce_loss_12": 3.2938269853591917, + "ce_loss_17": 2.941650378704071, + "ce_loss_23": 2.8178758025169373, + "ce_loss_3": 4.113538646697998, + "ce_loss_6": 3.8093990683555603, + "epoch": 0.801, + "grad_norm": 936.0, + "kl_loss_12": 1090.296612548828, + "kl_loss_17": 287.49616012573244, + "kl_loss_3": 2765.3645263671874, + "kl_loss_6": 2153.184539794922, + "learning_rate": 9.642615503142926e-05, + "loss": 1547.2773, + "step": 8010 + }, + { + "ce_loss_12": 3.33432058095932, + "ce_loss_17": 2.996255397796631, + "ce_loss_23": 2.8753136098384857, + "ce_loss_3": 4.136964285373688, + "ce_loss_6": 3.8324382066726685, + "epoch": 0.802, + "grad_norm": 1408.0, + "kl_loss_12": 1055.0836669921875, + "kl_loss_17": 283.3932373046875, + "kl_loss_3": 2670.6496459960936, + "kl_loss_6": 2061.926062011719, + "learning_rate": 9.549150281252633e-05, + "loss": 1494.2036, + "step": 8020 + }, + { + "ce_loss_12": 3.3622690558433534, + "ce_loss_17": 3.023555374145508, + "ce_loss_23": 2.8994755148887634, + "ce_loss_3": 4.144161570072174, + "ce_loss_6": 3.842393147945404, + "epoch": 0.803, + "grad_norm": 1336.0, + "kl_loss_12": 1061.3064239501953, + "kl_loss_17": 287.6574089050293, + "kl_loss_3": 2649.08232421875, + "kl_loss_6": 2050.1166259765623, + "learning_rate": 9.4560923989699e-05, + "loss": 1525.1096, + "step": 8030 + }, + { + "ce_loss_12": 3.35483433008194, + "ce_loss_17": 3.019861912727356, + "ce_loss_23": 2.8949792861938475, + "ce_loss_3": 4.133480882644653, + "ce_loss_6": 3.8327292680740355, + "epoch": 0.804, + "grad_norm": 1176.0, + "kl_loss_12": 1054.61171875, + "kl_loss_17": 288.03228759765625, + "kl_loss_3": 2630.298449707031, + "kl_loss_6": 2022.528057861328, + "learning_rate": 9.363442793386607e-05, + "loss": 1524.4778, + "step": 8040 + }, + { + "ce_loss_12": 3.344035971164703, + "ce_loss_17": 2.9912624835968016, + "ce_loss_23": 2.8620379090309145, + "ce_loss_3": 4.154768025875091, + "ce_loss_6": 3.839399552345276, + "epoch": 0.805, + "grad_norm": 1192.0, + "kl_loss_12": 1087.6466186523437, + "kl_loss_17": 291.8809112548828, + "kl_loss_3": 2700.2523193359375, + "kl_loss_6": 2080.220086669922, + "learning_rate": 9.271202397483213e-05, + "loss": 1492.3334, + "step": 8050 + }, + { + "ce_loss_12": 3.3396072030067443, + "ce_loss_17": 3.0182195901870728, + "ce_loss_23": 2.898484635353088, + "ce_loss_3": 4.109511208534241, + "ce_loss_6": 3.807023787498474, + "epoch": 0.806, + "grad_norm": 1008.0, + "kl_loss_12": 1032.6840789794921, + "kl_loss_17": 277.4155899047852, + "kl_loss_3": 2581.357275390625, + "kl_loss_6": 1988.5574157714843, + "learning_rate": 9.179372140119524e-05, + "loss": 1505.3848, + "step": 8060 + }, + { + "ce_loss_12": 3.3005812406539916, + "ce_loss_17": 2.966699254512787, + "ce_loss_23": 2.8480862021446227, + "ce_loss_3": 4.078098475933075, + "ce_loss_6": 3.7792740941047667, + "epoch": 0.807, + "grad_norm": 1120.0, + "kl_loss_12": 1046.3826721191406, + "kl_loss_17": 281.6129913330078, + "kl_loss_3": 2604.1417602539063, + "kl_loss_6": 2018.3038818359375, + "learning_rate": 9.087952946025175e-05, + "loss": 1513.502, + "step": 8070 + }, + { + "ce_loss_12": 3.3790520071983337, + "ce_loss_17": 3.0631897926330565, + "ce_loss_23": 2.9467246413230894, + "ce_loss_3": 4.123933029174805, + "ce_loss_6": 3.831411051750183, + "epoch": 0.808, + "grad_norm": 1168.0, + "kl_loss_12": 1007.8353851318359, + "kl_loss_17": 273.74057693481444, + "kl_loss_3": 2523.880090332031, + "kl_loss_6": 1941.5342407226562, + "learning_rate": 8.996945735790446e-05, + "loss": 1490.9507, + "step": 8080 + }, + { + "ce_loss_12": 3.305538558959961, + "ce_loss_17": 2.9698716044425963, + "ce_loss_23": 2.853313446044922, + "ce_loss_3": 4.071282744407654, + "ce_loss_6": 3.770635890960693, + "epoch": 0.809, + "grad_norm": 1024.0, + "kl_loss_12": 1051.1560485839843, + "kl_loss_17": 278.94518890380857, + "kl_loss_3": 2616.7640014648437, + "kl_loss_6": 2014.3000610351562, + "learning_rate": 8.906351425856951e-05, + "loss": 1505.8312, + "step": 8090 + }, + { + "ce_loss_12": 3.3000259399414062, + "ce_loss_17": 2.955925440788269, + "ce_loss_23": 2.832998812198639, + "ce_loss_3": 4.092427730560303, + "ce_loss_6": 3.78441641330719, + "epoch": 0.81, + "grad_norm": 1000.0, + "kl_loss_12": 1072.9378540039063, + "kl_loss_17": 285.1971321105957, + "kl_loss_3": 2694.5508056640624, + "kl_loss_6": 2074.3757263183593, + "learning_rate": 8.816170928508365e-05, + "loss": 1531.4363, + "step": 8100 + }, + { + "ce_loss_12": 3.271656095981598, + "ce_loss_17": 2.9251946806907654, + "ce_loss_23": 2.801189124584198, + "ce_loss_3": 4.092127597332, + "ce_loss_6": 3.781342017650604, + "epoch": 0.811, + "grad_norm": 848.0, + "kl_loss_12": 1085.3644073486328, + "kl_loss_17": 287.12475357055666, + "kl_loss_3": 2738.927209472656, + "kl_loss_6": 2116.110144042969, + "learning_rate": 8.7264051518613e-05, + "loss": 1526.0476, + "step": 8110 + }, + { + "ce_loss_12": 3.3333471536636354, + "ce_loss_17": 3.0051140427589416, + "ce_loss_23": 2.8876123905181883, + "ce_loss_3": 4.099722731113434, + "ce_loss_6": 3.799349296092987, + "epoch": 0.812, + "grad_norm": 1240.0, + "kl_loss_12": 1029.06455078125, + "kl_loss_17": 275.13319396972656, + "kl_loss_3": 2574.5433715820313, + "kl_loss_6": 1977.8097045898437, + "learning_rate": 8.637054999856148e-05, + "loss": 1484.9934, + "step": 8120 + }, + { + "ce_loss_12": 3.340409016609192, + "ce_loss_17": 3.000631868839264, + "ce_loss_23": 2.872890567779541, + "ce_loss_3": 4.133741664886474, + "ce_loss_6": 3.820933222770691, + "epoch": 0.813, + "grad_norm": 1096.0, + "kl_loss_12": 1051.732928466797, + "kl_loss_17": 288.68270721435545, + "kl_loss_3": 2652.4419555664062, + "kl_loss_6": 2028.232440185547, + "learning_rate": 8.548121372247918e-05, + "loss": 1522.7607, + "step": 8130 + }, + { + "ce_loss_12": 3.38684196472168, + "ce_loss_17": 3.0672188639640807, + "ce_loss_23": 2.9500478506088257, + "ce_loss_3": 4.153112828731537, + "ce_loss_6": 3.8581340312957764, + "epoch": 0.814, + "grad_norm": 1056.0, + "kl_loss_12": 1031.712338256836, + "kl_loss_17": 279.44617614746096, + "kl_loss_3": 2592.561669921875, + "kl_loss_6": 2010.5955078125, + "learning_rate": 8.459605164597267e-05, + "loss": 1482.8579, + "step": 8140 + }, + { + "ce_loss_12": 3.291148364543915, + "ce_loss_17": 2.954191195964813, + "ce_loss_23": 2.839016282558441, + "ce_loss_3": 4.08526486158371, + "ce_loss_6": 3.7741797089576723, + "epoch": 0.815, + "grad_norm": 1168.0, + "kl_loss_12": 1047.2955047607422, + "kl_loss_17": 279.0842231750488, + "kl_loss_3": 2647.6363525390625, + "kl_loss_6": 2028.89501953125, + "learning_rate": 8.371507268261436e-05, + "loss": 1513.5294, + "step": 8150 + }, + { + "ce_loss_12": 3.359915280342102, + "ce_loss_17": 3.024343800544739, + "ce_loss_23": 2.8986566185951235, + "ce_loss_3": 4.138587176799774, + "ce_loss_6": 3.8319987058639526, + "epoch": 0.816, + "grad_norm": 980.0, + "kl_loss_12": 1049.5741241455078, + "kl_loss_17": 284.2603759765625, + "kl_loss_3": 2621.991748046875, + "kl_loss_6": 2012.3725524902343, + "learning_rate": 8.283828570385238e-05, + "loss": 1470.4635, + "step": 8160 + }, + { + "ce_loss_12": 3.3558297991752624, + "ce_loss_17": 3.0201908469200136, + "ce_loss_23": 2.9029954433441163, + "ce_loss_3": 4.142328202724457, + "ce_loss_6": 3.8327948093414306, + "epoch": 0.817, + "grad_norm": 952.0, + "kl_loss_12": 1052.2428649902345, + "kl_loss_17": 284.0148567199707, + "kl_loss_3": 2619.49140625, + "kl_loss_6": 2013.0584106445312, + "learning_rate": 8.196569953892202e-05, + "loss": 1499.2858, + "step": 8170 + }, + { + "ce_loss_12": 3.2958539009094237, + "ce_loss_17": 2.9550757884979246, + "ce_loss_23": 2.833250343799591, + "ce_loss_3": 4.080728983879089, + "ce_loss_6": 3.7691246271133423, + "epoch": 0.818, + "grad_norm": 1072.0, + "kl_loss_12": 1066.8218170166015, + "kl_loss_17": 286.78183517456057, + "kl_loss_3": 2626.90947265625, + "kl_loss_6": 2029.0765869140625, + "learning_rate": 8.109732297475635e-05, + "loss": 1492.1337, + "step": 8180 + }, + { + "ce_loss_12": 3.305049383640289, + "ce_loss_17": 2.9337514519691466, + "ce_loss_23": 2.8022228956222532, + "ce_loss_3": 4.12907395362854, + "ce_loss_6": 3.8098646640777587, + "epoch": 0.819, + "grad_norm": 1048.0, + "kl_loss_12": 1111.1580535888672, + "kl_loss_17": 295.3542709350586, + "kl_loss_3": 2750.808483886719, + "kl_loss_6": 2130.1484802246096, + "learning_rate": 8.023316475589754e-05, + "loss": 1544.1424, + "step": 8190 + }, + { + "ce_loss_12": 3.2693939805030823, + "ce_loss_17": 2.9022013545036316, + "ce_loss_23": 2.7660971879959106, + "ce_loss_3": 4.129703235626221, + "ce_loss_6": 3.8015416502952575, + "epoch": 0.82, + "grad_norm": 1464.0, + "kl_loss_12": 1125.4208984375, + "kl_loss_17": 307.3749938964844, + "kl_loss_3": 2841.758447265625, + "kl_loss_6": 2199.205700683594, + "learning_rate": 7.937323358440934e-05, + "loss": 1571.7113, + "step": 8200 + }, + { + "ce_loss_12": 3.3275758266448974, + "ce_loss_17": 3.0086806058883666, + "ce_loss_23": 2.8955679059028627, + "ce_loss_3": 4.080337619781494, + "ce_loss_6": 3.7854795098304748, + "epoch": 0.821, + "grad_norm": 1064.0, + "kl_loss_12": 1025.0358978271483, + "kl_loss_17": 275.9737190246582, + "kl_loss_3": 2537.994274902344, + "kl_loss_6": 1955.729638671875, + "learning_rate": 7.851753811978923e-05, + "loss": 1478.1028, + "step": 8210 + }, + { + "ce_loss_12": 3.361056423187256, + "ce_loss_17": 3.023615562915802, + "ce_loss_23": 2.899628448486328, + "ce_loss_3": 4.155423247814179, + "ce_loss_6": 3.857728958129883, + "epoch": 0.822, + "grad_norm": 1576.0, + "kl_loss_12": 1052.4446899414063, + "kl_loss_17": 286.32282104492185, + "kl_loss_3": 2650.950402832031, + "kl_loss_6": 2060.075762939453, + "learning_rate": 7.766608697888095e-05, + "loss": 1494.5828, + "step": 8220 + }, + { + "ce_loss_12": 3.3725324153900145, + "ce_loss_17": 3.033415162563324, + "ce_loss_23": 2.910620903968811, + "ce_loss_3": 4.164709949493409, + "ce_loss_6": 3.8546987891197206, + "epoch": 0.823, + "grad_norm": 1336.0, + "kl_loss_12": 1068.575357055664, + "kl_loss_17": 287.6273994445801, + "kl_loss_3": 2665.501123046875, + "kl_loss_6": 2060.4616455078126, + "learning_rate": 7.681888873578785e-05, + "loss": 1530.6572, + "step": 8230 + }, + { + "ce_loss_12": 3.3186384439468384, + "ce_loss_17": 2.964526128768921, + "ce_loss_23": 2.833543539047241, + "ce_loss_3": 4.121504092216492, + "ce_loss_6": 3.8054251790046694, + "epoch": 0.824, + "grad_norm": 1264.0, + "kl_loss_12": 1092.8309265136718, + "kl_loss_17": 294.7323173522949, + "kl_loss_3": 2704.2026489257814, + "kl_loss_6": 2085.3021728515623, + "learning_rate": 7.597595192178702e-05, + "loss": 1513.8944, + "step": 8240 + }, + { + "ce_loss_12": 3.3183600187301634, + "ce_loss_17": 2.966995060443878, + "ce_loss_23": 2.8392292618751527, + "ce_loss_3": 4.142496371269226, + "ce_loss_6": 3.8186073541641234, + "epoch": 0.825, + "grad_norm": 968.0, + "kl_loss_12": 1095.9381225585937, + "kl_loss_17": 291.71913833618163, + "kl_loss_3": 2756.4898681640625, + "kl_loss_6": 2119.0372924804688, + "learning_rate": 7.513728502524286e-05, + "loss": 1540.7781, + "step": 8250 + }, + { + "ce_loss_12": 3.2841219305992126, + "ce_loss_17": 2.955878269672394, + "ce_loss_23": 2.842096221446991, + "ce_loss_3": 4.0645402550697325, + "ce_loss_6": 3.76453902721405, + "epoch": 0.826, + "grad_norm": 840.0, + "kl_loss_12": 1022.670541381836, + "kl_loss_17": 271.8673599243164, + "kl_loss_3": 2579.4839721679687, + "kl_loss_6": 1995.1776184082032, + "learning_rate": 7.430289649152156e-05, + "loss": 1504.5896, + "step": 8260 + }, + { + "ce_loss_12": 3.2348284006118773, + "ce_loss_17": 2.876335549354553, + "ce_loss_23": 2.7529516220092773, + "ce_loss_3": 4.052449405193329, + "ce_loss_6": 3.739718866348267, + "epoch": 0.827, + "grad_norm": 1048.0, + "kl_loss_12": 1096.6505584716797, + "kl_loss_17": 289.0554008483887, + "kl_loss_3": 2760.6556762695313, + "kl_loss_6": 2129.0817260742188, + "learning_rate": 7.347279472290646e-05, + "loss": 1520.1464, + "step": 8270 + }, + { + "ce_loss_12": 3.350387394428253, + "ce_loss_17": 3.008963429927826, + "ce_loss_23": 2.88888795375824, + "ce_loss_3": 4.150562691688537, + "ce_loss_6": 3.839677166938782, + "epoch": 0.828, + "grad_norm": 1096.0, + "kl_loss_12": 1066.2569763183594, + "kl_loss_17": 284.117505645752, + "kl_loss_3": 2677.5921875, + "kl_loss_6": 2067.034716796875, + "learning_rate": 7.264698807851328e-05, + "loss": 1521.5023, + "step": 8280 + }, + { + "ce_loss_12": 3.3077462196350096, + "ce_loss_17": 2.9838837027549743, + "ce_loss_23": 2.8650826036930086, + "ce_loss_3": 4.088552296161652, + "ce_loss_6": 3.7806774139404298, + "epoch": 0.829, + "grad_norm": 1104.0, + "kl_loss_12": 1033.3657104492188, + "kl_loss_17": 279.8206199645996, + "kl_loss_3": 2593.388317871094, + "kl_loss_6": 1994.3953857421875, + "learning_rate": 7.182548487420554e-05, + "loss": 1490.687, + "step": 8290 + }, + { + "ce_loss_12": 3.3589147090911866, + "ce_loss_17": 3.0282956838607786, + "ce_loss_23": 2.9110039949417112, + "ce_loss_3": 4.137791180610657, + "ce_loss_6": 3.8347092270851135, + "epoch": 0.83, + "grad_norm": 1064.0, + "kl_loss_12": 1052.384588623047, + "kl_loss_17": 285.2430877685547, + "kl_loss_3": 2622.855578613281, + "kl_loss_6": 2030.3604919433594, + "learning_rate": 7.100829338251146e-05, + "loss": 1490.9061, + "step": 8300 + }, + { + "ce_loss_12": 3.3167926907539367, + "ce_loss_17": 2.970119321346283, + "ce_loss_23": 2.8363095045089723, + "ce_loss_3": 4.117564260959625, + "ce_loss_6": 3.808456206321716, + "epoch": 0.831, + "grad_norm": 1064.0, + "kl_loss_12": 1085.094046020508, + "kl_loss_17": 295.32879638671875, + "kl_loss_3": 2695.377307128906, + "kl_loss_6": 2076.8177795410156, + "learning_rate": 7.019542183254046e-05, + "loss": 1505.1945, + "step": 8310 + }, + { + "ce_loss_12": 3.3409668445587157, + "ce_loss_17": 3.0074267029762267, + "ce_loss_23": 2.8743056416511537, + "ce_loss_3": 4.114725065231323, + "ce_loss_6": 3.812414515018463, + "epoch": 0.832, + "grad_norm": 1304.0, + "kl_loss_12": 1072.4284088134766, + "kl_loss_17": 297.69879455566405, + "kl_loss_3": 2642.8615844726564, + "kl_loss_6": 2040.5767395019532, + "learning_rate": 6.938687840989971e-05, + "loss": 1498.9299, + "step": 8320 + }, + { + "ce_loss_12": 3.292984998226166, + "ce_loss_17": 2.9502809286117553, + "ce_loss_23": 2.8233898639678956, + "ce_loss_3": 4.075324869155883, + "ce_loss_6": 3.7751200199127197, + "epoch": 0.833, + "grad_norm": 1360.0, + "kl_loss_12": 1063.7236389160157, + "kl_loss_17": 292.75162200927736, + "kl_loss_3": 2629.8801879882812, + "kl_loss_6": 2039.0565246582032, + "learning_rate": 6.858267125661271e-05, + "loss": 1524.8443, + "step": 8330 + }, + { + "ce_loss_12": 3.34079647064209, + "ce_loss_17": 3.0030898213386537, + "ce_loss_23": 2.878040623664856, + "ce_loss_3": 4.137813937664032, + "ce_loss_6": 3.821487474441528, + "epoch": 0.834, + "grad_norm": 1232.0, + "kl_loss_12": 1057.2560974121093, + "kl_loss_17": 284.3835502624512, + "kl_loss_3": 2658.54013671875, + "kl_loss_6": 2034.9720642089844, + "learning_rate": 6.778280847103668e-05, + "loss": 1539.2473, + "step": 8340 + }, + { + "ce_loss_12": 3.356912088394165, + "ce_loss_17": 3.010690116882324, + "ce_loss_23": 2.88764488697052, + "ce_loss_3": 4.1245949268341064, + "ce_loss_6": 3.819116735458374, + "epoch": 0.835, + "grad_norm": 940.0, + "kl_loss_12": 1071.227099609375, + "kl_loss_17": 286.31872711181643, + "kl_loss_3": 2635.9383056640627, + "kl_loss_6": 2024.1812744140625, + "learning_rate": 6.698729810778065e-05, + "loss": 1500.8768, + "step": 8350 + }, + { + "ce_loss_12": 3.2641802549362184, + "ce_loss_17": 2.9241887450218202, + "ce_loss_23": 2.800455904006958, + "ce_loss_3": 4.078383791446686, + "ce_loss_6": 3.7547880887985228, + "epoch": 0.836, + "grad_norm": 1288.0, + "kl_loss_12": 1051.2379241943358, + "kl_loss_17": 278.18785400390624, + "kl_loss_3": 2676.7776123046874, + "kl_loss_6": 2041.9749389648437, + "learning_rate": 6.619614817762538e-05, + "loss": 1516.0025, + "step": 8360 + }, + { + "ce_loss_12": 3.2639304637908935, + "ce_loss_17": 2.907888662815094, + "ce_loss_23": 2.7812278270721436, + "ce_loss_3": 4.103890705108642, + "ce_loss_6": 3.7825687408447264, + "epoch": 0.837, + "grad_norm": 960.0, + "kl_loss_12": 1098.0486114501953, + "kl_loss_17": 289.22062225341796, + "kl_loss_3": 2781.6754760742188, + "kl_loss_6": 2142.4128173828126, + "learning_rate": 6.540936664744196e-05, + "loss": 1542.1457, + "step": 8370 + }, + { + "ce_loss_12": 3.373264420032501, + "ce_loss_17": 3.0289392709732055, + "ce_loss_23": 2.9055778861045836, + "ce_loss_3": 4.160033249855042, + "ce_loss_6": 3.859750437736511, + "epoch": 0.838, + "grad_norm": 884.0, + "kl_loss_12": 1066.5439331054688, + "kl_loss_17": 285.0770950317383, + "kl_loss_3": 2651.09287109375, + "kl_loss_6": 2055.8511474609377, + "learning_rate": 6.462696144011149e-05, + "loss": 1496.0637, + "step": 8380 + }, + { + "ce_loss_12": 3.3310264587402343, + "ce_loss_17": 2.991706573963165, + "ce_loss_23": 2.8699768662452696, + "ce_loss_3": 4.092860805988312, + "ce_loss_6": 3.7977572083473206, + "epoch": 0.839, + "grad_norm": 900.0, + "kl_loss_12": 1064.0408813476563, + "kl_loss_17": 290.1427459716797, + "kl_loss_3": 2610.905310058594, + "kl_loss_6": 2026.75966796875, + "learning_rate": 6.384894043444567e-05, + "loss": 1481.7455, + "step": 8390 + }, + { + "ce_loss_12": 3.3575948596000673, + "ce_loss_17": 3.0081547141075133, + "ce_loss_23": 2.8842729449272158, + "ce_loss_3": 4.1496823072433475, + "ce_loss_6": 3.8388915777206423, + "epoch": 0.84, + "grad_norm": 1256.0, + "kl_loss_12": 1065.0152313232422, + "kl_loss_17": 287.56776962280276, + "kl_loss_3": 2659.087756347656, + "kl_loss_6": 2051.4926208496095, + "learning_rate": 6.307531146510753e-05, + "loss": 1501.6816, + "step": 8400 + }, + { + "ce_loss_12": 3.3196689248085023, + "ce_loss_17": 2.9898666977882384, + "ce_loss_23": 2.8641279578208922, + "ce_loss_3": 4.084683573246002, + "ce_loss_6": 3.7808205842971803, + "epoch": 0.841, + "grad_norm": 984.0, + "kl_loss_12": 1036.9258270263672, + "kl_loss_17": 285.1266288757324, + "kl_loss_3": 2581.5780639648438, + "kl_loss_6": 1977.9350891113281, + "learning_rate": 6.230608232253226e-05, + "loss": 1472.0643, + "step": 8410 + }, + { + "ce_loss_12": 3.306604731082916, + "ce_loss_17": 2.9507232666015626, + "ce_loss_23": 2.8234981060028077, + "ce_loss_3": 4.131628477573395, + "ce_loss_6": 3.817171037197113, + "epoch": 0.842, + "grad_norm": 1104.0, + "kl_loss_12": 1087.105435180664, + "kl_loss_17": 289.3849044799805, + "kl_loss_3": 2721.5063720703124, + "kl_loss_6": 2106.2221618652343, + "learning_rate": 6.154126075284855e-05, + "loss": 1507.0301, + "step": 8420 + }, + { + "ce_loss_12": 3.3652244925498964, + "ce_loss_17": 3.0340421080589293, + "ce_loss_23": 2.9187233686447143, + "ce_loss_3": 4.130150222778321, + "ce_loss_6": 3.8256107330322267, + "epoch": 0.843, + "grad_norm": 1304.0, + "kl_loss_12": 1038.1420593261719, + "kl_loss_17": 276.6188430786133, + "kl_loss_3": 2575.8635498046874, + "kl_loss_6": 1973.4901733398438, + "learning_rate": 6.078085445780129e-05, + "loss": 1466.3076, + "step": 8430 + }, + { + "ce_loss_12": 3.3772589802742004, + "ce_loss_17": 3.0360066175460814, + "ce_loss_23": 2.9157355785369874, + "ce_loss_3": 4.171771287918091, + "ce_loss_6": 3.8688151121139525, + "epoch": 0.844, + "grad_norm": 968.0, + "kl_loss_12": 1063.171942138672, + "kl_loss_17": 283.9888038635254, + "kl_loss_3": 2673.1849975585938, + "kl_loss_6": 2074.389031982422, + "learning_rate": 6.002487109467347e-05, + "loss": 1489.1416, + "step": 8440 + }, + { + "ce_loss_12": 3.387488567829132, + "ce_loss_17": 3.050030696392059, + "ce_loss_23": 2.9240846395492555, + "ce_loss_3": 4.154106748104096, + "ce_loss_6": 3.851106035709381, + "epoch": 0.845, + "grad_norm": 1088.0, + "kl_loss_12": 1073.7497619628907, + "kl_loss_17": 292.6628059387207, + "kl_loss_3": 2634.9559326171875, + "kl_loss_6": 2031.09990234375, + "learning_rate": 5.927331827620902e-05, + "loss": 1492.8718, + "step": 8450 + }, + { + "ce_loss_12": 3.3563677072525024, + "ce_loss_17": 3.0263548135757445, + "ce_loss_23": 2.9048399567604064, + "ce_loss_3": 4.099981164932251, + "ce_loss_6": 3.8053762912750244, + "epoch": 0.846, + "grad_norm": 1072.0, + "kl_loss_12": 1037.7698760986327, + "kl_loss_17": 280.9472290039063, + "kl_loss_3": 2535.442004394531, + "kl_loss_6": 1953.500341796875, + "learning_rate": 5.852620357053651e-05, + "loss": 1478.8081, + "step": 8460 + }, + { + "ce_loss_12": 3.394399857521057, + "ce_loss_17": 3.067415416240692, + "ce_loss_23": 2.9515963554382325, + "ce_loss_3": 4.152213895320893, + "ce_loss_6": 3.857205033302307, + "epoch": 0.847, + "grad_norm": 1048.0, + "kl_loss_12": 1036.8290802001952, + "kl_loss_17": 277.5314636230469, + "kl_loss_3": 2560.67099609375, + "kl_loss_6": 1981.1140991210937, + "learning_rate": 5.778353450109286e-05, + "loss": 1482.0109, + "step": 8470 + }, + { + "ce_loss_12": 3.4367149829864503, + "ce_loss_17": 3.098689925670624, + "ce_loss_23": 2.9720311760902405, + "ce_loss_3": 4.227457308769226, + "ce_loss_6": 3.9236016511917113, + "epoch": 0.848, + "grad_norm": 1012.0, + "kl_loss_12": 1065.6336242675782, + "kl_loss_17": 288.5423187255859, + "kl_loss_3": 2651.488635253906, + "kl_loss_6": 2046.8683654785157, + "learning_rate": 5.7045318546547206e-05, + "loss": 1496.9844, + "step": 8480 + }, + { + "ce_loss_12": 3.3335933089256287, + "ce_loss_17": 2.9945171236991883, + "ce_loss_23": 2.8744940757751465, + "ce_loss_3": 4.126176583766937, + "ce_loss_6": 3.82481609582901, + "epoch": 0.849, + "grad_norm": 1328.0, + "kl_loss_12": 1058.5303649902344, + "kl_loss_17": 282.7712348937988, + "kl_loss_3": 2657.193566894531, + "kl_loss_6": 2055.022296142578, + "learning_rate": 5.631156314072605e-05, + "loss": 1494.8992, + "step": 8490 + }, + { + "ce_loss_12": 3.351343595981598, + "ce_loss_17": 3.025673341751099, + "ce_loss_23": 2.9065319418907167, + "ce_loss_3": 4.112990772724151, + "ce_loss_6": 3.8109803080558775, + "epoch": 0.85, + "grad_norm": 988.0, + "kl_loss_12": 1031.8004302978516, + "kl_loss_17": 282.2818008422852, + "kl_loss_3": 2580.2078125, + "kl_loss_6": 1985.7573120117188, + "learning_rate": 5.5582275672538315e-05, + "loss": 1471.2869, + "step": 8500 + }, + { + "ce_loss_12": 3.3047799110412597, + "ce_loss_17": 2.9414509415626524, + "ce_loss_23": 2.8145953178405763, + "ce_loss_3": 4.127846312522888, + "ce_loss_6": 3.8183951377868652, + "epoch": 0.851, + "grad_norm": 1120.0, + "kl_loss_12": 1114.0586822509765, + "kl_loss_17": 293.6786460876465, + "kl_loss_3": 2775.8820190429688, + "kl_loss_6": 2153.7723083496094, + "learning_rate": 5.4857463485900484e-05, + "loss": 1541.5318, + "step": 8510 + }, + { + "ce_loss_12": 3.3387150883674623, + "ce_loss_17": 3.0012975692749024, + "ce_loss_23": 2.8756859064102174, + "ce_loss_3": 4.103997337818146, + "ce_loss_6": 3.8028729438781737, + "epoch": 0.852, + "grad_norm": 1200.0, + "kl_loss_12": 1055.3402618408204, + "kl_loss_17": 281.1202537536621, + "kl_loss_3": 2608.085583496094, + "kl_loss_6": 2009.9069580078126, + "learning_rate": 5.413713387966329e-05, + "loss": 1492.8399, + "step": 8520 + }, + { + "ce_loss_12": 3.350775933265686, + "ce_loss_17": 3.0103574633598327, + "ce_loss_23": 2.8882513999938966, + "ce_loss_3": 4.137410664558411, + "ce_loss_6": 3.837936055660248, + "epoch": 0.853, + "grad_norm": 1360.0, + "kl_loss_12": 1058.6788848876954, + "kl_loss_17": 282.32394790649414, + "kl_loss_3": 2648.292834472656, + "kl_loss_6": 2052.374346923828, + "learning_rate": 5.34212941075381e-05, + "loss": 1502.7247, + "step": 8530 + }, + { + "ce_loss_12": 3.3418834924697878, + "ce_loss_17": 3.0199050664901734, + "ce_loss_23": 2.9050287127494814, + "ce_loss_3": 4.109955275058747, + "ce_loss_6": 3.8061795234680176, + "epoch": 0.854, + "grad_norm": 1240.0, + "kl_loss_12": 1008.4690399169922, + "kl_loss_17": 274.83275985717773, + "kl_loss_3": 2575.9906005859375, + "kl_loss_6": 1970.3159729003905, + "learning_rate": 5.270995137802315e-05, + "loss": 1473.9852, + "step": 8540 + }, + { + "ce_loss_12": 3.292176532745361, + "ce_loss_17": 2.96178058385849, + "ce_loss_23": 2.8435588598251345, + "ce_loss_3": 4.0793102264404295, + "ce_loss_6": 3.774105632305145, + "epoch": 0.855, + "grad_norm": 888.0, + "kl_loss_12": 1046.6531677246094, + "kl_loss_17": 279.5433349609375, + "kl_loss_3": 2637.3547973632812, + "kl_loss_6": 2033.5572509765625, + "learning_rate": 5.2003112854332125e-05, + "loss": 1511.5877, + "step": 8550 + }, + { + "ce_loss_12": 3.290618920326233, + "ce_loss_17": 2.9556390285491942, + "ce_loss_23": 2.842723000049591, + "ce_loss_3": 4.057822823524475, + "ce_loss_6": 3.7611496210098267, + "epoch": 0.856, + "grad_norm": 968.0, + "kl_loss_12": 1037.3160766601563, + "kl_loss_17": 272.4007507324219, + "kl_loss_3": 2604.338903808594, + "kl_loss_6": 2008.6603881835938, + "learning_rate": 5.130078565432089e-05, + "loss": 1464.4805, + "step": 8560 + }, + { + "ce_loss_12": 3.3430102467536926, + "ce_loss_17": 3.0166367650032044, + "ce_loss_23": 2.9037629127502442, + "ce_loss_3": 4.099278628826141, + "ce_loss_6": 3.806718420982361, + "epoch": 0.857, + "grad_norm": 1032.0, + "kl_loss_12": 1027.847314453125, + "kl_loss_17": 271.56771697998045, + "kl_loss_3": 2560.9104248046874, + "kl_loss_6": 1976.3161193847657, + "learning_rate": 5.060297685041659e-05, + "loss": 1452.2871, + "step": 8570 + }, + { + "ce_loss_12": 3.30127671957016, + "ce_loss_17": 2.9604085326194762, + "ce_loss_23": 2.8306247353553773, + "ce_loss_3": 4.100674736499786, + "ce_loss_6": 3.7946442484855654, + "epoch": 0.858, + "grad_norm": 1144.0, + "kl_loss_12": 1067.2820556640625, + "kl_loss_17": 292.8963912963867, + "kl_loss_3": 2683.639929199219, + "kl_loss_6": 2071.2172790527343, + "learning_rate": 4.99096934695461e-05, + "loss": 1524.3353, + "step": 8580 + }, + { + "ce_loss_12": 3.3519344210624693, + "ce_loss_17": 3.01632022857666, + "ce_loss_23": 2.896422302722931, + "ce_loss_3": 4.125617742538452, + "ce_loss_6": 3.8321552634239198, + "epoch": 0.859, + "grad_norm": 920.0, + "kl_loss_12": 1036.7369049072265, + "kl_loss_17": 277.91772079467773, + "kl_loss_3": 2593.2775024414063, + "kl_loss_6": 2007.981787109375, + "learning_rate": 4.922094249306558e-05, + "loss": 1469.8185, + "step": 8590 + }, + { + "ce_loss_12": 3.3896760821342466, + "ce_loss_17": 3.051762652397156, + "ce_loss_23": 2.927615690231323, + "ce_loss_3": 4.160779738426209, + "ce_loss_6": 3.8637610912322997, + "epoch": 0.86, + "grad_norm": 1136.0, + "kl_loss_12": 1061.075439453125, + "kl_loss_17": 288.38329010009767, + "kl_loss_3": 2624.784045410156, + "kl_loss_6": 2029.8816284179688, + "learning_rate": 4.853673085668947e-05, + "loss": 1474.2352, + "step": 8600 + }, + { + "ce_loss_12": 3.401752161979675, + "ce_loss_17": 3.05824556350708, + "ce_loss_23": 2.9379342675209044, + "ce_loss_3": 4.180785989761352, + "ce_loss_6": 3.8813895106315615, + "epoch": 0.861, + "grad_norm": 1104.0, + "kl_loss_12": 1066.3026214599608, + "kl_loss_17": 280.7836517333984, + "kl_loss_3": 2639.3175415039063, + "kl_loss_6": 2042.6779052734375, + "learning_rate": 4.78570654504214e-05, + "loss": 1505.4118, + "step": 8610 + }, + { + "ce_loss_12": 3.3483115792274476, + "ce_loss_17": 3.014393675327301, + "ce_loss_23": 2.892766237258911, + "ce_loss_3": 4.125595450401306, + "ce_loss_6": 3.8208566427230837, + "epoch": 0.862, + "grad_norm": 964.0, + "kl_loss_12": 1048.1334350585937, + "kl_loss_17": 280.49364166259767, + "kl_loss_3": 2634.9196533203126, + "kl_loss_6": 2022.7561828613282, + "learning_rate": 4.7181953118484556e-05, + "loss": 1496.3431, + "step": 8620 + }, + { + "ce_loss_12": 3.368147909641266, + "ce_loss_17": 3.0343467235565185, + "ce_loss_23": 2.9179905891418456, + "ce_loss_3": 4.131514084339142, + "ce_loss_6": 3.834225833415985, + "epoch": 0.863, + "grad_norm": 1136.0, + "kl_loss_12": 1044.3617065429687, + "kl_loss_17": 278.33188400268557, + "kl_loss_3": 2568.1588134765625, + "kl_loss_6": 1984.6631713867187, + "learning_rate": 4.651140065925269e-05, + "loss": 1506.2501, + "step": 8630 + }, + { + "ce_loss_12": 3.3051889181137084, + "ce_loss_17": 2.9729141354560853, + "ce_loss_23": 2.848560094833374, + "ce_loss_3": 4.0864248991012575, + "ce_loss_6": 3.7882192850112917, + "epoch": 0.864, + "grad_norm": 1152.0, + "kl_loss_12": 1045.531802368164, + "kl_loss_17": 283.5457000732422, + "kl_loss_3": 2622.3549560546876, + "kl_loss_6": 2034.6311279296874, + "learning_rate": 4.58454148251814e-05, + "loss": 1513.5437, + "step": 8640 + }, + { + "ce_loss_12": 3.328133535385132, + "ce_loss_17": 2.98028701543808, + "ce_loss_23": 2.856073999404907, + "ce_loss_3": 4.13912308216095, + "ce_loss_6": 3.828638470172882, + "epoch": 0.865, + "grad_norm": 1040.0, + "kl_loss_12": 1070.8560150146484, + "kl_loss_17": 281.1698196411133, + "kl_loss_3": 2693.3925903320314, + "kl_loss_6": 2070.704803466797, + "learning_rate": 4.518400232274078e-05, + "loss": 1507.2037, + "step": 8650 + }, + { + "ce_loss_12": 3.3494417786598207, + "ce_loss_17": 3.0160619020462036, + "ce_loss_23": 2.888498270511627, + "ce_loss_3": 4.126687633991241, + "ce_loss_6": 3.823049473762512, + "epoch": 0.866, + "grad_norm": 936.0, + "kl_loss_12": 1056.8975738525392, + "kl_loss_17": 288.9259567260742, + "kl_loss_3": 2613.984899902344, + "kl_loss_6": 2010.7535461425782, + "learning_rate": 4.452716981234745e-05, + "loss": 1461.7759, + "step": 8660 + }, + { + "ce_loss_12": 3.314436304569244, + "ce_loss_17": 2.9823337316513063, + "ce_loss_23": 2.864330458641052, + "ce_loss_3": 4.09505170583725, + "ce_loss_6": 3.78656405210495, + "epoch": 0.867, + "grad_norm": 984.0, + "kl_loss_12": 1041.6230499267579, + "kl_loss_17": 275.7464958190918, + "kl_loss_3": 2615.276159667969, + "kl_loss_6": 1999.0199157714844, + "learning_rate": 4.3874923908297335e-05, + "loss": 1460.9713, + "step": 8670 + }, + { + "ce_loss_12": 3.379078185558319, + "ce_loss_17": 3.034619677066803, + "ce_loss_23": 2.916077446937561, + "ce_loss_3": 4.167405915260315, + "ce_loss_6": 3.861358070373535, + "epoch": 0.868, + "grad_norm": 1320.0, + "kl_loss_12": 1071.1197692871094, + "kl_loss_17": 283.57053604125974, + "kl_loss_3": 2664.220324707031, + "kl_loss_6": 2069.8397705078123, + "learning_rate": 4.322727117869951e-05, + "loss": 1495.5064, + "step": 8680 + }, + { + "ce_loss_12": 3.38282253742218, + "ce_loss_17": 3.0405599594116213, + "ce_loss_23": 2.920660102367401, + "ce_loss_3": 4.173842930793763, + "ce_loss_6": 3.8629134774208067, + "epoch": 0.869, + "grad_norm": 1192.0, + "kl_loss_12": 1060.762957763672, + "kl_loss_17": 284.0414749145508, + "kl_loss_3": 2663.9480712890627, + "kl_loss_6": 2042.9997863769531, + "learning_rate": 4.2584218145409916e-05, + "loss": 1494.0689, + "step": 8690 + }, + { + "ce_loss_12": 3.392480957508087, + "ce_loss_17": 3.0751676082611086, + "ce_loss_23": 2.9581345558166503, + "ce_loss_3": 4.143105471134186, + "ce_loss_6": 3.8466254830360413, + "epoch": 0.87, + "grad_norm": 1248.0, + "kl_loss_12": 1023.1029602050781, + "kl_loss_17": 274.7999664306641, + "kl_loss_3": 2543.5913696289062, + "kl_loss_6": 1950.3782043457031, + "learning_rate": 4.194577128396521e-05, + "loss": 1453.3071, + "step": 8700 + }, + { + "ce_loss_12": 3.306734549999237, + "ce_loss_17": 2.971284508705139, + "ce_loss_23": 2.852522623538971, + "ce_loss_3": 4.084229207038879, + "ce_loss_6": 3.782787263393402, + "epoch": 0.871, + "grad_norm": 976.0, + "kl_loss_12": 1037.1832397460937, + "kl_loss_17": 276.3266456604004, + "kl_loss_3": 2611.7448486328126, + "kl_loss_6": 2011.7207641601562, + "learning_rate": 4.1311937023518264e-05, + "loss": 1502.6502, + "step": 8710 + }, + { + "ce_loss_12": 3.310939145088196, + "ce_loss_17": 2.9816059350967405, + "ce_loss_23": 2.8702484488487245, + "ce_loss_3": 4.1435425758361815, + "ce_loss_6": 3.833130669593811, + "epoch": 0.872, + "grad_norm": 968.0, + "kl_loss_12": 1030.4807495117188, + "kl_loss_17": 269.01084365844724, + "kl_loss_3": 2706.113879394531, + "kl_loss_6": 2098.7085510253905, + "learning_rate": 4.0682721746773344e-05, + "loss": 1497.9731, + "step": 8720 + }, + { + "ce_loss_12": 3.221154248714447, + "ce_loss_17": 2.868494462966919, + "ce_loss_23": 2.7455691933631896, + "ce_loss_3": 4.032175767421722, + "ce_loss_6": 3.7165164232254027, + "epoch": 0.873, + "grad_norm": 1008.0, + "kl_loss_12": 1079.8149719238281, + "kl_loss_17": 281.65873489379885, + "kl_loss_3": 2713.8420288085936, + "kl_loss_6": 2083.4974060058594, + "learning_rate": 4.0058131789920904e-05, + "loss": 1491.0676, + "step": 8730 + }, + { + "ce_loss_12": 3.3403757333755495, + "ce_loss_17": 3.00846403837204, + "ce_loss_23": 2.890472078323364, + "ce_loss_3": 4.107055354118347, + "ce_loss_6": 3.804210674762726, + "epoch": 0.874, + "grad_norm": 940.0, + "kl_loss_12": 1055.1203155517578, + "kl_loss_17": 273.9949554443359, + "kl_loss_3": 2620.6344970703126, + "kl_loss_6": 2015.965771484375, + "learning_rate": 3.9438173442575e-05, + "loss": 1535.924, + "step": 8740 + }, + { + "ce_loss_12": 3.3696256041526795, + "ce_loss_17": 3.0336912751197813, + "ce_loss_23": 2.913709211349487, + "ce_loss_3": 4.140432238578796, + "ce_loss_6": 3.8295344591140745, + "epoch": 0.875, + "grad_norm": 1064.0, + "kl_loss_12": 1040.0205291748048, + "kl_loss_17": 279.5762062072754, + "kl_loss_3": 2582.630651855469, + "kl_loss_6": 1982.918798828125, + "learning_rate": 3.882285294770937e-05, + "loss": 1477.0505, + "step": 8750 + }, + { + "ce_loss_12": 3.325684869289398, + "ce_loss_17": 2.9906407475471495, + "ce_loss_23": 2.872960686683655, + "ce_loss_3": 4.079718363285065, + "ce_loss_6": 3.7855276465415955, + "epoch": 0.876, + "grad_norm": 892.0, + "kl_loss_12": 1029.6630554199219, + "kl_loss_17": 275.07968978881837, + "kl_loss_3": 2572.990869140625, + "kl_loss_6": 1976.6401428222657, + "learning_rate": 3.821217650159453e-05, + "loss": 1497.0498, + "step": 8760 + }, + { + "ce_loss_12": 3.2449742555618286, + "ce_loss_17": 2.8872615814208986, + "ce_loss_23": 2.7629997372627257, + "ce_loss_3": 4.060509705543518, + "ce_loss_6": 3.7358765721321108, + "epoch": 0.877, + "grad_norm": 1016.0, + "kl_loss_12": 1094.626397705078, + "kl_loss_17": 287.1759391784668, + "kl_loss_3": 2719.6081665039064, + "kl_loss_6": 2092.2599670410154, + "learning_rate": 3.760615025373543e-05, + "loss": 1515.3271, + "step": 8770 + }, + { + "ce_loss_12": 3.392821705341339, + "ce_loss_17": 3.0452610850334167, + "ce_loss_23": 2.918588709831238, + "ce_loss_3": 4.189818334579468, + "ce_loss_6": 3.876577985286713, + "epoch": 0.878, + "grad_norm": 908.0, + "kl_loss_12": 1076.1950653076171, + "kl_loss_17": 291.6323013305664, + "kl_loss_3": 2688.911828613281, + "kl_loss_6": 2060.1080932617188, + "learning_rate": 3.700478030680987e-05, + "loss": 1532.3845, + "step": 8780 + }, + { + "ce_loss_12": 3.3736968278884887, + "ce_loss_17": 3.0386847376823427, + "ce_loss_23": 2.9209038853645324, + "ce_loss_3": 4.149859249591827, + "ce_loss_6": 3.8536314964294434, + "epoch": 0.879, + "grad_norm": 956.0, + "kl_loss_12": 1037.4135864257812, + "kl_loss_17": 275.3736259460449, + "kl_loss_3": 2587.9447998046876, + "kl_loss_6": 2007.5624267578125, + "learning_rate": 3.6408072716606344e-05, + "loss": 1480.7403, + "step": 8790 + }, + { + "ce_loss_12": 3.324361264705658, + "ce_loss_17": 2.973997724056244, + "ce_loss_23": 2.8534497022628784, + "ce_loss_3": 4.123316991329193, + "ce_loss_6": 3.815816307067871, + "epoch": 0.88, + "grad_norm": 1568.0, + "kl_loss_12": 1078.816357421875, + "kl_loss_17": 284.4879379272461, + "kl_loss_3": 2702.5100830078127, + "kl_loss_6": 2089.4851806640627, + "learning_rate": 3.5816033491963716e-05, + "loss": 1550.1549, + "step": 8800 + }, + { + "ce_loss_12": 3.1941446185112, + "ce_loss_17": 2.846931892633438, + "ce_loss_23": 2.7270780503749847, + "ce_loss_3": 4.030868780612946, + "ce_loss_6": 3.7038835287094116, + "epoch": 0.881, + "grad_norm": 1328.0, + "kl_loss_12": 1069.0301940917968, + "kl_loss_17": 280.2302864074707, + "kl_loss_3": 2742.7992553710938, + "kl_loss_6": 2101.755230712891, + "learning_rate": 3.522866859471047e-05, + "loss": 1520.2628, + "step": 8810 + }, + { + "ce_loss_12": 3.3734650373458863, + "ce_loss_17": 3.053483486175537, + "ce_loss_23": 2.9413180232048033, + "ce_loss_3": 4.11530956029892, + "ce_loss_6": 3.8248594403266907, + "epoch": 0.882, + "grad_norm": 1080.0, + "kl_loss_12": 1005.0269561767578, + "kl_loss_17": 268.26475830078124, + "kl_loss_3": 2496.3321533203125, + "kl_loss_6": 1920.503350830078, + "learning_rate": 3.46459839396045e-05, + "loss": 1460.0039, + "step": 8820 + }, + { + "ce_loss_12": 3.321186828613281, + "ce_loss_17": 2.975290596485138, + "ce_loss_23": 2.8529227137565614, + "ce_loss_3": 4.11351488828659, + "ce_loss_6": 3.8004450678825377, + "epoch": 0.883, + "grad_norm": 932.0, + "kl_loss_12": 1060.291082763672, + "kl_loss_17": 282.3150550842285, + "kl_loss_3": 2649.278857421875, + "kl_loss_6": 2030.3860168457031, + "learning_rate": 3.406798539427386e-05, + "loss": 1532.8661, + "step": 8830 + }, + { + "ce_loss_12": 3.3703830003738404, + "ce_loss_17": 3.037055957317352, + "ce_loss_23": 2.9183802366256715, + "ce_loss_3": 4.155374789237976, + "ce_loss_6": 3.849817657470703, + "epoch": 0.884, + "grad_norm": 1200.0, + "kl_loss_12": 1059.6869079589844, + "kl_loss_17": 279.1549919128418, + "kl_loss_3": 2642.8896484375, + "kl_loss_6": 2036.8998413085938, + "learning_rate": 3.349467877915746e-05, + "loss": 1503.1357, + "step": 8840 + }, + { + "ce_loss_12": 3.3473170399665833, + "ce_loss_17": 3.0034255027770995, + "ce_loss_23": 2.8850237488746644, + "ce_loss_3": 4.1362377285957335, + "ce_loss_6": 3.832878088951111, + "epoch": 0.885, + "grad_norm": 1384.0, + "kl_loss_12": 1077.8331237792968, + "kl_loss_17": 284.7055152893066, + "kl_loss_3": 2679.574426269531, + "kl_loss_6": 2071.843896484375, + "learning_rate": 3.292606986744667e-05, + "loss": 1544.5214, + "step": 8850 + }, + { + "ce_loss_12": 3.2922990322113037, + "ce_loss_17": 2.957613694667816, + "ce_loss_23": 2.8416423439979552, + "ce_loss_3": 4.080385613441467, + "ce_loss_6": 3.7736966252326964, + "epoch": 0.886, + "grad_norm": 1040.0, + "kl_loss_12": 1052.9543731689453, + "kl_loss_17": 274.8283012390137, + "kl_loss_3": 2630.809216308594, + "kl_loss_6": 2031.2622009277343, + "learning_rate": 3.23621643850267e-05, + "loss": 1492.9676, + "step": 8860 + }, + { + "ce_loss_12": 3.364612865447998, + "ce_loss_17": 3.027756369113922, + "ce_loss_23": 2.9094764232635497, + "ce_loss_3": 4.14275975227356, + "ce_loss_6": 3.8348689556121824, + "epoch": 0.887, + "grad_norm": 1056.0, + "kl_loss_12": 1063.6302703857423, + "kl_loss_17": 286.01028671264646, + "kl_loss_3": 2647.3041870117186, + "kl_loss_6": 2030.6291870117188, + "learning_rate": 3.180296801041971e-05, + "loss": 1483.5244, + "step": 8870 + }, + { + "ce_loss_12": 3.3761980295181275, + "ce_loss_17": 3.048375737667084, + "ce_loss_23": 2.932830846309662, + "ce_loss_3": 4.171050536632538, + "ce_loss_6": 3.8650680780410767, + "epoch": 0.888, + "grad_norm": 1032.0, + "kl_loss_12": 1039.5651916503907, + "kl_loss_17": 275.2946243286133, + "kl_loss_3": 2641.3774291992186, + "kl_loss_6": 2031.7088256835937, + "learning_rate": 3.124848637472688e-05, + "loss": 1466.1933, + "step": 8880 + }, + { + "ce_loss_12": 3.224430525302887, + "ce_loss_17": 2.885611081123352, + "ce_loss_23": 2.768226385116577, + "ce_loss_3": 4.015231025218964, + "ce_loss_6": 3.7112488865852358, + "epoch": 0.889, + "grad_norm": 1120.0, + "kl_loss_12": 1044.7563507080079, + "kl_loss_17": 271.87509994506837, + "kl_loss_3": 2641.4933227539063, + "kl_loss_6": 2030.9006042480469, + "learning_rate": 3.069872506157212e-05, + "loss": 1483.9009, + "step": 8890 + }, + { + "ce_loss_12": 3.314626932144165, + "ce_loss_17": 2.9772611618041993, + "ce_loss_23": 2.861090135574341, + "ce_loss_3": 4.088821363449097, + "ce_loss_6": 3.7842692494392396, + "epoch": 0.89, + "grad_norm": 1352.0, + "kl_loss_12": 1046.4569183349608, + "kl_loss_17": 275.9215934753418, + "kl_loss_3": 2622.6554321289063, + "kl_loss_6": 2010.8903015136718, + "learning_rate": 3.0153689607045842e-05, + "loss": 1481.274, + "step": 8900 + }, + { + "ce_loss_12": 3.261507201194763, + "ce_loss_17": 2.8940985679626463, + "ce_loss_23": 2.770907407999039, + "ce_loss_3": 4.088374328613281, + "ce_loss_6": 3.7789727330207823, + "epoch": 0.891, + "grad_norm": 1192.0, + "kl_loss_12": 1114.263607788086, + "kl_loss_17": 285.8632568359375, + "kl_loss_3": 2787.009423828125, + "kl_loss_6": 2162.252655029297, + "learning_rate": 2.9613385499648926e-05, + "loss": 1518.8045, + "step": 8910 + }, + { + "ce_loss_12": 3.2768518686294557, + "ce_loss_17": 2.9423407673835755, + "ce_loss_23": 2.8236806631088256, + "ce_loss_3": 4.042646539211273, + "ce_loss_6": 3.7386788964271545, + "epoch": 0.892, + "grad_norm": 904.0, + "kl_loss_12": 1038.9451446533203, + "kl_loss_17": 278.3830856323242, + "kl_loss_3": 2579.437939453125, + "kl_loss_6": 1984.3537109375, + "learning_rate": 2.9077818180237692e-05, + "loss": 1491.4748, + "step": 8920 + }, + { + "ce_loss_12": 3.31893972158432, + "ce_loss_17": 2.9778279304504394, + "ce_loss_23": 2.8547706723213198, + "ce_loss_3": 4.119372272491455, + "ce_loss_6": 3.811346185207367, + "epoch": 0.893, + "grad_norm": 1096.0, + "kl_loss_12": 1043.2595336914062, + "kl_loss_17": 278.8047241210937, + "kl_loss_3": 2641.9256103515627, + "kl_loss_6": 2022.1247985839843, + "learning_rate": 2.8546993041969172e-05, + "loss": 1490.9999, + "step": 8930 + }, + { + "ce_loss_12": 3.3433060526847838, + "ce_loss_17": 3.013638412952423, + "ce_loss_23": 2.8959841132164, + "ce_loss_3": 4.1012047290802, + "ce_loss_6": 3.7924872279167174, + "epoch": 0.894, + "grad_norm": 988.0, + "kl_loss_12": 1031.7579345703125, + "kl_loss_17": 273.2785339355469, + "kl_loss_3": 2578.4865966796874, + "kl_loss_6": 1958.0750671386718, + "learning_rate": 2.802091543024671e-05, + "loss": 1487.326, + "step": 8940 + }, + { + "ce_loss_12": 3.3517987966537475, + "ce_loss_17": 3.0131351947784424, + "ce_loss_23": 2.890279245376587, + "ce_loss_3": 4.135260903835297, + "ce_loss_6": 3.8338452577590942, + "epoch": 0.895, + "grad_norm": 1056.0, + "kl_loss_12": 1064.8489837646484, + "kl_loss_17": 281.3643424987793, + "kl_loss_3": 2666.0608520507812, + "kl_loss_6": 2056.2918701171875, + "learning_rate": 2.7499590642665774e-05, + "loss": 1530.04, + "step": 8950 + }, + { + "ce_loss_12": 3.3488635540008547, + "ce_loss_17": 3.025653636455536, + "ce_loss_23": 2.9073669075965882, + "ce_loss_3": 4.125187158584595, + "ce_loss_6": 3.82365905046463, + "epoch": 0.896, + "grad_norm": 1048.0, + "kl_loss_12": 1037.3114379882813, + "kl_loss_17": 279.98156356811523, + "kl_loss_3": 2598.100732421875, + "kl_loss_6": 1999.0735656738282, + "learning_rate": 2.6983023928961405e-05, + "loss": 1473.7261, + "step": 8960 + }, + { + "ce_loss_12": 3.3273321270942686, + "ce_loss_17": 2.992566633224487, + "ce_loss_23": 2.8700740814208983, + "ce_loss_3": 4.105877768993378, + "ce_loss_6": 3.8104355692863465, + "epoch": 0.897, + "grad_norm": 1440.0, + "kl_loss_12": 1041.3552825927734, + "kl_loss_17": 278.02837600708006, + "kl_loss_3": 2598.092138671875, + "kl_loss_6": 2015.8913024902345, + "learning_rate": 2.6471220490954628e-05, + "loss": 1505.1469, + "step": 8970 + }, + { + "ce_loss_12": 3.3092846512794494, + "ce_loss_17": 2.985722553730011, + "ce_loss_23": 2.875733423233032, + "ce_loss_3": 4.094814789295197, + "ce_loss_6": 3.7927043557167055, + "epoch": 0.898, + "grad_norm": 1160.0, + "kl_loss_12": 1027.3684204101562, + "kl_loss_17": 274.019766998291, + "kl_loss_3": 2601.745983886719, + "kl_loss_6": 2008.9629211425781, + "learning_rate": 2.596418548250029e-05, + "loss": 1486.4215, + "step": 8980 + }, + { + "ce_loss_12": 3.3536044120788575, + "ce_loss_17": 3.0218282103538514, + "ce_loss_23": 2.9010983228683473, + "ce_loss_3": 4.125899636745453, + "ce_loss_6": 3.829993963241577, + "epoch": 0.899, + "grad_norm": 1048.0, + "kl_loss_12": 1056.1575775146484, + "kl_loss_17": 281.9281745910645, + "kl_loss_3": 2629.8561401367188, + "kl_loss_6": 2032.8089904785156, + "learning_rate": 2.5461924009435368e-05, + "loss": 1478.6717, + "step": 8990 + }, + { + "ce_loss_12": 3.3432854652404784, + "ce_loss_17": 3.010783517360687, + "ce_loss_23": 2.8903890252113342, + "ce_loss_3": 4.118599140644074, + "ce_loss_6": 3.810559618473053, + "epoch": 0.9, + "grad_norm": 1072.0, + "kl_loss_12": 1052.9543823242188, + "kl_loss_17": 283.1170166015625, + "kl_loss_3": 2605.7826538085938, + "kl_loss_6": 1999.7805297851562, + "learning_rate": 2.4964441129527336e-05, + "loss": 1508.3487, + "step": 9000 + }, + { + "ce_loss_12": 3.333498179912567, + "ce_loss_17": 3.0122481107711794, + "ce_loss_23": 2.899858772754669, + "ce_loss_3": 4.097084081172943, + "ce_loss_6": 3.7975186944007873, + "epoch": 0.901, + "grad_norm": 2160.0, + "kl_loss_12": 1020.2845001220703, + "kl_loss_17": 271.0269744873047, + "kl_loss_3": 2563.192272949219, + "kl_loss_6": 1966.9169921875, + "learning_rate": 2.4471741852423235e-05, + "loss": 1465.4293, + "step": 9010 + }, + { + "ce_loss_12": 3.399704563617706, + "ce_loss_17": 3.0641690731048583, + "ce_loss_23": 2.943408727645874, + "ce_loss_3": 4.163009667396546, + "ce_loss_6": 3.8632459163665773, + "epoch": 0.902, + "grad_norm": 1176.0, + "kl_loss_12": 1035.918508911133, + "kl_loss_17": 277.75927505493166, + "kl_loss_3": 2568.3744995117186, + "kl_loss_6": 1979.5784423828125, + "learning_rate": 2.3983831139599287e-05, + "loss": 1475.4694, + "step": 9020 + }, + { + "ce_loss_12": 3.3126293659210204, + "ce_loss_17": 2.981877088546753, + "ce_loss_23": 2.8633044242858885, + "ce_loss_3": 4.085747599601746, + "ce_loss_6": 3.7825194239616393, + "epoch": 0.903, + "grad_norm": 1064.0, + "kl_loss_12": 1017.3647033691407, + "kl_loss_17": 272.69321899414064, + "kl_loss_3": 2586.8809204101562, + "kl_loss_6": 1981.6273864746095, + "learning_rate": 2.3500713904311022e-05, + "loss": 1444.9332, + "step": 9030 + }, + { + "ce_loss_12": 3.3346133708953856, + "ce_loss_17": 3.0174012422561645, + "ce_loss_23": 2.904342031478882, + "ce_loss_3": 4.085952639579773, + "ce_loss_6": 3.79553884267807, + "epoch": 0.904, + "grad_norm": 1144.0, + "kl_loss_12": 1006.2362091064454, + "kl_loss_17": 267.2675193786621, + "kl_loss_3": 2516.152868652344, + "kl_loss_6": 1946.0141967773438, + "learning_rate": 2.3022395011543685e-05, + "loss": 1443.6189, + "step": 9040 + }, + { + "ce_loss_12": 3.385925018787384, + "ce_loss_17": 3.0416790723800657, + "ce_loss_23": 2.9204747676849365, + "ce_loss_3": 4.156314277648926, + "ce_loss_6": 3.848258936405182, + "epoch": 0.905, + "grad_norm": 1448.0, + "kl_loss_12": 1069.7081512451173, + "kl_loss_17": 286.5039176940918, + "kl_loss_3": 2623.0983764648436, + "kl_loss_6": 2016.3146728515626, + "learning_rate": 2.2548879277963063e-05, + "loss": 1520.4957, + "step": 9050 + }, + { + "ce_loss_12": 3.2931102752685546, + "ce_loss_17": 2.968023347854614, + "ce_loss_23": 2.8518067359924317, + "ce_loss_3": 4.063058459758759, + "ce_loss_6": 3.7594056487083436, + "epoch": 0.906, + "grad_norm": 1088.0, + "kl_loss_12": 1022.5076141357422, + "kl_loss_17": 273.32446670532227, + "kl_loss_3": 2567.251330566406, + "kl_loss_6": 1970.2335693359375, + "learning_rate": 2.208017147186736e-05, + "loss": 1434.3865, + "step": 9060 + }, + { + "ce_loss_12": 3.2922698259353638, + "ce_loss_17": 2.9559255719184874, + "ce_loss_23": 2.837369775772095, + "ce_loss_3": 4.067186653614044, + "ce_loss_6": 3.764479637145996, + "epoch": 0.907, + "grad_norm": 1088.0, + "kl_loss_12": 1038.8773620605468, + "kl_loss_17": 273.9896865844727, + "kl_loss_3": 2609.279541015625, + "kl_loss_6": 2013.705078125, + "learning_rate": 2.1616276313139227e-05, + "loss": 1471.0474, + "step": 9070 + }, + { + "ce_loss_12": 3.3338735938072204, + "ce_loss_17": 3.0027177572250365, + "ce_loss_23": 2.8810632467269897, + "ce_loss_3": 4.112732636928558, + "ce_loss_6": 3.8151703476905823, + "epoch": 0.908, + "grad_norm": 1032.0, + "kl_loss_12": 1036.070867919922, + "kl_loss_17": 277.0100456237793, + "kl_loss_3": 2601.8918701171874, + "kl_loss_6": 2009.253973388672, + "learning_rate": 2.1157198473197415e-05, + "loss": 1500.7545, + "step": 9080 + }, + { + "ce_loss_12": 3.3962167143821715, + "ce_loss_17": 3.0618329763412477, + "ce_loss_23": 2.937837529182434, + "ce_loss_3": 4.186863827705383, + "ce_loss_6": 3.875001060962677, + "epoch": 0.909, + "grad_norm": 1032.0, + "kl_loss_12": 1066.4001190185547, + "kl_loss_17": 285.7182487487793, + "kl_loss_3": 2643.5581665039062, + "kl_loss_6": 2037.5399230957032, + "learning_rate": 2.0702942574950812e-05, + "loss": 1497.7828, + "step": 9090 + }, + { + "ce_loss_12": 3.3390152215957642, + "ce_loss_17": 2.999873125553131, + "ce_loss_23": 2.8761911392211914, + "ce_loss_3": 4.124549388885498, + "ce_loss_6": 3.8125956773757936, + "epoch": 0.91, + "grad_norm": 1048.0, + "kl_loss_12": 1069.9019256591796, + "kl_loss_17": 287.98951568603513, + "kl_loss_3": 2650.0345458984375, + "kl_loss_6": 2025.361865234375, + "learning_rate": 2.025351319275137e-05, + "loss": 1498.6699, + "step": 9100 + }, + { + "ce_loss_12": 3.4453240752220156, + "ce_loss_17": 3.1048492908477785, + "ce_loss_23": 2.9836212515830995, + "ce_loss_3": 4.217355704307556, + "ce_loss_6": 3.9146528840065002, + "epoch": 0.911, + "grad_norm": 1104.0, + "kl_loss_12": 1077.796401977539, + "kl_loss_17": 285.53475799560545, + "kl_loss_3": 2642.6656860351563, + "kl_loss_6": 2044.46884765625, + "learning_rate": 1.9808914852347816e-05, + "loss": 1532.4327, + "step": 9110 + }, + { + "ce_loss_12": 3.2942785143852236, + "ce_loss_17": 2.9586670756340028, + "ce_loss_23": 2.833610546588898, + "ce_loss_3": 4.07194048166275, + "ce_loss_6": 3.7556120276451113, + "epoch": 0.912, + "grad_norm": 1032.0, + "kl_loss_12": 1044.191128540039, + "kl_loss_17": 278.99835510253905, + "kl_loss_3": 2605.4442626953123, + "kl_loss_6": 1986.3443481445313, + "learning_rate": 1.9369152030840554e-05, + "loss": 1475.4129, + "step": 9120 + }, + { + "ce_loss_12": 3.3684799432754517, + "ce_loss_17": 3.038332152366638, + "ce_loss_23": 2.9220212578773497, + "ce_loss_3": 4.150288593769074, + "ce_loss_6": 3.8487510085105896, + "epoch": 0.913, + "grad_norm": 1368.0, + "kl_loss_12": 1052.0964294433593, + "kl_loss_17": 275.7938705444336, + "kl_loss_3": 2654.1997680664062, + "kl_loss_6": 2046.3094360351563, + "learning_rate": 1.893422915663645e-05, + "loss": 1500.7535, + "step": 9130 + }, + { + "ce_loss_12": 3.2739500761032105, + "ce_loss_17": 2.925761067867279, + "ce_loss_23": 2.8034397840499876, + "ce_loss_3": 4.088464558124542, + "ce_loss_6": 3.7661290884017946, + "epoch": 0.914, + "grad_norm": 868.0, + "kl_loss_12": 1069.2874298095703, + "kl_loss_17": 282.2047752380371, + "kl_loss_3": 2702.7495727539062, + "kl_loss_6": 2066.0608825683594, + "learning_rate": 1.850415060940386e-05, + "loss": 1517.72, + "step": 9140 + }, + { + "ce_loss_12": 3.3597048044204714, + "ce_loss_17": 3.0365005016326903, + "ce_loss_23": 2.921056258678436, + "ce_loss_3": 4.113396620750427, + "ce_loss_6": 3.813002622127533, + "epoch": 0.915, + "grad_norm": 1064.0, + "kl_loss_12": 1032.023666381836, + "kl_loss_17": 275.6730438232422, + "kl_loss_3": 2575.0119384765626, + "kl_loss_6": 1964.8316711425782, + "learning_rate": 1.8078920720028978e-05, + "loss": 1474.0727, + "step": 9150 + }, + { + "ce_loss_12": 3.283889102935791, + "ce_loss_17": 2.9600313901901245, + "ce_loss_23": 2.849053978919983, + "ce_loss_3": 4.043579959869385, + "ce_loss_6": 3.7437153458595276, + "epoch": 0.916, + "grad_norm": 1400.0, + "kl_loss_12": 1023.5445556640625, + "kl_loss_17": 270.17430953979493, + "kl_loss_3": 2535.034033203125, + "kl_loss_6": 1954.6587158203124, + "learning_rate": 1.765854377057219e-05, + "loss": 1481.7438, + "step": 9160 + }, + { + "ce_loss_12": 3.261327934265137, + "ce_loss_17": 2.9379109382629394, + "ce_loss_23": 2.826120972633362, + "ce_loss_3": 4.033465266227722, + "ce_loss_6": 3.731200802326202, + "epoch": 0.917, + "grad_norm": 1024.0, + "kl_loss_12": 1006.3924865722656, + "kl_loss_17": 265.4219459533691, + "kl_loss_3": 2555.5319702148436, + "kl_loss_6": 1970.5117919921875, + "learning_rate": 1.724302399422456e-05, + "loss": 1468.8849, + "step": 9170 + }, + { + "ce_loss_12": 3.259600067138672, + "ce_loss_17": 2.918641984462738, + "ce_loss_23": 2.7934481501579285, + "ce_loss_3": 4.037407219409943, + "ce_loss_6": 3.722501480579376, + "epoch": 0.918, + "grad_norm": 1024.0, + "kl_loss_12": 1066.1141998291016, + "kl_loss_17": 286.471826171875, + "kl_loss_3": 2632.22890625, + "kl_loss_6": 2012.1058410644532, + "learning_rate": 1.683236557526574e-05, + "loss": 1494.4383, + "step": 9180 + }, + { + "ce_loss_12": 3.332295763492584, + "ce_loss_17": 3.0142446875572206, + "ce_loss_23": 2.9020984530448914, + "ce_loss_3": 4.076275789737702, + "ce_loss_6": 3.778683769702911, + "epoch": 0.919, + "grad_norm": 832.0, + "kl_loss_12": 1006.3529113769531, + "kl_loss_17": 267.5891944885254, + "kl_loss_3": 2506.306994628906, + "kl_loss_6": 1916.475634765625, + "learning_rate": 1.6426572649021475e-05, + "loss": 1464.321, + "step": 9190 + }, + { + "ce_loss_12": 3.355605161190033, + "ce_loss_17": 3.045579528808594, + "ce_loss_23": 2.931298625469208, + "ce_loss_3": 4.095176577568054, + "ce_loss_6": 3.79943608045578, + "epoch": 0.92, + "grad_norm": 1352.0, + "kl_loss_12": 1008.678677368164, + "kl_loss_17": 274.654532623291, + "kl_loss_3": 2516.2755615234373, + "kl_loss_6": 1922.5859741210938, + "learning_rate": 1.6025649301821876e-05, + "loss": 1456.7072, + "step": 9200 + }, + { + "ce_loss_12": 3.3600135803222657, + "ce_loss_17": 3.036742627620697, + "ce_loss_23": 2.914152812957764, + "ce_loss_3": 4.0989128947258, + "ce_loss_6": 3.8089195847511292, + "epoch": 0.921, + "grad_norm": 1040.0, + "kl_loss_12": 1034.201220703125, + "kl_loss_17": 278.173348236084, + "kl_loss_3": 2540.3933715820312, + "kl_loss_6": 1959.813055419922, + "learning_rate": 1.5629599570960716e-05, + "loss": 1449.2359, + "step": 9210 + }, + { + "ce_loss_12": 3.281749439239502, + "ce_loss_17": 2.9552075147628782, + "ce_loss_23": 2.8368995785713196, + "ce_loss_3": 4.073426759243011, + "ce_loss_6": 3.761858856678009, + "epoch": 0.922, + "grad_norm": 864.0, + "kl_loss_12": 1043.070327758789, + "kl_loss_17": 276.25808944702146, + "kl_loss_3": 2643.6991577148438, + "kl_loss_6": 2027.4319091796874, + "learning_rate": 1.5238427444654367e-05, + "loss": 1485.8428, + "step": 9220 + }, + { + "ce_loss_12": 3.3239726305007933, + "ce_loss_17": 2.9993454456329345, + "ce_loss_23": 2.880155599117279, + "ce_loss_3": 4.090979540348053, + "ce_loss_6": 3.7864125967025757, + "epoch": 0.923, + "grad_norm": 812.0, + "kl_loss_12": 1024.8006652832032, + "kl_loss_17": 274.18183517456055, + "kl_loss_3": 2569.9059326171873, + "kl_loss_6": 1968.6023559570312, + "learning_rate": 1.4852136862001764e-05, + "loss": 1467.6075, + "step": 9230 + }, + { + "ce_loss_12": 3.2993701934814452, + "ce_loss_17": 2.9703409552574156, + "ce_loss_23": 2.854680836200714, + "ce_loss_3": 4.047952282428741, + "ce_loss_6": 3.7495328783988953, + "epoch": 0.924, + "grad_norm": 1088.0, + "kl_loss_12": 1021.3720611572265, + "kl_loss_17": 271.0319076538086, + "kl_loss_3": 2543.8160400390625, + "kl_loss_6": 1948.3985900878906, + "learning_rate": 1.4470731712944884e-05, + "loss": 1473.6795, + "step": 9240 + }, + { + "ce_loss_12": 3.330318236351013, + "ce_loss_17": 2.995251107215881, + "ce_loss_23": 2.874153757095337, + "ce_loss_3": 4.114708054065704, + "ce_loss_6": 3.7896088123321534, + "epoch": 0.925, + "grad_norm": 1312.0, + "kl_loss_12": 1042.4645599365235, + "kl_loss_17": 280.9706100463867, + "kl_loss_3": 2619.0177490234373, + "kl_loss_6": 1987.1992736816405, + "learning_rate": 1.4094215838229174e-05, + "loss": 1503.7229, + "step": 9250 + }, + { + "ce_loss_12": 3.3083136796951296, + "ce_loss_17": 2.9710094690322877, + "ce_loss_23": 2.851603698730469, + "ce_loss_3": 4.09444340467453, + "ce_loss_6": 3.784080910682678, + "epoch": 0.926, + "grad_norm": 1072.0, + "kl_loss_12": 1049.132028198242, + "kl_loss_17": 276.60772094726565, + "kl_loss_3": 2644.8538818359375, + "kl_loss_6": 2025.5753662109375, + "learning_rate": 1.372259302936546e-05, + "loss": 1540.0123, + "step": 9260 + }, + { + "ce_loss_12": 3.401785659790039, + "ce_loss_17": 3.0654093503952025, + "ce_loss_23": 2.944291520118713, + "ce_loss_3": 4.174069476127625, + "ce_loss_6": 3.8684807658195495, + "epoch": 0.927, + "grad_norm": 1200.0, + "kl_loss_12": 1050.0497802734376, + "kl_loss_17": 288.2478630065918, + "kl_loss_3": 2608.3745239257814, + "kl_loss_6": 2001.8755798339844, + "learning_rate": 1.3355867028591206e-05, + "loss": 1474.5029, + "step": 9270 + }, + { + "ce_loss_12": 3.2946610689163207, + "ce_loss_17": 2.9762078881263734, + "ce_loss_23": 2.857401442527771, + "ce_loss_3": 4.049395573139191, + "ce_loss_6": 3.7476461291313172, + "epoch": 0.928, + "grad_norm": 1272.0, + "kl_loss_12": 1026.7928558349608, + "kl_loss_17": 271.5105369567871, + "kl_loss_3": 2554.0462646484375, + "kl_loss_6": 1948.730401611328, + "learning_rate": 1.2994041528833267e-05, + "loss": 1462.0882, + "step": 9280 + }, + { + "ce_loss_12": 3.3038975596427917, + "ce_loss_17": 2.975873041152954, + "ce_loss_23": 2.8577345252037047, + "ce_loss_3": 4.068048667907715, + "ce_loss_6": 3.7713038444519045, + "epoch": 0.929, + "grad_norm": 1248.0, + "kl_loss_12": 1031.1834899902344, + "kl_loss_17": 270.2506500244141, + "kl_loss_3": 2588.7155151367188, + "kl_loss_6": 1989.1662902832031, + "learning_rate": 1.2637120173670358e-05, + "loss": 1462.8256, + "step": 9290 + }, + { + "ce_loss_12": 3.329246151447296, + "ce_loss_17": 2.9927667379379272, + "ce_loss_23": 2.871297037601471, + "ce_loss_3": 4.111742997169495, + "ce_loss_6": 3.8050437331199647, + "epoch": 0.93, + "grad_norm": 1480.0, + "kl_loss_12": 1047.7047393798828, + "kl_loss_17": 280.56040649414064, + "kl_loss_3": 2612.6916259765626, + "kl_loss_6": 2005.778125, + "learning_rate": 1.2285106557296478e-05, + "loss": 1476.817, + "step": 9300 + }, + { + "ce_loss_12": 3.2389184474945067, + "ce_loss_17": 2.891287457942963, + "ce_loss_23": 2.7736205101013183, + "ce_loss_3": 4.076551246643066, + "ce_loss_6": 3.7549044370651243, + "epoch": 0.931, + "grad_norm": 988.0, + "kl_loss_12": 1067.2639923095703, + "kl_loss_17": 278.7872940063477, + "kl_loss_3": 2738.7692749023436, + "kl_loss_6": 2110.3818420410157, + "learning_rate": 1.1938004224484989e-05, + "loss": 1510.7908, + "step": 9310 + }, + { + "ce_loss_12": 3.4300517201423646, + "ce_loss_17": 3.103699254989624, + "ce_loss_23": 2.9833187103271483, + "ce_loss_3": 4.195168387889862, + "ce_loss_6": 3.888136649131775, + "epoch": 0.932, + "grad_norm": 1012.0, + "kl_loss_12": 1042.8537139892578, + "kl_loss_17": 280.47555923461914, + "kl_loss_3": 2594.4358642578127, + "kl_loss_6": 1984.2061462402344, + "learning_rate": 1.1595816670552429e-05, + "loss": 1501.0504, + "step": 9320 + }, + { + "ce_loss_12": 3.3418365716934204, + "ce_loss_17": 3.026378798484802, + "ce_loss_23": 2.9092856764793398, + "ce_loss_3": 4.11734037399292, + "ce_loss_6": 3.8102577209472654, + "epoch": 0.933, + "grad_norm": 1544.0, + "kl_loss_12": 1018.7249389648438, + "kl_loss_17": 275.87775268554685, + "kl_loss_3": 2572.297644042969, + "kl_loss_6": 1973.426904296875, + "learning_rate": 1.1258547341323699e-05, + "loss": 1452.3419, + "step": 9330 + }, + { + "ce_loss_12": 3.384983277320862, + "ce_loss_17": 3.058712899684906, + "ce_loss_23": 2.9401206731796266, + "ce_loss_3": 4.143124210834503, + "ce_loss_6": 3.8467655301094057, + "epoch": 0.934, + "grad_norm": 1020.0, + "kl_loss_12": 1042.8963317871094, + "kl_loss_17": 279.26527938842776, + "kl_loss_3": 2582.5818603515627, + "kl_loss_6": 1990.2594421386718, + "learning_rate": 1.0926199633097156e-05, + "loss": 1467.6707, + "step": 9340 + }, + { + "ce_loss_12": 3.3787927627563477, + "ce_loss_17": 3.065500867366791, + "ce_loss_23": 2.9487602353096007, + "ce_loss_3": 4.118912374973297, + "ce_loss_6": 3.823423957824707, + "epoch": 0.935, + "grad_norm": 1040.0, + "kl_loss_12": 1013.5736785888672, + "kl_loss_17": 271.54649047851564, + "kl_loss_3": 2529.153271484375, + "kl_loss_6": 1936.4615600585937, + "learning_rate": 1.0598776892610684e-05, + "loss": 1487.6551, + "step": 9350 + }, + { + "ce_loss_12": 3.2322107553482056, + "ce_loss_17": 2.892566466331482, + "ce_loss_23": 2.778317618370056, + "ce_loss_3": 4.017880356311798, + "ce_loss_6": 3.70479074716568, + "epoch": 0.936, + "grad_norm": 1056.0, + "kl_loss_12": 1048.4545867919921, + "kl_loss_17": 272.3912384033203, + "kl_loss_3": 2631.8158935546876, + "kl_loss_6": 2015.51806640625, + "learning_rate": 1.0276282417007399e-05, + "loss": 1472.0926, + "step": 9360 + }, + { + "ce_loss_12": 3.348423981666565, + "ce_loss_17": 3.0298909902572633, + "ce_loss_23": 2.9166847348213194, + "ce_loss_3": 4.097442483901977, + "ce_loss_6": 3.80429527759552, + "epoch": 0.937, + "grad_norm": 1128.0, + "kl_loss_12": 1012.1721435546875, + "kl_loss_17": 268.58225708007814, + "kl_loss_3": 2518.1324829101563, + "kl_loss_6": 1936.9852722167968, + "learning_rate": 9.958719453803277e-06, + "loss": 1457.2605, + "step": 9370 + }, + { + "ce_loss_12": 3.3690081238746643, + "ce_loss_17": 3.030905532836914, + "ce_loss_23": 2.9100383877754212, + "ce_loss_3": 4.134298634529114, + "ce_loss_6": 3.834771382808685, + "epoch": 0.938, + "grad_norm": 1128.0, + "kl_loss_12": 1051.0932312011719, + "kl_loss_17": 276.5739906311035, + "kl_loss_3": 2599.291943359375, + "kl_loss_6": 2003.5282287597656, + "learning_rate": 9.646091200853802e-06, + "loss": 1469.2527, + "step": 9380 + }, + { + "ce_loss_12": 3.3185535073280334, + "ce_loss_17": 2.9908557176589965, + "ce_loss_23": 2.873013174533844, + "ce_loss_3": 4.07992981672287, + "ce_loss_6": 3.7722710847854612, + "epoch": 0.939, + "grad_norm": 2736.0, + "kl_loss_12": 1026.4719848632812, + "kl_loss_17": 272.8523124694824, + "kl_loss_3": 2563.2376708984375, + "kl_loss_6": 1953.9481689453125, + "learning_rate": 9.338400806321978e-06, + "loss": 1427.4537, + "step": 9390 + }, + { + "ce_loss_12": 3.352263617515564, + "ce_loss_17": 3.0240584135055544, + "ce_loss_23": 2.9010123133659365, + "ce_loss_3": 4.113032567501068, + "ce_loss_6": 3.8156125426292418, + "epoch": 0.94, + "grad_norm": 824.0, + "kl_loss_12": 1039.0976806640624, + "kl_loss_17": 283.21082916259763, + "kl_loss_3": 2561.940295410156, + "kl_loss_6": 1986.542413330078, + "learning_rate": 9.035651368646646e-06, + "loss": 1458.089, + "step": 9400 + }, + { + "ce_loss_12": 3.3536390900611877, + "ce_loss_17": 3.030505657196045, + "ce_loss_23": 2.9178043007850647, + "ce_loss_3": 4.11185348033905, + "ce_loss_6": 3.80875107049942, + "epoch": 0.941, + "grad_norm": 1144.0, + "kl_loss_12": 1023.1300567626953, + "kl_loss_17": 269.4735565185547, + "kl_loss_3": 2550.777551269531, + "kl_loss_6": 1962.2735717773437, + "learning_rate": 8.737845936511335e-06, + "loss": 1468.4475, + "step": 9410 + }, + { + "ce_loss_12": 3.318327307701111, + "ce_loss_17": 2.984967792034149, + "ce_loss_23": 2.862468791007996, + "ce_loss_3": 4.097449505329132, + "ce_loss_6": 3.794388008117676, + "epoch": 0.942, + "grad_norm": 1344.0, + "kl_loss_12": 1049.340396118164, + "kl_loss_17": 281.8879737854004, + "kl_loss_3": 2629.4581298828125, + "kl_loss_6": 2026.4955139160156, + "learning_rate": 8.444987508813451e-06, + "loss": 1479.8018, + "step": 9420 + }, + { + "ce_loss_12": 3.2956662774086, + "ce_loss_17": 2.946325933933258, + "ce_loss_23": 2.8255608439445496, + "ce_loss_3": 4.088038122653961, + "ce_loss_6": 3.7815781831741333, + "epoch": 0.943, + "grad_norm": 1352.0, + "kl_loss_12": 1077.2649169921874, + "kl_loss_17": 284.64258270263673, + "kl_loss_3": 2692.6649780273438, + "kl_loss_6": 2081.035852050781, + "learning_rate": 8.157079034633974e-06, + "loss": 1504.0758, + "step": 9430 + }, + { + "ce_loss_12": 3.269844686985016, + "ce_loss_17": 2.935304272174835, + "ce_loss_23": 2.81859130859375, + "ce_loss_3": 4.045350301265716, + "ce_loss_6": 3.7425882458686828, + "epoch": 0.944, + "grad_norm": 1240.0, + "kl_loss_12": 1048.7982055664063, + "kl_loss_17": 275.0502960205078, + "kl_loss_3": 2619.5218505859375, + "kl_loss_6": 2022.1584899902343, + "learning_rate": 7.874123413208145e-06, + "loss": 1472.9727, + "step": 9440 + }, + { + "ce_loss_12": 3.2593874454498293, + "ce_loss_17": 2.9077999234199523, + "ce_loss_23": 2.791432774066925, + "ce_loss_3": 4.056062710285187, + "ce_loss_6": 3.7430989146232605, + "epoch": 0.945, + "grad_norm": 920.0, + "kl_loss_12": 1054.8998809814452, + "kl_loss_17": 277.63961563110354, + "kl_loss_3": 2658.3320190429686, + "kl_loss_6": 2038.6107299804687, + "learning_rate": 7.59612349389599e-06, + "loss": 1498.3892, + "step": 9450 + }, + { + "ce_loss_12": 3.3173531889915466, + "ce_loss_17": 2.995698320865631, + "ce_loss_23": 2.881583273410797, + "ce_loss_3": 4.064676368236542, + "ce_loss_6": 3.7659632921218873, + "epoch": 0.946, + "grad_norm": 1368.0, + "kl_loss_12": 1005.65966796875, + "kl_loss_17": 268.5233100891113, + "kl_loss_3": 2510.1213134765626, + "kl_loss_6": 1912.729071044922, + "learning_rate": 7.323082076153509e-06, + "loss": 1452.6504, + "step": 9460 + }, + { + "ce_loss_12": 3.3648293733596804, + "ce_loss_17": 3.0392589926719666, + "ce_loss_23": 2.921140217781067, + "ce_loss_3": 4.111345851421357, + "ce_loss_6": 3.815649449825287, + "epoch": 0.947, + "grad_norm": 792.0, + "kl_loss_12": 1030.4669982910157, + "kl_loss_17": 280.2841407775879, + "kl_loss_3": 2527.8580322265625, + "kl_loss_6": 1948.9057983398438, + "learning_rate": 7.055001909504755e-06, + "loss": 1481.591, + "step": 9470 + }, + { + "ce_loss_12": 3.400721490383148, + "ce_loss_17": 3.0784170508384703, + "ce_loss_23": 2.9573259115219117, + "ce_loss_3": 4.157055807113648, + "ce_loss_6": 3.8579179406166078, + "epoch": 0.948, + "grad_norm": 1032.0, + "kl_loss_12": 1039.483480834961, + "kl_loss_17": 277.8528198242187, + "kl_loss_3": 2571.344763183594, + "kl_loss_6": 1984.4642028808594, + "learning_rate": 6.791885693514133e-06, + "loss": 1473.3818, + "step": 9480 + }, + { + "ce_loss_12": 3.3163992047309874, + "ce_loss_17": 2.985028529167175, + "ce_loss_23": 2.8660680651664734, + "ce_loss_3": 4.108574676513672, + "ce_loss_6": 3.7938371777534483, + "epoch": 0.949, + "grad_norm": 1536.0, + "kl_loss_12": 1044.1281616210938, + "kl_loss_17": 278.31127853393554, + "kl_loss_3": 2656.0906494140627, + "kl_loss_6": 2034.2154418945313, + "learning_rate": 6.533736077758867e-06, + "loss": 1498.9166, + "step": 9490 + }, + { + "ce_loss_12": 3.295397973060608, + "ce_loss_17": 2.953207588195801, + "ce_loss_23": 2.8314607620239256, + "ce_loss_3": 4.096197354793548, + "ce_loss_6": 3.7886332869529724, + "epoch": 0.95, + "grad_norm": 1320.0, + "kl_loss_12": 1081.0685180664063, + "kl_loss_17": 285.80748291015624, + "kl_loss_3": 2691.1257080078126, + "kl_loss_6": 2082.5201416015625, + "learning_rate": 6.2805556618028556e-06, + "loss": 1492.1794, + "step": 9500 + }, + { + "ce_loss_12": 3.3366139054298403, + "ce_loss_17": 3.0263991355895996, + "ce_loss_23": 2.9105852842330933, + "ce_loss_3": 4.09708423614502, + "ce_loss_6": 3.7928590297698976, + "epoch": 0.951, + "grad_norm": 1368.0, + "kl_loss_12": 986.2284301757812, + "kl_loss_17": 267.4480796813965, + "kl_loss_3": 2520.3510498046876, + "kl_loss_6": 1924.9207092285155, + "learning_rate": 6.032346995169968e-06, + "loss": 1412.5211, + "step": 9510 + }, + { + "ce_loss_12": 3.3517472743988037, + "ce_loss_17": 3.0306913375854494, + "ce_loss_23": 2.916367495059967, + "ce_loss_3": 4.121383500099182, + "ce_loss_6": 3.81394704580307, + "epoch": 0.952, + "grad_norm": 1040.0, + "kl_loss_12": 1032.4408020019532, + "kl_loss_17": 276.3887512207031, + "kl_loss_3": 2577.0925537109374, + "kl_loss_6": 1970.7738647460938, + "learning_rate": 5.789112577318789e-06, + "loss": 1458.9979, + "step": 9520 + }, + { + "ce_loss_12": 3.3498944520950316, + "ce_loss_17": 3.016447389125824, + "ce_loss_23": 2.89688059091568, + "ce_loss_3": 4.12940376996994, + "ce_loss_6": 3.8246996641159057, + "epoch": 0.953, + "grad_norm": 932.0, + "kl_loss_12": 1050.8054412841798, + "kl_loss_17": 279.5018035888672, + "kl_loss_3": 2623.3647338867186, + "kl_loss_6": 2023.7264892578125, + "learning_rate": 5.550854857617194e-06, + "loss": 1462.5711, + "step": 9530 + }, + { + "ce_loss_12": 3.3379459381103516, + "ce_loss_17": 3.001688039302826, + "ce_loss_23": 2.877410364151001, + "ce_loss_3": 4.137910008430481, + "ce_loss_6": 3.8291539192199706, + "epoch": 0.954, + "grad_norm": 948.0, + "kl_loss_12": 1063.9521179199219, + "kl_loss_17": 285.664315032959, + "kl_loss_3": 2678.9524658203127, + "kl_loss_6": 2059.6889831542967, + "learning_rate": 5.317576235317756e-06, + "loss": 1505.4752, + "step": 9540 + }, + { + "ce_loss_12": 3.3384133100509645, + "ce_loss_17": 3.017950987815857, + "ce_loss_23": 2.9062692165374755, + "ce_loss_3": 4.092934966087341, + "ce_loss_6": 3.778127145767212, + "epoch": 0.955, + "grad_norm": 1032.0, + "kl_loss_12": 995.489956665039, + "kl_loss_17": 267.39564056396483, + "kl_loss_3": 2516.7044067382812, + "kl_loss_6": 1901.7774536132813, + "learning_rate": 5.089279059533658e-06, + "loss": 1463.7273, + "step": 9550 + }, + { + "ce_loss_12": 3.407905399799347, + "ce_loss_17": 3.0725943088531493, + "ce_loss_23": 2.9485292077064513, + "ce_loss_3": 4.161868751049042, + "ce_loss_6": 3.857768952846527, + "epoch": 0.956, + "grad_norm": 1032.0, + "kl_loss_12": 1044.148989868164, + "kl_loss_17": 283.47854843139646, + "kl_loss_3": 2572.16533203125, + "kl_loss_6": 1962.809149169922, + "learning_rate": 4.865965629214819e-06, + "loss": 1457.177, + "step": 9560 + }, + { + "ce_loss_12": 3.3561294078826904, + "ce_loss_17": 3.022306752204895, + "ce_loss_23": 2.9044751048088076, + "ce_loss_3": 4.1291221380233765, + "ce_loss_6": 3.825023341178894, + "epoch": 0.957, + "grad_norm": 992.0, + "kl_loss_12": 1050.428485107422, + "kl_loss_17": 279.6668502807617, + "kl_loss_3": 2627.7093994140623, + "kl_loss_6": 2018.747003173828, + "learning_rate": 4.6476381931251366e-06, + "loss": 1466.3221, + "step": 9570 + }, + { + "ce_loss_12": 3.334410846233368, + "ce_loss_17": 3.010200834274292, + "ce_loss_23": 2.892228066921234, + "ce_loss_3": 4.09078129529953, + "ce_loss_6": 3.7890233635902404, + "epoch": 0.958, + "grad_norm": 936.0, + "kl_loss_12": 1018.1339416503906, + "kl_loss_17": 272.4146240234375, + "kl_loss_3": 2554.4165771484377, + "kl_loss_6": 1950.5408752441406, + "learning_rate": 4.434298949819449e-06, + "loss": 1460.3959, + "step": 9580 + }, + { + "ce_loss_12": 3.324360358715057, + "ce_loss_17": 2.981010985374451, + "ce_loss_23": 2.857758712768555, + "ce_loss_3": 4.125252890586853, + "ce_loss_6": 3.8098152041435243, + "epoch": 0.959, + "grad_norm": 1004.0, + "kl_loss_12": 1088.3455841064454, + "kl_loss_17": 288.2356979370117, + "kl_loss_3": 2720.9905395507812, + "kl_loss_6": 2097.9127197265625, + "learning_rate": 4.2259500476214406e-06, + "loss": 1509.5423, + "step": 9590 + }, + { + "ce_loss_12": 3.2877951502799987, + "ce_loss_17": 2.954547381401062, + "ce_loss_23": 2.8363230228424072, + "ce_loss_3": 4.070486485958099, + "ce_loss_6": 3.769054639339447, + "epoch": 0.96, + "grad_norm": 944.0, + "kl_loss_12": 1045.826885986328, + "kl_loss_17": 276.11616744995115, + "kl_loss_3": 2624.5047119140627, + "kl_loss_6": 2027.1597595214844, + "learning_rate": 4.02259358460233e-06, + "loss": 1472.7125, + "step": 9600 + }, + { + "ce_loss_12": 3.3414696574211122, + "ce_loss_17": 3.0194800734519958, + "ce_loss_23": 2.8990446448326113, + "ce_loss_3": 4.104188013076782, + "ce_loss_6": 3.8056410789489745, + "epoch": 0.961, + "grad_norm": 1072.0, + "kl_loss_12": 1023.0301330566406, + "kl_loss_17": 278.8875244140625, + "kl_loss_3": 2551.333874511719, + "kl_loss_6": 1957.9582397460938, + "learning_rate": 3.8242316085594916e-06, + "loss": 1456.6529, + "step": 9610 + }, + { + "ce_loss_12": 3.2622671365737914, + "ce_loss_17": 2.914546084403992, + "ce_loss_23": 2.791579818725586, + "ce_loss_3": 4.080668830871582, + "ce_loss_6": 3.7585220336914062, + "epoch": 0.962, + "grad_norm": 1456.0, + "kl_loss_12": 1076.800796508789, + "kl_loss_17": 285.5383560180664, + "kl_loss_3": 2724.994970703125, + "kl_loss_6": 2097.9461975097656, + "learning_rate": 3.630866116995757e-06, + "loss": 1529.8656, + "step": 9620 + }, + { + "ce_loss_12": 3.3696959376335145, + "ce_loss_17": 3.051957297325134, + "ce_loss_23": 2.9373905539512633, + "ce_loss_3": 4.120000338554382, + "ce_loss_6": 3.8250346064567564, + "epoch": 0.963, + "grad_norm": 944.0, + "kl_loss_12": 1015.1488037109375, + "kl_loss_17": 271.67421875, + "kl_loss_3": 2542.3350463867187, + "kl_loss_6": 1950.3465576171875, + "learning_rate": 3.4424990570994797e-06, + "loss": 1484.4218, + "step": 9630 + }, + { + "ce_loss_12": 3.3661725878715516, + "ce_loss_17": 3.044158565998077, + "ce_loss_23": 2.9250866651535032, + "ce_loss_3": 4.131625187397003, + "ce_loss_6": 3.826724684238434, + "epoch": 0.964, + "grad_norm": 1104.0, + "kl_loss_12": 1032.4666778564454, + "kl_loss_17": 274.765673828125, + "kl_loss_3": 2582.521203613281, + "kl_loss_6": 1976.779034423828, + "learning_rate": 3.2591323257248896e-06, + "loss": 1470.472, + "step": 9640 + }, + { + "ce_loss_12": 3.2373790383338927, + "ce_loss_17": 2.9063490629196167, + "ce_loss_23": 2.790112245082855, + "ce_loss_3": 4.014607954025268, + "ce_loss_6": 3.7152130365371705, + "epoch": 0.965, + "grad_norm": 1056.0, + "kl_loss_12": 1035.9166870117188, + "kl_loss_17": 273.9448547363281, + "kl_loss_3": 2597.5956787109376, + "kl_loss_6": 2007.8367492675782, + "learning_rate": 3.0807677693729385e-06, + "loss": 1492.4096, + "step": 9650 + }, + { + "ce_loss_12": 3.4023047566413878, + "ce_loss_17": 3.0736619353294374, + "ce_loss_23": 2.9579479098320007, + "ce_loss_3": 4.1481396675109865, + "ce_loss_6": 3.8566255927085877, + "epoch": 0.966, + "grad_norm": 956.0, + "kl_loss_12": 1027.349755859375, + "kl_loss_17": 272.975756072998, + "kl_loss_3": 2543.197399902344, + "kl_loss_6": 1959.232568359375, + "learning_rate": 2.9074071841727055e-06, + "loss": 1445.7012, + "step": 9660 + }, + { + "ce_loss_12": 3.3433603644371033, + "ce_loss_17": 3.0127647042274477, + "ce_loss_23": 2.8944259762763975, + "ce_loss_3": 4.101817774772644, + "ce_loss_6": 3.796090304851532, + "epoch": 0.967, + "grad_norm": 1224.0, + "kl_loss_12": 1034.7205352783203, + "kl_loss_17": 277.1310554504395, + "kl_loss_3": 2587.0287109375, + "kl_loss_6": 1977.2501525878906, + "learning_rate": 2.739052315863355e-06, + "loss": 1441.5164, + "step": 9670 + }, + { + "ce_loss_12": 3.308350610733032, + "ce_loss_17": 2.9846307158470156, + "ce_loss_23": 2.8726982355117796, + "ce_loss_3": 4.092323124408722, + "ce_loss_6": 3.7888726472854612, + "epoch": 0.968, + "grad_norm": 944.0, + "kl_loss_12": 1020.8781433105469, + "kl_loss_17": 271.05588684082034, + "kl_loss_3": 2602.898889160156, + "kl_loss_6": 2005.857647705078, + "learning_rate": 2.5757048597765396e-06, + "loss": 1460.2541, + "step": 9680 + }, + { + "ce_loss_12": 3.338445246219635, + "ce_loss_17": 3.0044411063194274, + "ce_loss_23": 2.8843555092811584, + "ce_loss_3": 4.114881277084351, + "ce_loss_6": 3.8083300948143006, + "epoch": 0.969, + "grad_norm": 1408.0, + "kl_loss_12": 1044.8596771240234, + "kl_loss_17": 275.3220375061035, + "kl_loss_3": 2604.5454223632814, + "kl_loss_6": 2008.3942138671875, + "learning_rate": 2.417366460819359e-06, + "loss": 1477.1682, + "step": 9690 + }, + { + "ce_loss_12": 3.3537409782409666, + "ce_loss_17": 3.017800807952881, + "ce_loss_23": 2.894302558898926, + "ce_loss_3": 4.140884184837342, + "ce_loss_6": 3.8325692772865296, + "epoch": 0.97, + "grad_norm": 1112.0, + "kl_loss_12": 1051.7619720458983, + "kl_loss_17": 282.70016326904295, + "kl_loss_3": 2649.907849121094, + "kl_loss_6": 2030.5795593261719, + "learning_rate": 2.2640387134577057e-06, + "loss": 1469.5803, + "step": 9700 + }, + { + "ce_loss_12": 3.2549274802207946, + "ce_loss_17": 2.939910852909088, + "ce_loss_23": 2.828316831588745, + "ce_loss_3": 4.001725673675537, + "ce_loss_6": 3.705301547050476, + "epoch": 0.971, + "grad_norm": 868.0, + "kl_loss_12": 978.7618072509765, + "kl_loss_17": 263.37414093017577, + "kl_loss_3": 2466.4112915039063, + "kl_loss_6": 1881.4376586914063, + "learning_rate": 2.115723161700278e-06, + "loss": 1432.3954, + "step": 9710 + }, + { + "ce_loss_12": 3.2688825488090516, + "ce_loss_17": 2.9281031847000123, + "ce_loss_23": 2.8067177176475524, + "ce_loss_3": 4.0559126257896425, + "ce_loss_6": 3.7535340428352355, + "epoch": 0.972, + "grad_norm": 956.0, + "kl_loss_12": 1062.6104888916016, + "kl_loss_17": 282.78052520751953, + "kl_loss_3": 2651.771044921875, + "kl_loss_6": 2049.60595703125, + "learning_rate": 1.9724212990830937e-06, + "loss": 1501.4723, + "step": 9720 + }, + { + "ce_loss_12": 3.3958194136619566, + "ce_loss_17": 3.057923400402069, + "ce_loss_23": 2.93945198059082, + "ce_loss_3": 4.177759373188019, + "ce_loss_6": 3.860970449447632, + "epoch": 0.973, + "grad_norm": 996.0, + "kl_loss_12": 1045.985891723633, + "kl_loss_17": 280.6972091674805, + "kl_loss_3": 2639.22490234375, + "kl_loss_6": 2008.8534240722656, + "learning_rate": 1.8341345686543331e-06, + "loss": 1483.7762, + "step": 9730 + }, + { + "ce_loss_12": 3.3623135566711424, + "ce_loss_17": 3.0398145794868467, + "ce_loss_23": 2.9244924068450926, + "ce_loss_3": 4.100259184837341, + "ce_loss_6": 3.8060136675834655, + "epoch": 0.974, + "grad_norm": 876.0, + "kl_loss_12": 1019.5402954101562, + "kl_loss_17": 271.8833610534668, + "kl_loss_3": 2513.6443725585937, + "kl_loss_6": 1932.6292114257812, + "learning_rate": 1.7008643629596864e-06, + "loss": 1476.4012, + "step": 9740 + }, + { + "ce_loss_12": 3.345140302181244, + "ce_loss_17": 3.0265002131462095, + "ce_loss_23": 2.9070200681686402, + "ce_loss_3": 4.126636505126953, + "ce_loss_6": 3.814054250717163, + "epoch": 0.975, + "grad_norm": 940.0, + "kl_loss_12": 1021.1226135253906, + "kl_loss_17": 276.88904571533203, + "kl_loss_3": 2610.3638305664062, + "kl_loss_6": 1993.3836791992187, + "learning_rate": 1.5726120240288633e-06, + "loss": 1493.1273, + "step": 9750 + }, + { + "ce_loss_12": 3.264871072769165, + "ce_loss_17": 2.9383165001869203, + "ce_loss_23": 2.8230011105537414, + "ce_loss_3": 4.03258649110794, + "ce_loss_6": 3.733627498149872, + "epoch": 0.976, + "grad_norm": 1072.0, + "kl_loss_12": 1035.583740234375, + "kl_loss_17": 273.65186462402346, + "kl_loss_3": 2586.930322265625, + "kl_loss_6": 1992.2733276367187, + "learning_rate": 1.4493788433612708e-06, + "loss": 1460.1872, + "step": 9760 + }, + { + "ce_loss_12": 3.380255401134491, + "ce_loss_17": 3.0444837689399717, + "ce_loss_23": 2.9262943625450135, + "ce_loss_3": 4.149372577667236, + "ce_loss_6": 3.8591771006584166, + "epoch": 0.977, + "grad_norm": 1192.0, + "kl_loss_12": 1043.110906982422, + "kl_loss_17": 277.4537742614746, + "kl_loss_3": 2614.2517211914064, + "kl_loss_6": 2021.9392700195312, + "learning_rate": 1.3311660619138578e-06, + "loss": 1488.0276, + "step": 9770 + }, + { + "ce_loss_12": 3.3608654618263243, + "ce_loss_17": 3.0446624159812927, + "ce_loss_23": 2.9237622380256654, + "ce_loss_3": 4.094170534610749, + "ce_loss_6": 3.795668828487396, + "epoch": 0.978, + "grad_norm": 960.0, + "kl_loss_12": 1019.3621154785156, + "kl_loss_17": 276.7036956787109, + "kl_loss_3": 2498.4484741210936, + "kl_loss_6": 1912.7908447265625, + "learning_rate": 1.2179748700879012e-06, + "loss": 1463.3352, + "step": 9780 + }, + { + "ce_loss_12": 3.304617428779602, + "ce_loss_17": 2.974220836162567, + "ce_loss_23": 2.8536474823951723, + "ce_loss_3": 4.064146685600281, + "ce_loss_6": 3.767940413951874, + "epoch": 0.979, + "grad_norm": 1248.0, + "kl_loss_12": 1021.9867584228516, + "kl_loss_17": 275.06206970214845, + "kl_loss_3": 2550.9173217773437, + "kl_loss_6": 1961.2408264160156, + "learning_rate": 1.1098064077174619e-06, + "loss": 1467.9999, + "step": 9790 + }, + { + "ce_loss_12": 3.3371457934379576, + "ce_loss_17": 3.0031715512275694, + "ce_loss_23": 2.884718680381775, + "ce_loss_3": 4.124406111240387, + "ce_loss_6": 3.829567217826843, + "epoch": 0.98, + "grad_norm": 984.0, + "kl_loss_12": 1038.0324310302735, + "kl_loss_17": 275.1086616516113, + "kl_loss_3": 2638.8526611328125, + "kl_loss_6": 2041.0841125488282, + "learning_rate": 1.006661764057837e-06, + "loss": 1482.5494, + "step": 9800 + }, + { + "ce_loss_12": 3.337872099876404, + "ce_loss_17": 3.011410188674927, + "ce_loss_23": 2.8942373633384704, + "ce_loss_3": 4.104587101936341, + "ce_loss_6": 3.799306297302246, + "epoch": 0.981, + "grad_norm": 1552.0, + "kl_loss_12": 1033.9137969970702, + "kl_loss_17": 271.6068618774414, + "kl_loss_3": 2588.8114990234376, + "kl_loss_6": 1984.8397277832032, + "learning_rate": 9.085419777743465e-07, + "loss": 1457.612, + "step": 9810 + }, + { + "ce_loss_12": 3.287680518627167, + "ce_loss_17": 2.964575207233429, + "ce_loss_23": 2.8532678127288817, + "ce_loss_3": 4.052232813835144, + "ce_loss_6": 3.7622761726379395, + "epoch": 0.982, + "grad_norm": 900.0, + "kl_loss_12": 1018.4030578613281, + "kl_loss_17": 267.15757751464844, + "kl_loss_3": 2563.1669799804686, + "kl_loss_6": 1980.1304626464844, + "learning_rate": 8.15448036932176e-07, + "loss": 1438.089, + "step": 9820 + }, + { + "ce_loss_12": 3.336066448688507, + "ce_loss_17": 3.0072415232658387, + "ce_loss_23": 2.8884667992591857, + "ce_loss_3": 4.095813620090484, + "ce_loss_6": 3.7954184889793394, + "epoch": 0.983, + "grad_norm": 1080.0, + "kl_loss_12": 1039.043862915039, + "kl_loss_17": 275.2289779663086, + "kl_loss_3": 2588.074365234375, + "kl_loss_6": 1992.1940795898438, + "learning_rate": 7.273808789862724e-07, + "loss": 1481.977, + "step": 9830 + }, + { + "ce_loss_12": 3.3937831997871397, + "ce_loss_17": 3.0694342851638794, + "ce_loss_23": 2.951556408405304, + "ce_loss_3": 4.159242665767669, + "ce_loss_6": 3.8498289585113525, + "epoch": 0.984, + "grad_norm": 988.0, + "kl_loss_12": 1037.1162841796875, + "kl_loss_17": 277.25635147094727, + "kl_loss_3": 2582.94873046875, + "kl_loss_6": 1974.8491821289062, + "learning_rate": 6.443413907720186e-07, + "loss": 1460.4299, + "step": 9840 + }, + { + "ce_loss_12": 3.335273730754852, + "ce_loss_17": 3.014903891086578, + "ce_loss_23": 2.8978246688842773, + "ce_loss_3": 4.110360252857208, + "ce_loss_6": 3.8039698004722595, + "epoch": 0.985, + "grad_norm": 1272.0, + "kl_loss_12": 1019.0031555175781, + "kl_loss_17": 274.9228126525879, + "kl_loss_3": 2559.80078125, + "kl_loss_6": 1957.603826904297, + "learning_rate": 5.663304084960185e-07, + "loss": 1449.5539, + "step": 9850 + }, + { + "ce_loss_12": 3.282993268966675, + "ce_loss_17": 2.944962537288666, + "ce_loss_23": 2.827660346031189, + "ce_loss_3": 4.065218067169189, + "ce_loss_6": 3.7537544131278993, + "epoch": 0.986, + "grad_norm": 1020.0, + "kl_loss_12": 1047.8510192871095, + "kl_loss_17": 278.3067687988281, + "kl_loss_3": 2623.368518066406, + "kl_loss_6": 2011.9999145507813, + "learning_rate": 4.933487177280482e-07, + "loss": 1457.2979, + "step": 9860 + }, + { + "ce_loss_12": 3.3598272681236265, + "ce_loss_17": 3.0410907745361326, + "ce_loss_23": 2.929271900653839, + "ce_loss_3": 4.1150998711586, + "ce_loss_6": 3.8177472591400146, + "epoch": 0.987, + "grad_norm": 1416.0, + "kl_loss_12": 1012.1949981689453, + "kl_loss_17": 266.2216857910156, + "kl_loss_3": 2549.34970703125, + "kl_loss_6": 1953.7899536132813, + "learning_rate": 4.2539705339295075e-07, + "loss": 1442.8347, + "step": 9870 + }, + { + "ce_loss_12": 3.2366280555725098, + "ce_loss_17": 2.9017109751701353, + "ce_loss_23": 2.7866319894790648, + "ce_loss_3": 4.015068626403808, + "ce_loss_6": 3.7159414172172545, + "epoch": 0.988, + "grad_norm": 972.0, + "kl_loss_12": 1028.9166107177734, + "kl_loss_17": 269.9573547363281, + "kl_loss_3": 2598.421826171875, + "kl_loss_6": 2006.371514892578, + "learning_rate": 3.6247609976319816e-07, + "loss": 1456.7176, + "step": 9880 + }, + { + "ce_loss_12": 3.32789089679718, + "ce_loss_17": 2.990896999835968, + "ce_loss_23": 2.8709633350372314, + "ce_loss_3": 4.108030164241791, + "ce_loss_6": 3.8056225538253785, + "epoch": 0.989, + "grad_norm": 1016.0, + "kl_loss_12": 1053.1518157958985, + "kl_loss_17": 278.55935821533205, + "kl_loss_3": 2620.5390747070314, + "kl_loss_6": 2017.3613342285157, + "learning_rate": 3.0458649045211895e-07, + "loss": 1506.2714, + "step": 9890 + }, + { + "ce_loss_12": 3.3056017994880675, + "ce_loss_17": 2.962406671047211, + "ce_loss_23": 2.8373791098594667, + "ce_loss_3": 4.081587100028992, + "ce_loss_6": 3.772992491722107, + "epoch": 0.99, + "grad_norm": 908.0, + "kl_loss_12": 1048.933447265625, + "kl_loss_17": 283.54878692626954, + "kl_loss_3": 2603.8252807617187, + "kl_loss_6": 2000.5755249023437, + "learning_rate": 2.517288084074587e-07, + "loss": 1495.7473, + "step": 9900 + }, + { + "ce_loss_12": 3.359211504459381, + "ce_loss_17": 3.006488561630249, + "ce_loss_23": 2.8813580870628357, + "ce_loss_3": 4.151961064338684, + "ce_loss_6": 3.842729997634888, + "epoch": 0.991, + "grad_norm": 1400.0, + "kl_loss_12": 1086.1037017822266, + "kl_loss_17": 288.2465118408203, + "kl_loss_3": 2684.929248046875, + "kl_loss_6": 2073.267474365234, + "learning_rate": 2.0390358590538505e-07, + "loss": 1503.6292, + "step": 9910 + }, + { + "ce_loss_12": 3.341462767124176, + "ce_loss_17": 3.008048748970032, + "ce_loss_23": 2.8869954109191895, + "ce_loss_3": 4.113165223598481, + "ce_loss_6": 3.8097443699836733, + "epoch": 0.992, + "grad_norm": 1072.0, + "kl_loss_12": 1045.311962890625, + "kl_loss_17": 279.93851470947266, + "kl_loss_3": 2600.6946655273437, + "kl_loss_6": 2003.008935546875, + "learning_rate": 1.61111304545436e-07, + "loss": 1464.2891, + "step": 9920 + }, + { + "ce_loss_12": 3.3059886693954468, + "ce_loss_17": 2.98328115940094, + "ce_loss_23": 2.863870179653168, + "ce_loss_3": 4.0723427653312685, + "ce_loss_6": 3.7720368027687075, + "epoch": 0.993, + "grad_norm": 904.0, + "kl_loss_12": 1034.073046875, + "kl_loss_17": 274.453141784668, + "kl_loss_3": 2587.8949584960938, + "kl_loss_6": 1989.5302185058595, + "learning_rate": 1.2335239524541298e-07, + "loss": 1448.7602, + "step": 9930 + }, + { + "ce_loss_12": 3.2763088941574097, + "ce_loss_17": 2.942208456993103, + "ce_loss_23": 2.8267432928085325, + "ce_loss_3": 4.041367328166961, + "ce_loss_6": 3.7408220410346984, + "epoch": 0.994, + "grad_norm": 956.0, + "kl_loss_12": 1025.5233093261718, + "kl_loss_17": 272.5311225891113, + "kl_loss_3": 2580.800732421875, + "kl_loss_6": 1972.7632751464844, + "learning_rate": 9.06272382371065e-08, + "loss": 1464.1678, + "step": 9940 + }, + { + "ce_loss_12": 3.348012113571167, + "ce_loss_17": 3.0078374981880187, + "ce_loss_23": 2.893405330181122, + "ce_loss_3": 4.12751475572586, + "ce_loss_6": 3.822390305995941, + "epoch": 0.995, + "grad_norm": 1360.0, + "kl_loss_12": 1059.0628814697266, + "kl_loss_17": 276.8828186035156, + "kl_loss_3": 2634.0769165039064, + "kl_loss_6": 2026.2205505371094, + "learning_rate": 6.293616306246586e-08, + "loss": 1481.2273, + "step": 9950 + }, + { + "ce_loss_12": 3.320646572113037, + "ce_loss_17": 2.999920296669006, + "ce_loss_23": 2.885223948955536, + "ce_loss_3": 4.070868468284607, + "ce_loss_6": 3.771753668785095, + "epoch": 0.996, + "grad_norm": 1008.0, + "kl_loss_12": 1013.6059875488281, + "kl_loss_17": 268.9860580444336, + "kl_loss_3": 2521.4711303710938, + "kl_loss_6": 1938.149951171875, + "learning_rate": 4.027944857032395e-08, + "loss": 1429.216, + "step": 9960 + }, + { + "ce_loss_12": 3.311885952949524, + "ce_loss_17": 2.9989118576049805, + "ce_loss_23": 2.8931652188301085, + "ce_loss_3": 4.040776383876801, + "ce_loss_6": 3.745835208892822, + "epoch": 0.997, + "grad_norm": 1216.0, + "kl_loss_12": 976.0381683349609, + "kl_loss_17": 258.4782745361328, + "kl_loss_3": 2448.4940795898438, + "kl_loss_6": 1869.9054077148437, + "learning_rate": 2.265732291356626e-08, + "loss": 1405.6646, + "step": 9970 + }, + { + "ce_loss_12": 3.3592669129371644, + "ce_loss_17": 3.0400989770889284, + "ce_loss_23": 2.925530707836151, + "ce_loss_3": 4.115602195262909, + "ce_loss_6": 3.8081278443336486, + "epoch": 0.998, + "grad_norm": 1056.0, + "kl_loss_12": 1014.3947631835938, + "kl_loss_17": 273.0302558898926, + "kl_loss_3": 2532.3029907226564, + "kl_loss_6": 1928.0451232910157, + "learning_rate": 1.0069963546743833e-08, + "loss": 1470.89, + "step": 9980 + }, + { + "ce_loss_12": 3.3478829503059386, + "ce_loss_17": 3.021211862564087, + "ce_loss_23": 2.89819540977478, + "ce_loss_3": 4.12130331993103, + "ce_loss_6": 3.81664662361145, + "epoch": 0.999, + "grad_norm": 1168.0, + "kl_loss_12": 1037.3339416503907, + "kl_loss_17": 278.43563385009764, + "kl_loss_3": 2601.9752197265625, + "kl_loss_6": 1998.0107849121093, + "learning_rate": 2.517497224463483e-09, + "loss": 1464.9984, + "step": 9990 + }, + { + "ce_loss_12": 3.3240800499916077, + "ce_loss_17": 2.977655363082886, + "ce_loss_23": 2.8543686270713806, + "ce_loss_3": 4.138887107372284, + "ce_loss_6": 3.817936360836029, + "epoch": 1.0, + "grad_norm": 1224.0, + "kl_loss_12": 1070.1510314941406, + "kl_loss_17": 283.68445587158203, + "kl_loss_3": 2717.0578735351564, + "kl_loss_6": 2080.7163208007814, + "learning_rate": 0.0, + "loss": 1512.9695, + "step": 10000 + } + ], + "logging_steps": 10, + "max_steps": 10000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 250, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 8.502582338838856e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}