{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 10000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "ce_loss_12": 17.798757553100586, "ce_loss_17": 12.109902381896973, "ce_loss_23": 2.898716688156128, "ce_loss_3": 16.791034698486328, "ce_loss_6": 17.334301948547363, "epoch": 0.0001, "grad_norm": 211968.0, "kl_loss_12": 31074.1875, "kl_loss_17": 19915.5283203125, "kl_loss_3": 28423.80078125, "kl_loss_6": 29539.3955078125, "learning_rate": 1e-05, "loss": 27273.5469, "step": 1 }, { "ce_loss_12": 11.539109309514364, "ce_loss_17": 9.017974032296074, "ce_loss_23": 2.9512691365347967, "ce_loss_3": 11.650188684463501, "ce_loss_6": 11.816295120451185, "epoch": 0.001, "grad_norm": 26112.0, "kl_loss_12": 17560.87771267361, "kl_loss_17": 12742.507107204861, "kl_loss_3": 17486.401258680555, "kl_loss_6": 17750.530110677082, "learning_rate": 0.0001, "loss": 16390.9132, "step": 10 }, { "ce_loss_12": 6.662769317626953, "ce_loss_17": 5.113725876808166, "ce_loss_23": 2.958701026439667, "ce_loss_3": 7.423222160339355, "ce_loss_6": 7.240184164047241, "epoch": 0.002, "grad_norm": 10816.0, "kl_loss_12": 7201.2345703125, "kl_loss_17": 4226.757141113281, "kl_loss_3": 8504.014624023437, "kl_loss_6": 8126.985400390625, "learning_rate": 0.0002, "loss": 7110.9945, "step": 20 }, { "ce_loss_12": 5.635356903076172, "ce_loss_17": 3.987026798725128, "ce_loss_23": 2.7665123343467712, "ce_loss_3": 6.693690013885498, "ce_loss_6": 6.371592235565186, "epoch": 0.003, "grad_norm": 6592.0, "kl_loss_12": 5527.604516601563, "kl_loss_17": 2312.6288452148438, "kl_loss_3": 7508.761254882813, "kl_loss_6": 6876.302319335938, "learning_rate": 0.0003, "loss": 5459.1285, "step": 30 }, { "ce_loss_12": 5.312513065338135, "ce_loss_17": 3.8958184838294985, "ce_loss_23": 2.928679037094116, "ce_loss_3": 6.311967754364014, "ce_loss_6": 5.9457975149154665, "epoch": 0.004, "grad_norm": 9280.0, "kl_loss_12": 4565.179907226562, "kl_loss_17": 1834.186749267578, "kl_loss_3": 6455.294677734375, "kl_loss_6": 5751.1544921875, "learning_rate": 0.0004, "loss": 4687.6551, "step": 40 }, { "ce_loss_12": 5.002953696250915, "ce_loss_17": 3.739673209190369, "ce_loss_23": 2.8902602910995485, "ce_loss_3": 6.069131588935852, "ce_loss_6": 5.692244386672973, "epoch": 0.005, "grad_norm": 7552.0, "kl_loss_12": 4123.054260253906, "kl_loss_17": 1606.3025634765625, "kl_loss_3": 6154.768359375, "kl_loss_6": 5436.51982421875, "learning_rate": 0.0005, "loss": 4305.6285, "step": 50 }, { "ce_loss_12": 4.829141068458557, "ce_loss_17": 3.635519802570343, "ce_loss_23": 2.9109005689620973, "ce_loss_3": 5.868700075149536, "ce_loss_6": 5.501702117919922, "epoch": 0.006, "grad_norm": 11392.0, "kl_loss_12": 3755.554541015625, "kl_loss_17": 1389.9905212402343, "kl_loss_3": 5720.950512695313, "kl_loss_6": 5029.498291015625, "learning_rate": 0.0006, "loss": 3986.793, "step": 60 }, { "ce_loss_12": 4.645738911628723, "ce_loss_17": 3.4978497982025147, "ce_loss_23": 2.8339456021785736, "ce_loss_3": 5.803532552719116, "ce_loss_6": 5.397406053543091, "epoch": 0.007, "grad_norm": 8256.0, "kl_loss_12": 3554.4886840820313, "kl_loss_17": 1276.3660400390625, "kl_loss_3": 5769.8087890625, "kl_loss_6": 4980.427783203125, "learning_rate": 0.0007, "loss": 3883.4496, "step": 70 }, { "ce_loss_12": 4.597572684288025, "ce_loss_17": 3.5362090349197386, "ce_loss_23": 2.829783248901367, "ce_loss_3": 5.797271776199341, "ce_loss_6": 5.276372957229614, "epoch": 0.008, "grad_norm": 6976.0, "kl_loss_12": 3471.6586669921876, "kl_loss_17": 1371.0523620605468, "kl_loss_3": 5753.631884765625, "kl_loss_6": 4787.4904296875, "learning_rate": 0.0008, "loss": 3867.9441, "step": 80 }, { "ce_loss_12": 4.451014447212219, "ce_loss_17": 3.5222262740135193, "ce_loss_23": 2.7971161007881165, "ce_loss_3": 5.97350127696991, "ce_loss_6": 5.160103845596313, "epoch": 0.009, "grad_norm": 11968.0, "kl_loss_12": 3281.931005859375, "kl_loss_17": 1445.2955322265625, "kl_loss_3": 6169.022705078125, "kl_loss_6": 4631.531591796875, "learning_rate": 0.0009000000000000001, "loss": 3860.6953, "step": 90 }, { "ce_loss_12": 4.490131831169128, "ce_loss_17": 3.5490355372428892, "ce_loss_23": 2.903198480606079, "ce_loss_3": 5.901812219619751, "ce_loss_6": 5.335342597961426, "epoch": 0.01, "grad_norm": 7904.0, "kl_loss_12": 3168.4276489257813, "kl_loss_17": 1262.5684020996093, "kl_loss_3": 5861.412744140625, "kl_loss_6": 4760.437231445312, "learning_rate": 0.001, "loss": 3758.8113, "step": 100 }, { "ce_loss_12": 4.583330249786377, "ce_loss_17": 3.447415602207184, "ce_loss_23": 2.8641112565994264, "ce_loss_3": 5.773125648498535, "ce_loss_6": 5.602428388595581, "epoch": 0.011, "grad_norm": 16000.0, "kl_loss_12": 3430.811181640625, "kl_loss_17": 1154.315347290039, "kl_loss_3": 5642.04091796875, "kl_loss_6": 5399.360400390625, "learning_rate": 0.0009999974825027757, "loss": 3887.9992, "step": 110 }, { "ce_loss_12": 5.235488867759704, "ce_loss_17": 3.5834272861480714, "ce_loss_23": 2.9280850529670714, "ce_loss_3": 5.679557681083679, "ce_loss_6": 5.6021226167678835, "epoch": 0.012, "grad_norm": 14080.0, "kl_loss_12": 4564.27548828125, "kl_loss_17": 1343.9837036132812, "kl_loss_3": 5395.271264648438, "kl_loss_6": 5231.13349609375, "learning_rate": 0.0009999899300364532, "loss": 4105.0148, "step": 120 }, { "ce_loss_12": 5.704882669448852, "ce_loss_17": 3.6224010229110717, "ce_loss_23": 2.8817583322525024, "ce_loss_3": 5.816133284568787, "ce_loss_6": 5.332496976852417, "epoch": 0.013, "grad_norm": 12288.0, "kl_loss_12": 5597.638903808594, "kl_loss_17": 1380.1296264648438, "kl_loss_3": 5679.002026367188, "kl_loss_6": 4769.773095703125, "learning_rate": 0.0009999773426770863, "loss": 4395.3734, "step": 130 }, { "ce_loss_12": 4.891789817810059, "ce_loss_17": 3.532665467262268, "ce_loss_23": 2.929382526874542, "ce_loss_3": 5.8127936840057375, "ce_loss_6": 5.244004225730896, "epoch": 0.014, "grad_norm": 6912.0, "kl_loss_12": 3870.0730102539064, "kl_loss_17": 1216.6474182128907, "kl_loss_3": 5634.811572265625, "kl_loss_6": 4561.313623046875, "learning_rate": 0.0009999597205514296, "loss": 3838.6836, "step": 140 }, { "ce_loss_12": 4.74582872390747, "ce_loss_17": 3.492581915855408, "ce_loss_23": 2.88243225812912, "ce_loss_3": 5.780796051025391, "ce_loss_6": 5.230218529701233, "epoch": 0.015, "grad_norm": 3872.0, "kl_loss_12": 3684.5867309570312, "kl_loss_17": 1215.8963134765625, "kl_loss_3": 5671.0294921875, "kl_loss_6": 4582.3294921875, "learning_rate": 0.0009999370638369377, "loss": 3805.625, "step": 150 }, { "ce_loss_12": 4.55562047958374, "ce_loss_17": 3.4915971159934998, "ce_loss_23": 2.922529602050781, "ce_loss_3": 5.791428542137146, "ce_loss_6": 5.213034510612488, "epoch": 0.016, "grad_norm": 5344.0, "kl_loss_12": 3269.4089599609374, "kl_loss_17": 1116.4551055908203, "kl_loss_3": 5598.087133789062, "kl_loss_6": 4509.215734863281, "learning_rate": 0.000999909372761763, "loss": 3618.3555, "step": 160 }, { "ce_loss_12": 4.466858816146851, "ce_loss_17": 3.4728991270065306, "ce_loss_23": 2.860163617134094, "ce_loss_3": 5.697683525085449, "ce_loss_6": 5.133940243721009, "epoch": 0.017, "grad_norm": 2448.0, "kl_loss_12": 3196.91201171875, "kl_loss_17": 1242.2661376953124, "kl_loss_3": 5557.818823242187, "kl_loss_6": 4480.91279296875, "learning_rate": 0.0009998766476047546, "loss": 3649.5055, "step": 170 }, { "ce_loss_12": 4.527968239784241, "ce_loss_17": 3.5157664895057676, "ce_loss_23": 2.8960798501968386, "ce_loss_3": 5.678347492218018, "ce_loss_6": 5.090836000442505, "epoch": 0.018, "grad_norm": 2608.0, "kl_loss_12": 3277.224816894531, "kl_loss_17": 1209.9160522460938, "kl_loss_3": 5431.537768554687, "kl_loss_6": 4314.870227050781, "learning_rate": 0.0009998388886954545, "loss": 3574.5352, "step": 180 }, { "ce_loss_12": 4.394872999191284, "ce_loss_17": 3.47138329744339, "ce_loss_23": 2.8707207679748534, "ce_loss_3": 5.5450726509094235, "ce_loss_6": 5.056916284561157, "epoch": 0.019, "grad_norm": 2040.0, "kl_loss_12": 3051.567834472656, "kl_loss_17": 1180.9717346191405, "kl_loss_3": 5247.836181640625, "kl_loss_6": 4322.745776367187, "learning_rate": 0.0009997960964140947, "loss": 3442.2301, "step": 190 }, { "ce_loss_12": 4.349503815174103, "ce_loss_17": 3.4052997827529907, "ce_loss_23": 2.8605754256248472, "ce_loss_3": 5.51627106666565, "ce_loss_6": 5.015147471427918, "epoch": 0.02, "grad_norm": 2008.0, "kl_loss_12": 2997.1719360351562, "kl_loss_17": 1077.3347778320312, "kl_loss_3": 5216.086987304688, "kl_loss_6": 4270.710314941406, "learning_rate": 0.0009997482711915926, "loss": 3372.0453, "step": 200 }, { "ce_loss_12": 4.382658886909485, "ce_loss_17": 3.366709041595459, "ce_loss_23": 2.8402206182479857, "ce_loss_3": 5.42800190448761, "ce_loss_6": 4.934572362899781, "epoch": 0.021, "grad_norm": 2000.0, "kl_loss_12": 3123.2125854492188, "kl_loss_17": 1060.0697784423828, "kl_loss_3": 5091.03828125, "kl_loss_6": 4182.4359130859375, "learning_rate": 0.0009996954135095479, "loss": 3352.6879, "step": 210 }, { "ce_loss_12": 4.337828183174134, "ce_loss_17": 3.416949248313904, "ce_loss_23": 2.910671079158783, "ce_loss_3": 5.403342247009277, "ce_loss_6": 4.892907905578613, "epoch": 0.022, "grad_norm": 2832.0, "kl_loss_12": 2856.504675292969, "kl_loss_17": 997.9278289794922, "kl_loss_3": 4914.284692382813, "kl_loss_6": 3927.723681640625, "learning_rate": 0.0009996375239002368, "loss": 3177.3701, "step": 220 }, { "ce_loss_12": 4.298712420463562, "ce_loss_17": 3.466365027427673, "ce_loss_23": 2.9782427191734313, "ce_loss_3": 5.402818751335144, "ce_loss_6": 4.915108633041382, "epoch": 0.023, "grad_norm": 1928.0, "kl_loss_12": 2680.0574462890627, "kl_loss_17": 967.0066162109375, "kl_loss_3": 4764.784228515625, "kl_loss_6": 3848.396130371094, "learning_rate": 0.0009995746029466072, "loss": 3073.8885, "step": 230 }, { "ce_loss_12": 4.158153474330902, "ce_loss_17": 3.2901054859161376, "ce_loss_23": 2.786073011159897, "ce_loss_3": 5.342944192886352, "ce_loss_6": 4.852350068092346, "epoch": 0.024, "grad_norm": 2784.0, "kl_loss_12": 2786.9808959960938, "kl_loss_17": 1015.3545806884765, "kl_loss_3": 5037.80654296875, "kl_loss_6": 4118.17490234375, "learning_rate": 0.0009995066512822719, "loss": 3153.4832, "step": 240 }, { "ce_loss_12": 4.2036160111427305, "ce_loss_17": 3.368748664855957, "ce_loss_23": 2.8782119750976562, "ce_loss_3": 5.450334095954895, "ce_loss_6": 4.917315721511841, "epoch": 0.025, "grad_norm": 1688.0, "kl_loss_12": 2688.274169921875, "kl_loss_17": 961.38095703125, "kl_loss_3": 5079.578344726562, "kl_loss_6": 4061.3109497070313, "learning_rate": 0.000999433669591504, "loss": 3096.8104, "step": 250 }, { "ce_loss_12": 4.106511771678925, "ce_loss_17": 3.2343634247779844, "ce_loss_23": 2.7847194910049438, "ce_loss_3": 5.36883807182312, "ce_loss_6": 4.7796388387680055, "epoch": 0.026, "grad_norm": 1416.0, "kl_loss_12": 2675.691442871094, "kl_loss_17": 906.2499481201172, "kl_loss_3": 5100.184497070312, "kl_loss_6": 3969.0149169921874, "learning_rate": 0.000999355658609228, "loss": 3087.484, "step": 260 }, { "ce_loss_12": 4.176587176322937, "ce_loss_17": 3.267600440979004, "ce_loss_23": 2.809903073310852, "ce_loss_3": 5.416643333435059, "ce_loss_6": 4.862660479545593, "epoch": 0.027, "grad_norm": 1624.0, "kl_loss_12": 2710.8676025390623, "kl_loss_17": 894.4491577148438, "kl_loss_3": 5104.5154296875, "kl_loss_6": 4047.3380737304688, "learning_rate": 0.0009992726191210138, "loss": 3135.7184, "step": 270 }, { "ce_loss_12": 4.147988736629486, "ce_loss_17": 3.2972861886024476, "ce_loss_23": 2.8483713388442995, "ce_loss_3": 5.283263945579529, "ce_loss_6": 4.793030953407287, "epoch": 0.028, "grad_norm": 1936.0, "kl_loss_12": 2634.4557495117188, "kl_loss_17": 890.9104583740234, "kl_loss_3": 4775.991870117187, "kl_loss_6": 3846.7409912109374, "learning_rate": 0.0009991845519630679, "loss": 3005.9059, "step": 280 }, { "ce_loss_12": 4.069076085090638, "ce_loss_17": 3.2262199759483337, "ce_loss_23": 2.7426149249076843, "ce_loss_3": 5.173510408401489, "ce_loss_6": 4.663527464866638, "epoch": 0.029, "grad_norm": 1464.0, "kl_loss_12": 2673.3083618164064, "kl_loss_17": 964.3922149658204, "kl_loss_3": 4793.293725585938, "kl_loss_6": 3818.1897827148437, "learning_rate": 0.0009990914580222257, "loss": 3053.4721, "step": 290 }, { "ce_loss_12": 4.102203106880188, "ce_loss_17": 3.312656319141388, "ce_loss_23": 2.877616453170776, "ce_loss_3": 5.184468483924865, "ce_loss_6": 4.681406426429748, "epoch": 0.03, "grad_norm": 1424.0, "kl_loss_12": 2515.7253540039064, "kl_loss_17": 879.6887023925781, "kl_loss_3": 4588.6248046875, "kl_loss_6": 3630.072717285156, "learning_rate": 0.0009989933382359422, "loss": 2960.4674, "step": 300 }, { "ce_loss_12": 4.090289652347565, "ce_loss_17": 3.3021645307540894, "ce_loss_23": 2.8886025071144106, "ce_loss_3": 5.160676980018616, "ce_loss_6": 4.689880275726319, "epoch": 0.031, "grad_norm": 1744.0, "kl_loss_12": 2450.2645874023438, "kl_loss_17": 831.6864807128907, "kl_loss_3": 4499.324084472656, "kl_loss_6": 3599.02568359375, "learning_rate": 0.0009988901935922825, "loss": 2877.4172, "step": 310 }, { "ce_loss_12": 4.008204507827759, "ce_loss_17": 3.2151699900627135, "ce_loss_23": 2.7390822887420656, "ce_loss_3": 5.131978940963745, "ce_loss_6": 4.660629677772522, "epoch": 0.032, "grad_norm": 1288.0, "kl_loss_12": 2575.4901611328123, "kl_loss_17": 951.4900726318359, "kl_loss_3": 4740.608227539063, "kl_loss_6": 3837.3116943359373, "learning_rate": 0.0009987820251299122, "loss": 2955.9014, "step": 320 }, { "ce_loss_12": 4.053432321548462, "ce_loss_17": 3.3205178499221804, "ce_loss_23": 2.8587931513786318, "ce_loss_3": 5.102370977401733, "ce_loss_6": 4.653764081001282, "epoch": 0.033, "grad_norm": 1264.0, "kl_loss_12": 2450.4338500976564, "kl_loss_17": 935.9856903076172, "kl_loss_3": 4473.6075439453125, "kl_loss_6": 3609.1573974609373, "learning_rate": 0.0009986688339380862, "loss": 2848.4352, "step": 330 }, { "ce_loss_12": 3.957953178882599, "ce_loss_17": 3.22804708480835, "ce_loss_23": 2.8191253423690794, "ce_loss_3": 4.9991214036941525, "ce_loss_6": 4.5394447326660154, "epoch": 0.034, "grad_norm": 996.0, "kl_loss_12": 2342.859753417969, "kl_loss_17": 820.8218719482422, "kl_loss_3": 4322.4416015625, "kl_loss_6": 3451.667041015625, "learning_rate": 0.0009985506211566387, "loss": 2758.6914, "step": 340 }, { "ce_loss_12": 3.9520851135253907, "ce_loss_17": 3.2317885398864745, "ce_loss_23": 2.846635568141937, "ce_loss_3": 5.002023839950562, "ce_loss_6": 4.5321044921875, "epoch": 0.035, "grad_norm": 1592.0, "kl_loss_12": 2274.8722229003906, "kl_loss_17": 782.2494018554687, "kl_loss_3": 4281.415600585938, "kl_loss_6": 3387.6537963867186, "learning_rate": 0.0009984273879759713, "loss": 2705.5883, "step": 350 }, { "ce_loss_12": 4.0336832165718075, "ce_loss_17": 3.288927936553955, "ce_loss_23": 2.874334526062012, "ce_loss_3": 5.094504308700562, "ce_loss_6": 4.62630341053009, "epoch": 0.036, "grad_norm": 964.0, "kl_loss_12": 2347.282421875, "kl_loss_17": 819.1060791015625, "kl_loss_3": 4394.406872558594, "kl_loss_6": 3496.9873046875, "learning_rate": 0.0009982991356370402, "loss": 2793.3129, "step": 360 }, { "ce_loss_12": 3.9753374218940736, "ce_loss_17": 3.2193778276443483, "ce_loss_23": 2.8541990399360655, "ce_loss_3": 5.038383626937867, "ce_loss_6": 4.584095597267151, "epoch": 0.037, "grad_norm": 936.0, "kl_loss_12": 2308.5005615234377, "kl_loss_17": 759.3042938232422, "kl_loss_3": 4345.300744628907, "kl_loss_6": 3480.0982177734377, "learning_rate": 0.0009981658654313456, "loss": 2738.6379, "step": 370 }, { "ce_loss_12": 4.011369419097901, "ce_loss_17": 3.2788562774658203, "ce_loss_23": 2.916998362541199, "ce_loss_3": 5.057713532447815, "ce_loss_6": 4.593180727958679, "epoch": 0.038, "grad_norm": 1048.0, "kl_loss_12": 2240.633892822266, "kl_loss_17": 730.3999633789062, "kl_loss_3": 4240.71103515625, "kl_loss_6": 3363.1546142578127, "learning_rate": 0.000998027578700917, "loss": 2685.7131, "step": 380 }, { "ce_loss_12": 3.9916287541389464, "ce_loss_17": 3.232191336154938, "ce_loss_23": 2.871097719669342, "ce_loss_3": 5.032142806053161, "ce_loss_6": 4.558016991615295, "epoch": 0.039, "grad_norm": 1104.0, "kl_loss_12": 2288.9161743164063, "kl_loss_17": 730.8479858398438, "kl_loss_3": 4279.519396972656, "kl_loss_6": 3377.6620239257813, "learning_rate": 0.0009978842768382998, "loss": 2698.6039, "step": 390 }, { "ce_loss_12": 3.9320905208587646, "ce_loss_17": 3.206320250034332, "ce_loss_23": 2.873453605175018, "ce_loss_3": 4.961167740821838, "ce_loss_6": 4.500527572631836, "epoch": 0.04, "grad_norm": 1200.0, "kl_loss_12": 2164.7929077148438, "kl_loss_17": 678.6263000488282, "kl_loss_3": 4138.049755859375, "kl_loss_6": 3266.367578125, "learning_rate": 0.0009977359612865424, "loss": 2588.5998, "step": 400 }, { "ce_loss_12": 3.9608339428901673, "ce_loss_17": 3.23901629447937, "ce_loss_23": 2.890236461162567, "ce_loss_3": 4.998275995254517, "ce_loss_6": 4.56150221824646, "epoch": 0.041, "grad_norm": 1328.0, "kl_loss_12": 2207.0178466796874, "kl_loss_17": 721.2607818603516, "kl_loss_3": 4192.5740966796875, "kl_loss_6": 3363.844616699219, "learning_rate": 0.0009975826335391806, "loss": 2604.1301, "step": 410 }, { "ce_loss_12": 3.9477717757225035, "ce_loss_17": 3.2436631083488465, "ce_loss_23": 2.907458317279816, "ce_loss_3": 4.954374933242798, "ce_loss_6": 4.549765229225159, "epoch": 0.042, "grad_norm": 844.0, "kl_loss_12": 2127.1870727539062, "kl_loss_17": 682.032470703125, "kl_loss_3": 4070.1541015625, "kl_loss_6": 3305.5493408203124, "learning_rate": 0.0009974242951402235, "loss": 2572.7324, "step": 420 }, { "ce_loss_12": 3.950932002067566, "ce_loss_17": 3.2564106822013854, "ce_loss_23": 2.910490798950195, "ce_loss_3": 4.9817784309387205, "ce_loss_6": 4.567001938819885, "epoch": 0.043, "grad_norm": 1104.0, "kl_loss_12": 2153.303643798828, "kl_loss_17": 712.436978149414, "kl_loss_3": 4140.805859375, "kl_loss_6": 3338.912951660156, "learning_rate": 0.0009972609476841367, "loss": 2562.109, "step": 430 }, { "ce_loss_12": 3.9008285284042357, "ce_loss_17": 3.1890947103500364, "ce_loss_23": 2.8326279640197756, "ce_loss_3": 4.932444286346436, "ce_loss_6": 4.5064095735549925, "epoch": 0.044, "grad_norm": 948.0, "kl_loss_12": 2182.2732788085937, "kl_loss_17": 723.6691040039062, "kl_loss_3": 4158.048254394531, "kl_loss_6": 3358.2782592773438, "learning_rate": 0.0009970925928158272, "loss": 2605.9805, "step": 440 }, { "ce_loss_12": 3.8430684328079225, "ce_loss_17": 3.1278234124183655, "ce_loss_23": 2.7837769746780396, "ce_loss_3": 4.891387319564819, "ce_loss_6": 4.4741298913955685, "epoch": 0.045, "grad_norm": 912.0, "kl_loss_12": 2194.3703674316407, "kl_loss_17": 712.3570159912109, "kl_loss_3": 4230.814050292969, "kl_loss_6": 3414.0078369140624, "learning_rate": 0.000996919232230627, "loss": 2615.0797, "step": 450 }, { "ce_loss_12": 3.873635399341583, "ce_loss_17": 3.1885122299194335, "ce_loss_23": 2.8668599367141723, "ce_loss_3": 4.8737973928451535, "ce_loss_6": 4.461533617973328, "epoch": 0.046, "grad_norm": 1096.0, "kl_loss_12": 2121.9797241210936, "kl_loss_17": 668.4944366455078, "kl_loss_3": 4048.25771484375, "kl_loss_6": 3273.9767822265626, "learning_rate": 0.0009967408676742752, "loss": 2483.2707, "step": 460 }, { "ce_loss_12": 3.997884488105774, "ce_loss_17": 3.330697500705719, "ce_loss_23": 3.0016987800598143, "ce_loss_3": 4.963337516784668, "ce_loss_6": 4.5454404830932615, "epoch": 0.047, "grad_norm": 1168.0, "kl_loss_12": 2082.495251464844, "kl_loss_17": 688.2158508300781, "kl_loss_3": 3956.098254394531, "kl_loss_6": 3153.3085205078123, "learning_rate": 0.0009965575009429006, "loss": 2550.675, "step": 470 }, { "ce_loss_12": 3.828306031227112, "ce_loss_17": 3.134593462944031, "ce_loss_23": 2.7883999347686768, "ce_loss_3": 4.853883934020996, "ce_loss_6": 4.421608352661133, "epoch": 0.048, "grad_norm": 924.0, "kl_loss_12": 2141.7112731933594, "kl_loss_17": 703.4312683105469, "kl_loss_3": 4134.702722167969, "kl_loss_6": 3301.436169433594, "learning_rate": 0.0009963691338830043, "loss": 2537.2172, "step": 480 }, { "ce_loss_12": 3.8659058570861817, "ce_loss_17": 3.199295365810394, "ce_loss_23": 2.878894364833832, "ce_loss_3": 4.872761750221253, "ce_loss_6": 4.462412023544312, "epoch": 0.049, "grad_norm": 880.0, "kl_loss_12": 2068.831591796875, "kl_loss_17": 653.7953063964844, "kl_loss_3": 4015.805126953125, "kl_loss_6": 3236.064831542969, "learning_rate": 0.0009961757683914405, "loss": 2477.0184, "step": 490 }, { "ce_loss_12": 3.864358937740326, "ce_loss_17": 3.1860976815223694, "ce_loss_23": 2.864992415904999, "ce_loss_3": 4.822944927215576, "ce_loss_6": 4.406767797470093, "epoch": 0.05, "grad_norm": 952.0, "kl_loss_12": 2072.2715576171877, "kl_loss_17": 656.6796630859375, "kl_loss_3": 3929.730029296875, "kl_loss_6": 3145.1748901367187, "learning_rate": 0.0009959774064153978, "loss": 2476.4344, "step": 500 }, { "ce_loss_12": 3.8237053871154787, "ce_loss_17": 3.1873385667800904, "ce_loss_23": 2.8856295228004454, "ce_loss_3": 4.799383807182312, "ce_loss_6": 4.393183696269989, "epoch": 0.051, "grad_norm": 1104.0, "kl_loss_12": 1984.5502136230468, "kl_loss_17": 638.6908020019531, "kl_loss_3": 3862.577673339844, "kl_loss_6": 3089.1991943359376, "learning_rate": 0.0009957740499523787, "loss": 2434.852, "step": 510 }, { "ce_loss_12": 3.8531986594200136, "ce_loss_17": 3.2021188259124758, "ce_loss_23": 2.894522261619568, "ce_loss_3": 4.832447981834411, "ce_loss_6": 4.422991037368774, "epoch": 0.052, "grad_norm": 996.0, "kl_loss_12": 2003.9843200683595, "kl_loss_17": 635.9474334716797, "kl_loss_3": 3881.0363525390626, "kl_loss_6": 3101.2047607421873, "learning_rate": 0.0009955657010501807, "loss": 2418.8723, "step": 520 }, { "ce_loss_12": 3.8310924410820006, "ce_loss_17": 3.1631489157676698, "ce_loss_23": 2.848756265640259, "ce_loss_3": 4.833318781852722, "ce_loss_6": 4.419972848892212, "epoch": 0.053, "grad_norm": 1040.0, "kl_loss_12": 2029.2051391601562, "kl_loss_17": 638.5391387939453, "kl_loss_3": 3972.6285766601563, "kl_loss_6": 3185.361083984375, "learning_rate": 0.000995352361806875, "loss": 2437.1742, "step": 530 }, { "ce_loss_12": 3.8631009817123414, "ce_loss_17": 3.212864434719086, "ce_loss_23": 2.8967057943344114, "ce_loss_3": 4.83975579738617, "ce_loss_6": 4.445706224441528, "epoch": 0.054, "grad_norm": 812.0, "kl_loss_12": 2037.0037414550782, "kl_loss_17": 649.3562622070312, "kl_loss_3": 3937.4210205078125, "kl_loss_6": 3171.944091796875, "learning_rate": 0.0009951340343707852, "loss": 2466.9324, "step": 540 }, { "ce_loss_12": 3.90088312625885, "ce_loss_17": 3.240309000015259, "ce_loss_23": 2.9396286845207213, "ce_loss_3": 4.886050772666931, "ce_loss_6": 4.482292723655701, "epoch": 0.055, "grad_norm": 1056.0, "kl_loss_12": 1982.8837341308595, "kl_loss_17": 613.3875244140625, "kl_loss_3": 3888.7989135742187, "kl_loss_6": 3124.067333984375, "learning_rate": 0.0009949107209404665, "loss": 2422.4299, "step": 550 }, { "ce_loss_12": 3.8069709300994874, "ce_loss_17": 3.1651495575904844, "ce_loss_23": 2.8611255407333376, "ce_loss_3": 4.779746437072754, "ce_loss_6": 4.3871207475662235, "epoch": 0.056, "grad_norm": 904.0, "kl_loss_12": 1977.2301452636718, "kl_loss_17": 620.4258605957032, "kl_loss_3": 3868.1070190429687, "kl_loss_6": 3116.3320068359376, "learning_rate": 0.0009946824237646824, "loss": 2404.8865, "step": 560 }, { "ce_loss_12": 3.776531147956848, "ce_loss_17": 3.1243035554885865, "ce_loss_23": 2.8193069100379944, "ce_loss_3": 4.7919842004776, "ce_loss_6": 4.361578702926636, "epoch": 0.057, "grad_norm": 1008.0, "kl_loss_12": 1993.1865539550781, "kl_loss_17": 625.6534484863281, "kl_loss_3": 3952.2551391601564, "kl_loss_6": 3133.5532958984377, "learning_rate": 0.0009944491451423828, "loss": 2463.0141, "step": 570 }, { "ce_loss_12": 3.787885057926178, "ce_loss_17": 3.125153052806854, "ce_loss_23": 2.812381112575531, "ce_loss_3": 4.800907230377197, "ce_loss_6": 4.379597437381745, "epoch": 0.058, "grad_norm": 1072.0, "kl_loss_12": 2041.7862365722656, "kl_loss_17": 633.625210571289, "kl_loss_3": 4005.790637207031, "kl_loss_6": 3195.4418701171876, "learning_rate": 0.0009942108874226813, "loss": 2431.434, "step": 580 }, { "ce_loss_12": 3.8284475803375244, "ce_loss_17": 3.205892300605774, "ce_loss_23": 2.911292541027069, "ce_loss_3": 4.796784925460815, "ce_loss_6": 4.386137056350708, "epoch": 0.059, "grad_norm": 956.0, "kl_loss_12": 1921.2669250488282, "kl_loss_17": 609.1336517333984, "kl_loss_3": 3794.235485839844, "kl_loss_6": 3003.3477172851562, "learning_rate": 0.00099396765300483, "loss": 2323.3688, "step": 590 }, { "ce_loss_12": 3.813377547264099, "ce_loss_17": 3.2030410766601562, "ce_loss_23": 2.8971424460411073, "ce_loss_3": 4.776553750038147, "ce_loss_6": 4.383290815353393, "epoch": 0.06, "grad_norm": 1112.0, "kl_loss_12": 1935.8569091796876, "kl_loss_17": 634.0841278076172, "kl_loss_3": 3804.7228149414063, "kl_loss_6": 3041.823278808594, "learning_rate": 0.0009937194443381972, "loss": 2351.4145, "step": 600 }, { "ce_loss_12": 3.82430579662323, "ce_loss_17": 3.239823818206787, "ce_loss_23": 2.930612790584564, "ce_loss_3": 4.761824107170105, "ce_loss_6": 4.363707947731018, "epoch": 0.061, "grad_norm": 908.0, "kl_loss_12": 1880.8529052734375, "kl_loss_17": 646.1875213623047, "kl_loss_3": 3713.361767578125, "kl_loss_6": 2948.2285400390624, "learning_rate": 0.0009934662639222412, "loss": 2351.317, "step": 610 }, { "ce_loss_12": 3.8140470504760744, "ce_loss_17": 3.1936643242836, "ce_loss_23": 2.882937967777252, "ce_loss_3": 4.7908659219741825, "ce_loss_6": 4.390506052970887, "epoch": 0.062, "grad_norm": 952.0, "kl_loss_12": 1963.47578125, "kl_loss_17": 643.031982421875, "kl_loss_3": 3858.526159667969, "kl_loss_6": 3099.5319702148436, "learning_rate": 0.000993208114306486, "loss": 2373.4488, "step": 620 }, { "ce_loss_12": 3.7572766065597536, "ce_loss_17": 3.1062564730644224, "ce_loss_23": 2.814168381690979, "ce_loss_3": 4.740272831916809, "ce_loss_6": 4.334878396987915, "epoch": 0.063, "grad_norm": 924.0, "kl_loss_12": 1978.2345825195312, "kl_loss_17": 614.9058258056641, "kl_loss_3": 3885.282019042969, "kl_loss_6": 3110.789013671875, "learning_rate": 0.0009929449980904952, "loss": 2342.2086, "step": 630 }, { "ce_loss_12": 3.7797908902168276, "ce_loss_17": 3.1566942811012266, "ce_loss_23": 2.869984269142151, "ce_loss_3": 4.747637248039245, "ce_loss_6": 4.350768375396728, "epoch": 0.064, "grad_norm": 800.0, "kl_loss_12": 1928.0524047851563, "kl_loss_17": 596.5263305664063, "kl_loss_3": 3814.7638916015626, "kl_loss_6": 3054.167761230469, "learning_rate": 0.0009926769179238466, "loss": 2326.8762, "step": 640 }, { "ce_loss_12": 3.8469881892204283, "ce_loss_17": 3.203332006931305, "ce_loss_23": 2.903032958507538, "ce_loss_3": 4.791834831237793, "ce_loss_6": 4.398166084289551, "epoch": 0.065, "grad_norm": 1020.0, "kl_loss_12": 1990.3420837402343, "kl_loss_17": 627.5457916259766, "kl_loss_3": 3828.7911743164063, "kl_loss_6": 3068.594140625, "learning_rate": 0.000992403876506104, "loss": 2354.9871, "step": 650 }, { "ce_loss_12": 3.7656350135803223, "ce_loss_17": 3.137479567527771, "ce_loss_23": 2.847555148601532, "ce_loss_3": 4.724980711936951, "ce_loss_6": 4.33546097278595, "epoch": 0.066, "grad_norm": 872.0, "kl_loss_12": 1947.3597778320313, "kl_loss_17": 601.6236145019532, "kl_loss_3": 3810.823876953125, "kl_loss_6": 3051.07158203125, "learning_rate": 0.0009921258765867918, "loss": 2351.0516, "step": 660 }, { "ce_loss_12": 3.7432477951049803, "ce_loss_17": 3.102054977416992, "ce_loss_23": 2.8209555387496947, "ce_loss_3": 4.726716303825379, "ce_loss_6": 4.332750606536865, "epoch": 0.067, "grad_norm": 1048.0, "kl_loss_12": 1934.0251159667969, "kl_loss_17": 590.7294250488281, "kl_loss_3": 3862.057373046875, "kl_loss_6": 3099.9753173828126, "learning_rate": 0.0009918429209653662, "loss": 2332.4148, "step": 670 }, { "ce_loss_12": 3.79118971824646, "ce_loss_17": 3.160488820075989, "ce_loss_23": 2.8699307322502134, "ce_loss_3": 4.746296501159668, "ce_loss_6": 4.369972658157349, "epoch": 0.068, "grad_norm": 880.0, "kl_loss_12": 1943.5599060058594, "kl_loss_17": 621.3161865234375, "kl_loss_3": 3822.037878417969, "kl_loss_6": 3095.7862060546877, "learning_rate": 0.0009915550124911866, "loss": 2309.8324, "step": 680 }, { "ce_loss_12": 3.7744308590888975, "ce_loss_17": 3.164461362361908, "ce_loss_23": 2.871832025051117, "ce_loss_3": 4.717565703392029, "ce_loss_6": 4.334046483039856, "epoch": 0.069, "grad_norm": 928.0, "kl_loss_12": 1894.8691345214843, "kl_loss_17": 600.1791351318359, "kl_loss_3": 3719.063757324219, "kl_loss_6": 2989.2135620117188, "learning_rate": 0.0009912621540634887, "loss": 2298.6404, "step": 690 }, { "ce_loss_12": 3.7581581592559816, "ce_loss_17": 3.175926184654236, "ce_loss_23": 2.91041499376297, "ce_loss_3": 4.693021059036255, "ce_loss_6": 4.328584957122803, "epoch": 0.07, "grad_norm": 952.0, "kl_loss_12": 1805.7688293457031, "kl_loss_17": 557.3165618896485, "kl_loss_3": 3636.874401855469, "kl_loss_6": 2931.793310546875, "learning_rate": 0.0009909643486313534, "loss": 2253.7109, "step": 700 }, { "ce_loss_12": 3.694593846797943, "ce_loss_17": 3.0789793491363526, "ce_loss_23": 2.806851255893707, "ce_loss_3": 4.675688862800598, "ce_loss_6": 4.283495950698852, "epoch": 0.071, "grad_norm": 836.0, "kl_loss_12": 1873.1919860839844, "kl_loss_17": 561.667138671875, "kl_loss_3": 3785.0610595703124, "kl_loss_6": 3032.8089111328127, "learning_rate": 0.000990661599193678, "loss": 2339.2168, "step": 710 }, { "ce_loss_12": 3.781097650527954, "ce_loss_17": 3.1813387274742126, "ce_loss_23": 2.9092867970466614, "ce_loss_3": 4.718334794044495, "ce_loss_6": 4.346433448791504, "epoch": 0.072, "grad_norm": 844.0, "kl_loss_12": 1837.0475830078126, "kl_loss_17": 569.5842346191406, "kl_loss_3": 3658.182763671875, "kl_loss_6": 2955.2635498046875, "learning_rate": 0.0009903539087991462, "loss": 2259.2977, "step": 720 }, { "ce_loss_12": 3.7577704548835755, "ce_loss_17": 3.175670492649078, "ce_loss_23": 2.8987006187438964, "ce_loss_3": 4.700696063041687, "ce_loss_6": 4.328676080703735, "epoch": 0.073, "grad_norm": 828.0, "kl_loss_12": 1822.2861938476562, "kl_loss_17": 580.5244903564453, "kl_loss_3": 3665.8289306640627, "kl_loss_6": 2948.7177856445314, "learning_rate": 0.0009900412805461966, "loss": 2269.7988, "step": 730 }, { "ce_loss_12": 3.7941108107566834, "ce_loss_17": 3.2247503757476808, "ce_loss_23": 2.9608768463134765, "ce_loss_3": 4.737755060195923, "ce_loss_6": 4.366144275665283, "epoch": 0.074, "grad_norm": 1152.0, "kl_loss_12": 1778.3082214355468, "kl_loss_17": 545.759782409668, "kl_loss_3": 3633.7240478515623, "kl_loss_6": 2913.7588500976562, "learning_rate": 0.0009897237175829927, "loss": 2249.7711, "step": 740 }, { "ce_loss_12": 3.7247921228408813, "ce_loss_17": 3.1244994401931763, "ce_loss_23": 2.854635012149811, "ce_loss_3": 4.680092597007752, "ce_loss_6": 4.321918106079101, "epoch": 0.075, "grad_norm": 876.0, "kl_loss_12": 1839.49482421875, "kl_loss_17": 563.657942199707, "kl_loss_3": 3715.802197265625, "kl_loss_6": 3012.9723876953126, "learning_rate": 0.0009894012231073895, "loss": 2259.4498, "step": 750 }, { "ce_loss_12": 3.7520262837409972, "ce_loss_17": 3.1635246515274047, "ce_loss_23": 2.9015506505966187, "ce_loss_3": 4.691110682487488, "ce_loss_6": 4.32509058713913, "epoch": 0.076, "grad_norm": 908.0, "kl_loss_12": 1785.3197631835938, "kl_loss_17": 539.1061965942383, "kl_loss_3": 3617.3318725585937, "kl_loss_6": 2915.640002441406, "learning_rate": 0.0009890738003669028, "loss": 2248.3758, "step": 760 }, { "ce_loss_12": 3.7484350919723513, "ce_loss_17": 3.133402609825134, "ce_loss_23": 2.8702358484268187, "ce_loss_3": 4.71472737789154, "ce_loss_6": 4.340381741523743, "epoch": 0.077, "grad_norm": 772.0, "kl_loss_12": 1869.190771484375, "kl_loss_17": 557.6747970581055, "kl_loss_3": 3771.08115234375, "kl_loss_6": 3040.724951171875, "learning_rate": 0.0009887414526586764, "loss": 2246.727, "step": 770 }, { "ce_loss_12": 3.7698349475860597, "ce_loss_17": 3.175296950340271, "ce_loss_23": 2.920288693904877, "ce_loss_3": 4.71705801486969, "ce_loss_6": 4.331751799583435, "epoch": 0.078, "grad_norm": 788.0, "kl_loss_12": 1792.9148071289062, "kl_loss_17": 538.2495681762696, "kl_loss_3": 3645.0748657226563, "kl_loss_6": 2909.5169311523437, "learning_rate": 0.0009884041833294476, "loss": 2177.8914, "step": 780 }, { "ce_loss_12": 3.7557843208312987, "ce_loss_17": 3.1842599511146545, "ce_loss_23": 2.927360641956329, "ce_loss_3": 4.686308670043945, "ce_loss_6": 4.309764838218689, "epoch": 0.079, "grad_norm": 960.0, "kl_loss_12": 1760.4378784179687, "kl_loss_17": 537.1736328125, "kl_loss_3": 3580.4406494140626, "kl_loss_6": 2858.5263671875, "learning_rate": 0.000988061995775515, "loss": 2255.1971, "step": 790 }, { "ce_loss_12": 3.7003114342689516, "ce_loss_17": 3.1278024435043337, "ce_loss_23": 2.8603140830993654, "ce_loss_3": 4.623929166793824, "ce_loss_6": 4.252516531944275, "epoch": 0.08, "grad_norm": 960.0, "kl_loss_12": 1791.1098693847657, "kl_loss_17": 572.117807006836, "kl_loss_3": 3605.273156738281, "kl_loss_6": 2881.9625, "learning_rate": 0.0009877148934427035, "loss": 2211.3432, "step": 800 }, { "ce_loss_12": 3.7298121809959413, "ce_loss_17": 3.1572980999946596, "ce_loss_23": 2.8989072322845457, "ce_loss_3": 4.679050588607788, "ce_loss_6": 4.30648295879364, "epoch": 0.081, "grad_norm": 828.0, "kl_loss_12": 1769.9298461914063, "kl_loss_17": 549.3276397705079, "kl_loss_3": 3633.3033081054687, "kl_loss_6": 2918.2040771484376, "learning_rate": 0.0009873628798263297, "loss": 2195.8723, "step": 810 }, { "ce_loss_12": 3.6752089977264406, "ce_loss_17": 3.12715106010437, "ce_loss_23": 2.8626845240592957, "ce_loss_3": 4.616334772109985, "ce_loss_6": 4.246775162220001, "epoch": 0.082, "grad_norm": 928.0, "kl_loss_12": 1739.75078125, "kl_loss_17": 553.6329010009765, "kl_loss_3": 3569.2022094726562, "kl_loss_6": 2855.6306396484374, "learning_rate": 0.0009870059584711668, "loss": 2231.7055, "step": 820 }, { "ce_loss_12": 3.6967708706855773, "ce_loss_17": 3.126306390762329, "ce_loss_23": 2.8665772557258604, "ce_loss_3": 4.637620544433593, "ce_loss_6": 4.258155560493469, "epoch": 0.083, "grad_norm": 932.0, "kl_loss_12": 1746.82158203125, "kl_loss_17": 537.5760116577148, "kl_loss_3": 3573.961315917969, "kl_loss_6": 2853.324560546875, "learning_rate": 0.000986644132971409, "loss": 2181.06, "step": 830 }, { "ce_loss_12": 3.707110118865967, "ce_loss_17": 3.124701976776123, "ce_loss_23": 2.8559574365615843, "ce_loss_3": 4.668276762962341, "ce_loss_6": 4.28531403541565, "epoch": 0.084, "grad_norm": 808.0, "kl_loss_12": 1796.9253723144532, "kl_loss_17": 546.925749206543, "kl_loss_3": 3666.6813110351563, "kl_loss_6": 2930.093310546875, "learning_rate": 0.0009862774069706345, "loss": 2212.5172, "step": 840 }, { "ce_loss_12": 3.774817633628845, "ce_loss_17": 3.231712806224823, "ce_loss_23": 2.9828774333000183, "ce_loss_3": 4.684700012207031, "ce_loss_6": 4.320200824737549, "epoch": 0.085, "grad_norm": 800.0, "kl_loss_12": 1732.337646484375, "kl_loss_17": 536.6730453491211, "kl_loss_3": 3509.0249267578124, "kl_loss_6": 2806.8006591796875, "learning_rate": 0.000985905784161771, "loss": 2170.632, "step": 850 }, { "ce_loss_12": 3.715651345252991, "ce_loss_17": 3.154586577415466, "ce_loss_23": 2.9106888294219972, "ce_loss_3": 4.637371349334717, "ce_loss_6": 4.270102548599243, "epoch": 0.086, "grad_norm": 744.0, "kl_loss_12": 1734.4763610839843, "kl_loss_17": 514.9126983642578, "kl_loss_3": 3539.1369018554688, "kl_loss_6": 2834.6635498046876, "learning_rate": 0.000985529268287055, "loss": 2148.7227, "step": 860 }, { "ce_loss_12": 3.674989342689514, "ce_loss_17": 3.1098021984100344, "ce_loss_23": 2.8464756488800047, "ce_loss_3": 4.642683744430542, "ce_loss_6": 4.276080477237701, "epoch": 0.087, "grad_norm": 856.0, "kl_loss_12": 1765.6208251953126, "kl_loss_17": 545.826058959961, "kl_loss_3": 3641.5214111328123, "kl_loss_6": 2934.75615234375, "learning_rate": 0.0009851478631379982, "loss": 2215.8617, "step": 870 }, { "ce_loss_12": 3.7032182455062865, "ce_loss_17": 3.157497191429138, "ce_loss_23": 2.898158383369446, "ce_loss_3": 4.64922685623169, "ce_loss_6": 4.282612156867981, "epoch": 0.088, "grad_norm": 836.0, "kl_loss_12": 1717.8744384765625, "kl_loss_17": 551.1847427368164, "kl_loss_3": 3571.97431640625, "kl_loss_6": 2868.6826782226562, "learning_rate": 0.0009847615725553456, "loss": 2173.4846, "step": 880 }, { "ce_loss_12": 3.71795312166214, "ce_loss_17": 3.1950910091400146, "ce_loss_23": 2.9528208017349242, "ce_loss_3": 4.61668553352356, "ce_loss_6": 4.271125149726868, "epoch": 0.089, "grad_norm": 1016.0, "kl_loss_12": 1644.957293701172, "kl_loss_17": 517.1908355712891, "kl_loss_3": 3410.7930419921877, "kl_loss_6": 2743.155871582031, "learning_rate": 0.0009843704004290394, "loss": 2151.0193, "step": 890 }, { "ce_loss_12": 3.663389527797699, "ce_loss_17": 3.1275211811065673, "ce_loss_23": 2.864755928516388, "ce_loss_3": 4.603832817077636, "ce_loss_6": 4.239253675937652, "epoch": 0.09, "grad_norm": 716.0, "kl_loss_12": 1722.57783203125, "kl_loss_17": 551.8916763305664, "kl_loss_3": 3570.454870605469, "kl_loss_6": 2861.2793823242187, "learning_rate": 0.0009839743506981783, "loss": 2167.943, "step": 900 }, { "ce_loss_12": 3.6289228081703184, "ce_loss_17": 3.0549963235855104, "ce_loss_23": 2.787931501865387, "ce_loss_3": 4.591374397277832, "ce_loss_6": 4.221539330482483, "epoch": 0.091, "grad_norm": 768.0, "kl_loss_12": 1782.277117919922, "kl_loss_17": 558.6154708862305, "kl_loss_3": 3686.1168212890625, "kl_loss_6": 2968.0820068359376, "learning_rate": 0.0009835734273509786, "loss": 2207.0652, "step": 910 }, { "ce_loss_12": 3.693990683555603, "ce_loss_17": 3.1360571503639223, "ce_loss_23": 2.8765580654144287, "ce_loss_3": 4.6382588863372805, "ce_loss_6": 4.266951274871826, "epoch": 0.092, "grad_norm": 820.0, "kl_loss_12": 1706.47421875, "kl_loss_17": 539.3909072875977, "kl_loss_3": 3556.4054931640626, "kl_loss_6": 2838.4239868164063, "learning_rate": 0.0009831676344247342, "loss": 2159.5197, "step": 920 }, { "ce_loss_12": 3.683386981487274, "ce_loss_17": 3.1375192284584044, "ce_loss_23": 2.9041780829429626, "ce_loss_3": 4.575248861312867, "ce_loss_6": 4.217752540111542, "epoch": 0.093, "grad_norm": 832.0, "kl_loss_12": 1681.6122253417968, "kl_loss_17": 507.54964752197264, "kl_loss_3": 3429.8853271484377, "kl_loss_6": 2744.8274047851564, "learning_rate": 0.0009827569760057755, "loss": 2126.2113, "step": 930 }, { "ce_loss_12": 3.6735096335411073, "ce_loss_17": 3.0771583557128905, "ce_loss_23": 2.817060923576355, "ce_loss_3": 4.632097887992859, "ce_loss_6": 4.2522142887115475, "epoch": 0.094, "grad_norm": 920.0, "kl_loss_12": 1796.5344604492188, "kl_loss_17": 537.3643798828125, "kl_loss_3": 3685.60615234375, "kl_loss_6": 2951.1067016601564, "learning_rate": 0.000982341456229428, "loss": 2172.0842, "step": 940 }, { "ce_loss_12": 3.730703866481781, "ce_loss_17": 3.161611866950989, "ce_loss_23": 2.9119389176368715, "ce_loss_3": 4.65162878036499, "ce_loss_6": 4.285764884948731, "epoch": 0.095, "grad_norm": 876.0, "kl_loss_12": 1769.41083984375, "kl_loss_17": 528.7643432617188, "kl_loss_3": 3566.267614746094, "kl_loss_6": 2857.342639160156, "learning_rate": 0.000981921079279971, "loss": 2131.398, "step": 950 }, { "ce_loss_12": 3.6671907782554625, "ce_loss_17": 3.14452269077301, "ce_loss_23": 2.9202617764472962, "ce_loss_3": 4.5672272682189945, "ce_loss_6": 4.20267025232315, "epoch": 0.096, "grad_norm": 884.0, "kl_loss_12": 1647.3808959960938, "kl_loss_17": 494.3776519775391, "kl_loss_3": 3406.887805175781, "kl_loss_6": 2712.2851440429686, "learning_rate": 0.0009814958493905962, "loss": 2087.8811, "step": 960 }, { "ce_loss_12": 3.675451910495758, "ce_loss_17": 3.126833999156952, "ce_loss_23": 2.8827471137046814, "ce_loss_3": 4.63006055355072, "ce_loss_6": 4.257764625549316, "epoch": 0.097, "grad_norm": 828.0, "kl_loss_12": 1702.2585144042969, "kl_loss_17": 518.7686157226562, "kl_loss_3": 3553.715295410156, "kl_loss_6": 2842.6969970703126, "learning_rate": 0.0009810657708433637, "loss": 2178.4695, "step": 970 }, { "ce_loss_12": 3.699459207057953, "ce_loss_17": 3.196008837223053, "ce_loss_23": 2.953006935119629, "ce_loss_3": 4.605400490760803, "ce_loss_6": 4.242717599868774, "epoch": 0.098, "grad_norm": 792.0, "kl_loss_12": 1612.577392578125, "kl_loss_17": 508.1386352539063, "kl_loss_3": 3371.6166381835938, "kl_loss_6": 2683.5531372070313, "learning_rate": 0.0009806308479691594, "loss": 2072.0863, "step": 980 }, { "ce_loss_12": 3.746099066734314, "ce_loss_17": 3.2276832103729247, "ce_loss_23": 2.9513447999954225, "ce_loss_3": 4.660121750831604, "ce_loss_6": 4.313950896263123, "epoch": 0.099, "grad_norm": 1064.0, "kl_loss_12": 1686.7129333496093, "kl_loss_17": 575.0331268310547, "kl_loss_3": 3487.668017578125, "kl_loss_6": 2801.712780761719, "learning_rate": 0.0009801910851476522, "loss": 2125.6598, "step": 990 }, { "ce_loss_12": 3.6821831703186034, "ce_loss_17": 3.1449294328689574, "ce_loss_23": 2.890371763706207, "ce_loss_3": 4.6326807022094725, "ce_loss_6": 4.274430382251739, "epoch": 0.1, "grad_norm": 1080.0, "kl_loss_12": 1714.1190612792968, "kl_loss_17": 552.9030899047851, "kl_loss_3": 3598.4555908203124, "kl_loss_6": 2901.4451171875, "learning_rate": 0.0009797464868072487, "loss": 2147.7566, "step": 1000 }, { "ce_loss_12": 3.660784673690796, "ce_loss_17": 3.1308478116989136, "ce_loss_23": 2.8782097458839417, "ce_loss_3": 4.59887433052063, "ce_loss_6": 4.227726662158966, "epoch": 0.101, "grad_norm": 840.0, "kl_loss_12": 1695.8429382324218, "kl_loss_17": 536.8638442993164, "kl_loss_3": 3530.2622924804687, "kl_loss_6": 2826.7454345703127, "learning_rate": 0.0009792970574250492, "loss": 2137.6924, "step": 1010 }, { "ce_loss_12": 3.66312894821167, "ce_loss_17": 3.1327234148979186, "ce_loss_23": 2.888000476360321, "ce_loss_3": 4.584208631515503, "ce_loss_6": 4.2243523597717285, "epoch": 0.102, "grad_norm": 1020.0, "kl_loss_12": 1655.641827392578, "kl_loss_17": 515.9014770507813, "kl_loss_3": 3461.872509765625, "kl_loss_6": 2770.165979003906, "learning_rate": 0.0009788428015268028, "loss": 2076.8967, "step": 1020 }, { "ce_loss_12": 3.658862817287445, "ce_loss_17": 3.1359084010124207, "ce_loss_23": 2.8994009375572203, "ce_loss_3": 4.570998239517212, "ce_loss_6": 4.215702986717224, "epoch": 0.103, "grad_norm": 720.0, "kl_loss_12": 1638.7247253417968, "kl_loss_17": 514.6052276611329, "kl_loss_3": 3429.5267700195313, "kl_loss_6": 2738.1430419921876, "learning_rate": 0.0009783837236868609, "loss": 2080.224, "step": 1030 }, { "ce_loss_12": 3.6323411583900453, "ce_loss_17": 3.0970885038375853, "ce_loss_23": 2.8580228686332703, "ce_loss_3": 4.537933397293091, "ce_loss_6": 4.185244929790497, "epoch": 0.104, "grad_norm": 816.0, "kl_loss_12": 1652.1881530761718, "kl_loss_17": 504.38118438720704, "kl_loss_3": 3413.4997436523436, "kl_loss_6": 2735.0294799804688, "learning_rate": 0.0009779198285281327, "loss": 2065.5354, "step": 1040 }, { "ce_loss_12": 3.647919070720673, "ce_loss_17": 3.0967610716819762, "ce_loss_23": 2.863264966011047, "ce_loss_3": 4.571896409988403, "ce_loss_6": 4.211425912380219, "epoch": 0.105, "grad_norm": 780.0, "kl_loss_12": 1667.45146484375, "kl_loss_17": 492.2688583374023, "kl_loss_3": 3491.690625, "kl_loss_6": 2786.4512573242187, "learning_rate": 0.0009774511207220368, "loss": 2097.541, "step": 1050 }, { "ce_loss_12": 3.6793935298919678, "ce_loss_17": 3.1396284341812133, "ce_loss_23": 2.9056878805160524, "ce_loss_3": 4.610803484916687, "ce_loss_6": 4.24627730846405, "epoch": 0.106, "grad_norm": 680.0, "kl_loss_12": 1664.4289855957031, "kl_loss_17": 503.2407257080078, "kl_loss_3": 3494.34267578125, "kl_loss_6": 2796.4139770507813, "learning_rate": 0.0009769776049884564, "loss": 2101.0625, "step": 1060 }, { "ce_loss_12": 3.6110607266426085, "ce_loss_17": 3.0582555770874023, "ce_loss_23": 2.8185740232467653, "ce_loss_3": 4.550081133842468, "ce_loss_6": 4.188111197948456, "epoch": 0.107, "grad_norm": 804.0, "kl_loss_12": 1693.1038452148437, "kl_loss_17": 502.1560562133789, "kl_loss_3": 3553.5575317382813, "kl_loss_6": 2851.522509765625, "learning_rate": 0.0009764992860956889, "loss": 2160.6059, "step": 1070 }, { "ce_loss_12": 3.6763460516929625, "ce_loss_17": 3.1736180782318115, "ce_loss_23": 2.96165292263031, "ce_loss_3": 4.553462076187134, "ce_loss_6": 4.211921668052673, "epoch": 0.108, "grad_norm": 940.0, "kl_loss_12": 1565.0017883300782, "kl_loss_17": 465.6837585449219, "kl_loss_3": 3285.5805053710938, "kl_loss_6": 2630.6787231445314, "learning_rate": 0.0009760161688604008, "loss": 2027.6166, "step": 1080 }, { "ce_loss_12": 3.704931104183197, "ce_loss_17": 3.183962869644165, "ce_loss_23": 2.955583941936493, "ce_loss_3": 4.6247032403945925, "ce_loss_6": 4.262003123760223, "epoch": 0.109, "grad_norm": 780.0, "kl_loss_12": 1606.7292419433593, "kl_loss_17": 479.0166900634766, "kl_loss_3": 3407.0781982421877, "kl_loss_6": 2713.0140380859375, "learning_rate": 0.0009755282581475768, "loss": 2075.1375, "step": 1090 }, { "ce_loss_12": 3.735625076293945, "ce_loss_17": 3.2278528928756716, "ce_loss_23": 2.992156505584717, "ce_loss_3": 4.650100636482239, "ce_loss_6": 4.283954048156739, "epoch": 0.11, "grad_norm": 1120.0, "kl_loss_12": 1597.375665283203, "kl_loss_17": 487.4188934326172, "kl_loss_3": 3389.489611816406, "kl_loss_6": 2680.6547973632814, "learning_rate": 0.0009750355588704727, "loss": 2029.5996, "step": 1100 }, { "ce_loss_12": 3.602501726150513, "ce_loss_17": 3.0762116074562074, "ce_loss_23": 2.8531104922294617, "ce_loss_3": 4.54400155544281, "ce_loss_6": 4.181029498577118, "epoch": 0.111, "grad_norm": 892.0, "kl_loss_12": 1600.1835693359376, "kl_loss_17": 478.77005157470705, "kl_loss_3": 3453.218310546875, "kl_loss_6": 2753.7009399414064, "learning_rate": 0.0009745380759905647, "loss": 2102.3586, "step": 1110 }, { "ce_loss_12": 3.566656935214996, "ce_loss_17": 3.036020517349243, "ce_loss_23": 2.811892902851105, "ce_loss_3": 4.49903085231781, "ce_loss_6": 4.14581116437912, "epoch": 0.112, "grad_norm": 952.0, "kl_loss_12": 1613.6444152832032, "kl_loss_17": 472.362158203125, "kl_loss_3": 3453.385888671875, "kl_loss_6": 2762.3469360351564, "learning_rate": 0.0009740358145174998, "loss": 2112.1863, "step": 1120 }, { "ce_loss_12": 3.666201722621918, "ce_loss_17": 3.163138520717621, "ce_loss_23": 2.945633387565613, "ce_loss_3": 4.552199673652649, "ce_loss_6": 4.200457072257995, "epoch": 0.113, "grad_norm": 820.0, "kl_loss_12": 1570.5922912597657, "kl_loss_17": 465.73221893310546, "kl_loss_3": 3322.1153564453125, "kl_loss_6": 2639.3234130859373, "learning_rate": 0.0009735287795090455, "loss": 2028.2494, "step": 1130 }, { "ce_loss_12": 3.6025816917419435, "ce_loss_17": 3.0846946001052857, "ce_loss_23": 2.855406713485718, "ce_loss_3": 4.512770676612854, "ce_loss_6": 4.161376357078552, "epoch": 0.114, "grad_norm": 880.0, "kl_loss_12": 1629.634259033203, "kl_loss_17": 493.5605972290039, "kl_loss_3": 3416.2345947265626, "kl_loss_6": 2742.5260375976563, "learning_rate": 0.0009730169760710386, "loss": 2055.8789, "step": 1140 }, { "ce_loss_12": 3.655885374546051, "ce_loss_17": 3.1461606860160827, "ce_loss_23": 2.9138878703117372, "ce_loss_3": 4.562855291366577, "ce_loss_6": 4.201452386379242, "epoch": 0.115, "grad_norm": 812.0, "kl_loss_12": 1582.1077880859375, "kl_loss_17": 490.54324798583986, "kl_loss_3": 3347.0728271484377, "kl_loss_6": 2658.38212890625, "learning_rate": 0.0009725004093573342, "loss": 2048.8902, "step": 1150 }, { "ce_loss_12": 3.6162595748901367, "ce_loss_17": 3.117063570022583, "ce_loss_23": 2.8703216075897218, "ce_loss_3": 4.53184130191803, "ce_loss_6": 4.167289030551911, "epoch": 0.116, "grad_norm": 1064.0, "kl_loss_12": 1582.6757202148438, "kl_loss_17": 511.1013519287109, "kl_loss_3": 3389.1841430664062, "kl_loss_6": 2682.0026000976563, "learning_rate": 0.0009719790845697534, "loss": 2028.8348, "step": 1160 }, { "ce_loss_12": 3.5447800517082215, "ce_loss_17": 3.0915211915969847, "ce_loss_23": 2.8394047379493714, "ce_loss_3": 4.442968082427979, "ce_loss_6": 4.093847930431366, "epoch": 0.117, "grad_norm": 1304.0, "kl_loss_12": 1529.0038330078125, "kl_loss_17": 528.0925674438477, "kl_loss_3": 3300.722692871094, "kl_loss_6": 2623.0964965820312, "learning_rate": 0.0009714530069580309, "loss": 2001.9266, "step": 1170 }, { "ce_loss_12": 3.648385465145111, "ce_loss_17": 3.163987386226654, "ce_loss_23": 2.906623864173889, "ce_loss_3": 4.570439338684082, "ce_loss_6": 4.20642215013504, "epoch": 0.118, "grad_norm": 976.0, "kl_loss_12": 1599.7222900390625, "kl_loss_17": 545.9570709228516, "kl_loss_3": 3392.21904296875, "kl_loss_6": 2702.566052246094, "learning_rate": 0.0009709221818197624, "loss": 2045.9377, "step": 1180 }, { "ce_loss_12": 3.69852100610733, "ce_loss_17": 3.195405375957489, "ce_loss_23": 2.955050897598267, "ce_loss_3": 4.611287760734558, "ce_loss_6": 4.259045755863189, "epoch": 0.119, "grad_norm": 996.0, "kl_loss_12": 1595.0409118652344, "kl_loss_17": 510.2501693725586, "kl_loss_3": 3394.6732788085938, "kl_loss_6": 2711.4011474609374, "learning_rate": 0.0009703866145003512, "loss": 2048.791, "step": 1190 }, { "ce_loss_12": 3.6461198687553407, "ce_loss_17": 3.1560633540153504, "ce_loss_23": 2.9288500785827636, "ce_loss_3": 4.544662046432495, "ce_loss_6": 4.196171057224274, "epoch": 0.12, "grad_norm": 844.0, "kl_loss_12": 1562.8870666503906, "kl_loss_17": 476.9662322998047, "kl_loss_3": 3339.506799316406, "kl_loss_6": 2660.272998046875, "learning_rate": 0.0009698463103929542, "loss": 2039.2613, "step": 1200 }, { "ce_loss_12": 3.6362263798713683, "ce_loss_17": 3.118537414073944, "ce_loss_23": 2.890295457839966, "ce_loss_3": 4.55160620212555, "ce_loss_6": 4.197222125530243, "epoch": 0.121, "grad_norm": 840.0, "kl_loss_12": 1606.2838256835937, "kl_loss_17": 482.26617736816405, "kl_loss_3": 3396.402978515625, "kl_loss_6": 2713.9810546875, "learning_rate": 0.0009693012749384279, "loss": 2055.3727, "step": 1210 }, { "ce_loss_12": 3.620063138008118, "ce_loss_17": 3.11392560005188, "ce_loss_23": 2.894404947757721, "ce_loss_3": 4.517776942253112, "ce_loss_6": 4.170427846908569, "epoch": 0.122, "grad_norm": 896.0, "kl_loss_12": 1568.8251892089843, "kl_loss_17": 472.27161560058596, "kl_loss_3": 3342.350476074219, "kl_loss_6": 2666.2023193359373, "learning_rate": 0.0009687515136252732, "loss": 1995.2891, "step": 1220 }, { "ce_loss_12": 3.6086941242218016, "ce_loss_17": 3.079377865791321, "ce_loss_23": 2.861296200752258, "ce_loss_3": 4.553121948242188, "ce_loss_6": 4.194600236415863, "epoch": 0.123, "grad_norm": 920.0, "kl_loss_12": 1611.9838623046876, "kl_loss_17": 462.04967346191404, "kl_loss_3": 3465.6461181640625, "kl_loss_6": 2771.3404052734377, "learning_rate": 0.0009681970319895803, "loss": 2132.065, "step": 1230 }, { "ce_loss_12": 3.6665617704391478, "ce_loss_17": 3.152220356464386, "ce_loss_23": 2.9416170954704284, "ce_loss_3": 4.568020367622376, "ce_loss_6": 4.2054404497146605, "epoch": 0.124, "grad_norm": 1004.0, "kl_loss_12": 1559.611993408203, "kl_loss_17": 455.8671081542969, "kl_loss_3": 3328.233264160156, "kl_loss_6": 2633.26875, "learning_rate": 0.0009676378356149733, "loss": 1992.5469, "step": 1240 }, { "ce_loss_12": 3.606646728515625, "ce_loss_17": 3.105606806278229, "ce_loss_23": 2.9098575115203857, "ce_loss_3": 4.509173798561096, "ce_loss_6": 4.152543914318085, "epoch": 0.125, "grad_norm": 732.0, "kl_loss_12": 1517.0133972167969, "kl_loss_17": 436.5280731201172, "kl_loss_3": 3289.8114501953123, "kl_loss_6": 2606.249169921875, "learning_rate": 0.0009670739301325534, "loss": 1980.5193, "step": 1250 }, { "ce_loss_12": 3.5973118901252747, "ce_loss_17": 3.0886258721351623, "ce_loss_23": 2.8696954250335693, "ce_loss_3": 4.481967115402222, "ce_loss_6": 4.128626692295074, "epoch": 0.126, "grad_norm": 776.0, "kl_loss_12": 1570.6747680664062, "kl_loss_17": 457.1540771484375, "kl_loss_3": 3311.004675292969, "kl_loss_6": 2629.484521484375, "learning_rate": 0.0009665053212208426, "loss": 2021.3977, "step": 1260 }, { "ce_loss_12": 3.637851631641388, "ce_loss_17": 3.1208248853683473, "ce_loss_23": 2.904267394542694, "ce_loss_3": 4.546870756149292, "ce_loss_6": 4.183465051651001, "epoch": 0.127, "grad_norm": 920.0, "kl_loss_12": 1595.9552673339845, "kl_loss_17": 469.6421890258789, "kl_loss_3": 3385.359619140625, "kl_loss_6": 2681.4626708984374, "learning_rate": 0.0009659320146057262, "loss": 2027.4336, "step": 1270 }, { "ce_loss_12": 3.637918245792389, "ce_loss_17": 3.1327763319015505, "ce_loss_23": 2.92303524017334, "ce_loss_3": 4.516714310646057, "ce_loss_6": 4.169120907783508, "epoch": 0.128, "grad_norm": 1088.0, "kl_loss_12": 1543.7512573242188, "kl_loss_17": 451.88436431884764, "kl_loss_3": 3297.9475708007812, "kl_loss_6": 2619.7408935546873, "learning_rate": 0.0009653540160603955, "loss": 1990.6383, "step": 1280 }, { "ce_loss_12": 3.632475471496582, "ce_loss_17": 3.1240146160125732, "ce_loss_23": 2.922209882736206, "ce_loss_3": 4.500276255607605, "ce_loss_6": 4.169117403030396, "epoch": 0.129, "grad_norm": 1280.0, "kl_loss_12": 1547.4067016601562, "kl_loss_17": 442.0089508056641, "kl_loss_3": 3277.050927734375, "kl_loss_6": 2635.888952636719, "learning_rate": 0.0009647713314052896, "loss": 1965.3996, "step": 1290 }, { "ce_loss_12": 3.613161361217499, "ce_loss_17": 3.081482803821564, "ce_loss_23": 2.8680325150489807, "ce_loss_3": 4.532515811920166, "ce_loss_6": 4.186613607406616, "epoch": 0.13, "grad_norm": 956.0, "kl_loss_12": 1596.8814331054687, "kl_loss_17": 452.1222091674805, "kl_loss_3": 3415.5057739257813, "kl_loss_6": 2745.046984863281, "learning_rate": 0.0009641839665080363, "loss": 2033.1896, "step": 1300 }, { "ce_loss_12": 3.562652254104614, "ce_loss_17": 3.0530904650688173, "ce_loss_23": 2.8504271388053892, "ce_loss_3": 4.474567329883575, "ce_loss_6": 4.133738553524017, "epoch": 0.131, "grad_norm": 784.0, "kl_loss_12": 1531.3059204101562, "kl_loss_17": 432.8893524169922, "kl_loss_3": 3327.0203857421875, "kl_loss_6": 2671.88740234375, "learning_rate": 0.0009635919272833937, "loss": 1965.8117, "step": 1310 }, { "ce_loss_12": 3.5924949288368224, "ce_loss_17": 3.0808891892433166, "ce_loss_23": 2.8723941087722777, "ce_loss_3": 4.503520846366882, "ce_loss_6": 4.153036797046662, "epoch": 0.132, "grad_norm": 1048.0, "kl_loss_12": 1532.0396484375, "kl_loss_17": 437.7684783935547, "kl_loss_3": 3331.197619628906, "kl_loss_6": 2647.5683471679686, "learning_rate": 0.0009629952196931902, "loss": 1948.2967, "step": 1320 }, { "ce_loss_12": 3.5496832847595217, "ce_loss_17": 3.0600673079490663, "ce_loss_23": 2.86389017701149, "ce_loss_3": 4.480193614959717, "ce_loss_6": 4.126709806919098, "epoch": 0.133, "grad_norm": 920.0, "kl_loss_12": 1499.148486328125, "kl_loss_17": 434.60589294433595, "kl_loss_3": 3326.1408447265626, "kl_loss_6": 2646.4357788085936, "learning_rate": 0.0009623938497462645, "loss": 1959.7061, "step": 1330 }, { "ce_loss_12": 3.5570675134658813, "ce_loss_17": 3.054165482521057, "ce_loss_23": 2.8529660224914553, "ce_loss_3": 4.460046792030335, "ce_loss_6": 4.117917943000793, "epoch": 0.134, "grad_norm": 1240.0, "kl_loss_12": 1530.305157470703, "kl_loss_17": 441.2641830444336, "kl_loss_3": 3308.531140136719, "kl_loss_6": 2643.5994262695312, "learning_rate": 0.0009617878234984055, "loss": 1994.7871, "step": 1340 }, { "ce_loss_12": 3.614206826686859, "ce_loss_17": 3.138014531135559, "ce_loss_23": 2.938678300380707, "ce_loss_3": 4.502785682678223, "ce_loss_6": 4.152660512924195, "epoch": 0.135, "grad_norm": 1088.0, "kl_loss_12": 1482.2549621582032, "kl_loss_17": 428.1679397583008, "kl_loss_3": 3236.7653930664064, "kl_loss_6": 2564.1514282226562, "learning_rate": 0.0009611771470522907, "loss": 1953.1434, "step": 1350 }, { "ce_loss_12": 3.5755207896232606, "ce_loss_17": 3.0807791352272034, "ce_loss_23": 2.8702484488487245, "ce_loss_3": 4.491902422904968, "ce_loss_6": 4.136397743225098, "epoch": 0.136, "grad_norm": 772.0, "kl_loss_12": 1496.8853271484375, "kl_loss_17": 433.81241302490236, "kl_loss_3": 3306.8744140625, "kl_loss_6": 2615.6809326171874, "learning_rate": 0.0009605618265574251, "loss": 1944.1047, "step": 1360 }, { "ce_loss_12": 3.5688475489616396, "ce_loss_17": 3.0453089475631714, "ce_loss_23": 2.8437607407569887, "ce_loss_3": 4.478715562820435, "ce_loss_6": 4.137208127975464, "epoch": 0.137, "grad_norm": 812.0, "kl_loss_12": 1564.785009765625, "kl_loss_17": 442.6109817504883, "kl_loss_3": 3373.6415283203123, "kl_loss_6": 2701.3059326171874, "learning_rate": 0.0009599418682100792, "loss": 1991.9086, "step": 1370 }, { "ce_loss_12": 3.5749759554862974, "ce_loss_17": 3.0835989832878115, "ce_loss_23": 2.8831868410110473, "ce_loss_3": 4.501394367218017, "ce_loss_6": 4.144296360015869, "epoch": 0.138, "grad_norm": 804.0, "kl_loss_12": 1509.7307250976562, "kl_loss_17": 432.77608184814454, "kl_loss_3": 3318.9124755859375, "kl_loss_6": 2630.0392578125, "learning_rate": 0.0009593172782532268, "loss": 1973.3191, "step": 1380 }, { "ce_loss_12": 3.6028687715530396, "ce_loss_17": 3.1176024317741393, "ce_loss_23": 2.9136727333068846, "ce_loss_3": 4.505716633796692, "ce_loss_6": 4.151191294193268, "epoch": 0.139, "grad_norm": 848.0, "kl_loss_12": 1499.2755004882813, "kl_loss_17": 433.8134765625, "kl_loss_3": 3275.23427734375, "kl_loss_6": 2584.3994384765624, "learning_rate": 0.0009586880629764817, "loss": 1947.1092, "step": 1390 }, { "ce_loss_12": 3.5526779294013977, "ce_loss_17": 3.062094521522522, "ce_loss_23": 2.851840627193451, "ce_loss_3": 4.468865394592285, "ce_loss_6": 4.124651682376862, "epoch": 0.14, "grad_norm": 768.0, "kl_loss_12": 1505.257391357422, "kl_loss_17": 441.1879943847656, "kl_loss_3": 3301.0413330078127, "kl_loss_6": 2638.0770751953123, "learning_rate": 0.0009580542287160348, "loss": 1941.148, "step": 1400 }, { "ce_loss_12": 3.5124378323554994, "ce_loss_17": 3.026162827014923, "ce_loss_23": 2.8164494037628174, "ce_loss_3": 4.431016111373902, "ce_loss_6": 4.072873175144196, "epoch": 0.141, "grad_norm": 868.0, "kl_loss_12": 1509.1204895019532, "kl_loss_17": 451.11576385498046, "kl_loss_3": 3324.8519775390623, "kl_loss_6": 2641.1728759765624, "learning_rate": 0.0009574157818545901, "loss": 1946.1949, "step": 1410 }, { "ce_loss_12": 3.559919762611389, "ce_loss_17": 3.088271903991699, "ce_loss_23": 2.888034200668335, "ce_loss_3": 4.447533106803894, "ce_loss_6": 4.106405591964721, "epoch": 0.142, "grad_norm": 1032.0, "kl_loss_12": 1453.2306091308594, "kl_loss_17": 428.4953872680664, "kl_loss_3": 3214.4719360351564, "kl_loss_6": 2558.95302734375, "learning_rate": 0.0009567727288213005, "loss": 1957.8348, "step": 1420 }, { "ce_loss_12": 3.5663437843322754, "ce_loss_17": 3.079855978488922, "ce_loss_23": 2.8650198936462403, "ce_loss_3": 4.477605485916138, "ce_loss_6": 4.123521387577057, "epoch": 0.143, "grad_norm": 912.0, "kl_loss_12": 1524.784912109375, "kl_loss_17": 462.08311462402344, "kl_loss_3": 3328.7591918945313, "kl_loss_6": 2644.153955078125, "learning_rate": 0.0009561250760917027, "loss": 1960.2055, "step": 1430 }, { "ce_loss_12": 3.5682985663414, "ce_loss_17": 3.092578685283661, "ce_loss_23": 2.877074158191681, "ce_loss_3": 4.466118907928466, "ce_loss_6": 4.126705014705658, "epoch": 0.144, "grad_norm": 984.0, "kl_loss_12": 1512.0598266601562, "kl_loss_17": 466.4274627685547, "kl_loss_3": 3282.262243652344, "kl_loss_6": 2622.5625, "learning_rate": 0.0009554728301876525, "loss": 1924.617, "step": 1440 }, { "ce_loss_12": 3.6111833572387697, "ce_loss_17": 3.1243973851203917, "ce_loss_23": 2.9122658252716063, "ce_loss_3": 4.4899966478347775, "ce_loss_6": 4.147026920318604, "epoch": 0.145, "grad_norm": 780.0, "kl_loss_12": 1503.76845703125, "kl_loss_17": 453.64098052978517, "kl_loss_3": 3236.64072265625, "kl_loss_6": 2568.684216308594, "learning_rate": 0.0009548159976772592, "loss": 1997.7438, "step": 1450 }, { "ce_loss_12": 3.5708705306053163, "ce_loss_17": 3.0883104681968687, "ce_loss_23": 2.874672222137451, "ce_loss_3": 4.492368054389954, "ce_loss_6": 4.138778901100158, "epoch": 0.146, "grad_norm": 868.0, "kl_loss_12": 1515.8531860351563, "kl_loss_17": 453.29029693603513, "kl_loss_3": 3321.64169921875, "kl_loss_6": 2639.4420654296873, "learning_rate": 0.0009541545851748186, "loss": 1960.3789, "step": 1460 }, { "ce_loss_12": 3.4660339832305906, "ce_loss_17": 2.9614037394523622, "ce_loss_23": 2.751054549217224, "ce_loss_3": 4.412300229072571, "ce_loss_6": 4.054196834564209, "epoch": 0.147, "grad_norm": 964.0, "kl_loss_12": 1522.1134399414063, "kl_loss_17": 445.31788482666013, "kl_loss_3": 3382.094104003906, "kl_loss_6": 2679.044787597656, "learning_rate": 0.0009534885993407473, "loss": 1979.6027, "step": 1470 }, { "ce_loss_12": 3.591128182411194, "ce_loss_17": 3.1027103424072267, "ce_loss_23": 2.9019294023513793, "ce_loss_3": 4.5010672330856325, "ce_loss_6": 4.15693166255951, "epoch": 0.148, "grad_norm": 844.0, "kl_loss_12": 1484.3631774902344, "kl_loss_17": 428.38403778076173, "kl_loss_3": 3270.4963989257812, "kl_loss_6": 2609.3266723632814, "learning_rate": 0.0009528180468815154, "loss": 1953.434, "step": 1480 }, { "ce_loss_12": 3.6338334441185, "ce_loss_17": 3.1534486651420592, "ce_loss_23": 2.961154115200043, "ce_loss_3": 4.526342129707336, "ce_loss_6": 4.185048532485962, "epoch": 0.149, "grad_norm": 948.0, "kl_loss_12": 1481.1465270996093, "kl_loss_17": 430.0911437988281, "kl_loss_3": 3230.5956787109376, "kl_loss_6": 2562.666149902344, "learning_rate": 0.0009521429345495787, "loss": 1933.2992, "step": 1490 }, { "ce_loss_12": 3.598537302017212, "ce_loss_17": 3.1211525917053224, "ce_loss_23": 2.9331879019737244, "ce_loss_3": 4.479608464241028, "ce_loss_6": 4.137439227104187, "epoch": 0.15, "grad_norm": 960.0, "kl_loss_12": 1462.8320007324219, "kl_loss_17": 407.0600341796875, "kl_loss_3": 3211.813586425781, "kl_loss_6": 2545.463244628906, "learning_rate": 0.0009514632691433108, "loss": 1927.2254, "step": 1500 }, { "ce_loss_12": 3.5884127616882324, "ce_loss_17": 3.0939092874526977, "ce_loss_23": 2.8999706745147704, "ce_loss_3": 4.481277489662171, "ce_loss_6": 4.125718057155609, "epoch": 0.151, "grad_norm": 948.0, "kl_loss_12": 1493.9746704101562, "kl_loss_17": 429.4934387207031, "kl_loss_3": 3266.7730102539062, "kl_loss_6": 2581.5667236328127, "learning_rate": 0.0009507790575069346, "loss": 1948.7154, "step": 1510 }, { "ce_loss_12": 3.5742282629013062, "ce_loss_17": 3.06930810213089, "ce_loss_23": 2.8608696103096007, "ce_loss_3": 4.483838415145874, "ce_loss_6": 4.137926387786865, "epoch": 0.152, "grad_norm": 880.0, "kl_loss_12": 1521.2948486328125, "kl_loss_17": 445.56875, "kl_loss_3": 3322.2888793945312, "kl_loss_6": 2635.8153564453123, "learning_rate": 0.0009500903065304539, "loss": 1989.816, "step": 1520 }, { "ce_loss_12": 3.561050260066986, "ce_loss_17": 3.0977537989616395, "ce_loss_23": 2.9062703967094423, "ce_loss_3": 4.4570392847061155, "ce_loss_6": 4.105130457878113, "epoch": 0.153, "grad_norm": 988.0, "kl_loss_12": 1437.3990417480468, "kl_loss_17": 418.8599227905273, "kl_loss_3": 3199.603796386719, "kl_loss_6": 2526.914514160156, "learning_rate": 0.0009493970231495835, "loss": 1924.5223, "step": 1530 }, { "ce_loss_12": 3.5027644991874696, "ce_loss_17": 3.0454777598381044, "ce_loss_23": 2.8620609521865843, "ce_loss_3": 4.382567358016968, "ce_loss_6": 4.0437868475914005, "epoch": 0.154, "grad_norm": 916.0, "kl_loss_12": 1412.9682373046876, "kl_loss_17": 408.1348571777344, "kl_loss_3": 3152.004931640625, "kl_loss_6": 2493.6770629882812, "learning_rate": 0.0009486992143456792, "loss": 1884.0617, "step": 1540 }, { "ce_loss_12": 3.599140226840973, "ce_loss_17": 3.080864226818085, "ce_loss_23": 2.8673729300498962, "ce_loss_3": 4.539337968826294, "ce_loss_6": 4.187305796146393, "epoch": 0.155, "grad_norm": 820.0, "kl_loss_12": 1557.6886962890626, "kl_loss_17": 457.3323699951172, "kl_loss_3": 3423.3054809570312, "kl_loss_6": 2738.628857421875, "learning_rate": 0.0009479968871456679, "loss": 1985.3701, "step": 1550 }, { "ce_loss_12": 3.543421280384064, "ce_loss_17": 3.055658221244812, "ce_loss_23": 2.8478148102760317, "ce_loss_3": 4.466424870491028, "ce_loss_6": 4.108865821361542, "epoch": 0.156, "grad_norm": 1232.0, "kl_loss_12": 1501.4693420410156, "kl_loss_17": 452.0060317993164, "kl_loss_3": 3327.9519897460937, "kl_loss_6": 2635.226696777344, "learning_rate": 0.0009472900486219768, "loss": 1935.15, "step": 1560 }, { "ce_loss_12": 3.51930969953537, "ce_loss_17": 3.0501848578453066, "ce_loss_23": 2.839846873283386, "ce_loss_3": 4.412303018569946, "ce_loss_6": 4.060588872432708, "epoch": 0.157, "grad_norm": 1020.0, "kl_loss_12": 1464.4576171875, "kl_loss_17": 446.97486267089846, "kl_loss_3": 3240.67470703125, "kl_loss_6": 2549.5041015625, "learning_rate": 0.000946578705892462, "loss": 1938.8553, "step": 1570 }, { "ce_loss_12": 3.5285266995429994, "ce_loss_17": 3.0789849877357485, "ce_loss_23": 2.8734943747520445, "ce_loss_3": 4.422140288352966, "ce_loss_6": 4.075236809253693, "epoch": 0.158, "grad_norm": 768.0, "kl_loss_12": 1412.7643310546875, "kl_loss_17": 430.24098052978513, "kl_loss_3": 3183.34345703125, "kl_loss_6": 2502.259240722656, "learning_rate": 0.0009458628661203367, "loss": 1917.8164, "step": 1580 }, { "ce_loss_12": 3.571163034439087, "ce_loss_17": 3.0778075575828554, "ce_loss_23": 2.8818636178970336, "ce_loss_3": 4.490660810470581, "ce_loss_6": 4.131191611289978, "epoch": 0.159, "grad_norm": 1064.0, "kl_loss_12": 1492.7530334472656, "kl_loss_17": 436.4660186767578, "kl_loss_3": 3318.567431640625, "kl_loss_6": 2629.0233520507813, "learning_rate": 0.0009451425365140996, "loss": 1908.1531, "step": 1590 }, { "ce_loss_12": 3.594418454170227, "ce_loss_17": 3.138779675960541, "ce_loss_23": 2.9434367418289185, "ce_loss_3": 4.466434574127197, "ce_loss_6": 4.127680397033691, "epoch": 0.16, "grad_norm": 1144.0, "kl_loss_12": 1422.8166015625, "kl_loss_17": 416.85819549560546, "kl_loss_3": 3158.031591796875, "kl_loss_6": 2484.4607055664064, "learning_rate": 0.0009444177243274617, "loss": 1869.9365, "step": 1600 }, { "ce_loss_12": 3.498467671871185, "ce_loss_17": 3.0184685468673704, "ce_loss_23": 2.8139328956604004, "ce_loss_3": 4.4108422040939335, "ce_loss_6": 4.0624502301216125, "epoch": 0.161, "grad_norm": 860.0, "kl_loss_12": 1490.5886535644531, "kl_loss_17": 436.47232208251955, "kl_loss_3": 3284.385534667969, "kl_loss_6": 2611.4808715820313, "learning_rate": 0.0009436884368592739, "loss": 1933.7881, "step": 1610 }, { "ce_loss_12": 3.527223265171051, "ce_loss_17": 3.0564520359039307, "ce_loss_23": 2.8607160449028015, "ce_loss_3": 4.418679213523864, "ce_loss_6": 4.0709424138069155, "epoch": 0.162, "grad_norm": 876.0, "kl_loss_12": 1436.9969604492187, "kl_loss_17": 422.61949920654297, "kl_loss_3": 3201.344482421875, "kl_loss_6": 2530.6149291992188, "learning_rate": 0.0009429546814534529, "loss": 1937.7432, "step": 1620 }, { "ce_loss_12": 3.523609459400177, "ce_loss_17": 3.058007037639618, "ce_loss_23": 2.8733000993728637, "ce_loss_3": 4.4185140371322635, "ce_loss_6": 4.068736577033997, "epoch": 0.163, "grad_norm": 772.0, "kl_loss_12": 1429.8490417480468, "kl_loss_17": 410.2163909912109, "kl_loss_3": 3195.8261840820314, "kl_loss_6": 2515.595849609375, "learning_rate": 0.0009422164654989072, "loss": 1870.0762, "step": 1630 }, { "ce_loss_12": 3.613458776473999, "ce_loss_17": 3.1591226935386656, "ce_loss_23": 2.9718995690345764, "ce_loss_3": 4.496907019615174, "ce_loss_6": 4.157380700111389, "epoch": 0.164, "grad_norm": 1296.0, "kl_loss_12": 1414.0604736328125, "kl_loss_17": 415.0271514892578, "kl_loss_3": 3165.8775146484377, "kl_loss_6": 2493.7409057617188, "learning_rate": 0.0009414737964294635, "loss": 1882.8074, "step": 1640 }, { "ce_loss_12": 3.534224784374237, "ce_loss_17": 3.0942984342575075, "ce_loss_23": 2.9147730588912966, "ce_loss_3": 4.40143461227417, "ce_loss_6": 4.060476922988892, "epoch": 0.165, "grad_norm": 1064.0, "kl_loss_12": 1357.9927368164062, "kl_loss_17": 398.0754959106445, "kl_loss_3": 3082.7246826171877, "kl_loss_6": 2413.491162109375, "learning_rate": 0.000940726681723791, "loss": 1866.6898, "step": 1650 }, { "ce_loss_12": 3.450613260269165, "ce_loss_17": 2.9654441356658934, "ce_loss_23": 2.7755810499191282, "ce_loss_3": 4.376272583007813, "ce_loss_6": 4.026358532905578, "epoch": 0.166, "grad_norm": 1056.0, "kl_loss_12": 1461.2494140625, "kl_loss_17": 416.39320678710936, "kl_loss_3": 3303.5306762695313, "kl_loss_6": 2606.460546875, "learning_rate": 0.0009399751289053266, "loss": 1874.083, "step": 1660 }, { "ce_loss_12": 3.592409574985504, "ce_loss_17": 3.1445668935775757, "ce_loss_23": 2.953047680854797, "ce_loss_3": 4.4779764175415036, "ce_loss_6": 4.136311149597168, "epoch": 0.167, "grad_norm": 884.0, "kl_loss_12": 1394.46318359375, "kl_loss_17": 410.1771575927734, "kl_loss_3": 3157.8220092773436, "kl_loss_6": 2486.7565795898436, "learning_rate": 0.0009392191455421988, "loss": 1895.3063, "step": 1670 }, { "ce_loss_12": 3.590947449207306, "ce_loss_17": 3.136911916732788, "ce_loss_23": 2.9433919906616213, "ce_loss_3": 4.479397130012512, "ce_loss_6": 4.129423558712006, "epoch": 0.168, "grad_norm": 1032.0, "kl_loss_12": 1427.6645935058593, "kl_loss_17": 427.07548828125, "kl_loss_3": 3183.6565185546874, "kl_loss_6": 2503.090478515625, "learning_rate": 0.0009384587392471515, "loss": 1849.109, "step": 1680 }, { "ce_loss_12": 3.562043583393097, "ce_loss_17": 3.113064157962799, "ce_loss_23": 2.9287237524986267, "ce_loss_3": 4.418517231941223, "ce_loss_6": 4.085638308525086, "epoch": 0.169, "grad_norm": 1880.0, "kl_loss_12": 1395.710760498047, "kl_loss_17": 402.3734420776367, "kl_loss_3": 3108.858435058594, "kl_loss_6": 2451.7667602539063, "learning_rate": 0.0009376939176774678, "loss": 1843.5416, "step": 1690 }, { "ce_loss_12": 3.550597834587097, "ce_loss_17": 3.0912911772727965, "ce_loss_23": 2.901586651802063, "ce_loss_3": 4.4385639190673825, "ce_loss_6": 4.095955264568329, "epoch": 0.17, "grad_norm": 796.0, "kl_loss_12": 1405.624041748047, "kl_loss_17": 409.81105651855466, "kl_loss_3": 3160.742041015625, "kl_loss_6": 2495.841442871094, "learning_rate": 0.0009369246885348925, "loss": 1891.4168, "step": 1700 }, { "ce_loss_12": 3.5553791761398315, "ce_loss_17": 3.076837086677551, "ce_loss_23": 2.8917274832725526, "ce_loss_3": 4.461563658714295, "ce_loss_6": 4.118083095550537, "epoch": 0.171, "grad_norm": 900.0, "kl_loss_12": 1449.8334106445313, "kl_loss_17": 407.05690307617186, "kl_loss_3": 3231.3767700195312, "kl_loss_6": 2578.7130126953125, "learning_rate": 0.0009361510595655545, "loss": 1901.1266, "step": 1710 }, { "ce_loss_12": 3.528924262523651, "ce_loss_17": 3.053530716896057, "ce_loss_23": 2.852275025844574, "ce_loss_3": 4.40249902009964, "ce_loss_6": 4.066362845897674, "epoch": 0.172, "grad_norm": 924.0, "kl_loss_12": 1457.4590148925781, "kl_loss_17": 424.09732818603516, "kl_loss_3": 3216.18837890625, "kl_loss_6": 2553.196826171875, "learning_rate": 0.0009353730385598887, "loss": 1891.0375, "step": 1720 }, { "ce_loss_12": 3.4698773980140687, "ce_loss_17": 2.9942647933959963, "ce_loss_23": 2.8003952622413637, "ce_loss_3": 4.3849413871765135, "ce_loss_6": 4.0349366664886475, "epoch": 0.173, "grad_norm": 1096.0, "kl_loss_12": 1450.6549926757812, "kl_loss_17": 420.19408721923827, "kl_loss_3": 3264.2931640625, "kl_loss_6": 2581.879248046875, "learning_rate": 0.0009345906333525581, "loss": 1917.2275, "step": 1730 }, { "ce_loss_12": 3.487299084663391, "ce_loss_17": 3.0379632472991944, "ce_loss_23": 2.832411003112793, "ce_loss_3": 4.398686468601227, "ce_loss_6": 4.05125447511673, "epoch": 0.174, "grad_norm": 1064.0, "kl_loss_12": 1426.8994995117187, "kl_loss_17": 433.52008056640625, "kl_loss_3": 3225.7196655273438, "kl_loss_6": 2552.5757690429687, "learning_rate": 0.0009338038518223745, "loss": 1887.9564, "step": 1740 }, { "ce_loss_12": 3.549421525001526, "ce_loss_17": 3.088615870475769, "ce_loss_23": 2.8778313755989076, "ce_loss_3": 4.448174166679382, "ce_loss_6": 4.108810389041901, "epoch": 0.175, "grad_norm": 1176.0, "kl_loss_12": 1455.715264892578, "kl_loss_17": 449.5879791259766, "kl_loss_3": 3243.080114746094, "kl_loss_6": 2573.329064941406, "learning_rate": 0.0009330127018922195, "loss": 1961.3834, "step": 1750 }, { "ce_loss_12": 3.505111026763916, "ce_loss_17": 3.049764132499695, "ce_loss_23": 2.850164294242859, "ce_loss_3": 4.395489168167114, "ce_loss_6": 4.059174644947052, "epoch": 0.176, "grad_norm": 1024.0, "kl_loss_12": 1426.4638122558595, "kl_loss_17": 425.43538665771484, "kl_loss_3": 3204.3296508789062, "kl_loss_6": 2542.2288818359375, "learning_rate": 0.0009322171915289634, "loss": 1899.1223, "step": 1760 }, { "ce_loss_12": 3.518779253959656, "ce_loss_17": 3.0761126041412354, "ce_loss_23": 2.895628201961517, "ce_loss_3": 4.394254040718079, "ce_loss_6": 4.065056955814361, "epoch": 0.177, "grad_norm": 860.0, "kl_loss_12": 1398.0104370117188, "kl_loss_17": 413.5629913330078, "kl_loss_3": 3148.1657836914064, "kl_loss_6": 2497.1087890625, "learning_rate": 0.0009314173287433873, "loss": 1852.8078, "step": 1770 }, { "ce_loss_12": 3.51189923286438, "ce_loss_17": 3.0561089396476744, "ce_loss_23": 2.8649056434631346, "ce_loss_3": 4.407578945159912, "ce_loss_6": 4.058146071434021, "epoch": 0.178, "grad_norm": 940.0, "kl_loss_12": 1422.9045959472655, "kl_loss_17": 421.48497161865237, "kl_loss_3": 3196.1125244140626, "kl_loss_6": 2527.5512084960938, "learning_rate": 0.0009306131215901003, "loss": 1853.0984, "step": 1780 }, { "ce_loss_12": 3.5367927074432375, "ce_loss_17": 3.089224076271057, "ce_loss_23": 2.904616725444794, "ce_loss_3": 4.4222025871276855, "ce_loss_6": 4.077728879451752, "epoch": 0.179, "grad_norm": 1056.0, "kl_loss_12": 1400.3500183105468, "kl_loss_17": 412.55066833496096, "kl_loss_3": 3161.286022949219, "kl_loss_6": 2497.1654052734375, "learning_rate": 0.0009298045781674596, "loss": 1833.8826, "step": 1790 }, { "ce_loss_12": 3.5144901275634766, "ce_loss_17": 3.0709169149398803, "ce_loss_23": 2.879539155960083, "ce_loss_3": 4.379846239089966, "ce_loss_6": 4.051528000831604, "epoch": 0.18, "grad_norm": 1120.0, "kl_loss_12": 1391.6433166503907, "kl_loss_17": 422.61016235351565, "kl_loss_3": 3123.181042480469, "kl_loss_6": 2487.7217407226562, "learning_rate": 0.0009289917066174886, "loss": 1872.4902, "step": 1800 }, { "ce_loss_12": 3.489149105548859, "ce_loss_17": 3.0734178900718687, "ce_loss_23": 2.8796284437179565, "ce_loss_3": 4.333601975440979, "ce_loss_6": 4.019747495651245, "epoch": 0.181, "grad_norm": 1104.0, "kl_loss_12": 1349.7888732910155, "kl_loss_17": 424.239079284668, "kl_loss_3": 3048.8976928710936, "kl_loss_6": 2417.6389282226564, "learning_rate": 0.0009281745151257945, "loss": 1826.0723, "step": 1810 }, { "ce_loss_12": 3.53800128698349, "ce_loss_17": 3.093510103225708, "ce_loss_23": 2.8986947774887084, "ce_loss_3": 4.429437685012817, "ce_loss_6": 4.085277903079986, "epoch": 0.182, "grad_norm": 916.0, "kl_loss_12": 1390.8755798339844, "kl_loss_17": 430.1387771606445, "kl_loss_3": 3154.4113159179688, "kl_loss_6": 2487.5041625976564, "learning_rate": 0.0009273530119214868, "loss": 1881.2629, "step": 1820 }, { "ce_loss_12": 3.6042762279510496, "ce_loss_17": 3.167820060253143, "ce_loss_23": 2.9859367847442626, "ce_loss_3": 4.4725106954574585, "ce_loss_6": 4.148066067695618, "epoch": 0.183, "grad_norm": 856.0, "kl_loss_12": 1389.0459228515624, "kl_loss_17": 408.4911514282227, "kl_loss_3": 3087.9786865234373, "kl_loss_6": 2463.8032348632814, "learning_rate": 0.0009265272052770935, "loss": 1818.3871, "step": 1830 }, { "ce_loss_12": 3.4827219009399415, "ce_loss_17": 3.0174628019332888, "ce_loss_23": 2.824735474586487, "ce_loss_3": 4.38643205165863, "ce_loss_6": 4.036978626251221, "epoch": 0.184, "grad_norm": 864.0, "kl_loss_12": 1409.780340576172, "kl_loss_17": 412.9845397949219, "kl_loss_3": 3203.962255859375, "kl_loss_6": 2522.3953369140627, "learning_rate": 0.0009256971035084784, "loss": 1878.3439, "step": 1840 }, { "ce_loss_12": 3.448565185070038, "ce_loss_17": 2.9642847418785094, "ce_loss_23": 2.762493336200714, "ce_loss_3": 4.362462449073791, "ce_loss_6": 4.01518462896347, "epoch": 0.185, "grad_norm": 1160.0, "kl_loss_12": 1466.8582885742187, "kl_loss_17": 427.94970245361327, "kl_loss_3": 3280.3394165039062, "kl_loss_6": 2595.3165893554688, "learning_rate": 0.0009248627149747573, "loss": 1901.1352, "step": 1850 }, { "ce_loss_12": 3.5774747490882874, "ce_loss_17": 3.132255482673645, "ce_loss_23": 2.9522884011268617, "ce_loss_3": 4.440769362449646, "ce_loss_6": 4.11391499042511, "epoch": 0.186, "grad_norm": 1216.0, "kl_loss_12": 1390.4288696289063, "kl_loss_17": 399.74344940185546, "kl_loss_3": 3114.68369140625, "kl_loss_6": 2476.0990478515623, "learning_rate": 0.0009240240480782129, "loss": 1848.3285, "step": 1860 }, { "ce_loss_12": 3.504031074047089, "ce_loss_17": 3.0449450731277468, "ce_loss_23": 2.8575493335723876, "ce_loss_3": 4.403111290931702, "ce_loss_6": 4.054022586345672, "epoch": 0.187, "grad_norm": 932.0, "kl_loss_12": 1397.6381225585938, "kl_loss_17": 406.8374771118164, "kl_loss_3": 3180.344445800781, "kl_loss_6": 2515.5447021484374, "learning_rate": 0.0009231811112642122, "loss": 1848.7035, "step": 1870 }, { "ce_loss_12": 3.526438903808594, "ce_loss_17": 3.087480330467224, "ce_loss_23": 2.902469539642334, "ce_loss_3": 4.3881865501403805, "ce_loss_6": 4.057741189002991, "epoch": 0.188, "grad_norm": 1136.0, "kl_loss_12": 1376.4404724121093, "kl_loss_17": 405.33871002197264, "kl_loss_3": 3085.746923828125, "kl_loss_6": 2443.7936584472654, "learning_rate": 0.0009223339130211192, "loss": 1826.3887, "step": 1880 }, { "ce_loss_12": 3.399556028842926, "ce_loss_17": 2.9548186540603636, "ce_loss_23": 2.7765444099903105, "ce_loss_3": 4.322276103496551, "ce_loss_6": 3.9670225024223327, "epoch": 0.189, "grad_norm": 996.0, "kl_loss_12": 1366.1088439941407, "kl_loss_17": 390.74108123779297, "kl_loss_3": 3201.887548828125, "kl_loss_6": 2507.1484924316405, "learning_rate": 0.0009214824618802108, "loss": 1857.4459, "step": 1890 }, { "ce_loss_12": 3.5746437430381777, "ce_loss_17": 3.1217092633247376, "ce_loss_23": 2.9347758293151855, "ce_loss_3": 4.446531271934509, "ce_loss_6": 4.109218180179596, "epoch": 0.19, "grad_norm": 968.0, "kl_loss_12": 1395.1021362304687, "kl_loss_17": 405.2522399902344, "kl_loss_3": 3118.518701171875, "kl_loss_6": 2450.5746948242186, "learning_rate": 0.0009206267664155906, "loss": 1880.9684, "step": 1900 }, { "ce_loss_12": 3.5064347863197325, "ce_loss_17": 3.0529030919075013, "ce_loss_23": 2.865175986289978, "ce_loss_3": 4.398700284957886, "ce_loss_6": 4.056966316699982, "epoch": 0.191, "grad_norm": 828.0, "kl_loss_12": 1388.640936279297, "kl_loss_17": 400.7437454223633, "kl_loss_3": 3160.6127685546876, "kl_loss_6": 2485.7033813476564, "learning_rate": 0.0009197668352441024, "loss": 1862.618, "step": 1910 }, { "ce_loss_12": 3.540658414363861, "ce_loss_17": 3.094384551048279, "ce_loss_23": 2.9136697888374328, "ce_loss_3": 4.423890471458435, "ce_loss_6": 4.079328870773315, "epoch": 0.192, "grad_norm": 1020.0, "kl_loss_12": 1373.4152404785157, "kl_loss_17": 395.8149917602539, "kl_loss_3": 3131.6627197265625, "kl_loss_6": 2459.1564819335936, "learning_rate": 0.0009189026770252437, "loss": 1851.2279, "step": 1920 }, { "ce_loss_12": 3.56448096036911, "ce_loss_17": 3.1186470866203306, "ce_loss_23": 2.93763267993927, "ce_loss_3": 4.442287898063659, "ce_loss_6": 4.099389755725861, "epoch": 0.193, "grad_norm": 956.0, "kl_loss_12": 1375.8356872558593, "kl_loss_17": 395.9308288574219, "kl_loss_3": 3114.5266235351564, "kl_loss_6": 2440.400048828125, "learning_rate": 0.000918034300461078, "loss": 1889.0145, "step": 1930 }, { "ce_loss_12": 3.5736100912094115, "ce_loss_17": 3.1367435455322266, "ce_loss_23": 2.9566476941108704, "ce_loss_3": 4.431257534027099, "ce_loss_6": 4.098700952529907, "epoch": 0.194, "grad_norm": 980.0, "kl_loss_12": 1361.859637451172, "kl_loss_17": 396.3990768432617, "kl_loss_3": 3055.080847167969, "kl_loss_6": 2412.7113403320313, "learning_rate": 0.0009171617142961477, "loss": 1820.2039, "step": 1940 }, { "ce_loss_12": 3.551243340969086, "ce_loss_17": 3.1035329580307005, "ce_loss_23": 2.925469446182251, "ce_loss_3": 4.420022678375244, "ce_loss_6": 4.082593929767609, "epoch": 0.195, "grad_norm": 928.0, "kl_loss_12": 1381.5827026367188, "kl_loss_17": 391.83741607666013, "kl_loss_3": 3100.8082275390625, "kl_loss_6": 2442.37509765625, "learning_rate": 0.0009162849273173857, "loss": 1829.3063, "step": 1950 }, { "ce_loss_12": 3.4948492884635924, "ce_loss_17": 3.055851709842682, "ce_loss_23": 2.8745150208473205, "ce_loss_3": 4.382340717315674, "ce_loss_6": 4.030688488483429, "epoch": 0.196, "grad_norm": 1184.0, "kl_loss_12": 1361.248681640625, "kl_loss_17": 393.1595397949219, "kl_loss_3": 3122.5375244140623, "kl_loss_6": 2436.858264160156, "learning_rate": 0.0009154039483540273, "loss": 1850.5926, "step": 1960 }, { "ce_loss_12": 3.4737292528152466, "ce_loss_17": 3.0338326692581177, "ce_loss_23": 2.854222285747528, "ce_loss_3": 4.370313429832459, "ce_loss_6": 4.023552799224854, "epoch": 0.197, "grad_norm": 1016.0, "kl_loss_12": 1363.5056640625, "kl_loss_17": 396.76910400390625, "kl_loss_3": 3134.1785522460937, "kl_loss_6": 2465.798193359375, "learning_rate": 0.0009145187862775209, "loss": 1834.7617, "step": 1970 }, { "ce_loss_12": 3.5026198029518127, "ce_loss_17": 3.0663907170295714, "ce_loss_23": 2.8810113787651064, "ce_loss_3": 4.366890490055084, "ce_loss_6": 4.034711396694183, "epoch": 0.198, "grad_norm": 852.0, "kl_loss_12": 1363.4940185546875, "kl_loss_17": 409.51978454589846, "kl_loss_3": 3089.037390136719, "kl_loss_6": 2430.1923278808595, "learning_rate": 0.0009136294500014386, "loss": 1825.1098, "step": 1980 }, { "ce_loss_12": 3.4802589416503906, "ce_loss_17": 3.024968659877777, "ce_loss_23": 2.8380630016326904, "ce_loss_3": 4.3972203373909, "ce_loss_6": 4.06009738445282, "epoch": 0.199, "grad_norm": 1072.0, "kl_loss_12": 1394.8473205566406, "kl_loss_17": 410.90734405517577, "kl_loss_3": 3194.697900390625, "kl_loss_6": 2531.3602294921875, "learning_rate": 0.000912735948481387, "loss": 1883.1371, "step": 1990 }, { "ce_loss_12": 3.486107325553894, "ce_loss_17": 3.051172912120819, "ce_loss_23": 2.869850277900696, "ce_loss_3": 4.371750068664551, "ce_loss_6": 4.029321420192718, "epoch": 0.2, "grad_norm": 1008.0, "kl_loss_12": 1377.7386535644532, "kl_loss_17": 407.95068817138673, "kl_loss_3": 3139.5554321289064, "kl_loss_6": 2464.485803222656, "learning_rate": 0.0009118382907149164, "loss": 1813.9844, "step": 2000 }, { "ce_loss_12": 3.5094108819961547, "ce_loss_17": 3.0742453813552855, "ce_loss_23": 2.8915016531944273, "ce_loss_3": 4.396561598777771, "ce_loss_6": 4.0471575379371645, "epoch": 0.201, "grad_norm": 912.0, "kl_loss_12": 1373.4571838378906, "kl_loss_17": 401.7746154785156, "kl_loss_3": 3119.1673217773437, "kl_loss_6": 2440.0885620117188, "learning_rate": 0.0009109364857414306, "loss": 1819.1262, "step": 2010 }, { "ce_loss_12": 3.4708608746528626, "ce_loss_17": 3.0348114490509035, "ce_loss_23": 2.85721560716629, "ce_loss_3": 4.349551677703857, "ce_loss_6": 4.001498186588288, "epoch": 0.202, "grad_norm": 872.0, "kl_loss_12": 1353.9906433105468, "kl_loss_17": 394.17857055664064, "kl_loss_3": 3116.371875, "kl_loss_6": 2437.3400146484373, "learning_rate": 0.0009100305426420956, "loss": 1858.0932, "step": 2020 }, { "ce_loss_12": 3.465420198440552, "ce_loss_17": 3.0086682796478272, "ce_loss_23": 2.8314072251319886, "ce_loss_3": 4.394584012031555, "ce_loss_6": 4.039972257614136, "epoch": 0.203, "grad_norm": 928.0, "kl_loss_12": 1402.326953125, "kl_loss_17": 396.2106231689453, "kl_loss_3": 3243.264025878906, "kl_loss_6": 2553.330419921875, "learning_rate": 0.0009091204705397484, "loss": 1852.6303, "step": 2030 }, { "ce_loss_12": 3.461729180812836, "ce_loss_17": 2.994718587398529, "ce_loss_23": 2.8124589204788206, "ce_loss_3": 4.370394968986512, "ce_loss_6": 4.028866863250732, "epoch": 0.204, "grad_norm": 1096.0, "kl_loss_12": 1412.3960754394532, "kl_loss_17": 398.8710708618164, "kl_loss_3": 3242.097900390625, "kl_loss_6": 2569.238562011719, "learning_rate": 0.0009082062785988049, "loss": 1872.2047, "step": 2040 }, { "ce_loss_12": 3.547369456291199, "ce_loss_17": 3.1226140022277833, "ce_loss_23": 2.947987914085388, "ce_loss_3": 4.40339457988739, "ce_loss_6": 4.07686003446579, "epoch": 0.205, "grad_norm": 948.0, "kl_loss_12": 1351.3995056152344, "kl_loss_17": 389.1872100830078, "kl_loss_3": 3056.0323120117187, "kl_loss_6": 2417.5919677734373, "learning_rate": 0.0009072879760251679, "loss": 1828.1234, "step": 2050 }, { "ce_loss_12": 3.5337481617927553, "ce_loss_17": 3.0816317319869997, "ce_loss_23": 2.895327150821686, "ce_loss_3": 4.419333148002624, "ce_loss_6": 4.081639957427979, "epoch": 0.206, "grad_norm": 852.0, "kl_loss_12": 1393.3875061035155, "kl_loss_17": 401.1993011474609, "kl_loss_3": 3170.3522705078126, "kl_loss_6": 2516.9411743164064, "learning_rate": 0.0009063655720661341, "loss": 1844.2873, "step": 2060 }, { "ce_loss_12": 3.5453538060188294, "ce_loss_17": 3.1116058468818664, "ce_loss_23": 2.9324185848236084, "ce_loss_3": 4.399405813217163, "ce_loss_6": 4.07843269109726, "epoch": 0.207, "grad_norm": 808.0, "kl_loss_12": 1355.4486755371095, "kl_loss_17": 393.4923431396484, "kl_loss_3": 3041.4896850585938, "kl_loss_6": 2420.3547973632812, "learning_rate": 0.000905439076010301, "loss": 1818.3211, "step": 2070 }, { "ce_loss_12": 3.5257867217063903, "ce_loss_17": 3.0792110562324524, "ce_loss_23": 2.8893296360969543, "ce_loss_3": 4.41441969871521, "ce_loss_6": 4.0752364754676815, "epoch": 0.208, "grad_norm": 856.0, "kl_loss_12": 1386.7962463378906, "kl_loss_17": 410.9291061401367, "kl_loss_3": 3146.4475708007812, "kl_loss_6": 2472.69052734375, "learning_rate": 0.0009045084971874737, "loss": 1811.7352, "step": 2080 }, { "ce_loss_12": 3.500695765018463, "ce_loss_17": 3.069680321216583, "ce_loss_23": 2.8757070302963257, "ce_loss_3": 4.375121593475342, "ce_loss_6": 4.043542790412903, "epoch": 0.209, "grad_norm": 1152.0, "kl_loss_12": 1371.920343017578, "kl_loss_17": 412.9090606689453, "kl_loss_3": 3107.186181640625, "kl_loss_6": 2448.299768066406, "learning_rate": 0.0009035738449685707, "loss": 1857.4186, "step": 2090 }, { "ce_loss_12": 3.4602583169937136, "ce_loss_17": 2.9988523602485655, "ce_loss_23": 2.810432803630829, "ce_loss_3": 4.37140154838562, "ce_loss_6": 4.031448018550873, "epoch": 0.21, "grad_norm": 928.0, "kl_loss_12": 1398.5671875, "kl_loss_17": 402.3813186645508, "kl_loss_3": 3212.7988159179686, "kl_loss_6": 2537.6999389648436, "learning_rate": 0.0009026351287655293, "loss": 1838.1242, "step": 2100 }, { "ce_loss_12": 3.580909264087677, "ce_loss_17": 3.174431824684143, "ce_loss_23": 3.0090521216392516, "ce_loss_3": 4.410317397117614, "ce_loss_6": 4.08203741312027, "epoch": 0.211, "grad_norm": 1072.0, "kl_loss_12": 1291.9223876953124, "kl_loss_17": 377.8187744140625, "kl_loss_3": 2940.6177612304687, "kl_loss_6": 2313.8096557617187, "learning_rate": 0.0009016923580312113, "loss": 1745.3838, "step": 2110 }, { "ce_loss_12": 3.478354549407959, "ce_loss_17": 3.0455804109573363, "ce_loss_23": 2.873227298259735, "ce_loss_3": 4.352131581306457, "ce_loss_6": 4.007352542877197, "epoch": 0.212, "grad_norm": 1120.0, "kl_loss_12": 1344.7798950195313, "kl_loss_17": 390.32891540527345, "kl_loss_3": 3080.3701904296877, "kl_loss_6": 2408.41884765625, "learning_rate": 0.0009007455422593077, "loss": 1836.1477, "step": 2120 }, { "ce_loss_12": 3.5267333984375, "ce_loss_17": 3.064280998706818, "ce_loss_23": 2.881734824180603, "ce_loss_3": 4.405498075485229, "ce_loss_6": 4.059432685375214, "epoch": 0.213, "grad_norm": 1064.0, "kl_loss_12": 1400.2952575683594, "kl_loss_17": 398.35522003173827, "kl_loss_3": 3146.431506347656, "kl_loss_6": 2484.1463928222656, "learning_rate": 0.0008997946909842425, "loss": 1854.3893, "step": 2130 }, { "ce_loss_12": 3.5560362100601197, "ce_loss_17": 3.0794739723205566, "ce_loss_23": 2.8871533274650574, "ce_loss_3": 4.470532035827636, "ce_loss_6": 4.124138689041137, "epoch": 0.214, "grad_norm": 1120.0, "kl_loss_12": 1434.1231384277344, "kl_loss_17": 417.38670349121094, "kl_loss_3": 3245.7887573242188, "kl_loss_6": 2579.8198120117186, "learning_rate": 0.0008988398137810777, "loss": 1849.3316, "step": 2140 }, { "ce_loss_12": 3.537279522418976, "ce_loss_17": 3.1066004991531373, "ce_loss_23": 2.929929780960083, "ce_loss_3": 4.395436775684357, "ce_loss_6": 4.06287008523941, "epoch": 0.215, "grad_norm": 1012.0, "kl_loss_12": 1343.9474609375, "kl_loss_17": 388.11578979492185, "kl_loss_3": 3054.53203125, "kl_loss_6": 2400.9071044921875, "learning_rate": 0.0008978809202654162, "loss": 1788.1793, "step": 2150 }, { "ce_loss_12": 3.5223759412765503, "ce_loss_17": 3.099969244003296, "ce_loss_23": 2.9096084117889403, "ce_loss_3": 4.392382049560547, "ce_loss_6": 4.058804070949554, "epoch": 0.216, "grad_norm": 1024.0, "kl_loss_12": 1335.4723693847657, "kl_loss_17": 410.1502883911133, "kl_loss_3": 3057.390380859375, "kl_loss_6": 2406.6466796875, "learning_rate": 0.0008969180200933046, "loss": 1821.6504, "step": 2160 }, { "ce_loss_12": 3.5146302223205566, "ce_loss_17": 3.066201627254486, "ce_loss_23": 2.864242434501648, "ce_loss_3": 4.410690546035767, "ce_loss_6": 4.065093767642975, "epoch": 0.217, "grad_norm": 1280.0, "kl_loss_12": 1387.5415649414062, "kl_loss_17": 426.98915405273436, "kl_loss_3": 3146.7869384765627, "kl_loss_6": 2480.2459106445312, "learning_rate": 0.0008959511229611376, "loss": 1859.2805, "step": 2170 }, { "ce_loss_12": 3.547934627532959, "ce_loss_17": 3.1149901032447813, "ce_loss_23": 2.9360812187194822, "ce_loss_3": 4.426415348052979, "ce_loss_6": 4.092259585857391, "epoch": 0.218, "grad_norm": 944.0, "kl_loss_12": 1348.66884765625, "kl_loss_17": 391.7157409667969, "kl_loss_3": 3096.0570434570313, "kl_loss_6": 2439.3980102539062, "learning_rate": 0.0008949802386055581, "loss": 1815.5332, "step": 2180 }, { "ce_loss_12": 3.4251845479011536, "ce_loss_17": 2.9923732399940492, "ce_loss_23": 2.8120394349098206, "ce_loss_3": 4.317071855068207, "ce_loss_6": 3.9748199582099915, "epoch": 0.219, "grad_norm": 1184.0, "kl_loss_12": 1339.4009887695313, "kl_loss_17": 385.9377899169922, "kl_loss_3": 3084.246716308594, "kl_loss_6": 2414.5926025390627, "learning_rate": 0.0008940053768033609, "loss": 1844.834, "step": 2190 }, { "ce_loss_12": 3.5045410871505736, "ce_loss_17": 3.0732019186019897, "ce_loss_23": 2.8981399059295656, "ce_loss_3": 4.353909540176391, "ce_loss_6": 4.019334411621093, "epoch": 0.22, "grad_norm": 976.0, "kl_loss_12": 1333.2462219238282, "kl_loss_17": 388.82810974121094, "kl_loss_3": 3036.5219604492186, "kl_loss_6": 2388.822961425781, "learning_rate": 0.0008930265473713938, "loss": 1795.5426, "step": 2200 }, { "ce_loss_12": 3.47272390127182, "ce_loss_17": 3.040353035926819, "ce_loss_23": 2.855030930042267, "ce_loss_3": 4.346427464485169, "ce_loss_6": 4.012941122055054, "epoch": 0.221, "grad_norm": 1032.0, "kl_loss_12": 1339.0766662597657, "kl_loss_17": 402.6368606567383, "kl_loss_3": 3076.573132324219, "kl_loss_6": 2416.5378173828126, "learning_rate": 0.0008920437601664579, "loss": 1776.9988, "step": 2210 }, { "ce_loss_12": 3.4764568209648132, "ce_loss_17": 3.0413265466690063, "ce_loss_23": 2.8498608231544496, "ce_loss_3": 4.359201240539551, "ce_loss_6": 4.015230906009674, "epoch": 0.222, "grad_norm": 904.0, "kl_loss_12": 1372.467919921875, "kl_loss_17": 416.31578216552737, "kl_loss_3": 3135.654150390625, "kl_loss_6": 2461.368017578125, "learning_rate": 0.0008910570250852097, "loss": 1804.4967, "step": 2220 }, { "ce_loss_12": 3.5195571303367617, "ce_loss_17": 3.1164942383766174, "ce_loss_23": 2.9423655033111573, "ce_loss_3": 4.369904780387879, "ce_loss_6": 4.036461865901947, "epoch": 0.223, "grad_norm": 1112.0, "kl_loss_12": 1290.9219665527344, "kl_loss_17": 376.9027725219727, "kl_loss_3": 2988.6837158203125, "kl_loss_6": 2340.4248229980467, "learning_rate": 0.0008900663520640604, "loss": 1759.0195, "step": 2230 }, { "ce_loss_12": 3.5042008996009826, "ce_loss_17": 3.0734793901443482, "ce_loss_23": 2.9011544823646545, "ce_loss_3": 4.36311182975769, "ce_loss_6": 4.041118001937866, "epoch": 0.224, "grad_norm": 1004.0, "kl_loss_12": 1334.5621520996094, "kl_loss_17": 387.0037338256836, "kl_loss_3": 3057.005822753906, "kl_loss_6": 2425.3416015625, "learning_rate": 0.0008890717510790764, "loss": 1801.5563, "step": 2240 }, { "ce_loss_12": 3.466621422767639, "ce_loss_17": 3.0410445094108582, "ce_loss_23": 2.867064726352692, "ce_loss_3": 4.357352328300476, "ce_loss_6": 4.015378224849701, "epoch": 0.225, "grad_norm": 1088.0, "kl_loss_12": 1319.4816345214845, "kl_loss_17": 377.6472473144531, "kl_loss_3": 3086.9820068359377, "kl_loss_6": 2414.7611450195313, "learning_rate": 0.0008880732321458784, "loss": 1812.3871, "step": 2250 }, { "ce_loss_12": 3.4890703678131105, "ce_loss_17": 3.0644241333007813, "ce_loss_23": 2.8930245280265807, "ce_loss_3": 4.359507060050964, "ce_loss_6": 4.0214638948440555, "epoch": 0.226, "grad_norm": 1008.0, "kl_loss_12": 1324.6771667480468, "kl_loss_17": 375.7590301513672, "kl_loss_3": 3045.208117675781, "kl_loss_6": 2393.032666015625, "learning_rate": 0.0008870708053195413, "loss": 1810.9191, "step": 2260 }, { "ce_loss_12": 3.498258447647095, "ce_loss_17": 3.079079532623291, "ce_loss_23": 2.9166102409362793, "ce_loss_3": 4.339015591144562, "ce_loss_6": 4.020640075206757, "epoch": 0.227, "grad_norm": 1096.0, "kl_loss_12": 1285.5239379882812, "kl_loss_17": 359.8129364013672, "kl_loss_3": 2961.5877197265627, "kl_loss_6": 2341.482598876953, "learning_rate": 0.0008860644806944918, "loss": 1761.2666, "step": 2270 }, { "ce_loss_12": 3.470140528678894, "ce_loss_17": 3.0365947604179384, "ce_loss_23": 2.8646312832832335, "ce_loss_3": 4.346737742424011, "ce_loss_6": 4.01911609172821, "epoch": 0.228, "grad_norm": 956.0, "kl_loss_12": 1334.8696166992188, "kl_loss_17": 383.21412506103513, "kl_loss_3": 3073.5315063476564, "kl_loss_6": 2431.9812255859374, "learning_rate": 0.0008850542684044079, "loss": 1765.1102, "step": 2280 }, { "ce_loss_12": 3.4536983966827393, "ce_loss_17": 3.001571798324585, "ce_loss_23": 2.8181382417678833, "ce_loss_3": 4.369045734405518, "ce_loss_6": 4.026021492481232, "epoch": 0.229, "grad_norm": 1096.0, "kl_loss_12": 1394.3187866210938, "kl_loss_17": 398.59433898925784, "kl_loss_3": 3213.7202880859377, "kl_loss_6": 2537.492883300781, "learning_rate": 0.0008840401786221159, "loss": 1824.2578, "step": 2290 }, { "ce_loss_12": 3.549013364315033, "ce_loss_17": 3.123672294616699, "ce_loss_23": 2.960705578327179, "ce_loss_3": 4.38698136806488, "ce_loss_6": 4.058153986930847, "epoch": 0.23, "grad_norm": 844.0, "kl_loss_12": 1296.4142211914063, "kl_loss_17": 358.9579452514648, "kl_loss_3": 2962.921838378906, "kl_loss_6": 2329.2815368652346, "learning_rate": 0.000883022221559489, "loss": 1736.9637, "step": 2300 }, { "ce_loss_12": 3.524729681015015, "ce_loss_17": 3.0923240423202514, "ce_loss_23": 2.9268879771232603, "ce_loss_3": 4.394909167289734, "ce_loss_6": 4.064548134803772, "epoch": 0.231, "grad_norm": 1088.0, "kl_loss_12": 1332.733349609375, "kl_loss_17": 373.6583770751953, "kl_loss_3": 3063.1235473632814, "kl_loss_6": 2420.8246948242186, "learning_rate": 0.0008820004074673434, "loss": 1840.4422, "step": 2310 }, { "ce_loss_12": 3.4370501399040223, "ce_loss_17": 3.0037574648857115, "ce_loss_23": 2.8393426060676576, "ce_loss_3": 4.316093397140503, "ce_loss_6": 3.9695367217063904, "epoch": 0.232, "grad_norm": 924.0, "kl_loss_12": 1337.5342590332032, "kl_loss_17": 368.76837463378905, "kl_loss_3": 3081.577917480469, "kl_loss_6": 2412.1710571289063, "learning_rate": 0.0008809747466353355, "loss": 1776.1449, "step": 2320 }, { "ce_loss_12": 3.436672937870026, "ce_loss_17": 3.0068650960922243, "ce_loss_23": 2.8387425184249877, "ce_loss_3": 4.318840408325196, "ce_loss_6": 3.98241525888443, "epoch": 0.233, "grad_norm": 1792.0, "kl_loss_12": 1313.316717529297, "kl_loss_17": 369.51439056396487, "kl_loss_3": 3074.159777832031, "kl_loss_6": 2411.598767089844, "learning_rate": 0.0008799452493918585, "loss": 1800.6109, "step": 2330 }, { "ce_loss_12": 3.5013447403907776, "ce_loss_17": 3.07126921415329, "ce_loss_23": 2.9082824349403382, "ce_loss_3": 4.362140035629272, "ce_loss_6": 4.038045120239258, "epoch": 0.234, "grad_norm": 1000.0, "kl_loss_12": 1312.60400390625, "kl_loss_17": 364.74903259277346, "kl_loss_3": 3032.4592041015626, "kl_loss_6": 2394.586633300781, "learning_rate": 0.0008789119261039385, "loss": 1830.5715, "step": 2340 }, { "ce_loss_12": 3.4402097702026366, "ce_loss_17": 2.992724096775055, "ce_loss_23": 2.8270597219467164, "ce_loss_3": 4.309396195411682, "ce_loss_6": 3.9708672523498536, "epoch": 0.235, "grad_norm": 1040.0, "kl_loss_12": 1336.757794189453, "kl_loss_17": 369.18068542480466, "kl_loss_3": 3064.9718383789063, "kl_loss_6": 2405.208776855469, "learning_rate": 0.0008778747871771292, "loss": 1766.0102, "step": 2350 }, { "ce_loss_12": 3.4626588582992555, "ce_loss_17": 3.0441460490226744, "ce_loss_23": 2.883425784111023, "ce_loss_3": 4.300020003318787, "ce_loss_6": 3.9820852994918825, "epoch": 0.236, "grad_norm": 932.0, "kl_loss_12": 1287.1029907226562, "kl_loss_17": 355.5157272338867, "kl_loss_3": 2961.4650268554688, "kl_loss_6": 2333.040173339844, "learning_rate": 0.0008768338430554083, "loss": 1735.4672, "step": 2360 }, { "ce_loss_12": 3.491554284095764, "ce_loss_17": 3.0689448595046995, "ce_loss_23": 2.8880576729774474, "ce_loss_3": 4.349581158161163, "ce_loss_6": 4.016392803192138, "epoch": 0.237, "grad_norm": 940.0, "kl_loss_12": 1315.7613159179687, "kl_loss_17": 389.4289184570313, "kl_loss_3": 3028.0147705078125, "kl_loss_6": 2380.471240234375, "learning_rate": 0.0008757891042210713, "loss": 1790.2062, "step": 2370 }, { "ce_loss_12": 3.4883965253829956, "ce_loss_17": 3.077411413192749, "ce_loss_23": 2.907989227771759, "ce_loss_3": 4.351418459415436, "ce_loss_6": 4.019922041893006, "epoch": 0.238, "grad_norm": 944.0, "kl_loss_12": 1290.36904296875, "kl_loss_17": 373.3400344848633, "kl_loss_3": 2995.101806640625, "kl_loss_6": 2344.245819091797, "learning_rate": 0.0008747405811946271, "loss": 1763.9508, "step": 2380 }, { "ce_loss_12": 3.43811776638031, "ce_loss_17": 2.9906313180923463, "ce_loss_23": 2.8158723592758177, "ce_loss_3": 4.325619494915008, "ce_loss_6": 3.9896966576576234, "epoch": 0.239, "grad_norm": 872.0, "kl_loss_12": 1358.5515502929688, "kl_loss_17": 380.8702789306641, "kl_loss_3": 3141.8859008789063, "kl_loss_6": 2481.448974609375, "learning_rate": 0.0008736882845346905, "loss": 1779.4078, "step": 2390 }, { "ce_loss_12": 3.497787058353424, "ce_loss_17": 3.0713407397270203, "ce_loss_23": 2.896159815788269, "ce_loss_3": 4.380762910842895, "ce_loss_6": 4.03783073425293, "epoch": 0.24, "grad_norm": 1120.0, "kl_loss_12": 1309.9198181152344, "kl_loss_17": 382.0894317626953, "kl_loss_3": 3064.660021972656, "kl_loss_6": 2392.8274658203127, "learning_rate": 0.0008726322248378774, "loss": 1776.2117, "step": 2400 }, { "ce_loss_12": 3.496664881706238, "ce_loss_17": 3.062252378463745, "ce_loss_23": 2.8999626994132996, "ce_loss_3": 4.393068075180054, "ce_loss_6": 4.060015296936035, "epoch": 0.241, "grad_norm": 788.0, "kl_loss_12": 1330.9965576171876, "kl_loss_17": 364.06876373291016, "kl_loss_3": 3115.4126953125, "kl_loss_6": 2465.3824096679687, "learning_rate": 0.0008715724127386971, "loss": 1828.7469, "step": 2410 }, { "ce_loss_12": 3.5431973457336428, "ce_loss_17": 3.131938111782074, "ce_loss_23": 2.969063603878021, "ce_loss_3": 4.4129211664199826, "ce_loss_6": 4.077617800235748, "epoch": 0.242, "grad_norm": 920.0, "kl_loss_12": 1289.0557250976562, "kl_loss_17": 364.9691192626953, "kl_loss_3": 3012.280969238281, "kl_loss_6": 2371.3688720703126, "learning_rate": 0.0008705088589094458, "loss": 1775.2705, "step": 2420 }, { "ce_loss_12": 3.5635186672210692, "ce_loss_17": 3.1368691086769105, "ce_loss_23": 2.9768545746803285, "ce_loss_3": 4.4323231220245365, "ce_loss_6": 4.099137330055237, "epoch": 0.243, "grad_norm": 932.0, "kl_loss_12": 1306.441229248047, "kl_loss_17": 364.32171020507815, "kl_loss_3": 3046.074914550781, "kl_loss_6": 2390.725549316406, "learning_rate": 0.0008694415740600988, "loss": 1783.0297, "step": 2430 }, { "ce_loss_12": 3.435083818435669, "ce_loss_17": 3.0188833594322206, "ce_loss_23": 2.8394552946090696, "ce_loss_3": 4.335052013397217, "ce_loss_6": 4.008730459213257, "epoch": 0.244, "grad_norm": 1128.0, "kl_loss_12": 1319.4216796875, "kl_loss_17": 391.80567626953126, "kl_loss_3": 3102.2006103515623, "kl_loss_6": 2459.811145019531, "learning_rate": 0.0008683705689382025, "loss": 1790.7793, "step": 2440 }, { "ce_loss_12": 3.501308631896973, "ce_loss_17": 3.086840510368347, "ce_loss_23": 2.922885799407959, "ce_loss_3": 4.338215970993042, "ce_loss_6": 4.02147890329361, "epoch": 0.245, "grad_norm": 992.0, "kl_loss_12": 1284.8777770996094, "kl_loss_17": 372.9107971191406, "kl_loss_3": 2967.5705322265626, "kl_loss_6": 2348.475634765625, "learning_rate": 0.0008672958543287666, "loss": 1789.2359, "step": 2450 }, { "ce_loss_12": 3.5120221972465515, "ce_loss_17": 3.0961593270301817, "ce_loss_23": 2.926725244522095, "ce_loss_3": 4.341679620742798, "ce_loss_6": 4.023214590549469, "epoch": 0.246, "grad_norm": 1176.0, "kl_loss_12": 1297.7512512207031, "kl_loss_17": 373.1650756835937, "kl_loss_3": 2975.7365478515626, "kl_loss_6": 2330.557568359375, "learning_rate": 0.0008662174410541554, "loss": 1745.7102, "step": 2460 }, { "ce_loss_12": 3.470390427112579, "ce_loss_17": 3.0552254199981688, "ce_loss_23": 2.894935131072998, "ce_loss_3": 4.322717833518982, "ce_loss_6": 3.99176082611084, "epoch": 0.247, "grad_norm": 1032.0, "kl_loss_12": 1274.985662841797, "kl_loss_17": 363.30750732421876, "kl_loss_3": 2987.416650390625, "kl_loss_6": 2335.2953918457033, "learning_rate": 0.0008651353399739787, "loss": 1791.3639, "step": 2470 }, { "ce_loss_12": 3.4972774505615236, "ce_loss_17": 3.0815531253814696, "ce_loss_23": 2.915669393539429, "ce_loss_3": 4.35245532989502, "ce_loss_6": 4.024959123134613, "epoch": 0.248, "grad_norm": 884.0, "kl_loss_12": 1287.859619140625, "kl_loss_17": 361.7851028442383, "kl_loss_3": 2980.0279296875, "kl_loss_6": 2345.3993225097656, "learning_rate": 0.0008640495619849821, "loss": 1753.2402, "step": 2480 }, { "ce_loss_12": 3.459804022312164, "ce_loss_17": 3.0489287614822387, "ce_loss_23": 2.8821661949157713, "ce_loss_3": 4.315313053131104, "ce_loss_6": 3.983213782310486, "epoch": 0.249, "grad_norm": 1104.0, "kl_loss_12": 1288.3751342773437, "kl_loss_17": 367.5175216674805, "kl_loss_3": 2994.886682128906, "kl_loss_6": 2342.3803161621095, "learning_rate": 0.0008629601180209381, "loss": 1744.3105, "step": 2490 }, { "ce_loss_12": 3.4463224172592164, "ce_loss_17": 3.037926363945007, "ce_loss_23": 2.870476996898651, "ce_loss_3": 4.3170277118682865, "ce_loss_6": 3.9774172425270082, "epoch": 0.25, "grad_norm": 988.0, "kl_loss_12": 1279.446728515625, "kl_loss_17": 365.6539993286133, "kl_loss_3": 2985.2022094726562, "kl_loss_6": 2335.312158203125, "learning_rate": 0.000861867019052535, "loss": 1779.5314, "step": 2500 }, { "ce_loss_12": 3.405802917480469, "ce_loss_17": 2.9695845365524294, "ce_loss_23": 2.8024597764015198, "ce_loss_3": 4.301933288574219, "ce_loss_6": 3.963208818435669, "epoch": 0.251, "grad_norm": 1016.0, "kl_loss_12": 1317.6872192382812, "kl_loss_17": 365.97511596679686, "kl_loss_3": 3089.4097900390625, "kl_loss_6": 2429.868518066406, "learning_rate": 0.0008607702760872678, "loss": 1795.1891, "step": 2510 }, { "ce_loss_12": 3.4766680002212524, "ce_loss_17": 3.0628212213516237, "ce_loss_23": 2.902873623371124, "ce_loss_3": 4.3218683242797855, "ce_loss_6": 3.9976150155067445, "epoch": 0.252, "grad_norm": 1056.0, "kl_loss_12": 1269.5998901367188, "kl_loss_17": 356.7829559326172, "kl_loss_3": 2949.064074707031, "kl_loss_6": 2307.2, "learning_rate": 0.0008596699001693256, "loss": 1769.2199, "step": 2520 }, { "ce_loss_12": 3.4674214720726013, "ce_loss_17": 3.0689143180847167, "ce_loss_23": 2.9168426513671877, "ce_loss_3": 4.330565083026886, "ce_loss_6": 3.996849024295807, "epoch": 0.253, "grad_norm": 1040.0, "kl_loss_12": 1256.2728576660156, "kl_loss_17": 347.1592544555664, "kl_loss_3": 2967.556774902344, "kl_loss_6": 2312.917028808594, "learning_rate": 0.0008585659023794818, "loss": 1777.5496, "step": 2530 }, { "ce_loss_12": 3.488456690311432, "ce_loss_17": 3.049675261974335, "ce_loss_23": 2.8808682441711424, "ce_loss_3": 4.386857700347901, "ce_loss_6": 4.042752182483673, "epoch": 0.254, "grad_norm": 1024.0, "kl_loss_12": 1329.540185546875, "kl_loss_17": 367.1026809692383, "kl_loss_3": 3106.331298828125, "kl_loss_6": 2430.2492797851564, "learning_rate": 0.0008574582938349817, "loss": 1783.7406, "step": 2540 }, { "ce_loss_12": 3.4827857851982116, "ce_loss_17": 3.033743643760681, "ce_loss_23": 2.856909465789795, "ce_loss_3": 4.363191175460815, "ce_loss_6": 4.0178595662117, "epoch": 0.255, "grad_norm": 928.0, "kl_loss_12": 1360.4717224121093, "kl_loss_17": 387.29022216796875, "kl_loss_3": 3103.3068725585936, "kl_loss_6": 2429.029577636719, "learning_rate": 0.0008563470856894315, "loss": 1761.7795, "step": 2550 }, { "ce_loss_12": 3.453168773651123, "ce_loss_17": 3.019764852523804, "ce_loss_23": 2.8588831305503843, "ce_loss_3": 4.325317311286926, "ce_loss_6": 3.998428213596344, "epoch": 0.256, "grad_norm": 1136.0, "kl_loss_12": 1323.9368774414063, "kl_loss_17": 359.41111907958987, "kl_loss_3": 3045.49697265625, "kl_loss_6": 2401.3049194335936, "learning_rate": 0.0008552322891326845, "loss": 1768.4219, "step": 2560 }, { "ce_loss_12": 3.422929072380066, "ce_loss_17": 3.001448130607605, "ce_loss_23": 2.832953155040741, "ce_loss_3": 4.307469379901886, "ce_loss_6": 3.963564693927765, "epoch": 0.257, "grad_norm": 952.0, "kl_loss_12": 1304.7041931152344, "kl_loss_17": 367.04587707519534, "kl_loss_3": 3057.959326171875, "kl_loss_6": 2391.83447265625, "learning_rate": 0.0008541139153907296, "loss": 1751.0988, "step": 2570 }, { "ce_loss_12": 3.3787537217140198, "ce_loss_17": 2.958510947227478, "ce_loss_23": 2.7977961897850037, "ce_loss_3": 4.249011647701264, "ce_loss_6": 3.9145611047744753, "epoch": 0.258, "grad_norm": 1048.0, "kl_loss_12": 1283.3902404785156, "kl_loss_17": 355.2511352539062, "kl_loss_3": 3008.0134521484374, "kl_loss_6": 2352.897106933594, "learning_rate": 0.0008529919757255782, "loss": 1775.459, "step": 2580 }, { "ce_loss_12": 3.397475516796112, "ce_loss_17": 2.9850598335266114, "ce_loss_23": 2.833599638938904, "ce_loss_3": 4.2180173873901365, "ce_loss_6": 3.8947041869163512, "epoch": 0.259, "grad_norm": 992.0, "kl_loss_12": 1272.6158386230468, "kl_loss_17": 348.8061218261719, "kl_loss_3": 2917.62060546875, "kl_loss_6": 2278.41728515625, "learning_rate": 0.0008518664814351503, "loss": 1723.7156, "step": 2590 }, { "ce_loss_12": 3.398776721954346, "ce_loss_17": 2.960651087760925, "ce_loss_23": 2.7926816940307617, "ce_loss_3": 4.279233813285828, "ce_loss_6": 3.9336220383644105, "epoch": 0.26, "grad_norm": 984.0, "kl_loss_12": 1338.0150085449218, "kl_loss_17": 370.83314056396483, "kl_loss_3": 3081.394177246094, "kl_loss_6": 2407.3505981445314, "learning_rate": 0.0008507374438531607, "loss": 1831.2203, "step": 2600 }, { "ce_loss_12": 3.3708202481269836, "ce_loss_17": 2.939207446575165, "ce_loss_23": 2.7798258900642394, "ce_loss_3": 4.22383736371994, "ce_loss_6": 3.899499309062958, "epoch": 0.261, "grad_norm": 1012.0, "kl_loss_12": 1302.8319213867187, "kl_loss_17": 356.8877914428711, "kl_loss_3": 2991.932568359375, "kl_loss_6": 2358.0376098632814, "learning_rate": 0.0008496048743490053, "loss": 1752.1227, "step": 2610 }, { "ce_loss_12": 3.4945289492607117, "ce_loss_17": 3.0854422569274904, "ce_loss_23": 2.926167845726013, "ce_loss_3": 4.32569922208786, "ce_loss_6": 4.003281497955323, "epoch": 0.262, "grad_norm": 1464.0, "kl_loss_12": 1268.1002868652345, "kl_loss_17": 355.6738220214844, "kl_loss_3": 2944.6495727539063, "kl_loss_6": 2304.16591796875, "learning_rate": 0.0008484687843276469, "loss": 1737.8074, "step": 2620 }, { "ce_loss_12": 3.437246561050415, "ce_loss_17": 3.01961772441864, "ce_loss_23": 2.8550262570381166, "ce_loss_3": 4.30987902879715, "ce_loss_6": 3.969077003002167, "epoch": 0.263, "grad_norm": 924.0, "kl_loss_12": 1300.986590576172, "kl_loss_17": 363.2773895263672, "kl_loss_3": 3022.796911621094, "kl_loss_6": 2354.0840698242187, "learning_rate": 0.0008473291852294987, "loss": 1774.0008, "step": 2630 }, { "ce_loss_12": 3.4466328263282775, "ce_loss_17": 3.0360899448394774, "ce_loss_23": 2.8658475399017336, "ce_loss_3": 4.297406530380249, "ce_loss_6": 3.971087157726288, "epoch": 0.264, "grad_norm": 968.0, "kl_loss_12": 1296.3372619628906, "kl_loss_17": 367.7685241699219, "kl_loss_3": 2995.4369140625, "kl_loss_6": 2360.178326416016, "learning_rate": 0.0008461860885303114, "loss": 1742.2068, "step": 2640 }, { "ce_loss_12": 3.460123133659363, "ce_loss_17": 3.050652766227722, "ce_loss_23": 2.8907677292823792, "ce_loss_3": 4.304844355583191, "ce_loss_6": 3.9868418335914613, "epoch": 0.265, "grad_norm": 1440.0, "kl_loss_12": 1261.4657043457032, "kl_loss_17": 360.51488494873047, "kl_loss_3": 2945.7997192382813, "kl_loss_6": 2313.209997558594, "learning_rate": 0.000845039505741056, "loss": 1748.1709, "step": 2650 }, { "ce_loss_12": 3.465388464927673, "ce_loss_17": 3.0377141714096068, "ce_loss_23": 2.8722215056419373, "ce_loss_3": 4.3243202567100525, "ce_loss_6": 3.991137969493866, "epoch": 0.266, "grad_norm": 1012.0, "kl_loss_12": 1322.8445251464843, "kl_loss_17": 374.1597625732422, "kl_loss_3": 3032.355578613281, "kl_loss_6": 2393.8152709960937, "learning_rate": 0.0008438894484078086, "loss": 1821.5604, "step": 2660 }, { "ce_loss_12": 3.450354981422424, "ce_loss_17": 3.056048274040222, "ce_loss_23": 2.8842979192733766, "ce_loss_3": 4.297944974899292, "ce_loss_6": 3.983082890510559, "epoch": 0.267, "grad_norm": 928.0, "kl_loss_12": 1266.1199890136718, "kl_loss_17": 382.5615264892578, "kl_loss_3": 2958.8844482421873, "kl_loss_6": 2330.66201171875, "learning_rate": 0.0008427359281116334, "loss": 1752.3658, "step": 2670 }, { "ce_loss_12": 3.37357120513916, "ce_loss_17": 2.9571158409118654, "ce_loss_23": 2.7853593945503237, "ce_loss_3": 4.2602328896522526, "ce_loss_6": 3.9257378458976744, "epoch": 0.268, "grad_norm": 1448.0, "kl_loss_12": 1305.2779907226563, "kl_loss_17": 383.0894973754883, "kl_loss_3": 3063.417590332031, "kl_loss_6": 2411.9236450195312, "learning_rate": 0.0008415789564684673, "loss": 1772.8453, "step": 2680 }, { "ce_loss_12": 3.5913300633430483, "ce_loss_17": 3.1934632778167726, "ce_loss_23": 3.019876754283905, "ce_loss_3": 4.42219307422638, "ce_loss_6": 4.088761901855468, "epoch": 0.269, "grad_norm": 1136.0, "kl_loss_12": 1267.3768493652344, "kl_loss_17": 386.0883987426758, "kl_loss_3": 2907.607763671875, "kl_loss_6": 2264.976397705078, "learning_rate": 0.0008404185451290017, "loss": 1715.9904, "step": 2690 }, { "ce_loss_12": 3.4683207511901855, "ce_loss_17": 3.07811154127121, "ce_loss_23": 2.8965150237083437, "ce_loss_3": 4.320998978614807, "ce_loss_6": 3.992653453350067, "epoch": 0.27, "grad_norm": 1240.0, "kl_loss_12": 1260.81787109375, "kl_loss_17": 394.48106994628904, "kl_loss_3": 2952.572314453125, "kl_loss_6": 2312.224505615234, "learning_rate": 0.0008392547057785661, "loss": 1735.7289, "step": 2700 }, { "ce_loss_12": 3.4322686553001405, "ce_loss_17": 3.0301548719406126, "ce_loss_23": 2.8392791628837584, "ce_loss_3": 4.310086572170258, "ce_loss_6": 3.9823203444480897, "epoch": 0.271, "grad_norm": 880.0, "kl_loss_12": 1318.2845825195313, "kl_loss_17": 423.6947479248047, "kl_loss_3": 3075.54521484375, "kl_loss_6": 2435.184326171875, "learning_rate": 0.0008380874501370098, "loss": 1750.4709, "step": 2710 }, { "ce_loss_12": 3.4204657673835754, "ce_loss_17": 3.0037129402160643, "ce_loss_23": 2.821027398109436, "ce_loss_3": 4.296162378787995, "ce_loss_6": 3.9711161613464356, "epoch": 0.272, "grad_norm": 928.0, "kl_loss_12": 1301.1679565429688, "kl_loss_17": 401.3130187988281, "kl_loss_3": 3049.724011230469, "kl_loss_6": 2402.8673461914063, "learning_rate": 0.0008369167899585841, "loss": 1774.0307, "step": 2720 }, { "ce_loss_12": 3.487650990486145, "ce_loss_17": 3.1002121210098266, "ce_loss_23": 2.936123478412628, "ce_loss_3": 4.30772819519043, "ce_loss_6": 3.9867376923561095, "epoch": 0.273, "grad_norm": 1088.0, "kl_loss_12": 1245.5649353027343, "kl_loss_17": 369.61613006591796, "kl_loss_3": 2892.830920410156, "kl_loss_6": 2261.4607543945312, "learning_rate": 0.0008357427370318238, "loss": 1752.9934, "step": 2730 }, { "ce_loss_12": 3.45722119808197, "ce_loss_17": 3.0482855081558227, "ce_loss_23": 2.884982371330261, "ce_loss_3": 4.328402829170227, "ce_loss_6": 4.003220283985138, "epoch": 0.274, "grad_norm": 1320.0, "kl_loss_12": 1274.736767578125, "kl_loss_17": 362.33515014648435, "kl_loss_3": 3004.6664428710938, "kl_loss_6": 2365.4545654296876, "learning_rate": 0.0008345653031794292, "loss": 1758.5988, "step": 2740 }, { "ce_loss_12": 3.468037247657776, "ce_loss_17": 3.0583531856536865, "ce_loss_23": 2.8902764678001405, "ce_loss_3": 4.325004518032074, "ce_loss_6": 3.9913942217826843, "epoch": 0.275, "grad_norm": 1248.0, "kl_loss_12": 1277.5965454101563, "kl_loss_17": 370.7550796508789, "kl_loss_3": 2985.575329589844, "kl_loss_6": 2330.1687194824217, "learning_rate": 0.0008333845002581458, "loss": 1745.3102, "step": 2750 }, { "ce_loss_12": 3.415788435935974, "ce_loss_17": 2.9950094342231752, "ce_loss_23": 2.826165699958801, "ce_loss_3": 4.2899749159812925, "ce_loss_6": 3.957856571674347, "epoch": 0.276, "grad_norm": 1040.0, "kl_loss_12": 1311.6923767089843, "kl_loss_17": 368.13220062255857, "kl_loss_3": 3057.960583496094, "kl_loss_6": 2412.7806030273437, "learning_rate": 0.0008322003401586462, "loss": 1780.4906, "step": 2760 }, { "ce_loss_12": 3.4133952260017395, "ce_loss_17": 3.0063074231147766, "ce_loss_23": 2.8547223687171934, "ce_loss_3": 4.248537862300873, "ce_loss_6": 3.930584025382996, "epoch": 0.277, "grad_norm": 1120.0, "kl_loss_12": 1237.89150390625, "kl_loss_17": 345.06705474853516, "kl_loss_3": 2913.3134033203123, "kl_loss_6": 2285.3903869628907, "learning_rate": 0.0008310128348054094, "loss": 1679.1152, "step": 2770 }, { "ce_loss_12": 3.3873406529426573, "ce_loss_17": 2.9817344427108763, "ce_loss_23": 2.826510119438171, "ce_loss_3": 4.259957826137542, "ce_loss_6": 3.922231209278107, "epoch": 0.278, "grad_norm": 1056.0, "kl_loss_12": 1256.4411865234374, "kl_loss_17": 350.2546112060547, "kl_loss_3": 2975.923962402344, "kl_loss_6": 2313.7191162109375, "learning_rate": 0.0008298219961566008, "loss": 1732.8988, "step": 2780 }, { "ce_loss_12": 3.371931481361389, "ce_loss_17": 2.9464340090751646, "ce_loss_23": 2.7890324234962462, "ce_loss_3": 4.25629768371582, "ce_loss_6": 3.9189098596572878, "epoch": 0.279, "grad_norm": 1368.0, "kl_loss_12": 1300.705126953125, "kl_loss_17": 354.02294006347654, "kl_loss_3": 3076.6906494140626, "kl_loss_6": 2407.201062011719, "learning_rate": 0.0008286278362039527, "loss": 1741.5291, "step": 2790 }, { "ce_loss_12": 3.410293257236481, "ce_loss_17": 2.983790063858032, "ce_loss_23": 2.818021869659424, "ce_loss_3": 4.301714396476745, "ce_loss_6": 3.9640199065208437, "epoch": 0.28, "grad_norm": 1040.0, "kl_loss_12": 1309.4822204589843, "kl_loss_17": 358.61011657714846, "kl_loss_3": 3090.107067871094, "kl_loss_6": 2422.9576782226563, "learning_rate": 0.0008274303669726426, "loss": 1748.3598, "step": 2800 }, { "ce_loss_12": 3.3253208994865417, "ce_loss_17": 2.8995402812957765, "ce_loss_23": 2.737238872051239, "ce_loss_3": 4.240236687660217, "ce_loss_6": 3.903521752357483, "epoch": 0.281, "grad_norm": 900.0, "kl_loss_12": 1297.9562866210938, "kl_loss_17": 362.3501480102539, "kl_loss_3": 3113.5695922851564, "kl_loss_6": 2452.9843994140624, "learning_rate": 0.0008262296005211721, "loss": 1740.7102, "step": 2810 }, { "ce_loss_12": 3.437107729911804, "ce_loss_17": 3.014890968799591, "ce_loss_23": 2.8532368183135985, "ce_loss_3": 4.30629141330719, "ce_loss_6": 3.9686094522476196, "epoch": 0.282, "grad_norm": 1176.0, "kl_loss_12": 1297.9700988769532, "kl_loss_17": 360.28114929199216, "kl_loss_3": 3020.598486328125, "kl_loss_6": 2359.4715087890627, "learning_rate": 0.0008250255489412463, "loss": 1736.0588, "step": 2820 }, { "ce_loss_12": 3.5168994188308718, "ce_loss_17": 3.103827476501465, "ce_loss_23": 2.9425456881523133, "ce_loss_3": 4.380283188819885, "ce_loss_6": 4.0496561288833615, "epoch": 0.283, "grad_norm": 940.0, "kl_loss_12": 1263.3302734375, "kl_loss_17": 358.89661102294923, "kl_loss_3": 2991.8945556640624, "kl_loss_6": 2343.1708740234376, "learning_rate": 0.0008238182243576511, "loss": 1739.1707, "step": 2830 }, { "ce_loss_12": 3.4478026747703554, "ce_loss_17": 3.0591145396232604, "ce_loss_23": 2.9078246116638184, "ce_loss_3": 4.258657145500183, "ce_loss_6": 3.9365967869758607, "epoch": 0.284, "grad_norm": 988.0, "kl_loss_12": 1218.239813232422, "kl_loss_17": 343.7883529663086, "kl_loss_3": 2836.483447265625, "kl_loss_6": 2205.292608642578, "learning_rate": 0.0008226076389281315, "loss": 1684.1754, "step": 2840 }, { "ce_loss_12": 3.5043246269226076, "ce_loss_17": 3.104953384399414, "ce_loss_23": 2.9533308386802672, "ce_loss_3": 4.344747185707092, "ce_loss_6": 4.0262510776519775, "epoch": 0.285, "grad_norm": 1120.0, "kl_loss_12": 1250.2501098632813, "kl_loss_17": 346.8875228881836, "kl_loss_3": 2944.923376464844, "kl_loss_6": 2315.420861816406, "learning_rate": 0.0008213938048432696, "loss": 1694.4004, "step": 2850 }, { "ce_loss_12": 3.443698799610138, "ce_loss_17": 3.0415454149246215, "ce_loss_23": 2.8757529616355897, "ce_loss_3": 4.296736359596252, "ce_loss_6": 3.968770682811737, "epoch": 0.286, "grad_norm": 1400.0, "kl_loss_12": 1260.857196044922, "kl_loss_17": 360.1633895874023, "kl_loss_3": 2964.88828125, "kl_loss_6": 2322.2742919921875, "learning_rate": 0.0008201767343263612, "loss": 1735.0531, "step": 2860 }, { "ce_loss_12": 3.408144676685333, "ce_loss_17": 2.992103934288025, "ce_loss_23": 2.8287665486335754, "ce_loss_3": 4.272946655750275, "ce_loss_6": 3.9430832743644713, "epoch": 0.287, "grad_norm": 1032.0, "kl_loss_12": 1275.9811218261718, "kl_loss_17": 349.76739501953125, "kl_loss_3": 3006.3862426757814, "kl_loss_6": 2355.1337646484376, "learning_rate": 0.0008189564396332927, "loss": 1692.4711, "step": 2870 }, { "ce_loss_12": 3.3988120436668394, "ce_loss_17": 2.9796494960784914, "ce_loss_23": 2.822283446788788, "ce_loss_3": 4.284251952171326, "ce_loss_6": 3.934915232658386, "epoch": 0.288, "grad_norm": 956.0, "kl_loss_12": 1261.259295654297, "kl_loss_17": 351.22632751464846, "kl_loss_3": 3018.626599121094, "kl_loss_6": 2347.247448730469, "learning_rate": 0.0008177329330524181, "loss": 1743.3631, "step": 2880 }, { "ce_loss_12": 3.42569385766983, "ce_loss_17": 3.0192829370498657, "ce_loss_23": 2.860982358455658, "ce_loss_3": 4.265313994884491, "ce_loss_6": 3.939829432964325, "epoch": 0.289, "grad_norm": 936.0, "kl_loss_12": 1248.4840698242188, "kl_loss_17": 348.7205474853516, "kl_loss_3": 2906.6572387695314, "kl_loss_6": 2269.7896362304687, "learning_rate": 0.0008165062269044352, "loss": 1709.3297, "step": 2890 }, { "ce_loss_12": 3.422416353225708, "ce_loss_17": 2.9858880400657655, "ce_loss_23": 2.8259318113327025, "ce_loss_3": 4.268325328826904, "ce_loss_6": 3.9391417264938355, "epoch": 0.29, "grad_norm": 1264.0, "kl_loss_12": 1315.4104431152343, "kl_loss_17": 357.4076446533203, "kl_loss_3": 3013.7338623046876, "kl_loss_6": 2362.8879516601564, "learning_rate": 0.0008152763335422613, "loss": 1761.2363, "step": 2900 }, { "ce_loss_12": 3.4009305357933046, "ce_loss_17": 2.974248266220093, "ce_loss_23": 2.8042847514152527, "ce_loss_3": 4.2600690722465515, "ce_loss_6": 3.9201393485069276, "epoch": 0.291, "grad_norm": 1144.0, "kl_loss_12": 1298.79541015625, "kl_loss_17": 360.68007507324216, "kl_loss_3": 3015.308837890625, "kl_loss_6": 2350.725445556641, "learning_rate": 0.0008140432653509088, "loss": 1733.8855, "step": 2910 }, { "ce_loss_12": 3.433920180797577, "ce_loss_17": 3.023169231414795, "ce_loss_23": 2.8585532546043395, "ce_loss_3": 4.279167592525482, "ce_loss_6": 3.94570974111557, "epoch": 0.292, "grad_norm": 980.0, "kl_loss_12": 1282.2089416503907, "kl_loss_17": 364.0143417358398, "kl_loss_3": 2988.808801269531, "kl_loss_6": 2321.889697265625, "learning_rate": 0.0008128070347473608, "loss": 1725.4188, "step": 2920 }, { "ce_loss_12": 3.452561819553375, "ce_loss_17": 3.036209762096405, "ce_loss_23": 2.86896470785141, "ce_loss_3": 4.338265037536621, "ce_loss_6": 3.9918197631835937, "epoch": 0.293, "grad_norm": 1368.0, "kl_loss_12": 1288.3324584960938, "kl_loss_17": 367.0218811035156, "kl_loss_3": 3065.8913818359374, "kl_loss_6": 2384.238067626953, "learning_rate": 0.0008115676541804455, "loss": 1742.3186, "step": 2930 }, { "ce_loss_12": 3.429081213474274, "ce_loss_17": 3.032915246486664, "ce_loss_23": 2.8716418743133545, "ce_loss_3": 4.276789140701294, "ce_loss_6": 3.95506272315979, "epoch": 0.294, "grad_norm": 1160.0, "kl_loss_12": 1249.3301696777344, "kl_loss_17": 362.1129867553711, "kl_loss_3": 2951.0944580078126, "kl_loss_6": 2311.5976318359376, "learning_rate": 0.0008103251361307119, "loss": 1741.0098, "step": 2940 }, { "ce_loss_12": 3.467066729068756, "ce_loss_17": 3.0619003772735596, "ce_loss_23": 2.8990610003471375, "ce_loss_3": 4.321110010147095, "ce_loss_6": 3.990778183937073, "epoch": 0.295, "grad_norm": 984.0, "kl_loss_12": 1265.3674926757812, "kl_loss_17": 364.09894409179685, "kl_loss_3": 2966.912878417969, "kl_loss_6": 2325.2829406738283, "learning_rate": 0.0008090794931103026, "loss": 1712.6641, "step": 2950 }, { "ce_loss_12": 3.4314486742019654, "ce_loss_17": 3.0392224907875063, "ce_loss_23": 2.88177170753479, "ce_loss_3": 4.275236093997956, "ce_loss_6": 3.951506805419922, "epoch": 0.296, "grad_norm": 1048.0, "kl_loss_12": 1241.217755126953, "kl_loss_17": 350.6375930786133, "kl_loss_3": 2931.218994140625, "kl_loss_6": 2290.7966796875, "learning_rate": 0.0008078307376628291, "loss": 1712.4719, "step": 2960 }, { "ce_loss_12": 3.4813525915145873, "ce_loss_17": 3.095889449119568, "ce_loss_23": 2.9383904099464417, "ce_loss_3": 4.30640218257904, "ce_loss_6": 3.9789917945861815, "epoch": 0.297, "grad_norm": 1320.0, "kl_loss_12": 1200.214617919922, "kl_loss_17": 356.74906463623046, "kl_loss_3": 2835.6501342773436, "kl_loss_6": 2204.1634033203127, "learning_rate": 0.000806578882363245, "loss": 1654.0092, "step": 2970 }, { "ce_loss_12": 3.4101366281509398, "ce_loss_17": 3.0129162073135376, "ce_loss_23": 2.86002494096756, "ce_loss_3": 4.239542484283447, "ce_loss_6": 3.9237579584121702, "epoch": 0.298, "grad_norm": 996.0, "kl_loss_12": 1232.5573425292969, "kl_loss_17": 347.3342086791992, "kl_loss_3": 2897.1833251953126, "kl_loss_6": 2273.477685546875, "learning_rate": 0.0008053239398177191, "loss": 1723.9477, "step": 2980 }, { "ce_loss_12": 3.4091039061546327, "ce_loss_17": 3.007597863674164, "ce_loss_23": 2.847511053085327, "ce_loss_3": 4.270219349861145, "ce_loss_6": 3.9442752599716187, "epoch": 0.299, "grad_norm": 1000.0, "kl_loss_12": 1248.1081665039062, "kl_loss_17": 354.12125701904296, "kl_loss_3": 2963.1808349609373, "kl_loss_6": 2330.194610595703, "learning_rate": 0.0008040659226635089, "loss": 1757.8508, "step": 2990 }, { "ce_loss_12": 3.527987062931061, "ce_loss_17": 3.1199743270874025, "ce_loss_23": 2.954590117931366, "ce_loss_3": 4.3627019882202145, "ce_loss_6": 4.041871237754822, "epoch": 0.3, "grad_norm": 920.0, "kl_loss_12": 1262.4868103027343, "kl_loss_17": 369.21068878173827, "kl_loss_3": 2945.7930297851562, "kl_loss_6": 2307.741522216797, "learning_rate": 0.0008028048435688333, "loss": 1701.9465, "step": 3000 }, { "ce_loss_12": 3.424453687667847, "ce_loss_17": 3.0054264664649963, "ce_loss_23": 2.8507750153541567, "ce_loss_3": 4.295927572250366, "ce_loss_6": 3.9547128438949586, "epoch": 0.301, "grad_norm": 872.0, "kl_loss_12": 1277.3141723632812, "kl_loss_17": 354.91680297851565, "kl_loss_3": 3015.4612426757812, "kl_loss_6": 2361.0749755859374, "learning_rate": 0.0008015407152327448, "loss": 1730.5312, "step": 3010 }, { "ce_loss_12": 3.4446123480796813, "ce_loss_17": 3.047043168544769, "ce_loss_23": 2.8878801107406615, "ce_loss_3": 4.314627742767334, "ce_loss_6": 3.9756017565727233, "epoch": 0.302, "grad_norm": 1104.0, "kl_loss_12": 1255.7592834472657, "kl_loss_17": 354.1048187255859, "kl_loss_3": 3005.7612426757814, "kl_loss_6": 2343.475305175781, "learning_rate": 0.0008002735503850016, "loss": 1735.2307, "step": 3020 }, { "ce_loss_12": 3.3590387105941772, "ce_loss_17": 2.944441223144531, "ce_loss_23": 2.785951316356659, "ce_loss_3": 4.251755249500275, "ce_loss_6": 3.910470759868622, "epoch": 0.303, "grad_norm": 1208.0, "kl_loss_12": 1281.0070678710938, "kl_loss_17": 354.10951843261716, "kl_loss_3": 3049.8285888671876, "kl_loss_6": 2380.6233093261717, "learning_rate": 0.0007990033617859396, "loss": 1756.1746, "step": 3030 }, { "ce_loss_12": 3.398754870891571, "ce_loss_17": 2.9987738013267515, "ce_loss_23": 2.8411520957946776, "ce_loss_3": 4.253010249137878, "ce_loss_6": 3.915536332130432, "epoch": 0.304, "grad_norm": 996.0, "kl_loss_12": 1240.5551879882812, "kl_loss_17": 345.8853164672852, "kl_loss_3": 2941.6995849609375, "kl_loss_6": 2288.344030761719, "learning_rate": 0.000797730162226344, "loss": 1664.4863, "step": 3040 }, { "ce_loss_12": 3.423166370391846, "ce_loss_17": 3.015391969680786, "ce_loss_23": 2.857586407661438, "ce_loss_3": 4.28016722202301, "ce_loss_6": 3.9513394713401793, "epoch": 0.305, "grad_norm": 1280.0, "kl_loss_12": 1252.754815673828, "kl_loss_17": 347.2244384765625, "kl_loss_3": 2958.3191040039064, "kl_loss_6": 2312.717321777344, "learning_rate": 0.0007964539645273203, "loss": 1708.0309, "step": 3050 }, { "ce_loss_12": 3.4114735603332518, "ce_loss_17": 3.025020956993103, "ce_loss_23": 2.880479061603546, "ce_loss_3": 4.265051865577698, "ce_loss_6": 3.9374590635299684, "epoch": 0.306, "grad_norm": 984.0, "kl_loss_12": 1201.3837463378907, "kl_loss_17": 333.77726287841796, "kl_loss_3": 2892.557958984375, "kl_loss_6": 2244.813525390625, "learning_rate": 0.000795174781540165, "loss": 1696.2406, "step": 3060 }, { "ce_loss_12": 3.4819996476173403, "ce_loss_17": 3.0955939888954163, "ce_loss_23": 2.9482061147689818, "ce_loss_3": 4.297872924804688, "ce_loss_6": 3.9962024688720703, "epoch": 0.307, "grad_norm": 824.0, "kl_loss_12": 1201.2342224121094, "kl_loss_17": 335.6284637451172, "kl_loss_3": 2830.093212890625, "kl_loss_6": 2229.5277893066404, "learning_rate": 0.0007938926261462366, "loss": 1689.6482, "step": 3070 }, { "ce_loss_12": 3.4330130338668825, "ce_loss_17": 3.051720929145813, "ce_loss_23": 2.897457814216614, "ce_loss_3": 4.26753751039505, "ce_loss_6": 3.942421293258667, "epoch": 0.308, "grad_norm": 1032.0, "kl_loss_12": 1214.312567138672, "kl_loss_17": 344.4313339233398, "kl_loss_3": 2897.050341796875, "kl_loss_6": 2260.629071044922, "learning_rate": 0.0007926075112568258, "loss": 1714.8252, "step": 3080 }, { "ce_loss_12": 3.4475087881088258, "ce_loss_17": 3.053488528728485, "ce_loss_23": 2.8923556566238404, "ce_loss_3": 4.28842933177948, "ce_loss_6": 3.9646134853363035, "epoch": 0.309, "grad_norm": 928.0, "kl_loss_12": 1240.5612548828126, "kl_loss_17": 354.20672912597655, "kl_loss_3": 2923.2433959960936, "kl_loss_6": 2286.2694580078123, "learning_rate": 0.0007913194498130252, "loss": 1677.05, "step": 3090 }, { "ce_loss_12": 3.3970143675804136, "ce_loss_17": 2.9865134000778197, "ce_loss_23": 2.8217122316360475, "ce_loss_3": 4.25620584487915, "ce_loss_6": 3.9191672444343566, "epoch": 0.31, "grad_norm": 1240.0, "kl_loss_12": 1247.7857238769532, "kl_loss_17": 356.9169692993164, "kl_loss_3": 2962.325329589844, "kl_loss_6": 2299.6020629882814, "learning_rate": 0.0007900284547855992, "loss": 1725.1816, "step": 3100 }, { "ce_loss_12": 3.385082983970642, "ce_loss_17": 2.99058358669281, "ce_loss_23": 2.837337374687195, "ce_loss_3": 4.213411450386047, "ce_loss_6": 3.886853301525116, "epoch": 0.311, "grad_norm": 804.0, "kl_loss_12": 1234.5562255859375, "kl_loss_17": 350.2632308959961, "kl_loss_3": 2897.912805175781, "kl_loss_6": 2260.5111755371095, "learning_rate": 0.0007887345391748532, "loss": 1719.3498, "step": 3110 }, { "ce_loss_12": 3.4767756819725038, "ce_loss_17": 3.0919549822807313, "ce_loss_23": 2.939840757846832, "ce_loss_3": 4.2933003306388855, "ce_loss_6": 3.976503777503967, "epoch": 0.312, "grad_norm": 1264.0, "kl_loss_12": 1205.0848999023438, "kl_loss_17": 345.1719009399414, "kl_loss_3": 2828.3422607421876, "kl_loss_6": 2205.365673828125, "learning_rate": 0.0007874377160105036, "loss": 1645.6273, "step": 3120 }, { "ce_loss_12": 3.428488540649414, "ce_loss_17": 3.0121246337890626, "ce_loss_23": 2.8537957429885865, "ce_loss_3": 4.28159236907959, "ce_loss_6": 3.9406146883964537, "epoch": 0.313, "grad_norm": 928.0, "kl_loss_12": 1269.7479125976563, "kl_loss_17": 346.16997680664065, "kl_loss_3": 2976.1847412109373, "kl_loss_6": 2310.867999267578, "learning_rate": 0.0007861379983515449, "loss": 1755.5637, "step": 3130 }, { "ce_loss_12": 3.478686714172363, "ce_loss_17": 3.074314284324646, "ce_loss_23": 2.9215672969818116, "ce_loss_3": 4.305224299430847, "ce_loss_6": 3.9890227794647215, "epoch": 0.314, "grad_norm": 1012.0, "kl_loss_12": 1248.4745849609376, "kl_loss_17": 339.9023941040039, "kl_loss_3": 2893.8021484375, "kl_loss_6": 2270.5517761230467, "learning_rate": 0.0007848353992861195, "loss": 1679.8211, "step": 3140 }, { "ce_loss_12": 3.574088752269745, "ce_loss_17": 3.160393226146698, "ce_loss_23": 2.9927392840385436, "ce_loss_3": 4.402963733673095, "ce_loss_6": 4.084368908405304, "epoch": 0.315, "grad_norm": 932.0, "kl_loss_12": 1281.5710205078126, "kl_loss_17": 361.84169921875, "kl_loss_3": 2930.8458740234373, "kl_loss_6": 2299.417041015625, "learning_rate": 0.0007835299319313853, "loss": 1721.2396, "step": 3150 }, { "ce_loss_12": 3.441893661022186, "ce_loss_17": 3.050514340400696, "ce_loss_23": 2.9018136262893677, "ce_loss_3": 4.269163203239441, "ce_loss_6": 3.950773096084595, "epoch": 0.316, "grad_norm": 1112.0, "kl_loss_12": 1213.3498229980469, "kl_loss_17": 338.602278137207, "kl_loss_3": 2869.5227783203127, "kl_loss_6": 2248.8671264648438, "learning_rate": 0.0007822216094333848, "loss": 1721.7668, "step": 3160 }, { "ce_loss_12": 3.4707986950874328, "ce_loss_17": 3.0577391386032104, "ce_loss_23": 2.905358004570007, "ce_loss_3": 4.321441674232483, "ce_loss_6": 3.9936294078826906, "epoch": 0.317, "grad_norm": 1024.0, "kl_loss_12": 1257.6893615722656, "kl_loss_17": 341.61038665771486, "kl_loss_3": 2954.4898315429687, "kl_loss_6": 2308.9023193359376, "learning_rate": 0.0007809104449669101, "loss": 1698.5734, "step": 3170 }, { "ce_loss_12": 3.3872597932815554, "ce_loss_17": 2.9946960330009462, "ce_loss_23": 2.853222095966339, "ce_loss_3": 4.220446193218232, "ce_loss_6": 3.9008307218551637, "epoch": 0.318, "grad_norm": 1536.0, "kl_loss_12": 1210.7290832519532, "kl_loss_17": 331.215837097168, "kl_loss_3": 2866.191455078125, "kl_loss_6": 2238.608123779297, "learning_rate": 0.0007795964517353734, "loss": 1672.657, "step": 3180 }, { "ce_loss_12": 3.403947818279266, "ce_loss_17": 3.000806951522827, "ce_loss_23": 2.8538204431533813, "ce_loss_3": 4.2604128241539, "ce_loss_6": 3.930659818649292, "epoch": 0.319, "grad_norm": 980.0, "kl_loss_12": 1233.635791015625, "kl_loss_17": 336.13388977050784, "kl_loss_3": 2938.8574462890624, "kl_loss_6": 2292.6660705566405, "learning_rate": 0.000778279642970672, "loss": 1661.2771, "step": 3190 }, { "ce_loss_12": 3.3985164761543274, "ce_loss_17": 3.008086919784546, "ce_loss_23": 2.8599029898643495, "ce_loss_3": 4.225405025482178, "ce_loss_6": 3.910887622833252, "epoch": 0.32, "grad_norm": 1088.0, "kl_loss_12": 1213.4981201171875, "kl_loss_17": 333.7114624023437, "kl_loss_3": 2869.0503173828124, "kl_loss_6": 2258.674560546875, "learning_rate": 0.0007769600319330552, "loss": 1655.7844, "step": 3200 }, { "ce_loss_12": 3.447622609138489, "ce_loss_17": 3.0317168831825256, "ce_loss_23": 2.884088695049286, "ce_loss_3": 4.318697500228882, "ce_loss_6": 3.983830189704895, "epoch": 0.321, "grad_norm": 1224.0, "kl_loss_12": 1245.9826690673829, "kl_loss_17": 337.38295593261716, "kl_loss_3": 2983.2697509765626, "kl_loss_6": 2329.8124755859376, "learning_rate": 0.0007756376319109917, "loss": 1698.1805, "step": 3210 }, { "ce_loss_12": 3.4633454203605654, "ce_loss_17": 3.067983663082123, "ce_loss_23": 2.924458014965057, "ce_loss_3": 4.289544808864593, "ce_loss_6": 3.9699981570243836, "epoch": 0.322, "grad_norm": 1176.0, "kl_loss_12": 1213.3118133544922, "kl_loss_17": 330.83406219482424, "kl_loss_3": 2866.3750732421877, "kl_loss_6": 2242.109002685547, "learning_rate": 0.0007743124562210351, "loss": 1639.6686, "step": 3220 }, { "ce_loss_12": 3.475546681880951, "ce_loss_17": 3.0862147331237795, "ce_loss_23": 2.937591588497162, "ce_loss_3": 4.305411899089814, "ce_loss_6": 3.9904374480247498, "epoch": 0.323, "grad_norm": 1504.0, "kl_loss_12": 1211.6309631347656, "kl_loss_17": 333.8129486083984, "kl_loss_3": 2886.2772583007813, "kl_loss_6": 2262.567352294922, "learning_rate": 0.0007729845182076895, "loss": 1680.683, "step": 3230 }, { "ce_loss_12": 3.410216248035431, "ce_loss_17": 3.0217228651046755, "ce_loss_23": 2.877736246585846, "ce_loss_3": 4.229400503635406, "ce_loss_6": 3.9092357754707336, "epoch": 0.324, "grad_norm": 1048.0, "kl_loss_12": 1201.3193969726562, "kl_loss_17": 328.6470672607422, "kl_loss_3": 2836.2654541015627, "kl_loss_6": 2212.4843994140624, "learning_rate": 0.0007716538312432765, "loss": 1695.1324, "step": 3240 }, { "ce_loss_12": 3.390603744983673, "ce_loss_17": 2.9847122192382813, "ce_loss_23": 2.832513761520386, "ce_loss_3": 4.25310173034668, "ce_loss_6": 3.921470022201538, "epoch": 0.325, "grad_norm": 1240.0, "kl_loss_12": 1255.1136657714844, "kl_loss_17": 344.44752044677733, "kl_loss_3": 2969.5409912109376, "kl_loss_6": 2320.2986572265627, "learning_rate": 0.0007703204087277988, "loss": 1709.0459, "step": 3250 }, { "ce_loss_12": 3.451129651069641, "ce_loss_17": 3.0698804378509523, "ce_loss_23": 2.9265930533409117, "ce_loss_3": 4.268451309204101, "ce_loss_6": 3.950474727153778, "epoch": 0.326, "grad_norm": 908.0, "kl_loss_12": 1186.6741638183594, "kl_loss_17": 325.9498016357422, "kl_loss_3": 2813.0351440429686, "kl_loss_6": 2200.3617919921876, "learning_rate": 0.0007689842640888063, "loss": 1644.4848, "step": 3260 }, { "ce_loss_12": 3.4569345355033874, "ce_loss_17": 3.0677335143089293, "ce_loss_23": 2.9155858874320986, "ce_loss_3": 4.279983472824097, "ce_loss_6": 3.9633984923362733, "epoch": 0.327, "grad_norm": 1208.0, "kl_loss_12": 1202.497802734375, "kl_loss_17": 335.511946105957, "kl_loss_3": 2833.3481323242186, "kl_loss_6": 2213.333331298828, "learning_rate": 0.0007676454107812607, "loss": 1664.0393, "step": 3270 }, { "ce_loss_12": 3.4128082990646362, "ce_loss_17": 3.0163416266441345, "ce_loss_23": 2.8643674731254576, "ce_loss_3": 4.269602704048157, "ce_loss_6": 3.942162013053894, "epoch": 0.328, "grad_norm": 1072.0, "kl_loss_12": 1230.0508850097656, "kl_loss_17": 342.1333236694336, "kl_loss_3": 2943.2237182617187, "kl_loss_6": 2295.276818847656, "learning_rate": 0.0007663038622873999, "loss": 1674.4738, "step": 3280 }, { "ce_loss_12": 3.444205844402313, "ce_loss_17": 3.0551442742347716, "ce_loss_23": 2.9088870882987976, "ce_loss_3": 4.284962522983551, "ce_loss_6": 3.965476393699646, "epoch": 0.329, "grad_norm": 996.0, "kl_loss_12": 1205.8692626953125, "kl_loss_17": 337.4704162597656, "kl_loss_3": 2886.572998046875, "kl_loss_6": 2260.4553100585936, "learning_rate": 0.0007649596321166025, "loss": 1644.5602, "step": 3290 }, { "ce_loss_12": 3.3512764811515807, "ce_loss_17": 2.963244104385376, "ce_loss_23": 2.816838335990906, "ce_loss_3": 4.173265266418457, "ce_loss_6": 3.848917078971863, "epoch": 0.33, "grad_norm": 1176.0, "kl_loss_12": 1188.5720703125, "kl_loss_17": 328.98443298339845, "kl_loss_3": 2824.7002563476562, "kl_loss_6": 2182.885040283203, "learning_rate": 0.0007636127338052513, "loss": 1658.1311, "step": 3300 }, { "ce_loss_12": 3.459541440010071, "ce_loss_17": 3.0586934447288514, "ce_loss_23": 2.9092486619949343, "ce_loss_3": 4.322417807579041, "ce_loss_6": 4.004756271839142, "epoch": 0.331, "grad_norm": 956.0, "kl_loss_12": 1244.9644409179687, "kl_loss_17": 339.57240905761716, "kl_loss_3": 2961.9379638671876, "kl_loss_6": 2337.522784423828, "learning_rate": 0.0007622631809165971, "loss": 1680.2289, "step": 3310 }, { "ce_loss_12": 3.417771947383881, "ce_loss_17": 3.0409359097480775, "ce_loss_23": 2.904327702522278, "ce_loss_3": 4.219339740276337, "ce_loss_6": 3.9072014331817626, "epoch": 0.332, "grad_norm": 932.0, "kl_loss_12": 1154.7458221435547, "kl_loss_17": 314.9847137451172, "kl_loss_3": 2750.251428222656, "kl_loss_6": 2141.974853515625, "learning_rate": 0.000760910987040623, "loss": 1631.1079, "step": 3320 }, { "ce_loss_12": 3.4493923664093016, "ce_loss_17": 3.0475847482681275, "ce_loss_23": 2.890909481048584, "ce_loss_3": 4.309184527397155, "ce_loss_6": 3.9842631340026857, "epoch": 0.333, "grad_norm": 1004.0, "kl_loss_12": 1254.3341552734375, "kl_loss_17": 344.63670349121094, "kl_loss_3": 2990.2296508789063, "kl_loss_6": 2340.0262939453123, "learning_rate": 0.000759556165793906, "loss": 1678.9332, "step": 3330 }, { "ce_loss_12": 3.444781243801117, "ce_loss_17": 3.0487191557884215, "ce_loss_23": 2.8992719531059263, "ce_loss_3": 4.283552169799805, "ce_loss_6": 3.961264455318451, "epoch": 0.334, "grad_norm": 1312.0, "kl_loss_12": 1226.6700500488282, "kl_loss_17": 337.08323059082034, "kl_loss_3": 2898.0889892578125, "kl_loss_6": 2265.684948730469, "learning_rate": 0.000758198730819481, "loss": 1694.3148, "step": 3340 }, { "ce_loss_12": 3.4056811690330506, "ce_loss_17": 3.015982913970947, "ce_loss_23": 2.8739725828170775, "ce_loss_3": 4.259327507019043, "ce_loss_6": 3.92424658536911, "epoch": 0.335, "grad_norm": 1004.0, "kl_loss_12": 1191.1532836914062, "kl_loss_17": 325.79417724609374, "kl_loss_3": 2897.7123168945313, "kl_loss_6": 2244.7325927734373, "learning_rate": 0.0007568386957867032, "loss": 1669.3246, "step": 3350 }, { "ce_loss_12": 3.4579933881759644, "ce_loss_17": 3.0634746313095094, "ce_loss_23": 2.9128017902374266, "ce_loss_3": 4.298311471939087, "ce_loss_6": 3.96904639005661, "epoch": 0.336, "grad_norm": 984.0, "kl_loss_12": 1210.9908325195313, "kl_loss_17": 332.8085235595703, "kl_loss_3": 2887.1035766601562, "kl_loss_6": 2238.7325805664063, "learning_rate": 0.0007554760743911103, "loss": 1685.1012, "step": 3360 }, { "ce_loss_12": 3.3742502093315125, "ce_loss_17": 2.9924378871917723, "ce_loss_23": 2.8445406556129456, "ce_loss_3": 4.203329575061798, "ce_loss_6": 3.883717620372772, "epoch": 0.337, "grad_norm": 1232.0, "kl_loss_12": 1176.7677856445312, "kl_loss_17": 322.7069152832031, "kl_loss_3": 2854.629248046875, "kl_loss_6": 2216.13232421875, "learning_rate": 0.0007541108803542846, "loss": 1695.4902, "step": 3370 }, { "ce_loss_12": 3.405466413497925, "ce_loss_17": 3.026802861690521, "ce_loss_23": 2.8841339468955995, "ce_loss_3": 4.250721335411072, "ce_loss_6": 3.918993294239044, "epoch": 0.338, "grad_norm": 1032.0, "kl_loss_12": 1186.8168060302735, "kl_loss_17": 326.74443359375, "kl_loss_3": 2882.3068725585936, "kl_loss_6": 2238.0958862304688, "learning_rate": 0.0007527431274237149, "loss": 1734.5438, "step": 3380 }, { "ce_loss_12": 3.3811033248901365, "ce_loss_17": 2.9985656142234802, "ce_loss_23": 2.855313777923584, "ce_loss_3": 4.223816525936127, "ce_loss_6": 3.9034560084342957, "epoch": 0.339, "grad_norm": 836.0, "kl_loss_12": 1187.6836853027344, "kl_loss_17": 328.6970550537109, "kl_loss_3": 2870.4953369140626, "kl_loss_6": 2244.3517517089845, "learning_rate": 0.0007513728293726579, "loss": 1663.8422, "step": 3390 }, { "ce_loss_12": 3.4858251929283144, "ce_loss_17": 3.1010074496269224, "ce_loss_23": 2.9533849716186524, "ce_loss_3": 4.3071588516235355, "ce_loss_6": 3.9825177431106566, "epoch": 0.34, "grad_norm": 968.0, "kl_loss_12": 1201.841552734375, "kl_loss_17": 333.19029541015624, "kl_loss_3": 2849.7021728515624, "kl_loss_6": 2217.7927062988283, "learning_rate": 0.00075, "loss": 1647.9623, "step": 3400 }, { "ce_loss_12": 3.4936216950416563, "ce_loss_17": 3.096825158596039, "ce_loss_23": 2.946681487560272, "ce_loss_3": 4.33717565536499, "ce_loss_6": 4.007562124729157, "epoch": 0.341, "grad_norm": 1088.0, "kl_loss_12": 1218.2007141113281, "kl_loss_17": 336.65336151123046, "kl_loss_3": 2905.7448852539064, "kl_loss_6": 2266.000128173828, "learning_rate": 0.0007486246531301177, "loss": 1664.9313, "step": 3410 }, { "ce_loss_12": 3.3124721884727477, "ce_loss_17": 2.9247117161750795, "ce_loss_23": 2.7764694809913637, "ce_loss_3": 4.156286108493805, "ce_loss_6": 3.8327873945236206, "epoch": 0.342, "grad_norm": 972.0, "kl_loss_12": 1194.6027465820312, "kl_loss_17": 326.52222137451173, "kl_loss_3": 2875.920751953125, "kl_loss_6": 2236.224786376953, "learning_rate": 0.0007472468026127384, "loss": 1643.2096, "step": 3420 }, { "ce_loss_12": 3.467436170578003, "ce_loss_17": 3.0640435934066774, "ce_loss_23": 2.9080865740776063, "ce_loss_3": 4.331029152870178, "ce_loss_6": 3.997330093383789, "epoch": 0.343, "grad_norm": 1048.0, "kl_loss_12": 1247.931414794922, "kl_loss_17": 353.526628112793, "kl_loss_3": 2997.3978515625, "kl_loss_6": 2341.1341552734375, "learning_rate": 0.000745866462322802, "loss": 1710.0527, "step": 3430 }, { "ce_loss_12": 3.413029646873474, "ce_loss_17": 3.0317874789237975, "ce_loss_23": 2.889462399482727, "ce_loss_3": 4.240648257732391, "ce_loss_6": 3.920201134681702, "epoch": 0.344, "grad_norm": 992.0, "kl_loss_12": 1178.7016906738281, "kl_loss_17": 325.30846405029297, "kl_loss_3": 2833.207421875, "kl_loss_6": 2199.9562438964845, "learning_rate": 0.0007444836461603195, "loss": 1647.2428, "step": 3440 }, { "ce_loss_12": 3.4991490840911865, "ce_loss_17": 3.1049185156822205, "ce_loss_23": 2.9510151505470277, "ce_loss_3": 4.332233214378357, "ce_loss_6": 4.014215791225434, "epoch": 0.345, "grad_norm": 992.0, "kl_loss_12": 1251.4855041503906, "kl_loss_17": 351.6912017822266, "kl_loss_3": 2915.772314453125, "kl_loss_6": 2294.0759887695312, "learning_rate": 0.0007430983680502344, "loss": 1706.1598, "step": 3450 }, { "ce_loss_12": 3.3458803057670594, "ce_loss_17": 2.950461220741272, "ce_loss_23": 2.801449549198151, "ce_loss_3": 4.203580784797668, "ce_loss_6": 3.875659191608429, "epoch": 0.346, "grad_norm": 1104.0, "kl_loss_12": 1221.081005859375, "kl_loss_17": 339.3466766357422, "kl_loss_3": 2943.6446044921877, "kl_loss_6": 2290.779504394531, "learning_rate": 0.0007417106419422819, "loss": 1686.3721, "step": 3460 }, { "ce_loss_12": 3.422148883342743, "ce_loss_17": 3.035439658164978, "ce_loss_23": 2.886011064052582, "ce_loss_3": 4.258232343196869, "ce_loss_6": 3.929511618614197, "epoch": 0.347, "grad_norm": 1304.0, "kl_loss_12": 1193.3747314453126, "kl_loss_17": 330.8219955444336, "kl_loss_3": 2852.3163208007813, "kl_loss_6": 2209.255895996094, "learning_rate": 0.0007403204818108486, "loss": 1673.4871, "step": 3470 }, { "ce_loss_12": 3.408907175064087, "ce_loss_17": 3.02507598400116, "ce_loss_23": 2.87800327539444, "ce_loss_3": 4.256623768806458, "ce_loss_6": 3.9263039350509645, "epoch": 0.348, "grad_norm": 1200.0, "kl_loss_12": 1221.0171936035156, "kl_loss_17": 334.5439254760742, "kl_loss_3": 2934.891650390625, "kl_loss_6": 2288.8263549804688, "learning_rate": 0.0007389279016548316, "loss": 1638.8451, "step": 3480 }, { "ce_loss_12": 3.4307928681373596, "ce_loss_17": 3.0258888959884644, "ce_loss_23": 2.8691168308258055, "ce_loss_3": 4.318265390396118, "ce_loss_6": 3.977033627033234, "epoch": 0.349, "grad_norm": 1336.0, "kl_loss_12": 1240.856460571289, "kl_loss_17": 346.09679412841797, "kl_loss_3": 3012.540246582031, "kl_loss_6": 2333.396343994141, "learning_rate": 0.0007375329154974975, "loss": 1706.6148, "step": 3490 }, { "ce_loss_12": 3.3646591901779175, "ce_loss_17": 2.97927565574646, "ce_loss_23": 2.844258749485016, "ce_loss_3": 4.202694976329804, "ce_loss_6": 3.874116039276123, "epoch": 0.35, "grad_norm": 1012.0, "kl_loss_12": 1182.0934997558593, "kl_loss_17": 326.57828521728516, "kl_loss_3": 2833.6731201171874, "kl_loss_6": 2196.8786743164064, "learning_rate": 0.0007361355373863414, "loss": 1678.5336, "step": 3500 }, { "ce_loss_12": 3.408645486831665, "ce_loss_17": 3.0301180243492127, "ce_loss_23": 2.8835768818855287, "ce_loss_3": 4.239638113975525, "ce_loss_6": 3.914521038532257, "epoch": 0.351, "grad_norm": 996.0, "kl_loss_12": 1170.8706909179687, "kl_loss_17": 326.8454879760742, "kl_loss_3": 2834.1891235351563, "kl_loss_6": 2189.419958496094, "learning_rate": 0.0007347357813929454, "loss": 1679.9662, "step": 3510 }, { "ce_loss_12": 3.363666367530823, "ce_loss_17": 2.9849454045295714, "ce_loss_23": 2.8406106352806093, "ce_loss_3": 4.193360352516175, "ce_loss_6": 3.8782509207725524, "epoch": 0.352, "grad_norm": 1152.0, "kl_loss_12": 1161.9197082519531, "kl_loss_17": 326.0701385498047, "kl_loss_3": 2821.089880371094, "kl_loss_6": 2196.3210876464846, "learning_rate": 0.0007333336616128369, "loss": 1667.7982, "step": 3520 }, { "ce_loss_12": 3.363524007797241, "ce_loss_17": 2.9640949726104737, "ce_loss_23": 2.8081513166427614, "ce_loss_3": 4.223999178409576, "ce_loss_6": 3.899560832977295, "epoch": 0.353, "grad_norm": 940.0, "kl_loss_12": 1227.8570373535156, "kl_loss_17": 341.45946044921874, "kl_loss_3": 2951.542346191406, "kl_loss_6": 2311.4682556152343, "learning_rate": 0.0007319291921653463, "loss": 1696.3664, "step": 3530 }, { "ce_loss_12": 3.4368479251861572, "ce_loss_17": 3.045628237724304, "ce_loss_23": 2.88864551782608, "ce_loss_3": 4.29784243106842, "ce_loss_6": 3.960416281223297, "epoch": 0.354, "grad_norm": 1048.0, "kl_loss_12": 1223.4083862304688, "kl_loss_17": 345.2417556762695, "kl_loss_3": 2935.894006347656, "kl_loss_6": 2278.224365234375, "learning_rate": 0.0007305223871934656, "loss": 1665.9004, "step": 3540 }, { "ce_loss_12": 3.390989351272583, "ce_loss_17": 3.007799005508423, "ce_loss_23": 2.8593823671340943, "ce_loss_3": 4.239778733253479, "ce_loss_6": 3.9120181441307067, "epoch": 0.355, "grad_norm": 1088.0, "kl_loss_12": 1199.7558227539062, "kl_loss_17": 331.8055679321289, "kl_loss_3": 2895.3523071289064, "kl_loss_6": 2249.6681213378906, "learning_rate": 0.0007291132608637052, "loss": 1671.7258, "step": 3550 }, { "ce_loss_12": 3.4190950870513914, "ce_loss_17": 2.9831862688064574, "ce_loss_23": 2.8376792788505556, "ce_loss_3": 4.28361166715622, "ce_loss_6": 3.955883574485779, "epoch": 0.356, "grad_norm": 1248.0, "kl_loss_12": 1284.4929809570312, "kl_loss_17": 324.38274536132815, "kl_loss_3": 3015.9632934570313, "kl_loss_6": 2369.1489807128905, "learning_rate": 0.0007277018273659516, "loss": 1730.4875, "step": 3560 }, { "ce_loss_12": 3.499080014228821, "ce_loss_17": 3.095122253894806, "ce_loss_23": 2.944272482395172, "ce_loss_3": 4.32317476272583, "ce_loss_6": 4.0050243973732, "epoch": 0.357, "grad_norm": 860.0, "kl_loss_12": 1239.7875671386719, "kl_loss_17": 340.98800201416014, "kl_loss_3": 2891.496875, "kl_loss_6": 2267.2274780273438, "learning_rate": 0.0007262881009133242, "loss": 1674.6738, "step": 3570 }, { "ce_loss_12": 3.400438332557678, "ce_loss_17": 3.0199944972991943, "ce_loss_23": 2.8742507457733155, "ce_loss_3": 4.232627630233765, "ce_loss_6": 3.914644730091095, "epoch": 0.358, "grad_norm": 1012.0, "kl_loss_12": 1179.560043334961, "kl_loss_17": 324.86656494140624, "kl_loss_3": 2846.779443359375, "kl_loss_6": 2215.014270019531, "learning_rate": 0.0007248720957420329, "loss": 1637.5172, "step": 3580 }, { "ce_loss_12": 3.3900612473487852, "ce_loss_17": 3.0157479643821716, "ce_loss_23": 2.8782155990600584, "ce_loss_3": 4.211673903465271, "ce_loss_6": 3.8979337215423584, "epoch": 0.359, "grad_norm": 968.0, "kl_loss_12": 1171.381411743164, "kl_loss_17": 321.20471649169923, "kl_loss_3": 2810.2108520507813, "kl_loss_6": 2188.5156005859376, "learning_rate": 0.0007234538261112341, "loss": 1678.8098, "step": 3590 }, { "ce_loss_12": 3.4516560554504396, "ce_loss_17": 3.0625590562820433, "ce_loss_23": 2.9125977158546448, "ce_loss_3": 4.298930358886719, "ce_loss_6": 3.9716432213783266, "epoch": 0.36, "grad_norm": 1448.0, "kl_loss_12": 1205.0357238769532, "kl_loss_17": 335.10853118896483, "kl_loss_3": 2908.1567504882814, "kl_loss_6": 2268.4599182128904, "learning_rate": 0.0007220333063028871, "loss": 1654.9832, "step": 3600 }, { "ce_loss_12": 3.647151756286621, "ce_loss_17": 3.100103199481964, "ce_loss_23": 2.9450520038604737, "ce_loss_3": 4.415069651603699, "ce_loss_6": 4.093648076057434, "epoch": 0.361, "grad_norm": 920.0, "kl_loss_12": 1537.5132202148438, "kl_loss_17": 348.4029968261719, "kl_loss_3": 3085.6149047851563, "kl_loss_6": 2452.1618225097654, "learning_rate": 0.0007206105506216106, "loss": 1780.9383, "step": 3610 }, { "ce_loss_12": 3.3636831879615783, "ce_loss_17": 2.970406544208527, "ce_loss_23": 2.830918622016907, "ce_loss_3": 4.1724036693572994, "ce_loss_6": 3.860769248008728, "epoch": 0.362, "grad_norm": 1192.0, "kl_loss_12": 1192.9184692382812, "kl_loss_17": 321.7777557373047, "kl_loss_3": 2808.0264526367187, "kl_loss_6": 2189.515771484375, "learning_rate": 0.0007191855733945387, "loss": 1620.7258, "step": 3620 }, { "ce_loss_12": 3.45752409696579, "ce_loss_17": 3.0582520365715027, "ce_loss_23": 2.91327451467514, "ce_loss_3": 4.273197531700134, "ce_loss_6": 3.9577839732170106, "epoch": 0.363, "grad_norm": 1360.0, "kl_loss_12": 1206.7141052246093, "kl_loss_17": 327.1438919067383, "kl_loss_3": 2851.4152221679688, "kl_loss_6": 2226.528112792969, "learning_rate": 0.0007177583889711762, "loss": 1646.1391, "step": 3630 }, { "ce_loss_12": 3.3765084385871886, "ce_loss_17": 2.9773260831832884, "ce_loss_23": 2.8310837745666504, "ce_loss_3": 4.207540595531464, "ce_loss_6": 3.8836970210075377, "epoch": 0.364, "grad_norm": 1008.0, "kl_loss_12": 1220.5072326660156, "kl_loss_17": 330.72536010742186, "kl_loss_3": 2894.7391357421875, "kl_loss_6": 2249.510809326172, "learning_rate": 0.0007163290117232541, "loss": 1670.5426, "step": 3640 }, { "ce_loss_12": 3.449886155128479, "ce_loss_17": 3.0827237129211427, "ce_loss_23": 2.9416739106178285, "ce_loss_3": 4.242349648475647, "ce_loss_6": 3.9314298033714294, "epoch": 0.365, "grad_norm": 968.0, "kl_loss_12": 1169.6258117675782, "kl_loss_17": 319.44925079345705, "kl_loss_3": 2779.614050292969, "kl_loss_6": 2167.718853759766, "learning_rate": 0.0007148974560445859, "loss": 1634.0065, "step": 3650 }, { "ce_loss_12": 3.39327358007431, "ce_loss_17": 3.011539709568024, "ce_loss_23": 2.8679875254631044, "ce_loss_3": 4.203790700435638, "ce_loss_6": 3.8777782678604127, "epoch": 0.366, "grad_norm": 964.0, "kl_loss_12": 1173.676431274414, "kl_loss_17": 322.41929931640624, "kl_loss_3": 2789.83759765625, "kl_loss_6": 2164.4882751464843, "learning_rate": 0.0007134637363509209, "loss": 1611.7471, "step": 3660 }, { "ce_loss_12": 3.4866799235343935, "ce_loss_17": 3.1163609504699705, "ce_loss_23": 2.977469801902771, "ce_loss_3": 4.299042820930481, "ce_loss_6": 3.983168888092041, "epoch": 0.367, "grad_norm": 952.0, "kl_loss_12": 1160.4010803222657, "kl_loss_17": 316.6776519775391, "kl_loss_3": 2768.0127685546877, "kl_loss_6": 2150.191613769531, "learning_rate": 0.0007120278670798009, "loss": 1632.7604, "step": 3670 }, { "ce_loss_12": 3.3547308087348937, "ce_loss_17": 2.9385184764862062, "ce_loss_23": 2.793093574047089, "ce_loss_3": 4.232029449939728, "ce_loss_6": 3.904446530342102, "epoch": 0.368, "grad_norm": 1072.0, "kl_loss_12": 1246.8237579345703, "kl_loss_17": 336.25614776611326, "kl_loss_3": 3006.0741333007813, "kl_loss_6": 2360.845281982422, "learning_rate": 0.0007105898626904133, "loss": 1730.9002, "step": 3680 }, { "ce_loss_12": 3.4213064908981323, "ce_loss_17": 3.026142966747284, "ce_loss_23": 2.881701076030731, "ce_loss_3": 4.266143536567688, "ce_loss_6": 3.938157784938812, "epoch": 0.369, "grad_norm": 1176.0, "kl_loss_12": 1195.0875213623046, "kl_loss_17": 323.45970611572267, "kl_loss_3": 2873.1806762695314, "kl_loss_6": 2230.3562744140627, "learning_rate": 0.0007091497376634463, "loss": 1645.2902, "step": 3690 }, { "ce_loss_12": 3.3609025478363037, "ce_loss_17": 2.976015532016754, "ce_loss_23": 2.8323405981063843, "ce_loss_3": 4.19185117483139, "ce_loss_6": 3.8728413343429566, "epoch": 0.37, "grad_norm": 980.0, "kl_loss_12": 1184.621435546875, "kl_loss_17": 327.99839782714844, "kl_loss_3": 2828.9660766601564, "kl_loss_6": 2203.749591064453, "learning_rate": 0.0007077075065009433, "loss": 1666.8021, "step": 3700 }, { "ce_loss_12": 3.4689030408859254, "ce_loss_17": 3.076432979106903, "ce_loss_23": 2.9234078168869018, "ce_loss_3": 4.315247058868408, "ce_loss_6": 3.988802170753479, "epoch": 0.371, "grad_norm": 1104.0, "kl_loss_12": 1224.3506103515624, "kl_loss_17": 340.6742309570312, "kl_loss_3": 2917.4282836914062, "kl_loss_6": 2279.465539550781, "learning_rate": 0.0007062631837261557, "loss": 1674.5109, "step": 3710 }, { "ce_loss_12": 3.334757077693939, "ce_loss_17": 2.955988013744354, "ce_loss_23": 2.8153989911079407, "ce_loss_3": 4.175846552848816, "ce_loss_6": 3.8509674072265625, "epoch": 0.372, "grad_norm": 1128.0, "kl_loss_12": 1177.7722412109374, "kl_loss_17": 322.4950775146484, "kl_loss_3": 2849.675048828125, "kl_loss_6": 2210.367559814453, "learning_rate": 0.0007048167838833977, "loss": 1686.3625, "step": 3720 }, { "ce_loss_12": 3.4109480500221254, "ce_loss_17": 3.0387460708618166, "ce_loss_23": 2.890661919116974, "ce_loss_3": 4.229937970638275, "ce_loss_6": 3.915041470527649, "epoch": 0.373, "grad_norm": 1096.0, "kl_loss_12": 1171.3883331298828, "kl_loss_17": 328.7982147216797, "kl_loss_3": 2824.0050537109373, "kl_loss_6": 2200.1705627441406, "learning_rate": 0.0007033683215379002, "loss": 1640.442, "step": 3730 }, { "ce_loss_12": 3.400256597995758, "ce_loss_17": 3.017388308048248, "ce_loss_23": 2.8781059622764587, "ce_loss_3": 4.242074239253998, "ce_loss_6": 3.9133849501609803, "epoch": 0.374, "grad_norm": 1360.0, "kl_loss_12": 1169.729736328125, "kl_loss_17": 319.6404739379883, "kl_loss_3": 2847.294885253906, "kl_loss_6": 2206.2028259277345, "learning_rate": 0.0007019178112756625, "loss": 1663.7166, "step": 3740 }, { "ce_loss_12": 3.379054582118988, "ce_loss_17": 2.994138073921204, "ce_loss_23": 2.8540342330932615, "ce_loss_3": 4.206314241886139, "ce_loss_6": 3.8927628993988037, "epoch": 0.375, "grad_norm": 968.0, "kl_loss_12": 1167.5954345703126, "kl_loss_17": 319.94212341308594, "kl_loss_3": 2816.092578125, "kl_loss_6": 2200.663360595703, "learning_rate": 0.0007004652677033068, "loss": 1642.523, "step": 3750 }, { "ce_loss_12": 3.430193781852722, "ce_loss_17": 3.06665323972702, "ce_loss_23": 2.9323292851448057, "ce_loss_3": 4.248442935943603, "ce_loss_6": 3.927159142494202, "epoch": 0.376, "grad_norm": 1064.0, "kl_loss_12": 1131.9261444091796, "kl_loss_17": 309.78822937011716, "kl_loss_3": 2785.084680175781, "kl_loss_6": 2150.070684814453, "learning_rate": 0.0006990107054479312, "loss": 1620.216, "step": 3760 }, { "ce_loss_12": 3.425540065765381, "ce_loss_17": 3.0395328879356383, "ce_loss_23": 2.896018648147583, "ce_loss_3": 4.24209463596344, "ce_loss_6": 3.9303755879402162, "epoch": 0.377, "grad_norm": 1048.0, "kl_loss_12": 1186.8247192382812, "kl_loss_17": 326.4223037719727, "kl_loss_3": 2820.041540527344, "kl_loss_6": 2201.958917236328, "learning_rate": 0.000697554139156961, "loss": 1642.091, "step": 3770 }, { "ce_loss_12": 3.4275654792785644, "ce_loss_17": 3.041707730293274, "ce_loss_23": 2.8987489342689514, "ce_loss_3": 4.269000458717346, "ce_loss_6": 3.941159975528717, "epoch": 0.378, "grad_norm": 1072.0, "kl_loss_12": 1198.9744995117187, "kl_loss_17": 331.3607635498047, "kl_loss_3": 2895.0713012695314, "kl_loss_6": 2249.006982421875, "learning_rate": 0.0006960955834980027, "loss": 1626.0086, "step": 3780 }, { "ce_loss_12": 3.388811159133911, "ce_loss_17": 3.0065065026283264, "ce_loss_23": 2.865676498413086, "ce_loss_3": 4.215176248550415, "ce_loss_6": 3.8905073523521425, "epoch": 0.379, "grad_norm": 1024.0, "kl_loss_12": 1166.9369812011719, "kl_loss_17": 318.73981018066405, "kl_loss_3": 2826.0754150390626, "kl_loss_6": 2183.756945800781, "learning_rate": 0.0006946350531586958, "loss": 1629.8831, "step": 3790 }, { "ce_loss_12": 3.419374644756317, "ce_loss_17": 3.0380242824554444, "ce_loss_23": 2.899268925189972, "ce_loss_3": 4.244865441322327, "ce_loss_6": 3.9227930545806884, "epoch": 0.38, "grad_norm": 1432.0, "kl_loss_12": 1178.4429138183593, "kl_loss_17": 321.04165802001955, "kl_loss_3": 2842.534814453125, "kl_loss_6": 2213.956402587891, "learning_rate": 0.0006931725628465643, "loss": 1669.4293, "step": 3800 }, { "ce_loss_12": 3.4255444049835204, "ce_loss_17": 3.039610135555267, "ce_loss_23": 2.8914812207221985, "ce_loss_3": 4.2611403465271, "ce_loss_6": 3.9370644330978393, "epoch": 0.381, "grad_norm": 1088.0, "kl_loss_12": 1179.518118286133, "kl_loss_17": 327.4698013305664, "kl_loss_3": 2840.5864013671876, "kl_loss_6": 2207.823132324219, "learning_rate": 0.0006917081272888696, "loss": 1642.6992, "step": 3810 }, { "ce_loss_12": 3.3692954659461973, "ce_loss_17": 2.965705895423889, "ce_loss_23": 2.8230329275131227, "ce_loss_3": 4.224563157558441, "ce_loss_6": 3.8963645458221436, "epoch": 0.382, "grad_norm": 1136.0, "kl_loss_12": 1231.950244140625, "kl_loss_17": 326.74798278808595, "kl_loss_3": 2944.267932128906, "kl_loss_6": 2300.83447265625, "learning_rate": 0.0006902417612324615, "loss": 1656.0443, "step": 3820 }, { "ce_loss_12": 3.484429359436035, "ce_loss_17": 3.083032763004303, "ce_loss_23": 2.9321335554122925, "ce_loss_3": 4.346240592002869, "ce_loss_6": 4.015672600269317, "epoch": 0.383, "grad_norm": 996.0, "kl_loss_12": 1232.4169067382813, "kl_loss_17": 339.5805206298828, "kl_loss_3": 2957.370556640625, "kl_loss_6": 2296.0817749023436, "learning_rate": 0.00068877347944363, "loss": 1683.5805, "step": 3830 }, { "ce_loss_12": 3.4553491711616515, "ce_loss_17": 3.077640438079834, "ce_loss_23": 2.93577378988266, "ce_loss_3": 4.262261152267456, "ce_loss_6": 3.9459288239479067, "epoch": 0.384, "grad_norm": 952.0, "kl_loss_12": 1174.015118408203, "kl_loss_17": 322.96498718261716, "kl_loss_3": 2797.076550292969, "kl_loss_6": 2187.494738769531, "learning_rate": 0.0006873032967079561, "loss": 1649.0123, "step": 3840 }, { "ce_loss_12": 3.4251646637916564, "ce_loss_17": 3.0583839654922484, "ce_loss_23": 2.921176278591156, "ce_loss_3": 4.226701831817627, "ce_loss_6": 3.9207385420799254, "epoch": 0.385, "grad_norm": 1040.0, "kl_loss_12": 1149.8529052734375, "kl_loss_17": 320.24271392822266, "kl_loss_3": 2766.314599609375, "kl_loss_6": 2155.769970703125, "learning_rate": 0.0006858312278301637, "loss": 1607.1077, "step": 3850 }, { "ce_loss_12": 3.4586119294166564, "ce_loss_17": 3.094258224964142, "ce_loss_23": 2.959088897705078, "ce_loss_3": 4.254314255714417, "ce_loss_6": 3.939511406421661, "epoch": 0.386, "grad_norm": 996.0, "kl_loss_12": 1156.5623840332032, "kl_loss_17": 319.6972915649414, "kl_loss_3": 2757.2991333007812, "kl_loss_6": 2136.331689453125, "learning_rate": 0.0006843572876339704, "loss": 1605.4867, "step": 3860 }, { "ce_loss_12": 3.378196489810944, "ce_loss_17": 3.0188413739204405, "ce_loss_23": 2.8821682691574098, "ce_loss_3": 4.178075551986694, "ce_loss_6": 3.862470579147339, "epoch": 0.387, "grad_norm": 1400.0, "kl_loss_12": 1141.6409393310546, "kl_loss_17": 313.7957015991211, "kl_loss_3": 2745.0942260742186, "kl_loss_6": 2123.7592712402343, "learning_rate": 0.0006828814909619373, "loss": 1654.2504, "step": 3870 }, { "ce_loss_12": 3.521103036403656, "ce_loss_17": 3.13949875831604, "ce_loss_23": 2.9910845518112184, "ce_loss_3": 4.34140408039093, "ce_loss_6": 4.002995109558105, "epoch": 0.388, "grad_norm": 880.0, "kl_loss_12": 1186.9687927246093, "kl_loss_17": 334.6907043457031, "kl_loss_3": 2817.365966796875, "kl_loss_6": 2165.355920410156, "learning_rate": 0.0006814038526753205, "loss": 1611.627, "step": 3880 }, { "ce_loss_12": 3.4270783066749573, "ce_loss_17": 3.042400801181793, "ce_loss_23": 2.899458038806915, "ce_loss_3": 4.239492547512055, "ce_loss_6": 3.9202648162841798, "epoch": 0.389, "grad_norm": 1056.0, "kl_loss_12": 1170.2104949951172, "kl_loss_17": 323.59288482666017, "kl_loss_3": 2790.2853149414063, "kl_loss_6": 2166.1191528320314, "learning_rate": 0.0006799243876539213, "loss": 1620.8202, "step": 3890 }, { "ce_loss_12": 3.359401023387909, "ce_loss_17": 2.9685895681381225, "ce_loss_23": 2.830006313323975, "ce_loss_3": 4.210623705387116, "ce_loss_6": 3.882489597797394, "epoch": 0.39, "grad_norm": 1136.0, "kl_loss_12": 1183.7762817382813, "kl_loss_17": 320.4316207885742, "kl_loss_3": 2891.90244140625, "kl_loss_6": 2243.515478515625, "learning_rate": 0.0006784431107959359, "loss": 1667.743, "step": 3900 }, { "ce_loss_12": 3.4223373532295227, "ce_loss_17": 3.0267418742179872, "ce_loss_23": 2.879237198829651, "ce_loss_3": 4.278204727172851, "ce_loss_6": 3.946788024902344, "epoch": 0.391, "grad_norm": 1264.0, "kl_loss_12": 1212.3030578613282, "kl_loss_17": 330.3525680541992, "kl_loss_3": 2935.582763671875, "kl_loss_6": 2278.313610839844, "learning_rate": 0.0006769600370178059, "loss": 1661.9084, "step": 3910 }, { "ce_loss_12": 3.3743677854537966, "ce_loss_17": 2.9920090198516847, "ce_loss_23": 2.8529026985168455, "ce_loss_3": 4.2079997777938845, "ce_loss_6": 3.8821290016174315, "epoch": 0.392, "grad_norm": 820.0, "kl_loss_12": 1180.332696533203, "kl_loss_17": 319.2956573486328, "kl_loss_3": 2833.5747680664062, "kl_loss_6": 2189.435583496094, "learning_rate": 0.0006754751812540679, "loss": 1611.8293, "step": 3920 }, { "ce_loss_12": 3.4259696125984194, "ce_loss_17": 3.041267991065979, "ce_loss_23": 2.897170066833496, "ce_loss_3": 4.2738687753677365, "ce_loss_6": 3.9354104518890383, "epoch": 0.393, "grad_norm": 1280.0, "kl_loss_12": 1197.7025756835938, "kl_loss_17": 326.8806884765625, "kl_loss_3": 2886.1544555664063, "kl_loss_6": 2231.5798583984374, "learning_rate": 0.0006739885584572025, "loss": 1660.7541, "step": 3930 }, { "ce_loss_12": 3.4471525073051454, "ce_loss_17": 3.051491439342499, "ce_loss_23": 2.913893294334412, "ce_loss_3": 4.300728058815002, "ce_loss_6": 3.9704771399497987, "epoch": 0.394, "grad_norm": 992.0, "kl_loss_12": 1213.9912811279296, "kl_loss_17": 326.83617095947267, "kl_loss_3": 2943.55283203125, "kl_loss_6": 2295.3341735839845, "learning_rate": 0.0006725001835974853, "loss": 1649.1977, "step": 3940 }, { "ce_loss_12": 3.4414065361022947, "ce_loss_17": 3.0564581513404847, "ce_loss_23": 2.9085390329360963, "ce_loss_3": 4.280353951454162, "ce_loss_6": 3.960696303844452, "epoch": 0.395, "grad_norm": 960.0, "kl_loss_12": 1196.9637329101563, "kl_loss_17": 331.86663208007815, "kl_loss_3": 2880.139501953125, "kl_loss_6": 2242.0492309570313, "learning_rate": 0.0006710100716628344, "loss": 1626.1908, "step": 3950 }, { "ce_loss_12": 3.428723990917206, "ce_loss_17": 3.038597285747528, "ce_loss_23": 2.8930386543273925, "ce_loss_3": 4.255900311470032, "ce_loss_6": 3.936865043640137, "epoch": 0.396, "grad_norm": 1016.0, "kl_loss_12": 1188.9954895019532, "kl_loss_17": 319.97182312011716, "kl_loss_3": 2847.898083496094, "kl_loss_6": 2217.8192932128904, "learning_rate": 0.0006695182376586602, "loss": 1653.416, "step": 3960 }, { "ce_loss_12": 3.4169690012931824, "ce_loss_17": 3.0513774871826174, "ce_loss_23": 2.919622015953064, "ce_loss_3": 4.2158072113990785, "ce_loss_6": 3.893892788887024, "epoch": 0.397, "grad_norm": 1480.0, "kl_loss_12": 1115.445260620117, "kl_loss_17": 304.40807037353517, "kl_loss_3": 2705.8389892578125, "kl_loss_6": 2082.140728759766, "learning_rate": 0.000668024696607715, "loss": 1632.433, "step": 3970 }, { "ce_loss_12": 3.406718575954437, "ce_loss_17": 3.031963884830475, "ce_loss_23": 2.895033621788025, "ce_loss_3": 4.22108142375946, "ce_loss_6": 3.9037919998168946, "epoch": 0.398, "grad_norm": 1536.0, "kl_loss_12": 1165.0296813964844, "kl_loss_17": 317.1994743347168, "kl_loss_3": 2799.8370971679688, "kl_loss_6": 2181.7656982421877, "learning_rate": 0.0006665294635499404, "loss": 1619.0551, "step": 3980 }, { "ce_loss_12": 3.43480144739151, "ce_loss_17": 3.041903519630432, "ce_loss_23": 2.8935364723205566, "ce_loss_3": 4.295881199836731, "ce_loss_6": 3.973275625705719, "epoch": 0.399, "grad_norm": 1184.0, "kl_loss_12": 1226.6514404296875, "kl_loss_17": 339.84486083984376, "kl_loss_3": 2951.3014892578126, "kl_loss_6": 2316.8784423828124, "learning_rate": 0.0006650325535423167, "loss": 1668.1738, "step": 3990 }, { "ce_loss_12": 3.4110765933990477, "ce_loss_17": 3.0551862835884096, "ce_loss_23": 2.9150756716728212, "ce_loss_3": 4.215204250812531, "ce_loss_6": 3.8930283784866333, "epoch": 0.4, "grad_norm": 1176.0, "kl_loss_12": 1126.5774383544922, "kl_loss_17": 316.1091995239258, "kl_loss_3": 2728.413671875, "kl_loss_6": 2095.254010009766, "learning_rate": 0.0006635339816587109, "loss": 1618.082, "step": 4000 }, { "ce_loss_12": 3.3729982733726502, "ce_loss_17": 2.992246413230896, "ce_loss_23": 2.851309823989868, "ce_loss_3": 4.219701409339905, "ce_loss_6": 3.905067670345306, "epoch": 0.401, "grad_norm": 1400.0, "kl_loss_12": 1184.4628173828125, "kl_loss_17": 325.0249252319336, "kl_loss_3": 2873.4937377929687, "kl_loss_6": 2256.891387939453, "learning_rate": 0.0006620337629897252, "loss": 1631.4626, "step": 4010 }, { "ce_loss_12": 3.3826616048812865, "ce_loss_17": 3.0062437295913695, "ce_loss_23": 2.8597806096076965, "ce_loss_3": 4.213565754890442, "ce_loss_6": 3.8855297684669496, "epoch": 0.402, "grad_norm": 1056.0, "kl_loss_12": 1164.632745361328, "kl_loss_17": 325.6399185180664, "kl_loss_3": 2830.8072875976563, "kl_loss_6": 2184.3242370605467, "learning_rate": 0.0006605319126425454, "loss": 1655.3189, "step": 4020 }, { "ce_loss_12": 3.3020559906959535, "ce_loss_17": 2.919109809398651, "ce_loss_23": 2.781159979104996, "ce_loss_3": 4.159004271030426, "ce_loss_6": 3.836709499359131, "epoch": 0.403, "grad_norm": 1128.0, "kl_loss_12": 1189.5907165527344, "kl_loss_17": 322.5351135253906, "kl_loss_3": 2899.326037597656, "kl_loss_6": 2257.474090576172, "learning_rate": 0.0006590284457407876, "loss": 1652.3072, "step": 4030 }, { "ce_loss_12": 3.389311420917511, "ce_loss_17": 3.010061573982239, "ce_loss_23": 2.8683920979499815, "ce_loss_3": 4.223417830467224, "ce_loss_6": 3.894852077960968, "epoch": 0.404, "grad_norm": 1048.0, "kl_loss_12": 1165.923388671875, "kl_loss_17": 323.8217208862305, "kl_loss_3": 2836.329638671875, "kl_loss_6": 2193.572961425781, "learning_rate": 0.0006575233774243465, "loss": 1628.1803, "step": 4040 }, { "ce_loss_12": 3.3846017479896546, "ce_loss_17": 3.006804585456848, "ce_loss_23": 2.8631478548049927, "ce_loss_3": 4.2313508033752445, "ce_loss_6": 3.9021772384643554, "epoch": 0.405, "grad_norm": 1264.0, "kl_loss_12": 1176.518975830078, "kl_loss_17": 329.24969482421875, "kl_loss_3": 2880.244873046875, "kl_loss_6": 2226.158905029297, "learning_rate": 0.0006560167228492435, "loss": 1644.4455, "step": 4050 }, { "ce_loss_12": 3.4046581268310545, "ce_loss_17": 3.041847562789917, "ce_loss_23": 2.9070791721343996, "ce_loss_3": 4.219157040119171, "ce_loss_6": 3.8969243288040163, "epoch": 0.406, "grad_norm": 988.0, "kl_loss_12": 1129.4734466552734, "kl_loss_17": 312.1006332397461, "kl_loss_3": 2755.409387207031, "kl_loss_6": 2129.448858642578, "learning_rate": 0.0006545084971874737, "loss": 1629.0785, "step": 4060 }, { "ce_loss_12": 3.4048274517059327, "ce_loss_17": 3.010120987892151, "ce_loss_23": 2.8593595743179323, "ce_loss_3": 4.262722325325012, "ce_loss_6": 3.9252733469009398, "epoch": 0.407, "grad_norm": 1088.0, "kl_loss_12": 1210.6922943115235, "kl_loss_17": 337.23687744140625, "kl_loss_3": 2925.420068359375, "kl_loss_6": 2256.9163330078127, "learning_rate": 0.0006529987156268526, "loss": 1640.2656, "step": 4070 }, { "ce_loss_12": 3.317921507358551, "ce_loss_17": 2.9338502049446107, "ce_loss_23": 2.785680627822876, "ce_loss_3": 4.1656157851219175, "ce_loss_6": 3.837941527366638, "epoch": 0.408, "grad_norm": 1024.0, "kl_loss_12": 1175.9814331054688, "kl_loss_17": 324.7815673828125, "kl_loss_3": 2871.5527587890624, "kl_loss_6": 2227.038439941406, "learning_rate": 0.0006514873933708637, "loss": 1675.1437, "step": 4080 }, { "ce_loss_12": 3.408484864234924, "ce_loss_17": 3.039537787437439, "ce_loss_23": 2.8986527919769287, "ce_loss_3": 4.231956946849823, "ce_loss_6": 3.917720365524292, "epoch": 0.409, "grad_norm": 976.0, "kl_loss_12": 1140.7272155761718, "kl_loss_17": 314.2014724731445, "kl_loss_3": 2784.7043701171874, "kl_loss_6": 2163.5523376464844, "learning_rate": 0.0006499745456385053, "loss": 1610.7346, "step": 4090 }, { "ce_loss_12": 3.3876006484031675, "ce_loss_17": 3.0119059801101686, "ce_loss_23": 2.863646948337555, "ce_loss_3": 4.21506804227829, "ce_loss_6": 3.894675004482269, "epoch": 0.41, "grad_norm": 960.0, "kl_loss_12": 1179.637158203125, "kl_loss_17": 326.34833068847655, "kl_loss_3": 2843.0706420898437, "kl_loss_6": 2211.5627380371093, "learning_rate": 0.0006484601876641375, "loss": 1647.2432, "step": 4100 }, { "ce_loss_12": 3.358959376811981, "ce_loss_17": 2.9945371627807615, "ce_loss_23": 2.8585385918617248, "ce_loss_3": 4.157709872722625, "ce_loss_6": 3.844931447505951, "epoch": 0.411, "grad_norm": 1040.0, "kl_loss_12": 1127.2386535644532, "kl_loss_17": 313.78133392333984, "kl_loss_3": 2728.8353637695313, "kl_loss_6": 2121.688055419922, "learning_rate": 0.000646944334697328, "loss": 1597.0447, "step": 4110 }, { "ce_loss_12": 3.460660433769226, "ce_loss_17": 3.096243751049042, "ce_loss_23": 2.9564993023872375, "ce_loss_3": 4.2486083745956424, "ce_loss_6": 3.938153052330017, "epoch": 0.412, "grad_norm": 960.0, "kl_loss_12": 1131.3982330322265, "kl_loss_17": 315.8477813720703, "kl_loss_3": 2707.6667724609374, "kl_loss_6": 2098.8063354492188, "learning_rate": 0.0006454270020026995, "loss": 1574.3133, "step": 4120 }, { "ce_loss_12": 3.4204355835914613, "ce_loss_17": 3.067693066596985, "ce_loss_23": 2.9321154236793516, "ce_loss_3": 4.215284967422486, "ce_loss_6": 3.904770815372467, "epoch": 0.413, "grad_norm": 988.0, "kl_loss_12": 1112.7341613769531, "kl_loss_17": 306.37862396240234, "kl_loss_3": 2706.292724609375, "kl_loss_6": 2092.9628967285157, "learning_rate": 0.0006439082048597755, "loss": 1570.2234, "step": 4130 }, { "ce_loss_12": 3.4354603052139283, "ce_loss_17": 3.0538384079933167, "ce_loss_23": 2.9169345617294313, "ce_loss_3": 4.2601546287536625, "ce_loss_6": 3.9380828738212585, "epoch": 0.414, "grad_norm": 1144.0, "kl_loss_12": 1169.7640869140625, "kl_loss_17": 318.6606689453125, "kl_loss_3": 2818.671984863281, "kl_loss_6": 2193.7250061035156, "learning_rate": 0.0006423879585628261, "loss": 1629.1748, "step": 4140 }, { "ce_loss_12": 3.399686324596405, "ce_loss_17": 3.018126440048218, "ce_loss_23": 2.873788130283356, "ce_loss_3": 4.243383419513703, "ce_loss_6": 3.922605037689209, "epoch": 0.415, "grad_norm": 1384.0, "kl_loss_12": 1186.7492919921874, "kl_loss_17": 326.4311004638672, "kl_loss_3": 2872.7005859375, "kl_loss_6": 2243.2379943847654, "learning_rate": 0.0006408662784207149, "loss": 1650.5398, "step": 4150 }, { "ce_loss_12": 3.3609784603118897, "ce_loss_17": 2.994195282459259, "ce_loss_23": 2.8559187173843386, "ce_loss_3": 4.1885595440864565, "ce_loss_6": 3.861635887622833, "epoch": 0.416, "grad_norm": 1224.0, "kl_loss_12": 1156.0622589111329, "kl_loss_17": 314.15498046875, "kl_loss_3": 2812.4429443359377, "kl_loss_6": 2174.676873779297, "learning_rate": 0.0006393431797567439, "loss": 1619.142, "step": 4160 }, { "ce_loss_12": 3.4092275619506838, "ce_loss_17": 3.059811198711395, "ce_loss_23": 2.9277246236801147, "ce_loss_3": 4.204770541191101, "ce_loss_6": 3.888715922832489, "epoch": 0.417, "grad_norm": 1216.0, "kl_loss_12": 1123.1555938720703, "kl_loss_17": 312.6223335266113, "kl_loss_3": 2726.036267089844, "kl_loss_6": 2100.0018615722656, "learning_rate": 0.0006378186779084996, "loss": 1551.5569, "step": 4170 }, { "ce_loss_12": 3.294791781902313, "ce_loss_17": 2.90387818813324, "ce_loss_23": 2.7652917981147764, "ce_loss_3": 4.132733774185181, "ce_loss_6": 3.806008851528168, "epoch": 0.418, "grad_norm": 936.0, "kl_loss_12": 1173.2480255126952, "kl_loss_17": 320.46661376953125, "kl_loss_3": 2840.6937744140623, "kl_loss_6": 2205.0871704101564, "learning_rate": 0.0006362927882276989, "loss": 1644.2609, "step": 4180 }, { "ce_loss_12": 3.430036115646362, "ce_loss_17": 3.0758618474006654, "ce_loss_23": 2.9404595136642455, "ce_loss_3": 4.2398931860923765, "ce_loss_6": 3.9211429834365843, "epoch": 0.419, "grad_norm": 1424.0, "kl_loss_12": 1106.8557525634765, "kl_loss_17": 304.0802848815918, "kl_loss_3": 2734.475732421875, "kl_loss_6": 2099.411590576172, "learning_rate": 0.000634765526080034, "loss": 1556.1971, "step": 4190 }, { "ce_loss_12": 3.453889286518097, "ce_loss_17": 3.0857287406921388, "ce_loss_23": 2.9473395586013793, "ce_loss_3": 4.255516767501831, "ce_loss_6": 3.944706749916077, "epoch": 0.42, "grad_norm": 960.0, "kl_loss_12": 1145.736996459961, "kl_loss_17": 317.604719543457, "kl_loss_3": 2756.877331542969, "kl_loss_6": 2135.238446044922, "learning_rate": 0.0006332369068450174, "loss": 1586.098, "step": 4200 }, { "ce_loss_12": 3.404520869255066, "ce_loss_17": 3.026888573169708, "ce_loss_23": 2.8927263975143434, "ce_loss_3": 4.221071326732636, "ce_loss_6": 3.915247893333435, "epoch": 0.421, "grad_norm": 972.0, "kl_loss_12": 1162.0391998291016, "kl_loss_17": 316.31638107299807, "kl_loss_3": 2793.0206665039063, "kl_loss_6": 2186.566125488281, "learning_rate": 0.0006317069459158283, "loss": 1605.7854, "step": 4210 }, { "ce_loss_12": 3.4682124257087708, "ce_loss_17": 3.117214620113373, "ce_loss_23": 2.9843473196029664, "ce_loss_3": 4.26329472064972, "ce_loss_6": 3.9522085428237914, "epoch": 0.422, "grad_norm": 964.0, "kl_loss_12": 1128.2288879394532, "kl_loss_17": 313.6609832763672, "kl_loss_3": 2728.2145874023436, "kl_loss_6": 2108.8893432617188, "learning_rate": 0.0006301756586991561, "loss": 1583.7455, "step": 4220 }, { "ce_loss_12": 3.3013151049613954, "ce_loss_17": 2.9211807370185854, "ce_loss_23": 2.781052625179291, "ce_loss_3": 4.149512720108032, "ce_loss_6": 3.8239535689353943, "epoch": 0.423, "grad_norm": 1004.0, "kl_loss_12": 1185.4845764160157, "kl_loss_17": 321.148747253418, "kl_loss_3": 2884.1349365234373, "kl_loss_6": 2240.584826660156, "learning_rate": 0.0006286430606150459, "loss": 1639.8203, "step": 4230 }, { "ce_loss_12": 3.48389675617218, "ce_loss_17": 3.1149152278900147, "ce_loss_23": 2.977355432510376, "ce_loss_3": 4.2954552412033085, "ce_loss_6": 3.9828169465065004, "epoch": 0.424, "grad_norm": 984.0, "kl_loss_12": 1147.0034240722657, "kl_loss_17": 320.0153060913086, "kl_loss_3": 2780.7621337890623, "kl_loss_6": 2161.5279174804687, "learning_rate": 0.0006271091670967436, "loss": 1600.0794, "step": 4240 }, { "ce_loss_12": 3.424987781047821, "ce_loss_17": 3.0343403697013853, "ce_loss_23": 2.8883402824401854, "ce_loss_3": 4.268402814865112, "ce_loss_6": 3.942337465286255, "epoch": 0.425, "grad_norm": 1288.0, "kl_loss_12": 1207.4884033203125, "kl_loss_17": 332.36527862548826, "kl_loss_3": 2902.5668090820313, "kl_loss_6": 2257.4686645507813, "learning_rate": 0.0006255739935905395, "loss": 1638.8984, "step": 4250 }, { "ce_loss_12": 3.4310410976409913, "ce_loss_17": 3.0717048764228823, "ce_loss_23": 2.931931567192078, "ce_loss_3": 4.242630553245545, "ce_loss_6": 3.925148296356201, "epoch": 0.426, "grad_norm": 1144.0, "kl_loss_12": 1145.0904571533204, "kl_loss_17": 318.5279541015625, "kl_loss_3": 2771.041711425781, "kl_loss_6": 2146.984967041016, "learning_rate": 0.0006240375555556145, "loss": 1651.0617, "step": 4260 }, { "ce_loss_12": 3.448286783695221, "ce_loss_17": 3.0631021738052366, "ce_loss_23": 2.921405851840973, "ce_loss_3": 4.293193435668945, "ce_loss_6": 3.9641549587249756, "epoch": 0.427, "grad_norm": 828.0, "kl_loss_12": 1183.8236541748047, "kl_loss_17": 321.07748260498045, "kl_loss_3": 2869.4075927734375, "kl_loss_6": 2233.6482055664064, "learning_rate": 0.000622499868463882, "loss": 1637.2682, "step": 4270 }, { "ce_loss_12": 3.392454504966736, "ce_loss_17": 3.0375906944274904, "ce_loss_23": 2.9044872641563417, "ce_loss_3": 4.181933903694153, "ce_loss_6": 3.871629846096039, "epoch": 0.428, "grad_norm": 1200.0, "kl_loss_12": 1119.5519989013671, "kl_loss_17": 311.49632720947267, "kl_loss_3": 2726.1117553710938, "kl_loss_6": 2109.8937805175783, "learning_rate": 0.0006209609477998338, "loss": 1587.0687, "step": 4280 }, { "ce_loss_12": 3.463665783405304, "ce_loss_17": 3.093734884262085, "ce_loss_23": 2.9519339919090273, "ce_loss_3": 4.2704680323600765, "ce_loss_6": 3.9471255421638487, "epoch": 0.429, "grad_norm": 924.0, "kl_loss_12": 1153.5091369628906, "kl_loss_17": 319.26009674072264, "kl_loss_3": 2781.630859375, "kl_loss_6": 2142.5775146484375, "learning_rate": 0.0006194208090603844, "loss": 1618.8986, "step": 4290 }, { "ce_loss_12": 3.37452689409256, "ce_loss_17": 3.0138617753982544, "ce_loss_23": 2.878571164608002, "ce_loss_3": 4.188263761997223, "ce_loss_6": 3.878198838233948, "epoch": 0.43, "grad_norm": 948.0, "kl_loss_12": 1118.5654907226562, "kl_loss_17": 307.2293182373047, "kl_loss_3": 2742.392199707031, "kl_loss_6": 2129.879833984375, "learning_rate": 0.0006178794677547138, "loss": 1566.273, "step": 4300 }, { "ce_loss_12": 3.4126819610595702, "ce_loss_17": 3.040526843070984, "ce_loss_23": 2.901843559741974, "ce_loss_3": 4.239118087291717, "ce_loss_6": 3.921295201778412, "epoch": 0.431, "grad_norm": 1136.0, "kl_loss_12": 1158.5338500976563, "kl_loss_17": 319.30886993408205, "kl_loss_3": 2805.543896484375, "kl_loss_6": 2183.512487792969, "learning_rate": 0.0006163369394041111, "loss": 1605.8267, "step": 4310 }, { "ce_loss_12": 3.345866930484772, "ce_loss_17": 2.974448561668396, "ce_loss_23": 2.838093012571335, "ce_loss_3": 4.1859783172607425, "ce_loss_6": 3.8710575103759766, "epoch": 0.432, "grad_norm": 1128.0, "kl_loss_12": 1158.7231079101562, "kl_loss_17": 311.65046844482424, "kl_loss_3": 2838.497119140625, "kl_loss_6": 2212.0281005859374, "learning_rate": 0.0006147932395418205, "loss": 1657.0939, "step": 4320 }, { "ce_loss_12": 3.3821220636367797, "ce_loss_17": 3.0188361406326294, "ce_loss_23": 2.883270764350891, "ce_loss_3": 4.1908201456069945, "ce_loss_6": 3.872398817539215, "epoch": 0.433, "grad_norm": 1448.0, "kl_loss_12": 1142.3731201171875, "kl_loss_17": 313.32046966552736, "kl_loss_3": 2772.5299682617188, "kl_loss_6": 2137.9815185546877, "learning_rate": 0.0006132483837128823, "loss": 1581.4684, "step": 4330 }, { "ce_loss_12": 3.360012209415436, "ce_loss_17": 2.994674015045166, "ce_loss_23": 2.859726941585541, "ce_loss_3": 4.202695202827454, "ce_loss_6": 3.8732985854148865, "epoch": 0.434, "grad_norm": 912.0, "kl_loss_12": 1148.7674255371094, "kl_loss_17": 310.43812255859376, "kl_loss_3": 2840.497424316406, "kl_loss_6": 2196.349890136719, "learning_rate": 0.0006117023874739772, "loss": 1622.8654, "step": 4340 }, { "ce_loss_12": 3.367009127140045, "ce_loss_17": 2.9907040119171144, "ce_loss_23": 2.852668786048889, "ce_loss_3": 4.191274094581604, "ce_loss_6": 3.87186518907547, "epoch": 0.435, "grad_norm": 1024.0, "kl_loss_12": 1161.118389892578, "kl_loss_17": 313.2685745239258, "kl_loss_3": 2827.9594970703124, "kl_loss_6": 2188.6732482910156, "learning_rate": 0.0006101552663932703, "loss": 1632.4307, "step": 4350 }, { "ce_loss_12": 3.3917209982872008, "ce_loss_17": 3.022636556625366, "ce_loss_23": 2.884074628353119, "ce_loss_3": 4.2065025806427006, "ce_loss_6": 3.8965853214263917, "epoch": 0.436, "grad_norm": 876.0, "kl_loss_12": 1152.564471435547, "kl_loss_17": 320.2475158691406, "kl_loss_3": 2783.341162109375, "kl_loss_6": 2171.592877197266, "learning_rate": 0.0006086070360502539, "loss": 1609.012, "step": 4360 }, { "ce_loss_12": 3.3906732559204102, "ce_loss_17": 3.0208441615104675, "ce_loss_23": 2.8845319390296935, "ce_loss_3": 4.21385703086853, "ce_loss_6": 3.896357476711273, "epoch": 0.437, "grad_norm": 1016.0, "kl_loss_12": 1148.9530700683595, "kl_loss_17": 310.03978576660154, "kl_loss_3": 2806.8066650390624, "kl_loss_6": 2181.143347167969, "learning_rate": 0.0006070577120355903, "loss": 1614.329, "step": 4370 }, { "ce_loss_12": 3.3964044094085692, "ce_loss_17": 3.0242053508758544, "ce_loss_23": 2.890236794948578, "ce_loss_3": 4.183533334732056, "ce_loss_6": 3.871987521648407, "epoch": 0.438, "grad_norm": 960.0, "kl_loss_12": 1123.01728515625, "kl_loss_17": 306.5171600341797, "kl_loss_3": 2704.7479614257813, "kl_loss_6": 2085.7805908203127, "learning_rate": 0.0006055073099509549, "loss": 1585.2225, "step": 4380 }, { "ce_loss_12": 3.4402162551879885, "ce_loss_17": 3.0814417481422423, "ce_loss_23": 2.9468929171562195, "ce_loss_3": 4.247218203544617, "ce_loss_6": 3.933430850505829, "epoch": 0.439, "grad_norm": 1064.0, "kl_loss_12": 1131.3260192871094, "kl_loss_17": 311.39515380859376, "kl_loss_3": 2749.497119140625, "kl_loss_6": 2138.216192626953, "learning_rate": 0.0006039558454088796, "loss": 1612.5506, "step": 4390 }, { "ce_loss_12": 3.422533404827118, "ce_loss_17": 3.0480231046676636, "ce_loss_23": 2.9077144980430605, "ce_loss_3": 4.241112649440765, "ce_loss_6": 3.9317553758621218, "epoch": 0.44, "grad_norm": 1256.0, "kl_loss_12": 1146.5663787841797, "kl_loss_17": 313.3567604064941, "kl_loss_3": 2788.4807006835936, "kl_loss_6": 2172.3048828125, "learning_rate": 0.0006024033340325954, "loss": 1576.1466, "step": 4400 }, { "ce_loss_12": 3.464881455898285, "ce_loss_17": 3.1081199288368224, "ce_loss_23": 2.9811485767364503, "ce_loss_3": 4.24443507194519, "ce_loss_6": 3.940363574028015, "epoch": 0.441, "grad_norm": 884.0, "kl_loss_12": 1095.667462158203, "kl_loss_17": 298.9134033203125, "kl_loss_3": 2663.990344238281, "kl_loss_6": 2061.230120849609, "learning_rate": 0.0006008497914558743, "loss": 1566.9678, "step": 4410 }, { "ce_loss_12": 3.4442933082580565, "ce_loss_17": 3.0712830781936646, "ce_loss_23": 2.9269362568855284, "ce_loss_3": 4.272569918632508, "ce_loss_6": 3.945245790481567, "epoch": 0.442, "grad_norm": 1016.0, "kl_loss_12": 1164.5831817626954, "kl_loss_17": 326.71973876953126, "kl_loss_3": 2832.883654785156, "kl_loss_6": 2189.940539550781, "learning_rate": 0.0005992952333228728, "loss": 1621.5256, "step": 4420 }, { "ce_loss_12": 3.3685991048812864, "ce_loss_17": 3.00733345746994, "ce_loss_23": 2.8735783815383913, "ce_loss_3": 4.198529326915741, "ce_loss_6": 3.882496440410614, "epoch": 0.443, "grad_norm": 1032.0, "kl_loss_12": 1138.6497192382812, "kl_loss_17": 309.3685668945312, "kl_loss_3": 2800.78447265625, "kl_loss_6": 2179.3860107421874, "learning_rate": 0.0005977396752879741, "loss": 1591.6296, "step": 4430 }, { "ce_loss_12": 3.3082218766212463, "ce_loss_17": 2.934286594390869, "ce_loss_23": 2.795569658279419, "ce_loss_3": 4.121199953556061, "ce_loss_6": 3.8101864218711854, "epoch": 0.444, "grad_norm": 1288.0, "kl_loss_12": 1158.0689025878905, "kl_loss_17": 315.5136322021484, "kl_loss_3": 2796.026611328125, "kl_loss_6": 2176.3042724609377, "learning_rate": 0.0005961831330156305, "loss": 1591.341, "step": 4440 }, { "ce_loss_12": 3.4397592782974242, "ce_loss_17": 3.0738757252693176, "ce_loss_23": 2.934872305393219, "ce_loss_3": 4.28535441160202, "ce_loss_6": 3.9639587998390198, "epoch": 0.445, "grad_norm": 1232.0, "kl_loss_12": 1144.8870971679687, "kl_loss_17": 316.44542388916017, "kl_loss_3": 2836.933642578125, "kl_loss_6": 2195.8565368652344, "learning_rate": 0.0005946256221802051, "loss": 1641.4605, "step": 4450 }, { "ce_loss_12": 3.3827796936035157, "ce_loss_17": 3.0436911463737486, "ce_loss_23": 2.911225152015686, "ce_loss_3": 4.180616104602814, "ce_loss_6": 3.864882254600525, "epoch": 0.446, "grad_norm": 1104.0, "kl_loss_12": 1090.165689086914, "kl_loss_17": 312.66605987548826, "kl_loss_3": 2692.0668334960938, "kl_loss_6": 2074.98994140625, "learning_rate": 0.0005930671584658151, "loss": 1638.3902, "step": 4460 }, { "ce_loss_12": 3.419265556335449, "ce_loss_17": 3.0543219804763795, "ce_loss_23": 2.9189966201782225, "ce_loss_3": 4.228051006793976, "ce_loss_6": 3.9190755128860473, "epoch": 0.447, "grad_norm": 1144.0, "kl_loss_12": 1135.6621948242187, "kl_loss_17": 319.49698638916016, "kl_loss_3": 2777.132958984375, "kl_loss_6": 2155.2238891601564, "learning_rate": 0.0005915077575661722, "loss": 1618.4136, "step": 4470 }, { "ce_loss_12": 3.442099952697754, "ce_loss_17": 3.07456750869751, "ce_loss_23": 2.9299558758735658, "ce_loss_3": 4.255296397209167, "ce_loss_6": 3.9377192854881287, "epoch": 0.448, "grad_norm": 1072.0, "kl_loss_12": 1161.6586822509767, "kl_loss_17": 332.67717895507815, "kl_loss_3": 2798.3115966796877, "kl_loss_6": 2179.07138671875, "learning_rate": 0.000589947435184427, "loss": 1596.1469, "step": 4480 }, { "ce_loss_12": 3.470191848278046, "ce_loss_17": 3.1280281066894533, "ce_loss_23": 2.990417408943176, "ce_loss_3": 4.238299298286438, "ce_loss_6": 3.934940552711487, "epoch": 0.449, "grad_norm": 1136.0, "kl_loss_12": 1124.0675689697266, "kl_loss_17": 314.65479736328126, "kl_loss_3": 2680.177978515625, "kl_loss_6": 2075.8083312988283, "learning_rate": 0.0005883862070330078, "loss": 1577.6258, "step": 4490 }, { "ce_loss_12": 3.438168096542358, "ce_loss_17": 3.063719856739044, "ce_loss_23": 2.9275577425956727, "ce_loss_3": 4.238969564437866, "ce_loss_6": 3.9301570534706114, "epoch": 0.45, "grad_norm": 1168.0, "kl_loss_12": 1152.8913330078126, "kl_loss_17": 316.52400970458984, "kl_loss_3": 2774.6293701171876, "kl_loss_6": 2158.622393798828, "learning_rate": 0.0005868240888334653, "loss": 1597.5406, "step": 4500 }, { "ce_loss_12": 3.3346810936927795, "ce_loss_17": 2.9612236976623536, "ce_loss_23": 2.819701302051544, "ce_loss_3": 4.172786235809326, "ce_loss_6": 3.844736897945404, "epoch": 0.451, "grad_norm": 960.0, "kl_loss_12": 1160.5079437255858, "kl_loss_17": 321.31404266357424, "kl_loss_3": 2839.0437744140627, "kl_loss_6": 2195.8436950683595, "learning_rate": 0.0005852610963163119, "loss": 1618.7336, "step": 4510 }, { "ce_loss_12": 3.347758114337921, "ce_loss_17": 2.9786689639091493, "ce_loss_23": 2.8448096394538878, "ce_loss_3": 4.152521347999572, "ce_loss_6": 3.8433269381523134, "epoch": 0.452, "grad_norm": 1056.0, "kl_loss_12": 1137.2014526367188, "kl_loss_17": 310.4284866333008, "kl_loss_3": 2761.944201660156, "kl_loss_6": 2144.6801025390623, "learning_rate": 0.0005836972452208654, "loss": 1577.3804, "step": 4520 }, { "ce_loss_12": 3.3452757716178896, "ce_loss_17": 2.9843344926834106, "ce_loss_23": 2.8507073402404783, "ce_loss_3": 4.172976124286651, "ce_loss_6": 3.8575175166130067, "epoch": 0.453, "grad_norm": 1144.0, "kl_loss_12": 1132.9714324951171, "kl_loss_17": 313.4719665527344, "kl_loss_3": 2796.802868652344, "kl_loss_6": 2169.0428100585937, "learning_rate": 0.0005821325512950885, "loss": 1598.8304, "step": 4530 }, { "ce_loss_12": 3.3669161796569824, "ce_loss_17": 3.005538260936737, "ce_loss_23": 2.8709388375282288, "ce_loss_3": 4.176382279396057, "ce_loss_6": 3.8579967260360717, "epoch": 0.454, "grad_norm": 980.0, "kl_loss_12": 1107.4463897705077, "kl_loss_17": 305.12395706176756, "kl_loss_3": 2722.3322875976564, "kl_loss_6": 2097.39970703125, "learning_rate": 0.0005805670302954321, "loss": 1584.4574, "step": 4540 }, { "ce_loss_12": 3.3633339762687684, "ce_loss_17": 3.010383832454681, "ce_loss_23": 2.8782918214797975, "ce_loss_3": 4.172574400901794, "ce_loss_6": 3.8587594032287598, "epoch": 0.455, "grad_norm": 1056.0, "kl_loss_12": 1112.6406524658203, "kl_loss_17": 299.7900199890137, "kl_loss_3": 2729.6116943359375, "kl_loss_6": 2117.4360534667967, "learning_rate": 0.000579000697986675, "loss": 1563.8545, "step": 4550 }, { "ce_loss_12": 3.362711465358734, "ce_loss_17": 2.9764938712120057, "ce_loss_23": 2.835140883922577, "ce_loss_3": 4.201532959938049, "ce_loss_6": 3.8728991508483888, "epoch": 0.456, "grad_norm": 1552.0, "kl_loss_12": 1178.7279296875, "kl_loss_17": 323.01573944091797, "kl_loss_3": 2854.2749877929687, "kl_loss_6": 2213.0695861816407, "learning_rate": 0.0005774335701417662, "loss": 1613.0781, "step": 4560 }, { "ce_loss_12": 3.3355695724487306, "ce_loss_17": 2.9634481072425842, "ce_loss_23": 2.828409492969513, "ce_loss_3": 4.187091267108917, "ce_loss_6": 3.866722321510315, "epoch": 0.457, "grad_norm": 956.0, "kl_loss_12": 1150.532940673828, "kl_loss_17": 307.76855926513673, "kl_loss_3": 2868.6227416992188, "kl_loss_6": 2231.113848876953, "learning_rate": 0.0005758656625416658, "loss": 1613.2054, "step": 4570 }, { "ce_loss_12": 3.3876204609870912, "ce_loss_17": 3.027148795127869, "ce_loss_23": 2.8844519376754763, "ce_loss_3": 4.2057746887207035, "ce_loss_6": 3.8870466589927672, "epoch": 0.458, "grad_norm": 956.0, "kl_loss_12": 1136.8851470947266, "kl_loss_17": 316.8278106689453, "kl_loss_3": 2786.901501464844, "kl_loss_6": 2145.4644104003905, "learning_rate": 0.0005742969909751859, "loss": 1575.2155, "step": 4580 }, { "ce_loss_12": 3.4007017374038697, "ce_loss_17": 3.034340000152588, "ce_loss_23": 2.89749014377594, "ce_loss_3": 4.228575205802917, "ce_loss_6": 3.9102230429649354, "epoch": 0.459, "grad_norm": 944.0, "kl_loss_12": 1143.504412841797, "kl_loss_17": 312.67955169677737, "kl_loss_3": 2813.0753662109373, "kl_loss_6": 2178.4854736328125, "learning_rate": 0.0005727275712388318, "loss": 1615.2342, "step": 4590 }, { "ce_loss_12": 3.3975281357765197, "ce_loss_17": 3.0452203273773195, "ce_loss_23": 2.9190300822257997, "ce_loss_3": 4.187395370006561, "ce_loss_6": 3.8785125851631164, "epoch": 0.46, "grad_norm": 980.0, "kl_loss_12": 1097.602850341797, "kl_loss_17": 300.3045196533203, "kl_loss_3": 2693.808935546875, "kl_loss_6": 2085.7624572753907, "learning_rate": 0.0005711574191366427, "loss": 1563.5143, "step": 4600 }, { "ce_loss_12": 3.3609559893608094, "ce_loss_17": 3.004966366291046, "ce_loss_23": 2.868931531906128, "ce_loss_3": 4.1772660374641415, "ce_loss_6": 3.8511223554611207, "epoch": 0.461, "grad_norm": 828.0, "kl_loss_12": 1116.0391632080077, "kl_loss_17": 304.19537963867185, "kl_loss_3": 2751.3671752929686, "kl_loss_6": 2109.9775939941405, "learning_rate": 0.0005695865504800327, "loss": 1566.2516, "step": 4610 }, { "ce_loss_12": 3.3485647439956665, "ce_loss_17": 2.95493905544281, "ce_loss_23": 2.80878484249115, "ce_loss_3": 4.229027032852173, "ce_loss_6": 3.8901962757110597, "epoch": 0.462, "grad_norm": 1144.0, "kl_loss_12": 1214.5166809082032, "kl_loss_17": 328.5302200317383, "kl_loss_3": 2971.7057861328126, "kl_loss_6": 2304.507421875, "learning_rate": 0.0005680149810876322, "loss": 1641.5496, "step": 4620 }, { "ce_loss_12": 3.359517002105713, "ce_loss_17": 2.9971216320991516, "ce_loss_23": 2.866685378551483, "ce_loss_3": 4.192817986011505, "ce_loss_6": 3.871630370616913, "epoch": 0.463, "grad_norm": 1968.0, "kl_loss_12": 1122.232159423828, "kl_loss_17": 304.9923568725586, "kl_loss_3": 2791.3302612304688, "kl_loss_6": 2163.3309997558595, "learning_rate": 0.0005664427267851271, "loss": 1587.3979, "step": 4630 }, { "ce_loss_12": 3.2848004341125487, "ce_loss_17": 2.923465597629547, "ce_loss_23": 2.7880022048950197, "ce_loss_3": 4.112053787708282, "ce_loss_6": 3.796520805358887, "epoch": 0.464, "grad_norm": 1176.0, "kl_loss_12": 1109.333233642578, "kl_loss_17": 302.905500793457, "kl_loss_3": 2762.598498535156, "kl_loss_6": 2136.8950744628905, "learning_rate": 0.0005648698034051009, "loss": 1566.1207, "step": 4640 }, { "ce_loss_12": 3.3933666944503784, "ce_loss_17": 3.0286192536354064, "ce_loss_23": 2.893868792057037, "ce_loss_3": 4.239399254322052, "ce_loss_6": 3.918375754356384, "epoch": 0.465, "grad_norm": 976.0, "kl_loss_12": 1127.1548950195313, "kl_loss_17": 305.87921447753905, "kl_loss_3": 2811.355078125, "kl_loss_6": 2189.2998901367187, "learning_rate": 0.0005632962267868747, "loss": 1574.9604, "step": 4650 }, { "ce_loss_12": 3.3286603569984434, "ce_loss_17": 2.9657166481018065, "ce_loss_23": 2.83941011428833, "ce_loss_3": 4.1424295663833615, "ce_loss_6": 3.8242884278297424, "epoch": 0.466, "grad_norm": 1128.0, "kl_loss_12": 1115.7930053710938, "kl_loss_17": 298.15654907226565, "kl_loss_3": 2751.6337646484376, "kl_loss_6": 2123.6877258300783, "learning_rate": 0.0005617220127763474, "loss": 1588.1186, "step": 4660 }, { "ce_loss_12": 3.399213743209839, "ce_loss_17": 3.049934506416321, "ce_loss_23": 2.9157302737236024, "ce_loss_3": 4.208151781558991, "ce_loss_6": 3.887350296974182, "epoch": 0.467, "grad_norm": 984.0, "kl_loss_12": 1117.3792602539063, "kl_loss_17": 306.5261657714844, "kl_loss_3": 2735.5549438476564, "kl_loss_6": 2100.812664794922, "learning_rate": 0.0005601471772258368, "loss": 1586.6715, "step": 4670 }, { "ce_loss_12": 3.382119631767273, "ce_loss_17": 3.033186662197113, "ce_loss_23": 2.8996434092521666, "ce_loss_3": 4.197882652282715, "ce_loss_6": 3.8774981260299684, "epoch": 0.468, "grad_norm": 1048.0, "kl_loss_12": 1095.82021484375, "kl_loss_17": 304.3485252380371, "kl_loss_3": 2719.297412109375, "kl_loss_6": 2086.8393798828124, "learning_rate": 0.0005585717359939192, "loss": 1593.2576, "step": 4680 }, { "ce_loss_12": 3.3027979850769045, "ce_loss_17": 2.945749342441559, "ce_loss_23": 2.8152668833732606, "ce_loss_3": 4.104500102996826, "ce_loss_6": 3.7914631724357606, "epoch": 0.469, "grad_norm": 1224.0, "kl_loss_12": 1105.9541046142579, "kl_loss_17": 302.2694259643555, "kl_loss_3": 2706.378515625, "kl_loss_6": 2090.2652587890625, "learning_rate": 0.0005569957049452703, "loss": 1594.227, "step": 4690 }, { "ce_loss_12": 3.369704079627991, "ce_loss_17": 3.0091665029525756, "ce_loss_23": 2.871000623703003, "ce_loss_3": 4.200240743160248, "ce_loss_6": 3.874775540828705, "epoch": 0.47, "grad_norm": 1040.0, "kl_loss_12": 1132.7062622070312, "kl_loss_17": 311.7139724731445, "kl_loss_3": 2804.237646484375, "kl_loss_6": 2159.6342346191404, "learning_rate": 0.0005554190999505056, "loss": 1606.4697, "step": 4700 }, { "ce_loss_12": 3.491312766075134, "ce_loss_17": 3.1184787154197693, "ce_loss_23": 2.9828242897987365, "ce_loss_3": 4.289120233058929, "ce_loss_6": 3.9822962641716004, "epoch": 0.471, "grad_norm": 1096.0, "kl_loss_12": 1157.3537475585938, "kl_loss_17": 318.26240692138674, "kl_loss_3": 2775.381311035156, "kl_loss_6": 2164.731060791016, "learning_rate": 0.0005538419368860196, "loss": 1548.7658, "step": 4710 }, { "ce_loss_12": 3.3963905811309814, "ce_loss_17": 3.0453217029571533, "ce_loss_23": 2.909340965747833, "ce_loss_3": 4.201114368438721, "ce_loss_6": 3.8916338801383974, "epoch": 0.472, "grad_norm": 1040.0, "kl_loss_12": 1125.0690490722657, "kl_loss_17": 310.0855987548828, "kl_loss_3": 2735.7884887695313, "kl_loss_6": 2123.482470703125, "learning_rate": 0.0005522642316338268, "loss": 1607.8658, "step": 4720 }, { "ce_loss_12": 3.4153844356536864, "ce_loss_17": 3.056465709209442, "ce_loss_23": 2.9310761451721192, "ce_loss_3": 4.222506582736969, "ce_loss_6": 3.91341096162796, "epoch": 0.473, "grad_norm": 908.0, "kl_loss_12": 1125.3278381347657, "kl_loss_17": 305.9197311401367, "kl_loss_3": 2736.0848754882813, "kl_loss_6": 2125.858508300781, "learning_rate": 0.0005506860000814017, "loss": 1615.6734, "step": 4730 }, { "ce_loss_12": 3.418290138244629, "ce_loss_17": 3.0670406818389893, "ce_loss_23": 2.942454922199249, "ce_loss_3": 4.213110136985779, "ce_loss_6": 3.897139918804169, "epoch": 0.474, "grad_norm": 988.0, "kl_loss_12": 1103.0640380859375, "kl_loss_17": 293.4414855957031, "kl_loss_3": 2694.6202392578125, "kl_loss_6": 2075.2019470214846, "learning_rate": 0.0005491072581215186, "loss": 1574.7879, "step": 4740 }, { "ce_loss_12": 3.4341385841369627, "ce_loss_17": 3.0710843801498413, "ce_loss_23": 2.933401334285736, "ce_loss_3": 4.239067649841308, "ce_loss_6": 3.9195735335350035, "epoch": 0.475, "grad_norm": 1104.0, "kl_loss_12": 1139.1321960449218, "kl_loss_17": 314.00052337646486, "kl_loss_3": 2772.883679199219, "kl_loss_6": 2136.547479248047, "learning_rate": 0.0005475280216520913, "loss": 1562.3973, "step": 4750 }, { "ce_loss_12": 3.349986743927002, "ce_loss_17": 2.997076177597046, "ce_loss_23": 2.866178572177887, "ce_loss_3": 4.152829468250275, "ce_loss_6": 3.837746262550354, "epoch": 0.476, "grad_norm": 1192.0, "kl_loss_12": 1096.0175872802733, "kl_loss_17": 301.01770782470703, "kl_loss_3": 2698.3632202148438, "kl_loss_6": 2076.271063232422, "learning_rate": 0.0005459483065760138, "loss": 1595.573, "step": 4760 }, { "ce_loss_12": 3.3164255142211916, "ce_loss_17": 2.941723871231079, "ce_loss_23": 2.8090891242027283, "ce_loss_3": 4.179085612297058, "ce_loss_6": 3.8568483710289003, "epoch": 0.477, "grad_norm": 1048.0, "kl_loss_12": 1156.2247375488282, "kl_loss_17": 306.74070129394534, "kl_loss_3": 2867.866845703125, "kl_loss_6": 2235.567724609375, "learning_rate": 0.0005443681288009991, "loss": 1601.8034, "step": 4770 }, { "ce_loss_12": 3.3487496376037598, "ce_loss_17": 2.9888516068458557, "ce_loss_23": 2.8586570262908935, "ce_loss_3": 4.1714806199073795, "ce_loss_6": 3.8551082015037537, "epoch": 0.478, "grad_norm": 836.0, "kl_loss_12": 1120.5207489013671, "kl_loss_17": 301.96836547851564, "kl_loss_3": 2774.1383666992188, "kl_loss_6": 2145.277478027344, "learning_rate": 0.0005427875042394199, "loss": 1587.4909, "step": 4780 }, { "ce_loss_12": 3.3863983273506166, "ce_loss_17": 3.0380003809928895, "ce_loss_23": 2.8937386155128477, "ce_loss_3": 4.200499761104584, "ce_loss_6": 3.8739574551582336, "epoch": 0.479, "grad_norm": 1016.0, "kl_loss_12": 1115.8671905517579, "kl_loss_17": 314.9477005004883, "kl_loss_3": 2738.422473144531, "kl_loss_6": 2093.9318725585936, "learning_rate": 0.0005412064488081482, "loss": 1597.256, "step": 4790 }, { "ce_loss_12": 3.3651809096336365, "ce_loss_17": 3.0185546040534974, "ce_loss_23": 2.8903374314308166, "ce_loss_3": 4.176959121227265, "ce_loss_6": 3.8557177424430846, "epoch": 0.48, "grad_norm": 984.0, "kl_loss_12": 1079.9235870361329, "kl_loss_17": 297.1005088806152, "kl_loss_3": 2698.806689453125, "kl_loss_6": 2073.9032958984376, "learning_rate": 0.0005396249784283942, "loss": 1548.1096, "step": 4800 }, { "ce_loss_12": 3.410639774799347, "ce_loss_17": 3.041554272174835, "ce_loss_23": 2.904786264896393, "ce_loss_3": 4.251404666900635, "ce_loss_6": 3.9328576564788817, "epoch": 0.481, "grad_norm": 968.0, "kl_loss_12": 1150.5845642089844, "kl_loss_17": 312.17601776123047, "kl_loss_3": 2835.917614746094, "kl_loss_6": 2204.481726074219, "learning_rate": 0.0005380431090255476, "loss": 1610.3476, "step": 4810 }, { "ce_loss_12": 3.3847161650657656, "ce_loss_17": 3.035974931716919, "ce_loss_23": 2.9126381397247316, "ce_loss_3": 4.18476665019989, "ce_loss_6": 3.875488114356995, "epoch": 0.482, "grad_norm": 912.0, "kl_loss_12": 1089.5160858154297, "kl_loss_17": 290.29730834960935, "kl_loss_3": 2687.77041015625, "kl_loss_6": 2075.221826171875, "learning_rate": 0.0005364608565290155, "loss": 1548.8517, "step": 4820 }, { "ce_loss_12": 3.400247502326965, "ce_loss_17": 3.043899357318878, "ce_loss_23": 2.915937912464142, "ce_loss_3": 4.2259934425354, "ce_loss_6": 3.9076520800590515, "epoch": 0.483, "grad_norm": 996.0, "kl_loss_12": 1115.6108764648438, "kl_loss_17": 304.19869842529295, "kl_loss_3": 2769.1885986328125, "kl_loss_6": 2143.8648986816406, "learning_rate": 0.0005348782368720626, "loss": 1584.2228, "step": 4830 }, { "ce_loss_12": 3.3320335507392884, "ce_loss_17": 2.9882596731185913, "ce_loss_23": 2.854368841648102, "ce_loss_3": 4.15036495923996, "ce_loss_6": 3.8380707025527956, "epoch": 0.484, "grad_norm": 844.0, "kl_loss_12": 1091.236019897461, "kl_loss_17": 298.0321350097656, "kl_loss_3": 2724.5786010742186, "kl_loss_6": 2096.6154052734373, "learning_rate": 0.000533295265991652, "loss": 1576.1192, "step": 4840 }, { "ce_loss_12": 3.4043202638626098, "ce_loss_17": 3.048448932170868, "ce_loss_23": 2.914829695224762, "ce_loss_3": 4.195201706886292, "ce_loss_6": 3.8884942173957824, "epoch": 0.485, "grad_norm": 1088.0, "kl_loss_12": 1097.879165649414, "kl_loss_17": 301.0802597045898, "kl_loss_3": 2693.1909423828124, "kl_loss_6": 2091.048065185547, "learning_rate": 0.0005317119598282822, "loss": 1546.7378, "step": 4850 }, { "ce_loss_12": 3.4163004517555238, "ce_loss_17": 3.055159831047058, "ce_loss_23": 2.921207368373871, "ce_loss_3": 4.212529075145722, "ce_loss_6": 3.9011515617370605, "epoch": 0.486, "grad_norm": 1272.0, "kl_loss_12": 1117.5282165527344, "kl_loss_17": 304.5247299194336, "kl_loss_3": 2715.294299316406, "kl_loss_6": 2106.637139892578, "learning_rate": 0.0005301283343258293, "loss": 1562.6322, "step": 4860 }, { "ce_loss_12": 3.452582538127899, "ce_loss_17": 3.1055136561393737, "ce_loss_23": 2.97325998544693, "ce_loss_3": 4.245788037776947, "ce_loss_6": 3.9398349642753603, "epoch": 0.487, "grad_norm": 976.0, "kl_loss_12": 1102.8094665527344, "kl_loss_17": 303.8196731567383, "kl_loss_3": 2696.489990234375, "kl_loss_6": 2084.3579345703124, "learning_rate": 0.000528544405431384, "loss": 1544.8414, "step": 4870 }, { "ce_loss_12": 3.3599804520606993, "ce_loss_17": 2.9914204597473146, "ce_loss_23": 2.8525378108024597, "ce_loss_3": 4.176129698753357, "ce_loss_6": 3.85386004447937, "epoch": 0.488, "grad_norm": 1016.0, "kl_loss_12": 1146.3044921875, "kl_loss_17": 315.3286987304688, "kl_loss_3": 2801.2627197265624, "kl_loss_6": 2156.7280395507814, "learning_rate": 0.000526960189095093, "loss": 1600.166, "step": 4880 }, { "ce_loss_12": 3.334537076950073, "ce_loss_17": 2.9861772298812865, "ce_loss_23": 2.852927792072296, "ce_loss_3": 4.146724760532379, "ce_loss_6": 3.8291383743286134, "epoch": 0.489, "grad_norm": 972.0, "kl_loss_12": 1091.358187866211, "kl_loss_17": 299.0898208618164, "kl_loss_3": 2717.929248046875, "kl_loss_6": 2096.2663513183593, "learning_rate": 0.0005253757012699972, "loss": 1556.6723, "step": 4890 }, { "ce_loss_12": 3.3993546605110168, "ce_loss_17": 3.0482656478881838, "ce_loss_23": 2.9207078576087953, "ce_loss_3": 4.2034744143486025, "ce_loss_6": 3.888164055347443, "epoch": 0.49, "grad_norm": 1096.0, "kl_loss_12": 1099.3933380126953, "kl_loss_17": 298.8610366821289, "kl_loss_3": 2719.4765380859376, "kl_loss_6": 2089.0828125, "learning_rate": 0.0005237909579118712, "loss": 1578.4719, "step": 4900 }, { "ce_loss_12": 3.388143837451935, "ce_loss_17": 3.023465883731842, "ce_loss_23": 2.8811936497688295, "ce_loss_3": 4.216781687736511, "ce_loss_6": 3.900932049751282, "epoch": 0.491, "grad_norm": 1208.0, "kl_loss_12": 1139.244564819336, "kl_loss_17": 316.09378509521486, "kl_loss_3": 2809.9015625, "kl_loss_6": 2175.9791259765625, "learning_rate": 0.0005222059749790631, "loss": 1598.0844, "step": 4910 }, { "ce_loss_12": 3.4135035037994386, "ce_loss_17": 3.07126407623291, "ce_loss_23": 2.942831826210022, "ce_loss_3": 4.193430387973786, "ce_loss_6": 3.882127547264099, "epoch": 0.492, "grad_norm": 1128.0, "kl_loss_12": 1084.961932373047, "kl_loss_17": 296.9151382446289, "kl_loss_3": 2666.9692993164062, "kl_loss_6": 2048.114013671875, "learning_rate": 0.0005206207684323337, "loss": 1526.9676, "step": 4920 }, { "ce_loss_12": 3.4127950191497805, "ce_loss_17": 3.055870497226715, "ce_loss_23": 2.9254420518875124, "ce_loss_3": 4.215585446357727, "ce_loss_6": 3.9041973114013673, "epoch": 0.493, "grad_norm": 1224.0, "kl_loss_12": 1125.9021087646483, "kl_loss_17": 306.9901626586914, "kl_loss_3": 2730.7314453125, "kl_loss_6": 2122.603759765625, "learning_rate": 0.000519035354234695, "loss": 1590.4694, "step": 4930 }, { "ce_loss_12": 3.3999263286590575, "ce_loss_17": 3.0387742042541506, "ce_loss_23": 2.895954394340515, "ce_loss_3": 4.199414372444153, "ce_loss_6": 3.8858419179916384, "epoch": 0.494, "grad_norm": 1184.0, "kl_loss_12": 1124.1252349853517, "kl_loss_17": 314.1003082275391, "kl_loss_3": 2728.172705078125, "kl_loss_6": 2106.65966796875, "learning_rate": 0.0005174497483512506, "loss": 1552.0831, "step": 4940 }, { "ce_loss_12": 3.41754287481308, "ce_loss_17": 3.0745951890945435, "ce_loss_23": 2.9491618275642395, "ce_loss_3": 4.21412056684494, "ce_loss_6": 3.9022476434707642, "epoch": 0.495, "grad_norm": 976.0, "kl_loss_12": 1096.221469116211, "kl_loss_17": 295.2589080810547, "kl_loss_3": 2714.2718505859375, "kl_loss_6": 2097.348748779297, "learning_rate": 0.0005158639667490339, "loss": 1584.319, "step": 4950 }, { "ce_loss_12": 3.3561907529830934, "ce_loss_17": 2.9901116251945496, "ce_loss_23": 2.8571989059448244, "ce_loss_3": 4.174978446960449, "ce_loss_6": 3.845904302597046, "epoch": 0.496, "grad_norm": 1080.0, "kl_loss_12": 1125.0650604248046, "kl_loss_17": 303.3439971923828, "kl_loss_3": 2763.3949584960938, "kl_loss_6": 2120.0795471191404, "learning_rate": 0.0005142780253968481, "loss": 1574.7215, "step": 4960 }, { "ce_loss_12": 3.286413645744324, "ce_loss_17": 2.9384931564331054, "ce_loss_23": 2.8116806387901305, "ce_loss_3": 4.088664150238037, "ce_loss_6": 3.77538526058197, "epoch": 0.497, "grad_norm": 1376.0, "kl_loss_12": 1079.2211120605468, "kl_loss_17": 295.1123382568359, "kl_loss_3": 2690.6939697265625, "kl_loss_6": 2071.3447998046877, "learning_rate": 0.0005126919402651053, "loss": 1525.6309, "step": 4970 }, { "ce_loss_12": 3.370965826511383, "ce_loss_17": 3.012047052383423, "ce_loss_23": 2.875856566429138, "ce_loss_3": 4.195962822437286, "ce_loss_6": 3.884010601043701, "epoch": 0.498, "grad_norm": 1184.0, "kl_loss_12": 1126.862042236328, "kl_loss_17": 312.22010040283203, "kl_loss_3": 2761.840673828125, "kl_loss_6": 2139.622937011719, "learning_rate": 0.0005111057273256647, "loss": 1584.7812, "step": 4980 }, { "ce_loss_12": 3.419179451465607, "ce_loss_17": 3.0902063131332396, "ce_loss_23": 2.971741831302643, "ce_loss_3": 4.177947437763214, "ce_loss_6": 3.8815560221672056, "epoch": 0.499, "grad_norm": 1224.0, "kl_loss_12": 1053.150732421875, "kl_loss_17": 285.0224044799805, "kl_loss_3": 2569.1810791015623, "kl_loss_6": 1990.8180358886718, "learning_rate": 0.0005095194025516733, "loss": 1504.0578, "step": 4990 }, { "ce_loss_12": 3.374780237674713, "ce_loss_17": 3.0321351408958437, "ce_loss_23": 2.908969295024872, "ce_loss_3": 4.176579833030701, "ce_loss_6": 3.8689828872680665, "epoch": 0.5, "grad_norm": 1064.0, "kl_loss_12": 1082.8041015625, "kl_loss_17": 291.1903495788574, "kl_loss_3": 2680.8746826171873, "kl_loss_6": 2074.5982421875, "learning_rate": 0.000507932981917404, "loss": 1585.3881, "step": 5000 }, { "ce_loss_12": 3.362855243682861, "ce_loss_17": 2.988880681991577, "ce_loss_23": 2.8498858332633974, "ce_loss_3": 4.198509395122528, "ce_loss_6": 3.8798313617706297, "epoch": 0.501, "grad_norm": 976.0, "kl_loss_12": 1162.0169952392578, "kl_loss_17": 314.58503875732424, "kl_loss_3": 2829.14833984375, "kl_loss_6": 2203.0539306640626, "learning_rate": 0.0005063464813980949, "loss": 1620.7376, "step": 5010 }, { "ce_loss_12": 3.3213644504547117, "ce_loss_17": 2.967113363742828, "ce_loss_23": 2.8417882204055784, "ce_loss_3": 4.131510305404663, "ce_loss_6": 3.8222088694572447, "epoch": 0.502, "grad_norm": 1024.0, "kl_loss_12": 1118.1484283447267, "kl_loss_17": 299.0059448242188, "kl_loss_3": 2753.0571533203124, "kl_loss_6": 2133.07119140625, "learning_rate": 0.0005047599169697884, "loss": 1566.5135, "step": 5020 }, { "ce_loss_12": 3.2775539636611937, "ce_loss_17": 2.914250874519348, "ce_loss_23": 2.7824475884437563, "ce_loss_3": 4.102517211437226, "ce_loss_6": 3.785682499408722, "epoch": 0.503, "grad_norm": 1152.0, "kl_loss_12": 1109.3389801025392, "kl_loss_17": 302.9358444213867, "kl_loss_3": 2754.7373901367187, "kl_loss_6": 2128.6819702148437, "learning_rate": 0.000503173304609171, "loss": 1538.1774, "step": 5030 }, { "ce_loss_12": 3.3846050620079042, "ce_loss_17": 3.028143787384033, "ce_loss_23": 2.8941952705383303, "ce_loss_3": 4.190663433074951, "ce_loss_6": 3.878485178947449, "epoch": 0.504, "grad_norm": 1192.0, "kl_loss_12": 1106.5670806884766, "kl_loss_17": 296.38336486816405, "kl_loss_3": 2712.2251586914062, "kl_loss_6": 2105.7641845703124, "learning_rate": 0.0005015866602934111, "loss": 1537.4449, "step": 5040 }, { "ce_loss_12": 3.385323441028595, "ce_loss_17": 3.0098569989204407, "ce_loss_23": 2.8715598344802857, "ce_loss_3": 4.201504063606262, "ce_loss_6": 3.881215286254883, "epoch": 0.505, "grad_norm": 1128.0, "kl_loss_12": 1161.3495056152344, "kl_loss_17": 316.38443450927736, "kl_loss_3": 2794.4003662109376, "kl_loss_6": 2166.5137634277344, "learning_rate": 0.0005, "loss": 1584.4279, "step": 5050 }, { "ce_loss_12": 3.359061539173126, "ce_loss_17": 2.9973837614059446, "ce_loss_23": 2.866459846496582, "ce_loss_3": 4.173029494285584, "ce_loss_6": 3.8500274181365968, "epoch": 0.506, "grad_norm": 1368.0, "kl_loss_12": 1112.7315063476562, "kl_loss_17": 304.95944061279295, "kl_loss_3": 2741.0337890625, "kl_loss_6": 2107.1026306152344, "learning_rate": 0.0004984133397065889, "loss": 1546.8272, "step": 5060 }, { "ce_loss_12": 3.372469925880432, "ce_loss_17": 3.005589461326599, "ce_loss_23": 2.867159426212311, "ce_loss_3": 4.18806174993515, "ce_loss_6": 3.873115861415863, "epoch": 0.507, "grad_norm": 1032.0, "kl_loss_12": 1123.6464630126952, "kl_loss_17": 306.77745666503904, "kl_loss_3": 2752.3882690429687, "kl_loss_6": 2126.8781494140626, "learning_rate": 0.0004968266953908291, "loss": 1546.3459, "step": 5070 }, { "ce_loss_12": 3.3890419483184813, "ce_loss_17": 3.0321525931358337, "ce_loss_23": 2.9076250076293944, "ce_loss_3": 4.213087391853333, "ce_loss_6": 3.892174541950226, "epoch": 0.508, "grad_norm": 1128.0, "kl_loss_12": 1105.1057861328125, "kl_loss_17": 292.9428909301758, "kl_loss_3": 2746.5344970703127, "kl_loss_6": 2125.439532470703, "learning_rate": 0.0004952400830302117, "loss": 1561.7111, "step": 5080 }, { "ce_loss_12": 3.339200568199158, "ce_loss_17": 2.973728096485138, "ce_loss_23": 2.8369530916213987, "ce_loss_3": 4.165874052047729, "ce_loss_6": 3.8430152773857116, "epoch": 0.509, "grad_norm": 1128.0, "kl_loss_12": 1136.2608703613282, "kl_loss_17": 307.2293701171875, "kl_loss_3": 2784.1413452148436, "kl_loss_6": 2157.0075561523436, "learning_rate": 0.0004936535186019053, "loss": 1572.1361, "step": 5090 }, { "ce_loss_12": 3.3992201328277587, "ce_loss_17": 3.059427273273468, "ce_loss_23": 2.935956287384033, "ce_loss_3": 4.1924993872642515, "ce_loss_6": 3.8852957248687745, "epoch": 0.51, "grad_norm": 1144.0, "kl_loss_12": 1067.5751892089843, "kl_loss_17": 287.5902297973633, "kl_loss_3": 2651.0362182617187, "kl_loss_6": 2043.5480590820312, "learning_rate": 0.000492067018082596, "loss": 1536.1178, "step": 5100 }, { "ce_loss_12": 3.382169485092163, "ce_loss_17": 3.0059693813323975, "ce_loss_23": 2.8708268642425536, "ce_loss_3": 4.224006390571594, "ce_loss_6": 3.899908125400543, "epoch": 0.511, "grad_norm": 1392.0, "kl_loss_12": 1148.9556762695313, "kl_loss_17": 309.4052139282227, "kl_loss_3": 2832.4399169921876, "kl_loss_6": 2198.9977783203126, "learning_rate": 0.0004904805974483267, "loss": 1623.9365, "step": 5110 }, { "ce_loss_12": 3.4949843645095826, "ce_loss_17": 3.1124078273773192, "ce_loss_23": 2.9691667199134826, "ce_loss_3": 4.304324913024902, "ce_loss_6": 3.9900792956352236, "epoch": 0.512, "grad_norm": 1192.0, "kl_loss_12": 1186.4844299316405, "kl_loss_17": 324.86098022460936, "kl_loss_3": 2814.8347045898436, "kl_loss_6": 2192.9266662597656, "learning_rate": 0.0004888942726743353, "loss": 1644.2129, "step": 5120 }, { "ce_loss_12": 3.3473219513893127, "ce_loss_17": 2.989601743221283, "ce_loss_23": 2.8587637186050414, "ce_loss_3": 4.163712787628174, "ce_loss_6": 3.8459107637405396, "epoch": 0.513, "grad_norm": 1048.0, "kl_loss_12": 1121.4485717773437, "kl_loss_17": 305.6918441772461, "kl_loss_3": 2768.2578125, "kl_loss_6": 2135.605535888672, "learning_rate": 0.0004873080597348947, "loss": 1587.6139, "step": 5130 }, { "ce_loss_12": 3.262009656429291, "ce_loss_17": 2.887168896198273, "ce_loss_23": 2.75510116815567, "ce_loss_3": 4.116335034370422, "ce_loss_6": 3.7995688676834107, "epoch": 0.514, "grad_norm": 1144.0, "kl_loss_12": 1153.4715423583984, "kl_loss_17": 301.694108581543, "kl_loss_3": 2854.0366943359377, "kl_loss_6": 2224.2273193359374, "learning_rate": 0.0004857219746031519, "loss": 1596.3561, "step": 5140 }, { "ce_loss_12": 3.3927998185157775, "ce_loss_17": 3.0430898785591127, "ce_loss_23": 2.9166905879974365, "ce_loss_3": 4.200037574768066, "ce_loss_6": 3.877912938594818, "epoch": 0.515, "grad_norm": 1016.0, "kl_loss_12": 1092.6485656738282, "kl_loss_17": 300.26861572265625, "kl_loss_3": 2706.573291015625, "kl_loss_6": 2077.4241271972655, "learning_rate": 0.0004841360332509663, "loss": 1561.7504, "step": 5150 }, { "ce_loss_12": 3.350694715976715, "ce_loss_17": 2.9989014267921448, "ce_loss_23": 2.873697781562805, "ce_loss_3": 4.148285734653473, "ce_loss_6": 3.842498481273651, "epoch": 0.516, "grad_norm": 1368.0, "kl_loss_12": 1091.5367370605468, "kl_loss_17": 291.91494903564455, "kl_loss_3": 2684.4546142578124, "kl_loss_6": 2076.9037963867186, "learning_rate": 0.0004825502516487497, "loss": 1503.6863, "step": 5160 }, { "ce_loss_12": 3.3179797649383547, "ce_loss_17": 2.9602678537368776, "ce_loss_23": 2.835769760608673, "ce_loss_3": 4.13564225435257, "ce_loss_6": 3.835077476501465, "epoch": 0.517, "grad_norm": 1400.0, "kl_loss_12": 1110.5461822509765, "kl_loss_17": 297.78162994384763, "kl_loss_3": 2746.1639770507813, "kl_loss_6": 2157.3029541015626, "learning_rate": 0.00048096464576530507, "loss": 1586.2658, "step": 5170 }, { "ce_loss_12": 3.40576229095459, "ce_loss_17": 3.0669244527816772, "ce_loss_23": 2.9372299790382383, "ce_loss_3": 4.170511937141418, "ce_loss_6": 3.873363471031189, "epoch": 0.518, "grad_norm": 1056.0, "kl_loss_12": 1087.6305053710937, "kl_loss_17": 299.07730026245116, "kl_loss_3": 2632.4614013671876, "kl_loss_6": 2038.3814025878905, "learning_rate": 0.00047937923156766646, "loss": 1524.3064, "step": 5180 }, { "ce_loss_12": 3.434298062324524, "ce_loss_17": 3.1022254109382628, "ce_loss_23": 2.9763625144958494, "ce_loss_3": 4.1991536021232605, "ce_loss_6": 3.8984975099563597, "epoch": 0.519, "grad_norm": 1352.0, "kl_loss_12": 1077.075503540039, "kl_loss_17": 291.15290145874025, "kl_loss_3": 2635.3423583984377, "kl_loss_6": 2036.4578491210937, "learning_rate": 0.00047779402502093696, "loss": 1529.8139, "step": 5190 }, { "ce_loss_12": 3.417772078514099, "ce_loss_17": 3.0711885690689087, "ce_loss_23": 2.9444741368293763, "ce_loss_3": 4.217487633228302, "ce_loss_6": 3.904100239276886, "epoch": 0.52, "grad_norm": 1120.0, "kl_loss_12": 1084.6034698486328, "kl_loss_17": 296.61804046630857, "kl_loss_3": 2687.315478515625, "kl_loss_6": 2068.0660705566406, "learning_rate": 0.0004762090420881289, "loss": 1553.0215, "step": 5200 }, { "ce_loss_12": 3.3423298478126524, "ce_loss_17": 2.992612087726593, "ce_loss_23": 2.865809166431427, "ce_loss_3": 4.1181949257850645, "ce_loss_6": 3.8178300738334654, "epoch": 0.521, "grad_norm": 1104.0, "kl_loss_12": 1091.2071472167968, "kl_loss_17": 296.23180694580077, "kl_loss_3": 2668.9665283203126, "kl_loss_6": 2073.2246154785157, "learning_rate": 0.00047462429873000296, "loss": 1516.2294, "step": 5210 }, { "ce_loss_12": 3.4136949300765993, "ce_loss_17": 3.0739993691444396, "ce_loss_23": 2.944578742980957, "ce_loss_3": 4.207600545883179, "ce_loss_6": 3.8948861360549927, "epoch": 0.522, "grad_norm": 1272.0, "kl_loss_12": 1078.202474975586, "kl_loss_17": 302.248828125, "kl_loss_3": 2676.73427734375, "kl_loss_6": 2065.8719177246094, "learning_rate": 0.0004730398109049071, "loss": 1528.9749, "step": 5220 }, { "ce_loss_12": 3.379810082912445, "ce_loss_17": 3.0066291213035585, "ce_loss_23": 2.868514358997345, "ce_loss_3": 4.210710990428924, "ce_loss_6": 3.89647376537323, "epoch": 0.523, "grad_norm": 1208.0, "kl_loss_12": 1150.9331085205079, "kl_loss_17": 311.6774368286133, "kl_loss_3": 2813.1722778320313, "kl_loss_6": 2192.030694580078, "learning_rate": 0.000471455594568616, "loss": 1577.8427, "step": 5230 }, { "ce_loss_12": 3.407266581058502, "ce_loss_17": 3.062355172634125, "ce_loss_23": 2.939179801940918, "ce_loss_3": 4.188946032524109, "ce_loss_6": 3.8770066261291505, "epoch": 0.524, "grad_norm": 1264.0, "kl_loss_12": 1081.1963348388672, "kl_loss_17": 299.0507553100586, "kl_loss_3": 2653.9161987304688, "kl_loss_6": 2035.7987121582032, "learning_rate": 0.00046987166567417086, "loss": 1547.227, "step": 5240 }, { "ce_loss_12": 3.349161219596863, "ce_loss_17": 2.9991302371025084, "ce_loss_23": 2.8701635241508483, "ce_loss_3": 4.162693417072296, "ce_loss_6": 3.8393788576126098, "epoch": 0.525, "grad_norm": 996.0, "kl_loss_12": 1095.2534759521484, "kl_loss_17": 295.6771751403809, "kl_loss_3": 2718.0575561523438, "kl_loss_6": 2089.1939208984377, "learning_rate": 0.00046828804017171776, "loss": 1512.2344, "step": 5250 }, { "ce_loss_12": 3.4031213760375976, "ce_loss_17": 3.0353243589401244, "ce_loss_23": 2.9000794291496277, "ce_loss_3": 4.230943822860718, "ce_loss_6": 3.912229025363922, "epoch": 0.526, "grad_norm": 1312.0, "kl_loss_12": 1117.801577758789, "kl_loss_17": 304.54825592041016, "kl_loss_3": 2767.648645019531, "kl_loss_6": 2137.6414428710937, "learning_rate": 0.00046670473400834805, "loss": 1583.449, "step": 5260 }, { "ce_loss_12": 3.3178684949874877, "ce_loss_17": 2.9749055981636046, "ce_loss_23": 2.8483084440231323, "ce_loss_3": 4.121661520004272, "ce_loss_6": 3.8078757643699648, "epoch": 0.527, "grad_norm": 1064.0, "kl_loss_12": 1067.9791290283204, "kl_loss_17": 292.6219314575195, "kl_loss_3": 2680.6882202148436, "kl_loss_6": 2062.2185485839846, "learning_rate": 0.00046512176312793734, "loss": 1584.1877, "step": 5270 }, { "ce_loss_12": 3.322924852371216, "ce_loss_17": 2.9707990288734436, "ce_loss_23": 2.8352202653884886, "ce_loss_3": 4.123947286605835, "ce_loss_6": 3.817195165157318, "epoch": 0.528, "grad_norm": 1112.0, "kl_loss_12": 1094.3742126464845, "kl_loss_17": 296.32200927734374, "kl_loss_3": 2715.054821777344, "kl_loss_6": 2107.3768615722656, "learning_rate": 0.00046353914347098467, "loss": 1567.2074, "step": 5280 }, { "ce_loss_12": 3.4210985898971558, "ce_loss_17": 3.067178690433502, "ce_loss_23": 2.9379127740859987, "ce_loss_3": 4.217587947845459, "ce_loss_6": 3.9102158188819884, "epoch": 0.529, "grad_norm": 1048.0, "kl_loss_12": 1083.2941864013671, "kl_loss_17": 293.2905502319336, "kl_loss_3": 2691.60302734375, "kl_loss_6": 2081.5444458007814, "learning_rate": 0.0004619568909744524, "loss": 1570.5784, "step": 5290 }, { "ce_loss_12": 3.4063668847084045, "ce_loss_17": 3.06554034948349, "ce_loss_23": 2.939655864238739, "ce_loss_3": 4.199694049358368, "ce_loss_6": 3.88633279800415, "epoch": 0.53, "grad_norm": 1344.0, "kl_loss_12": 1087.9134185791015, "kl_loss_17": 297.81110229492185, "kl_loss_3": 2678.2871826171877, "kl_loss_6": 2065.655108642578, "learning_rate": 0.00046037502157160573, "loss": 1556.4295, "step": 5300 }, { "ce_loss_12": 3.3119312047958376, "ce_loss_17": 2.9507867217063906, "ce_loss_23": 2.8170288801193237, "ce_loss_3": 4.108970201015472, "ce_loss_6": 3.799880337715149, "epoch": 0.531, "grad_norm": 1048.0, "kl_loss_12": 1108.8056365966797, "kl_loss_17": 303.1284881591797, "kl_loss_3": 2711.242626953125, "kl_loss_6": 2104.8263671875, "learning_rate": 0.00045879355119185207, "loss": 1567.8275, "step": 5310 }, { "ce_loss_12": 3.3983437180519105, "ce_loss_17": 3.0327823519706727, "ce_loss_23": 2.9004740595817564, "ce_loss_3": 4.212223029136657, "ce_loss_6": 3.8889464497566224, "epoch": 0.532, "grad_norm": 1152.0, "kl_loss_12": 1144.2750122070313, "kl_loss_17": 307.9484588623047, "kl_loss_3": 2772.0366333007814, "kl_loss_6": 2136.8219482421873, "learning_rate": 0.0004572124957605803, "loss": 1590.5291, "step": 5320 }, { "ce_loss_12": 3.4015161991119385, "ce_loss_17": 3.0423429012298584, "ce_loss_23": 2.906565010547638, "ce_loss_3": 4.199693059921264, "ce_loss_6": 3.8853312730789185, "epoch": 0.533, "grad_norm": 1136.0, "kl_loss_12": 1112.8039581298829, "kl_loss_17": 303.7584228515625, "kl_loss_3": 2726.5053955078124, "kl_loss_6": 2097.246826171875, "learning_rate": 0.00045563187119900103, "loss": 1542.5536, "step": 5330 }, { "ce_loss_12": 3.2540330410003664, "ce_loss_17": 2.89544860124588, "ce_loss_23": 2.768121588230133, "ce_loss_3": 4.084176480770111, "ce_loss_6": 3.7669583201408385, "epoch": 0.534, "grad_norm": 1104.0, "kl_loss_12": 1115.6181274414062, "kl_loss_17": 299.1415328979492, "kl_loss_3": 2771.7518310546875, "kl_loss_6": 2149.6638732910155, "learning_rate": 0.00045405169342398633, "loss": 1576.5603, "step": 5340 }, { "ce_loss_12": 3.349810791015625, "ce_loss_17": 2.9884414792060854, "ce_loss_23": 2.8516027212142943, "ce_loss_3": 4.166842401027679, "ce_loss_6": 3.8521859884262084, "epoch": 0.535, "grad_norm": 1288.0, "kl_loss_12": 1131.2634643554688, "kl_loss_17": 310.653182220459, "kl_loss_3": 2767.4667846679686, "kl_loss_6": 2154.172009277344, "learning_rate": 0.0004524719783479088, "loss": 1551.4629, "step": 5350 }, { "ce_loss_12": 3.308687424659729, "ce_loss_17": 2.9389654994010925, "ce_loss_23": 2.8066301941871643, "ce_loss_3": 4.13761465549469, "ce_loss_6": 3.8201239943504333, "epoch": 0.536, "grad_norm": 1080.0, "kl_loss_12": 1133.4922302246093, "kl_loss_17": 305.0677856445312, "kl_loss_3": 2793.434191894531, "kl_loss_6": 2159.982745361328, "learning_rate": 0.00045089274187848144, "loss": 1552.6938, "step": 5360 }, { "ce_loss_12": 3.3914544343948365, "ce_loss_17": 3.0516345858573914, "ce_loss_23": 2.925180435180664, "ce_loss_3": 4.187878561019898, "ce_loss_6": 3.8809223055839537, "epoch": 0.537, "grad_norm": 940.0, "kl_loss_12": 1087.690396118164, "kl_loss_17": 295.78431701660156, "kl_loss_3": 2695.1560668945312, "kl_loss_6": 2088.261999511719, "learning_rate": 0.00044931399991859835, "loss": 1535.7311, "step": 5370 }, { "ce_loss_12": 3.263565754890442, "ce_loss_17": 2.911922574043274, "ce_loss_23": 2.7829471707344053, "ce_loss_3": 4.075548827648163, "ce_loss_6": 3.754881238937378, "epoch": 0.538, "grad_norm": 1096.0, "kl_loss_12": 1100.927456665039, "kl_loss_17": 295.7045013427734, "kl_loss_3": 2745.293347167969, "kl_loss_6": 2104.372979736328, "learning_rate": 0.00044773576836617336, "loss": 1537.6268, "step": 5380 }, { "ce_loss_12": 3.3768471837043763, "ce_loss_17": 3.005662715435028, "ce_loss_23": 2.875761556625366, "ce_loss_3": 4.179002559185028, "ce_loss_6": 3.868990218639374, "epoch": 0.539, "grad_norm": 1080.0, "kl_loss_12": 1134.3475219726563, "kl_loss_17": 303.7232040405273, "kl_loss_3": 2747.5085571289064, "kl_loss_6": 2136.5920349121093, "learning_rate": 0.00044615806311398056, "loss": 1599.1203, "step": 5390 }, { "ce_loss_12": 3.396114432811737, "ce_loss_17": 3.074923348426819, "ce_loss_23": 2.9501069307327272, "ce_loss_3": 4.153258454799652, "ce_loss_6": 3.8466654300689695, "epoch": 0.54, "grad_norm": 1016.0, "kl_loss_12": 1052.32314453125, "kl_loss_17": 289.0661148071289, "kl_loss_3": 2593.190881347656, "kl_loss_6": 1993.3581787109374, "learning_rate": 0.00044458090004949454, "loss": 1539.339, "step": 5400 }, { "ce_loss_12": 3.3335826635360717, "ce_loss_17": 2.951065015792847, "ce_loss_23": 2.8089526534080504, "ce_loss_3": 4.174290323257447, "ce_loss_6": 3.8532066226005552, "epoch": 0.541, "grad_norm": 964.0, "kl_loss_12": 1162.954409790039, "kl_loss_17": 317.13980560302736, "kl_loss_3": 2863.15234375, "kl_loss_6": 2221.428271484375, "learning_rate": 0.0004430042950547297, "loss": 1576.8596, "step": 5410 }, { "ce_loss_12": 3.396577227115631, "ce_loss_17": 3.033453941345215, "ce_loss_23": 2.8963638305664063, "ce_loss_3": 4.211390352249145, "ce_loss_6": 3.8926366686820986, "epoch": 0.542, "grad_norm": 1104.0, "kl_loss_12": 1130.787789916992, "kl_loss_17": 312.94815216064455, "kl_loss_3": 2775.302355957031, "kl_loss_6": 2144.0336853027343, "learning_rate": 0.0004414282640060809, "loss": 1563.72, "step": 5420 }, { "ce_loss_12": 3.461113154888153, "ce_loss_17": 3.1103001713752745, "ce_loss_23": 2.978768992424011, "ce_loss_3": 4.233858203887939, "ce_loss_6": 3.9294100046157836, "epoch": 0.543, "grad_norm": 1224.0, "kl_loss_12": 1085.9322082519532, "kl_loss_17": 298.91674041748047, "kl_loss_3": 2630.5422973632812, "kl_loss_6": 2034.4115905761719, "learning_rate": 0.0004398528227741633, "loss": 1532.4951, "step": 5430 }, { "ce_loss_12": 3.3482273697853087, "ce_loss_17": 2.990015757083893, "ce_loss_23": 2.857475447654724, "ce_loss_3": 4.1610354542732235, "ce_loss_6": 3.8469629645347596, "epoch": 0.544, "grad_norm": 1192.0, "kl_loss_12": 1108.1345916748046, "kl_loss_17": 309.11339111328124, "kl_loss_3": 2719.430615234375, "kl_loss_6": 2098.4219177246096, "learning_rate": 0.00043827798722365264, "loss": 1571.2654, "step": 5440 }, { "ce_loss_12": 3.436587190628052, "ce_loss_17": 3.0982702732086183, "ce_loss_23": 2.97260959148407, "ce_loss_3": 4.212341475486755, "ce_loss_6": 3.909819185733795, "epoch": 0.545, "grad_norm": 1120.0, "kl_loss_12": 1080.8670532226563, "kl_loss_17": 298.4113143920898, "kl_loss_3": 2648.5758544921873, "kl_loss_6": 2044.9892578125, "learning_rate": 0.00043670377321312535, "loss": 1519.2725, "step": 5450 }, { "ce_loss_12": 3.437877380847931, "ce_loss_17": 3.1039116382598877, "ce_loss_23": 2.9818585395812987, "ce_loss_3": 4.211915624141693, "ce_loss_6": 3.904364216327667, "epoch": 0.546, "grad_norm": 1072.0, "kl_loss_12": 1069.5538299560546, "kl_loss_17": 291.2031845092773, "kl_loss_3": 2638.0355224609375, "kl_loss_6": 2021.8220092773438, "learning_rate": 0.0004351301965948991, "loss": 1536.6615, "step": 5460 }, { "ce_loss_12": 3.3516125679016113, "ce_loss_17": 3.0189313173294066, "ce_loss_23": 2.8931252121925355, "ce_loss_3": 4.143655979633332, "ce_loss_6": 3.828027617931366, "epoch": 0.547, "grad_norm": 1272.0, "kl_loss_12": 1059.2278198242188, "kl_loss_17": 289.7262649536133, "kl_loss_3": 2646.740625, "kl_loss_6": 2029.8422607421876, "learning_rate": 0.000433557273214873, "loss": 1524.926, "step": 5470 }, { "ce_loss_12": 3.354721283912659, "ce_loss_17": 3.0130093812942507, "ce_loss_23": 2.8745198965072634, "ce_loss_3": 4.152052211761474, "ce_loss_6": 3.8364554405212403, "epoch": 0.548, "grad_norm": 964.0, "kl_loss_12": 1084.157943725586, "kl_loss_17": 300.56928100585935, "kl_loss_3": 2678.8192504882813, "kl_loss_6": 2063.3562622070312, "learning_rate": 0.000431985018912368, "loss": 1519.3572, "step": 5480 }, { "ce_loss_12": 3.3481543898582458, "ce_loss_17": 2.9807775378227235, "ce_loss_23": 2.849691128730774, "ce_loss_3": 4.166516637802124, "ce_loss_6": 3.854253661632538, "epoch": 0.549, "grad_norm": 1352.0, "kl_loss_12": 1111.0652160644531, "kl_loss_17": 300.24488677978513, "kl_loss_3": 2764.147900390625, "kl_loss_6": 2148.22568359375, "learning_rate": 0.0004304134495199674, "loss": 1530.8162, "step": 5490 }, { "ce_loss_12": 3.375580167770386, "ce_loss_17": 3.0166457295417786, "ce_loss_23": 2.8813437461853026, "ce_loss_3": 4.167650175094605, "ce_loss_6": 3.8590697288513183, "epoch": 0.55, "grad_norm": 972.0, "kl_loss_12": 1129.8497314453125, "kl_loss_17": 304.87682189941404, "kl_loss_3": 2749.120849609375, "kl_loss_6": 2130.23251953125, "learning_rate": 0.0004288425808633575, "loss": 1552.1418, "step": 5500 }, { "ce_loss_12": 3.338499128818512, "ce_loss_17": 2.991779828071594, "ce_loss_23": 2.8615185499191282, "ce_loss_3": 4.142512774467468, "ce_loss_6": 3.8318737506866456, "epoch": 0.551, "grad_norm": 1128.0, "kl_loss_12": 1092.0799499511718, "kl_loss_17": 295.32262191772463, "kl_loss_3": 2709.7896728515625, "kl_loss_6": 2087.308660888672, "learning_rate": 0.0004272724287611684, "loss": 1550.8097, "step": 5510 }, { "ce_loss_12": 3.3272794246673585, "ce_loss_17": 2.9705721020698546, "ce_loss_23": 2.840547788143158, "ce_loss_3": 4.144940996170044, "ce_loss_6": 3.8349496006965635, "epoch": 0.552, "grad_norm": 1024.0, "kl_loss_12": 1108.8988861083985, "kl_loss_17": 299.7927978515625, "kl_loss_3": 2759.7510986328125, "kl_loss_6": 2144.3437255859376, "learning_rate": 0.00042570300902481425, "loss": 1561.2407, "step": 5520 }, { "ce_loss_12": 3.3361517786979675, "ce_loss_17": 2.996741271018982, "ce_loss_23": 2.869708800315857, "ce_loss_3": 4.126246166229248, "ce_loss_6": 3.8227033257484435, "epoch": 0.553, "grad_norm": 1168.0, "kl_loss_12": 1082.9974700927735, "kl_loss_17": 294.6476860046387, "kl_loss_3": 2682.4309326171874, "kl_loss_6": 2074.345251464844, "learning_rate": 0.00042413433745833423, "loss": 1533.8751, "step": 5530 }, { "ce_loss_12": 3.3447516441345213, "ce_loss_17": 2.9871742129325867, "ce_loss_23": 2.8585495829582213, "ce_loss_3": 4.156840574741364, "ce_loss_6": 3.842957389354706, "epoch": 0.554, "grad_norm": 964.0, "kl_loss_12": 1104.5059844970704, "kl_loss_17": 297.8919593811035, "kl_loss_3": 2739.675, "kl_loss_6": 2113.0805847167967, "learning_rate": 0.0004225664298582339, "loss": 1513.3228, "step": 5540 }, { "ce_loss_12": 3.413621115684509, "ce_loss_17": 3.0707615494728087, "ce_loss_23": 2.9419044971466066, "ce_loss_3": 4.194462668895722, "ce_loss_6": 3.888316106796265, "epoch": 0.555, "grad_norm": 1184.0, "kl_loss_12": 1069.3179565429687, "kl_loss_17": 291.9802551269531, "kl_loss_3": 2635.356201171875, "kl_loss_6": 2029.8074829101563, "learning_rate": 0.000420999302013325, "loss": 1507.5443, "step": 5550 }, { "ce_loss_12": 3.346064102649689, "ce_loss_17": 2.9775984048843385, "ce_loss_23": 2.8422580122947694, "ce_loss_3": 4.203102040290832, "ce_loss_6": 3.8697832822799683, "epoch": 0.556, "grad_norm": 1136.0, "kl_loss_12": 1128.6266021728516, "kl_loss_17": 312.0568115234375, "kl_loss_3": 2840.6982421875, "kl_loss_6": 2181.8872680664062, "learning_rate": 0.000419432969704568, "loss": 1557.96, "step": 5560 }, { "ce_loss_12": 3.357046055793762, "ce_loss_17": 3.008342170715332, "ce_loss_23": 2.8809687376022337, "ce_loss_3": 4.14723539352417, "ce_loss_6": 3.8425039291381835, "epoch": 0.557, "grad_norm": 1024.0, "kl_loss_12": 1081.215493774414, "kl_loss_17": 295.9071792602539, "kl_loss_3": 2669.309045410156, "kl_loss_6": 2059.5223022460937, "learning_rate": 0.00041786744870491154, "loss": 1568.9957, "step": 5570 }, { "ce_loss_12": 3.3172632813453675, "ce_loss_17": 2.955425262451172, "ce_loss_23": 2.821260964870453, "ce_loss_3": 4.119695484638214, "ce_loss_6": 3.806350862979889, "epoch": 0.558, "grad_norm": 828.0, "kl_loss_12": 1118.678955078125, "kl_loss_17": 302.5011444091797, "kl_loss_3": 2731.080847167969, "kl_loss_6": 2115.9034118652344, "learning_rate": 0.0004163027547791347, "loss": 1544.0346, "step": 5580 }, { "ce_loss_12": 3.309348404407501, "ce_loss_17": 2.947772943973541, "ce_loss_23": 2.814731788635254, "ce_loss_3": 4.156531059741974, "ce_loss_6": 3.8248415350914002, "epoch": 0.559, "grad_norm": 1240.0, "kl_loss_12": 1114.9301391601562, "kl_loss_17": 301.8474456787109, "kl_loss_3": 2811.576452636719, "kl_loss_6": 2152.8734436035156, "learning_rate": 0.0004147389036836881, "loss": 1565.2141, "step": 5590 }, { "ce_loss_12": 3.3516384840011595, "ce_loss_17": 2.987208640575409, "ce_loss_23": 2.8559449911117554, "ce_loss_3": 4.1557383179664615, "ce_loss_6": 3.838567817211151, "epoch": 0.56, "grad_norm": 1152.0, "kl_loss_12": 1114.13662109375, "kl_loss_17": 300.4609680175781, "kl_loss_3": 2724.2331665039064, "kl_loss_6": 2099.025762939453, "learning_rate": 0.00041317591116653486, "loss": 1581.426, "step": 5600 }, { "ce_loss_12": 3.375890648365021, "ce_loss_17": 3.016210114955902, "ce_loss_23": 2.8871567368507387, "ce_loss_3": 4.192247068881988, "ce_loss_6": 3.8769724011421203, "epoch": 0.561, "grad_norm": 868.0, "kl_loss_12": 1118.3572082519531, "kl_loss_17": 306.61693572998047, "kl_loss_3": 2753.4271606445313, "kl_loss_6": 2124.9645141601563, "learning_rate": 0.0004116137929669921, "loss": 1550.451, "step": 5610 }, { "ce_loss_12": 3.370885908603668, "ce_loss_17": 3.0194214582443237, "ce_loss_23": 2.886621868610382, "ce_loss_3": 4.162791633605957, "ce_loss_6": 3.8525096774101257, "epoch": 0.562, "grad_norm": 984.0, "kl_loss_12": 1096.9350799560548, "kl_loss_17": 294.7433792114258, "kl_loss_3": 2698.144323730469, "kl_loss_6": 2083.495526123047, "learning_rate": 0.00041005256481557305, "loss": 1525.6439, "step": 5620 }, { "ce_loss_12": 3.4280372262001038, "ce_loss_17": 3.095071315765381, "ce_loss_23": 2.9774329543113707, "ce_loss_3": 4.1962571144104, "ce_loss_6": 3.8930899143218993, "epoch": 0.563, "grad_norm": 976.0, "kl_loss_12": 1045.3597290039063, "kl_loss_17": 284.91689529418943, "kl_loss_3": 2594.1365478515627, "kl_loss_6": 1992.5528503417968, "learning_rate": 0.00040849224243382767, "loss": 1503.5002, "step": 5630 }, { "ce_loss_12": 3.322995662689209, "ce_loss_17": 2.9679643034935, "ce_loss_23": 2.8381430983543394, "ce_loss_3": 4.143428933620453, "ce_loss_6": 3.8188948750495912, "epoch": 0.564, "grad_norm": 1144.0, "kl_loss_12": 1107.2206665039062, "kl_loss_17": 296.03601531982423, "kl_loss_3": 2743.397814941406, "kl_loss_6": 2105.9764099121094, "learning_rate": 0.000406932841534185, "loss": 1529.3828, "step": 5640 }, { "ce_loss_12": 3.2935484528541563, "ce_loss_17": 2.938543951511383, "ce_loss_23": 2.8072848439216616, "ce_loss_3": 4.1067038655281065, "ce_loss_6": 3.789441239833832, "epoch": 0.565, "grad_norm": 1280.0, "kl_loss_12": 1104.0531219482423, "kl_loss_17": 301.8814224243164, "kl_loss_3": 2737.7357543945313, "kl_loss_6": 2116.7823181152344, "learning_rate": 0.0004053743778197951, "loss": 1591.8746, "step": 5650 }, { "ce_loss_12": 3.391418957710266, "ce_loss_17": 3.033774769306183, "ce_loss_23": 2.9061689734458924, "ce_loss_3": 4.182014131546021, "ce_loss_6": 3.885739576816559, "epoch": 0.566, "grad_norm": 984.0, "kl_loss_12": 1107.833056640625, "kl_loss_17": 303.8883712768555, "kl_loss_3": 2684.518017578125, "kl_loss_6": 2084.1996826171876, "learning_rate": 0.0004038168669843697, "loss": 1568.2076, "step": 5660 }, { "ce_loss_12": 3.3283536434173584, "ce_loss_17": 2.98902086019516, "ce_loss_23": 2.8606817722320557, "ce_loss_3": 4.110491549968719, "ce_loss_6": 3.8071823596954344, "epoch": 0.567, "grad_norm": 1144.0, "kl_loss_12": 1078.9445007324218, "kl_loss_17": 296.4503967285156, "kl_loss_3": 2661.1900146484377, "kl_loss_6": 2052.0313110351562, "learning_rate": 0.000402260324712026, "loss": 1558.9467, "step": 5670 }, { "ce_loss_12": 3.3894649267196657, "ce_loss_17": 3.0279223918914795, "ce_loss_23": 2.903757655620575, "ce_loss_3": 4.205927729606628, "ce_loss_6": 3.8925053119659423, "epoch": 0.568, "grad_norm": 1080.0, "kl_loss_12": 1102.922415161133, "kl_loss_17": 291.1776184082031, "kl_loss_3": 2740.9990234375, "kl_loss_6": 2123.743518066406, "learning_rate": 0.00040070476667712743, "loss": 1536.7454, "step": 5680 }, { "ce_loss_12": 3.401654672622681, "ce_loss_17": 3.0565669178962707, "ce_loss_23": 2.927121162414551, "ce_loss_3": 4.19632499217987, "ce_loss_6": 3.8915048956871034, "epoch": 0.569, "grad_norm": 988.0, "kl_loss_12": 1081.9984588623047, "kl_loss_17": 295.2158767700195, "kl_loss_3": 2684.343957519531, "kl_loss_6": 2082.0970153808594, "learning_rate": 0.0003991502085441259, "loss": 1547.1768, "step": 5690 }, { "ce_loss_12": 3.4185086369514464, "ce_loss_17": 3.090511643886566, "ce_loss_23": 2.966333067417145, "ce_loss_3": 4.192823874950409, "ce_loss_6": 3.8864670515060427, "epoch": 0.57, "grad_norm": 1704.0, "kl_loss_12": 1051.5930419921874, "kl_loss_17": 288.72295532226565, "kl_loss_3": 2605.3867065429686, "kl_loss_6": 2003.20048828125, "learning_rate": 0.0003975966659674047, "loss": 1528.4219, "step": 5700 }, { "ce_loss_12": 3.411479675769806, "ce_loss_17": 3.0676485657691956, "ce_loss_23": 2.9371942400932314, "ce_loss_3": 4.217548334598542, "ce_loss_6": 3.8981226801872255, "epoch": 0.571, "grad_norm": 1040.0, "kl_loss_12": 1084.3829620361328, "kl_loss_17": 297.86733169555663, "kl_loss_3": 2690.292272949219, "kl_loss_6": 2069.090856933594, "learning_rate": 0.0003960441545911204, "loss": 1523.9072, "step": 5710 }, { "ce_loss_12": 3.3867371439933778, "ce_loss_17": 3.0464473128318788, "ce_loss_23": 2.924866199493408, "ce_loss_3": 4.1862914800643924, "ce_loss_6": 3.866020941734314, "epoch": 0.572, "grad_norm": 1008.0, "kl_loss_12": 1086.786917114258, "kl_loss_17": 293.1391548156738, "kl_loss_3": 2695.4630493164063, "kl_loss_6": 2060.2872924804688, "learning_rate": 0.0003944926900490452, "loss": 1528.1995, "step": 5720 }, { "ce_loss_12": 3.3431777358055115, "ce_loss_17": 2.981088709831238, "ce_loss_23": 2.847019040584564, "ce_loss_3": 4.164076113700867, "ce_loss_6": 3.845302677154541, "epoch": 0.573, "grad_norm": 1120.0, "kl_loss_12": 1116.8780670166016, "kl_loss_17": 300.9324645996094, "kl_loss_3": 2760.368933105469, "kl_loss_6": 2132.633331298828, "learning_rate": 0.0003929422879644099, "loss": 1547.4666, "step": 5730 }, { "ce_loss_12": 3.316387987136841, "ce_loss_17": 2.9817095041275024, "ce_loss_23": 2.8599035263061525, "ce_loss_3": 4.11018670797348, "ce_loss_6": 3.802216875553131, "epoch": 0.574, "grad_norm": 952.0, "kl_loss_12": 1063.2005157470703, "kl_loss_17": 289.11716537475587, "kl_loss_3": 2670.6744995117188, "kl_loss_6": 2049.6060180664062, "learning_rate": 0.0003913929639497462, "loss": 1492.7935, "step": 5740 }, { "ce_loss_12": 3.2893279433250426, "ce_loss_17": 2.9399860501289368, "ce_loss_23": 2.8129002928733824, "ce_loss_3": 4.120655429363251, "ce_loss_6": 3.805050051212311, "epoch": 0.575, "grad_norm": 1144.0, "kl_loss_12": 1088.0894744873046, "kl_loss_17": 292.67746505737307, "kl_loss_3": 2748.174462890625, "kl_loss_6": 2129.8866577148438, "learning_rate": 0.00038984473360672965, "loss": 1527.1544, "step": 5750 }, { "ce_loss_12": 3.3022054076194762, "ce_loss_17": 2.9506754994392397, "ce_loss_23": 2.822495758533478, "ce_loss_3": 4.120715641975403, "ce_loss_6": 3.8015459895133974, "epoch": 0.576, "grad_norm": 1072.0, "kl_loss_12": 1087.9370819091796, "kl_loss_17": 292.52199630737306, "kl_loss_3": 2732.5151489257814, "kl_loss_6": 2102.5725280761717, "learning_rate": 0.0003882976125260229, "loss": 1520.1432, "step": 5760 }, { "ce_loss_12": 3.3552067637443543, "ce_loss_17": 3.0094610452651978, "ce_loss_23": 2.87836571931839, "ce_loss_3": 4.162405633926392, "ce_loss_6": 3.8583031415939333, "epoch": 0.577, "grad_norm": 1168.0, "kl_loss_12": 1082.5192932128907, "kl_loss_17": 295.40897369384766, "kl_loss_3": 2708.016064453125, "kl_loss_6": 2094.8595703125, "learning_rate": 0.00038675161628711776, "loss": 1544.6213, "step": 5770 }, { "ce_loss_12": 3.385429286956787, "ce_loss_17": 3.0453956604003904, "ce_loss_23": 2.9156638741493226, "ce_loss_3": 4.1767893671989444, "ce_loss_6": 3.860312449932098, "epoch": 0.578, "grad_norm": 1088.0, "kl_loss_12": 1070.3292022705077, "kl_loss_17": 294.337052154541, "kl_loss_3": 2652.4936645507814, "kl_loss_6": 2038.156591796875, "learning_rate": 0.0003852067604581794, "loss": 1563.9108, "step": 5780 }, { "ce_loss_12": 3.3374809861183166, "ce_loss_17": 2.9879353404045106, "ce_loss_23": 2.8669142842292787, "ce_loss_3": 4.147375965118409, "ce_loss_6": 3.835199749469757, "epoch": 0.579, "grad_norm": 1144.0, "kl_loss_12": 1090.973321533203, "kl_loss_17": 289.15511627197264, "kl_loss_3": 2724.677099609375, "kl_loss_6": 2117.0361877441405, "learning_rate": 0.0003836630605958888, "loss": 1532.849, "step": 5790 }, { "ce_loss_12": 3.3952765345573424, "ce_loss_17": 3.048290753364563, "ce_loss_23": 2.9225462913513183, "ce_loss_3": 4.181715559959412, "ce_loss_6": 3.877750539779663, "epoch": 0.58, "grad_norm": 1376.0, "kl_loss_12": 1092.2751861572265, "kl_loss_17": 293.92593612670896, "kl_loss_3": 2685.8295288085938, "kl_loss_6": 2082.6134765625, "learning_rate": 0.0003821205322452863, "loss": 1585.8538, "step": 5800 }, { "ce_loss_12": 3.372221601009369, "ce_loss_17": 3.0282418727874756, "ce_loss_23": 2.9078637599945067, "ce_loss_3": 4.161537182331085, "ce_loss_6": 3.854393744468689, "epoch": 0.581, "grad_norm": 1152.0, "kl_loss_12": 1074.3501007080079, "kl_loss_17": 288.7527191162109, "kl_loss_3": 2669.658166503906, "kl_loss_6": 2063.708355712891, "learning_rate": 0.0003805791909396155, "loss": 1528.6066, "step": 5810 }, { "ce_loss_12": 3.33404426574707, "ce_loss_17": 2.97887327671051, "ce_loss_23": 2.8551398634910585, "ce_loss_3": 4.130940783023834, "ce_loss_6": 3.823299324512482, "epoch": 0.582, "grad_norm": 1208.0, "kl_loss_12": 1079.6981231689454, "kl_loss_17": 288.92149887084963, "kl_loss_3": 2689.42265625, "kl_loss_6": 2081.0817321777345, "learning_rate": 0.0003790390522001662, "loss": 1541.9953, "step": 5820 }, { "ce_loss_12": 3.2722179651260377, "ce_loss_17": 2.9275118470191956, "ce_loss_23": 2.8062780618667604, "ce_loss_3": 4.0827684044837955, "ce_loss_6": 3.774982988834381, "epoch": 0.583, "grad_norm": 908.0, "kl_loss_12": 1081.150732421875, "kl_loss_17": 287.35875778198243, "kl_loss_3": 2712.231884765625, "kl_loss_6": 2095.866046142578, "learning_rate": 0.0003775001315361183, "loss": 1519.208, "step": 5830 }, { "ce_loss_12": 3.374925982952118, "ce_loss_17": 3.0226351737976076, "ce_loss_23": 2.891891610622406, "ce_loss_3": 4.189905488491059, "ce_loss_6": 3.868648278713226, "epoch": 0.584, "grad_norm": 1264.0, "kl_loss_12": 1097.4136840820313, "kl_loss_17": 297.0824569702148, "kl_loss_3": 2729.2060546875, "kl_loss_6": 2104.550671386719, "learning_rate": 0.0003759624444443858, "loss": 1549.6725, "step": 5840 }, { "ce_loss_12": 3.3894322991371153, "ce_loss_17": 3.0529601216316222, "ce_loss_23": 2.9311428904533385, "ce_loss_3": 4.178544974327087, "ce_loss_6": 3.8757920384407045, "epoch": 0.585, "grad_norm": 1264.0, "kl_loss_12": 1068.644955444336, "kl_loss_17": 287.98411254882814, "kl_loss_3": 2671.1638671875, "kl_loss_6": 2065.733996582031, "learning_rate": 0.00037442600640946044, "loss": 1513.0081, "step": 5850 }, { "ce_loss_12": 3.3614978075027464, "ce_loss_17": 3.022676134109497, "ce_loss_23": 2.89876549243927, "ce_loss_3": 4.150962948799133, "ce_loss_6": 3.8397781610488892, "epoch": 0.586, "grad_norm": 1096.0, "kl_loss_12": 1074.1620147705078, "kl_loss_17": 291.1872756958008, "kl_loss_3": 2674.4105712890623, "kl_loss_6": 2057.2900573730467, "learning_rate": 0.00037289083290325663, "loss": 1500.6721, "step": 5860 }, { "ce_loss_12": 3.3385061502456663, "ce_loss_17": 2.9948259949684144, "ce_loss_23": 2.867872619628906, "ce_loss_3": 4.128155767917633, "ce_loss_6": 3.8193222999572756, "epoch": 0.587, "grad_norm": 1136.0, "kl_loss_12": 1057.0669647216796, "kl_loss_17": 289.5401077270508, "kl_loss_3": 2640.21787109375, "kl_loss_6": 2028.2294311523438, "learning_rate": 0.0003713569393849543, "loss": 1510.5493, "step": 5870 }, { "ce_loss_12": 3.3919139862060548, "ce_loss_17": 3.051191568374634, "ce_loss_23": 2.9242817759513855, "ce_loss_3": 4.1811758518219, "ce_loss_6": 3.875462770462036, "epoch": 0.588, "grad_norm": 1128.0, "kl_loss_12": 1079.9062591552733, "kl_loss_17": 290.8986526489258, "kl_loss_3": 2669.1223266601564, "kl_loss_6": 2065.6935485839845, "learning_rate": 0.00036982434130084397, "loss": 1528.3336, "step": 5880 }, { "ce_loss_12": 3.319986581802368, "ce_loss_17": 2.9722229123115538, "ce_loss_23": 2.8413925766944885, "ce_loss_3": 4.113067412376404, "ce_loss_6": 3.7913942337036133, "epoch": 0.589, "grad_norm": 1008.0, "kl_loss_12": 1089.219302368164, "kl_loss_17": 301.8997085571289, "kl_loss_3": 2681.680615234375, "kl_loss_6": 2054.2447814941406, "learning_rate": 0.00036829305408417166, "loss": 1546.4316, "step": 5890 }, { "ce_loss_12": 3.3140172004699706, "ce_loss_17": 2.9589925050735473, "ce_loss_23": 2.8260411262512206, "ce_loss_3": 4.134636390209198, "ce_loss_6": 3.8160529017448424, "epoch": 0.59, "grad_norm": 964.0, "kl_loss_12": 1104.7077758789062, "kl_loss_17": 300.90020446777345, "kl_loss_3": 2747.9242553710938, "kl_loss_6": 2121.038653564453, "learning_rate": 0.0003667630931549826, "loss": 1544.2424, "step": 5900 }, { "ce_loss_12": 3.2931908488273622, "ce_loss_17": 2.9319966673851012, "ce_loss_23": 2.8010067105293275, "ce_loss_3": 4.142383825778961, "ce_loss_6": 3.8219820141792296, "epoch": 0.591, "grad_norm": 1536.0, "kl_loss_12": 1120.0817626953126, "kl_loss_17": 296.99367065429686, "kl_loss_3": 2830.34755859375, "kl_loss_6": 2194.107342529297, "learning_rate": 0.00036523447391996613, "loss": 1573.1694, "step": 5910 }, { "ce_loss_12": 3.3541221261024474, "ce_loss_17": 3.0086112022399902, "ce_loss_23": 2.8847101092338563, "ce_loss_3": 4.145113706588745, "ce_loss_6": 3.8366657137870788, "epoch": 0.592, "grad_norm": 1160.0, "kl_loss_12": 1060.8351257324218, "kl_loss_17": 284.69307098388674, "kl_loss_3": 2644.6882934570312, "kl_loss_6": 2042.1166931152343, "learning_rate": 0.00036370721177230114, "loss": 1507.935, "step": 5920 }, { "ce_loss_12": 3.3653964400291443, "ce_loss_17": 3.015224003791809, "ce_loss_23": 2.8863983511924745, "ce_loss_3": 4.187268912792206, "ce_loss_6": 3.864014434814453, "epoch": 0.593, "grad_norm": 1064.0, "kl_loss_12": 1098.9212554931642, "kl_loss_17": 301.37811279296875, "kl_loss_3": 2741.749865722656, "kl_loss_6": 2103.893975830078, "learning_rate": 0.00036218132209150044, "loss": 1543.1186, "step": 5930 }, { "ce_loss_12": 3.3391342997550963, "ce_loss_17": 2.9664016723632813, "ce_loss_23": 2.832013189792633, "ce_loss_3": 4.177560043334961, "ce_loss_6": 3.8474175453186037, "epoch": 0.594, "grad_norm": 944.0, "kl_loss_12": 1139.9402313232422, "kl_loss_17": 309.586328125, "kl_loss_3": 2825.787255859375, "kl_loss_6": 2181.953405761719, "learning_rate": 0.0003606568202432562, "loss": 1574.9494, "step": 5940 }, { "ce_loss_12": 3.393070602416992, "ce_loss_17": 3.0422707915306093, "ce_loss_23": 2.9140780329704286, "ce_loss_3": 4.2164586067199705, "ce_loss_6": 3.901226615905762, "epoch": 0.595, "grad_norm": 1200.0, "kl_loss_12": 1117.1515686035157, "kl_loss_17": 299.17299728393556, "kl_loss_3": 2776.9905517578127, "kl_loss_6": 2153.523962402344, "learning_rate": 0.0003591337215792851, "loss": 1532.3694, "step": 5950 }, { "ce_loss_12": 3.402258062362671, "ce_loss_17": 3.064120590686798, "ce_loss_23": 2.9467490553855895, "ce_loss_3": 4.156550872325897, "ce_loss_6": 3.868350923061371, "epoch": 0.596, "grad_norm": 964.0, "kl_loss_12": 1057.8695495605468, "kl_loss_17": 279.5248458862305, "kl_loss_3": 2594.1113037109376, "kl_loss_6": 2025.8124206542968, "learning_rate": 0.00035761204143717383, "loss": 1517.4082, "step": 5960 }, { "ce_loss_12": 3.369770860671997, "ce_loss_17": 3.0239328026771544, "ce_loss_23": 2.8976298213005065, "ce_loss_3": 4.161876916885376, "ce_loss_6": 3.8617225527763366, "epoch": 0.597, "grad_norm": 1064.0, "kl_loss_12": 1086.6250274658203, "kl_loss_17": 296.2882781982422, "kl_loss_3": 2684.7556884765627, "kl_loss_6": 2089.285284423828, "learning_rate": 0.0003560917951402245, "loss": 1572.7229, "step": 5970 }, { "ce_loss_12": 3.3533798694610595, "ce_loss_17": 3.0037330985069275, "ce_loss_23": 2.8826130986213685, "ce_loss_3": 4.144544363021851, "ce_loss_6": 3.841200351715088, "epoch": 0.598, "grad_norm": 1304.0, "kl_loss_12": 1080.194351196289, "kl_loss_17": 288.0468276977539, "kl_loss_3": 2674.7138916015624, "kl_loss_6": 2076.2087158203126, "learning_rate": 0.00035457299799730046, "loss": 1528.053, "step": 5980 }, { "ce_loss_12": 3.4121166348457335, "ce_loss_17": 3.0635475873947144, "ce_loss_23": 2.939061403274536, "ce_loss_3": 4.191158545017243, "ce_loss_6": 3.8858134865760805, "epoch": 0.599, "grad_norm": 1056.0, "kl_loss_12": 1076.3190643310547, "kl_loss_17": 291.5554595947266, "kl_loss_3": 2657.576770019531, "kl_loss_6": 2046.1385803222656, "learning_rate": 0.0003530556653026721, "loss": 1528.4061, "step": 5990 }, { "ce_loss_12": 3.338744008541107, "ce_loss_17": 2.991839051246643, "ce_loss_23": 2.8644071221351624, "ce_loss_3": 4.148921036720276, "ce_loss_6": 3.8365323424339293, "epoch": 0.6, "grad_norm": 1528.0, "kl_loss_12": 1065.605661010742, "kl_loss_17": 288.03568267822266, "kl_loss_3": 2682.4912841796877, "kl_loss_6": 2081.639971923828, "learning_rate": 0.00035153981233586274, "loss": 1541.0759, "step": 6000 }, { "ce_loss_12": 3.308136820793152, "ce_loss_17": 2.9552505135536196, "ce_loss_23": 2.83356112241745, "ce_loss_3": 4.110099685192108, "ce_loss_6": 3.799723744392395, "epoch": 0.601, "grad_norm": 1104.0, "kl_loss_12": 1082.6001922607422, "kl_loss_17": 285.3096450805664, "kl_loss_3": 2685.622314453125, "kl_loss_6": 2073.3991088867188, "learning_rate": 0.00035002545436149473, "loss": 1580.2212, "step": 6010 }, { "ce_loss_12": 3.3342719674110413, "ce_loss_17": 2.977822244167328, "ce_loss_23": 2.8470508456230164, "ce_loss_3": 4.141815936565399, "ce_loss_6": 3.836508405208588, "epoch": 0.602, "grad_norm": 1056.0, "kl_loss_12": 1109.7272827148438, "kl_loss_17": 302.80249938964846, "kl_loss_3": 2743.32158203125, "kl_loss_6": 2125.801647949219, "learning_rate": 0.0003485126066291364, "loss": 1532.795, "step": 6020 }, { "ce_loss_12": 3.3519185185432434, "ce_loss_17": 3.004382920265198, "ce_loss_23": 2.879039800167084, "ce_loss_3": 4.151576399803162, "ce_loss_6": 3.853084945678711, "epoch": 0.603, "grad_norm": 928.0, "kl_loss_12": 1079.1995330810546, "kl_loss_17": 286.18796691894534, "kl_loss_3": 2690.309191894531, "kl_loss_6": 2088.8179321289062, "learning_rate": 0.0003470012843731476, "loss": 1541.0395, "step": 6030 }, { "ce_loss_12": 3.309786891937256, "ce_loss_17": 2.958170974254608, "ce_loss_23": 2.832332801818848, "ce_loss_3": 4.122778058052063, "ce_loss_6": 3.8115728855133058, "epoch": 0.604, "grad_norm": 976.0, "kl_loss_12": 1085.630792236328, "kl_loss_17": 290.9824615478516, "kl_loss_3": 2714.005078125, "kl_loss_6": 2106.4618286132813, "learning_rate": 0.00034549150281252633, "loss": 1575.2684, "step": 6040 }, { "ce_loss_12": 3.2901809930801393, "ce_loss_17": 2.949860167503357, "ce_loss_23": 2.8217196464538574, "ce_loss_3": 4.078897643089294, "ce_loss_6": 3.7691982269287108, "epoch": 0.605, "grad_norm": 844.0, "kl_loss_12": 1068.8934967041016, "kl_loss_17": 295.5742889404297, "kl_loss_3": 2643.9244140625, "kl_loss_6": 2034.2466369628905, "learning_rate": 0.0003439832771507565, "loss": 1510.9966, "step": 6050 }, { "ce_loss_12": 3.3043904185295103, "ce_loss_17": 2.9475502729415894, "ce_loss_23": 2.8230517506599426, "ce_loss_3": 4.097569191455841, "ce_loss_6": 3.7958645701408384, "epoch": 0.606, "grad_norm": 920.0, "kl_loss_12": 1094.1976593017578, "kl_loss_17": 292.10668869018554, "kl_loss_3": 2699.2143432617186, "kl_loss_6": 2095.2228515625, "learning_rate": 0.0003424766225756537, "loss": 1520.6666, "step": 6060 }, { "ce_loss_12": 3.352240562438965, "ce_loss_17": 3.0022374629974364, "ce_loss_23": 2.880287563800812, "ce_loss_3": 4.148053014278412, "ce_loss_6": 3.8432873129844665, "epoch": 0.607, "grad_norm": 1328.0, "kl_loss_12": 1085.2191497802735, "kl_loss_17": 289.7973358154297, "kl_loss_3": 2683.3669921875, "kl_loss_6": 2085.138427734375, "learning_rate": 0.00034097155425921255, "loss": 1509.973, "step": 6070 }, { "ce_loss_12": 3.2671317934989927, "ce_loss_17": 2.9134520411491396, "ce_loss_23": 2.7855666875839233, "ce_loss_3": 4.085386347770691, "ce_loss_6": 3.767521357536316, "epoch": 0.608, "grad_norm": 996.0, "kl_loss_12": 1102.0127227783203, "kl_loss_17": 293.87185592651366, "kl_loss_3": 2754.059533691406, "kl_loss_6": 2126.1028076171874, "learning_rate": 0.0003394680873574546, "loss": 1540.0117, "step": 6080 }, { "ce_loss_12": 3.368610906600952, "ce_loss_17": 3.0045804500579836, "ce_loss_23": 2.8808337569236757, "ce_loss_3": 4.169866585731507, "ce_loss_6": 3.8673511505126954, "epoch": 0.609, "grad_norm": 1152.0, "kl_loss_12": 1115.8485717773438, "kl_loss_17": 297.3110382080078, "kl_loss_3": 2746.35283203125, "kl_loss_6": 2137.1409240722655, "learning_rate": 0.0003379662370102747, "loss": 1539.9181, "step": 6090 }, { "ce_loss_12": 3.353055679798126, "ce_loss_17": 3.0137892961502075, "ce_loss_23": 2.893881106376648, "ce_loss_3": 4.145450925827026, "ce_loss_6": 3.8335773825645445, "epoch": 0.61, "grad_norm": 1064.0, "kl_loss_12": 1075.17353515625, "kl_loss_17": 288.5395881652832, "kl_loss_3": 2682.5274169921877, "kl_loss_6": 2063.0679138183596, "learning_rate": 0.0003364660183412892, "loss": 1531.6078, "step": 6100 }, { "ce_loss_12": 3.3445538878440857, "ce_loss_17": 3.0021854639053345, "ce_loss_23": 2.8739949345588682, "ce_loss_3": 4.135062468051911, "ce_loss_6": 3.8257460951805116, "epoch": 0.611, "grad_norm": 1480.0, "kl_loss_12": 1095.9966613769532, "kl_loss_17": 294.90844039916993, "kl_loss_3": 2693.2832275390624, "kl_loss_6": 2079.3878479003906, "learning_rate": 0.0003349674464576834, "loss": 1549.767, "step": 6110 }, { "ce_loss_12": 3.303148889541626, "ce_loss_17": 2.95277715921402, "ce_loss_23": 2.825856649875641, "ce_loss_3": 4.105974912643433, "ce_loss_6": 3.79825245141983, "epoch": 0.612, "grad_norm": 1136.0, "kl_loss_12": 1088.9236877441406, "kl_loss_17": 294.0137344360352, "kl_loss_3": 2725.0036499023436, "kl_loss_6": 2109.8044250488283, "learning_rate": 0.00033347053645005966, "loss": 1508.8463, "step": 6120 }, { "ce_loss_12": 3.3814459562301638, "ce_loss_17": 3.038688623905182, "ce_loss_23": 2.918603265285492, "ce_loss_3": 4.158742451667786, "ce_loss_6": 3.853513276576996, "epoch": 0.613, "grad_norm": 1264.0, "kl_loss_12": 1059.3533569335937, "kl_loss_17": 285.41724853515626, "kl_loss_3": 2617.593151855469, "kl_loss_6": 2014.1167114257812, "learning_rate": 0.00033197530339228485, "loss": 1518.3242, "step": 6130 }, { "ce_loss_12": 3.360202980041504, "ce_loss_17": 3.005841851234436, "ce_loss_23": 2.877064311504364, "ce_loss_3": 4.150792407989502, "ce_loss_6": 3.839804494380951, "epoch": 0.614, "grad_norm": 1232.0, "kl_loss_12": 1087.1132446289062, "kl_loss_17": 294.73849563598634, "kl_loss_3": 2674.4955078125, "kl_loss_6": 2059.744171142578, "learning_rate": 0.00033048176234133967, "loss": 1521.8021, "step": 6140 }, { "ce_loss_12": 3.341773736476898, "ce_loss_17": 3.001604068279266, "ce_loss_23": 2.877167117595673, "ce_loss_3": 4.13272854089737, "ce_loss_6": 3.822968912124634, "epoch": 0.615, "grad_norm": 1000.0, "kl_loss_12": 1092.6182922363282, "kl_loss_17": 295.6040756225586, "kl_loss_3": 2697.6044677734376, "kl_loss_6": 2079.6624145507812, "learning_rate": 0.0003289899283371657, "loss": 1546.1363, "step": 6150 }, { "ce_loss_12": 3.351723313331604, "ce_loss_17": 3.0066920042037966, "ce_loss_23": 2.885224533081055, "ce_loss_3": 4.161774778366089, "ce_loss_6": 3.8526275753974915, "epoch": 0.616, "grad_norm": 1056.0, "kl_loss_12": 1060.0194366455078, "kl_loss_17": 285.7883804321289, "kl_loss_3": 2672.10927734375, "kl_loss_6": 2070.3538452148437, "learning_rate": 0.0003274998164025148, "loss": 1549.4511, "step": 6160 }, { "ce_loss_12": 3.387555170059204, "ce_loss_17": 3.046477997303009, "ce_loss_23": 2.919628012180328, "ce_loss_3": 4.181600368022918, "ce_loss_6": 3.8731335401535034, "epoch": 0.617, "grad_norm": 1064.0, "kl_loss_12": 1092.581869506836, "kl_loss_17": 296.49526062011716, "kl_loss_3": 2683.827392578125, "kl_loss_6": 2079.1006103515624, "learning_rate": 0.0003260114415427975, "loss": 1567.9312, "step": 6170 }, { "ce_loss_12": 3.3219369292259215, "ce_loss_17": 2.9732369661331175, "ce_loss_23": 2.8491630494594573, "ce_loss_3": 4.150906503200531, "ce_loss_6": 3.8287518501281737, "epoch": 0.618, "grad_norm": 1208.0, "kl_loss_12": 1091.444174194336, "kl_loss_17": 292.62363357543944, "kl_loss_3": 2767.002648925781, "kl_loss_6": 2123.3348022460937, "learning_rate": 0.0003245248187459323, "loss": 1571.918, "step": 6180 }, { "ce_loss_12": 3.29248104095459, "ce_loss_17": 2.964356517791748, "ce_loss_23": 2.8416656017303468, "ce_loss_3": 4.0751290082931515, "ce_loss_6": 3.7665364861488344, "epoch": 0.619, "grad_norm": 1960.0, "kl_loss_12": 1048.1108001708985, "kl_loss_17": 282.0491600036621, "kl_loss_3": 2624.9859619140625, "kl_loss_6": 2019.6050048828124, "learning_rate": 0.00032303996298219416, "loss": 1493.8592, "step": 6190 }, { "ce_loss_12": 3.3730963468551636, "ce_loss_17": 3.0364683270454407, "ce_loss_23": 2.9132092356681825, "ce_loss_3": 4.150227224826812, "ce_loss_6": 3.8517947673797606, "epoch": 0.62, "grad_norm": 1160.0, "kl_loss_12": 1046.0441528320312, "kl_loss_17": 284.50682067871094, "kl_loss_3": 2596.279650878906, "kl_loss_6": 2006.559521484375, "learning_rate": 0.00032155688920406414, "loss": 1493.982, "step": 6200 }, { "ce_loss_12": 3.307856547832489, "ce_loss_17": 2.9499720454216005, "ce_loss_23": 2.8251369833946227, "ce_loss_3": 4.141007030010224, "ce_loss_6": 3.8185084462165833, "epoch": 0.621, "grad_norm": 1040.0, "kl_loss_12": 1089.2369049072265, "kl_loss_17": 294.6406188964844, "kl_loss_3": 2752.1582153320314, "kl_loss_6": 2122.614605712891, "learning_rate": 0.0003200756123460788, "loss": 1577.3812, "step": 6210 }, { "ce_loss_12": 3.351741683483124, "ce_loss_17": 2.993086099624634, "ce_loss_23": 2.863808012008667, "ce_loss_3": 4.161297249794006, "ce_loss_6": 3.848933458328247, "epoch": 0.622, "grad_norm": 1960.0, "kl_loss_12": 1108.0446380615235, "kl_loss_17": 299.9723045349121, "kl_loss_3": 2750.4146240234377, "kl_loss_6": 2129.6673889160156, "learning_rate": 0.00031859614732467957, "loss": 1558.543, "step": 6220 }, { "ce_loss_12": 3.375869262218475, "ce_loss_17": 3.037969648838043, "ce_loss_23": 2.9146955251693725, "ce_loss_3": 4.169191467761993, "ce_loss_6": 3.851213252544403, "epoch": 0.623, "grad_norm": 992.0, "kl_loss_12": 1053.6349365234375, "kl_loss_17": 285.6215026855469, "kl_loss_3": 2644.1402099609377, "kl_loss_6": 2024.4838745117188, "learning_rate": 0.00031711850903806275, "loss": 1507.5234, "step": 6230 }, { "ce_loss_12": 3.3069013476371767, "ce_loss_17": 2.9518585205078125, "ce_loss_23": 2.822894871234894, "ce_loss_3": 4.115435206890107, "ce_loss_6": 3.8047771573066713, "epoch": 0.624, "grad_norm": 1264.0, "kl_loss_12": 1104.3774383544921, "kl_loss_17": 299.824292755127, "kl_loss_3": 2743.9949951171875, "kl_loss_6": 2122.171771240234, "learning_rate": 0.0003156427123660297, "loss": 1525.2645, "step": 6240 }, { "ce_loss_12": 3.3772284746170045, "ce_loss_17": 3.031772184371948, "ce_loss_23": 2.9064966320991514, "ce_loss_3": 4.147403049468994, "ce_loss_6": 3.841344749927521, "epoch": 0.625, "grad_norm": 924.0, "kl_loss_12": 1082.6211517333984, "kl_loss_17": 290.51032104492185, "kl_loss_3": 2642.9122924804688, "kl_loss_6": 2037.5070373535157, "learning_rate": 0.0003141687721698363, "loss": 1529.8761, "step": 6250 }, { "ce_loss_12": 3.3317097783088685, "ce_loss_17": 3.006670558452606, "ce_loss_23": 2.889304530620575, "ce_loss_3": 4.0970113515853885, "ce_loss_6": 3.7971444487571717, "epoch": 0.626, "grad_norm": 952.0, "kl_loss_12": 1023.0517028808594, "kl_loss_17": 276.3337455749512, "kl_loss_3": 2562.144384765625, "kl_loss_6": 1967.785302734375, "learning_rate": 0.00031269670329204396, "loss": 1494.9123, "step": 6260 }, { "ce_loss_12": 3.380745196342468, "ce_loss_17": 3.0420040488243103, "ce_loss_23": 2.9203967571258547, "ce_loss_3": 4.142084872722625, "ce_loss_6": 3.8459377884864807, "epoch": 0.627, "grad_norm": 1280.0, "kl_loss_12": 1069.5860168457032, "kl_loss_17": 290.3002021789551, "kl_loss_3": 2616.0244873046877, "kl_loss_6": 2023.7755065917968, "learning_rate": 0.00031122652055637015, "loss": 1523.6999, "step": 6270 }, { "ce_loss_12": 3.344464898109436, "ce_loss_17": 3.0021347045898437, "ce_loss_23": 2.8809479236602784, "ce_loss_3": 4.160500121116638, "ce_loss_6": 3.8466029167175293, "epoch": 0.628, "grad_norm": 1264.0, "kl_loss_12": 1071.2355590820312, "kl_loss_17": 290.591535949707, "kl_loss_3": 2714.581018066406, "kl_loss_6": 2090.1380126953127, "learning_rate": 0.0003097582387675385, "loss": 1512.6952, "step": 6280 }, { "ce_loss_12": 3.388115203380585, "ce_loss_17": 3.0414551854133607, "ce_loss_23": 2.915836453437805, "ce_loss_3": 4.170203387737274, "ce_loss_6": 3.8668522119522093, "epoch": 0.629, "grad_norm": 1208.0, "kl_loss_12": 1079.1004943847656, "kl_loss_17": 292.55574188232424, "kl_loss_3": 2678.459423828125, "kl_loss_6": 2070.119000244141, "learning_rate": 0.00030829187271113034, "loss": 1520.3328, "step": 6290 }, { "ce_loss_12": 3.3620065093040465, "ce_loss_17": 3.0299498438835144, "ce_loss_23": 2.907538902759552, "ce_loss_3": 4.149127829074859, "ce_loss_6": 3.838905620574951, "epoch": 0.63, "grad_norm": 1240.0, "kl_loss_12": 1040.4415649414063, "kl_loss_17": 282.5463836669922, "kl_loss_3": 2615.3244873046874, "kl_loss_6": 2007.760760498047, "learning_rate": 0.00030682743715343565, "loss": 1521.68, "step": 6300 }, { "ce_loss_12": 3.335128140449524, "ce_loss_17": 2.9822778344154357, "ce_loss_23": 2.852949821949005, "ce_loss_3": 4.134418356418609, "ce_loss_6": 3.830398738384247, "epoch": 0.631, "grad_norm": 1248.0, "kl_loss_12": 1089.9908935546875, "kl_loss_17": 297.17050018310545, "kl_loss_3": 2691.6098022460938, "kl_loss_6": 2083.130157470703, "learning_rate": 0.0003053649468413043, "loss": 1558.2545, "step": 6310 }, { "ce_loss_12": 3.428669238090515, "ce_loss_17": 3.0927268743515013, "ce_loss_23": 2.966153454780579, "ce_loss_3": 4.215055656433106, "ce_loss_6": 3.9037797093391418, "epoch": 0.632, "grad_norm": 1544.0, "kl_loss_12": 1072.3437133789062, "kl_loss_17": 294.3572998046875, "kl_loss_3": 2663.4129272460937, "kl_loss_6": 2048.999542236328, "learning_rate": 0.00030390441650199725, "loss": 1512.9581, "step": 6320 }, { "ce_loss_12": 3.343301022052765, "ce_loss_17": 2.9984962582588195, "ce_loss_23": 2.8740360498428346, "ce_loss_3": 4.133579123020172, "ce_loss_6": 3.8244497656822203, "epoch": 0.633, "grad_norm": 1168.0, "kl_loss_12": 1065.6994506835938, "kl_loss_17": 289.3247261047363, "kl_loss_3": 2648.2664428710937, "kl_loss_6": 2032.6017517089845, "learning_rate": 0.00030244586084303903, "loss": 1500.9375, "step": 6330 }, { "ce_loss_12": 3.332288992404938, "ce_loss_17": 2.9722416281700133, "ce_loss_23": 2.8453716039657593, "ce_loss_3": 4.135188567638397, "ce_loss_6": 3.8267269372940063, "epoch": 0.634, "grad_norm": 856.0, "kl_loss_12": 1104.3370056152344, "kl_loss_17": 297.27563934326173, "kl_loss_3": 2726.391882324219, "kl_loss_6": 2114.6755859375, "learning_rate": 0.00030098929455206903, "loss": 1521.0924, "step": 6340 }, { "ce_loss_12": 3.3067553639411926, "ce_loss_17": 2.9648497700691223, "ce_loss_23": 2.845312809944153, "ce_loss_3": 4.113324463367462, "ce_loss_6": 3.8015464425086973, "epoch": 0.635, "grad_norm": 1040.0, "kl_loss_12": 1069.1728210449219, "kl_loss_17": 285.195866394043, "kl_loss_3": 2687.285046386719, "kl_loss_6": 2075.2121032714845, "learning_rate": 0.00029953473229669324, "loss": 1569.9994, "step": 6350 }, { "ce_loss_12": 3.3521832466125487, "ce_loss_17": 2.9959980249404907, "ce_loss_23": 2.873391032218933, "ce_loss_3": 4.1363235235214235, "ce_loss_6": 3.833011245727539, "epoch": 0.636, "grad_norm": 1392.0, "kl_loss_12": 1088.97607421875, "kl_loss_17": 287.1729057312012, "kl_loss_3": 2675.118701171875, "kl_loss_6": 2077.9879821777345, "learning_rate": 0.00029808218872433767, "loss": 1509.7584, "step": 6360 }, { "ce_loss_12": 3.3870229005813597, "ce_loss_17": 3.052706515789032, "ce_loss_23": 2.9297956585884095, "ce_loss_3": 4.169206213951111, "ce_loss_6": 3.870281684398651, "epoch": 0.637, "grad_norm": 1568.0, "kl_loss_12": 1048.5180084228516, "kl_loss_17": 282.48961334228517, "kl_loss_3": 2630.5627685546874, "kl_loss_6": 2032.5360290527344, "learning_rate": 0.0002966316784621, "loss": 1498.2122, "step": 6370 }, { "ce_loss_12": 3.3314394116401673, "ce_loss_17": 2.9822089433670045, "ce_loss_23": 2.847934913635254, "ce_loss_3": 4.135004734992981, "ce_loss_6": 3.8197688102722167, "epoch": 0.638, "grad_norm": 1072.0, "kl_loss_12": 1095.9818908691407, "kl_loss_17": 296.8167663574219, "kl_loss_3": 2713.0668090820313, "kl_loss_6": 2088.9831420898436, "learning_rate": 0.0002951832161166024, "loss": 1513.5551, "step": 6380 }, { "ce_loss_12": 3.391787350177765, "ce_loss_17": 3.039981758594513, "ce_loss_23": 2.9113774418830873, "ce_loss_3": 4.174764740467071, "ce_loss_6": 3.877041220664978, "epoch": 0.639, "grad_norm": 888.0, "kl_loss_12": 1083.952947998047, "kl_loss_17": 293.1161712646484, "kl_loss_3": 2669.3549194335938, "kl_loss_6": 2076.956903076172, "learning_rate": 0.0002937368162738445, "loss": 1502.0477, "step": 6390 }, { "ce_loss_12": 3.3227894425392153, "ce_loss_17": 2.993958580493927, "ce_loss_23": 2.8782469749450685, "ce_loss_3": 4.11331399679184, "ce_loss_6": 3.8117558360099792, "epoch": 0.64, "grad_norm": 1160.0, "kl_loss_12": 1048.937274169922, "kl_loss_17": 279.83716888427733, "kl_loss_3": 2632.9319458007812, "kl_loss_6": 2037.3876037597656, "learning_rate": 0.0002922924934990568, "loss": 1526.9837, "step": 6400 }, { "ce_loss_12": 3.2892454981803896, "ce_loss_17": 2.9320313334465027, "ce_loss_23": 2.8080975651741027, "ce_loss_3": 4.108310544490815, "ce_loss_6": 3.7949350237846375, "epoch": 0.641, "grad_norm": 1416.0, "kl_loss_12": 1095.0140625, "kl_loss_17": 289.45925369262693, "kl_loss_3": 2754.233044433594, "kl_loss_6": 2133.916400146484, "learning_rate": 0.0002908502623365536, "loss": 1539.6635, "step": 6410 }, { "ce_loss_12": 3.2228192329406737, "ce_loss_17": 2.8717540979385374, "ce_loss_23": 2.7454086601734162, "ce_loss_3": 4.063563895225525, "ce_loss_6": 3.7421645879745484, "epoch": 0.642, "grad_norm": 1376.0, "kl_loss_12": 1095.3357879638672, "kl_loss_17": 289.3959281921387, "kl_loss_3": 2782.289697265625, "kl_loss_6": 2145.7878540039064, "learning_rate": 0.0002894101373095867, "loss": 1544.1738, "step": 6420 }, { "ce_loss_12": 3.413454031944275, "ce_loss_17": 3.074009048938751, "ce_loss_23": 2.9470053911209106, "ce_loss_3": 4.182444787025451, "ce_loss_6": 3.886037003993988, "epoch": 0.643, "grad_norm": 868.0, "kl_loss_12": 1073.1138610839844, "kl_loss_17": 293.98568572998045, "kl_loss_3": 2631.6056274414063, "kl_loss_6": 2042.1914184570312, "learning_rate": 0.00028797213292019926, "loss": 1514.5645, "step": 6430 }, { "ce_loss_12": 3.394935941696167, "ce_loss_17": 3.052765667438507, "ce_loss_23": 2.925507402420044, "ce_loss_3": 4.178702533245087, "ce_loss_6": 3.871429920196533, "epoch": 0.644, "grad_norm": 1248.0, "kl_loss_12": 1081.2356658935546, "kl_loss_17": 293.6976516723633, "kl_loss_3": 2645.124951171875, "kl_loss_6": 2043.427099609375, "learning_rate": 0.0002865362636490791, "loss": 1547.284, "step": 6440 }, { "ce_loss_12": 3.4013839244842528, "ce_loss_17": 3.0620115041732787, "ce_loss_23": 2.942251706123352, "ce_loss_3": 4.181041252613068, "ce_loss_6": 3.874261713027954, "epoch": 0.645, "grad_norm": 1096.0, "kl_loss_12": 1063.9557067871094, "kl_loss_17": 287.7143745422363, "kl_loss_3": 2624.9001586914064, "kl_loss_6": 2024.624041748047, "learning_rate": 0.0002851025439554142, "loss": 1502.1726, "step": 6450 }, { "ce_loss_12": 3.3908503651618958, "ce_loss_17": 3.0486684918403624, "ce_loss_23": 2.9185595750808715, "ce_loss_3": 4.1556107759475704, "ce_loss_6": 3.8517315864562987, "epoch": 0.646, "grad_norm": 1080.0, "kl_loss_12": 1064.3933685302734, "kl_loss_17": 290.0317329406738, "kl_loss_3": 2599.8388427734376, "kl_loss_6": 2001.0165588378907, "learning_rate": 0.00028367098827674573, "loss": 1502.8248, "step": 6460 }, { "ce_loss_12": 3.3172547459602355, "ce_loss_17": 2.9788630843162536, "ce_loss_23": 2.8583949327468874, "ce_loss_3": 4.1198078989982605, "ce_loss_6": 3.804555869102478, "epoch": 0.647, "grad_norm": 1280.0, "kl_loss_12": 1058.7646209716797, "kl_loss_17": 283.33220596313475, "kl_loss_3": 2653.0367065429687, "kl_loss_6": 2037.1688110351563, "learning_rate": 0.00028224161102882397, "loss": 1525.8549, "step": 6470 }, { "ce_loss_12": 3.2954100370407104, "ce_loss_17": 2.9568528294563294, "ce_loss_23": 2.8395904183387755, "ce_loss_3": 4.058874034881592, "ce_loss_6": 3.7639296650886536, "epoch": 0.648, "grad_norm": 956.0, "kl_loss_12": 1051.4144226074218, "kl_loss_17": 277.5598617553711, "kl_loss_3": 2598.404248046875, "kl_loss_6": 2012.757061767578, "learning_rate": 0.00028081442660546124, "loss": 1513.174, "step": 6480 }, { "ce_loss_12": 3.359078085422516, "ce_loss_17": 3.021183693408966, "ce_loss_23": 2.895930051803589, "ce_loss_3": 4.148511445522308, "ce_loss_6": 3.834946537017822, "epoch": 0.649, "grad_norm": 1004.0, "kl_loss_12": 1060.023629760742, "kl_loss_17": 291.75023956298827, "kl_loss_3": 2661.596643066406, "kl_loss_6": 2033.6894104003907, "learning_rate": 0.0002793894493783892, "loss": 1514.7693, "step": 6490 }, { "ce_loss_12": 3.366551387310028, "ce_loss_17": 3.030740213394165, "ce_loss_23": 2.913418471813202, "ce_loss_3": 4.158485841751099, "ce_loss_6": 3.850329840183258, "epoch": 0.65, "grad_norm": 1112.0, "kl_loss_12": 1045.378155517578, "kl_loss_17": 278.98334350585935, "kl_loss_3": 2624.3213745117187, "kl_loss_6": 2024.8353698730468, "learning_rate": 0.0002779666936971129, "loss": 1499.6525, "step": 6500 }, { "ce_loss_12": 3.394643759727478, "ce_loss_17": 3.04899320602417, "ce_loss_23": 2.9235555052757265, "ce_loss_3": 4.185719633102417, "ce_loss_6": 3.8867220282554626, "epoch": 0.651, "grad_norm": 916.0, "kl_loss_12": 1075.935726928711, "kl_loss_17": 288.56647872924805, "kl_loss_3": 2680.7698486328127, "kl_loss_6": 2072.048992919922, "learning_rate": 0.00027654617388876614, "loss": 1532.4816, "step": 6510 }, { "ce_loss_12": 3.398653984069824, "ce_loss_17": 3.0640782356262206, "ce_loss_23": 2.938787519931793, "ce_loss_3": 4.188610053062439, "ce_loss_6": 3.878059720993042, "epoch": 0.652, "grad_norm": 908.0, "kl_loss_12": 1061.6495239257813, "kl_loss_17": 291.04235076904297, "kl_loss_3": 2673.7321899414064, "kl_loss_6": 2048.234454345703, "learning_rate": 0.0002751279042579672, "loss": 1528.4081, "step": 6520 }, { "ce_loss_12": 3.3495648741722106, "ce_loss_17": 3.0123731017112734, "ce_loss_23": 2.892154061794281, "ce_loss_3": 4.12523752450943, "ce_loss_6": 3.828623628616333, "epoch": 0.653, "grad_norm": 1344.0, "kl_loss_12": 1051.1055145263672, "kl_loss_17": 278.1377319335937, "kl_loss_3": 2622.2849731445312, "kl_loss_6": 2022.1887268066407, "learning_rate": 0.00027371189908667604, "loss": 1526.3842, "step": 6530 }, { "ce_loss_12": 3.4101003408432007, "ce_loss_17": 3.0658029317855835, "ce_loss_23": 2.9356747388839723, "ce_loss_3": 4.236353695392609, "ce_loss_6": 3.928320360183716, "epoch": 0.654, "grad_norm": 912.0, "kl_loss_12": 1088.5187896728517, "kl_loss_17": 305.0631164550781, "kl_loss_3": 2743.589709472656, "kl_loss_6": 2120.7795227050783, "learning_rate": 0.00027229817263404863, "loss": 1564.6652, "step": 6540 }, { "ce_loss_12": 3.372529911994934, "ce_loss_17": 3.046334421634674, "ce_loss_23": 2.929336595535278, "ce_loss_3": 4.127618956565857, "ce_loss_6": 3.825167953968048, "epoch": 0.655, "grad_norm": 1144.0, "kl_loss_12": 1039.775701904297, "kl_loss_17": 281.04506454467776, "kl_loss_3": 2574.1316040039064, "kl_loss_6": 1979.277850341797, "learning_rate": 0.0002708867391362948, "loss": 1498.607, "step": 6550 }, { "ce_loss_12": 3.348506212234497, "ce_loss_17": 3.024095320701599, "ce_loss_23": 2.908759653568268, "ce_loss_3": 4.1206450939178465, "ce_loss_6": 3.817743384838104, "epoch": 0.656, "grad_norm": 960.0, "kl_loss_12": 1020.556494140625, "kl_loss_17": 276.0389991760254, "kl_loss_3": 2578.2486083984377, "kl_loss_6": 1974.8206176757812, "learning_rate": 0.0002694776128065345, "loss": 1498.607, "step": 6560 }, { "ce_loss_12": 3.317186141014099, "ce_loss_17": 2.9659634828567505, "ce_loss_23": 2.8406397342681884, "ce_loss_3": 4.101336324214936, "ce_loss_6": 3.78921320438385, "epoch": 0.657, "grad_norm": 832.0, "kl_loss_12": 1084.9080657958984, "kl_loss_17": 291.7940902709961, "kl_loss_3": 2676.329724121094, "kl_loss_6": 2057.403973388672, "learning_rate": 0.00026807080783465374, "loss": 1499.9951, "step": 6570 }, { "ce_loss_12": 3.4112606763839723, "ce_loss_17": 3.0685062646865844, "ce_loss_23": 2.9431819200515745, "ce_loss_3": 4.199546587467194, "ce_loss_6": 3.8976390838623045, "epoch": 0.658, "grad_norm": 1128.0, "kl_loss_12": 1085.2541046142578, "kl_loss_17": 291.7502784729004, "kl_loss_3": 2673.6155639648437, "kl_loss_6": 2070.629815673828, "learning_rate": 0.00026666633838716316, "loss": 1540.0823, "step": 6580 }, { "ce_loss_12": 3.325250494480133, "ce_loss_17": 2.9772164583206178, "ce_loss_23": 2.849889171123505, "ce_loss_3": 4.125745224952698, "ce_loss_6": 3.8154043674468996, "epoch": 0.659, "grad_norm": 1104.0, "kl_loss_12": 1087.6900970458985, "kl_loss_17": 296.8415771484375, "kl_loss_3": 2694.4278930664063, "kl_loss_6": 2083.0305114746093, "learning_rate": 0.00026526421860705474, "loss": 1550.6973, "step": 6590 }, { "ce_loss_12": 3.3463248372077943, "ce_loss_17": 2.9926101684570314, "ce_loss_23": 2.8664312243461607, "ce_loss_3": 4.144900977611542, "ce_loss_6": 3.84015097618103, "epoch": 0.66, "grad_norm": 1272.0, "kl_loss_12": 1082.672903442383, "kl_loss_17": 295.27245864868166, "kl_loss_3": 2689.79296875, "kl_loss_6": 2084.4121032714843, "learning_rate": 0.0002638644626136587, "loss": 1518.3607, "step": 6600 }, { "ce_loss_12": 3.3497479915618897, "ce_loss_17": 3.010710376501083, "ce_loss_23": 2.8882995724678038, "ce_loss_3": 4.14725239276886, "ce_loss_6": 3.838615524768829, "epoch": 0.661, "grad_norm": 1048.0, "kl_loss_12": 1062.1607543945313, "kl_loss_17": 285.89757080078124, "kl_loss_3": 2643.746630859375, "kl_loss_6": 2037.8521606445313, "learning_rate": 0.00026246708450250255, "loss": 1512.559, "step": 6610 }, { "ce_loss_12": 3.3246140122413634, "ce_loss_17": 2.9918368458747864, "ce_loss_23": 2.8710078835487365, "ce_loss_3": 4.096918213367462, "ce_loss_6": 3.8028513193130493, "epoch": 0.662, "grad_norm": 984.0, "kl_loss_12": 1048.273110961914, "kl_loss_17": 283.497420501709, "kl_loss_3": 2607.4280639648437, "kl_loss_6": 2013.1176147460938, "learning_rate": 0.00026107209834516854, "loss": 1498.8475, "step": 6620 }, { "ce_loss_12": 3.3087719678878784, "ce_loss_17": 2.9600358247756957, "ce_loss_23": 2.8357106685638427, "ce_loss_3": 4.133854627609253, "ce_loss_6": 3.8167326927185057, "epoch": 0.663, "grad_norm": 1080.0, "kl_loss_12": 1081.6213989257812, "kl_loss_17": 288.49953842163086, "kl_loss_3": 2732.234655761719, "kl_loss_6": 2106.6643188476564, "learning_rate": 0.0002596795181891514, "loss": 1548.7227, "step": 6630 }, { "ce_loss_12": 3.3174493074417115, "ce_loss_17": 2.971146559715271, "ce_loss_23": 2.837556302547455, "ce_loss_3": 4.109105908870697, "ce_loss_6": 3.8051462888717653, "epoch": 0.664, "grad_norm": 1112.0, "kl_loss_12": 1099.5311096191406, "kl_loss_17": 299.4432113647461, "kl_loss_3": 2689.6567504882814, "kl_loss_6": 2086.8522766113283, "learning_rate": 0.000258289358057718, "loss": 1584.9252, "step": 6640 }, { "ce_loss_12": 3.3833224058151243, "ce_loss_17": 3.027475082874298, "ce_loss_23": 2.8935093522071837, "ce_loss_3": 4.180250668525696, "ce_loss_6": 3.8748958826065065, "epoch": 0.665, "grad_norm": 1120.0, "kl_loss_12": 1098.4118103027345, "kl_loss_17": 301.1476058959961, "kl_loss_3": 2711.7373779296877, "kl_loss_6": 2101.2721557617188, "learning_rate": 0.0002569016319497657, "loss": 1549.8691, "step": 6650 }, { "ce_loss_12": 3.3719239592552186, "ce_loss_17": 3.0172632336616516, "ce_loss_23": 2.8854519844055178, "ce_loss_3": 4.161745226383209, "ce_loss_6": 3.8583480596542357, "epoch": 0.666, "grad_norm": 1072.0, "kl_loss_12": 1103.6511932373046, "kl_loss_17": 303.22628631591795, "kl_loss_3": 2711.325439453125, "kl_loss_6": 2102.153436279297, "learning_rate": 0.00025551635383968066, "loss": 1558.8236, "step": 6660 }, { "ce_loss_12": 3.281505012512207, "ce_loss_17": 2.940502107143402, "ce_loss_23": 2.813677740097046, "ce_loss_3": 4.103258204460144, "ce_loss_6": 3.77959862947464, "epoch": 0.667, "grad_norm": 1048.0, "kl_loss_12": 1085.2066711425782, "kl_loss_17": 293.6140731811523, "kl_loss_3": 2719.1686889648436, "kl_loss_6": 2092.0775451660156, "learning_rate": 0.00025413353767719804, "loss": 1543.7244, "step": 6670 }, { "ce_loss_12": 3.3316211462020875, "ce_loss_17": 2.990746355056763, "ce_loss_23": 2.870693302154541, "ce_loss_3": 4.122513997554779, "ce_loss_6": 3.820575976371765, "epoch": 0.668, "grad_norm": 1136.0, "kl_loss_12": 1068.2965698242188, "kl_loss_17": 284.17504425048827, "kl_loss_3": 2670.3581787109374, "kl_loss_6": 2070.3725158691404, "learning_rate": 0.0002527531973872617, "loss": 1525.5031, "step": 6680 }, { "ce_loss_12": 3.338472878932953, "ce_loss_17": 3.008737337589264, "ce_loss_23": 2.8879444122314455, "ce_loss_3": 4.122899007797241, "ce_loss_6": 3.8144315242767335, "epoch": 0.669, "grad_norm": 928.0, "kl_loss_12": 1054.9741912841796, "kl_loss_17": 286.4005615234375, "kl_loss_3": 2637.2769409179687, "kl_loss_6": 2025.1039489746095, "learning_rate": 0.0002513753468698826, "loss": 1502.526, "step": 6690 }, { "ce_loss_12": 3.317826581001282, "ce_loss_17": 2.9731616497039797, "ce_loss_23": 2.849671816825867, "ce_loss_3": 4.118358051776886, "ce_loss_6": 3.8065997838973997, "epoch": 0.67, "grad_norm": 896.0, "kl_loss_12": 1082.1374267578126, "kl_loss_17": 293.6460014343262, "kl_loss_3": 2702.1760009765626, "kl_loss_6": 2087.3015747070312, "learning_rate": 0.0002500000000000001, "loss": 1532.8109, "step": 6700 }, { "ce_loss_12": 3.398632895946503, "ce_loss_17": 3.0781195163726807, "ce_loss_23": 2.9640834808349608, "ce_loss_3": 4.1491206049919125, "ce_loss_6": 3.8456847786903383, "epoch": 0.671, "grad_norm": 1064.0, "kl_loss_12": 1029.509799194336, "kl_loss_17": 277.89868240356446, "kl_loss_3": 2540.9117431640625, "kl_loss_6": 1946.9610656738282, "learning_rate": 0.0002486271706273421, "loss": 1526.3414, "step": 6710 }, { "ce_loss_12": 3.3420624256134035, "ce_loss_17": 3.016208124160767, "ce_loss_23": 2.900933396816254, "ce_loss_3": 4.097201704978943, "ce_loss_6": 3.7993552684783936, "epoch": 0.672, "grad_norm": 1136.0, "kl_loss_12": 1028.4074829101562, "kl_loss_17": 275.4583435058594, "kl_loss_3": 2554.589733886719, "kl_loss_6": 1960.41845703125, "learning_rate": 0.0002472568725762853, "loss": 1500.902, "step": 6720 }, { "ce_loss_12": 3.3320967674255373, "ce_loss_17": 3.0137043356895448, "ce_loss_23": 2.896807622909546, "ce_loss_3": 4.0894329190254215, "ce_loss_6": 3.7987982273101806, "epoch": 0.673, "grad_norm": 980.0, "kl_loss_12": 1022.3379486083984, "kl_loss_17": 274.11581802368164, "kl_loss_3": 2557.5329467773436, "kl_loss_6": 1979.3651306152344, "learning_rate": 0.00024588911964571554, "loss": 1478.16, "step": 6730 }, { "ce_loss_12": 3.383940541744232, "ce_loss_17": 3.0269935369491576, "ce_loss_23": 2.8932540893554686, "ce_loss_3": 4.192104065418244, "ce_loss_6": 3.879211986064911, "epoch": 0.674, "grad_norm": 1032.0, "kl_loss_12": 1114.5952880859375, "kl_loss_17": 303.4539489746094, "kl_loss_3": 2712.537390136719, "kl_loss_6": 2103.3569152832033, "learning_rate": 0.00024452392560888974, "loss": 1527.6742, "step": 6740 }, { "ce_loss_12": 3.258523499965668, "ce_loss_17": 2.9190154671669006, "ce_loss_23": 2.7954254031181334, "ce_loss_3": 4.043025708198547, "ce_loss_6": 3.7352750062942506, "epoch": 0.675, "grad_norm": 964.0, "kl_loss_12": 1053.5061340332031, "kl_loss_17": 279.17559814453125, "kl_loss_3": 2647.2302368164064, "kl_loss_6": 2033.0640502929687, "learning_rate": 0.00024316130421329695, "loss": 1493.335, "step": 6750 }, { "ce_loss_12": 3.324470043182373, "ce_loss_17": 2.9908277809619905, "ce_loss_23": 2.8736360669136047, "ce_loss_3": 4.108500003814697, "ce_loss_6": 3.8002394437789917, "epoch": 0.676, "grad_norm": 1016.0, "kl_loss_12": 1059.7845489501954, "kl_loss_17": 282.0549186706543, "kl_loss_3": 2640.0760986328123, "kl_loss_6": 2035.9004272460938, "learning_rate": 0.00024180126918051909, "loss": 1515.6507, "step": 6760 }, { "ce_loss_12": 3.3820838809013365, "ce_loss_17": 3.0400675654411318, "ce_loss_23": 2.9171111464500425, "ce_loss_3": 4.148803424835205, "ce_loss_6": 3.848258209228516, "epoch": 0.677, "grad_norm": 1056.0, "kl_loss_12": 1065.7451202392579, "kl_loss_17": 287.0917541503906, "kl_loss_3": 2629.779504394531, "kl_loss_6": 2024.6423583984374, "learning_rate": 0.00024044383420609406, "loss": 1495.1738, "step": 6770 }, { "ce_loss_12": 3.3741610646247864, "ce_loss_17": 3.05090674161911, "ce_loss_23": 2.9324339389801026, "ce_loss_3": 4.128608286380768, "ce_loss_6": 3.830838167667389, "epoch": 0.678, "grad_norm": 2096.0, "kl_loss_12": 1040.0166320800781, "kl_loss_17": 278.25733337402346, "kl_loss_3": 2569.7828369140625, "kl_loss_6": 1983.01318359375, "learning_rate": 0.00023908901295937712, "loss": 1514.7359, "step": 6780 }, { "ce_loss_12": 3.363990819454193, "ce_loss_17": 3.0348567843437193, "ce_loss_23": 2.9126704931259155, "ce_loss_3": 4.140928709506989, "ce_loss_6": 3.838523817062378, "epoch": 0.679, "grad_norm": 1088.0, "kl_loss_12": 1038.0822326660157, "kl_loss_17": 281.52527770996096, "kl_loss_3": 2588.859130859375, "kl_loss_6": 1988.4985778808593, "learning_rate": 0.00023773681908340283, "loss": 1513.9151, "step": 6790 }, { "ce_loss_12": 3.367039108276367, "ce_loss_17": 3.0190672516822814, "ce_loss_23": 2.8874718308448792, "ce_loss_3": 4.159163236618042, "ce_loss_6": 3.8588353276252745, "epoch": 0.68, "grad_norm": 1048.0, "kl_loss_12": 1106.3126983642578, "kl_loss_17": 303.4519523620605, "kl_loss_3": 2704.4398681640623, "kl_loss_6": 2104.3208129882814, "learning_rate": 0.00023638726619474876, "loss": 1562.6964, "step": 6800 }, { "ce_loss_12": 3.3643818736076354, "ce_loss_17": 3.013008165359497, "ce_loss_23": 2.878277862071991, "ce_loss_3": 4.188300597667694, "ce_loss_6": 3.8750722408294678, "epoch": 0.681, "grad_norm": 1232.0, "kl_loss_12": 1105.7393890380858, "kl_loss_17": 300.2103561401367, "kl_loss_3": 2752.149304199219, "kl_loss_6": 2135.510040283203, "learning_rate": 0.0002350403678833976, "loss": 1543.9213, "step": 6810 }, { "ce_loss_12": 3.2816458344459534, "ce_loss_17": 2.9370277523994446, "ce_loss_23": 2.8150463938713073, "ce_loss_3": 4.0751855850219725, "ce_loss_6": 3.76500483751297, "epoch": 0.682, "grad_norm": 856.0, "kl_loss_12": 1076.2480041503907, "kl_loss_17": 285.06641082763673, "kl_loss_3": 2683.7720581054687, "kl_loss_6": 2068.134649658203, "learning_rate": 0.00023369613771260007, "loss": 1512.8716, "step": 6820 }, { "ce_loss_12": 3.39626430273056, "ce_loss_17": 3.047028458118439, "ce_loss_23": 2.9260175466537475, "ce_loss_3": 4.189331746101379, "ce_loss_6": 3.8821977376937866, "epoch": 0.683, "grad_norm": 896.0, "kl_loss_12": 1084.3677520751953, "kl_loss_17": 290.67858200073243, "kl_loss_3": 2690.972790527344, "kl_loss_6": 2081.7094116210938, "learning_rate": 0.00023235458921873925, "loss": 1535.7379, "step": 6830 }, { "ce_loss_12": 3.386719024181366, "ce_loss_17": 3.0164597988128663, "ce_loss_23": 2.881801736354828, "ce_loss_3": 4.203792822360993, "ce_loss_6": 3.894656181335449, "epoch": 0.684, "grad_norm": 1016.0, "kl_loss_12": 1140.004690551758, "kl_loss_17": 307.1119552612305, "kl_loss_3": 2795.7464233398437, "kl_loss_6": 2184.1928283691404, "learning_rate": 0.0002310157359111938, "loss": 1590.245, "step": 6840 }, { "ce_loss_12": 3.281593942642212, "ce_loss_17": 2.9068808436393736, "ce_loss_23": 2.7744899153709413, "ce_loss_3": 4.151088750362396, "ce_loss_6": 3.8237022399902343, "epoch": 0.685, "grad_norm": 1040.0, "kl_loss_12": 1121.4570068359376, "kl_loss_17": 298.14404449462893, "kl_loss_3": 2864.43203125, "kl_loss_6": 2216.5655029296877, "learning_rate": 0.0002296795912722014, "loss": 1589.223, "step": 6850 }, { "ce_loss_12": 3.376678490638733, "ce_loss_17": 3.0392310976982118, "ce_loss_23": 2.9149879217147827, "ce_loss_3": 4.142076396942139, "ce_loss_6": 3.8350725889205934, "epoch": 0.686, "grad_norm": 1136.0, "kl_loss_12": 1057.6813568115235, "kl_loss_17": 285.86506271362305, "kl_loss_3": 2618.06630859375, "kl_loss_6": 2008.5849182128907, "learning_rate": 0.0002283461687567236, "loss": 1479.476, "step": 6860 }, { "ce_loss_12": 3.4147990345954895, "ce_loss_17": 3.0852442264556883, "ce_loss_23": 2.967425298690796, "ce_loss_3": 4.163288021087647, "ce_loss_6": 3.869793510437012, "epoch": 0.687, "grad_norm": 1088.0, "kl_loss_12": 1037.484829711914, "kl_loss_17": 282.1102653503418, "kl_loss_3": 2554.0628662109375, "kl_loss_6": 1967.2089477539062, "learning_rate": 0.00022701548179231045, "loss": 1506.1345, "step": 6870 }, { "ce_loss_12": 3.3873743414878845, "ce_loss_17": 3.0464545249938966, "ce_loss_23": 2.922188627719879, "ce_loss_3": 4.175418794155121, "ce_loss_6": 3.875636374950409, "epoch": 0.688, "grad_norm": 1400.0, "kl_loss_12": 1075.7415679931642, "kl_loss_17": 289.9831932067871, "kl_loss_3": 2661.655517578125, "kl_loss_6": 2067.484735107422, "learning_rate": 0.00022568754377896516, "loss": 1503.4135, "step": 6880 }, { "ce_loss_12": 3.385250473022461, "ce_loss_17": 3.0440564274787905, "ce_loss_23": 2.9164554595947267, "ce_loss_3": 4.146143007278442, "ce_loss_6": 3.846618139743805, "epoch": 0.689, "grad_norm": 952.0, "kl_loss_12": 1077.9116333007812, "kl_loss_17": 290.6547370910645, "kl_loss_3": 2626.9868286132814, "kl_loss_6": 2026.3530639648438, "learning_rate": 0.00022436236808900844, "loss": 1506.102, "step": 6890 }, { "ce_loss_12": 3.2836104154586794, "ce_loss_17": 2.936374032497406, "ce_loss_23": 2.810339403152466, "ce_loss_3": 4.082259452342987, "ce_loss_6": 3.7733787298202515, "epoch": 0.69, "grad_norm": 1000.0, "kl_loss_12": 1083.3499816894532, "kl_loss_17": 289.6112510681152, "kl_loss_3": 2704.3146240234373, "kl_loss_6": 2076.830242919922, "learning_rate": 0.00022303996806694487, "loss": 1520.6959, "step": 6900 }, { "ce_loss_12": 3.348386228084564, "ce_loss_17": 3.005220341682434, "ce_loss_23": 2.8856596112251283, "ce_loss_3": 4.147955656051636, "ce_loss_6": 3.8367886543273926, "epoch": 0.691, "grad_norm": 1096.0, "kl_loss_12": 1068.3898193359375, "kl_loss_17": 283.26347732543945, "kl_loss_3": 2685.2141967773437, "kl_loss_6": 2064.223016357422, "learning_rate": 0.00022172035702932823, "loss": 1514.9848, "step": 6910 }, { "ce_loss_12": 3.3900525689125063, "ce_loss_17": 3.0595398545265198, "ce_loss_23": 2.935157132148743, "ce_loss_3": 4.146279001235962, "ce_loss_6": 3.850438177585602, "epoch": 0.692, "grad_norm": 1012.0, "kl_loss_12": 1046.249249267578, "kl_loss_17": 287.8135139465332, "kl_loss_3": 2567.0113037109377, "kl_loss_6": 1990.5654724121093, "learning_rate": 0.00022040354826462666, "loss": 1482.2508, "step": 6920 }, { "ce_loss_12": 3.318897318840027, "ce_loss_17": 2.9834712266922, "ce_loss_23": 2.865597403049469, "ce_loss_3": 4.112779140472412, "ce_loss_6": 3.808480966091156, "epoch": 0.693, "grad_norm": 1096.0, "kl_loss_12": 1055.834146118164, "kl_loss_17": 281.22287368774414, "kl_loss_3": 2656.381481933594, "kl_loss_6": 2053.055072021484, "learning_rate": 0.0002190895550330899, "loss": 1526.6893, "step": 6930 }, { "ce_loss_12": 3.2867075204849243, "ce_loss_17": 2.9261071562767027, "ce_loss_23": 2.797209632396698, "ce_loss_3": 4.094171130657196, "ce_loss_6": 3.777874195575714, "epoch": 0.694, "grad_norm": 1584.0, "kl_loss_12": 1091.2527709960937, "kl_loss_17": 291.81100311279295, "kl_loss_3": 2703.411511230469, "kl_loss_6": 2074.94111328125, "learning_rate": 0.00021777839056661552, "loss": 1509.3366, "step": 6940 }, { "ce_loss_12": 3.337392258644104, "ce_loss_17": 3.0037010431289675, "ce_loss_23": 2.883526420593262, "ce_loss_3": 4.121099376678467, "ce_loss_6": 3.8113796710968018, "epoch": 0.695, "grad_norm": 1448.0, "kl_loss_12": 1057.7788360595703, "kl_loss_17": 283.61209259033205, "kl_loss_3": 2636.60634765625, "kl_loss_6": 2022.8482055664062, "learning_rate": 0.0002164700680686147, "loss": 1485.7182, "step": 6950 }, { "ce_loss_12": 3.3705799698829653, "ce_loss_17": 3.0428457498550414, "ce_loss_23": 2.91817193031311, "ce_loss_3": 4.139367914199829, "ce_loss_6": 3.8418300271034242, "epoch": 0.696, "grad_norm": 1576.0, "kl_loss_12": 1039.181134033203, "kl_loss_17": 285.70360260009767, "kl_loss_3": 2583.8804931640625, "kl_loss_6": 1989.1890197753905, "learning_rate": 0.0002151646007138806, "loss": 1487.7269, "step": 6960 }, { "ce_loss_12": 3.288752329349518, "ce_loss_17": 2.940204656124115, "ce_loss_23": 2.815975916385651, "ce_loss_3": 4.087472057342529, "ce_loss_6": 3.7803780317306517, "epoch": 0.697, "grad_norm": 1264.0, "kl_loss_12": 1086.2156982421875, "kl_loss_17": 290.2320617675781, "kl_loss_3": 2702.2922485351564, "kl_loss_6": 2095.7369689941406, "learning_rate": 0.00021386200164845526, "loss": 1518.6186, "step": 6970 }, { "ce_loss_12": 3.422125792503357, "ce_loss_17": 3.0908151388168337, "ce_loss_23": 2.9712757110595702, "ce_loss_3": 4.169968152046204, "ce_loss_6": 3.877488946914673, "epoch": 0.698, "grad_norm": 1056.0, "kl_loss_12": 1040.7631591796876, "kl_loss_17": 279.68602294921874, "kl_loss_3": 2554.2770751953126, "kl_loss_6": 1976.9941101074219, "learning_rate": 0.0002125622839894964, "loss": 1470.3584, "step": 6980 }, { "ce_loss_12": 3.3737497091293336, "ce_loss_17": 3.0447206258773805, "ce_loss_23": 2.9255369782447813, "ce_loss_3": 4.149833381175995, "ce_loss_6": 3.8436522603034975, "epoch": 0.699, "grad_norm": 1004.0, "kl_loss_12": 1034.2772094726563, "kl_loss_17": 279.0837905883789, "kl_loss_3": 2585.775256347656, "kl_loss_6": 1985.5286926269532, "learning_rate": 0.00021126546082514663, "loss": 1479.8258, "step": 6990 }, { "ce_loss_12": 3.3959160089492797, "ce_loss_17": 3.066310930252075, "ce_loss_23": 2.9466503143310545, "ce_loss_3": 4.15298364162445, "ce_loss_6": 3.8538186311721803, "epoch": 0.7, "grad_norm": 1056.0, "kl_loss_12": 1044.9644104003905, "kl_loss_17": 280.3296058654785, "kl_loss_3": 2576.889367675781, "kl_loss_6": 1987.39765625, "learning_rate": 0.00020997154521440098, "loss": 1473.4609, "step": 7000 }, { "ce_loss_12": 3.3464019894599915, "ce_loss_17": 3.0117498874664306, "ce_loss_23": 2.894484758377075, "ce_loss_3": 4.125147533416748, "ce_loss_6": 3.816951608657837, "epoch": 0.701, "grad_norm": 956.0, "kl_loss_12": 1051.8970977783204, "kl_loss_17": 278.6096450805664, "kl_loss_3": 2619.6825317382813, "kl_loss_6": 2012.2601745605468, "learning_rate": 0.0002086805501869749, "loss": 1484.8754, "step": 7010 }, { "ce_loss_12": 3.347380018234253, "ce_loss_17": 2.995504927635193, "ce_loss_23": 2.867575478553772, "ce_loss_3": 4.148639333248139, "ce_loss_6": 3.8330894231796266, "epoch": 0.702, "grad_norm": 1208.0, "kl_loss_12": 1098.6046295166016, "kl_loss_17": 294.9442642211914, "kl_loss_3": 2722.4083984375, "kl_loss_6": 2093.525006103516, "learning_rate": 0.0002073924887431744, "loss": 1525.892, "step": 7020 }, { "ce_loss_12": 3.336989998817444, "ce_loss_17": 2.992552936077118, "ce_loss_23": 2.874508500099182, "ce_loss_3": 4.118464183807373, "ce_loss_6": 3.824393606185913, "epoch": 0.703, "grad_norm": 1152.0, "kl_loss_12": 1065.1422973632812, "kl_loss_17": 284.58373031616213, "kl_loss_3": 2654.9352783203126, "kl_loss_6": 2063.013397216797, "learning_rate": 0.00020610737385376348, "loss": 1550.015, "step": 7030 }, { "ce_loss_12": 3.371172559261322, "ce_loss_17": 3.0425814151763917, "ce_loss_23": 2.9228823304176332, "ce_loss_3": 4.125135231018066, "ce_loss_6": 3.8306512594223023, "epoch": 0.704, "grad_norm": 1240.0, "kl_loss_12": 1038.9294342041017, "kl_loss_17": 283.898503112793, "kl_loss_3": 2563.0609741210938, "kl_loss_6": 1977.402685546875, "learning_rate": 0.00020482521845983521, "loss": 1507.833, "step": 7040 }, { "ce_loss_12": 3.3880650877952574, "ce_loss_17": 3.0465856552124024, "ce_loss_23": 2.917467784881592, "ce_loss_3": 4.1669586300849915, "ce_loss_6": 3.874874770641327, "epoch": 0.705, "grad_norm": 992.0, "kl_loss_12": 1080.733804321289, "kl_loss_17": 295.1897285461426, "kl_loss_3": 2654.74990234375, "kl_loss_6": 2066.9548767089846, "learning_rate": 0.00020354603547267987, "loss": 1535.6387, "step": 7050 }, { "ce_loss_12": 3.3857168674468996, "ce_loss_17": 3.0348952293395994, "ce_loss_23": 2.9074413180351257, "ce_loss_3": 4.179165804386139, "ce_loss_6": 3.8724570512771606, "epoch": 0.706, "grad_norm": 1152.0, "kl_loss_12": 1086.6515533447266, "kl_loss_17": 293.63476943969727, "kl_loss_3": 2683.95380859375, "kl_loss_6": 2069.7790588378907, "learning_rate": 0.00020226983777365604, "loss": 1563.2762, "step": 7060 }, { "ce_loss_12": 3.2863266706466674, "ce_loss_17": 2.9430976510047913, "ce_loss_23": 2.8257482767105104, "ce_loss_3": 4.097273468971252, "ce_loss_6": 3.79554363489151, "epoch": 0.707, "grad_norm": 1328.0, "kl_loss_12": 1056.6638458251953, "kl_loss_17": 279.99397430419924, "kl_loss_3": 2690.1689331054686, "kl_loss_6": 2093.4708618164063, "learning_rate": 0.00020099663821406056, "loss": 1516.3141, "step": 7070 }, { "ce_loss_12": 3.364639139175415, "ce_loss_17": 3.0360579252243043, "ce_loss_23": 2.914643609523773, "ce_loss_3": 4.128908836841584, "ce_loss_6": 3.832869017124176, "epoch": 0.708, "grad_norm": 1608.0, "kl_loss_12": 1038.0157989501954, "kl_loss_17": 278.99704666137694, "kl_loss_3": 2584.8697265625, "kl_loss_6": 1995.2526916503907, "learning_rate": 0.00019972644961499853, "loss": 1513.3617, "step": 7080 }, { "ce_loss_12": 3.3603764891624452, "ce_loss_17": 3.01292085647583, "ce_loss_23": 2.8856699466705322, "ce_loss_3": 4.167693400382996, "ce_loss_6": 3.852894461154938, "epoch": 0.709, "grad_norm": 916.0, "kl_loss_12": 1086.388168334961, "kl_loss_17": 295.21505279541014, "kl_loss_3": 2708.575646972656, "kl_loss_6": 2087.249658203125, "learning_rate": 0.00019845928476725522, "loss": 1522.1769, "step": 7090 }, { "ce_loss_12": 3.425241839885712, "ce_loss_17": 3.0817944526672365, "ce_loss_23": 2.9560640811920167, "ce_loss_3": 4.197418344020844, "ce_loss_6": 3.898772180080414, "epoch": 0.71, "grad_norm": 1272.0, "kl_loss_12": 1077.3139404296876, "kl_loss_17": 289.7147720336914, "kl_loss_3": 2628.4650268554688, "kl_loss_6": 2023.9876220703125, "learning_rate": 0.00019719515643116677, "loss": 1551.5217, "step": 7100 }, { "ce_loss_12": 3.347544848918915, "ce_loss_17": 3.016925871372223, "ce_loss_23": 2.8971351742744447, "ce_loss_3": 4.12117292881012, "ce_loss_6": 3.8190558791160583, "epoch": 0.711, "grad_norm": 1040.0, "kl_loss_12": 1045.0277587890625, "kl_loss_17": 283.63589630126955, "kl_loss_3": 2617.9817504882812, "kl_loss_6": 2007.0311584472656, "learning_rate": 0.0001959340773364911, "loss": 1514.3187, "step": 7110 }, { "ce_loss_12": 3.3740314126014708, "ce_loss_17": 3.03394775390625, "ce_loss_23": 2.911791443824768, "ce_loss_3": 4.163285481929779, "ce_loss_6": 3.8552634716033936, "epoch": 0.712, "grad_norm": 940.0, "kl_loss_12": 1061.135043334961, "kl_loss_17": 286.66785583496096, "kl_loss_3": 2647.994091796875, "kl_loss_6": 2041.245782470703, "learning_rate": 0.0001946760601822809, "loss": 1487.3021, "step": 7120 }, { "ce_loss_12": 3.414544379711151, "ce_loss_17": 3.0850356340408327, "ce_loss_23": 2.96371386051178, "ce_loss_3": 4.186938786506653, "ce_loss_6": 3.8713775396347048, "epoch": 0.713, "grad_norm": 1088.0, "kl_loss_12": 1043.7035827636719, "kl_loss_17": 281.5654457092285, "kl_loss_3": 2607.409875488281, "kl_loss_6": 1985.3408264160157, "learning_rate": 0.00019342111763675512, "loss": 1466.5036, "step": 7130 }, { "ce_loss_12": 3.4078126668930055, "ce_loss_17": 3.082154428958893, "ce_loss_23": 2.9556745529174804, "ce_loss_3": 4.162794458866119, "ce_loss_6": 3.864700162410736, "epoch": 0.714, "grad_norm": 980.0, "kl_loss_12": 1041.3136169433594, "kl_loss_17": 285.2400405883789, "kl_loss_3": 2559.408154296875, "kl_loss_6": 1978.7324157714843, "learning_rate": 0.00019216926233717085, "loss": 1469.756, "step": 7140 }, { "ce_loss_12": 3.324468493461609, "ce_loss_17": 2.9787415981292726, "ce_loss_23": 2.858863890171051, "ce_loss_3": 4.164292752742767, "ce_loss_6": 3.86098655462265, "epoch": 0.715, "grad_norm": 1328.0, "kl_loss_12": 1068.1288116455078, "kl_loss_17": 280.78184204101564, "kl_loss_3": 2746.748254394531, "kl_loss_6": 2145.286444091797, "learning_rate": 0.00019092050688969737, "loss": 1541.5396, "step": 7150 }, { "ce_loss_12": 3.3683008909225465, "ce_loss_17": 3.042057716846466, "ce_loss_23": 2.923891615867615, "ce_loss_3": 4.135058331489563, "ce_loss_6": 3.8425532221794128, "epoch": 0.716, "grad_norm": 1192.0, "kl_loss_12": 1041.178155517578, "kl_loss_17": 280.4268173217773, "kl_loss_3": 2604.3657470703124, "kl_loss_6": 2012.2357360839844, "learning_rate": 0.00018967486386928817, "loss": 1485.7764, "step": 7160 }, { "ce_loss_12": 3.2786318182945253, "ce_loss_17": 2.9329994559288024, "ce_loss_23": 2.8096609830856325, "ce_loss_3": 4.083643531799316, "ce_loss_6": 3.772729206085205, "epoch": 0.717, "grad_norm": 964.0, "kl_loss_12": 1074.2320251464844, "kl_loss_17": 287.8572372436523, "kl_loss_3": 2699.5654418945314, "kl_loss_6": 2074.710577392578, "learning_rate": 0.00018843234581955443, "loss": 1567.7979, "step": 7170 }, { "ce_loss_12": 3.285939705371857, "ce_loss_17": 2.938218724727631, "ce_loss_23": 2.8121312975883486, "ce_loss_3": 4.080737113952637, "ce_loss_6": 3.7792991995811462, "epoch": 0.718, "grad_norm": 1192.0, "kl_loss_12": 1083.5157440185546, "kl_loss_17": 289.6617492675781, "kl_loss_3": 2676.434375, "kl_loss_6": 2066.987451171875, "learning_rate": 0.00018719296525263924, "loss": 1525.8836, "step": 7180 }, { "ce_loss_12": 3.350790059566498, "ce_loss_17": 3.0248377323150635, "ce_loss_23": 2.9074713706970217, "ce_loss_3": 4.104654264450073, "ce_loss_6": 3.807921862602234, "epoch": 0.719, "grad_norm": 1464.0, "kl_loss_12": 1024.6246978759766, "kl_loss_17": 282.37331466674806, "kl_loss_3": 2545.300085449219, "kl_loss_6": 1961.3479431152343, "learning_rate": 0.0001859567346490913, "loss": 1467.8516, "step": 7190 }, { "ce_loss_12": 3.356687808036804, "ce_loss_17": 3.021155858039856, "ce_loss_23": 2.8927383184432984, "ce_loss_3": 4.146722996234894, "ce_loss_6": 3.845209336280823, "epoch": 0.72, "grad_norm": 1056.0, "kl_loss_12": 1070.720932006836, "kl_loss_17": 292.14399642944335, "kl_loss_3": 2671.309716796875, "kl_loss_6": 2063.0775756835938, "learning_rate": 0.0001847236664577389, "loss": 1501.4645, "step": 7200 }, { "ce_loss_12": 3.3498232841491697, "ce_loss_17": 3.0289518237113953, "ce_loss_23": 2.9100707530975343, "ce_loss_3": 4.10675413608551, "ce_loss_6": 3.8027848839759826, "epoch": 0.721, "grad_norm": 992.0, "kl_loss_12": 1024.908447265625, "kl_loss_17": 280.13957595825195, "kl_loss_3": 2547.7384521484373, "kl_loss_6": 1949.222998046875, "learning_rate": 0.00018349377309556487, "loss": 1461.0233, "step": 7210 }, { "ce_loss_12": 3.327030324935913, "ce_loss_17": 2.983788788318634, "ce_loss_23": 2.8595494508743284, "ce_loss_3": 4.154975938796997, "ce_loss_6": 3.8437779188156127, "epoch": 0.722, "grad_norm": 1336.0, "kl_loss_12": 1097.6743377685548, "kl_loss_17": 291.4554931640625, "kl_loss_3": 2756.4244140625, "kl_loss_6": 2143.5498779296877, "learning_rate": 0.00018226706694758193, "loss": 1543.6838, "step": 7220 }, { "ce_loss_12": 3.3858094096183775, "ce_loss_17": 3.0542941093444824, "ce_loss_23": 2.9364635705947877, "ce_loss_3": 4.162681579589844, "ce_loss_6": 3.8647823333740234, "epoch": 0.723, "grad_norm": 1000.0, "kl_loss_12": 1060.308575439453, "kl_loss_17": 282.78950729370115, "kl_loss_3": 2636.62216796875, "kl_loss_6": 2041.1465759277344, "learning_rate": 0.0001810435603667075, "loss": 1547.8955, "step": 7230 }, { "ce_loss_12": 3.253883945941925, "ce_loss_17": 2.911012315750122, "ce_loss_23": 2.788108789920807, "ce_loss_3": 4.047508549690247, "ce_loss_6": 3.7393125534057616, "epoch": 0.724, "grad_norm": 872.0, "kl_loss_12": 1055.8319091796875, "kl_loss_17": 280.26491088867186, "kl_loss_3": 2649.6712646484375, "kl_loss_6": 2038.39619140625, "learning_rate": 0.0001798232656736389, "loss": 1537.725, "step": 7240 }, { "ce_loss_12": 3.3978039264678954, "ce_loss_17": 3.0690760612487793, "ce_loss_23": 2.9458399534225466, "ce_loss_3": 4.148194074630737, "ce_loss_6": 3.857631707191467, "epoch": 0.725, "grad_norm": 1032.0, "kl_loss_12": 1028.905044555664, "kl_loss_17": 281.8951362609863, "kl_loss_3": 2543.316760253906, "kl_loss_6": 1961.9904541015626, "learning_rate": 0.0001786061951567303, "loss": 1484.9909, "step": 7250 }, { "ce_loss_12": 3.33192777633667, "ce_loss_17": 2.991592597961426, "ce_loss_23": 2.866880714893341, "ce_loss_3": 4.11643146276474, "ce_loss_6": 3.814874029159546, "epoch": 0.726, "grad_norm": 1072.0, "kl_loss_12": 1066.6974670410157, "kl_loss_17": 289.0210906982422, "kl_loss_3": 2635.8052734375, "kl_loss_6": 2039.5341430664062, "learning_rate": 0.00017739236107186857, "loss": 1521.1975, "step": 7260 }, { "ce_loss_12": 3.396472692489624, "ce_loss_17": 3.0823714971542358, "ce_loss_23": 2.963548684120178, "ce_loss_3": 4.141474437713623, "ce_loss_6": 3.8460238099098207, "epoch": 0.727, "grad_norm": 1104.0, "kl_loss_12": 1017.0243896484375, "kl_loss_17": 274.48282318115236, "kl_loss_3": 2524.86728515625, "kl_loss_6": 1935.4310913085938, "learning_rate": 0.00017618177564234904, "loss": 1473.3111, "step": 7270 }, { "ce_loss_12": 3.3675931453704835, "ce_loss_17": 3.044634234905243, "ce_loss_23": 2.9322256088256835, "ce_loss_3": 4.122310245037079, "ce_loss_6": 3.817670261859894, "epoch": 0.728, "grad_norm": 980.0, "kl_loss_12": 1010.9489959716797, "kl_loss_17": 269.3266403198242, "kl_loss_3": 2518.7998168945314, "kl_loss_6": 1925.0574768066406, "learning_rate": 0.00017497445105875377, "loss": 1465.2238, "step": 7280 }, { "ce_loss_12": 3.3209853768348694, "ce_loss_17": 2.9679404616355898, "ce_loss_23": 2.8431422114372253, "ce_loss_3": 4.124039840698242, "ce_loss_6": 3.8073510766029357, "epoch": 0.729, "grad_norm": 1144.0, "kl_loss_12": 1086.3922210693358, "kl_loss_17": 289.9881004333496, "kl_loss_3": 2705.1694458007814, "kl_loss_6": 2082.1453247070312, "learning_rate": 0.000173770399478828, "loss": 1522.562, "step": 7290 }, { "ce_loss_12": 3.230670762062073, "ce_loss_17": 2.898967134952545, "ce_loss_23": 2.7818917870521545, "ce_loss_3": 4.020108902454377, "ce_loss_6": 3.709551203250885, "epoch": 0.73, "grad_norm": 1064.0, "kl_loss_12": 1040.022787475586, "kl_loss_17": 277.5796829223633, "kl_loss_3": 2642.2594360351563, "kl_loss_6": 2024.7518676757813, "learning_rate": 0.0001725696330273575, "loss": 1532.9043, "step": 7300 }, { "ce_loss_12": 3.392570424079895, "ce_loss_17": 3.0626121759414673, "ce_loss_23": 2.943638336658478, "ce_loss_3": 4.148163962364197, "ce_loss_6": 3.8440015435218813, "epoch": 0.731, "grad_norm": 1264.0, "kl_loss_12": 1028.6334869384766, "kl_loss_17": 275.8746925354004, "kl_loss_3": 2548.121789550781, "kl_loss_6": 1954.8319152832032, "learning_rate": 0.00017137216379604724, "loss": 1459.7328, "step": 7310 }, { "ce_loss_12": 3.284358024597168, "ce_loss_17": 2.9527658343315126, "ce_loss_23": 2.829926073551178, "ce_loss_3": 4.08096536397934, "ce_loss_6": 3.7757596254348753, "epoch": 0.732, "grad_norm": 1192.0, "kl_loss_12": 1043.1614227294922, "kl_loss_17": 281.0181144714355, "kl_loss_3": 2630.3141235351563, "kl_loss_6": 2029.7836059570313, "learning_rate": 0.00017017800384339925, "loss": 1507.2828, "step": 7320 }, { "ce_loss_12": 3.26470342874527, "ce_loss_17": 2.9141951203346252, "ce_loss_23": 2.7884278655052186, "ce_loss_3": 4.081666564941406, "ce_loss_6": 3.7667925357818604, "epoch": 0.733, "grad_norm": 1000.0, "kl_loss_12": 1087.1163970947266, "kl_loss_17": 285.4456298828125, "kl_loss_3": 2725.5192993164064, "kl_loss_6": 2103.689501953125, "learning_rate": 0.00016898716519459073, "loss": 1497.8559, "step": 7330 }, { "ce_loss_12": 3.3825608849525453, "ce_loss_17": 3.0331148505210876, "ce_loss_23": 2.900582027435303, "ce_loss_3": 4.195623028278351, "ce_loss_6": 3.8816211581230164, "epoch": 0.734, "grad_norm": 960.0, "kl_loss_12": 1088.0415496826172, "kl_loss_17": 299.24062423706056, "kl_loss_3": 2700.626806640625, "kl_loss_6": 2086.225689697266, "learning_rate": 0.00016779965984135375, "loss": 1519.9778, "step": 7340 }, { "ce_loss_12": 3.2890339851379395, "ce_loss_17": 2.952122926712036, "ce_loss_23": 2.82895188331604, "ce_loss_3": 4.083813881874084, "ce_loss_6": 3.780845284461975, "epoch": 0.735, "grad_norm": 1192.0, "kl_loss_12": 1042.695639038086, "kl_loss_17": 277.04451141357424, "kl_loss_3": 2625.741064453125, "kl_loss_6": 2032.5084655761718, "learning_rate": 0.00016661549974185424, "loss": 1498.1021, "step": 7350 }, { "ce_loss_12": 3.323700475692749, "ce_loss_17": 2.9858119606971742, "ce_loss_23": 2.862323749065399, "ce_loss_3": 4.097188651561737, "ce_loss_6": 3.7945112943649293, "epoch": 0.736, "grad_norm": 1024.0, "kl_loss_12": 1056.7614532470702, "kl_loss_17": 287.83737258911134, "kl_loss_3": 2622.837121582031, "kl_loss_6": 2023.2848693847657, "learning_rate": 0.00016543469682057105, "loss": 1481.7643, "step": 7360 }, { "ce_loss_12": 3.3511133074760435, "ce_loss_17": 3.00958833694458, "ce_loss_23": 2.8835976123809814, "ce_loss_3": 4.127915704250336, "ce_loss_6": 3.8203643321990968, "epoch": 0.737, "grad_norm": 976.0, "kl_loss_12": 1061.8063507080078, "kl_loss_17": 289.0400161743164, "kl_loss_3": 2634.0617919921874, "kl_loss_6": 2024.7839965820312, "learning_rate": 0.00016425726296817632, "loss": 1493.3889, "step": 7370 }, { "ce_loss_12": 3.3494356870651245, "ce_loss_17": 3.016697037220001, "ce_loss_23": 2.901347589492798, "ce_loss_3": 4.1259073138237, "ce_loss_6": 3.823378086090088, "epoch": 0.738, "grad_norm": 996.0, "kl_loss_12": 1033.489205932617, "kl_loss_17": 279.1275939941406, "kl_loss_3": 2588.6812133789062, "kl_loss_6": 1985.3832580566407, "learning_rate": 0.00016308321004141607, "loss": 1489.5334, "step": 7380 }, { "ce_loss_12": 3.3241341948509215, "ce_loss_17": 2.977702283859253, "ce_loss_23": 2.8510108828544616, "ce_loss_3": 4.11659916639328, "ce_loss_6": 3.8045485615730286, "epoch": 0.739, "grad_norm": 1320.0, "kl_loss_12": 1075.1243988037108, "kl_loss_17": 294.54233703613284, "kl_loss_3": 2660.0075805664064, "kl_loss_6": 2044.3505432128907, "learning_rate": 0.00016191254986299043, "loss": 1490.9283, "step": 7390 }, { "ce_loss_12": 3.331904947757721, "ce_loss_17": 3.0109179496765135, "ce_loss_23": 2.8957800030708314, "ce_loss_3": 4.101964116096497, "ce_loss_6": 3.8015185356140138, "epoch": 0.74, "grad_norm": 936.0, "kl_loss_12": 1030.34921875, "kl_loss_17": 273.9654281616211, "kl_loss_3": 2589.215710449219, "kl_loss_6": 1992.3321166992187, "learning_rate": 0.00016074529422143398, "loss": 1507.4629, "step": 7400 }, { "ce_loss_12": 3.321945583820343, "ce_loss_17": 2.9820314168930055, "ce_loss_23": 2.858848738670349, "ce_loss_3": 4.122950708866119, "ce_loss_6": 3.816438043117523, "epoch": 0.741, "grad_norm": 1020.0, "kl_loss_12": 1063.10849609375, "kl_loss_17": 290.5937843322754, "kl_loss_3": 2667.8557250976564, "kl_loss_6": 2064.332342529297, "learning_rate": 0.0001595814548709983, "loss": 1533.617, "step": 7410 }, { "ce_loss_12": 3.3886276841163636, "ce_loss_17": 3.0383246660232546, "ce_loss_23": 2.9127967834472654, "ce_loss_3": 4.178054928779602, "ce_loss_6": 3.8697713732719423, "epoch": 0.742, "grad_norm": 1296.0, "kl_loss_12": 1085.9440490722657, "kl_loss_17": 295.9751800537109, "kl_loss_3": 2689.67080078125, "kl_loss_6": 2086.7424377441407, "learning_rate": 0.00015842104353153285, "loss": 1522.5532, "step": 7420 }, { "ce_loss_12": 3.3868244290351868, "ce_loss_17": 3.0489043235778808, "ce_loss_23": 2.925636887550354, "ce_loss_3": 4.170488095283508, "ce_loss_6": 3.8697841644287108, "epoch": 0.743, "grad_norm": 1020.0, "kl_loss_12": 1065.0920440673829, "kl_loss_17": 289.10176544189454, "kl_loss_3": 2638.9310913085938, "kl_loss_6": 2043.6178588867188, "learning_rate": 0.0001572640718883667, "loss": 1531.0913, "step": 7430 }, { "ce_loss_12": 3.3096341848373414, "ce_loss_17": 2.987624776363373, "ce_loss_23": 2.872477889060974, "ce_loss_3": 4.082384061813355, "ce_loss_6": 3.7801726818084718, "epoch": 0.744, "grad_norm": 964.0, "kl_loss_12": 1032.089779663086, "kl_loss_17": 277.3216491699219, "kl_loss_3": 2569.769177246094, "kl_loss_6": 1975.4082946777344, "learning_rate": 0.0001561105515921915, "loss": 1511.5431, "step": 7440 }, { "ce_loss_12": 3.2099051237106324, "ce_loss_17": 2.857716774940491, "ce_loss_23": 2.7396645665168764, "ce_loss_3": 4.029573166370392, "ce_loss_6": 3.706012415885925, "epoch": 0.745, "grad_norm": 1120.0, "kl_loss_12": 1069.060369873047, "kl_loss_17": 277.27220916748047, "kl_loss_3": 2726.9617431640627, "kl_loss_6": 2093.1573852539063, "learning_rate": 0.0001549604942589441, "loss": 1509.512, "step": 7450 }, { "ce_loss_12": 3.341689133644104, "ce_loss_17": 3.0220096111297607, "ce_loss_23": 2.9066612124443054, "ce_loss_3": 4.086879503726959, "ce_loss_6": 3.8007105112075807, "epoch": 0.746, "grad_norm": 896.0, "kl_loss_12": 1009.4411987304687, "kl_loss_17": 271.61903076171876, "kl_loss_3": 2505.743408203125, "kl_loss_6": 1934.8906188964843, "learning_rate": 0.00015381391146968864, "loss": 1461.8856, "step": 7460 }, { "ce_loss_12": 3.333513391017914, "ce_loss_17": 2.9974658370018004, "ce_loss_23": 2.879487907886505, "ce_loss_3": 4.123572957515717, "ce_loss_6": 3.821442413330078, "epoch": 0.747, "grad_norm": 1136.0, "kl_loss_12": 1039.75849609375, "kl_loss_17": 274.57335205078124, "kl_loss_3": 2615.1775756835937, "kl_loss_6": 2020.2796203613282, "learning_rate": 0.00015267081477050133, "loss": 1504.0042, "step": 7470 }, { "ce_loss_12": 3.427445709705353, "ce_loss_17": 3.09232976436615, "ce_loss_23": 2.965824806690216, "ce_loss_3": 4.178172600269318, "ce_loss_6": 3.883107364177704, "epoch": 0.748, "grad_norm": 968.0, "kl_loss_12": 1057.313070678711, "kl_loss_17": 290.85567474365234, "kl_loss_3": 2577.955285644531, "kl_loss_6": 1986.2766052246093, "learning_rate": 0.00015153121567235335, "loss": 1470.3311, "step": 7480 }, { "ce_loss_12": 3.3238179087638855, "ce_loss_17": 2.9903075218200685, "ce_loss_23": 2.874490833282471, "ce_loss_3": 4.120837676525116, "ce_loss_6": 3.8117267370223997, "epoch": 0.749, "grad_norm": 984.0, "kl_loss_12": 1054.7883697509765, "kl_loss_17": 282.8294624328613, "kl_loss_3": 2665.5120239257812, "kl_loss_6": 2052.0541809082033, "learning_rate": 0.00015039512565099468, "loss": 1472.3645, "step": 7490 }, { "ce_loss_12": 3.3772895336151123, "ce_loss_17": 3.052878940105438, "ce_loss_23": 2.937324583530426, "ce_loss_3": 4.142999291419983, "ce_loss_6": 3.8434554815292357, "epoch": 0.75, "grad_norm": 988.0, "kl_loss_12": 1040.7686798095704, "kl_loss_17": 278.93787689208983, "kl_loss_3": 2590.99130859375, "kl_loss_6": 1991.1614868164063, "learning_rate": 0.00014926255614683932, "loss": 1538.4063, "step": 7500 }, { "ce_loss_12": 3.318462574481964, "ce_loss_17": 2.988450789451599, "ce_loss_23": 2.8678772807121278, "ce_loss_3": 4.096622204780578, "ce_loss_6": 3.796446645259857, "epoch": 0.751, "grad_norm": 1072.0, "kl_loss_12": 1038.6952239990235, "kl_loss_17": 280.92377548217775, "kl_loss_3": 2619.8683471679688, "kl_loss_6": 2017.0190490722657, "learning_rate": 0.0001481335185648498, "loss": 1501.4311, "step": 7510 }, { "ce_loss_12": 3.347504699230194, "ce_loss_17": 3.0147265672683714, "ce_loss_23": 2.8962690949440004, "ce_loss_3": 4.121276211738587, "ce_loss_6": 3.8162226915359496, "epoch": 0.752, "grad_norm": 1056.0, "kl_loss_12": 1048.723617553711, "kl_loss_17": 281.2362464904785, "kl_loss_3": 2618.826452636719, "kl_loss_6": 2011.4213256835938, "learning_rate": 0.0001470080242744218, "loss": 1482.2265, "step": 7520 }, { "ce_loss_12": 3.34172899723053, "ce_loss_17": 3.0092835426330566, "ce_loss_23": 2.897358810901642, "ce_loss_3": 4.1162315011024475, "ce_loss_6": 3.8220934510231017, "epoch": 0.753, "grad_norm": 924.0, "kl_loss_12": 1029.3202911376952, "kl_loss_17": 273.0403007507324, "kl_loss_3": 2603.049938964844, "kl_loss_6": 2020.111151123047, "learning_rate": 0.0001458860846092705, "loss": 1497.3177, "step": 7530 }, { "ce_loss_12": 3.378802788257599, "ce_loss_17": 3.055048716068268, "ce_loss_23": 2.9335278511047362, "ce_loss_3": 4.1248640537261965, "ce_loss_6": 3.829790270328522, "epoch": 0.754, "grad_norm": 1176.0, "kl_loss_12": 1033.308349609375, "kl_loss_17": 281.56811904907227, "kl_loss_3": 2530.413781738281, "kl_loss_6": 1945.8234558105469, "learning_rate": 0.00014476771086731566, "loss": 1448.7382, "step": 7540 }, { "ce_loss_12": 3.4659211158752443, "ce_loss_17": 3.1346755385398866, "ce_loss_23": 3.0065076112747193, "ce_loss_3": 4.226503646373748, "ce_loss_6": 3.93116614818573, "epoch": 0.755, "grad_norm": 1128.0, "kl_loss_12": 1052.2420623779296, "kl_loss_17": 292.08596420288086, "kl_loss_3": 2581.6936767578127, "kl_loss_6": 1997.2755920410157, "learning_rate": 0.00014365291431056872, "loss": 1518.8756, "step": 7550 }, { "ce_loss_12": 3.3224751710891725, "ce_loss_17": 2.9759204030036925, "ce_loss_23": 2.8496909141540527, "ce_loss_3": 4.109515285491943, "ce_loss_6": 3.801195240020752, "epoch": 0.756, "grad_norm": 904.0, "kl_loss_12": 1081.3564544677733, "kl_loss_17": 294.1281547546387, "kl_loss_3": 2679.63125, "kl_loss_6": 2064.4482421875, "learning_rate": 0.00014254170616501827, "loss": 1514.05, "step": 7560 }, { "ce_loss_12": 3.2967641592025756, "ce_loss_17": 2.930052900314331, "ce_loss_23": 2.7977853178977967, "ce_loss_3": 4.1107590913772585, "ce_loss_6": 3.7909518480300903, "epoch": 0.757, "grad_norm": 1020.0, "kl_loss_12": 1128.729232788086, "kl_loss_17": 298.5667533874512, "kl_loss_3": 2763.8598754882814, "kl_loss_6": 2126.74951171875, "learning_rate": 0.0001414340976205183, "loss": 1567.2646, "step": 7570 }, { "ce_loss_12": 3.280392253398895, "ce_loss_17": 2.9262807726860047, "ce_loss_23": 2.807630729675293, "ce_loss_3": 4.092207396030426, "ce_loss_6": 3.7799305081367494, "epoch": 0.758, "grad_norm": 1336.0, "kl_loss_12": 1065.8500915527343, "kl_loss_17": 281.43665313720703, "kl_loss_3": 2699.0353881835936, "kl_loss_6": 2083.4188720703123, "learning_rate": 0.00014033009983067452, "loss": 1506.1648, "step": 7580 }, { "ce_loss_12": 3.394823133945465, "ce_loss_17": 3.0780391693115234, "ce_loss_23": 2.9608801007270813, "ce_loss_3": 4.142879939079284, "ce_loss_6": 3.8521243810653685, "epoch": 0.759, "grad_norm": 1056.0, "kl_loss_12": 1015.8759643554688, "kl_loss_17": 273.7043281555176, "kl_loss_3": 2537.435986328125, "kl_loss_6": 1953.083233642578, "learning_rate": 0.00013922972391273224, "loss": 1475.9166, "step": 7590 }, { "ce_loss_12": 3.409343111515045, "ce_loss_17": 3.0831799149513244, "ce_loss_23": 2.961395597457886, "ce_loss_3": 4.194057762622833, "ce_loss_6": 3.8991695284843444, "epoch": 0.76, "grad_norm": 1544.0, "kl_loss_12": 1032.2313873291016, "kl_loss_17": 281.07324600219727, "kl_loss_3": 2602.2853759765626, "kl_loss_6": 2026.8900756835938, "learning_rate": 0.0001381329809474649, "loss": 1492.928, "step": 7600 }, { "ce_loss_12": 3.3551380395889283, "ce_loss_17": 3.0006386637687683, "ce_loss_23": 2.871221125125885, "ce_loss_3": 4.1550891399383545, "ce_loss_6": 3.8454134941101072, "epoch": 0.761, "grad_norm": 1184.0, "kl_loss_12": 1100.336163330078, "kl_loss_17": 294.1468795776367, "kl_loss_3": 2706.9808654785156, "kl_loss_6": 2096.7616455078123, "learning_rate": 0.0001370398819790621, "loss": 1533.9725, "step": 7610 }, { "ce_loss_12": 3.4408915281295775, "ce_loss_17": 3.116612160205841, "ce_loss_23": 2.9966283202171327, "ce_loss_3": 4.1998590469360355, "ce_loss_6": 3.9062315940856935, "epoch": 0.762, "grad_norm": 1088.0, "kl_loss_12": 1025.209161376953, "kl_loss_17": 280.3527404785156, "kl_loss_3": 2559.6067016601564, "kl_loss_6": 1977.9031372070312, "learning_rate": 0.00013595043801501794, "loss": 1465.2758, "step": 7620 }, { "ce_loss_12": 3.2940911531448362, "ce_loss_17": 2.9408129334449766, "ce_loss_23": 2.8113720297813414, "ce_loss_3": 4.135281682014465, "ce_loss_6": 3.812657952308655, "epoch": 0.763, "grad_norm": 1280.0, "kl_loss_12": 1105.1962646484376, "kl_loss_17": 294.04699935913084, "kl_loss_3": 2790.003955078125, "kl_loss_6": 2151.4091979980467, "learning_rate": 0.00013486466002602133, "loss": 1539.4232, "step": 7630 }, { "ce_loss_12": 3.3600721836090086, "ce_loss_17": 3.031892383098602, "ce_loss_23": 2.9156318426132204, "ce_loss_3": 4.1076583623886105, "ce_loss_6": 3.8144946694374084, "epoch": 0.764, "grad_norm": 984.0, "kl_loss_12": 1034.2011474609376, "kl_loss_17": 277.2654144287109, "kl_loss_3": 2565.522998046875, "kl_loss_6": 1969.8522705078126, "learning_rate": 0.00013378255894584462, "loss": 1514.8982, "step": 7640 }, { "ce_loss_12": 3.3211570501327516, "ce_loss_17": 2.979449915885925, "ce_loss_23": 2.8517404437065124, "ce_loss_3": 4.130272006988525, "ce_loss_6": 3.8186199069023132, "epoch": 0.765, "grad_norm": 1144.0, "kl_loss_12": 1071.8350494384765, "kl_loss_17": 290.4520797729492, "kl_loss_3": 2694.0873901367186, "kl_loss_6": 2079.6744750976563, "learning_rate": 0.0001327041456712334, "loss": 1524.7129, "step": 7650 }, { "ce_loss_12": 3.353261423110962, "ce_loss_17": 3.0194068431854246, "ce_loss_23": 2.89362518787384, "ce_loss_3": 4.14197006225586, "ce_loss_6": 3.8328930497169496, "epoch": 0.766, "grad_norm": 1304.0, "kl_loss_12": 1062.2486907958985, "kl_loss_17": 287.6679061889648, "kl_loss_3": 2641.623046875, "kl_loss_6": 2034.7803100585938, "learning_rate": 0.00013162943106179747, "loss": 1519.9464, "step": 7660 }, { "ce_loss_12": 3.3228484749794007, "ce_loss_17": 2.9905929923057557, "ce_loss_23": 2.8718537211418154, "ce_loss_3": 4.084289968013763, "ce_loss_6": 3.7856014490127565, "epoch": 0.767, "grad_norm": 976.0, "kl_loss_12": 1042.566226196289, "kl_loss_17": 278.72392959594725, "kl_loss_3": 2588.8190795898436, "kl_loss_6": 1995.1788024902344, "learning_rate": 0.00013055842593990132, "loss": 1492.6238, "step": 7670 }, { "ce_loss_12": 3.2804736375808714, "ce_loss_17": 2.943773651123047, "ce_loss_23": 2.8240336775779724, "ce_loss_3": 4.059450459480286, "ce_loss_6": 3.747068452835083, "epoch": 0.768, "grad_norm": 992.0, "kl_loss_12": 1031.083804321289, "kl_loss_17": 278.0084037780762, "kl_loss_3": 2579.5146728515624, "kl_loss_6": 1981.6210876464843, "learning_rate": 0.00012949114109055414, "loss": 1509.4066, "step": 7680 }, { "ce_loss_12": 3.325702929496765, "ce_loss_17": 2.985046076774597, "ce_loss_23": 2.8618072509765624, "ce_loss_3": 4.106758785247803, "ce_loss_6": 3.806333029270172, "epoch": 0.769, "grad_norm": 1032.0, "kl_loss_12": 1057.625018310547, "kl_loss_17": 287.085506439209, "kl_loss_3": 2639.7965576171873, "kl_loss_6": 2039.7977416992187, "learning_rate": 0.00012842758726130281, "loss": 1521.547, "step": 7690 }, { "ce_loss_12": 3.3817939877510073, "ce_loss_17": 3.0320732951164246, "ce_loss_23": 2.9052831649780275, "ce_loss_3": 4.1872913479805, "ce_loss_6": 3.873805296421051, "epoch": 0.77, "grad_norm": 1048.0, "kl_loss_12": 1073.136459350586, "kl_loss_17": 289.16695938110354, "kl_loss_3": 2685.2822509765624, "kl_loss_6": 2067.8914428710937, "learning_rate": 0.00012736777516212267, "loss": 1502.7867, "step": 7700 }, { "ce_loss_12": 3.3680408596992493, "ce_loss_17": 3.027958834171295, "ce_loss_23": 2.9010915637016295, "ce_loss_3": 4.150628173351288, "ce_loss_6": 3.8480496644973754, "epoch": 0.771, "grad_norm": 1200.0, "kl_loss_12": 1072.5316314697266, "kl_loss_17": 292.39178771972655, "kl_loss_3": 2632.5837646484374, "kl_loss_6": 2040.2246887207032, "learning_rate": 0.00012631171546530968, "loss": 1486.1639, "step": 7710 }, { "ce_loss_12": 3.3786956071853638, "ce_loss_17": 3.0343061208724977, "ce_loss_23": 2.906955349445343, "ce_loss_3": 4.14276841878891, "ce_loss_6": 3.8478384733200075, "epoch": 0.772, "grad_norm": 1312.0, "kl_loss_12": 1076.8528961181642, "kl_loss_17": 291.54980926513673, "kl_loss_3": 2633.0797729492188, "kl_loss_6": 2043.3614379882813, "learning_rate": 0.00012525941880537307, "loss": 1524.2413, "step": 7720 }, { "ce_loss_12": 3.398939514160156, "ce_loss_17": 3.0631134629249575, "ce_loss_23": 2.942670977115631, "ce_loss_3": 4.171182703971863, "ce_loss_6": 3.86375207901001, "epoch": 0.773, "grad_norm": 1152.0, "kl_loss_12": 1049.3153106689454, "kl_loss_17": 280.57236404418944, "kl_loss_3": 2612.383923339844, "kl_loss_6": 1999.32705078125, "learning_rate": 0.00012421089577892869, "loss": 1495.2975, "step": 7730 }, { "ce_loss_12": 3.361254799365997, "ce_loss_17": 3.017450821399689, "ce_loss_23": 2.8957334995269775, "ce_loss_3": 4.158638286590576, "ce_loss_6": 3.8432446360588073, "epoch": 0.774, "grad_norm": 1012.0, "kl_loss_12": 1077.780435180664, "kl_loss_17": 284.0028190612793, "kl_loss_3": 2675.105017089844, "kl_loss_6": 2050.918780517578, "learning_rate": 0.0001231661569445919, "loss": 1516.5686, "step": 7740 }, { "ce_loss_12": 3.2308457136154174, "ce_loss_17": 2.890022850036621, "ce_loss_23": 2.7693028926849363, "ce_loss_3": 4.028452098369598, "ce_loss_6": 3.714640963077545, "epoch": 0.775, "grad_norm": 976.0, "kl_loss_12": 1057.9297485351562, "kl_loss_17": 286.21061248779296, "kl_loss_3": 2668.4592529296874, "kl_loss_6": 2049.3302795410154, "learning_rate": 0.00012212521282287093, "loss": 1540.696, "step": 7750 }, { "ce_loss_12": 3.3653706312179565, "ce_loss_17": 3.0217082381248472, "ce_loss_23": 2.8978288531303407, "ce_loss_3": 4.125185596942901, "ce_loss_6": 3.829509603977203, "epoch": 0.776, "grad_norm": 1064.0, "kl_loss_12": 1065.6006988525392, "kl_loss_17": 287.18448333740236, "kl_loss_3": 2599.796826171875, "kl_loss_6": 2009.5392761230469, "learning_rate": 0.00012108807389606158, "loss": 1524.3297, "step": 7760 }, { "ce_loss_12": 3.354585552215576, "ce_loss_17": 3.024714708328247, "ce_loss_23": 2.90908796787262, "ce_loss_3": 4.133483743667602, "ce_loss_6": 3.8332683205604554, "epoch": 0.777, "grad_norm": 1008.0, "kl_loss_12": 1044.9940063476563, "kl_loss_17": 277.4327453613281, "kl_loss_3": 2609.7527587890627, "kl_loss_6": 2017.0435913085937, "learning_rate": 0.00012005475060814159, "loss": 1490.6898, "step": 7770 }, { "ce_loss_12": 3.3110827565193177, "ce_loss_17": 2.962108778953552, "ce_loss_23": 2.842651915550232, "ce_loss_3": 4.103477931022644, "ce_loss_6": 3.805611324310303, "epoch": 0.778, "grad_norm": 1192.0, "kl_loss_12": 1077.4869171142577, "kl_loss_17": 285.6055603027344, "kl_loss_3": 2677.163195800781, "kl_loss_6": 2081.893310546875, "learning_rate": 0.00011902525336466464, "loss": 1515.6557, "step": 7780 }, { "ce_loss_12": 3.309223008155823, "ce_loss_17": 2.957816791534424, "ce_loss_23": 2.826165294647217, "ce_loss_3": 4.1195541501045225, "ce_loss_6": 3.808503520488739, "epoch": 0.779, "grad_norm": 916.0, "kl_loss_12": 1092.967202758789, "kl_loss_17": 293.0207763671875, "kl_loss_3": 2728.019567871094, "kl_loss_6": 2111.1691650390626, "learning_rate": 0.00011799959253265668, "loss": 1524.7547, "step": 7790 }, { "ce_loss_12": 3.3427767872810366, "ce_loss_17": 3.005957913398743, "ce_loss_23": 2.8837684988975525, "ce_loss_3": 4.141287219524384, "ce_loss_6": 3.8301820755004883, "epoch": 0.78, "grad_norm": 1004.0, "kl_loss_12": 1063.709521484375, "kl_loss_17": 287.99907302856445, "kl_loss_3": 2670.677001953125, "kl_loss_6": 2058.465539550781, "learning_rate": 0.00011697777844051105, "loss": 1512.9553, "step": 7800 }, { "ce_loss_12": 3.34201123714447, "ce_loss_17": 3.0019322752952577, "ce_loss_23": 2.875227212905884, "ce_loss_3": 4.166788375377655, "ce_loss_6": 3.8613690376281737, "epoch": 0.781, "grad_norm": 960.0, "kl_loss_12": 1070.0397857666017, "kl_loss_17": 291.10598678588866, "kl_loss_3": 2726.98310546875, "kl_loss_6": 2118.020654296875, "learning_rate": 0.00011595982137788402, "loss": 1532.2738, "step": 7810 }, { "ce_loss_12": 3.297192394733429, "ce_loss_17": 2.974992501735687, "ce_loss_23": 2.8550528407096865, "ce_loss_3": 4.067392218112945, "ce_loss_6": 3.7673613667488097, "epoch": 0.782, "grad_norm": 1496.0, "kl_loss_12": 1030.9745788574219, "kl_loss_17": 277.9621757507324, "kl_loss_3": 2583.620556640625, "kl_loss_6": 1984.6390380859375, "learning_rate": 0.00011494573159559212, "loss": 1494.2168, "step": 7820 }, { "ce_loss_12": 3.305099880695343, "ce_loss_17": 2.9622052550315856, "ce_loss_23": 2.839592659473419, "ce_loss_3": 4.081231749057769, "ce_loss_6": 3.786455988883972, "epoch": 0.783, "grad_norm": 1000.0, "kl_loss_12": 1064.5903106689452, "kl_loss_17": 289.8975769042969, "kl_loss_3": 2638.9677490234376, "kl_loss_6": 2053.720361328125, "learning_rate": 0.00011393551930550828, "loss": 1541.4899, "step": 7830 }, { "ce_loss_12": 3.4056118726730347, "ce_loss_17": 3.078857898712158, "ce_loss_23": 2.9554930329322815, "ce_loss_3": 4.180590558052063, "ce_loss_6": 3.875455212593079, "epoch": 0.784, "grad_norm": 1144.0, "kl_loss_12": 1038.494302368164, "kl_loss_17": 284.370703125, "kl_loss_3": 2591.5422241210936, "kl_loss_6": 1985.3228637695313, "learning_rate": 0.00011292919468045875, "loss": 1485.5578, "step": 7840 }, { "ce_loss_12": 3.3782568097114565, "ce_loss_17": 3.0423492789268494, "ce_loss_23": 2.9200599431991576, "ce_loss_3": 4.162760663032532, "ce_loss_6": 3.854156756401062, "epoch": 0.785, "grad_norm": 1096.0, "kl_loss_12": 1049.5641845703126, "kl_loss_17": 285.1061187744141, "kl_loss_3": 2621.591650390625, "kl_loss_6": 2012.33037109375, "learning_rate": 0.00011192676785412154, "loss": 1485.7816, "step": 7850 }, { "ce_loss_12": 3.3352497935295107, "ce_loss_17": 2.9882391810417177, "ce_loss_23": 2.8606590270996093, "ce_loss_3": 4.143488276004791, "ce_loss_6": 3.829927217960358, "epoch": 0.786, "grad_norm": 1248.0, "kl_loss_12": 1070.5537658691405, "kl_loss_17": 288.625439453125, "kl_loss_3": 2696.9665283203126, "kl_loss_6": 2076.2456298828124, "learning_rate": 0.00011092824892092374, "loss": 1522.1464, "step": 7860 }, { "ce_loss_12": 3.2732455015182493, "ce_loss_17": 2.924168884754181, "ce_loss_23": 2.8001551151275637, "ce_loss_3": 4.080106019973755, "ce_loss_6": 3.7724220633506773, "epoch": 0.787, "grad_norm": 1144.0, "kl_loss_12": 1081.1447296142578, "kl_loss_17": 282.0930236816406, "kl_loss_3": 2704.5709228515625, "kl_loss_6": 2092.5669921875, "learning_rate": 0.0001099336479359398, "loss": 1509.2197, "step": 7870 }, { "ce_loss_12": 3.3644967079162598, "ce_loss_17": 3.0397923231124877, "ce_loss_23": 2.9192729711532595, "ce_loss_3": 4.132528138160706, "ce_loss_6": 3.829935443401337, "epoch": 0.788, "grad_norm": 1064.0, "kl_loss_12": 1039.1475769042968, "kl_loss_17": 280.9910743713379, "kl_loss_3": 2589.2807373046876, "kl_loss_6": 1985.3061462402343, "learning_rate": 0.00010894297491479043, "loss": 1499.0057, "step": 7880 }, { "ce_loss_12": 3.3554076671600344, "ce_loss_17": 3.021135950088501, "ce_loss_23": 2.901791679859161, "ce_loss_3": 4.140153288841248, "ce_loss_6": 3.8371911525726317, "epoch": 0.789, "grad_norm": 1096.0, "kl_loss_12": 1053.7805969238282, "kl_loss_17": 281.94963760375975, "kl_loss_3": 2625.732043457031, "kl_loss_6": 2028.555682373047, "learning_rate": 0.00010795623983354214, "loss": 1492.3039, "step": 7890 }, { "ce_loss_12": 3.283448374271393, "ce_loss_17": 2.9408915996551515, "ce_loss_23": 2.8099642992019653, "ce_loss_3": 4.072676730155945, "ce_loss_6": 3.7639185547828675, "epoch": 0.79, "grad_norm": 1128.0, "kl_loss_12": 1075.9224853515625, "kl_loss_17": 296.4463150024414, "kl_loss_3": 2683.721142578125, "kl_loss_6": 2063.581365966797, "learning_rate": 0.00010697345262860636, "loss": 1513.3955, "step": 7900 }, { "ce_loss_12": 3.3812041759490965, "ce_loss_17": 3.059047210216522, "ce_loss_23": 2.9412465929985045, "ce_loss_3": 4.161124658584595, "ce_loss_6": 3.8484493136405944, "epoch": 0.791, "grad_norm": 1208.0, "kl_loss_12": 1039.3051513671876, "kl_loss_17": 281.3121871948242, "kl_loss_3": 2603.398046875, "kl_loss_6": 1995.908465576172, "learning_rate": 0.00010599462319663906, "loss": 1477.8745, "step": 7910 }, { "ce_loss_12": 3.3532153487205507, "ce_loss_17": 3.0290448904037475, "ce_loss_23": 2.9121936798095702, "ce_loss_3": 4.107381510734558, "ce_loss_6": 3.80359365940094, "epoch": 0.792, "grad_norm": 1120.0, "kl_loss_12": 1027.3702545166016, "kl_loss_17": 278.64846115112306, "kl_loss_3": 2552.5097900390624, "kl_loss_6": 1950.554150390625, "learning_rate": 0.00010501976139444191, "loss": 1462.499, "step": 7920 }, { "ce_loss_12": 3.379141628742218, "ce_loss_17": 3.0500821232795716, "ce_loss_23": 2.9308634400367737, "ce_loss_3": 4.141736924648285, "ce_loss_6": 3.85120313167572, "epoch": 0.793, "grad_norm": 1080.0, "kl_loss_12": 1031.612240600586, "kl_loss_17": 277.12482299804685, "kl_loss_3": 2567.5821899414063, "kl_loss_6": 1991.542919921875, "learning_rate": 0.0001040488770388625, "loss": 1497.1086, "step": 7930 }, { "ce_loss_12": 3.3468173861503603, "ce_loss_17": 3.0071908950805666, "ce_loss_23": 2.8908491253852846, "ce_loss_3": 4.129796278476715, "ce_loss_6": 3.827104127407074, "epoch": 0.794, "grad_norm": 1144.0, "kl_loss_12": 1059.5995666503907, "kl_loss_17": 281.8688194274902, "kl_loss_3": 2650.525427246094, "kl_loss_6": 2047.058837890625, "learning_rate": 0.00010308197990669538, "loss": 1497.3817, "step": 7940 }, { "ce_loss_12": 3.4481312155723574, "ce_loss_17": 3.1162644386291505, "ce_loss_23": 2.9944263815879824, "ce_loss_3": 4.223077440261841, "ce_loss_6": 3.91604346036911, "epoch": 0.795, "grad_norm": 1208.0, "kl_loss_12": 1061.0736083984375, "kl_loss_17": 288.20682525634766, "kl_loss_3": 2627.3918579101564, "kl_loss_6": 2017.8263366699218, "learning_rate": 0.0001021190797345839, "loss": 1486.8842, "step": 7950 }, { "ce_loss_12": 3.2265631914138795, "ce_loss_17": 2.8669999957084658, "ce_loss_23": 2.7341838479042053, "ce_loss_3": 4.046359026432038, "ce_loss_6": 3.724623703956604, "epoch": 0.796, "grad_norm": 1056.0, "kl_loss_12": 1112.0119140625, "kl_loss_17": 297.9104400634766, "kl_loss_3": 2748.6629150390627, "kl_loss_6": 2113.977429199219, "learning_rate": 0.00010116018621892236, "loss": 1528.2286, "step": 7960 }, { "ce_loss_12": 3.4046633005142213, "ce_loss_17": 3.0650864958763124, "ce_loss_23": 2.9383347630500793, "ce_loss_3": 4.186934053897858, "ce_loss_6": 3.8826000571250914, "epoch": 0.797, "grad_norm": 1024.0, "kl_loss_12": 1086.240530395508, "kl_loss_17": 299.4277030944824, "kl_loss_3": 2658.250390625, "kl_loss_6": 2064.7257446289063, "learning_rate": 0.00010020530901575753, "loss": 1481.2242, "step": 7970 }, { "ce_loss_12": 3.411669981479645, "ce_loss_17": 3.077830362319946, "ce_loss_23": 2.9566885828971863, "ce_loss_3": 4.1832381844520565, "ce_loss_6": 3.87999769449234, "epoch": 0.798, "grad_norm": 856.0, "kl_loss_12": 1062.4724334716798, "kl_loss_17": 287.22545928955077, "kl_loss_3": 2635.3749145507813, "kl_loss_6": 2028.0968811035157, "learning_rate": 9.925445774069231e-05, "loss": 1476.0003, "step": 7980 }, { "ce_loss_12": 3.3718875646591187, "ce_loss_17": 3.032572162151337, "ce_loss_23": 2.9057837605476378, "ce_loss_3": 4.142505764961243, "ce_loss_6": 3.843195044994354, "epoch": 0.799, "grad_norm": 916.0, "kl_loss_12": 1043.4041320800782, "kl_loss_17": 285.2698211669922, "kl_loss_3": 2588.7929077148438, "kl_loss_6": 1991.7592468261719, "learning_rate": 9.830764196878872e-05, "loss": 1457.0741, "step": 7990 }, { "ce_loss_12": 3.3205261707305906, "ce_loss_17": 2.9875508427619932, "ce_loss_23": 2.8695124864578245, "ce_loss_3": 4.107181358337402, "ce_loss_6": 3.799295961856842, "epoch": 0.8, "grad_norm": 1024.0, "kl_loss_12": 1056.3955352783203, "kl_loss_17": 278.84816665649413, "kl_loss_3": 2666.8331298828125, "kl_loss_6": 2049.211877441406, "learning_rate": 9.736487123447069e-05, "loss": 1501.2203, "step": 8000 }, { "ce_loss_12": 3.2938269853591917, "ce_loss_17": 2.941650378704071, "ce_loss_23": 2.8178758025169373, "ce_loss_3": 4.113538646697998, "ce_loss_6": 3.8093990683555603, "epoch": 0.801, "grad_norm": 936.0, "kl_loss_12": 1090.296612548828, "kl_loss_17": 287.49616012573244, "kl_loss_3": 2765.3645263671874, "kl_loss_6": 2153.184539794922, "learning_rate": 9.642615503142926e-05, "loss": 1547.2773, "step": 8010 }, { "ce_loss_12": 3.33432058095932, "ce_loss_17": 2.996255397796631, "ce_loss_23": 2.8753136098384857, "ce_loss_3": 4.136964285373688, "ce_loss_6": 3.8324382066726685, "epoch": 0.802, "grad_norm": 1408.0, "kl_loss_12": 1055.0836669921875, "kl_loss_17": 283.3932373046875, "kl_loss_3": 2670.6496459960936, "kl_loss_6": 2061.926062011719, "learning_rate": 9.549150281252633e-05, "loss": 1494.2036, "step": 8020 }, { "ce_loss_12": 3.3622690558433534, "ce_loss_17": 3.023555374145508, "ce_loss_23": 2.8994755148887634, "ce_loss_3": 4.144161570072174, "ce_loss_6": 3.842393147945404, "epoch": 0.803, "grad_norm": 1336.0, "kl_loss_12": 1061.3064239501953, "kl_loss_17": 287.6574089050293, "kl_loss_3": 2649.08232421875, "kl_loss_6": 2050.1166259765623, "learning_rate": 9.4560923989699e-05, "loss": 1525.1096, "step": 8030 }, { "ce_loss_12": 3.35483433008194, "ce_loss_17": 3.019861912727356, "ce_loss_23": 2.8949792861938475, "ce_loss_3": 4.133480882644653, "ce_loss_6": 3.8327292680740355, "epoch": 0.804, "grad_norm": 1176.0, "kl_loss_12": 1054.61171875, "kl_loss_17": 288.03228759765625, "kl_loss_3": 2630.298449707031, "kl_loss_6": 2022.528057861328, "learning_rate": 9.363442793386607e-05, "loss": 1524.4778, "step": 8040 }, { "ce_loss_12": 3.344035971164703, "ce_loss_17": 2.9912624835968016, "ce_loss_23": 2.8620379090309145, "ce_loss_3": 4.154768025875091, "ce_loss_6": 3.839399552345276, "epoch": 0.805, "grad_norm": 1192.0, "kl_loss_12": 1087.6466186523437, "kl_loss_17": 291.8809112548828, "kl_loss_3": 2700.2523193359375, "kl_loss_6": 2080.220086669922, "learning_rate": 9.271202397483213e-05, "loss": 1492.3334, "step": 8050 }, { "ce_loss_12": 3.3396072030067443, "ce_loss_17": 3.0182195901870728, "ce_loss_23": 2.898484635353088, "ce_loss_3": 4.109511208534241, "ce_loss_6": 3.807023787498474, "epoch": 0.806, "grad_norm": 1008.0, "kl_loss_12": 1032.6840789794921, "kl_loss_17": 277.4155899047852, "kl_loss_3": 2581.357275390625, "kl_loss_6": 1988.5574157714843, "learning_rate": 9.179372140119524e-05, "loss": 1505.3848, "step": 8060 }, { "ce_loss_12": 3.3005812406539916, "ce_loss_17": 2.966699254512787, "ce_loss_23": 2.8480862021446227, "ce_loss_3": 4.078098475933075, "ce_loss_6": 3.7792740941047667, "epoch": 0.807, "grad_norm": 1120.0, "kl_loss_12": 1046.3826721191406, "kl_loss_17": 281.6129913330078, "kl_loss_3": 2604.1417602539063, "kl_loss_6": 2018.3038818359375, "learning_rate": 9.087952946025175e-05, "loss": 1513.502, "step": 8070 }, { "ce_loss_12": 3.3790520071983337, "ce_loss_17": 3.0631897926330565, "ce_loss_23": 2.9467246413230894, "ce_loss_3": 4.123933029174805, "ce_loss_6": 3.831411051750183, "epoch": 0.808, "grad_norm": 1168.0, "kl_loss_12": 1007.8353851318359, "kl_loss_17": 273.74057693481444, "kl_loss_3": 2523.880090332031, "kl_loss_6": 1941.5342407226562, "learning_rate": 8.996945735790446e-05, "loss": 1490.9507, "step": 8080 }, { "ce_loss_12": 3.305538558959961, "ce_loss_17": 2.9698716044425963, "ce_loss_23": 2.853313446044922, "ce_loss_3": 4.071282744407654, "ce_loss_6": 3.770635890960693, "epoch": 0.809, "grad_norm": 1024.0, "kl_loss_12": 1051.1560485839843, "kl_loss_17": 278.94518890380857, "kl_loss_3": 2616.7640014648437, "kl_loss_6": 2014.3000610351562, "learning_rate": 8.906351425856951e-05, "loss": 1505.8312, "step": 8090 }, { "ce_loss_12": 3.3000259399414062, "ce_loss_17": 2.955925440788269, "ce_loss_23": 2.832998812198639, "ce_loss_3": 4.092427730560303, "ce_loss_6": 3.78441641330719, "epoch": 0.81, "grad_norm": 1000.0, "kl_loss_12": 1072.9378540039063, "kl_loss_17": 285.1971321105957, "kl_loss_3": 2694.5508056640624, "kl_loss_6": 2074.3757263183593, "learning_rate": 8.816170928508365e-05, "loss": 1531.4363, "step": 8100 }, { "ce_loss_12": 3.271656095981598, "ce_loss_17": 2.9251946806907654, "ce_loss_23": 2.801189124584198, "ce_loss_3": 4.092127597332, "ce_loss_6": 3.781342017650604, "epoch": 0.811, "grad_norm": 848.0, "kl_loss_12": 1085.3644073486328, "kl_loss_17": 287.12475357055666, "kl_loss_3": 2738.927209472656, "kl_loss_6": 2116.110144042969, "learning_rate": 8.7264051518613e-05, "loss": 1526.0476, "step": 8110 }, { "ce_loss_12": 3.3333471536636354, "ce_loss_17": 3.0051140427589416, "ce_loss_23": 2.8876123905181883, "ce_loss_3": 4.099722731113434, "ce_loss_6": 3.799349296092987, "epoch": 0.812, "grad_norm": 1240.0, "kl_loss_12": 1029.06455078125, "kl_loss_17": 275.13319396972656, "kl_loss_3": 2574.5433715820313, "kl_loss_6": 1977.8097045898437, "learning_rate": 8.637054999856148e-05, "loss": 1484.9934, "step": 8120 }, { "ce_loss_12": 3.340409016609192, "ce_loss_17": 3.000631868839264, "ce_loss_23": 2.872890567779541, "ce_loss_3": 4.133741664886474, "ce_loss_6": 3.820933222770691, "epoch": 0.813, "grad_norm": 1096.0, "kl_loss_12": 1051.732928466797, "kl_loss_17": 288.68270721435545, "kl_loss_3": 2652.4419555664062, "kl_loss_6": 2028.232440185547, "learning_rate": 8.548121372247918e-05, "loss": 1522.7607, "step": 8130 }, { "ce_loss_12": 3.38684196472168, "ce_loss_17": 3.0672188639640807, "ce_loss_23": 2.9500478506088257, "ce_loss_3": 4.153112828731537, "ce_loss_6": 3.8581340312957764, "epoch": 0.814, "grad_norm": 1056.0, "kl_loss_12": 1031.712338256836, "kl_loss_17": 279.44617614746096, "kl_loss_3": 2592.561669921875, "kl_loss_6": 2010.5955078125, "learning_rate": 8.459605164597267e-05, "loss": 1482.8579, "step": 8140 }, { "ce_loss_12": 3.291148364543915, "ce_loss_17": 2.954191195964813, "ce_loss_23": 2.839016282558441, "ce_loss_3": 4.08526486158371, "ce_loss_6": 3.7741797089576723, "epoch": 0.815, "grad_norm": 1168.0, "kl_loss_12": 1047.2955047607422, "kl_loss_17": 279.0842231750488, "kl_loss_3": 2647.6363525390625, "kl_loss_6": 2028.89501953125, "learning_rate": 8.371507268261436e-05, "loss": 1513.5294, "step": 8150 }, { "ce_loss_12": 3.359915280342102, "ce_loss_17": 3.024343800544739, "ce_loss_23": 2.8986566185951235, "ce_loss_3": 4.138587176799774, "ce_loss_6": 3.8319987058639526, "epoch": 0.816, "grad_norm": 980.0, "kl_loss_12": 1049.5741241455078, "kl_loss_17": 284.2603759765625, "kl_loss_3": 2621.991748046875, "kl_loss_6": 2012.3725524902343, "learning_rate": 8.283828570385238e-05, "loss": 1470.4635, "step": 8160 }, { "ce_loss_12": 3.3558297991752624, "ce_loss_17": 3.0201908469200136, "ce_loss_23": 2.9029954433441163, "ce_loss_3": 4.142328202724457, "ce_loss_6": 3.8327948093414306, "epoch": 0.817, "grad_norm": 952.0, "kl_loss_12": 1052.2428649902345, "kl_loss_17": 284.0148567199707, "kl_loss_3": 2619.49140625, "kl_loss_6": 2013.0584106445312, "learning_rate": 8.196569953892202e-05, "loss": 1499.2858, "step": 8170 }, { "ce_loss_12": 3.2958539009094237, "ce_loss_17": 2.9550757884979246, "ce_loss_23": 2.833250343799591, "ce_loss_3": 4.080728983879089, "ce_loss_6": 3.7691246271133423, "epoch": 0.818, "grad_norm": 1072.0, "kl_loss_12": 1066.8218170166015, "kl_loss_17": 286.78183517456057, "kl_loss_3": 2626.90947265625, "kl_loss_6": 2029.0765869140625, "learning_rate": 8.109732297475635e-05, "loss": 1492.1337, "step": 8180 }, { "ce_loss_12": 3.305049383640289, "ce_loss_17": 2.9337514519691466, "ce_loss_23": 2.8022228956222532, "ce_loss_3": 4.12907395362854, "ce_loss_6": 3.8098646640777587, "epoch": 0.819, "grad_norm": 1048.0, "kl_loss_12": 1111.1580535888672, "kl_loss_17": 295.3542709350586, "kl_loss_3": 2750.808483886719, "kl_loss_6": 2130.1484802246096, "learning_rate": 8.023316475589754e-05, "loss": 1544.1424, "step": 8190 }, { "ce_loss_12": 3.2693939805030823, "ce_loss_17": 2.9022013545036316, "ce_loss_23": 2.7660971879959106, "ce_loss_3": 4.129703235626221, "ce_loss_6": 3.8015416502952575, "epoch": 0.82, "grad_norm": 1464.0, "kl_loss_12": 1125.4208984375, "kl_loss_17": 307.3749938964844, "kl_loss_3": 2841.758447265625, "kl_loss_6": 2199.205700683594, "learning_rate": 7.937323358440934e-05, "loss": 1571.7113, "step": 8200 }, { "ce_loss_12": 3.3275758266448974, "ce_loss_17": 3.0086806058883666, "ce_loss_23": 2.8955679059028627, "ce_loss_3": 4.080337619781494, "ce_loss_6": 3.7854795098304748, "epoch": 0.821, "grad_norm": 1064.0, "kl_loss_12": 1025.0358978271483, "kl_loss_17": 275.9737190246582, "kl_loss_3": 2537.994274902344, "kl_loss_6": 1955.729638671875, "learning_rate": 7.851753811978923e-05, "loss": 1478.1028, "step": 8210 }, { "ce_loss_12": 3.361056423187256, "ce_loss_17": 3.023615562915802, "ce_loss_23": 2.899628448486328, "ce_loss_3": 4.155423247814179, "ce_loss_6": 3.857728958129883, "epoch": 0.822, "grad_norm": 1576.0, "kl_loss_12": 1052.4446899414063, "kl_loss_17": 286.32282104492185, "kl_loss_3": 2650.950402832031, "kl_loss_6": 2060.075762939453, "learning_rate": 7.766608697888095e-05, "loss": 1494.5828, "step": 8220 }, { "ce_loss_12": 3.3725324153900145, "ce_loss_17": 3.033415162563324, "ce_loss_23": 2.910620903968811, "ce_loss_3": 4.164709949493409, "ce_loss_6": 3.8546987891197206, "epoch": 0.823, "grad_norm": 1336.0, "kl_loss_12": 1068.575357055664, "kl_loss_17": 287.6273994445801, "kl_loss_3": 2665.501123046875, "kl_loss_6": 2060.4616455078126, "learning_rate": 7.681888873578785e-05, "loss": 1530.6572, "step": 8230 }, { "ce_loss_12": 3.3186384439468384, "ce_loss_17": 2.964526128768921, "ce_loss_23": 2.833543539047241, "ce_loss_3": 4.121504092216492, "ce_loss_6": 3.8054251790046694, "epoch": 0.824, "grad_norm": 1264.0, "kl_loss_12": 1092.8309265136718, "kl_loss_17": 294.7323173522949, "kl_loss_3": 2704.2026489257814, "kl_loss_6": 2085.3021728515623, "learning_rate": 7.597595192178702e-05, "loss": 1513.8944, "step": 8240 }, { "ce_loss_12": 3.3183600187301634, "ce_loss_17": 2.966995060443878, "ce_loss_23": 2.8392292618751527, "ce_loss_3": 4.142496371269226, "ce_loss_6": 3.8186073541641234, "epoch": 0.825, "grad_norm": 968.0, "kl_loss_12": 1095.9381225585937, "kl_loss_17": 291.71913833618163, "kl_loss_3": 2756.4898681640625, "kl_loss_6": 2119.0372924804688, "learning_rate": 7.513728502524286e-05, "loss": 1540.7781, "step": 8250 }, { "ce_loss_12": 3.2841219305992126, "ce_loss_17": 2.955878269672394, "ce_loss_23": 2.842096221446991, "ce_loss_3": 4.0645402550697325, "ce_loss_6": 3.76453902721405, "epoch": 0.826, "grad_norm": 840.0, "kl_loss_12": 1022.670541381836, "kl_loss_17": 271.8673599243164, "kl_loss_3": 2579.4839721679687, "kl_loss_6": 1995.1776184082032, "learning_rate": 7.430289649152156e-05, "loss": 1504.5896, "step": 8260 }, { "ce_loss_12": 3.2348284006118773, "ce_loss_17": 2.876335549354553, "ce_loss_23": 2.7529516220092773, "ce_loss_3": 4.052449405193329, "ce_loss_6": 3.739718866348267, "epoch": 0.827, "grad_norm": 1048.0, "kl_loss_12": 1096.6505584716797, "kl_loss_17": 289.0554008483887, "kl_loss_3": 2760.6556762695313, "kl_loss_6": 2129.0817260742188, "learning_rate": 7.347279472290646e-05, "loss": 1520.1464, "step": 8270 }, { "ce_loss_12": 3.350387394428253, "ce_loss_17": 3.008963429927826, "ce_loss_23": 2.88888795375824, "ce_loss_3": 4.150562691688537, "ce_loss_6": 3.839677166938782, "epoch": 0.828, "grad_norm": 1096.0, "kl_loss_12": 1066.2569763183594, "kl_loss_17": 284.117505645752, "kl_loss_3": 2677.5921875, "kl_loss_6": 2067.034716796875, "learning_rate": 7.264698807851328e-05, "loss": 1521.5023, "step": 8280 }, { "ce_loss_12": 3.3077462196350096, "ce_loss_17": 2.9838837027549743, "ce_loss_23": 2.8650826036930086, "ce_loss_3": 4.088552296161652, "ce_loss_6": 3.7806774139404298, "epoch": 0.829, "grad_norm": 1104.0, "kl_loss_12": 1033.3657104492188, "kl_loss_17": 279.8206199645996, "kl_loss_3": 2593.388317871094, "kl_loss_6": 1994.3953857421875, "learning_rate": 7.182548487420554e-05, "loss": 1490.687, "step": 8290 }, { "ce_loss_12": 3.3589147090911866, "ce_loss_17": 3.0282956838607786, "ce_loss_23": 2.9110039949417112, "ce_loss_3": 4.137791180610657, "ce_loss_6": 3.8347092270851135, "epoch": 0.83, "grad_norm": 1064.0, "kl_loss_12": 1052.384588623047, "kl_loss_17": 285.2430877685547, "kl_loss_3": 2622.855578613281, "kl_loss_6": 2030.3604919433594, "learning_rate": 7.100829338251146e-05, "loss": 1490.9061, "step": 8300 }, { "ce_loss_12": 3.3167926907539367, "ce_loss_17": 2.970119321346283, "ce_loss_23": 2.8363095045089723, "ce_loss_3": 4.117564260959625, "ce_loss_6": 3.808456206321716, "epoch": 0.831, "grad_norm": 1064.0, "kl_loss_12": 1085.094046020508, "kl_loss_17": 295.32879638671875, "kl_loss_3": 2695.377307128906, "kl_loss_6": 2076.8177795410156, "learning_rate": 7.019542183254046e-05, "loss": 1505.1945, "step": 8310 }, { "ce_loss_12": 3.3409668445587157, "ce_loss_17": 3.0074267029762267, "ce_loss_23": 2.8743056416511537, "ce_loss_3": 4.114725065231323, "ce_loss_6": 3.812414515018463, "epoch": 0.832, "grad_norm": 1304.0, "kl_loss_12": 1072.4284088134766, "kl_loss_17": 297.69879455566405, "kl_loss_3": 2642.8615844726564, "kl_loss_6": 2040.5767395019532, "learning_rate": 6.938687840989971e-05, "loss": 1498.9299, "step": 8320 }, { "ce_loss_12": 3.292984998226166, "ce_loss_17": 2.9502809286117553, "ce_loss_23": 2.8233898639678956, "ce_loss_3": 4.075324869155883, "ce_loss_6": 3.7751200199127197, "epoch": 0.833, "grad_norm": 1360.0, "kl_loss_12": 1063.7236389160157, "kl_loss_17": 292.75162200927736, "kl_loss_3": 2629.8801879882812, "kl_loss_6": 2039.0565246582032, "learning_rate": 6.858267125661271e-05, "loss": 1524.8443, "step": 8330 }, { "ce_loss_12": 3.34079647064209, "ce_loss_17": 3.0030898213386537, "ce_loss_23": 2.878040623664856, "ce_loss_3": 4.137813937664032, "ce_loss_6": 3.821487474441528, "epoch": 0.834, "grad_norm": 1232.0, "kl_loss_12": 1057.2560974121093, "kl_loss_17": 284.3835502624512, "kl_loss_3": 2658.54013671875, "kl_loss_6": 2034.9720642089844, "learning_rate": 6.778280847103668e-05, "loss": 1539.2473, "step": 8340 }, { "ce_loss_12": 3.356912088394165, "ce_loss_17": 3.010690116882324, "ce_loss_23": 2.88764488697052, "ce_loss_3": 4.1245949268341064, "ce_loss_6": 3.819116735458374, "epoch": 0.835, "grad_norm": 940.0, "kl_loss_12": 1071.227099609375, "kl_loss_17": 286.31872711181643, "kl_loss_3": 2635.9383056640627, "kl_loss_6": 2024.1812744140625, "learning_rate": 6.698729810778065e-05, "loss": 1500.8768, "step": 8350 }, { "ce_loss_12": 3.2641802549362184, "ce_loss_17": 2.9241887450218202, "ce_loss_23": 2.800455904006958, "ce_loss_3": 4.078383791446686, "ce_loss_6": 3.7547880887985228, "epoch": 0.836, "grad_norm": 1288.0, "kl_loss_12": 1051.2379241943358, "kl_loss_17": 278.18785400390624, "kl_loss_3": 2676.7776123046874, "kl_loss_6": 2041.9749389648437, "learning_rate": 6.619614817762538e-05, "loss": 1516.0025, "step": 8360 }, { "ce_loss_12": 3.2639304637908935, "ce_loss_17": 2.907888662815094, "ce_loss_23": 2.7812278270721436, "ce_loss_3": 4.103890705108642, "ce_loss_6": 3.7825687408447264, "epoch": 0.837, "grad_norm": 960.0, "kl_loss_12": 1098.0486114501953, "kl_loss_17": 289.22062225341796, "kl_loss_3": 2781.6754760742188, "kl_loss_6": 2142.4128173828126, "learning_rate": 6.540936664744196e-05, "loss": 1542.1457, "step": 8370 }, { "ce_loss_12": 3.373264420032501, "ce_loss_17": 3.0289392709732055, "ce_loss_23": 2.9055778861045836, "ce_loss_3": 4.160033249855042, "ce_loss_6": 3.859750437736511, "epoch": 0.838, "grad_norm": 884.0, "kl_loss_12": 1066.5439331054688, "kl_loss_17": 285.0770950317383, "kl_loss_3": 2651.09287109375, "kl_loss_6": 2055.8511474609377, "learning_rate": 6.462696144011149e-05, "loss": 1496.0637, "step": 8380 }, { "ce_loss_12": 3.3310264587402343, "ce_loss_17": 2.991706573963165, "ce_loss_23": 2.8699768662452696, "ce_loss_3": 4.092860805988312, "ce_loss_6": 3.7977572083473206, "epoch": 0.839, "grad_norm": 900.0, "kl_loss_12": 1064.0408813476563, "kl_loss_17": 290.1427459716797, "kl_loss_3": 2610.905310058594, "kl_loss_6": 2026.75966796875, "learning_rate": 6.384894043444567e-05, "loss": 1481.7455, "step": 8390 }, { "ce_loss_12": 3.3575948596000673, "ce_loss_17": 3.0081547141075133, "ce_loss_23": 2.8842729449272158, "ce_loss_3": 4.1496823072433475, "ce_loss_6": 3.8388915777206423, "epoch": 0.84, "grad_norm": 1256.0, "kl_loss_12": 1065.0152313232422, "kl_loss_17": 287.56776962280276, "kl_loss_3": 2659.087756347656, "kl_loss_6": 2051.4926208496095, "learning_rate": 6.307531146510753e-05, "loss": 1501.6816, "step": 8400 }, { "ce_loss_12": 3.3196689248085023, "ce_loss_17": 2.9898666977882384, "ce_loss_23": 2.8641279578208922, "ce_loss_3": 4.084683573246002, "ce_loss_6": 3.7808205842971803, "epoch": 0.841, "grad_norm": 984.0, "kl_loss_12": 1036.9258270263672, "kl_loss_17": 285.1266288757324, "kl_loss_3": 2581.5780639648438, "kl_loss_6": 1977.9350891113281, "learning_rate": 6.230608232253226e-05, "loss": 1472.0643, "step": 8410 }, { "ce_loss_12": 3.306604731082916, "ce_loss_17": 2.9507232666015626, "ce_loss_23": 2.8234981060028077, "ce_loss_3": 4.131628477573395, "ce_loss_6": 3.817171037197113, "epoch": 0.842, "grad_norm": 1104.0, "kl_loss_12": 1087.105435180664, "kl_loss_17": 289.3849044799805, "kl_loss_3": 2721.5063720703124, "kl_loss_6": 2106.2221618652343, "learning_rate": 6.154126075284855e-05, "loss": 1507.0301, "step": 8420 }, { "ce_loss_12": 3.3652244925498964, "ce_loss_17": 3.0340421080589293, "ce_loss_23": 2.9187233686447143, "ce_loss_3": 4.130150222778321, "ce_loss_6": 3.8256107330322267, "epoch": 0.843, "grad_norm": 1304.0, "kl_loss_12": 1038.1420593261719, "kl_loss_17": 276.6188430786133, "kl_loss_3": 2575.8635498046874, "kl_loss_6": 1973.4901733398438, "learning_rate": 6.078085445780129e-05, "loss": 1466.3076, "step": 8430 }, { "ce_loss_12": 3.3772589802742004, "ce_loss_17": 3.0360066175460814, "ce_loss_23": 2.9157355785369874, "ce_loss_3": 4.171771287918091, "ce_loss_6": 3.8688151121139525, "epoch": 0.844, "grad_norm": 968.0, "kl_loss_12": 1063.171942138672, "kl_loss_17": 283.9888038635254, "kl_loss_3": 2673.1849975585938, "kl_loss_6": 2074.389031982422, "learning_rate": 6.002487109467347e-05, "loss": 1489.1416, "step": 8440 }, { "ce_loss_12": 3.387488567829132, "ce_loss_17": 3.050030696392059, "ce_loss_23": 2.9240846395492555, "ce_loss_3": 4.154106748104096, "ce_loss_6": 3.851106035709381, "epoch": 0.845, "grad_norm": 1088.0, "kl_loss_12": 1073.7497619628907, "kl_loss_17": 292.6628059387207, "kl_loss_3": 2634.9559326171875, "kl_loss_6": 2031.09990234375, "learning_rate": 5.927331827620902e-05, "loss": 1492.8718, "step": 8450 }, { "ce_loss_12": 3.3563677072525024, "ce_loss_17": 3.0263548135757445, "ce_loss_23": 2.9048399567604064, "ce_loss_3": 4.099981164932251, "ce_loss_6": 3.8053762912750244, "epoch": 0.846, "grad_norm": 1072.0, "kl_loss_12": 1037.7698760986327, "kl_loss_17": 280.9472290039063, "kl_loss_3": 2535.442004394531, "kl_loss_6": 1953.500341796875, "learning_rate": 5.852620357053651e-05, "loss": 1478.8081, "step": 8460 }, { "ce_loss_12": 3.394399857521057, "ce_loss_17": 3.067415416240692, "ce_loss_23": 2.9515963554382325, "ce_loss_3": 4.152213895320893, "ce_loss_6": 3.857205033302307, "epoch": 0.847, "grad_norm": 1048.0, "kl_loss_12": 1036.8290802001952, "kl_loss_17": 277.5314636230469, "kl_loss_3": 2560.67099609375, "kl_loss_6": 1981.1140991210937, "learning_rate": 5.778353450109286e-05, "loss": 1482.0109, "step": 8470 }, { "ce_loss_12": 3.4367149829864503, "ce_loss_17": 3.098689925670624, "ce_loss_23": 2.9720311760902405, "ce_loss_3": 4.227457308769226, "ce_loss_6": 3.9236016511917113, "epoch": 0.848, "grad_norm": 1012.0, "kl_loss_12": 1065.6336242675782, "kl_loss_17": 288.5423187255859, "kl_loss_3": 2651.488635253906, "kl_loss_6": 2046.8683654785157, "learning_rate": 5.7045318546547206e-05, "loss": 1496.9844, "step": 8480 }, { "ce_loss_12": 3.3335933089256287, "ce_loss_17": 2.9945171236991883, "ce_loss_23": 2.8744940757751465, "ce_loss_3": 4.126176583766937, "ce_loss_6": 3.82481609582901, "epoch": 0.849, "grad_norm": 1328.0, "kl_loss_12": 1058.5303649902344, "kl_loss_17": 282.7712348937988, "kl_loss_3": 2657.193566894531, "kl_loss_6": 2055.022296142578, "learning_rate": 5.631156314072605e-05, "loss": 1494.8992, "step": 8490 }, { "ce_loss_12": 3.351343595981598, "ce_loss_17": 3.025673341751099, "ce_loss_23": 2.9065319418907167, "ce_loss_3": 4.112990772724151, "ce_loss_6": 3.8109803080558775, "epoch": 0.85, "grad_norm": 988.0, "kl_loss_12": 1031.8004302978516, "kl_loss_17": 282.2818008422852, "kl_loss_3": 2580.2078125, "kl_loss_6": 1985.7573120117188, "learning_rate": 5.5582275672538315e-05, "loss": 1471.2869, "step": 8500 }, { "ce_loss_12": 3.3047799110412597, "ce_loss_17": 2.9414509415626524, "ce_loss_23": 2.8145953178405763, "ce_loss_3": 4.127846312522888, "ce_loss_6": 3.8183951377868652, "epoch": 0.851, "grad_norm": 1120.0, "kl_loss_12": 1114.0586822509765, "kl_loss_17": 293.6786460876465, "kl_loss_3": 2775.8820190429688, "kl_loss_6": 2153.7723083496094, "learning_rate": 5.4857463485900484e-05, "loss": 1541.5318, "step": 8510 }, { "ce_loss_12": 3.3387150883674623, "ce_loss_17": 3.0012975692749024, "ce_loss_23": 2.8756859064102174, "ce_loss_3": 4.103997337818146, "ce_loss_6": 3.8028729438781737, "epoch": 0.852, "grad_norm": 1200.0, "kl_loss_12": 1055.3402618408204, "kl_loss_17": 281.1202537536621, "kl_loss_3": 2608.085583496094, "kl_loss_6": 2009.9069580078126, "learning_rate": 5.413713387966329e-05, "loss": 1492.8399, "step": 8520 }, { "ce_loss_12": 3.350775933265686, "ce_loss_17": 3.0103574633598327, "ce_loss_23": 2.8882513999938966, "ce_loss_3": 4.137410664558411, "ce_loss_6": 3.837936055660248, "epoch": 0.853, "grad_norm": 1360.0, "kl_loss_12": 1058.6788848876954, "kl_loss_17": 282.32394790649414, "kl_loss_3": 2648.292834472656, "kl_loss_6": 2052.374346923828, "learning_rate": 5.34212941075381e-05, "loss": 1502.7247, "step": 8530 }, { "ce_loss_12": 3.3418834924697878, "ce_loss_17": 3.0199050664901734, "ce_loss_23": 2.9050287127494814, "ce_loss_3": 4.109955275058747, "ce_loss_6": 3.8061795234680176, "epoch": 0.854, "grad_norm": 1240.0, "kl_loss_12": 1008.4690399169922, "kl_loss_17": 274.83275985717773, "kl_loss_3": 2575.9906005859375, "kl_loss_6": 1970.3159729003905, "learning_rate": 5.270995137802315e-05, "loss": 1473.9852, "step": 8540 }, { "ce_loss_12": 3.292176532745361, "ce_loss_17": 2.96178058385849, "ce_loss_23": 2.8435588598251345, "ce_loss_3": 4.0793102264404295, "ce_loss_6": 3.774105632305145, "epoch": 0.855, "grad_norm": 888.0, "kl_loss_12": 1046.6531677246094, "kl_loss_17": 279.5433349609375, "kl_loss_3": 2637.3547973632812, "kl_loss_6": 2033.5572509765625, "learning_rate": 5.2003112854332125e-05, "loss": 1511.5877, "step": 8550 }, { "ce_loss_12": 3.290618920326233, "ce_loss_17": 2.9556390285491942, "ce_loss_23": 2.842723000049591, "ce_loss_3": 4.057822823524475, "ce_loss_6": 3.7611496210098267, "epoch": 0.856, "grad_norm": 968.0, "kl_loss_12": 1037.3160766601563, "kl_loss_17": 272.4007507324219, "kl_loss_3": 2604.338903808594, "kl_loss_6": 2008.6603881835938, "learning_rate": 5.130078565432089e-05, "loss": 1464.4805, "step": 8560 }, { "ce_loss_12": 3.3430102467536926, "ce_loss_17": 3.0166367650032044, "ce_loss_23": 2.9037629127502442, "ce_loss_3": 4.099278628826141, "ce_loss_6": 3.806718420982361, "epoch": 0.857, "grad_norm": 1032.0, "kl_loss_12": 1027.847314453125, "kl_loss_17": 271.56771697998045, "kl_loss_3": 2560.9104248046874, "kl_loss_6": 1976.3161193847657, "learning_rate": 5.060297685041659e-05, "loss": 1452.2871, "step": 8570 }, { "ce_loss_12": 3.30127671957016, "ce_loss_17": 2.9604085326194762, "ce_loss_23": 2.8306247353553773, "ce_loss_3": 4.100674736499786, "ce_loss_6": 3.7946442484855654, "epoch": 0.858, "grad_norm": 1144.0, "kl_loss_12": 1067.2820556640625, "kl_loss_17": 292.8963912963867, "kl_loss_3": 2683.639929199219, "kl_loss_6": 2071.2172790527343, "learning_rate": 4.99096934695461e-05, "loss": 1524.3353, "step": 8580 }, { "ce_loss_12": 3.3519344210624693, "ce_loss_17": 3.01632022857666, "ce_loss_23": 2.896422302722931, "ce_loss_3": 4.125617742538452, "ce_loss_6": 3.8321552634239198, "epoch": 0.859, "grad_norm": 920.0, "kl_loss_12": 1036.7369049072265, "kl_loss_17": 277.91772079467773, "kl_loss_3": 2593.2775024414063, "kl_loss_6": 2007.981787109375, "learning_rate": 4.922094249306558e-05, "loss": 1469.8185, "step": 8590 }, { "ce_loss_12": 3.3896760821342466, "ce_loss_17": 3.051762652397156, "ce_loss_23": 2.927615690231323, "ce_loss_3": 4.160779738426209, "ce_loss_6": 3.8637610912322997, "epoch": 0.86, "grad_norm": 1136.0, "kl_loss_12": 1061.075439453125, "kl_loss_17": 288.38329010009767, "kl_loss_3": 2624.784045410156, "kl_loss_6": 2029.8816284179688, "learning_rate": 4.853673085668947e-05, "loss": 1474.2352, "step": 8600 }, { "ce_loss_12": 3.401752161979675, "ce_loss_17": 3.05824556350708, "ce_loss_23": 2.9379342675209044, "ce_loss_3": 4.180785989761352, "ce_loss_6": 3.8813895106315615, "epoch": 0.861, "grad_norm": 1104.0, "kl_loss_12": 1066.3026214599608, "kl_loss_17": 280.7836517333984, "kl_loss_3": 2639.3175415039063, "kl_loss_6": 2042.6779052734375, "learning_rate": 4.78570654504214e-05, "loss": 1505.4118, "step": 8610 }, { "ce_loss_12": 3.3483115792274476, "ce_loss_17": 3.014393675327301, "ce_loss_23": 2.892766237258911, "ce_loss_3": 4.125595450401306, "ce_loss_6": 3.8208566427230837, "epoch": 0.862, "grad_norm": 964.0, "kl_loss_12": 1048.1334350585937, "kl_loss_17": 280.49364166259767, "kl_loss_3": 2634.9196533203126, "kl_loss_6": 2022.7561828613282, "learning_rate": 4.7181953118484556e-05, "loss": 1496.3431, "step": 8620 }, { "ce_loss_12": 3.368147909641266, "ce_loss_17": 3.0343467235565185, "ce_loss_23": 2.9179905891418456, "ce_loss_3": 4.131514084339142, "ce_loss_6": 3.834225833415985, "epoch": 0.863, "grad_norm": 1136.0, "kl_loss_12": 1044.3617065429687, "kl_loss_17": 278.33188400268557, "kl_loss_3": 2568.1588134765625, "kl_loss_6": 1984.6631713867187, "learning_rate": 4.651140065925269e-05, "loss": 1506.2501, "step": 8630 }, { "ce_loss_12": 3.3051889181137084, "ce_loss_17": 2.9729141354560853, "ce_loss_23": 2.848560094833374, "ce_loss_3": 4.0864248991012575, "ce_loss_6": 3.7882192850112917, "epoch": 0.864, "grad_norm": 1152.0, "kl_loss_12": 1045.531802368164, "kl_loss_17": 283.5457000732422, "kl_loss_3": 2622.3549560546876, "kl_loss_6": 2034.6311279296874, "learning_rate": 4.58454148251814e-05, "loss": 1513.5437, "step": 8640 }, { "ce_loss_12": 3.328133535385132, "ce_loss_17": 2.98028701543808, "ce_loss_23": 2.856073999404907, "ce_loss_3": 4.13912308216095, "ce_loss_6": 3.828638470172882, "epoch": 0.865, "grad_norm": 1040.0, "kl_loss_12": 1070.8560150146484, "kl_loss_17": 281.1698196411133, "kl_loss_3": 2693.3925903320314, "kl_loss_6": 2070.704803466797, "learning_rate": 4.518400232274078e-05, "loss": 1507.2037, "step": 8650 }, { "ce_loss_12": 3.3494417786598207, "ce_loss_17": 3.0160619020462036, "ce_loss_23": 2.888498270511627, "ce_loss_3": 4.126687633991241, "ce_loss_6": 3.823049473762512, "epoch": 0.866, "grad_norm": 936.0, "kl_loss_12": 1056.8975738525392, "kl_loss_17": 288.9259567260742, "kl_loss_3": 2613.984899902344, "kl_loss_6": 2010.7535461425782, "learning_rate": 4.452716981234745e-05, "loss": 1461.7759, "step": 8660 }, { "ce_loss_12": 3.314436304569244, "ce_loss_17": 2.9823337316513063, "ce_loss_23": 2.864330458641052, "ce_loss_3": 4.09505170583725, "ce_loss_6": 3.78656405210495, "epoch": 0.867, "grad_norm": 984.0, "kl_loss_12": 1041.6230499267579, "kl_loss_17": 275.7464958190918, "kl_loss_3": 2615.276159667969, "kl_loss_6": 1999.0199157714844, "learning_rate": 4.3874923908297335e-05, "loss": 1460.9713, "step": 8670 }, { "ce_loss_12": 3.379078185558319, "ce_loss_17": 3.034619677066803, "ce_loss_23": 2.916077446937561, "ce_loss_3": 4.167405915260315, "ce_loss_6": 3.861358070373535, "epoch": 0.868, "grad_norm": 1320.0, "kl_loss_12": 1071.1197692871094, "kl_loss_17": 283.57053604125974, "kl_loss_3": 2664.220324707031, "kl_loss_6": 2069.8397705078123, "learning_rate": 4.322727117869951e-05, "loss": 1495.5064, "step": 8680 }, { "ce_loss_12": 3.38282253742218, "ce_loss_17": 3.0405599594116213, "ce_loss_23": 2.920660102367401, "ce_loss_3": 4.173842930793763, "ce_loss_6": 3.8629134774208067, "epoch": 0.869, "grad_norm": 1192.0, "kl_loss_12": 1060.762957763672, "kl_loss_17": 284.0414749145508, "kl_loss_3": 2663.9480712890627, "kl_loss_6": 2042.9997863769531, "learning_rate": 4.2584218145409916e-05, "loss": 1494.0689, "step": 8690 }, { "ce_loss_12": 3.392480957508087, "ce_loss_17": 3.0751676082611086, "ce_loss_23": 2.9581345558166503, "ce_loss_3": 4.143105471134186, "ce_loss_6": 3.8466254830360413, "epoch": 0.87, "grad_norm": 1248.0, "kl_loss_12": 1023.1029602050781, "kl_loss_17": 274.7999664306641, "kl_loss_3": 2543.5913696289062, "kl_loss_6": 1950.3782043457031, "learning_rate": 4.194577128396521e-05, "loss": 1453.3071, "step": 8700 }, { "ce_loss_12": 3.306734549999237, "ce_loss_17": 2.971284508705139, "ce_loss_23": 2.852522623538971, "ce_loss_3": 4.084229207038879, "ce_loss_6": 3.782787263393402, "epoch": 0.871, "grad_norm": 976.0, "kl_loss_12": 1037.1832397460937, "kl_loss_17": 276.3266456604004, "kl_loss_3": 2611.7448486328126, "kl_loss_6": 2011.7207641601562, "learning_rate": 4.1311937023518264e-05, "loss": 1502.6502, "step": 8710 }, { "ce_loss_12": 3.310939145088196, "ce_loss_17": 2.9816059350967405, "ce_loss_23": 2.8702484488487245, "ce_loss_3": 4.1435425758361815, "ce_loss_6": 3.833130669593811, "epoch": 0.872, "grad_norm": 968.0, "kl_loss_12": 1030.4807495117188, "kl_loss_17": 269.01084365844724, "kl_loss_3": 2706.113879394531, "kl_loss_6": 2098.7085510253905, "learning_rate": 4.0682721746773344e-05, "loss": 1497.9731, "step": 8720 }, { "ce_loss_12": 3.221154248714447, "ce_loss_17": 2.868494462966919, "ce_loss_23": 2.7455691933631896, "ce_loss_3": 4.032175767421722, "ce_loss_6": 3.7165164232254027, "epoch": 0.873, "grad_norm": 1008.0, "kl_loss_12": 1079.8149719238281, "kl_loss_17": 281.65873489379885, "kl_loss_3": 2713.8420288085936, "kl_loss_6": 2083.4974060058594, "learning_rate": 4.0058131789920904e-05, "loss": 1491.0676, "step": 8730 }, { "ce_loss_12": 3.3403757333755495, "ce_loss_17": 3.00846403837204, "ce_loss_23": 2.890472078323364, "ce_loss_3": 4.107055354118347, "ce_loss_6": 3.804210674762726, "epoch": 0.874, "grad_norm": 940.0, "kl_loss_12": 1055.1203155517578, "kl_loss_17": 273.9949554443359, "kl_loss_3": 2620.6344970703126, "kl_loss_6": 2015.965771484375, "learning_rate": 3.9438173442575e-05, "loss": 1535.924, "step": 8740 }, { "ce_loss_12": 3.3696256041526795, "ce_loss_17": 3.0336912751197813, "ce_loss_23": 2.913709211349487, "ce_loss_3": 4.140432238578796, "ce_loss_6": 3.8295344591140745, "epoch": 0.875, "grad_norm": 1064.0, "kl_loss_12": 1040.0205291748048, "kl_loss_17": 279.5762062072754, "kl_loss_3": 2582.630651855469, "kl_loss_6": 1982.918798828125, "learning_rate": 3.882285294770937e-05, "loss": 1477.0505, "step": 8750 }, { "ce_loss_12": 3.325684869289398, "ce_loss_17": 2.9906407475471495, "ce_loss_23": 2.872960686683655, "ce_loss_3": 4.079718363285065, "ce_loss_6": 3.7855276465415955, "epoch": 0.876, "grad_norm": 892.0, "kl_loss_12": 1029.6630554199219, "kl_loss_17": 275.07968978881837, "kl_loss_3": 2572.990869140625, "kl_loss_6": 1976.6401428222657, "learning_rate": 3.821217650159453e-05, "loss": 1497.0498, "step": 8760 }, { "ce_loss_12": 3.2449742555618286, "ce_loss_17": 2.8872615814208986, "ce_loss_23": 2.7629997372627257, "ce_loss_3": 4.060509705543518, "ce_loss_6": 3.7358765721321108, "epoch": 0.877, "grad_norm": 1016.0, "kl_loss_12": 1094.626397705078, "kl_loss_17": 287.1759391784668, "kl_loss_3": 2719.6081665039064, "kl_loss_6": 2092.2599670410154, "learning_rate": 3.760615025373543e-05, "loss": 1515.3271, "step": 8770 }, { "ce_loss_12": 3.392821705341339, "ce_loss_17": 3.0452610850334167, "ce_loss_23": 2.918588709831238, "ce_loss_3": 4.189818334579468, "ce_loss_6": 3.876577985286713, "epoch": 0.878, "grad_norm": 908.0, "kl_loss_12": 1076.1950653076171, "kl_loss_17": 291.6323013305664, "kl_loss_3": 2688.911828613281, "kl_loss_6": 2060.1080932617188, "learning_rate": 3.700478030680987e-05, "loss": 1532.3845, "step": 8780 }, { "ce_loss_12": 3.3736968278884887, "ce_loss_17": 3.0386847376823427, "ce_loss_23": 2.9209038853645324, "ce_loss_3": 4.149859249591827, "ce_loss_6": 3.8536314964294434, "epoch": 0.879, "grad_norm": 956.0, "kl_loss_12": 1037.4135864257812, "kl_loss_17": 275.3736259460449, "kl_loss_3": 2587.9447998046876, "kl_loss_6": 2007.5624267578125, "learning_rate": 3.6408072716606344e-05, "loss": 1480.7403, "step": 8790 }, { "ce_loss_12": 3.324361264705658, "ce_loss_17": 2.973997724056244, "ce_loss_23": 2.8534497022628784, "ce_loss_3": 4.123316991329193, "ce_loss_6": 3.815816307067871, "epoch": 0.88, "grad_norm": 1568.0, "kl_loss_12": 1078.816357421875, "kl_loss_17": 284.4879379272461, "kl_loss_3": 2702.5100830078127, "kl_loss_6": 2089.4851806640627, "learning_rate": 3.5816033491963716e-05, "loss": 1550.1549, "step": 8800 }, { "ce_loss_12": 3.1941446185112, "ce_loss_17": 2.846931892633438, "ce_loss_23": 2.7270780503749847, "ce_loss_3": 4.030868780612946, "ce_loss_6": 3.7038835287094116, "epoch": 0.881, "grad_norm": 1328.0, "kl_loss_12": 1069.0301940917968, "kl_loss_17": 280.2302864074707, "kl_loss_3": 2742.7992553710938, "kl_loss_6": 2101.755230712891, "learning_rate": 3.522866859471047e-05, "loss": 1520.2628, "step": 8810 }, { "ce_loss_12": 3.3734650373458863, "ce_loss_17": 3.053483486175537, "ce_loss_23": 2.9413180232048033, "ce_loss_3": 4.11530956029892, "ce_loss_6": 3.8248594403266907, "epoch": 0.882, "grad_norm": 1080.0, "kl_loss_12": 1005.0269561767578, "kl_loss_17": 268.26475830078124, "kl_loss_3": 2496.3321533203125, "kl_loss_6": 1920.503350830078, "learning_rate": 3.46459839396045e-05, "loss": 1460.0039, "step": 8820 }, { "ce_loss_12": 3.321186828613281, "ce_loss_17": 2.975290596485138, "ce_loss_23": 2.8529227137565614, "ce_loss_3": 4.11351488828659, "ce_loss_6": 3.8004450678825377, "epoch": 0.883, "grad_norm": 932.0, "kl_loss_12": 1060.291082763672, "kl_loss_17": 282.3150550842285, "kl_loss_3": 2649.278857421875, "kl_loss_6": 2030.3860168457031, "learning_rate": 3.406798539427386e-05, "loss": 1532.8661, "step": 8830 }, { "ce_loss_12": 3.3703830003738404, "ce_loss_17": 3.037055957317352, "ce_loss_23": 2.9183802366256715, "ce_loss_3": 4.155374789237976, "ce_loss_6": 3.849817657470703, "epoch": 0.884, "grad_norm": 1200.0, "kl_loss_12": 1059.6869079589844, "kl_loss_17": 279.1549919128418, "kl_loss_3": 2642.8896484375, "kl_loss_6": 2036.8998413085938, "learning_rate": 3.349467877915746e-05, "loss": 1503.1357, "step": 8840 }, { "ce_loss_12": 3.3473170399665833, "ce_loss_17": 3.0034255027770995, "ce_loss_23": 2.8850237488746644, "ce_loss_3": 4.1362377285957335, "ce_loss_6": 3.832878088951111, "epoch": 0.885, "grad_norm": 1384.0, "kl_loss_12": 1077.8331237792968, "kl_loss_17": 284.7055152893066, "kl_loss_3": 2679.574426269531, "kl_loss_6": 2071.843896484375, "learning_rate": 3.292606986744667e-05, "loss": 1544.5214, "step": 8850 }, { "ce_loss_12": 3.2922990322113037, "ce_loss_17": 2.957613694667816, "ce_loss_23": 2.8416423439979552, "ce_loss_3": 4.080385613441467, "ce_loss_6": 3.7736966252326964, "epoch": 0.886, "grad_norm": 1040.0, "kl_loss_12": 1052.9543731689453, "kl_loss_17": 274.8283012390137, "kl_loss_3": 2630.809216308594, "kl_loss_6": 2031.2622009277343, "learning_rate": 3.23621643850267e-05, "loss": 1492.9676, "step": 8860 }, { "ce_loss_12": 3.364612865447998, "ce_loss_17": 3.027756369113922, "ce_loss_23": 2.9094764232635497, "ce_loss_3": 4.14275975227356, "ce_loss_6": 3.8348689556121824, "epoch": 0.887, "grad_norm": 1056.0, "kl_loss_12": 1063.6302703857423, "kl_loss_17": 286.01028671264646, "kl_loss_3": 2647.3041870117186, "kl_loss_6": 2030.6291870117188, "learning_rate": 3.180296801041971e-05, "loss": 1483.5244, "step": 8870 }, { "ce_loss_12": 3.3761980295181275, "ce_loss_17": 3.048375737667084, "ce_loss_23": 2.932830846309662, "ce_loss_3": 4.171050536632538, "ce_loss_6": 3.8650680780410767, "epoch": 0.888, "grad_norm": 1032.0, "kl_loss_12": 1039.5651916503907, "kl_loss_17": 275.2946243286133, "kl_loss_3": 2641.3774291992186, "kl_loss_6": 2031.7088256835937, "learning_rate": 3.124848637472688e-05, "loss": 1466.1933, "step": 8880 }, { "ce_loss_12": 3.224430525302887, "ce_loss_17": 2.885611081123352, "ce_loss_23": 2.768226385116577, "ce_loss_3": 4.015231025218964, "ce_loss_6": 3.7112488865852358, "epoch": 0.889, "grad_norm": 1120.0, "kl_loss_12": 1044.7563507080079, "kl_loss_17": 271.87509994506837, "kl_loss_3": 2641.4933227539063, "kl_loss_6": 2030.9006042480469, "learning_rate": 3.069872506157212e-05, "loss": 1483.9009, "step": 8890 }, { "ce_loss_12": 3.314626932144165, "ce_loss_17": 2.9772611618041993, "ce_loss_23": 2.861090135574341, "ce_loss_3": 4.088821363449097, "ce_loss_6": 3.7842692494392396, "epoch": 0.89, "grad_norm": 1352.0, "kl_loss_12": 1046.4569183349608, "kl_loss_17": 275.9215934753418, "kl_loss_3": 2622.6554321289063, "kl_loss_6": 2010.8903015136718, "learning_rate": 3.0153689607045842e-05, "loss": 1481.274, "step": 8900 }, { "ce_loss_12": 3.261507201194763, "ce_loss_17": 2.8940985679626463, "ce_loss_23": 2.770907407999039, "ce_loss_3": 4.088374328613281, "ce_loss_6": 3.7789727330207823, "epoch": 0.891, "grad_norm": 1192.0, "kl_loss_12": 1114.263607788086, "kl_loss_17": 285.8632568359375, "kl_loss_3": 2787.009423828125, "kl_loss_6": 2162.252655029297, "learning_rate": 2.9613385499648926e-05, "loss": 1518.8045, "step": 8910 }, { "ce_loss_12": 3.2768518686294557, "ce_loss_17": 2.9423407673835755, "ce_loss_23": 2.8236806631088256, "ce_loss_3": 4.042646539211273, "ce_loss_6": 3.7386788964271545, "epoch": 0.892, "grad_norm": 904.0, "kl_loss_12": 1038.9451446533203, "kl_loss_17": 278.3830856323242, "kl_loss_3": 2579.437939453125, "kl_loss_6": 1984.3537109375, "learning_rate": 2.9077818180237692e-05, "loss": 1491.4748, "step": 8920 }, { "ce_loss_12": 3.31893972158432, "ce_loss_17": 2.9778279304504394, "ce_loss_23": 2.8547706723213198, "ce_loss_3": 4.119372272491455, "ce_loss_6": 3.811346185207367, "epoch": 0.893, "grad_norm": 1096.0, "kl_loss_12": 1043.2595336914062, "kl_loss_17": 278.8047241210937, "kl_loss_3": 2641.9256103515627, "kl_loss_6": 2022.1247985839843, "learning_rate": 2.8546993041969172e-05, "loss": 1490.9999, "step": 8930 }, { "ce_loss_12": 3.3433060526847838, "ce_loss_17": 3.013638412952423, "ce_loss_23": 2.8959841132164, "ce_loss_3": 4.1012047290802, "ce_loss_6": 3.7924872279167174, "epoch": 0.894, "grad_norm": 988.0, "kl_loss_12": 1031.7579345703125, "kl_loss_17": 273.2785339355469, "kl_loss_3": 2578.4865966796874, "kl_loss_6": 1958.0750671386718, "learning_rate": 2.802091543024671e-05, "loss": 1487.326, "step": 8940 }, { "ce_loss_12": 3.3517987966537475, "ce_loss_17": 3.0131351947784424, "ce_loss_23": 2.890279245376587, "ce_loss_3": 4.135260903835297, "ce_loss_6": 3.8338452577590942, "epoch": 0.895, "grad_norm": 1056.0, "kl_loss_12": 1064.8489837646484, "kl_loss_17": 281.3643424987793, "kl_loss_3": 2666.0608520507812, "kl_loss_6": 2056.2918701171875, "learning_rate": 2.7499590642665774e-05, "loss": 1530.04, "step": 8950 }, { "ce_loss_12": 3.3488635540008547, "ce_loss_17": 3.025653636455536, "ce_loss_23": 2.9073669075965882, "ce_loss_3": 4.125187158584595, "ce_loss_6": 3.82365905046463, "epoch": 0.896, "grad_norm": 1048.0, "kl_loss_12": 1037.3114379882813, "kl_loss_17": 279.98156356811523, "kl_loss_3": 2598.100732421875, "kl_loss_6": 1999.0735656738282, "learning_rate": 2.6983023928961405e-05, "loss": 1473.7261, "step": 8960 }, { "ce_loss_12": 3.3273321270942686, "ce_loss_17": 2.992566633224487, "ce_loss_23": 2.8700740814208983, "ce_loss_3": 4.105877768993378, "ce_loss_6": 3.8104355692863465, "epoch": 0.897, "grad_norm": 1440.0, "kl_loss_12": 1041.3552825927734, "kl_loss_17": 278.02837600708006, "kl_loss_3": 2598.092138671875, "kl_loss_6": 2015.8913024902345, "learning_rate": 2.6471220490954628e-05, "loss": 1505.1469, "step": 8970 }, { "ce_loss_12": 3.3092846512794494, "ce_loss_17": 2.985722553730011, "ce_loss_23": 2.875733423233032, "ce_loss_3": 4.094814789295197, "ce_loss_6": 3.7927043557167055, "epoch": 0.898, "grad_norm": 1160.0, "kl_loss_12": 1027.3684204101562, "kl_loss_17": 274.019766998291, "kl_loss_3": 2601.745983886719, "kl_loss_6": 2008.9629211425781, "learning_rate": 2.596418548250029e-05, "loss": 1486.4215, "step": 8980 }, { "ce_loss_12": 3.3536044120788575, "ce_loss_17": 3.0218282103538514, "ce_loss_23": 2.9010983228683473, "ce_loss_3": 4.125899636745453, "ce_loss_6": 3.829993963241577, "epoch": 0.899, "grad_norm": 1048.0, "kl_loss_12": 1056.1575775146484, "kl_loss_17": 281.9281745910645, "kl_loss_3": 2629.8561401367188, "kl_loss_6": 2032.8089904785156, "learning_rate": 2.5461924009435368e-05, "loss": 1478.6717, "step": 8990 }, { "ce_loss_12": 3.3432854652404784, "ce_loss_17": 3.010783517360687, "ce_loss_23": 2.8903890252113342, "ce_loss_3": 4.118599140644074, "ce_loss_6": 3.810559618473053, "epoch": 0.9, "grad_norm": 1072.0, "kl_loss_12": 1052.9543823242188, "kl_loss_17": 283.1170166015625, "kl_loss_3": 2605.7826538085938, "kl_loss_6": 1999.7805297851562, "learning_rate": 2.4964441129527336e-05, "loss": 1508.3487, "step": 9000 }, { "ce_loss_12": 3.333498179912567, "ce_loss_17": 3.0122481107711794, "ce_loss_23": 2.899858772754669, "ce_loss_3": 4.097084081172943, "ce_loss_6": 3.7975186944007873, "epoch": 0.901, "grad_norm": 2160.0, "kl_loss_12": 1020.2845001220703, "kl_loss_17": 271.0269744873047, "kl_loss_3": 2563.192272949219, "kl_loss_6": 1966.9169921875, "learning_rate": 2.4471741852423235e-05, "loss": 1465.4293, "step": 9010 }, { "ce_loss_12": 3.399704563617706, "ce_loss_17": 3.0641690731048583, "ce_loss_23": 2.943408727645874, "ce_loss_3": 4.163009667396546, "ce_loss_6": 3.8632459163665773, "epoch": 0.902, "grad_norm": 1176.0, "kl_loss_12": 1035.918508911133, "kl_loss_17": 277.75927505493166, "kl_loss_3": 2568.3744995117186, "kl_loss_6": 1979.5784423828125, "learning_rate": 2.3983831139599287e-05, "loss": 1475.4694, "step": 9020 }, { "ce_loss_12": 3.3126293659210204, "ce_loss_17": 2.981877088546753, "ce_loss_23": 2.8633044242858885, "ce_loss_3": 4.085747599601746, "ce_loss_6": 3.7825194239616393, "epoch": 0.903, "grad_norm": 1064.0, "kl_loss_12": 1017.3647033691407, "kl_loss_17": 272.69321899414064, "kl_loss_3": 2586.8809204101562, "kl_loss_6": 1981.6273864746095, "learning_rate": 2.3500713904311022e-05, "loss": 1444.9332, "step": 9030 }, { "ce_loss_12": 3.3346133708953856, "ce_loss_17": 3.0174012422561645, "ce_loss_23": 2.904342031478882, "ce_loss_3": 4.085952639579773, "ce_loss_6": 3.79553884267807, "epoch": 0.904, "grad_norm": 1144.0, "kl_loss_12": 1006.2362091064454, "kl_loss_17": 267.2675193786621, "kl_loss_3": 2516.152868652344, "kl_loss_6": 1946.0141967773438, "learning_rate": 2.3022395011543685e-05, "loss": 1443.6189, "step": 9040 }, { "ce_loss_12": 3.385925018787384, "ce_loss_17": 3.0416790723800657, "ce_loss_23": 2.9204747676849365, "ce_loss_3": 4.156314277648926, "ce_loss_6": 3.848258936405182, "epoch": 0.905, "grad_norm": 1448.0, "kl_loss_12": 1069.7081512451173, "kl_loss_17": 286.5039176940918, "kl_loss_3": 2623.0983764648436, "kl_loss_6": 2016.3146728515626, "learning_rate": 2.2548879277963063e-05, "loss": 1520.4957, "step": 9050 }, { "ce_loss_12": 3.2931102752685546, "ce_loss_17": 2.968023347854614, "ce_loss_23": 2.8518067359924317, "ce_loss_3": 4.063058459758759, "ce_loss_6": 3.7594056487083436, "epoch": 0.906, "grad_norm": 1088.0, "kl_loss_12": 1022.5076141357422, "kl_loss_17": 273.32446670532227, "kl_loss_3": 2567.251330566406, "kl_loss_6": 1970.2335693359375, "learning_rate": 2.208017147186736e-05, "loss": 1434.3865, "step": 9060 }, { "ce_loss_12": 3.2922698259353638, "ce_loss_17": 2.9559255719184874, "ce_loss_23": 2.837369775772095, "ce_loss_3": 4.067186653614044, "ce_loss_6": 3.764479637145996, "epoch": 0.907, "grad_norm": 1088.0, "kl_loss_12": 1038.8773620605468, "kl_loss_17": 273.9896865844727, "kl_loss_3": 2609.279541015625, "kl_loss_6": 2013.705078125, "learning_rate": 2.1616276313139227e-05, "loss": 1471.0474, "step": 9070 }, { "ce_loss_12": 3.3338735938072204, "ce_loss_17": 3.0027177572250365, "ce_loss_23": 2.8810632467269897, "ce_loss_3": 4.112732636928558, "ce_loss_6": 3.8151703476905823, "epoch": 0.908, "grad_norm": 1032.0, "kl_loss_12": 1036.070867919922, "kl_loss_17": 277.0100456237793, "kl_loss_3": 2601.8918701171874, "kl_loss_6": 2009.253973388672, "learning_rate": 2.1157198473197415e-05, "loss": 1500.7545, "step": 9080 }, { "ce_loss_12": 3.3962167143821715, "ce_loss_17": 3.0618329763412477, "ce_loss_23": 2.937837529182434, "ce_loss_3": 4.186863827705383, "ce_loss_6": 3.875001060962677, "epoch": 0.909, "grad_norm": 1032.0, "kl_loss_12": 1066.4001190185547, "kl_loss_17": 285.7182487487793, "kl_loss_3": 2643.5581665039062, "kl_loss_6": 2037.5399230957032, "learning_rate": 2.0702942574950812e-05, "loss": 1497.7828, "step": 9090 }, { "ce_loss_12": 3.3390152215957642, "ce_loss_17": 2.999873125553131, "ce_loss_23": 2.8761911392211914, "ce_loss_3": 4.124549388885498, "ce_loss_6": 3.8125956773757936, "epoch": 0.91, "grad_norm": 1048.0, "kl_loss_12": 1069.9019256591796, "kl_loss_17": 287.98951568603513, "kl_loss_3": 2650.0345458984375, "kl_loss_6": 2025.361865234375, "learning_rate": 2.025351319275137e-05, "loss": 1498.6699, "step": 9100 }, { "ce_loss_12": 3.4453240752220156, "ce_loss_17": 3.1048492908477785, "ce_loss_23": 2.9836212515830995, "ce_loss_3": 4.217355704307556, "ce_loss_6": 3.9146528840065002, "epoch": 0.911, "grad_norm": 1104.0, "kl_loss_12": 1077.796401977539, "kl_loss_17": 285.53475799560545, "kl_loss_3": 2642.6656860351563, "kl_loss_6": 2044.46884765625, "learning_rate": 1.9808914852347816e-05, "loss": 1532.4327, "step": 9110 }, { "ce_loss_12": 3.2942785143852236, "ce_loss_17": 2.9586670756340028, "ce_loss_23": 2.833610546588898, "ce_loss_3": 4.07194048166275, "ce_loss_6": 3.7556120276451113, "epoch": 0.912, "grad_norm": 1032.0, "kl_loss_12": 1044.191128540039, "kl_loss_17": 278.99835510253905, "kl_loss_3": 2605.4442626953123, "kl_loss_6": 1986.3443481445313, "learning_rate": 1.9369152030840554e-05, "loss": 1475.4129, "step": 9120 }, { "ce_loss_12": 3.3684799432754517, "ce_loss_17": 3.038332152366638, "ce_loss_23": 2.9220212578773497, "ce_loss_3": 4.150288593769074, "ce_loss_6": 3.8487510085105896, "epoch": 0.913, "grad_norm": 1368.0, "kl_loss_12": 1052.0964294433593, "kl_loss_17": 275.7938705444336, "kl_loss_3": 2654.1997680664062, "kl_loss_6": 2046.3094360351563, "learning_rate": 1.893422915663645e-05, "loss": 1500.7535, "step": 9130 }, { "ce_loss_12": 3.2739500761032105, "ce_loss_17": 2.925761067867279, "ce_loss_23": 2.8034397840499876, "ce_loss_3": 4.088464558124542, "ce_loss_6": 3.7661290884017946, "epoch": 0.914, "grad_norm": 868.0, "kl_loss_12": 1069.2874298095703, "kl_loss_17": 282.2047752380371, "kl_loss_3": 2702.7495727539062, "kl_loss_6": 2066.0608825683594, "learning_rate": 1.850415060940386e-05, "loss": 1517.72, "step": 9140 }, { "ce_loss_12": 3.3597048044204714, "ce_loss_17": 3.0365005016326903, "ce_loss_23": 2.921056258678436, "ce_loss_3": 4.113396620750427, "ce_loss_6": 3.813002622127533, "epoch": 0.915, "grad_norm": 1064.0, "kl_loss_12": 1032.023666381836, "kl_loss_17": 275.6730438232422, "kl_loss_3": 2575.0119384765626, "kl_loss_6": 1964.8316711425782, "learning_rate": 1.8078920720028978e-05, "loss": 1474.0727, "step": 9150 }, { "ce_loss_12": 3.283889102935791, "ce_loss_17": 2.9600313901901245, "ce_loss_23": 2.849053978919983, "ce_loss_3": 4.043579959869385, "ce_loss_6": 3.7437153458595276, "epoch": 0.916, "grad_norm": 1400.0, "kl_loss_12": 1023.5445556640625, "kl_loss_17": 270.17430953979493, "kl_loss_3": 2535.034033203125, "kl_loss_6": 1954.6587158203124, "learning_rate": 1.765854377057219e-05, "loss": 1481.7438, "step": 9160 }, { "ce_loss_12": 3.261327934265137, "ce_loss_17": 2.9379109382629394, "ce_loss_23": 2.826120972633362, "ce_loss_3": 4.033465266227722, "ce_loss_6": 3.731200802326202, "epoch": 0.917, "grad_norm": 1024.0, "kl_loss_12": 1006.3924865722656, "kl_loss_17": 265.4219459533691, "kl_loss_3": 2555.5319702148436, "kl_loss_6": 1970.5117919921875, "learning_rate": 1.724302399422456e-05, "loss": 1468.8849, "step": 9170 }, { "ce_loss_12": 3.259600067138672, "ce_loss_17": 2.918641984462738, "ce_loss_23": 2.7934481501579285, "ce_loss_3": 4.037407219409943, "ce_loss_6": 3.722501480579376, "epoch": 0.918, "grad_norm": 1024.0, "kl_loss_12": 1066.1141998291016, "kl_loss_17": 286.471826171875, "kl_loss_3": 2632.22890625, "kl_loss_6": 2012.1058410644532, "learning_rate": 1.683236557526574e-05, "loss": 1494.4383, "step": 9180 }, { "ce_loss_12": 3.332295763492584, "ce_loss_17": 3.0142446875572206, "ce_loss_23": 2.9020984530448914, "ce_loss_3": 4.076275789737702, "ce_loss_6": 3.778683769702911, "epoch": 0.919, "grad_norm": 832.0, "kl_loss_12": 1006.3529113769531, "kl_loss_17": 267.5891944885254, "kl_loss_3": 2506.306994628906, "kl_loss_6": 1916.475634765625, "learning_rate": 1.6426572649021475e-05, "loss": 1464.321, "step": 9190 }, { "ce_loss_12": 3.355605161190033, "ce_loss_17": 3.045579528808594, "ce_loss_23": 2.931298625469208, "ce_loss_3": 4.095176577568054, "ce_loss_6": 3.79943608045578, "epoch": 0.92, "grad_norm": 1352.0, "kl_loss_12": 1008.678677368164, "kl_loss_17": 274.654532623291, "kl_loss_3": 2516.2755615234373, "kl_loss_6": 1922.5859741210938, "learning_rate": 1.6025649301821876e-05, "loss": 1456.7072, "step": 9200 }, { "ce_loss_12": 3.3600135803222657, "ce_loss_17": 3.036742627620697, "ce_loss_23": 2.914152812957764, "ce_loss_3": 4.0989128947258, "ce_loss_6": 3.8089195847511292, "epoch": 0.921, "grad_norm": 1040.0, "kl_loss_12": 1034.201220703125, "kl_loss_17": 278.173348236084, "kl_loss_3": 2540.3933715820312, "kl_loss_6": 1959.813055419922, "learning_rate": 1.5629599570960716e-05, "loss": 1449.2359, "step": 9210 }, { "ce_loss_12": 3.281749439239502, "ce_loss_17": 2.9552075147628782, "ce_loss_23": 2.8368995785713196, "ce_loss_3": 4.073426759243011, "ce_loss_6": 3.761858856678009, "epoch": 0.922, "grad_norm": 864.0, "kl_loss_12": 1043.070327758789, "kl_loss_17": 276.25808944702146, "kl_loss_3": 2643.6991577148438, "kl_loss_6": 2027.4319091796874, "learning_rate": 1.5238427444654367e-05, "loss": 1485.8428, "step": 9220 }, { "ce_loss_12": 3.3239726305007933, "ce_loss_17": 2.9993454456329345, "ce_loss_23": 2.880155599117279, "ce_loss_3": 4.090979540348053, "ce_loss_6": 3.7864125967025757, "epoch": 0.923, "grad_norm": 812.0, "kl_loss_12": 1024.8006652832032, "kl_loss_17": 274.18183517456055, "kl_loss_3": 2569.9059326171873, "kl_loss_6": 1968.6023559570312, "learning_rate": 1.4852136862001764e-05, "loss": 1467.6075, "step": 9230 }, { "ce_loss_12": 3.2993701934814452, "ce_loss_17": 2.9703409552574156, "ce_loss_23": 2.854680836200714, "ce_loss_3": 4.047952282428741, "ce_loss_6": 3.7495328783988953, "epoch": 0.924, "grad_norm": 1088.0, "kl_loss_12": 1021.3720611572265, "kl_loss_17": 271.0319076538086, "kl_loss_3": 2543.8160400390625, "kl_loss_6": 1948.3985900878906, "learning_rate": 1.4470731712944884e-05, "loss": 1473.6795, "step": 9240 }, { "ce_loss_12": 3.330318236351013, "ce_loss_17": 2.995251107215881, "ce_loss_23": 2.874153757095337, "ce_loss_3": 4.114708054065704, "ce_loss_6": 3.7896088123321534, "epoch": 0.925, "grad_norm": 1312.0, "kl_loss_12": 1042.4645599365235, "kl_loss_17": 280.9706100463867, "kl_loss_3": 2619.0177490234373, "kl_loss_6": 1987.1992736816405, "learning_rate": 1.4094215838229174e-05, "loss": 1503.7229, "step": 9250 }, { "ce_loss_12": 3.3083136796951296, "ce_loss_17": 2.9710094690322877, "ce_loss_23": 2.851603698730469, "ce_loss_3": 4.09444340467453, "ce_loss_6": 3.784080910682678, "epoch": 0.926, "grad_norm": 1072.0, "kl_loss_12": 1049.132028198242, "kl_loss_17": 276.60772094726565, "kl_loss_3": 2644.8538818359375, "kl_loss_6": 2025.5753662109375, "learning_rate": 1.372259302936546e-05, "loss": 1540.0123, "step": 9260 }, { "ce_loss_12": 3.401785659790039, "ce_loss_17": 3.0654093503952025, "ce_loss_23": 2.944291520118713, "ce_loss_3": 4.174069476127625, "ce_loss_6": 3.8684807658195495, "epoch": 0.927, "grad_norm": 1200.0, "kl_loss_12": 1050.0497802734376, "kl_loss_17": 288.2478630065918, "kl_loss_3": 2608.3745239257814, "kl_loss_6": 2001.8755798339844, "learning_rate": 1.3355867028591206e-05, "loss": 1474.5029, "step": 9270 }, { "ce_loss_12": 3.2946610689163207, "ce_loss_17": 2.9762078881263734, "ce_loss_23": 2.857401442527771, "ce_loss_3": 4.049395573139191, "ce_loss_6": 3.7476461291313172, "epoch": 0.928, "grad_norm": 1272.0, "kl_loss_12": 1026.7928558349608, "kl_loss_17": 271.5105369567871, "kl_loss_3": 2554.0462646484375, "kl_loss_6": 1948.730401611328, "learning_rate": 1.2994041528833267e-05, "loss": 1462.0882, "step": 9280 }, { "ce_loss_12": 3.3038975596427917, "ce_loss_17": 2.975873041152954, "ce_loss_23": 2.8577345252037047, "ce_loss_3": 4.068048667907715, "ce_loss_6": 3.7713038444519045, "epoch": 0.929, "grad_norm": 1248.0, "kl_loss_12": 1031.1834899902344, "kl_loss_17": 270.2506500244141, "kl_loss_3": 2588.7155151367188, "kl_loss_6": 1989.1662902832031, "learning_rate": 1.2637120173670358e-05, "loss": 1462.8256, "step": 9290 }, { "ce_loss_12": 3.329246151447296, "ce_loss_17": 2.9927667379379272, "ce_loss_23": 2.871297037601471, "ce_loss_3": 4.111742997169495, "ce_loss_6": 3.8050437331199647, "epoch": 0.93, "grad_norm": 1480.0, "kl_loss_12": 1047.7047393798828, "kl_loss_17": 280.56040649414064, "kl_loss_3": 2612.6916259765626, "kl_loss_6": 2005.778125, "learning_rate": 1.2285106557296478e-05, "loss": 1476.817, "step": 9300 }, { "ce_loss_12": 3.2389184474945067, "ce_loss_17": 2.891287457942963, "ce_loss_23": 2.7736205101013183, "ce_loss_3": 4.076551246643066, "ce_loss_6": 3.7549044370651243, "epoch": 0.931, "grad_norm": 988.0, "kl_loss_12": 1067.2639923095703, "kl_loss_17": 278.7872940063477, "kl_loss_3": 2738.7692749023436, "kl_loss_6": 2110.3818420410157, "learning_rate": 1.1938004224484989e-05, "loss": 1510.7908, "step": 9310 }, { "ce_loss_12": 3.4300517201423646, "ce_loss_17": 3.103699254989624, "ce_loss_23": 2.9833187103271483, "ce_loss_3": 4.195168387889862, "ce_loss_6": 3.888136649131775, "epoch": 0.932, "grad_norm": 1012.0, "kl_loss_12": 1042.8537139892578, "kl_loss_17": 280.47555923461914, "kl_loss_3": 2594.4358642578127, "kl_loss_6": 1984.2061462402344, "learning_rate": 1.1595816670552429e-05, "loss": 1501.0504, "step": 9320 }, { "ce_loss_12": 3.3418365716934204, "ce_loss_17": 3.026378798484802, "ce_loss_23": 2.9092856764793398, "ce_loss_3": 4.11734037399292, "ce_loss_6": 3.8102577209472654, "epoch": 0.933, "grad_norm": 1544.0, "kl_loss_12": 1018.7249389648438, "kl_loss_17": 275.87775268554685, "kl_loss_3": 2572.297644042969, "kl_loss_6": 1973.426904296875, "learning_rate": 1.1258547341323699e-05, "loss": 1452.3419, "step": 9330 }, { "ce_loss_12": 3.384983277320862, "ce_loss_17": 3.058712899684906, "ce_loss_23": 2.9401206731796266, "ce_loss_3": 4.143124210834503, "ce_loss_6": 3.8467655301094057, "epoch": 0.934, "grad_norm": 1020.0, "kl_loss_12": 1042.8963317871094, "kl_loss_17": 279.26527938842776, "kl_loss_3": 2582.5818603515627, "kl_loss_6": 1990.2594421386718, "learning_rate": 1.0926199633097156e-05, "loss": 1467.6707, "step": 9340 }, { "ce_loss_12": 3.3787927627563477, "ce_loss_17": 3.065500867366791, "ce_loss_23": 2.9487602353096007, "ce_loss_3": 4.118912374973297, "ce_loss_6": 3.823423957824707, "epoch": 0.935, "grad_norm": 1040.0, "kl_loss_12": 1013.5736785888672, "kl_loss_17": 271.54649047851564, "kl_loss_3": 2529.153271484375, "kl_loss_6": 1936.4615600585937, "learning_rate": 1.0598776892610684e-05, "loss": 1487.6551, "step": 9350 }, { "ce_loss_12": 3.2322107553482056, "ce_loss_17": 2.892566466331482, "ce_loss_23": 2.778317618370056, "ce_loss_3": 4.017880356311798, "ce_loss_6": 3.70479074716568, "epoch": 0.936, "grad_norm": 1056.0, "kl_loss_12": 1048.4545867919921, "kl_loss_17": 272.3912384033203, "kl_loss_3": 2631.8158935546876, "kl_loss_6": 2015.51806640625, "learning_rate": 1.0276282417007399e-05, "loss": 1472.0926, "step": 9360 }, { "ce_loss_12": 3.348423981666565, "ce_loss_17": 3.0298909902572633, "ce_loss_23": 2.9166847348213194, "ce_loss_3": 4.097442483901977, "ce_loss_6": 3.80429527759552, "epoch": 0.937, "grad_norm": 1128.0, "kl_loss_12": 1012.1721435546875, "kl_loss_17": 268.58225708007814, "kl_loss_3": 2518.1324829101563, "kl_loss_6": 1936.9852722167968, "learning_rate": 9.958719453803277e-06, "loss": 1457.2605, "step": 9370 }, { "ce_loss_12": 3.3690081238746643, "ce_loss_17": 3.030905532836914, "ce_loss_23": 2.9100383877754212, "ce_loss_3": 4.134298634529114, "ce_loss_6": 3.834771382808685, "epoch": 0.938, "grad_norm": 1128.0, "kl_loss_12": 1051.0932312011719, "kl_loss_17": 276.5739906311035, "kl_loss_3": 2599.291943359375, "kl_loss_6": 2003.5282287597656, "learning_rate": 9.646091200853802e-06, "loss": 1469.2527, "step": 9380 }, { "ce_loss_12": 3.3185535073280334, "ce_loss_17": 2.9908557176589965, "ce_loss_23": 2.873013174533844, "ce_loss_3": 4.07992981672287, "ce_loss_6": 3.7722710847854612, "epoch": 0.939, "grad_norm": 2736.0, "kl_loss_12": 1026.4719848632812, "kl_loss_17": 272.8523124694824, "kl_loss_3": 2563.2376708984375, "kl_loss_6": 1953.9481689453125, "learning_rate": 9.338400806321978e-06, "loss": 1427.4537, "step": 9390 }, { "ce_loss_12": 3.352263617515564, "ce_loss_17": 3.0240584135055544, "ce_loss_23": 2.9010123133659365, "ce_loss_3": 4.113032567501068, "ce_loss_6": 3.8156125426292418, "epoch": 0.94, "grad_norm": 824.0, "kl_loss_12": 1039.0976806640624, "kl_loss_17": 283.21082916259763, "kl_loss_3": 2561.940295410156, "kl_loss_6": 1986.542413330078, "learning_rate": 9.035651368646646e-06, "loss": 1458.089, "step": 9400 }, { "ce_loss_12": 3.3536390900611877, "ce_loss_17": 3.030505657196045, "ce_loss_23": 2.9178043007850647, "ce_loss_3": 4.11185348033905, "ce_loss_6": 3.80875107049942, "epoch": 0.941, "grad_norm": 1144.0, "kl_loss_12": 1023.1300567626953, "kl_loss_17": 269.4735565185547, "kl_loss_3": 2550.777551269531, "kl_loss_6": 1962.2735717773437, "learning_rate": 8.737845936511335e-06, "loss": 1468.4475, "step": 9410 }, { "ce_loss_12": 3.318327307701111, "ce_loss_17": 2.984967792034149, "ce_loss_23": 2.862468791007996, "ce_loss_3": 4.097449505329132, "ce_loss_6": 3.794388008117676, "epoch": 0.942, "grad_norm": 1344.0, "kl_loss_12": 1049.340396118164, "kl_loss_17": 281.8879737854004, "kl_loss_3": 2629.4581298828125, "kl_loss_6": 2026.4955139160156, "learning_rate": 8.444987508813451e-06, "loss": 1479.8018, "step": 9420 }, { "ce_loss_12": 3.2956662774086, "ce_loss_17": 2.946325933933258, "ce_loss_23": 2.8255608439445496, "ce_loss_3": 4.088038122653961, "ce_loss_6": 3.7815781831741333, "epoch": 0.943, "grad_norm": 1352.0, "kl_loss_12": 1077.2649169921874, "kl_loss_17": 284.64258270263673, "kl_loss_3": 2692.6649780273438, "kl_loss_6": 2081.035852050781, "learning_rate": 8.157079034633974e-06, "loss": 1504.0758, "step": 9430 }, { "ce_loss_12": 3.269844686985016, "ce_loss_17": 2.935304272174835, "ce_loss_23": 2.81859130859375, "ce_loss_3": 4.045350301265716, "ce_loss_6": 3.7425882458686828, "epoch": 0.944, "grad_norm": 1240.0, "kl_loss_12": 1048.7982055664063, "kl_loss_17": 275.0502960205078, "kl_loss_3": 2619.5218505859375, "kl_loss_6": 2022.1584899902343, "learning_rate": 7.874123413208145e-06, "loss": 1472.9727, "step": 9440 }, { "ce_loss_12": 3.2593874454498293, "ce_loss_17": 2.9077999234199523, "ce_loss_23": 2.791432774066925, "ce_loss_3": 4.056062710285187, "ce_loss_6": 3.7430989146232605, "epoch": 0.945, "grad_norm": 920.0, "kl_loss_12": 1054.8998809814452, "kl_loss_17": 277.63961563110354, "kl_loss_3": 2658.3320190429686, "kl_loss_6": 2038.6107299804687, "learning_rate": 7.59612349389599e-06, "loss": 1498.3892, "step": 9450 }, { "ce_loss_12": 3.3173531889915466, "ce_loss_17": 2.995698320865631, "ce_loss_23": 2.881583273410797, "ce_loss_3": 4.064676368236542, "ce_loss_6": 3.7659632921218873, "epoch": 0.946, "grad_norm": 1368.0, "kl_loss_12": 1005.65966796875, "kl_loss_17": 268.5233100891113, "kl_loss_3": 2510.1213134765626, "kl_loss_6": 1912.729071044922, "learning_rate": 7.323082076153509e-06, "loss": 1452.6504, "step": 9460 }, { "ce_loss_12": 3.3648293733596804, "ce_loss_17": 3.0392589926719666, "ce_loss_23": 2.921140217781067, "ce_loss_3": 4.111345851421357, "ce_loss_6": 3.815649449825287, "epoch": 0.947, "grad_norm": 792.0, "kl_loss_12": 1030.4669982910157, "kl_loss_17": 280.2841407775879, "kl_loss_3": 2527.8580322265625, "kl_loss_6": 1948.9057983398438, "learning_rate": 7.055001909504755e-06, "loss": 1481.591, "step": 9470 }, { "ce_loss_12": 3.400721490383148, "ce_loss_17": 3.0784170508384703, "ce_loss_23": 2.9573259115219117, "ce_loss_3": 4.157055807113648, "ce_loss_6": 3.8579179406166078, "epoch": 0.948, "grad_norm": 1032.0, "kl_loss_12": 1039.483480834961, "kl_loss_17": 277.8528198242187, "kl_loss_3": 2571.344763183594, "kl_loss_6": 1984.4642028808594, "learning_rate": 6.791885693514133e-06, "loss": 1473.3818, "step": 9480 }, { "ce_loss_12": 3.3163992047309874, "ce_loss_17": 2.985028529167175, "ce_loss_23": 2.8660680651664734, "ce_loss_3": 4.108574676513672, "ce_loss_6": 3.7938371777534483, "epoch": 0.949, "grad_norm": 1536.0, "kl_loss_12": 1044.1281616210938, "kl_loss_17": 278.31127853393554, "kl_loss_3": 2656.0906494140627, "kl_loss_6": 2034.2154418945313, "learning_rate": 6.533736077758867e-06, "loss": 1498.9166, "step": 9490 }, { "ce_loss_12": 3.295397973060608, "ce_loss_17": 2.953207588195801, "ce_loss_23": 2.8314607620239256, "ce_loss_3": 4.096197354793548, "ce_loss_6": 3.7886332869529724, "epoch": 0.95, "grad_norm": 1320.0, "kl_loss_12": 1081.0685180664063, "kl_loss_17": 285.80748291015624, "kl_loss_3": 2691.1257080078126, "kl_loss_6": 2082.5201416015625, "learning_rate": 6.2805556618028556e-06, "loss": 1492.1794, "step": 9500 }, { "ce_loss_12": 3.3366139054298403, "ce_loss_17": 3.0263991355895996, "ce_loss_23": 2.9105852842330933, "ce_loss_3": 4.09708423614502, "ce_loss_6": 3.7928590297698976, "epoch": 0.951, "grad_norm": 1368.0, "kl_loss_12": 986.2284301757812, "kl_loss_17": 267.4480796813965, "kl_loss_3": 2520.3510498046876, "kl_loss_6": 1924.9207092285155, "learning_rate": 6.032346995169968e-06, "loss": 1412.5211, "step": 9510 }, { "ce_loss_12": 3.3517472743988037, "ce_loss_17": 3.0306913375854494, "ce_loss_23": 2.916367495059967, "ce_loss_3": 4.121383500099182, "ce_loss_6": 3.81394704580307, "epoch": 0.952, "grad_norm": 1040.0, "kl_loss_12": 1032.4408020019532, "kl_loss_17": 276.3887512207031, "kl_loss_3": 2577.0925537109374, "kl_loss_6": 1970.7738647460938, "learning_rate": 5.789112577318789e-06, "loss": 1458.9979, "step": 9520 }, { "ce_loss_12": 3.3498944520950316, "ce_loss_17": 3.016447389125824, "ce_loss_23": 2.89688059091568, "ce_loss_3": 4.12940376996994, "ce_loss_6": 3.8246996641159057, "epoch": 0.953, "grad_norm": 932.0, "kl_loss_12": 1050.8054412841798, "kl_loss_17": 279.5018035888672, "kl_loss_3": 2623.3647338867186, "kl_loss_6": 2023.7264892578125, "learning_rate": 5.550854857617194e-06, "loss": 1462.5711, "step": 9530 }, { "ce_loss_12": 3.3379459381103516, "ce_loss_17": 3.001688039302826, "ce_loss_23": 2.877410364151001, "ce_loss_3": 4.137910008430481, "ce_loss_6": 3.8291539192199706, "epoch": 0.954, "grad_norm": 948.0, "kl_loss_12": 1063.9521179199219, "kl_loss_17": 285.664315032959, "kl_loss_3": 2678.9524658203127, "kl_loss_6": 2059.6889831542967, "learning_rate": 5.317576235317756e-06, "loss": 1505.4752, "step": 9540 }, { "ce_loss_12": 3.3384133100509645, "ce_loss_17": 3.017950987815857, "ce_loss_23": 2.9062692165374755, "ce_loss_3": 4.092934966087341, "ce_loss_6": 3.778127145767212, "epoch": 0.955, "grad_norm": 1032.0, "kl_loss_12": 995.489956665039, "kl_loss_17": 267.39564056396483, "kl_loss_3": 2516.7044067382812, "kl_loss_6": 1901.7774536132813, "learning_rate": 5.089279059533658e-06, "loss": 1463.7273, "step": 9550 }, { "ce_loss_12": 3.407905399799347, "ce_loss_17": 3.0725943088531493, "ce_loss_23": 2.9485292077064513, "ce_loss_3": 4.161868751049042, "ce_loss_6": 3.857768952846527, "epoch": 0.956, "grad_norm": 1032.0, "kl_loss_12": 1044.148989868164, "kl_loss_17": 283.47854843139646, "kl_loss_3": 2572.16533203125, "kl_loss_6": 1962.809149169922, "learning_rate": 4.865965629214819e-06, "loss": 1457.177, "step": 9560 }, { "ce_loss_12": 3.3561294078826904, "ce_loss_17": 3.022306752204895, "ce_loss_23": 2.9044751048088076, "ce_loss_3": 4.1291221380233765, "ce_loss_6": 3.825023341178894, "epoch": 0.957, "grad_norm": 992.0, "kl_loss_12": 1050.428485107422, "kl_loss_17": 279.6668502807617, "kl_loss_3": 2627.7093994140623, "kl_loss_6": 2018.747003173828, "learning_rate": 4.6476381931251366e-06, "loss": 1466.3221, "step": 9570 }, { "ce_loss_12": 3.334410846233368, "ce_loss_17": 3.010200834274292, "ce_loss_23": 2.892228066921234, "ce_loss_3": 4.09078129529953, "ce_loss_6": 3.7890233635902404, "epoch": 0.958, "grad_norm": 936.0, "kl_loss_12": 1018.1339416503906, "kl_loss_17": 272.4146240234375, "kl_loss_3": 2554.4165771484377, "kl_loss_6": 1950.5408752441406, "learning_rate": 4.434298949819449e-06, "loss": 1460.3959, "step": 9580 }, { "ce_loss_12": 3.324360358715057, "ce_loss_17": 2.981010985374451, "ce_loss_23": 2.857758712768555, "ce_loss_3": 4.125252890586853, "ce_loss_6": 3.8098152041435243, "epoch": 0.959, "grad_norm": 1004.0, "kl_loss_12": 1088.3455841064454, "kl_loss_17": 288.2356979370117, "kl_loss_3": 2720.9905395507812, "kl_loss_6": 2097.9127197265625, "learning_rate": 4.2259500476214406e-06, "loss": 1509.5423, "step": 9590 }, { "ce_loss_12": 3.2877951502799987, "ce_loss_17": 2.954547381401062, "ce_loss_23": 2.8363230228424072, "ce_loss_3": 4.070486485958099, "ce_loss_6": 3.769054639339447, "epoch": 0.96, "grad_norm": 944.0, "kl_loss_12": 1045.826885986328, "kl_loss_17": 276.11616744995115, "kl_loss_3": 2624.5047119140627, "kl_loss_6": 2027.1597595214844, "learning_rate": 4.02259358460233e-06, "loss": 1472.7125, "step": 9600 }, { "ce_loss_12": 3.3414696574211122, "ce_loss_17": 3.0194800734519958, "ce_loss_23": 2.8990446448326113, "ce_loss_3": 4.104188013076782, "ce_loss_6": 3.8056410789489745, "epoch": 0.961, "grad_norm": 1072.0, "kl_loss_12": 1023.0301330566406, "kl_loss_17": 278.8875244140625, "kl_loss_3": 2551.333874511719, "kl_loss_6": 1957.9582397460938, "learning_rate": 3.8242316085594916e-06, "loss": 1456.6529, "step": 9610 }, { "ce_loss_12": 3.2622671365737914, "ce_loss_17": 2.914546084403992, "ce_loss_23": 2.791579818725586, "ce_loss_3": 4.080668830871582, "ce_loss_6": 3.7585220336914062, "epoch": 0.962, "grad_norm": 1456.0, "kl_loss_12": 1076.800796508789, "kl_loss_17": 285.5383560180664, "kl_loss_3": 2724.994970703125, "kl_loss_6": 2097.9461975097656, "learning_rate": 3.630866116995757e-06, "loss": 1529.8656, "step": 9620 }, { "ce_loss_12": 3.3696959376335145, "ce_loss_17": 3.051957297325134, "ce_loss_23": 2.9373905539512633, "ce_loss_3": 4.120000338554382, "ce_loss_6": 3.8250346064567564, "epoch": 0.963, "grad_norm": 944.0, "kl_loss_12": 1015.1488037109375, "kl_loss_17": 271.67421875, "kl_loss_3": 2542.3350463867187, "kl_loss_6": 1950.3465576171875, "learning_rate": 3.4424990570994797e-06, "loss": 1484.4218, "step": 9630 }, { "ce_loss_12": 3.3661725878715516, "ce_loss_17": 3.044158565998077, "ce_loss_23": 2.9250866651535032, "ce_loss_3": 4.131625187397003, "ce_loss_6": 3.826724684238434, "epoch": 0.964, "grad_norm": 1104.0, "kl_loss_12": 1032.4666778564454, "kl_loss_17": 274.765673828125, "kl_loss_3": 2582.521203613281, "kl_loss_6": 1976.779034423828, "learning_rate": 3.2591323257248896e-06, "loss": 1470.472, "step": 9640 }, { "ce_loss_12": 3.2373790383338927, "ce_loss_17": 2.9063490629196167, "ce_loss_23": 2.790112245082855, "ce_loss_3": 4.014607954025268, "ce_loss_6": 3.7152130365371705, "epoch": 0.965, "grad_norm": 1056.0, "kl_loss_12": 1035.9166870117188, "kl_loss_17": 273.9448547363281, "kl_loss_3": 2597.5956787109376, "kl_loss_6": 2007.8367492675782, "learning_rate": 3.0807677693729385e-06, "loss": 1492.4096, "step": 9650 }, { "ce_loss_12": 3.4023047566413878, "ce_loss_17": 3.0736619353294374, "ce_loss_23": 2.9579479098320007, "ce_loss_3": 4.1481396675109865, "ce_loss_6": 3.8566255927085877, "epoch": 0.966, "grad_norm": 956.0, "kl_loss_12": 1027.349755859375, "kl_loss_17": 272.975756072998, "kl_loss_3": 2543.197399902344, "kl_loss_6": 1959.232568359375, "learning_rate": 2.9074071841727055e-06, "loss": 1445.7012, "step": 9660 }, { "ce_loss_12": 3.3433603644371033, "ce_loss_17": 3.0127647042274477, "ce_loss_23": 2.8944259762763975, "ce_loss_3": 4.101817774772644, "ce_loss_6": 3.796090304851532, "epoch": 0.967, "grad_norm": 1224.0, "kl_loss_12": 1034.7205352783203, "kl_loss_17": 277.1310554504395, "kl_loss_3": 2587.0287109375, "kl_loss_6": 1977.2501525878906, "learning_rate": 2.739052315863355e-06, "loss": 1441.5164, "step": 9670 }, { "ce_loss_12": 3.308350610733032, "ce_loss_17": 2.9846307158470156, "ce_loss_23": 2.8726982355117796, "ce_loss_3": 4.092323124408722, "ce_loss_6": 3.7888726472854612, "epoch": 0.968, "grad_norm": 944.0, "kl_loss_12": 1020.8781433105469, "kl_loss_17": 271.05588684082034, "kl_loss_3": 2602.898889160156, "kl_loss_6": 2005.857647705078, "learning_rate": 2.5757048597765396e-06, "loss": 1460.2541, "step": 9680 }, { "ce_loss_12": 3.338445246219635, "ce_loss_17": 3.0044411063194274, "ce_loss_23": 2.8843555092811584, "ce_loss_3": 4.114881277084351, "ce_loss_6": 3.8083300948143006, "epoch": 0.969, "grad_norm": 1408.0, "kl_loss_12": 1044.8596771240234, "kl_loss_17": 275.3220375061035, "kl_loss_3": 2604.5454223632814, "kl_loss_6": 2008.3942138671875, "learning_rate": 2.417366460819359e-06, "loss": 1477.1682, "step": 9690 }, { "ce_loss_12": 3.3537409782409666, "ce_loss_17": 3.017800807952881, "ce_loss_23": 2.894302558898926, "ce_loss_3": 4.140884184837342, "ce_loss_6": 3.8325692772865296, "epoch": 0.97, "grad_norm": 1112.0, "kl_loss_12": 1051.7619720458983, "kl_loss_17": 282.70016326904295, "kl_loss_3": 2649.907849121094, "kl_loss_6": 2030.5795593261719, "learning_rate": 2.2640387134577057e-06, "loss": 1469.5803, "step": 9700 }, { "ce_loss_12": 3.2549274802207946, "ce_loss_17": 2.939910852909088, "ce_loss_23": 2.828316831588745, "ce_loss_3": 4.001725673675537, "ce_loss_6": 3.705301547050476, "epoch": 0.971, "grad_norm": 868.0, "kl_loss_12": 978.7618072509765, "kl_loss_17": 263.37414093017577, "kl_loss_3": 2466.4112915039063, "kl_loss_6": 1881.4376586914063, "learning_rate": 2.115723161700278e-06, "loss": 1432.3954, "step": 9710 }, { "ce_loss_12": 3.2688825488090516, "ce_loss_17": 2.9281031847000123, "ce_loss_23": 2.8067177176475524, "ce_loss_3": 4.0559126257896425, "ce_loss_6": 3.7535340428352355, "epoch": 0.972, "grad_norm": 956.0, "kl_loss_12": 1062.6104888916016, "kl_loss_17": 282.78052520751953, "kl_loss_3": 2651.771044921875, "kl_loss_6": 2049.60595703125, "learning_rate": 1.9724212990830937e-06, "loss": 1501.4723, "step": 9720 }, { "ce_loss_12": 3.3958194136619566, "ce_loss_17": 3.057923400402069, "ce_loss_23": 2.93945198059082, "ce_loss_3": 4.177759373188019, "ce_loss_6": 3.860970449447632, "epoch": 0.973, "grad_norm": 996.0, "kl_loss_12": 1045.985891723633, "kl_loss_17": 280.6972091674805, "kl_loss_3": 2639.22490234375, "kl_loss_6": 2008.8534240722656, "learning_rate": 1.8341345686543331e-06, "loss": 1483.7762, "step": 9730 }, { "ce_loss_12": 3.3623135566711424, "ce_loss_17": 3.0398145794868467, "ce_loss_23": 2.9244924068450926, "ce_loss_3": 4.100259184837341, "ce_loss_6": 3.8060136675834655, "epoch": 0.974, "grad_norm": 876.0, "kl_loss_12": 1019.5402954101562, "kl_loss_17": 271.8833610534668, "kl_loss_3": 2513.6443725585937, "kl_loss_6": 1932.6292114257812, "learning_rate": 1.7008643629596864e-06, "loss": 1476.4012, "step": 9740 }, { "ce_loss_12": 3.345140302181244, "ce_loss_17": 3.0265002131462095, "ce_loss_23": 2.9070200681686402, "ce_loss_3": 4.126636505126953, "ce_loss_6": 3.814054250717163, "epoch": 0.975, "grad_norm": 940.0, "kl_loss_12": 1021.1226135253906, "kl_loss_17": 276.88904571533203, "kl_loss_3": 2610.3638305664062, "kl_loss_6": 1993.3836791992187, "learning_rate": 1.5726120240288633e-06, "loss": 1493.1273, "step": 9750 }, { "ce_loss_12": 3.264871072769165, "ce_loss_17": 2.9383165001869203, "ce_loss_23": 2.8230011105537414, "ce_loss_3": 4.03258649110794, "ce_loss_6": 3.733627498149872, "epoch": 0.976, "grad_norm": 1072.0, "kl_loss_12": 1035.583740234375, "kl_loss_17": 273.65186462402346, "kl_loss_3": 2586.930322265625, "kl_loss_6": 1992.2733276367187, "learning_rate": 1.4493788433612708e-06, "loss": 1460.1872, "step": 9760 }, { "ce_loss_12": 3.380255401134491, "ce_loss_17": 3.0444837689399717, "ce_loss_23": 2.9262943625450135, "ce_loss_3": 4.149372577667236, "ce_loss_6": 3.8591771006584166, "epoch": 0.977, "grad_norm": 1192.0, "kl_loss_12": 1043.110906982422, "kl_loss_17": 277.4537742614746, "kl_loss_3": 2614.2517211914064, "kl_loss_6": 2021.9392700195312, "learning_rate": 1.3311660619138578e-06, "loss": 1488.0276, "step": 9770 }, { "ce_loss_12": 3.3608654618263243, "ce_loss_17": 3.0446624159812927, "ce_loss_23": 2.9237622380256654, "ce_loss_3": 4.094170534610749, "ce_loss_6": 3.795668828487396, "epoch": 0.978, "grad_norm": 960.0, "kl_loss_12": 1019.3621154785156, "kl_loss_17": 276.7036956787109, "kl_loss_3": 2498.4484741210936, "kl_loss_6": 1912.7908447265625, "learning_rate": 1.2179748700879012e-06, "loss": 1463.3352, "step": 9780 }, { "ce_loss_12": 3.304617428779602, "ce_loss_17": 2.974220836162567, "ce_loss_23": 2.8536474823951723, "ce_loss_3": 4.064146685600281, "ce_loss_6": 3.767940413951874, "epoch": 0.979, "grad_norm": 1248.0, "kl_loss_12": 1021.9867584228516, "kl_loss_17": 275.06206970214845, "kl_loss_3": 2550.9173217773437, "kl_loss_6": 1961.2408264160156, "learning_rate": 1.1098064077174619e-06, "loss": 1467.9999, "step": 9790 }, { "ce_loss_12": 3.3371457934379576, "ce_loss_17": 3.0031715512275694, "ce_loss_23": 2.884718680381775, "ce_loss_3": 4.124406111240387, "ce_loss_6": 3.829567217826843, "epoch": 0.98, "grad_norm": 984.0, "kl_loss_12": 1038.0324310302735, "kl_loss_17": 275.1086616516113, "kl_loss_3": 2638.8526611328125, "kl_loss_6": 2041.0841125488282, "learning_rate": 1.006661764057837e-06, "loss": 1482.5494, "step": 9800 }, { "ce_loss_12": 3.337872099876404, "ce_loss_17": 3.011410188674927, "ce_loss_23": 2.8942373633384704, "ce_loss_3": 4.104587101936341, "ce_loss_6": 3.799306297302246, "epoch": 0.981, "grad_norm": 1552.0, "kl_loss_12": 1033.9137969970702, "kl_loss_17": 271.6068618774414, "kl_loss_3": 2588.8114990234376, "kl_loss_6": 1984.8397277832032, "learning_rate": 9.085419777743465e-07, "loss": 1457.612, "step": 9810 }, { "ce_loss_12": 3.287680518627167, "ce_loss_17": 2.964575207233429, "ce_loss_23": 2.8532678127288817, "ce_loss_3": 4.052232813835144, "ce_loss_6": 3.7622761726379395, "epoch": 0.982, "grad_norm": 900.0, "kl_loss_12": 1018.4030578613281, "kl_loss_17": 267.15757751464844, "kl_loss_3": 2563.1669799804686, "kl_loss_6": 1980.1304626464844, "learning_rate": 8.15448036932176e-07, "loss": 1438.089, "step": 9820 }, { "ce_loss_12": 3.336066448688507, "ce_loss_17": 3.0072415232658387, "ce_loss_23": 2.8884667992591857, "ce_loss_3": 4.095813620090484, "ce_loss_6": 3.7954184889793394, "epoch": 0.983, "grad_norm": 1080.0, "kl_loss_12": 1039.043862915039, "kl_loss_17": 275.2289779663086, "kl_loss_3": 2588.074365234375, "kl_loss_6": 1992.1940795898438, "learning_rate": 7.273808789862724e-07, "loss": 1481.977, "step": 9830 }, { "ce_loss_12": 3.3937831997871397, "ce_loss_17": 3.0694342851638794, "ce_loss_23": 2.951556408405304, "ce_loss_3": 4.159242665767669, "ce_loss_6": 3.8498289585113525, "epoch": 0.984, "grad_norm": 988.0, "kl_loss_12": 1037.1162841796875, "kl_loss_17": 277.25635147094727, "kl_loss_3": 2582.94873046875, "kl_loss_6": 1974.8491821289062, "learning_rate": 6.443413907720186e-07, "loss": 1460.4299, "step": 9840 }, { "ce_loss_12": 3.335273730754852, "ce_loss_17": 3.014903891086578, "ce_loss_23": 2.8978246688842773, "ce_loss_3": 4.110360252857208, "ce_loss_6": 3.8039698004722595, "epoch": 0.985, "grad_norm": 1272.0, "kl_loss_12": 1019.0031555175781, "kl_loss_17": 274.9228126525879, "kl_loss_3": 2559.80078125, "kl_loss_6": 1957.603826904297, "learning_rate": 5.663304084960185e-07, "loss": 1449.5539, "step": 9850 }, { "ce_loss_12": 3.282993268966675, "ce_loss_17": 2.944962537288666, "ce_loss_23": 2.827660346031189, "ce_loss_3": 4.065218067169189, "ce_loss_6": 3.7537544131278993, "epoch": 0.986, "grad_norm": 1020.0, "kl_loss_12": 1047.8510192871095, "kl_loss_17": 278.3067687988281, "kl_loss_3": 2623.368518066406, "kl_loss_6": 2011.9999145507813, "learning_rate": 4.933487177280482e-07, "loss": 1457.2979, "step": 9860 }, { "ce_loss_12": 3.3598272681236265, "ce_loss_17": 3.0410907745361326, "ce_loss_23": 2.929271900653839, "ce_loss_3": 4.1150998711586, "ce_loss_6": 3.8177472591400146, "epoch": 0.987, "grad_norm": 1416.0, "kl_loss_12": 1012.1949981689453, "kl_loss_17": 266.2216857910156, "kl_loss_3": 2549.34970703125, "kl_loss_6": 1953.7899536132813, "learning_rate": 4.2539705339295075e-07, "loss": 1442.8347, "step": 9870 }, { "ce_loss_12": 3.2366280555725098, "ce_loss_17": 2.9017109751701353, "ce_loss_23": 2.7866319894790648, "ce_loss_3": 4.015068626403808, "ce_loss_6": 3.7159414172172545, "epoch": 0.988, "grad_norm": 972.0, "kl_loss_12": 1028.9166107177734, "kl_loss_17": 269.9573547363281, "kl_loss_3": 2598.421826171875, "kl_loss_6": 2006.371514892578, "learning_rate": 3.6247609976319816e-07, "loss": 1456.7176, "step": 9880 }, { "ce_loss_12": 3.32789089679718, "ce_loss_17": 2.990896999835968, "ce_loss_23": 2.8709633350372314, "ce_loss_3": 4.108030164241791, "ce_loss_6": 3.8056225538253785, "epoch": 0.989, "grad_norm": 1016.0, "kl_loss_12": 1053.1518157958985, "kl_loss_17": 278.55935821533205, "kl_loss_3": 2620.5390747070314, "kl_loss_6": 2017.3613342285157, "learning_rate": 3.0458649045211895e-07, "loss": 1506.2714, "step": 9890 }, { "ce_loss_12": 3.3056017994880675, "ce_loss_17": 2.962406671047211, "ce_loss_23": 2.8373791098594667, "ce_loss_3": 4.081587100028992, "ce_loss_6": 3.772992491722107, "epoch": 0.99, "grad_norm": 908.0, "kl_loss_12": 1048.933447265625, "kl_loss_17": 283.54878692626954, "kl_loss_3": 2603.8252807617187, "kl_loss_6": 2000.5755249023437, "learning_rate": 2.517288084074587e-07, "loss": 1495.7473, "step": 9900 }, { "ce_loss_12": 3.359211504459381, "ce_loss_17": 3.006488561630249, "ce_loss_23": 2.8813580870628357, "ce_loss_3": 4.151961064338684, "ce_loss_6": 3.842729997634888, "epoch": 0.991, "grad_norm": 1400.0, "kl_loss_12": 1086.1037017822266, "kl_loss_17": 288.2465118408203, "kl_loss_3": 2684.929248046875, "kl_loss_6": 2073.267474365234, "learning_rate": 2.0390358590538505e-07, "loss": 1503.6292, "step": 9910 }, { "ce_loss_12": 3.341462767124176, "ce_loss_17": 3.008048748970032, "ce_loss_23": 2.8869954109191895, "ce_loss_3": 4.113165223598481, "ce_loss_6": 3.8097443699836733, "epoch": 0.992, "grad_norm": 1072.0, "kl_loss_12": 1045.311962890625, "kl_loss_17": 279.93851470947266, "kl_loss_3": 2600.6946655273437, "kl_loss_6": 2003.008935546875, "learning_rate": 1.61111304545436e-07, "loss": 1464.2891, "step": 9920 }, { "ce_loss_12": 3.3059886693954468, "ce_loss_17": 2.98328115940094, "ce_loss_23": 2.863870179653168, "ce_loss_3": 4.0723427653312685, "ce_loss_6": 3.7720368027687075, "epoch": 0.993, "grad_norm": 904.0, "kl_loss_12": 1034.073046875, "kl_loss_17": 274.453141784668, "kl_loss_3": 2587.8949584960938, "kl_loss_6": 1989.5302185058595, "learning_rate": 1.2335239524541298e-07, "loss": 1448.7602, "step": 9930 }, { "ce_loss_12": 3.2763088941574097, "ce_loss_17": 2.942208456993103, "ce_loss_23": 2.8267432928085325, "ce_loss_3": 4.041367328166961, "ce_loss_6": 3.7408220410346984, "epoch": 0.994, "grad_norm": 956.0, "kl_loss_12": 1025.5233093261718, "kl_loss_17": 272.5311225891113, "kl_loss_3": 2580.800732421875, "kl_loss_6": 1972.7632751464844, "learning_rate": 9.06272382371065e-08, "loss": 1464.1678, "step": 9940 }, { "ce_loss_12": 3.348012113571167, "ce_loss_17": 3.0078374981880187, "ce_loss_23": 2.893405330181122, "ce_loss_3": 4.12751475572586, "ce_loss_6": 3.822390305995941, "epoch": 0.995, "grad_norm": 1360.0, "kl_loss_12": 1059.0628814697266, "kl_loss_17": 276.8828186035156, "kl_loss_3": 2634.0769165039064, "kl_loss_6": 2026.2205505371094, "learning_rate": 6.293616306246586e-08, "loss": 1481.2273, "step": 9950 }, { "ce_loss_12": 3.320646572113037, "ce_loss_17": 2.999920296669006, "ce_loss_23": 2.885223948955536, "ce_loss_3": 4.070868468284607, "ce_loss_6": 3.771753668785095, "epoch": 0.996, "grad_norm": 1008.0, "kl_loss_12": 1013.6059875488281, "kl_loss_17": 268.9860580444336, "kl_loss_3": 2521.4711303710938, "kl_loss_6": 1938.149951171875, "learning_rate": 4.027944857032395e-08, "loss": 1429.216, "step": 9960 }, { "ce_loss_12": 3.311885952949524, "ce_loss_17": 2.9989118576049805, "ce_loss_23": 2.8931652188301085, "ce_loss_3": 4.040776383876801, "ce_loss_6": 3.745835208892822, "epoch": 0.997, "grad_norm": 1216.0, "kl_loss_12": 976.0381683349609, "kl_loss_17": 258.4782745361328, "kl_loss_3": 2448.4940795898438, "kl_loss_6": 1869.9054077148437, "learning_rate": 2.265732291356626e-08, "loss": 1405.6646, "step": 9970 }, { "ce_loss_12": 3.3592669129371644, "ce_loss_17": 3.0400989770889284, "ce_loss_23": 2.925530707836151, "ce_loss_3": 4.115602195262909, "ce_loss_6": 3.8081278443336486, "epoch": 0.998, "grad_norm": 1056.0, "kl_loss_12": 1014.3947631835938, "kl_loss_17": 273.0302558898926, "kl_loss_3": 2532.3029907226564, "kl_loss_6": 1928.0451232910157, "learning_rate": 1.0069963546743833e-08, "loss": 1470.89, "step": 9980 }, { "ce_loss_12": 3.3478829503059386, "ce_loss_17": 3.021211862564087, "ce_loss_23": 2.89819540977478, "ce_loss_3": 4.12130331993103, "ce_loss_6": 3.81664662361145, "epoch": 0.999, "grad_norm": 1168.0, "kl_loss_12": 1037.3339416503907, "kl_loss_17": 278.43563385009764, "kl_loss_3": 2601.9752197265625, "kl_loss_6": 1998.0107849121093, "learning_rate": 2.517497224463483e-09, "loss": 1464.9984, "step": 9990 }, { "ce_loss_12": 3.3240800499916077, "ce_loss_17": 2.977655363082886, "ce_loss_23": 2.8543686270713806, "ce_loss_3": 4.138887107372284, "ce_loss_6": 3.817936360836029, "epoch": 1.0, "grad_norm": 1224.0, "kl_loss_12": 1070.1510314941406, "kl_loss_17": 283.68445587158203, "kl_loss_3": 2717.0578735351564, "kl_loss_6": 2080.7163208007814, "learning_rate": 0.0, "loss": 1512.9695, "step": 10000 } ], "logging_steps": 10, "max_steps": 10000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.502582338838856e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }